commit cc19a130a8e109df3726b56f8d95827af9abfde1 Author: vaydheesh Date: Mon Aug 12 10:17:45 2019 +0530 added: tagger in apertium diff --git a/README.md b/README.md index 7063459..1247425 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,32 @@ In [1]: import apertium In [2]: apertium.append_pair_path('..') ``` +### Tagger +Method 1: One can create ```Tagger``` objects on which ```tag()``` function can be run. +```python +In [1]: import apertium +In [2]: tagger = apertium.Tagger('eng') +In [3]: tagger.tag('cats') +Out[3]: [cats/cat] +``` +Method 2: Running ```tag()``` directly. +```python +In [1]: import apertium +In [2]: apertium.tag('en', 'cats') +Out[2]: [cats/cat] +``` + ### Translation -Performing Translations +Method 1: One can create ```Translator``` objects on which ```translate()``` function can be run. ```python In [1]: import apertium In [2]: t = apertium.Translator('eng', 'spa') In [3]: t.translate('cats') Out[3]: 'Gatos' ``` +Method 2: Running ```translate()``` directly. +```python +In [1]: import apertium +In [2]: apertium.translate('en', 'spa', 'cats') +Out[2]: 'Gatos' +``` \ No newline at end of file diff --git a/apertium/__init__.py b/apertium/__init__.py index a2b1501..061ffb3 100644 --- a/apertium/__init__.py +++ b/apertium/__init__.py @@ -7,6 +7,7 @@ from apertium.analysis import analyze, Analyzer # noqa: F401 from apertium.generation import generate, Generator # noqa: F401 from apertium.installer import install_module # noqa: F401 from apertium.mode_search import search_path +from apertium.tagger import tag, Tagger # noqa: F401 from apertium.translation import translate, Translator # noqa: F401 @@ -29,6 +30,9 @@ def _update_modes(pair_path: str) -> None: if modes['generator']: for dirpath, modename, lang_pair in modes['generator']: generators[lang_pair] = (dirpath, modename) + if modes['tagger']: + for dirpath, modename, lang_pair in modes['tagger']: + taggers[lang_pair] = (dirpath, modename) def append_pair_path(pair_path: str) -> None: @@ -66,6 +70,7 @@ def update_path_windows() -> None: pair_paths = ['/usr/share/apertium', '/usr/local/share/apertium'] analyzers = {} # type: Dict[str, Tuple[str, str]] generators = {} # type: Dict[str, Tuple[str, str]] +taggers = {} # type: Dict[str, Tuple[str, str]] pairs = {} # type: Dict[str, str] for pair_path in pair_paths: _update_modes(pair_path) diff --git a/apertium/mode_search.py b/apertium/mode_search.py index 24e309e..b3bd8ce 100644 --- a/apertium/mode_search.py +++ b/apertium/mode_search.py @@ -51,11 +51,13 @@ def search_path(rootpath: str, include_pairs: bool = True) -> Dict[str, List[Tup 'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)), 'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)), 'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)), + 'tagger': re.compile(r'(({0}(-{0})?)-tagger[A-z]*)\.mode'.format(lang_code)), } modes = { 'analyzer': [], 'generator': [], 'pair': [], + 'tagger': [], } # type: Dict[str, List[Tuple[str, str, str]]] real_root = os.path.abspath(os.path.realpath(rootpath)) diff --git a/apertium/tagger/__init__.py b/apertium/tagger/__init__.py new file mode 100644 index 0000000..2a04334 --- /dev/null +++ b/apertium/tagger/__init__.py @@ -0,0 +1,85 @@ +import os +from typing import Dict, List + +from streamparser import LexicalUnit, parse + +import apertium +from apertium.utils import execute_pipeline, parse_mode_file, to_alpha3_code + + +class Tagger: + """ + Attributes: + tagger_cmds (Dict[str, List[List[str]]]) + lang (str) + """ + + def __init__(self, lang: str) -> None: + """ + Args: + lang (str) + """ + self.tagger_cmds = {} # type: Dict[str, List[List[str]]] + self.lang = to_alpha3_code(lang) # type: str + if self.lang not in apertium.taggers: + raise apertium.ModeNotInstalled(self.lang) + else: + self.path, self.mode = apertium.taggers[self.lang] + + def _get_commands(self) -> List[List[str]]: + """ + Returns: + List[List[str]] + """ + if self.lang not in self.tagger_cmds: + mode_path, mode = apertium.taggers[self.lang] + abs_mode_path = os.path.join(mode_path, 'modes', '{}.mode'.format(mode)) + self.tagger_cmds[self.lang] = parse_mode_file(abs_mode_path) + + return self.tagger_cmds[self.lang] + + @staticmethod + def _postproc_text(result: str) -> List[LexicalUnit]: + """ + Postprocesses the input + + Args: + result (str) + + Returns: + List[LexicalUnit] + """ + lexical_units = list(parse(result)) + return lexical_units + + def tag(self, in_text: str, formatting: str = 'txt') -> List[LexicalUnit]: + """ + Runs apertium to tagger the input + + Args: + in_text (str) + formatting (str) + + Returns: + List[LexicalUnit] + """ + self._get_commands() + deformatter = ['apertium-des{}'.format(formatting), '-n'] + if deformatter not in self.tagger_cmds[self.lang]: + self.tagger_cmds[self.lang].insert(0, deformatter) + result = execute_pipeline(in_text, self.tagger_cmds[self.lang]) + return self._postproc_text(result) + + +def tag(lang: str, in_text: str, formatting: str = 'txt') -> List[LexicalUnit]: + """ + Args: + lang (str) + in_text (str) + formatting (str) + + Returns: + List[LexicalUnit] + """ + tagger = Tagger(lang) + return tagger.tag(in_text, formatting) diff --git a/tests/__init__.py b/tests/__init__.py index 03a7377..c4f47a6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -119,3 +119,24 @@ class TestTranslate(unittest.TestCase): translator = apertium.Translator('kaz', 'tat') translated = translator.translate('мысық') self.assertEqual(translated, 'мәче') + + +class TestTagger(unittest.TestCase): + def test_tagger_en(self): + tagger = apertium.Tagger('en') + lexical_units = tagger.tag('cats') + lexical_unit = lexical_units[0] + self.assertListEqual(lexical_unit.readings, [[SReading(baseform='cat', tags=['n', 'pl'])]]) + self.assertEqual(lexical_unit.wordform, 'cats') + self.assertEqual(lexical_unit.knownness, known) + + def test_tag_en(self): + lexical_units = apertium.tag('eng', 'cats') + lexical_unit = lexical_units[0] + self.assertListEqual(lexical_unit.readings, [[SReading(baseform='cat', tags=['n', 'pl'])]]) + self.assertEqual(lexical_unit.wordform, 'cats') + self.assertEqual(lexical_unit.knownness, known) + + def test_uninstalled_mode(self): + with self.assertRaises(apertium.ModeNotInstalled): + apertium.Tagger('spa')