commit 99fb09d1e6b62d5af47b051dec67498efcb0f9e8 Author: Lokendra Singh Date: Mon Aug 12 20:36:46 2019 +0530 added: Tagger class (#62) * added: tagger in apertium * updated: readme * readme: minor fix * fix: python syntax highlighting * remove: ToDo diff --git a/README.md b/README.md index 7063459..153aa9a 100644 --- a/README.md +++ b/README.md @@ -6,31 +6,28 @@ [![Coverage Status](https://coveralls.io/repos/github/apertium/apertium-python/badge.svg?branch=master)](https://coveralls.io/github/apertium/apertium-python?branch=master) ## Introduction -- The code-base is in development for the Gsoc '18 project called **Apertium API in Python** +- The code-base is in development for the GSoC '19 project called **Apertium API in Python** - The Apertium core modules are written in C++. - This project is an attempt to make the Apertium modules available in python, which because of it's simplicity is more appealing to users. ## About the Exisiting Code Base -- The exisiting code base has the subprocess implementation of the basic functions of Apertium. -- A branch called the ```windows``` has the implementation for the ```windows``` support and will soon be available on master. Detailed instructions can be found [here](https://gist.github.com/arghyatiger/c8aab476022158f4bdb3dbe45308cdb4) - -## Major things to do -- Subprocess implementation of the C++ functions in Apertium. To make the wrapper thinner. -- Other small issues can be found [here](https://github.com/apertium/apertium-python/issues) +- The exisiting code base has the subprocess and swig wrapper implementation of the basic functions of Apertium. ## Usage of library +- For multiple invocations `Method 1` is more performant, as the dictionary needs to be loaded only once. + ### Analysis Performing Morphological Analysis -Method 1: One can create ```Analyzer``` objects on which ```analyze()``` function can be run. +Method 1: Create an `Analyzer` object and call its `analyze` method. ```python In [1]: import apertium In [2]: a = apertium.Analyzer('en') In [3]: a.analyze('cats') Out[3]: [cats/cat, ./.] ``` -Method 2: Alternatively, the library provides an option to directly run the ```analyze``` method. +Method 2: Calling `analyze()` directly. ```python In [1]: import apertium In [2]: apertium.analyze('en', 'cats') @@ -40,15 +37,15 @@ Out[2]: cats/cat ### Generation Performing Morphological Generation -Method 1: Just like the ```Analyzer```, One can create ```Generator``` objects on which ```generate()``` function can be run. -```python +Method 1: Create a `Generator` object and call its `generate` method. +```python In [1]: import apertium In [2]: g = apertium.Generator('en') In [3]: g.generate('^cat$') Out[3]: 'cats' ``` -Method 2: Running ```generate()``` directly. -```python +Method 2: Calling `generate()` directly. +```python In [1]: import apertium In [2]: apertium.generate('en', '^cat$') Out[2]: 'cats' @@ -61,11 +58,32 @@ In [1]: import apertium In [2]: apertium.append_pair_path('..') ``` +### Tagger +Method 1: Create a `Tagger` object and call its `tag` method. +```python +In [1]: import apertium +In [2]: tagger = apertium.Tagger('eng') +In [3]: tagger.tag('cats') +Out[3]: [cats/cat] +``` +Method 2: Calling `tag()` directly. +```python +In [1]: import apertium +In [2]: apertium.tag('en', 'cats') +Out[2]: [cats/cat] +``` + ### Translation -Performing Translations +Method 1: Create a `Translator` object and call its `translate` method. ```python In [1]: import apertium In [2]: t = apertium.Translator('eng', 'spa') In [3]: t.translate('cats') Out[3]: 'Gatos' ``` +Method 2: Calling `translate()` directly. +```python +In [1]: import apertium +In [2]: apertium.translate('en', 'spa', 'cats') +Out[2]: 'Gatos' +``` \ No newline at end of file diff --git a/apertium/__init__.py b/apertium/__init__.py index a2b1501..061ffb3 100644 --- a/apertium/__init__.py +++ b/apertium/__init__.py @@ -7,6 +7,7 @@ from apertium.analysis import analyze, Analyzer # noqa: F401 from apertium.generation import generate, Generator # noqa: F401 from apertium.installer import install_module # noqa: F401 from apertium.mode_search import search_path +from apertium.tagger import tag, Tagger # noqa: F401 from apertium.translation import translate, Translator # noqa: F401 @@ -29,6 +30,9 @@ def _update_modes(pair_path: str) -> None: if modes['generator']: for dirpath, modename, lang_pair in modes['generator']: generators[lang_pair] = (dirpath, modename) + if modes['tagger']: + for dirpath, modename, lang_pair in modes['tagger']: + taggers[lang_pair] = (dirpath, modename) def append_pair_path(pair_path: str) -> None: @@ -66,6 +70,7 @@ def update_path_windows() -> None: pair_paths = ['/usr/share/apertium', '/usr/local/share/apertium'] analyzers = {} # type: Dict[str, Tuple[str, str]] generators = {} # type: Dict[str, Tuple[str, str]] +taggers = {} # type: Dict[str, Tuple[str, str]] pairs = {} # type: Dict[str, str] for pair_path in pair_paths: _update_modes(pair_path) diff --git a/apertium/mode_search.py b/apertium/mode_search.py index 24e309e..b3bd8ce 100644 --- a/apertium/mode_search.py +++ b/apertium/mode_search.py @@ -51,11 +51,13 @@ def search_path(rootpath: str, include_pairs: bool = True) -> Dict[str, List[Tup 'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)), 'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)), 'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)), + 'tagger': re.compile(r'(({0}(-{0})?)-tagger[A-z]*)\.mode'.format(lang_code)), } modes = { 'analyzer': [], 'generator': [], 'pair': [], + 'tagger': [], } # type: Dict[str, List[Tuple[str, str, str]]] real_root = os.path.abspath(os.path.realpath(rootpath)) diff --git a/apertium/tagger/__init__.py b/apertium/tagger/__init__.py new file mode 100644 index 0000000..2a04334 --- /dev/null +++ b/apertium/tagger/__init__.py @@ -0,0 +1,85 @@ +import os +from typing import Dict, List + +from streamparser import LexicalUnit, parse + +import apertium +from apertium.utils import execute_pipeline, parse_mode_file, to_alpha3_code + + +class Tagger: + """ + Attributes: + tagger_cmds (Dict[str, List[List[str]]]) + lang (str) + """ + + def __init__(self, lang: str) -> None: + """ + Args: + lang (str) + """ + self.tagger_cmds = {} # type: Dict[str, List[List[str]]] + self.lang = to_alpha3_code(lang) # type: str + if self.lang not in apertium.taggers: + raise apertium.ModeNotInstalled(self.lang) + else: + self.path, self.mode = apertium.taggers[self.lang] + + def _get_commands(self) -> List[List[str]]: + """ + Returns: + List[List[str]] + """ + if self.lang not in self.tagger_cmds: + mode_path, mode = apertium.taggers[self.lang] + abs_mode_path = os.path.join(mode_path, 'modes', '{}.mode'.format(mode)) + self.tagger_cmds[self.lang] = parse_mode_file(abs_mode_path) + + return self.tagger_cmds[self.lang] + + @staticmethod + def _postproc_text(result: str) -> List[LexicalUnit]: + """ + Postprocesses the input + + Args: + result (str) + + Returns: + List[LexicalUnit] + """ + lexical_units = list(parse(result)) + return lexical_units + + def tag(self, in_text: str, formatting: str = 'txt') -> List[LexicalUnit]: + """ + Runs apertium to tagger the input + + Args: + in_text (str) + formatting (str) + + Returns: + List[LexicalUnit] + """ + self._get_commands() + deformatter = ['apertium-des{}'.format(formatting), '-n'] + if deformatter not in self.tagger_cmds[self.lang]: + self.tagger_cmds[self.lang].insert(0, deformatter) + result = execute_pipeline(in_text, self.tagger_cmds[self.lang]) + return self._postproc_text(result) + + +def tag(lang: str, in_text: str, formatting: str = 'txt') -> List[LexicalUnit]: + """ + Args: + lang (str) + in_text (str) + formatting (str) + + Returns: + List[LexicalUnit] + """ + tagger = Tagger(lang) + return tagger.tag(in_text, formatting) diff --git a/tests/__init__.py b/tests/__init__.py index 03a7377..c4f47a6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -119,3 +119,24 @@ class TestTranslate(unittest.TestCase): translator = apertium.Translator('kaz', 'tat') translated = translator.translate('мысық') self.assertEqual(translated, 'мәче') + + +class TestTagger(unittest.TestCase): + def test_tagger_en(self): + tagger = apertium.Tagger('en') + lexical_units = tagger.tag('cats') + lexical_unit = lexical_units[0] + self.assertListEqual(lexical_unit.readings, [[SReading(baseform='cat', tags=['n', 'pl'])]]) + self.assertEqual(lexical_unit.wordform, 'cats') + self.assertEqual(lexical_unit.knownness, known) + + def test_tag_en(self): + lexical_units = apertium.tag('eng', 'cats') + lexical_unit = lexical_units[0] + self.assertListEqual(lexical_unit.readings, [[SReading(baseform='cat', tags=['n', 'pl'])]]) + self.assertEqual(lexical_unit.wordform, 'cats') + self.assertEqual(lexical_unit.knownness, known) + + def test_uninstalled_mode(self): + with self.assertRaises(apertium.ModeNotInstalled): + apertium.Tagger('spa')