commit c7490dc40d6507cd725ac1554b7fb82d619ca4ad Author: vivekvardhanadepu Date: Thu Jul 29 01:23:18 2021 +0530 init non-parallel corpora training diff --git a/.github/workflows/training.yml b/.github/workflows/training.yml index 2552d9b..2c70d5c 100644 --- a/.github/workflows/training.yml +++ b/.github/workflows/training.yml @@ -19,7 +19,7 @@ jobs: pip3 install -r requirements.txt - name: run - run: python3 check_config.py tests/training/config.toml + run: "! python3 check_config.py tests/training/config.toml" training: name: lexical selection training @@ -72,4 +72,4 @@ jobs: pip3 install -r requirements.txt - name: Training - run: python3 lexical_training.py tests/training/config.toml + run: python3 lexical_selection_training.py tests/training/config.toml diff --git a/.gitignore b/.gitignore index 8a3b797..ec53a12 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ __pycache__/ /*.toml # corpora -europarl* \ No newline at end of file +europarl* + +# bash files +/*.sh \ No newline at end of file diff --git a/README.md b/README.md index 4601445..4809bb0 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,22 @@ for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/Us ## requirements +**parallel corpora:** + - [parallel corpus](https://wiki.apertium.org/wiki/Corpora) - [apertium-core](https://wiki.apertium.org/wiki/Installation) (install apertium-lex-tools with yasmet) - [fast_align](https://github.com/clab/fast_align) - [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally) - python dependencies in [requirements.txt](requirements.txt) +**non-parallel corpora:** +- [non-parallel corpus](https://wiki.apertium.org/wiki/Corpora) +- [apertium-core](https://wiki.apertium.org/wiki/Installation) +- [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally) +- [IRSTLM](https://wiki.apertium.org/wiki/IRSTLM) +- python dependencies in [requirements.txt](requirements.txt) + + ## how to use - install the requirements and download or clone this repo (`git clone https://github.com/vivekvardhanadepu/apertium-lexical-training.git`) @@ -34,3 +44,6 @@ This folder contains scripts and data for automated testing of the training scri Philipp Koehn. *Europarl: A Parallel Corpus for Statistical Machine Translation.* MT Summit 2005. + +[2] +https://www-i6.informatik.rwth-aachen.de/web/Software/YASMET.html diff --git a/config.toml.example b/config.toml.example index 9c9d15c..ed7ea17 100644 --- a/config.toml.example +++ b/config.toml.example @@ -18,7 +18,7 @@ CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts # LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" -# fast align build folder +# fast align build folder[not required for non-parallel training] FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" # apertium language data diff --git a/lexical_training.py b/lexical_selection_training.py similarity index 99% rename from lexical_training.py rename to lexical_selection_training.py index d91916e..eb6ec33 100644 --- a/lexical_training.py +++ b/lexical_selection_training.py @@ -252,7 +252,7 @@ def training(config, cache_dir, log): f0.seek(0) f1.truncate(0) # print(l) - cmds = [['grep', f'^{l}'], ['cut', '-f', '2'], ['head', '-1']] + cmds = [['grep', f'^{l}'], ['head', '-1'], ['cut', '-f', '2']] pipe(cmds, f0, f1, log).wait() f0.seek(0)