commit 52cc3be3994d777031e28f4dd015bddadf0de303 Author: vivekvardhanadepu Date: Sun Aug 1 15:54:07 2021 +0530 check_config and github actions for non-parallel training added diff --git a/.github/workflows/training.yml b/.github/workflows/training.yml index 2c70d5c..4477189 100644 --- a/.github/workflows/training.yml +++ b/.github/workflows/training.yml @@ -18,18 +18,18 @@ jobs: sudo apt-get -qfy install python3-pip pip3 install -r requirements.txt - - name: run + - name: Parallel run: "! python3 check_config.py tests/training/config.toml" - training: - name: lexical selection training + - name: Non-parallel + run: "! python3 check_config.py tests/training/config-np.toml" + + parallel_training: + name: parallel corpora training runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - # - name: Running check_config.py(before installation of prerequisites) - # run: python3 check_config.py tests/training/config.toml - - name: Installing apertium dependencies run: | sudo apt-get -qy update @@ -53,7 +53,7 @@ jobs: make -j4 VERBOSE=1 V=1 cd .. - - name: checking out apertium-eng-spa + - name: Checking out apertium-eng-spa uses: actions/checkout@v2 with: repository: apertium/apertium-eng-spa @@ -73,3 +73,52 @@ jobs: - name: Training run: python3 lexical_selection_training.py tests/training/config.toml + + non_parallel_training: + name: non-parallel corpora training + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Installing apertium dependencies + run: | + sudo apt-get -qy update + sudo apt-get -qfy install wget ca-certificates + wget -q https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash + sudo apt-get -qfy install --no-install-recommends apertium-all-dev + + - name: Checking out apertium-eng-spa + uses: actions/checkout@v2 + with: + repository: apertium/apertium-eng-spa + path: apertium-eng-spa + + - name: Installing apertium-eng-spa locally + working-directory: apertium-eng-spa + run: | + autoreconf -fvi + ./configure + make -j4 VERBOSE=1 V=1 + + - name: Installing python dependencies + run: | + sudo apt-get -qfy install python3-pip + pip3 install -r requirements.txt + + - name: Checking out IRSTLM + uses: actions/checkout@v2 + with: + repository: irstlm-team/irstlm + path: irstlm + + - name: Installing IRSTLM + working-directory: irstlm + run: | + sed -i 's/isystem/I/' src/Makefile.am + sh regenerate-makefiles.sh + ./configure + make -j4 VERBOSE=1 V=1 + sudo make install + + - name: Training + run: python3 lexical_selection_training.py tests/training/config-np.toml diff --git a/check_config.py b/check_config.py index 533483e..5beccdd 100644 --- a/check_config.py +++ b/check_config.py @@ -11,13 +11,14 @@ fast_align_url = "https://github.com/clab/fast_align" langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" apertium_url = "https://wiki.apertium.org/wiki/Installation" yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" +irstlm_url = "https://wiki.apertium.org/wiki/IRSTLM" -def check_config(filename='config.toml'): +def check_config(config_filename): misconfigured = False lex_tools_paths = ['/opt/local/share/apertium-lex-tools', '/usr/local/share/apertium-lex-tools', '/usr/share/apertium-lex-tools'] - with open(filename) as config_file: + with open(config_filename) as config_file: config_toml = config_file.read() config = parse(config_toml) @@ -25,7 +26,7 @@ def check_config(filename='config.toml'): assert config_toml == dumps(config) # changing the paths to absolute - for key in ['CORPUS_SL', 'CORPUS_TL', 'FAST_ALIGN', 'LANG_DATA']: + for key in ['CORPUS_SL', 'CORPUS_TL', 'LANG_DATA']: if not os.path.isabs(config[key]): config[key] = os.path.join(os.path.abspath('.'), config[key]) @@ -39,56 +40,19 @@ def check_config(filename='config.toml'): f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") misconfigured = True - is_lex_tools_present = False - for lex_tools in lex_tools_paths: - if os.path.isdir(lex_tools): - scripts = ['extract-sentences.py', 'extract-freq-lexicon.py', - 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', - 'ngrams-to-rules-me.py', 'common.py'] - - for script in scripts: - if not os.path.isfile(os.path.join(lex_tools, script)): - print( - f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n") - misconfigured = True - is_lex_tools_present = True - - if not is_lex_tools_present: - print( - f"'apertium_lex_tools'is not installed, to install apertium-lex-tools follow {apertium_url}\n") - misconfigured = True - - # assuming scripts are intact - # if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): - # print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", - # "provide a valid directory or \nto install, follow", lex_tools_url, '\n') - # misconfigured = True - - if not os.path.isfile(config['FAST_ALIGN']): - print( - f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n") - misconfigured = True - # else: - # if 'fast_align' not in os.listdir(config['FAST_ALIGN']): - # print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ - # "provide a valid directory or \nto install, follow", fast_align_url, '\n') - # misconfigured = True - if not os.path.isdir(config['LANG_DATA']): print( f"'{config['LANG_DATA']}'(LANG_DATA) is not a directory, provide a valid directory or \nto install, follow {langs_url}\n") misconfigured = True else: - sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin" - tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin" - if sl_tl_autobil not in os.listdir(config['LANG_DATA']): - print(f"'{sl_tl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ - provide a valid directory or \nto install, follow {langs_url}\n") - misconfigured = True - if tl_sl_autobil not in os.listdir(config['LANG_DATA']): - print(f"'{tl_sl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ - provide a valid directory or \nto install, follow {langs_url}\n") - misconfigured = True + modules = [] + modules.append(f"{config['SL']}-{config['TL']}.autobil.bin") + modules.append(f"{config['TL']}-{config['SL']}.autobil.bin") + for module in modules: + if module not in os.listdir(config['LANG_DATA']): + print(f"'{module}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ + provide a valid directory or \nto install, follow {langs_url}\n") + misconfigured = True apertium_present = False for path in os.environ["PATH"].split(os.pathsep): @@ -101,40 +65,159 @@ def check_config(filename='config.toml'): f"apertium is either not installed or not added to path, see {apertium_url}\n") misconfigured = True - yasmet_present = False - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'yasmet')): - yasmet_present = True - break - - if not yasmet_present: - print( - f"yasmet is either not installed or not added to path, install yasmet and add to the path, \ - {yasmet_url} or re-install apertium-lex-tools with yasmet, {apertium_url}\n") - misconfigured = True - - process_tagger_output_present = False - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'process-tagger-output')): - process_tagger_output_present = True - break - - if not process_tagger_output_present: + if not isinstance(config['TRAINING_LINES'], int): print( - f"process-tagger-output is either not installed or not added to path, re-install apertium-lex-tools {apertium_url}\n") + f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer. pass an integer \n") misconfigured = True - if not isinstance(config['TRAINING_LINES'], int): + if not isinstance(config['IS_PARALLEL'], bool): print( - f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer \n") + f"'{config['IS_PARALLEL']}'(IS_PARALLEL) is not an boolean. pass true or false \n") misconfigured = True + else: + if config['IS_PARALLEL']: + yasmet_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'yasmet')): + yasmet_present = True + break + + if not yasmet_present: + print( + f"yasmet is either not installed or not added to path, install yasmet and add to the path, \ + {yasmet_url} or \nre-install apertium-lex-tools with yasmet, {apertium_url}\n") + misconfigured = True + + process_tagger_output_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'process-tagger-output')): + process_tagger_output_present = True + break + + if not process_tagger_output_present: + print( + f"process-tagger-output is not installed, re-install apertium-lex-tools {apertium_url}\n") + misconfigured = True + + if not os.path.isabs(config['FAST_ALIGN']): + config['FAST_ALIGN'] = os.path.join( + os.path.abspath('.'), config['FAST_ALIGN']) + if not os.path.isfile(config['FAST_ALIGN']): + print( + f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n") + misconfigured = True + # else: + # if 'fast_align' not in os.listdir(config['FAST_ALIGN']): + # print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ + # "provide a valid directory or \nto install, follow", fast_align_url, '\n') + # misconfigured = True + + is_lex_tools_present = False + for lex_tools in lex_tools_paths: + if os.path.isdir(lex_tools): + scripts = ['extract-sentences.py', 'extract-freq-lexicon.py', + 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', + 'ngrams-to-rules-me.py', 'common.py'] + + for script in scripts: + if not os.path.isfile(os.path.join(lex_tools, script)): + print( + f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n") + misconfigured = True + is_lex_tools_present = True + + if not is_lex_tools_present: + print( + f"'apertium_lex_tools' is not installed, to install apertium-lex-tools follow {apertium_url}\n") + misconfigured = True + + else: + if os.path.isdir(config['LANG_DATA']): + modules = [] + modules.append( + f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t1x") + modules.append(f"{config['SL']}-{config['TL']}.t1x.bin") + modules.append( + f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t2x") + modules.append(f"{config['SL']}-{config['TL']}.t2x.bin") + modules.append( + f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t3x") + modules.append(f"{config['SL']}-{config['TL']}.t3x.bin") + modules.append(f"{config['SL']}-{config['TL']}.autogen.bin") + modules.append(f"{config['SL']}-{config['TL']}.autopgen.bin") + for module in modules: + if module not in os.listdir(config['LANG_DATA']): + print(f"'{module}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ + provide a valid directory or \nto install, follow {langs_url}\n") + misconfigured = True + + multitrans_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'multitrans')): + multitrans_present = True + break + + if not multitrans_present: + print( + f"multitrans is not installed, re-install apertium-lex-tools {apertium_url}\n") + misconfigured = True + + ranker_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'irstlm-ranker')): + ranker_present = True + break + + if not ranker_present: + print( + f"irstlm-ranker is not installed, re-install apertium-lex-tools with irstlm {apertium_url}\n") + misconfigured = True + + # if not 'IRSTLM' in os.environ: + # print( + # f"IRSTLM is either not installed or not defined as an environment variable, see {irstlm_url}\n") + # misconfigured = True + + irstlm_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'build-lm.sh')): + irstlm_present = True + break + + if not irstlm_present: + print( + f"'build-lm.sh' is not installed or added to path, see {irstlm_url}\n") + misconfigured = True + + is_lex_tools_present = False + for lex_tools in lex_tools_paths: + if os.path.isdir(lex_tools): + scripts = ['biltrans-extract-frac-freq.py', 'extract-alig-lrx.py', + 'biltrans-count-patterns-ngrams.py', 'ngram-pruning-frac.py', 'ngrams-to-rules.py', + 'biltrans_count_common.py', 'common.py'] + + for script in scripts: + if not os.path.isfile(os.path.join(lex_tools, script)): + print( + f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n") + misconfigured = True + is_lex_tools_present = True + + if not is_lex_tools_present: + print( + f"'apertium_lex_tools' is not installed, to install apertium-lex-tools follow {apertium_url}\n") + misconfigured = True if misconfigured: exit(1) + else: + print("prerequisites are properly installed") return config if __name__ == '__main__': - if(len(sys.argv)==2): - check_config(sys.argv[1]) + config_file = 'config.toml' + if(len(sys.argv) == 2): + config_file = sys.argv[1] + check_config(config_file) diff --git a/config.toml.example b/config.toml.example index ed7ea17..a471413 100644 --- a/config.toml.example +++ b/config.toml.example @@ -3,10 +3,10 @@ # corpus name CORPUS = "europarl-v7" -# source language +# source language[it should match with the language codes of apertium] SL = "eng" -# target language +# target language[it should match with the language codes of apertium] TL = "spa" # source corpus @@ -18,11 +18,14 @@ CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts # LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" -# fast align build folder[not required for non-parallel training] -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" - # apertium language data LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" -# number of lines to be trained on +# number of lines to be trained on (do not enclose in quotes) TRAINING_LINES = 100000 + +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = true + +# fast align build folder[not required for non-parallel training] +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" diff --git a/lexical_selection_training.py b/lexical_selection_training.py index 50df75f..c6465a9 100644 --- a/lexical_selection_training.py +++ b/lexical_selection_training.py @@ -64,7 +64,7 @@ def pipe(cmds, firstin, lastout, stderr): return procs[-1] -def training(config, cache_dir, log): +def parallel_training(config, cache_dir, log): MIN = 1 @@ -299,13 +299,21 @@ def training(config, cache_dir, log): # ngrams_to_rules(ngrams_all) +def non_parallel_training(config, cache_dir, log): + pass + + def main(config_file): print("validating configuration....") config = check_config(config_file) - # adding lex scripts to path - lex_tools = '/home/vivek/Documents/FOSS/apertium/lex-tools/scripts' - sys.path.insert(1, lex_tools) + # appending lex scripts' paths to environment path + sys.path.insert(0, '/usr/share/apertium-lex-tools') + sys.path.insert(0, '/opt/local/share/apertium-lex-tools') + sys.path.insert(0, '/usr/local/share/apertium-lex-tools') + + # remove after testing + sys.path.insert(0, '/home/vivek/Documents/FOSS/apertium/lex-tools/scripts') # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°' print("cleaning corpus....") @@ -313,6 +321,8 @@ def main(config_file): cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}" + if not config['IS_PARALLEL']: + cache_dir = cache_dir + '-np' # the directory where all the intermediary outputs are stored if os.path.isdir(cache_dir): if not query(f"Do you want to overwrite the files in '{cache_dir}'"): @@ -325,7 +335,10 @@ def main(config_file): log = os.path.join(cache_dir, "training.log") with open(log, 'a') as log_file: - training(config, cache_dir, log_file) + if config['IS_PARALLEL']: + parallel_training(config, cache_dir, log_file) + else: + non_parallel_training(config, cache_dir, log_file) print("training complete!!") diff --git a/tests/training/config-np.toml b/tests/training/config-np.toml new file mode 100644 index 0000000..59b6af6 --- /dev/null +++ b/tests/training/config-np.toml @@ -0,0 +1,25 @@ +# configuration for lexical training + +# corpus name +CORPUS = "europarl-v7" + +# source language[it should match with the language codes of apertium] +SL = "eng" + +# target language[it should match with the language codes of apertium] +TL = "spa" + +# source corpus +CORPUS_SL = "tests/training/test.eng" + +# target corpus +CORPUS_TL = "tests/training/test.spa" + +# apertium language data +LANG_DATA = "apertium-eng-spa" + +# number of lines to be trained on (do not enclose in quotes) +TRAINING_LINES = 100 + +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = false diff --git a/tests/training/config.toml b/tests/training/config.toml index 30076f2..1ac5762 100644 --- a/tests/training/config.toml +++ b/tests/training/config.toml @@ -3,23 +3,29 @@ # corpus name CORPUS = "europarl-v7" -# source language -SL = "spa" +# source language[it should match with the language codes of apertium] +SL = "eng" -# target language -TL = "eng" +# target language[it should match with the language codes of apertium] +TL = "spa" # source corpus -CORPUS_SL = "tests/training/test.spa" +CORPUS_SL = "tests/training/test.eng" # target corpus -CORPUS_TL = "tests/training/test.eng" +CORPUS_TL = "tests/training/test.spa" -# fast align -FAST_ALIGN = "fast_align/build/fast_align" +# apertium-lex-tools scripts +# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" # apertium language data LANG_DATA = "apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) TRAINING_LINES = 100 + +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = true + +# fast align build folder[not required for non-parallel training] +FAST_ALIGN = "fast_align/build/fast_align"