commit 176557583f2342b2da0bacabbedd70f1dfcc21d1 Author: vivekvardhanadepu Date: Fri Aug 6 22:41:17 2021 +0530 IRSTLM export fix; adding PAIR, TL_MODEL to config diff --git a/.github/workflows/training.yml b/.github/workflows/training.yml index 4477189..6cd30a0 100644 --- a/.github/workflows/training.yml +++ b/.github/workflows/training.yml @@ -121,4 +121,6 @@ jobs: sudo make install - name: Training - run: python3 lexical_selection_training.py tests/training/config-np.toml + run: | + export IRSTLM="/usr/local" + python3 lexical_selection_training.py tests/training/config-np.toml diff --git a/check_config.py b/check_config.py index 7e46615..c9ccb5e 100644 --- a/check_config.py +++ b/check_config.py @@ -34,11 +34,11 @@ def check_config(config_filename): print( f"'{config['CORPUS_SL']}'(CORPUS_SL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") misconfigured = True - - if not os.path.isfile(config['CORPUS_TL']): - print( - f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") - misconfigured = True + if 'TL_MODEL' not in config: + if not os.path.isfile(config['CORPUS_TL']): + print( + f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") + misconfigured = True if not os.path.isdir(config['LANG_DATA']): print( @@ -145,13 +145,13 @@ def check_config(config_filename): if os.path.isdir(config['LANG_DATA']): modules = [] modules.append( - f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t1x") + f"apertium-{config['PAIR']}.{config['SL']}-{config['TL']}.t1x") modules.append(f"{config['SL']}-{config['TL']}.t1x.bin") modules.append( - f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t2x") + f"apertium-{config['PAIR']}.{config['SL']}-{config['TL']}.t2x") modules.append(f"{config['SL']}-{config['TL']}.t2x.bin") modules.append( - f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t3x") + f"apertium-{config['PAIR']}.{config['SL']}-{config['TL']}.t3x") modules.append(f"{config['SL']}-{config['TL']}.t3x.bin") modules.append(f"{config['SL']}-{config['TL']}.autogen.bin") modules.append(f"{config['SL']}-{config['TL']}.autopgen.bin") @@ -161,6 +161,16 @@ def check_config(config_filename): provide a valid directory or \nto install, follow {langs_url}\n") misconfigured = True + if not 'IRSTLM' in os.environ: + print( + f"IRSTLM is either not installed or not defined as an environment variable, see {irstlm_url}\n") + misconfigured = True + else: + if not os.path.isfile(os.path.join(os.environ['IRSTLM'], 'bin/build-lm.sh')): + print( + f"'build-lm.sh' is not present in $IRSTLM('{os.environ['IRSTLM']}'), see {irstlm_url}\n") + misconfigured = True + multitrans_present = False for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'multitrans')): @@ -183,11 +193,6 @@ def check_config(config_filename): f"irstlm-ranker is not installed, re-install apertium-lex-tools with irstlm {apertium_url}\n") misconfigured = True - # if not 'IRSTLM' in os.environ: - # print( - # f"IRSTLM is either not installed or not defined as an environment variable, see {irstlm_url}\n") - # misconfigured = True - is_lex_tools_present = False for lex_tools in lex_tools_paths: if os.path.isdir(lex_tools): @@ -207,16 +212,11 @@ def check_config(config_filename): f"apertium_lex_tools scripts are not installed, re-install apertium-lex-tools {apertium_url}\n") misconfigured = True - irstlm_present = False - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'build-lm.sh')): - irstlm_present = True - break - - if not irstlm_present: - print( - f"'build-lm.sh' is not installed or added to path, see {irstlm_url}\n") - misconfigured = True + if 'TL_MODEL' in config: + if not os.path.isfile(config['TL_MODEL']): + print( + f"'{config['TL_MODEL']}'(TL_MODEL) is not a file, provide a valid file or \nto build, see {irstlm_url}\n") + misconfigured = True if misconfigured: exit(1) diff --git a/config.toml.example b/config.toml.example index a4bf92b..c8ad006 100644 --- a/config.toml.example +++ b/config.toml.example @@ -1,22 +1,26 @@ # configuration for lexical training +# Note: one of CORPUS_TL or TL_MODEL(binary language model) is required for non-parallel corpora training. If both are specified, CORPUS_TL is ignored -# parallel(true) or non-parallel corpora(false) -IS_PARALLEL = true +# parallel(true) or non-parallel(false) +IS_PARALLEL = false # corpus name CORPUS = "europarl-v7" # source language[it should match with the language codes of apertium] -SL = "eng" +SL = "spa" # target language[it should match with the language codes of apertium] -TL = "spa" +TL = "eng" + +# language pair code(as per apertium language codes) +PAIR = "eng-spa" # source corpus -CORPUS_SL = "europarl-v7.eng-spa.eng" +CORPUS_SL = "europarl-v7.eng-spa.spa" # target corpus -CORPUS_TL = "europarl-v7.eng-spa.spa" +CORPUS_TL = "europarl-v7.eng-spa.eng" # apertium-lex-tools scripts # LEX_TOOLS = "../apertium-lex-tools/scripts" @@ -25,7 +29,7 @@ CORPUS_TL = "europarl-v7.eng-spa.spa" LANG_DATA = "../apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) -TRAINING_LINES = 10 +TRAINING_LINES = 100 # fast align build folder[not required for non-parallel training] FAST_ALIGN = "../fast_align/build/fast_align" @@ -35,3 +39,6 @@ MAX_RULES = 3 # crisphold CRISPHOLD = 1.5 + +# TL binary language model[not required for parallel training] +TL_MODEL = "europarl-v7.eng-spa.eng.lm" \ No newline at end of file diff --git a/tests/training/config-np.toml b/tests/training/config-np.toml index 56b9cd3..d8b67dc 100644 --- a/tests/training/config-np.toml +++ b/tests/training/config-np.toml @@ -1,7 +1,8 @@ # configuration for lexical training +# Note: one of CORPUS_TL or TL_MODEL(binary language model) is required for non-parallel corpora training. If both are specified, CORPUS_TL is ignored -# parallel(true) or non-parallel corpora(false) -IS_PARALLEL = true +# parallel(true) or non-parallel(false) +IS_PARALLEL = false # corpus name CORPUS = "europarl-v7" @@ -12,6 +13,9 @@ SL = "eng" # target language[it should match with the language codes of apertium] TL = "spa" +# language pair code(as per apertium language codes) +PAIR = "eng-spa" + # source corpus CORPUS_SL = "tests/training/test.eng" @@ -32,3 +36,6 @@ MAX_RULES = 3 # crisphold CRISPHOLD = 1.5 + +# TL binary language model[not required for parallel training] +# TL_MODEL = "europarl-v7.eng-spa.eng.lm" \ No newline at end of file diff --git a/tests/training/config.toml b/tests/training/config.toml index d7e62d3..da2b0ba 100644 --- a/tests/training/config.toml +++ b/tests/training/config.toml @@ -1,6 +1,6 @@ # configuration for lexical training -# parallel(true) or non-parallel corpora(false) +# parallel(true) or non-parallel(false) IS_PARALLEL = true # corpus name @@ -12,6 +12,9 @@ SL = "eng" # target language[it should match with the language codes of apertium] TL = "spa" +# language pair code(as per apertium language codes) +PAIR = "eng-spa" + # source corpus CORPUS_SL = "tests/training/test.eng"