commit 98b0fc16718e7740dc0399496c36cad681aa60dd Author: vivekvardhanadepu Date: Fri Aug 6 21:38:19 2021 +0530 adding MAX_RULES and CRISPHOLD to config diff --git a/check_config.py b/check_config.py index 968f839..7e46615 100644 --- a/check_config.py +++ b/check_config.py @@ -70,6 +70,16 @@ def check_config(config_filename): f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer. pass an integer \n") misconfigured = True + if not isinstance(config['MAX_RULES'], int): + print( + f"'{config['MAX_RULES']}'(MAX_RULES) is not an integer. pass an integer \n") + misconfigured = True + + if not (isinstance(config['CRISPHOLD'], int) or isinstance(config['CRISPHOLD'], float)): + print( + f"'{config['CRISPHOLD']}'(CRISPHOLD) is not an integer. pass an integer \n") + misconfigured = True + if not isinstance(config['IS_PARALLEL'], bool): print( f"'{config['IS_PARALLEL']}'(IS_PARALLEL) is not an boolean. pass true or false \n") diff --git a/config.toml.example b/config.toml.example index a471413..a4bf92b 100644 --- a/config.toml.example +++ b/config.toml.example @@ -1,5 +1,8 @@ # configuration for lexical training +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = true + # corpus name CORPUS = "europarl-v7" @@ -16,16 +19,19 @@ CORPUS_SL = "europarl-v7.eng-spa.eng" CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts -# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" +# LEX_TOOLS = "../apertium-lex-tools/scripts" # apertium language data -LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" +LANG_DATA = "../apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) -TRAINING_LINES = 100000 - -# parallel(true) or non-parallel corpora(false) -IS_PARALLEL = true +TRAINING_LINES = 10 # fast align build folder[not required for non-parallel training] -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" +FAST_ALIGN = "../fast_align/build/fast_align" + +# max number of rules +MAX_RULES = 3 + +# crisphold +CRISPHOLD = 1.5 diff --git a/lexical_selection_training.py b/lexical_selection_training.py index c6465a9..76c5d7f 100644 --- a/lexical_selection_training.py +++ b/lexical_selection_training.py @@ -225,18 +225,17 @@ def parallel_training(config, cache_dir, log): with open(freq_lex, 'w') as f, redirect_stdout(f), redirect_stderr(log): extract_freq_lexicon(candidates) - crisphold = 1.5 # count patterns mod = import_module('ngram-count-patterns') ngram_count_patterns = getattr(mod, 'ngram_count_patterns') with open(ngrams, 'w') as f, redirect_stdout(f), redirect_stderr(log): - ngram_count_patterns(freq_lex, candidates, crisphold) + ngram_count_patterns(freq_lex, candidates, config['CRISPHOLD'], config['MAX_RULES']) # ngrams to rules mod = import_module('ngrams-to-rules') ngrams_to_rules = getattr(mod, 'ngrams_to_rules') with open(rules, 'w') as f, redirect_stdout(f), redirect_stderr(log): - ngrams_to_rules(ngrams, crisphold) + ngrams_to_rules(ngrams, config['CRISPHOLD']) # # count patterns # mod = import_module('ngram-count-patterns-maxent2') diff --git a/tests/training/config-np.toml b/tests/training/config-np.toml index 59b6af6..56b9cd3 100644 --- a/tests/training/config-np.toml +++ b/tests/training/config-np.toml @@ -1,5 +1,8 @@ # configuration for lexical training +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = true + # corpus name CORPUS = "europarl-v7" @@ -15,11 +18,17 @@ CORPUS_SL = "tests/training/test.eng" # target corpus CORPUS_TL = "tests/training/test.spa" +# apertium-lex-tools scripts +# LEX_TOOLS = "apertium-lex-tools/scripts" + # apertium language data LANG_DATA = "apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) TRAINING_LINES = 100 -# parallel(true) or non-parallel corpora(false) -IS_PARALLEL = false +# max number of rules +MAX_RULES = 3 + +# crisphold +CRISPHOLD = 1.5 diff --git a/tests/training/config.toml b/tests/training/config.toml index 1ac5762..d7e62d3 100644 --- a/tests/training/config.toml +++ b/tests/training/config.toml @@ -1,5 +1,8 @@ # configuration for lexical training +# parallel(true) or non-parallel corpora(false) +IS_PARALLEL = true + # corpus name CORPUS = "europarl-v7" @@ -16,7 +19,7 @@ CORPUS_SL = "tests/training/test.eng" CORPUS_TL = "tests/training/test.spa" # apertium-lex-tools scripts -# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" +# LEX_TOOLS = "../apertium-lex-tools/scripts" # apertium language data LANG_DATA = "apertium-eng-spa" @@ -24,8 +27,11 @@ LANG_DATA = "apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) TRAINING_LINES = 100 -# parallel(true) or non-parallel corpora(false) -IS_PARALLEL = true - # fast align build folder[not required for non-parallel training] FAST_ALIGN = "fast_align/build/fast_align" + +# max number of rules +MAX_RULES = 3 + +# crisphold +CRISPHOLD = 1.5