commit 98b0fc16718e7740dc0399496c36cad681aa60dd
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Fri Aug 6 21:38:19 2021 +0530

    adding MAX_RULES and CRISPHOLD to config

diff --git a/check_config.py b/check_config.py
index 968f839..7e46615 100644
--- a/check_config.py
+++ b/check_config.py
@@ -70,6 +70,16 @@ def check_config(config_filename):
             f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer. pass an integer \n")
         misconfigured = True
 
+    if not isinstance(config['MAX_RULES'], int):
+        print(
+            f"'{config['MAX_RULES']}'(MAX_RULES) is not an integer. pass an integer \n")
+        misconfigured = True
+    
+    if not (isinstance(config['CRISPHOLD'], int) or isinstance(config['CRISPHOLD'], float)):
+        print(
+            f"'{config['CRISPHOLD']}'(CRISPHOLD) is not an integer. pass an integer \n")
+        misconfigured = True
+
     if not isinstance(config['IS_PARALLEL'], bool):
         print(
             f"'{config['IS_PARALLEL']}'(IS_PARALLEL) is not an boolean. pass true or false \n")
diff --git a/config.toml.example b/config.toml.example
index a471413..a4bf92b 100644
--- a/config.toml.example
+++ b/config.toml.example
@@ -1,5 +1,8 @@
 # configuration for lexical training
 
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = true
+
 # corpus name
 CORPUS = "europarl-v7"
 
@@ -16,16 +19,19 @@ CORPUS_SL = "europarl-v7.eng-spa.eng"
 CORPUS_TL = "europarl-v7.eng-spa.spa"
 
 # apertium-lex-tools scripts
-# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+# LEX_TOOLS = "../apertium-lex-tools/scripts"
 
 # apertium language data
-LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
+LANG_DATA = "../apertium-eng-spa"
 
 # number of lines to be trained on (do not enclose in quotes)
-TRAINING_LINES = 100000
-
-# parallel(true) or non-parallel corpora(false)
-IS_PARALLEL = true
+TRAINING_LINES = 10
 
 # fast align build folder[not required for non-parallel training]
-FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
+FAST_ALIGN = "../fast_align/build/fast_align"
+
+# max number of rules
+MAX_RULES = 3
+
+# crisphold
+CRISPHOLD = 1.5
diff --git a/lexical_selection_training.py b/lexical_selection_training.py
index c6465a9..76c5d7f 100644
--- a/lexical_selection_training.py
+++ b/lexical_selection_training.py
@@ -225,18 +225,17 @@ def parallel_training(config, cache_dir, log):
     with open(freq_lex, 'w') as f, redirect_stdout(f), redirect_stderr(log):
         extract_freq_lexicon(candidates)
 
-    crisphold = 1.5
     # count patterns
     mod = import_module('ngram-count-patterns')
     ngram_count_patterns = getattr(mod, 'ngram_count_patterns')
     with open(ngrams, 'w') as f, redirect_stdout(f), redirect_stderr(log):
-        ngram_count_patterns(freq_lex, candidates, crisphold)
+        ngram_count_patterns(freq_lex, candidates, config['CRISPHOLD'], config['MAX_RULES'])
 
     # ngrams to rules
     mod = import_module('ngrams-to-rules')
     ngrams_to_rules = getattr(mod, 'ngrams_to_rules')
     with open(rules, 'w') as f, redirect_stdout(f), redirect_stderr(log):
-        ngrams_to_rules(ngrams, crisphold)
+        ngrams_to_rules(ngrams, config['CRISPHOLD'])
 
     # # count patterns
     # mod = import_module('ngram-count-patterns-maxent2')
diff --git a/tests/training/config-np.toml b/tests/training/config-np.toml
index 59b6af6..56b9cd3 100644
--- a/tests/training/config-np.toml
+++ b/tests/training/config-np.toml
@@ -1,5 +1,8 @@
 # configuration for lexical training
 
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = true
+
 # corpus name
 CORPUS = "europarl-v7"
 
@@ -15,11 +18,17 @@ CORPUS_SL = "tests/training/test.eng"
 # target corpus
 CORPUS_TL = "tests/training/test.spa"
 
+# apertium-lex-tools scripts
+# LEX_TOOLS = "apertium-lex-tools/scripts"
+
 # apertium language data
 LANG_DATA = "apertium-eng-spa"
 
 # number of lines to be trained on (do not enclose in quotes)
 TRAINING_LINES = 100
 
-# parallel(true) or non-parallel corpora(false)
-IS_PARALLEL = false
+# max number of rules
+MAX_RULES = 3
+
+# crisphold
+CRISPHOLD = 1.5
diff --git a/tests/training/config.toml b/tests/training/config.toml
index 1ac5762..d7e62d3 100644
--- a/tests/training/config.toml
+++ b/tests/training/config.toml
@@ -1,5 +1,8 @@
 # configuration for lexical training
 
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = true
+
 # corpus name
 CORPUS = "europarl-v7"
 
@@ -16,7 +19,7 @@ CORPUS_SL = "tests/training/test.eng"
 CORPUS_TL = "tests/training/test.spa"
 
 # apertium-lex-tools scripts
-# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+# LEX_TOOLS = "../apertium-lex-tools/scripts"
 
 # apertium language data
 LANG_DATA = "apertium-eng-spa"
@@ -24,8 +27,11 @@ LANG_DATA = "apertium-eng-spa"
 # number of lines to be trained on (do not enclose in quotes)
 TRAINING_LINES = 100
 
-# parallel(true) or non-parallel corpora(false)
-IS_PARALLEL = true
-
 # fast align build folder[not required for non-parallel training]
 FAST_ALIGN = "fast_align/build/fast_align"
+
+# max number of rules
+MAX_RULES = 3
+
+# crisphold
+CRISPHOLD = 1.5