commit 50a95aff515b9d6703e5795eca5d5d595523b421
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Thu Jul 15 09:27:22 2021 +0530

    incorporating changes of apertium-lex-tools(60e6ae9920ddc1ba24c96e5d4fe6a66ee139321a)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8a3b797
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,15 @@
+# cache
+cache*
+__pycache__/
+
+# rules
+*.xml
+
+#logs
+*.log
+
+# configs
+/*.toml
+
+# corpora
+europarl*
\ No newline at end of file
diff --git a/README.md b/README.md
index d4efc15..1a039c7 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,8 @@ for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/Us
 ## Requirements
 
 - [parallel corpus](https://wiki.apertium.org/wiki/Corpora)
-- [apertium](https://wiki.apertium.org/wiki/Installation)
+- [apertium-core](https://wiki.apertium.org/wiki/Installation) (install apertium-lex-tools with yasmet)
 - [fast_align](https://github.com/clab/fast_align)
-- [apertium-lex-tools](https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling) (if not installed already)
 - [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally)
 
 ## Installation steps
diff --git a/check_config.py b/check_config.py
index 1237eca..94a5c1a 100644
--- a/check_config.py
+++ b/check_config.py
@@ -5,7 +5,7 @@ import os
 
 # urls of the required tools and data
 corpora_url = "https://wiki.apertium.org/wiki/Corpora"
-lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling"
+# lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling"
 fast_align_url = "https://github.com/clab/fast_align"
 langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs"
 apertium_url = "https://wiki.apertium.org/wiki/Installation"
@@ -14,6 +14,7 @@ yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules"
 
 def check_config(filename='config.toml'):
     misconfigured = False
+    lex_tools = '/usr/share/apertium-lex-tools'
     with open(filename) as config_file:
         config_toml = config_file.read()
         config = parse(config_toml)
@@ -22,7 +23,7 @@ def check_config(filename='config.toml'):
     assert config_toml == dumps(config)
 
     # changing the paths to absolute
-    for key in ['CORPUS_SL', 'CORPUS_TL', 'LEX_TOOLS', 'FAST_ALIGN', 'LANG_DATA']:
+    for key in ['CORPUS_SL', 'CORPUS_TL', 'FAST_ALIGN', 'LANG_DATA']:
         if not os.path.isabs(config[key]):
             config[key] = os.path.join(os.path.abspath('.'), config[key])
 
@@ -36,22 +37,26 @@ def check_config(filename='config.toml'):
             f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n")
         misconfigured = True
 
-    if not os.path.isdir(config['LEX_TOOLS']):
+    if not os.path.isdir(lex_tools):
         print(
-            f"'{config['LEX_TOOLS']}'(LEX_TOOLS) is not a directory, provide a valid directory or \nto install, follow {lex_tools_url}\n")
+            f"'{lex_tools}'is not a directory, install apertium-lex-tools {apertium_url}\n")
         misconfigured = True
     else:
-        # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \
-        #                 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \
-        #                     'ngrams-to-rules-me.py']
+        scripts = ['extract-sentences.py', 'extract-freq-lexicon.py',
+                   'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py',
+                   'ngrams-to-rules-me.py']
 
-        # for script in scripts:
+        for script in scripts:
+            if not os.path.isfile(os.path.join(lex_tools, script)):
+                print(
+                    f"'{script}' is present in '{lex_tools}', install apertium-lex-tools {apertium_url}\n")
+                misconfigured = True
 
         # assuming scripts are intact
-        if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
-            print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),",
-                  "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
-            misconfigured = True
+        # if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
+        #     print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),",
+        #           "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
+        #     misconfigured = True
 
     if not os.path.isfile(config['FAST_ALIGN']):
         print(
@@ -98,7 +103,19 @@ def check_config(filename='config.toml'):
 
     if not yasmet_present:
         print(
-            f"yasmet is either not installed or not added to path, see {yasmet_url}\n")
+            f"yasmet is either not installed or not added to path, install yasmet and add to the path, \
+                {yasmet_url} or re-install apertium-lex-tools with yasmet, {apertium_url}\n")
+        misconfigured = True
+
+    process_tagger_output_present = False
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'process-tagger-output')):
+            process_tagger_output_present = True
+            break
+
+    if not process_tagger_output_present:
+        print(
+            f"process-tagger-output is either not installed or not added to path, re-install apertium-lex-tools {apertium_url}\n")
         misconfigured = True
 
     if not isinstance(config['TRAINING_LINES'], int):
diff --git a/config.toml.example b/config.toml.example
index b9e537c..9c9d15c 100644
--- a/config.toml.example
+++ b/config.toml.example
@@ -16,7 +16,7 @@ CORPUS_SL = "europarl-v7.eng-spa.eng"
 CORPUS_TL = "europarl-v7.eng-spa.spa"
 
 # apertium-lex-tools scripts
-LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
 
 # fast align build folder
 FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
diff --git a/lexical_training.py b/lexical_training.py
index 9bda975..47f263a 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -64,21 +64,20 @@ def pipe(cmds, firstin, lastout, stderr):
     return procs[-1]
 
 
-def training(config, log):
+def training(config, cache_dir, log):
 
     MIN = 1
 
-    # file/folder names
-    cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}"
+    # file names
     sl_tagged = os.path.join(
         cache_dir, f"{config['CORPUS']}.tagged.{config['SL']}")
     tl_tagged = os.path.join(
         cache_dir, f"{config['CORPUS']}.tagged.{config['TL']}")
-    lines = os.path.join(cache_dir, config['CORPUS']+'.lines')
+    lines = os.path.join(cache_dir, f"{config['CORPUS']}.lines")
     tagged_merged = os.path.join(
         cache_dir, f"{config['CORPUS']}.tagged-merged.{config['SL']}-{config['TL']}")
-    alignment = os.path.join(cache_dir, config['CORPUS'] +
-                             '.align.'+config['SL']+'-'+config['TL'])
+    alignment = os.path.join(
+        cache_dir, f"{config['CORPUS']}.align.{config['SL']}-{config['TL']}")
     clean_biltrans = os.path.join(
         cache_dir, f"{config['CORPUS']}.clean_biltrans.{config['SL']}-{config['TL']}")
     phrasetable = os.path.join(
@@ -101,15 +100,6 @@ def training(config, log):
         cache_dir, 'ngrams_all.txt')
     rules = f"{config['CORPUS']}-{config['SL']}-{config['TL']}.ngrams-lm-{MIN}.xml"
 
-    # the directory where all the intermediary outputs are stored
-    if os.path.isdir(cache_dir):
-        if not query(f"Do you want to overwrite the files in '{cache_dir}'"):
-            print(f"(re)move {cache_dir} and re-run lexical_training.py")
-            exit(1)
-        shutil.rmtree(cache_dir)
-
-    os.mkdir(cache_dir)
-
     if os.path.isfile(rules):
         if not query(f"Do you want to overwrite '{rules}'"):
             print(f"(re)move {rules} and re-run lexical_training.py")
@@ -198,18 +188,23 @@ def training(config, log):
 
     # phrasetable
     with open(tmp1, 'w') as f1, open(tmp2, 'w') as f2:
-        sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin"
-        tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin"
+        sl_tl_autobil = os.path.join(
+            config['LANG_DATA'], f"{config['SL']}-{config['TL']}.autobil.bin")
+        tl_sl_autobil = os.path.join(
+            config['LANG_DATA'], f"{config['TL']}-{config['SL']}.autobil.bin")
         with open(tl_tagged, 'r') as f:
-            call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
-                  os.path.join(config['LANG_DATA'], tl_sl_autobil)], stdin=f, stdout=f1, stderr=log)
+            # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
+            call(['process-tagger-output', tl_sl_autobil],
+                 stdin=f, stdout=f1, stderr=log)
         with open(sl_tagged, 'r') as f:
-            call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
-                  os.path.join(config['LANG_DATA'], sl_tl_autobil)], stdin=f, stdout=f2, stderr=log)
+            # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
+            call(['process-tagger-output', sl_tl_autobil],
+                 stdin=f, stdout=f2, stderr=log)
             f.seek(0)
             with open(clean_biltrans, 'w') as f0:
-                call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
-                      os.path.join(config['LANG_DATA'], sl_tl_autobil)], stdin=f, stdout=f0, stderr=log)
+                # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
+                call(['process-tagger-output', sl_tl_autobil],
+                     stdin=f, stdout=f0, stderr=log)
 
     cmds = [['paste', tmp1, tmp2, alignment], ['sed', 's/\t/ ||| /g']]
     with open(phrasetable, 'w') as f:
@@ -296,17 +291,29 @@ def main():
     config = check_config()
 
     # adding lex scripts to path
-    sys.path.insert(1, config['LEX_TOOLS'])
+    lex_tools = '/usr/share/apertium-lex-tools'
+    sys.path.insert(1, lex_tools)
 
     # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
     print("cleaning corpus....")
     # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
 
-    log = os.path.join(
-        f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}", 'training.log')
+    cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}"
+
+    # the directory where all the intermediary outputs are stored
+    if os.path.isdir(cache_dir):
+        if not query(f"Do you want to overwrite the files in '{cache_dir}'"):
+            print(f"(re)move {cache_dir} and re-run lexical_training.py")
+            exit(0)
+        shutil.rmtree(cache_dir)
+
+    os.mkdir(cache_dir)
+
+    log = os.path.join(cache_dir, "training.log")
 
     with open(log, 'a') as log_file:
-        training(config, log_file)
+        training(config, cache_dir, log_file)
+    print("training complete!!")
 
 
 if __name__ == '__main__':
diff --git a/tests/check_config_test.py b/tests/check_config_test.py
index 116cd64..7dee417 100644
--- a/tests/check_config_test.py
+++ b/tests/check_config_test.py
@@ -1,14 +1,14 @@
 # tests check_config.py
 from check_config import check_config
-import sys
 from tomlkit import parse, dumps
 import os
 import shutil
-
+import sys
 sys.path.append('../')
 
 
 def main(argc, argv):
+    tamper_string = 'abc'
 
     # Test 1
     config_file = open('config_test.toml', 'r')
@@ -16,7 +16,7 @@ def main(argc, argv):
     config = parse(config_toml)
     config_file.close()
 
-    print("Test 1 : wrong paths")
+    print("Test 1 : No installations")
     print("---------------------")
 
     for key in config:
@@ -24,43 +24,69 @@ def main(argc, argv):
             continue
         config[key] += "abc"
 
-    if os.fork() == 0:
-        with open('check_config_test.toml', 'w') as test_file:
-            test_file.write(dumps(config))
-        check_config('check_config_test.toml')
-        exit(0)
+    # if os.fork() == 0:
+    #     with open('check_config_test.toml', 'w') as test_file:
+    #         test_file.write(dumps(config))
+    #     check_config('check_config_test.toml')
+    #     exit(0)
 
-    _, _ = os.wait()
+    # _, _ = os.wait()
 
-    # Test 2
-    config_file = open('config_test.toml', 'r')
-    config_toml = config_file.read()
-    config = parse(config_toml)
-    config_file.close()
+    # # Test 2
+    # config_file = open('config_test.toml', 'r')
+    # config_toml = config_file.read()
+    # config = parse(config_toml)
+    # config_file.close()
+
+    # print("Test 2 : partial/no installations")
+    # print("----------------------------------")
 
-    print("Test 2 : partial/no installations")
-    print("----------------------------------")
+    # config['SL'] += "abc"
 
-    config['SL'] += "abc"
+    lex_tools = '/usr/share/apertium-lex-tools'
+    if os.path.isdir(lex_tools):
+        scripts = ['extract-sentences.py', 'extract-freq-lexicon.py',
+                   'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py',
+                   'ngrams-to-rules-me.py']
+
+        for script in scripts:
+            if os.path.isfile(os.path.join(lex_tools, script)):
+                shutil.move(os.path.join(lex_tools, script),
+                            os.path.join(lex_tools, script+tamper_string))
 
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'apertium')):
             shutil.move(os.path.join(path, 'apertium'),
-                        os.path.join(path, 'apertium'+'abc'))
+                        os.path.join(path, 'apertium'+tamper_string))
             break
 
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'yasmet')):
             shutil.move(os.path.join(path, 'yasmet'),
-                        os.path.join(path, 'yasmet'+'abc'))
+                        os.path.join(path, 'yasmet'+tamper_string))
             break
 
-    if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')):
-        shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
-                    os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'))
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'process-tagger-output')):
+            shutil.move(os.path.join(path, 'process-tagger-output'),
+                        os.path.join(path, 'process-tagger-output'+tamper_string))
+            break
+
+    if os.path.isdir(lex_tools):
+        scripts = ['extract-sentences.py', 'extract-freq-lexicon.py',
+                   'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py',
+                   'ngrams-to-rules-me.py']
+
+        for script in scripts:
+            if os.path.isfile(os.path.join(lex_tools, script+tamper_string)):
+                shutil.move(os.path.join(lex_tools, script+tamper_string),
+                            os.path.join(lex_tools, script))
+    # if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')):
+    #     shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
+    #                 os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+tamper_string))
 
     # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')):
-    #     shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'))
+    #     shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+tamper_string))
 
     if os.fork() == 0:
         with open('check_config_test.toml', 'w') as test_file:
@@ -70,33 +96,39 @@ def main(argc, argv):
 
     _, _ = os.wait()
 
-    shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'),
-                os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
+    # shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+tamper_string),
+    #             os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
 
-    # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align'))
+    # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+tamper_string), os.path.join(config['FAST_ALIGN'], 'fast_align'))
 
     for path in os.environ["PATH"].split(os.pathsep):
-        if os.path.isfile(os.path.join(path, 'apertium'+'abc')):
-            shutil.move(os.path.join(path, 'apertium'+'abc'),
+        if os.path.isfile(os.path.join(path, 'apertium'+tamper_string)):
+            shutil.move(os.path.join(path, 'apertium'+tamper_string),
                         os.path.join(path, 'apertium'))
             break
 
     for path in os.environ["PATH"].split(os.pathsep):
-        if os.path.isfile(os.path.join(path, 'yasmet'+'abc')):
-            shutil.move(os.path.join(path, 'yasmet'+'abc'),
+        if os.path.isfile(os.path.join(path, 'yasmet'+tamper_string)):
+            shutil.move(os.path.join(path, 'yasmet'+tamper_string),
                         os.path.join(path, 'yasmet'))
             break
 
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'process-tagger-output'+tamper_string)):
+            shutil.move(os.path.join(path, 'process-tagger-output'+tamper_string),
+                        os.path.join(path, 'process-tagger-output'))
+            break
+
     # Test 3
     config_file = open('config_test.toml', 'r')
     config_toml = config_file.read()
     config = parse(config_toml)
     config_file.close()
 
-    print("Test 3 : wrong TRAINING_LINES")
+    print("Test 2 : wrong TRAINING_LINES")
     print("---------------------")
 
-    for value in ['abc', 1.00, 1e237892]:
+    for value in [tamper_string, 1.00, 1e237892]:
         config['TRAINING_LINES'] = value
         if os.fork() == 0:
             with open('check_config_test.toml', 'w') as test_file:
@@ -112,7 +144,7 @@ def main(argc, argv):
     config = parse(config_toml)
     config_file.close()
 
-    print("Test 4 : correct installations")
+    print("Test 3 : correct installations")
     print("-------------------------------")
 
     if os.fork() == 0: