commit 4fb5b89e497343037d659f39226a2243b4e92188
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Mon May 31 15:55:00 2021 +0530

    check_config:minor changes and optimizations

diff --git a/README.md b/README.md
index 02ac9f2..7115def 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,9 @@
-# apertium-lexical-training
\ No newline at end of file
+# apertium-lexical-training
+
+The procedure for lexical selection training is a bit messy, with various scripts involved that require lots of manual tweaking, and many third party tools to be installed, e.g. irstlm, moses, gizapp. The goal of this task is to make the training procedure as streamlined and user-friendly as possible
+
+for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/User-friendly_lexical_selection_training
+
+## tests
+
+This folder contains scripts for automated testing of the helper scripts
diff --git a/check_config.py b/check_config.py
index 599ebb8..5a2d5d4 100644
--- a/check_config.py
+++ b/check_config.py
@@ -11,7 +11,8 @@ langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs"
 apertium_url = "https://wiki.apertium.org/wiki/Installation"
 yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules"
 
-def parse_config(filename='config.toml'):
+def check_config(filename='config.toml'):
+    misconfigured = False
     with open(filename) as config_file:
         config_toml = config_file.read()
         config = parse(config_toml)
@@ -19,17 +20,25 @@ def parse_config(filename='config.toml'):
     # gives error if not parsed well
     assert config_toml == dumps(config)
 
+    # changing the paths to absolute
+    for key in ['CORPUS_SL', 'CORPUS_TL', 'LEX_TOOLS', 'FAST_ALIGN', 'LANG_DATA']:
+        if not os.path.isabs(config[key]):
+            config[key] = os.path.join(os.path.abspath('.'), config[key])
+
     if not os.path.isfile(config['CORPUS_SL']):
-        print(config['CORPUS_SL'], "is not a file, provide a valid file or \nto download, look", corpora_url)
-        exit(-1)
+        print("'"+config['CORPUS_SL']+"'(CORPUS_SL)","is not a file, provide a valid"+ \
+                    " file or \nto download, look", corpora_url, '\n')
+        misconfigured = True
 
     if not os.path.isfile(config['CORPUS_TL']):
-        print(config['CORPUS_TL'], "is not a file, provide a valid file or \nto download, look", corpora_url)
-        exit(-1)
+        print("'"+config['CORPUS_TL']+"'(CORPUS_TL)", "is not a file, provide a valid "+ \
+                    "file or \nto download, look", corpora_url, '\n')
+        misconfigured = True
 
     if not os.path.isdir(config['LEX_TOOLS']):
-        print(config['LEX_TOOLS'], "is not a directory, provide a valid directory or \nto install, follow", lex_tools_url)
-        exit(-1)
+        print("'"+config['LEX_TOOLS']+"'(LEX_TOOLS)", "is not a directory, provide a valid "+ \
+                    "directory or \nto install, follow", lex_tools_url, '\n')
+        misconfigured = True
     else:
         # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \
         #                 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \
@@ -39,31 +48,35 @@ def parse_config(filename='config.toml'):
 
         # assuming scripts are intact
         if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
-            print("process-tagger-output is not in", config['LEX_TOOLS'] + ",","provide a valid directory or \nto install, follow", lex_tools_url)
-            exit(-1)
+            print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", \
+                        "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
+            misconfigured = True
 
     if not os.path.isdir(config['FAST_ALIGN']):
-        print(config['FAST_ALIGN'], "is not a directory, provide a valid directory or \nto install, follow", fast_align_url)
-        exit(-1)
+        print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a directory, provide"+ \
+                    " a valid directory or \nto install, follow", fast_align_url, '\n')
+        misconfigured = True
     else:
         if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
-            print("fast_align is not present in", config['FAST_ALIGN']+ ",", "provide a valid directory or \nto install, follow", fast_align_url)
-            exit(-1)
+            print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
+                            "provide a valid directory or \nto install, follow", fast_align_url, '\n')
+            misconfigured = True
     
     if not os.path.isdir(config['LANG_DATA']):
-        print(config['LANG_DATA'], "is not a directory, provide a valid directory or \nto install, follow", langs_url)
-        exit(-1)
+        print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \
+                    "directory or \nto install, follow", langs_url, '\n')
+        misconfigured = True
     else:
         sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin'
         tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin'
-
         if sl_tl_autobil not in os.listdir(config['LANG_DATA']):
-            print(sl_tl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url)
-            exit(-1)
-
+            print("'"+sl_tl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \
+                        "provide a valid directory or \nto install, follow", langs_url, '\n')
+            misconfigured = True
         if tl_sl_autobil not in os.listdir(config['LANG_DATA']):
-            print(tl_sl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url)
-            exit(-1)
+            print("'"+tl_sl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \
+                        "provide a valid directory or \nto install, follow", langs_url, '\n')
+            misconfigured = True
 
     apertium_present = False
     for path in os.environ["PATH"].split(os.pathsep):
@@ -72,8 +85,8 @@ def parse_config(filename='config.toml'):
             break
 
     if not apertium_present:
-        print("apertium is either not installed or not added to path, see", apertium_url)
-        exit(-1)
+        print("apertium is either not installed or not added to path, see", apertium_url, '\n')
+        misconfigured = True
 
     yasmet_present = False
     for path in os.environ["PATH"].split(os.pathsep):
@@ -82,9 +95,13 @@ def parse_config(filename='config.toml'):
             break
         
     if not yasmet_present:
-        print("yasmet is either not installed or not added to path, see", yasmet_url)
-        exit(-1)
+        print("yasmet is either not installed or not added to path, see", yasmet_url, '\n')
+        misconfigured = True
+
+    if misconfigured:
+        exit(1)
+
     return config
 
 if __name__ == '__main__':
-    parse_config()
\ No newline at end of file
+    check_config()
\ No newline at end of file
diff --git a/clean_corpus.py b/clean_corpus.py
index f1b85d3..b131bf8 100644
--- a/clean_corpus.py
+++ b/clean_corpus.py
@@ -22,51 +22,16 @@ def main(argc, argv):
         # print(lines1, lines2)
         i = 0
         for i in range(len(lines1)):
-            # if not(lines1[i].strip()) and not(lines2[i].strip()):
-            #     continue
-                # if i > 0:
-                #     if i < len(lines1)-1:
-                #         del lines1[i-1], lines2[i-1]
-                #         del lines1[i-1], lines2[i-1]
-                #         del lines1[i-1], lines2[i-1]
-                #     else:
-                #         del lines1[i-1], lines2[i-1]
-                #         del lines1[i-1], lines2[i-1]
-                # else:
-                #     if i < len(lines1)-1:
-                #         del lines1[i], lines2[i]
-                #         del lines1[i], lines2[i]
-                #     else:
-                #         del lines1[i], lines2[i]
             if (not lines1[i].strip()) or (not lines2[i].strip()):
                 lines_to_remove.update([i-1, i, i+1])
                 continue
             
-            # removing lines only with '°' and '*'
-            if (not lines1[i].replace('°', ' ').replace('*', ' ').strip()) and (not lines2[i].replace('°', ' ').replace('*', ' ').strip()):
+            # removing lines only with '°', '*' and '.'
+            if (not lines1[i].replace('°', '').replace('*', '').replace('.','').strip()) and \
+                        (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()):
                 lines_to_remove.add(i)
             # print(lines1, lines2)
-
-        # assert len(lines1) == len(lines2)
-
-        # if len(lines1) == 0:
-        #     l1.seek(0)
-        #     l1.write('\n')
-        #     l1.truncate()
-
-        #     l2.seek(0)
-        #     l2.write('\n')
-        #     l2.truncate()
-
-        #     l1.close()
-        #     l2.close()
-        #     return
-
-        # if '\n' not in lines1[len(lines1)-1]:
-        #     lines1[len(lines1)-1] = lines1[len(lines1)-1] + '\n'
-        # if '\n' not in lines2[len(lines2)-1]:
-        #     lines2[len(lines2)-1] = lines2[len(lines2)-1] + '\n'
-
+            
         print(lines_to_remove)
 
         l1.seek(0)
diff --git a/config.toml b/config.toml
index 22158dd..df7c7a3 100644
--- a/config.toml
+++ b/config.toml
@@ -1,5 +1,4 @@
 # configuration for lexical training
-# Note: pass absolute paths
 
 # corpus name
 CORPUS = "europarl-v7"
@@ -17,10 +16,10 @@ CORPUS_SL = "europarl-v7.eng-spa.eng"
 CORPUS_TL = "europarl-v7.eng-spa.spa"
 
 # apertium-lex-tools scripts
-LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+LEX_TOOLS = "../apertium-lex-tools/scripts"
 
 # fast align build folder
-FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
+FAST_ALIGN = "coding_challenges/fast_align/build"
 
 # apertium language data
-LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
+LANG_DATA = "coding_challenges/apertium-eng-spa"
diff --git a/config.toml.example b/config.toml.example
index 22158dd..5949a1f 100644
--- a/config.toml.example
+++ b/config.toml.example
@@ -1,5 +1,4 @@
 # configuration for lexical training
-# Note: pass absolute paths
 
 # corpus name
 CORPUS = "europarl-v7"
diff --git a/lexical_training.py b/lexical_training.py
index 72fdfa0..cb105b8 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,8 +1,8 @@
 # lexical training script
-from check_config import parse_config
+from check_config import check_config
 
 def main():
-    config = parse_config()
+    config = check_config()
     print("parsing complete")
     
 if __name__ == '__main__':
diff --git a/tests/check_config_test.py b/tests/check_config_test.py
new file mode 100644
index 0000000..08753b6
--- /dev/null
+++ b/tests/check_config_test.py
@@ -0,0 +1,98 @@
+# tests check_config.py
+import sys
+from tomlkit import parse, dumps
+import os
+import shutil
+
+sys.path.append('../')
+
+from check_config import check_config
+
+def main(argc, argv):
+    
+    # Test 1
+    config_file = open('config_test.toml', 'r')
+    config_toml = config_file.read()
+    config = parse(config_toml)
+    config_file.close()
+
+    print("Test 1 : wrong paths")
+    print("---------------------")
+
+    for key in config:
+        config[key]+="abc"
+
+    if os.fork() == 0:
+        with open('check_config_test.toml', 'w') as test_file:
+            test_file.write(dumps(config))
+        check_config('check_config_test.toml')
+        exit(0)
+
+    _, _ = os.wait()
+
+    # Test 2
+    config_file = open('config_test.toml', 'r')
+    config_toml = config_file.read()
+    config = parse(config_toml)
+    config_file.close()
+
+    print("Test 2 : partial/no installations")
+    print("----------------------------------")
+
+    config['SL']+="abc"
+
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'apertium')):
+            shutil.move(os.path.join(path, 'apertium'), os.path.join(path, 'apertium'+'abc'))
+            break
+    
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'yasmet')):
+            shutil.move(os.path.join(path, 'yasmet'), os.path.join(path, 'yasmet'+'abc'))
+            break
+
+    if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')):
+        shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'))
+
+    if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')):
+        shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'))
+
+    if os.fork() == 0:
+        with open('check_config_test.toml', 'w') as test_file:
+            test_file.write(dumps(config))
+        check_config('check_config_test.toml')
+        exit(0)
+
+    _, _ = os.wait()
+
+    shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
+
+    shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align'))
+
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'apertium'+'abc')):
+            shutil.move(os.path.join(path, 'apertium'+'abc'), os.path.join(path, 'apertium'))
+            break
+    
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'yasmet'+'abc')):
+            shutil.move(os.path.join(path, 'yasmet'+'abc'), os.path.join(path, 'yasmet'))
+            break
+    
+    # Test 3
+    config_file = open('config_test.toml', 'r')
+    config_toml = config_file.read()
+    config = parse(config_toml)
+    config_file.close()
+
+    print("Test 3 : correct installations")
+    print("-------------------------------")
+
+    with open('check_config_test.toml', 'w') as test_file:
+        test_file.write(dumps(config))
+    check_config('check_config_test.toml')
+
+    os.remove('check_config_test.toml')
+
+if __name__ == '__main__':
+    main(len(sys.argv), sys.argv)
\ No newline at end of file
diff --git a/tests/config_test.toml b/tests/config_test.toml
new file mode 100644
index 0000000..0d67a99
--- /dev/null
+++ b/tests/config_test.toml
@@ -0,0 +1,25 @@
+# configuration for lexical training
+
+# corpus name
+CORPUS = "europarl-v7"
+
+# source language
+SL = "eng"
+
+# target language
+TL = "spa"
+
+# source corpus
+CORPUS_SL = "../europarl-v7.eng-spa.eng"
+
+# target corpus
+CORPUS_TL = "../europarl-v7.eng-spa.spa"
+
+# apertium-lex-tools scripts
+LEX_TOOLS = "../../apertium-lex-tools/scripts"
+
+# fast align build folder
+FAST_ALIGN = "../coding_challenges/fast_align/build"
+
+# apertium language data
+LANG_DATA = "../coding_challenges/apertium-eng-spa"