commit aa27247b596eec6390037720fa169c06484ce789
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Tue May 25 19:12:41 2021 +0530

    scripts for checking config added

diff --git a/check_config.py b/check_config.py
new file mode 100644
index 0000000..599ebb8
--- /dev/null
+++ b/check_config.py
@@ -0,0 +1,90 @@
+# parses the config, check if the tools are present
+
+from tomlkit import parse, dumps
+import os
+
+# urls of the required tools and data
+corpora_url = "https://wiki.apertium.org/wiki/Corpora"
+lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling"
+fast_align_url = "https://github.com/clab/fast_align"
+langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs"
+apertium_url = "https://wiki.apertium.org/wiki/Installation"
+yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules"
+
+def parse_config(filename='config.toml'):
+    with open(filename) as config_file:
+        config_toml = config_file.read()
+        config = parse(config_toml)
+
+    # gives error if not parsed well
+    assert config_toml == dumps(config)
+
+    if not os.path.isfile(config['CORPUS_SL']):
+        print(config['CORPUS_SL'], "is not a file, provide a valid file or \nto download, look", corpora_url)
+        exit(-1)
+
+    if not os.path.isfile(config['CORPUS_TL']):
+        print(config['CORPUS_TL'], "is not a file, provide a valid file or \nto download, look", corpora_url)
+        exit(-1)
+
+    if not os.path.isdir(config['LEX_TOOLS']):
+        print(config['LEX_TOOLS'], "is not a directory, provide a valid directory or \nto install, follow", lex_tools_url)
+        exit(-1)
+    else:
+        # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \
+        #                 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \
+        #                     'ngrams-to-rules-me.py']
+
+        # for script in scripts:
+
+        # assuming scripts are intact
+        if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
+            print("process-tagger-output is not in", config['LEX_TOOLS'] + ",","provide a valid directory or \nto install, follow", lex_tools_url)
+            exit(-1)
+
+    if not os.path.isdir(config['FAST_ALIGN']):
+        print(config['FAST_ALIGN'], "is not a directory, provide a valid directory or \nto install, follow", fast_align_url)
+        exit(-1)
+    else:
+        if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
+            print("fast_align is not present in", config['FAST_ALIGN']+ ",", "provide a valid directory or \nto install, follow", fast_align_url)
+            exit(-1)
+    
+    if not os.path.isdir(config['LANG_DATA']):
+        print(config['LANG_DATA'], "is not a directory, provide a valid directory or \nto install, follow", langs_url)
+        exit(-1)
+    else:
+        sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin'
+        tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin'
+
+        if sl_tl_autobil not in os.listdir(config['LANG_DATA']):
+            print(sl_tl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url)
+            exit(-1)
+
+        if tl_sl_autobil not in os.listdir(config['LANG_DATA']):
+            print(tl_sl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url)
+            exit(-1)
+
+    apertium_present = False
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'apertium')):
+            apertium_present = True
+            break
+
+    if not apertium_present:
+        print("apertium is either not installed or not added to path, see", apertium_url)
+        exit(-1)
+
+    yasmet_present = False
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.path.isfile(os.path.join(path, 'yasmet')):
+            yasmet_present = True
+            break
+        
+    if not yasmet_present:
+        print("yasmet is either not installed or not added to path, see", yasmet_url)
+        exit(-1)
+    return config
+
+if __name__ == '__main__':
+    parse_config()
\ No newline at end of file
diff --git a/clean_corpus.py b/clean_corpus.py
new file mode 100644
index 0000000..f1b85d3
--- /dev/null
+++ b/clean_corpus.py
@@ -0,0 +1,95 @@
+# removes lines above and below the empty lines including the empty lines in each corpus
+# removes lines containing only ° and *
+# stripping trailing and leading spaces
+
+
+import sys
+
+
+def main(argc, argv):
+    if argc != 3:
+        print('usage: clean_corpus.py <corpus 1> <corpus 2>')
+        exit(-1)
+
+    lines1 = []
+    lines2 = []
+    lines_to_remove = set()
+
+    with open(argv[1], 'r+') as l1, open(argv[2], 'r+') as l2:
+        lines1 = l1.readlines()
+        lines2 = l2.readlines()
+        assert len(lines1) == len(lines2)
+        # print(lines1, lines2)
+        i = 0
+        for i in range(len(lines1)):
+            # if not(lines1[i].strip()) and not(lines2[i].strip()):
+            #     continue
+                # if i > 0:
+                #     if i < len(lines1)-1:
+                #         del lines1[i-1], lines2[i-1]
+                #         del lines1[i-1], lines2[i-1]
+                #         del lines1[i-1], lines2[i-1]
+                #     else:
+                #         del lines1[i-1], lines2[i-1]
+                #         del lines1[i-1], lines2[i-1]
+                # else:
+                #     if i < len(lines1)-1:
+                #         del lines1[i], lines2[i]
+                #         del lines1[i], lines2[i]
+                #     else:
+                #         del lines1[i], lines2[i]
+            if (not lines1[i].strip()) or (not lines2[i].strip()):
+                lines_to_remove.update([i-1, i, i+1])
+                continue
+            
+            # removing lines only with '°' and '*'
+            if (not lines1[i].replace('°', ' ').replace('*', ' ').strip()) and (not lines2[i].replace('°', ' ').replace('*', ' ').strip()):
+                lines_to_remove.add(i)
+            # print(lines1, lines2)
+
+        # assert len(lines1) == len(lines2)
+
+        # if len(lines1) == 0:
+        #     l1.seek(0)
+        #     l1.write('\n')
+        #     l1.truncate()
+
+        #     l2.seek(0)
+        #     l2.write('\n')
+        #     l2.truncate()
+
+        #     l1.close()
+        #     l2.close()
+        #     return
+
+        # if '\n' not in lines1[len(lines1)-1]:
+        #     lines1[len(lines1)-1] = lines1[len(lines1)-1] + '\n'
+        # if '\n' not in lines2[len(lines2)-1]:
+        #     lines2[len(lines2)-1] = lines2[len(lines2)-1] + '\n'
+
+        print(lines_to_remove)
+
+        l1.seek(0)
+        # l1.write(''.join(lines1))
+        l1.write('')
+        l1.truncate()
+
+        l2.seek(0)
+        l2.write('')
+        l2.truncate()
+
+    with open(argv[1], 'a') as l1, open(argv[2], 'a') as l2:
+        lines_to_keep = set()
+        lines_to_keep.update([i for i in range(len(lines1))])
+        lines_to_keep = lines_to_keep - lines_to_remove
+        
+        for i in sorted(lines_to_keep):
+            # also removing leading and trailing spaces
+            l1.write(lines1[i].strip() + '\n')
+            l2.write(lines2[i].strip() + '\n')
+        
+        l1.truncate()
+        l2.truncate()
+
+if __name__ == '__main__':
+    main(len(sys.argv), sys.argv)
\ No newline at end of file
diff --git a/config.toml b/config.toml
new file mode 100644
index 0000000..22158dd
--- /dev/null
+++ b/config.toml
@@ -0,0 +1,26 @@
+# configuration for lexical training
+# Note: pass absolute paths
+
+# corpus name
+CORPUS = "europarl-v7"
+
+# source language
+SL = "eng"
+
+# target language
+TL = "spa"
+
+# source corpus
+CORPUS_SL = "europarl-v7.eng-spa.eng"
+
+# target corpus
+CORPUS_TL = "europarl-v7.eng-spa.spa"
+
+# apertium-lex-tools scripts
+LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+
+# fast align build folder
+FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
+
+# apertium language data
+LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
diff --git a/config.toml.example b/config.toml.example
new file mode 100644
index 0000000..22158dd
--- /dev/null
+++ b/config.toml.example
@@ -0,0 +1,26 @@
+# configuration for lexical training
+# Note: pass absolute paths
+
+# corpus name
+CORPUS = "europarl-v7"
+
+# source language
+SL = "eng"
+
+# target language
+TL = "spa"
+
+# source corpus
+CORPUS_SL = "europarl-v7.eng-spa.eng"
+
+# target corpus
+CORPUS_TL = "europarl-v7.eng-spa.spa"
+
+# apertium-lex-tools scripts
+LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
+
+# fast align build folder
+FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
+
+# apertium language data
+LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
diff --git a/lexical_training.py b/lexical_training.py
new file mode 100644
index 0000000..72fdfa0
--- /dev/null
+++ b/lexical_training.py
@@ -0,0 +1,9 @@
+# lexical training script
+from check_config import parse_config
+
+def main():
+    config = parse_config()
+    print("parsing complete")
+    
+if __name__ == '__main__':
+    main()
\ No newline at end of file