commit 6b0079ecf254907c1ed00248f5deb2a67616262a
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Sat Jun 12 14:39:39 2021 +0530

    added code for alignment

diff --git a/check_config.py b/check_config.py
index 2455282..60b77db 100644
--- a/check_config.py
+++ b/check_config.py
@@ -52,15 +52,15 @@ def check_config(filename='config.toml'):
                         "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
             misconfigured = True
 
-    if not os.path.isdir(config['FAST_ALIGN']):
-        print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a directory, provide"+ \
-                    " a valid directory or \nto install, follow", fast_align_url, '\n')
+    if not os.path.isfile(config['FAST_ALIGN']):
+        print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a file, provide"+ \
+                    " a valid executable or \nto install, follow", fast_align_url, '\n')
         misconfigured = True
-    else:
-        if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
-            print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
-                            "provide a valid directory or \nto install, follow", fast_align_url, '\n')
-            misconfigured = True
+    # else:
+    #     if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
+    #         print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
+    #                         "provide a valid directory or \nto install, follow", fast_align_url, '\n')
+    #         misconfigured = True
     
     if not os.path.isdir(config['LANG_DATA']):
         print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \
diff --git a/config.toml b/config.toml
index 6e94ceb..7684198 100644
--- a/config.toml
+++ b/config.toml
@@ -18,8 +18,8 @@ CORPUS_TL = "europarl-v7.eng-spa.spa"
 # apertium-lex-tools scripts
 LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
 
-# fast align build folder
-FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
+# fast align
+FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build/fast_align"
 
 # apertium language data
 LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
diff --git a/lexical_training.py b/lexical_training.py
index e0470b6..2bfb766 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,9 +1,39 @@
 # lexical training script
 import os
-from subprocess import Popen, PIPE
+from subprocess import Popen, PIPE, call
 from check_config import check_config
 from clean_corpus import clean_corpus
 
+def query(question, default="yes"):
+    """Ask a yes/no question via raw_input() and return their answer.
+
+    "question" is a string that is presented to the user.
+    "default" is the presumed answer if the user just hits <Enter>.
+            It must be "yes" (the default), "no" or None (meaning
+            an answer is required of the user).
+
+    The "answer" return value is True for "yes" or False for "no".
+    """
+    valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+    if default is None:
+        prompt = " [y/n]"
+    elif default == "no":
+        prompt = " [y/N]"
+    else:
+        prompt = " [Y/n]"
+        default= "yes"
+
+    while True:
+        print(question + prompt+"(default='"+default+"')?")
+        choice = input().lower()
+        if default is not None and choice == "":
+            return valid[default]
+        elif choice in valid:
+            return valid[choice]
+        else:
+            print("Please respond with 'yes', 'no', 'y' or 'n'")
+            exit(1)
+
 def main():
     print("validating configuration....")
     config = check_config()
@@ -12,18 +42,29 @@ def main():
     print("cleaning corpus....")
     # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
 
+    # file names
+    cache_dir = "cache-"+config['SL']+"-"+config['TL']
+    training_log_name = cache_dir+'/'+'training.log'
+    sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL']
+    tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL']
+    lines = cache_dir+'/'+config['CORPUS']+'.lines'
+    tagged_merged = cache_dir+'/'+config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL']
+    alignment = cache_dir+'/'+config['CORPUS']+'.align.'+config['SL']+'-'+config['TL']
+
     with open(config['CORPUS_SL'], 'r') as corpus_sl:
         training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines()))
     
     print('loading', training_lines, 'lines from the corpora')
 
     # the directory where all the intermediary outputs are stored
-    cache_dir = "cache-"+config['SL']+"-"+config['TL']
     if not os.path.isdir(cache_dir):
         os.mkdir(cache_dir)
+    else:
+        if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"):
+            print("remove", cache_dir, "and re-run lexical_training.py")
+            exit(1)
 
-    training_log_name = cache_dir+'/'+'training.log'
-    if os.path.isdir(training_log_name):
+    if os.path.isfile(training_log_name):
         os.remove(training_log_name)
 
     training_log = open(training_log_name, 'a')
@@ -33,32 +74,67 @@ def main():
     with open(config['CORPUS_SL']) as f:
         p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
 
-    c2 = ['apertium-destxt']
-    p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+    # c2 = ['apertium-destxt']
+    # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
     c3 = ['apertium', '-d', config['LANG_DATA'], config['SL']+'-'+config['TL']+'-tagger']
-    p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log)
+    p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
-    c4 = ['apertium-pretransfer']
-    sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL']
+    c4 = ['sed', 's/ \+/ /g']
+    p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log)
+
+    c5 = ['apertium-pretransfer']
     with open(sl_tagged, 'w') as f:
-        Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log)
+        Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log)
 
     # tagging the target side corpus
     c1 = ['head', '-n', str(config['TRAINING_LINES'])]
     with open(config['CORPUS_TL']) as f:
         p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
 
-    c2 = ['apertium-destxt']
-    p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+    # c2 = ['apertium-destxt']
+    # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
     c3 = ['apertium', '-d', config['LANG_DATA'], config['TL']+'-'+config['SL']+'-tagger']
-    p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log)
+    p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
-    c4 = ['apertium-pretransfer']
-    tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL']
+    c4 = ['sed', 's/ \+/ /g']
+    p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log)
+
+    c5 = ['apertium-pretransfer']
     with open(tl_tagged, 'w') as f:
-        Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log).wait()
+        Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log).wait()
+
+    # removing lines with no analyses
+    with open(lines, 'w+') as f0:
+        call(['seq', '1', str(config['TRAINING_LINES'])], stdout=f0, stderr=training_log)
+        clean_tagged = cache_dir+'/'+config['CORPUS']+'.clean_tagged'
+        with open(clean_tagged, 'w+') as f1:
+            p1 = Popen(['paste', lines, sl_tagged, tl_tagged], stdout=PIPE, stderr=training_log)
+            Popen(['grep', '<*\t*<'], stdin=p1.stdout, stdout=f1, stderr=training_log).wait()
+
+            call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=training_log)
+
+            f1.seek(0)
+            with open(sl_tagged, 'w') as f2:
+                p1 = Popen(['cut', '-f', '2'], stdin=f1, stdout=PIPE, stderr=training_log)
+                p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+                Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log)
+
+            f1.seek(0)
+            with open(tl_tagged, 'w') as f2:
+                p1 = Popen(['cut', '-f', '3'], stdin=f1, stdout=PIPE, stderr=training_log)
+                p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+                Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log).wait()
+
+    os.remove(clean_tagged)
+
+    # aligning the parallel corpus
+    with open(tagged_merged, 'w+') as f:
+        with open(os.devnull, 'r') as f1:
+            call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-', sl_tagged], stdin=f1, stdout=f, stderr=training_log)
+        with open(alignment, 'w') as f2:
+            call([config['FAST_ALIGN'], '-i', tagged_merged, '-d', '-o', '-v'], stdout=f2, stderr=training_log)
 
     training_log.close()
 
diff --git a/tests/check_config_test.py b/tests/check_config_test.py
index c1d5bb3..3f678d0 100644
--- a/tests/check_config_test.py
+++ b/tests/check_config_test.py
@@ -56,8 +56,8 @@ def main(argc, argv):
     if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')):
         shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'))
 
-    if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')):
-        shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'))
+    # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')):
+    #     shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'))
 
     if os.fork() == 0:
         with open('check_config_test.toml', 'w') as test_file:
@@ -69,7 +69,7 @@ def main(argc, argv):
 
     shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
 
-    shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align'))
+    # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align'))
 
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'apertium'+'abc')):