commit a1662f785d5b83425d5dcfbf8b762d50c2688464
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Mon Jun 14 14:30:29 2021 +0530

    code optimizations

diff --git a/lexical_training.py b/lexical_training.py
index 931bd29..b99ffc8 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,10 +1,10 @@
 # lexical training script
 import os
 from subprocess import Popen, PIPE, call
-from typing import List
 from check_config import check_config
 from clean_corpus import clean_corpus
 
+
 def query(question, default="yes"):
     """Ask a yes/no question via raw_input() and return their answer.
 
@@ -22,7 +22,7 @@ def query(question, default="yes"):
         prompt = " [y/N]"
     else:
         prompt = " [Y/n]"
-        default= "yes"
+        default = "yes"
 
     while True:
         print(question + prompt+"(default='"+default+"')?")
@@ -35,133 +35,130 @@ def query(question, default="yes"):
             print("Please respond with 'yes', 'no', 'y' or 'n'")
             exit(1)
 
-def main():
-    print("validating configuration....")
-    config = check_config()
 
-    # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
-    print("cleaning corpus....")
-    # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
+def pipe(cmds, firstin, lastout, stderr):
+    """Open a list of commands as a simple shell pipe, using the same stderr
+    for all commands.
+    Returns None if the command list is empty.
+    Example usage:
+    >>> cmds = [['yes', 'olleh'], ['head', '-2'], ['rev']]
+    >>> with open('/tmp/foo', 'w') as outf:
+            p = pipe(cmds, None, outf, sys.stderr)
+            retcode = p.wait()
+    >>> print(open('/tmp/foo', 'r').read())
+    hello
+    hello
+    """
+    if cmds == []:
+        return
+    procs = []                  # type: List[Popen]
+    for i in range(len(cmds)):
+        cmd = cmds[i]
+        inp = procs[i-1].stdout if i > 0 else firstin
+        outp = PIPE if i+1 < len(cmds) else lastout
+        procs.append(Popen(cmd, stdin=inp, stdout=outp, stderr=stderr))
+    return procs[-1]
+
 
+def training(config, cache_dir, log):
     # file names
-    cache_dir = "cache-"+config['SL']+"-"+config['TL']
-    training_log_name = cache_dir+'/'+'training.log'
     sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL']
     tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL']
     lines = cache_dir+'/'+config['CORPUS']+'.lines'
-    tagged_merged = cache_dir+'/'+config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL']
-    alignment = cache_dir+'/'+config['CORPUS']+'.align.'+config['SL']+'-'+config['TL']
+    tagged_merged = cache_dir+'/' + \
+        config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL']
+    alignment = cache_dir+'/'+config['CORPUS'] + \
+        '.align.'+config['SL']+'-'+config['TL']
 
     with open(config['CORPUS_SL'], 'r') as corpus_sl:
-        training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines()))
-    
-    print('loading', training_lines, 'lines from the corpora')
-
-    # the directory where all the intermediary outputs are stored
-    if not os.path.isdir(cache_dir):
-        os.mkdir(cache_dir)
-    else:
-        if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"):
-            print("remove", cache_dir, "and re-run lexical_training.py")
-            exit(1)
-
-    if os.path.isfile(training_log_name):
-        os.remove(training_log_name)
+        training_lines = min(config['TRAINING_LINES'],
+                             len(corpus_sl.readlines()))
 
-    training_log = open(training_log_name, 'a')
+    print('loading', training_lines, 'lines from the corpora')
 
     # tagging the source side corpus
-    c1 = ['head', '-n', str(config['TRAINING_LINES'])]
-    with open(config['CORPUS_SL']) as f:
-        p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
+    cmds = [['head', '-n', str(training_lines)],
+            ['apertium', '-d', config['LANG_DATA'],
+             config['SL']+'-'+config['TL']+'-tagger'],
+            ['sed', 's/ \+/ /g'], ['apertium-pretransfer']]
+    with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp:
+        pipe(cmds, inp, outp, log)
 
     # c2 = ['apertium-destxt']
     # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
-    c3 = ['apertium', '-d', config['LANG_DATA'], config['SL']+'-'+config['TL']+'-tagger']
-    p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-
-    c4 = ['sed', 's/ \+/ /g']
-    p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log)
-
-    c5 = ['apertium-pretransfer']
-    with open(sl_tagged, 'w') as f:
-        Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log)
-
     # tagging the target side corpus
-    c1 = ['head', '-n', str(config['TRAINING_LINES'])]
-    with open(config['CORPUS_TL']) as f:
-        p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
+    cmds = [['head', '-n', str(training_lines)],
+            ['apertium', '-d', config['LANG_DATA'],
+             config['TL']+'-'+config['SL']+'-tagger'],
+            ['sed', 's/ \+/ /g'], ['apertium-pretransfer']]
+    with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp:
+        pipe(cmds, inp, outp, log).wait()
 
     # c2 = ['apertium-destxt']
     # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
-    c3 = ['apertium', '-d', config['LANG_DATA'], config['TL']+'-'+config['SL']+'-tagger']
-    p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-
-    c4 = ['sed', 's/ \+/ /g']
-    p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log)
-
-    c5 = ['apertium-pretransfer']
-    with open(tl_tagged, 'w') as f:
-        Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log).wait()
-
     # removing lines with no analyses
     with open(lines, 'w+') as f0:
-        call(['seq', '1', str(config['TRAINING_LINES'])], stdout=f0, stderr=training_log)
+        call(['seq', '1', str(training_lines)],
+             stdout=f0, stderr=log)
         clean_tagged = cache_dir+'/'+config['CORPUS']+'.clean_tagged'
         with open(clean_tagged, 'w+') as f1:
-            p1 = Popen(['paste', lines, sl_tagged, tl_tagged], stdout=PIPE, stderr=training_log)
-            Popen(['grep', '<*\t*<'], stdin=p1.stdout, stdout=f1, stderr=training_log).wait()
+            cmds = [['paste', lines, sl_tagged, tl_tagged],
+                    ['grep', '<*\t*<']]
+            pipe(cmds, None, f1, log)
 
-            call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=training_log)
+            call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=log)
 
             f1.seek(0)
             with open(sl_tagged, 'w') as f2:
-                p1 = Popen(['cut', '-f', '2'], stdin=f1, stdout=PIPE, stderr=training_log)
-                p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-                Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log)
+                cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'],
+                        ['sed', 's/\$[^\^]*/$ /g']]
+                pipe(cmds, f1, f2, log)
 
             f1.seek(0)
             with open(tl_tagged, 'w') as f2:
-                p1 = Popen(['cut', '-f', '3'], stdin=f1, stdout=PIPE, stderr=training_log)
-                p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-                Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log).wait()
+                cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'],
+                        ['sed', 's/\$[^\^]*/$ /g']]
+                pipe(cmds, f1, f2, log).wait()
 
     os.remove(clean_tagged)
 
     # aligning the parallel corpus
     with open(tagged_merged, 'w+') as f:
         with open(os.devnull, 'r') as f1:
-            call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-', sl_tagged], stdin=f1, stdout=f, stderr=training_log)
+            call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-',
+                  sl_tagged], stdin=f1, stdout=f, stderr=log)
         with open(alignment, 'w') as f2:
-            call([config['FAST_ALIGN'], '-i', tagged_merged, '-d', '-o', '-v'], stdout=f2, stderr=training_log)
+            call([config['FAST_ALIGN'], '-i', tagged_merged, '-d',
+                  '-o', '-v'], stdout=f2, stderr=log)
 
-    training_log.close()
 
-if __name__ == '__main__':
-    main()
+def main():
+    print("validating configuration....")
+    config = check_config()
 
+    # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
+    print("cleaning corpus....")
+    # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
 
-def pipe(cmds, firstin, lastout, stderr):
-    """Open a list of commands as a simple shell pipe, using the same stderr
-    for all commands.
-    Returns None if the command list is empty.
-    Example usage:
-    >>> cmds = [['yes', 'olleh'], ['head', '-2'], ['rev']]
-    >>> with open('/tmp/foo', 'w') as outf:
-        p = pipe(cmds, None, outf, sys.stderr)
-        retcode = p.wait()
-    >>> print(open('/tmp/foo', 'r').read())
-    hello
-    hello
-    """
-    if cmds == []:
-        return
-    procs = []                  # type: List[Popen]
-    for i in range(len(cmds)):
-        cmd = cmds[i]
-        inp = procs[i-1].stdout if i > 0 else firstin
-        outp = PIPE if i+1 < len(cmds) else lastout
-        procs.append(Popen(cmd, stdin=inp, stdout=outp, stderr=stderr))
-    return procs[-1]
+    cache_dir = "cache-"+config['SL']+"-"+config['TL']
+    log = cache_dir+'/'+'training.log'
+
+    # the directory where all the intermediary outputs are stored
+    if not os.path.isdir(cache_dir):
+        os.mkdir(cache_dir)
+    else:
+        if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"):
+            print("remove", cache_dir, "and re-run lexical_training.py")
+            exit(1)
+
+    if os.path.isfile(log):
+        os.remove(log)
+
+    with open(log, 'a') as log_file:
+        training(config, cache_dir, log_file)
+
+
+if __name__ == '__main__':
+    main()