commit 34b7b9c0d77995dc744601be0d96fb3618c59c2e
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Sat Jul 10 20:34:11 2021 +0530

    minor changes

diff --git a/lexical_training.py b/lexical_training.py
index bb75673..e0d7ade 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -113,27 +113,21 @@ def training(config, cache_dir, log):
     print('loading', training_lines, 'lines from the corpora')
 
     # tagging the source side corpus
-    cmds = [['head', '-n', str(training_lines)],
-            ['apertium', '-d', config['LANG_DATA'],
+    cmds = [['head', '-n', str(training_lines)],  # ['apertium-destxt'],
+            ['apertium', '-d', config['LANG_DATA'],  # '-f', 'none',
              config['SL']+'-'+config['TL']+'-tagger'],
             ['apertium-pretransfer']]
     with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
 
-    # c2 = ['apertium-destxt']
-    # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-
     # tagging the target side corpus
-    cmds = [['head', '-n', str(training_lines)],
-            ['apertium', '-d', config['LANG_DATA'],
+    cmds = [['head', '-n', str(training_lines)],  # ['apertium-destxt'],
+            ['apertium', '-d', config['LANG_DATA'],  # '-f', 'none',
              config['TL']+'-'+config['SL']+'-tagger'],
             ['apertium-pretransfer']]
     with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
 
-    # c2 = ['apertium-destxt']
-    # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
-
     # removing lines with no analyses
     with open(lines, 'w') as f:
         call(['seq', '1', str(training_lines)],
@@ -152,20 +146,20 @@ def training(config, cache_dir, log):
 
         f0.seek(0)
         with open(sl_tagged, 'w') as f2:
-            cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'],
+            cmds = [['cut', '-f', '2'], ['sed', 's/ /~~/g'],
                     ['sed', 's/\$[^\^]*/$ /g']]
             pipe(cmds, f0, f2, log).wait()
 
         f0.seek(0)
         with open(tl_tagged, 'w') as f2:
-            cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'],
+            cmds = [['cut', '-f', '3'], ['sed', 's/ /~~/g'],
                     ['sed', 's/\$[^\^]*/$ /g']]
             pipe(cmds, f0, f2, log).wait()
 
     os.remove(clean_tagged)
 
     # aligning the parallel corpus
-    with open(tagged_merged, 'w+') as f:
+    with open(tagged_merged, 'w') as f:
         with open(os.devnull, 'r') as f1:
             call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-',
                   sl_tagged], stdin=f1, stdout=f, stderr=log)
@@ -177,12 +171,12 @@ def training(config, cache_dir, log):
     with open(sl_tagged, 'r+') as f:
         data = f.read()
         f.seek(0)
-        f.write(data.replace('~', ' '))
+        f.write(data.replace('~~', ' '))
 
     with open(tl_tagged, 'r+') as f:
         data = f.read()
         f.seek(0)
-        f.write(data.replace('~', ' '))
+        f.write(data.replace('~~', ' '))
 
     # temp files
     tmp1 = 'tmp1'
@@ -202,9 +196,10 @@ def training(config, cache_dir, log):
             with open(clean_biltrans, 'w') as f0:
                 call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
                       os.path.join(config['LANG_DATA'], sl_tl_autobil)], stdin=f, stdout=f0, stderr=log)
-        cmds = [['paste', tmp1, tmp2, alignment], ['sed', 's/\t/ ||| /g']]
-        with open(phrasetable, 'w') as f:
-            pipe(cmds, None, f, log).wait()
+
+    cmds = [['paste', tmp1, tmp2, alignment], ['sed', 's/\t/ ||| /g']]
+    with open(phrasetable, 'w') as f:
+        pipe(cmds, None, f, log).wait()
 
     os.remove(tmp1)
     os.remove(tmp2)