commit 2de39d0334561ffaa69444687d6fd8b870f16585
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Thu Jun 24 16:03:00 2021 +0530

    minor fixes

diff --git a/clean_corpus.py b/clean_corpus.py
index ce84b89..2aac7fa 100644
--- a/clean_corpus.py
+++ b/clean_corpus.py
@@ -4,6 +4,7 @@
 
 
 import sys
+import re
 
 
 def clean_corpus(corpus1, corpus2):
@@ -17,44 +18,46 @@ def clean_corpus(corpus1, corpus2):
         lines2 = l2.readlines()
         assert len(lines1) == len(lines2)
         # print(lines1, lines2)
-        i = 0
         for i in range(len(lines1)):
-            if (not lines1[i].strip()) or (not lines2[i].strip()):
+            removal_map = "".maketrans('', '', '°.*')
+            if (not lines1[i].translate(removal_map).strip()) != (not lines2[i].translate(removal_map).strip()):
                 lines_to_remove.update([i-1, i, i+1])
                 continue
-            
+
             # removing lines only with '°', '*' and '.'
-            if (not lines1[i].replace('°', '').replace('*', '').replace('.','').strip()) and \
-                        (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()):
-                lines_to_remove.add(i)
+            # if (not lines1[i].replace('°', '').replace('*', '').replace('.', '').strip()) and \
+            #         (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()):
+            #     lines_to_remove.add(i)
             # print(lines1, lines2)
-            
+            if (not lines1[i].translate(removal_map).strip()) and \
+                    (not lines2[i].translate(removal_map).strip()):
+                lines_to_remove.add(i)
+
         # print(lines_to_remove)
 
-        l1.seek(0)
-        # l1.write(''.join(lines1))
-        l1.write('')
-        l1.truncate()
+        # l1.seek(0)
+        # # l1.write(''.join(lines1))
+        # l1.write('')
+        l1.truncate(0)
 
-        l2.seek(0)
-        l2.write('')
-        l2.truncate()
+        # l2.seek(0)
+        # l2.write('')
+        l2.truncate(0)
 
-    with open(corpus1, 'a') as l1, open(corpus2, 'a') as l2:
+    with open(corpus1, 'w') as l1, open(corpus2, 'w') as l2:
         lines_to_keep = set()
         lines_to_keep.update([i for i in range(len(lines1))])
         lines_to_keep = lines_to_keep - lines_to_remove
-        
-        for i in sorted(lines_to_keep):
-            # also removing leading and trailing spaces
-            l1.write(lines1[i].strip() + '\n')
-            l2.write(lines2[i].strip() + '\n')
-        
-        l1.truncate()
-        l2.truncate()
+
+        # also removing leading and trailing spaces
+        l1.writelines(
+            re.sub(' +', ' ', lines1[i]).strip()+'\n' for i in sorted(lines_to_keep))
+        l2.writelines(
+            re.sub(' +', ' ', lines2[i]).strip()+'\n' for i in sorted(lines_to_keep))
+
 
 if __name__ == '__main__':
     if len(sys.argv) != 3:
         print('usage: clean_corpus.py <corpus 1> <corpus 2>')
         exit(1)
-    clean_corpus(sys.argv[1], sys.argv[2])
\ No newline at end of file
+    clean_corpus(sys.argv[1], sys.argv[2])
diff --git a/lexical_training.py b/lexical_training.py
index 3be2eb8..d74a2af 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -111,7 +111,7 @@ def training(config, cache_dir, log):
     cmds = [['head', '-n', str(training_lines)],
             ['apertium', '-d', config['LANG_DATA'],
              config['SL']+'-'+config['TL']+'-tagger'],
-            ['sed', 's/ \+/ /g'], ['apertium-pretransfer']]
+            ['apertium-pretransfer']]
     with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
 
@@ -122,7 +122,7 @@ def training(config, cache_dir, log):
     cmds = [['head', '-n', str(training_lines)],
             ['apertium', '-d', config['LANG_DATA'],
              config['TL']+'-'+config['SL']+'-tagger'],
-            ['sed', 's/ \+/ /g'], ['apertium-pretransfer']]
+            ['apertium-pretransfer']]
     with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
 
@@ -130,30 +130,32 @@ def training(config, cache_dir, log):
     # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
 
     # removing lines with no analyses
-    with open(lines, 'w+') as f0:
+    with open(lines, 'w') as f:
         call(['seq', '1', str(training_lines)],
-             stdout=f0, stderr=log)
-        clean_tagged = os.path.join(
-            cache_dir, config['CORPUS']+'.clean_tagged')
-        with open(clean_tagged, 'w+') as f1:
-            cmds = [['paste', lines, sl_tagged, tl_tagged],
-                    ['grep', '<*\t*<']]
-            pipe(cmds, None, f1, log).wait()
-
-            # f1.seek(0)
-            call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=log)
-
-            f1.seek(0)
-            with open(sl_tagged, 'w') as f2:
-                cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'],
-                        ['sed', 's/\$[^\^]*/$ /g']]
-                pipe(cmds, f1, f2, log).wait()
-
-            f1.seek(0)
-            with open(tl_tagged, 'w') as f2:
-                cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'],
-                        ['sed', 's/\$[^\^]*/$ /g']]
-                pipe(cmds, f1, f2, log).wait()
+             stdout=f, stderr=log)
+
+    clean_tagged = os.path.join(
+        cache_dir, config['CORPUS']+'.clean_tagged')
+    with open(clean_tagged, 'w') as f1:
+        cmds = [['paste', lines, sl_tagged, tl_tagged],
+                ['grep', '<*\t*<']]
+        pipe(cmds, None, f1, log).wait()
+
+    with open(clean_tagged, 'r') as f0:
+        with open(lines, 'w') as f1:
+            call(['cut', '-f', '1'], stdin=f0, stdout=f1, stderr=log)
+
+        f0.seek(0)
+        with open(sl_tagged, 'w') as f2:
+            cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'],
+                    ['sed', 's/\$[^\^]*/$ /g']]
+            pipe(cmds, f0, f2, log).wait()
+
+        f0.seek(0)
+        with open(tl_tagged, 'w') as f2:
+            cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'],
+                    ['sed', 's/\$[^\^]*/$ /g']]
+            pipe(cmds, f0, f2, log).wait()
 
     os.remove(clean_tagged)
 
@@ -162,6 +164,7 @@ def training(config, cache_dir, log):
         with open(os.devnull, 'r') as f1:
             call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-',
                   sl_tagged], stdin=f1, stdout=f, stderr=log)
+
     with open(alignment, 'w') as f:
         call([config['FAST_ALIGN'], '-i', tagged_merged, '-d',
               '-o', '-v'], stdout=f, stderr=log)
@@ -170,6 +173,7 @@ def training(config, cache_dir, log):
         data = f.read()
         f.seek(0)
         f.write(data.replace('~', ' '))
+
     with open(tl_tagged, 'r+') as f:
         data = f.read()
         f.seek(0)