commit d2ff40b38796ec60720d8569e8ffacee0c9596e0
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Thu Jun 10 15:08:13 2021 +0530

    adding code for tagging

diff --git a/config.toml b/config.toml
index f52caff..6e94ceb 100644
--- a/config.toml
+++ b/config.toml
@@ -25,4 +25,4 @@ FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training
 LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
 
 # number of lines to be trained on (do not enclose in quotes)
-TRAINING_LINES = 1953934
+TRAINING_LINES = 100
diff --git a/lexical_training.py b/lexical_training.py
index dd6ca96..e0470b6 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,5 +1,6 @@
 # lexical training script
 import os
+from subprocess import Popen, PIPE
 from check_config import check_config
 from clean_corpus import clean_corpus
 
@@ -9,7 +10,7 @@ def main():
 
     # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
     print("cleaning corpus....")
-    clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
+    # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
 
     with open(config['CORPUS_SL'], 'r') as corpus_sl:
         training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines()))
@@ -18,7 +19,48 @@ def main():
 
     # the directory where all the intermediary outputs are stored
     cache_dir = "cache-"+config['SL']+"-"+config['TL']
-    os.mkdir(cache_dir)
+    if not os.path.isdir(cache_dir):
+        os.mkdir(cache_dir)
+
+    training_log_name = cache_dir+'/'+'training.log'
+    if os.path.isdir(training_log_name):
+        os.remove(training_log_name)
+
+    training_log = open(training_log_name, 'a')
+
+    # tagging the source side corpus
+    c1 = ['head', '-n', str(config['TRAINING_LINES'])]
+    with open(config['CORPUS_SL']) as f:
+        p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
+
+    c2 = ['apertium-destxt']
+    p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+
+    c3 = ['apertium', '-d', config['LANG_DATA'], config['SL']+'-'+config['TL']+'-tagger']
+    p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log)
+
+    c4 = ['apertium-pretransfer']
+    sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL']
+    with open(sl_tagged, 'w') as f:
+        Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log)
+
+    # tagging the target side corpus
+    c1 = ['head', '-n', str(config['TRAINING_LINES'])]
+    with open(config['CORPUS_TL']) as f:
+        p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log)
+
+    c2 = ['apertium-destxt']
+    p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
+
+    c3 = ['apertium', '-d', config['LANG_DATA'], config['TL']+'-'+config['SL']+'-tagger']
+    p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log)
+
+    c4 = ['apertium-pretransfer']
+    tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL']
+    with open(tl_tagged, 'w') as f:
+        Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log).wait()
+
+    training_log.close()
 
 if __name__ == '__main__':
     main()
\ No newline at end of file