commit 033c67d6092633886576b1b652f98fb7a487c6cf
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Wed Jun 9 16:03:43 2021 +0530

    added code for cleaning corpus in lexical_training.py

diff --git a/clean_corpus.py b/clean_corpus.py
index b131bf8..ce84b89 100644
--- a/clean_corpus.py
+++ b/clean_corpus.py
@@ -1,21 +1,18 @@
 # removes lines above and below the empty lines including the empty lines in each corpus
-# removes lines containing only ° and *
+# removes lines containing only '°', '*' or '.'
 # stripping trailing and leading spaces
 
 
 import sys
 
 
-def main(argc, argv):
-    if argc != 3:
-        print('usage: clean_corpus.py <corpus 1> <corpus 2>')
-        exit(-1)
+def clean_corpus(corpus1, corpus2):
 
     lines1 = []
     lines2 = []
     lines_to_remove = set()
 
-    with open(argv[1], 'r+') as l1, open(argv[2], 'r+') as l2:
+    with open(corpus1, 'r+') as l1, open(corpus2, 'r+') as l2:
         lines1 = l1.readlines()
         lines2 = l2.readlines()
         assert len(lines1) == len(lines2)
@@ -32,7 +29,7 @@ def main(argc, argv):
                 lines_to_remove.add(i)
             # print(lines1, lines2)
             
-        print(lines_to_remove)
+        # print(lines_to_remove)
 
         l1.seek(0)
         # l1.write(''.join(lines1))
@@ -43,7 +40,7 @@ def main(argc, argv):
         l2.write('')
         l2.truncate()
 
-    with open(argv[1], 'a') as l1, open(argv[2], 'a') as l2:
+    with open(corpus1, 'a') as l1, open(corpus2, 'a') as l2:
         lines_to_keep = set()
         lines_to_keep.update([i for i in range(len(lines1))])
         lines_to_keep = lines_to_keep - lines_to_remove
@@ -57,4 +54,7 @@ def main(argc, argv):
         l2.truncate()
 
 if __name__ == '__main__':
-    main(len(sys.argv), sys.argv)
\ No newline at end of file
+    if len(sys.argv) != 3:
+        print('usage: clean_corpus.py <corpus 1> <corpus 2>')
+        exit(1)
+    clean_corpus(sys.argv[1], sys.argv[2])
\ No newline at end of file
diff --git a/config.toml b/config.toml
index 6e5bc6c..f52caff 100644
--- a/config.toml
+++ b/config.toml
@@ -25,4 +25,4 @@ FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training
 LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
 
 # number of lines to be trained on (do not enclose in quotes)
-TRAINING_LINES = 100000
+TRAINING_LINES = 1953934
diff --git a/lexical_training.py b/lexical_training.py
index aa1a9a7..dd6ca96 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,9 +1,24 @@
 # lexical training script
+import os
 from check_config import check_config
+from clean_corpus import clean_corpus
 
 def main():
+    print("validating configuration....")
     config = check_config()
-    print("checking config is done")
+
+    # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
+    print("cleaning corpus....")
+    clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
+
+    with open(config['CORPUS_SL'], 'r') as corpus_sl:
+        training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines()))
     
+    print('loading', training_lines, 'lines from the corpora')
+
+    # the directory where all the intermediary outputs are stored
+    cache_dir = "cache-"+config['SL']+"-"+config['TL']
+    os.mkdir(cache_dir)
+
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/tests/check_config_test.py b/tests/check_config_test.py
index ca42963..c1d5bb3 100644
--- a/tests/check_config_test.py
+++ b/tests/check_config_test.py
@@ -115,7 +115,7 @@ def main(argc, argv):
         check_config('check_config_test.toml')
         exit(0)
 
-        _, _ = os.wait()
+    _, _ = os.wait()
 
     os.remove('check_config_test.toml')