commit 6a789609f6776afbdadc4b1879c81c998fa04414
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Tue Jun 22 17:31:17 2021 +0530

    minor fixes (working on 7000 sentences)

diff --git a/README.md b/README.md
index 352c79b..bcd6d65 100644
--- a/README.md
+++ b/README.md
@@ -5,16 +5,19 @@ The procedure for lexical selection training is a bit messy, with various script
 for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/User-friendly_lexical_selection_training
 
 ## Requirements
-* [parallel corpus](https://wiki.apertium.org/wiki/Corpora)
-* [apertium](https://wiki.apertium.org/wiki/Installation)
-* [fast_align](https://github.com/clab/fast_align)
-* [apertium-lex-tools](https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling) (with yasmet and scripts)
-* [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally)
+
+- [parallel corpus](https://wiki.apertium.org/wiki/Corpora)
+- [apertium](https://wiki.apertium.org/wiki/Installation)
+- [fast_align](https://github.com/clab/fast_align)
+- [apertium-lex-tools](https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling) (with yasmet and scripts)
+- [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally)
 
 ## Installation steps
-* install the requirements and download or clone this repo (`git clone https://github.com/vivekvardhanadepu/apertium-lexical-training.git`)
-* provide tools' and corpus' paths in [config.toml](config.toml) (for ref, see [config.toml.example](config.toml.example))
-* run lexical_training.py
+
+- install the requirements and download or clone this repo (`git clone https://github.com/vivekvardhanadepu/apertium-lexical-training.git`)
+- create config.toml and provide tools' and corpus' paths in it (for ref, see [config.toml.example](config.toml.example"))
+- run lexical_training.py
 
 ## tests
+
 This folder contains scripts for automated testing of the helper scripts
diff --git a/config.toml b/config.toml
deleted file mode 100644
index 6c42170..0000000
--- a/config.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-# configuration for lexical training
-
-# corpus name
-CORPUS = "europarl-v7"
-
-# source language
-SL = "eng"
-
-# target language
-TL = "spa"
-
-# source corpus
-CORPUS_SL = "europarl-v7.eng-spa.eng"
-
-# target corpus
-CORPUS_TL = "europarl-v7.eng-spa.spa"
-
-# apertium-lex-tools scripts
-LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
-
-# fast align
-FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build/fast_align"
-
-# apertium language data
-LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
-
-# number of lines to be trained on (do not enclose in quotes)
-TRAINING_LINES = 7000
diff --git a/lexical_training.py b/lexical_training.py
index b375c7a..3be2eb8 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -1,6 +1,7 @@
 # lexical training script
 import os
 import sys
+import shutil
 
 from subprocess import Popen, PIPE, call
 from check_config import check_config
@@ -112,7 +113,7 @@ def training(config, cache_dir, log):
              config['SL']+'-'+config['TL']+'-tagger'],
             ['sed', 's/ \+/ /g'], ['apertium-pretransfer']]
     with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp:
-        pipe(cmds, inp, outp, log)
+        pipe(cmds, inp, outp, log).wait()
 
     # c2 = ['apertium-destxt']
     # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log)
@@ -137,15 +138,16 @@ def training(config, cache_dir, log):
         with open(clean_tagged, 'w+') as f1:
             cmds = [['paste', lines, sl_tagged, tl_tagged],
                     ['grep', '<*\t*<']]
-            pipe(cmds, None, f1, log)
+            pipe(cmds, None, f1, log).wait()
 
+            # f1.seek(0)
             call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=log)
 
             f1.seek(0)
             with open(sl_tagged, 'w') as f2:
                 cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'],
                         ['sed', 's/\$[^\^]*/$ /g']]
-                pipe(cmds, f1, f2, log)
+                pipe(cmds, f1, f2, log).wait()
 
             f1.seek(0)
             with open(tl_tagged, 'w') as f2:
@@ -160,9 +162,9 @@ def training(config, cache_dir, log):
         with open(os.devnull, 'r') as f1:
             call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-',
                   sl_tagged], stdin=f1, stdout=f, stderr=log)
-        with open(alignment, 'w') as f2:
-            call([config['FAST_ALIGN'], '-i', tagged_merged, '-d',
-                  '-o', '-v'], stdout=f2, stderr=log)
+    with open(alignment, 'w') as f:
+        call([config['FAST_ALIGN'], '-i', tagged_merged, '-d',
+              '-o', '-v'], stdout=f, stderr=log)
 
     with open(sl_tagged, 'r+') as f:
         data = f.read()
@@ -178,7 +180,7 @@ def training(config, cache_dir, log):
     tmp2 = 'tmp2'
 
     # phrasetable
-    with open(tmp1, 'w+') as f1, open(tmp2, 'w+') as f2:
+    with open(tmp1, 'w') as f1, open(tmp2, 'w') as f2:
         sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin'
         tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin'
         with open(tl_tagged, 'r') as f:
@@ -286,15 +288,13 @@ def main():
     log = os.path.join(cache_dir, 'training.log')
 
     # the directory where all the intermediary outputs are stored
-    if not os.path.isdir(cache_dir):
-        os.mkdir(cache_dir)
-    else:
+    if os.path.isdir(cache_dir):
         if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"):
             print("remove", cache_dir, "and re-run lexical_training.py")
             exit(1)
+        shutil.rmtree(cache_dir)
 
-    if os.path.isfile(log):
-        os.remove(log)
+    os.mkdir(cache_dir)
 
     with open(log, 'a') as log_file:
         training(config, cache_dir, log_file)