commit 88098f9a5553e02bddadd4e7d8fa6514bce06e41
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Mon Jul 12 00:59:40 2021 +0530

    formatted strings with f""

diff --git a/check_config.py b/check_config.py
index 60b77db..1237eca 100644
--- a/check_config.py
+++ b/check_config.py
@@ -11,6 +11,7 @@ langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs"
 apertium_url = "https://wiki.apertium.org/wiki/Installation"
 yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules"
 
+
 def check_config(filename='config.toml'):
     misconfigured = False
     with open(filename) as config_file:
@@ -26,18 +27,18 @@ def check_config(filename='config.toml'):
             config[key] = os.path.join(os.path.abspath('.'), config[key])
 
     if not os.path.isfile(config['CORPUS_SL']):
-        print("'"+config['CORPUS_SL']+"'(CORPUS_SL)","is not a file, provide a valid"+ \
-                    " file or \nto download, look", corpora_url, '\n')
+        print(
+            f"'{config['CORPUS_SL']}'(CORPUS_SL) is not a file, provide a valid file or \nto download, look {corpora_url}\n")
         misconfigured = True
 
     if not os.path.isfile(config['CORPUS_TL']):
-        print("'"+config['CORPUS_TL']+"'(CORPUS_TL)", "is not a file, provide a valid "+ \
-                    "file or \nto download, look", corpora_url, '\n')
+        print(
+            f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n")
         misconfigured = True
 
     if not os.path.isdir(config['LEX_TOOLS']):
-        print("'"+config['LEX_TOOLS']+"'(LEX_TOOLS)", "is not a directory, provide a valid "+ \
-                    "directory or \nto install, follow", lex_tools_url, '\n')
+        print(
+            f"'{config['LEX_TOOLS']}'(LEX_TOOLS) is not a directory, provide a valid directory or \nto install, follow {lex_tools_url}\n")
         misconfigured = True
     else:
         # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \
@@ -48,34 +49,34 @@ def check_config(filename='config.toml'):
 
         # assuming scripts are intact
         if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
-            print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", \
-                        "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
+            print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),",
+                  "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
             misconfigured = True
 
     if not os.path.isfile(config['FAST_ALIGN']):
-        print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a file, provide"+ \
-                    " a valid executable or \nto install, follow", fast_align_url, '\n')
+        print(
+            f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n")
         misconfigured = True
     # else:
     #     if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
     #         print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
     #                         "provide a valid directory or \nto install, follow", fast_align_url, '\n')
     #         misconfigured = True
-    
+
     if not os.path.isdir(config['LANG_DATA']):
-        print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \
-                    "directory or \nto install, follow", langs_url, '\n')
+        print(
+            f"'{config['LANG_DATA']}'(LANG_DATA) is not a directory, provide a valid directory or \nto install, follow {langs_url}\n")
         misconfigured = True
     else:
-        sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin'
-        tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin'
+        sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin"
+        tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin"
         if sl_tl_autobil not in os.listdir(config['LANG_DATA']):
-            print("'"+sl_tl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \
-                        "provide a valid directory or \nto install, follow", langs_url, '\n')
+            print(f"'{sl_tl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
+                  provide a valid directory or \nto install, follow {langs_url}\n")
             misconfigured = True
         if tl_sl_autobil not in os.listdir(config['LANG_DATA']):
-            print("'"+tl_sl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \
-                        "provide a valid directory or \nto install, follow", langs_url, '\n')
+            print(f"'{tl_sl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
+                  provide a valid directory or \nto install, follow {langs_url}\n")
             misconfigured = True
 
     apertium_present = False
@@ -85,7 +86,8 @@ def check_config(filename='config.toml'):
             break
 
     if not apertium_present:
-        print("apertium is either not installed or not added to path, see", apertium_url, '\n')
+        print(
+            f"apertium is either not installed or not added to path, see {apertium_url}\n")
         misconfigured = True
 
     yasmet_present = False
@@ -93,13 +95,15 @@ def check_config(filename='config.toml'):
         if os.path.isfile(os.path.join(path, 'yasmet')):
             yasmet_present = True
             break
-        
+
     if not yasmet_present:
-        print("yasmet is either not installed or not added to path, see", yasmet_url, '\n')
+        print(
+            f"yasmet is either not installed or not added to path, see {yasmet_url}\n")
         misconfigured = True
-    
+
     if not isinstance(config['TRAINING_LINES'], int):
-        print("'"+str(config['TRAINING_LINES'])+"'(TRAINING_LINES)", "is not an integer", '\n')
+        print(
+            f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer \n")
         misconfigured = True
 
     if misconfigured:
@@ -107,5 +111,6 @@ def check_config(filename='config.toml'):
 
     return config
 
+
 if __name__ == '__main__':
-    check_config()
\ No newline at end of file
+    check_config()
diff --git a/clean_corpus.py b/clean_corpus.py
index 2aac7fa..dbc108f 100644
--- a/clean_corpus.py
+++ b/clean_corpus.py
@@ -51,13 +51,13 @@ def clean_corpus(corpus1, corpus2):
 
         # also removing leading and trailing spaces
         l1.writelines(
-            re.sub(' +', ' ', lines1[i]).strip()+'\n' for i in sorted(lines_to_keep))
+            f"{re.sub(' +', ' ', lines1[i]).strip()}\n" for i in sorted(lines_to_keep))
         l2.writelines(
-            re.sub(' +', ' ', lines2[i]).strip()+'\n' for i in sorted(lines_to_keep))
+            f"{re.sub(' +', ' ', lines2[i]).strip()}\n" for i in sorted(lines_to_keep))
 
 
 if __name__ == '__main__':
     if len(sys.argv) != 3:
-        print('usage: clean_corpus.py <corpus 1> <corpus 2>')
+        print('Usage: clean_corpus.py <corpus 1> <corpus 2>')
         exit(1)
     clean_corpus(sys.argv[1], sys.argv[2])
diff --git a/lexical_training.py b/lexical_training.py
index e0d7ade..9bda975 100644
--- a/lexical_training.py
+++ b/lexical_training.py
@@ -30,7 +30,7 @@ def query(question, default="yes"):
         default = "yes"
 
     while True:
-        print(question + prompt+"(default='"+default+"')?")
+        print(f"{question} {prompt} (default='{default}')?")
         choice = input().lower()
         if default is not None and choice == "":
             return valid[default]
@@ -38,7 +38,6 @@ def query(question, default="yes"):
             return valid[choice]
         else:
             print("Please respond with 'yes', 'no', 'y' or 'n'")
-            exit(1)
 
 
 def pipe(cmds, firstin, lastout, stderr):
@@ -65,28 +64,29 @@ def pipe(cmds, firstin, lastout, stderr):
     return procs[-1]
 
 
-def training(config, cache_dir, log):
+def training(config, log):
 
     MIN = 1
 
-    # file names
+    # file/folder names
+    cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}"
     sl_tagged = os.path.join(
-        cache_dir, config['CORPUS']+'.tagged.'+config['SL'])
+        cache_dir, f"{config['CORPUS']}.tagged.{config['SL']}")
     tl_tagged = os.path.join(
-        cache_dir, config['CORPUS']+'.tagged.'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.tagged.{config['TL']}")
     lines = os.path.join(cache_dir, config['CORPUS']+'.lines')
     tagged_merged = os.path.join(
-        cache_dir, config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.tagged-merged.{config['SL']}-{config['TL']}")
     alignment = os.path.join(cache_dir, config['CORPUS'] +
                              '.align.'+config['SL']+'-'+config['TL'])
     clean_biltrans = os.path.join(
-        cache_dir, config['CORPUS']+'.clean_biltrans.'+config['SL']+'-'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.clean_biltrans.{config['SL']}-{config['TL']}")
     phrasetable = os.path.join(
-        cache_dir, config['CORPUS']+'.phrasetable.'+config['SL']+'-'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.phrasetable.{config['SL']}-{config['TL']}")
     candidates = os.path.join(
-        cache_dir, config['CORPUS']+'.candidates.'+config['SL']+'-'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.candidates.{config['SL']}-{config['TL']}")
     freq_lex = os.path.join(
-        cache_dir, config['CORPUS']+'.lex.'+config['SL']+'-'+config['TL'])
+        cache_dir, f"{config['CORPUS']}.lex.{config['SL']}-{config['TL']}")
     ngrams = os.path.join(
         cache_dir, 'ngrams')
     events = os.path.join(
@@ -99,23 +99,37 @@ def training(config, cache_dir, log):
         cache_dir, 'rules_all.txt')
     ngrams_all = os.path.join(
         cache_dir, 'ngrams_all.txt')
-    rules = config['CORPUS']+"-"+config['SL']+'-' + \
-        config['TL']+'.ngrams-lm-'+str(MIN)+'.xml'
+    rules = f"{config['CORPUS']}-{config['SL']}-{config['TL']}.ngrams-lm-{MIN}.xml"
+
+    # the directory where all the intermediary outputs are stored
+    if os.path.isdir(cache_dir):
+        if not query(f"Do you want to overwrite the files in '{cache_dir}'"):
+            print(f"(re)move {cache_dir} and re-run lexical_training.py")
+            exit(1)
+        shutil.rmtree(cache_dir)
+
+    os.mkdir(cache_dir)
+
+    if os.path.isfile(rules):
+        if not query(f"Do you want to overwrite '{rules}'"):
+            print(f"(re)move {rules} and re-run lexical_training.py")
+            exit(1)
+        os.remove(rules)
 
     with open(config['CORPUS_SL'], 'r') as corpus_sl:
         training_lines = len(corpus_sl.readlines())
         if config['TRAINING_LINES'] > training_lines:
-            print('Warning:', str(config['TRAINING_LINES']) +
-                  '(TRAINING_LINES) >', training_lines)
+            print(
+                f"Warning: {config['TRAINING_LINES']}(TRAINING_LINES) > {training_lines}")
         else:
             training_lines = config['TRAINING_LINES']
 
-    print('loading', training_lines, 'lines from the corpora')
+    print(f"loading {training_lines} lines from the corpora")
 
     # tagging the source side corpus
     cmds = [['head', '-n', str(training_lines)],  # ['apertium-destxt'],
             ['apertium', '-d', config['LANG_DATA'],  # '-f', 'none',
-             config['SL']+'-'+config['TL']+'-tagger'],
+             f"{config['SL']}-{config['TL']}-tagger"],
             ['apertium-pretransfer']]
     with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
@@ -123,7 +137,7 @@ def training(config, cache_dir, log):
     # tagging the target side corpus
     cmds = [['head', '-n', str(training_lines)],  # ['apertium-destxt'],
             ['apertium', '-d', config['LANG_DATA'],  # '-f', 'none',
-             config['TL']+'-'+config['SL']+'-tagger'],
+             f"{config['TL']}-{config['SL']}-tagger"],
             ['apertium-pretransfer']]
     with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp:
         pipe(cmds, inp, outp, log).wait()
@@ -134,7 +148,7 @@ def training(config, cache_dir, log):
              stdout=f, stderr=log)
 
     clean_tagged = os.path.join(
-        cache_dir, config['CORPUS']+'.clean_tagged')
+        cache_dir, f"{config['CORPUS']}.clean_tagged")
     with open(clean_tagged, 'w') as f1:
         cmds = [['paste', lines, sl_tagged, tl_tagged],
                 ['grep', '<*\t*<']]
@@ -184,8 +198,8 @@ def training(config, cache_dir, log):
 
     # phrasetable
     with open(tmp1, 'w') as f1, open(tmp2, 'w') as f2:
-        sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin'
-        tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin'
+        sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin"
+        tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin"
         with open(tl_tagged, 'r') as f:
             call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
                   os.path.join(config['LANG_DATA'], tl_sl_autobil)], stdin=f, stdout=f1, stderr=log)
@@ -243,16 +257,16 @@ def training(config, cache_dir, log):
                 f0.seek(0)
                 f1.truncate(0)
                 # print(l)
-                cmds = [['grep', '^'+l], ['cut', '-f', '2'], ['head', '-1']]
+                cmds = [['grep', f'^{l}'], ['cut', '-f', '2'], ['head', '-1']]
                 pipe(cmds, f0, f1, log).wait()
                 f0.seek(0)
 
-                cmds = [['grep', '^'+l], ['cut', '-f', '3']]
+                cmds = [['grep', f'^{l}'], ['cut', '-f', '3']]
                 pipe(cmds, f0, f1, log).wait()
                 f1.seek(0)
 
                 cmds = [
-                    ['yasmet', '-red', str(MIN)], ['yasmet'], ['sed', 's/ /\t/g'], ['sed', 's/^/'+l+'\t/g']]
+                    ['yasmet', '-red', str(MIN)], ['yasmet'], ['sed', 's/ /\t/g'], ['sed', f's/^/{l}\t/g']]
                 pipe(cmds, f1, f2, log).wait()
 
     os.remove('tmp.yasmet')
@@ -288,20 +302,11 @@ def main():
     print("cleaning corpus....")
     # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL'])
 
-    cache_dir = "cache-"+config['CORPUS']+"-"+config['SL']+"-"+config['TL']
-    log = os.path.join(cache_dir, 'training.log')
-
-    # the directory where all the intermediary outputs are stored
-    if os.path.isdir(cache_dir):
-        if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"):
-            print("remove", cache_dir, "and re-run lexical_training.py")
-            exit(1)
-        shutil.rmtree(cache_dir)
-
-    os.mkdir(cache_dir)
+    log = os.path.join(
+        f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}", 'training.log')
 
     with open(log, 'a') as log_file:
-        training(config, cache_dir, log_file)
+        training(config, log_file)
 
 
 if __name__ == '__main__':
diff --git a/tests/check_config_test.py b/tests/check_config_test.py
index 3f678d0..116cd64 100644
--- a/tests/check_config_test.py
+++ b/tests/check_config_test.py
@@ -1,4 +1,5 @@
 # tests check_config.py
+from check_config import check_config
 import sys
 from tomlkit import parse, dumps
 import os
@@ -6,10 +7,9 @@ import shutil
 
 sys.path.append('../')
 
-from check_config import check_config
 
 def main(argc, argv):
-    
+
     # Test 1
     config_file = open('config_test.toml', 'r')
     config_toml = config_file.read()
@@ -22,7 +22,7 @@ def main(argc, argv):
     for key in config:
         if key == 'TRAINING_LINES':
             continue
-        config[key]+="abc"
+        config[key] += "abc"
 
     if os.fork() == 0:
         with open('check_config_test.toml', 'w') as test_file:
@@ -41,20 +41,23 @@ def main(argc, argv):
     print("Test 2 : partial/no installations")
     print("----------------------------------")
 
-    config['SL']+="abc"
+    config['SL'] += "abc"
 
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'apertium')):
-            shutil.move(os.path.join(path, 'apertium'), os.path.join(path, 'apertium'+'abc'))
+            shutil.move(os.path.join(path, 'apertium'),
+                        os.path.join(path, 'apertium'+'abc'))
             break
-    
+
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'yasmet')):
-            shutil.move(os.path.join(path, 'yasmet'), os.path.join(path, 'yasmet'+'abc'))
+            shutil.move(os.path.join(path, 'yasmet'),
+                        os.path.join(path, 'yasmet'+'abc'))
             break
 
     if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')):
-        shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'))
+        shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'),
+                    os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'))
 
     # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')):
     #     shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'))
@@ -67,20 +70,23 @@ def main(argc, argv):
 
     _, _ = os.wait()
 
-    shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
+    shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'),
+                os.path.join(config['LEX_TOOLS'], 'process-tagger-output'))
 
     # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align'))
 
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'apertium'+'abc')):
-            shutil.move(os.path.join(path, 'apertium'+'abc'), os.path.join(path, 'apertium'))
+            shutil.move(os.path.join(path, 'apertium'+'abc'),
+                        os.path.join(path, 'apertium'))
             break
-    
+
     for path in os.environ["PATH"].split(os.pathsep):
         if os.path.isfile(os.path.join(path, 'yasmet'+'abc')):
-            shutil.move(os.path.join(path, 'yasmet'+'abc'), os.path.join(path, 'yasmet'))
+            shutil.move(os.path.join(path, 'yasmet'+'abc'),
+                        os.path.join(path, 'yasmet'))
             break
-    
+
     # Test 3
     config_file = open('config_test.toml', 'r')
     config_toml = config_file.read()
@@ -90,7 +96,7 @@ def main(argc, argv):
     print("Test 3 : wrong TRAINING_LINES")
     print("---------------------")
 
-    for value in ['abc', 1.00, 1e237892, "abc"]:
+    for value in ['abc', 1.00, 1e237892]:
         config['TRAINING_LINES'] = value
         if os.fork() == 0:
             with open('check_config_test.toml', 'w') as test_file:
@@ -119,5 +125,6 @@ def main(argc, argv):
 
     os.remove('check_config_test.toml')
 
+
 if __name__ == '__main__':
-    main(len(sys.argv), sys.argv)
\ No newline at end of file
+    main(len(sys.argv), sys.argv)