commit 88098f9a5553e02bddadd4e7d8fa6514bce06e41 Author: vivekvardhanadepu Date: Mon Jul 12 00:59:40 2021 +0530 formatted strings with f"" diff --git a/check_config.py b/check_config.py index 60b77db..1237eca 100644 --- a/check_config.py +++ b/check_config.py @@ -11,6 +11,7 @@ langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" apertium_url = "https://wiki.apertium.org/wiki/Installation" yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" + def check_config(filename='config.toml'): misconfigured = False with open(filename) as config_file: @@ -26,18 +27,18 @@ def check_config(filename='config.toml'): config[key] = os.path.join(os.path.abspath('.'), config[key]) if not os.path.isfile(config['CORPUS_SL']): - print("'"+config['CORPUS_SL']+"'(CORPUS_SL)","is not a file, provide a valid"+ \ - " file or \nto download, look", corpora_url, '\n') + print( + f"'{config['CORPUS_SL']}'(CORPUS_SL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") misconfigured = True if not os.path.isfile(config['CORPUS_TL']): - print("'"+config['CORPUS_TL']+"'(CORPUS_TL)", "is not a file, provide a valid "+ \ - "file or \nto download, look", corpora_url, '\n') + print( + f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") misconfigured = True if not os.path.isdir(config['LEX_TOOLS']): - print("'"+config['LEX_TOOLS']+"'(LEX_TOOLS)", "is not a directory, provide a valid "+ \ - "directory or \nto install, follow", lex_tools_url, '\n') + print( + f"'{config['LEX_TOOLS']}'(LEX_TOOLS) is not a directory, provide a valid directory or \nto install, follow {lex_tools_url}\n") misconfigured = True else: # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \ @@ -48,34 +49,34 @@ def check_config(filename='config.toml'): # assuming scripts are intact if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): - print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", \ - "provide a valid directory or \nto install, follow", lex_tools_url, '\n') + print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", + "provide a valid directory or \nto install, follow", lex_tools_url, '\n') misconfigured = True if not os.path.isfile(config['FAST_ALIGN']): - print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a file, provide"+ \ - " a valid executable or \nto install, follow", fast_align_url, '\n') + print( + f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n") misconfigured = True # else: # if 'fast_align' not in os.listdir(config['FAST_ALIGN']): # print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ # "provide a valid directory or \nto install, follow", fast_align_url, '\n') # misconfigured = True - + if not os.path.isdir(config['LANG_DATA']): - print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \ - "directory or \nto install, follow", langs_url, '\n') + print( + f"'{config['LANG_DATA']}'(LANG_DATA) is not a directory, provide a valid directory or \nto install, follow {langs_url}\n") misconfigured = True else: - sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin' - tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin' + sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin" + tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin" if sl_tl_autobil not in os.listdir(config['LANG_DATA']): - print("'"+sl_tl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \ - "provide a valid directory or \nto install, follow", langs_url, '\n') + print(f"'{sl_tl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ + provide a valid directory or \nto install, follow {langs_url}\n") misconfigured = True if tl_sl_autobil not in os.listdir(config['LANG_DATA']): - print("'"+tl_sl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \ - "provide a valid directory or \nto install, follow", langs_url, '\n') + print(f"'{tl_sl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \ + provide a valid directory or \nto install, follow {langs_url}\n") misconfigured = True apertium_present = False @@ -85,7 +86,8 @@ def check_config(filename='config.toml'): break if not apertium_present: - print("apertium is either not installed or not added to path, see", apertium_url, '\n') + print( + f"apertium is either not installed or not added to path, see {apertium_url}\n") misconfigured = True yasmet_present = False @@ -93,13 +95,15 @@ def check_config(filename='config.toml'): if os.path.isfile(os.path.join(path, 'yasmet')): yasmet_present = True break - + if not yasmet_present: - print("yasmet is either not installed or not added to path, see", yasmet_url, '\n') + print( + f"yasmet is either not installed or not added to path, see {yasmet_url}\n") misconfigured = True - + if not isinstance(config['TRAINING_LINES'], int): - print("'"+str(config['TRAINING_LINES'])+"'(TRAINING_LINES)", "is not an integer", '\n') + print( + f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer \n") misconfigured = True if misconfigured: @@ -107,5 +111,6 @@ def check_config(filename='config.toml'): return config + if __name__ == '__main__': - check_config() \ No newline at end of file + check_config() diff --git a/clean_corpus.py b/clean_corpus.py index 2aac7fa..dbc108f 100644 --- a/clean_corpus.py +++ b/clean_corpus.py @@ -51,13 +51,13 @@ def clean_corpus(corpus1, corpus2): # also removing leading and trailing spaces l1.writelines( - re.sub(' +', ' ', lines1[i]).strip()+'\n' for i in sorted(lines_to_keep)) + f"{re.sub(' +', ' ', lines1[i]).strip()}\n" for i in sorted(lines_to_keep)) l2.writelines( - re.sub(' +', ' ', lines2[i]).strip()+'\n' for i in sorted(lines_to_keep)) + f"{re.sub(' +', ' ', lines2[i]).strip()}\n" for i in sorted(lines_to_keep)) if __name__ == '__main__': if len(sys.argv) != 3: - print('usage: clean_corpus.py ') + print('Usage: clean_corpus.py ') exit(1) clean_corpus(sys.argv[1], sys.argv[2]) diff --git a/lexical_training.py b/lexical_training.py index e0d7ade..9bda975 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -30,7 +30,7 @@ def query(question, default="yes"): default = "yes" while True: - print(question + prompt+"(default='"+default+"')?") + print(f"{question} {prompt} (default='{default}')?") choice = input().lower() if default is not None and choice == "": return valid[default] @@ -38,7 +38,6 @@ def query(question, default="yes"): return valid[choice] else: print("Please respond with 'yes', 'no', 'y' or 'n'") - exit(1) def pipe(cmds, firstin, lastout, stderr): @@ -65,28 +64,29 @@ def pipe(cmds, firstin, lastout, stderr): return procs[-1] -def training(config, cache_dir, log): +def training(config, log): MIN = 1 - # file names + # file/folder names + cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}" sl_tagged = os.path.join( - cache_dir, config['CORPUS']+'.tagged.'+config['SL']) + cache_dir, f"{config['CORPUS']}.tagged.{config['SL']}") tl_tagged = os.path.join( - cache_dir, config['CORPUS']+'.tagged.'+config['TL']) + cache_dir, f"{config['CORPUS']}.tagged.{config['TL']}") lines = os.path.join(cache_dir, config['CORPUS']+'.lines') tagged_merged = os.path.join( - cache_dir, config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL']) + cache_dir, f"{config['CORPUS']}.tagged-merged.{config['SL']}-{config['TL']}") alignment = os.path.join(cache_dir, config['CORPUS'] + '.align.'+config['SL']+'-'+config['TL']) clean_biltrans = os.path.join( - cache_dir, config['CORPUS']+'.clean_biltrans.'+config['SL']+'-'+config['TL']) + cache_dir, f"{config['CORPUS']}.clean_biltrans.{config['SL']}-{config['TL']}") phrasetable = os.path.join( - cache_dir, config['CORPUS']+'.phrasetable.'+config['SL']+'-'+config['TL']) + cache_dir, f"{config['CORPUS']}.phrasetable.{config['SL']}-{config['TL']}") candidates = os.path.join( - cache_dir, config['CORPUS']+'.candidates.'+config['SL']+'-'+config['TL']) + cache_dir, f"{config['CORPUS']}.candidates.{config['SL']}-{config['TL']}") freq_lex = os.path.join( - cache_dir, config['CORPUS']+'.lex.'+config['SL']+'-'+config['TL']) + cache_dir, f"{config['CORPUS']}.lex.{config['SL']}-{config['TL']}") ngrams = os.path.join( cache_dir, 'ngrams') events = os.path.join( @@ -99,23 +99,37 @@ def training(config, cache_dir, log): cache_dir, 'rules_all.txt') ngrams_all = os.path.join( cache_dir, 'ngrams_all.txt') - rules = config['CORPUS']+"-"+config['SL']+'-' + \ - config['TL']+'.ngrams-lm-'+str(MIN)+'.xml' + rules = f"{config['CORPUS']}-{config['SL']}-{config['TL']}.ngrams-lm-{MIN}.xml" + + # the directory where all the intermediary outputs are stored + if os.path.isdir(cache_dir): + if not query(f"Do you want to overwrite the files in '{cache_dir}'"): + print(f"(re)move {cache_dir} and re-run lexical_training.py") + exit(1) + shutil.rmtree(cache_dir) + + os.mkdir(cache_dir) + + if os.path.isfile(rules): + if not query(f"Do you want to overwrite '{rules}'"): + print(f"(re)move {rules} and re-run lexical_training.py") + exit(1) + os.remove(rules) with open(config['CORPUS_SL'], 'r') as corpus_sl: training_lines = len(corpus_sl.readlines()) if config['TRAINING_LINES'] > training_lines: - print('Warning:', str(config['TRAINING_LINES']) + - '(TRAINING_LINES) >', training_lines) + print( + f"Warning: {config['TRAINING_LINES']}(TRAINING_LINES) > {training_lines}") else: training_lines = config['TRAINING_LINES'] - print('loading', training_lines, 'lines from the corpora') + print(f"loading {training_lines} lines from the corpora") # tagging the source side corpus cmds = [['head', '-n', str(training_lines)], # ['apertium-destxt'], ['apertium', '-d', config['LANG_DATA'], # '-f', 'none', - config['SL']+'-'+config['TL']+'-tagger'], + f"{config['SL']}-{config['TL']}-tagger"], ['apertium-pretransfer']] with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp: pipe(cmds, inp, outp, log).wait() @@ -123,7 +137,7 @@ def training(config, cache_dir, log): # tagging the target side corpus cmds = [['head', '-n', str(training_lines)], # ['apertium-destxt'], ['apertium', '-d', config['LANG_DATA'], # '-f', 'none', - config['TL']+'-'+config['SL']+'-tagger'], + f"{config['TL']}-{config['SL']}-tagger"], ['apertium-pretransfer']] with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp: pipe(cmds, inp, outp, log).wait() @@ -134,7 +148,7 @@ def training(config, cache_dir, log): stdout=f, stderr=log) clean_tagged = os.path.join( - cache_dir, config['CORPUS']+'.clean_tagged') + cache_dir, f"{config['CORPUS']}.clean_tagged") with open(clean_tagged, 'w') as f1: cmds = [['paste', lines, sl_tagged, tl_tagged], ['grep', '<*\t*<']] @@ -184,8 +198,8 @@ def training(config, cache_dir, log): # phrasetable with open(tmp1, 'w') as f1, open(tmp2, 'w') as f2: - sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin' - tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin' + sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin" + tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin" with open(tl_tagged, 'r') as f: call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LANG_DATA'], tl_sl_autobil)], stdin=f, stdout=f1, stderr=log) @@ -243,16 +257,16 @@ def training(config, cache_dir, log): f0.seek(0) f1.truncate(0) # print(l) - cmds = [['grep', '^'+l], ['cut', '-f', '2'], ['head', '-1']] + cmds = [['grep', f'^{l}'], ['cut', '-f', '2'], ['head', '-1']] pipe(cmds, f0, f1, log).wait() f0.seek(0) - cmds = [['grep', '^'+l], ['cut', '-f', '3']] + cmds = [['grep', f'^{l}'], ['cut', '-f', '3']] pipe(cmds, f0, f1, log).wait() f1.seek(0) cmds = [ - ['yasmet', '-red', str(MIN)], ['yasmet'], ['sed', 's/ /\t/g'], ['sed', 's/^/'+l+'\t/g']] + ['yasmet', '-red', str(MIN)], ['yasmet'], ['sed', 's/ /\t/g'], ['sed', f's/^/{l}\t/g']] pipe(cmds, f1, f2, log).wait() os.remove('tmp.yasmet') @@ -288,20 +302,11 @@ def main(): print("cleaning corpus....") # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL']) - cache_dir = "cache-"+config['CORPUS']+"-"+config['SL']+"-"+config['TL'] - log = os.path.join(cache_dir, 'training.log') - - # the directory where all the intermediary outputs are stored - if os.path.isdir(cache_dir): - if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"): - print("remove", cache_dir, "and re-run lexical_training.py") - exit(1) - shutil.rmtree(cache_dir) - - os.mkdir(cache_dir) + log = os.path.join( + f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}", 'training.log') with open(log, 'a') as log_file: - training(config, cache_dir, log_file) + training(config, log_file) if __name__ == '__main__': diff --git a/tests/check_config_test.py b/tests/check_config_test.py index 3f678d0..116cd64 100644 --- a/tests/check_config_test.py +++ b/tests/check_config_test.py @@ -1,4 +1,5 @@ # tests check_config.py +from check_config import check_config import sys from tomlkit import parse, dumps import os @@ -6,10 +7,9 @@ import shutil sys.path.append('../') -from check_config import check_config def main(argc, argv): - + # Test 1 config_file = open('config_test.toml', 'r') config_toml = config_file.read() @@ -22,7 +22,7 @@ def main(argc, argv): for key in config: if key == 'TRAINING_LINES': continue - config[key]+="abc" + config[key] += "abc" if os.fork() == 0: with open('check_config_test.toml', 'w') as test_file: @@ -41,20 +41,23 @@ def main(argc, argv): print("Test 2 : partial/no installations") print("----------------------------------") - config['SL']+="abc" + config['SL'] += "abc" for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'apertium')): - shutil.move(os.path.join(path, 'apertium'), os.path.join(path, 'apertium'+'abc')) + shutil.move(os.path.join(path, 'apertium'), + os.path.join(path, 'apertium'+'abc')) break - + for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'yasmet')): - shutil.move(os.path.join(path, 'yasmet'), os.path.join(path, 'yasmet'+'abc')) + shutil.move(os.path.join(path, 'yasmet'), + os.path.join(path, 'yasmet'+'abc')) break if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) + shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), + os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) @@ -67,20 +70,23 @@ def main(argc, argv): _, _ = os.wait() - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) + shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), + os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'apertium'+'abc')): - shutil.move(os.path.join(path, 'apertium'+'abc'), os.path.join(path, 'apertium')) + shutil.move(os.path.join(path, 'apertium'+'abc'), + os.path.join(path, 'apertium')) break - + for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'yasmet'+'abc')): - shutil.move(os.path.join(path, 'yasmet'+'abc'), os.path.join(path, 'yasmet')) + shutil.move(os.path.join(path, 'yasmet'+'abc'), + os.path.join(path, 'yasmet')) break - + # Test 3 config_file = open('config_test.toml', 'r') config_toml = config_file.read() @@ -90,7 +96,7 @@ def main(argc, argv): print("Test 3 : wrong TRAINING_LINES") print("---------------------") - for value in ['abc', 1.00, 1e237892, "abc"]: + for value in ['abc', 1.00, 1e237892]: config['TRAINING_LINES'] = value if os.fork() == 0: with open('check_config_test.toml', 'w') as test_file: @@ -119,5 +125,6 @@ def main(argc, argv): os.remove('check_config_test.toml') + if __name__ == '__main__': - main(len(sys.argv), sys.argv) \ No newline at end of file + main(len(sys.argv), sys.argv)