commit 50a95aff515b9d6703e5795eca5d5d595523b421 Author: vivekvardhanadepu Date: Thu Jul 15 09:27:22 2021 +0530 incorporating changes of apertium-lex-tools(60e6ae9920ddc1ba24c96e5d4fe6a66ee139321a) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8a3b797 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# cache +cache* +__pycache__/ + +# rules +*.xml + +#logs +*.log + +# configs +/*.toml + +# corpora +europarl* \ No newline at end of file diff --git a/README.md b/README.md index d4efc15..1a039c7 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,8 @@ for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/Us ## Requirements - [parallel corpus](https://wiki.apertium.org/wiki/Corpora) -- [apertium](https://wiki.apertium.org/wiki/Installation) +- [apertium-core](https://wiki.apertium.org/wiki/Installation) (install apertium-lex-tools with yasmet) - [fast_align](https://github.com/clab/fast_align) -- [apertium-lex-tools](https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling) (if not installed already) - [language pair](https://wiki.apertium.org/wiki/List_of_language_pairs) (install locally) ## Installation steps diff --git a/check_config.py b/check_config.py index 1237eca..94a5c1a 100644 --- a/check_config.py +++ b/check_config.py @@ -5,7 +5,7 @@ import os # urls of the required tools and data corpora_url = "https://wiki.apertium.org/wiki/Corpora" -lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling" +# lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling" fast_align_url = "https://github.com/clab/fast_align" langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" apertium_url = "https://wiki.apertium.org/wiki/Installation" @@ -14,6 +14,7 @@ yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" def check_config(filename='config.toml'): misconfigured = False + lex_tools = '/usr/share/apertium-lex-tools' with open(filename) as config_file: config_toml = config_file.read() config = parse(config_toml) @@ -22,7 +23,7 @@ def check_config(filename='config.toml'): assert config_toml == dumps(config) # changing the paths to absolute - for key in ['CORPUS_SL', 'CORPUS_TL', 'LEX_TOOLS', 'FAST_ALIGN', 'LANG_DATA']: + for key in ['CORPUS_SL', 'CORPUS_TL', 'FAST_ALIGN', 'LANG_DATA']: if not os.path.isabs(config[key]): config[key] = os.path.join(os.path.abspath('.'), config[key]) @@ -36,22 +37,26 @@ def check_config(filename='config.toml'): f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n") misconfigured = True - if not os.path.isdir(config['LEX_TOOLS']): + if not os.path.isdir(lex_tools): print( - f"'{config['LEX_TOOLS']}'(LEX_TOOLS) is not a directory, provide a valid directory or \nto install, follow {lex_tools_url}\n") + f"'{lex_tools}'is not a directory, install apertium-lex-tools {apertium_url}\n") misconfigured = True else: - # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \ - # 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \ - # 'ngrams-to-rules-me.py'] + scripts = ['extract-sentences.py', 'extract-freq-lexicon.py', + 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', + 'ngrams-to-rules-me.py'] - # for script in scripts: + for script in scripts: + if not os.path.isfile(os.path.join(lex_tools, script)): + print( + f"'{script}' is present in '{lex_tools}', install apertium-lex-tools {apertium_url}\n") + misconfigured = True # assuming scripts are intact - if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): - print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", - "provide a valid directory or \nto install, follow", lex_tools_url, '\n') - misconfigured = True + # if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): + # print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", + # "provide a valid directory or \nto install, follow", lex_tools_url, '\n') + # misconfigured = True if not os.path.isfile(config['FAST_ALIGN']): print( @@ -98,7 +103,19 @@ def check_config(filename='config.toml'): if not yasmet_present: print( - f"yasmet is either not installed or not added to path, see {yasmet_url}\n") + f"yasmet is either not installed or not added to path, install yasmet and add to the path, \ + {yasmet_url} or re-install apertium-lex-tools with yasmet, {apertium_url}\n") + misconfigured = True + + process_tagger_output_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'process-tagger-output')): + process_tagger_output_present = True + break + + if not process_tagger_output_present: + print( + f"process-tagger-output is either not installed or not added to path, re-install apertium-lex-tools {apertium_url}\n") misconfigured = True if not isinstance(config['TRAINING_LINES'], int): diff --git a/config.toml.example b/config.toml.example index b9e537c..9c9d15c 100644 --- a/config.toml.example +++ b/config.toml.example @@ -16,7 +16,7 @@ CORPUS_SL = "europarl-v7.eng-spa.eng" CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts -LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" +# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" # fast align build folder FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" diff --git a/lexical_training.py b/lexical_training.py index 9bda975..47f263a 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -64,21 +64,20 @@ def pipe(cmds, firstin, lastout, stderr): return procs[-1] -def training(config, log): +def training(config, cache_dir, log): MIN = 1 - # file/folder names - cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}" + # file names sl_tagged = os.path.join( cache_dir, f"{config['CORPUS']}.tagged.{config['SL']}") tl_tagged = os.path.join( cache_dir, f"{config['CORPUS']}.tagged.{config['TL']}") - lines = os.path.join(cache_dir, config['CORPUS']+'.lines') + lines = os.path.join(cache_dir, f"{config['CORPUS']}.lines") tagged_merged = os.path.join( cache_dir, f"{config['CORPUS']}.tagged-merged.{config['SL']}-{config['TL']}") - alignment = os.path.join(cache_dir, config['CORPUS'] + - '.align.'+config['SL']+'-'+config['TL']) + alignment = os.path.join( + cache_dir, f"{config['CORPUS']}.align.{config['SL']}-{config['TL']}") clean_biltrans = os.path.join( cache_dir, f"{config['CORPUS']}.clean_biltrans.{config['SL']}-{config['TL']}") phrasetable = os.path.join( @@ -101,15 +100,6 @@ def training(config, log): cache_dir, 'ngrams_all.txt') rules = f"{config['CORPUS']}-{config['SL']}-{config['TL']}.ngrams-lm-{MIN}.xml" - # the directory where all the intermediary outputs are stored - if os.path.isdir(cache_dir): - if not query(f"Do you want to overwrite the files in '{cache_dir}'"): - print(f"(re)move {cache_dir} and re-run lexical_training.py") - exit(1) - shutil.rmtree(cache_dir) - - os.mkdir(cache_dir) - if os.path.isfile(rules): if not query(f"Do you want to overwrite '{rules}'"): print(f"(re)move {rules} and re-run lexical_training.py") @@ -198,18 +188,23 @@ def training(config, log): # phrasetable with open(tmp1, 'w') as f1, open(tmp2, 'w') as f2: - sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin" - tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin" + sl_tl_autobil = os.path.join( + config['LANG_DATA'], f"{config['SL']}-{config['TL']}.autobil.bin") + tl_sl_autobil = os.path.join( + config['LANG_DATA'], f"{config['TL']}-{config['SL']}.autobil.bin") with open(tl_tagged, 'r') as f: - call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), - os.path.join(config['LANG_DATA'], tl_sl_autobil)], stdin=f, stdout=f1, stderr=log) + # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), + call(['process-tagger-output', tl_sl_autobil], + stdin=f, stdout=f1, stderr=log) with open(sl_tagged, 'r') as f: - call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), - os.path.join(config['LANG_DATA'], sl_tl_autobil)], stdin=f, stdout=f2, stderr=log) + # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), + call(['process-tagger-output', sl_tl_autobil], + stdin=f, stdout=f2, stderr=log) f.seek(0) with open(clean_biltrans, 'w') as f0: - call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), - os.path.join(config['LANG_DATA'], sl_tl_autobil)], stdin=f, stdout=f0, stderr=log) + # call([os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), + call(['process-tagger-output', sl_tl_autobil], + stdin=f, stdout=f0, stderr=log) cmds = [['paste', tmp1, tmp2, alignment], ['sed', 's/\t/ ||| /g']] with open(phrasetable, 'w') as f: @@ -296,17 +291,29 @@ def main(): config = check_config() # adding lex scripts to path - sys.path.insert(1, config['LEX_TOOLS']) + lex_tools = '/usr/share/apertium-lex-tools' + sys.path.insert(1, lex_tools) # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°' print("cleaning corpus....") # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL']) - log = os.path.join( - f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}", 'training.log') + cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}" + + # the directory where all the intermediary outputs are stored + if os.path.isdir(cache_dir): + if not query(f"Do you want to overwrite the files in '{cache_dir}'"): + print(f"(re)move {cache_dir} and re-run lexical_training.py") + exit(0) + shutil.rmtree(cache_dir) + + os.mkdir(cache_dir) + + log = os.path.join(cache_dir, "training.log") with open(log, 'a') as log_file: - training(config, log_file) + training(config, cache_dir, log_file) + print("training complete!!") if __name__ == '__main__': diff --git a/tests/check_config_test.py b/tests/check_config_test.py index 116cd64..7dee417 100644 --- a/tests/check_config_test.py +++ b/tests/check_config_test.py @@ -1,14 +1,14 @@ # tests check_config.py from check_config import check_config -import sys from tomlkit import parse, dumps import os import shutil - +import sys sys.path.append('../') def main(argc, argv): + tamper_string = 'abc' # Test 1 config_file = open('config_test.toml', 'r') @@ -16,7 +16,7 @@ def main(argc, argv): config = parse(config_toml) config_file.close() - print("Test 1 : wrong paths") + print("Test 1 : No installations") print("---------------------") for key in config: @@ -24,43 +24,69 @@ def main(argc, argv): continue config[key] += "abc" - if os.fork() == 0: - with open('check_config_test.toml', 'w') as test_file: - test_file.write(dumps(config)) - check_config('check_config_test.toml') - exit(0) + # if os.fork() == 0: + # with open('check_config_test.toml', 'w') as test_file: + # test_file.write(dumps(config)) + # check_config('check_config_test.toml') + # exit(0) - _, _ = os.wait() + # _, _ = os.wait() - # Test 2 - config_file = open('config_test.toml', 'r') - config_toml = config_file.read() - config = parse(config_toml) - config_file.close() + # # Test 2 + # config_file = open('config_test.toml', 'r') + # config_toml = config_file.read() + # config = parse(config_toml) + # config_file.close() + + # print("Test 2 : partial/no installations") + # print("----------------------------------") - print("Test 2 : partial/no installations") - print("----------------------------------") + # config['SL'] += "abc" - config['SL'] += "abc" + lex_tools = '/usr/share/apertium-lex-tools' + if os.path.isdir(lex_tools): + scripts = ['extract-sentences.py', 'extract-freq-lexicon.py', + 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', + 'ngrams-to-rules-me.py'] + + for script in scripts: + if os.path.isfile(os.path.join(lex_tools, script)): + shutil.move(os.path.join(lex_tools, script), + os.path.join(lex_tools, script+tamper_string)) for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'apertium')): shutil.move(os.path.join(path, 'apertium'), - os.path.join(path, 'apertium'+'abc')) + os.path.join(path, 'apertium'+tamper_string)) break for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'yasmet')): shutil.move(os.path.join(path, 'yasmet'), - os.path.join(path, 'yasmet'+'abc')) + os.path.join(path, 'yasmet'+tamper_string)) break - if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), - os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'process-tagger-output')): + shutil.move(os.path.join(path, 'process-tagger-output'), + os.path.join(path, 'process-tagger-output'+tamper_string)) + break + + if os.path.isdir(lex_tools): + scripts = ['extract-sentences.py', 'extract-freq-lexicon.py', + 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', + 'ngrams-to-rules-me.py'] + + for script in scripts: + if os.path.isfile(os.path.join(lex_tools, script+tamper_string)): + shutil.move(os.path.join(lex_tools, script+tamper_string), + os.path.join(lex_tools, script)) + # if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): + # shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), + # os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+tamper_string)) # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): - # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) + # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+tamper_string)) if os.fork() == 0: with open('check_config_test.toml', 'w') as test_file: @@ -70,33 +96,39 @@ def main(argc, argv): _, _ = os.wait() - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), - os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) + # shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+tamper_string), + # os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) - # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) + # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+tamper_string), os.path.join(config['FAST_ALIGN'], 'fast_align')) for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'apertium'+'abc')): - shutil.move(os.path.join(path, 'apertium'+'abc'), + if os.path.isfile(os.path.join(path, 'apertium'+tamper_string)): + shutil.move(os.path.join(path, 'apertium'+tamper_string), os.path.join(path, 'apertium')) break for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'yasmet'+'abc')): - shutil.move(os.path.join(path, 'yasmet'+'abc'), + if os.path.isfile(os.path.join(path, 'yasmet'+tamper_string)): + shutil.move(os.path.join(path, 'yasmet'+tamper_string), os.path.join(path, 'yasmet')) break + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'process-tagger-output'+tamper_string)): + shutil.move(os.path.join(path, 'process-tagger-output'+tamper_string), + os.path.join(path, 'process-tagger-output')) + break + # Test 3 config_file = open('config_test.toml', 'r') config_toml = config_file.read() config = parse(config_toml) config_file.close() - print("Test 3 : wrong TRAINING_LINES") + print("Test 2 : wrong TRAINING_LINES") print("---------------------") - for value in ['abc', 1.00, 1e237892]: + for value in [tamper_string, 1.00, 1e237892]: config['TRAINING_LINES'] = value if os.fork() == 0: with open('check_config_test.toml', 'w') as test_file: @@ -112,7 +144,7 @@ def main(argc, argv): config = parse(config_toml) config_file.close() - print("Test 4 : correct installations") + print("Test 3 : correct installations") print("-------------------------------") if os.fork() == 0: