commit 4fb5b89e497343037d659f39226a2243b4e92188 Author: vivekvardhanadepu Date: Mon May 31 15:55:00 2021 +0530 check_config:minor changes and optimizations diff --git a/README.md b/README.md index 02ac9f2..7115def 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ -# apertium-lexical-training \ No newline at end of file +# apertium-lexical-training + +The procedure for lexical selection training is a bit messy, with various scripts involved that require lots of manual tweaking, and many third party tools to be installed, e.g. irstlm, moses, gizapp. The goal of this task is to make the training procedure as streamlined and user-friendly as possible + +for more, read https://wiki.apertium.org/wiki/Ideas_for_Google_Summer_of_Code/User-friendly_lexical_selection_training + +## tests + +This folder contains scripts for automated testing of the helper scripts diff --git a/check_config.py b/check_config.py index 599ebb8..5a2d5d4 100644 --- a/check_config.py +++ b/check_config.py @@ -11,7 +11,8 @@ langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" apertium_url = "https://wiki.apertium.org/wiki/Installation" yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" -def parse_config(filename='config.toml'): +def check_config(filename='config.toml'): + misconfigured = False with open(filename) as config_file: config_toml = config_file.read() config = parse(config_toml) @@ -19,17 +20,25 @@ def parse_config(filename='config.toml'): # gives error if not parsed well assert config_toml == dumps(config) + # changing the paths to absolute + for key in ['CORPUS_SL', 'CORPUS_TL', 'LEX_TOOLS', 'FAST_ALIGN', 'LANG_DATA']: + if not os.path.isabs(config[key]): + config[key] = os.path.join(os.path.abspath('.'), config[key]) + if not os.path.isfile(config['CORPUS_SL']): - print(config['CORPUS_SL'], "is not a file, provide a valid file or \nto download, look", corpora_url) - exit(-1) + print("'"+config['CORPUS_SL']+"'(CORPUS_SL)","is not a file, provide a valid"+ \ + " file or \nto download, look", corpora_url, '\n') + misconfigured = True if not os.path.isfile(config['CORPUS_TL']): - print(config['CORPUS_TL'], "is not a file, provide a valid file or \nto download, look", corpora_url) - exit(-1) + print("'"+config['CORPUS_TL']+"'(CORPUS_TL)", "is not a file, provide a valid "+ \ + "file or \nto download, look", corpora_url, '\n') + misconfigured = True if not os.path.isdir(config['LEX_TOOLS']): - print(config['LEX_TOOLS'], "is not a directory, provide a valid directory or \nto install, follow", lex_tools_url) - exit(-1) + print("'"+config['LEX_TOOLS']+"'(LEX_TOOLS)", "is not a directory, provide a valid "+ \ + "directory or \nto install, follow", lex_tools_url, '\n') + misconfigured = True else: # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \ # 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \ @@ -39,31 +48,35 @@ def parse_config(filename='config.toml'): # assuming scripts are intact if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): - print("process-tagger-output is not in", config['LEX_TOOLS'] + ",","provide a valid directory or \nto install, follow", lex_tools_url) - exit(-1) + print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),", \ + "provide a valid directory or \nto install, follow", lex_tools_url, '\n') + misconfigured = True if not os.path.isdir(config['FAST_ALIGN']): - print(config['FAST_ALIGN'], "is not a directory, provide a valid directory or \nto install, follow", fast_align_url) - exit(-1) + print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a directory, provide"+ \ + " a valid directory or \nto install, follow", fast_align_url, '\n') + misconfigured = True else: if 'fast_align' not in os.listdir(config['FAST_ALIGN']): - print("fast_align is not present in", config['FAST_ALIGN']+ ",", "provide a valid directory or \nto install, follow", fast_align_url) - exit(-1) + print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ + "provide a valid directory or \nto install, follow", fast_align_url, '\n') + misconfigured = True if not os.path.isdir(config['LANG_DATA']): - print(config['LANG_DATA'], "is not a directory, provide a valid directory or \nto install, follow", langs_url) - exit(-1) + print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \ + "directory or \nto install, follow", langs_url, '\n') + misconfigured = True else: sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin' tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin' - if sl_tl_autobil not in os.listdir(config['LANG_DATA']): - print(sl_tl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) - exit(-1) - + print("'"+sl_tl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \ + "provide a valid directory or \nto install, follow", langs_url, '\n') + misconfigured = True if tl_sl_autobil not in os.listdir(config['LANG_DATA']): - print(tl_sl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) - exit(-1) + print("'"+tl_sl_autobil+"'", "is not in", "'"+config['LANG_DATA']+ "'(LANG_DATA),", \ + "provide a valid directory or \nto install, follow", langs_url, '\n') + misconfigured = True apertium_present = False for path in os.environ["PATH"].split(os.pathsep): @@ -72,8 +85,8 @@ def parse_config(filename='config.toml'): break if not apertium_present: - print("apertium is either not installed or not added to path, see", apertium_url) - exit(-1) + print("apertium is either not installed or not added to path, see", apertium_url, '\n') + misconfigured = True yasmet_present = False for path in os.environ["PATH"].split(os.pathsep): @@ -82,9 +95,13 @@ def parse_config(filename='config.toml'): break if not yasmet_present: - print("yasmet is either not installed or not added to path, see", yasmet_url) - exit(-1) + print("yasmet is either not installed or not added to path, see", yasmet_url, '\n') + misconfigured = True + + if misconfigured: + exit(1) + return config if __name__ == '__main__': - parse_config() \ No newline at end of file + check_config() \ No newline at end of file diff --git a/clean_corpus.py b/clean_corpus.py index f1b85d3..b131bf8 100644 --- a/clean_corpus.py +++ b/clean_corpus.py @@ -22,51 +22,16 @@ def main(argc, argv): # print(lines1, lines2) i = 0 for i in range(len(lines1)): - # if not(lines1[i].strip()) and not(lines2[i].strip()): - # continue - # if i > 0: - # if i < len(lines1)-1: - # del lines1[i-1], lines2[i-1] - # del lines1[i-1], lines2[i-1] - # del lines1[i-1], lines2[i-1] - # else: - # del lines1[i-1], lines2[i-1] - # del lines1[i-1], lines2[i-1] - # else: - # if i < len(lines1)-1: - # del lines1[i], lines2[i] - # del lines1[i], lines2[i] - # else: - # del lines1[i], lines2[i] if (not lines1[i].strip()) or (not lines2[i].strip()): lines_to_remove.update([i-1, i, i+1]) continue - # removing lines only with '°' and '*' - if (not lines1[i].replace('°', ' ').replace('*', ' ').strip()) and (not lines2[i].replace('°', ' ').replace('*', ' ').strip()): + # removing lines only with '°', '*' and '.' + if (not lines1[i].replace('°', '').replace('*', '').replace('.','').strip()) and \ + (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()): lines_to_remove.add(i) # print(lines1, lines2) - - # assert len(lines1) == len(lines2) - - # if len(lines1) == 0: - # l1.seek(0) - # l1.write('\n') - # l1.truncate() - - # l2.seek(0) - # l2.write('\n') - # l2.truncate() - - # l1.close() - # l2.close() - # return - - # if '\n' not in lines1[len(lines1)-1]: - # lines1[len(lines1)-1] = lines1[len(lines1)-1] + '\n' - # if '\n' not in lines2[len(lines2)-1]: - # lines2[len(lines2)-1] = lines2[len(lines2)-1] + '\n' - + print(lines_to_remove) l1.seek(0) diff --git a/config.toml b/config.toml index 22158dd..df7c7a3 100644 --- a/config.toml +++ b/config.toml @@ -1,5 +1,4 @@ # configuration for lexical training -# Note: pass absolute paths # corpus name CORPUS = "europarl-v7" @@ -17,10 +16,10 @@ CORPUS_SL = "europarl-v7.eng-spa.eng" CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts -LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" +LEX_TOOLS = "../apertium-lex-tools/scripts" # fast align build folder -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" +FAST_ALIGN = "coding_challenges/fast_align/build" # apertium language data -LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" +LANG_DATA = "coding_challenges/apertium-eng-spa" diff --git a/config.toml.example b/config.toml.example index 22158dd..5949a1f 100644 --- a/config.toml.example +++ b/config.toml.example @@ -1,5 +1,4 @@ # configuration for lexical training -# Note: pass absolute paths # corpus name CORPUS = "europarl-v7" diff --git a/lexical_training.py b/lexical_training.py index 72fdfa0..cb105b8 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -1,8 +1,8 @@ # lexical training script -from check_config import parse_config +from check_config import check_config def main(): - config = parse_config() + config = check_config() print("parsing complete") if __name__ == '__main__': diff --git a/tests/check_config_test.py b/tests/check_config_test.py new file mode 100644 index 0000000..08753b6 --- /dev/null +++ b/tests/check_config_test.py @@ -0,0 +1,98 @@ +# tests check_config.py +import sys +from tomlkit import parse, dumps +import os +import shutil + +sys.path.append('../') + +from check_config import check_config + +def main(argc, argv): + + # Test 1 + config_file = open('config_test.toml', 'r') + config_toml = config_file.read() + config = parse(config_toml) + config_file.close() + + print("Test 1 : wrong paths") + print("---------------------") + + for key in config: + config[key]+="abc" + + if os.fork() == 0: + with open('check_config_test.toml', 'w') as test_file: + test_file.write(dumps(config)) + check_config('check_config_test.toml') + exit(0) + + _, _ = os.wait() + + # Test 2 + config_file = open('config_test.toml', 'r') + config_toml = config_file.read() + config = parse(config_toml) + config_file.close() + + print("Test 2 : partial/no installations") + print("----------------------------------") + + config['SL']+="abc" + + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'apertium')): + shutil.move(os.path.join(path, 'apertium'), os.path.join(path, 'apertium'+'abc')) + break + + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'yasmet')): + shutil.move(os.path.join(path, 'yasmet'), os.path.join(path, 'yasmet'+'abc')) + break + + if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): + shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) + + if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): + shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) + + if os.fork() == 0: + with open('check_config_test.toml', 'w') as test_file: + test_file.write(dumps(config)) + check_config('check_config_test.toml') + exit(0) + + _, _ = os.wait() + + shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) + + shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) + + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'apertium'+'abc')): + shutil.move(os.path.join(path, 'apertium'+'abc'), os.path.join(path, 'apertium')) + break + + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'yasmet'+'abc')): + shutil.move(os.path.join(path, 'yasmet'+'abc'), os.path.join(path, 'yasmet')) + break + + # Test 3 + config_file = open('config_test.toml', 'r') + config_toml = config_file.read() + config = parse(config_toml) + config_file.close() + + print("Test 3 : correct installations") + print("-------------------------------") + + with open('check_config_test.toml', 'w') as test_file: + test_file.write(dumps(config)) + check_config('check_config_test.toml') + + os.remove('check_config_test.toml') + +if __name__ == '__main__': + main(len(sys.argv), sys.argv) \ No newline at end of file diff --git a/tests/config_test.toml b/tests/config_test.toml new file mode 100644 index 0000000..0d67a99 --- /dev/null +++ b/tests/config_test.toml @@ -0,0 +1,25 @@ +# configuration for lexical training + +# corpus name +CORPUS = "europarl-v7" + +# source language +SL = "eng" + +# target language +TL = "spa" + +# source corpus +CORPUS_SL = "../europarl-v7.eng-spa.eng" + +# target corpus +CORPUS_TL = "../europarl-v7.eng-spa.spa" + +# apertium-lex-tools scripts +LEX_TOOLS = "../../apertium-lex-tools/scripts" + +# fast align build folder +FAST_ALIGN = "../coding_challenges/fast_align/build" + +# apertium language data +LANG_DATA = "../coding_challenges/apertium-eng-spa"