commit aa27247b596eec6390037720fa169c06484ce789 Author: vivekvardhanadepu Date: Tue May 25 19:12:41 2021 +0530 scripts for checking config added diff --git a/check_config.py b/check_config.py new file mode 100644 index 0000000..599ebb8 --- /dev/null +++ b/check_config.py @@ -0,0 +1,90 @@ +# parses the config, check if the tools are present + +from tomlkit import parse, dumps +import os + +# urls of the required tools and data +corpora_url = "https://wiki.apertium.org/wiki/Corpora" +lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling" +fast_align_url = "https://github.com/clab/fast_align" +langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" +apertium_url = "https://wiki.apertium.org/wiki/Installation" +yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" + +def parse_config(filename='config.toml'): + with open(filename) as config_file: + config_toml = config_file.read() + config = parse(config_toml) + + # gives error if not parsed well + assert config_toml == dumps(config) + + if not os.path.isfile(config['CORPUS_SL']): + print(config['CORPUS_SL'], "is not a file, provide a valid file or \nto download, look", corpora_url) + exit(-1) + + if not os.path.isfile(config['CORPUS_TL']): + print(config['CORPUS_TL'], "is not a file, provide a valid file or \nto download, look", corpora_url) + exit(-1) + + if not os.path.isdir(config['LEX_TOOLS']): + print(config['LEX_TOOLS'], "is not a directory, provide a valid directory or \nto install, follow", lex_tools_url) + exit(-1) + else: + # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \ + # 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \ + # 'ngrams-to-rules-me.py'] + + # for script in scripts: + + # assuming scripts are intact + if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): + print("process-tagger-output is not in", config['LEX_TOOLS'] + ",","provide a valid directory or \nto install, follow", lex_tools_url) + exit(-1) + + if not os.path.isdir(config['FAST_ALIGN']): + print(config['FAST_ALIGN'], "is not a directory, provide a valid directory or \nto install, follow", fast_align_url) + exit(-1) + else: + if 'fast_align' not in os.listdir(config['FAST_ALIGN']): + print("fast_align is not present in", config['FAST_ALIGN']+ ",", "provide a valid directory or \nto install, follow", fast_align_url) + exit(-1) + + if not os.path.isdir(config['LANG_DATA']): + print(config['LANG_DATA'], "is not a directory, provide a valid directory or \nto install, follow", langs_url) + exit(-1) + else: + sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin' + tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin' + + if sl_tl_autobil not in os.listdir(config['LANG_DATA']): + print(sl_tl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) + exit(-1) + + if tl_sl_autobil not in os.listdir(config['LANG_DATA']): + print(tl_sl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) + exit(-1) + + apertium_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'apertium')): + apertium_present = True + break + + if not apertium_present: + print("apertium is either not installed or not added to path, see", apertium_url) + exit(-1) + + yasmet_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'yasmet')): + yasmet_present = True + break + + if not yasmet_present: + print("yasmet is either not installed or not added to path, see", yasmet_url) + exit(-1) + return config + +if __name__ == '__main__': + parse_config() \ No newline at end of file diff --git a/clean_corpus.py b/clean_corpus.py new file mode 100644 index 0000000..f1b85d3 --- /dev/null +++ b/clean_corpus.py @@ -0,0 +1,95 @@ +# removes lines above and below the empty lines including the empty lines in each corpus +# removes lines containing only ° and * +# stripping trailing and leading spaces + + +import sys + + +def main(argc, argv): + if argc != 3: + print('usage: clean_corpus.py ') + exit(-1) + + lines1 = [] + lines2 = [] + lines_to_remove = set() + + with open(argv[1], 'r+') as l1, open(argv[2], 'r+') as l2: + lines1 = l1.readlines() + lines2 = l2.readlines() + assert len(lines1) == len(lines2) + # print(lines1, lines2) + i = 0 + for i in range(len(lines1)): + # if not(lines1[i].strip()) and not(lines2[i].strip()): + # continue + # if i > 0: + # if i < len(lines1)-1: + # del lines1[i-1], lines2[i-1] + # del lines1[i-1], lines2[i-1] + # del lines1[i-1], lines2[i-1] + # else: + # del lines1[i-1], lines2[i-1] + # del lines1[i-1], lines2[i-1] + # else: + # if i < len(lines1)-1: + # del lines1[i], lines2[i] + # del lines1[i], lines2[i] + # else: + # del lines1[i], lines2[i] + if (not lines1[i].strip()) or (not lines2[i].strip()): + lines_to_remove.update([i-1, i, i+1]) + continue + + # removing lines only with '°' and '*' + if (not lines1[i].replace('°', ' ').replace('*', ' ').strip()) and (not lines2[i].replace('°', ' ').replace('*', ' ').strip()): + lines_to_remove.add(i) + # print(lines1, lines2) + + # assert len(lines1) == len(lines2) + + # if len(lines1) == 0: + # l1.seek(0) + # l1.write('\n') + # l1.truncate() + + # l2.seek(0) + # l2.write('\n') + # l2.truncate() + + # l1.close() + # l2.close() + # return + + # if '\n' not in lines1[len(lines1)-1]: + # lines1[len(lines1)-1] = lines1[len(lines1)-1] + '\n' + # if '\n' not in lines2[len(lines2)-1]: + # lines2[len(lines2)-1] = lines2[len(lines2)-1] + '\n' + + print(lines_to_remove) + + l1.seek(0) + # l1.write(''.join(lines1)) + l1.write('') + l1.truncate() + + l2.seek(0) + l2.write('') + l2.truncate() + + with open(argv[1], 'a') as l1, open(argv[2], 'a') as l2: + lines_to_keep = set() + lines_to_keep.update([i for i in range(len(lines1))]) + lines_to_keep = lines_to_keep - lines_to_remove + + for i in sorted(lines_to_keep): + # also removing leading and trailing spaces + l1.write(lines1[i].strip() + '\n') + l2.write(lines2[i].strip() + '\n') + + l1.truncate() + l2.truncate() + +if __name__ == '__main__': + main(len(sys.argv), sys.argv) \ No newline at end of file diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..22158dd --- /dev/null +++ b/config.toml @@ -0,0 +1,26 @@ +# configuration for lexical training +# Note: pass absolute paths + +# corpus name +CORPUS = "europarl-v7" + +# source language +SL = "eng" + +# target language +TL = "spa" + +# source corpus +CORPUS_SL = "europarl-v7.eng-spa.eng" + +# target corpus +CORPUS_TL = "europarl-v7.eng-spa.spa" + +# apertium-lex-tools scripts +LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" + +# fast align build folder +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" + +# apertium language data +LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" diff --git a/config.toml.example b/config.toml.example new file mode 100644 index 0000000..22158dd --- /dev/null +++ b/config.toml.example @@ -0,0 +1,26 @@ +# configuration for lexical training +# Note: pass absolute paths + +# corpus name +CORPUS = "europarl-v7" + +# source language +SL = "eng" + +# target language +TL = "spa" + +# source corpus +CORPUS_SL = "europarl-v7.eng-spa.eng" + +# target corpus +CORPUS_TL = "europarl-v7.eng-spa.spa" + +# apertium-lex-tools scripts +LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" + +# fast align build folder +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" + +# apertium language data +LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" diff --git a/lexical_training.py b/lexical_training.py new file mode 100644 index 0000000..72fdfa0 --- /dev/null +++ b/lexical_training.py @@ -0,0 +1,9 @@ +# lexical training script +from check_config import parse_config + +def main(): + config = parse_config() + print("parsing complete") + +if __name__ == '__main__': + main() \ No newline at end of file