commit 5313d297ce38d2db465dd8fc89d947da0be92c6b Author: vivekvardhanadepu Date: Tue May 25 17:44:38 2021 +0530 check_config.py now checks if the tools are present diff --git a/check_config.py b/check_config.py index 864d000..1954399 100644 --- a/check_config.py +++ b/check_config.py @@ -4,10 +4,12 @@ from tomlkit import parse, dumps import os # urls of the required tools and data -corpora_url = "" -lex_tools_url = "" -fast_align_url = "" -langs_url = "" +corpora_url = "https://wiki.apertium.org/wiki/Corpora" +lex_tools_url = "https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling" +fast_align_url = "https://github.com/clab/fast_align" +langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs" +apertium_url = "https://wiki.apertium.org/wiki/Installation" +yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules" def parse_config(filename='config.toml'): with open(filename) as config_file: @@ -18,22 +20,69 @@ def parse_config(filename='config.toml'): assert config_toml == dumps(config) if not os.path.isfile(config['CORPUS_SL']): - print(config['CORPUS_SL'], "is not a file. Provide a valid file or to download,\n look", corpora_url) + print(config['CORPUS_SL'], "is not a file, provide a valid file or \nto download, look", corpora_url) + exit(-1) if not os.path.isfile(config['CORPUS_TL']): - print(config['CORPUS_TL'], "is not a file. Provide a valid file or to download,\n look", corpora_url) + print(config['CORPUS_TL'], "is not a file, provide a valid file or \nto download, look", corpora_url) + exit(-1) if not os.path.isdir(config['LEX_TOOLS']): - print(config['LEX_TOOLS'], "is not a directory. Provide a valid directory or to install,\n follow", lex_tools_url) + print(config['LEX_TOOLS'], "is not a directory, provide a valid directory or \nto install, follow", lex_tools_url) + exit(-1) + else: + # scripts = ['process-tagger-output', 'extract-sentences.py', 'extract-freq-lexicon.py', \ + # 'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py', \ + # 'ngrams-to-rules-me.py'] - if not os.path.isdir(config['FAST_ALIGN']): - print(config['FAST_ALIGN'], "is not a directory. Provide a valid directory or to install,\n follow", fast_align_url) + # for script in scripts: - if not os.path.isdir(config['LANG_DATA']): - print(config['LANG_DATA'], "is not a directory. Provide a valid directory or to install,\n follow", langs_url) + # assuming scripts are intact + if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']): + print("process-tagger-output is not in", config['LEX_TOOLS'] + ",","provide a valid directory or \nto install, follow", lex_tools_url) + exit(-1) + if not os.path.isdir(config['FAST_ALIGN']): + print(config['FAST_ALIGN'], "is not a directory, provide a valid directory or \nto install, follow", fast_align_url) + exit(-1) + else: + if 'fast_align' not in os.listdir(config['FAST_ALIGN']): + print("fast_align is not present in", config['FAST_ALIGN']+ ",", "provide a valid directory or \nto install, follow", fast_align_url) + exit(-1) - + if not os.path.isdir(config['LANG_DATA']): + print(config['LANG_DATA'], "is not a directory, provide a valid directory or \nto install, follow", langs_url) + exit(-1) + else: + sl_tl_autobil = config['SL'] + '-' + config['TL'] + '.autobil.bin' + tl_sl_autobil = config['TL'] + '-' + config['SL'] + '.autobil.bin' + + if sl_tl_autobil not in os.listdir(config['LANG_DATA']): + print(sl_tl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) + exit(-1) + + if tl_sl_autobil not in os.listdir(config['LANG_DATA']): + print(tl_sl_autobil, "is not in", config['LANG_DATA']+ ",", "provide a valid directory or \nto install, follow", langs_url) + exit(-1) + + apertium_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'apertium')): + apertium_present = True + break + + if not apertium_present: + print("apertium is either not installed or not added to path, see", apertium_url) + + yasmet_present = False + for path in os.environ["PATH"].split(os.pathsep): + if os.path.isfile(os.path.join(path, 'yasmet')): + yasmet_present = True + break + + if not yasmet_present: + print("yasmet is either not installed or not added to path, see", yasmet_url) + return config if __name__ == '__main__': diff --git a/config.toml b/config.toml index 8809d38..22158dd 100644 --- a/config.toml +++ b/config.toml @@ -19,8 +19,8 @@ CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" +# fast align build folder +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" + # apertium language data LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" - -# fast align build folder -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" \ No newline at end of file diff --git a/config.toml.example b/config.toml.example new file mode 100644 index 0000000..22158dd --- /dev/null +++ b/config.toml.example @@ -0,0 +1,26 @@ +# configuration for lexical training +# Note: pass absolute paths + +# corpus name +CORPUS = "europarl-v7" + +# source language +SL = "eng" + +# target language +TL = "spa" + +# source corpus +CORPUS_SL = "europarl-v7.eng-spa.eng" + +# target corpus +CORPUS_TL = "europarl-v7.eng-spa.spa" + +# apertium-lex-tools scripts +LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" + +# fast align build folder +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" + +# apertium language data +LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" diff --git a/lexical_training.log b/lexical_training.log index 2356a59..4718d6e 100644 --- a/lexical_training.log +++ b/lexical_training.log @@ -1 +1,2 @@ -{'CORPUS': 'europarl-v7', 'SL': 'eng', 'TL': 'spa', 'CORPUS_SL': 'europarl-v7.eng-spa.eng', 'CORPUS_TL': 'europarl-v7.eng-spa.spa', 'LEX_TOOLS': '/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts', 'DATA': '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa', 'FAST_ALIGN': '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build'} +europarl-v7.eng-spa.en is not a file, provide a valid file or +to download, look https://wiki.apertium.org/wiki/Corpora diff --git a/lexical_training.py b/lexical_training.py index ace48b3..72fdfa0 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -3,6 +3,7 @@ from check_config import parse_config def main(): config = parse_config() - + print("parsing complete") + if __name__ == '__main__': main() \ No newline at end of file