commit ccfe020961832bd42050cbefefee68f810688096 Author: vivekvardhanadepu Date: Fri Jun 4 09:43:04 2021 +0530 moving repo to apertium-lexical-training diff --git a/clean_corpus.py b/clean_corpus.py deleted file mode 100644 index b131bf8..0000000 --- a/clean_corpus.py +++ /dev/null @@ -1,60 +0,0 @@ -# removes lines above and below the empty lines including the empty lines in each corpus -# removes lines containing only ° and * -# stripping trailing and leading spaces - - -import sys - - -def main(argc, argv): - if argc != 3: - print('usage: clean_corpus.py ') - exit(-1) - - lines1 = [] - lines2 = [] - lines_to_remove = set() - - with open(argv[1], 'r+') as l1, open(argv[2], 'r+') as l2: - lines1 = l1.readlines() - lines2 = l2.readlines() - assert len(lines1) == len(lines2) - # print(lines1, lines2) - i = 0 - for i in range(len(lines1)): - if (not lines1[i].strip()) or (not lines2[i].strip()): - lines_to_remove.update([i-1, i, i+1]) - continue - - # removing lines only with '°', '*' and '.' - if (not lines1[i].replace('°', '').replace('*', '').replace('.','').strip()) and \ - (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()): - lines_to_remove.add(i) - # print(lines1, lines2) - - print(lines_to_remove) - - l1.seek(0) - # l1.write(''.join(lines1)) - l1.write('') - l1.truncate() - - l2.seek(0) - l2.write('') - l2.truncate() - - with open(argv[1], 'a') as l1, open(argv[2], 'a') as l2: - lines_to_keep = set() - lines_to_keep.update([i for i in range(len(lines1))]) - lines_to_keep = lines_to_keep - lines_to_remove - - for i in sorted(lines_to_keep): - # also removing leading and trailing spaces - l1.write(lines1[i].strip() + '\n') - l2.write(lines2[i].strip() + '\n') - - l1.truncate() - l2.truncate() - -if __name__ == '__main__': - main(len(sys.argv), sys.argv) \ No newline at end of file diff --git a/config.toml b/config.toml deleted file mode 100644 index df7c7a3..0000000 --- a/config.toml +++ /dev/null @@ -1,25 +0,0 @@ -# configuration for lexical training - -# corpus name -CORPUS = "europarl-v7" - -# source language -SL = "eng" - -# target language -TL = "spa" - -# source corpus -CORPUS_SL = "europarl-v7.eng-spa.eng" - -# target corpus -CORPUS_TL = "europarl-v7.eng-spa.spa" - -# apertium-lex-tools scripts -LEX_TOOLS = "../apertium-lex-tools/scripts" - -# fast align build folder -FAST_ALIGN = "coding_challenges/fast_align/build" - -# apertium language data -LANG_DATA = "coding_challenges/apertium-eng-spa" diff --git a/config.toml.example b/config.toml.example deleted file mode 100644 index 5949a1f..0000000 --- a/config.toml.example +++ /dev/null @@ -1,25 +0,0 @@ -# configuration for lexical training - -# corpus name -CORPUS = "europarl-v7" - -# source language -SL = "eng" - -# target language -TL = "spa" - -# source corpus -CORPUS_SL = "europarl-v7.eng-spa.eng" - -# target corpus -CORPUS_TL = "europarl-v7.eng-spa.spa" - -# apertium-lex-tools scripts -LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" - -# fast align build folder -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" - -# apertium language data -LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" diff --git a/lexical_training.py b/lexical_training.py deleted file mode 100644 index cb105b8..0000000 --- a/lexical_training.py +++ /dev/null @@ -1,9 +0,0 @@ -# lexical training script -from check_config import check_config - -def main(): - config = check_config() - print("parsing complete") - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/tests/check_config_test.log b/tests/check_config_test.log deleted file mode 100644 index 0654e3c..0000000 --- a/tests/check_config_test.log +++ /dev/null @@ -1,43 +0,0 @@ -Test 1 : wrong paths ---------------------- -'/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../europarl-v7.eng-spa.engabc'(CORPUS_SL) is not a file, provide a valid file or -to download, look https://wiki.apertium.org/wiki/Corpora - -'/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../europarl-v7.eng-spa.spaabc'(CORPUS_TL) is not a file, provide a valid file or -to download, look https://wiki.apertium.org/wiki/Corpora - -'/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../../apertium-lex-tools/scriptsabc'(LEX_TOOLS) is not a directory, provide a valid directory or -to install, follow https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling - -'/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../coding_challenges/fast_align/buildabc'(FAST_ALIGN) is not a directory, provide a valid directory or -to install, follow https://github.com/clab/fast_align - -'/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../coding_challenges/apertium-eng-spaabc'(LANG_DATA) is not a directory, provide a valid directory or -to install, follow https://wiki.apertium.org/wiki/List_of_language_pairs - -Test 1 : wrong paths ---------------------- -Test 2 : partial/no installations ----------------------------------- -'process-tagger-output' is not in '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../../apertium-lex-tools/scripts'(LEX_TOOLS), provide a valid directory or -to install, follow https://wiki.apertium.org/wiki/Install_Apertium_core_by_compiling - -fast_align is not present in '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../coding_challenges/fast_align/build'(FAST_ALIGN), provide a valid directory or -to install, follow https://github.com/clab/fast_align - -'engabc-spa.autobil.bin' is not in '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../coding_challenges/apertium-eng-spa'(LANG_DATA), provide a valid directory or -to install, follow https://wiki.apertium.org/wiki/List_of_language_pairs - -'spa-engabc.autobil.bin' is not in '/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/tests/../coding_challenges/apertium-eng-spa'(LANG_DATA), provide a valid directory or -to install, follow https://wiki.apertium.org/wiki/List_of_language_pairs - -apertium is either not installed or not added to path, see https://wiki.apertium.org/wiki/Installation - -yasmet is either not installed or not added to path, see https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules - -Test 1 : wrong paths ---------------------- -Test 2 : partial/no installations ----------------------------------- -Test 3 : correct installations -------------------------------- diff --git a/tests/check_config_test.py b/tests/check_config_test.py deleted file mode 100644 index 08753b6..0000000 --- a/tests/check_config_test.py +++ /dev/null @@ -1,98 +0,0 @@ -# tests check_config.py -import sys -from tomlkit import parse, dumps -import os -import shutil - -sys.path.append('../') - -from check_config import check_config - -def main(argc, argv): - - # Test 1 - config_file = open('config_test.toml', 'r') - config_toml = config_file.read() - config = parse(config_toml) - config_file.close() - - print("Test 1 : wrong paths") - print("---------------------") - - for key in config: - config[key]+="abc" - - if os.fork() == 0: - with open('check_config_test.toml', 'w') as test_file: - test_file.write(dumps(config)) - check_config('check_config_test.toml') - exit(0) - - _, _ = os.wait() - - # Test 2 - config_file = open('config_test.toml', 'r') - config_toml = config_file.read() - config = parse(config_toml) - config_file.close() - - print("Test 2 : partial/no installations") - print("----------------------------------") - - config['SL']+="abc" - - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'apertium')): - shutil.move(os.path.join(path, 'apertium'), os.path.join(path, 'apertium'+'abc')) - break - - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'yasmet')): - shutil.move(os.path.join(path, 'yasmet'), os.path.join(path, 'yasmet'+'abc')) - break - - if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) - - if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): - shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) - - if os.fork() == 0: - with open('check_config_test.toml', 'w') as test_file: - test_file.write(dumps(config)) - check_config('check_config_test.toml') - exit(0) - - _, _ = os.wait() - - shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) - - shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) - - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'apertium'+'abc')): - shutil.move(os.path.join(path, 'apertium'+'abc'), os.path.join(path, 'apertium')) - break - - for path in os.environ["PATH"].split(os.pathsep): - if os.path.isfile(os.path.join(path, 'yasmet'+'abc')): - shutil.move(os.path.join(path, 'yasmet'+'abc'), os.path.join(path, 'yasmet')) - break - - # Test 3 - config_file = open('config_test.toml', 'r') - config_toml = config_file.read() - config = parse(config_toml) - config_file.close() - - print("Test 3 : correct installations") - print("-------------------------------") - - with open('check_config_test.toml', 'w') as test_file: - test_file.write(dumps(config)) - check_config('check_config_test.toml') - - os.remove('check_config_test.toml') - -if __name__ == '__main__': - main(len(sys.argv), sys.argv) \ No newline at end of file diff --git a/tests/config_test.toml b/tests/config_test.toml deleted file mode 100644 index 0d67a99..0000000 --- a/tests/config_test.toml +++ /dev/null @@ -1,25 +0,0 @@ -# configuration for lexical training - -# corpus name -CORPUS = "europarl-v7" - -# source language -SL = "eng" - -# target language -TL = "spa" - -# source corpus -CORPUS_SL = "../europarl-v7.eng-spa.eng" - -# target corpus -CORPUS_TL = "../europarl-v7.eng-spa.spa" - -# apertium-lex-tools scripts -LEX_TOOLS = "../../apertium-lex-tools/scripts" - -# fast align build folder -FAST_ALIGN = "../coding_challenges/fast_align/build" - -# apertium language data -LANG_DATA = "../coding_challenges/apertium-eng-spa"