commit 033c67d6092633886576b1b652f98fb7a487c6cf Author: vivekvardhanadepu Date: Wed Jun 9 16:03:43 2021 +0530 added code for cleaning corpus in lexical_training.py diff --git a/clean_corpus.py b/clean_corpus.py index b131bf8..ce84b89 100644 --- a/clean_corpus.py +++ b/clean_corpus.py @@ -1,21 +1,18 @@ # removes lines above and below the empty lines including the empty lines in each corpus -# removes lines containing only ° and * +# removes lines containing only '°', '*' or '.' # stripping trailing and leading spaces import sys -def main(argc, argv): - if argc != 3: - print('usage: clean_corpus.py ') - exit(-1) +def clean_corpus(corpus1, corpus2): lines1 = [] lines2 = [] lines_to_remove = set() - with open(argv[1], 'r+') as l1, open(argv[2], 'r+') as l2: + with open(corpus1, 'r+') as l1, open(corpus2, 'r+') as l2: lines1 = l1.readlines() lines2 = l2.readlines() assert len(lines1) == len(lines2) @@ -32,7 +29,7 @@ def main(argc, argv): lines_to_remove.add(i) # print(lines1, lines2) - print(lines_to_remove) + # print(lines_to_remove) l1.seek(0) # l1.write(''.join(lines1)) @@ -43,7 +40,7 @@ def main(argc, argv): l2.write('') l2.truncate() - with open(argv[1], 'a') as l1, open(argv[2], 'a') as l2: + with open(corpus1, 'a') as l1, open(corpus2, 'a') as l2: lines_to_keep = set() lines_to_keep.update([i for i in range(len(lines1))]) lines_to_keep = lines_to_keep - lines_to_remove @@ -57,4 +54,7 @@ def main(argc, argv): l2.truncate() if __name__ == '__main__': - main(len(sys.argv), sys.argv) \ No newline at end of file + if len(sys.argv) != 3: + print('usage: clean_corpus.py ') + exit(1) + clean_corpus(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/config.toml b/config.toml index 6e5bc6c..f52caff 100644 --- a/config.toml +++ b/config.toml @@ -25,4 +25,4 @@ FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" # number of lines to be trained on (do not enclose in quotes) -TRAINING_LINES = 100000 +TRAINING_LINES = 1953934 diff --git a/lexical_training.py b/lexical_training.py index aa1a9a7..dd6ca96 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -1,9 +1,24 @@ # lexical training script +import os from check_config import check_config +from clean_corpus import clean_corpus def main(): + print("validating configuration....") config = check_config() - print("checking config is done") + + # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°' + print("cleaning corpus....") + clean_corpus(config['CORPUS_SL'], config['CORPUS_TL']) + + with open(config['CORPUS_SL'], 'r') as corpus_sl: + training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines())) + print('loading', training_lines, 'lines from the corpora') + + # the directory where all the intermediary outputs are stored + cache_dir = "cache-"+config['SL']+"-"+config['TL'] + os.mkdir(cache_dir) + if __name__ == '__main__': main() \ No newline at end of file diff --git a/tests/check_config_test.py b/tests/check_config_test.py index ca42963..c1d5bb3 100644 --- a/tests/check_config_test.py +++ b/tests/check_config_test.py @@ -115,7 +115,7 @@ def main(argc, argv): check_config('check_config_test.toml') exit(0) - _, _ = os.wait() + _, _ = os.wait() os.remove('check_config_test.toml')