commit 6b0079ecf254907c1ed00248f5deb2a67616262a Author: vivekvardhanadepu Date: Sat Jun 12 14:39:39 2021 +0530 added code for alignment diff --git a/check_config.py b/check_config.py index 2455282..60b77db 100644 --- a/check_config.py +++ b/check_config.py @@ -52,15 +52,15 @@ def check_config(filename='config.toml'): "provide a valid directory or \nto install, follow", lex_tools_url, '\n') misconfigured = True - if not os.path.isdir(config['FAST_ALIGN']): - print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a directory, provide"+ \ - " a valid directory or \nto install, follow", fast_align_url, '\n') + if not os.path.isfile(config['FAST_ALIGN']): + print("'"+config['FAST_ALIGN']+"'(FAST_ALIGN)", "is not a file, provide"+ \ + " a valid executable or \nto install, follow", fast_align_url, '\n') misconfigured = True - else: - if 'fast_align' not in os.listdir(config['FAST_ALIGN']): - print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ - "provide a valid directory or \nto install, follow", fast_align_url, '\n') - misconfigured = True + # else: + # if 'fast_align' not in os.listdir(config['FAST_ALIGN']): + # print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \ + # "provide a valid directory or \nto install, follow", fast_align_url, '\n') + # misconfigured = True if not os.path.isdir(config['LANG_DATA']): print("'"+config['LANG_DATA']+"'(LANG_DATA)", "is not a directory, provide a valid "+ \ diff --git a/config.toml b/config.toml index 6e94ceb..7684198 100644 --- a/config.toml +++ b/config.toml @@ -18,8 +18,8 @@ CORPUS_TL = "europarl-v7.eng-spa.spa" # apertium-lex-tools scripts LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts" -# fast align build folder -FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build" +# fast align +FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build/fast_align" # apertium language data LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa" diff --git a/lexical_training.py b/lexical_training.py index e0470b6..2bfb766 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -1,9 +1,39 @@ # lexical training script import os -from subprocess import Popen, PIPE +from subprocess import Popen, PIPE, call from check_config import check_config from clean_corpus import clean_corpus +def query(question, default="yes"): + """Ask a yes/no question via raw_input() and return their answer. + + "question" is a string that is presented to the user. + "default" is the presumed answer if the user just hits . + It must be "yes" (the default), "no" or None (meaning + an answer is required of the user). + + The "answer" return value is True for "yes" or False for "no". + """ + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} + if default is None: + prompt = " [y/n]" + elif default == "no": + prompt = " [y/N]" + else: + prompt = " [Y/n]" + default= "yes" + + while True: + print(question + prompt+"(default='"+default+"')?") + choice = input().lower() + if default is not None and choice == "": + return valid[default] + elif choice in valid: + return valid[choice] + else: + print("Please respond with 'yes', 'no', 'y' or 'n'") + exit(1) + def main(): print("validating configuration....") config = check_config() @@ -12,18 +42,29 @@ def main(): print("cleaning corpus....") # clean_corpus(config['CORPUS_SL'], config['CORPUS_TL']) + # file names + cache_dir = "cache-"+config['SL']+"-"+config['TL'] + training_log_name = cache_dir+'/'+'training.log' + sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL'] + tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL'] + lines = cache_dir+'/'+config['CORPUS']+'.lines' + tagged_merged = cache_dir+'/'+config['CORPUS']+'.tagged-merged.'+config['SL']+'-'+config['TL'] + alignment = cache_dir+'/'+config['CORPUS']+'.align.'+config['SL']+'-'+config['TL'] + with open(config['CORPUS_SL'], 'r') as corpus_sl: training_lines = min(config['TRAINING_LINES'], len(corpus_sl.readlines())) print('loading', training_lines, 'lines from the corpora') # the directory where all the intermediary outputs are stored - cache_dir = "cache-"+config['SL']+"-"+config['TL'] if not os.path.isdir(cache_dir): os.mkdir(cache_dir) + else: + if not query("Do you want to overwrite the files in "+"'"+cache_dir+"'"): + print("remove", cache_dir, "and re-run lexical_training.py") + exit(1) - training_log_name = cache_dir+'/'+'training.log' - if os.path.isdir(training_log_name): + if os.path.isfile(training_log_name): os.remove(training_log_name) training_log = open(training_log_name, 'a') @@ -33,32 +74,67 @@ def main(): with open(config['CORPUS_SL']) as f: p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log) - c2 = ['apertium-destxt'] - p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log) + # c2 = ['apertium-destxt'] + # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log) c3 = ['apertium', '-d', config['LANG_DATA'], config['SL']+'-'+config['TL']+'-tagger'] - p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log) + p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log) - c4 = ['apertium-pretransfer'] - sl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['SL'] + c4 = ['sed', 's/ \+/ /g'] + p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log) + + c5 = ['apertium-pretransfer'] with open(sl_tagged, 'w') as f: - Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log) + Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log) # tagging the target side corpus c1 = ['head', '-n', str(config['TRAINING_LINES'])] with open(config['CORPUS_TL']) as f: p1 = Popen(c1, stdin=f, stdout=PIPE, stderr=training_log) - c2 = ['apertium-destxt'] - p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log) + # c2 = ['apertium-destxt'] + # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log) c3 = ['apertium', '-d', config['LANG_DATA'], config['TL']+'-'+config['SL']+'-tagger'] - p3 = Popen(c3, stdin=p2.stdout, stdout=PIPE, stderr=training_log) + p3 = Popen(c3, stdin=p1.stdout, stdout=PIPE, stderr=training_log) - c4 = ['apertium-pretransfer'] - tl_tagged = cache_dir+'/'+config['CORPUS']+'.tagged.'+config['TL'] + c4 = ['sed', 's/ \+/ /g'] + p4 = Popen(c4, stdin=p3.stdout, stdout=PIPE, stderr=training_log) + + c5 = ['apertium-pretransfer'] with open(tl_tagged, 'w') as f: - Popen(c4, stdin=p3.stdout, stdout=f, stderr=training_log).wait() + Popen(c5, stdin=p4.stdout, stdout=f, stderr=training_log).wait() + + # removing lines with no analyses + with open(lines, 'w+') as f0: + call(['seq', '1', str(config['TRAINING_LINES'])], stdout=f0, stderr=training_log) + clean_tagged = cache_dir+'/'+config['CORPUS']+'.clean_tagged' + with open(clean_tagged, 'w+') as f1: + p1 = Popen(['paste', lines, sl_tagged, tl_tagged], stdout=PIPE, stderr=training_log) + Popen(['grep', '<*\t*<'], stdin=p1.stdout, stdout=f1, stderr=training_log).wait() + + call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=training_log) + + f1.seek(0) + with open(sl_tagged, 'w') as f2: + p1 = Popen(['cut', '-f', '2'], stdin=f1, stdout=PIPE, stderr=training_log) + p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log) + Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log) + + f1.seek(0) + with open(tl_tagged, 'w') as f2: + p1 = Popen(['cut', '-f', '3'], stdin=f1, stdout=PIPE, stderr=training_log) + p2 = Popen(['sed', 's/ /~/g'], stdin=p1.stdout, stdout=PIPE, stderr=training_log) + Popen(['sed', 's/\$[^\^]*/$ /g'], stdin=p2.stdout, stdout=f2, stderr=training_log).wait() + + os.remove(clean_tagged) + + # aligning the parallel corpus + with open(tagged_merged, 'w+') as f: + with open(os.devnull, 'r') as f1: + call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-', sl_tagged], stdin=f1, stdout=f, stderr=training_log) + with open(alignment, 'w') as f2: + call([config['FAST_ALIGN'], '-i', tagged_merged, '-d', '-o', '-v'], stdout=f2, stderr=training_log) training_log.close() diff --git a/tests/check_config_test.py b/tests/check_config_test.py index c1d5bb3..3f678d0 100644 --- a/tests/check_config_test.py +++ b/tests/check_config_test.py @@ -56,8 +56,8 @@ def main(argc, argv): if os.path.isfile(os.path.join(config['LEX_TOOLS'], 'process-tagger-output')): shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc')) - if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): - shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) + # if os.path.isfile(os.path.join(config['FAST_ALIGN'], 'fast_align')): + # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'), os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc')) if os.fork() == 0: with open('check_config_test.toml', 'w') as test_file: @@ -69,7 +69,7 @@ def main(argc, argv): shutil.move(os.path.join(config['LEX_TOOLS'], 'process-tagger-output'+'abc'), os.path.join(config['LEX_TOOLS'], 'process-tagger-output')) - shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) + # shutil.move(os.path.join(config['FAST_ALIGN'], 'fast_align'+'abc'), os.path.join(config['FAST_ALIGN'], 'fast_align')) for path in os.environ["PATH"].split(os.pathsep): if os.path.isfile(os.path.join(path, 'apertium'+'abc')):