Index: branches/weighted-transfer/apertium-weights-learner/rlister.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/rlister.py (revision 71928) +++ branches/weighted-transfer/apertium-weights-learner/rlister.py (nonexistent) @@ -1,38 +0,0 @@ -#! /usr/bin/python3 - -import sys - -try: # see if lxml is installed - from lxml import etree as ET - print("Using lxml library happily ever after.") -except ImportError: # it is not - import xml.etree.ElementTree as ET - print("lxml library not found. Falling back to xml.etree,\n" - "though it's highly recommended that you install lxml\n" - "as it works dramatically faster than xml.etree.") - -def list_rules(rfname): - """ - List rules. - """ - try: - transtree = ET.parse(rfname) - except FileNotFoundError: - print('Failed to locate rules file \'{}\'. ' - 'Have you misspelled the name?'.format(opts.rfname)) - sys.exit(1) - except ET.ParseError: - print('Error parsing rules file \'{}\'. ' - 'Is there something wrong with it?'.format(opts.rfname)) - sys.exit(1) - - root = transtree.getroot() - for rnum, rule in enumerate(root.find('section-rules').findall('rule')): - print(rnum, rule.attrib['comment']) - print(' '.join(pattern_item.attrib['n'] for pattern_item in rule.find('pattern').findall('pattern-item'))) - print() - -if __name__ == "__main__": - if len(sys.argv) < 2: - sys.exit(1) - list_rules(sys.argv[1]) Property changes on: branches/weighted-transfer/apertium-weights-learner/rlister.py ___________________________________________________________________ Deleted: svn:executable ## -1 +0,0 ## -* \ No newline at end of property Index: branches/weighted-transfer/apertium-weights-learner/pipelines.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/pipelines.py (nonexistent) +++ branches/weighted-transfer/apertium-weights-learner/pipelines.py (revision 71929) @@ -0,0 +1,127 @@ +import sys +from subprocess import Popen, PIPE + +class partialTranslator(): + def __init__(self, tixfname, binfname): + self.autobil = Popen(['lt-proc', '-b', '-z', + binfname + '.autobil.bin' + ], + stdin = PIPE, stdout = PIPE) + self.transfer = Popen(['apertium-transfer', '-b', '-z', + tixfname + '.t1x', + binfname + '.t1x.bin' + ], + stdin = self.autobil.stdout, stdout = PIPE) + self.interchunk = Popen(['apertium-interchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.transfer.stdout, stdout = PIPE) + self.postchunk = Popen(['apertium-postchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.interchunk.stdout, stdout = PIPE) + self.autogen = Popen(['lt-proc', '-g', '-z', + binfname + '.autogen.bin' + ], + stdin = self.postchunk.stdout, stdout = PIPE) + + def translate(self, string): + string = string.strip() + '[][\n]' + + if type(string) == type(''): + bstring = bytes(string, 'utf-8') + else: + bstring = string + + self.autobil.stdin.write(bstring) + self.autobil.stdin.write(b'\0') + self.autobil.stdin.flush() + + char = self.autogen.stdout.read(1) + output = [] + while char and char != b'\0': + output.append(char) + char = self.autogen.stdout.read(1) + + return (b''.join(output)).decode('utf-8').replace('[][\n]','') + +class weightedPartialTranslator(): + def __init__(self, tixfname, binfname): + self.tixfname = tixfname + self.binfname = binfname + + self.autobil = Popen(['lt-proc', '-b', '-z', + binfname + '.autobil.bin' + ], + stdin = PIPE, stdout = PIPE) + + # transfer is missing here + + self.interchunk = Popen(['apertium-interchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = PIPE, stdout = PIPE) + self.postchunk = Popen(['apertium-postchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.interchunk.stdout, stdout = PIPE) + self.autogen = Popen(['lt-proc', '-g', '-z', + binfname + '.autogen.bin' + ], + stdin = self.postchunk.stdout, stdout = PIPE) + + def translate(self, string, wixfname): + # start null flush pipeline + string = string.strip() + '[][\n]' + + if type(string) == type(''): + bstring = bytes(string, 'utf-8') + else: + bstring = string + + self.autobil.stdin.write(bstring) + self.autobil.stdin.write(b'\0') + self.autobil.stdin.flush() + + char = self.autobil.stdout.read(1) + autobil_output = [] + while char and char != b'\0': + autobil_output.append(char) + char = self.autobil.stdout.read(1) + + # make weighted transfer + transfer = Popen(['apertium-transfer', '-bw', + wixfname, + self.tixfname + '.t1x', + self.binfname + '.t1x.bin' + ], + stdin = PIPE, stdout = PIPE) + + transfer_output, err = transfer.communicate(b''.join(autobil_output)) + + # resume null flush pipeline + self.interchunk.stdin.write(transfer_output) + self.interchunk.stdin.write(b'\0') + self.interchunk.stdin.flush() + + char = self.autogen.stdout.read(1) + autogen_output = [] + while char and char != b'\0': + autogen_output.append(char) + char = self.autogen.stdout.read(1) + + return (b''.join(autogen_output)).decode('utf-8').replace('[][\n]','') + +if __name__ == "__main__": + t = weightedPartialTranslator('../apertium-en-es/apertium-en-es.en-es', '../apertium-en-es/en-es') + + with open('./tests/testfile.txt', 'r', encoding='utf-8') as ifile: + for line in ifile: + print('line:', line) + mo = t.translate(line, '../apertium-en-es/apertium-en-es.en-es.w1x') + print('mo:', mo) + print() Index: branches/weighted-transfer/apertium-weights-learner/tools/rlister.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/tools/rlister.py (nonexistent) +++ branches/weighted-transfer/apertium-weights-learner/tools/rlister.py (revision 71929) @@ -0,0 +1,40 @@ +#! /usr/bin/python3 + +import sys + +try: + # see if lxml is installed + from lxml import etree as ET + print("Using lxml library happily ever after.", file=sys.stderr) +except ImportError: + # it is not + import xml.etree.ElementTree as ET + print("lxml library not found. Falling back to xml.etree,\n" + "though it's highly recommended that you install lxml\n" + "as it works dramatically faster than xml.etree.", file=sys.stderr) + +def list_rules(rfname): + """ + List rules. + """ + try: + transtree = ET.parse(rfname) + except FileNotFoundError: + print('Failed to locate rules file \'{}\'. ' + 'Have you misspelled the name?'.format(opts.rfname)) + sys.exit(1) + except ET.ParseError: + print('Error parsing rules file \'{}\'. ' + 'Is there something wrong with it?'.format(opts.rfname)) + sys.exit(1) + + root = transtree.getroot() + for rnum, rule in enumerate(root.find('section-rules').findall('rule')): + rule_id = rule.attrib.get('id', 'no id') + print(rnum+1, rule_id, rule.attrib['comment'], sep=' / ') + print(' '.join(pattern_item.attrib['n'] for pattern_item in rule.find('pattern').findall('pattern-item'))) + print() + +if __name__ == "__main__": + if len(sys.argv) == 2: + list_rules(sys.argv[1]) Property changes on: branches/weighted-transfer/apertium-weights-learner/tools/rlister.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/weighted-transfer/apertium-weights-learner/twlconfig.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 71928) +++ branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 71929) @@ -1,5 +1,5 @@ # full path to source corpus from which to learn the rules -source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-en-10000.txt" +source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-100-special.txt" # name of apertium pair (not direction) apertium_pair_name = "en-es" Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 71928) +++ branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 71929) @@ -2,20 +2,32 @@ import re, sys, os, pipes, gc from math import exp -import coverage +from time import perf_counter as clock import kenlm -from time import perf_counter as clock import twlconfig # a simple config in python file +import coverage +from pipelines import partialTranslator, weightedPartialTranslator +tmpweights_fname = 'tmpweights.w1x' + # regular expression to cut out a sentence sent_re = re.compile('.*?\$') + +# anything between $ and ^ inter_re = re.compile(r'\$.*?\^') -apertium_re = re.compile(r'[@#~]') + +# apertium special symbols for removal +apertium_re = re.compile(r'[@#~*]') + +# apertium token (anything between ^ and $) apertium_token_re = re.compile(r'\^.*?\$') +# start and finish of weights file weights_head = '\n\n' weights_tail = '' +# regexes used to normalize lines +# for scoring against language model beforepunc_re = re.compile(r'([¿("/])(\w)') afterpunc_re = re.compile(r'(\w)([;:,.!?)"/—])') quot_re = re.compile("[«»']") @@ -24,16 +36,21 @@ afterdash_re = re.compile(r'(\w)-(\W)') def normalize(line): - line = line.lower().replace('--', '—').replace(' - ', ' — ') + """ + Tokenize and graphically normalize line + for scoring it against language model. + """ + line = line.replace('--', '—').replace(' - ', ' — ') line = quot_re.sub('"', line) line = beforedash_re.sub(r'\1— \2', afterdash_re.sub(r'\1 —\2', line)) line = beforepunc_re.sub(r'\1 \2', afterpunc_re.sub(r'\1 \2', line)) line = numfix_re.sub(r'\1\2', line) - return line + return line.lower() def pattern_to_xml(pattern, weight=1.): """ - Create XML representation of the pattern for weigths file. + Create a string with XML representation + of pattern with weight for weigths file. """ pattern_line = ' \n'.format(weight) for pattern_item in pattern: @@ -75,7 +92,7 @@ # make output file name ofname = '{}-tagged.txt'.format(prefix) - # create partial pipeline + # create pipeline pipe = pipes.Template() pipe.append('apertium -d "{}" {}-{}-tagger'.format(pair_data, source, target), '--') pipe.append('apertium-pretransfer', '--') @@ -86,24 +103,6 @@ print('Done in {:.2f}'.format(clock() - btime)) return ofname -def clean_tagged(corpus, prefix): - """ - Tokenize, lowercase and clean up tagged corpus. - """ - print('Cleaning up tagged source corpus.') - btime = clock() - - # make output file name - ofname = '{}-tagged-clean.txt'.format(prefix) - - with open(corpus, 'r', encoding='utf-8') as ifile, \ - open(ofname, 'w', encoding='utf-8') as ofile: - for line in ifile: - ofile.write(inter_re.sub('$ ^', line.replace('"', '').lower())) - - print('Done in {:.2f}'.format(clock() - btime)) - return ofname - def search_ambiguous(ambiguous_rules, coverage): """ Look for patterns covered by one of the ambiguous rules in ambiguous_rules. @@ -117,6 +116,9 @@ def detect_ambiguous(corpus, prefix, cat_dict, pattern_FST, ambiguous_rules, tixfname, binfname, rule_id_map): """ + Find sentences that contain ambiguous chunks. + Translate them in all possible ways. + Store the results. """ print('Looking for ambiguous sentences and translating them.') btime = clock() @@ -123,32 +125,38 @@ # make output file name ofname = '{}-ambiguous.txt'.format(prefix) - #sfname = '{}-sentences.txt'.format(prefix) + # initialize translator for translation with no weights + translator = partialTranslator(tixfname, binfname) + weighted_translator = weightedPartialTranslator(tixfname, binfname) + + # initialize statistics lines_count, total_sents_count, ambig_sents_count, ambig_chunks_count = 0, 0, 0, 0 botched_coverages = 0 lbtime = clock() with open(corpus, 'r', encoding='utf-8') as ifile, \ - open(ofname, 'w', encoding='utf-8') as ofile: #,\ - #open(sfname, 'w', encoding='utf-8') as sfile: + open(ofname, 'w', encoding='utf-8') as ofile: for line in ifile: lines_count += 1 if lines_count % 1000 == 0: - print('\n{} total lines\n{} total sentences\n{} ambiguous sentences\n{} ambiguous chunks\n{} botched coverages\nanother {:.4f} elapsed'.format(lines_count, total_sents_count, ambig_sents_count, ambig_chunks_count, botched_coverages, clock() - lbtime)) + print('\n{} total lines\n{} total sentences'.format(lines_count, total_sents_count)) + print('{} ambiguous sentences\n{} ambiguous chunks'.format(ambig_sents_count, ambig_chunks_count)) + print('{} botched coverages\nanother {:.4f} elapsed'.format(botched_coverages, clock() - lbtime)) gc.collect() lbtime = clock() + # look at each sentence in line for sent_match in sent_re.finditer(line.strip()): total_sents_count += 1 - #print(total_sents_count, sent_match.group(0)) + # get coverages coverage_list = pattern_FST.get_lrlm(sent_match.group(0), cat_dict) if coverage_list == []: botched_coverages += 1 else: + # look for ambiguous chunks coverage_item = coverage_list[0] - # look for ambiguous chunks pattern_list = search_ambiguous(ambiguous_rules, coverage_item) if pattern_list != []: #print(coverage_item) @@ -175,14 +183,13 @@ # first, translate each segment with default rules for sentence_segment in sentence_segments: - sentence_segment.append(translate(sentence_segment[2], tixfname, binfname)) + sentence_segment.append(apertium_re.sub('', translator.translate(sentence_segment[2]))) - # second, translate each segment with all the rules + # second, translate each segment with each of the rules, # and make full sentence, where other segments are translated with default rules for j, sentence_segment in enumerate(sentence_segments): - translation_list = translate_ambiguous(ambiguous_rules[sentence_segment[0]], - sentence_segment[1], sentence_segment[2], - tixfname, binfname, rule_id_map) + translation_list = translate_ambiguous(weighted_translator, ambiguous_rules[sentence_segment[0]], + sentence_segment[1], sentence_segment[2], rule_id_map) output_list = [] for rule, translation in translation_list: translated_sentence = ' '.join(sentence_segment[3] for sentence_segment in sentence_segments[:j]) +\ @@ -189,6 +196,7 @@ ' ' + translation + ' ' +\ ' '.join(sentence_segment[3] for sentence_segment in sentence_segments[j+1:]) output_list.append('{}\t{}'.format(rule, translated_sentence.strip(' '))) + # store results to a file # first, print rule group number, pattern, and number of rules in the group print('{}\t^{}$\t{}'.format(sentence_segment[0], '$ ^'.join(sentence_segment[1]), len(output_list)), file=ofile) @@ -197,32 +205,8 @@ print('Done in {:.2f}'.format(clock() - btime)) return ofname -def translate(sent_line, tixfname, binfname, weightsfname=None): +def translate_ambiguous(weighted_translator, rule_group, pattern, sent_line, rule_id_map): """ - Translate sent_line using given weights file. - """ - # create pipeline - pipe = pipes.Template() - pipe.append('lt-proc -b {}'.format('.'.join((binfname, 'autobil.bin'))), '--') - # use weights file - if weightsfname is not None: - pipe.append('apertium-transfer -bw {} {} {}'.format(weightsfname, '.'.join((tixfname, 't1x')), '.'.join((binfname, 't1x.bin'))), '--') - # do not use weights file - else: - pipe.append('apertium-transfer -b {} {}'.format('.'.join((tixfname, 't1x')), '.'.join((binfname, 't1x.bin'))), '--') - pipe.append('apertium-interchunk {} {}'.format('.'.join((tixfname, 't2x')), '.'.join((binfname, 't2x.bin'))), '--') - pipe.append('apertium-postchunk {} {}'.format('.'.join((tixfname, 't3x')), '.'.join((binfname, 't3x.bin'))), '--') - pipe.append('lt-proc -g {}'.format('.'.join((binfname, 'autogen.bin'))), '--') - pipe.append('apertium-retxt', '--') - - # translate - pipefile = pipe.open('pipefile', 'w') - pipefile.write(sent_line) - pipefile.close() - return apertium_re.sub('', open('pipefile').read().lower()) - -def translate_ambiguous(rule_group, pattern, sent_line, tixfname, binfname, rule_id_map): - """ Translate sent_line for each rule in rule_group. """ translation_list = [] @@ -238,17 +222,18 @@ weights_line += pattern weights_line += ' \n' weights_line += ' \n' + weights_tail - with open('tmpweights.w1x', 'w', encoding='utf-8') as wfile: + with open(tmpweights_fname, 'w', encoding='utf-8') as wfile: wfile.write(weights_line) # translate using created file - translation = translate(sent_line, tixfname, binfname, 'tmpweights.w1x') + translation = apertium_re.sub('', weighted_translator.translate(sent_line, tmpweights_fname)) translation_list.append((focus_rule, translation)) + return translation_list def score_sentences(ambig_sentences_fname, model, prefix): """ - Score translated sentences. + Score translated sentences against language model. """ print('Scoring ambiguous sentences.') btime, chunk_counter, sentence_counter = clock(), 0, 0 @@ -298,7 +283,7 @@ sorted_scores_fname = '{}-chunk-weights-sorted.txt'.format(prefix) ofname = '{}-rule-weights.w1x'.format(prefix) - # create pipe + # create pipeline pipe = pipes.Template() pipe.append('sort $IN > $OUT', 'ff') pipe.copy(scores_fname, sorted_scores_fname) @@ -305,10 +290,13 @@ with open(sorted_scores_fname, 'r', encoding='utf-8') as ifile,\ open(ofname, 'w', encoding='utf-8') as ofile: + # read and process the first line prev_group_number, prev_rule_number, prev_pattern, weight = ifile.readline().rstrip('\n').split('\t') total_pattern_weight = float(weight) ofile.write(weights_head) ofile.write(' \n \n'.format(rule_map[prev_rule_number])) + + # read and process other lines for line in ifile: group_number, rule_number, pattern, weight = line.rstrip('\n').split('\t') if pattern != prev_pattern: @@ -324,6 +312,9 @@ # add up rule-pattern weights total_pattern_weight += float(weight) prev_group_number, prev_rule_number, prev_pattern = group_number, rule_number, pattern + + # flush the last rule-pattern + ofile.write(pattern_to_xml(apertium_token_re.findall(prev_pattern), total_pattern_weight)) ofile.write(' \n \n') ofile.write(weights_tail) @@ -343,9 +334,6 @@ twlconfig.source_corpus, prefix, twlconfig.data_folder) - # clean up tagged corpus - #clean_fname = clean_tagged(tagged_fname, prefix) - # load rules, build rule FST tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map = \ load_rules(twlconfig.apertium_pair_data, twlconfig.source, twlconfig.target) @@ -369,4 +357,8 @@ # sum up weigths for rule-pattern and make final xml make_xml_rules(scores_fname, prefix, rule_id_map) + # clean up + if os.path.exists(tmpweights_fname): + os.remove(tmpweights_fname) + print('Performed in {:.2f}'.format(clock() - tbtime))