Index: branches/weighted-transfer/apertium-weights-learner/README.md =================================================================== --- branches/weighted-transfer/apertium-weights-learner/README.md (revision 72173) +++ branches/weighted-transfer/apertium-weights-learner/README.md (revision 72174) @@ -63,16 +63,16 @@ The contents of the unpruned w1x file should look like the following: ``` - + - + - + @@ -80,7 +80,7 @@ - + @@ -92,7 +92,7 @@ - + Index: branches/weighted-transfer/apertium-weights-learner/coverage.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 72173) +++ branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 72174) @@ -105,11 +105,12 @@ # build pattern -> rules numbers dict (rules_dict), # and rule number -> rule id dict (rule_id_map) - rules_dict, rule_id_map = {}, {} + rules_dict, rule_xmls, rule_id_map = {}, {}, {} for i, rule in enumerate(root.find('section-rules').findall('rule')): if 'id' in rule.attrib: # rule has 'id' attribute: add it to rule_id_map rule_id_map[str(i)] = rule.attrib['id'] + rule_xmls[str(i)] = rule # build pattern pattern = tuple(pattern_item.attrib['n'] for pattern_item in rule.find('pattern').findall('pattern-item')) @@ -131,7 +132,7 @@ # sort rules to optimize FST building rules.sort() - return rules, ambiguous_rule_groups, rule_id_map + return rules, ambiguous_rule_groups, rule_id_map, rule_xmls def prepare(rfname): """ @@ -149,9 +150,9 @@ sys.exit(1) cat_dict = get_cat_dict(transtree) - rules, ambiguous_rules, rule_id_map = get_rules(transtree) + rules, ambiguous_rules, rule_id_map, rule_xmls = get_rules(transtree) - return cat_dict, rules, ambiguous_rules, rule_id_map + return cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls class FST: """ @@ -303,7 +304,7 @@ return tuple([len(group[0]) for group in coverage]) if __name__ == "__main__": - cat_dict, rules, ambiguous_rules, rule_id_map = prepare('../apertium-en-es/apertium-en-es.en-es.t1x') + cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = prepare('../apertium-en-es/apertium-en-es.en-es.t1x') pattern_FST = FST(rules) coverages = pattern_FST.get_lrlm('^prpers$ ^want# to$ ^wait$ ^until$ ^prpers$ ^can$ ^offer$ ^what$ ^would$ ^be$ ^totally$ ^satisfy$ ^for$ ^consumer$^.$', cat_dict) Index: branches/weighted-transfer/apertium-weights-learner/tools/__init__.py =================================================================== Index: branches/weighted-transfer/apertium-weights-learner/tools/simpletok.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/tools/simpletok.py (revision 72173) +++ branches/weighted-transfer/apertium-weights-learner/tools/simpletok.py (revision 72174) @@ -2,6 +2,9 @@ import sys, re +# regexes used to normalize lines +# for scoring them against language model +# or for sending them to language model training beforepunc_re = re.compile(r'([¿("/])(\w)') afterpunc_re = re.compile(r'(\w)([;:,.!?)"/—])') quot_re = re.compile("[«»`'“”„‘’‛]") @@ -10,12 +13,17 @@ afterdash_re = re.compile(r'(\w)-(\W)') def normalize(line): + """ + Tokenize and graphically normalize line + for scoring it against language model + or for sending it to language model training. + """ line = line.lower().replace('--', '—').replace(' - ', ' — ') line = quot_re.sub('"', line) line = beforedash_re.sub(r'\1— \2', afterdash_re.sub(r'\1 —\2', line)) line = beforepunc_re.sub(r'\1 \2', afterpunc_re.sub(r'\1 \2', line)) line = numfix_re.sub(r'\1\2', line) - return line + return line.lower() if __name__ == "__main__": with open(sys.argv[1], 'r', encoding='utf-8') as ifile,\ Index: branches/weighted-transfer/apertium-weights-learner/twlconfig.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 72173) +++ branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 72174) @@ -1,5 +1,5 @@ # full path to source corpus from which to learn the rules -#source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-100-special.txt" +#source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-en-100.txt" source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/new-software-sample.txt" # name of apertium language pair (not translation direction) Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72173) +++ branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72174) @@ -1,6 +1,6 @@ #! /usr/bin/python3 -import re, sys, os, pipes, gc +import re, sys, os, pipes, gc, hashlib from time import perf_counter as clock from math import exp # language model handling @@ -11,7 +11,21 @@ import coverage # apertium translator pipelines from pipelines import partialTranslator, weightedPartialTranslator +from tools.simpletok import normalize +try: # see if lxml is installed + from lxml import etree + if __name__ == "__main__": + print("Using lxml library happily ever after.") + using_lxml = True +except ImportError: # it is not + import xml.etree.ElementTree as etree + if __name__ == "__main__": + print("lxml library not found. Falling back to xml.etree,\n" + "though it's highly recommended that you install lxml\n" + "as it works dramatically faster than xml.etree.") + using_lxml = False + tmpweights_fname = 'tmpweights.w1x' # regular expression to cut out a sentence @@ -23,44 +37,8 @@ # apertium token (anything between ^ and $) apertium_token_re = re.compile(r'\^(.*?)\$') -# start and finish of weights file -weights_head = '\n\n' -weights_tail = '' +whitespace_re = re.compile('\s') -# regexes used to normalize lines -# for scoring against language model -beforepunc_re = re.compile(r'([¿("/])(\w)') -afterpunc_re = re.compile(r'(\w)([;:,.!?)"/—])') -quot_re = re.compile("[«»`'“”„‘’‛]") -numfix_re = re.compile('([0-9]) ([,.:][0-9])') -beforedash_re = re.compile(r'(\W)-(\w)') -afterdash_re = re.compile(r'(\w)-(\W)') - -def normalize(line): - """ - Tokenize and graphically normalize line - for scoring it against language model. - """ - line = line.replace('--', '—').replace(' - ', ' — ') - line = quot_re.sub('"', line) - line = beforedash_re.sub(r'\1— \2', afterdash_re.sub(r'\1 —\2', line)) - line = beforepunc_re.sub(r'\1 \2', afterpunc_re.sub(r'\1 \2', line)) - line = numfix_re.sub(r'\1\2', line) - return line.lower() - -def pattern_to_xml(pattern, weight=1.): - """ - Create a string with XML representation - of weighted pattern for weigths file. - """ - pattern_line = ' \n'.format(weight) - for pattern_item in pattern: - parts = pattern_item.split('<', maxsplit=1) + [''] - lemma, tags = parts[0], parts[1].strip('>') - pattern_line += ' \n'.format(lemma, tags.replace('><', '.')) - pattern_line += ' \n' - return pattern_line - def load_rules(pair_data, source, target): """ Load t1x transfer rules file from pair_data folder in source-target direction. @@ -69,10 +47,10 @@ tixbasepath = os.path.join(pair_data, tixbasename) binbasepath = os.path.join(pair_data, '{}-{}'.format(source, target)) tixfname = '.'.join((tixbasepath, 't1x')) - cat_dict, rules, ambiguous_rules, rule_id_map = coverage.prepare(tixfname) + cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = coverage.prepare(tixfname) pattern_FST = coverage.FST(rules) - return tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map + return tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map, rule_xmls def make_prefix(corpus, data_folder): """ @@ -221,21 +199,22 @@ Translate sent_line for each rule in rule_group. """ translation_list = [] - pattern = pattern_to_xml(pattern) #for each rule for focus_rule in rule_group: # create weights file favoring that rule - weights_line = weights_head + ' \n' + oroot = etree.Element('transfer-weights') + et_rulegroup = etree.SubElement(oroot, 'rule-group') for rule in rule_group: - weights_line += ' \n'.format(rule_id_map[str(rule)]) + et_rule = make_et_rule(str(rule), et_rulegroup, rule_id_map) if rule == focus_rule: - weights_line += pattern - weights_line += ' \n' - weights_line += ' \n' + weights_tail - with open(tmpweights_fname, 'w', encoding='utf-8') as wfile: - wfile.write(weights_line) + et_pattern = make_et_pattern(et_rule, pattern) + if using_lxml: + etree.ElementTree(oroot).write(tmpweights_fname, pretty_print=True, encoding='utf-8', xml_declaration=True) + else: + etree.ElementTree(oroot).write(tmpweights_fname, encoding='utf-8', xml_declaration=True) + # translate using created weights file translation = weighted_translator.translate(sent_line, tmpweights_fname) translation_list.append((focus_rule, translation)) @@ -287,8 +266,45 @@ print('Scored {} chunks, {} sentences in {:.2f}'.format(chunk_counter, sentence_counter, clock() - btime)) return ofname -def make_xml_rules(scores_fname, prefix, rule_map): +def make_et_pattern(et_rule, tokens, weight=1.): """ + Make pattern element for xml tree + with pattern-item elements. + """ + if type(tokens) == type(''): + # if tokens is str, tokenize it + tokens = apertium_token_re.findall(tokens) + et_pattern = etree.SubElement(et_rule, 'pattern') + et_pattern.attrib['weight'] = str(weight) + for token in tokens: + et_pattern_item = etree.SubElement(et_pattern, 'pattern-item') + parts = token.split('<', maxsplit=1) + [''] + lemma, tags = parts[0], parts[1].strip('>').replace('><', '.') + et_pattern_item.attrib['lemma'] = lemma + et_pattern_item.attrib['tags'] = tags + return et_pattern + +def make_et_rule(rule_number, et_rulegroup, rule_map, rule_xmls=None): + """ + Make rule element for xml tree. + """ + et_rule = etree.SubElement(et_rulegroup, 'rule') + if rule_xmls is not None: + # this part is used for final weights file + # copy rule attributes from transfer file + et_rule.attrib.update(rule_xmls[rule_number].attrib) + # calculate md5 sum of rule text without whitespace + # and add it as rule attribute + rule_text = etree.tostring(rule_xmls[rule_number], encoding='unicode') + clean_rule_text = whitespace_re.sub('', rule_text) + et_rule.attrib['md5'] = hashlib.md5(clean_rule_text.encode()).hexdigest() + else: + # this part is used for temporary weights file + et_rule.attrib['id'] = rule_map[rule_number] + return et_rule + +def make_xml_rules(scores_fname, prefix, rule_map, rule_xmls): + """ Sum up the weights for each rule-pattern pair, add the result to xml weights file. """ @@ -304,30 +320,33 @@ pipe.append('sort $IN > $OUT', 'ff') pipe.copy(scores_fname, sorted_scores_fname) - with open(sorted_scores_fname, 'r', encoding='utf-8') as ifile,\ - open(ofname, 'w', encoding='utf-8') as ofile: + # create empty output xml tree + oroot = etree.Element('transfer-weights') + et_newrulegroup = etree.SubElement(oroot, 'rule-group') + + with open(sorted_scores_fname, 'r', encoding='utf-8') as ifile: # read and process the first line prev_group_number, prev_rule_number, prev_pattern, weight = ifile.readline().rstrip('\n').split('\t') total_pattern_weight = float(weight) - ofile.write(weights_head) - ofile.write(' \n \n'.format(rule_map[prev_rule_number])) + et_newrule = make_et_rule(prev_rule_number, et_newrulegroup, rule_map, rule_xmls) # read and process other lines for line in ifile: group_number, rule_number, pattern, weight = line.rstrip('\n').split('\t') if group_number != prev_group_number: - # rule group changed, flush pattern, close previuos, open new - ofile.write(pattern_to_xml(apertium_token_re.findall(prev_pattern), total_pattern_weight)) + # rule group changed: flush pattern, close previuos, open new + et_newpattern = make_et_pattern(et_newrule, prev_pattern, total_pattern_weight) + et_newrulegroup = etree.SubElement(oroot, 'rule-group') + et_newrule = make_et_rule(rule_number, et_newrulegroup, rule_map, rule_xmls) total_pattern_weight = 0. - ofile.write(' \n \n \n \n'.format(rule_map[rule_number])) elif rule_number != prev_rule_number: - # rule changed, flush pattern, close previuos rule, open new - ofile.write(pattern_to_xml(apertium_token_re.findall(prev_pattern), total_pattern_weight)) + # rule changed: flush previous pattern, create new rule + et_newpattern = make_et_pattern(et_newrule, prev_pattern, total_pattern_weight) + et_newrule = make_et_rule(rule_number, et_newrulegroup, rule_map, rule_xmls) total_pattern_weight = 0. - ofile.write(' \n \n'.format(rule_map[rule_number])) elif pattern != prev_pattern: - # pattern changed, flush previous - ofile.write(pattern_to_xml(apertium_token_re.findall(prev_pattern), total_pattern_weight)) + # pattern changed: flush previous + et_newpattern = make_et_pattern(et_newrule, prev_pattern, total_pattern_weight) total_pattern_weight = 0. # add up rule-pattern weights total_pattern_weight += float(weight) @@ -334,10 +353,13 @@ prev_group_number, prev_rule_number, prev_pattern = group_number, rule_number, pattern # flush the last rule-pattern - ofile.write(pattern_to_xml(apertium_token_re.findall(prev_pattern), total_pattern_weight)) - ofile.write(' \n \n') - ofile.write(weights_tail) + et_newpattern = make_et_pattern(et_newrule, prev_pattern, total_pattern_weight) + if using_lxml: + etree.ElementTree(oroot).write(ofname, pretty_print=True, encoding='utf-8', xml_declaration=True) + else: + etree.ElementTree(oroot).write(ofname, encoding='utf-8', xml_declaration=True) + print('Done in {:.2f}'.format(clock() - btime)) return ofname @@ -355,8 +377,10 @@ twlconfig.data_folder) # load rules, build rule FST - tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map = \ - load_rules(twlconfig.apertium_pair_data, twlconfig.source, twlconfig.target) + tixbasepath, binbasepath, cat_dict, pattern_FST, \ + ambiguous_rules, rule_id_map, rule_xmls = \ + load_rules(twlconfig.apertium_pair_data, + twlconfig.source, twlconfig.target) # detect and store sentences with ambiguity ambig_sentences_fname = detect_ambiguous(tagged_fname, prefix, @@ -375,7 +399,7 @@ scores_fname = score_sentences(ambig_sentences_fname, model, prefix) # sum up weigths for rule-pattern and make final xml - make_xml_rules(scores_fname, prefix, rule_id_map) + make_xml_rules(scores_fname, prefix, rule_id_map, rule_xmls) # clean up temporary weights filem if os.path.exists(tmpweights_fname):