Index: branches/weighted-transfer/apertium-weights-learner/README.md =================================================================== --- branches/weighted-transfer/apertium-weights-learner/README.md (revision 71869) +++ branches/weighted-transfer/apertium-weights-learner/README.md (revision 71870) @@ -41,6 +41,3 @@ ``` ./twlearner.py ``` - -## Known issues -So far, learner script runs out of memory or something at approx. 7500 input lines while looking for ambiguous sentences and translating them and gets killed by the system. Index: branches/weighted-transfer/apertium-weights-learner/coverage.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 71869) +++ branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 71870) @@ -100,204 +100,15 @@ return cat_list return default_cat -def get_pattern_FST(transtree): +def process_line(line, cat_dict): """ - From xml tree with transfer rules, - build an improvised pattern FST using nested dictionaries. - """ - root = transtree.getroot() - pattern_FST = [{}, None] - ambiguous_rules = {} - for i, rule in enumerate(root.find('section-rules').findall('rule')): - curr_level = pattern_FST - for pattern_item in rule.find('pattern').findall('pattern-item'): - item_cat = pattern_item.attrib['n'] - if not item_cat in curr_level[0]: - curr_level[0][item_cat] = [{}, None] - curr_level = curr_level[0][item_cat] - if curr_level[1] is not None: - # the rule is ambiguous as the pattern has already been seen - ambiguous_rules.setdefault(curr_level[1][0], [(curr_level[1][0], curr_level[1][2])]) - ambiguous_rules[curr_level[1][0]].append((str(i), rule.attrib.get('id', ''))) - else: - curr_level[1] = (str(i), rule.attrib.get('comment', ''), rule.attrib.get('id', '')) - return pattern_FST, ambiguous_rules - -def rebuild_pattern_r(pattern_FST): - """ - Recursively rebuild all patterns from pattern FST, just in case. - """ - rule_list = [] - if pattern_FST[0] != {}: - for pattern_item in pattern_FST[0]: - pattern_list = [[pattern_item] + pattern_tail - for pattern_tail in rebuild_pattern_r(pattern_FST[0][pattern_item])] - rule_list.extend(pattern_list) - if pattern_FST[1] is not None: - rule_list.append([pattern_FST[1]]) - return rule_list - -def output_patterns(pattern_FST): - """ - Output all patterns to file in linear fashion. - """ - pattern_list = rebuild_pattern_r(pattern_FST) - pattern_list.sort(key=lambda x: int(x[-1][0])) - with open('rules.txt', 'w', encoding='utf-8') as rfile: - for pattern in pattern_list: - rfile.write('{: <4}\n {}\n {}\n'.format(pattern[-1][0], ' '.join(pattern[:-1]), pattern[-1][1])) - -def calculate_coverage_r(pattern_FST, line, state): - """ - Recursively find all possible pattern combinations - for preprocessed line where each word has a list - of categories assigned to it. - - Output is a list of lists, each list consists - of elements of (type, word/num, cat/comm), - where type is either 'w' (word) or 'r' (rule). - - If type is 'w' then the next two items are the word - and the category assigned to it respectively. - - If type is 'r' then the next two items are the rule - number and its commentary from the xml representation. - - The rule entry in the list marks the end of pattern - for this rule. - - line is current line with ambiguous categories assigned - to words, state is a state of our makeshift FST represented - by its level. - """ - # the end of the line - if not line: - # check if it's also the end of pattern - if state[1] is not None: - return [[('r',) + state[1]]] - return [] - - coverage_list = [] - current_item = line[0] - - # continue the pattern for each category assigned to current word - for cat in (current_item[1] & set(state[0].keys())): - pattern_list = [[('w', current_item[0], cat)] + pattern_tail - for pattern_tail - in calculate_coverage_r( - pattern_FST, - line[1:], - state[0][cat])] - coverage_list.extend(pattern_list) - - # check if it can be an end of the pattern - if state[1] is not None: - # if so, also try to start new pattern from here - pattern_list = [[('r',) + state[1]] + pattern_tail - for pattern_tail - in calculate_coverage_r( - pattern_FST, - line, - pattern_FST)] - coverage_list.extend(pattern_list) - return coverage_list - -def parse_coverage_list(coverage_list): - """ - Get list of lists representing coverages - (as output by calculate_coverage_r) and return - a list of regrouped coverages. - """ - return [parse_coverage(coverage) for coverage in coverage_list] - -def parse_coverage(coverage): - """ - Get a list representing one coverage - (as output by calculate_coverage_r) and return - a list where elements are groups (list_of_words, rule). - """ - groups = [] - current_group = [] - for token in coverage: - if token[0] == 'w': - current_group.append(token[1]) - elif token[0] == 'r': - groups.append((current_group, token[1:])) - current_group = [] - return groups - -def coverages_to_groups(coverage_list): - output_list = [] - for coverage in coverage_list: - output_list.append(coverage_to_groups(coverage)) - return output_list - -def output_all_coverages(coverage_list, output_stream): - for coverage in coverage_list: - output_stream.write(coverage_to_groups(coverage) + '\n') - output_stream.write('\n') - -def coverage_to_groups(coverage): - """ - Output coverage with groups. - """ - output_str = '' - for group in coverage: - output_str = output_str + \ - ' ({} {})'.format(group[1][0], ' '.join(group[0])) - return output_str.strip() - -def signature(coverage): - """ - Get coverage signature which is just a tuple - of lengths of groups comprising the coverage. - """ - return tuple([len(group[0]) for group in coverage]) - -def get_LRLM(coverage_list): - """ - Get only LRLM coverages from list of all coverages - by sorting them lexicographycally by their signatures. - """ - sorted_list = sorted(coverage_list, key=signature, reverse=True) - signature_max = signature(sorted_list[0]) - LRLM_list = [] - for item in sorted_list: - if signature(item) == signature_max: - LRLM_list.append(item) - else: - return LRLM_list - return LRLM_list - -def process_line(line, cat_dict, pattern_FST, output_stream, out_all, out_lrlm, print_out=True): - """ Get line in stream format and print all coverages and LRLM only. """ - if print_out: - output_stream.write(line + '\n') - btime = clock() line = get_cats_by_line(line, cat_dict) - #print('{:f} cats'.format(clock() - btime)) - btime = clock() - coverage_list = calculate_coverage_r(pattern_FST, line, pattern_FST) - parsed_coverages = [] - if coverage_list != []: - if out_all: - parsed_coverages = parse_coverage_list(coverage_list) - if print_out: - output_stream.write('All coverages:\n') - output_all_coverages(parsed_coverages, output_stream) - if out_lrlm: - parsed_coverages = get_LRLM(parse_coverage_list(coverage_list)) - if print_out: - output_stream.write('LRLM only:\n') - output_all_coverages(parsed_coverages, output_stream) - else: - if print_out: - output_stream.write('No coverage found\n') - #print('{:f} coverages'.format(clock() - btime)) - return parsed_coverages, line + print(line) + return line + def get_options(): """ Parse commandline arguments @@ -341,6 +152,33 @@ return opts, args +def get_rules(transtree): + """ + From xml tree with transfer rules, + build an improvised pattern FST using nested dictionaries. + """ + root = transtree.getroot() + rules = [] + rule_id_map = {} + ambiguous_rule_groups = {} + prev_pattern, rule_group = [], -1 + for i, rule in enumerate(root.find('section-rules').findall('rule')): + if 'id' in rule.attrib: + rule_id_map[str(i)] = rule.attrib['id'] + pattern = ['start'] + for pattern_item in rule.find('pattern').findall('pattern-item'): + pattern.append(pattern_item.attrib['n']) + if pattern == prev_pattern: + ambiguous_rule_groups.setdefault(str(rule_group), {str(rule_group)}) + ambiguous_rule_groups[str(rule_group)].add(str(i)) + else: + rules.append(tuple(pattern) + (str(i),)) + rule_group = i + prev_pattern = pattern + + rules.sort() + return rules, ambiguous_rule_groups, rule_id_map + def prepare(rfname): """ Read transfer file and prepare pattern FST. @@ -357,15 +195,121 @@ sys.exit(1) cat_dict = get_cat_dict(transtree) - pattern_FST, ambiguous_rules = get_pattern_FST(transtree) - #output_patterns(pattern_FST) + rules, ambiguous_rules, rule_id_map = get_rules(transtree) - return cat_dict, pattern_FST, ambiguous_rules + return cat_dict, rules, ambiguous_rules, rule_id_map +class FST: + def __init__(self, init_rules): + self.start_state = 0 + self.final_states = {} + self.states = {0} + self.alphabet = set() + self.transitions = {} + + maxlen = max(len(rule) for rule in init_rules) - 1 + self.maxlen = maxlen - 1 + state, prev = 0, '' + + rules = [] + for rule in init_rules: + rules.append([(rule[0], 0)] + list(rule[1:])) + + for level in range(1, maxlen): + for rule in rules: + # end of the rule + if len(rule) <= level: + state += 1 + elif len(rule) == level+1: + self.final_states[rule[level-1][1]] = rule[level] + else: + if rule[level] != prev: + state += 1 + self.transitions[(rule[level-1][1], rule[level])] = state + prev = rule[level] + rule[level] = (rule[level], state) + prev = '' + + def get_lrlm(self, line, cat_dict): + line = get_cats_by_line(line, cat_dict) + coverage_list, state_list = [[]], [self.start_state] + for token, cat_list in line: + new_coverage_list, new_state_list = [], [] + for cat in cat_list: + for coverage, state in zip(coverage_list, state_list): + if (state, cat) not in self.transitions: + if state in self.final_states: + if (self.start_state, cat) in self.transitions: + new_coverage_list.append(coverage + [('r', self.final_states[state]), ('w', token)]) + new_state_list.append(self.transitions[(self.start_state, cat)]) + else: + # discard coverage + pass + #print('Unknown transition: ({}, {})'.format(state, cat)) + else: + # discard coverage + pass + #print('Unknown transition: ({}, {})'.format(state, cat)) + else: + new_coverage_list.append(coverage + [('w', token)]) + new_state_list.append(self.transitions[(state, cat)]) + coverage_list, state_list = new_coverage_list, new_state_list + + new_coverage_list = [] + for coverage, state in zip(coverage_list, state_list): + if state in self.final_states: + new_coverage_list.append(coverage + [('r', self.final_states[state])]) + else: + # discard coverage + pass + #print('Unexpected end of pattern') + + if new_coverage_list == []: + return [] + + handsome_coverage_list = [] + for coverage in new_coverage_list: + pattern, handsome_coverage = [], [] + for element in coverage: + if element[0] == 'w': + pattern.append(element[1]) + else: + handsome_coverage.append((pattern, element[1])) + pattern = [] + handsome_coverage_list.append(handsome_coverage) + + handsome_coverage_list.sort(key=signature, reverse=True) + signature_max = signature(handsome_coverage_list[0]) + LRLM_list = [] + for coverage in handsome_coverage_list: + if signature(coverage) == signature_max: + LRLM_list.append(coverage) + else: + return LRLM_list + return LRLM_list + +def signature(coverage): + """ + Get coverage signature which is just a tuple + of lengths of groups comprising the coverage. + """ + return tuple([len(group[0]) for group in coverage]) + if __name__ == "__main__": opts, args = get_options() - cat_dict, pattern_FST, ambiguous_rules = prepare(opts.rfname) + cat_dict, rules, ambiguous_rules, rule_id_map = prepare(opts.rfname) + pattern_FST = FST(rules) + #for rule in rules: + # print(rule) + #print(rule_id_map) + + coverages = pattern_FST.get_lrlm('^proud$ ^culture$', cat_dict) + for coverage in coverages: + print(coverage) + + sys.exit(0) + if len(args) == 0: input_stream = sys.stdin elif len(args) == 1: Index: branches/weighted-transfer/apertium-weights-learner/twlconfig.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 71869) +++ branches/weighted-transfer/apertium-weights-learner/twlconfig.py (revision 71870) @@ -1,5 +1,5 @@ # full path to source corpus from which to learn the rules -source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-100000.txt" +source_corpus = "/home/nm/source/apertium/weighted-transfer/apertium-weights-learner/data/2007-en-10000.txt" # name of apertium pair (not direction) apertium_pair_name = "en-es" Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 71869) +++ branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 71870) @@ -1,6 +1,6 @@ #! /usr/bin/python3 -import re, sys, os, pipes +import re, sys, os, pipes, gc from math import exp import coverage import kenlm @@ -51,9 +51,10 @@ tixbasepath = os.path.join(pair_data, tixbasename) binbasepath = os.path.join(pair_data, '{}-{}'.format(source, target)) tixfname = '.'.join((tixbasepath, 't1x')) - cat_dict, pattern_FST, ambiguous_rules = coverage.prepare(tixfname) + cat_dict, rules, ambiguous_rules, rule_id_map = coverage.prepare(tixfname) + pattern_FST = coverage.FST(rules) - return tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules + return tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map def make_prefix(corpus, data_folder): """ @@ -103,18 +104,18 @@ print('Done in {:.2f}'.format(clock() - btime)) return ofname -def search_ambiguous(ambiguous_rules, coverage_item): +def search_ambiguous(ambiguous_rules, coverage): """ Look for patterns covered by one of the ambiguous rules in ambiguous_rules. If found, return the rules and their patterns. """ pattern_list = [] - for i, part in enumerate(coverage_item): - if part[1][0] in ambiguous_rules: - pattern_list.append((i, part[1][0], tuple(part[0]))) + for i, part in enumerate(coverage): + if part[1] in ambiguous_rules: + pattern_list.append((i, part[1], tuple(part[0]))) return pattern_list -def detect_ambiguous(corpus, prefix, cat_dict, pattern_FST, ambiguous_rules, tixfname, binfname): +def detect_ambiguous(corpus, prefix, cat_dict, pattern_FST, ambiguous_rules, tixfname, binfname, rule_id_map): """ """ print('Looking for ambiguous sentences and translating them.') @@ -124,27 +125,45 @@ ofname = '{}-ambiguous.txt'.format(prefix) #sfname = '{}-sentences.txt'.format(prefix) + lines_count, total_sents_count, ambig_sents_count, ambig_chunks_count = 0, 0, 0, 0 + botched_coverages = 0 + lbtime = clock() + with open(corpus, 'r', encoding='utf-8') as ifile, \ open(ofname, 'w', encoding='utf-8') as ofile: #,\ #open(sfname, 'w', encoding='utf-8') as sfile: for line in ifile: + lines_count += 1 + if lines_count % 1000 == 0: + print('\n{} total lines\n{} total sentences\n{} ambiguous sentences\n{} ambiguous chunks\n{} botched coverages\nanother {:.4f} elapsed'.format(lines_count, total_sents_count, ambig_sents_count, ambig_chunks_count, botched_coverages, clock() - lbtime)) + gc.collect() + lbtime = clock() # look at each sentence in line for sent_match in sent_re.finditer(line.strip()): + total_sents_count += 1 + #print(total_sents_count, sent_match.group(0)) # get coverages - coverage_list, parsed_line = coverage.process_line(sent_match.group(0), - cat_dict, pattern_FST, - None, False, True, False) - if coverage_list != []: + coverage_list = pattern_FST.get_lrlm(sent_match.group(0), cat_dict) + if coverage_list == []: + botched_coverages += 1 + else: coverage_item = coverage_list[0] # look for ambiguous chunks pattern_list = search_ambiguous(ambiguous_rules, coverage_item) if pattern_list != []: + #print(coverage_item) + #print() + #print(pattern_list) + #print() + ambig_sents_count += 1 # segment the sentence into parts each containing one ambiguous chunk sentence_segments, prev = [], 0 for i, rule_group_number, pattern in pattern_list: + ambig_chunks_count += 1 piece_of_line = '^' + '$ ^'.join(sum([chunk[0] for chunk in coverage_item[prev:i+1]], [])) + '$' sentence_segments.append([rule_group_number, pattern, piece_of_line]) prev = i+1 + if sentence_segments != []: # add up the tail of the sentence if prev <= len(coverage_item): @@ -151,6 +170,9 @@ piece_of_line = ' ^' + '$ ^'.join(sum([chunk[0] for chunk in coverage_item[prev:]], [])) + '$' sentence_segments[-1][2] += piece_of_line + #print(sentence_segments) + #print() + # first, translate each segment with default rules for sentence_segment in sentence_segments: sentence_segment.append(translate(sentence_segment[2], tixfname, binfname)) @@ -160,13 +182,13 @@ for j, sentence_segment in enumerate(sentence_segments): translation_list = translate_ambiguous(ambiguous_rules[sentence_segment[0]], sentence_segment[1], sentence_segment[2], - tixfname, binfname) + tixfname, binfname, rule_id_map) output_list = [] for rule, translation in translation_list: translated_sentence = ' '.join(sentence_segment[3] for sentence_segment in sentence_segments[:j]) +\ ' ' + translation + ' ' +\ ' '.join(sentence_segment[3] for sentence_segment in sentence_segments[j+1:]) - output_list.append('{}\t{}'.format(rule[0], translated_sentence.strip(' '))) + output_list.append('{}\t{}'.format(rule, translated_sentence.strip(' '))) # store results to a file # first, print rule group number, pattern, and number of rules in the group print('{}\t^{}$\t{}'.format(sentence_segment[0], '$ ^'.join(sentence_segment[1]), len(output_list)), file=ofile) @@ -199,7 +221,7 @@ pipefile.close() return apertium_re.sub('', open('pipefile').read().lower()) -def translate_ambiguous(rule_group, pattern, sent_line, tixfname, binfname): +def translate_ambiguous(rule_group, pattern, sent_line, tixfname, binfname, rule_id_map): """ Translate sent_line for each rule in rule_group. """ @@ -211,7 +233,7 @@ # create weights file favoring that rule weights_line = weights_head + ' \n' for rule in rule_group: - weights_line += ' \n'.format(rule[1]) + weights_line += ' \n'.format(rule_id_map[str(rule)]) if rule == focus_rule: weights_line += pattern weights_line += ' \n' @@ -247,7 +269,7 @@ for i in range(int(rulecount)): line = ifile.readline() rule_number, sentence = line.rstrip('\n').split('\t') - score = exp(model.score(sentence, bos = True, eos = True)) + score = exp(model.score(normalize(sentence), bos = True, eos = True)) weights_list.append((rule_number, score)) total += score sentence_counter += 1 @@ -322,20 +344,18 @@ twlconfig.data_folder) # clean up tagged corpus - clean_fname = clean_tagged(tagged_fname, prefix) + #clean_fname = clean_tagged(tagged_fname, prefix) # load rules, build rule FST - tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules = \ + tixbasepath, binbasepath, cat_dict, pattern_FST, ambiguous_rules, rule_id_map = \ load_rules(twlconfig.apertium_pair_data, twlconfig.source, twlconfig.target) - rule_map = {} - for rule_group in ambiguous_rules.values(): - rule_map.update({rule[0]:rule[1] for rule in rule_group}) # detect and store sentences with ambiguity - ambig_sentences_fname = detect_ambiguous(clean_fname, prefix, + ambig_sentences_fname = detect_ambiguous(tagged_fname, prefix, cat_dict, pattern_FST, ambiguous_rules, - tixbasepath, binbasepath) + tixbasepath, binbasepath, + rule_id_map) # load language model print('Loading language model.') @@ -347,6 +367,6 @@ scores_fname = score_sentences(ambig_sentences_fname, model, prefix) # sum up weigths for rule-pattern and make final xml - make_xml_rules(scores_fname, prefix, rule_map) + make_xml_rules(scores_fname, prefix, rule_id_map) print('Performed in {:.2f}'.format(clock() - tbtime))