Index: branches/weighted-transfer/apertium-weights-learner/pipelines.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/pipelines.py (revision 72236) +++ branches/weighted-transfer/apertium-weights-learner/pipelines.py (nonexistent) @@ -1,163 +0,0 @@ -import sys, re -from subprocess import Popen, PIPE - -# apertium special symbols for removal -apertium_re = re.compile(r'[@#~*]') - -class partialTranslator(): - """ - Wrapper for part of Apertium pipeline - going from bidix lookup to the generation. - """ - def __init__(self, tixfname, binfname): - """ - On initialization, partial Apertium pipeline - is invoked with '-z' option (null flush) - and remains active waiting for input. - """ - self.autobil = Popen(['lt-proc', '-b', '-z', - binfname + '.autobil.bin' - ], - stdin = PIPE, stdout = PIPE) - self.transfer = Popen(['apertium-transfer', '-b', '-z', - tixfname + '.t1x', - binfname + '.t1x.bin' - ], - stdin = self.autobil.stdout, stdout = PIPE) - self.interchunk = Popen(['apertium-interchunk', '-z', - tixfname + '.t2x', - binfname + '.t2x.bin' - ], - stdin = self.transfer.stdout, stdout = PIPE) - self.postchunk = Popen(['apertium-postchunk', '-z', - tixfname + '.t2x', - binfname + '.t2x.bin' - ], - stdin = self.interchunk.stdout, stdout = PIPE) - self.autogen = Popen(['lt-proc', '-g', '-z', - binfname + '.autogen.bin' - ], - stdin = self.postchunk.stdout, stdout = PIPE) - - def translate(self, string): - """ - Convert input string to bytes, - send it to the pipeline, - return the result converted to utf-8. - """ - string = string.strip() + '[][\n]' - - if type(string) == type(''): - bstring = bytes(string, 'utf-8') - else: - bstring = string - - self.autobil.stdin.write(bstring) - self.autobil.stdin.write(b'\0') - self.autobil.stdin.flush() - - char = self.autogen.stdout.read(1) - output = [] - while char and char != b'\0': - output.append(char) - char = self.autogen.stdout.read(1) - - return apertium_re.sub('', (b''.join(output)).decode('utf-8').replace('[][\n]','')) - -class weightedPartialTranslator(): - """ - Wrapper for part of Apertium pipeline - going from bidix lookup to the generation. - It is missing 1st-stage transfer at init, - because transfer is invoked at translation - with provided weights file. - """ - def __init__(self, tixfname, binfname): - """ - On initialization, fragments of Apertium pipeline - are invoked with '-z' option (null flush) - and remain active waiting for input. - """ - self.tixfname = tixfname - self.binfname = binfname - - self.autobil = Popen(['lt-proc', '-b', '-z', - binfname + '.autobil.bin' - ], - stdin = PIPE, stdout = PIPE) - - # transfer is missing here - # it is invoked during translation - # using provided transfer weights file - - self.interchunk = Popen(['apertium-interchunk', '-z', - tixfname + '.t2x', - binfname + '.t2x.bin' - ], - stdin = PIPE, stdout = PIPE) - self.postchunk = Popen(['apertium-postchunk', '-z', - tixfname + '.t2x', - binfname + '.t2x.bin' - ], - stdin = self.interchunk.stdout, stdout = PIPE) - self.autogen = Popen(['lt-proc', '-g', '-z', - binfname + '.autogen.bin' - ], - stdin = self.postchunk.stdout, stdout = PIPE) - - def translate(self, string, wixfname): - """ - Convert input string to bytes, - send it to the pipeline, - return the result converted to utf-8. - """ - string = string.strip() + '[][\n]' - - if type(string) == type(''): - bstring = bytes(string, 'utf-8') - else: - bstring = string - - # start going through null flush pipeline - self.autobil.stdin.write(bstring) - self.autobil.stdin.write(b'\0') - self.autobil.stdin.flush() - - char = self.autobil.stdout.read(1) - autobil_output = [] - while char and char != b'\0': - autobil_output.append(char) - char = self.autobil.stdout.read(1) - - # make weighted transfer - transfer = Popen(['apertium-transfer', '-bw', - wixfname, - self.tixfname + '.t1x', - self.binfname + '.t1x.bin' - ], - stdin = PIPE, stdout = PIPE) - - transfer_output, err = transfer.communicate(b''.join(autobil_output)) - - # resume going through null flush pipeline - self.interchunk.stdin.write(transfer_output) - self.interchunk.stdin.write(b'\0') - self.interchunk.stdin.flush() - - char = self.autogen.stdout.read(1) - autogen_output = [] - while char and char != b'\0': - autogen_output.append(char) - char = self.autogen.stdout.read(1) - - return apertium_re.sub('', (b''.join(autogen_output)).decode('utf-8').replace('[][\n]','')) - -if __name__ == "__main__": - t = weightedPartialTranslator('../apertium-en-es/apertium-en-es.en-es', '../apertium-en-es/en-es') - - with open('./tests/testfile.txt', 'r', encoding='utf-8') as ifile: - for line in ifile: - print('line:', line) - mo = t.translate(line, '../apertium-en-es/apertium-en-es.en-es.w1x') - print('mo:', mo) - print() Index: branches/weighted-transfer/apertium-weights-learner/coverage.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 72236) +++ branches/weighted-transfer/apertium-weights-learner/coverage.py (nonexistent) @@ -1,313 +0,0 @@ -#! /usr/bin/python3 - -import re, sys -from optparse import OptionParser, OptionGroup -from time import clock - -try: # see if lxml is installed - from lxml import etree as ET - if __name__ == "__main__": - print("Using lxml library happily ever after.") -except ImportError: # it is not - import xml.etree.ElementTree as ET - if __name__ == "__main__": - print("lxml library not found. Falling back to xml.etree,\n" - "though it's highly recommended that you install lxml\n" - "as it works dramatically faster than xml.etree.") - -# regex lines to build up rexes for cat-items -any_tag_re = '<[a-z0-9-]+>' -any_num_of_any_tags_re = '({})*'.format(any_tag_re) - -# apertium token (anything between ^ and $) -apertium_token_re = re.compile(r'\^(.*?)\$') - -def cat_item_to_re(cat_item): - """ - Get a pattern as specified in xml. - Output a regex line that matches what - is specified by the pattern. - - Attention: ^ and $ here are NOT Apertium start - and end of token, they are regex start and end - of line. Token is assumed to have been already - stripped of its ^ and $. - """ - - # start with the lemma (or with the lack of it) - re_line = '^' + cat_item.attrib.get('lemma', '[^<>]*') - - tags = cat_item.attrib['tags'] - - if tags == '': - # no tags: close regex line - return re_line + '$' - - tag_sequence = tags.split('.') - for tag in tag_sequence[:-1]: - if tag == '*': - # any tag - re_line += any_tag_re - else: - # specific tag - re_line += '<{}>'.format(tag) - - if tag_sequence[-1] == '*': - # any tags at the end - re_line += any_num_of_any_tags_re - else: - # specific tag at the end - re_line += '<{}>'.format(tag_sequence[-1]) - - return re_line + '$' - -def get_cat_dict(transtree): - """ - Get an xml tree with transfer rules. - Build an inverted index of the rules. - """ - root = transtree.getroot() - cat_dict = {} - for def_cat in root.find('section-def-cats').findall('def-cat'): - for cat_item in def_cat.findall('cat-item'): - # make a regex line to recognize lemma-tag pattern - re_line = cat_item_to_re(cat_item) - # add empty category list if there is none - cat_dict.setdefault(re_line, []) - # add category to the list - cat_dict[re_line].append(def_cat.attrib['n']) - return cat_dict - -def get_cats_by_line(line, cat_dict): - """ - Return all possible categories for each apertium token in line. - """ - return [get_cat(token, cat_dict) - for token in apertium_token_re.findall(line)] - -def get_cat(token, cat_dict): - """ - Return all possible categories for token. - """ - token_cat_list = [] - for cat_re, cat_list in cat_dict.items(): - if re.match(cat_re, token): - token_cat_list.extend(cat_list) - return (token, token_cat_list) - -def get_rules(transtree): - """ - From xml tree with transfer rules, - get rules, ambiguous rules, - and rule id to number map. - """ - root = transtree.getroot() - - # build pattern -> rules numbers dict (rules_dict), - # and rule number -> rule id dict (rule_id_map) - rules_dict, rule_xmls, rule_id_map = {}, {}, {} - for i, rule in enumerate(root.find('section-rules').findall('rule')): - if 'id' in rule.attrib: - # rule has 'id' attribute: add it to rule_id_map - rule_id_map[str(i)] = rule.attrib['id'] - rule_xmls[str(i)] = rule - # build pattern - pattern = tuple(pattern_item.attrib['n'] - for pattern_item in rule.find('pattern').findall('pattern-item')) - # add empty rules list for pattern - # if pattern was not in rules_dict - rules_dict.setdefault(pattern, []) - # add rule number to rules list - rules_dict[pattern].append(str(i)) - - # detect groups of ambiguous rules, - # and prepare rules for building FST - rules, ambiguous_rule_groups = [], {} - for pattern, rule_group in rules_dict.items(): - if all(rule in rule_id_map for rule in rule_group): - # all rules in group have ids: add group to ambiguous rules - ambiguous_rule_groups[rule_group[0]] = rule_group - # add pattern to rules using first rule as default - rules.append(pattern + (rule_group[0],)) - # sort rules to optimize FST building - rules.sort() - - return rules, ambiguous_rule_groups, rule_id_map, rule_xmls - -def prepare(rfname): - """ - Read transfer file and prepare pattern FST. - """ - try: - transtree = ET.parse(rfname) - except FileNotFoundError: - print('Failed to locate rules file \'{}\'. ' - 'Have you misspelled the name?'.format(opts.rfname)) - sys.exit(1) - except ET.ParseError: - print('Error parsing rules file \'{}\'. ' - 'Is there something wrong with it?'.format(opts.rfname)) - sys.exit(1) - - cat_dict = get_cat_dict(transtree) - rules, ambiguous_rules, rule_id_map, rule_xmls = get_rules(transtree) - - return cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls - -class FST: - """ - FST for coverage recognition. - """ - def __init__(self, init_rules): - """ - Initialize with patterns from init_rules. - """ - self.start_state = 0 - self.final_states = {} # final state: rule - self.transitions = {} # (state, input): state - - maxlen = max(len(rule) for rule in init_rules) - self.maxlen = maxlen - 1 - - # make rule table, where each pattern starts with ('start', 0) - rules = [[('start', self.start_state)] + list(rule) for rule in init_rules] - - state, prev_cat = self.start_state, '' - # look at each rule pattern at fixed position - for level in range(1, maxlen): - for rule in rules: - if len(rule) <= level: - # this rule already ended: increment state to keep it simple - state += 1 - elif len(rule) == level+1: - # end of the rule is here: add this state as a final - self.final_states[rule[level-1][1]] = rule[level] - else: - if rule[level] != prev_cat: - # rule patterns diverged: add new state - state += 1 - # add transition - self.transitions[(rule[level-1][1], rule[level])] = state - prev_cat = rule[level] - # add current state to current pattern element - rule[level] = (rule[level], state) - # change prev_cat to empty at the end of rules list - # to ensure state is changed at the start of next run through - prev_cat = '' - - def get_lrlm(self, line, cat_dict): - """ - Build all lrlm coverages for line. - - """ - # tokenize line and get all possible categories for each token - line = get_cats_by_line(line, cat_dict) - - # coverage and state lists are built dinamically - # each state from state_list is the state of FST - # at the end of corresponding coverage from coverage_list - coverage_list, state_list = [[]], [self.start_state] - - # go through all tokens in line - for token, cat_list in line: - new_coverage_list, new_state_list = [], [] - - # go through all cats for the token - for cat in cat_list: - - # try to continue each coverage obtained on the previous step - for coverage, state in zip(coverage_list, state_list): - - # first, check if we can go further along current pattern - if (state, cat) in self.transitions: - # current pattern can be made longer: add one more token - new_coverage_list.append(coverage + [('w', token)]) - new_state_list.append(self.transitions[(state, cat)]) - - # if not, check if we can finalize current pattern - elif state in self.final_states: - # current state is one of the final states: close previous pattern - new_coverage = coverage + [('r', self.final_states[state])] - - if (self.start_state, cat) in self.transitions: - # can start new pattern - new_coverage_list.append(new_coverage + [('w', token)]) - new_state_list.append(self.transitions[(self.start_state, cat)]) - elif '*' in token: - # can not start new pattern because of an unknown word - new_coverage_list.append(new_coverage + [('w', token), ('r', 'unknown')]) - new_state_list.append(self.start_state) - - # if not, check if it is just an unknown word - elif state == self.start_state and '*' in token: - # unknown word at start state: add it to pattern, start new - new_coverage_list.append(coverage + [('w', token), ('r', 'unknown')]) - new_state_list.append(self.start_state) - - # if nothing worked, just discard this coverage - - coverage_list, state_list = new_coverage_list, new_state_list - - # finalize coverages - new_coverage_list = [] - for coverage, state in zip(coverage_list, state_list): - if state in self.final_states: - # current state is one of the final states: close the last pattern - new_coverage_list.append(coverage + [('r', self.final_states[state])]) - elif coverage != [] and coverage[-1][0] == 'r': - # the last pattern is already closed - new_coverage_list.append(coverage) - # if nothing worked, just discard this coverage as incomplete - - if new_coverage_list == []: - # no coverages detected: no need to go further - return [] - - # convert coverage representation: - # [('r'/'w', rule_number/token), ...] -> [([token, token, ... ], rule_number), ...] - formatted_coverage_list = [] - for coverage in new_coverage_list: - pattern, formatted_coverage = [], [] - for element in coverage: - if element[0] == 'w': - pattern.append(element[1]) - else: - formatted_coverage.append((pattern, element[1])) - pattern = [] - formatted_coverage_list.append(formatted_coverage) - - # now we filter out some not-lrlm coverages - # that still got into - - # sort coverages by signature, which is a tuple - # of coverage part lengths - formatted_coverage_list.sort(key=signature, reverse=True) - signature_max = signature(formatted_coverage_list[0]) - - # keep only those with top signature - # they would be the LRLM ones - LRLM_list = [] - for coverage in formatted_coverage_list: - if signature(coverage) == signature_max: - # keep adding - LRLM_list.append(coverage) - else: - # no need to look further, others will be worse - return LRLM_list - return LRLM_list - -def signature(coverage): - """ - Get coverage signature which is just a tuple - of lengths of groups comprising the coverage. - """ - return tuple([len(group[0]) for group in coverage]) - -if __name__ == "__main__": - cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = prepare('../apertium-en-es/apertium-en-es.en-es.t1x') - pattern_FST = FST(rules) - - coverages = pattern_FST.get_lrlm('^prpers$ ^want# to$ ^wait$ ^until$ ^prpers$ ^can$ ^offer$ ^what$ ^would$ ^be$ ^totally$ ^satisfy$ ^for$ ^consumer$^.$', cat_dict) - print('Coverages detected:') - for coverage in coverages: - print(coverage) Property changes on: branches/weighted-transfer/apertium-weights-learner/coverage.py ___________________________________________________________________ Deleted: svn:executable ## -1 +0,0 ## -* \ No newline at end of property Index: branches/weighted-transfer/apertium-weights-learner/tools/coverage.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/tools/coverage.py (nonexistent) +++ branches/weighted-transfer/apertium-weights-learner/tools/coverage.py (revision 72237) @@ -0,0 +1,314 @@ +#! /usr/bin/python3 + +import re, sys +from optparse import OptionParser, OptionGroup +from time import clock + +try: # see if lxml is installed + from lxml import etree as ET + if __name__ == "__main__": + print("Using lxml library happily ever after.") +except ImportError: # it is not + import xml.etree.ElementTree as ET + if __name__ == "__main__": + print("lxml library not found. Falling back to xml.etree,\n" + "though it's highly recommended that you install lxml\n" + "as it works dramatically faster than xml.etree.") + +# regex lines to build up rexes for cat-items +any_tag_re = '<[a-z0-9-]+>' +any_num_of_any_tags_re = '({})*'.format(any_tag_re) + +# apertium token (anything between ^ and $) +apertium_token_re = re.compile(r'\^(.*?)\$') + +def cat_item_to_re(cat_item): + """ + Get a pattern as specified in xml. + Output a regex line that matches what + is specified by the pattern. + + Attention: ^ and $ here are NOT Apertium start + and end of token, they are regex start and end + of line. Token is assumed to have been already + stripped of its ^ and $. + """ + + # start with the lemma (or with the lack of it) + re_line = '^' + cat_item.attrib.get('lemma', '[^<>]*') + + tags = cat_item.attrib['tags'] + + if tags == '': + # no tags: close regex line + return re_line + '$' + + tag_sequence = tags.split('.') + for tag in tag_sequence[:-1]: + if tag == '*': + # any tag + re_line += any_tag_re + else: + # specific tag + re_line += '<{}>'.format(tag) + + if tag_sequence[-1] == '*': + # any tags at the end + re_line += any_num_of_any_tags_re + else: + # specific tag at the end + re_line += '<{}>'.format(tag_sequence[-1]) + + return re_line + '$' + +def get_cat_dict(transtree): + """ + Get an xml tree with transfer rules. + Build an inverted index of the rules. + """ + root = transtree.getroot() + cat_dict = {} + for def_cat in root.find('section-def-cats').findall('def-cat'): + for cat_item in def_cat.findall('cat-item'): + # make a regex line to recognize lemma-tag pattern + re_line = cat_item_to_re(cat_item) + # add empty category list if there is none + cat_dict.setdefault(re_line, []) + # add category to the list + cat_dict[re_line].append(def_cat.attrib['n']) + return cat_dict + +def get_cats_by_line(line, cat_dict): + """ + Return all possible categories for each apertium token in line. + """ + return [get_cat(token, cat_dict) + for token in apertium_token_re.findall(line)] + +def get_cat(token, cat_dict): + """ + Return all possible categories for token. + """ + token_cat_list = [] + for cat_re, cat_list in cat_dict.items(): + if re.match(cat_re, token): + token_cat_list.extend(cat_list) + return (token, token_cat_list) + +def get_rules(transtree): + """ + From xml tree with transfer rules, + get rules, ambiguous rules, + and rule id to number map. + """ + root = transtree.getroot() + + # build pattern -> rules numbers dict (rules_dict), + # and rule number -> rule id dict (rule_id_map) + rules_dict, rule_xmls, rule_id_map = {}, {}, {} + for i, rule in enumerate(root.find('section-rules').findall('rule')): + if 'id' in rule.attrib: + # rule has 'id' attribute: add it to rule_id_map + rule_id_map[str(i)] = rule.attrib['id'] + rule_xmls[str(i)] = rule + # build pattern + pattern = tuple(pattern_item.attrib['n'] + for pattern_item in rule.find('pattern').findall('pattern-item')) + # add empty rules list for pattern + # if pattern was not in rules_dict + rules_dict.setdefault(pattern, []) + # add rule number to rules list + rules_dict[pattern].append(str(i)) + + # detect groups of ambiguous rules, + # and prepare rules for building FST + rules, ambiguous_rule_groups = [], {} + for pattern, rule_group in rules_dict.items(): + if all(rule in rule_id_map for rule in rule_group): + # all rules in group have ids: add group to ambiguous rules + ambiguous_rule_groups[rule_group[0]] = rule_group + # add pattern to rules using first rule as default + rules.append(pattern + (rule_group[0],)) + # sort rules to optimize FST building + rules.sort() + + return rules, ambiguous_rule_groups, rule_id_map, rule_xmls + +def prepare(rfname): + """ + Read transfer file and prepare pattern FST. + """ + try: + transtree = ET.parse(rfname) + except FileNotFoundError: + print('Failed to locate rules file \'{}\'. ' + 'Have you misspelled the name?'.format(opts.rfname)) + sys.exit(1) + except ET.ParseError: + print('Error parsing rules file \'{}\'. ' + 'Is there something wrong with it?'.format(opts.rfname)) + sys.exit(1) + + cat_dict = get_cat_dict(transtree) + rules, ambiguous_rules, rule_id_map, rule_xmls = get_rules(transtree) + + return cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls + +class FST: + """ + FST for coverage recognition. + """ + def __init__(self, init_rules): + """ + Initialize with patterns from init_rules. + """ + self.start_state = 0 + self.final_states = {} # final state: rule + self.transitions = {} # (state, input): state + + maxlen = max(len(rule) for rule in init_rules) + self.maxlen = maxlen - 1 + + # make rule table, where each pattern starts with ('start', 0) + rules = [[('start', self.start_state)] + list(rule) for rule in init_rules] + + state, prev_cat = self.start_state, '' + # look at each rule pattern at fixed position + for level in range(1, maxlen): + for rule in rules: + if len(rule) <= level: + # this rule already ended: increment state to keep it simple + state += 1 + elif len(rule) == level+1: + # end of the rule is here: add this state as a final + self.final_states[rule[level-1][1]] = rule[level] + else: + if rule[level] != prev_cat: + # rule patterns diverged: add new state + state += 1 + # add transition + self.transitions[(rule[level-1][1], rule[level])] = state + prev_cat = rule[level] + # add current state to current pattern element + rule[level] = (rule[level], state) + # change prev_cat to empty at the end of rules list + # to ensure state is changed at the start of next run through + prev_cat = '' + + def get_lrlm(self, line, cat_dict): + """ + Build all lrlm coverages for line. + + """ + # tokenize line and get all possible categories for each token + line = get_cats_by_line(line, cat_dict) + + # coverage and state lists are built dinamically + # each state from state_list is the state of FST + # at the end of corresponding coverage from coverage_list + coverage_list, state_list = [[]], [self.start_state] + + # go through all tokens in line + for token, cat_list in line: + new_coverage_list, new_state_list = [], [] + + # go through all cats for the token + for cat in cat_list: + + # try to continue each coverage obtained on the previous step + for coverage, state in zip(coverage_list, state_list): + + # first, check if we can go further along current pattern + if (state, cat) in self.transitions: + # current pattern can be made longer: add one more token + new_coverage_list.append(coverage + [('w', token)]) + new_state_list.append(self.transitions[(state, cat)]) + + # if not, check if we can finalize current pattern + elif state in self.final_states: + # current state is one of the final states: close previous pattern + new_coverage = coverage + [('r', self.final_states[state])] + + if (self.start_state, cat) in self.transitions: + # can start new pattern + new_coverage_list.append(new_coverage + [('w', token)]) + new_state_list.append(self.transitions[(self.start_state, cat)]) + elif '*' in token: + # can not start new pattern because of an unknown word + new_coverage_list.append(new_coverage + [('w', token), ('r', 'unknown')]) + new_state_list.append(self.start_state) + + # if not, check if it is just an unknown word + elif state == self.start_state and '*' in token: + # unknown word at start state: add it to pattern, start new + new_coverage_list.append(coverage + [('w', token), ('r', 'unknown')]) + new_state_list.append(self.start_state) + + # if nothing worked, just discard this coverage + + coverage_list, state_list = new_coverage_list, new_state_list + + # finalize coverages + new_coverage_list = [] + for coverage, state in zip(coverage_list, state_list): + if state in self.final_states: + # current state is one of the final states: close the last pattern + new_coverage_list.append(coverage + [('r', self.final_states[state])]) + elif coverage != [] and coverage[-1][0] == 'r': + # the last pattern is already closed + new_coverage_list.append(coverage) + # if nothing worked, just discard this coverage as incomplete + + if new_coverage_list == []: + # no coverages detected: no need to go further + return [] + + # convert coverage representation: + # [('r'/'w', rule_number/token), ...] -> [([token, token, ... ], rule_number), ...] + formatted_coverage_list = [] + for coverage in new_coverage_list: + pattern, formatted_coverage = [], [] + for element in coverage: + if element[0] == 'w': + pattern.append(element[1]) + else: + formatted_coverage.append((pattern, element[1])) + pattern = [] + formatted_coverage_list.append(formatted_coverage) + + # now we filter out some not-lrlm coverages + # that still got into + + # sort coverages by signature, which is a tuple + # of coverage part lengths + formatted_coverage_list.sort(key=signature, reverse=True) + signature_max = signature(formatted_coverage_list[0]) + + # keep only those with top signature + # they would be the LRLM ones + LRLM_list = [] + for coverage in formatted_coverage_list: + if signature(coverage) == signature_max: + # keep adding + LRLM_list.append(coverage) + else: + # no need to look further, others will be worse + return LRLM_list + return LRLM_list + +def signature(coverage): + """ + Get coverage signature which is just a tuple + of lengths of groups comprising the coverage. + """ + return tuple([len(group[0]) for group in coverage]) + +if __name__ == "__main__": + cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = prepare(sys.argv[1]) + pattern_FST = FST(rules) + + coverages = pattern_FST.get_lrlm('^prpers$ ^want# to$ ^wait$ ^until$ ^prpers$ ^can$ ^offer$ ^what$ ^would$ ^be$ ^totally$ ^satisfy$ ^for$ ^consumer$^.$', cat_dict) + + print('Coverages detected:') + for coverage in coverages: + print(coverage) Property changes on: branches/weighted-transfer/apertium-weights-learner/tools/coverage.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py (nonexistent) +++ branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py (revision 72237) @@ -0,0 +1,153 @@ +import sys, re +from subprocess import Popen, PIPE + +# apertium special symbols for removal +apertium_re = re.compile(r'[@#~*]') + +class partialTranslator(): + """ + Wrapper for part of Apertium pipeline + going from bidix lookup to the generation. + """ + def __init__(self, tixfname, binfname): + """ + On initialization, partial Apertium pipeline + is invoked with '-z' option (null flush) + and remains active waiting for input. + """ + self.autobil = Popen(['lt-proc', '-b', '-z', + binfname + '.autobil.bin' + ], + stdin = PIPE, stdout = PIPE) + self.transfer = Popen(['apertium-transfer', '-b', '-z', + tixfname + '.t1x', + binfname + '.t1x.bin' + ], + stdin = self.autobil.stdout, stdout = PIPE) + self.interchunk = Popen(['apertium-interchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.transfer.stdout, stdout = PIPE) + self.postchunk = Popen(['apertium-postchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.interchunk.stdout, stdout = PIPE) + self.autogen = Popen(['lt-proc', '-g', '-z', + binfname + '.autogen.bin' + ], + stdin = self.postchunk.stdout, stdout = PIPE) + + def translate(self, string): + """ + Convert input string to bytes, + send it to the pipeline, + return the result converted to utf-8. + """ + string = string.strip() + '[][\n]' + + if type(string) == type(''): + bstring = bytes(string, 'utf-8') + else: + bstring = string + + self.autobil.stdin.write(bstring) + self.autobil.stdin.write(b'\0') + self.autobil.stdin.flush() + + char = self.autogen.stdout.read(1) + output = [] + while char and char != b'\0': + output.append(char) + char = self.autogen.stdout.read(1) + + return apertium_re.sub('', (b''.join(output)).decode('utf-8').replace('[][\n]','')) + +class weightedPartialTranslator(): + """ + Wrapper for part of Apertium pipeline + going from bidix lookup to the generation. + It is missing 1st-stage transfer at init, + because transfer is invoked at translation + with provided weights file. + """ + def __init__(self, tixfname, binfname): + """ + On initialization, fragments of Apertium pipeline + are invoked with '-z' option (null flush) + and remain active waiting for input. + """ + self.tixfname = tixfname + self.binfname = binfname + + self.autobil = Popen(['lt-proc', '-b', '-z', + binfname + '.autobil.bin' + ], + stdin = PIPE, stdout = PIPE) + + # transfer is missing here + # it is invoked during translation + # using provided transfer weights file + + self.interchunk = Popen(['apertium-interchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = PIPE, stdout = PIPE) + self.postchunk = Popen(['apertium-postchunk', '-z', + tixfname + '.t2x', + binfname + '.t2x.bin' + ], + stdin = self.interchunk.stdout, stdout = PIPE) + self.autogen = Popen(['lt-proc', '-g', '-z', + binfname + '.autogen.bin' + ], + stdin = self.postchunk.stdout, stdout = PIPE) + + def translate(self, string, wixfname): + """ + Convert input string to bytes, + send it to the pipeline, + return the result converted to utf-8. + """ + string = string.strip() + '[][\n]' + + if type(string) == type(''): + bstring = bytes(string, 'utf-8') + else: + bstring = string + + # start going through null flush pipeline + self.autobil.stdin.write(bstring) + self.autobil.stdin.write(b'\0') + self.autobil.stdin.flush() + + char = self.autobil.stdout.read(1) + autobil_output = [] + while char and char != b'\0': + autobil_output.append(char) + char = self.autobil.stdout.read(1) + + # make weighted transfer + transfer = Popen(['apertium-transfer', '-bw', + wixfname, + self.tixfname + '.t1x', + self.binfname + '.t1x.bin' + ], + stdin = PIPE, stdout = PIPE) + + transfer_output, err = transfer.communicate(b''.join(autobil_output)) + + # resume going through null flush pipeline + self.interchunk.stdin.write(transfer_output) + self.interchunk.stdin.write(b'\0') + self.interchunk.stdin.flush() + + char = self.autogen.stdout.read(1) + autogen_output = [] + while char and char != b'\0': + autogen_output.append(char) + char = self.autogen.stdout.read(1) + + return apertium_re.sub('', (b''.join(autogen_output)).decode('utf-8').replace('[][\n]','')) Index: branches/weighted-transfer/apertium-weights-learner/tools/prune.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/tools/prune.py (revision 72236) +++ branches/weighted-transfer/apertium-weights-learner/tools/prune.py (revision 72237) @@ -17,7 +17,7 @@ usage_line = 'Usage: ./prune.py INPUT_FILE [OUTPUT_FILE]' -def prune_transfer_weights(ifname, ofname): +def prune_xml_transfer_weights(using_lxml, ifname, ofname=None): """ Prune the transfer weights file provided in ifname. @@ -38,8 +38,11 @@ except etree.ParseError: print('Error parsing rules file \'{}\'. ' 'Is there something wrong with it?'.format(opts.rfname)) - sys.exit(1) + return None + if ofname is None: + ofname = ifname.rsplit('.', maxsplit=1)[0] + '-prunned.w1x' + # create (empty) output xml tree oroot = etree.Element('transfer-weights') # go through rule-groups @@ -88,6 +91,8 @@ else: etree.ElementTree(oroot).write(ofname, encoding='utf-8', xml_declaration=True) + return ofname + def xml_pattern_to_str(et_pattern): """ Convert xml pattern item into pattern string. @@ -114,8 +119,6 @@ sys.exit(1) if len(sys.argv) == 2: - ofname = ifname.rsplit('.', maxsplit=1)[0] + '-prunned.w1x' + prune_xml_transfer_weights(using_lxml, ifname) else: - ofname = sys.argv[2] - - prune_transfer_weights(ifname, ofname) + prune_xml_transfer_weights(using_lxml, ifname, sys.argv[2]) Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72236) +++ branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72237) @@ -8,10 +8,11 @@ # simple config in python file import twlconfig # module for coverage calculation -import coverage +from tools import coverage # apertium translator pipelines -from pipelines import partialTranslator, weightedPartialTranslator +from tools.pipelines import partialTranslator, weightedPartialTranslator from tools.simpletok import normalize +from tools.prune import prune_xml_transfer_weights try: # see if lxml is installed from lxml import etree @@ -191,6 +192,10 @@ gc.collect() lbtime = clock() + # clean up temporary weights file + if os.path.exists(tmpweights_fname): + os.remove(tmpweights_fname) + print('Done in {:.2f}'.format(clock() - btime)) return ofname @@ -303,7 +308,7 @@ et_rule.attrib['id'] = rule_map[rule_number] return et_rule -def make_xml_rules(scores_fname, prefix, rule_map, rule_xmls): +def make_xml_transfer_weights(scores_fname, prefix, rule_map, rule_xmls): """ Sum up the weights for each rule-pattern pair, add the result to xml weights file. @@ -398,11 +403,11 @@ # estimate rule weights for each ambiguous chunk scores_fname = score_sentences(ambig_sentences_fname, model, prefix) - # sum up weigths for rule-pattern and make final xml - make_xml_rules(scores_fname, prefix, rule_id_map, rule_xmls) + # sum up weigths for rule-pattern and make unprunned xml + weights_fname = make_xml_transfer_weights(scores_fname, prefix, + rule_id_map, rule_xmls) - # clean up temporary weights filem - if os.path.exists(tmpweights_fname): - os.remove(tmpweights_fname) + # prune weights file + prunned_fname = prune_xml_transfer_weights(using_lxml, weights_fname) print('Performed in {:.2f}'.format(clock() - tbtime))