Index: branches/weighted-transfer/apertium-weights-learner/coverage.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 72156) +++ branches/weighted-transfer/apertium-weights-learner/coverage.py (revision 72157) @@ -15,168 +15,122 @@ "though it's highly recommended that you install lxml\n" "as it works dramatically faster than xml.etree.") +# regex lines to build up rexes for cat-items any_tag_re = '<[a-z0-9-]+>' any_num_of_any_tags_re = '({})*'.format(any_tag_re) -any_num_of_any_tags_line_re = '^{}$'.format(any_num_of_any_tags_re) -default_cat = ['default'] -def tag_pattern_to_re(tag_pattern): +# apertium token (anything between ^ and $) +apertium_token_re = re.compile(r'\^(.*?)\$') + +def cat_item_to_re(cat_item): """ - Get a tag pattern as specified in xml. + Get a pattern as specified in xml. Output a regex line that matches what is specified by the pattern. + + Attention: ^ and $ here are NOT Apertium start + and end of token, they are regex start and end + of line. Token is assumed to have been already + stripped of its ^ and $. """ - if tag_pattern == '': # no tags - return '^$' - re_line = '^' - tag_sequence = tag_pattern.split('.') + + # start with the lemma (or with the lack of it) + re_line = '^' + cat_item.attrib.get('lemma', '[^<>]*') + + tags = cat_item.attrib['tags'] + + if tags == '': + # no tags: close regex line + return re_line + '$' + + tag_sequence = tags.split('.') for tag in tag_sequence[:-1]: + if tag == '*': # any tag - if tag == '*': - re_line = re_line + any_tag_re + re_line += any_tag_re + else: # specific tag + re_line += '<{}>'.format(tag) + + if tag_sequence[-1] == '*': + # any tags at the end + re_line += any_num_of_any_tags_re else: - re_line = re_line + '<{}>'.format(tag) - # any tags at the end - if tag_sequence[-1] == '*': - re_line = re_line + any_num_of_any_tags_re # specific tag at the end - else: - re_line = re_line + '<{}>'.format(tag_sequence[-1]) + re_line += '<{}>'.format(tag_sequence[-1]) + return re_line + '$' def get_cat_dict(transtree): """ - Get an xml with transfer rules. - Build a makeshift inverted index of the rules. + Get an xml tree with transfer rules. + Build an inverted index of the rules. """ root = transtree.getroot() cat_dict = {} for def_cat in root.find('section-def-cats').findall('def-cat'): for cat_item in def_cat.findall('cat-item'): - tag_re = tag_pattern_to_re(cat_item.attrib.get('tags', '*')) - lemma = cat_item.attrib.get('lemma', '') - if tag_re not in cat_dict: - cat_dict[tag_re] = {} - if lemma not in cat_dict[tag_re]: - cat_dict[tag_re][lemma] = [] - cat_dict[tag_re][lemma].append(def_cat.attrib['n']) + # make a regex line to recognize lemma-tag pattern + re_line = cat_item_to_re(cat_item) + # add empty category list if there is none + cat_dict.setdefault(re_line, []) + # add category to the list + cat_dict[re_line].append(def_cat.attrib['n']) return cat_dict def get_cats_by_line(line, cat_dict): """ - Return all possible categories for ALU. + Return all possible categories for each apertium token in line. """ - return [get_cats_by_ALU(ALU, cat_dict) - for ALU in re.findall(r'\^.*?\$', line)] + return [get_cat(token, cat_dict) + for token in apertium_token_re.findall(line)] -def get_cats_by_ALU(ALU, cat_dict): +def get_cat(token, cat_dict): """ - Return set of all possible categories for ALU. + Return all possible categories for token. """ - divided = ALU.lstrip('^').rstrip('$').split('/') - if len(divided) > 1: - lemma = divided[0] - LU_list = divided[1:] - return (lemma, set(sum([get_cats_by_LU(LU, cat_dict, lemma) - for LU in LU_list], []))) - if len(divided) == 1: - lemma = divided[0] #.split('<', 1)[0] - return (lemma, set(get_cats_by_LU(divided[0], cat_dict, lemma))) - return ('default', set(default_cat)) + token_cat_list = [] + for cat_re, cat_list in cat_dict.items(): + if re.match(cat_re, token): + token_cat_list.extend(cat_list) + return (token, token_cat_list) -def get_cats_by_LU(LU, cat_dict, lemma): - """ - Return list of all possible categories for LU. - """ - partial_lemma = LU.split('<', 1)[0] - tags = LU[len(partial_lemma):].split('#', 1)[0] - cat_list = [] - for tag_re in cat_dict: - if re.match(tag_re, tags): - cat_list.extend((cat_dict[tag_re].get(lemma, []))) - cat_list.extend((cat_dict[tag_re].get('', []))) - if cat_list: - return cat_list - return default_cat - -def process_line(line, cat_dict): - """ - Get line in stream format and print all coverages and LRLM only. - """ - line = get_cats_by_line(line, cat_dict) - print(line) - - return line - -def get_options(): - """ - Parse commandline arguments - """ - usage = "USAGE: ./%prog [-a|-l] [-o OUTPUT_FILE] -r RULES_FILE [INPUT_FILE]" - op = OptionParser(usage=usage) - - op.add_option("-o", "--out", dest="ofname", - help="output results to OUTPUT_FILE.", metavar="OUTPUT_FILE") - - op.add_option("-r", "--rules", dest="rfname", - help="use RULES_FILE t*x file for calculating coverages.", metavar="RULES_FILE") - - mode_group = OptionGroup(op, "output mode", - "Specify what coverages to output, all or LRLM. " - "If none specified, outputs both variants.") - - mode_group.add_option("-a", "--all", dest="all", action="store_true", - help="output all coverages") - - mode_group.add_option("-l", "--lrlm", dest="lrlm", action="store_true", - help="output LRLM coverages") - - op.add_option_group(mode_group) - - (opts, args) = op.parse_args() - - if opts.rfname is None: - op.error("specify t*x file containing rules with -r (--rules) option.") - op.print_help() - sys.exit(1) - - if len(args) > 1: - op.error("too many arguments.") - op.print_help() - sys.exit(1) - - if opts.all is None and opts.lrlm is None: - opts.all = True - opts.lrlm = True - - return opts, args - def get_rules(transtree): """ From xml tree with transfer rules, - build an improvised pattern FST using nested dictionaries. + get rules, ambiguous rules, + and rule id to number map. """ root = transtree.getroot() - rules = [] - rule_id_map = {} - ambiguous_rule_groups = {} - prev_pattern, rule_group = [], -1 + + # build pattern -> rules numbers dict (rules_dict), + # and rule number -> rule id dict (rule_id_map) + rules_dict, rule_id_map = {}, {} for i, rule in enumerate(root.find('section-rules').findall('rule')): if 'id' in rule.attrib: + # rule has 'id' attribute: add it to rule_id_map rule_id_map[str(i)] = rule.attrib['id'] - pattern = ['start'] - for pattern_item in rule.find('pattern').findall('pattern-item'): - pattern.append(pattern_item.attrib['n']) - if pattern == prev_pattern: - ambiguous_rule_groups.setdefault(str(rule_group), {str(rule_group)}) - ambiguous_rule_groups[str(rule_group)].add(str(i)) - else: - rules.append(tuple(pattern) + (str(i),)) - rule_group = i - prev_pattern = pattern + # build pattern + pattern = tuple(pattern_item.attrib['n'] + for pattern_item in rule.find('pattern').findall('pattern-item')) + # add empty rules list for pattern + # if pattern was not in rules_dict + rules_dict.setdefault(pattern, []) + # add rule number to rules list + rules_dict[pattern].append(str(i)) + # detect groups of ambiguous rules, + # and prepare rules for building FST + rules, ambiguous_rule_groups = [], {} + for pattern, rule_group in rules_dict.items(): + if all(rule in rule_id_map for rule in rule_group): + # all rules in group have ids: add group to ambiguous rules + ambiguous_rule_groups[rule_group[0]] = rule_group + # add pattern to rules using first rule as default + rules.append(pattern + (rule_group[0],)) + # sort rules to optimize FST building rules.sort() + return rules, ambiguous_rule_groups, rule_id_map def prepare(rfname): @@ -200,91 +154,138 @@ return cat_dict, rules, ambiguous_rules, rule_id_map class FST: + """ + FST for coverage recognition. + """ def __init__(self, init_rules): + """ + Initialize with patterns from init_rules. + """ self.start_state = 0 - self.final_states = {} - self.states = {0} - self.alphabet = set() - self.transitions = {} + self.final_states = {} # final state: rule + self.transitions = {} # (state, input): state - maxlen = max(len(rule) for rule in init_rules) - 1 + maxlen = max(len(rule) for rule in init_rules) self.maxlen = maxlen - 1 - state, prev = 0, '' - rules = [] - for rule in init_rules: - rules.append([(rule[0], 0)] + list(rule[1:])) + # make rule table, where each pattern starts with ('start', 0) + rules = [[('start', self.start_state)] + list(rule) for rule in init_rules] + state, prev_cat = self.start_state, '' + # look at each rule pattern at fixed position for level in range(1, maxlen): for rule in rules: - # end of the rule if len(rule) <= level: + # this rule already ended: increment state to keep it simple state += 1 elif len(rule) == level+1: + # end of the rule is here: add this state as a final self.final_states[rule[level-1][1]] = rule[level] else: - if rule[level] != prev: + if rule[level] != prev_cat: + # rule patterns diverged: add new state state += 1 + # add transition self.transitions[(rule[level-1][1], rule[level])] = state - prev = rule[level] + prev_cat = rule[level] + # add current state to current pattern element rule[level] = (rule[level], state) - prev = '' + # change prev_cat to empty at the end of rules list + # to ensure state is changed at the start of next run through + prev_cat = '' def get_lrlm(self, line, cat_dict): + """ + Build all lrlm coverages for line. + + """ + # tokenize line and get all possible categories for each token line = get_cats_by_line(line, cat_dict) + + # coverage and state lists are built dinamically + # each state from state_list is the state of FST + # at the end of corresponding coverage from coverage_list coverage_list, state_list = [[]], [self.start_state] + + # go through all tokens in line for token, cat_list in line: new_coverage_list, new_state_list = [], [] + + # go through all cats for the token for cat in cat_list: + + # try to continue each coverage obtained on the previous step for coverage, state in zip(coverage_list, state_list): - if (state, cat) not in self.transitions: - if state in self.final_states: + + if (state, cat) in self.transitions: + # current pattern can be made longer: add one more token + new_coverage_list.append(coverage + [('w', token)]) + new_state_list.append(self.transitions[(state, cat)]) + + elif state in self.final_states: + # current state is one of the final states: close previous pattern + new_coverage = coverage + [('r', self.final_states[state])] + if (self.start_state, cat) in self.transitions: - new_coverage_list.append(coverage + [('r', self.final_states[state]), ('w', token)]) + # can start new pattern + new_coverage_list.append(new_coverage + [('w', token)]) new_state_list.append(self.transitions[(self.start_state, cat)]) - else: - # discard coverage - pass - #print('Unknown transition: ({}, {})'.format(state, cat)) - else: - # discard coverage - pass - #print('Unknown transition: ({}, {})'.format(state, cat)) - else: - new_coverage_list.append(coverage + [('w', token)]) - new_state_list.append(self.transitions[(state, cat)]) + elif '*' in token: + # can not start new pattern because of an unknown word + new_coverage_list.append(new_coverage + [('w', token), ('r', -1)]) + new_state_list.append(self.start_state) + + elif state == self.start_state and '*' in token: + # unknown word at start state: add it to pattern, start new + new_coverage_list.append(coverage + [('w', token), ('r', -1)]) + new_state_list.append(self.start_state) + + # if nothing worked, just discard this coverage + coverage_list, state_list = new_coverage_list, new_state_list + # finalize coverages new_coverage_list = [] for coverage, state in zip(coverage_list, state_list): if state in self.final_states: + # current state is one of the final states: close the last pattern new_coverage_list.append(coverage + [('r', self.final_states[state])]) - else: - # discard coverage - pass - #print('Unexpected end of pattern') + elif coverage[-1][0] == 'r': + # the last pattern is already closed + new_coverage_list.append(coverage) + # if nothing worked, just discard this coverage as incomplete if new_coverage_list == []: + # no coverages detected: no need to go further return [] - handsome_coverage_list = [] + # convert coverage representation: + # [('r'/'w', rule_number/token), ...] -> [([token, token, ... ], rule_number), ...] + formatted_coverage_list = [] for coverage in new_coverage_list: - pattern, handsome_coverage = [], [] + pattern, formatted_coverage = [], [] for element in coverage: if element[0] == 'w': pattern.append(element[1]) else: - handsome_coverage.append((pattern, element[1])) + formatted_coverage.append((pattern, element[1])) pattern = [] - handsome_coverage_list.append(handsome_coverage) + formatted_coverage_list.append(formatted_coverage) - handsome_coverage_list.sort(key=signature, reverse=True) - signature_max = signature(handsome_coverage_list[0]) + # sort coverages by signature, which is a tuple + # of coverage part lengths + formatted_coverage_list.sort(key=signature, reverse=True) + signature_max = signature(formatted_coverage_list[0]) + + # keep only those with top signature in terms of signature + # they would be LRLM ones LRLM_list = [] - for coverage in handsome_coverage_list: + for coverage in formatted_coverage_list: if signature(coverage) == signature_max: + # keep adding LRLM_list.append(coverage) else: + # no need to look further, others will be worse return LRLM_list return LRLM_list @@ -296,37 +297,10 @@ return tuple([len(group[0]) for group in coverage]) if __name__ == "__main__": - opts, args = get_options() - cat_dict, rules, ambiguous_rules, rule_id_map = prepare(opts.rfname) + cat_dict, rules, ambiguous_rules, rule_id_map = prepare('../apertium-en-es/apertium-en-es.en-es.t1x') pattern_FST = FST(rules) - #for rule in rules: - # print(rule) - #print(rule_id_map) - - coverages = pattern_FST.get_lrlm('^proud$ ^culture$', cat_dict) + coverages = pattern_FST.get_lrlm('^publish$ ^in$ ^the$ ^journal$ ^of$ ^the$ ^american$ ^medical$ ^association$ ^the$ ^study$ ^track$ ^the$ ^mental$ ^health$ ^of$ ^88,000$ ^army$ ^combat$ ^veteran$ ^by$ ^compare$ ^their$ ^response$ ^in$ ^a$ ^mental$ ^health$ ^questionnaire$ ^fill$ ^out$ ^upon$ ^their$ ^return$ ^home$ ^with$ ^a$ ^second$ ^mental$ ^health$ ^screening$ ^three$ ^to$ ^six$ ^month$ ^later$^.$', cat_dict) + print('Coverages detected:') for coverage in coverages: print(coverage) - - sys.exit(0) - - if len(args) == 0: - input_stream = sys.stdin - elif len(args) == 1: - try: - input_stream = open(args[0], 'r', encoding='utf-8') - except FileNotFoundError: - print('Failed to locate input file \'{}\'. ' - 'Have you misspelled the name?'.format(args[0])) - sys.exit(1) - - if opts.ofname: - output_stream = open(opts.ofname, 'w', encoding='utf-8') - else: - output_stream = sys.stdout - - for line in input_stream: - process_line(line, cat_dict, pattern_FST, output_stream, opts.all, opts.lrlm) - - if opts.ofname: - output_stream.close() Index: branches/weighted-transfer/apertium-weights-learner/pipelines.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/pipelines.py (revision 72156) +++ branches/weighted-transfer/apertium-weights-learner/pipelines.py (revision 72157) @@ -1,8 +1,20 @@ -import sys +import sys, re from subprocess import Popen, PIPE +# apertium special symbols for removal +apertium_re = re.compile(r'[@#~*]') + class partialTranslator(): + """ + Wrapper for part of Apertium pipeline + going from bidix lookup to the generation. + """ def __init__(self, tixfname, binfname): + """ + On initialization, partial Apertium pipeline + is invoked with '-z' option (null flush) + and remains active waiting for input. + """ self.autobil = Popen(['lt-proc', '-b', '-z', binfname + '.autobil.bin' ], @@ -28,6 +40,11 @@ stdin = self.postchunk.stdout, stdout = PIPE) def translate(self, string): + """ + Convert input string to bytes, + send it to the pipeline, + return the result converted to utf-8. + """ string = string.strip() + '[][\n]' if type(string) == type(''): @@ -45,10 +62,22 @@ output.append(char) char = self.autogen.stdout.read(1) - return (b''.join(output)).decode('utf-8').replace('[][\n]','') + return apertium_re.sub('', (b''.join(output)).decode('utf-8').replace('[][\n]','')) class weightedPartialTranslator(): + """ + Wrapper for part of Apertium pipeline + going from bidix lookup to the generation. + It is missing 1st-stage transfer at init, + because transfer is invoked at translation + with provided weights file. + """ def __init__(self, tixfname, binfname): + """ + On initialization, fragments of Apertium pipeline + are invoked with '-z' option (null flush) + and remain active waiting for input. + """ self.tixfname = tixfname self.binfname = binfname @@ -58,6 +87,8 @@ stdin = PIPE, stdout = PIPE) # transfer is missing here + # it is invoked during translation + # using provided transfer weights file self.interchunk = Popen(['apertium-interchunk', '-z', tixfname + '.t2x', @@ -75,7 +106,11 @@ stdin = self.postchunk.stdout, stdout = PIPE) def translate(self, string, wixfname): - # start null flush pipeline + """ + Convert input string to bytes, + send it to the pipeline, + return the result converted to utf-8. + """ string = string.strip() + '[][\n]' if type(string) == type(''): @@ -83,6 +118,7 @@ else: bstring = string + # start going through null flush pipeline self.autobil.stdin.write(bstring) self.autobil.stdin.write(b'\0') self.autobil.stdin.flush() @@ -103,7 +139,7 @@ transfer_output, err = transfer.communicate(b''.join(autobil_output)) - # resume null flush pipeline + # resume going through null flush pipeline self.interchunk.stdin.write(transfer_output) self.interchunk.stdin.write(b'\0') self.interchunk.stdin.flush() @@ -114,7 +150,7 @@ autogen_output.append(char) char = self.autogen.stdout.read(1) - return (b''.join(autogen_output)).decode('utf-8').replace('[][\n]','') + return apertium_re.sub('', (b''.join(autogen_output)).decode('utf-8').replace('[][\n]','')) if __name__ == "__main__": t = weightedPartialTranslator('../apertium-en-es/apertium-en-es.en-es', '../apertium-en-es/en-es') Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py =================================================================== --- branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72156) +++ branches/weighted-transfer/apertium-weights-learner/twlearner.py (revision 72157) @@ -1,11 +1,15 @@ #! /usr/bin/python3 import re, sys, os, pipes, gc +from time import perf_counter as clock from math import exp -from time import perf_counter as clock +# language model handling import kenlm -import twlconfig # a simple config in python file +# simple config in python file +import twlconfig +# module for coverage calculation import coverage +# apertium translator pipelines from pipelines import partialTranslator, weightedPartialTranslator tmpweights_fname = 'tmpweights.w1x' @@ -16,11 +20,8 @@ # anything between $ and ^ inter_re = re.compile(r'\$.*?\^') -# apertium special symbols for removal -apertium_re = re.compile(r'[@#~*]') - # apertium token (anything between ^ and $) -apertium_token_re = re.compile(r'\^.*?\$') +apertium_token_re = re.compile(r'\^(.*?)\$') # start and finish of weights file weights_head = '\n\n' @@ -50,13 +51,13 @@ def pattern_to_xml(pattern, weight=1.): """ Create a string with XML representation - of pattern with weight for weigths file. + of weighted pattern for weigths file. """ pattern_line = ' \n'.format(weight) for pattern_item in pattern: - parts = pattern_item.strip('^$').split('<', maxsplit=1) + parts = pattern_item.split('<', maxsplit=1) + [''] lemma, tags = parts[0], parts[1].strip('>') - pattern_line += ' \n'.format(lemma, '.'.join(tags.split('><'))) + pattern_line += ' \n'.format(lemma, tags.replace('><', '.')) pattern_line += ' \n' return pattern_line @@ -90,7 +91,7 @@ btime = clock() # make output file name - ofname = '{}-tagged.txt'.format(prefix) + ofname = prefix + '-tagged.txt' # create pipeline pipe = pipes.Template() @@ -124,10 +125,12 @@ btime = clock() # make output file name - ofname = '{}-ambiguous.txt'.format(prefix) + ofname = prefix + '-ambiguous.txt' - # initialize translator for translation with no weights + # initialize translators + # for translation with no weights translator = partialTranslator(tixfname, binfname) + # for weighted translation weighted_translator = weightedPartialTranslator(tixfname, binfname) # initialize statistics @@ -138,13 +141,6 @@ with open(corpus, 'r', encoding='utf-8') as ifile, \ open(ofname, 'w', encoding='utf-8') as ofile: for line in ifile: - lines_count += 1 - if lines_count % 1000 == 0: - print('\n{} total lines\n{} total sentences'.format(lines_count, total_sents_count)) - print('{} ambiguous sentences\n{} ambiguous chunks'.format(ambig_sents_count, ambig_chunks_count)) - print('{} botched coverages\nanother {:.4f} elapsed'.format(botched_coverages, clock() - lbtime)) - gc.collect() - lbtime = clock() # look at each sentence in line for sent_match in sent_re.finditer(line.strip()): @@ -154,28 +150,33 @@ coverage_list = pattern_FST.get_lrlm(sent_match.group(0), cat_dict) if coverage_list == []: botched_coverages += 1 + print('Botched coverage:', sent_match.group(0)) + print() else: # look for ambiguous chunks coverage_item = coverage_list[0] pattern_list = search_ambiguous(ambiguous_rules, coverage_item) if pattern_list != []: - #print(coverage_item) - #print() - #print(pattern_list) - #print() + print('Coverage:', coverage_item) + print('Pattern list:', pattern_list) + print() ambig_sents_count += 1 # segment the sentence into parts each containing one ambiguous chunk sentence_segments, prev = [], 0 for i, rule_group_number, pattern in pattern_list: ambig_chunks_count += 1 - piece_of_line = '^' + '$ ^'.join(sum([chunk[0] for chunk in coverage_item[prev:i+1]], [])) + '$' + list_with_chunk = sum([chunk[0] + for chunk in coverage_item[prev:i+1]], []) + piece_of_line = '^' + '$ ^'.join(list_with_chunk) + '$' sentence_segments.append([rule_group_number, pattern, piece_of_line]) prev = i+1 if sentence_segments != []: + if prev <= len(coverage_item): # add up the tail of the sentence - if prev <= len(coverage_item): - piece_of_line = ' ^' + '$ ^'.join(sum([chunk[0] for chunk in coverage_item[prev:]], [])) + '$' + list_with_chunk = sum([chunk[0] + for chunk in coverage_item[prev:]], []) + piece_of_line = ' ^' + '$ ^'.join(list_with_chunk) + '$' sentence_segments[-1][2] += piece_of_line #print(sentence_segments) @@ -183,7 +184,7 @@ # first, translate each segment with default rules for sentence_segment in sentence_segments: - sentence_segment.append(apertium_re.sub('', translator.translate(sentence_segment[2]))) + sentence_segment.append(translator.translate(sentence_segment[2])) # second, translate each segment with each of the rules, # and make full sentence, where other segments are translated with default rules @@ -202,6 +203,15 @@ print('{}\t^{}$\t{}'.format(sentence_segment[0], '$ ^'.join(sentence_segment[1]), len(output_list)), file=ofile) # then, output all the translations in the following way: rule number, then translated sentence print('\n'.join(output_list), file=ofile) + + lines_count += 1 + if lines_count % 100 == 0: + print('\n{} total lines\n{} total sentences'.format(lines_count, total_sents_count)) + print('{} ambiguous sentences\n{} ambiguous chunks'.format(ambig_sents_count, ambig_chunks_count)) + print('{} botched coverages\nanother {:.4f} elapsed'.format(botched_coverages, clock() - lbtime)) + gc.collect() + lbtime = clock() + print('Done in {:.2f}'.format(clock() - btime)) return ofname @@ -225,8 +235,8 @@ with open(tmpweights_fname, 'w', encoding='utf-8') as wfile: wfile.write(weights_line) - # translate using created file - translation = apertium_re.sub('', weighted_translator.translate(sent_line, tmpweights_fname)) + # translate using created weights file + translation = weighted_translator.translate(sent_line, tmpweights_fname) translation_list.append((focus_rule, translation)) return translation_list @@ -239,7 +249,7 @@ btime, chunk_counter, sentence_counter = clock(), 0, 0 # make output file name - ofname = '{}-chunk-weights.txt'.format(prefix) + ofname = prefix + '-chunk-weights.txt' with open(ambig_sentences_fname, 'r', encoding='utf-8') as ifile, \ open(ofname, 'w', encoding='utf-8') as ofile: @@ -248,20 +258,24 @@ try: line = ifile.readline() group_number, pattern, rulecount = line.rstrip('\n').split('\t') - weights_list = [] - # score - total = 0. + weights_list, total = [], 0. + + # read and process as much following lines as specified by rulecount for i in range(int(rulecount)): line = ifile.readline() rule_number, sentence = line.rstrip('\n').split('\t') + + # score and add up score = exp(model.score(normalize(sentence), bos = True, eos = True)) weights_list.append((rule_number, score)) total += score sentence_counter += 1 + # normalize and print out for rule_number, score in weights_list: print(group_number, rule_number, pattern, score / total, sep='\t', file=ofile) chunk_counter += 1 + except ValueError: reading = False except IndexError: @@ -268,6 +282,7 @@ reading = False except EOFError: reading = False + print('Scored {} chunks, {} sentences in {:.2f}'.format(chunk_counter, sentence_counter, clock() - btime)) return ofname @@ -280,8 +295,8 @@ btime = clock() # make output file names - sorted_scores_fname = '{}-chunk-weights-sorted.txt'.format(prefix) - ofname = '{}-rule-weights.w1x'.format(prefix) + sorted_scores_fname = prefix + '-chunk-weights-sorted.txt' + ofname = prefix + '-rule-weights.w1x' # create pipeline pipe = pipes.Template() @@ -357,7 +372,7 @@ # sum up weigths for rule-pattern and make final xml make_xml_rules(scores_fname, prefix, rule_id_map) - # clean up + # clean up temporary weights filem if os.path.exists(tmpweights_fname): os.remove(tmpweights_fname)