Index: branches/weighted-transfer/apertium-weights-learner/pipelines.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/pipelines.py	(revision 72236)
+++ branches/weighted-transfer/apertium-weights-learner/pipelines.py	(nonexistent)
@@ -1,163 +0,0 @@
-import sys, re
-from subprocess import Popen, PIPE
-
-# apertium special symbols for removal 
-apertium_re = re.compile(r'[@#~*]')
-
-class partialTranslator():
-    """
-    Wrapper for part of Apertium pipeline
-    going from bidix lookup to the generation.
-    """
-    def __init__(self, tixfname, binfname):
-        """
-        On initialization, partial Apertium pipeline
-        is invoked with '-z' option (null flush)
-        and remains active waiting for input.
-        """
-        self.autobil = Popen(['lt-proc', '-b', '-z', 
-                              binfname + '.autobil.bin'
-                             ],
-                             stdin = PIPE, stdout = PIPE)
-        self.transfer = Popen(['apertium-transfer', '-b', '-z', 
-                               tixfname + '.t1x', 
-                               binfname + '.t1x.bin'
-                              ], 
-                              stdin = self.autobil.stdout, stdout = PIPE)
-        self.interchunk = Popen(['apertium-interchunk', '-z',
-                                 tixfname + '.t2x',
-                                 binfname + '.t2x.bin'
-                                ],
-                                stdin = self.transfer.stdout, stdout = PIPE)
-        self.postchunk = Popen(['apertium-postchunk', '-z',
-                                tixfname + '.t2x',
-                                binfname + '.t2x.bin'
-                               ],
-                               stdin = self.interchunk.stdout, stdout = PIPE)
-        self.autogen = Popen(['lt-proc', '-g', '-z',
-                              binfname + '.autogen.bin'
-                             ],
-                             stdin = self.postchunk.stdout, stdout = PIPE)
-
-    def translate(self, string):
-        """
-        Convert input string to bytes,
-        send it to the pipeline,
-        return the result converted to utf-8.
-        """
-        string = string.strip() + '[][\n]'
-
-        if type(string) == type(''): 
-            bstring = bytes(string, 'utf-8')
-        else:
-            bstring = string  
-
-        self.autobil.stdin.write(bstring)
-        self.autobil.stdin.write(b'\0')
-        self.autobil.stdin.flush()
-
-        char = self.autogen.stdout.read(1)
-        output = []
-        while char and char != b'\0':
-            output.append(char)
-            char = self.autogen.stdout.read(1)
-
-        return apertium_re.sub('', (b''.join(output)).decode('utf-8').replace('[][\n]',''))
-
-class weightedPartialTranslator():
-    """
-    Wrapper for part of Apertium pipeline
-    going from bidix lookup to the generation.
-    It is missing 1st-stage transfer at init,
-    because transfer is invoked at translation
-    with provided weights file.
-    """
-    def __init__(self, tixfname, binfname):
-        """
-        On initialization, fragments of Apertium pipeline
-        are invoked with '-z' option (null flush)
-        and remain active waiting for input.
-        """
-        self.tixfname = tixfname
-        self.binfname = binfname
-
-        self.autobil = Popen(['lt-proc', '-b', '-z', 
-                              binfname + '.autobil.bin'
-                             ],
-                             stdin = PIPE, stdout = PIPE)
-
-        # transfer is missing here
-        # it is invoked during translation
-        # using provided transfer weights file 
-
-        self.interchunk = Popen(['apertium-interchunk', '-z',
-                                 tixfname + '.t2x',
-                                 binfname + '.t2x.bin'
-                                ],
-                                stdin = PIPE, stdout = PIPE)
-        self.postchunk = Popen(['apertium-postchunk', '-z',
-                                tixfname + '.t2x',
-                                binfname + '.t2x.bin'
-                               ],
-                               stdin = self.interchunk.stdout, stdout = PIPE)
-        self.autogen = Popen(['lt-proc', '-g', '-z',
-                              binfname + '.autogen.bin'
-                             ],
-                             stdin = self.postchunk.stdout, stdout = PIPE)
-
-    def translate(self, string, wixfname):
-        """
-        Convert input string to bytes,
-        send it to the pipeline,
-        return the result converted to utf-8.
-        """
-        string = string.strip() + '[][\n]'
-
-        if type(string) == type(''): 
-            bstring = bytes(string, 'utf-8')
-        else:
-            bstring = string  
-
-        # start going through null flush pipeline
-        self.autobil.stdin.write(bstring)
-        self.autobil.stdin.write(b'\0')
-        self.autobil.stdin.flush()
-
-        char = self.autobil.stdout.read(1)
-        autobil_output = []
-        while char and char != b'\0':
-            autobil_output.append(char)
-            char = self.autobil.stdout.read(1)
-
-        # make weighted transfer
-        transfer = Popen(['apertium-transfer', '-bw',
-                          wixfname,
-                          self.tixfname + '.t1x', 
-                          self.binfname + '.t1x.bin'
-                         ],
-                         stdin = PIPE, stdout = PIPE)
-
-        transfer_output, err = transfer.communicate(b''.join(autobil_output))
-
-        # resume going through null flush pipeline
-        self.interchunk.stdin.write(transfer_output)
-        self.interchunk.stdin.write(b'\0')
-        self.interchunk.stdin.flush()
-
-        char = self.autogen.stdout.read(1)
-        autogen_output = []
-        while char and char != b'\0':
-            autogen_output.append(char)
-            char = self.autogen.stdout.read(1)
-
-        return apertium_re.sub('', (b''.join(autogen_output)).decode('utf-8').replace('[][\n]',''))
-
-if __name__ == "__main__":
-    t = weightedPartialTranslator('../apertium-en-es/apertium-en-es.en-es', '../apertium-en-es/en-es')
-
-    with open('./tests/testfile.txt', 'r', encoding='utf-8') as ifile:
-        for line in ifile:
-            print('line:', line)
-            mo = t.translate(line, '../apertium-en-es/apertium-en-es.en-es.w1x')
-            print('mo:', mo)
-            print()
Index: branches/weighted-transfer/apertium-weights-learner/coverage.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/coverage.py	(revision 72236)
+++ branches/weighted-transfer/apertium-weights-learner/coverage.py	(nonexistent)
@@ -1,313 +0,0 @@
-#! /usr/bin/python3
-
-import re, sys
-from optparse import OptionParser, OptionGroup
-from time import clock
-
-try: # see if lxml is installed
-    from lxml import etree as ET
-    if __name__ == "__main__":
-        print("Using lxml library happily ever after.")
-except ImportError: # it is not
-    import xml.etree.ElementTree as ET
-    if __name__ == "__main__":
-        print("lxml library not found. Falling back to xml.etree,\n"
-              "though it's highly recommended that you install lxml\n"
-              "as it works dramatically faster than xml.etree.")
-
-# regex lines to build up rexes for cat-items
-any_tag_re = '<[a-z0-9-]+>'
-any_num_of_any_tags_re = '({})*'.format(any_tag_re)
-
-# apertium token (anything between ^ and $)
-apertium_token_re = re.compile(r'\^(.*?)\$')
-
-def cat_item_to_re(cat_item):
-    """
-    Get a pattern as specified in xml.
-    Output a regex line that matches what 
-    is specified by the pattern.
-
-    Attention: ^ and $ here are NOT Apertium start
-    and end of token, they are regex start and end
-    of line. Token is assumed to have been already
-    stripped of its ^ and $.
-    """
-
-    # start with the lemma (or with the lack of it)
-    re_line = '^' + cat_item.attrib.get('lemma', '[^<>]*')
-
-    tags = cat_item.attrib['tags']
-
-    if tags == '':
-        # no tags: close regex line
-        return re_line + '$'
-
-    tag_sequence = tags.split('.')
-    for tag in tag_sequence[:-1]:
-        if tag == '*':
-            # any tag
-            re_line += any_tag_re
-        else:
-            # specific tag
-            re_line += '<{}>'.format(tag)
-
-    if tag_sequence[-1] == '*':
-        # any tags at the end
-        re_line += any_num_of_any_tags_re
-    else:
-        # specific tag at the end
-        re_line += '<{}>'.format(tag_sequence[-1])
-
-    return re_line + '$'
-
-def get_cat_dict(transtree):
-    """
-    Get an xml tree with transfer rules.
-    Build an inverted index of the rules.
-    """
-    root = transtree.getroot()
-    cat_dict = {}
-    for def_cat in root.find('section-def-cats').findall('def-cat'):
-        for cat_item in def_cat.findall('cat-item'):
-            # make a regex line to recognize lemma-tag pattern
-            re_line = cat_item_to_re(cat_item)
-            # add empty category list if there is none
-            cat_dict.setdefault(re_line, [])
-            # add category to the list
-            cat_dict[re_line].append(def_cat.attrib['n'])
-    return cat_dict
-
-def get_cats_by_line(line, cat_dict):
-    """
-    Return all possible categories for each apertium token in line.
-    """
-    return [get_cat(token, cat_dict)
-                for token in apertium_token_re.findall(line)]
-
-def get_cat(token, cat_dict):
-    """
-    Return all possible categories for token.
-    """
-    token_cat_list = []
-    for cat_re, cat_list in cat_dict.items():
-        if re.match(cat_re, token):
-            token_cat_list.extend(cat_list)
-    return (token, token_cat_list)
-
-def get_rules(transtree):
-    """
-    From xml tree with transfer rules,
-    get rules, ambiguous rules,
-    and rule id to number map.
-    """
-    root = transtree.getroot()
-
-    # build pattern -> rules numbers dict (rules_dict),
-    # and rule number -> rule id dict (rule_id_map)
-    rules_dict, rule_xmls, rule_id_map  = {}, {}, {}
-    for i, rule in enumerate(root.find('section-rules').findall('rule')):
-        if 'id' in rule.attrib:
-            # rule has 'id' attribute: add it to rule_id_map
-            rule_id_map[str(i)] = rule.attrib['id']
-            rule_xmls[str(i)] = rule
-        # build pattern
-        pattern = tuple(pattern_item.attrib['n'] 
-                for pattern_item in rule.find('pattern').findall('pattern-item'))
-        # add empty rules list for pattern
-        # if pattern was not in rules_dict
-        rules_dict.setdefault(pattern, [])
-        # add rule number to rules list
-        rules_dict[pattern].append(str(i))
-
-    # detect groups of ambiguous rules,
-    # and prepare rules for building FST
-    rules, ambiguous_rule_groups = [], {}
-    for pattern, rule_group in rules_dict.items():
-        if all(rule in rule_id_map for rule in rule_group):
-            # all rules in group have ids: add group to ambiguous rules
-            ambiguous_rule_groups[rule_group[0]] = rule_group
-        # add pattern to rules using first rule as default
-        rules.append(pattern + (rule_group[0],))
-    # sort rules to optimize FST building
-    rules.sort()
-
-    return rules, ambiguous_rule_groups, rule_id_map, rule_xmls
-
-def prepare(rfname):
-    """
-    Read transfer file and prepare pattern FST.
-    """
-    try:
-        transtree = ET.parse(rfname)
-    except FileNotFoundError:
-        print('Failed to locate rules file \'{}\'. '
-              'Have you misspelled the name?'.format(opts.rfname))
-        sys.exit(1)
-    except ET.ParseError:
-        print('Error parsing rules file \'{}\'. '
-              'Is there something wrong with it?'.format(opts.rfname))
-        sys.exit(1)
-
-    cat_dict = get_cat_dict(transtree)
-    rules, ambiguous_rules, rule_id_map, rule_xmls = get_rules(transtree)
-
-    return cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls
-
-class FST:
-    """
-    FST for coverage recognition.
-    """
-    def __init__(self, init_rules):
-        """
-        Initialize with patterns from init_rules.
-        """
-        self.start_state = 0
-        self.final_states = {} # final state: rule
-        self.transitions = {} # (state, input): state
-
-        maxlen = max(len(rule) for rule in init_rules)
-        self.maxlen = maxlen - 1
-
-        # make rule table, where each pattern starts with ('start', 0)
-        rules = [[('start', self.start_state)] + list(rule) for rule in init_rules]
-
-        state, prev_cat = self.start_state, ''
-        # look at each rule pattern at fixed position 
-        for level in range(1, maxlen):
-            for rule in rules:
-                if len(rule) <= level:
-                    # this rule already ended: increment state to keep it simple
-                    state += 1
-                elif len(rule) == level+1:
-                    # end of the rule is here: add this state as a final
-                    self.final_states[rule[level-1][1]] = rule[level]
-                else:
-                    if rule[level] != prev_cat:
-                        # rule patterns diverged: add new state                        
-                        state += 1
-                    # add transition
-                    self.transitions[(rule[level-1][1], rule[level])] = state
-                    prev_cat = rule[level]
-                    # add current state to current pattern element
-                    rule[level] = (rule[level], state)
-            # change prev_cat to empty at the end of rules list
-            # to ensure state is changed at the start of next run through
-            prev_cat = ''
-
-    def get_lrlm(self, line, cat_dict):
-        """
-        Build all lrlm coverages for line.
-        
-        """
-        # tokenize line and get all possible categories for each token
-        line = get_cats_by_line(line, cat_dict)
-
-        # coverage and state lists are built dinamically
-        # each state from state_list is the state of FST
-        # at the end of corresponding coverage from coverage_list
-        coverage_list, state_list = [[]], [self.start_state]
-
-        # go through all tokens in line
-        for token, cat_list in line:
-            new_coverage_list, new_state_list = [], []
-
-            # go through all cats for the token
-            for cat in cat_list:
-
-                # try to continue each coverage obtained on the previous step
-                for coverage, state in zip(coverage_list, state_list):
-
-                    # first, check if we can go further along current pattern
-                    if (state, cat) in self.transitions:
-                        # current pattern can be made longer: add one more token
-                        new_coverage_list.append(coverage + [('w', token)])
-                        new_state_list.append(self.transitions[(state, cat)])
-
-                    # if not, check if we can finalize current pattern
-                    elif state in self.final_states:
-                        # current state is one of the final states: close previous pattern
-                        new_coverage = coverage + [('r', self.final_states[state])]
-
-                        if (self.start_state, cat) in self.transitions:
-                            # can start new pattern
-                            new_coverage_list.append(new_coverage + [('w', token)])
-                            new_state_list.append(self.transitions[(self.start_state, cat)])
-                        elif '*' in token:
-                            # can not start new pattern because of an unknown word
-                            new_coverage_list.append(new_coverage + [('w', token), ('r', 'unknown')])
-                            new_state_list.append(self.start_state)
-
-                    # if not, check if it is just an unknown word
-                    elif state == self.start_state and '*' in token:
-                        # unknown word at start state: add it to pattern, start new
-                        new_coverage_list.append(coverage + [('w', token), ('r', 'unknown')])
-                        new_state_list.append(self.start_state)
-
-                    # if nothing worked, just discard this coverage
-
-            coverage_list, state_list = new_coverage_list, new_state_list
-
-        # finalize coverages
-        new_coverage_list = []
-        for coverage, state in zip(coverage_list, state_list):
-            if state in self.final_states:
-                # current state is one of the final states: close the last pattern
-                new_coverage_list.append(coverage + [('r', self.final_states[state])])
-            elif coverage != [] and coverage[-1][0] == 'r':
-                # the last pattern is already closed
-                new_coverage_list.append(coverage)
-            # if nothing worked, just discard this coverage as incomplete
-
-        if new_coverage_list == []:
-            # no coverages detected: no need to go further
-            return []
-
-        # convert coverage representation:
-        # [('r'/'w', rule_number/token), ...] -> [([token, token, ... ], rule_number), ...]
-        formatted_coverage_list = []
-        for coverage in new_coverage_list:
-            pattern, formatted_coverage = [], []
-            for element in coverage:
-                if element[0] == 'w':
-                    pattern.append(element[1])
-                else:
-                    formatted_coverage.append((pattern, element[1]))
-                    pattern = []
-            formatted_coverage_list.append(formatted_coverage)
-
-        # now we filter out some not-lrlm coverages
-        # that still got into
-
-        # sort coverages by signature, which is a tuple
-        # of coverage part lengths
-        formatted_coverage_list.sort(key=signature, reverse=True)
-        signature_max = signature(formatted_coverage_list[0])
-
-        # keep only those with top signature
-        # they would be the LRLM ones
-        LRLM_list = []
-        for coverage in formatted_coverage_list:
-            if signature(coverage) == signature_max:
-                # keep adding
-                LRLM_list.append(coverage)
-            else:
-                # no need to look further, others will be worse
-                return LRLM_list
-        return LRLM_list
-
-def signature(coverage):
-    """
-    Get coverage signature which is just a tuple
-    of lengths of groups comprising the coverage.
-    """
-    return tuple([len(group[0]) for group in coverage])
-
-if __name__ == "__main__":
-    cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = prepare('../apertium-en-es/apertium-en-es.en-es.t1x')
-    pattern_FST = FST(rules)
-
-    coverages = pattern_FST.get_lrlm('^prpers<prn><subj><p1><mf><pl>$ ^want# to<vbmod><pp>$ ^wait<vblex><inf>$ ^until<cnjadv>$ ^prpers<prn><subj><p1><mf><pl>$ ^can<vaux><past>$ ^offer<vblex><inf>$ ^what<prn><itg><m><sp>$ ^would<vaux><inf>$ ^be<vbser><inf>$ ^totally<adv>$ ^satisfy<vblex><ger>$ ^for<pr>$ ^consumer<n><pl>$^.<sent>$', cat_dict)
-    print('Coverages detected:')
-    for coverage in coverages:
-        print(coverage)

Property changes on: branches/weighted-transfer/apertium-weights-learner/coverage.py
___________________________________________________________________
Deleted: svn:executable
## -1 +0,0 ##
-*
\ No newline at end of property
Index: branches/weighted-transfer/apertium-weights-learner/tools/coverage.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/tools/coverage.py	(nonexistent)
+++ branches/weighted-transfer/apertium-weights-learner/tools/coverage.py	(revision 72237)
@@ -0,0 +1,314 @@
+#! /usr/bin/python3
+
+import re, sys
+from optparse import OptionParser, OptionGroup
+from time import clock
+
+try: # see if lxml is installed
+    from lxml import etree as ET
+    if __name__ == "__main__":
+        print("Using lxml library happily ever after.")
+except ImportError: # it is not
+    import xml.etree.ElementTree as ET
+    if __name__ == "__main__":
+        print("lxml library not found. Falling back to xml.etree,\n"
+              "though it's highly recommended that you install lxml\n"
+              "as it works dramatically faster than xml.etree.")
+
+# regex lines to build up rexes for cat-items
+any_tag_re = '<[a-z0-9-]+>'
+any_num_of_any_tags_re = '({})*'.format(any_tag_re)
+
+# apertium token (anything between ^ and $)
+apertium_token_re = re.compile(r'\^(.*?)\$')
+
+def cat_item_to_re(cat_item):
+    """
+    Get a pattern as specified in xml.
+    Output a regex line that matches what 
+    is specified by the pattern.
+
+    Attention: ^ and $ here are NOT Apertium start
+    and end of token, they are regex start and end
+    of line. Token is assumed to have been already
+    stripped of its ^ and $.
+    """
+
+    # start with the lemma (or with the lack of it)
+    re_line = '^' + cat_item.attrib.get('lemma', '[^<>]*')
+
+    tags = cat_item.attrib['tags']
+
+    if tags == '':
+        # no tags: close regex line
+        return re_line + '$'
+
+    tag_sequence = tags.split('.')
+    for tag in tag_sequence[:-1]:
+        if tag == '*':
+            # any tag
+            re_line += any_tag_re
+        else:
+            # specific tag
+            re_line += '<{}>'.format(tag)
+
+    if tag_sequence[-1] == '*':
+        # any tags at the end
+        re_line += any_num_of_any_tags_re
+    else:
+        # specific tag at the end
+        re_line += '<{}>'.format(tag_sequence[-1])
+
+    return re_line + '$'
+
+def get_cat_dict(transtree):
+    """
+    Get an xml tree with transfer rules.
+    Build an inverted index of the rules.
+    """
+    root = transtree.getroot()
+    cat_dict = {}
+    for def_cat in root.find('section-def-cats').findall('def-cat'):
+        for cat_item in def_cat.findall('cat-item'):
+            # make a regex line to recognize lemma-tag pattern
+            re_line = cat_item_to_re(cat_item)
+            # add empty category list if there is none
+            cat_dict.setdefault(re_line, [])
+            # add category to the list
+            cat_dict[re_line].append(def_cat.attrib['n'])
+    return cat_dict
+
+def get_cats_by_line(line, cat_dict):
+    """
+    Return all possible categories for each apertium token in line.
+    """
+    return [get_cat(token, cat_dict)
+                for token in apertium_token_re.findall(line)]
+
+def get_cat(token, cat_dict):
+    """
+    Return all possible categories for token.
+    """
+    token_cat_list = []
+    for cat_re, cat_list in cat_dict.items():
+        if re.match(cat_re, token):
+            token_cat_list.extend(cat_list)
+    return (token, token_cat_list)
+
+def get_rules(transtree):
+    """
+    From xml tree with transfer rules,
+    get rules, ambiguous rules,
+    and rule id to number map.
+    """
+    root = transtree.getroot()
+
+    # build pattern -> rules numbers dict (rules_dict),
+    # and rule number -> rule id dict (rule_id_map)
+    rules_dict, rule_xmls, rule_id_map  = {}, {}, {}
+    for i, rule in enumerate(root.find('section-rules').findall('rule')):
+        if 'id' in rule.attrib:
+            # rule has 'id' attribute: add it to rule_id_map
+            rule_id_map[str(i)] = rule.attrib['id']
+            rule_xmls[str(i)] = rule
+        # build pattern
+        pattern = tuple(pattern_item.attrib['n'] 
+                for pattern_item in rule.find('pattern').findall('pattern-item'))
+        # add empty rules list for pattern
+        # if pattern was not in rules_dict
+        rules_dict.setdefault(pattern, [])
+        # add rule number to rules list
+        rules_dict[pattern].append(str(i))
+
+    # detect groups of ambiguous rules,
+    # and prepare rules for building FST
+    rules, ambiguous_rule_groups = [], {}
+    for pattern, rule_group in rules_dict.items():
+        if all(rule in rule_id_map for rule in rule_group):
+            # all rules in group have ids: add group to ambiguous rules
+            ambiguous_rule_groups[rule_group[0]] = rule_group
+        # add pattern to rules using first rule as default
+        rules.append(pattern + (rule_group[0],))
+    # sort rules to optimize FST building
+    rules.sort()
+
+    return rules, ambiguous_rule_groups, rule_id_map, rule_xmls
+
+def prepare(rfname):
+    """
+    Read transfer file and prepare pattern FST.
+    """
+    try:
+        transtree = ET.parse(rfname)
+    except FileNotFoundError:
+        print('Failed to locate rules file \'{}\'. '
+              'Have you misspelled the name?'.format(opts.rfname))
+        sys.exit(1)
+    except ET.ParseError:
+        print('Error parsing rules file \'{}\'. '
+              'Is there something wrong with it?'.format(opts.rfname))
+        sys.exit(1)
+
+    cat_dict = get_cat_dict(transtree)
+    rules, ambiguous_rules, rule_id_map, rule_xmls = get_rules(transtree)
+
+    return cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls
+
+class FST:
+    """
+    FST for coverage recognition.
+    """
+    def __init__(self, init_rules):
+        """
+        Initialize with patterns from init_rules.
+        """
+        self.start_state = 0
+        self.final_states = {} # final state: rule
+        self.transitions = {} # (state, input): state
+
+        maxlen = max(len(rule) for rule in init_rules)
+        self.maxlen = maxlen - 1
+
+        # make rule table, where each pattern starts with ('start', 0)
+        rules = [[('start', self.start_state)] + list(rule) for rule in init_rules]
+
+        state, prev_cat = self.start_state, ''
+        # look at each rule pattern at fixed position 
+        for level in range(1, maxlen):
+            for rule in rules:
+                if len(rule) <= level:
+                    # this rule already ended: increment state to keep it simple
+                    state += 1
+                elif len(rule) == level+1:
+                    # end of the rule is here: add this state as a final
+                    self.final_states[rule[level-1][1]] = rule[level]
+                else:
+                    if rule[level] != prev_cat:
+                        # rule patterns diverged: add new state                        
+                        state += 1
+                    # add transition
+                    self.transitions[(rule[level-1][1], rule[level])] = state
+                    prev_cat = rule[level]
+                    # add current state to current pattern element
+                    rule[level] = (rule[level], state)
+            # change prev_cat to empty at the end of rules list
+            # to ensure state is changed at the start of next run through
+            prev_cat = ''
+
+    def get_lrlm(self, line, cat_dict):
+        """
+        Build all lrlm coverages for line.
+        
+        """
+        # tokenize line and get all possible categories for each token
+        line = get_cats_by_line(line, cat_dict)
+
+        # coverage and state lists are built dinamically
+        # each state from state_list is the state of FST
+        # at the end of corresponding coverage from coverage_list
+        coverage_list, state_list = [[]], [self.start_state]
+
+        # go through all tokens in line
+        for token, cat_list in line:
+            new_coverage_list, new_state_list = [], []
+
+            # go through all cats for the token
+            for cat in cat_list:
+
+                # try to continue each coverage obtained on the previous step
+                for coverage, state in zip(coverage_list, state_list):
+
+                    # first, check if we can go further along current pattern
+                    if (state, cat) in self.transitions:
+                        # current pattern can be made longer: add one more token
+                        new_coverage_list.append(coverage + [('w', token)])
+                        new_state_list.append(self.transitions[(state, cat)])
+
+                    # if not, check if we can finalize current pattern
+                    elif state in self.final_states:
+                        # current state is one of the final states: close previous pattern
+                        new_coverage = coverage + [('r', self.final_states[state])]
+
+                        if (self.start_state, cat) in self.transitions:
+                            # can start new pattern
+                            new_coverage_list.append(new_coverage + [('w', token)])
+                            new_state_list.append(self.transitions[(self.start_state, cat)])
+                        elif '*' in token:
+                            # can not start new pattern because of an unknown word
+                            new_coverage_list.append(new_coverage + [('w', token), ('r', 'unknown')])
+                            new_state_list.append(self.start_state)
+
+                    # if not, check if it is just an unknown word
+                    elif state == self.start_state and '*' in token:
+                        # unknown word at start state: add it to pattern, start new
+                        new_coverage_list.append(coverage + [('w', token), ('r', 'unknown')])
+                        new_state_list.append(self.start_state)
+
+                    # if nothing worked, just discard this coverage
+
+            coverage_list, state_list = new_coverage_list, new_state_list
+
+        # finalize coverages
+        new_coverage_list = []
+        for coverage, state in zip(coverage_list, state_list):
+            if state in self.final_states:
+                # current state is one of the final states: close the last pattern
+                new_coverage_list.append(coverage + [('r', self.final_states[state])])
+            elif coverage != [] and coverage[-1][0] == 'r':
+                # the last pattern is already closed
+                new_coverage_list.append(coverage)
+            # if nothing worked, just discard this coverage as incomplete
+
+        if new_coverage_list == []:
+            # no coverages detected: no need to go further
+            return []
+
+        # convert coverage representation:
+        # [('r'/'w', rule_number/token), ...] -> [([token, token, ... ], rule_number), ...]
+        formatted_coverage_list = []
+        for coverage in new_coverage_list:
+            pattern, formatted_coverage = [], []
+            for element in coverage:
+                if element[0] == 'w':
+                    pattern.append(element[1])
+                else:
+                    formatted_coverage.append((pattern, element[1]))
+                    pattern = []
+            formatted_coverage_list.append(formatted_coverage)
+
+        # now we filter out some not-lrlm coverages
+        # that still got into
+
+        # sort coverages by signature, which is a tuple
+        # of coverage part lengths
+        formatted_coverage_list.sort(key=signature, reverse=True)
+        signature_max = signature(formatted_coverage_list[0])
+
+        # keep only those with top signature
+        # they would be the LRLM ones
+        LRLM_list = []
+        for coverage in formatted_coverage_list:
+            if signature(coverage) == signature_max:
+                # keep adding
+                LRLM_list.append(coverage)
+            else:
+                # no need to look further, others will be worse
+                return LRLM_list
+        return LRLM_list
+
+def signature(coverage):
+    """
+    Get coverage signature which is just a tuple
+    of lengths of groups comprising the coverage.
+    """
+    return tuple([len(group[0]) for group in coverage])
+
+if __name__ == "__main__":
+    cat_dict, rules, ambiguous_rules, rule_id_map, rule_xmls = prepare(sys.argv[1])
+    pattern_FST = FST(rules)
+
+    coverages = pattern_FST.get_lrlm('^prpers<prn><subj><p1><mf><pl>$ ^want# to<vbmod><pp>$ ^wait<vblex><inf>$ ^until<cnjadv>$ ^prpers<prn><subj><p1><mf><pl>$ ^can<vaux><past>$ ^offer<vblex><inf>$ ^what<prn><itg><m><sp>$ ^would<vaux><inf>$ ^be<vbser><inf>$ ^totally<adv>$ ^satisfy<vblex><ger>$ ^for<pr>$ ^consumer<n><pl>$^.<sent>$', cat_dict)
+
+    print('Coverages detected:')
+    for coverage in coverages:
+        print(coverage)

Property changes on: branches/weighted-transfer/apertium-weights-learner/tools/coverage.py
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py	(nonexistent)
+++ branches/weighted-transfer/apertium-weights-learner/tools/pipelines.py	(revision 72237)
@@ -0,0 +1,153 @@
+import sys, re
+from subprocess import Popen, PIPE
+
+# apertium special symbols for removal 
+apertium_re = re.compile(r'[@#~*]')
+
+class partialTranslator():
+    """
+    Wrapper for part of Apertium pipeline
+    going from bidix lookup to the generation.
+    """
+    def __init__(self, tixfname, binfname):
+        """
+        On initialization, partial Apertium pipeline
+        is invoked with '-z' option (null flush)
+        and remains active waiting for input.
+        """
+        self.autobil = Popen(['lt-proc', '-b', '-z', 
+                              binfname + '.autobil.bin'
+                             ],
+                             stdin = PIPE, stdout = PIPE)
+        self.transfer = Popen(['apertium-transfer', '-b', '-z', 
+                               tixfname + '.t1x', 
+                               binfname + '.t1x.bin'
+                              ], 
+                              stdin = self.autobil.stdout, stdout = PIPE)
+        self.interchunk = Popen(['apertium-interchunk', '-z',
+                                 tixfname + '.t2x',
+                                 binfname + '.t2x.bin'
+                                ],
+                                stdin = self.transfer.stdout, stdout = PIPE)
+        self.postchunk = Popen(['apertium-postchunk', '-z',
+                                tixfname + '.t2x',
+                                binfname + '.t2x.bin'
+                               ],
+                               stdin = self.interchunk.stdout, stdout = PIPE)
+        self.autogen = Popen(['lt-proc', '-g', '-z',
+                              binfname + '.autogen.bin'
+                             ],
+                             stdin = self.postchunk.stdout, stdout = PIPE)
+
+    def translate(self, string):
+        """
+        Convert input string to bytes,
+        send it to the pipeline,
+        return the result converted to utf-8.
+        """
+        string = string.strip() + '[][\n]'
+
+        if type(string) == type(''): 
+            bstring = bytes(string, 'utf-8')
+        else:
+            bstring = string  
+
+        self.autobil.stdin.write(bstring)
+        self.autobil.stdin.write(b'\0')
+        self.autobil.stdin.flush()
+
+        char = self.autogen.stdout.read(1)
+        output = []
+        while char and char != b'\0':
+            output.append(char)
+            char = self.autogen.stdout.read(1)
+
+        return apertium_re.sub('', (b''.join(output)).decode('utf-8').replace('[][\n]',''))
+
+class weightedPartialTranslator():
+    """
+    Wrapper for part of Apertium pipeline
+    going from bidix lookup to the generation.
+    It is missing 1st-stage transfer at init,
+    because transfer is invoked at translation
+    with provided weights file.
+    """
+    def __init__(self, tixfname, binfname):
+        """
+        On initialization, fragments of Apertium pipeline
+        are invoked with '-z' option (null flush)
+        and remain active waiting for input.
+        """
+        self.tixfname = tixfname
+        self.binfname = binfname
+
+        self.autobil = Popen(['lt-proc', '-b', '-z', 
+                              binfname + '.autobil.bin'
+                             ],
+                             stdin = PIPE, stdout = PIPE)
+
+        # transfer is missing here
+        # it is invoked during translation
+        # using provided transfer weights file 
+
+        self.interchunk = Popen(['apertium-interchunk', '-z',
+                                 tixfname + '.t2x',
+                                 binfname + '.t2x.bin'
+                                ],
+                                stdin = PIPE, stdout = PIPE)
+        self.postchunk = Popen(['apertium-postchunk', '-z',
+                                tixfname + '.t2x',
+                                binfname + '.t2x.bin'
+                               ],
+                               stdin = self.interchunk.stdout, stdout = PIPE)
+        self.autogen = Popen(['lt-proc', '-g', '-z',
+                              binfname + '.autogen.bin'
+                             ],
+                             stdin = self.postchunk.stdout, stdout = PIPE)
+
+    def translate(self, string, wixfname):
+        """
+        Convert input string to bytes,
+        send it to the pipeline,
+        return the result converted to utf-8.
+        """
+        string = string.strip() + '[][\n]'
+
+        if type(string) == type(''): 
+            bstring = bytes(string, 'utf-8')
+        else:
+            bstring = string  
+
+        # start going through null flush pipeline
+        self.autobil.stdin.write(bstring)
+        self.autobil.stdin.write(b'\0')
+        self.autobil.stdin.flush()
+
+        char = self.autobil.stdout.read(1)
+        autobil_output = []
+        while char and char != b'\0':
+            autobil_output.append(char)
+            char = self.autobil.stdout.read(1)
+
+        # make weighted transfer
+        transfer = Popen(['apertium-transfer', '-bw',
+                          wixfname,
+                          self.tixfname + '.t1x', 
+                          self.binfname + '.t1x.bin'
+                         ],
+                         stdin = PIPE, stdout = PIPE)
+
+        transfer_output, err = transfer.communicate(b''.join(autobil_output))
+
+        # resume going through null flush pipeline
+        self.interchunk.stdin.write(transfer_output)
+        self.interchunk.stdin.write(b'\0')
+        self.interchunk.stdin.flush()
+
+        char = self.autogen.stdout.read(1)
+        autogen_output = []
+        while char and char != b'\0':
+            autogen_output.append(char)
+            char = self.autogen.stdout.read(1)
+
+        return apertium_re.sub('', (b''.join(autogen_output)).decode('utf-8').replace('[][\n]',''))
Index: branches/weighted-transfer/apertium-weights-learner/tools/prune.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/tools/prune.py	(revision 72236)
+++ branches/weighted-transfer/apertium-weights-learner/tools/prune.py	(revision 72237)
@@ -17,7 +17,7 @@
 
 usage_line = 'Usage: ./prune.py INPUT_FILE [OUTPUT_FILE]'
 
-def prune_transfer_weights(ifname, ofname):
+def prune_xml_transfer_weights(using_lxml, ifname, ofname=None):
     """
     Prune the transfer weights file provided in ifname.
 
@@ -38,8 +38,11 @@
     except etree.ParseError:
         print('Error parsing rules file \'{}\'. '
               'Is there something wrong with it?'.format(opts.rfname))
-        sys.exit(1)
+        return None
 
+    if ofname is None:
+        ofname = ifname.rsplit('.', maxsplit=1)[0] + '-prunned.w1x'        
+
     # create (empty) output xml tree
     oroot = etree.Element('transfer-weights')
     # go through rule-groups
@@ -88,6 +91,8 @@
     else:
         etree.ElementTree(oroot).write(ofname, encoding='utf-8', xml_declaration=True)
 
+    return ofname
+
 def xml_pattern_to_str(et_pattern):
     """
     Convert xml pattern item into pattern string.
@@ -114,8 +119,6 @@
         sys.exit(1)
 
     if len(sys.argv) == 2:
-        ofname = ifname.rsplit('.', maxsplit=1)[0] + '-prunned.w1x'
+        prune_xml_transfer_weights(using_lxml, ifname)
     else:
-        ofname = sys.argv[2]
-            
-    prune_transfer_weights(ifname, ofname)
+        prune_xml_transfer_weights(using_lxml, ifname, sys.argv[2])
Index: branches/weighted-transfer/apertium-weights-learner/twlearner.py
===================================================================
--- branches/weighted-transfer/apertium-weights-learner/twlearner.py	(revision 72236)
+++ branches/weighted-transfer/apertium-weights-learner/twlearner.py	(revision 72237)
@@ -8,10 +8,11 @@
 # simple config in python file
 import twlconfig
 # module for coverage calculation
-import coverage
+from tools import coverage
 # apertium translator pipelines
-from pipelines import partialTranslator, weightedPartialTranslator
+from tools.pipelines import partialTranslator, weightedPartialTranslator
 from tools.simpletok import normalize
+from tools.prune import prune_xml_transfer_weights
 
 try: # see if lxml is installed
     from lxml import etree
@@ -191,6 +192,10 @@
                 gc.collect()
                 lbtime = clock()
 
+    # clean up temporary weights file
+    if os.path.exists(tmpweights_fname):
+        os.remove(tmpweights_fname)
+
     print('Done in {:.2f}'.format(clock() - btime))
     return ofname
 
@@ -303,7 +308,7 @@
         et_rule.attrib['id'] = rule_map[rule_number]
     return et_rule
 
-def make_xml_rules(scores_fname, prefix, rule_map, rule_xmls):
+def make_xml_transfer_weights(scores_fname, prefix, rule_map, rule_xmls):
     """
     Sum up the weights for each rule-pattern pair,
     add the result to xml weights file.
@@ -398,11 +403,11 @@
     # estimate rule weights for each ambiguous chunk
     scores_fname = score_sentences(ambig_sentences_fname, model, prefix)
 
-    # sum up weigths for rule-pattern and make final xml
-    make_xml_rules(scores_fname, prefix, rule_id_map, rule_xmls)
+    # sum up weigths for rule-pattern and make unprunned xml
+    weights_fname = make_xml_transfer_weights(scores_fname, prefix, 
+                                              rule_id_map, rule_xmls)
 
-    # clean up temporary weights filem
-    if os.path.exists(tmpweights_fname):
-        os.remove(tmpweights_fname)
+    # prune weights file
+    prunned_fname = prune_xml_transfer_weights(using_lxml, weights_fname)
 
     print('Performed in {:.2f}'.format(clock() - tbtime))