Index: branches/apertium-tagger/experiments/evaluate_tagger.py
===================================================================
--- branches/apertium-tagger/experiments/evaluate_tagger.py	(nonexistent)
+++ branches/apertium-tagger/experiments/evaluate_tagger.py	(revision 68351)
@@ -0,0 +1,468 @@
+#!/usr/bin/env python3
+
+import sys
+
+skipUnknown = True
+testFunc = False
+
+# src: ^власти/власть<n><f><nn><sg><gen>/власть<n><f><nn><sg><dat>/власть<n><f><nn><sg><prp>/власть<n><f><nn><pl><acc>/власть<n><f><nn><pl><nom>$
+# ref: ^власти/власть<n><f><nn><sg><dat><@P←>$
+# tst: ^власти/власть<n><f><nn><sg><dat>$
+
+
+def readings(w, testFunc):
+    readings = []
+    removed_readings = []
+    reading = ''
+    seen = False
+    for c in w:
+        if c == '/' and seen == False:
+            seen = True
+            continue
+        elif (c == '/' or c == '$') and seen:
+            if len(reading) < 1:
+                print('Feil: ', w, file=sys.stderr)
+            if reading[0] == '¬':
+                removed_readings.append(reading)
+            else:
+                if testFunc:
+                    readings.append(
+                        reading_lemma(reading) + reading_msd(reading) + reading_func(reading))
+                else:
+                    readings.append(
+                        reading_lemma(reading) + reading_msd(reading))
+            reading = ''
+            continue
+        if seen:
+            reading = reading + c
+    return (readings, removed_readings)
+
+
+def clean(s):
+    o = s.replace('¹', '').replace('²', '').replace('³', '').replace('⁻', '')
+    return o
+
+
+def reading_lemma(r):
+    r = clean(r)
+    return r.split('<')[0]
+
+
+def reading_pos(r):
+    if r.count('<') < 1:
+        return '<unk>'
+    return '<' + r.split('<')[1].split('>')[0] + '>'
+
+
+def reading_msd(r):
+    msd = ''
+    seen = False
+    tag = ''
+    for c in r:
+        if c == '<':
+            seen = True
+        if c == '>':
+            tag = tag + c
+            if tag.count(':') > 0 or len(tag) < 2:  # {
+                continue
+            elif tag[1] == '@':
+                continue
+            else:
+                msd = msd + tag
+            tag = ''
+            continue
+        if seen:
+            tag = tag + c
+    return msd
+
+
+def reading_func(r):
+    func = ''
+    seen = False
+    for c in r:
+        if c == '@':
+            seen = True
+        if c == '>':
+            seen = False
+        if seen:
+            func = func + c
+    func = '<' + func + '>'
+    return func.replace('<>', '')
+
+
+def readings_rules(readings):
+    rules = set()
+    readings_rules = {}
+
+    for r in readings:
+        reading = ''
+        seen = False
+        tag = ''
+        first = True
+        for c in r:
+            if c == '<' and first == False:
+                seen = True
+            elif c == '<' and first == True:
+                seen = True
+                first = False
+            if c == '+':
+                first = True
+            if c == '¬':
+                continue
+            if c == '>':
+                tag = tag + c
+                if tag.count(':'):
+                    rules.add(tag)
+                else:
+                    reading = reading + tag
+                tag = ''
+                seen = False
+            if seen and not first:
+                tag = tag + c
+            elif first:
+                reading = reading + c
+        if reading not in readings_rules:
+            readings_rules[reading] = []
+        readings_rules[reading] = list(rules)
+    return (rules, readings_rules)
+
+
+class TaggerEvaluator:
+    def __init__(self, src_fn, ref_fn, tst_fn):
+        self.src_f = open(src_fn)
+        self.ref_f = open(ref_fn)
+        self.tst_f = open(tst_fn)
+
+        # Sanity check
+
+        src_l = len(self.src_f.readlines())
+        ref_l = len(self.src_f.readlines())
+        tst_l = len(self.src_f.readlines())
+
+        lines = -1
+
+        if src_l != ref_l != tst_l:
+            print(src_l, ref_l, tst_l, file=sys.stderr)
+        else:
+            self.lines = src_l
+
+        self.src_f.seek(0)
+        self.ref_f.seek(0)
+        self.tst_f.seek(0)
+
+    def run_analysis(self):
+        self.n_tokens = 0
+        self.n_unknown = 0
+        n_line = 0
+        #                       tp tn fp fn
+        self.rules = {}  # rules['SELECT:462'] = (0, 0, 0, 0)
+
+        applic = {}  # applic['SELECT:462'] = 0
+
+        feiler = {}  # feiler['SELECT:462'] = [13, 45, 100]
+
+        self.n_truepositive = 0
+        self.n_truenegative = 0
+        self.n_falsepositive = 0
+        self.n_falsenegative = 0
+
+        self.n_ref_readings = 0
+        self.n_src_readings = 0
+        self.n_tst_readings = 0
+
+        self.n_tst_lema_correct = 0
+        self.n_tst_pos_correct = 0
+        self.n_tst_lemapos_correct = 0
+        self.n_tst_msd_correct = 0
+        self.n_tst_lemamsd_correct = 0
+        self.n_tst_func_correct = 0
+
+        self.n_bas_lema_correct = 0
+        self.n_bas_pos_correct = 0
+        self.n_bas_lemapos_correct = 0
+        self.n_bas_msd_correct = 0
+        self.n_bas_lemamsd_correct = 0
+        self.n_bas_func_correct = 0
+
+        for line in range(0, self.lines):  # {
+            n_line = n_line + 1
+            src_w = self.src_f.readline()
+            ref_w = self.ref_f.readline()
+            tst_w = self.tst_f.readline()
+
+            #print('src_w', src_w);
+            #print('ref_w', ref_w);
+            #print('tst_w', tst_w);
+
+            if src_w.count('¶') > 0:  # {
+                continue
+            #}
+
+            self.n_tokens = self.n_tokens + 1
+
+            tst_readings = []
+            tst_lema = ''
+            tst_pos = ''
+            tst_func = ''
+            tst_msd = ''
+            src_readings = []
+            src_lema = ''
+            src_pos = ''
+            src_func = ''
+            src_msd = ''
+            ref_readings = []
+            ref_lema = ''
+            ref_pos = ''
+            ref_func = ''
+            ref_msd = ''
+
+            if tst_w.count('/*') < 1 and tst_w[0] == '^':  # {
+                tst_readings, tst_removed = readings(tst_w, testFunc)
+                tst_lema = reading_lemma(tst_readings[0])
+                tst_pos = reading_pos(tst_readings[0])
+                tst_func = reading_func(tst_readings[0])
+                tst_msd = reading_msd(tst_readings[0])
+
+                src_readings, src_removed = readings(src_w, testFunc)
+                src_lema = reading_lemma(src_readings[0])
+                src_pos = reading_pos(src_readings[0])
+                src_func = reading_func(src_readings[0])
+                src_msd = reading_msd(src_readings[0])
+
+                self.n_src_readings = self.n_src_readings + len(src_readings)
+                self.n_tst_readings = self.n_tst_readings + len(tst_readings)
+            #}
+
+            if ref_w.count('/*') < 1 and ref_w[0] == '^':  # {
+                ref_readings, ref_removed = readings(ref_w, testFunc)
+                ref_lema = reading_lemma(ref_readings[0])
+                ref_pos = reading_pos(ref_readings[0])
+                ref_func = reading_func(ref_readings[0])
+                ref_msd = reading_msd(ref_readings[0])
+            #}
+
+            if tst_w.count('/*') > 0 and skipUnknown == True:  # {
+                print('*\t', ref_lema, ref_msd)
+                self.n_unknown = self.n_unknown + 1
+                continue
+            #}
+
+            self.n_ref_readings = self.n_ref_readings + 1
+
+            #######################################################################
+
+            tst_rules, tst_readings_rules = readings_rules(tst_readings + tst_removed)
+            #print('READINGS:', tst_readings);
+            #print('RULES_READINGS:', tst_readings_rules);
+            for rule in list(tst_rules):  # {
+                if rule not in self.rules:  # {
+                    self.rules[rule] = (0, 0, 0, 0)
+                #}
+                #print('RULES:', rule, rules[rule]);
+            #}
+
+            for tst_reading in tst_readings:  # {
+                if tst_reading not in ref_readings:  # {
+                    self.n_falsepositive = self.n_falsepositive + 1
+                    for rule in tst_readings_rules[tst_reading]:  # {
+                        (tp, tn, fp, fn) = self.rules[rule]
+                        fp = fp + 1
+                        self.rules[rule] = (tp, tn, fp, fn)
+                    #}
+                else:  # {
+                    self.n_truepositive = self.n_truepositive + 1
+                    for rule in tst_readings_rules[tst_reading]:  # {
+                        (tp, tn, fp, fn) = self.rules[rule]
+                        tp = tp + 1
+                        self.rules[rule] = (tp, tn, fp, fn)
+                    #}
+                #}
+            #}
+
+            for ref_reading in ref_readings:  # {
+                if ref_reading not in tst_readings:  # {
+                    print('[' + str(n_line) + '] FALSENEG:', ref_reading, tst_readings)
+                    self.n_falsenegative = self.n_falsenegative + 1
+                    if ref_reading not in tst_readings_rules:  # {
+                        continue
+                    #}
+                    for rule in tst_readings_rules[ref_reading]:  # {
+                        (tp, tn, fp, fn) = self.rules[rule]
+                        fn = fn + 1
+                        self.rules[rule] = (tp, tn, fp, fn)
+                        if rule not in feiler:  # {
+                            feiler[rule] = []
+                        #}
+                        feiler[rule].append(n_line)
+                    #}
+                #}
+            #}
+
+            for src_reading in src_readings:  # {
+                # {
+                if src_reading not in ref_readings and src_reading not in tst_readings:
+                    self.n_truenegative = self.n_truenegative + 1
+                    if src_reading not in tst_readings_rules:  # {
+                        continue
+                    #}
+                    for rule in tst_readings_rules[src_reading]:  # {
+                        (tp, tn, fp, fn) = self.rules[rule]
+                        tn = tn + 1
+                        self.rules[rule] = (tp, tn, fp, fn)
+                    #}
+                #}
+            #}
+
+            #######################################################################
+
+            if tst_lema == ref_lema and tst_msd == ref_msd:  # {
+                print('=\t', tst_lema, tst_msd)
+            else:  # {
+                #print('ref:', ref_readings, file=sys.stderr);
+                print('-\t', ref_lema, ref_msd, src_readings)
+                #print('tst:', tst_readings, file=sys.stderr);
+                print('+\t', tst_lema, tst_msd, tst_readings)
+            #}
+
+            # {
+            if ref_lema + ref_msd not in tst_readings and ref_lema + ref_msd in src_readings:
+                print('!\t', ref_lema + ref_msd, tst_readings)
+            #}
+
+            if src_lema == ref_lema:
+                self.n_bas_lema_correct = self.n_bas_lema_correct + 1
+            if src_lema == ref_lema and src_pos == ref_pos:
+                self.n_bas_lemapos_correct = self.n_bas_lemapos_correct + 1
+            if src_lema == ref_lema and src_msd == ref_msd:
+                self.n_bas_lemamsd_correct = self.n_bas_lemamsd_correct + 1
+            if src_pos == ref_pos:
+                self.n_bas_pos_correct = self.n_bas_pos_correct + 1
+            if src_msd == ref_msd:
+                self.n_bas_msd_correct = self.n_bas_msd_correct + 1
+
+            if tst_lema == ref_lema:
+                self.n_tst_lema_correct = self.n_tst_lema_correct + 1
+            if tst_lema == ref_lema and tst_pos == ref_pos:
+                self.n_tst_lemapos_correct = self.n_tst_lemapos_correct + 1
+            if tst_lema == ref_lema and tst_msd == ref_msd:
+                self.n_tst_lemamsd_correct = self.n_tst_lemamsd_correct + 1
+            if tst_pos == ref_pos:
+                self.n_tst_pos_correct = self.n_tst_pos_correct + 1
+            if tst_msd == ref_msd:
+                self.n_tst_msd_correct = self.n_tst_msd_correct + 1
+            if tst_func == ref_func and ref_func != '':
+                self.n_tst_func_correct = self.n_tst_func_correct + 1
+
+    @property
+    def precision(self):
+        return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsepositive))
+
+    @property
+    def recall(self):
+        return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsenegative))
+
+    @property
+    def accuracy(self):
+        return float(self.n_truepositive + self.n_truenegative) / \
+            (float(self.n_truepositive + self.n_falsenegative +
+                   self.n_truenegative + self.n_falsepositive))
+
+    def print_analyses(self):
+        #	print("");
+        #}
+
+        # Accuracy = number of correct analyses / number of analyses in ref;
+        # False positives
+
+        # Lemma accuracy
+        # POS accuracy
+        # MSD accuracy
+        # Func accuracy
+
+        print('')
+
+        print('unknown  :\t', self.n_unknown,
+              '(', (float(self.n_unknown) / float(self.n_ref_readings)) * 100.0, ')')
+
+        print('')
+
+        print('truepos  :\t', self.n_truepositive)
+        print('trueneg  :\t', self.n_truenegative)
+        print('falsepos :\t', self.n_falsepositive)
+        print('falseneg :\t', self.n_falsenegative)
+
+        print('')
+
+        print('precision:\t', self.precision, '\t( true pos / all pos )')
+        print('recall   :\t', self.recall, '\t( true pos / (true pos + false neg) )')
+        print('accuracy :\t', self.accuracy, '\t((true pos + true neg) / (everything) )')
+
+        print('')
+
+        src_ambig_rate = float(self.n_src_readings) / float(self.n_ref_readings)
+        tst_ambig_rate = float(self.n_tst_readings) / float(self.n_ref_readings)
+        print('tokens   :\t', self.n_tokens)
+        print('src_ambig:\t', src_ambig_rate)
+        print('tst_ambig:\t', tst_ambig_rate)
+        print('resolved :\t %.2f%%' %
+              (100.0 - (tst_ambig_rate / src_ambig_rate * 100.0)))
+
+        print('')
+
+        p_bas_lema_correct = float(self.n_bas_lema_correct) / float(self.n_ref_readings) * 100.0
+        p_bas_pos_correct = float(self.n_bas_pos_correct) / float(self.n_ref_readings) * 100.0
+        p_bas_lemapos_correct = float(
+            self.n_bas_lemapos_correct) / float(self.n_ref_readings) * 100.0
+        p_bas_msd_correct = float(self.n_bas_msd_correct) / float(self.n_ref_readings) * 100.0
+        p_bas_lemamsd_correct = float(
+            self.n_bas_lemamsd_correct) / float(self.n_ref_readings) * 100.0
+        p_bas_func_correct = float(self.n_bas_func_correct) / float(self.n_ref_readings) * 100.0
+
+        print('lem      :\t', p_bas_lema_correct)
+        #print('pos      :\t',p_bas_pos_correct);
+        print('lem+pos  :\t', p_bas_lemapos_correct)
+        #print('msd      :\t',p_bas_msd_correct);
+        print('lem+msd  :\t', p_bas_lemamsd_correct)
+        print('func     :\t', p_bas_func_correct)
+
+        print('')
+
+        p_tst_lema_correct = float(self.n_tst_lema_correct) / float(self.n_ref_readings) * 100.0
+        p_tst_pos_correct = float(self.n_tst_pos_correct) / float(self.n_ref_readings) * 100.0
+        p_tst_lemapos_correct = float(
+            self.n_tst_lemapos_correct) / float(self.n_ref_readings) * 100.0
+        p_tst_msd_correct = float(self.n_tst_msd_correct) / float(self.n_ref_readings) * 100.0
+        p_tst_lemamsd_correct = float(
+            self.n_tst_lemamsd_correct) / float(self.n_ref_readings) * 100.0
+        p_tst_func_correct = float(self.n_tst_func_correct) / float(self.n_ref_readings) * 100.0
+
+        print('lem      :\t', p_tst_lema_correct,
+              '(', p_tst_lema_correct - p_bas_lema_correct, ')')
+        #print('pos      :\t',p_tst_pos_correct, '(', p_tst_pos_correct-p_bas_pos_correct, ')');
+        print('lem+pos  :\t', p_tst_lemapos_correct,
+              '(', p_tst_lemapos_correct - p_bas_lemapos_correct, ')')
+        #print('msd      :\t',p_tst_msd_correct, '(', p_tst_msd_correct-p_bas_msd_correct, ')');
+        print('lem+msd  :\t', p_tst_lemamsd_correct,
+              '(', p_tst_lemamsd_correct - p_bas_lemamsd_correct, ')')
+        print('func     :\t', p_tst_func_correct,
+              '(', p_tst_func_correct - p_bas_func_correct, ')')
+
+        rkeys = list(self.rules.keys())
+        rkeys.sort()
+        print('')
+        print('Rule No.\tTP\tTN\tFP\tFN')
+        for rule in rkeys:
+            print('%s\t%d\t%d\t%d\t%d' %
+                  (rule, self.rules[rule][0], self.rules[rule][1], self.rules[rule][2], self.rules[rule][3]))
+        print('')
+        for rule in rkeys:
+            if rule in feiler:
+                print(rule, '\t', feiler[rule])
+
+def main():
+    te = TaggerEvaluator(sys.argv[1], sys.argv[2], sys.argv[3])
+    te.run_analysis()
+    te.print_analyses()
+
+if __name__ == '__main__':
+    main()
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py	(nonexistent)
+++ branches/apertium-tagger/experiments/run_experiment.py	(revision 68351)
@@ -0,0 +1,666 @@
+import statistics
+import sys
+from os import mkdir
+from os.path import isdir, join as pjoin, exists as pexists
+import subprocess
+from subprocess import PIPE
+import functools
+import itertools
+import aitertools
+import argparse
+import tabulate
+import csv
+import asyncio
+from asyncio.subprocess import create_subprocess_exec
+from collections import namedtuple
+from contextlib import contextmanager
+import os
+from pprint import pprint
+
+from evaluate_tagger import TaggerEvaluator
+from split_corpus_n import main as split_corpus_n
+
+loop = asyncio.get_event_loop()
+
+@contextmanager
+def cd(newdir):
+    prevdir = os.getcwd()
+    os.chdir(os.path.expanduser(newdir))
+    try:
+        yield
+    finally:
+        os.chdir(prevdir)
+
+
+TMPDIR = 'experimenttmp'
+
+DEFAULT_TEXTS = {
+    'cat': ['texts/miscellaneous.tagged.txt'],
+    'spa': ['texts/miscellaneous.tagged.txt'],
+    'hbs': ['hbs-tagger-data/hbs.tagged.txt'],
+    'rus': ['texts/son-smešnogo-čeloveka.ana.txt'],
+    'kaz': ['eval/ref.1000.txt'],
+    'por': [
+        'texts/bering.txt',
+        'texts/cultura.txt',
+        'texts/beringia.txt',
+        'texts/raio.txt',
+        'texts/música.txt',
+        'texts/água.txt',
+        'texts/akatsuki.txt',
+    ],
+}
+
+DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe']
+
+def comma_list(s):
+    if s == '()':
+        return []
+    return s.split(',')
+
+def comma_colon_dict(s):
+    d = {}
+    for bit in s.split(','):
+        pair = bit.split(':')
+        d[pair[0]] = pair[1].split(',')
+    return d
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Runs a series of experiments on different part of speech taggers and different language data.")
+    parser.add_argument('languagesdir', help="Path to the directory containing all the individaul language data directories")
+    parser.add_argument('--languages', help="Only run experiments for these languages, comma separated", default=DEFAULT_LANGUAGES, type=comma_list)
+    parser.add_argument('--taggers',  help="Only run experiments with these taggers, comma separated", default=DEFAULT_TAGGERS, type=comma_list)
+    parser.add_argument('--language-texts',  help="Use different texts per language, coma seperated colon pairs", default=DEFAULT_TEXTS, type=comma_colon_dict)
+    parser.add_argument('--folds', help="Use x-fold validation instead of 10-fold", default=10, type=int)
+    parser.add_argument('--reuse', help="Reuse preprocesed dictionary and corpa from previous run", action='store_true')
+
+    return parser.parse_args()
+
+def read1k_chunker(f):
+    def read1k():
+        return f.read(1024)
+    return iter(read1k, '')
+
+def filter(func=None, iter_filter=False, input_chunker=None, output_separator=''):
+    if func is None:
+        def defer(func=None):
+            return filter(func, iter_filter=iter_filter, input_chunker=input_chunker, output_separator=output_separator)
+        return defer
+
+    def generator(input_iter):
+        for line in input_iter:
+            filtered = func(line)
+            if filtered is not None:
+                yield filtered + output_separator
+
+    @functools.wraps(func)
+    def wrapper(input, output=None):
+        input_file = None
+        if isinstance(input, str):
+            input_iter = input_file = open(input)
+            if input_chunker:
+                input_iter = input_chunker(input_file)
+            else:
+                input_iter = input_file.readlines()
+        else:
+            input_iter = input
+        if iter_filter:
+            gen = func(input_iter)
+        else:
+            gen = generator(input_iter)
+        if output is None:
+            return gen
+        output_file = open(output, 'w')
+        for line in gen:
+            output_file.write(line)
+        if input_file is not None:
+            input_file.close()
+        output_file.close()
+    return wrapper
+
+
+class MapFilter:
+    def __init__(self, aiterable, pred=None, tran=None):
+        self.aiterable = aiterable
+        self.pred = pred
+        self.tran = tran
+
+    async def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        while True:
+            payload = await self.aiterable.__anext__()
+            if self.pred is None or self.pred(payload):
+                if self.tran is None:
+                    return payload
+                else:
+                    return self.tran(payload)
+
+
+async def dir_in(input_fn, proc):
+    input_file = open(input_fn)
+    while 1:
+        b = input_file.read(1024)
+        if not len(b):
+            return
+        await proc.stdin.write(b)
+
+
+async def dir_out(proc, output_fn):
+    output_file = open(output_fn, 'w')
+    while 1:
+        b = await in_proc.read(1024)
+        if not len(b):
+            return
+        output_file.write(b)
+
+
+class Tee(MapFilter):
+    def __init__(self, aiterable, log_file):
+        self.log_file = open(log_file, 'wb')
+        super().__init__(aiterable, tran=self.tran)
+
+    def tran(self, bit):
+        self.log_file.write(bit)
+        return bit
+
+
+async def pipe(in_proc, out_proc):
+    while 1:
+        b = await in_proc.stdout.read(16384)
+        if b == b'':
+            return
+        out_proc.stdin.write(b)
+        #await out_proc.stdin.drain()
+
+#async def readlines(in_proc):
+#    while 1:
+#        b = await in_proc.stdout.readline()
+#        if not len(b):
+#            return
+#        b
+
+
+async def writeiter(iter, out_proc):
+    i = 0
+    async for block in iter:
+        if (i % 10000) == 0:
+            print(".", end="", flush=True)
+        out_proc.stdin.write(block)
+        #await out_proc.stdin.drain()
+        i += 1
+    print("writeiter done")
+    out_proc.stdin.write_eof()
+
+def proc_filter(func=None, output_chunker=None):
+    if func is None:
+        def defer(func):
+            return proc_filter(func, output_chunker=output_chunker)
+        return defer
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        input = kwargs.pop('input', None)
+        output = kwargs.pop('output', None)
+
+        if callable(func):
+            cmd = func(*args, **kwargs)
+        else:
+            cmd = func
+
+        kwargs = {}
+        if isinstance(input, str):
+            kwargs['stdin'] = open(input)
+        else:
+            kwargs['stdin'] = subprocess.PIPE
+        if output:
+            kwargs['stdout'] = open(output, 'w')
+        else:
+            kwargs['stdout'] = subprocess.PIPE
+        print("RUNNING: ", ' '.join(cmd))
+        proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs)
+        print(type(input))
+        if not isinstance(input, str) and input is not None:
+            for line in input:
+                if 'apertium-destxt' in cmd:
+                    print('des', line)
+                #if not isinstance(input, bytes):
+                    #line = line.encode('utf8')
+                #print(proc.stdin)
+                proc.stdin.write(line)
+        if not output:
+            if output_chunker:
+                return output_chunker(proc.stdout)
+            else:
+                return proc.stdout.readlines()
+        stdout, stderr = proc.communicate()
+        retcode = proc.poll()
+        return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr)
+
+    return wrapper
+
+
+@filter
+def strip_blanks_(line):
+    if line != '\n':
+        return line
+
+
+@filter(output_separator='\n')
+def extract_words(line):
+    return line.split('^')[1].split('/')[0]
+
+
+@filter(output_separator='\n')
+def extract_first_analysis(line):
+    return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$'
+
+strip_blanks = functools.partial(MapFilter, pred=lambda line: line != '\n')
+
+
+@filter
+def passthrough(line):
+    return line
+
+def prun(*args, **kwargs):
+    for key in ('stdin', 'stdout', 'stderr'):
+        if key not in kwargs:
+            kwargs[key] = subprocess.PIPE
+    print('prun', args, kwargs)
+    return subprocess.run(*args, **kwargs)
+
+
+def check_run(cmd, *args, **kwargs):
+    kwargs['check'] = True
+    print("RUNNING: ", ' '.join(cmd))
+    return subprocess.run(cmd, *args, **kwargs)
+
+
+@proc_filter
+def lt_proc(morphology_fn, dictcase=False):
+    cmd = ['lt-proc', morphology_fn]
+    if dictcase:
+        cmd.insert(1, '--dictionary-case')
+    return cmd
+
+
+def insert_model(cmd, model, tagging=False):
+    if model == 'bigram':
+        pass
+    elif model.startswith('unigram'):
+        cmd.insert(2, '-u')
+        cmd.insert(3, model[7:])
+    elif model == 'lwsw':
+        cmd.insert(2, '--sliding-window')
+    return cmd
+
+
+
+@proc_filter
+def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0):
+    cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn]
+    if not all((trainsrc_fn, dic_fn, tsx_fn)) and model_type != 'unigram':
+        raise ValueError("Optional arguments required for non-unigram models")
+    if model_type != 'unigram':
+        #apertium-tagger -s 0 /tmp/$DN.dic /tmp/spa.misc.$i.trainsrc $TSX /tmp/spa.misc.$i.prob /tmp/spa.misc.$i.train /tmp/spa.misc.$i.trainsrc
+        cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn]
+        cmd.append(trainsrc_fn)
+    insert_model(cmd, model_type)
+    cmd.insert(-1, train_fn)
+    return cmd
+
+
+@proc_filter
+def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, iterations=0):
+    if model_fn == 'unigram':
+        raise ValueError("No unsupervised training for unigram models")
+    cmd = ['apertium-tagger', '--train={}'.format(iterations), dic_fn, trainsrc_fn, tsx_fn, model_fn]
+    insert_model(cmd, model_type)
+    return cmd
+
+
+@proc_filter
+def tagger_tag(model_type, model_fn):
+    cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn]
+    insert_model(cmd, model_type, tagging=True)
+    return cmd
+
+filter_dix = functools.partial(
+    MapFilter,
+    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
+    tran=lambda line: line.split(b":")[0] + b"\n")
+
+async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn):
+    pipes = []
+
+    expand_proc = await create_subprocess_exec('lt-expand', dix_fn, stdout=PIPE)
+    filtered = filter_dix(expand_proc.stdout)
+
+    extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", "¡"]
+    for i, extra in enumerate(extras):
+        extras[i] = (extra + "\n").encode('utf-8')
+    with_extras = aitertools.chain(filtered, extras)
+
+    lt_inpipe, destxt_outpipe = os.pipe()
+    destxt = await create_subprocess_exec('apertium-destxt', stdin=PIPE, stdout=destxt_outpipe)
+    os.close(destxt_outpipe)
+
+    pipes.append(writeiter(with_extras, destxt))
+
+    filter_ambg_inpipe, lt_outpipe = os.pipe()
+    lt_proc = await create_subprocess_exec('lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe)
+    os.close(lt_outpipe)
+
+    filter_ambg = await create_subprocess_exec('apertium-filter-ambiguity', tsx_fn, stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb'))
+    pipes.append(filter_ambg.wait())
+    return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
+
+
+#def lt_proc(morphology_fn, input_fn, output_fn=None, dictcase=False):
+    #cmd = ['lt-proc', morphology_fn]
+    #if dictcase:
+        #cmd.insert(1, '--dictionary-case')
+    #if isinstance(input_fn, str):
+        #cmd.append(input_fn)
+        #cmd.append(output_fn)
+        #return prun(cmd)
+    #else:
+        #kwargs = {'stdin': subprocess.PIPE}
+        #if output_fn:
+            ##kwargs['stdout'] = open(output_fn, 'w')
+        #with subprocess.Popen(cmd, **kwargs) as proc:
+            #if not isinstance(input_fn, str):
+                #for line in input_fn:
+                    #if not isinstance(input_fn, bytes):
+                        #line = line.encode('utf8')
+                    #proc.stdin.write(line)
+            #stdout, stderr = proc.communicate()
+            #retcode = proc.poll()
+        #return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr)
+
+@proc_filter
+def cg_proc(grammar_fn, dictcase=True):
+    cmd = ['cg-proc', grammar_fn]
+    if dictcase:
+        cmd.insert(1, '-w')
+    return cmd
+
+
+@filter(iter_filter=True)
+def strip_unknown_sent(gen):
+    buff = []
+    valid_sent = True
+    for line in gen:
+        if line.strip() == '':
+            if valid_sent:
+                for line in buff:
+                    yield line
+                yield '\n'
+            buff = []
+            valid_sent = True
+        else:
+            buff.append(line)
+            if '/*' in line:
+                valid_sent = False
+
+def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
+    sentences = 0
+    with open(corpus_fn) as corpus_file:
+        for line in corpus_file.readlines():
+            if line.strip() == '':
+                sentences = sentences + 1
+
+        split_left = int(float(sentences) * r / n)
+        split_right = int(float(sentences) * (r + 1) / n)
+
+        buffer = ''
+        index = 0
+
+        corpus_file.seek(0)
+
+        with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file:
+            for line in corpus_file.readlines():
+                if line.strip() == '':
+                    index = index + 1
+                elif split_left <= index < split_right:
+                    ref_file.write(line)
+                else:
+                    train_file.write(line)
+
+
+class MissingLanguageDataException(Exception):
+    def __init__(self, fn):
+        self.fn = fn
+
+
+def xval_experiment(name):
+    def dec(func=None):
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            accuracies = []
+            for i, xval_fns in enumerate(self.xval_fns):
+                #(prefix, train_fn, src_fn, ref_fn)
+                xval_fns['test'] = xval_fns['prefix'] + 'test.' + name
+                xval_fns['model'] = xval_fns['prefix'] + 'model.' + name
+                func(self, xval_fns) # train_fn, src_fn, ref_fn, test_fn, model_fn
+                evaluator = TaggerEvaluator(xval_fns['src'], xval_fns['ref'], xval_fns['test'])
+                evaluator.run_analysis()
+                accuracies.append(evaluator.accuracy)
+            return (min(accuracies), max(accuracies), statistics.mean(accuracies), statistics.stddev(accuracies))
+        return wrapper
+    return dec
+
+
+def unigram_taggers(cls):
+    for do_cg in [False, True]:
+        for unigram_type in range(1,4):
+            unigram_model = 'unigram' + str(unigram_type)
+            name = ('cg' if do_cg else '') + unigram_model
+
+            @cls.reg_experiment
+            @xval_experiment(name)
+            def experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model):
+                tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train'])
+                if do_cg:
+                    tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
+                else:
+                    tagger_input = xval_fns['src']
+                tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
+            setattr(cls, 'experiment_' + name, experiment)
+    return cls
+
+
+def extract_src(morphology_fn, input, output=None):
+    ref_words_iter = extract_words(input=input)
+    return lt_proc(morphology_fn, input=ref_words_iter, output=output)
+
+
+class LanguageTaggerExperimentor:
+    EVAL_PREFIX = 'experiment_'
+
+    experiments = {}
+
+    def __init__(self, lang, lang_root, texts, folds, reuse=False):
+        self.lang = lang
+        self.work_dir = pjoin(TMPDIR, lang)
+
+        pair_name = 'apertium-{0}.{0}'.format(lang)
+        self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin')
+        self.cg_fn = pjoin(lang_root, lang + '.rlx.bin')
+        self.dix_fn = pjoin(lang_root, pair_name + '.dix')
+        self.tsx_fn = pjoin(lang_root, pair_name + '.tsx')
+
+        self.text_fns = [pjoin(lang_root, text) for text in texts]
+        self.joined_fn = pjoin(self.work_dir, 'joined')
+        self.ref_fn = pjoin(self.work_dir, 'ref')
+        self.src_fn = pjoin(self.work_dir, 'src')
+        self.dic_fn = pjoin(self.work_dir, 'filtered.dic')
+
+        self.xval_fns = []
+        self.folds = folds
+
+        for i in range(folds):
+            xval_prefix = pjoin(self.work_dir, 'xval.{}.'.format(i))
+            xval_src_fn = xval_prefix + 'src'
+            xval_trainsrc_fn = xval_prefix + 'trainsrc'
+            xval_train_fn = xval_prefix + 'train'
+            xval_ref_fn = xval_prefix + 'ref'
+
+            self.xval_fns.append({
+                'prefix': xval_prefix,
+                'train': xval_train_fn,
+                'src': xval_src_fn,
+                'ref': xval_ref_fn,
+                'trainsrc': xval_trainsrc_fn,
+            })
+
+        self.validate()
+
+        if not reuse:
+            self.do_preprocessing()
+
+    def validate(self):
+        check_run(["apertium-validate-dictionary", self.dix_fn])
+        check_run(["apertium-validate-tagger", self.tsx_fn])
+
+        for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]:
+            if not pexists(fn):
+                raise MissingLanguageDataException(fn=fn)
+
+    def do_preprocessing(self):
+        if not isdir(self.work_dir):
+            mkdir(self.work_dir)
+
+        strip_unknown_sent(itertools.chain(*(open(fn).readlines() for fn in self.text_fns)), self.joined_fn)
+        #loop.run_until_complete(strip_blanks(self.joined_fn, self.ref_fn))
+        strip_blanks_(self.joined_fn, self.ref_fn)
+        extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn)
+        loop.run_until_complete(fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn))
+
+        for i, xval_fn in enumerate(self.xval_fns):
+            split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], self.folds, i)
+            extract_src(self.morphology_fn, input=xval_fn['ref'], output=xval_fn['src'])
+            extract_src(self.morphology_fn, input=xval_fn['train'], output=xval_fn['trainsrc'])
+
+    def _analyse(self, test_fn):
+        evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn)
+        evaluator.run_analysis()
+        return evaluator.accuracy
+
+    @classmethod
+    def reg_experiment(cls, name):
+        def reg(func):
+            LanguageTaggerExperimentor.experiments[name] = func
+        return reg
+
+    @classmethod
+    def add_experiments(cls):
+        for do_cg in [False, True]:
+            for unigram_type in range(1,4):
+                unigram_model = 'unigram' + str(unigram_type)
+                name = ('cg' if do_cg else '') + unigram_model
+
+                @cls.reg_experiment(name)
+                @xval_experiment(name)
+                def unigram_experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model):
+                    tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train'])
+                    if do_cg:
+                        tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
+                    else:
+                        tagger_input = xval_fns['src']
+                    tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
+
+        for do_cg in [False, True]:
+            name = ('cg' if do_cg else '') + '1st'
+
+            @cls.reg_experiment(name)
+            def pick_first_experiment(self, do_cg=do_cg):
+                first_fn = pjoin(self.work_dir, 'test.' + name)
+                if do_cg:
+                    tagger_input = cg_proc(self.cg_fn, input=self.src_fn)
+                else:
+                    tagger_input = self.src_fn
+                extract_first_analysis(self.src_fn, first_fn)
+                return self._analyse(first_fn)
+
+        for do_cg in [False, True]:
+            for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]:
+                for iterations in [0, 50, 250]:
+                    name = "{cg}{sup}_{model}_i{iterations}".format(
+                        cg='cg_' if do_cg else '',
+                        sup='sup' if is_supervised else 'unsup',
+                        model=model,
+                        iterations=iterations)
+
+                    @cls.reg_experiment(name)
+                    @xval_experiment(name)
+                    def model_experiment(self, xval_fns, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations):
+                        if is_supervised:
+                            tagger_train_sup(
+                                model, xval_fns['model'],
+                                train_fn=xval_fns['train'],
+                                trainsrc_fn=xval_fns['trainsrc'],
+                                dic_fn=self.dic_fn,
+                                tsx_fn=self.tsx_fn,
+                                iterations=iterations)
+                        else:
+                            tagger_train_unsup(
+                                model, xval_fns['model'],
+                                trainsrc_fn=xval_fns['trainsrc'],
+                                dic_fn=self.dic_fn,
+                                tsx_fn=self.tsx_fn,
+                                iterations=iterations)
+                        if do_cg:
+                            tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
+                        else:
+                            tagger_input = xval_fns['src']
+                        tagger_tag(model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
+
+    @classmethod
+    def all_taggers(cls):
+        return cls.experiments.keys()
+
+    def get_tagger(self, tagger):
+        return functools.partial(self.experiments[tagger], self)
+
+
+LanguageTaggerExperimentor.add_experiments()
+
+
+DEFAULT_TAGGERS = list(LanguageTaggerExperimentor.all_taggers())
+
+
+def main():
+    args = parse_args()
+    if not isdir(TMPDIR):
+        mkdir(TMPDIR)
+
+    languages_tagger_accuracies = {}
+    try:
+        for lang in args.languages:
+            taggers = args.language_texts[lang]
+            lang_root = pjoin(args.languagesdir, 'apertium-' + lang)
+            def mk_experimentor():
+                return LanguageTaggerExperimentor(lang, lang_root, taggers, args.folds, reuse=args.reuse)
+            try:
+                experimentor = mk_experimentor()
+            except MissingLanguageDataException as e:
+                print("Missing {}... Trying to build it for you.".format(e.fn))
+                with cd(lang_root):
+                    check_run('./autogen.sh')
+                    check_run('make')
+                experimentor = mk_experimentor()
+            languages_tagger_accuracies[lang] = {}
+            for tagger in args.taggers:
+                experiment = experimentor.get_tagger(tagger)
+                languages_tagger_accuracies[lang][tagger] = experiment()
+    finally:
+        pprint(languages_tagger_accuracies)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    finally:
+        loop.close()
Index: branches/apertium-tagger/experiments/split_corpus_n.py
===================================================================
--- branches/apertium-tagger/experiments/split_corpus_n.py	(nonexistent)
+++ branches/apertium-tagger/experiments/split_corpus_n.py	(revision 68351)
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import sys
+import random
+
+def main(corpus, prefix):
+    tokens = 0.0
+    sentences = 0.0
+
+    for line in open(corpus).readlines():
+
+        if line.strip() == '':
+            sentences = sentences + 1.0
+        else:
+            tokens = tokens + 1.0
+
+    print(sentences, tokens, tokens / sentences)
+
+    ids = []
+    for i in range(0, int(sentences)):
+        ids.append(i)
+    random.shuffle(ids)
+    split = int(sentences / 10)
+    print(split, file=sys.stderr)
+    testing = ids[0:split]
+    training = ids[split:]
+
+    train_file = open(prefix + 'train', 'w+')
+    test_file = open(prefix + 'ref', 'w+')
+
+    buffer = ''
+    index = 0
+    for line in open(corpus).readlines():
+        if line.strip() == '':
+            index = index + 1
+            buffer = buffer + '\n'
+            if index in testing:
+                test_file.write(buffer)
+            elif index in training:
+                train_file.write(buffer)
+            else:
+                print('ERROR: %d not in testing or training' %
+                      (index), file=sys.stderr)
+            buffer = ''
+        else:
+            buffer = buffer + line
+        if index % int(tokens / 100) == 0:
+            sys.stderr.write('.')
+            sys.stderr.flush()
+
+if __name__ == '__main__':
+    corpus = sys.argv[1]
+    prefix = sys.argv[2]
+    main(corpus, prefix)
Index: trunk/apertium/apertium/hmm.cc
===================================================================
--- trunk/apertium/apertium/hmm.cc	(revision 68349)
+++ trunk/apertium/apertium/hmm.cc	(revision 68351)
@@ -204,14 +204,9 @@
     if (tags.size()==0) { //This is an unknown word
       tags = tdhmm.getOpenClass();
     }
-    else if (output.has_not(tags)) { 
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n";
-      errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
-      errors+= L"Take a look at the dictionary and at the training corpus. Then, retrain.";      
-      fatal_error(errors);      
-    }    
+    else {
+      require_ambiguity_class(tdhmm, tags, *word);
+    }
 
     k2=output[tags];
 
@@ -366,15 +361,8 @@
     if (word_untagged->get_tags().size()==0) { // Unknown word
       tags = tdhmm.getOpenClass();
     }
-    else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors+= L"Word '"+word_untagged->get_superficial_form()+L"' not found in the dictionary.\n";
-      errors+= L"New ambiguity class: "+word_untagged->get_string_tags()+L"\n";
-      errors+= L"Take a look at the dictionary, then retrain.";
-      fatal_error(errors);      
-    }    
     else {
+      require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged);
       tags = word_untagged->get_tags();
     }
 
@@ -460,49 +448,12 @@
 }
 
 void 
-HMM::read_dictionary (FILE *fdic) {
-  int i, k, nw=0;
-  TaggerWord *word=NULL;
-  set <TTag> tags;
-  Collection &output = tdhmm.getOutput();
-  
-  MorphoStream morpho_stream(fdic, true, &tdhmm);
-  
-  // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
-   
-  word = morpho_stream.get_next_word();
-  
-  while (word) {
-    if (++nw%10000==0) wcerr<<L'.'<<flush;
-    
-    tags = word->get_tags();
+HMM::read_dictionary(FILE *fdic) {
+  tagger_utils::read_dictionary(fdic, tdhmm);
+  int N = (tdhmm.getTagIndex()).size();
+  int M = (tdhmm.getOutput()).size();
+  wcerr << N << L" states and " << M <<L" ambiguity classes\n";
 
-    if (tags.size()>0)
-      k = output[tags];
-
-    delete word;
-    word = morpho_stream.get_next_word();
-  }
-  wcerr<<L"\n";
-  
-  // OPEN AMBIGUITY CLASS
-  // It contains all tags that are not closed.
-  // Unknown words are assigned the open ambiguity class
-  k=output[tdhmm.getOpenClass()];
-
-  int N = (tdhmm.getTagIndex()).size();  
-  
-  // Create ambiguity class holding one single tag for each tag.
-  // If not created yet
-  for(i = 0; i != N; i++) {
-    set<TTag> amb_class;
-    amb_class.insert(i);
-    k=output[amb_class];
-  }
-
-  int M = output.size();
-  
-  wcerr<< N <<L" states and "<< M <<L" ambiguity classes\n";
   tdhmm.setProbabilities(N, M);
 }
 
@@ -577,14 +528,7 @@
       ndesconocidas++;
     }
     
-    if (output.has_not(tags)) {
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n";
-      errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
-      errors+= L"Take a look at the dictionary, then retrain.";
-      fatal_error(errors);      
-    }
+    require_ambiguity_class(tdhmm, tags, *word);
     
     k = output[tags];    
     len = pending.size();
@@ -806,17 +750,7 @@
     if (tags.size()==0) // This is an unknown word
       tags = tdhmm.getOpenClass();
                        
-    if (output.has_not(tags)) {  // Encontrada una clase de ambigüedad desconocida hasta el momento      
-      if (debug) {
-        wstring errors;
-	errors = L"A new ambiguity class was found. \n";
-	errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
-	errors+= L"Word '"+word->get_superficial_form()+L"'.\n";
-	errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
-	wcerr<<L"Error: "<<errors;
-      }
-      tags = find_similar_ambiguity_class(tags);
-    } 
+    tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug);
          
     k = output[tags];  //Ambiguity class the word belongs to
     
@@ -936,29 +870,3 @@
     cout << "\n";
   }
 }   
-
-set<TTag>
-HMM::find_similar_ambiguity_class(set<TTag> c) {
-  int size_ret = -1;
-  set<TTag> ret=tdhmm.getOpenClass(); //Se devolver� si no encontramos ninguna clase mejor
-  bool skeep_class;
-  Collection &output = tdhmm.getOutput();
-
-  for(int k=0; k<output.size(); k++) {
-    if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
-      skeep_class=false;
-      // Test if output[k] is a subset of class
-      for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {
-        if (c.find(*it)==c.end()) { 
-	   skeep_class=true; //output[k] is not a subset of class
-	   break;
-	}
-      }
-      if (!skeep_class) {
-        size_ret = output[k].size();
-	     ret = output[k];
-      }
-    }
-  }
-  return ret;
-}
Index: trunk/apertium/apertium/hmm.h
===================================================================
--- trunk/apertium/apertium/hmm.h	(revision 68349)
+++ trunk/apertium/apertium/hmm.h	(revision 68351)
@@ -60,16 +60,6 @@
     *  @see: read_ambiguity_classes, read_dictionary
     */
    void init(); 
-   
-   /** This method returns a known ambiguity class that is a subset of
-    *  the one received as a parameter. This is useful when a new
-    *  ambiguity class is found because of changes in the morphological
-    *  dictionary used by the MT system.
-    *  @param c set of tags (ambiguity class)
-    *  @return a known ambiguity class 
-    */
-   set<TTag> find_similar_ambiguity_class(set<TTag> c);
-   
 public:  
    void deserialise(FILE *Serialised_FILE_Tagger);
    std::vector<std::wstring> &getArrayTags();
Index: trunk/apertium/apertium/lswpost.cc
===================================================================
--- trunk/apertium/apertium/lswpost.cc	(revision 68349)
+++ trunk/apertium/apertium/lswpost.cc	(revision 68351)
@@ -104,7 +104,6 @@
   set<TTag> tags_left, tags_mid, tags_right;
   set<TTag>::iterator iter_left, iter_mid, iter_right;
   vector<vector<vector<double> > > para_matrix(N, vector<vector<double> >(N, vector<double>(N, 0)));
-  Collection &output = tdlsw.getOutput();
   MorphoStream morpho_stream(ftxt, true, &tdlsw);
   int num_valid_seq = 0;
   
@@ -114,14 +113,8 @@
   if (tags_left.size()==0) { //This is an unknown word
     tags_left = tdlsw.getOpenClass();
   }
-  if (output.has_not(tags_left)) {
-    wstring errors;
-    errors = L"A new ambiguity class was found. I cannot continue.\n";
-    errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-    errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-    errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-    fatal_error(errors);
-  }
+
+  require_ambiguity_class(tdlsw, tags_left, *word);
   ++nw;
   delete word;
   word = morpho_stream.get_next_word();  // word for tags mid
@@ -129,14 +122,7 @@
   if (tags_mid.size()==0) { //This is an unknown word
     tags_mid = tdlsw.getOpenClass();
   }
-  if (output.has_not(tags_mid)) {
-    wstring errors;
-    errors = L"A new ambiguity class was found. I cannot continue.\n";
-    errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-    errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-    errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-    fatal_error(errors);
-  }
+  require_ambiguity_class(tdlsw, tags_mid, *word);
   ++nw;
   delete word;
   if (morpho_stream.getEndOfFile()) {
@@ -155,14 +141,7 @@
     if (tags_right.size()==0) { //This is an unknown word
       tags_right = tdlsw.getOpenClass();
     }
-    if (output.has_not(tags_right)) {
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-      errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-      errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-      fatal_error(errors);
-    }
+    require_ambiguity_class(tdlsw, tags_right, *word);
 
     num_valid_seq = tags_left.size() * tags_mid.size() * tags_right.size();
     for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) {
@@ -247,48 +226,11 @@
 
 void
 LSWPoST::read_dictionary(FILE *fdic) {
-  int i, k, nw = 0;
-  TaggerWord *word = NULL;
-  set<TTag> tags;
-  Collection &output = tdlsw.getOutput();
-
-  MorphoStream morpho_stream(fdic, true, &tdlsw);
-
-  // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
-
-  word = morpho_stream.get_next_word();
-
-  while (word) {
-    if (++nw % 10000 == 0)
-      wcerr << L'.' << flush;
-
-    tags = word->get_tags();
-
-    if (tags.size() > 0)
-      k = output[tags];
-
-    delete word;
-    word = morpho_stream.get_next_word();
-  }
-  wcerr << L"\n";
-
-  // OPEN AMBIGUITY CLASS
-  // It contains all tags that are not closed.
-  // Unknown words are assigned the open ambiguity class
-  k = output[tdlsw.getOpenClass()];
-
+  tagger_utils::read_dictionary(fdic, tdlsw);
   int N = (tdlsw.getTagIndex()).size();
+  int M = (tdlsw.getOutput()).size();
+  wcerr << N << L" states and " << M <<L" ambiguity classes\n";
 
-  // Create ambiguity class holding one single tag for each tag.
-  // If not created yet
-  for (i = 0; i != N; i++) {
-    set < TTag > amb_class;
-    amb_class.insert(i);
-    k = output[amb_class];
-  }
-
-  wcerr << N << L" states\n";
-
   // set up the probability matrix of tdlsw, the pointer to the TaggerDataLSW object
   tdlsw.setProbabilities(N);
 }
@@ -302,7 +244,6 @@
   set<TTag> tags_left, tags_mid, tags_right;
   set<TTag>::iterator iter_left, iter_mid, iter_right;
   vector<vector<vector<double> > > para_matrix_new(N, vector<vector<double> >(N, vector<double>(N, 0)));
-  Collection &output = tdlsw.getOutput();
   MorphoStream morpho_stream(ftxt, true, &tdlsw);
 
   word = new TaggerWord();          // word for tags left
@@ -311,14 +252,7 @@
   if (tags_left.size()==0) { //This is an unknown word
     tags_left = tdlsw.getOpenClass();
   }
-  if (output.has_not(tags_left)) {
-    wstring errors;
-    errors = L"A new ambiguity class was found. I cannot continue.\n";
-    errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-    errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-    errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-    fatal_error(errors);
-  }
+  require_ambiguity_class(tdlsw, tags_left, *word);
   ++nw;
   delete word;
   word = morpho_stream.get_next_word();  // word for tags mid
@@ -326,14 +260,7 @@
   if (tags_mid.size()==0) { //This is an unknown word
     tags_mid = tdlsw.getOpenClass();
   }
-  if (output.has_not(tags_mid)) {
-    wstring errors;
-    errors = L"A new ambiguity class was found. I cannot continue.\n";
-    errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-    errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-    errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-    fatal_error(errors);
-  }
+  require_ambiguity_class(tdlsw, tags_mid, *word);
   ++nw;
   delete word;
   if (morpho_stream.getEndOfFile()) {
@@ -351,14 +278,7 @@
     if (tags_right.size()==0) { //This is an unknown word
       tags_right = tdlsw.getOpenClass();
     }
-    if (output.has_not(tags_right)) {
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
-      errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
-      errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-      fatal_error(errors);
-    }
+    require_ambiguity_class(tdlsw, tags_right, *word);
 
     double normalization = 0;
 
@@ -416,38 +336,20 @@
   set<TTag>::iterator iter_left, iter_mid, iter_right;
   MorphoStream morpho_stream(Input, debug, &tdlsw);
   morpho_stream.setNullFlush(null_flush);                      
-  Collection &output = tdlsw.getOutput();
  
   word_left = new TaggerWord();          // word left
   word_left->add_tag(eos, L"sent", tdlsw.getPreferRules());
   word_left->set_show_sf(show_sf);
   tags_left = word_left->get_tags();          // tags left
-  if (output.has_not(tags_left)) {
-    if (debug) {
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors += L"Word '" + word_left->get_superficial_form() + L"' not found Input the dictionary.\n";
-      errors += L"New ambiguity class: " + word_left->get_string_tags() + L"\n";
-      errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-      fatal_error(errors);
-    }
-    tags_left = find_similar_ambiguity_class(tags_left);
-  }
+
+  tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug);
   
   word_mid = morpho_stream.get_next_word(); // word mid
   word_mid->set_show_sf(show_sf);
   tags_mid = word_mid->get_tags();          // tags mid
-  if (output.has_not(tags_mid)) {
-    if (debug) {
-      wstring errors;
-      errors = L"A new ambiguity class was found. I cannot continue.\n";
-      errors += L"Word '" + word_mid->get_superficial_form() + L"' not found Input the dictionary.\n";
-      errors += L"New ambiguity class: " + word_mid->get_string_tags() + L"\n";
-      errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
-      fatal_error(errors);
-    }
-    tags_mid = find_similar_ambiguity_class(tags_mid);
-  }
+
+  tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug);
+
   if (morpho_stream.getEndOfFile()) {
     delete word_left;
     delete word_mid;
@@ -461,17 +363,7 @@
   while (word_right) {
 
     tags_right = word_right->get_tags();
-    if (output.has_not(tags_right)) {
-      if (debug) {
-        wstring errors;
-        errors = L"A new ambiguity class was found. \n";
-        errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
-        errors+= L"Word '"+word_right->get_superficial_form()+L"'.\n";
-        errors+= L"New ambiguity class: "+word_right->get_string_tags()+L"\n";
-        fatal_error(errors);
-      }
-      tags_right = find_similar_ambiguity_class(tags_right);
-    }
+    tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug);
 
     double max = -1;
     TTag tag_max = *tags_mid.begin();
@@ -511,30 +403,3 @@
   delete word_left;
   delete word_mid;
 }
-
-set<TTag>                                                                                     
-LSWPoST::find_similar_ambiguity_class(set<TTag> c) {
-  int size_ret = -1;                                                                          
-  set<TTag> ret=tdlsw.getOpenClass(); // return open-class as default, if no better is found.
-  bool skip_class;                                                                           
-  Collection &output = tdlsw.getOutput();                                                       
-                                                                                              
-  for(int k=0; k<output.size(); k++) {                                                           
-    if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {    
-      skip_class=false;                                                                      
-      // Test if output[k] is a subset of class                                               
-      for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {        
-        if (c.find(*it)==c.end()) { 
-           skip_class=true; //output[k] is not a subset of class                             
-           break;
-        }  
-      } 
-      if (!skip_class) {                                                                     
-        size_ret = output[k].size();                                                          
-             ret = output[k];
-      }      
-    } 
-  } 
-  return ret;                                                                                 
-} 
-
Index: trunk/apertium/apertium/lswpost.h
===================================================================
--- trunk/apertium/apertium/lswpost.h	(revision 68349)
+++ trunk/apertium/apertium/lswpost.h	(revision 68351)
@@ -107,14 +107,5 @@
    /** Do the tagging
     */
    void tagger(FILE *Input, FILE *Output, const bool &First = false);
-
-   /** This method returns a known ambiguity class that is a subset of 
-    *  the one received as a parameter. This is useful when a new
-    *  ambiguity class is found because of changes in the morphological
-    *  dictionary used by the MT system.
-    *  @param c set of tags (ambiguity class) 
-    *  @return a known ambiguity class    
-    */ 
-   set<TTag> find_similar_ambiguity_class(set<TTag> c);
 };
 #endif
Index: trunk/apertium/apertium/tagger_utils.cc
===================================================================
--- trunk/apertium/apertium/tagger_utils.cc	(revision 68349)
+++ trunk/apertium/apertium/tagger_utils.cc	(revision 68351)
@@ -14,7 +14,9 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
+
 #include <apertium/tagger_utils.h>
+#include <apertium/morpho_stream.h>
 
 #include <stdio.h>
 #include <apertium/string_utils.h>
@@ -119,7 +121,103 @@
 
   return s;
 }
-  
+
+void
+tagger_utils::read_dictionary(FILE *fdic, TaggerData &td) {
+  int i, k, nw = 0;
+  TaggerWord *word = NULL;
+  set <TTag> tags;
+  Collection &output = td.getOutput();
+
+  MorphoStream morpho_stream(fdic, true, &td);
+
+  // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
+
+  word = morpho_stream.get_next_word();
+
+  while (word) {
+    if (++nw % 10000 == 0)
+      wcerr << L'.' << flush;
+
+    tags = word->get_tags();
+
+    if (tags.size() > 0)
+      k = output[tags];
+
+    delete word;
+    word = morpho_stream.get_next_word();
+  }
+  wcerr << L"\n";
+
+  // OPEN AMBIGUITY CLASS
+  // It contains all tags that are not closed.
+  // Unknown words are assigned the open ambiguity class
+  k = output[td.getOpenClass()];
+
+  // Create ambiguity class holding one single tag for each tag.
+  // If not created yet
+  int N = (td.getTagIndex()).size();
+  for(i = 0; i != N; i++) {
+    set<TTag> amb_class;
+    amb_class.insert(i);
+    k = output[amb_class];
+  }
+}
+
+set<TTag>
+tagger_utils::find_similar_ambiguity_class(TaggerData &td, set<TTag> &c) {
+  int size_ret = -1;
+  set<TTag> ret = td.getOpenClass(); // return open-class as default, if no better is found.
+  bool skip_class;
+  Collection &output = td.getOutput();
+
+  for(int k=0; k<output.size(); k++) {
+    if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
+      skip_class = false;
+      // Test if output[k] is a subset of class
+      for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {
+        if (c.find(*it)==c.end()) {
+           skip_class = true; //output[k] is not a subset of class
+           break;
+        }
+      }
+      if (!skip_class) {
+        size_ret = output[k].size();
+             ret = output[k];
+      }
+    }
+  }
+  return ret;
+}
+
+void
+tagger_utils::require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word) {
+  if (td.getOutput().has_not(tags)) {
+    wstring errors;
+    errors = L"A new ambiguity class was found. I cannot continue.\n";
+    errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n";
+    errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n";
+    errors+= L"Take a look at the dictionary, then retrain.";
+    fatal_error(errors);
+  }
+}
+
+set<TTag>
+tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool debug) {
+  if (td.getOutput().has_not(tags)) {
+    if (debug) {
+      wstring errors;
+      errors = L"A new ambiguity class was found. \n";
+      errors += L"Retraining the tagger is necessary so as to take it into account.\n";
+      errors += L"Word '" + word.get_superficial_form() + L"'.\n";
+      errors += L"New ambiguity class: " + word.get_string_tags() + L"\n";
+      wcerr << L"Error: " << errors;
+    }
+    return find_similar_ambiguity_class(td, tags);
+  }
+  return tags;
+}
+
 template <class T>
 ostream& operator<< (ostream& os, const map <int, T> & f){
   typename map <int, T>::const_iterator it;
Index: trunk/apertium/apertium/tagger_utils.h
===================================================================
--- trunk/apertium/apertium/tagger_utils.h	(revision 68349)
+++ trunk/apertium/apertium/tagger_utils.h	(revision 68351)
@@ -25,6 +25,8 @@
 #include <vector>
 #include <apertium/ttag.h>
 #include <cstdlib>
+#include <apertium/tagger_data.h>
+#include <apertium/tagger_word.h>
 
 using namespace std;
 
@@ -66,6 +68,29 @@
   */
 int nguiones_fs(wstring const &cadena);
 
+/** Reads the expanded dictionary received as a parameter puts the resulting
+ *  ambiguity classes that the tagger will manage.
+ *  @param fdic the input stream with the expanded dictionary to read
+ *  @param td the tagger data instance to mutate
+ */
+void read_dictionary(FILE *fdic, TaggerData &td);
+
+/** This method returns a known ambiguity class that is a subset of
+*  the one received as a parameter. This is useful when a new
+*  ambiguity class is found because of changes in the morphological
+*  dictionary used by the MT system.
+*  @param c set of tags (ambiguity class)
+*  @return a known ambiguity class
+*/
+set<TTag> find_similar_ambiguity_class(TaggerData &td, set<TTag> &c);
+
+/** Dies with an error message if the tags aren't in the tagger data */
+void require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word);
+
+/** As with find_similar_ambiguity_class, but returns tags if it's already fine
+ * & prints a warning if debug */
+set<TTag> require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool debug);
+
 wstring trim(wstring s);
 
 };