Index: branches/apertium-tagger/experiments/split_corpus_n.py
===================================================================
--- branches/apertium-tagger/experiments/split_corpus_n.py	(revision 68806)
+++ branches/apertium-tagger/experiments/split_corpus_n.py	(nonexistent)
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import random
-
-def main(corpus, prefix):
-    tokens = 0.0
-    sentences = 0.0
-
-    for line in open(corpus).readlines():
-
-        if line.strip() == '':
-            sentences = sentences + 1.0
-        else:
-            tokens = tokens + 1.0
-
-    print(sentences, tokens, tokens / sentences)
-
-    ids = []
-    for i in range(0, int(sentences)):
-        ids.append(i)
-    random.shuffle(ids)
-    split = int(sentences / 10)
-    print(split, file=sys.stderr)
-    testing = ids[0:split]
-    training = ids[split:]
-
-    train_file = open(prefix + 'train', 'w+')
-    test_file = open(prefix + 'ref', 'w+')
-
-    buffer = ''
-    index = 0
-    for line in open(corpus).readlines():
-        if line.strip() == '':
-            index = index + 1
-            buffer = buffer + '\n'
-            if index in testing:
-                test_file.write(buffer)
-            elif index in training:
-                train_file.write(buffer)
-            else:
-                print('ERROR: %d not in testing or training' %
-                      (index), file=sys.stderr)
-            buffer = ''
-        else:
-            buffer = buffer + line
-        if index % int(tokens / 100) == 0:
-            sys.stderr.write('.')
-            sys.stderr.flush()
-
-if __name__ == '__main__':
-    corpus = sys.argv[1]
-    prefix = sys.argv[2]
-    main(corpus, prefix)
Index: branches/apertium-tagger/experiments/add_to_wikitable.py
===================================================================
--- branches/apertium-tagger/experiments/add_to_wikitable.py	(nonexistent)
+++ branches/apertium-tagger/experiments/add_to_wikitable.py	(revision 68807)
@@ -0,0 +1,172 @@
+# -- encoding: utf-8 --
+
+import sys
+import mwparserfromhell
+from mwparserfromhell.nodes.tag import Tag
+from mwparserfromhell.nodes.text import Text
+from mwparserfromhell.wikicode import Wikicode
+
+TAGGER_ORDER = ['1st', 'unigram1', 'unigram2', 'unigram3', 'bigram', 'lwsw']
+rdict = lambda d: {v: k for k, v in d.items()}
+
+LANG_CODE_NAME_MAP = {
+    'cat': 'Catalan',
+    'spa': 'Spanish',
+    'hbs': 'Serbo-Croatian',
+    'rus': 'Russian',
+    'kaz': 'Kazakh',
+    'por': 'Portuguese',
+    'swe': 'Swedish',
+}
+LANG_NAME_CODE_MAP = rdict(LANG_CODE_NAME_MAP)
+
+def name_to_attrs(name):
+    attrs = {}
+    for tagger in TAGGER_ORDER:
+        if tagger in name:
+            attrs['tagger'] = tagger
+
+    if 'cg' in name:
+        attrs['cg'] = True
+    else:
+        attrs['cg'] = False
+
+    if attrs['tagger'] == 'lwsw':
+        attrs['sup'] = None
+    elif 'unsup' in name:
+        attrs['sup'] = False
+    elif 'sup' in name:
+        attrs['sup'] = True
+    else:
+        attrs['sup'] = None
+
+    if '_i' in name:
+        attrs['iters'] = int(name.split('_i')[1])
+    else:
+        attrs['iters'] = None
+
+    return attrs
+
+def attrs_to_sort_tuple(attrs):
+    # tagger; unsup, sup; nocg, cg; iters
+    return (TAGGER_ORDER.index(attrs['tagger']), attrs['sup'], attrs['cg'], attrs['iters'])
+
+def attrs_to_str(attrs):
+    if attrs['tagger'].startswith('unigram'):
+        out = 'Unigram model ' + attrs['tagger'][len('unigram'):]
+    elif attrs['tagger'] == '1st':
+        out = attrs['tagger']
+    else:
+        out = attrs['tagger'].title()
+
+    if (attrs['cg']):
+        out = "CG→" + out
+
+    if attrs['sup'] is not None or attrs['iters'] is not None:
+        bits = []
+        if attrs['sup'] is not None:
+            bits.append('sup' if attrs['sup'] else 'unsup')
+        if attrs['iters'] is not None:
+            bits.append('{} iters'.format(attrs['iters']))
+        out += ' ({})'.format(', '.join(bits))
+
+    return out
+
+def value_to_str(value):
+    if hasattr(value, "__getitem__"):
+        return "{2:.2f}±{3:.2f}".format(*(v * 100 for v in value))
+    else:
+        return "{0:.2f}".format(value * 100)
+
+
+def mk_title_td(title):
+    return Tag(
+        'td',
+        wiki_markup='|',
+        contents=" '''{}''' ".format(title),
+        closing_wiki_markup='')
+
+def mk_val_td(val, is_last=False):
+    return Tag(
+        'td',
+        wiki_markup='||',
+        attrs=['align=right'],
+        contents=" {} {}".format(val, "\n" if is_last else ""),
+        wiki_style_separator='|',
+        closing_wiki_markup='')
+
+def mk_empty_td(is_last=False):
+    return Tag(
+        'td',
+        wiki_markup='||',
+        contents="\n" if is_last else "",
+        closing_wiki_markup='')
+
+def mk_initial_tr(title):
+    return Tag(
+        'tr',
+        wiki_markup='|-\n',
+        contents=Wikicode([mk_title_td(title), mk_empty_td(is_last=True)]),
+        closing_wiki_markup='')
+
+input_table = sys.stdin.read()
+input_data = eval(open(sys.argv[1]).read())
+
+lang_order = []
+
+table = mwparserfromhell.parse(input_table.strip())
+table_inner = table.get(0).contents
+headings = table_inner.get(2).contents.nodes
+for tag in headings:
+    if not isinstance(tag, Tag):
+        continue
+    title = tag.contents.strip()
+    if not title:
+        continue
+    lang_order.append(LANG_NAME_CODE_MAP[title])
+
+def insert_into_tr(tr, col_idx, val_str):
+    print("insert_into_tr", tr, col_idx, val_str)
+    print('tr.contents.nodes', tr.contents.nodes)
+    if len(tr.contents.nodes) <= col_idx:
+        last_td = tr.contents.get(-1)
+        if last_td.contents.endswith('\n'):
+            last_td.contents = last_td.contents[:-1]
+        while len(tr.contents.nodes) < col_idx:
+            tr.contents.append(mk_empty_td())
+        tr.contents.append(mk_empty_td(last_node=True))
+    target_cell = tr.contents.get(col_idx)
+    has_newline = target_cell.contents.endswith('\n')
+    print('target_cell', target_cell)
+    val_td = mk_val_td(val_str, is_last=has_newline)
+    print('replacement cell', val_td)
+    tr.contents.set(col_idx, val_td)
+    print('tr after', tr)
+
+for lang, data in input_data.items():
+    lang_idx = lang_order.index(lang)
+    data = [(name_to_attrs(name), value_to_str(value)) for name, value in data.items()]
+    data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0]))
+    table_idx = 4
+    for attrs, val_str in data:
+        title_str = attrs_to_str(attrs)
+        col_idx = lang_idx + 1
+        while table_idx < len(table_inner.nodes):
+            tr = table_inner.get(table_idx)
+            if len(tr.contents) > 1:
+                cell_contents = tr.contents.get(0).contents
+                existing_title_str = str(cell_contents).strip(' ').strip("'")
+                if existing_title_str == title_str:
+                    # insert into existing
+                    insert_into_tr(tr, col_idx, val_str)
+                    break
+            else:
+                table_inner.remove(tr, recursive=False)
+            table_idx += 1
+        else:
+            # append to end
+            tr = mk_initial_tr(title_str)
+            insert_into_tr(tr, col_idx, val_str)
+            table_inner.append(tr)
+
+print(table)
Index: branches/apertium-tagger/experiments/requirements.txt
===================================================================
--- branches/apertium-tagger/experiments/requirements.txt	(nonexistent)
+++ branches/apertium-tagger/experiments/requirements.txt	(revision 68807)
@@ -0,0 +1,2 @@
+aitertools==0.1.0
+tabulate==0.7.5
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py	(revision 68806)
+++ branches/apertium-tagger/experiments/run_experiment.py	(revision 68807)
@@ -1,39 +1,26 @@
-import statistics
-import sys
+from statistics import mean, stdev
 from os import mkdir
 from os.path import isdir, join as pjoin, exists as pexists
-import subprocess
 from subprocess import PIPE
 import functools
 import itertools
 import aitertools
 import argparse
-import tabulate
-import csv
 import asyncio
 from asyncio.subprocess import create_subprocess_exec
-from collections import namedtuple
-from contextlib import contextmanager
 import os
-from pprint import pprint
+import re
+from pprint import pformat
+import datetime
 
 from evaluate_tagger import TaggerEvaluator
-from split_corpus_n import main as split_corpus_n
 
+from shell_utils import (
+    cd, filter, proc_filter, check_run, writeiter, MapFilter)
+
 loop = asyncio.get_event_loop()
 
-@contextmanager
-def cd(newdir):
-    prevdir = os.getcwd()
-    os.chdir(os.path.expanduser(newdir))
-    try:
-        yield
-    finally:
-        os.chdir(prevdir)
-
-
 TMPDIR = 'experimenttmp'
-
 DEFAULT_TEXTS = {
     'cat': ['texts/miscellaneous.tagged.txt'],
     'spa': ['texts/miscellaneous.tagged.txt'],
@@ -50,14 +37,21 @@
         'texts/akatsuki.txt',
     ],
 }
-
+TSX_MAP = {
+    'hbs': 'apertium-hbs.hbs-coarse.tsx',
+}
 DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe']
+NO_TSX_LANGUAGES = ['rus']
+STRIP_AT_LANGUAGES = ['rus']
+AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>')
 
+
 def comma_list(s):
     if s == '()':
         return []
     return s.split(',')
 
+
 def comma_colon_dict(s):
     d = {}
     for bit in s.split(','):
@@ -65,184 +59,41 @@
         d[pair[0]] = pair[1].split(',')
     return d
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description="Runs a series of experiments on different part of speech taggers and different language data.")
-    parser.add_argument('languagesdir', help="Path to the directory containing all the individaul language data directories")
-    parser.add_argument('--languages', help="Only run experiments for these languages, comma separated", default=DEFAULT_LANGUAGES, type=comma_list)
-    parser.add_argument('--taggers',  help="Only run experiments with these taggers, comma separated", default=DEFAULT_TAGGERS, type=comma_list)
-    parser.add_argument('--language-texts',  help="Use different texts per language, coma seperated colon pairs", default=DEFAULT_TEXTS, type=comma_colon_dict)
-    parser.add_argument('--folds', help="Use x-fold validation instead of 10-fold", default=10, type=int)
-    parser.add_argument('--reuse', help="Reuse preprocesed dictionary and corpa from previous run", action='store_true')
+    parser = argparse.ArgumentParser(
+        description="Runs a series of experiments on different part of speech "
+                    "taggers and different language data.")
+    parser.add_argument(
+        'languagesdir',
+        help="Path to the directory containing all the individaul language "
+             "data directories")
+    parser.add_argument(
+        '--languages',
+        help="Only run experiments for these languages, comma separated",
+        default=DEFAULT_LANGUAGES, type=comma_list)
+    parser.add_argument(
+        '--taggers',
+        help="Only run experiments with these taggers, comma separated",
+        default=DEFAULT_TAGGERS, type=comma_list)
+    parser.add_argument(
+        '--language-texts',
+        help="Use different texts per language, coma seperated colon pairs",
+        default=DEFAULT_TEXTS, type=comma_colon_dict)
+    parser.add_argument(
+        '--folds',
+        help="Use x-fold validation instead of 10-fold",
+        default=10, type=int)
+    parser.add_argument(
+        '--reuse',
+        help="Reuse preprocesed dictionary and corpa from previous run",
+        action='store_true')
 
     return parser.parse_args()
 
-def read1k_chunker(f):
-    def read1k():
-        return f.read(1024)
-    return iter(read1k, '')
 
-def filter(func=None, iter_filter=False, input_chunker=None, output_separator=''):
-    if func is None:
-        def defer(func=None):
-            return filter(func, iter_filter=iter_filter, input_chunker=input_chunker, output_separator=output_separator)
-        return defer
-
-    def generator(input_iter):
-        for line in input_iter:
-            filtered = func(line)
-            if filtered is not None:
-                yield filtered + output_separator
-
-    @functools.wraps(func)
-    def wrapper(input, output=None):
-        input_file = None
-        if isinstance(input, str):
-            input_iter = input_file = open(input)
-            if input_chunker:
-                input_iter = input_chunker(input_file)
-            else:
-                input_iter = input_file.readlines()
-        else:
-            input_iter = input
-        if iter_filter:
-            gen = func(input_iter)
-        else:
-            gen = generator(input_iter)
-        if output is None:
-            return gen
-        output_file = open(output, 'w')
-        for line in gen:
-            output_file.write(line)
-        if input_file is not None:
-            input_file.close()
-        output_file.close()
-    return wrapper
-
-
-class MapFilter:
-    def __init__(self, aiterable, pred=None, tran=None):
-        self.aiterable = aiterable
-        self.pred = pred
-        self.tran = tran
-
-    async def __aiter__(self):
-        return self
-
-    async def __anext__(self):
-        while True:
-            payload = await self.aiterable.__anext__()
-            if self.pred is None or self.pred(payload):
-                if self.tran is None:
-                    return payload
-                else:
-                    return self.tran(payload)
-
-
-async def dir_in(input_fn, proc):
-    input_file = open(input_fn)
-    while 1:
-        b = input_file.read(1024)
-        if not len(b):
-            return
-        await proc.stdin.write(b)
-
-
-async def dir_out(proc, output_fn):
-    output_file = open(output_fn, 'w')
-    while 1:
-        b = await in_proc.read(1024)
-        if not len(b):
-            return
-        output_file.write(b)
-
-
-class Tee(MapFilter):
-    def __init__(self, aiterable, log_file):
-        self.log_file = open(log_file, 'wb')
-        super().__init__(aiterable, tran=self.tran)
-
-    def tran(self, bit):
-        self.log_file.write(bit)
-        return bit
-
-
-async def pipe(in_proc, out_proc):
-    while 1:
-        b = await in_proc.stdout.read(16384)
-        if b == b'':
-            return
-        out_proc.stdin.write(b)
-        #await out_proc.stdin.drain()
-
-#async def readlines(in_proc):
-#    while 1:
-#        b = await in_proc.stdout.readline()
-#        if not len(b):
-#            return
-#        b
-
-
-async def writeiter(iter, out_proc):
-    i = 0
-    async for block in iter:
-        if (i % 10000) == 0:
-            print(".", end="", flush=True)
-        out_proc.stdin.write(block)
-        #await out_proc.stdin.drain()
-        i += 1
-    print("writeiter done")
-    out_proc.stdin.write_eof()
-
-def proc_filter(func=None, output_chunker=None):
-    if func is None:
-        def defer(func):
-            return proc_filter(func, output_chunker=output_chunker)
-        return defer
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        input = kwargs.pop('input', None)
-        output = kwargs.pop('output', None)
-
-        if callable(func):
-            cmd = func(*args, **kwargs)
-        else:
-            cmd = func
-
-        kwargs = {}
-        if isinstance(input, str):
-            kwargs['stdin'] = open(input)
-        else:
-            kwargs['stdin'] = subprocess.PIPE
-        if output:
-            kwargs['stdout'] = open(output, 'w')
-        else:
-            kwargs['stdout'] = subprocess.PIPE
-        print("RUNNING: ", ' '.join(cmd))
-        proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs)
-        print(type(input))
-        if not isinstance(input, str) and input is not None:
-            for line in input:
-                if 'apertium-destxt' in cmd:
-                    print('des', line)
-                #if not isinstance(input, bytes):
-                    #line = line.encode('utf8')
-                #print(proc.stdin)
-                proc.stdin.write(line)
-        if not output:
-            if output_chunker:
-                return output_chunker(proc.stdout)
-            else:
-                return proc.stdout.readlines()
-        stdout, stderr = proc.communicate()
-        retcode = proc.poll()
-        return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr)
-
-    return wrapper
-
-
 @filter
-def strip_blanks_(line):
+def strip_blanks(line):
     if line != '\n':
         return line
 
@@ -256,27 +107,12 @@
 def extract_first_analysis(line):
     return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$'
 
-strip_blanks = functools.partial(MapFilter, pred=lambda line: line != '\n')
 
-
 @filter
 def passthrough(line):
     return line
 
-def prun(*args, **kwargs):
-    for key in ('stdin', 'stdout', 'stderr'):
-        if key not in kwargs:
-            kwargs[key] = subprocess.PIPE
-    print('prun', args, kwargs)
-    return subprocess.run(*args, **kwargs)
 
-
-def check_run(cmd, *args, **kwargs):
-    kwargs['check'] = True
-    print("RUNNING: ", ' '.join(cmd))
-    return subprocess.run(cmd, *args, **kwargs)
-
-
 @proc_filter
 def lt_proc(morphology_fn, dictcase=False):
     cmd = ['lt-proc', morphology_fn]
@@ -296,26 +132,32 @@
     return cmd
 
 
-
 @proc_filter
-def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0):
+def tagger_train_sup(model_type, model_fn, train_fn,
+                     trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0):
     cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn]
-    if not all((trainsrc_fn, dic_fn, tsx_fn)) and model_type != 'unigram':
+    if (not all((trainsrc_fn, dic_fn, tsx_fn)) and
+            not model_type.startswith('unigram')):
         raise ValueError("Optional arguments required for non-unigram models")
-    if model_type != 'unigram':
-        #apertium-tagger -s 0 /tmp/$DN.dic /tmp/spa.misc.$i.trainsrc $TSX /tmp/spa.misc.$i.prob /tmp/spa.misc.$i.train /tmp/spa.misc.$i.trainsrc
+    if model_type == 'lwsw':
+        raise ValueError("No supervised training for lwsw model")
+    if model_type.startswith('unigram'):
+        cmd.append(train_fn)
+    else:
         cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn]
+        cmd.append(train_fn)
         cmd.append(trainsrc_fn)
     insert_model(cmd, model_type)
-    cmd.insert(-1, train_fn)
     return cmd
 
 
 @proc_filter
-def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, iterations=0):
-    if model_fn == 'unigram':
+def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn,
+                       iterations=0):
+    if model_fn.startswith('unigram'):
         raise ValueError("No unsupervised training for unigram models")
-    cmd = ['apertium-tagger', '--train={}'.format(iterations), dic_fn, trainsrc_fn, tsx_fn, model_fn]
+    cmd = ['apertium-tagger', '--train={}'.format(iterations),
+           dic_fn, trainsrc_fn, tsx_fn, model_fn]
     insert_model(cmd, model_type)
     return cmd
 
@@ -334,51 +176,42 @@
 async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn):
     pipes = []
 
-    expand_proc = await create_subprocess_exec('lt-expand', dix_fn, stdout=PIPE)
+    expand_proc = await create_subprocess_exec('lt-expand',
+                                               dix_fn, stdout=PIPE)
     filtered = filter_dix(expand_proc.stdout)
 
-    extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", "¡"]
+    extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿",
+              "¡"]
     for i, extra in enumerate(extras):
         extras[i] = (extra + "\n").encode('utf-8')
     with_extras = aitertools.chain(filtered, extras)
 
     lt_inpipe, destxt_outpipe = os.pipe()
-    destxt = await create_subprocess_exec('apertium-destxt', stdin=PIPE, stdout=destxt_outpipe)
+    destxt = await create_subprocess_exec('apertium-destxt',
+                                          stdin=PIPE, stdout=destxt_outpipe)
     os.close(destxt_outpipe)
 
     pipes.append(writeiter(with_extras, destxt))
 
+    if tsx_fn is not None:
     filter_ambg_inpipe, lt_outpipe = os.pipe()
-    lt_proc = await create_subprocess_exec('lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe)
+        lt_proc = await create_subprocess_exec(
+            'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe)
     os.close(lt_outpipe)
 
-    filter_ambg = await create_subprocess_exec('apertium-filter-ambiguity', tsx_fn, stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb'))
+        filter_ambg = await create_subprocess_exec(
+            'apertium-filter-ambiguity', tsx_fn,
+            stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb'))
     pipes.append(filter_ambg.wait())
+    else:
+        lt_proc = await create_subprocess_exec(
+            'lt-proc', morphology_fn,
+            stdin=lt_inpipe, stdout=open(output_fn, 'wb'))
+        pipes.append(lt_proc.wait())
+
     return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
 
 
-#def lt_proc(morphology_fn, input_fn, output_fn=None, dictcase=False):
-    #cmd = ['lt-proc', morphology_fn]
-    #if dictcase:
-        #cmd.insert(1, '--dictionary-case')
-    #if isinstance(input_fn, str):
-        #cmd.append(input_fn)
-        #cmd.append(output_fn)
-        #return prun(cmd)
-    #else:
-        #kwargs = {'stdin': subprocess.PIPE}
-        #if output_fn:
-            ##kwargs['stdout'] = open(output_fn, 'w')
-        #with subprocess.Popen(cmd, **kwargs) as proc:
-            #if not isinstance(input_fn, str):
-                #for line in input_fn:
-                    #if not isinstance(input_fn, bytes):
-                        #line = line.encode('utf8')
-                    #proc.stdin.write(line)
-            #stdout, stderr = proc.communicate()
-            #retcode = proc.poll()
-        #return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr)
-
 @proc_filter
 def cg_proc(grammar_fn, dictcase=True):
     cmd = ['cg-proc', grammar_fn]
@@ -387,12 +220,17 @@
     return cmd
 
 
+@filter
+def strip_at_tag(line):
+    return AT_TAG_REGEX.sub('', line)
+
+
 @filter(iter_filter=True)
 def strip_unknown_sent(gen):
     buff = []
     valid_sent = True
     for line in gen:
-        if line.strip() == '':
+        if line.strip().strip('¶') == '':
             if valid_sent:
                 for line in buff:
                     yield line
@@ -404,6 +242,7 @@
             if '/*' in line:
                 valid_sent = False
 
+
 def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
     sentences = 0
     with open(corpus_fn) as corpus_file:
@@ -414,7 +253,6 @@
         split_left = int(float(sentences) * r / n)
         split_right = int(float(sentences) * (r + 1) / n)
 
-        buffer = ''
         index = 0
 
         corpus_file.seek(0)
@@ -440,37 +278,19 @@
         def wrapper(self, *args, **kwargs):
             accuracies = []
             for i, xval_fns in enumerate(self.xval_fns):
-                #(prefix, train_fn, src_fn, ref_fn)
                 xval_fns['test'] = xval_fns['prefix'] + 'test.' + name
                 xval_fns['model'] = xval_fns['prefix'] + 'model.' + name
-                func(self, xval_fns) # train_fn, src_fn, ref_fn, test_fn, model_fn
-                evaluator = TaggerEvaluator(xval_fns['src'], xval_fns['ref'], xval_fns['test'])
+                func(self, xval_fns)
+                evaluator = TaggerEvaluator(
+                    xval_fns['src'], xval_fns['ref'], xval_fns['test'])
                 evaluator.run_analysis()
                 accuracies.append(evaluator.accuracy)
-            return (min(accuracies), max(accuracies), statistics.mean(accuracies), statistics.stddev(accuracies))
+            return (min(accuracies), max(accuracies),
+                    mean(accuracies), stdev(accuracies))
         return wrapper
     return dec
 
 
-def unigram_taggers(cls):
-    for do_cg in [False, True]:
-        for unigram_type in range(1,4):
-            unigram_model = 'unigram' + str(unigram_type)
-            name = ('cg' if do_cg else '') + unigram_model
-
-            @cls.reg_experiment
-            @xval_experiment(name)
-            def experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model):
-                tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train'])
-                if do_cg:
-                    tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
-                else:
-                    tagger_input = xval_fns['src']
-                tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
-            setattr(cls, 'experiment_' + name, experiment)
-    return cls
-
-
 def extract_src(morphology_fn, input, output=None):
     ref_words_iter = extract_words(input=input)
     return lt_proc(morphology_fn, input=ref_words_iter, output=output)
@@ -489,6 +309,13 @@
         self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin')
         self.cg_fn = pjoin(lang_root, lang + '.rlx.bin')
         self.dix_fn = pjoin(lang_root, pair_name + '.dix')
+        if not pexists(self.dix_fn):
+            self.dix_fn = pjoin(lang_root, '.deps', pair_name + '.dix')
+        if lang in NO_TSX_LANGUAGES:
+            self.tsx_fn = None
+        elif lang in TSX_MAP:
+            self.tsx_fn = pjoin(lang_root, TSX_MAP[lang])
+        else:
         self.tsx_fn = pjoin(lang_root, pair_name + '.tsx')
 
         self.text_fns = [pjoin(lang_root, text) for text in texts]
@@ -521,27 +348,38 @@
             self.do_preprocessing()
 
     def validate(self):
+        for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]:
+            if fn is not None and not pexists(fn):
+                raise MissingLanguageDataException(fn=fn)
+
         check_run(["apertium-validate-dictionary", self.dix_fn])
+        if self.tsx_fn is not None:
         check_run(["apertium-validate-tagger", self.tsx_fn])
 
-        for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]:
-            if not pexists(fn):
-                raise MissingLanguageDataException(fn=fn)
-
     def do_preprocessing(self):
         if not isdir(self.work_dir):
             mkdir(self.work_dir)
 
-        strip_unknown_sent(itertools.chain(*(open(fn).readlines() for fn in self.text_fns)), self.joined_fn)
-        #loop.run_until_complete(strip_blanks(self.joined_fn, self.ref_fn))
-        strip_blanks_(self.joined_fn, self.ref_fn)
+        joined = itertools.chain(*(open(fn).readlines()
+                                   for fn in self.text_fns))
+        if self.lang in STRIP_AT_LANGUAGES:
+            strip_unknown_in = strip_at_tag(joined)
+        else:
+            strip_unknown_in = joined
+        strip_unknown_sent(strip_unknown_in, self.joined_fn)
+        strip_blanks(self.joined_fn, self.ref_fn)
         extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn)
-        loop.run_until_complete(fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn))
+        loop.run_until_complete(
+            fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn,
+                    output_fn=self.dic_fn))
 
         for i, xval_fn in enumerate(self.xval_fns):
-            split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], self.folds, i)
-            extract_src(self.morphology_fn, input=xval_fn['ref'], output=xval_fn['src'])
-            extract_src(self.morphology_fn, input=xval_fn['train'], output=xval_fn['trainsrc'])
+            split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'],
+                      self.folds, i)
+            extract_src(self.morphology_fn,
+                        input=xval_fn['ref'], output=xval_fn['src'])
+            extract_src(self.morphology_fn,
+                        input=xval_fn['train'], output=xval_fn['trainsrc'])
 
     def _analyse(self, test_fn):
         evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn)
@@ -552,24 +390,35 @@
     def reg_experiment(cls, name):
         def reg(func):
             LanguageTaggerExperimentor.experiments[name] = func
+            return func
         return reg
 
     @classmethod
+    def needs_tsx(cls, func):
+        func.needs_tsx = True
+        return func
+
+    @classmethod
     def add_experiments(cls):
         for do_cg in [False, True]:
-            for unigram_type in range(1,4):
+            for unigram_type in range(1, 4):
                 unigram_model = 'unigram' + str(unigram_type)
                 name = ('cg' if do_cg else '') + unigram_model
 
                 @cls.reg_experiment(name)
                 @xval_experiment(name)
-                def unigram_experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model):
-                    tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train'])
+                def unigram_experiment(self, xval_fns,
+                                       do_cg=do_cg,
+                                       unigram_model=unigram_model):
+                    tagger_train_sup(unigram_model,
+                                     xval_fns['model'], xval_fns['train'])
                     if do_cg:
-                        tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
+                        tagger_input = cg_proc(self.cg_fn,
+                                               input=xval_fns['src'])
                     else:
                         tagger_input = xval_fns['src']
-                    tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
+                    tagger_tag(unigram_model, xval_fns['model'],
+                               input=tagger_input, output=xval_fns['test'])
 
         for do_cg in [False, True]:
             name = ('cg' if do_cg else '') + '1st'
@@ -581,7 +430,7 @@
                     tagger_input = cg_proc(self.cg_fn, input=self.src_fn)
                 else:
                     tagger_input = self.src_fn
-                extract_first_analysis(self.src_fn, first_fn)
+                extract_first_analysis(tagger_input, first_fn)
                 return self._analyse(first_fn)
 
         for do_cg in [False, True]:
@@ -595,6 +444,7 @@
 
                     @cls.reg_experiment(name)
                     @xval_experiment(name)
+                    @cls.needs_tsx
                     def model_experiment(self, xval_fns, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations):
                         if is_supervised:
                             tagger_train_sup(
@@ -612,10 +462,12 @@
                                 tsx_fn=self.tsx_fn,
                                 iterations=iterations)
                         if do_cg:
-                            tagger_input = cg_proc(self.cg_fn, input=xval_fns['src'])
+                            tagger_input = cg_proc(self.cg_fn,
+                                                   input=xval_fns['src'])
                         else:
                             tagger_input = xval_fns['src']
-                        tagger_tag(model, xval_fns['model'], input=tagger_input, output=xval_fns['test'])
+                        tagger_tag(model, xval_fns['model'],
+                                   input=tagger_input, output=xval_fns['test'])
 
     @classmethod
     def all_taggers(cls):
@@ -622,6 +474,9 @@
         return cls.experiments.keys()
 
     def get_tagger(self, tagger):
+        tagger_func = self.experiments[tagger]
+        if self.tsx_fn is None and getattr(tagger_func, 'needs_tsx', False):
+            return None
         return functools.partial(self.experiments[tagger], self)
 
 
@@ -641,22 +496,32 @@
         for lang in args.languages:
             taggers = args.language_texts[lang]
             lang_root = pjoin(args.languagesdir, 'apertium-' + lang)
+
             def mk_experimentor():
-                return LanguageTaggerExperimentor(lang, lang_root, taggers, args.folds, reuse=args.reuse)
+                return LanguageTaggerExperimentor(lang, lang_root, taggers,
+                                                  args.folds, reuse=args.reuse)
             try:
                 experimentor = mk_experimentor()
             except MissingLanguageDataException as e:
                 print("Missing {}... Trying to build it for you.".format(e.fn))
                 with cd(lang_root):
-                    check_run('./autogen.sh')
-                    check_run('make')
+                    check_run(['./autogen.sh'])
+                    check_run(['make'])
                 experimentor = mk_experimentor()
             languages_tagger_accuracies[lang] = {}
             for tagger in args.taggers:
                 experiment = experimentor.get_tagger(tagger)
+                if experiment is None:
+                    print("Skipping {}/{} since it needs a tsx"
+                          .format(lang, tagger))
+                else:
                 languages_tagger_accuracies[lang][tagger] = experiment()
     finally:
-        pprint(languages_tagger_accuracies)
+        result_pretty = pformat(languages_tagger_accuracies)
+        print(result_pretty)
+        outf = pjoin(TMPDIR, 'result-{}.pyson'
+                     .format(datetime.datetime.now().isoformat()))
+        open(outf, 'w').write(result_pretty)
 
 
 if __name__ == '__main__':
Index: branches/apertium-tagger/experiments/shell_utils.py
===================================================================
--- branches/apertium-tagger/experiments/shell_utils.py	(nonexistent)
+++ branches/apertium-tagger/experiments/shell_utils.py	(revision 68807)
@@ -0,0 +1,186 @@
+import os
+import functools
+from contextlib import contextmanager
+import subprocess
+
+
+@contextmanager
+def cd(newdir):
+    prevdir = os.getcwd()
+    os.chdir(os.path.expanduser(newdir))
+    try:
+        yield
+    finally:
+        os.chdir(prevdir)
+
+
+def read1k_chunker(f):
+    def read1k():
+        return f.read(1024)
+    return iter(read1k, '')
+
+
+def filter(func=None, iter_filter=False,
+           input_chunker=None, output_separator=''):
+    if func is None:
+        def defer(func=None):
+            return filter(
+                func, iter_filter=iter_filter,
+                input_chunker=input_chunker, output_separator=output_separator)
+        return defer
+
+    def generator(input_iter):
+        for line in input_iter:
+            filtered = func(line)
+            if filtered is not None:
+                yield filtered + output_separator
+
+    @functools.wraps(func)
+    def wrapper(input, output=None):
+        input_file = None
+        if isinstance(input, str):
+            input_iter = input_file = open(input)
+            if input_chunker:
+                input_iter = input_chunker(input_file)
+            else:
+                input_iter = input_file.readlines()
+        else:
+            input_iter = input
+        if iter_filter:
+            gen = func(input_iter)
+        else:
+            gen = generator(input_iter)
+        if output is None:
+            return gen
+        output_file = open(output, 'w')
+        for line in gen:
+            output_file.write(line)
+        if input_file is not None:
+            input_file.close()
+        output_file.close()
+    return wrapper
+
+
+class MapFilter:
+    def __init__(self, aiterable, pred=None, tran=None):
+        self.aiterable = aiterable
+        self.pred = pred
+        self.tran = tran
+
+    async def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        while True:
+            payload = await self.aiterable.__anext__()
+            if self.pred is None or self.pred(payload):
+                if self.tran is None:
+                    return payload
+                else:
+                    return self.tran(payload)
+
+
+async def dir_in(input_fn, proc):
+    input_file = open(input_fn)
+    while 1:
+        b = input_file.read(1024)
+        if not len(b):
+            return
+        await proc.stdin.write(b)
+
+
+async def dir_out(proc, output_fn):
+    output_file = open(output_fn, 'w')
+    while 1:
+        b = await proc.read(1024)
+        if not len(b):
+            return
+        output_file.write(b)
+
+
+class Tee(MapFilter):
+    def __init__(self, aiterable, log_file):
+        self.log_file = open(log_file, 'wb')
+        super().__init__(aiterable, tran=self.tran)
+
+    def tran(self, bit):
+        self.log_file.write(bit)
+        return bit
+
+
+async def pipe(in_proc, out_proc):
+    while 1:
+        b = await in_proc.stdout.read(16384)
+        if b == b'':
+            return
+        out_proc.stdin.write(b)
+
+
+async def writeiter(iter, out_proc):
+    i = 0
+    async for block in iter:
+        if (i % 10000) == 0:
+            print(".", end="", flush=True)
+        out_proc.stdin.write(block)
+        i += 1
+    print("writeiter done")
+    out_proc.stdin.write_eof()
+
+
+def proc_filter(func=None, output_chunker=None):
+    if func is None:
+        def defer(func):
+            return proc_filter(func, output_chunker=output_chunker)
+        return defer
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        input = kwargs.pop('input', None)
+        output = kwargs.pop('output', None)
+
+        if callable(func):
+            cmd = func(*args, **kwargs)
+        else:
+            cmd = func
+
+        kwargs = {}
+        if isinstance(input, str):
+            kwargs['stdin'] = open(input)
+        else:
+            kwargs['stdin'] = subprocess.PIPE
+        if output:
+            kwargs['stdout'] = open(output, 'w')
+        else:
+            kwargs['stdout'] = subprocess.PIPE
+        print("RUNNING: ", ' '.join(cmd))
+        proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs)
+        print(type(input))
+        if not isinstance(input, str) and input is not None:
+            for line in input:
+                if 'apertium-destxt' in cmd:
+                    print('des', line)
+                proc.stdin.write(line)
+        if not output:
+            if output_chunker:
+                return output_chunker(proc.stdout)
+            else:
+                return proc.stdout.readlines()
+        stdout, stderr = proc.communicate()
+        retcode = proc.poll()
+        return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr)
+
+    return wrapper
+
+
+def prun(*args, **kwargs):
+    for key in ('stdin', 'stdout', 'stderr'):
+        if key not in kwargs:
+            kwargs[key] = subprocess.PIPE
+    print('prun', args, kwargs)
+    return subprocess.run(*args, **kwargs)
+
+
+def check_run(cmd, *args, **kwargs):
+    kwargs['check'] = True
+    print("RUNNING: ", ' '.join(cmd))
+    return subprocess.run(cmd, *args, **kwargs)