Index: branches/apertium-tagger/experiments/experiments.py
===================================================================
--- branches/apertium-tagger/experiments/experiments.py	(revision 69967)
+++ branches/apertium-tagger/experiments/experiments.py	(revision 69968)
@@ -6,7 +6,7 @@
 from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag,
                             tagger_train_sup, tagger_train_unsup)
 
-# Experiment db
+# Experiment registry
 experiments = {}
 
 
@@ -22,11 +22,21 @@
     return func
 
 
-def non_default(func):
-    func.non_default = True
+def meta(func):
+    func.meta = True
     return func
 
 
+def default(func):
+    func.default = True
+    return func
+
+
+def cg_extra(func):
+    func.cg_extra = True
+    return func
+
+
 # Statistical helpers
 def aggregates(data):
     return (min(data), max(data),
@@ -66,6 +76,7 @@
 
         @reg_experiment(name)
         @xval_experiment(name)
+        @default
         def unigram_experiment(lab, xval_fns,
                                do_cg=do_cg,
                                unigram_model=unigram_model):
@@ -84,6 +95,7 @@
     name = ('cg' if do_cg else '') + '1st'
 
     @reg_experiment(name)
+    @default
     def pick_first_experiment(lab, do_cg=do_cg):
         first_fn = pjoin(lab.work_dir, 'test.' + name)
         if do_cg:
@@ -93,7 +105,14 @@
         extract_first_analysis(tagger_input, first_fn)
         return get_single_analysis(lab, first_fn)
 
-for do_cg in [False, True]:
+for cg_aug in [0, 1, 2, 3, (4, 0), (4, 5), (4, 10), (4, 20), (4, 30)]:
+    if isinstance(cg_aug, tuple):
+        cg_aug, cg_aug_t = cg_aug
+    else:
+        cg_aug_t = None
+    for do_cg in [None, 'in', 'dual', 'inv']:
+        if do_cg == 'inv':
+            continue
     for is_supervised, model in [(True, 'bigram'),
                                  (False, 'bigram'),
                                  (False, 'lwsw')]:
@@ -103,18 +122,20 @@
             iterations_list = [0, 50, 250]
         for iterations in iterations_list:
             name = (
-                "{cg}{sup}_{model}".format(
-                    cg='cg_' if do_cg else '',
+                    "{cgt}{cg}{sup}_{model}".format(
+                        cgt='cgt{}_'.format(cg_aug) if cg_aug else '',
+                        cg='cg{}_'.format(do_cg if do_cg != 'in' else '')
+                            if do_cg else '',
                     sup='sup' if is_supervised else 'unsup',
                     model=model
                 ) +
                 ("_i{iterations}".format(iterations=iterations)
-                 if len(iterations_list) > 1 else ""))
+                     if len(iterations_list) > 1 else "") +
+                    ("_j{iterations}".format(iterations=cg_aug_t)
+                     if cg_aug_t else ""))
 
-            @reg_experiment(name)
-            @xval_experiment(name)
             @needs_tsx
-            def model_experiment(lab, xval_fns, do_cg=do_cg,
+                def model_experiment(lab, xval_fns, cg_aug=cg_aug, do_cg=do_cg,
                                  is_supervised=is_supervised, model=model,
                                  iterations=iterations):
                 if is_supervised:
@@ -124,7 +145,9 @@
                         trainsrc_fn=xval_fns['trainsrc'],
                         dic_fn=lab.dic_fn,
                         tsx_fn=lab.tsx_fn,
-                        iterations=iterations)
+                            iterations=iterations,
+                            cg_aug=cg_aug,
+                            cgtrain_fn=xval_fns['traincgtag'])
                 else:
                     tagger_train_unsup(
                         model, xval_fns['model'],
@@ -131,18 +154,37 @@
                         trainsrc_fn=xval_fns['trainsrc'],
                         dic_fn=lab.dic_fn,
                         tsx_fn=lab.tsx_fn,
-                        iterations=iterations)
-                if do_cg:
-                    tagger_input = cg_proc(lab.cg_fn,
-                                           input=xval_fns['src'])
+                            iterations=iterations,
+                            cg_aug=cg_aug,
+                            cgtrain_fn=xval_fns['traincgtag'])
+                    if do_cg == 'in':
+                        tagger_input = xval_fns['cgtag']
                 else:
                     tagger_input = xval_fns['src']
+                    if do_cg in ['dual', 'inv']:
+                        cg_tagger_input = xval_fns['cgtag']
+                    else:
+                        cg_tagger_input = None
+                    tagger_cg_aug = None
+                    if do_cg == 'dual':
+                        tagger_cg_aug = 1
+                    elif do_cg == 'inv':
+                        tagger_cg_aug = 2
                 tagger_tag(
-                    model, xval_fns['model'], input=tagger_input,
-                    output=xval_fns['test']).check_returncode()
+                        model, xval_fns['model'], cg_fn=cg_tagger_input,
+                        cg_aug=tagger_cg_aug, input=tagger_input,
+                        output=xval_fns['test']
+                    ).check_returncode()
+                if not cg_aug and do_cg in [None, 'in']:
+                    model_experiment = default(model_experiment)
+                else:
+                    model_experiment = cg_extra(model_experiment)
+                model_experiment = xval_experiment(name)(model_experiment)
+                model_experiment = reg_experiment(name)(model_experiment)
 
 
 @reg_experiment('word_count')
+@default
 def word_count(lab):
     count = 0
     for line in open(lab.src_fn):
@@ -153,7 +195,7 @@
 
 @reg_experiment('new_cg_ambg')
 @needs_tsx
-@non_default
+@meta
 def new_cg_ambg(lab):
     model_fn = pjoin(lab.work_dir, 'new_cg_ambg.model')
     tagger_train_sup('bigram', model_fn,
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py	(revision 69967)
+++ branches/apertium-tagger/experiments/run_experiment.py	(revision 69968)
@@ -1,25 +1,24 @@
-from os import mkdir
-from os.path import isdir, exists as pexists, join as pjoin
-from subprocess import PIPE
+import argparse
+import datetime
 import functools
 import itertools
-import aitertools
-import argparse
-import asyncio
-from asyncio.subprocess import create_subprocess_exec
-import os
-import re
 import sys
+from os import mkdir
+from os.path import exists as pexists
+from os.path import join as pjoin
+from os.path import isdir
 from pprint import pformat
-import datetime
 
-from shell_utils import cd, filter, check_run, writeiter, MapFilter
-from shell_wrappers import extract_src
+import asyncio
 from experiments import experiments
+from shell_utils import cd, check_run
+from shell_wrappers import (cg_proc, extract_src, fix_dix, run_cg_conv_clean,
+                            split_n_r, strip_blanks, strip_unknown_sent,
+                            copy_blanks)
 
 loop = asyncio.get_event_loop()
 
-TMPDIR = 'experimenttmp'
+WORK_DIR = 'experiment_work'
 DEFAULT_TEXTS = {
     'cat': ['texts/miscellaneous.tagged.txt'],
     'spa': ['texts/miscellaneous.tagged.txt'],
@@ -97,6 +96,10 @@
         help="Reuse preprocesed dictionary from previous run",
         action='store_true')
     parser.add_argument(
+        '--cg-extra',
+        help="Run extra CG assisted tagging tests",
+        action='store_true')
+    parser.add_argument(
         '--output',
         help="Output file for the results of the experiment")
     parser.add_argument(
@@ -107,94 +110,8 @@
     return parser.parse_args()
 
 
-@filter
-def strip_blanks(line):
-    if line != '\n':
-        return line
-
-
-filter_dix = functools.partial(
-    MapFilter,
-    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
-    tran=lambda line: line.split(b":")[0] + b"\n")
-
-
-async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn):
-    pipes = []
-
-    expand_proc = await create_subprocess_exec('lt-expand',
-                                               dix_fn, stdout=PIPE)
-    filtered = filter_dix(expand_proc.stdout)
-
-    extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿",
-              "¡", "“", "”", "«", "»",
-              ]
-    for i, extra in enumerate(extras):
-        extras[i] = (extra + "\n").encode('utf-8')
-    with_extras = aitertools.chain(filtered, extras)
-
-    lt_inpipe, destxt_outpipe = os.pipe()
-    destxt = await create_subprocess_exec('apertium-destxt',
-                                          stdin=PIPE, stdout=destxt_outpipe)
-    os.close(destxt_outpipe)
-
-    pipes.append(writeiter(with_extras, destxt))
-
-    if tsx_fn is not None:
-        filter_ambg_inpipe, lt_outpipe = os.pipe()
-        lt_proc = await create_subprocess_exec(
-            'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe)
-        os.close(lt_outpipe)
-
-        filter_ambg = await create_subprocess_exec(
-            'apertium-filter-ambiguity', tsx_fn,
-            stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb'))
-        pipes.append(filter_ambg.wait())
-    else:
-        lt_proc = await create_subprocess_exec(
-            'lt-proc', morphology_fn,
-            stdin=lt_inpipe, stdout=open(output_fn, 'wb'))
-        pipes.append(lt_proc.wait())
-
-    return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
-
-
-filter_dix = functools.partial(
-    MapFilter,
-    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
-    tran=lambda line: line.split(b":")[0] + b"\n")
-
-
-SENT_END_RE = re.compile(br'/[.!?]<sent>\$$')
-SELECT_RE = re.compile(br'<SELECT:[0-9]+>')
-
-
-async def cg_conv_clean(input, output):
-    cleanstream_inpipe, cg_conv_outpipe = os.pipe()
-    await create_subprocess_exec(
-        'cg-conv', '--in-cg', '--out-apertium', "--ltr",
-        stdin=open(input, 'r'), stdout=cg_conv_outpipe)
-    os.close(cg_conv_outpipe)
-    cleanstream = await create_subprocess_exec(
-        'apertium-cleanstream', '-n',
-        stdin=cleanstream_inpipe, stdout=PIPE)
-
-    output_f = open(output, 'wb')
-    await cleanstream.stdout.readline()
-    async for line in cleanstream.stdout:
-        line = SELECT_RE.sub(b'', line)
-        output_f.write(line)
-        if SENT_END_RE.search(line):
-            output_f.write(b'\n')
-
-
-def cg_conv_clean_(input, output):
-    return loop.run_until_complete(
-        cg_conv_clean(input, output))
-
-
 PREPROCESSER_MAP = {
-    'cg': cg_conv_clean_
+    'cg': run_cg_conv_clean
 }
 
 
@@ -225,50 +142,6 @@
 }
 
 
-@filter(iter_filter=True)
-def strip_unknown_sent(gen, invalidate_func=None):
-    buff = []
-    valid_sent = True
-    for line in gen:
-        if line.strip().strip('¶') == '':
-            if valid_sent:
-                for line in buff:
-                    yield line
-                yield '\n'
-            buff = []
-            valid_sent = True
-        else:
-            buff.append(line)
-            if '/*' in line:
-                valid_sent = False
-            if invalidate_func is not None and invalidate_func(line):
-                valid_sent = False
-
-
-def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
-    sentences = 0
-    with open(corpus_fn) as corpus_file:
-        for line in corpus_file.readlines():
-            if line.strip() == '':
-                sentences = sentences + 1
-
-        split_left = int(float(sentences) * r / n)
-        split_right = int(float(sentences) * (r + 1) / n)
-
-        index = 0
-
-        corpus_file.seek(0)
-
-        with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file:
-            for line in corpus_file.readlines():
-                if line.strip() == '':
-                    index = index + 1
-                elif split_left <= index < split_right:
-                    ref_file.write(line)
-                else:
-                    train_file.write(line)
-
-
 class MissingLanguageDataException(Exception):
     def __init__(self, fn):
         self.fn = fn
@@ -278,7 +151,7 @@
     def __init__(self, lang, lang_root, texts, folds,
                  reuse=False, reuse_dic=False):
         self.lang = lang
-        self.work_dir = pjoin(TMPDIR, lang)
+        self.work_dir = pjoin(WORK_DIR, lang)
 
         pair_name = 'apertium-{0}.{0}'.format(lang)
         self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin')
@@ -304,6 +177,9 @@
         self.joined_fn = pjoin(self.work_dir, 'joined')
         self.ref_fn = pjoin(self.work_dir, 'ref')
         self.src_fn = pjoin(self.work_dir, 'src')
+        self.src_blanks_fn = self.src_fn + '.blanks'
+        self.cgtag_fn = pjoin(self.work_dir, 'cgtag')
+        self.cgtag_blanks_fn = self.cgtag_fn + '.blanks'
         self.dic_fn = pjoin(self.work_dir, 'filtered.dic')
 
         self.xval_fns = []
@@ -311,17 +187,21 @@
 
         for i in range(folds):
             xval_prefix = pjoin(self.work_dir, 'xval.{}.'.format(i))
+            xval_ref_fn = xval_prefix + 'ref'
+            xval_train_fn = xval_prefix + 'train'
             xval_src_fn = xval_prefix + 'src'
             xval_trainsrc_fn = xval_prefix + 'trainsrc'
-            xval_train_fn = xval_prefix + 'train'
-            xval_ref_fn = xval_prefix + 'ref'
+            xval_cgtag_fn = xval_prefix + 'cgtag'
+            xval_traincg_fn = xval_prefix + 'traincgtag'
 
             self.xval_fns.append({
                 'prefix': xval_prefix,
+                'ref': xval_ref_fn,
                 'train': xval_train_fn,
                 'src': xval_src_fn,
-                'ref': xval_ref_fn,
                 'trainsrc': xval_trainsrc_fn,
+                'cgtag': xval_cgtag_fn,
+                'traincgtag': xval_traincg_fn,
             })
 
         self.validate()
@@ -362,6 +242,9 @@
                 self.lang, lambda x: False))
         strip_blanks(self.joined_fn, self.ref_fn)
         extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn)
+        copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn)
+        cg_proc(self.cg_fn, input=self.src_fn, output=self.cgtag_fn)
+        copy_blanks(self.joined_fn, self.cgtag_fn, self.cgtag_blanks_fn)
         if not reuse_dic:
             loop.run_until_complete(
                 fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn,
@@ -370,10 +253,10 @@
         for i, xval_fn in enumerate(self.xval_fns):
             split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'],
                       self.folds, i)
-            extract_src(self.morphology_fn,
-                        input=xval_fn['ref'], output=xval_fn['src'])
-            extract_src(self.morphology_fn,
-                        input=xval_fn['train'], output=xval_fn['trainsrc'])
+            split_n_r(self.src_blanks_fn, xval_fn['trainsrc'], xval_fn['src'],
+                      self.folds, i)
+            split_n_r(self.cgtag_blanks_fn, xval_fn['traincgtag'],
+                      xval_fn['cgtag'], self.folds, i)
 
     def get_tagger(self, tagger):
         tagger_func = experiments[tagger]
@@ -388,8 +271,8 @@
 
 def main():
     args = parse_args()
-    if not isdir(TMPDIR):
-        mkdir(TMPDIR)
+    if not isdir(WORK_DIR):
+        mkdir(WORK_DIR)
     if args.notify:
         import notify2
 
@@ -399,24 +282,26 @@
             taggers = args.language_texts[lang]
             lang_root = pjoin(args.languagesdir, 'apertium-' + lang)
 
-            def mk_experimentor():
+            def mk_lab():
                 return LanguageTaggerLab(lang, lang_root, taggers,
                                          args.folds, reuse=args.reuse,
                                          reuse_dic=args.reuse_dic)
             try:
-                experimentor = mk_experimentor()
+                lab = mk_lab()
             except MissingLanguageDataException as e:
                 print("Missing {}... Trying to build it for you.".format(e.fn))
                 with cd(lang_root):
                     check_run(['./autogen.sh'])
                     check_run(['make'])
-                experimentor = mk_experimentor()
+                lab = mk_lab()
             languages_tagger_accuracies[lang] = {}
             for tagger in args.taggers:
-                experiment = experimentor.get_tagger(tagger)
+                experiment = lab.get_tagger(tagger)
                 if experiment is None:
                     print("Skipping {}/{} since it needs a tsx"
                           .format(lang, tagger))
+                elif getattr(experiment, 'default', False) and
+                    not (args.cg_extra and getattr(experiment, 'cg_extra', False)):
                 else:
                     languages_tagger_accuracies[lang][tagger] = experiment()
     finally:
@@ -425,7 +310,7 @@
         if args.output:
             outf = args.output
         else:
-            outf = pjoin(TMPDIR, 'result-{}.pyson'
+            outf = pjoin(WORK_DIR, 'result-{}.pyson'
                          .format(datetime.datetime.now().isoformat()))
         open(outf, 'w').write(result_pretty)
 
Index: branches/apertium-tagger/experiments/shell_wrappers.py
===================================================================
--- branches/apertium-tagger/experiments/shell_wrappers.py	(revision 69967)
+++ branches/apertium-tagger/experiments/shell_wrappers.py	(revision 69968)
@@ -1,6 +1,20 @@
-from shell_utils import filter, proc_filter
+import functools
+import os
+import re
+from subprocess import PIPE
 
+import aitertools
+import asyncio
+from asyncio.subprocess import create_subprocess_exec
+from shell_utils import MapFilter, filter, proc_filter, writeiter
 
+loop = asyncio.get_event_loop()
+
+BYTES_SENT_END_RE = re.compile(br'/[.!?]<sent>\$$')
+SENT_END_RE = re.compile(r'/[.!?]<sent>\$$')
+SELECT_RE = re.compile(br'<SELECT:[0-9]+>')
+
+
 @proc_filter
 def lt_proc(morphology_fn, dictcase=False):
     cmd = ['lt-proc', morphology_fn]
@@ -11,7 +25,10 @@
 
 @filter(output_separator='\n')
 def extract_words(line):
+    if line:
     return line.split('^')[1].split('/')[0]
+    else:
+        return ''
 
 
 @filter(output_separator='\n')
@@ -19,6 +36,15 @@
     return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$'
 
 
+@filter(iter_filter=True)
+def add_sentence_newlines(iter):
+    for line in iter:
+        if BYTES_SENT_END_RE.search(line):
+            return line
+        else:
+            return line + b'\n'
+
+
 def extract_src(morphology_fn, input, output=None):
     ref_words_iter = extract_words(input=input)
     return lt_proc(morphology_fn, input=ref_words_iter, output=output)
@@ -45,7 +71,8 @@
 
 @proc_filter
 def tagger_train_sup(model_type, model_fn, train_fn,
-                     trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0):
+                     trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0,
+                     cg_aug=0, cgtrain_fn=None):
     cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn]
     if (not all((trainsrc_fn, dic_fn, tsx_fn)) and
             not model_type.startswith('unigram')):
@@ -58,6 +85,9 @@
         cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn]
         cmd.append(train_fn)
         cmd.append(trainsrc_fn)
+    if cg_aug:
+        cmd.insert(2, '--cg-augmented={}'.format(cg_aug))
+        cmd.insert(5, cgtrain_fn)
     insert_model(cmd, model_type)
     return cmd
 
@@ -64,19 +94,176 @@
 
 @proc_filter
 def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn,
-                       iterations=0):
+                       iterations=0, cg_aug=0, ambg_classes=10,
+                       cgtrain_fn=None):
     if model_fn.startswith('unigram'):
         raise ValueError("No unsupervised training for unigram models")
-    cmd = ['apertium-tagger', '--train={}'.format(iterations),
-           dic_fn, trainsrc_fn, tsx_fn, model_fn]
-    insert_model(cmd, model_type)
+    cmd = ['apertium-tagger', '--train={}'.format(iterations)]
+    if model_type == 'lwsw':
+        cmd.append('--sliding-window')
+    if cg_aug:
+        cmd.append('--cg-augmented={}'.format(cg_aug))
+    if cg_aug == 4:
+        cmd.append(str(ambg_classes))
+    cmd.append(dic_fn)
+    if cg_aug != 1 and cg_aug != 4:
+        cmd.append(trainsrc_fn)
+    if cg_aug != 0:
+        cmd.append(cgtrain_fn)
+    cmd.extend([tsx_fn, model_fn])
     return cmd
 
 
 @proc_filter
-def tagger_tag(model_type, model_fn, debug=False):
+def tagger_tag(model_type, model_fn, cg_fn=None, cg_aug=None, debug=False):
     cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn]
+    if cg_aug:
+        cmd.append('--cg-augmented={}'.format(cg_aug))
+    if cg_fn:
+        cmd.append(cg_fn)
     if debug:
         cmd.insert(1, '--debug')
     insert_model(cmd, model_type, tagging=True)
     return cmd
+
+
+@proc_filter
+def cleanstream():
+    return ['apertium-cleanstream', '-n']
+
+
+async def cg_conv_clean(input, output):
+    cleanstream_inpipe, cg_conv_outpipe = os.pipe()
+    await create_subprocess_exec(
+        'cg-conv', '--in-cg', '--out-apertium', "--ltr",
+        stdin=open(input, 'r'), stdout=cg_conv_outpipe)
+    os.close(cg_conv_outpipe)
+    cleanstream_proc = await create_subprocess_exec(
+        'apertium-cleanstream', '-n',
+        stdin=cleanstream_inpipe, stdout=PIPE)
+
+    output_f = open(output, 'wb')
+    await cleanstream_proc.stdout.readline()
+    async for line in cleanstream_proc.stdout:
+        line = SELECT_RE.sub(b'', line)
+        output_f.write(line)
+        if BYTES_SENT_END_RE.search(line):
+            output_f.write(b'\n')
+
+
+def run_cg_conv_clean(input, output):
+    return loop.run_until_complete(
+        cg_conv_clean(input, output))
+
+
+@filter
+def strip_blanks(line):
+    if line != '\n':
+        return line
+
+
+filter_dix = functools.partial(
+    MapFilter,
+    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
+    tran=lambda line: line.split(b":")[0] + b"\n")
+
+
+async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn):
+    pipes = []
+
+    expand_proc = await create_subprocess_exec('lt-expand',
+                                               dix_fn, stdout=PIPE)
+    filtered = filter_dix(expand_proc.stdout)
+
+    extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿",
+              "¡", "“", "”", "«", "»",
+              ]
+    for i, extra in enumerate(extras):
+        extras[i] = (extra + "\n").encode('utf-8')
+    with_extras = aitertools.chain(filtered, extras)
+
+    lt_inpipe, destxt_outpipe = os.pipe()
+    destxt = await create_subprocess_exec('apertium-destxt',
+                                          stdin=PIPE, stdout=destxt_outpipe)
+    os.close(destxt_outpipe)
+
+    pipes.append(writeiter(with_extras, destxt))
+
+    if tsx_fn is not None:
+        filter_ambg_inpipe, lt_outpipe = os.pipe()
+        lt_proc = await create_subprocess_exec(
+            'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe)
+        os.close(lt_outpipe)
+
+        filter_ambg = await create_subprocess_exec(
+            'apertium-filter-ambiguity', tsx_fn,
+            stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb'))
+        pipes.append(filter_ambg.wait())
+    else:
+        lt_proc = await create_subprocess_exec(
+            'lt-proc', morphology_fn,
+            stdin=lt_inpipe, stdout=open(output_fn, 'wb'))
+        pipes.append(lt_proc.wait())
+
+    return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
+
+
+filter_dix = functools.partial(
+    MapFilter,
+    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
+    tran=lambda line: line.split(b":")[0] + b"\n")
+
+
+@filter(iter_filter=True)
+def strip_unknown_sent(gen, invalidate_func=None):
+    buff = []
+    valid_sent = True
+    for line in gen:
+        if line.strip().strip('¶') == '':
+            if valid_sent:
+                for line in buff:
+                    yield line
+                yield '\n'
+            buff = []
+            valid_sent = True
+        else:
+            buff.append(line)
+            if '/*' in line:
+                valid_sent = False
+            if invalidate_func is not None and invalidate_func(line):
+                valid_sent = False
+
+
+def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
+    sentences = 0
+    with open(corpus_fn) as corpus_file:
+        for line in corpus_file.readlines():
+            if line.strip() == '':
+                sentences += 1
+
+        split_left = int(float(sentences) * r / n)
+        split_right = int(float(sentences) * (r + 1) / n)
+
+        index = 0
+
+        corpus_file.seek(0)
+
+        with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file:
+            for line in corpus_file.readlines():
+                if line.strip() == '':
+                    index += 1
+                elif split_left <= index < split_right:
+                    ref_file.write(line)
+                else:
+                    train_file.write(line)
+
+
+def copy_blanks(blanks_fn, input_fn, output_fn):
+    blanks = open(blanks_fn)
+    input = open(input_fn)
+    output = open(output_fn, 'w')
+    for line in blanks:
+        if line.strip() == '':
+            output.write('\n')
+        else:
+            output.write(input.readline())