Index: branches/apertium-tagger/experiments/add_to_wikitable.py
===================================================================
--- branches/apertium-tagger/experiments/add_to_wikitable.py	(revision 69202)
+++ branches/apertium-tagger/experiments/add_to_wikitable.py	(revision 69203)
@@ -1,6 +1,7 @@
 # -- encoding: utf-8 --
 
 import sys
+import locale
 import mwparserfromhell
 from mwparserfromhell.nodes.tag import Tag
 from mwparserfromhell.nodes.text import Text
@@ -79,6 +80,10 @@
         return "{0:.2f}".format(value * 100)
 
 
+def result_to_str(result):
+    return '{}, {}'.format(result[0], result[1])
+
+
 def mk_title_td(title):
     return Tag(
         'td',
@@ -102,6 +107,13 @@
         contents="\n" if is_last else "",
         closing_wiki_markup='')
 
+def mk_wc_td(val, is_first=False, is_last=False):
+    return Tag(
+        'td',
+        wiki_markup='!' if is_first else '!!',
+        contents=" <small>{}</small>{}".format(val, "\n" if is_last else " "),
+        closing_wiki_markup='')
+
 def mk_initial_tr(title):
     return Tag(
         'tr',
@@ -110,7 +122,6 @@
         closing_wiki_markup='')
 
 input_table = sys.stdin.read()
-input_data = eval(open(sys.argv[1]).read())
 
 lang_order = []
 
@@ -126,8 +137,6 @@
     lang_order.append(LANG_NAME_CODE_MAP[title])
 
 def insert_into_tr(tr, col_idx, val_str):
-    print("insert_into_tr", tr, col_idx, val_str)
-    print('tr.contents.nodes', tr.contents.nodes)
     if len(tr.contents.nodes) <= col_idx:
         last_td = tr.contents.get(-1)
         if last_td.contents.endswith('\n'):
@@ -137,20 +146,50 @@
         tr.contents.append(mk_empty_td(is_last=True))
     target_cell = tr.contents.get(col_idx)
     has_newline = target_cell.contents.endswith('\n')
-    print('target_cell', target_cell)
     val_td = mk_val_td(val_str, is_last=has_newline)
-    print('replacement cell', val_td)
     tr.contents.set(col_idx, val_td)
-    print('tr after', tr)
 
+def insert_into_wc(tr, col_idx, val_str):
+    target_cell = tr.contents.get(col_idx)
+    has_newline = target_cell.contents.endswith('\n')
+    is_first = len(target_cell.wiki_markup) == 1
+    val_td = mk_wc_td(val_str, is_first=is_first, is_last=has_newline)
+    tr.contents.set(col_idx, val_td)
+
+def format_word_count(word_count):
+    locale.setlocale(locale.LC_ALL, 'en_US')
+    number = locale.format("%d", word_count, grouping=True)
+    return "{}".format(number)
+
+if sys.argv[1] in LANG_CODE_NAME_MAP:
+    # blank out column
+    col_idx = lang_order.index(sys.argv[1]) + 1
+    table_idx = 3
+    while table_idx < len(table_inner.nodes):
+        tr = table_inner.get(table_idx)
+        if len(tr.contents.nodes) > col_idx:
+            if tr.contents.get(col_idx).contents.endswith('\n'):
+                tr.contents.get(col_idx).contents = '\n'
+            else:
+                tr.contents.get(col_idx).contents = ''
+        table_idx += 1
+    print(table)
+    sys.exit()
+
+input_data = eval(open(sys.argv[1]).read())
+
 for lang, data in input_data.items():
     lang_idx = lang_order.index(lang)
-    data = [(name_to_attrs(name), value_to_str(value)) for name, value in data.items()]
+    col_idx = lang_idx + 1
+    word_count = data.pop('word_count', None)
+    if word_count is not None:
+        word_count_tr = table_inner.get(3)
+        insert_into_wc(word_count_tr, col_idx, format_word_count(word_count))
+    data = [(name_to_attrs(name), result_to_str(value)) for name, value in data.items()]
     data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0]))
     table_idx = 4
     for attrs, val_str in data:
         title_str = attrs_to_str(attrs)
-        col_idx = lang_idx + 1
         while table_idx < len(table_inner.nodes):
             tr = table_inner.get(table_idx)
             if len(tr.contents) > 1:
Index: branches/apertium-tagger/experiments/evaluate_tagger.py
===================================================================
--- branches/apertium-tagger/experiments/evaluate_tagger.py	(revision 69202)
+++ branches/apertium-tagger/experiments/evaluate_tagger.py	(revision 69203)
@@ -165,6 +165,8 @@
         self.n_truenegative = 0
         self.n_falsepositive = 0
         self.n_falsenegative = 0
+        self.n_analysis_available = 0
+        self.n_analysis_unavailable = 0
 
         self.n_ref_readings = 0
         self.n_src_readings = 0
@@ -279,6 +281,13 @@
                 #}
             #}
 
+            for ref_reading in ref_readings:
+                if ref_reading not in src_readings:
+                    print('[' + str(n_line) + '] UNAVAILABLE:', ref_reading, src_readings)
+                    self.n_analysis_unavailable += 1
+                else:
+                    self.n_analysis_available += 1
+
             for ref_reading in ref_readings:  # {
                 if ref_reading not in tst_readings:  # {
                     print('[' + str(n_line) + '] FALSENEG:', ref_reading, tst_readings)
@@ -362,6 +371,10 @@
         return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsenegative))
 
     @property
+    def recall_available(self):
+        return float(self.n_truepositive) / float(self.n_analysis_available)
+
+    @property
     def accuracy(self):
         return float(self.n_truepositive + self.n_truenegative) / \
             (float(self.n_truepositive + self.n_falsenegative +
@@ -390,11 +403,14 @@
         print('trueneg  :\t', self.n_truenegative)
         print('falsepos :\t', self.n_falsepositive)
         print('falseneg :\t', self.n_falsenegative)
+        print('analysis available :\t', self.n_analysis_available)
+        print('analysis unavailable :\t', self.n_analysis_unavailable)
 
         print('')
 
         print('precision:\t', self.precision, '\t( true pos / all pos )')
         print('recall   :\t', self.recall, '\t( true pos / (true pos + false neg) )')
+        print('recall available  :\t', self.recall_available, '\t( true pos / (src in ref) )')
         print('accuracy :\t', self.accuracy, '\t((true pos + true neg) / (everything) )')
 
         print('')
Index: branches/apertium-tagger/experiments/requirements.txt
===================================================================
--- branches/apertium-tagger/experiments/requirements.txt	(revision 69202)
+++ branches/apertium-tagger/experiments/requirements.txt	(revision 69203)
@@ -1,3 +1,5 @@
 aitertools==0.1.0
 tabulate==0.7.5
 -e git+https://github.com/frankier/streamparser.git@setup-py#egg=streamparser
+dbus-python==1.2.4
+notify2==0.3
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py	(revision 69202)
+++ branches/apertium-tagger/experiments/run_experiment.py	(revision 69203)
@@ -10,6 +10,7 @@
 from asyncio.subprocess import create_subprocess_exec
 import os
 import re
+import sys
 from pprint import pformat
 import datetime
 
@@ -26,18 +27,22 @@
     'spa': ['texts/miscellaneous.tagged.txt'],
     'hbs': ['hbs-tagger-data/hbs.tagged.txt'],
     'rus': [
-        # 'texts/son-smešnogo-čeloveka.ana.txt', incorrect format?
-        'texts/dva-samoubijstva.ana.txt'  # seems to be only one in correct format?
+        # seems to be only one in correct format?
+        'texts/dva-samoubijstva.ana.txt'
     ],
     'kaz': ['eval/ref.1000.txt'],
     'por': [
-        'texts/água.txt',  # ambiguous
-        'texts/raio.txt',  # ambiguous
-        # 'texts/música.txt', ambiguous, syntax errors
-        'texts/akatsuki.txt',  # ambiguous
-        'texts/beringia.txt',  # ambiguous
-        'texts/cultura.txt',  # ambiguous
-        'texts/bering.txt',  # ambiguous
+        'cg:água.tagged.txt',
+        'cg:anfetamina.tagged.txt',
+        'cg:bering.tagged.txt',
+        'cg:cultura.tagged.txt',
+        'cg:myanmar.tagged.txt',
+        'cg:wikitravel.tagged.txt',
+        'cg:akatsuki.tagged.txt',
+        'cg:beringia.tagged.txt',
+        'cg:catalunha.tagged.txt',
+        'cg:música.tagged.txt',
+        'cg:raio.tagged.txt',
     ],
 }
 TSX_MAP = {
@@ -90,9 +95,16 @@
         help="Reuse preprocesed dictionary and corpa from previous run",
         action='store_true')
     parser.add_argument(
+        '--reuse-dic',
+        help="Reuse preprocesed dictionary from previous run",
+        action='store_true')
+    parser.add_argument(
         '--output',
-        help="Output file for the results of the experiment",
-        action='store_string')
+        help="Output file for the results of the experiment")
+    parser.add_argument(
+        '--notify',
+        help="Produce a desktop notification when done",
+        action='store_true')
 
     return parser.parse_args()
 
@@ -186,7 +198,8 @@
     filtered = filter_dix(expand_proc.stdout)
 
     extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿",
-              "¡"]
+              "¡", "“", "”", "«", "»",
+              ]
     for i, extra in enumerate(extras):
         extras[i] = (extra + "\n").encode('utf-8')
     with_extras = aitertools.chain(filtered, extras)
@@ -217,6 +230,45 @@
     return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
 
 
+filter_dix = functools.partial(
+    MapFilter,
+    pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
+    tran=lambda line: line.split(b":")[0] + b"\n")
+
+
+SENT_END_RE = re.compile(br'/[.!?]<sent>\$$')
+SELECT_RE = re.compile(br'<SELECT:[0-9]+>')
+
+
+async def cg_conv_clean(input, output):
+    cleanstream_inpipe, cg_conv_outpipe = os.pipe()
+    await create_subprocess_exec(
+        'cg-conv', '--in-cg', '--out-apertium', "--ltr",
+        stdin=open(input, 'r'), stdout=cg_conv_outpipe)
+    os.close(cg_conv_outpipe)
+    cleanstream = await create_subprocess_exec(
+        'apertium-cleanstream', '-n',
+        stdin=cleanstream_inpipe, stdout=PIPE)
+
+    output_f = open(output, 'wb')
+    await cleanstream.stdout.readline()
+    async for line in cleanstream.stdout:
+        line = SELECT_RE.sub(b'', line)
+        output_f.write(line)
+        if SENT_END_RE.search(line):
+            output_f.write(b'\n')
+
+
+def cg_conv_clean_(input, output):
+    return loop.run_until_complete(
+        cg_conv_clean(input, output))
+
+
+PREPROCESSER_MAP = {
+    'cg': cg_conv_clean_
+}
+
+
 @proc_filter
 def cg_proc(grammar_fn, dictcase=True):
     cmd = ['cg-proc', grammar_fn]
@@ -225,22 +277,33 @@
     return cmd
 
 
-AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>')
-PRPERS_TAG = re.compile(r'/<pr>\+')
+#AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>')
+#PRPERS_TAG = re.compile(r'/<pr>\+')
 
 
-@filter
-def cleanup_rus(line):
-    return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line))
+#@filter
+#def cleanup_rus(line):
+    #return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line))
 
 
-TEXT_CLEANUP_MAP = {
-    'rus': cleanup_rus,
+#TEXT_CLEANUP_MAP = {
+    #'rus': cleanup_rus,
+#}
+
+def invalidate_por(line):
+    # All these are the wrong way around!
+    return ('/$' in line \
+            or line.startswith('^beta-fenetilamina/')  # gets analysed as two words
+            or line.startswith('^km²/')  # ^2 ends up outside analysis
+            or ('./' in line and '<sent>' not in line))  # ends up as an <abbr><sent> multiword
+
+LANGUAGE_INVALIDATOR_MAP = {
+    'por': invalidate_por
 }
 
 
 @filter(iter_filter=True)
-def strip_unknown_sent(gen):
+def strip_unknown_sent(gen, invalidate_func=None):
     buff = []
     valid_sent = True
     for line in gen:
@@ -255,6 +318,8 @@
             buff.append(line)
             if '/*' in line:
                 valid_sent = False
+            if invalidate_func is not None and invalidate_func(line):
+                valid_sent = False
 
 
 def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
@@ -286,11 +351,16 @@
         self.fn = fn
 
 
+def aggregates(data):
+    return (min(accuracies), max(accuracies),
+            mean(accuracies), stdev(accuracies))
+
 def xval_experiment(name):
     def dec(func=None):
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs):
-            accuracies = []
+            recall = []
+            recall_available = []
             for i, xval_fns in enumerate(self.xval_fns):
                 xval_fns['test'] = xval_fns['prefix'] + 'test.' + name
                 xval_fns['model'] = xval_fns['prefix'] + 'model.' + name
@@ -298,9 +368,9 @@
                 evaluator = TaggerEvaluator(
                     xval_fns['src'], xval_fns['ref'], xval_fns['test'])
                 evaluator.run_analysis()
-                accuracies.append(evaluator.accuracy)
-            return (min(accuracies), max(accuracies),
-                    mean(accuracies), stdev(accuracies))
+                recall.append(evaluator.recall)
+                recall_available.append(evaluator.recall_available)
+            return (aggregates(recall), aggregates(recall_available))
         return wrapper
     return dec
 
@@ -315,7 +385,7 @@
 
     experiments = {}
 
-    def __init__(self, lang, lang_root, texts, folds, reuse=False):
+    def __init__(self, lang, lang_root, texts, folds, reuse=False, reuse_dic=False):
         self.lang = lang
         self.work_dir = pjoin(TMPDIR, lang)
 
@@ -332,7 +402,14 @@
         else:
             self.tsx_fn = pjoin(lang_root, pair_name + '.tsx')
 
-        self.text_fns = [pjoin(lang_root, text) for text in texts]
+        self.text_fns = []
+        for text in texts:
+            if ':' in text:
+                preprocesser_name, text = text.split(':', 1)
+            else:
+                preprocesser_name = None
+            self.text_fns.append((preprocesser_name,
+                                  pjoin(lang_root, 'texts', text)))
         self.joined_fn = pjoin(self.work_dir, 'joined')
         self.ref_fn = pjoin(self.work_dir, 'ref')
         self.src_fn = pjoin(self.work_dir, 'src')
@@ -359,7 +436,7 @@
         self.validate()
 
         if not reuse:
-            self.do_preprocessing()
+            self.do_preprocessing(reuse_dic=reuse_dic)
 
     def validate(self):
         for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]:
@@ -370,19 +447,28 @@
         if self.tsx_fn is not None:
             check_run(["apertium-validate-tagger", self.tsx_fn])
 
-    def do_preprocessing(self):
+    def do_preprocessing(self, reuse_dic=False):
         if not isdir(self.work_dir):
             mkdir(self.work_dir)
 
+        preprocessed_texts = []
+        for i, (preprocesser_name, fn) in enumerate(self.text_fns):
+            if preprocesser_name:
+                preprocesser = PREPROCESSER_MAP.get(preprocesser_name)
+                cleaned_fn = pjoin(self.work_dir, 'cleaned.{}.{}.txt'.format(i, preprocesser_name))
+                preprocesser(input=fn, output=cleaned_fn)
+                preprocessed_texts.append(cleaned_fn)
+            else:
+                preprocessed_texts.append(fn)
+
         joined = itertools.chain(*(open(fn).readlines()
-                                   for fn in self.text_fns))
-        if self.lang in TEXT_CLEANUP_MAP:
-            strip_unknown_in = TEXT_CLEANUP_MAP[self.lang](joined)
-        else:
-            strip_unknown_in = joined
-        strip_unknown_sent(strip_unknown_in, self.joined_fn)
+                                   for fn in preprocessed_texts))
+        strip_unknown_sent(
+            joined, self.joined_fn,
+            invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(self.lang, lambda x: False))
         strip_blanks(self.joined_fn, self.ref_fn)
         extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn)
+        if not reuse_dic:
         loop.run_until_complete(
             fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn,
                     output_fn=self.dic_fn))
@@ -398,7 +484,7 @@
     def _analyse(self, test_fn):
         evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn)
         evaluator.run_analysis()
-        return evaluator.accuracy
+        return (evaluator.recall, evaluator.recall_available)
 
     @classmethod
     def reg_experiment(cls, name):
@@ -432,7 +518,7 @@
                     else:
                         tagger_input = xval_fns['src']
                     tagger_tag(unigram_model, xval_fns['model'],
-                               input=tagger_input, output=xval_fns['test'])
+                               input=tagger_input, output=xval_fns['test']).check_returncode()
 
         for do_cg in [False, True]:
             name = ('cg' if do_cg else '') + '1st'
@@ -449,12 +535,19 @@
 
         for do_cg in [False, True]:
             for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]:
-                for iterations in [0, 50, 250]:
-                    name = "{cg}{sup}_{model}_i{iterations}".format(
+                if is_supervised:
+                    iterations_list = [0]
+                else:
+                    iterations_list = [0, 50, 250]
+                for iterations in iterations_list:
+                    name = (
+                        "{cg}{sup}_{model}".format(
                         cg='cg_' if do_cg else '',
                         sup='sup' if is_supervised else 'unsup',
-                        model=model,
-                        iterations=iterations)
+                            model=model
+                        ) +
+                        ("_i{iterations}".format(iterations=iterations)
+                         if len(iterations_list) > 1 else ""))
 
                     @cls.reg_experiment(name)
                     @xval_experiment(name)
@@ -481,8 +574,16 @@
                         else:
                             tagger_input = xval_fns['src']
                         tagger_tag(model, xval_fns['model'],
-                                   input=tagger_input, output=xval_fns['test'])
+                                   input=tagger_input, output=xval_fns['test']).check_returncode()
 
+        @cls.reg_experiment('word_count')
+        def word_count(self):
+            count = 0
+            for line in open(self.src_fn):
+                if line.strip() != '':
+                    count += 1
+            return count
+
     @classmethod
     def all_taggers(cls):
         return cls.experiments.keys()
@@ -504,6 +605,8 @@
     args = parse_args()
     if not isdir(TMPDIR):
         mkdir(TMPDIR)
+    if args.notify:
+        import notify2
 
     languages_tagger_accuracies = {}
     try:
@@ -513,7 +616,8 @@
 
             def mk_experimentor():
                 return LanguageTaggerExperimentor(lang, lang_root, taggers,
-                                                  args.folds, reuse=args.reuse)
+                                                  args.folds, reuse=args.reuse,
+                                                  reuse_dic=args.reuse_dic)
             try:
                 experimentor = mk_experimentor()
             except MissingLanguageDataException as e:
@@ -540,7 +644,12 @@
                          .format(datetime.datetime.now().isoformat()))
         open(outf, 'w').write(result_pretty)
 
+        if args.notify:
+            notify2.init("Tagger experiment finished")
+            notice = notify2.Notification(' '.join(sys.argv), "Tagger experiment finished")
+            notice.show()
 
+
 if __name__ == '__main__':
     try:
         main()
Index: branches/apertium-tagger/experiments/shell_utils.py
===================================================================
--- branches/apertium-tagger/experiments/shell_utils.py	(revision 69202)
+++ branches/apertium-tagger/experiments/shell_utils.py	(revision 69203)
@@ -29,14 +29,14 @@
                 input_chunker=input_chunker, output_separator=output_separator)
         return defer
 
-    def generator(input_iter):
+    def generator(input_iter, **kwargs):
         for line in input_iter:
-            filtered = func(line)
+            filtered = func(line, **kwargs)
             if filtered is not None:
                 yield filtered + output_separator
 
     @functools.wraps(func)
-    def wrapper(input, output=None):
+    def wrapper(input, output=None, **kwargs):
         input_file = None
         if isinstance(input, str):
             input_iter = input_file = open(input)
@@ -47,9 +47,9 @@
         else:
             input_iter = input
         if iter_filter:
-            gen = func(input_iter)
+            gen = func(input_iter, **kwargs)
         else:
-            gen = generator(input_iter)
+            gen = generator(input_iter, **kwargs)
         if output is None:
             return gen
         output_file = open(output, 'w')
@@ -127,7 +127,7 @@
     out_proc.stdin.write_eof()
 
 
-def proc_filter(func=None, output_chunker=None):
+def proc_filter(func=None, output_chunker=None, check=False):
     if func is None:
         def defer(func):
             return proc_filter(func, output_chunker=output_chunker)