Index: branches/apertium-tagger/experiments/add_to_wikitable.py
===================================================================
--- branches/apertium-tagger/experiments/add_to_wikitable.py (revision 69202)
+++ branches/apertium-tagger/experiments/add_to_wikitable.py (revision 69203)
@@ -1,6 +1,7 @@
# -- encoding: utf-8 --
import sys
+import locale
import mwparserfromhell
from mwparserfromhell.nodes.tag import Tag
from mwparserfromhell.nodes.text import Text
@@ -79,6 +80,10 @@
return "{0:.2f}".format(value * 100)
+def result_to_str(result):
+ return '{}, {}'.format(result[0], result[1])
+
+
def mk_title_td(title):
return Tag(
'td',
@@ -102,6 +107,13 @@
contents="\n" if is_last else "",
closing_wiki_markup='')
+def mk_wc_td(val, is_first=False, is_last=False):
+ return Tag(
+ 'td',
+ wiki_markup='!' if is_first else '!!',
+ contents=" {}{}".format(val, "\n" if is_last else " "),
+ closing_wiki_markup='')
+
def mk_initial_tr(title):
return Tag(
'tr',
@@ -110,7 +122,6 @@
closing_wiki_markup='')
input_table = sys.stdin.read()
-input_data = eval(open(sys.argv[1]).read())
lang_order = []
@@ -126,8 +137,6 @@
lang_order.append(LANG_NAME_CODE_MAP[title])
def insert_into_tr(tr, col_idx, val_str):
- print("insert_into_tr", tr, col_idx, val_str)
- print('tr.contents.nodes', tr.contents.nodes)
if len(tr.contents.nodes) <= col_idx:
last_td = tr.contents.get(-1)
if last_td.contents.endswith('\n'):
@@ -137,20 +146,50 @@
tr.contents.append(mk_empty_td(is_last=True))
target_cell = tr.contents.get(col_idx)
has_newline = target_cell.contents.endswith('\n')
- print('target_cell', target_cell)
val_td = mk_val_td(val_str, is_last=has_newline)
- print('replacement cell', val_td)
tr.contents.set(col_idx, val_td)
- print('tr after', tr)
+def insert_into_wc(tr, col_idx, val_str):
+ target_cell = tr.contents.get(col_idx)
+ has_newline = target_cell.contents.endswith('\n')
+ is_first = len(target_cell.wiki_markup) == 1
+ val_td = mk_wc_td(val_str, is_first=is_first, is_last=has_newline)
+ tr.contents.set(col_idx, val_td)
+
+def format_word_count(word_count):
+ locale.setlocale(locale.LC_ALL, 'en_US')
+ number = locale.format("%d", word_count, grouping=True)
+ return "{}".format(number)
+
+if sys.argv[1] in LANG_CODE_NAME_MAP:
+ # blank out column
+ col_idx = lang_order.index(sys.argv[1]) + 1
+ table_idx = 3
+ while table_idx < len(table_inner.nodes):
+ tr = table_inner.get(table_idx)
+ if len(tr.contents.nodes) > col_idx:
+ if tr.contents.get(col_idx).contents.endswith('\n'):
+ tr.contents.get(col_idx).contents = '\n'
+ else:
+ tr.contents.get(col_idx).contents = ''
+ table_idx += 1
+ print(table)
+ sys.exit()
+
+input_data = eval(open(sys.argv[1]).read())
+
for lang, data in input_data.items():
lang_idx = lang_order.index(lang)
- data = [(name_to_attrs(name), value_to_str(value)) for name, value in data.items()]
+ col_idx = lang_idx + 1
+ word_count = data.pop('word_count', None)
+ if word_count is not None:
+ word_count_tr = table_inner.get(3)
+ insert_into_wc(word_count_tr, col_idx, format_word_count(word_count))
+ data = [(name_to_attrs(name), result_to_str(value)) for name, value in data.items()]
data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0]))
table_idx = 4
for attrs, val_str in data:
title_str = attrs_to_str(attrs)
- col_idx = lang_idx + 1
while table_idx < len(table_inner.nodes):
tr = table_inner.get(table_idx)
if len(tr.contents) > 1:
Index: branches/apertium-tagger/experiments/evaluate_tagger.py
===================================================================
--- branches/apertium-tagger/experiments/evaluate_tagger.py (revision 69202)
+++ branches/apertium-tagger/experiments/evaluate_tagger.py (revision 69203)
@@ -165,6 +165,8 @@
self.n_truenegative = 0
self.n_falsepositive = 0
self.n_falsenegative = 0
+ self.n_analysis_available = 0
+ self.n_analysis_unavailable = 0
self.n_ref_readings = 0
self.n_src_readings = 0
@@ -279,6 +281,13 @@
#}
#}
+ for ref_reading in ref_readings:
+ if ref_reading not in src_readings:
+ print('[' + str(n_line) + '] UNAVAILABLE:', ref_reading, src_readings)
+ self.n_analysis_unavailable += 1
+ else:
+ self.n_analysis_available += 1
+
for ref_reading in ref_readings: # {
if ref_reading not in tst_readings: # {
print('[' + str(n_line) + '] FALSENEG:', ref_reading, tst_readings)
@@ -362,6 +371,10 @@
return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsenegative))
@property
+ def recall_available(self):
+ return float(self.n_truepositive) / float(self.n_analysis_available)
+
+ @property
def accuracy(self):
return float(self.n_truepositive + self.n_truenegative) / \
(float(self.n_truepositive + self.n_falsenegative +
@@ -390,11 +403,14 @@
print('trueneg :\t', self.n_truenegative)
print('falsepos :\t', self.n_falsepositive)
print('falseneg :\t', self.n_falsenegative)
+ print('analysis available :\t', self.n_analysis_available)
+ print('analysis unavailable :\t', self.n_analysis_unavailable)
print('')
print('precision:\t', self.precision, '\t( true pos / all pos )')
print('recall :\t', self.recall, '\t( true pos / (true pos + false neg) )')
+ print('recall available :\t', self.recall_available, '\t( true pos / (src in ref) )')
print('accuracy :\t', self.accuracy, '\t((true pos + true neg) / (everything) )')
print('')
Index: branches/apertium-tagger/experiments/requirements.txt
===================================================================
--- branches/apertium-tagger/experiments/requirements.txt (revision 69202)
+++ branches/apertium-tagger/experiments/requirements.txt (revision 69203)
@@ -1,3 +1,5 @@
aitertools==0.1.0
tabulate==0.7.5
-e git+https://github.com/frankier/streamparser.git@setup-py#egg=streamparser
+dbus-python==1.2.4
+notify2==0.3
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py (revision 69202)
+++ branches/apertium-tagger/experiments/run_experiment.py (revision 69203)
@@ -10,6 +10,7 @@
from asyncio.subprocess import create_subprocess_exec
import os
import re
+import sys
from pprint import pformat
import datetime
@@ -26,18 +27,22 @@
'spa': ['texts/miscellaneous.tagged.txt'],
'hbs': ['hbs-tagger-data/hbs.tagged.txt'],
'rus': [
- # 'texts/son-smešnogo-čeloveka.ana.txt', incorrect format?
- 'texts/dva-samoubijstva.ana.txt' # seems to be only one in correct format?
+ # seems to be only one in correct format?
+ 'texts/dva-samoubijstva.ana.txt'
],
'kaz': ['eval/ref.1000.txt'],
'por': [
- 'texts/água.txt', # ambiguous
- 'texts/raio.txt', # ambiguous
- # 'texts/música.txt', ambiguous, syntax errors
- 'texts/akatsuki.txt', # ambiguous
- 'texts/beringia.txt', # ambiguous
- 'texts/cultura.txt', # ambiguous
- 'texts/bering.txt', # ambiguous
+ 'cg:água.tagged.txt',
+ 'cg:anfetamina.tagged.txt',
+ 'cg:bering.tagged.txt',
+ 'cg:cultura.tagged.txt',
+ 'cg:myanmar.tagged.txt',
+ 'cg:wikitravel.tagged.txt',
+ 'cg:akatsuki.tagged.txt',
+ 'cg:beringia.tagged.txt',
+ 'cg:catalunha.tagged.txt',
+ 'cg:música.tagged.txt',
+ 'cg:raio.tagged.txt',
],
}
TSX_MAP = {
@@ -90,9 +95,16 @@
help="Reuse preprocesed dictionary and corpa from previous run",
action='store_true')
parser.add_argument(
+ '--reuse-dic',
+ help="Reuse preprocesed dictionary from previous run",
+ action='store_true')
+ parser.add_argument(
'--output',
- help="Output file for the results of the experiment",
- action='store_string')
+ help="Output file for the results of the experiment")
+ parser.add_argument(
+ '--notify',
+ help="Produce a desktop notification when done",
+ action='store_true')
return parser.parse_args()
@@ -186,7 +198,8 @@
filtered = filter_dix(expand_proc.stdout)
extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿",
- "¡"]
+ "¡", "“", "”", "«", "»",
+ ]
for i, extra in enumerate(extras):
extras[i] = (extra + "\n").encode('utf-8')
with_extras = aitertools.chain(filtered, extras)
@@ -217,6 +230,45 @@
return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes))
+filter_dix = functools.partial(
+ MapFilter,
+ pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
+ tran=lambda line: line.split(b":")[0] + b"\n")
+
+
+SENT_END_RE = re.compile(br'/[.!?]\$$')
+SELECT_RE = re.compile(br'')
+
+
+async def cg_conv_clean(input, output):
+ cleanstream_inpipe, cg_conv_outpipe = os.pipe()
+ await create_subprocess_exec(
+ 'cg-conv', '--in-cg', '--out-apertium', "--ltr",
+ stdin=open(input, 'r'), stdout=cg_conv_outpipe)
+ os.close(cg_conv_outpipe)
+ cleanstream = await create_subprocess_exec(
+ 'apertium-cleanstream', '-n',
+ stdin=cleanstream_inpipe, stdout=PIPE)
+
+ output_f = open(output, 'wb')
+ await cleanstream.stdout.readline()
+ async for line in cleanstream.stdout:
+ line = SELECT_RE.sub(b'', line)
+ output_f.write(line)
+ if SENT_END_RE.search(line):
+ output_f.write(b'\n')
+
+
+def cg_conv_clean_(input, output):
+ return loop.run_until_complete(
+ cg_conv_clean(input, output))
+
+
+PREPROCESSER_MAP = {
+ 'cg': cg_conv_clean_
+}
+
+
@proc_filter
def cg_proc(grammar_fn, dictcase=True):
cmd = ['cg-proc', grammar_fn]
@@ -225,22 +277,33 @@
return cmd
-AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>')
-PRPERS_TAG = re.compile(r'/\+')
+#AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>')
+#PRPERS_TAG = re.compile(r'/\+')
-@filter
-def cleanup_rus(line):
- return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line))
+#@filter
+#def cleanup_rus(line):
+ #return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line))
-TEXT_CLEANUP_MAP = {
- 'rus': cleanup_rus,
+#TEXT_CLEANUP_MAP = {
+ #'rus': cleanup_rus,
+#}
+
+def invalidate_por(line):
+ # All these are the wrong way around!
+ return ('/$' in line \
+ or line.startswith('^beta-fenetilamina/') # gets analysed as two words
+ or line.startswith('^km²/') # ^2 ends up outside analysis
+ or ('./' in line and '' not in line)) # ends up as an multiword
+
+LANGUAGE_INVALIDATOR_MAP = {
+ 'por': invalidate_por
}
@filter(iter_filter=True)
-def strip_unknown_sent(gen):
+def strip_unknown_sent(gen, invalidate_func=None):
buff = []
valid_sent = True
for line in gen:
@@ -255,6 +318,8 @@
buff.append(line)
if '/*' in line:
valid_sent = False
+ if invalidate_func is not None and invalidate_func(line):
+ valid_sent = False
def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
@@ -286,11 +351,16 @@
self.fn = fn
+def aggregates(data):
+ return (min(accuracies), max(accuracies),
+ mean(accuracies), stdev(accuracies))
+
def xval_experiment(name):
def dec(func=None):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
- accuracies = []
+ recall = []
+ recall_available = []
for i, xval_fns in enumerate(self.xval_fns):
xval_fns['test'] = xval_fns['prefix'] + 'test.' + name
xval_fns['model'] = xval_fns['prefix'] + 'model.' + name
@@ -298,9 +368,9 @@
evaluator = TaggerEvaluator(
xval_fns['src'], xval_fns['ref'], xval_fns['test'])
evaluator.run_analysis()
- accuracies.append(evaluator.accuracy)
- return (min(accuracies), max(accuracies),
- mean(accuracies), stdev(accuracies))
+ recall.append(evaluator.recall)
+ recall_available.append(evaluator.recall_available)
+ return (aggregates(recall), aggregates(recall_available))
return wrapper
return dec
@@ -315,7 +385,7 @@
experiments = {}
- def __init__(self, lang, lang_root, texts, folds, reuse=False):
+ def __init__(self, lang, lang_root, texts, folds, reuse=False, reuse_dic=False):
self.lang = lang
self.work_dir = pjoin(TMPDIR, lang)
@@ -332,7 +402,14 @@
else:
self.tsx_fn = pjoin(lang_root, pair_name + '.tsx')
- self.text_fns = [pjoin(lang_root, text) for text in texts]
+ self.text_fns = []
+ for text in texts:
+ if ':' in text:
+ preprocesser_name, text = text.split(':', 1)
+ else:
+ preprocesser_name = None
+ self.text_fns.append((preprocesser_name,
+ pjoin(lang_root, 'texts', text)))
self.joined_fn = pjoin(self.work_dir, 'joined')
self.ref_fn = pjoin(self.work_dir, 'ref')
self.src_fn = pjoin(self.work_dir, 'src')
@@ -359,7 +436,7 @@
self.validate()
if not reuse:
- self.do_preprocessing()
+ self.do_preprocessing(reuse_dic=reuse_dic)
def validate(self):
for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]:
@@ -370,19 +447,28 @@
if self.tsx_fn is not None:
check_run(["apertium-validate-tagger", self.tsx_fn])
- def do_preprocessing(self):
+ def do_preprocessing(self, reuse_dic=False):
if not isdir(self.work_dir):
mkdir(self.work_dir)
+ preprocessed_texts = []
+ for i, (preprocesser_name, fn) in enumerate(self.text_fns):
+ if preprocesser_name:
+ preprocesser = PREPROCESSER_MAP.get(preprocesser_name)
+ cleaned_fn = pjoin(self.work_dir, 'cleaned.{}.{}.txt'.format(i, preprocesser_name))
+ preprocesser(input=fn, output=cleaned_fn)
+ preprocessed_texts.append(cleaned_fn)
+ else:
+ preprocessed_texts.append(fn)
+
joined = itertools.chain(*(open(fn).readlines()
- for fn in self.text_fns))
- if self.lang in TEXT_CLEANUP_MAP:
- strip_unknown_in = TEXT_CLEANUP_MAP[self.lang](joined)
- else:
- strip_unknown_in = joined
- strip_unknown_sent(strip_unknown_in, self.joined_fn)
+ for fn in preprocessed_texts))
+ strip_unknown_sent(
+ joined, self.joined_fn,
+ invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(self.lang, lambda x: False))
strip_blanks(self.joined_fn, self.ref_fn)
extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn)
+ if not reuse_dic:
loop.run_until_complete(
fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn,
output_fn=self.dic_fn))
@@ -398,7 +484,7 @@
def _analyse(self, test_fn):
evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn)
evaluator.run_analysis()
- return evaluator.accuracy
+ return (evaluator.recall, evaluator.recall_available)
@classmethod
def reg_experiment(cls, name):
@@ -432,7 +518,7 @@
else:
tagger_input = xval_fns['src']
tagger_tag(unigram_model, xval_fns['model'],
- input=tagger_input, output=xval_fns['test'])
+ input=tagger_input, output=xval_fns['test']).check_returncode()
for do_cg in [False, True]:
name = ('cg' if do_cg else '') + '1st'
@@ -449,12 +535,19 @@
for do_cg in [False, True]:
for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]:
- for iterations in [0, 50, 250]:
- name = "{cg}{sup}_{model}_i{iterations}".format(
+ if is_supervised:
+ iterations_list = [0]
+ else:
+ iterations_list = [0, 50, 250]
+ for iterations in iterations_list:
+ name = (
+ "{cg}{sup}_{model}".format(
cg='cg_' if do_cg else '',
sup='sup' if is_supervised else 'unsup',
- model=model,
- iterations=iterations)
+ model=model
+ ) +
+ ("_i{iterations}".format(iterations=iterations)
+ if len(iterations_list) > 1 else ""))
@cls.reg_experiment(name)
@xval_experiment(name)
@@ -481,8 +574,16 @@
else:
tagger_input = xval_fns['src']
tagger_tag(model, xval_fns['model'],
- input=tagger_input, output=xval_fns['test'])
+ input=tagger_input, output=xval_fns['test']).check_returncode()
+ @cls.reg_experiment('word_count')
+ def word_count(self):
+ count = 0
+ for line in open(self.src_fn):
+ if line.strip() != '':
+ count += 1
+ return count
+
@classmethod
def all_taggers(cls):
return cls.experiments.keys()
@@ -504,6 +605,8 @@
args = parse_args()
if not isdir(TMPDIR):
mkdir(TMPDIR)
+ if args.notify:
+ import notify2
languages_tagger_accuracies = {}
try:
@@ -513,7 +616,8 @@
def mk_experimentor():
return LanguageTaggerExperimentor(lang, lang_root, taggers,
- args.folds, reuse=args.reuse)
+ args.folds, reuse=args.reuse,
+ reuse_dic=args.reuse_dic)
try:
experimentor = mk_experimentor()
except MissingLanguageDataException as e:
@@ -540,7 +644,12 @@
.format(datetime.datetime.now().isoformat()))
open(outf, 'w').write(result_pretty)
+ if args.notify:
+ notify2.init("Tagger experiment finished")
+ notice = notify2.Notification(' '.join(sys.argv), "Tagger experiment finished")
+ notice.show()
+
if __name__ == '__main__':
try:
main()
Index: branches/apertium-tagger/experiments/shell_utils.py
===================================================================
--- branches/apertium-tagger/experiments/shell_utils.py (revision 69202)
+++ branches/apertium-tagger/experiments/shell_utils.py (revision 69203)
@@ -29,14 +29,14 @@
input_chunker=input_chunker, output_separator=output_separator)
return defer
- def generator(input_iter):
+ def generator(input_iter, **kwargs):
for line in input_iter:
- filtered = func(line)
+ filtered = func(line, **kwargs)
if filtered is not None:
yield filtered + output_separator
@functools.wraps(func)
- def wrapper(input, output=None):
+ def wrapper(input, output=None, **kwargs):
input_file = None
if isinstance(input, str):
input_iter = input_file = open(input)
@@ -47,9 +47,9 @@
else:
input_iter = input
if iter_filter:
- gen = func(input_iter)
+ gen = func(input_iter, **kwargs)
else:
- gen = generator(input_iter)
+ gen = generator(input_iter, **kwargs)
if output is None:
return gen
output_file = open(output, 'w')
@@ -127,7 +127,7 @@
out_proc.stdin.write_eof()
-def proc_filter(func=None, output_chunker=None):
+def proc_filter(func=None, output_chunker=None, check=False):
if func is None:
def defer(func):
return proc_filter(func, output_chunker=output_chunker)