Index: branches/apertium-tagger/experiments/add_to_wikitable.py =================================================================== --- branches/apertium-tagger/experiments/add_to_wikitable.py (revision 69202) +++ branches/apertium-tagger/experiments/add_to_wikitable.py (revision 69203) @@ -1,6 +1,7 @@ # -- encoding: utf-8 -- import sys +import locale import mwparserfromhell from mwparserfromhell.nodes.tag import Tag from mwparserfromhell.nodes.text import Text @@ -79,6 +80,10 @@ return "{0:.2f}".format(value * 100) +def result_to_str(result): + return '{}, {}'.format(result[0], result[1]) + + def mk_title_td(title): return Tag( 'td', @@ -102,6 +107,13 @@ contents="\n" if is_last else "", closing_wiki_markup='') +def mk_wc_td(val, is_first=False, is_last=False): + return Tag( + 'td', + wiki_markup='!' if is_first else '!!', + contents=" {}{}".format(val, "\n" if is_last else " "), + closing_wiki_markup='') + def mk_initial_tr(title): return Tag( 'tr', @@ -110,7 +122,6 @@ closing_wiki_markup='') input_table = sys.stdin.read() -input_data = eval(open(sys.argv[1]).read()) lang_order = [] @@ -126,8 +137,6 @@ lang_order.append(LANG_NAME_CODE_MAP[title]) def insert_into_tr(tr, col_idx, val_str): - print("insert_into_tr", tr, col_idx, val_str) - print('tr.contents.nodes', tr.contents.nodes) if len(tr.contents.nodes) <= col_idx: last_td = tr.contents.get(-1) if last_td.contents.endswith('\n'): @@ -137,20 +146,50 @@ tr.contents.append(mk_empty_td(is_last=True)) target_cell = tr.contents.get(col_idx) has_newline = target_cell.contents.endswith('\n') - print('target_cell', target_cell) val_td = mk_val_td(val_str, is_last=has_newline) - print('replacement cell', val_td) tr.contents.set(col_idx, val_td) - print('tr after', tr) +def insert_into_wc(tr, col_idx, val_str): + target_cell = tr.contents.get(col_idx) + has_newline = target_cell.contents.endswith('\n') + is_first = len(target_cell.wiki_markup) == 1 + val_td = mk_wc_td(val_str, is_first=is_first, is_last=has_newline) + tr.contents.set(col_idx, val_td) + +def format_word_count(word_count): + locale.setlocale(locale.LC_ALL, 'en_US') + number = locale.format("%d", word_count, grouping=True) + return "{}".format(number) + +if sys.argv[1] in LANG_CODE_NAME_MAP: + # blank out column + col_idx = lang_order.index(sys.argv[1]) + 1 + table_idx = 3 + while table_idx < len(table_inner.nodes): + tr = table_inner.get(table_idx) + if len(tr.contents.nodes) > col_idx: + if tr.contents.get(col_idx).contents.endswith('\n'): + tr.contents.get(col_idx).contents = '\n' + else: + tr.contents.get(col_idx).contents = '' + table_idx += 1 + print(table) + sys.exit() + +input_data = eval(open(sys.argv[1]).read()) + for lang, data in input_data.items(): lang_idx = lang_order.index(lang) - data = [(name_to_attrs(name), value_to_str(value)) for name, value in data.items()] + col_idx = lang_idx + 1 + word_count = data.pop('word_count', None) + if word_count is not None: + word_count_tr = table_inner.get(3) + insert_into_wc(word_count_tr, col_idx, format_word_count(word_count)) + data = [(name_to_attrs(name), result_to_str(value)) for name, value in data.items()] data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0])) table_idx = 4 for attrs, val_str in data: title_str = attrs_to_str(attrs) - col_idx = lang_idx + 1 while table_idx < len(table_inner.nodes): tr = table_inner.get(table_idx) if len(tr.contents) > 1: Index: branches/apertium-tagger/experiments/evaluate_tagger.py =================================================================== --- branches/apertium-tagger/experiments/evaluate_tagger.py (revision 69202) +++ branches/apertium-tagger/experiments/evaluate_tagger.py (revision 69203) @@ -165,6 +165,8 @@ self.n_truenegative = 0 self.n_falsepositive = 0 self.n_falsenegative = 0 + self.n_analysis_available = 0 + self.n_analysis_unavailable = 0 self.n_ref_readings = 0 self.n_src_readings = 0 @@ -279,6 +281,13 @@ #} #} + for ref_reading in ref_readings: + if ref_reading not in src_readings: + print('[' + str(n_line) + '] UNAVAILABLE:', ref_reading, src_readings) + self.n_analysis_unavailable += 1 + else: + self.n_analysis_available += 1 + for ref_reading in ref_readings: # { if ref_reading not in tst_readings: # { print('[' + str(n_line) + '] FALSENEG:', ref_reading, tst_readings) @@ -362,6 +371,10 @@ return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsenegative)) @property + def recall_available(self): + return float(self.n_truepositive) / float(self.n_analysis_available) + + @property def accuracy(self): return float(self.n_truepositive + self.n_truenegative) / \ (float(self.n_truepositive + self.n_falsenegative + @@ -390,11 +403,14 @@ print('trueneg :\t', self.n_truenegative) print('falsepos :\t', self.n_falsepositive) print('falseneg :\t', self.n_falsenegative) + print('analysis available :\t', self.n_analysis_available) + print('analysis unavailable :\t', self.n_analysis_unavailable) print('') print('precision:\t', self.precision, '\t( true pos / all pos )') print('recall :\t', self.recall, '\t( true pos / (true pos + false neg) )') + print('recall available :\t', self.recall_available, '\t( true pos / (src in ref) )') print('accuracy :\t', self.accuracy, '\t((true pos + true neg) / (everything) )') print('') Index: branches/apertium-tagger/experiments/requirements.txt =================================================================== --- branches/apertium-tagger/experiments/requirements.txt (revision 69202) +++ branches/apertium-tagger/experiments/requirements.txt (revision 69203) @@ -1,3 +1,5 @@ aitertools==0.1.0 tabulate==0.7.5 -e git+https://github.com/frankier/streamparser.git@setup-py#egg=streamparser +dbus-python==1.2.4 +notify2==0.3 Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 69202) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 69203) @@ -10,6 +10,7 @@ from asyncio.subprocess import create_subprocess_exec import os import re +import sys from pprint import pformat import datetime @@ -26,18 +27,22 @@ 'spa': ['texts/miscellaneous.tagged.txt'], 'hbs': ['hbs-tagger-data/hbs.tagged.txt'], 'rus': [ - # 'texts/son-smešnogo-čeloveka.ana.txt', incorrect format? - 'texts/dva-samoubijstva.ana.txt' # seems to be only one in correct format? + # seems to be only one in correct format? + 'texts/dva-samoubijstva.ana.txt' ], 'kaz': ['eval/ref.1000.txt'], 'por': [ - 'texts/água.txt', # ambiguous - 'texts/raio.txt', # ambiguous - # 'texts/música.txt', ambiguous, syntax errors - 'texts/akatsuki.txt', # ambiguous - 'texts/beringia.txt', # ambiguous - 'texts/cultura.txt', # ambiguous - 'texts/bering.txt', # ambiguous + 'cg:água.tagged.txt', + 'cg:anfetamina.tagged.txt', + 'cg:bering.tagged.txt', + 'cg:cultura.tagged.txt', + 'cg:myanmar.tagged.txt', + 'cg:wikitravel.tagged.txt', + 'cg:akatsuki.tagged.txt', + 'cg:beringia.tagged.txt', + 'cg:catalunha.tagged.txt', + 'cg:música.tagged.txt', + 'cg:raio.tagged.txt', ], } TSX_MAP = { @@ -90,9 +95,16 @@ help="Reuse preprocesed dictionary and corpa from previous run", action='store_true') parser.add_argument( + '--reuse-dic', + help="Reuse preprocesed dictionary from previous run", + action='store_true') + parser.add_argument( '--output', - help="Output file for the results of the experiment", - action='store_string') + help="Output file for the results of the experiment") + parser.add_argument( + '--notify', + help="Produce a desktop notification when done", + action='store_true') return parser.parse_args() @@ -186,7 +198,8 @@ filtered = filter_dix(expand_proc.stdout) extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", - "¡"] + "¡", "“", "”", "«", "»", + ] for i, extra in enumerate(extras): extras[i] = (extra + "\n").encode('utf-8') with_extras = aitertools.chain(filtered, extras) @@ -217,6 +230,45 @@ return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) +filter_dix = functools.partial( + MapFilter, + pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, + tran=lambda line: line.split(b":")[0] + b"\n") + + +SENT_END_RE = re.compile(br'/[.!?]\$$') +SELECT_RE = re.compile(br'') + + +async def cg_conv_clean(input, output): + cleanstream_inpipe, cg_conv_outpipe = os.pipe() + await create_subprocess_exec( + 'cg-conv', '--in-cg', '--out-apertium', "--ltr", + stdin=open(input, 'r'), stdout=cg_conv_outpipe) + os.close(cg_conv_outpipe) + cleanstream = await create_subprocess_exec( + 'apertium-cleanstream', '-n', + stdin=cleanstream_inpipe, stdout=PIPE) + + output_f = open(output, 'wb') + await cleanstream.stdout.readline() + async for line in cleanstream.stdout: + line = SELECT_RE.sub(b'', line) + output_f.write(line) + if SENT_END_RE.search(line): + output_f.write(b'\n') + + +def cg_conv_clean_(input, output): + return loop.run_until_complete( + cg_conv_clean(input, output)) + + +PREPROCESSER_MAP = { + 'cg': cg_conv_clean_ +} + + @proc_filter def cg_proc(grammar_fn, dictcase=True): cmd = ['cg-proc', grammar_fn] @@ -225,22 +277,33 @@ return cmd -AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') -PRPERS_TAG = re.compile(r'/\+') +#AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') +#PRPERS_TAG = re.compile(r'/\+') -@filter -def cleanup_rus(line): - return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line)) +#@filter +#def cleanup_rus(line): + #return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line)) -TEXT_CLEANUP_MAP = { - 'rus': cleanup_rus, +#TEXT_CLEANUP_MAP = { + #'rus': cleanup_rus, +#} + +def invalidate_por(line): + # All these are the wrong way around! + return ('/$' in line \ + or line.startswith('^beta-fenetilamina/') # gets analysed as two words + or line.startswith('^km²/') # ^2 ends up outside analysis + or ('./' in line and '' not in line)) # ends up as an multiword + +LANGUAGE_INVALIDATOR_MAP = { + 'por': invalidate_por } @filter(iter_filter=True) -def strip_unknown_sent(gen): +def strip_unknown_sent(gen, invalidate_func=None): buff = [] valid_sent = True for line in gen: @@ -255,6 +318,8 @@ buff.append(line) if '/*' in line: valid_sent = False + if invalidate_func is not None and invalidate_func(line): + valid_sent = False def split_n_r(corpus_fn, train_fn, ref_fn, n, r): @@ -286,11 +351,16 @@ self.fn = fn +def aggregates(data): + return (min(accuracies), max(accuracies), + mean(accuracies), stdev(accuracies)) + def xval_experiment(name): def dec(func=None): @functools.wraps(func) def wrapper(self, *args, **kwargs): - accuracies = [] + recall = [] + recall_available = [] for i, xval_fns in enumerate(self.xval_fns): xval_fns['test'] = xval_fns['prefix'] + 'test.' + name xval_fns['model'] = xval_fns['prefix'] + 'model.' + name @@ -298,9 +368,9 @@ evaluator = TaggerEvaluator( xval_fns['src'], xval_fns['ref'], xval_fns['test']) evaluator.run_analysis() - accuracies.append(evaluator.accuracy) - return (min(accuracies), max(accuracies), - mean(accuracies), stdev(accuracies)) + recall.append(evaluator.recall) + recall_available.append(evaluator.recall_available) + return (aggregates(recall), aggregates(recall_available)) return wrapper return dec @@ -315,7 +385,7 @@ experiments = {} - def __init__(self, lang, lang_root, texts, folds, reuse=False): + def __init__(self, lang, lang_root, texts, folds, reuse=False, reuse_dic=False): self.lang = lang self.work_dir = pjoin(TMPDIR, lang) @@ -332,7 +402,14 @@ else: self.tsx_fn = pjoin(lang_root, pair_name + '.tsx') - self.text_fns = [pjoin(lang_root, text) for text in texts] + self.text_fns = [] + for text in texts: + if ':' in text: + preprocesser_name, text = text.split(':', 1) + else: + preprocesser_name = None + self.text_fns.append((preprocesser_name, + pjoin(lang_root, 'texts', text))) self.joined_fn = pjoin(self.work_dir, 'joined') self.ref_fn = pjoin(self.work_dir, 'ref') self.src_fn = pjoin(self.work_dir, 'src') @@ -359,7 +436,7 @@ self.validate() if not reuse: - self.do_preprocessing() + self.do_preprocessing(reuse_dic=reuse_dic) def validate(self): for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]: @@ -370,19 +447,28 @@ if self.tsx_fn is not None: check_run(["apertium-validate-tagger", self.tsx_fn]) - def do_preprocessing(self): + def do_preprocessing(self, reuse_dic=False): if not isdir(self.work_dir): mkdir(self.work_dir) + preprocessed_texts = [] + for i, (preprocesser_name, fn) in enumerate(self.text_fns): + if preprocesser_name: + preprocesser = PREPROCESSER_MAP.get(preprocesser_name) + cleaned_fn = pjoin(self.work_dir, 'cleaned.{}.{}.txt'.format(i, preprocesser_name)) + preprocesser(input=fn, output=cleaned_fn) + preprocessed_texts.append(cleaned_fn) + else: + preprocessed_texts.append(fn) + joined = itertools.chain(*(open(fn).readlines() - for fn in self.text_fns)) - if self.lang in TEXT_CLEANUP_MAP: - strip_unknown_in = TEXT_CLEANUP_MAP[self.lang](joined) - else: - strip_unknown_in = joined - strip_unknown_sent(strip_unknown_in, self.joined_fn) + for fn in preprocessed_texts)) + strip_unknown_sent( + joined, self.joined_fn, + invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(self.lang, lambda x: False)) strip_blanks(self.joined_fn, self.ref_fn) extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) + if not reuse_dic: loop.run_until_complete( fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn)) @@ -398,7 +484,7 @@ def _analyse(self, test_fn): evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn) evaluator.run_analysis() - return evaluator.accuracy + return (evaluator.recall, evaluator.recall_available) @classmethod def reg_experiment(cls, name): @@ -432,7 +518,7 @@ else: tagger_input = xval_fns['src'] tagger_tag(unigram_model, xval_fns['model'], - input=tagger_input, output=xval_fns['test']) + input=tagger_input, output=xval_fns['test']).check_returncode() for do_cg in [False, True]: name = ('cg' if do_cg else '') + '1st' @@ -449,12 +535,19 @@ for do_cg in [False, True]: for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]: - for iterations in [0, 50, 250]: - name = "{cg}{sup}_{model}_i{iterations}".format( + if is_supervised: + iterations_list = [0] + else: + iterations_list = [0, 50, 250] + for iterations in iterations_list: + name = ( + "{cg}{sup}_{model}".format( cg='cg_' if do_cg else '', sup='sup' if is_supervised else 'unsup', - model=model, - iterations=iterations) + model=model + ) + + ("_i{iterations}".format(iterations=iterations) + if len(iterations_list) > 1 else "")) @cls.reg_experiment(name) @xval_experiment(name) @@ -481,8 +574,16 @@ else: tagger_input = xval_fns['src'] tagger_tag(model, xval_fns['model'], - input=tagger_input, output=xval_fns['test']) + input=tagger_input, output=xval_fns['test']).check_returncode() + @cls.reg_experiment('word_count') + def word_count(self): + count = 0 + for line in open(self.src_fn): + if line.strip() != '': + count += 1 + return count + @classmethod def all_taggers(cls): return cls.experiments.keys() @@ -504,6 +605,8 @@ args = parse_args() if not isdir(TMPDIR): mkdir(TMPDIR) + if args.notify: + import notify2 languages_tagger_accuracies = {} try: @@ -513,7 +616,8 @@ def mk_experimentor(): return LanguageTaggerExperimentor(lang, lang_root, taggers, - args.folds, reuse=args.reuse) + args.folds, reuse=args.reuse, + reuse_dic=args.reuse_dic) try: experimentor = mk_experimentor() except MissingLanguageDataException as e: @@ -540,7 +644,12 @@ .format(datetime.datetime.now().isoformat())) open(outf, 'w').write(result_pretty) + if args.notify: + notify2.init("Tagger experiment finished") + notice = notify2.Notification(' '.join(sys.argv), "Tagger experiment finished") + notice.show() + if __name__ == '__main__': try: main() Index: branches/apertium-tagger/experiments/shell_utils.py =================================================================== --- branches/apertium-tagger/experiments/shell_utils.py (revision 69202) +++ branches/apertium-tagger/experiments/shell_utils.py (revision 69203) @@ -29,14 +29,14 @@ input_chunker=input_chunker, output_separator=output_separator) return defer - def generator(input_iter): + def generator(input_iter, **kwargs): for line in input_iter: - filtered = func(line) + filtered = func(line, **kwargs) if filtered is not None: yield filtered + output_separator @functools.wraps(func) - def wrapper(input, output=None): + def wrapper(input, output=None, **kwargs): input_file = None if isinstance(input, str): input_iter = input_file = open(input) @@ -47,9 +47,9 @@ else: input_iter = input if iter_filter: - gen = func(input_iter) + gen = func(input_iter, **kwargs) else: - gen = generator(input_iter) + gen = generator(input_iter, **kwargs) if output is None: return gen output_file = open(output, 'w') @@ -127,7 +127,7 @@ out_proc.stdin.write_eof() -def proc_filter(func=None, output_chunker=None): +def proc_filter(func=None, output_chunker=None, check=False): if func is None: def defer(func): return proc_filter(func, output_chunker=output_chunker)