Index: branches/apertium-tagger/experiments/split_corpus_n.py =================================================================== --- branches/apertium-tagger/experiments/split_corpus_n.py (revision 68806) +++ branches/apertium-tagger/experiments/split_corpus_n.py (nonexistent) @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import random - -def main(corpus, prefix): - tokens = 0.0 - sentences = 0.0 - - for line in open(corpus).readlines(): - - if line.strip() == '': - sentences = sentences + 1.0 - else: - tokens = tokens + 1.0 - - print(sentences, tokens, tokens / sentences) - - ids = [] - for i in range(0, int(sentences)): - ids.append(i) - random.shuffle(ids) - split = int(sentences / 10) - print(split, file=sys.stderr) - testing = ids[0:split] - training = ids[split:] - - train_file = open(prefix + 'train', 'w+') - test_file = open(prefix + 'ref', 'w+') - - buffer = '' - index = 0 - for line in open(corpus).readlines(): - if line.strip() == '': - index = index + 1 - buffer = buffer + '\n' - if index in testing: - test_file.write(buffer) - elif index in training: - train_file.write(buffer) - else: - print('ERROR: %d not in testing or training' % - (index), file=sys.stderr) - buffer = '' - else: - buffer = buffer + line - if index % int(tokens / 100) == 0: - sys.stderr.write('.') - sys.stderr.flush() - -if __name__ == '__main__': - corpus = sys.argv[1] - prefix = sys.argv[2] - main(corpus, prefix) Index: branches/apertium-tagger/experiments/add_to_wikitable.py =================================================================== --- branches/apertium-tagger/experiments/add_to_wikitable.py (nonexistent) +++ branches/apertium-tagger/experiments/add_to_wikitable.py (revision 68807) @@ -0,0 +1,172 @@ +# -- encoding: utf-8 -- + +import sys +import mwparserfromhell +from mwparserfromhell.nodes.tag import Tag +from mwparserfromhell.nodes.text import Text +from mwparserfromhell.wikicode import Wikicode + +TAGGER_ORDER = ['1st', 'unigram1', 'unigram2', 'unigram3', 'bigram', 'lwsw'] +rdict = lambda d: {v: k for k, v in d.items()} + +LANG_CODE_NAME_MAP = { + 'cat': 'Catalan', + 'spa': 'Spanish', + 'hbs': 'Serbo-Croatian', + 'rus': 'Russian', + 'kaz': 'Kazakh', + 'por': 'Portuguese', + 'swe': 'Swedish', +} +LANG_NAME_CODE_MAP = rdict(LANG_CODE_NAME_MAP) + +def name_to_attrs(name): + attrs = {} + for tagger in TAGGER_ORDER: + if tagger in name: + attrs['tagger'] = tagger + + if 'cg' in name: + attrs['cg'] = True + else: + attrs['cg'] = False + + if attrs['tagger'] == 'lwsw': + attrs['sup'] = None + elif 'unsup' in name: + attrs['sup'] = False + elif 'sup' in name: + attrs['sup'] = True + else: + attrs['sup'] = None + + if '_i' in name: + attrs['iters'] = int(name.split('_i')[1]) + else: + attrs['iters'] = None + + return attrs + +def attrs_to_sort_tuple(attrs): + # tagger; unsup, sup; nocg, cg; iters + return (TAGGER_ORDER.index(attrs['tagger']), attrs['sup'], attrs['cg'], attrs['iters']) + +def attrs_to_str(attrs): + if attrs['tagger'].startswith('unigram'): + out = 'Unigram model ' + attrs['tagger'][len('unigram'):] + elif attrs['tagger'] == '1st': + out = attrs['tagger'] + else: + out = attrs['tagger'].title() + + if (attrs['cg']): + out = "CG→" + out + + if attrs['sup'] is not None or attrs['iters'] is not None: + bits = [] + if attrs['sup'] is not None: + bits.append('sup' if attrs['sup'] else 'unsup') + if attrs['iters'] is not None: + bits.append('{} iters'.format(attrs['iters'])) + out += ' ({})'.format(', '.join(bits)) + + return out + +def value_to_str(value): + if hasattr(value, "__getitem__"): + return "{2:.2f}±{3:.2f}".format(*(v * 100 for v in value)) + else: + return "{0:.2f}".format(value * 100) + + +def mk_title_td(title): + return Tag( + 'td', + wiki_markup='|', + contents=" '''{}''' ".format(title), + closing_wiki_markup='') + +def mk_val_td(val, is_last=False): + return Tag( + 'td', + wiki_markup='||', + attrs=['align=right'], + contents=" {} {}".format(val, "\n" if is_last else ""), + wiki_style_separator='|', + closing_wiki_markup='') + +def mk_empty_td(is_last=False): + return Tag( + 'td', + wiki_markup='||', + contents="\n" if is_last else "", + closing_wiki_markup='') + +def mk_initial_tr(title): + return Tag( + 'tr', + wiki_markup='|-\n', + contents=Wikicode([mk_title_td(title), mk_empty_td(is_last=True)]), + closing_wiki_markup='') + +input_table = sys.stdin.read() +input_data = eval(open(sys.argv[1]).read()) + +lang_order = [] + +table = mwparserfromhell.parse(input_table.strip()) +table_inner = table.get(0).contents +headings = table_inner.get(2).contents.nodes +for tag in headings: + if not isinstance(tag, Tag): + continue + title = tag.contents.strip() + if not title: + continue + lang_order.append(LANG_NAME_CODE_MAP[title]) + +def insert_into_tr(tr, col_idx, val_str): + print("insert_into_tr", tr, col_idx, val_str) + print('tr.contents.nodes', tr.contents.nodes) + if len(tr.contents.nodes) <= col_idx: + last_td = tr.contents.get(-1) + if last_td.contents.endswith('\n'): + last_td.contents = last_td.contents[:-1] + while len(tr.contents.nodes) < col_idx: + tr.contents.append(mk_empty_td()) + tr.contents.append(mk_empty_td(last_node=True)) + target_cell = tr.contents.get(col_idx) + has_newline = target_cell.contents.endswith('\n') + print('target_cell', target_cell) + val_td = mk_val_td(val_str, is_last=has_newline) + print('replacement cell', val_td) + tr.contents.set(col_idx, val_td) + print('tr after', tr) + +for lang, data in input_data.items(): + lang_idx = lang_order.index(lang) + data = [(name_to_attrs(name), value_to_str(value)) for name, value in data.items()] + data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0])) + table_idx = 4 + for attrs, val_str in data: + title_str = attrs_to_str(attrs) + col_idx = lang_idx + 1 + while table_idx < len(table_inner.nodes): + tr = table_inner.get(table_idx) + if len(tr.contents) > 1: + cell_contents = tr.contents.get(0).contents + existing_title_str = str(cell_contents).strip(' ').strip("'") + if existing_title_str == title_str: + # insert into existing + insert_into_tr(tr, col_idx, val_str) + break + else: + table_inner.remove(tr, recursive=False) + table_idx += 1 + else: + # append to end + tr = mk_initial_tr(title_str) + insert_into_tr(tr, col_idx, val_str) + table_inner.append(tr) + +print(table) Index: branches/apertium-tagger/experiments/requirements.txt =================================================================== --- branches/apertium-tagger/experiments/requirements.txt (nonexistent) +++ branches/apertium-tagger/experiments/requirements.txt (revision 68807) @@ -0,0 +1,2 @@ +aitertools==0.1.0 +tabulate==0.7.5 Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 68806) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 68807) @@ -1,39 +1,26 @@ -import statistics -import sys +from statistics import mean, stdev from os import mkdir from os.path import isdir, join as pjoin, exists as pexists -import subprocess from subprocess import PIPE import functools import itertools import aitertools import argparse -import tabulate -import csv import asyncio from asyncio.subprocess import create_subprocess_exec -from collections import namedtuple -from contextlib import contextmanager import os -from pprint import pprint +import re +from pprint import pformat +import datetime from evaluate_tagger import TaggerEvaluator -from split_corpus_n import main as split_corpus_n +from shell_utils import ( + cd, filter, proc_filter, check_run, writeiter, MapFilter) + loop = asyncio.get_event_loop() -@contextmanager -def cd(newdir): - prevdir = os.getcwd() - os.chdir(os.path.expanduser(newdir)) - try: - yield - finally: - os.chdir(prevdir) - - TMPDIR = 'experimenttmp' - DEFAULT_TEXTS = { 'cat': ['texts/miscellaneous.tagged.txt'], 'spa': ['texts/miscellaneous.tagged.txt'], @@ -50,14 +37,21 @@ 'texts/akatsuki.txt', ], } - +TSX_MAP = { + 'hbs': 'apertium-hbs.hbs-coarse.tsx', +} DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe'] +NO_TSX_LANGUAGES = ['rus'] +STRIP_AT_LANGUAGES = ['rus'] +AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') + def comma_list(s): if s == '()': return [] return s.split(',') + def comma_colon_dict(s): d = {} for bit in s.split(','): @@ -65,184 +59,41 @@ d[pair[0]] = pair[1].split(',') return d + def parse_args(): - parser = argparse.ArgumentParser(description="Runs a series of experiments on different part of speech taggers and different language data.") - parser.add_argument('languagesdir', help="Path to the directory containing all the individaul language data directories") - parser.add_argument('--languages', help="Only run experiments for these languages, comma separated", default=DEFAULT_LANGUAGES, type=comma_list) - parser.add_argument('--taggers', help="Only run experiments with these taggers, comma separated", default=DEFAULT_TAGGERS, type=comma_list) - parser.add_argument('--language-texts', help="Use different texts per language, coma seperated colon pairs", default=DEFAULT_TEXTS, type=comma_colon_dict) - parser.add_argument('--folds', help="Use x-fold validation instead of 10-fold", default=10, type=int) - parser.add_argument('--reuse', help="Reuse preprocesed dictionary and corpa from previous run", action='store_true') + parser = argparse.ArgumentParser( + description="Runs a series of experiments on different part of speech " + "taggers and different language data.") + parser.add_argument( + 'languagesdir', + help="Path to the directory containing all the individaul language " + "data directories") + parser.add_argument( + '--languages', + help="Only run experiments for these languages, comma separated", + default=DEFAULT_LANGUAGES, type=comma_list) + parser.add_argument( + '--taggers', + help="Only run experiments with these taggers, comma separated", + default=DEFAULT_TAGGERS, type=comma_list) + parser.add_argument( + '--language-texts', + help="Use different texts per language, coma seperated colon pairs", + default=DEFAULT_TEXTS, type=comma_colon_dict) + parser.add_argument( + '--folds', + help="Use x-fold validation instead of 10-fold", + default=10, type=int) + parser.add_argument( + '--reuse', + help="Reuse preprocesed dictionary and corpa from previous run", + action='store_true') return parser.parse_args() -def read1k_chunker(f): - def read1k(): - return f.read(1024) - return iter(read1k, '') -def filter(func=None, iter_filter=False, input_chunker=None, output_separator=''): - if func is None: - def defer(func=None): - return filter(func, iter_filter=iter_filter, input_chunker=input_chunker, output_separator=output_separator) - return defer - - def generator(input_iter): - for line in input_iter: - filtered = func(line) - if filtered is not None: - yield filtered + output_separator - - @functools.wraps(func) - def wrapper(input, output=None): - input_file = None - if isinstance(input, str): - input_iter = input_file = open(input) - if input_chunker: - input_iter = input_chunker(input_file) - else: - input_iter = input_file.readlines() - else: - input_iter = input - if iter_filter: - gen = func(input_iter) - else: - gen = generator(input_iter) - if output is None: - return gen - output_file = open(output, 'w') - for line in gen: - output_file.write(line) - if input_file is not None: - input_file.close() - output_file.close() - return wrapper - - -class MapFilter: - def __init__(self, aiterable, pred=None, tran=None): - self.aiterable = aiterable - self.pred = pred - self.tran = tran - - async def __aiter__(self): - return self - - async def __anext__(self): - while True: - payload = await self.aiterable.__anext__() - if self.pred is None or self.pred(payload): - if self.tran is None: - return payload - else: - return self.tran(payload) - - -async def dir_in(input_fn, proc): - input_file = open(input_fn) - while 1: - b = input_file.read(1024) - if not len(b): - return - await proc.stdin.write(b) - - -async def dir_out(proc, output_fn): - output_file = open(output_fn, 'w') - while 1: - b = await in_proc.read(1024) - if not len(b): - return - output_file.write(b) - - -class Tee(MapFilter): - def __init__(self, aiterable, log_file): - self.log_file = open(log_file, 'wb') - super().__init__(aiterable, tran=self.tran) - - def tran(self, bit): - self.log_file.write(bit) - return bit - - -async def pipe(in_proc, out_proc): - while 1: - b = await in_proc.stdout.read(16384) - if b == b'': - return - out_proc.stdin.write(b) - #await out_proc.stdin.drain() - -#async def readlines(in_proc): -# while 1: -# b = await in_proc.stdout.readline() -# if not len(b): -# return -# b - - -async def writeiter(iter, out_proc): - i = 0 - async for block in iter: - if (i % 10000) == 0: - print(".", end="", flush=True) - out_proc.stdin.write(block) - #await out_proc.stdin.drain() - i += 1 - print("writeiter done") - out_proc.stdin.write_eof() - -def proc_filter(func=None, output_chunker=None): - if func is None: - def defer(func): - return proc_filter(func, output_chunker=output_chunker) - return defer - - @functools.wraps(func) - def wrapper(*args, **kwargs): - input = kwargs.pop('input', None) - output = kwargs.pop('output', None) - - if callable(func): - cmd = func(*args, **kwargs) - else: - cmd = func - - kwargs = {} - if isinstance(input, str): - kwargs['stdin'] = open(input) - else: - kwargs['stdin'] = subprocess.PIPE - if output: - kwargs['stdout'] = open(output, 'w') - else: - kwargs['stdout'] = subprocess.PIPE - print("RUNNING: ", ' '.join(cmd)) - proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs) - print(type(input)) - if not isinstance(input, str) and input is not None: - for line in input: - if 'apertium-destxt' in cmd: - print('des', line) - #if not isinstance(input, bytes): - #line = line.encode('utf8') - #print(proc.stdin) - proc.stdin.write(line) - if not output: - if output_chunker: - return output_chunker(proc.stdout) - else: - return proc.stdout.readlines() - stdout, stderr = proc.communicate() - retcode = proc.poll() - return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr) - - return wrapper - - @filter -def strip_blanks_(line): +def strip_blanks(line): if line != '\n': return line @@ -256,27 +107,12 @@ def extract_first_analysis(line): return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' -strip_blanks = functools.partial(MapFilter, pred=lambda line: line != '\n') - @filter def passthrough(line): return line -def prun(*args, **kwargs): - for key in ('stdin', 'stdout', 'stderr'): - if key not in kwargs: - kwargs[key] = subprocess.PIPE - print('prun', args, kwargs) - return subprocess.run(*args, **kwargs) - -def check_run(cmd, *args, **kwargs): - kwargs['check'] = True - print("RUNNING: ", ' '.join(cmd)) - return subprocess.run(cmd, *args, **kwargs) - - @proc_filter def lt_proc(morphology_fn, dictcase=False): cmd = ['lt-proc', morphology_fn] @@ -296,26 +132,32 @@ return cmd - @proc_filter -def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): +def tagger_train_sup(model_type, model_fn, train_fn, + trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] - if not all((trainsrc_fn, dic_fn, tsx_fn)) and model_type != 'unigram': + if (not all((trainsrc_fn, dic_fn, tsx_fn)) and + not model_type.startswith('unigram')): raise ValueError("Optional arguments required for non-unigram models") - if model_type != 'unigram': - #apertium-tagger -s 0 /tmp/$DN.dic /tmp/spa.misc.$i.trainsrc $TSX /tmp/spa.misc.$i.prob /tmp/spa.misc.$i.train /tmp/spa.misc.$i.trainsrc + if model_type == 'lwsw': + raise ValueError("No supervised training for lwsw model") + if model_type.startswith('unigram'): + cmd.append(train_fn) + else: cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] + cmd.append(train_fn) cmd.append(trainsrc_fn) insert_model(cmd, model_type) - cmd.insert(-1, train_fn) return cmd @proc_filter -def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, iterations=0): - if model_fn == 'unigram': +def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, + iterations=0): + if model_fn.startswith('unigram'): raise ValueError("No unsupervised training for unigram models") - cmd = ['apertium-tagger', '--train={}'.format(iterations), dic_fn, trainsrc_fn, tsx_fn, model_fn] + cmd = ['apertium-tagger', '--train={}'.format(iterations), + dic_fn, trainsrc_fn, tsx_fn, model_fn] insert_model(cmd, model_type) return cmd @@ -334,51 +176,42 @@ async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): pipes = [] - expand_proc = await create_subprocess_exec('lt-expand', dix_fn, stdout=PIPE) + expand_proc = await create_subprocess_exec('lt-expand', + dix_fn, stdout=PIPE) filtered = filter_dix(expand_proc.stdout) - extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", "¡"] + extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", + "¡"] for i, extra in enumerate(extras): extras[i] = (extra + "\n").encode('utf-8') with_extras = aitertools.chain(filtered, extras) lt_inpipe, destxt_outpipe = os.pipe() - destxt = await create_subprocess_exec('apertium-destxt', stdin=PIPE, stdout=destxt_outpipe) + destxt = await create_subprocess_exec('apertium-destxt', + stdin=PIPE, stdout=destxt_outpipe) os.close(destxt_outpipe) pipes.append(writeiter(with_extras, destxt)) + if tsx_fn is not None: filter_ambg_inpipe, lt_outpipe = os.pipe() - lt_proc = await create_subprocess_exec('lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe) + lt_proc = await create_subprocess_exec( + 'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe) os.close(lt_outpipe) - filter_ambg = await create_subprocess_exec('apertium-filter-ambiguity', tsx_fn, stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb')) + filter_ambg = await create_subprocess_exec( + 'apertium-filter-ambiguity', tsx_fn, + stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb')) pipes.append(filter_ambg.wait()) + else: + lt_proc = await create_subprocess_exec( + 'lt-proc', morphology_fn, + stdin=lt_inpipe, stdout=open(output_fn, 'wb')) + pipes.append(lt_proc.wait()) + return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) -#def lt_proc(morphology_fn, input_fn, output_fn=None, dictcase=False): - #cmd = ['lt-proc', morphology_fn] - #if dictcase: - #cmd.insert(1, '--dictionary-case') - #if isinstance(input_fn, str): - #cmd.append(input_fn) - #cmd.append(output_fn) - #return prun(cmd) - #else: - #kwargs = {'stdin': subprocess.PIPE} - #if output_fn: - ##kwargs['stdout'] = open(output_fn, 'w') - #with subprocess.Popen(cmd, **kwargs) as proc: - #if not isinstance(input_fn, str): - #for line in input_fn: - #if not isinstance(input_fn, bytes): - #line = line.encode('utf8') - #proc.stdin.write(line) - #stdout, stderr = proc.communicate() - #retcode = proc.poll() - #return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr) - @proc_filter def cg_proc(grammar_fn, dictcase=True): cmd = ['cg-proc', grammar_fn] @@ -387,12 +220,17 @@ return cmd +@filter +def strip_at_tag(line): + return AT_TAG_REGEX.sub('', line) + + @filter(iter_filter=True) def strip_unknown_sent(gen): buff = [] valid_sent = True for line in gen: - if line.strip() == '': + if line.strip().strip('¶') == '': if valid_sent: for line in buff: yield line @@ -404,6 +242,7 @@ if '/*' in line: valid_sent = False + def split_n_r(corpus_fn, train_fn, ref_fn, n, r): sentences = 0 with open(corpus_fn) as corpus_file: @@ -414,7 +253,6 @@ split_left = int(float(sentences) * r / n) split_right = int(float(sentences) * (r + 1) / n) - buffer = '' index = 0 corpus_file.seek(0) @@ -440,37 +278,19 @@ def wrapper(self, *args, **kwargs): accuracies = [] for i, xval_fns in enumerate(self.xval_fns): - #(prefix, train_fn, src_fn, ref_fn) xval_fns['test'] = xval_fns['prefix'] + 'test.' + name xval_fns['model'] = xval_fns['prefix'] + 'model.' + name - func(self, xval_fns) # train_fn, src_fn, ref_fn, test_fn, model_fn - evaluator = TaggerEvaluator(xval_fns['src'], xval_fns['ref'], xval_fns['test']) + func(self, xval_fns) + evaluator = TaggerEvaluator( + xval_fns['src'], xval_fns['ref'], xval_fns['test']) evaluator.run_analysis() accuracies.append(evaluator.accuracy) - return (min(accuracies), max(accuracies), statistics.mean(accuracies), statistics.stddev(accuracies)) + return (min(accuracies), max(accuracies), + mean(accuracies), stdev(accuracies)) return wrapper return dec -def unigram_taggers(cls): - for do_cg in [False, True]: - for unigram_type in range(1,4): - unigram_model = 'unigram' + str(unigram_type) - name = ('cg' if do_cg else '') + unigram_model - - @cls.reg_experiment - @xval_experiment(name) - def experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model): - tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train']) - if do_cg: - tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) - else: - tagger_input = xval_fns['src'] - tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) - setattr(cls, 'experiment_' + name, experiment) - return cls - - def extract_src(morphology_fn, input, output=None): ref_words_iter = extract_words(input=input) return lt_proc(morphology_fn, input=ref_words_iter, output=output) @@ -489,6 +309,13 @@ self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin') self.cg_fn = pjoin(lang_root, lang + '.rlx.bin') self.dix_fn = pjoin(lang_root, pair_name + '.dix') + if not pexists(self.dix_fn): + self.dix_fn = pjoin(lang_root, '.deps', pair_name + '.dix') + if lang in NO_TSX_LANGUAGES: + self.tsx_fn = None + elif lang in TSX_MAP: + self.tsx_fn = pjoin(lang_root, TSX_MAP[lang]) + else: self.tsx_fn = pjoin(lang_root, pair_name + '.tsx') self.text_fns = [pjoin(lang_root, text) for text in texts] @@ -521,27 +348,38 @@ self.do_preprocessing() def validate(self): + for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]: + if fn is not None and not pexists(fn): + raise MissingLanguageDataException(fn=fn) + check_run(["apertium-validate-dictionary", self.dix_fn]) + if self.tsx_fn is not None: check_run(["apertium-validate-tagger", self.tsx_fn]) - for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]: - if not pexists(fn): - raise MissingLanguageDataException(fn=fn) - def do_preprocessing(self): if not isdir(self.work_dir): mkdir(self.work_dir) - strip_unknown_sent(itertools.chain(*(open(fn).readlines() for fn in self.text_fns)), self.joined_fn) - #loop.run_until_complete(strip_blanks(self.joined_fn, self.ref_fn)) - strip_blanks_(self.joined_fn, self.ref_fn) + joined = itertools.chain(*(open(fn).readlines() + for fn in self.text_fns)) + if self.lang in STRIP_AT_LANGUAGES: + strip_unknown_in = strip_at_tag(joined) + else: + strip_unknown_in = joined + strip_unknown_sent(strip_unknown_in, self.joined_fn) + strip_blanks(self.joined_fn, self.ref_fn) extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) - loop.run_until_complete(fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn)) + loop.run_until_complete( + fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, + output_fn=self.dic_fn)) for i, xval_fn in enumerate(self.xval_fns): - split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], self.folds, i) - extract_src(self.morphology_fn, input=xval_fn['ref'], output=xval_fn['src']) - extract_src(self.morphology_fn, input=xval_fn['train'], output=xval_fn['trainsrc']) + split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], + self.folds, i) + extract_src(self.morphology_fn, + input=xval_fn['ref'], output=xval_fn['src']) + extract_src(self.morphology_fn, + input=xval_fn['train'], output=xval_fn['trainsrc']) def _analyse(self, test_fn): evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn) @@ -552,24 +390,35 @@ def reg_experiment(cls, name): def reg(func): LanguageTaggerExperimentor.experiments[name] = func + return func return reg @classmethod + def needs_tsx(cls, func): + func.needs_tsx = True + return func + + @classmethod def add_experiments(cls): for do_cg in [False, True]: - for unigram_type in range(1,4): + for unigram_type in range(1, 4): unigram_model = 'unigram' + str(unigram_type) name = ('cg' if do_cg else '') + unigram_model @cls.reg_experiment(name) @xval_experiment(name) - def unigram_experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model): - tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train']) + def unigram_experiment(self, xval_fns, + do_cg=do_cg, + unigram_model=unigram_model): + tagger_train_sup(unigram_model, + xval_fns['model'], xval_fns['train']) if do_cg: - tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) + tagger_input = cg_proc(self.cg_fn, + input=xval_fns['src']) else: tagger_input = xval_fns['src'] - tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) + tagger_tag(unigram_model, xval_fns['model'], + input=tagger_input, output=xval_fns['test']) for do_cg in [False, True]: name = ('cg' if do_cg else '') + '1st' @@ -581,7 +430,7 @@ tagger_input = cg_proc(self.cg_fn, input=self.src_fn) else: tagger_input = self.src_fn - extract_first_analysis(self.src_fn, first_fn) + extract_first_analysis(tagger_input, first_fn) return self._analyse(first_fn) for do_cg in [False, True]: @@ -595,6 +444,7 @@ @cls.reg_experiment(name) @xval_experiment(name) + @cls.needs_tsx def model_experiment(self, xval_fns, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations): if is_supervised: tagger_train_sup( @@ -612,10 +462,12 @@ tsx_fn=self.tsx_fn, iterations=iterations) if do_cg: - tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) + tagger_input = cg_proc(self.cg_fn, + input=xval_fns['src']) else: tagger_input = xval_fns['src'] - tagger_tag(model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) + tagger_tag(model, xval_fns['model'], + input=tagger_input, output=xval_fns['test']) @classmethod def all_taggers(cls): @@ -622,6 +474,9 @@ return cls.experiments.keys() def get_tagger(self, tagger): + tagger_func = self.experiments[tagger] + if self.tsx_fn is None and getattr(tagger_func, 'needs_tsx', False): + return None return functools.partial(self.experiments[tagger], self) @@ -641,22 +496,32 @@ for lang in args.languages: taggers = args.language_texts[lang] lang_root = pjoin(args.languagesdir, 'apertium-' + lang) + def mk_experimentor(): - return LanguageTaggerExperimentor(lang, lang_root, taggers, args.folds, reuse=args.reuse) + return LanguageTaggerExperimentor(lang, lang_root, taggers, + args.folds, reuse=args.reuse) try: experimentor = mk_experimentor() except MissingLanguageDataException as e: print("Missing {}... Trying to build it for you.".format(e.fn)) with cd(lang_root): - check_run('./autogen.sh') - check_run('make') + check_run(['./autogen.sh']) + check_run(['make']) experimentor = mk_experimentor() languages_tagger_accuracies[lang] = {} for tagger in args.taggers: experiment = experimentor.get_tagger(tagger) + if experiment is None: + print("Skipping {}/{} since it needs a tsx" + .format(lang, tagger)) + else: languages_tagger_accuracies[lang][tagger] = experiment() finally: - pprint(languages_tagger_accuracies) + result_pretty = pformat(languages_tagger_accuracies) + print(result_pretty) + outf = pjoin(TMPDIR, 'result-{}.pyson' + .format(datetime.datetime.now().isoformat())) + open(outf, 'w').write(result_pretty) if __name__ == '__main__': Index: branches/apertium-tagger/experiments/shell_utils.py =================================================================== --- branches/apertium-tagger/experiments/shell_utils.py (nonexistent) +++ branches/apertium-tagger/experiments/shell_utils.py (revision 68807) @@ -0,0 +1,186 @@ +import os +import functools +from contextlib import contextmanager +import subprocess + + +@contextmanager +def cd(newdir): + prevdir = os.getcwd() + os.chdir(os.path.expanduser(newdir)) + try: + yield + finally: + os.chdir(prevdir) + + +def read1k_chunker(f): + def read1k(): + return f.read(1024) + return iter(read1k, '') + + +def filter(func=None, iter_filter=False, + input_chunker=None, output_separator=''): + if func is None: + def defer(func=None): + return filter( + func, iter_filter=iter_filter, + input_chunker=input_chunker, output_separator=output_separator) + return defer + + def generator(input_iter): + for line in input_iter: + filtered = func(line) + if filtered is not None: + yield filtered + output_separator + + @functools.wraps(func) + def wrapper(input, output=None): + input_file = None + if isinstance(input, str): + input_iter = input_file = open(input) + if input_chunker: + input_iter = input_chunker(input_file) + else: + input_iter = input_file.readlines() + else: + input_iter = input + if iter_filter: + gen = func(input_iter) + else: + gen = generator(input_iter) + if output is None: + return gen + output_file = open(output, 'w') + for line in gen: + output_file.write(line) + if input_file is not None: + input_file.close() + output_file.close() + return wrapper + + +class MapFilter: + def __init__(self, aiterable, pred=None, tran=None): + self.aiterable = aiterable + self.pred = pred + self.tran = tran + + async def __aiter__(self): + return self + + async def __anext__(self): + while True: + payload = await self.aiterable.__anext__() + if self.pred is None or self.pred(payload): + if self.tran is None: + return payload + else: + return self.tran(payload) + + +async def dir_in(input_fn, proc): + input_file = open(input_fn) + while 1: + b = input_file.read(1024) + if not len(b): + return + await proc.stdin.write(b) + + +async def dir_out(proc, output_fn): + output_file = open(output_fn, 'w') + while 1: + b = await proc.read(1024) + if not len(b): + return + output_file.write(b) + + +class Tee(MapFilter): + def __init__(self, aiterable, log_file): + self.log_file = open(log_file, 'wb') + super().__init__(aiterable, tran=self.tran) + + def tran(self, bit): + self.log_file.write(bit) + return bit + + +async def pipe(in_proc, out_proc): + while 1: + b = await in_proc.stdout.read(16384) + if b == b'': + return + out_proc.stdin.write(b) + + +async def writeiter(iter, out_proc): + i = 0 + async for block in iter: + if (i % 10000) == 0: + print(".", end="", flush=True) + out_proc.stdin.write(block) + i += 1 + print("writeiter done") + out_proc.stdin.write_eof() + + +def proc_filter(func=None, output_chunker=None): + if func is None: + def defer(func): + return proc_filter(func, output_chunker=output_chunker) + return defer + + @functools.wraps(func) + def wrapper(*args, **kwargs): + input = kwargs.pop('input', None) + output = kwargs.pop('output', None) + + if callable(func): + cmd = func(*args, **kwargs) + else: + cmd = func + + kwargs = {} + if isinstance(input, str): + kwargs['stdin'] = open(input) + else: + kwargs['stdin'] = subprocess.PIPE + if output: + kwargs['stdout'] = open(output, 'w') + else: + kwargs['stdout'] = subprocess.PIPE + print("RUNNING: ", ' '.join(cmd)) + proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs) + print(type(input)) + if not isinstance(input, str) and input is not None: + for line in input: + if 'apertium-destxt' in cmd: + print('des', line) + proc.stdin.write(line) + if not output: + if output_chunker: + return output_chunker(proc.stdout) + else: + return proc.stdout.readlines() + stdout, stderr = proc.communicate() + retcode = proc.poll() + return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr) + + return wrapper + + +def prun(*args, **kwargs): + for key in ('stdin', 'stdout', 'stderr'): + if key not in kwargs: + kwargs[key] = subprocess.PIPE + print('prun', args, kwargs) + return subprocess.run(*args, **kwargs) + + +def check_run(cmd, *args, **kwargs): + kwargs['check'] = True + print("RUNNING: ", ' '.join(cmd)) + return subprocess.run(cmd, *args, **kwargs)