Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 69967) +++ branches/apertium-tagger/experiments/experiments.py (revision 69968) @@ -6,7 +6,7 @@ from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag, tagger_train_sup, tagger_train_unsup) -# Experiment db +# Experiment registry experiments = {} @@ -22,11 +22,21 @@ return func -def non_default(func): - func.non_default = True +def meta(func): + func.meta = True return func +def default(func): + func.default = True + return func + + +def cg_extra(func): + func.cg_extra = True + return func + + # Statistical helpers def aggregates(data): return (min(data), max(data), @@ -66,6 +76,7 @@ @reg_experiment(name) @xval_experiment(name) + @default def unigram_experiment(lab, xval_fns, do_cg=do_cg, unigram_model=unigram_model): @@ -84,6 +95,7 @@ name = ('cg' if do_cg else '') + '1st' @reg_experiment(name) + @default def pick_first_experiment(lab, do_cg=do_cg): first_fn = pjoin(lab.work_dir, 'test.' + name) if do_cg: @@ -93,7 +105,14 @@ extract_first_analysis(tagger_input, first_fn) return get_single_analysis(lab, first_fn) -for do_cg in [False, True]: +for cg_aug in [0, 1, 2, 3, (4, 0), (4, 5), (4, 10), (4, 20), (4, 30)]: + if isinstance(cg_aug, tuple): + cg_aug, cg_aug_t = cg_aug + else: + cg_aug_t = None + for do_cg in [None, 'in', 'dual', 'inv']: + if do_cg == 'inv': + continue for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]: @@ -103,18 +122,20 @@ iterations_list = [0, 50, 250] for iterations in iterations_list: name = ( - "{cg}{sup}_{model}".format( - cg='cg_' if do_cg else '', + "{cgt}{cg}{sup}_{model}".format( + cgt='cgt{}_'.format(cg_aug) if cg_aug else '', + cg='cg{}_'.format(do_cg if do_cg != 'in' else '') + if do_cg else '', sup='sup' if is_supervised else 'unsup', model=model ) + ("_i{iterations}".format(iterations=iterations) - if len(iterations_list) > 1 else "")) + if len(iterations_list) > 1 else "") + + ("_j{iterations}".format(iterations=cg_aug_t) + if cg_aug_t else "")) - @reg_experiment(name) - @xval_experiment(name) @needs_tsx - def model_experiment(lab, xval_fns, do_cg=do_cg, + def model_experiment(lab, xval_fns, cg_aug=cg_aug, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations): if is_supervised: @@ -124,7 +145,9 @@ trainsrc_fn=xval_fns['trainsrc'], dic_fn=lab.dic_fn, tsx_fn=lab.tsx_fn, - iterations=iterations) + iterations=iterations, + cg_aug=cg_aug, + cgtrain_fn=xval_fns['traincgtag']) else: tagger_train_unsup( model, xval_fns['model'], @@ -131,18 +154,37 @@ trainsrc_fn=xval_fns['trainsrc'], dic_fn=lab.dic_fn, tsx_fn=lab.tsx_fn, - iterations=iterations) - if do_cg: - tagger_input = cg_proc(lab.cg_fn, - input=xval_fns['src']) + iterations=iterations, + cg_aug=cg_aug, + cgtrain_fn=xval_fns['traincgtag']) + if do_cg == 'in': + tagger_input = xval_fns['cgtag'] else: tagger_input = xval_fns['src'] + if do_cg in ['dual', 'inv']: + cg_tagger_input = xval_fns['cgtag'] + else: + cg_tagger_input = None + tagger_cg_aug = None + if do_cg == 'dual': + tagger_cg_aug = 1 + elif do_cg == 'inv': + tagger_cg_aug = 2 tagger_tag( - model, xval_fns['model'], input=tagger_input, - output=xval_fns['test']).check_returncode() + model, xval_fns['model'], cg_fn=cg_tagger_input, + cg_aug=tagger_cg_aug, input=tagger_input, + output=xval_fns['test'] + ).check_returncode() + if not cg_aug and do_cg in [None, 'in']: + model_experiment = default(model_experiment) + else: + model_experiment = cg_extra(model_experiment) + model_experiment = xval_experiment(name)(model_experiment) + model_experiment = reg_experiment(name)(model_experiment) @reg_experiment('word_count') +@default def word_count(lab): count = 0 for line in open(lab.src_fn): @@ -153,7 +195,7 @@ @reg_experiment('new_cg_ambg') @needs_tsx -@non_default +@meta def new_cg_ambg(lab): model_fn = pjoin(lab.work_dir, 'new_cg_ambg.model') tagger_train_sup('bigram', model_fn, Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 69967) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 69968) @@ -1,25 +1,24 @@ -from os import mkdir -from os.path import isdir, exists as pexists, join as pjoin -from subprocess import PIPE +import argparse +import datetime import functools import itertools -import aitertools -import argparse -import asyncio -from asyncio.subprocess import create_subprocess_exec -import os -import re import sys +from os import mkdir +from os.path import exists as pexists +from os.path import join as pjoin +from os.path import isdir from pprint import pformat -import datetime -from shell_utils import cd, filter, check_run, writeiter, MapFilter -from shell_wrappers import extract_src +import asyncio from experiments import experiments +from shell_utils import cd, check_run +from shell_wrappers import (cg_proc, extract_src, fix_dix, run_cg_conv_clean, + split_n_r, strip_blanks, strip_unknown_sent, + copy_blanks) loop = asyncio.get_event_loop() -TMPDIR = 'experimenttmp' +WORK_DIR = 'experiment_work' DEFAULT_TEXTS = { 'cat': ['texts/miscellaneous.tagged.txt'], 'spa': ['texts/miscellaneous.tagged.txt'], @@ -97,6 +96,10 @@ help="Reuse preprocesed dictionary from previous run", action='store_true') parser.add_argument( + '--cg-extra', + help="Run extra CG assisted tagging tests", + action='store_true') + parser.add_argument( '--output', help="Output file for the results of the experiment") parser.add_argument( @@ -107,94 +110,8 @@ return parser.parse_args() -@filter -def strip_blanks(line): - if line != '\n': - return line - - -filter_dix = functools.partial( - MapFilter, - pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, - tran=lambda line: line.split(b":")[0] + b"\n") - - -async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): - pipes = [] - - expand_proc = await create_subprocess_exec('lt-expand', - dix_fn, stdout=PIPE) - filtered = filter_dix(expand_proc.stdout) - - extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", - "¡", "“", "”", "«", "»", - ] - for i, extra in enumerate(extras): - extras[i] = (extra + "\n").encode('utf-8') - with_extras = aitertools.chain(filtered, extras) - - lt_inpipe, destxt_outpipe = os.pipe() - destxt = await create_subprocess_exec('apertium-destxt', - stdin=PIPE, stdout=destxt_outpipe) - os.close(destxt_outpipe) - - pipes.append(writeiter(with_extras, destxt)) - - if tsx_fn is not None: - filter_ambg_inpipe, lt_outpipe = os.pipe() - lt_proc = await create_subprocess_exec( - 'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe) - os.close(lt_outpipe) - - filter_ambg = await create_subprocess_exec( - 'apertium-filter-ambiguity', tsx_fn, - stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb')) - pipes.append(filter_ambg.wait()) - else: - lt_proc = await create_subprocess_exec( - 'lt-proc', morphology_fn, - stdin=lt_inpipe, stdout=open(output_fn, 'wb')) - pipes.append(lt_proc.wait()) - - return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) - - -filter_dix = functools.partial( - MapFilter, - pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, - tran=lambda line: line.split(b":")[0] + b"\n") - - -SENT_END_RE = re.compile(br'/[.!?]\$$') -SELECT_RE = re.compile(br'') - - -async def cg_conv_clean(input, output): - cleanstream_inpipe, cg_conv_outpipe = os.pipe() - await create_subprocess_exec( - 'cg-conv', '--in-cg', '--out-apertium', "--ltr", - stdin=open(input, 'r'), stdout=cg_conv_outpipe) - os.close(cg_conv_outpipe) - cleanstream = await create_subprocess_exec( - 'apertium-cleanstream', '-n', - stdin=cleanstream_inpipe, stdout=PIPE) - - output_f = open(output, 'wb') - await cleanstream.stdout.readline() - async for line in cleanstream.stdout: - line = SELECT_RE.sub(b'', line) - output_f.write(line) - if SENT_END_RE.search(line): - output_f.write(b'\n') - - -def cg_conv_clean_(input, output): - return loop.run_until_complete( - cg_conv_clean(input, output)) - - PREPROCESSER_MAP = { - 'cg': cg_conv_clean_ + 'cg': run_cg_conv_clean } @@ -225,50 +142,6 @@ } -@filter(iter_filter=True) -def strip_unknown_sent(gen, invalidate_func=None): - buff = [] - valid_sent = True - for line in gen: - if line.strip().strip('¶') == '': - if valid_sent: - for line in buff: - yield line - yield '\n' - buff = [] - valid_sent = True - else: - buff.append(line) - if '/*' in line: - valid_sent = False - if invalidate_func is not None and invalidate_func(line): - valid_sent = False - - -def split_n_r(corpus_fn, train_fn, ref_fn, n, r): - sentences = 0 - with open(corpus_fn) as corpus_file: - for line in corpus_file.readlines(): - if line.strip() == '': - sentences = sentences + 1 - - split_left = int(float(sentences) * r / n) - split_right = int(float(sentences) * (r + 1) / n) - - index = 0 - - corpus_file.seek(0) - - with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file: - for line in corpus_file.readlines(): - if line.strip() == '': - index = index + 1 - elif split_left <= index < split_right: - ref_file.write(line) - else: - train_file.write(line) - - class MissingLanguageDataException(Exception): def __init__(self, fn): self.fn = fn @@ -278,7 +151,7 @@ def __init__(self, lang, lang_root, texts, folds, reuse=False, reuse_dic=False): self.lang = lang - self.work_dir = pjoin(TMPDIR, lang) + self.work_dir = pjoin(WORK_DIR, lang) pair_name = 'apertium-{0}.{0}'.format(lang) self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin') @@ -304,6 +177,9 @@ self.joined_fn = pjoin(self.work_dir, 'joined') self.ref_fn = pjoin(self.work_dir, 'ref') self.src_fn = pjoin(self.work_dir, 'src') + self.src_blanks_fn = self.src_fn + '.blanks' + self.cgtag_fn = pjoin(self.work_dir, 'cgtag') + self.cgtag_blanks_fn = self.cgtag_fn + '.blanks' self.dic_fn = pjoin(self.work_dir, 'filtered.dic') self.xval_fns = [] @@ -311,17 +187,21 @@ for i in range(folds): xval_prefix = pjoin(self.work_dir, 'xval.{}.'.format(i)) + xval_ref_fn = xval_prefix + 'ref' + xval_train_fn = xval_prefix + 'train' xval_src_fn = xval_prefix + 'src' xval_trainsrc_fn = xval_prefix + 'trainsrc' - xval_train_fn = xval_prefix + 'train' - xval_ref_fn = xval_prefix + 'ref' + xval_cgtag_fn = xval_prefix + 'cgtag' + xval_traincg_fn = xval_prefix + 'traincgtag' self.xval_fns.append({ 'prefix': xval_prefix, + 'ref': xval_ref_fn, 'train': xval_train_fn, 'src': xval_src_fn, - 'ref': xval_ref_fn, 'trainsrc': xval_trainsrc_fn, + 'cgtag': xval_cgtag_fn, + 'traincgtag': xval_traincg_fn, }) self.validate() @@ -362,6 +242,9 @@ self.lang, lambda x: False)) strip_blanks(self.joined_fn, self.ref_fn) extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) + copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn) + cg_proc(self.cg_fn, input=self.src_fn, output=self.cgtag_fn) + copy_blanks(self.joined_fn, self.cgtag_fn, self.cgtag_blanks_fn) if not reuse_dic: loop.run_until_complete( fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, @@ -370,10 +253,10 @@ for i, xval_fn in enumerate(self.xval_fns): split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], self.folds, i) - extract_src(self.morphology_fn, - input=xval_fn['ref'], output=xval_fn['src']) - extract_src(self.morphology_fn, - input=xval_fn['train'], output=xval_fn['trainsrc']) + split_n_r(self.src_blanks_fn, xval_fn['trainsrc'], xval_fn['src'], + self.folds, i) + split_n_r(self.cgtag_blanks_fn, xval_fn['traincgtag'], + xval_fn['cgtag'], self.folds, i) def get_tagger(self, tagger): tagger_func = experiments[tagger] @@ -388,8 +271,8 @@ def main(): args = parse_args() - if not isdir(TMPDIR): - mkdir(TMPDIR) + if not isdir(WORK_DIR): + mkdir(WORK_DIR) if args.notify: import notify2 @@ -399,24 +282,26 @@ taggers = args.language_texts[lang] lang_root = pjoin(args.languagesdir, 'apertium-' + lang) - def mk_experimentor(): + def mk_lab(): return LanguageTaggerLab(lang, lang_root, taggers, args.folds, reuse=args.reuse, reuse_dic=args.reuse_dic) try: - experimentor = mk_experimentor() + lab = mk_lab() except MissingLanguageDataException as e: print("Missing {}... Trying to build it for you.".format(e.fn)) with cd(lang_root): check_run(['./autogen.sh']) check_run(['make']) - experimentor = mk_experimentor() + lab = mk_lab() languages_tagger_accuracies[lang] = {} for tagger in args.taggers: - experiment = experimentor.get_tagger(tagger) + experiment = lab.get_tagger(tagger) if experiment is None: print("Skipping {}/{} since it needs a tsx" .format(lang, tagger)) + elif getattr(experiment, 'default', False) and + not (args.cg_extra and getattr(experiment, 'cg_extra', False)): else: languages_tagger_accuracies[lang][tagger] = experiment() finally: @@ -425,7 +310,7 @@ if args.output: outf = args.output else: - outf = pjoin(TMPDIR, 'result-{}.pyson' + outf = pjoin(WORK_DIR, 'result-{}.pyson' .format(datetime.datetime.now().isoformat())) open(outf, 'w').write(result_pretty) Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (revision 69967) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 69968) @@ -1,6 +1,20 @@ -from shell_utils import filter, proc_filter +import functools +import os +import re +from subprocess import PIPE +import aitertools +import asyncio +from asyncio.subprocess import create_subprocess_exec +from shell_utils import MapFilter, filter, proc_filter, writeiter +loop = asyncio.get_event_loop() + +BYTES_SENT_END_RE = re.compile(br'/[.!?]\$$') +SENT_END_RE = re.compile(r'/[.!?]\$$') +SELECT_RE = re.compile(br'') + + @proc_filter def lt_proc(morphology_fn, dictcase=False): cmd = ['lt-proc', morphology_fn] @@ -11,7 +25,10 @@ @filter(output_separator='\n') def extract_words(line): + if line: return line.split('^')[1].split('/')[0] + else: + return '' @filter(output_separator='\n') @@ -19,6 +36,15 @@ return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' +@filter(iter_filter=True) +def add_sentence_newlines(iter): + for line in iter: + if BYTES_SENT_END_RE.search(line): + return line + else: + return line + b'\n' + + def extract_src(morphology_fn, input, output=None): ref_words_iter = extract_words(input=input) return lt_proc(morphology_fn, input=ref_words_iter, output=output) @@ -45,7 +71,8 @@ @proc_filter def tagger_train_sup(model_type, model_fn, train_fn, - trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): + trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0, + cg_aug=0, cgtrain_fn=None): cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] if (not all((trainsrc_fn, dic_fn, tsx_fn)) and not model_type.startswith('unigram')): @@ -58,6 +85,9 @@ cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] cmd.append(train_fn) cmd.append(trainsrc_fn) + if cg_aug: + cmd.insert(2, '--cg-augmented={}'.format(cg_aug)) + cmd.insert(5, cgtrain_fn) insert_model(cmd, model_type) return cmd @@ -64,19 +94,176 @@ @proc_filter def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, - iterations=0): + iterations=0, cg_aug=0, ambg_classes=10, + cgtrain_fn=None): if model_fn.startswith('unigram'): raise ValueError("No unsupervised training for unigram models") - cmd = ['apertium-tagger', '--train={}'.format(iterations), - dic_fn, trainsrc_fn, tsx_fn, model_fn] - insert_model(cmd, model_type) + cmd = ['apertium-tagger', '--train={}'.format(iterations)] + if model_type == 'lwsw': + cmd.append('--sliding-window') + if cg_aug: + cmd.append('--cg-augmented={}'.format(cg_aug)) + if cg_aug == 4: + cmd.append(str(ambg_classes)) + cmd.append(dic_fn) + if cg_aug != 1 and cg_aug != 4: + cmd.append(trainsrc_fn) + if cg_aug != 0: + cmd.append(cgtrain_fn) + cmd.extend([tsx_fn, model_fn]) return cmd @proc_filter -def tagger_tag(model_type, model_fn, debug=False): +def tagger_tag(model_type, model_fn, cg_fn=None, cg_aug=None, debug=False): cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn] + if cg_aug: + cmd.append('--cg-augmented={}'.format(cg_aug)) + if cg_fn: + cmd.append(cg_fn) if debug: cmd.insert(1, '--debug') insert_model(cmd, model_type, tagging=True) return cmd + + +@proc_filter +def cleanstream(): + return ['apertium-cleanstream', '-n'] + + +async def cg_conv_clean(input, output): + cleanstream_inpipe, cg_conv_outpipe = os.pipe() + await create_subprocess_exec( + 'cg-conv', '--in-cg', '--out-apertium', "--ltr", + stdin=open(input, 'r'), stdout=cg_conv_outpipe) + os.close(cg_conv_outpipe) + cleanstream_proc = await create_subprocess_exec( + 'apertium-cleanstream', '-n', + stdin=cleanstream_inpipe, stdout=PIPE) + + output_f = open(output, 'wb') + await cleanstream_proc.stdout.readline() + async for line in cleanstream_proc.stdout: + line = SELECT_RE.sub(b'', line) + output_f.write(line) + if BYTES_SENT_END_RE.search(line): + output_f.write(b'\n') + + +def run_cg_conv_clean(input, output): + return loop.run_until_complete( + cg_conv_clean(input, output)) + + +@filter +def strip_blanks(line): + if line != '\n': + return line + + +filter_dix = functools.partial( + MapFilter, + pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, + tran=lambda line: line.split(b":")[0] + b"\n") + + +async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): + pipes = [] + + expand_proc = await create_subprocess_exec('lt-expand', + dix_fn, stdout=PIPE) + filtered = filter_dix(expand_proc.stdout) + + extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", + "¡", "“", "”", "«", "»", + ] + for i, extra in enumerate(extras): + extras[i] = (extra + "\n").encode('utf-8') + with_extras = aitertools.chain(filtered, extras) + + lt_inpipe, destxt_outpipe = os.pipe() + destxt = await create_subprocess_exec('apertium-destxt', + stdin=PIPE, stdout=destxt_outpipe) + os.close(destxt_outpipe) + + pipes.append(writeiter(with_extras, destxt)) + + if tsx_fn is not None: + filter_ambg_inpipe, lt_outpipe = os.pipe() + lt_proc = await create_subprocess_exec( + 'lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe) + os.close(lt_outpipe) + + filter_ambg = await create_subprocess_exec( + 'apertium-filter-ambiguity', tsx_fn, + stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb')) + pipes.append(filter_ambg.wait()) + else: + lt_proc = await create_subprocess_exec( + 'lt-proc', morphology_fn, + stdin=lt_inpipe, stdout=open(output_fn, 'wb')) + pipes.append(lt_proc.wait()) + + return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) + + +filter_dix = functools.partial( + MapFilter, + pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, + tran=lambda line: line.split(b":")[0] + b"\n") + + +@filter(iter_filter=True) +def strip_unknown_sent(gen, invalidate_func=None): + buff = [] + valid_sent = True + for line in gen: + if line.strip().strip('¶') == '': + if valid_sent: + for line in buff: + yield line + yield '\n' + buff = [] + valid_sent = True + else: + buff.append(line) + if '/*' in line: + valid_sent = False + if invalidate_func is not None and invalidate_func(line): + valid_sent = False + + +def split_n_r(corpus_fn, train_fn, ref_fn, n, r): + sentences = 0 + with open(corpus_fn) as corpus_file: + for line in corpus_file.readlines(): + if line.strip() == '': + sentences += 1 + + split_left = int(float(sentences) * r / n) + split_right = int(float(sentences) * (r + 1) / n) + + index = 0 + + corpus_file.seek(0) + + with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file: + for line in corpus_file.readlines(): + if line.strip() == '': + index += 1 + elif split_left <= index < split_right: + ref_file.write(line) + else: + train_file.write(line) + + +def copy_blanks(blanks_fn, input_fn, output_fn): + blanks = open(blanks_fn) + input = open(input_fn) + output = open(output_fn, 'w') + for line in blanks: + if line.strip() == '': + output.write('\n') + else: + output.write(input.readline())