Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (nonexistent) +++ branches/apertium-tagger/experiments/experiments.py (revision 69428) @@ -0,0 +1,174 @@ +import functools +from os.path import join as pjoin +from statistics import mean, stdev + +from evaluate_tagger import TaggerEvaluator +from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag, + tagger_train_sup, tagger_train_unsup) + +# Experiment db +experiments = {} + + +def reg_experiment(name): + def reg(func): + experiments[name] = func + return func + return reg + + +def needs_tsx(func): + func.needs_tsx = True + return func + + +def non_default(func): + func.non_default = True + return func + + +# Statistical helpers +def aggregates(data): + return (min(data), max(data), + mean(data), stdev(data)) + + +def xval_experiment(name): + def dec(func=None): + @functools.wraps(func) + def wrapper(lab, *args, **kwargs): + recall = [] + recall_available = [] + for i, xval_fns in enumerate(lab.xval_fns): + xval_fns['test'] = xval_fns['prefix'] + 'test.' + name + xval_fns['model'] = xval_fns['prefix'] + 'model.' + name + func(lab, xval_fns) + evaluator = TaggerEvaluator( + xval_fns['src'], xval_fns['ref'], xval_fns['test']) + evaluator.run_analysis() + recall.append(evaluator.recall) + recall_available.append(evaluator.recall_available) + return (aggregates(recall), aggregates(recall_available)) + return wrapper + return dec + + +def get_single_analysis(lab, test_fn): + evaluator = TaggerEvaluator(lab.src_fn, lab.ref_fn, test_fn) + evaluator.run_analysis() + return (evaluator.recall, evaluator.recall_available) + +# Experiments +for do_cg in [False, True]: + for unigram_type in range(1, 4): + unigram_model = 'unigram' + str(unigram_type) + name = ('cg' if do_cg else '') + unigram_model + + @reg_experiment(name) + @xval_experiment(name) + def unigram_experiment(lab, xval_fns, + do_cg=do_cg, + unigram_model=unigram_model): + tagger_train_sup(unigram_model, + xval_fns['model'], xval_fns['train']) + if do_cg: + tagger_input = cg_proc(lab.cg_fn, + input=xval_fns['src']) + else: + tagger_input = xval_fns['src'] + tagger_tag( + unigram_model, xval_fns['model'], + input=tagger_input, output=xval_fns['test']).check_returncode() + +for do_cg in [False, True]: + name = ('cg' if do_cg else '') + '1st' + + @reg_experiment(name) + def pick_first_experiment(lab, do_cg=do_cg): + first_fn = pjoin(lab.work_dir, 'test.' + name) + if do_cg: + tagger_input = cg_proc(lab.cg_fn, input=lab.src_fn) + else: + tagger_input = lab.src_fn + extract_first_analysis(tagger_input, first_fn) + return get_single_analysis(first_fn) + +for do_cg in [False, True]: + for is_supervised, model in [(True, 'bigram'), + (False, 'bigram'), + (False, 'lwsw')]: + if is_supervised: + iterations_list = [0] + else: + iterations_list = [0, 50, 250] + for iterations in iterations_list: + name = ( + "{cg}{sup}_{model}".format( + cg='cg_' if do_cg else '', + sup='sup' if is_supervised else 'unsup', + model=model + ) + + ("_i{iterations}".format(iterations=iterations) + if len(iterations_list) > 1 else "")) + + @reg_experiment(name) + @xval_experiment(name) + @needs_tsx + def model_experiment(lab, xval_fns, do_cg=do_cg, + is_supervised=is_supervised, model=model, + iterations=iterations): + if is_supervised: + tagger_train_sup( + model, xval_fns['model'], + train_fn=xval_fns['train'], + trainsrc_fn=xval_fns['trainsrc'], + dic_fn=lab.dic_fn, + tsx_fn=lab.tsx_fn, + iterations=iterations) + else: + tagger_train_unsup( + model, xval_fns['model'], + trainsrc_fn=xval_fns['trainsrc'], + dic_fn=lab.dic_fn, + tsx_fn=lab.tsx_fn, + iterations=iterations) + if do_cg: + tagger_input = cg_proc(lab.cg_fn, + input=xval_fns['src']) + else: + tagger_input = xval_fns['src'] + tagger_tag( + model, xval_fns['model'], input=tagger_input, + output=xval_fns['test']).check_returncode() + + +@reg_experiment('word_count') +def word_count(lab): + count = 0 + for line in open(lab.src_fn): + if line.strip() != '': + count += 1 + return count + + +@reg_experiment('new_cg_ambg') +@needs_tsx +@non_default +def new_cg_ambg(lab): + model_fn = pjoin(lab.work_dir, 'new_cg_ambg.model') + tagger_train_sup('bigram', model_fn, + train_fn=lab.ref_fn, + trainsrc_fn=lab.src_fn, + dic_fn=lab.dic_fn, + tsx_fn=lab.tsx_fn, + iterations=0) + print('trained') + tagger_input = cg_proc(lab.cg_fn, input=lab.src_fn) + tag_completed_proc = tagger_tag( + 'bigram', model_fn, input=tagger_input, output='/dev/null') + tag_completed_proc.check_returncode() + count = 0 + for line in tag_completed_proc.stderr.split("\n"): + if "Error: A new ambiguity class was found." in line.strip(): + count += 1 + return count Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 69427) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 69428) @@ -1,6 +1,5 @@ -from statistics import mean, stdev from os import mkdir -from os.path import isdir, join as pjoin, exists as pexists +from os.path import isdir, exists as pexists, join as pjoin from subprocess import PIPE import functools import itertools @@ -14,11 +13,10 @@ from pprint import pformat import datetime -from evaluate_tagger import TaggerEvaluator +from shell_utils import cd, filter, check_run, writeiter, MapFilter +from shell_wrappers import extract_src +from experiments import experiments -from shell_utils import ( - cd, filter, proc_filter, check_run, writeiter, MapFilter) - loop = asyncio.get_event_loop() TMPDIR = 'experimenttmp' @@ -115,81 +113,12 @@ return line -@filter(output_separator='\n') -def extract_words(line): - return line.split('^')[1].split('/')[0] - - -@filter(output_separator='\n') -def extract_first_analysis(line): - return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' - - -@filter -def passthrough(line): - return line - - -@proc_filter -def lt_proc(morphology_fn, dictcase=False): - cmd = ['lt-proc', morphology_fn] - if dictcase: - cmd.insert(1, '--dictionary-case') - return cmd - - -def insert_model(cmd, model, tagging=False): - if model == 'bigram': - pass - elif model.startswith('unigram'): - cmd.insert(2, '-u') - cmd.insert(3, model[7:]) - elif model == 'lwsw': - cmd.insert(2, '--sliding-window') - return cmd - - -@proc_filter -def tagger_train_sup(model_type, model_fn, train_fn, - trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): - cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] - if (not all((trainsrc_fn, dic_fn, tsx_fn)) and - not model_type.startswith('unigram')): - raise ValueError("Optional arguments required for non-unigram models") - if model_type == 'lwsw': - raise ValueError("No supervised training for lwsw model") - if model_type.startswith('unigram'): - cmd.append(train_fn) - else: - cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] - cmd.append(train_fn) - cmd.append(trainsrc_fn) - insert_model(cmd, model_type) - return cmd - - -@proc_filter -def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, - iterations=0): - if model_fn.startswith('unigram'): - raise ValueError("No unsupervised training for unigram models") - cmd = ['apertium-tagger', '--train={}'.format(iterations), - dic_fn, trainsrc_fn, tsx_fn, model_fn] - insert_model(cmd, model_type) - return cmd - - -@proc_filter -def tagger_tag(model_type, model_fn): - cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn] - insert_model(cmd, model_type, tagging=True) - return cmd - filter_dix = functools.partial( MapFilter, pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, tran=lambda line: line.split(b":")[0] + b"\n") + async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): pipes = [] @@ -269,14 +198,6 @@ } -@proc_filter -def cg_proc(grammar_fn, dictcase=True): - cmd = ['cg-proc', grammar_fn] - if dictcase: - cmd.insert(1, '-w') - return cmd - - #AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') #PRPERS_TAG = re.compile(r'/\+') @@ -291,11 +212,13 @@ #} def invalidate_por(line): - # All these are the wrong way around! - return ('/$' in line \ - or line.startswith('^beta-fenetilamina/') # gets analysed as two words - or line.startswith('^km²/') # ^2 ends up outside analysis - or ('./' in line and '' not in line)) # ends up as an multiword + return ('/$' in line or + # gets analysed as two words + line.startswith('^beta-fenetilamina/') or + # ^2 ends up outside analysis + line.startswith('^km²/') or + # ends up as an multiword + ('./' in line and '' not in line)) LANGUAGE_INVALIDATOR_MAP = { 'por': invalidate_por @@ -351,41 +274,9 @@ self.fn = fn -def aggregates(data): - return (min(accuracies), max(accuracies), - mean(accuracies), stdev(accuracies)) - -def xval_experiment(name): - def dec(func=None): - @functools.wraps(func) - def wrapper(self, *args, **kwargs): - recall = [] - recall_available = [] - for i, xval_fns in enumerate(self.xval_fns): - xval_fns['test'] = xval_fns['prefix'] + 'test.' + name - xval_fns['model'] = xval_fns['prefix'] + 'model.' + name - func(self, xval_fns) - evaluator = TaggerEvaluator( - xval_fns['src'], xval_fns['ref'], xval_fns['test']) - evaluator.run_analysis() - recall.append(evaluator.recall) - recall_available.append(evaluator.recall_available) - return (aggregates(recall), aggregates(recall_available)) - return wrapper - return dec - - -def extract_src(morphology_fn, input, output=None): - ref_words_iter = extract_words(input=input) - return lt_proc(morphology_fn, input=ref_words_iter, output=output) - - -class LanguageTaggerExperimentor: - EVAL_PREFIX = 'experiment_' - - experiments = {} - - def __init__(self, lang, lang_root, texts, folds, reuse=False, reuse_dic=False): +class LanguageTaggerLab: + def __init__(self, lang, lang_root, texts, folds, + reuse=False, reuse_dic=False): self.lang = lang self.work_dir = pjoin(TMPDIR, lang) @@ -455,7 +346,9 @@ for i, (preprocesser_name, fn) in enumerate(self.text_fns): if preprocesser_name: preprocesser = PREPROCESSER_MAP.get(preprocesser_name) - cleaned_fn = pjoin(self.work_dir, 'cleaned.{}.{}.txt'.format(i, preprocesser_name)) + cleaned_fn = pjoin( + self.work_dir, + 'cleaned.{}.{}.txt'.format(i, preprocesser_name)) preprocesser(input=fn, output=cleaned_fn) preprocessed_texts.append(cleaned_fn) else: @@ -465,7 +358,8 @@ for fn in preprocessed_texts)) strip_unknown_sent( joined, self.joined_fn, - invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(self.lang, lambda x: False)) + invalidate_func=LANGUAGE_INVALIDATOR_MAP.get( + self.lang, lambda x: False)) strip_blanks(self.joined_fn, self.ref_fn) extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) if not reuse_dic: @@ -481,126 +375,17 @@ extract_src(self.morphology_fn, input=xval_fn['train'], output=xval_fn['trainsrc']) - def _analyse(self, test_fn): - evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn) - evaluator.run_analysis() - return (evaluator.recall, evaluator.recall_available) - - @classmethod - def reg_experiment(cls, name): - def reg(func): - LanguageTaggerExperimentor.experiments[name] = func - return func - return reg - - @classmethod - def needs_tsx(cls, func): - func.needs_tsx = True - return func - - @classmethod - def add_experiments(cls): - for do_cg in [False, True]: - for unigram_type in range(1, 4): - unigram_model = 'unigram' + str(unigram_type) - name = ('cg' if do_cg else '') + unigram_model - - @cls.reg_experiment(name) - @xval_experiment(name) - def unigram_experiment(self, xval_fns, - do_cg=do_cg, - unigram_model=unigram_model): - tagger_train_sup(unigram_model, - xval_fns['model'], xval_fns['train']) - if do_cg: - tagger_input = cg_proc(self.cg_fn, - input=xval_fns['src']) - else: - tagger_input = xval_fns['src'] - tagger_tag(unigram_model, xval_fns['model'], - input=tagger_input, output=xval_fns['test']).check_returncode() - - for do_cg in [False, True]: - name = ('cg' if do_cg else '') + '1st' - - @cls.reg_experiment(name) - def pick_first_experiment(self, do_cg=do_cg): - first_fn = pjoin(self.work_dir, 'test.' + name) - if do_cg: - tagger_input = cg_proc(self.cg_fn, input=self.src_fn) - else: - tagger_input = self.src_fn - extract_first_analysis(tagger_input, first_fn) - return self._analyse(first_fn) - - for do_cg in [False, True]: - for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]: - if is_supervised: - iterations_list = [0] - else: - iterations_list = [0, 50, 250] - for iterations in iterations_list: - name = ( - "{cg}{sup}_{model}".format( - cg='cg_' if do_cg else '', - sup='sup' if is_supervised else 'unsup', - model=model - ) + - ("_i{iterations}".format(iterations=iterations) - if len(iterations_list) > 1 else "")) - - @cls.reg_experiment(name) - @xval_experiment(name) - @cls.needs_tsx - def model_experiment(self, xval_fns, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations): - if is_supervised: - tagger_train_sup( - model, xval_fns['model'], - train_fn=xval_fns['train'], - trainsrc_fn=xval_fns['trainsrc'], - dic_fn=self.dic_fn, - tsx_fn=self.tsx_fn, - iterations=iterations) - else: - tagger_train_unsup( - model, xval_fns['model'], - trainsrc_fn=xval_fns['trainsrc'], - dic_fn=self.dic_fn, - tsx_fn=self.tsx_fn, - iterations=iterations) - if do_cg: - tagger_input = cg_proc(self.cg_fn, - input=xval_fns['src']) - else: - tagger_input = xval_fns['src'] - tagger_tag(model, xval_fns['model'], - input=tagger_input, output=xval_fns['test']).check_returncode() - - @cls.reg_experiment('word_count') - def word_count(self): - count = 0 - for line in open(self.src_fn): - if line.strip() != '': - count += 1 - return count - - @classmethod - def all_taggers(cls): - return cls.experiments.keys() - def get_tagger(self, tagger): - tagger_func = self.experiments[tagger] + tagger_func = experiments[tagger] if self.tsx_fn is None and getattr(tagger_func, 'needs_tsx', False): return None - return functools.partial(self.experiments[tagger], self) + return functools.partial(experiments[tagger], self) -LanguageTaggerExperimentor.add_experiments() +DEFAULT_TAGGERS = [k for k, v in experiments.items() + if not getattr(v, 'non_default', False)] -DEFAULT_TAGGERS = list(LanguageTaggerExperimentor.all_taggers()) - - def main(): args = parse_args() if not isdir(TMPDIR): @@ -615,7 +400,7 @@ lang_root = pjoin(args.languagesdir, 'apertium-' + lang) def mk_experimentor(): - return LanguageTaggerExperimentor(lang, lang_root, taggers, + return LanguageTaggerLab(lang, lang_root, taggers, args.folds, reuse=args.reuse, reuse_dic=args.reuse_dic) try: @@ -646,7 +431,8 @@ if args.notify: notify2.init("Tagger experiment finished") - notice = notify2.Notification(' '.join(sys.argv), "Tagger experiment finished") + notice = notify2.Notification( + ' '.join(sys.argv), "Tagger experiment finished") notice.show() Index: branches/apertium-tagger/experiments/shell_utils.py =================================================================== --- branches/apertium-tagger/experiments/shell_utils.py (revision 69427) +++ branches/apertium-tagger/experiments/shell_utils.py (revision 69428) @@ -150,15 +150,13 @@ kwargs['stdin'] = subprocess.PIPE if output: kwargs['stdout'] = open(output, 'w') + kwargs['stderr'] = subprocess.PIPE else: kwargs['stdout'] = subprocess.PIPE print("RUNNING: ", ' '.join(cmd)) proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs) - print(type(input)) if not isinstance(input, str) and input is not None: for line in input: - if 'apertium-destxt' in cmd: - print('des', line) proc.stdin.write(line) if not output: if output_chunker: Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (nonexistent) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 69428) @@ -0,0 +1,80 @@ +from shell_utils import filter, proc_filter + + +@proc_filter +def lt_proc(morphology_fn, dictcase=False): + cmd = ['lt-proc', morphology_fn] + if dictcase: + cmd.insert(1, '--dictionary-case') + return cmd + + +@filter(output_separator='\n') +def extract_words(line): + return line.split('^')[1].split('/')[0] + + +@filter(output_separator='\n') +def extract_first_analysis(line): + return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' + + +def extract_src(morphology_fn, input, output=None): + ref_words_iter = extract_words(input=input) + return lt_proc(morphology_fn, input=ref_words_iter, output=output) + + +def insert_model(cmd, model, tagging=False): + if model == 'bigram': + pass + elif model.startswith('unigram'): + cmd.insert(2, '-u') + cmd.insert(3, model[7:]) + elif model == 'lwsw': + cmd.insert(2, '--sliding-window') + return cmd + + +@proc_filter +def cg_proc(grammar_fn, dictcase=True): + cmd = ['cg-proc', grammar_fn] + if dictcase: + cmd.insert(1, '-w') + return cmd + + +@proc_filter +def tagger_train_sup(model_type, model_fn, train_fn, + trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): + cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] + if (not all((trainsrc_fn, dic_fn, tsx_fn)) and + not model_type.startswith('unigram')): + raise ValueError("Optional arguments required for non-unigram models") + if model_type == 'lwsw': + raise ValueError("No supervised training for lwsw model") + if model_type.startswith('unigram'): + cmd.append(train_fn) + else: + cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] + cmd.append(train_fn) + cmd.append(trainsrc_fn) + insert_model(cmd, model_type) + return cmd + + +@proc_filter +def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, + iterations=0): + if model_fn.startswith('unigram'): + raise ValueError("No unsupervised training for unigram models") + cmd = ['apertium-tagger', '--train={}'.format(iterations), + dic_fn, trainsrc_fn, tsx_fn, model_fn] + insert_model(cmd, model_type) + return cmd + + +@proc_filter +def tagger_tag(model_type, model_fn): + cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn] + insert_model(cmd, model_type, tagging=True) + return cmd