Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 70296) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 70298) @@ -4,6 +4,7 @@ import functools import itertools import sys +import traceback from os import mkdir from os.path import exists as pexists from os.path import join as pjoin @@ -11,11 +12,11 @@ from pprint import pformat import asyncio -from experiments import experiments, experiment_groups +from experiments import experiment_groups, experiments from shell_utils import cd, check_run -from shell_wrappers import (cg_proc, extract_src, fix_dix, run_cg_conv_clean, - split_n_r, strip_blanks, strip_unknown_sent, - copy_blanks) +from shell_wrappers import (cg_proc, copy_blanks, extract_src, fix_dix, + run_cg_conv_clean, split_n_r, strip_blanks, + strip_unknown_sent) loop = asyncio.get_event_loop() @@ -54,7 +55,7 @@ 'cg:texts/raio.tagged.txt', ], 'swe': [ - 'cg:texts/tid.tagged.txt' + 'cgr:texts/tid.tagged.txt' ], } TSX_MAP = { @@ -61,7 +62,8 @@ 'hbs': 'apertium-hbs.hbs-coarse.tsx', } DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe'] -NO_TSX_LANGUAGES = ['rus', 'kaz'] +NO_TSX_LANGUAGES = ['rus', 'kaz', 'swe'] +CRP_NOT_DIC_LANGUAGES = ['swe'] def comma_list(s): @@ -131,7 +133,8 @@ PREPROCESSER_MAP = { - 'cg': run_cg_conv_clean + 'cg': run_cg_conv_clean, + 'cgr': functools.partial(run_cg_conv_clean, rtl=True), } @@ -188,6 +191,7 @@ self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin') self.cg_fn = pjoin(lang_root, lang + '.rlx.bin') self.no_tsx = lang in NO_TSX_LANGUAGES + self.crp_not_dic = lang in CRP_NOT_DIC_LANGUAGES if self.no_tsx: self.dix_fn = None else: @@ -215,6 +219,9 @@ self.src_blanks_fn = self.src_fn + '.blanks' self.cgtag_fn = pjoin(self.work_dir, 'cgtag') self.cgtag_blanks_fn = self.cgtag_fn + '.blanks' + if self.crp_not_dic: + self.dic_fn = self.src_fn + else: self.dic_fn = pjoin(self.work_dir, 'filtered.dic') self.xval_fns = [] @@ -274,7 +281,7 @@ copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn) cg_proc(self.cg_fn, input=self.src_fn, output=self.cgtag_fn) copy_blanks(self.joined_fn, self.cgtag_fn, self.cgtag_blanks_fn) - if not reuse_dic and not self.no_tsx: + if not reuse_dic and not self.crp_not_dic and not self.no_tsx: loop.run_until_complete( fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn)) @@ -324,7 +331,11 @@ if args.dry: print("Running {}/{}".format(lang, tagger)) else: + try: languages_tagger_accuracies[lang][tagger] = experiment(lab) + except: + languages_tagger_accuracies[lang][tagger] = None + traceback.print_exc() else: print("Skipping {}/{} since it needs a tsx" .format(lang, tagger)) Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (revision 70296) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 70298) @@ -72,8 +72,8 @@ @proc_filter def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0, - cg_aug=0, cgtrain_fn=None): - cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] + ambg_classes=10, cg_aug=0, cgtrain_fn=None): + cmd = ['apertium-tagger', '--supervised={}'.format(iterations)] if (not all((trainsrc_fn, dic_fn, tsx_fn)) and not model_type.startswith('unigram')): raise ValueError("Optional arguments required for non-unigram models") @@ -80,15 +80,22 @@ if model_type == 'lwsw': raise ValueError("No supervised training for lwsw model") if model_type.startswith('unigram'): - cmd.append(train_fn) + cmd.append('-u') + cmd.append(model_type[7:]) + cmd.extend([model_fn, train_fn]) else: - cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] - cmd.append(train_fn) + if cg_aug: + cmd.append('--cg-augmented={}'.format(cg_aug)) + if cg_aug == 4: + cmd.append(str(ambg_classes)) + cmd.append(dic_fn) + if cg_aug == 0: cmd.append(trainsrc_fn) - if cg_aug: - cmd.insert(2, '--cg-augmented={}'.format(cg_aug)) - cmd.insert(5, cgtrain_fn) - insert_model(cmd, model_type) + cmd.extend([tsx_fn, model_fn, train_fn]) + if cg_aug != 1 and cg_aug != 4: + cmd.append(trainsrc_fn) + if cg_aug != 0: + cmd.append(cgtrain_fn) return cmd @@ -132,10 +139,10 @@ return ['apertium-cleanstream', '-n'] -async def cg_conv_clean(input, output): +async def cg_conv_clean(input, output, rtl=False): cleanstream_inpipe, cg_conv_outpipe = os.pipe() await create_subprocess_exec( - 'cg-conv', '--in-cg', '--out-apertium', "--ltr", + 'cg-conv', '--in-cg', '--out-apertium', "--rtl" if rtl else "--ltr", stdin=open(input, 'r'), stdout=cg_conv_outpipe) os.close(cg_conv_outpipe) cleanstream_proc = await create_subprocess_exec( @@ -151,9 +158,9 @@ output_f.write(b'\n') -def run_cg_conv_clean(input, output): +def run_cg_conv_clean(input, output, **kwargs): return loop.run_until_complete( - cg_conv_clean(input, output)) + cg_conv_clean(input, output, **kwargs)) @filter Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 70296) +++ branches/apertium-tagger/experiments/experiments.py (revision 70298) @@ -145,6 +145,7 @@ tsx_fn=lab.tsx_fn, iterations=iterations, cg_aug=cg_aug, + ambg_classes=cg_aug_t, cgtrain_fn=xval_fns['traincgtag']) else: tagger_train_unsup( @@ -154,6 +155,7 @@ tsx_fn=lab.tsx_fn, iterations=iterations, cg_aug=cg_aug, + ambg_classes=cg_aug_t, cgtrain_fn=xval_fns['traincgtag']) if do_cg == 'in': tagger_input = xval_fns['cgtag']