Index: branches/apertium-tagger/experiments/evaluate_tagger.py =================================================================== --- branches/apertium-tagger/experiments/evaluate_tagger.py (revision 71356) +++ branches/apertium-tagger/experiments/evaluate_tagger.py (revision 71357) @@ -219,7 +219,10 @@ ref_msd = '' if tst_w.count('/*') < 1 and tst_w[0] == '^': # { + print('tst_w', tst_w) tst_readings, tst_removed = readings(tst_w, testFunc) + print('tst_readings', tst_readings) + print('tst_removed', tst_removed) tst_lema = reading_lemma(tst_readings[0]) tst_pos = reading_pos(tst_readings[0]) tst_func = reading_func(tst_readings[0]) Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 71356) +++ branches/apertium-tagger/experiments/experiments.py (revision 71357) @@ -4,7 +4,8 @@ from evaluate_tagger import TaggerEvaluator from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag, - tagger_train_sup, tagger_train_unsup) + tagger_train_sup, tagger_train_unsup, + tagger_train_percep) # Experiment registry experiments = {} @@ -215,3 +216,30 @@ count += 1 ambg_classes.add(line) return count, len(ambg_classes) + + +for do_cg in [False, True]: + for mtx_basename in ['kaztags', 'unigram_model3', 'spacyflattags']: + mtx_fn = 'mtx/' + mtx_basename + '.mtx' + name = ('cg_' if do_cg else '') + mtx_basename + '_percep' + + @reg + @group('percep') + @xval_experiment + @exp_name(name) + def percep_experiment(lab, xval_fns, + do_cg=do_cg, mtx_fn=mtx_fn): + tagger_train_percep(xval_fns['model'], + train_fn=xval_fns['train'], + trainsrc_fn=xval_fns['trainsrc'], + mtx_fn=mtx_fn, + sent_seg=lab.sent_seg) + if do_cg: + tagger_input = xval_fns['cgtag'] + else: + tagger_input = xval_fns['src'] + print('tagger_input', tagger_input, 'model', xval_fns['model']) + tagger_tag( + 'percep', xval_fns['model'], input=tagger_input, + output=xval_fns['test'] + ).check_returncode() Index: branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (revision 71357) @@ -1,5 +1,7 @@ + - + + @@ -17,6 +19,7 @@ + @@ -26,6 +29,7 @@ + @@ -32,9 +36,12 @@ + + - + + Index: branches/apertium-tagger/experiments/mtx/commondefns.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/commondefns.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/commondefns.mtx (revision 71357) @@ -2,11 +2,14 @@ - + + + + - + @@ -17,13 +20,13 @@ - + - + @@ -34,15 +37,104 @@ - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/mtx/kaztags.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/kaztags.mtx (nonexistent) +++ branches/apertium-tagger/experiments/mtx/kaztags.mtx (revision 71357) @@ -0,0 +1,192 @@ + + +]> + + + + &commondefns; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/mtx/morphodita.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/morphodita.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/morphodita.mtx (revision 71357) @@ -1,3 +1,7 @@ + + +]> - - -]> - + + @@ -64,9 +65,9 @@ - + &commondefns; - + @@ -102,20 +103,20 @@ - - + + - - + + - + @@ -122,7 +123,7 @@ - + @@ -129,7 +130,7 @@ - + @@ -137,7 +138,7 @@ - + @@ -145,7 +146,7 @@ - + @@ -152,7 +153,7 @@ - + @@ -159,7 +160,7 @@ - + @@ -167,7 +168,7 @@ - + @@ -174,7 +175,7 @@ - + @@ -181,7 +182,7 @@ - + @@ -189,7 +190,7 @@ - + @@ -196,7 +197,7 @@ - + @@ -204,7 +205,7 @@ - + @@ -211,7 +212,7 @@ - + @@ -218,7 +219,7 @@ - + @@ -225,7 +226,7 @@ - + @@ -232,7 +233,7 @@ - + @@ -239,7 +240,7 @@ - + @@ -246,7 +247,7 @@ - + @@ -253,7 +254,7 @@ - + @@ -260,7 +261,7 @@ - + @@ -267,7 +268,7 @@ - + @@ -274,7 +275,7 @@ - + @@ -281,7 +282,7 @@ - + @@ -288,7 +289,7 @@ - + @@ -295,7 +296,7 @@ - + @@ -302,7 +303,7 @@ - + @@ -309,7 +310,7 @@ - + @@ -316,7 +317,7 @@ - + @@ -323,7 +324,7 @@ - + @@ -330,7 +331,7 @@ - + @@ -337,7 +338,7 @@ - + @@ -344,7 +345,7 @@ - + @@ -351,37 +352,50 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/mtx/proposed.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/proposed.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/proposed.mtx (revision 71357) @@ -37,6 +37,8 @@ + + @@ -48,3 +50,9 @@ + + + + + + Index: branches/apertium-tagger/experiments/mtx/spacyflattags.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 71357) @@ -1,15 +1,22 @@ + + +]> - - -]> - + + &commondefns; - - + + + + + + + + @@ -20,122 +27,124 @@ - - - + + + - - - - - - + + + + + + - - - - + + + + + + - - - - + + + + + + - - - - - + + + + + + + + + - - - - - - + + + + + + - - - - - - - + + + + + + + + + - - - - - - + + + + + + - - - - - - - - + + + + + + - - - - - - + + + + + + - - - - - - + + + + + + - - - - - - - - + + + + + + - - - - - - - - - - - + + + + + + - + + Index: branches/apertium-tagger/experiments/mtx/unigram_model3.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (revision 71356) +++ branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (revision 71357) @@ -1,20 +1,41 @@ - + + + + + - + + + - + - + - + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 71356) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 71357) @@ -16,7 +16,7 @@ from shell_utils import cd, check_run from shell_wrappers import (cg_proc, copy_blanks, extract_src, fix_dix, cg_conv_clean, split_n_r, strip_blanks, - strip_unknown_sent) + strip_unknown_sent, strip_cg_comments) loop = asyncio.get_event_loop() @@ -131,6 +131,16 @@ '--notify', help="Produce a desktop notification when done", action='store_true') + parser.add_argument( + '--sent-seg', + help="Segment input sentences on blank lines rather than on " + "tags", + action='store_true') + parser.add_argument( + '--use-cg-src', + help="Get ambiguous stream by uncommenting commented out lines in " + "input CG", + action='store_true') return parser.parse_args() @@ -154,6 +164,7 @@ #'rus': cleanup_rus, #} + def invalidate_por(line): return ('/$' in line or # gets analysed as two words @@ -163,16 +174,20 @@ # ends up as an multiword ('./' in line and '' not in line)) + def invalidate_hbs(line): return line.startswith('+') + def invalidate_kaz(line): # Odd... if '/' not in line: return True left, right = line.split('/', 1) + # '-' in line or return '<' in left or '<' not in right + LANGUAGE_INVALIDATOR_MAP = { 'por': invalidate_por, 'kaz': invalidate_kaz, @@ -187,6 +202,7 @@ class LanguageTaggerLab: def __init__(self, lang, lang_root, texts, folds, + sent_seg=False, use_cg_src=False, reuse=False, reuse_dic=False): self.lang = lang self.work_dir = pjoin(WORK_DIR, lang) @@ -243,6 +259,9 @@ self.validate() + self.sent_seg = sent_seg + self.use_cg_src = use_cg_src + if not reuse: self.do_preprocessing(reuse_dic=reuse_dic) @@ -281,6 +300,31 @@ invalidate_func=LANGUAGE_INVALIDATOR_MAP.get( self.lang, lambda x: False)) strip_blanks(self.joined_fn, self.ref_fn) + if self.use_cg_src: + assert all(pp_name in ['cg', 'cgr'] + for (pp_name, _) in self.text_fns),\ + "Can only get ambiguous input from CG "\ + "if all input files are in CG format" + texts = [] + for i, (preprocessor_name, fn) in enumerate(self.text_fns): + ambg_cg_text = pjoin( + self.work_dir, + 'ambg.{}.{}.txt'.format( + i, preprocessor_name)) + strip_cg_comments(fn, ambg_cg_text) + preprocessor = PREPROCESSOR_MAP.get(preprocessor_name) + cleaned_fn = pjoin( + self.work_dir, + 'cleaned.src.{}.{}.txt'.format(i, preprocessor_name)) + preprocessor(input=ambg_cg_text, output=cleaned_fn) + texts.append(cleaned_fn) + joined = itertools.chain(*(open(fn).readlines() for fn in texts)) + strip_unknown_sent( + joined, self.src_blanks_fn, + invalidate_func=LANGUAGE_INVALIDATOR_MAP.get( + self.lang, lambda x: False)) + strip_blanks(self.src_blanks_fn, self.src_fn) + else: extract_src(self.morphology_fn, input_fn=self.ref_fn, output_fn=self.src_fn) copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn) @@ -292,11 +336,12 @@ for i, xval_fn in enumerate(self.xval_fns): split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], - self.folds, i) + self.folds, i, write_blanks=self.sent_seg) split_n_r(self.src_blanks_fn, xval_fn['trainsrc'], xval_fn['src'], - self.folds, i) + self.folds, i, write_blanks=self.sent_seg) split_n_r(self.cgtag_blanks_fn, xval_fn['traincgtag'], - xval_fn['cgtag'], self.folds, i) + xval_fn['cgtag'], self.folds, i, + write_blanks=self.sent_seg) def can_run_experiment(self, experiment_func): if self.no_tsx and getattr(experiment_func, 'needs_tsx', False): @@ -318,9 +363,10 @@ lang_root = pjoin(args.languagesdir, 'apertium-' + lang) def mk_lab(): - return LanguageTaggerLab(lang, lang_root, taggers, - args.folds, reuse=args.reuse, - reuse_dic=args.reuse_dic) + return LanguageTaggerLab( + lang, lang_root, taggers, args.folds, + sent_seg=args.sent_seg, use_cg_src=args.use_cg_src, + reuse=args.reuse, reuse_dic=args.reuse_dic) try: lab = mk_lab() except MissingLanguageDataException as e: @@ -329,6 +375,7 @@ check_run(['./autogen.sh']) check_run(['make']) lab = mk_lab() + def run_tagger(tagger): experiment = experiments[tagger] if lab.can_run_experiment(experiment): @@ -336,7 +383,8 @@ print("Running {}/{}".format(lang, tagger)) else: try: - languages_tagger_accuracies[lang][tagger] = experiment(lab) + languages_tagger_accuracies[lang][tagger] = \ + experiment(lab) except: languages_tagger_accuracies[lang][tagger] = None traceback.print_exc() Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (revision 71356) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 71357) @@ -12,7 +12,7 @@ BYTES_SENT_END_RE = re.compile(br'/[.!?]\$$') SENT_END_RE = re.compile(r'/[.!?]\$$') -SELECT_RE = re.compile(br'') +CG_TAG_RE = re.compile(br'()|()') def run(func): @@ -92,6 +92,8 @@ cmd.insert(3, model[7:]) elif model == 'lwsw': cmd.insert(2, '--sliding-window') + elif model == 'percep': + cmd.insert(2, '--perceptron') return cmd @@ -104,6 +106,23 @@ @proc_filter +def tagger_train_percep(model_fn, train_fn, sent_seg=False, + trainsrc_fn=None, mtx_fn=None, iterations=10): + cmd = [ + 'apertium-tagger', + '--skip-on-error', + '-xs', + str(iterations), + model_fn, + train_fn, + trainsrc_fn, + mtx_fn] + if sent_seg: + cmd.insert(1, '--sent-seg') + return cmd + + +@proc_filter def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0, ambg_classes=10, cg_aug=0, cgtrain_fn=None): @@ -187,7 +206,7 @@ output_f = open(output, 'wb') await cleanstream_proc.stdout.readline() async for line in cleanstream_proc.stdout: - line = SELECT_RE.sub(b'', line) + line = CG_TAG_RE.sub(b'', line) output_f.write(line) if BYTES_SENT_END_RE.search(line): output_f.write(b'\n') @@ -199,6 +218,16 @@ return line +@filter +def strip_cg_comments(line): + # Empty analysis will get stripped from ref + # so must also be stripped from src + if line.startswith(';') and '""' not in line: + return line[1:] + else: + return line + + filter_dix = functools.partial( MapFilter, pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, @@ -260,6 +289,7 @@ valid_sent = True else: buff.append(line) + # XXX: Perceptron should be trained with unknowns if '/*' in line: valid_sent = False if invalidate_func is not None and invalidate_func(line): @@ -266,7 +296,7 @@ valid_sent = False -def split_n_r(corpus_fn, train_fn, ref_fn, n, r): +def split_n_r(corpus_fn, train_fn, ref_fn, n, r, write_blanks=True): sentences = 0 with open(corpus_fn) as corpus_file: for line in corpus_file.readlines(): @@ -283,6 +313,11 @@ with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file: for line in corpus_file.readlines(): if line.strip() == '': + if write_blanks: + if split_left <= index < split_right: + ref_file.write('\n') + else: + train_file.write('\n') index += 1 elif split_left <= index < split_right: ref_file.write(line)