Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 72010) +++ branches/apertium-tagger/experiments/experiments.py (revision 72011) @@ -28,6 +28,13 @@ return inner +def needs_tsx_when(pred): + def inner(func): + func.needs_tsx = pred + return func + return inner + + def needs_tsx(func): func.needs_tsx = True return func @@ -37,11 +44,14 @@ pass -def has_needs_tsx(spec): +def has_needs_tsx(lab, spec): lspec, kvspec = spec assert(len(lspec) >= 1) name, *lspec = lspec - return getattr(experiment_func_map[name], 'needs_tsx', False) + needs_tsx = getattr(experiment_func_map[name], 'needs_tsx', False) + if isinstance(needs_tsx, bool): + return needs_tsx + return needs_tsx(lab, spec) def run_experiment(spec, lab): @@ -49,6 +59,7 @@ assert(len(lspec) >= 1) name, *lspec = lspec experiment_func = experiment_func_map[name] + print("call experiment_func", experiment_func, lab, lspec, spec, kvspec) return experiment_func(lab, *lspec, spec=spec, **kvspec) @@ -293,10 +304,16 @@ experiment_groups['meta'].append((['new_cg_ambg'], {})) +def percep_needs_tsx(lab, spec): + mtx_fn = pjoin(lab.lang_root, spec[1]['mtx'] + '.mtx') + return "= best_mean or r_max >= best_max: + results[cur_mtx] = (r_mean, r_max) + + # Enqueue more + for remove_i in range(delim, orig_num_feats): + trimmed_tree = deepcopy(mtx_tree) + to_remove = trimmed_tree.xpath( + FEAT_XPATH)[remove_i - len(removed_feats)] + to_remove.getparent().remove(to_remove) + new_mtx = "{}X{}".format(cur_mtx, remove_i) + print(mtx_fn(new_mtx)) + trimmed_tree.write(mtx_fn(new_mtx), method='html') + mtx_queue.append(( + new_mtx, removed_feats + (remove_i,), remove_i + 1, + max(r_mean, best_mean), max(r_max, best_max))) + + print_best(results, print=print_both) + + +def main(): + args = parse_args() + if not isdir(WORK_DIR): + mkdir(WORK_DIR) + if args.notify: + import notify2 + + run_search(args) + + if args.notify: + notify2.init("Grid search finished") + notice = notify2.Notification( + ' '.join(sys.argv), "Grid search finished") + notice.show() + + +if __name__ == '__main__': + main() Index: branches/apertium-tagger/experiments/mtx/coarsebigram.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/coarsebigram.mtx (nonexistent) +++ branches/apertium-tagger/experiments/mtx/coarsebigram.mtx (revision 72011) @@ -0,0 +1,91 @@ + + +]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/mtx/kaztags.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/kaztags.mtx (revision 72010) +++ branches/apertium-tagger/experiments/mtx/kaztags.mtx (revision 72011) @@ -9,7 +9,7 @@ &commondefns; - + @@ -17,8 +17,8 @@ - - + + @@ -26,8 +26,8 @@ - - + + @@ -38,8 +38,8 @@ - - + + @@ -53,8 +53,8 @@ - - + + @@ -68,8 +68,8 @@ - - + + @@ -77,8 +77,8 @@ - - + + @@ -86,19 +86,19 @@ - - + + - - + + - + Index: branches/apertium-tagger/experiments/mtx/spacycoarsetags.mtx =================================================================== --- branches/apertium-tagger/experiments/mtx/spacycoarsetags.mtx (nonexistent) +++ branches/apertium-tagger/experiments/mtx/spacycoarsetags.mtx (revision 72011) @@ -0,0 +1,185 @@ + + +]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/experiments/requirements.txt =================================================================== --- branches/apertium-tagger/experiments/requirements.txt (revision 72010) +++ branches/apertium-tagger/experiments/requirements.txt (revision 72011) @@ -1,3 +1,4 @@ aitertools==0.1.0 tabulate==0.7.5 +lxml==3.6.1 -e git+https://github.com/frankier/streamparser.git@setup-py#egg=streamparser Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 72010) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 72011) @@ -2,22 +2,24 @@ import argparse import datetime import functools +import glob import itertools +import string import sys import traceback from os import mkdir from os.path import exists as pexists from os.path import join as pjoin -from os.path import isdir +from os.path import basename, isdir from pprint import pformat import asyncio -from experiments import experiment_groups, str_to_spec, spec_to_str,\ - has_needs_tsx, run_experiment +from experiments import (experiment_groups, has_needs_tsx, run_experiment, + spec_to_str, str_to_spec) from shell_utils import cd, check_run -from shell_wrappers import (cg_proc, copy_blanks, extract_src, fix_dix, - cg_conv_clean, split_n_r, strip_blanks, - strip_unknown_sent, strip_cg_comments) +from shell_wrappers import (cg_conv_clean, cg_proc, copy_blanks, extract_src, + fix_dix, split_n_r, strip_blanks, + strip_cg_comments, strip_unknown_sent) loop = asyncio.get_event_loop() @@ -84,40 +86,12 @@ return d -def parse_args(): - tagger_info = [] - for group_name, group_taggers in experiment_groups.items(): - tagger_info.append("{}:".format(group_name)) - for tagger in group_taggers: - tagger_info.append(" {}".format(spec_to_str(tagger))) - tagger_info.append("") - parser = argparse.ArgumentParser( - description="Runs a series of experiments on different part of speech " - "taggers and different language data.\n\nGROUPS:\n\n" + - "\n".join(tagger_info), - formatter_class=argparse.RawDescriptionHelpFormatter) +def add_common_args(parser): parser.add_argument( 'languagesdir', help="Path to the directory containing all the individaul language " "data directories") parser.add_argument( - '--languages', - help="Only run experiments for these languages, comma separated", - default=DEFAULT_LANGUAGES, type=comma_list) - parser.add_argument( - '--taggers', - help="Only run experiments with these taggers, comma separated", - default=[], type=comma_list) - parser.add_argument( - '--tagger-groups', - help="Only run these experiment groups. " - "Available: default, cg-extra, meta.", - default=['default'], type=comma_list) - parser.add_argument( - '--language-texts', - help="Use different texts per language, coma seperated colon pairs", - default=DEFAULT_TEXTS, type=comma_colon_dict) - parser.add_argument( '--folds', help="Use x-fold validation instead of 10-fold", default=10, type=int) @@ -155,6 +129,39 @@ help="Suppress errors during processing individual experiments", action='store_true') + +def parse_args(): + tagger_info = [] + for group_name, group_taggers in experiment_groups.items(): + tagger_info.append("{}:".format(group_name)) + for tagger in group_taggers: + tagger_info.append(" {}".format(spec_to_str(tagger))) + tagger_info.append("") + parser = argparse.ArgumentParser( + description="Runs a series of experiments on different part of speech " + "taggers and different language data.\n\nGROUPS:\n\n" + + "\n".join(tagger_info), + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument( + '--taggers', + help="Only run experiments with these taggers, comma separated", + default=[], type=comma_list) + parser.add_argument( + '--tagger-groups', + help="Only run these experiment groups. " + "Available: default, cg-extra, meta.", + default=['default'], type=comma_list) + parser.add_argument( + '--languages', + help="Only run experiments for these languages, comma separated", + default=DEFAULT_LANGUAGES, type=comma_list) + parser.add_argument( + '--language-texts', + help="Use different texts per language, coma seperated colon pairs", + default=DEFAULT_TEXTS, type=comma_colon_dict) + add_common_args(parser) + return parser.parse_args() @@ -221,6 +228,7 @@ self.work_dir = pjoin(WORK_DIR, lang) pair_name = 'apertium-{0}.{0}'.format(lang) + self.lang_root = lang_root self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin') self.cg_fn = pjoin(lang_root, lang + '.rlx.bin') self.no_tsx = lang in NO_TSX_LANGUAGES @@ -275,6 +283,14 @@ self.sent_seg = sent_seg self.use_cg_src = use_cg_src + for fn in glob.glob("mtx/*.mtx"): + template = string.Template(open(fn).read()) + mtx_dest = pjoin(lang_root, basename(fn)) + mtx_content = template.substitute({ + 'tsx_fn': self.tsx_fn + }) + open(mtx_dest, 'w').write(mtx_content) + if not reuse: self.do_preprocessing(reuse_dic=reuse_dic) @@ -357,18 +373,18 @@ write_blanks=self.sent_seg) def can_run_experiment(self, spec): - if self.no_tsx and has_needs_tsx(spec): + if self.no_tsx and has_needs_tsx(self, spec): return False return True def run_lang(lang, args): - taggers = args.language_texts[lang] + texts = args.language_texts[lang] lang_root = pjoin(args.languagesdir, 'apertium-' + lang) def mk_lab(): return LanguageTaggerLab( - lang, lang_root, taggers, args.folds, + lang, lang_root, texts, args.folds, sent_seg=args.sent_seg, use_cg_src=args.use_cg_src, reuse=args.reuse, reuse_dic=args.reuse_dic) try: