Index: branches/apertium-tagger/experiments/experiment_spec.py =================================================================== --- branches/apertium-tagger/experiments/experiment_spec.py (nonexistent) +++ branches/apertium-tagger/experiments/experiment_spec.py (revision 72285) @@ -0,0 +1,53 @@ +import re + +NUM_KV_REGEX = re.compile(r'(\D+)(\d+)') +KV_REGEX = re.compile(r'([^-]+)-([^-]+)') +BOOL_OPTIONS = ['cg', 'sup'] + + +class BadSpecStrException(Exception): + pass + + +def str_to_spec(s): + spec = [], {} + bits = s.split('_') + for option in BOOL_OPTIONS: + spec[1][option] = False + got_kv = False + for bit in bits: + match = NUM_KV_REGEX.match(bit) + if match: + spec[1][match.group(1)] = match.group(2) + got_kv = True + continue + match = KV_REGEX.match(bit) + if match: + spec[1][match.group(1)] = match.group(2) + got_kv = True + continue + if bit in BOOL_OPTIONS: + spec[1][bit] = True + got_kv = True + continue + if got_kv: + BadSpecStrException("Args must preceed kwargs") + spec[0].append(bit) + return spec + + +def spec_to_str(spec): + lspec, kvspec = spec + result_bits = lspec[:] + for k, v in kvspec.items(): + if not v: + continue + if k in BOOL_OPTIONS: + if v: + result_bits.append(k) + continue + if isinstance(v, int): + result_bits.append("{}{}".format(k, v)) + continue + result_bits.append("{}-{}".format(k, v)) + return "_".join(result_bits) Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 72281) +++ branches/apertium-tagger/experiments/experiments.py (revision 72285) @@ -1,19 +1,16 @@ +import functools from collections import defaultdict -import re -import functools -from os.path import abspath, dirname, join as pjoin +from os.path import join as pjoin +from os.path import abspath, dirname from statistics import mean, stdev from evaluate_tagger import TaggerEvaluator +from experiment_spec import spec_to_str, str_to_spec from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag, - tagger_train_sup, tagger_train_unsup, - tagger_train_percep) + tagger_train_percep, tagger_train_sup, + tagger_train_unsup) - MODULE_PATH = abspath(dirname(__file__)) -NUM_KV_REGEX = re.compile(r'(\D+)(\d+)') -KV_REGEX = re.compile(r'([^-]+)-([^-]+)') -BOOL_OPTIONS = ['cg', 'sup'] # Experiment registry @@ -40,10 +37,6 @@ return func -class BadSpecStrException(Exception): - pass - - def has_needs_tsx(lab, spec): lspec, kvspec = spec assert(len(lspec) >= 1) @@ -63,50 +56,6 @@ return experiment_func(lab, *lspec, spec=spec, **kvspec) -def str_to_spec(s): - spec = [], {} - bits = s.split('_') - for option in BOOL_OPTIONS: - spec[1][option] = False - got_kv = False - for bit in bits: - match = NUM_KV_REGEX.match(bit) - if match: - spec[1][match.group(1)] = match.group(2) - got_kv = True - continue - match = KV_REGEX.match(bit) - if match: - spec[1][match.group(1)] = match.group(2) - got_kv = True - continue - if bit in BOOL_OPTIONS: - spec[1][bit] = True - got_kv = True - continue - if got_kv: - BadSpecStrException("Args must preceed kwargs") - spec[0].append(bit) - return spec - - -def spec_to_str(spec): - lspec, kvspec = spec - result_bits = lspec[:] - for k, v in kvspec.items(): - if not v: - continue - if k in BOOL_OPTIONS: - if v: - result_bits.append(k) - continue - if isinstance(v, int): - result_bits.append("{}{}".format(k, v)) - continue - result_bits.append("{}-{}".format(k, v)) - return "_".join(result_bits) - - # Statistical helpers def aggregates(data): return (min(data), max(data), Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 72281) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 72285) @@ -10,12 +10,12 @@ from os import mkdir from os.path import exists as pexists from os.path import join as pjoin -from os.path import basename, isdir, dirname +from os.path import basename, dirname, isdir from pprint import pformat import asyncio -from experiments import (experiment_groups, has_needs_tsx, run_experiment, - spec_to_str, str_to_spec) +from experiment_spec import spec_to_str, str_to_spec +from experiments import experiment_groups, has_needs_tsx, run_experiment from shell_utils import MissingLanguageDataException, check_run, mk_built_lab from shell_wrappers import (cg_conv_clean, cg_proc, copy_blanks, extract_src, fix_dix, split_n_r, strip_blanks, @@ -70,6 +70,7 @@ DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe'] NO_TSX_LANGUAGES = ['rus', 'kaz', 'swe'] CRP_NOT_DIC_LANGUAGES = ['swe'] +CG_SRC_LANGS = ['kaz', 'swe'] def comma_list(s): @@ -120,11 +121,6 @@ "tags", action='store_true') parser.add_argument( - '--use-cg-src', - help="Get ambiguous stream by uncommenting commented out lines in " - "input CG", - action='store_true') - parser.add_argument( '--suppress-errors', help="Suppress errors during processing individual experiments", action='store_true') @@ -217,8 +213,7 @@ class LanguageTaggerLab: def __init__(self, lang, lang_root, texts, folds, - sent_seg=False, use_cg_src=False, - reuse=False, reuse_dic=False): + sent_seg=False, reuse=False, reuse_dic=False): self.lang = lang self.work_dir = pjoin(WORK_DIR, lang) @@ -276,7 +271,7 @@ self.validate() self.sent_seg = sent_seg - self.use_cg_src = use_cg_src + self.use_cg_src = lang in CG_SRC_LANGS for fn in glob.glob(pjoin(dirname(__file__), "mtx/*.mtx")): template = string.Template(open(fn).read()) @@ -380,8 +375,8 @@ def mk_lab(): return LanguageTaggerLab( lang, lang_root, texts, args.folds, - sent_seg=args.sent_seg, use_cg_src=args.use_cg_src, - reuse=args.reuse, reuse_dic=args.reuse_dic) + sent_seg=args.sent_seg, reuse=args.reuse, + reuse_dic=args.reuse_dic) lab = mk_built_lab(mk_lab, lang_root)