Index: branches/apertium-tagger/experiments/evaluate_tagger.py
===================================================================
--- branches/apertium-tagger/experiments/evaluate_tagger.py (revision 71356)
+++ branches/apertium-tagger/experiments/evaluate_tagger.py (revision 71357)
@@ -219,7 +219,10 @@
ref_msd = ''
if tst_w.count('/*') < 1 and tst_w[0] == '^': # {
+ print('tst_w', tst_w)
tst_readings, tst_removed = readings(tst_w, testFunc)
+ print('tst_readings', tst_readings)
+ print('tst_removed', tst_removed)
tst_lema = reading_lemma(tst_readings[0])
tst_pos = reading_pos(tst_readings[0])
tst_func = reading_func(tst_readings[0])
Index: branches/apertium-tagger/experiments/experiments.py
===================================================================
--- branches/apertium-tagger/experiments/experiments.py (revision 71356)
+++ branches/apertium-tagger/experiments/experiments.py (revision 71357)
@@ -4,7 +4,8 @@
from evaluate_tagger import TaggerEvaluator
from shell_wrappers import (cg_proc, extract_first_analysis, tagger_tag,
- tagger_train_sup, tagger_train_unsup)
+ tagger_train_sup, tagger_train_unsup,
+ tagger_train_percep)
# Experiment registry
experiments = {}
@@ -215,3 +216,30 @@
count += 1
ambg_classes.add(line)
return count, len(ambg_classes)
+
+
+for do_cg in [False, True]:
+ for mtx_basename in ['kaztags', 'unigram_model3', 'spacyflattags']:
+ mtx_fn = 'mtx/' + mtx_basename + '.mtx'
+ name = ('cg_' if do_cg else '') + mtx_basename + '_percep'
+
+ @reg
+ @group('percep')
+ @xval_experiment
+ @exp_name(name)
+ def percep_experiment(lab, xval_fns,
+ do_cg=do_cg, mtx_fn=mtx_fn):
+ tagger_train_percep(xval_fns['model'],
+ train_fn=xval_fns['train'],
+ trainsrc_fn=xval_fns['trainsrc'],
+ mtx_fn=mtx_fn,
+ sent_seg=lab.sent_seg)
+ if do_cg:
+ tagger_input = xval_fns['cgtag']
+ else:
+ tagger_input = xval_fns['src']
+ print('tagger_input', tagger_input, 'model', xval_fns['model'])
+ tagger_tag(
+ 'percep', xval_fns['model'], input=tagger_input,
+ output=xval_fns['test']
+ ).check_returncode()
Index: branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (revision 71357)
@@ -1,5 +1,7 @@
+
-
+
+
@@ -17,6 +19,7 @@
+
@@ -26,6 +29,7 @@
+
@@ -32,9 +36,12 @@
+
+
-
+
+
Index: branches/apertium-tagger/experiments/mtx/commondefns.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/commondefns.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/commondefns.mtx (revision 71357)
@@ -2,11 +2,14 @@
-
+
+
+
+
-
+
@@ -17,13 +20,13 @@
-
+
-
+
@@ -34,15 +37,104 @@
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/kaztags.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/kaztags.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/kaztags.mtx (revision 71357)
@@ -0,0 +1,192 @@
+
+
+]>
+
+
+
+ &commondefns;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/morphodita.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/morphodita.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/morphodita.mtx (revision 71357)
@@ -1,3 +1,7 @@
+
+
+]>
-
-
-]>
-
+
+
@@ -64,9 +65,9 @@
-
+
&commondefns;
-
+
@@ -102,20 +103,20 @@
-
-
+
+
-
-
+
+
-
+
@@ -122,7 +123,7 @@
-
+
@@ -129,7 +130,7 @@
-
+
@@ -137,7 +138,7 @@
-
+
@@ -145,7 +146,7 @@
-
+
@@ -152,7 +153,7 @@
-
+
@@ -159,7 +160,7 @@
-
+
@@ -167,7 +168,7 @@
-
+
@@ -174,7 +175,7 @@
-
+
@@ -181,7 +182,7 @@
-
+
@@ -189,7 +190,7 @@
-
+
@@ -196,7 +197,7 @@
-
+
@@ -204,7 +205,7 @@
-
+
@@ -211,7 +212,7 @@
-
+
@@ -218,7 +219,7 @@
-
+
@@ -225,7 +226,7 @@
-
+
@@ -232,7 +233,7 @@
-
+
@@ -239,7 +240,7 @@
-
+
@@ -246,7 +247,7 @@
-
+
@@ -253,7 +254,7 @@
-
+
@@ -260,7 +261,7 @@
-
+
@@ -267,7 +268,7 @@
-
+
@@ -274,7 +275,7 @@
-
+
@@ -281,7 +282,7 @@
-
+
@@ -288,7 +289,7 @@
-
+
@@ -295,7 +296,7 @@
-
+
@@ -302,7 +303,7 @@
-
+
@@ -309,7 +310,7 @@
-
+
@@ -316,7 +317,7 @@
-
+
@@ -323,7 +324,7 @@
-
+
@@ -330,7 +331,7 @@
-
+
@@ -337,7 +338,7 @@
-
+
@@ -344,7 +345,7 @@
-
+
@@ -351,37 +352,50 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/proposed.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/proposed.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/proposed.mtx (revision 71357)
@@ -37,6 +37,8 @@
+
+
@@ -48,3 +50,9 @@
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/spacyflattags.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 71357)
@@ -1,15 +1,22 @@
+
+
+]>
-
-
-]>
-
+
+
&commondefns;
-
-
+
+
+
+
+
+
+
+
@@ -20,122 +27,124 @@
-
-
-
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
+
+
Index: branches/apertium-tagger/experiments/mtx/unigram_model3.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (revision 71356)
+++ branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (revision 71357)
@@ -1,20 +1,41 @@
-
+
+
+
+
+
-
+
+
+
-
+
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py (revision 71356)
+++ branches/apertium-tagger/experiments/run_experiment.py (revision 71357)
@@ -16,7 +16,7 @@
from shell_utils import cd, check_run
from shell_wrappers import (cg_proc, copy_blanks, extract_src, fix_dix,
cg_conv_clean, split_n_r, strip_blanks,
- strip_unknown_sent)
+ strip_unknown_sent, strip_cg_comments)
loop = asyncio.get_event_loop()
@@ -131,6 +131,16 @@
'--notify',
help="Produce a desktop notification when done",
action='store_true')
+ parser.add_argument(
+ '--sent-seg',
+ help="Segment input sentences on blank lines rather than on "
+ "tags",
+ action='store_true')
+ parser.add_argument(
+ '--use-cg-src',
+ help="Get ambiguous stream by uncommenting commented out lines in "
+ "input CG",
+ action='store_true')
return parser.parse_args()
@@ -154,6 +164,7 @@
#'rus': cleanup_rus,
#}
+
def invalidate_por(line):
return ('/$' in line or
# gets analysed as two words
@@ -163,16 +174,20 @@
# ends up as an multiword
('./' in line and '' not in line))
+
def invalidate_hbs(line):
return line.startswith('+')
+
def invalidate_kaz(line):
# Odd...
if '/' not in line:
return True
left, right = line.split('/', 1)
+ # '-' in line or
return '<' in left or '<' not in right
+
LANGUAGE_INVALIDATOR_MAP = {
'por': invalidate_por,
'kaz': invalidate_kaz,
@@ -187,6 +202,7 @@
class LanguageTaggerLab:
def __init__(self, lang, lang_root, texts, folds,
+ sent_seg=False, use_cg_src=False,
reuse=False, reuse_dic=False):
self.lang = lang
self.work_dir = pjoin(WORK_DIR, lang)
@@ -243,6 +259,9 @@
self.validate()
+ self.sent_seg = sent_seg
+ self.use_cg_src = use_cg_src
+
if not reuse:
self.do_preprocessing(reuse_dic=reuse_dic)
@@ -281,6 +300,31 @@
invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(
self.lang, lambda x: False))
strip_blanks(self.joined_fn, self.ref_fn)
+ if self.use_cg_src:
+ assert all(pp_name in ['cg', 'cgr']
+ for (pp_name, _) in self.text_fns),\
+ "Can only get ambiguous input from CG "\
+ "if all input files are in CG format"
+ texts = []
+ for i, (preprocessor_name, fn) in enumerate(self.text_fns):
+ ambg_cg_text = pjoin(
+ self.work_dir,
+ 'ambg.{}.{}.txt'.format(
+ i, preprocessor_name))
+ strip_cg_comments(fn, ambg_cg_text)
+ preprocessor = PREPROCESSOR_MAP.get(preprocessor_name)
+ cleaned_fn = pjoin(
+ self.work_dir,
+ 'cleaned.src.{}.{}.txt'.format(i, preprocessor_name))
+ preprocessor(input=ambg_cg_text, output=cleaned_fn)
+ texts.append(cleaned_fn)
+ joined = itertools.chain(*(open(fn).readlines() for fn in texts))
+ strip_unknown_sent(
+ joined, self.src_blanks_fn,
+ invalidate_func=LANGUAGE_INVALIDATOR_MAP.get(
+ self.lang, lambda x: False))
+ strip_blanks(self.src_blanks_fn, self.src_fn)
+ else:
extract_src(self.morphology_fn,
input_fn=self.ref_fn, output_fn=self.src_fn)
copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn)
@@ -292,11 +336,12 @@
for i, xval_fn in enumerate(self.xval_fns):
split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'],
- self.folds, i)
+ self.folds, i, write_blanks=self.sent_seg)
split_n_r(self.src_blanks_fn, xval_fn['trainsrc'], xval_fn['src'],
- self.folds, i)
+ self.folds, i, write_blanks=self.sent_seg)
split_n_r(self.cgtag_blanks_fn, xval_fn['traincgtag'],
- xval_fn['cgtag'], self.folds, i)
+ xval_fn['cgtag'], self.folds, i,
+ write_blanks=self.sent_seg)
def can_run_experiment(self, experiment_func):
if self.no_tsx and getattr(experiment_func, 'needs_tsx', False):
@@ -318,9 +363,10 @@
lang_root = pjoin(args.languagesdir, 'apertium-' + lang)
def mk_lab():
- return LanguageTaggerLab(lang, lang_root, taggers,
- args.folds, reuse=args.reuse,
- reuse_dic=args.reuse_dic)
+ return LanguageTaggerLab(
+ lang, lang_root, taggers, args.folds,
+ sent_seg=args.sent_seg, use_cg_src=args.use_cg_src,
+ reuse=args.reuse, reuse_dic=args.reuse_dic)
try:
lab = mk_lab()
except MissingLanguageDataException as e:
@@ -329,6 +375,7 @@
check_run(['./autogen.sh'])
check_run(['make'])
lab = mk_lab()
+
def run_tagger(tagger):
experiment = experiments[tagger]
if lab.can_run_experiment(experiment):
@@ -336,7 +383,8 @@
print("Running {}/{}".format(lang, tagger))
else:
try:
- languages_tagger_accuracies[lang][tagger] = experiment(lab)
+ languages_tagger_accuracies[lang][tagger] = \
+ experiment(lab)
except:
languages_tagger_accuracies[lang][tagger] = None
traceback.print_exc()
Index: branches/apertium-tagger/experiments/shell_wrappers.py
===================================================================
--- branches/apertium-tagger/experiments/shell_wrappers.py (revision 71356)
+++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 71357)
@@ -12,7 +12,7 @@
BYTES_SENT_END_RE = re.compile(br'/[.!?]\$$')
SENT_END_RE = re.compile(r'/[.!?]\$$')
-SELECT_RE = re.compile(br'')
+CG_TAG_RE = re.compile(br'()|()')
def run(func):
@@ -92,6 +92,8 @@
cmd.insert(3, model[7:])
elif model == 'lwsw':
cmd.insert(2, '--sliding-window')
+ elif model == 'percep':
+ cmd.insert(2, '--perceptron')
return cmd
@@ -104,6 +106,23 @@
@proc_filter
+def tagger_train_percep(model_fn, train_fn, sent_seg=False,
+ trainsrc_fn=None, mtx_fn=None, iterations=10):
+ cmd = [
+ 'apertium-tagger',
+ '--skip-on-error',
+ '-xs',
+ str(iterations),
+ model_fn,
+ train_fn,
+ trainsrc_fn,
+ mtx_fn]
+ if sent_seg:
+ cmd.insert(1, '--sent-seg')
+ return cmd
+
+
+@proc_filter
def tagger_train_sup(model_type, model_fn, train_fn,
trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0,
ambg_classes=10, cg_aug=0, cgtrain_fn=None):
@@ -187,7 +206,7 @@
output_f = open(output, 'wb')
await cleanstream_proc.stdout.readline()
async for line in cleanstream_proc.stdout:
- line = SELECT_RE.sub(b'', line)
+ line = CG_TAG_RE.sub(b'', line)
output_f.write(line)
if BYTES_SENT_END_RE.search(line):
output_f.write(b'\n')
@@ -199,6 +218,16 @@
return line
+@filter
+def strip_cg_comments(line):
+ # Empty analysis will get stripped from ref
+ # so must also be stripped from src
+ if line.startswith(';') and '""' not in line:
+ return line[1:]
+ else:
+ return line
+
+
filter_dix = functools.partial(
MapFilter,
pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line,
@@ -260,6 +289,7 @@
valid_sent = True
else:
buff.append(line)
+ # XXX: Perceptron should be trained with unknowns
if '/*' in line:
valid_sent = False
if invalidate_func is not None and invalidate_func(line):
@@ -266,7 +296,7 @@
valid_sent = False
-def split_n_r(corpus_fn, train_fn, ref_fn, n, r):
+def split_n_r(corpus_fn, train_fn, ref_fn, n, r, write_blanks=True):
sentences = 0
with open(corpus_fn) as corpus_file:
for line in corpus_file.readlines():
@@ -283,6 +313,11 @@
with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file:
for line in corpus_file.readlines():
if line.strip() == '':
+ if write_blanks:
+ if split_left <= index < split_right:
+ ref_file.write('\n')
+ else:
+ train_file.write('\n')
index += 1
elif split_left <= index < split_right:
ref_file.write(line)