Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 70327) +++ branches/apertium-tagger/experiments/experiments.py (revision 70328) @@ -109,8 +109,6 @@ else: cg_aug_t = None for do_cg in [None, 'in', 'dual', 'inv']: - if do_cg == 'inv': - continue for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]: Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 70327) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 70328) @@ -15,7 +15,7 @@ from experiments import experiment_groups, experiments from shell_utils import cd, check_run from shell_wrappers import (cg_proc, copy_blanks, extract_src, fix_dix, - run_cg_conv_clean, split_n_r, strip_blanks, + cg_conv_clean, split_n_r, strip_blanks, strip_unknown_sent) loop = asyncio.get_event_loop() @@ -133,8 +133,8 @@ PREPROCESSER_MAP = { - 'cg': run_cg_conv_clean, - 'cgr': functools.partial(run_cg_conv_clean, rtl=True), + 'cg': cg_conv_clean, + 'cgr': functools.partial(cg_conv_clean, rtl=True), } @@ -278,14 +278,14 @@ invalidate_func=LANGUAGE_INVALIDATOR_MAP.get( self.lang, lambda x: False)) strip_blanks(self.joined_fn, self.ref_fn) - extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) + extract_src(self.morphology_fn, + input_fn=self.ref_fn, output_fn=self.src_fn) copy_blanks(self.joined_fn, self.src_fn, self.src_blanks_fn) cg_proc(self.cg_fn, input=self.src_fn, output=self.cgtag_fn) copy_blanks(self.joined_fn, self.cgtag_fn, self.cgtag_blanks_fn) if not reuse_dic and not self.crp_not_dic and not self.no_tsx: - loop.run_until_complete( fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, - output_fn=self.dic_fn)) + output_fn=self.dic_fn) for i, xval_fn in enumerate(self.xval_fns): split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], Index: branches/apertium-tagger/experiments/shell_utils.py =================================================================== --- branches/apertium-tagger/experiments/shell_utils.py (revision 70327) +++ branches/apertium-tagger/experiments/shell_utils.py (revision 70328) @@ -89,13 +89,9 @@ await proc.stdin.write(b) -async def dir_out(proc, output_fn): - output_file = open(output_fn, 'w') - while 1: - b = await proc.read(1024) - if not len(b): - return - output_file.write(b) +async def dir_out(aiter, output_file): + async for chunk in aiter: + output_file.write(chunk) class Tee(MapFilter): Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (revision 70327) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 70328) @@ -6,7 +6,7 @@ import aitertools import asyncio from asyncio.subprocess import create_subprocess_exec -from shell_utils import MapFilter, filter, proc_filter, writeiter +from shell_utils import MapFilter, filter, proc_filter, writeiter, dir_out loop = asyncio.get_event_loop() @@ -15,22 +15,23 @@ SELECT_RE = re.compile(br'') -@proc_filter -def lt_proc(morphology_fn, dictcase=False): - cmd = ['lt-proc', morphology_fn] - if dictcase: - cmd.insert(1, '--dictionary-case') - return cmd +def run(func): + @functools.wraps(func) + def inner(*args, **kwargs): + return loop.run_until_complete(func(*args, **kwargs)) + return inner -@filter(output_separator='\n') def extract_words(line): if line: - return line.split('^')[1].split('/')[0] + return line.split(b'^')[1].split(b'/')[0] + b'\n' else: return '' +extract_words_async = functools.partial(MapFilter, tran=extract_words) + + @filter(output_separator='\n') def extract_first_analysis(line): return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' @@ -45,11 +46,44 @@ return line + b'\n' -def extract_src(morphology_fn, input, output=None): - ref_words_iter = extract_words(input=input) - return lt_proc(morphology_fn, input=ref_words_iter, output=output) +def push_in_stack(orig_line): + line = orig_line.decode('utf-8') + dirty = False + surf, *analyses = line.strip().strip('^$').split('/') + for i, analysis in enumerate(analyses): + if '># ' in analysis: + dirty = True + lemma, tags = analysis.split('<', 1) + tags, stack = tags.rsplit('># ', 1) + analyses[i] = "{}# {}<{}>".format(lemma, stack, tags) + if dirty: + res = "^{}/{}$\n".format(surf, '/'.join(analyses)).encode('utf-8') + if res.count(b'$') == 2: + print(repr(orig_line), repr(surf), repr(analyses), repr(res)) + return res + return orig_line +push_in_stack_async = functools.partial(MapFilter, tran=push_in_stack) + + +@run +async def extract_src(morphology_fn, input_fn, output_fn): + print("lt-proc", morphology_fn, input_fn, output_fn) + pipes = [] + + out = open(output_fn, 'wb') + + in_words = extract_words_async(aitertools.chain(open(input_fn, 'rb'))) + lt_proc = await create_subprocess_exec('lt-proc', morphology_fn, + stdin=PIPE, stdout=PIPE) + pipes.append(lt_proc.wait()) + pipes.append(writeiter(in_words, lt_proc)) + pipes.append(dir_out(push_in_stack_async(lt_proc.stdout), out)) + + return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) + + def insert_model(cmd, model, tagging=False): if model == 'bigram': pass @@ -139,6 +173,7 @@ return ['apertium-cleanstream', '-n'] +@run async def cg_conv_clean(input, output, rtl=False): cleanstream_inpipe, cg_conv_outpipe = os.pipe() await create_subprocess_exec( @@ -158,11 +193,6 @@ output_f.write(b'\n') -def run_cg_conv_clean(input, output, **kwargs): - return loop.run_until_complete( - cg_conv_clean(input, output, **kwargs)) - - @filter def strip_blanks(line): if line != '\n': @@ -175,6 +205,7 @@ tran=lambda line: line.split(b":")[0] + b"\n") +@run async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): pipes = [] @@ -215,12 +246,6 @@ return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) -filter_dix = functools.partial( - MapFilter, - pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, - tran=lambda line: line.split(b":")[0] + b"\n") - - @filter(iter_filter=True) def strip_unknown_sent(gen, invalidate_func=None): buff = []