Index: branches/apertium-tagger/experiments/evaluate_tagger.py =================================================================== --- branches/apertium-tagger/experiments/evaluate_tagger.py (nonexistent) +++ branches/apertium-tagger/experiments/evaluate_tagger.py (revision 68351) @@ -0,0 +1,468 @@ +#!/usr/bin/env python3 + +import sys + +skipUnknown = True +testFunc = False + +# src: ^власти/власть/власть/власть/власть/власть$ +# ref: ^власти/власть<@P←>$ +# tst: ^власти/власть$ + + +def readings(w, testFunc): + readings = [] + removed_readings = [] + reading = '' + seen = False + for c in w: + if c == '/' and seen == False: + seen = True + continue + elif (c == '/' or c == '$') and seen: + if len(reading) < 1: + print('Feil: ', w, file=sys.stderr) + if reading[0] == '¬': + removed_readings.append(reading) + else: + if testFunc: + readings.append( + reading_lemma(reading) + reading_msd(reading) + reading_func(reading)) + else: + readings.append( + reading_lemma(reading) + reading_msd(reading)) + reading = '' + continue + if seen: + reading = reading + c + return (readings, removed_readings) + + +def clean(s): + o = s.replace('¹', '').replace('²', '').replace('³', '').replace('⁻', '') + return o + + +def reading_lemma(r): + r = clean(r) + return r.split('<')[0] + + +def reading_pos(r): + if r.count('<') < 1: + return '' + return '<' + r.split('<')[1].split('>')[0] + '>' + + +def reading_msd(r): + msd = '' + seen = False + tag = '' + for c in r: + if c == '<': + seen = True + if c == '>': + tag = tag + c + if tag.count(':') > 0 or len(tag) < 2: # { + continue + elif tag[1] == '@': + continue + else: + msd = msd + tag + tag = '' + continue + if seen: + tag = tag + c + return msd + + +def reading_func(r): + func = '' + seen = False + for c in r: + if c == '@': + seen = True + if c == '>': + seen = False + if seen: + func = func + c + func = '<' + func + '>' + return func.replace('<>', '') + + +def readings_rules(readings): + rules = set() + readings_rules = {} + + for r in readings: + reading = '' + seen = False + tag = '' + first = True + for c in r: + if c == '<' and first == False: + seen = True + elif c == '<' and first == True: + seen = True + first = False + if c == '+': + first = True + if c == '¬': + continue + if c == '>': + tag = tag + c + if tag.count(':'): + rules.add(tag) + else: + reading = reading + tag + tag = '' + seen = False + if seen and not first: + tag = tag + c + elif first: + reading = reading + c + if reading not in readings_rules: + readings_rules[reading] = [] + readings_rules[reading] = list(rules) + return (rules, readings_rules) + + +class TaggerEvaluator: + def __init__(self, src_fn, ref_fn, tst_fn): + self.src_f = open(src_fn) + self.ref_f = open(ref_fn) + self.tst_f = open(tst_fn) + + # Sanity check + + src_l = len(self.src_f.readlines()) + ref_l = len(self.src_f.readlines()) + tst_l = len(self.src_f.readlines()) + + lines = -1 + + if src_l != ref_l != tst_l: + print(src_l, ref_l, tst_l, file=sys.stderr) + else: + self.lines = src_l + + self.src_f.seek(0) + self.ref_f.seek(0) + self.tst_f.seek(0) + + def run_analysis(self): + self.n_tokens = 0 + self.n_unknown = 0 + n_line = 0 + # tp tn fp fn + self.rules = {} # rules['SELECT:462'] = (0, 0, 0, 0) + + applic = {} # applic['SELECT:462'] = 0 + + feiler = {} # feiler['SELECT:462'] = [13, 45, 100] + + self.n_truepositive = 0 + self.n_truenegative = 0 + self.n_falsepositive = 0 + self.n_falsenegative = 0 + + self.n_ref_readings = 0 + self.n_src_readings = 0 + self.n_tst_readings = 0 + + self.n_tst_lema_correct = 0 + self.n_tst_pos_correct = 0 + self.n_tst_lemapos_correct = 0 + self.n_tst_msd_correct = 0 + self.n_tst_lemamsd_correct = 0 + self.n_tst_func_correct = 0 + + self.n_bas_lema_correct = 0 + self.n_bas_pos_correct = 0 + self.n_bas_lemapos_correct = 0 + self.n_bas_msd_correct = 0 + self.n_bas_lemamsd_correct = 0 + self.n_bas_func_correct = 0 + + for line in range(0, self.lines): # { + n_line = n_line + 1 + src_w = self.src_f.readline() + ref_w = self.ref_f.readline() + tst_w = self.tst_f.readline() + + #print('src_w', src_w); + #print('ref_w', ref_w); + #print('tst_w', tst_w); + + if src_w.count('¶') > 0: # { + continue + #} + + self.n_tokens = self.n_tokens + 1 + + tst_readings = [] + tst_lema = '' + tst_pos = '' + tst_func = '' + tst_msd = '' + src_readings = [] + src_lema = '' + src_pos = '' + src_func = '' + src_msd = '' + ref_readings = [] + ref_lema = '' + ref_pos = '' + ref_func = '' + ref_msd = '' + + if tst_w.count('/*') < 1 and tst_w[0] == '^': # { + tst_readings, tst_removed = readings(tst_w, testFunc) + tst_lema = reading_lemma(tst_readings[0]) + tst_pos = reading_pos(tst_readings[0]) + tst_func = reading_func(tst_readings[0]) + tst_msd = reading_msd(tst_readings[0]) + + src_readings, src_removed = readings(src_w, testFunc) + src_lema = reading_lemma(src_readings[0]) + src_pos = reading_pos(src_readings[0]) + src_func = reading_func(src_readings[0]) + src_msd = reading_msd(src_readings[0]) + + self.n_src_readings = self.n_src_readings + len(src_readings) + self.n_tst_readings = self.n_tst_readings + len(tst_readings) + #} + + if ref_w.count('/*') < 1 and ref_w[0] == '^': # { + ref_readings, ref_removed = readings(ref_w, testFunc) + ref_lema = reading_lemma(ref_readings[0]) + ref_pos = reading_pos(ref_readings[0]) + ref_func = reading_func(ref_readings[0]) + ref_msd = reading_msd(ref_readings[0]) + #} + + if tst_w.count('/*') > 0 and skipUnknown == True: # { + print('*\t', ref_lema, ref_msd) + self.n_unknown = self.n_unknown + 1 + continue + #} + + self.n_ref_readings = self.n_ref_readings + 1 + + ####################################################################### + + tst_rules, tst_readings_rules = readings_rules(tst_readings + tst_removed) + #print('READINGS:', tst_readings); + #print('RULES_READINGS:', tst_readings_rules); + for rule in list(tst_rules): # { + if rule not in self.rules: # { + self.rules[rule] = (0, 0, 0, 0) + #} + #print('RULES:', rule, rules[rule]); + #} + + for tst_reading in tst_readings: # { + if tst_reading not in ref_readings: # { + self.n_falsepositive = self.n_falsepositive + 1 + for rule in tst_readings_rules[tst_reading]: # { + (tp, tn, fp, fn) = self.rules[rule] + fp = fp + 1 + self.rules[rule] = (tp, tn, fp, fn) + #} + else: # { + self.n_truepositive = self.n_truepositive + 1 + for rule in tst_readings_rules[tst_reading]: # { + (tp, tn, fp, fn) = self.rules[rule] + tp = tp + 1 + self.rules[rule] = (tp, tn, fp, fn) + #} + #} + #} + + for ref_reading in ref_readings: # { + if ref_reading not in tst_readings: # { + print('[' + str(n_line) + '] FALSENEG:', ref_reading, tst_readings) + self.n_falsenegative = self.n_falsenegative + 1 + if ref_reading not in tst_readings_rules: # { + continue + #} + for rule in tst_readings_rules[ref_reading]: # { + (tp, tn, fp, fn) = self.rules[rule] + fn = fn + 1 + self.rules[rule] = (tp, tn, fp, fn) + if rule not in feiler: # { + feiler[rule] = [] + #} + feiler[rule].append(n_line) + #} + #} + #} + + for src_reading in src_readings: # { + # { + if src_reading not in ref_readings and src_reading not in tst_readings: + self.n_truenegative = self.n_truenegative + 1 + if src_reading not in tst_readings_rules: # { + continue + #} + for rule in tst_readings_rules[src_reading]: # { + (tp, tn, fp, fn) = self.rules[rule] + tn = tn + 1 + self.rules[rule] = (tp, tn, fp, fn) + #} + #} + #} + + ####################################################################### + + if tst_lema == ref_lema and tst_msd == ref_msd: # { + print('=\t', tst_lema, tst_msd) + else: # { + #print('ref:', ref_readings, file=sys.stderr); + print('-\t', ref_lema, ref_msd, src_readings) + #print('tst:', tst_readings, file=sys.stderr); + print('+\t', tst_lema, tst_msd, tst_readings) + #} + + # { + if ref_lema + ref_msd not in tst_readings and ref_lema + ref_msd in src_readings: + print('!\t', ref_lema + ref_msd, tst_readings) + #} + + if src_lema == ref_lema: + self.n_bas_lema_correct = self.n_bas_lema_correct + 1 + if src_lema == ref_lema and src_pos == ref_pos: + self.n_bas_lemapos_correct = self.n_bas_lemapos_correct + 1 + if src_lema == ref_lema and src_msd == ref_msd: + self.n_bas_lemamsd_correct = self.n_bas_lemamsd_correct + 1 + if src_pos == ref_pos: + self.n_bas_pos_correct = self.n_bas_pos_correct + 1 + if src_msd == ref_msd: + self.n_bas_msd_correct = self.n_bas_msd_correct + 1 + + if tst_lema == ref_lema: + self.n_tst_lema_correct = self.n_tst_lema_correct + 1 + if tst_lema == ref_lema and tst_pos == ref_pos: + self.n_tst_lemapos_correct = self.n_tst_lemapos_correct + 1 + if tst_lema == ref_lema and tst_msd == ref_msd: + self.n_tst_lemamsd_correct = self.n_tst_lemamsd_correct + 1 + if tst_pos == ref_pos: + self.n_tst_pos_correct = self.n_tst_pos_correct + 1 + if tst_msd == ref_msd: + self.n_tst_msd_correct = self.n_tst_msd_correct + 1 + if tst_func == ref_func and ref_func != '': + self.n_tst_func_correct = self.n_tst_func_correct + 1 + + @property + def precision(self): + return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsepositive)) + + @property + def recall(self): + return float(self.n_truepositive) / (float(self.n_truepositive + self.n_falsenegative)) + + @property + def accuracy(self): + return float(self.n_truepositive + self.n_truenegative) / \ + (float(self.n_truepositive + self.n_falsenegative + + self.n_truenegative + self.n_falsepositive)) + + def print_analyses(self): + # print(""); + #} + + # Accuracy = number of correct analyses / number of analyses in ref; + # False positives + + # Lemma accuracy + # POS accuracy + # MSD accuracy + # Func accuracy + + print('') + + print('unknown :\t', self.n_unknown, + '(', (float(self.n_unknown) / float(self.n_ref_readings)) * 100.0, ')') + + print('') + + print('truepos :\t', self.n_truepositive) + print('trueneg :\t', self.n_truenegative) + print('falsepos :\t', self.n_falsepositive) + print('falseneg :\t', self.n_falsenegative) + + print('') + + print('precision:\t', self.precision, '\t( true pos / all pos )') + print('recall :\t', self.recall, '\t( true pos / (true pos + false neg) )') + print('accuracy :\t', self.accuracy, '\t((true pos + true neg) / (everything) )') + + print('') + + src_ambig_rate = float(self.n_src_readings) / float(self.n_ref_readings) + tst_ambig_rate = float(self.n_tst_readings) / float(self.n_ref_readings) + print('tokens :\t', self.n_tokens) + print('src_ambig:\t', src_ambig_rate) + print('tst_ambig:\t', tst_ambig_rate) + print('resolved :\t %.2f%%' % + (100.0 - (tst_ambig_rate / src_ambig_rate * 100.0))) + + print('') + + p_bas_lema_correct = float(self.n_bas_lema_correct) / float(self.n_ref_readings) * 100.0 + p_bas_pos_correct = float(self.n_bas_pos_correct) / float(self.n_ref_readings) * 100.0 + p_bas_lemapos_correct = float( + self.n_bas_lemapos_correct) / float(self.n_ref_readings) * 100.0 + p_bas_msd_correct = float(self.n_bas_msd_correct) / float(self.n_ref_readings) * 100.0 + p_bas_lemamsd_correct = float( + self.n_bas_lemamsd_correct) / float(self.n_ref_readings) * 100.0 + p_bas_func_correct = float(self.n_bas_func_correct) / float(self.n_ref_readings) * 100.0 + + print('lem :\t', p_bas_lema_correct) + #print('pos :\t',p_bas_pos_correct); + print('lem+pos :\t', p_bas_lemapos_correct) + #print('msd :\t',p_bas_msd_correct); + print('lem+msd :\t', p_bas_lemamsd_correct) + print('func :\t', p_bas_func_correct) + + print('') + + p_tst_lema_correct = float(self.n_tst_lema_correct) / float(self.n_ref_readings) * 100.0 + p_tst_pos_correct = float(self.n_tst_pos_correct) / float(self.n_ref_readings) * 100.0 + p_tst_lemapos_correct = float( + self.n_tst_lemapos_correct) / float(self.n_ref_readings) * 100.0 + p_tst_msd_correct = float(self.n_tst_msd_correct) / float(self.n_ref_readings) * 100.0 + p_tst_lemamsd_correct = float( + self.n_tst_lemamsd_correct) / float(self.n_ref_readings) * 100.0 + p_tst_func_correct = float(self.n_tst_func_correct) / float(self.n_ref_readings) * 100.0 + + print('lem :\t', p_tst_lema_correct, + '(', p_tst_lema_correct - p_bas_lema_correct, ')') + #print('pos :\t',p_tst_pos_correct, '(', p_tst_pos_correct-p_bas_pos_correct, ')'); + print('lem+pos :\t', p_tst_lemapos_correct, + '(', p_tst_lemapos_correct - p_bas_lemapos_correct, ')') + #print('msd :\t',p_tst_msd_correct, '(', p_tst_msd_correct-p_bas_msd_correct, ')'); + print('lem+msd :\t', p_tst_lemamsd_correct, + '(', p_tst_lemamsd_correct - p_bas_lemamsd_correct, ')') + print('func :\t', p_tst_func_correct, + '(', p_tst_func_correct - p_bas_func_correct, ')') + + rkeys = list(self.rules.keys()) + rkeys.sort() + print('') + print('Rule No.\tTP\tTN\tFP\tFN') + for rule in rkeys: + print('%s\t%d\t%d\t%d\t%d' % + (rule, self.rules[rule][0], self.rules[rule][1], self.rules[rule][2], self.rules[rule][3])) + print('') + for rule in rkeys: + if rule in feiler: + print(rule, '\t', feiler[rule]) + +def main(): + te = TaggerEvaluator(sys.argv[1], sys.argv[2], sys.argv[3]) + te.run_analysis() + te.print_analyses() + +if __name__ == '__main__': + main() Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (nonexistent) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 68351) @@ -0,0 +1,666 @@ +import statistics +import sys +from os import mkdir +from os.path import isdir, join as pjoin, exists as pexists +import subprocess +from subprocess import PIPE +import functools +import itertools +import aitertools +import argparse +import tabulate +import csv +import asyncio +from asyncio.subprocess import create_subprocess_exec +from collections import namedtuple +from contextlib import contextmanager +import os +from pprint import pprint + +from evaluate_tagger import TaggerEvaluator +from split_corpus_n import main as split_corpus_n + +loop = asyncio.get_event_loop() + +@contextmanager +def cd(newdir): + prevdir = os.getcwd() + os.chdir(os.path.expanduser(newdir)) + try: + yield + finally: + os.chdir(prevdir) + + +TMPDIR = 'experimenttmp' + +DEFAULT_TEXTS = { + 'cat': ['texts/miscellaneous.tagged.txt'], + 'spa': ['texts/miscellaneous.tagged.txt'], + 'hbs': ['hbs-tagger-data/hbs.tagged.txt'], + 'rus': ['texts/son-smešnogo-čeloveka.ana.txt'], + 'kaz': ['eval/ref.1000.txt'], + 'por': [ + 'texts/bering.txt', + 'texts/cultura.txt', + 'texts/beringia.txt', + 'texts/raio.txt', + 'texts/música.txt', + 'texts/água.txt', + 'texts/akatsuki.txt', + ], +} + +DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe'] + +def comma_list(s): + if s == '()': + return [] + return s.split(',') + +def comma_colon_dict(s): + d = {} + for bit in s.split(','): + pair = bit.split(':') + d[pair[0]] = pair[1].split(',') + return d + +def parse_args(): + parser = argparse.ArgumentParser(description="Runs a series of experiments on different part of speech taggers and different language data.") + parser.add_argument('languagesdir', help="Path to the directory containing all the individaul language data directories") + parser.add_argument('--languages', help="Only run experiments for these languages, comma separated", default=DEFAULT_LANGUAGES, type=comma_list) + parser.add_argument('--taggers', help="Only run experiments with these taggers, comma separated", default=DEFAULT_TAGGERS, type=comma_list) + parser.add_argument('--language-texts', help="Use different texts per language, coma seperated colon pairs", default=DEFAULT_TEXTS, type=comma_colon_dict) + parser.add_argument('--folds', help="Use x-fold validation instead of 10-fold", default=10, type=int) + parser.add_argument('--reuse', help="Reuse preprocesed dictionary and corpa from previous run", action='store_true') + + return parser.parse_args() + +def read1k_chunker(f): + def read1k(): + return f.read(1024) + return iter(read1k, '') + +def filter(func=None, iter_filter=False, input_chunker=None, output_separator=''): + if func is None: + def defer(func=None): + return filter(func, iter_filter=iter_filter, input_chunker=input_chunker, output_separator=output_separator) + return defer + + def generator(input_iter): + for line in input_iter: + filtered = func(line) + if filtered is not None: + yield filtered + output_separator + + @functools.wraps(func) + def wrapper(input, output=None): + input_file = None + if isinstance(input, str): + input_iter = input_file = open(input) + if input_chunker: + input_iter = input_chunker(input_file) + else: + input_iter = input_file.readlines() + else: + input_iter = input + if iter_filter: + gen = func(input_iter) + else: + gen = generator(input_iter) + if output is None: + return gen + output_file = open(output, 'w') + for line in gen: + output_file.write(line) + if input_file is not None: + input_file.close() + output_file.close() + return wrapper + + +class MapFilter: + def __init__(self, aiterable, pred=None, tran=None): + self.aiterable = aiterable + self.pred = pred + self.tran = tran + + async def __aiter__(self): + return self + + async def __anext__(self): + while True: + payload = await self.aiterable.__anext__() + if self.pred is None or self.pred(payload): + if self.tran is None: + return payload + else: + return self.tran(payload) + + +async def dir_in(input_fn, proc): + input_file = open(input_fn) + while 1: + b = input_file.read(1024) + if not len(b): + return + await proc.stdin.write(b) + + +async def dir_out(proc, output_fn): + output_file = open(output_fn, 'w') + while 1: + b = await in_proc.read(1024) + if not len(b): + return + output_file.write(b) + + +class Tee(MapFilter): + def __init__(self, aiterable, log_file): + self.log_file = open(log_file, 'wb') + super().__init__(aiterable, tran=self.tran) + + def tran(self, bit): + self.log_file.write(bit) + return bit + + +async def pipe(in_proc, out_proc): + while 1: + b = await in_proc.stdout.read(16384) + if b == b'': + return + out_proc.stdin.write(b) + #await out_proc.stdin.drain() + +#async def readlines(in_proc): +# while 1: +# b = await in_proc.stdout.readline() +# if not len(b): +# return +# b + + +async def writeiter(iter, out_proc): + i = 0 + async for block in iter: + if (i % 10000) == 0: + print(".", end="", flush=True) + out_proc.stdin.write(block) + #await out_proc.stdin.drain() + i += 1 + print("writeiter done") + out_proc.stdin.write_eof() + +def proc_filter(func=None, output_chunker=None): + if func is None: + def defer(func): + return proc_filter(func, output_chunker=output_chunker) + return defer + + @functools.wraps(func) + def wrapper(*args, **kwargs): + input = kwargs.pop('input', None) + output = kwargs.pop('output', None) + + if callable(func): + cmd = func(*args, **kwargs) + else: + cmd = func + + kwargs = {} + if isinstance(input, str): + kwargs['stdin'] = open(input) + else: + kwargs['stdin'] = subprocess.PIPE + if output: + kwargs['stdout'] = open(output, 'w') + else: + kwargs['stdout'] = subprocess.PIPE + print("RUNNING: ", ' '.join(cmd)) + proc = subprocess.Popen(cmd, universal_newlines=True, **kwargs) + print(type(input)) + if not isinstance(input, str) and input is not None: + for line in input: + if 'apertium-destxt' in cmd: + print('des', line) + #if not isinstance(input, bytes): + #line = line.encode('utf8') + #print(proc.stdin) + proc.stdin.write(line) + if not output: + if output_chunker: + return output_chunker(proc.stdout) + else: + return proc.stdout.readlines() + stdout, stderr = proc.communicate() + retcode = proc.poll() + return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr) + + return wrapper + + +@filter +def strip_blanks_(line): + if line != '\n': + return line + + +@filter(output_separator='\n') +def extract_words(line): + return line.split('^')[1].split('/')[0] + + +@filter(output_separator='\n') +def extract_first_analysis(line): + return '/'.join(line.split('/')[0:2]).strip().rstrip('$') + '$' + +strip_blanks = functools.partial(MapFilter, pred=lambda line: line != '\n') + + +@filter +def passthrough(line): + return line + +def prun(*args, **kwargs): + for key in ('stdin', 'stdout', 'stderr'): + if key not in kwargs: + kwargs[key] = subprocess.PIPE + print('prun', args, kwargs) + return subprocess.run(*args, **kwargs) + + +def check_run(cmd, *args, **kwargs): + kwargs['check'] = True + print("RUNNING: ", ' '.join(cmd)) + return subprocess.run(cmd, *args, **kwargs) + + +@proc_filter +def lt_proc(morphology_fn, dictcase=False): + cmd = ['lt-proc', morphology_fn] + if dictcase: + cmd.insert(1, '--dictionary-case') + return cmd + + +def insert_model(cmd, model, tagging=False): + if model == 'bigram': + pass + elif model.startswith('unigram'): + cmd.insert(2, '-u') + cmd.insert(3, model[7:]) + elif model == 'lwsw': + cmd.insert(2, '--sliding-window') + return cmd + + + +@proc_filter +def tagger_train_sup(model_type, model_fn, train_fn, trainsrc_fn=None, dic_fn=None, tsx_fn=None, iterations=0): + cmd = ['apertium-tagger', '--supervised={}'.format(iterations), model_fn] + if not all((trainsrc_fn, dic_fn, tsx_fn)) and model_type != 'unigram': + raise ValueError("Optional arguments required for non-unigram models") + if model_type != 'unigram': + #apertium-tagger -s 0 /tmp/$DN.dic /tmp/spa.misc.$i.trainsrc $TSX /tmp/spa.misc.$i.prob /tmp/spa.misc.$i.train /tmp/spa.misc.$i.trainsrc + cmd[2:2] = [dic_fn, trainsrc_fn, tsx_fn] + cmd.append(trainsrc_fn) + insert_model(cmd, model_type) + cmd.insert(-1, train_fn) + return cmd + + +@proc_filter +def tagger_train_unsup(model_type, model_fn, trainsrc_fn, dic_fn, tsx_fn, iterations=0): + if model_fn == 'unigram': + raise ValueError("No unsupervised training for unigram models") + cmd = ['apertium-tagger', '--train={}'.format(iterations), dic_fn, trainsrc_fn, tsx_fn, model_fn] + insert_model(cmd, model_type) + return cmd + + +@proc_filter +def tagger_tag(model_type, model_fn): + cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn] + insert_model(cmd, model_type, tagging=True) + return cmd + +filter_dix = functools.partial( + MapFilter, + pred=lambda line: b"__REGEXP__" not in line and b":<:" not in line, + tran=lambda line: line.split(b":")[0] + b"\n") + +async def fix_dix(morphology_fn, tsx_fn, dix_fn, output_fn): + pipes = [] + + expand_proc = await create_subprocess_exec('lt-expand', dix_fn, stdout=PIPE) + filtered = filter_dix(expand_proc.stdout) + + extras = [".", "?", ";", ":", "!", "42", ",", "(", "\\[", ")", "\\]", "¿", "¡"] + for i, extra in enumerate(extras): + extras[i] = (extra + "\n").encode('utf-8') + with_extras = aitertools.chain(filtered, extras) + + lt_inpipe, destxt_outpipe = os.pipe() + destxt = await create_subprocess_exec('apertium-destxt', stdin=PIPE, stdout=destxt_outpipe) + os.close(destxt_outpipe) + + pipes.append(writeiter(with_extras, destxt)) + + filter_ambg_inpipe, lt_outpipe = os.pipe() + lt_proc = await create_subprocess_exec('lt-proc', morphology_fn, stdin=lt_inpipe, stdout=lt_outpipe) + os.close(lt_outpipe) + + filter_ambg = await create_subprocess_exec('apertium-filter-ambiguity', tsx_fn, stdin=filter_ambg_inpipe, stdout=open(output_fn, 'wb')) + pipes.append(filter_ambg.wait()) + return await asyncio.gather(*(asyncio.ensure_future(cr) for cr in pipes)) + + +#def lt_proc(morphology_fn, input_fn, output_fn=None, dictcase=False): + #cmd = ['lt-proc', morphology_fn] + #if dictcase: + #cmd.insert(1, '--dictionary-case') + #if isinstance(input_fn, str): + #cmd.append(input_fn) + #cmd.append(output_fn) + #return prun(cmd) + #else: + #kwargs = {'stdin': subprocess.PIPE} + #if output_fn: + ##kwargs['stdout'] = open(output_fn, 'w') + #with subprocess.Popen(cmd, **kwargs) as proc: + #if not isinstance(input_fn, str): + #for line in input_fn: + #if not isinstance(input_fn, bytes): + #line = line.encode('utf8') + #proc.stdin.write(line) + #stdout, stderr = proc.communicate() + #retcode = proc.poll() + #return subprocess.CompletedProcess(proc.args, retcode, stdout, stderr) + +@proc_filter +def cg_proc(grammar_fn, dictcase=True): + cmd = ['cg-proc', grammar_fn] + if dictcase: + cmd.insert(1, '-w') + return cmd + + +@filter(iter_filter=True) +def strip_unknown_sent(gen): + buff = [] + valid_sent = True + for line in gen: + if line.strip() == '': + if valid_sent: + for line in buff: + yield line + yield '\n' + buff = [] + valid_sent = True + else: + buff.append(line) + if '/*' in line: + valid_sent = False + +def split_n_r(corpus_fn, train_fn, ref_fn, n, r): + sentences = 0 + with open(corpus_fn) as corpus_file: + for line in corpus_file.readlines(): + if line.strip() == '': + sentences = sentences + 1 + + split_left = int(float(sentences) * r / n) + split_right = int(float(sentences) * (r + 1) / n) + + buffer = '' + index = 0 + + corpus_file.seek(0) + + with open(train_fn, 'w') as train_file, open(ref_fn, 'w') as ref_file: + for line in corpus_file.readlines(): + if line.strip() == '': + index = index + 1 + elif split_left <= index < split_right: + ref_file.write(line) + else: + train_file.write(line) + + +class MissingLanguageDataException(Exception): + def __init__(self, fn): + self.fn = fn + + +def xval_experiment(name): + def dec(func=None): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + accuracies = [] + for i, xval_fns in enumerate(self.xval_fns): + #(prefix, train_fn, src_fn, ref_fn) + xval_fns['test'] = xval_fns['prefix'] + 'test.' + name + xval_fns['model'] = xval_fns['prefix'] + 'model.' + name + func(self, xval_fns) # train_fn, src_fn, ref_fn, test_fn, model_fn + evaluator = TaggerEvaluator(xval_fns['src'], xval_fns['ref'], xval_fns['test']) + evaluator.run_analysis() + accuracies.append(evaluator.accuracy) + return (min(accuracies), max(accuracies), statistics.mean(accuracies), statistics.stddev(accuracies)) + return wrapper + return dec + + +def unigram_taggers(cls): + for do_cg in [False, True]: + for unigram_type in range(1,4): + unigram_model = 'unigram' + str(unigram_type) + name = ('cg' if do_cg else '') + unigram_model + + @cls.reg_experiment + @xval_experiment(name) + def experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model): + tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train']) + if do_cg: + tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) + else: + tagger_input = xval_fns['src'] + tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) + setattr(cls, 'experiment_' + name, experiment) + return cls + + +def extract_src(morphology_fn, input, output=None): + ref_words_iter = extract_words(input=input) + return lt_proc(morphology_fn, input=ref_words_iter, output=output) + + +class LanguageTaggerExperimentor: + EVAL_PREFIX = 'experiment_' + + experiments = {} + + def __init__(self, lang, lang_root, texts, folds, reuse=False): + self.lang = lang + self.work_dir = pjoin(TMPDIR, lang) + + pair_name = 'apertium-{0}.{0}'.format(lang) + self.morphology_fn = pjoin(lang_root, lang + '.automorf.bin') + self.cg_fn = pjoin(lang_root, lang + '.rlx.bin') + self.dix_fn = pjoin(lang_root, pair_name + '.dix') + self.tsx_fn = pjoin(lang_root, pair_name + '.tsx') + + self.text_fns = [pjoin(lang_root, text) for text in texts] + self.joined_fn = pjoin(self.work_dir, 'joined') + self.ref_fn = pjoin(self.work_dir, 'ref') + self.src_fn = pjoin(self.work_dir, 'src') + self.dic_fn = pjoin(self.work_dir, 'filtered.dic') + + self.xval_fns = [] + self.folds = folds + + for i in range(folds): + xval_prefix = pjoin(self.work_dir, 'xval.{}.'.format(i)) + xval_src_fn = xval_prefix + 'src' + xval_trainsrc_fn = xval_prefix + 'trainsrc' + xval_train_fn = xval_prefix + 'train' + xval_ref_fn = xval_prefix + 'ref' + + self.xval_fns.append({ + 'prefix': xval_prefix, + 'train': xval_train_fn, + 'src': xval_src_fn, + 'ref': xval_ref_fn, + 'trainsrc': xval_trainsrc_fn, + }) + + self.validate() + + if not reuse: + self.do_preprocessing() + + def validate(self): + check_run(["apertium-validate-dictionary", self.dix_fn]) + check_run(["apertium-validate-tagger", self.tsx_fn]) + + for fn in [self.morphology_fn, self.cg_fn, self.tsx_fn, self.dix_fn]: + if not pexists(fn): + raise MissingLanguageDataException(fn=fn) + + def do_preprocessing(self): + if not isdir(self.work_dir): + mkdir(self.work_dir) + + strip_unknown_sent(itertools.chain(*(open(fn).readlines() for fn in self.text_fns)), self.joined_fn) + #loop.run_until_complete(strip_blanks(self.joined_fn, self.ref_fn)) + strip_blanks_(self.joined_fn, self.ref_fn) + extract_src(self.morphology_fn, input=self.ref_fn, output=self.src_fn) + loop.run_until_complete(fix_dix(self.morphology_fn, self.tsx_fn, self.dix_fn, output_fn=self.dic_fn)) + + for i, xval_fn in enumerate(self.xval_fns): + split_n_r(self.joined_fn, xval_fn['train'], xval_fn['ref'], self.folds, i) + extract_src(self.morphology_fn, input=xval_fn['ref'], output=xval_fn['src']) + extract_src(self.morphology_fn, input=xval_fn['train'], output=xval_fn['trainsrc']) + + def _analyse(self, test_fn): + evaluator = TaggerEvaluator(self.src_fn, self.ref_fn, test_fn) + evaluator.run_analysis() + return evaluator.accuracy + + @classmethod + def reg_experiment(cls, name): + def reg(func): + LanguageTaggerExperimentor.experiments[name] = func + return reg + + @classmethod + def add_experiments(cls): + for do_cg in [False, True]: + for unigram_type in range(1,4): + unigram_model = 'unigram' + str(unigram_type) + name = ('cg' if do_cg else '') + unigram_model + + @cls.reg_experiment(name) + @xval_experiment(name) + def unigram_experiment(self, xval_fns, do_cg=do_cg, unigram_model=unigram_model): + tagger_train_sup('unigram', unigram_model, xval_fns['model'], xval_fns['train']) + if do_cg: + tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) + else: + tagger_input = xval_fns['src'] + tagger_tag(unigram_model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) + + for do_cg in [False, True]: + name = ('cg' if do_cg else '') + '1st' + + @cls.reg_experiment(name) + def pick_first_experiment(self, do_cg=do_cg): + first_fn = pjoin(self.work_dir, 'test.' + name) + if do_cg: + tagger_input = cg_proc(self.cg_fn, input=self.src_fn) + else: + tagger_input = self.src_fn + extract_first_analysis(self.src_fn, first_fn) + return self._analyse(first_fn) + + for do_cg in [False, True]: + for is_supervised, model in [(True, 'bigram'), (False, 'bigram'), (False, 'lwsw')]: + for iterations in [0, 50, 250]: + name = "{cg}{sup}_{model}_i{iterations}".format( + cg='cg_' if do_cg else '', + sup='sup' if is_supervised else 'unsup', + model=model, + iterations=iterations) + + @cls.reg_experiment(name) + @xval_experiment(name) + def model_experiment(self, xval_fns, do_cg=do_cg, is_supervised=is_supervised, model=model, iterations=iterations): + if is_supervised: + tagger_train_sup( + model, xval_fns['model'], + train_fn=xval_fns['train'], + trainsrc_fn=xval_fns['trainsrc'], + dic_fn=self.dic_fn, + tsx_fn=self.tsx_fn, + iterations=iterations) + else: + tagger_train_unsup( + model, xval_fns['model'], + trainsrc_fn=xval_fns['trainsrc'], + dic_fn=self.dic_fn, + tsx_fn=self.tsx_fn, + iterations=iterations) + if do_cg: + tagger_input = cg_proc(self.cg_fn, input=xval_fns['src']) + else: + tagger_input = xval_fns['src'] + tagger_tag(model, xval_fns['model'], input=tagger_input, output=xval_fns['test']) + + @classmethod + def all_taggers(cls): + return cls.experiments.keys() + + def get_tagger(self, tagger): + return functools.partial(self.experiments[tagger], self) + + +LanguageTaggerExperimentor.add_experiments() + + +DEFAULT_TAGGERS = list(LanguageTaggerExperimentor.all_taggers()) + + +def main(): + args = parse_args() + if not isdir(TMPDIR): + mkdir(TMPDIR) + + languages_tagger_accuracies = {} + try: + for lang in args.languages: + taggers = args.language_texts[lang] + lang_root = pjoin(args.languagesdir, 'apertium-' + lang) + def mk_experimentor(): + return LanguageTaggerExperimentor(lang, lang_root, taggers, args.folds, reuse=args.reuse) + try: + experimentor = mk_experimentor() + except MissingLanguageDataException as e: + print("Missing {}... Trying to build it for you.".format(e.fn)) + with cd(lang_root): + check_run('./autogen.sh') + check_run('make') + experimentor = mk_experimentor() + languages_tagger_accuracies[lang] = {} + for tagger in args.taggers: + experiment = experimentor.get_tagger(tagger) + languages_tagger_accuracies[lang][tagger] = experiment() + finally: + pprint(languages_tagger_accuracies) + + +if __name__ == '__main__': + try: + main() + finally: + loop.close() Index: branches/apertium-tagger/experiments/split_corpus_n.py =================================================================== --- branches/apertium-tagger/experiments/split_corpus_n.py (nonexistent) +++ branches/apertium-tagger/experiments/split_corpus_n.py (revision 68351) @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import sys +import random + +def main(corpus, prefix): + tokens = 0.0 + sentences = 0.0 + + for line in open(corpus).readlines(): + + if line.strip() == '': + sentences = sentences + 1.0 + else: + tokens = tokens + 1.0 + + print(sentences, tokens, tokens / sentences) + + ids = [] + for i in range(0, int(sentences)): + ids.append(i) + random.shuffle(ids) + split = int(sentences / 10) + print(split, file=sys.stderr) + testing = ids[0:split] + training = ids[split:] + + train_file = open(prefix + 'train', 'w+') + test_file = open(prefix + 'ref', 'w+') + + buffer = '' + index = 0 + for line in open(corpus).readlines(): + if line.strip() == '': + index = index + 1 + buffer = buffer + '\n' + if index in testing: + test_file.write(buffer) + elif index in training: + train_file.write(buffer) + else: + print('ERROR: %d not in testing or training' % + (index), file=sys.stderr) + buffer = '' + else: + buffer = buffer + line + if index % int(tokens / 100) == 0: + sys.stderr.write('.') + sys.stderr.flush() + +if __name__ == '__main__': + corpus = sys.argv[1] + prefix = sys.argv[2] + main(corpus, prefix) Index: trunk/apertium/apertium/hmm.cc =================================================================== --- trunk/apertium/apertium/hmm.cc (revision 68349) +++ trunk/apertium/apertium/hmm.cc (revision 68351) @@ -204,14 +204,9 @@ if (tags.size()==0) { //This is an unknown word tags = tdhmm.getOpenClass(); } - else if (output.has_not(tags)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n"; - errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n"; - errors+= L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + else { + require_ambiguity_class(tdhmm, tags, *word); + } k2=output[tags]; @@ -366,15 +361,8 @@ if (word_untagged->get_tags().size()==0) { // Unknown word tags = tdhmm.getOpenClass(); } - else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors+= L"Word '"+word_untagged->get_superficial_form()+L"' not found in the dictionary.\n"; - errors+= L"New ambiguity class: "+word_untagged->get_string_tags()+L"\n"; - errors+= L"Take a look at the dictionary, then retrain."; - fatal_error(errors); - } else { + require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged); tags = word_untagged->get_tags(); } @@ -460,49 +448,12 @@ } void -HMM::read_dictionary (FILE *fdic) { - int i, k, nw=0; - TaggerWord *word=NULL; - set tags; - Collection &output = tdhmm.getOutput(); - - MorphoStream morpho_stream(fdic, true, &tdhmm); - - // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark - - word = morpho_stream.get_next_word(); - - while (word) { - if (++nw%10000==0) wcerr<get_tags(); +HMM::read_dictionary(FILE *fdic) { + tagger_utils::read_dictionary(fdic, tdhmm); + int N = (tdhmm.getTagIndex()).size(); + int M = (tdhmm.getOutput()).size(); + wcerr << N << L" states and " << M <0) - k = output[tags]; - - delete word; - word = morpho_stream.get_next_word(); - } - wcerr< amb_class; - amb_class.insert(i); - k=output[amb_class]; - } - - int M = output.size(); - - wcerr<< N <get_superficial_form()+L"' not found in the dictionary.\n"; - errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n"; - errors+= L"Take a look at the dictionary, then retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdhmm, tags, *word); k = output[tags]; len = pending.size(); @@ -806,17 +750,7 @@ if (tags.size()==0) // This is an unknown word tags = tdhmm.getOpenClass(); - if (output.has_not(tags)) { // Encontrada una clase de ambigüedad desconocida hasta el momento - if (debug) { - wstring errors; - errors = L"A new ambiguity class was found. \n"; - errors+= L"Retraining the tagger is necessary so as to take it into account.\n"; - errors+= L"Word '"+word->get_superficial_form()+L"'.\n"; - errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n"; - wcerr< -HMM::find_similar_ambiguity_class(set c) { - int size_ret = -1; - set ret=tdhmm.getOpenClass(); //Se devolver si no encontramos ninguna clase mejor - bool skeep_class; - Collection &output = tdhmm.getOutput(); - - for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { - skeep_class=false; - // Test if output[k] is a subset of class - for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { - if (c.find(*it)==c.end()) { - skeep_class=true; //output[k] is not a subset of class - break; - } - } - if (!skeep_class) { - size_ret = output[k].size(); - ret = output[k]; - } - } - } - return ret; -} Index: trunk/apertium/apertium/hmm.h =================================================================== --- trunk/apertium/apertium/hmm.h (revision 68349) +++ trunk/apertium/apertium/hmm.h (revision 68351) @@ -60,16 +60,6 @@ * @see: read_ambiguity_classes, read_dictionary */ void init(); - - /** This method returns a known ambiguity class that is a subset of - * the one received as a parameter. This is useful when a new - * ambiguity class is found because of changes in the morphological - * dictionary used by the MT system. - * @param c set of tags (ambiguity class) - * @return a known ambiguity class - */ - set find_similar_ambiguity_class(set c); - public: void deserialise(FILE *Serialised_FILE_Tagger); std::vector &getArrayTags(); Index: trunk/apertium/apertium/lswpost.cc =================================================================== --- trunk/apertium/apertium/lswpost.cc (revision 68349) +++ trunk/apertium/apertium/lswpost.cc (revision 68351) @@ -104,7 +104,6 @@ set tags_left, tags_mid, tags_right; set::iterator iter_left, iter_mid, iter_right; vector > > para_matrix(N, vector >(N, vector(N, 0))); - Collection &output = tdlsw.getOutput(); MorphoStream morpho_stream(ftxt, true, &tdlsw); int num_valid_seq = 0; @@ -114,14 +113,8 @@ if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); } - if (output.has_not(tags_left)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + + require_ambiguity_class(tdlsw, tags_left, *word); ++nw; delete word; word = morpho_stream.get_next_word(); // word for tags mid @@ -129,14 +122,7 @@ if (tags_mid.size()==0) { //This is an unknown word tags_mid = tdlsw.getOpenClass(); } - if (output.has_not(tags_mid)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdlsw, tags_mid, *word); ++nw; delete word; if (morpho_stream.getEndOfFile()) { @@ -155,14 +141,7 @@ if (tags_right.size()==0) { //This is an unknown word tags_right = tdlsw.getOpenClass(); } - if (output.has_not(tags_right)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdlsw, tags_right, *word); num_valid_seq = tags_left.size() * tags_mid.size() * tags_right.size(); for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { @@ -247,48 +226,11 @@ void LSWPoST::read_dictionary(FILE *fdic) { - int i, k, nw = 0; - TaggerWord *word = NULL; - set tags; - Collection &output = tdlsw.getOutput(); - - MorphoStream morpho_stream(fdic, true, &tdlsw); - - // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark - - word = morpho_stream.get_next_word(); - - while (word) { - if (++nw % 10000 == 0) - wcerr << L'.' << flush; - - tags = word->get_tags(); - - if (tags.size() > 0) - k = output[tags]; - - delete word; - word = morpho_stream.get_next_word(); - } - wcerr << L"\n"; - - // OPEN AMBIGUITY CLASS - // It contains all tags that are not closed. - // Unknown words are assigned the open ambiguity class - k = output[tdlsw.getOpenClass()]; - + tagger_utils::read_dictionary(fdic, tdlsw); int N = (tdlsw.getTagIndex()).size(); + int M = (tdlsw.getOutput()).size(); + wcerr << N << L" states and " << M < amb_class; - amb_class.insert(i); - k = output[amb_class]; - } - - wcerr << N << L" states\n"; - // set up the probability matrix of tdlsw, the pointer to the TaggerDataLSW object tdlsw.setProbabilities(N); } @@ -302,7 +244,6 @@ set tags_left, tags_mid, tags_right; set::iterator iter_left, iter_mid, iter_right; vector > > para_matrix_new(N, vector >(N, vector(N, 0))); - Collection &output = tdlsw.getOutput(); MorphoStream morpho_stream(ftxt, true, &tdlsw); word = new TaggerWord(); // word for tags left @@ -311,14 +252,7 @@ if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); } - if (output.has_not(tags_left)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdlsw, tags_left, *word); ++nw; delete word; word = morpho_stream.get_next_word(); // word for tags mid @@ -326,14 +260,7 @@ if (tags_mid.size()==0) { //This is an unknown word tags_mid = tdlsw.getOpenClass(); } - if (output.has_not(tags_mid)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdlsw, tags_mid, *word); ++nw; delete word; if (morpho_stream.getEndOfFile()) { @@ -351,14 +278,7 @@ if (tags_right.size()==0) { //This is an unknown word tags_right = tdlsw.getOpenClass(); } - if (output.has_not(tags_right)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n"; - errors += L"New ambiguity class: " + word->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } + require_ambiguity_class(tdlsw, tags_right, *word); double normalization = 0; @@ -416,38 +336,20 @@ set::iterator iter_left, iter_mid, iter_right; MorphoStream morpho_stream(Input, debug, &tdlsw); morpho_stream.setNullFlush(null_flush); - Collection &output = tdlsw.getOutput(); word_left = new TaggerWord(); // word left word_left->add_tag(eos, L"sent", tdlsw.getPreferRules()); word_left->set_show_sf(show_sf); tags_left = word_left->get_tags(); // tags left - if (output.has_not(tags_left)) { - if (debug) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word_left->get_superficial_form() + L"' not found Input the dictionary.\n"; - errors += L"New ambiguity class: " + word_left->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } - tags_left = find_similar_ambiguity_class(tags_left); - } + + tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug); word_mid = morpho_stream.get_next_word(); // word mid word_mid->set_show_sf(show_sf); tags_mid = word_mid->get_tags(); // tags mid - if (output.has_not(tags_mid)) { - if (debug) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors += L"Word '" + word_mid->get_superficial_form() + L"' not found Input the dictionary.\n"; - errors += L"New ambiguity class: " + word_mid->get_string_tags() + L"\n"; - errors += L"Take a look at the dictionary and at the training corpus. Then, retrain."; - fatal_error(errors); - } - tags_mid = find_similar_ambiguity_class(tags_mid); - } + + tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); + if (morpho_stream.getEndOfFile()) { delete word_left; delete word_mid; @@ -461,17 +363,7 @@ while (word_right) { tags_right = word_right->get_tags(); - if (output.has_not(tags_right)) { - if (debug) { - wstring errors; - errors = L"A new ambiguity class was found. \n"; - errors+= L"Retraining the tagger is necessary so as to take it into account.\n"; - errors+= L"Word '"+word_right->get_superficial_form()+L"'.\n"; - errors+= L"New ambiguity class: "+word_right->get_string_tags()+L"\n"; - fatal_error(errors); - } - tags_right = find_similar_ambiguity_class(tags_right); - } + tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug); double max = -1; TTag tag_max = *tags_mid.begin(); @@ -511,30 +403,3 @@ delete word_left; delete word_mid; } - -set -LSWPoST::find_similar_ambiguity_class(set c) { - int size_ret = -1; - set ret=tdlsw.getOpenClass(); // return open-class as default, if no better is found. - bool skip_class; - Collection &output = tdlsw.getOutput(); - - for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { - skip_class=false; - // Test if output[k] is a subset of class - for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { - if (c.find(*it)==c.end()) { - skip_class=true; //output[k] is not a subset of class - break; - } - } - if (!skip_class) { - size_ret = output[k].size(); - ret = output[k]; - } - } - } - return ret; -} - Index: trunk/apertium/apertium/lswpost.h =================================================================== --- trunk/apertium/apertium/lswpost.h (revision 68349) +++ trunk/apertium/apertium/lswpost.h (revision 68351) @@ -107,14 +107,5 @@ /** Do the tagging */ void tagger(FILE *Input, FILE *Output, const bool &First = false); - - /** This method returns a known ambiguity class that is a subset of - * the one received as a parameter. This is useful when a new - * ambiguity class is found because of changes in the morphological - * dictionary used by the MT system. - * @param c set of tags (ambiguity class) - * @return a known ambiguity class - */ - set find_similar_ambiguity_class(set c); }; #endif Index: trunk/apertium/apertium/tagger_utils.cc =================================================================== --- trunk/apertium/apertium/tagger_utils.cc (revision 68349) +++ trunk/apertium/apertium/tagger_utils.cc (revision 68351) @@ -14,7 +14,9 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, see . */ + #include +#include #include #include @@ -119,7 +121,103 @@ return s; } - + +void +tagger_utils::read_dictionary(FILE *fdic, TaggerData &td) { + int i, k, nw = 0; + TaggerWord *word = NULL; + set tags; + Collection &output = td.getOutput(); + + MorphoStream morpho_stream(fdic, true, &td); + + // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark + + word = morpho_stream.get_next_word(); + + while (word) { + if (++nw % 10000 == 0) + wcerr << L'.' << flush; + + tags = word->get_tags(); + + if (tags.size() > 0) + k = output[tags]; + + delete word; + word = morpho_stream.get_next_word(); + } + wcerr << L"\n"; + + // OPEN AMBIGUITY CLASS + // It contains all tags that are not closed. + // Unknown words are assigned the open ambiguity class + k = output[td.getOpenClass()]; + + // Create ambiguity class holding one single tag for each tag. + // If not created yet + int N = (td.getTagIndex()).size(); + for(i = 0; i != N; i++) { + set amb_class; + amb_class.insert(i); + k = output[amb_class]; + } +} + +set +tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { + int size_ret = -1; + set ret = td.getOpenClass(); // return open-class as default, if no better is found. + bool skip_class; + Collection &output = td.getOutput(); + + for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { + skip_class = false; + // Test if output[k] is a subset of class + for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { + if (c.find(*it)==c.end()) { + skip_class = true; //output[k] is not a subset of class + break; + } + } + if (!skip_class) { + size_ret = output[k].size(); + ret = output[k]; + } + } + } + return ret; +} + +void +tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word) { + if (td.getOutput().has_not(tags)) { + wstring errors; + errors = L"A new ambiguity class was found. I cannot continue.\n"; + errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n"; + errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n"; + errors+= L"Take a look at the dictionary, then retrain."; + fatal_error(errors); + } +} + +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { + wstring errors; + errors = L"A new ambiguity class was found. \n"; + errors += L"Retraining the tagger is necessary so as to take it into account.\n"; + errors += L"Word '" + word.get_superficial_form() + L"'.\n"; + errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; + wcerr << L"Error: " << errors; + } + return find_similar_ambiguity_class(td, tags); + } + return tags; +} + template ostream& operator<< (ostream& os, const map & f){ typename map ::const_iterator it; Index: trunk/apertium/apertium/tagger_utils.h =================================================================== --- trunk/apertium/apertium/tagger_utils.h (revision 68349) +++ trunk/apertium/apertium/tagger_utils.h (revision 68351) @@ -25,6 +25,8 @@ #include #include #include +#include +#include using namespace std; @@ -66,6 +68,29 @@ */ int nguiones_fs(wstring const &cadena); +/** Reads the expanded dictionary received as a parameter puts the resulting + * ambiguity classes that the tagger will manage. + * @param fdic the input stream with the expanded dictionary to read + * @param td the tagger data instance to mutate + */ +void read_dictionary(FILE *fdic, TaggerData &td); + +/** This method returns a known ambiguity class that is a subset of +* the one received as a parameter. This is useful when a new +* ambiguity class is found because of changes in the morphological +* dictionary used by the MT system. +* @param c set of tags (ambiguity class) +* @return a known ambiguity class +*/ +set find_similar_ambiguity_class(TaggerData &td, set &c); + +/** Dies with an error message if the tags aren't in the tagger data */ +void require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word); + +/** As with find_similar_ambiguity_class, but returns tags if it's already fine + * & prints a warning if debug */ +set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + wstring trim(wstring s); };