Index: branches/apertium-tagger/apertium/apertium/lswpost.cc =================================================================== --- branches/apertium-tagger/apertium/apertium/lswpost.cc (revision 69429) +++ branches/apertium-tagger/apertium/apertium/lswpost.cc (revision 69432) @@ -342,14 +342,12 @@ word_left->set_show_sf(show_sf); tags_left = word_left->get_tags(); // tags left - tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug); - + warn_absent_ambiguity_class(tdlsw, tags_left, *word_left, debug); word_mid = morpho_stream.get_next_word(); // word mid word_mid->set_show_sf(show_sf); tags_mid = word_mid->get_tags(); // tags mid - tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); - + warn_absent_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); if (morpho_stream.getEndOfFile()) { delete word_left; delete word_mid; @@ -361,9 +359,8 @@ wstring micad; while (word_right) { - tags_right = word_right->get_tags(); - tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug); + warn_absent_ambiguity_class(tdlsw, tags_right, *word_right, debug); double max = -1; TTag tag_max = *tags_mid.begin(); Index: branches/apertium-tagger/apertium/apertium/tagger_utils.cc =================================================================== --- branches/apertium-tagger/apertium/apertium/tagger_utils.cc (revision 69429) +++ branches/apertium-tagger/apertium/apertium/tagger_utils.cc (revision 69432) @@ -20,6 +20,8 @@ #include #include +#include +#include #include #ifdef _MSC_VER #define wcstok wcstok_s @@ -167,27 +169,18 @@ set tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { - int size_ret = -1; - set ret = td.getOpenClass(); // return open-class as default, if no better is found. - bool skip_class; + set &ret = td.getOpenClass(); Collection &output = td.getOutput(); - for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { - skip_class = false; - // Test if output[k] is a subset of class - for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { - if (c.find(*it)==c.end()) { - skip_class = true; //output[k] is not a subset of class - break; + for (int k=0; k &ambg_class = output[k]; + if (ambg_class.size() >= ret.size()) { + continue; } + if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { + ret = ambg_class; } - if (!skip_class) { - size_ret = output[k].size(); - ret = output[k]; } - } - } return ret; } @@ -208,10 +201,7 @@ } } -set -tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { - if (td.getOutput().has_not(tags)) { - if (debug) { +static void _warn_absent_ambiguity_class(TaggerWord &word) { wstring errors; errors = L"A new ambiguity class was found. \n"; errors += L"Retraining the tagger is necessary so as to take it into account.\n"; @@ -218,6 +208,13 @@ errors += L"Word '" + word.get_superficial_form() + L"'.\n"; errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; wcerr << L"Error: " << errors; +} + +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { + _warn_absent_ambiguity_class(word); } return find_similar_ambiguity_class(td, tags); } @@ -224,6 +221,13 @@ return tags; } +void +tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags) && debug) { + _warn_absent_ambiguity_class(word); + } +} + template ostream& operator<< (ostream& os, const map & f){ typename map ::const_iterator it; Index: branches/apertium-tagger/apertium/apertium/tagger_utils.h =================================================================== --- branches/apertium-tagger/apertium/apertium/tagger_utils.h (revision 69429) +++ branches/apertium-tagger/apertium/apertium/tagger_utils.h (revision 69432) @@ -91,6 +91,9 @@ * & prints a warning if debug */ set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); +/** Just prints a warning if debug */ +void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + wstring trim(wstring s); }; Index: branches/apertium-tagger/apertium/apertium/hmm.cc =================================================================== --- branches/apertium-tagger/apertium/apertium/hmm.cc (revision 69429) +++ branches/apertium-tagger/apertium/apertium/hmm.cc (revision 69432) @@ -710,7 +710,7 @@ TaggerWord *word=NULL; TTag tag; - set tags, pretags; + set ambg_class_tags, tags, pretags; set ::iterator itag, jtag; double prob, loli, x; @@ -750,9 +750,9 @@ if (tags.size()==0) // This is an unknown word tags = tdhmm.getOpenClass(); - tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); + ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); - k = output[tags]; //Ambiguity class the word belongs to + k = output[ambg_class_tags]; //Ambiguity class the word belongs to #ifdef __GNUC__ clear_array_double(alpha[nwpend%2], N); Index: branches/apertium-tagger/apertium/tests/tagger/streamparser.py =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/streamparser.py (revision 69429) +++ branches/apertium-tagger/apertium/tests/tagger/streamparser.py (nonexistent) @@ -1,171 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -""" -Usage: streamparser.py [FILE] - -Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found. -""" - -import re, pprint, sys, itertools, fileinput -from enum import Enum -from collections import namedtuple - - -Knownness = Enum('Knownness', 'known unknown biunknown genunknown') -try: - Knownness.__doc__ = """Level of knowledge associated with a lexical unit. - Values: - known - unknown: Denoted by '*', analysis not available. - biunknown: Denoted by '@', translation not available. - genunknown: Denoted by '#', generated form not available. -""" -except AttributeError: - # Python 3.2 users have to read the source - pass - -SReading = namedtuple('SReading', ['baseform', 'tags']) -try: - SReading.__doc__ = """A single subreading of an analysis of a token. - Fields: - baseform (str): The base form (lemma, lexical form, citation form) of the reading. - tags (list of str): The morphological tags associated with the reading. -""" -except AttributeError: - # Python 3.2 users have to read the source - pass - -def subreadingToString(sub): - return sub.baseform+"".join("<"+t+">" for t in sub.tags) - -def readingToString(reading): - return "+".join(subreadingToString(sub) for sub in reading) - -def mainpos(reading, ltr=False): - """Return the first part-of-speech tag of a reading. If there are - several subreadings, by default give the first tag of the last - subreading. If ltr=True, give the first tag of the first - subreading, see - http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more - information. - - """ - if ltr: - return reading[0].tags[0] - else: - return reading[-1].tags[0] - -class LexicalUnit: - - """A lexical unit consisting of a lemma and its readings. - - Attributes: - lexicalUnit (str): The lexical unit in Apertium stream format. - wordform (str): The word form (surface form) of the lexical unit. - readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings. - knownness (Knownness): The level of knowledge of the lexical unit. - """ - - knownness = Knownness.known - def __init__(self, lexicalUnit): - self.lexicalUnit = lexicalUnit - - cohort = re.split(r'(?]+>)+)', reading) - for subreading in subreadingParts: - baseform = subreading[0].lstrip('+') - tags = re.findall(r'<([^>]+)>', subreading[1]) - - subreadings.append(SReading(baseform=baseform, tags=tags)) - - self.readings.append(subreadings) - else: - self.knownness = {'*': Knownness.unknown, '@': Knownness.biunknown, '#': Knownness.genunknown}[readings[0][0]] - - def __repr__(self): - return self.lexicalUnit - - -def parse(stream, withText=False): - """Generates lexical units from a character stream. - - Args: - stream (iterable): A character stream containing lexical units, superblanks and other text. - withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit. - - Yields: - LexicalUnit: The next lexical unit found in the character stream. (if withText is False) - (str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True) - """ - - buffer = '' - textBuffer = '' - inLexicalUnit = False - inSuperblank = False - - for char in stream: - - if inSuperblank: - if char == ']': - inSuperblank = False - textBuffer += char - elif char == '\\': - textBuffer += char - textBuffer += next(stream) - else: - textBuffer += char - elif inLexicalUnit: - if char == '$': - if withText: - yield (textBuffer, LexicalUnit(buffer)) - else: - yield LexicalUnit(buffer) - buffer = '' - textBuffer = '' - inLexicalUnit = False - elif char == '\\': - buffer += char - buffer += next(stream) - else: - buffer += char - else: - if char == '[': - inSuperblank = True - textBuffer += char - elif char == '^': - inLexicalUnit = True - elif char == '\\': - textBuffer += char - textBuffer += next(stream) - else: - textBuffer += char - - -def parse_file(f, withText=False): - """Generates lexical units from a file. - - Args: - f (file): A file containing lexical units, superblanks and other text. - - Yields: - LexicalUnit: The next lexical unit found in the file. - """ - - return parse(itertools.chain.from_iterable(f), withText) - - -if __name__ == '__main__': - lexicalUnits = parse_file(fileinput.input()) - - for lexicalUnit in lexicalUnits: - pprint.pprint(lexicalUnit.readings, width=120) Index: branches/apertium-tagger/apertium/tests/tagger/__init__.py =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/__init__.py (revision 69429) +++ branches/apertium-tagger/apertium/tests/tagger/__init__.py (revision 69432) @@ -9,9 +9,7 @@ from subprocess import (check_call, check_output, Popen, PIPE, DEVNULL, TimeoutExpired, CalledProcessError) -from . import streamparser - # Utilities def tmp(contents): t = tempfile.NamedTemporaryFile(mode='w', delete=False) @@ -169,10 +167,12 @@ ^booked/book/book$ ^books/book/book$ ^./.$ + ^Close/close/close/close/close/close$ ^the/the$ ^books/book/book$ ^./.$ + ^The/the$ ^falling/fall/fall/fall$ ^cat/cat$ @@ -189,10 +189,12 @@ ^booked/book$ ^books/book$ ^./.$ + ^Close/close$ ^the/the$ ^books/book$ ^./.$ + ^The/the$ ^falling/fall$ ^cat/cat$ @@ -259,7 +261,7 @@ def test_changing_class_hmm_sup(self): model_fn = tmp("") untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) - tagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + tagged = tmp(TRAIN_NO_PROBLEM_TAGGED) check_call( [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, model_fn, tagged, untagged]) Index: branches/apertium-tagger/experiments/experiments.py =================================================================== --- branches/apertium-tagger/experiments/experiments.py (revision 69429) +++ branches/apertium-tagger/experiments/experiments.py (revision 69432) @@ -165,10 +165,12 @@ print('trained') tagger_input = cg_proc(lab.cg_fn, input=lab.src_fn) tag_completed_proc = tagger_tag( - 'bigram', model_fn, input=tagger_input, output='/dev/null') + 'bigram', model_fn, debug=True, input=tagger_input, output='/dev/null') tag_completed_proc.check_returncode() count = 0 + ambg_classes = set() for line in tag_completed_proc.stderr.split("\n"): - if "Error: A new ambiguity class was found." in line.strip(): + if line.startswith("New ambiguity class:"): count += 1 - return count + ambg_classes.add(line) + return count, len(ambg_classes) Index: branches/apertium-tagger/experiments/shell_wrappers.py =================================================================== --- branches/apertium-tagger/experiments/shell_wrappers.py (revision 69429) +++ branches/apertium-tagger/experiments/shell_wrappers.py (revision 69432) @@ -74,7 +74,9 @@ @proc_filter -def tagger_tag(model_type, model_fn): +def tagger_tag(model_type, model_fn, debug=False): cmd = ['apertium-tagger', '--tagger', '--show-superficial', model_fn] + if debug: + cmd.insert(1, '--debug') insert_model(cmd, model_type, tagging=True) return cmd