Index: branches/apertium-tagger/apertium/Jenkinsfile =================================================================== --- branches/apertium-tagger/apertium/Jenkinsfile (nonexistent) +++ branches/apertium-tagger/apertium/Jenkinsfile (revision 69391) @@ -0,0 +1,10 @@ +node { + stage 'Checkout' + checkout scm + + stage 'Build' + sh "./autogen.sh && make clean && make" + + stage 'Test' + sh "make test" +} Index: branches/apertium-tagger/apertium/Makefile.am =================================================================== --- branches/apertium-tagger/apertium/Makefile.am (revision 69388) +++ branches/apertium-tagger/apertium/Makefile.am (revision 69391) @@ -1,5 +1,5 @@ -SUBDIRS = $(GENERIC_LIBRARY_NAME) -DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) +SUBDIRS = $(GENERIC_LIBRARY_NAME) tests +DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) tests modesdir=$(prefix)/share/apertium/modes Index: branches/apertium-tagger/apertium/configure.ac =================================================================== --- branches/apertium-tagger/apertium/configure.ac (revision 69388) +++ branches/apertium-tagger/apertium/configure.ac (revision 69391) @@ -186,4 +186,4 @@ AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows]) AS_IF([test x$version_type = xwindows], [AC_DEFINE(HAVE_GETOPT_LONG,0)], []) -AC_OUTPUT([Makefile apertium.pc apertium/Makefile]) +AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile]) Index: branches/apertium-tagger/apertium/tests/Makefile.am =================================================================== --- branches/apertium-tagger/apertium/tests/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium/tests/Makefile.am (revision 69391) @@ -0,0 +1 @@ +SUBDIRS = tagger Index: branches/apertium-tagger/apertium/tests/run_tests.py =================================================================== --- branches/apertium-tagger/apertium/tests/run_tests.py (revision 69388) +++ branches/apertium-tagger/apertium/tests/run_tests.py (revision 69391) @@ -6,11 +6,12 @@ import unittest import pretransfer +import tagger if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) failures = 0 - for module in [pretransfer]: + for module in [pretransfer, tagger]: suite = unittest.TestLoader().loadTestsFromModule(module) res = unittest.TextTestRunner(verbosity = 2).run(suite) failures += len(res.failures) Index: branches/apertium-tagger/apertium/tests/tagger/Makefile.am =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium/tests/tagger/Makefile.am (revision 69391) @@ -0,0 +1,14 @@ +library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) + +bin_PROGRAMS = test-find-similar-ambiguity-class +bin_SCRIPTS = $(GENERATEDSCRIPTS) + +AM_CPPFLAGS = -I$(top_srcdir) + +apertiumdir = $(prefix)/share/apertium +apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) +apertiumlib = $(prefix)/lib +apertiumsysconf = $(prefix)/etc/apertium + +test_find_similar_ambiguity_class_SOURCES = test_find_similar_ambiguity_classes.cc +test_find_similar_ambiguity_class_LDADD = -L$(top_srcdir)/$(GENERIC_LIBRARY_NAME)/.libs/ $(APERTIUM_LIBS) -l$(GENERIC_LIBRARY_NAME)$(GENERIC_MAJOR_VERSION) Index: branches/apertium-tagger/apertium/tests/tagger/__init__.py =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/__init__.py (nonexistent) +++ branches/apertium-tagger/apertium/tests/tagger/__init__.py (revision 69391) @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import functools +import unittest +import tempfile +from os.path import join as pjoin +from os.path import abspath, dirname +from subprocess import check_call, check_output, run, PIPE, DEVNULL + +from . import streamparser + +## Utilities +def tmp(contents): + t = tempfile.NamedTemporaryFile(mode='w', delete=False) + t.write(contents) + return t.name + + +def rel(fn): + return abspath(pjoin(dirname(abspath(__file__)), fn)) + + +APERTIUM_TAGGER = rel("../../apertium/apertium-tagger") + + +def check_stderr(*popenargs, timeout=None, **kwargs): + return run(*popenargs, stderr=PIPE, timeout=timeout, check=True, + **kwargs).stderr + + +def trace_dec(f): + @functools.wraps(f) + def inner(*args, **kwargs): + if len(args) > 0: + print("run " + " ".join(args[0])) + return f(*args, **kwargs) + return inner + +check_call = functools.partial(trace_dec(check_call), universal_newlines=True) +check_output = functools.partial(trace_dec(check_output), universal_newlines=True) +check_stderr = functools.partial(trace_dec(check_stderr), universal_newlines=True) + +## Test files +DIC = """ +^the/the$ +^books/book/book$ +^has/have$ +^booked/book/book$ +^close/close/close/close/close/close$ +^cat/cat$ +^room/room$ +^red/red$ +^./.$ +""".strip() + +TSX = """ + + + + + + + + + + + + + + + + + + + + +""".strip() + +TRAIN_NO_PROBLEM_UNTAGGED = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_NO_PROBLEM_TAGGED = """ +^The/the$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_UNTAGGED = """ +^The/The$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^booked/book/book$ +^books/book/book$ +^./.$ +^Close/close/close/close/close/close$ +^the/the$ +^books/book/book$ +^./.$ +^The/the$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^books/book/book$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_TAGGED = """ +^The/The$ +^falling/fall$ +^cat/cat$ +^has/have$ +^booked/book$ +^books/book$ +^./.$ +^Close/close$ +^the/the$ +^books/book$ +^./.$ +^The/the$ +^falling/fall$ +^cat/cat$ +^has/have$ +^books/book$ +^./.$ +""".strip() + +TEST_SUCCESS = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TEST_NEW_AMBG_CLASS = """ +^The/the$ +^cat/cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +## Expected strings +EXPECTED_SUBST = """ +Error: A new ambiguity class was found. +Retraining the tagger is necessary so as to take it into account. +Word 'cat'. +New ambiguity class: {NOUN,ADJ} +""".strip() + +## Tests +class AmbiguityClassTest(unittest.TestCase): + def setUp(self): + self.tsx_fn = tmp(TSX) + self.dic_fn = tmp(DIC) + + def changing_class_impl(self, flags, model_fn): + test1 = tmp(TEST_SUCCESS) + test2 = tmp(TEST_NEW_AMBG_CLASS) + success_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test1], + stdout=DEVNULL) + self.assertEqual(success_stderr.strip(), "") + subst_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test2], + stdout=DEVNULL) + self.assertEqual(subst_stderr.strip(), EXPECTED_SUBST) + ambg_class = check_output( + [rel('test-find-similar-ambiguity-class'), model_fn], + input="NOUN ADJ\n") + substituted_class = set(ambg_class.split(" ")) + # Should get open class + self.assertSetEqual(substituted_class, set(("VERB", "NOUN", "ADJ"))) + + def test_changing_class_hmm_sup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + tagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + self.changing_class_impl([], model_fn) + + def test_changing_class_hmm_unsup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '-t', '1', self.dic_fn, untagged, self.tsx_fn, + model_fn]) + self.changing_class_impl([], model_fn) + + def test_changing_class_sliding_window(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '--sliding-window', '-t', '1', self.dic_fn, + untagged, self.tsx_fn, model_fn]) + self.changing_class_impl(['--sliding-window'], model_fn) + + def test_cat_is_a_verb(self): + model_fn = tmp("") + untagged = tmp(TRAIN_CAT_TO_BE_A_VERB_UNTAGGED) + tagged = tmp(TRAIN_CAT_TO_BE_A_VERB_TAGGED) + new_ambg_class = tmp(TEST_NEW_AMBG_CLASS) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + subst_stdout = check_output( + [APERTIUM_TAGGER, '-d', '-g', model_fn, new_ambg_class], stderr=DEVNULL) + lexical_units = streamparser.parse(subst_stdout) + acceptable = False + for lexical_unit in lexical_units: + for reading in lexical_unit.readings: + if reading.baseform == 'cat' and \ + ('adj' in reading.tags or 'n' in reading.tags): + acceptable = True + self.assertTrue( + acceptable, + "'cat' must be output and tagged as an adjective or a noun.\n" + + "Actual output:\n{}".format(subst_stdout)) Property changes on: branches/apertium-tagger/apertium/tests/tagger/__init__.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium/tests/tagger/streamparser.py =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/streamparser.py (nonexistent) +++ branches/apertium-tagger/apertium/tests/tagger/streamparser.py (revision 69391) @@ -0,0 +1,171 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +Usage: streamparser.py [FILE] + +Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found. +""" + +import re, pprint, sys, itertools, fileinput +from enum import Enum +from collections import namedtuple + + +Knownness = Enum('Knownness', 'known unknown biunknown genunknown') +try: + Knownness.__doc__ = """Level of knowledge associated with a lexical unit. + Values: + known + unknown: Denoted by '*', analysis not available. + biunknown: Denoted by '@', translation not available. + genunknown: Denoted by '#', generated form not available. +""" +except AttributeError: + # Python 3.2 users have to read the source + pass + +SReading = namedtuple('SReading', ['baseform', 'tags']) +try: + SReading.__doc__ = """A single subreading of an analysis of a token. + Fields: + baseform (str): The base form (lemma, lexical form, citation form) of the reading. + tags (list of str): The morphological tags associated with the reading. +""" +except AttributeError: + # Python 3.2 users have to read the source + pass + +def subreadingToString(sub): + return sub.baseform+"".join("<"+t+">" for t in sub.tags) + +def readingToString(reading): + return "+".join(subreadingToString(sub) for sub in reading) + +def mainpos(reading, ltr=False): + """Return the first part-of-speech tag of a reading. If there are + several subreadings, by default give the first tag of the last + subreading. If ltr=True, give the first tag of the first + subreading, see + http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more + information. + + """ + if ltr: + return reading[0].tags[0] + else: + return reading[-1].tags[0] + +class LexicalUnit: + + """A lexical unit consisting of a lemma and its readings. + + Attributes: + lexicalUnit (str): The lexical unit in Apertium stream format. + wordform (str): The word form (surface form) of the lexical unit. + readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings. + knownness (Knownness): The level of knowledge of the lexical unit. + """ + + knownness = Knownness.known + def __init__(self, lexicalUnit): + self.lexicalUnit = lexicalUnit + + cohort = re.split(r'(?]+>)+)', reading) + for subreading in subreadingParts: + baseform = subreading[0].lstrip('+') + tags = re.findall(r'<([^>]+)>', subreading[1]) + + subreadings.append(SReading(baseform=baseform, tags=tags)) + + self.readings.append(subreadings) + else: + self.knownness = {'*': Knownness.unknown, '@': Knownness.biunknown, '#': Knownness.genunknown}[readings[0][0]] + + def __repr__(self): + return self.lexicalUnit + + +def parse(stream, withText=False): + """Generates lexical units from a character stream. + + Args: + stream (iterable): A character stream containing lexical units, superblanks and other text. + withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit. + + Yields: + LexicalUnit: The next lexical unit found in the character stream. (if withText is False) + (str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True) + """ + + buffer = '' + textBuffer = '' + inLexicalUnit = False + inSuperblank = False + + for char in stream: + + if inSuperblank: + if char == ']': + inSuperblank = False + textBuffer += char + elif char == '\\': + textBuffer += char + textBuffer += next(stream) + else: + textBuffer += char + elif inLexicalUnit: + if char == '$': + if withText: + yield (textBuffer, LexicalUnit(buffer)) + else: + yield LexicalUnit(buffer) + buffer = '' + textBuffer = '' + inLexicalUnit = False + elif char == '\\': + buffer += char + buffer += next(stream) + else: + buffer += char + else: + if char == '[': + inSuperblank = True + textBuffer += char + elif char == '^': + inLexicalUnit = True + elif char == '\\': + textBuffer += char + textBuffer += next(stream) + else: + textBuffer += char + + +def parse_file(f, withText=False): + """Generates lexical units from a file. + + Args: + f (file): A file containing lexical units, superblanks and other text. + + Yields: + LexicalUnit: The next lexical unit found in the file. + """ + + return parse(itertools.chain.from_iterable(f), withText) + + +if __name__ == '__main__': + lexicalUnits = parse_file(fileinput.input()) + + for lexicalUnit in lexicalUnits: + pprint.pprint(lexicalUnit.readings, width=120) Index: branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc =================================================================== --- branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (nonexistent) +++ branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (revision 69391) @@ -0,0 +1,61 @@ +#include "apertium/utf_converter.h" +#include "apertium/tagger_utils.h" +#include "apertium/tagger_data_hmm.h" +#include "apertium/tagger_data.h" +#include +#include +#include +#include + +void print_ambiguity_class(const vector &array_tags, const set &abgset) +{ + unsigned int j; + set::const_iterator abgseti; + for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { + wcout << array_tags[*abgseti]; + if (j < abgset.size() - 1) { + wcout << " "; + } + } +} + +void find_similar_ambiguity_class_io(TaggerData &td) +{ + vector &array_tags = td.getArrayTags(); + wstring line = L""; + getline(wcin, line, L'\n'); + + wstringstream line_stream(line); + set ambiguity_class; + wstring tag_name; + while (line_stream >> tag_name) { + vector::iterator it; + it = find(array_tags.begin(), array_tags.end(), tag_name); + if (it == array_tags.end()) { + wcerr << L"Tag not in model: " << tag_name << L'\n'; + exit(-3); + } + ambiguity_class.insert(it - array_tags.begin()); + } + set similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class); + print_ambiguity_class(array_tags, similar_ambiguity_class); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + cerr<<"Usage: "<\n"; + exit(-1); + } + char* probfile = argv[1]; + TaggerDataHMM tagger_data_hmm; + FILE* fin = fopen(probfile, "r"); + if (!fin) { + cerr<<"Error: cannot open file '"<