Index: branches/apertium-tagger/apertium/Jenkinsfile
===================================================================
--- branches/apertium-tagger/apertium/Jenkinsfile	(nonexistent)
+++ branches/apertium-tagger/apertium/Jenkinsfile	(revision 69391)
@@ -0,0 +1,10 @@
+node {
+   stage 'Checkout'
+   checkout scm
+
+   stage 'Build'
+   sh "./autogen.sh && make clean && make"
+
+   stage 'Test'
+   sh "make test"
+}
Index: branches/apertium-tagger/apertium/Makefile.am
===================================================================
--- branches/apertium-tagger/apertium/Makefile.am	(revision 69388)
+++ branches/apertium-tagger/apertium/Makefile.am	(revision 69391)
@@ -1,5 +1,5 @@
-SUBDIRS = $(GENERIC_LIBRARY_NAME)
-DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME)
+SUBDIRS = $(GENERIC_LIBRARY_NAME) tests
+DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) tests
 
 modesdir=$(prefix)/share/apertium/modes
 
Index: branches/apertium-tagger/apertium/configure.ac
===================================================================
--- branches/apertium-tagger/apertium/configure.ac	(revision 69388)
+++ branches/apertium-tagger/apertium/configure.ac	(revision 69391)
@@ -186,4 +186,4 @@
 AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows])
 AS_IF([test x$version_type = xwindows], [AC_DEFINE(HAVE_GETOPT_LONG,0)], [])
 
-AC_OUTPUT([Makefile apertium.pc apertium/Makefile])
+AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile])
Index: branches/apertium-tagger/apertium/tests/Makefile.am
===================================================================
--- branches/apertium-tagger/apertium/tests/Makefile.am	(nonexistent)
+++ branches/apertium-tagger/apertium/tests/Makefile.am	(revision 69391)
@@ -0,0 +1 @@
+SUBDIRS = tagger
Index: branches/apertium-tagger/apertium/tests/run_tests.py
===================================================================
--- branches/apertium-tagger/apertium/tests/run_tests.py	(revision 69388)
+++ branches/apertium-tagger/apertium/tests/run_tests.py	(revision 69391)
@@ -6,11 +6,12 @@
 
 import unittest
 import pretransfer
+import tagger
 
 if __name__ == "__main__":
     os.chdir(os.path.dirname(__file__))
     failures = 0
-    for module in [pretransfer]:
+    for module in [pretransfer, tagger]:
         suite = unittest.TestLoader().loadTestsFromModule(module)
         res = unittest.TextTestRunner(verbosity = 2).run(suite)
         failures += len(res.failures)
Index: branches/apertium-tagger/apertium/tests/tagger/Makefile.am
===================================================================
--- branches/apertium-tagger/apertium/tests/tagger/Makefile.am	(nonexistent)
+++ branches/apertium-tagger/apertium/tests/tagger/Makefile.am	(revision 69391)
@@ -0,0 +1,14 @@
+library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
+
+bin_PROGRAMS = test-find-similar-ambiguity-class
+bin_SCRIPTS =  $(GENERATEDSCRIPTS)
+
+AM_CPPFLAGS = -I$(top_srcdir)
+
+apertiumdir = $(prefix)/share/apertium
+apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION)
+apertiumlib = $(prefix)/lib
+apertiumsysconf = $(prefix)/etc/apertium
+
+test_find_similar_ambiguity_class_SOURCES = test_find_similar_ambiguity_classes.cc
+test_find_similar_ambiguity_class_LDADD = -L$(top_srcdir)/$(GENERIC_LIBRARY_NAME)/.libs/ $(APERTIUM_LIBS) -l$(GENERIC_LIBRARY_NAME)$(GENERIC_MAJOR_VERSION)
Index: branches/apertium-tagger/apertium/tests/tagger/__init__.py
===================================================================
--- branches/apertium-tagger/apertium/tests/tagger/__init__.py	(nonexistent)
+++ branches/apertium-tagger/apertium/tests/tagger/__init__.py	(revision 69391)
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import functools
+import unittest
+import tempfile
+from os.path import join as pjoin
+from os.path import abspath, dirname
+from subprocess import check_call, check_output, run, PIPE, DEVNULL
+
+from . import streamparser
+
+## Utilities
+def tmp(contents):
+    t = tempfile.NamedTemporaryFile(mode='w', delete=False)
+    t.write(contents)
+    return t.name
+
+
+def rel(fn):
+    return abspath(pjoin(dirname(abspath(__file__)), fn))
+
+
+APERTIUM_TAGGER = rel("../../apertium/apertium-tagger")
+
+
+def check_stderr(*popenargs, timeout=None, **kwargs):
+    return run(*popenargs, stderr=PIPE, timeout=timeout, check=True,
+               **kwargs).stderr
+
+
+def trace_dec(f):
+    @functools.wraps(f)
+    def inner(*args, **kwargs):
+        if len(args) > 0:
+            print("run " + " ".join(args[0]))
+        return f(*args, **kwargs)
+    return inner
+
+check_call = functools.partial(trace_dec(check_call), universal_newlines=True)
+check_output = functools.partial(trace_dec(check_output), universal_newlines=True)
+check_stderr = functools.partial(trace_dec(check_stderr), universal_newlines=True)
+
+## Test files
+DIC = """
+^the/the<det><def><sp>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^has/have<vbhaver><pres><p3><sg>$
+^booked/book<vblex><pp>/book<vblex><past>$
+^close/close<adj><sint>/close<n><sg>/close<vblex><inf>/close<vblex><pres>/close<vblex><imp>$
+^cat/cat<n><sg>$
+^room/room<n><sg>$
+^red/red<adj><sint>$
+^./.<sent>$
+""".strip()
+
+TSX = """
+<?xml version="1.0" encoding="utf-8"?>
+<tagger name="test">
+  <tagset>
+    <def-label name="DET" closed="true">
+      <tags-item tags="det.*"/>
+      <tags-item tags="det.*.*"/>
+    </def-label> 
+    <def-label name="VERB">
+      <tags-item tags="vblex.*"/>
+      <tags-item tags="vbhaver.*"/>
+    </def-label> 
+    <def-label name="NOUN">
+      <tags-item tags="n.*"/>
+    </def-label> 
+    <def-label name="ADJ">
+      <tags-item tags="adj.*"/>
+      <tags-item tags="adj"/>
+    </def-label> 
+  </tagset>
+</tagger>
+""".strip()
+
+TRAIN_NO_PROBLEM_UNTAGGED = """
+^The/the<det><def><sp>$
+^cat/cat<n><sg>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+
+^The/the<det><def><sp>$
+^red/red<adj><sint>$
+^cat/cat<n><sg>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^red/red<adj><sint>$
+^room/room<n><sg>$
+^./.<sent>$
+
+^The/the<det><def><sp>$
+^red/red<adj><sint>$
+^cat/cat<n><sg>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+""".strip()
+
+TRAIN_NO_PROBLEM_TAGGED = """
+^The/the<det><def><sp>$
+^cat/cat<n><sg>$
+^books/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+
+^The/the<det><def><sp>$
+^red/red<adj><sint>$
+^cat/cat<n><sg>$
+^books/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^red/red<adj><sint>$
+^room/room<n><sg>$
+^./.<sent>$
+
+^The/the<det><def><sp>$
+^red/red<adj><sint>$
+^cat/cat<n><sg>$
+^books/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+""".strip()
+
+TRAIN_CAT_TO_BE_A_VERB_UNTAGGED = """
+^The/The<det><def><sp>$
+^falling/fall<vblex><pprs>/fall<vblex><ger>/fall<vblex><subs>$
+^cat/cat<n><sg>$
+^has/have<vbhaver><pres><p3><sg>$
+^booked/book<vblex><pp>/book<vblex><past>$
+^books/book<n><pl>/book<vblex><pres><p3><sg>$
+^./.<sent>$
+^Close/close<adj><sint>/close<n><sg>/close<vblex><inf>/close<vblex><pres>/close<vblex><imp>$
+^the/the<det><def><sp>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^./.<sent>$
+^The/the<det><def><sp>$
+^falling/fall<vblex><pprs>/fall<vblex><ger>/fall<vblex><subs>$
+^cat/cat<n><sg>$
+^has/have<vbhaver><pres><p3><sg>$
+^books/book<n><pl>/book<vblex><pres><p3><sg>$
+^./.<sent>$
+""".strip()
+
+TRAIN_CAT_TO_BE_A_VERB_TAGGED = """
+^The/The<det><def><sp>$
+^falling/fall<vblex><pprs>$
+^cat/cat<n><sg>$
+^has/have<vbhaver><pres><p3><sg>$
+^booked/book<vblex><pp>$
+^books/book<n><pl>$
+^./.<sent>$
+^Close/close<vblex><imp>$
+^the/the<det><def><sp>$
+^books/book<n><pl>$
+^./.<sent>$
+^The/the<det><def><sp>$
+^falling/fall<vblex><pprs>$
+^cat/cat<n><sg>$
+^has/have<vbhaver><pres><p3><sg>$
+^books/book<n><pl>$
+^./.<sent>$
+""".strip()
+
+TEST_SUCCESS = """
+^The/the<det><def><sp>$
+^cat/cat<n><sg>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+""".strip()
+
+TEST_NEW_AMBG_CLASS = """
+^The/the<det><def><sp>$
+^cat/cat<n><sg>/cat<adj>$
+^books/book<n><pl>/book<vblex><pri><p3><sg>$
+^the/the<det><def><sp>$
+^room/room<n><sg>$
+^./.<sent>$
+""".strip()
+
+## Expected strings
+EXPECTED_SUBST = """
+Error: A new ambiguity class was found. 
+Retraining the tagger is necessary so as to take it into account.
+Word 'cat'.
+New ambiguity class: {NOUN,ADJ}
+""".strip()
+
+## Tests
+class AmbiguityClassTest(unittest.TestCase):
+    def setUp(self):
+        self.tsx_fn = tmp(TSX)
+        self.dic_fn = tmp(DIC)
+
+    def changing_class_impl(self, flags, model_fn):
+        test1 = tmp(TEST_SUCCESS)
+        test2 = tmp(TEST_NEW_AMBG_CLASS)
+        success_stderr = check_stderr(
+            [APERTIUM_TAGGER, '-d'] + flags +
+            ['-g', model_fn, test1],
+            stdout=DEVNULL)
+        self.assertEqual(success_stderr.strip(), "")
+        subst_stderr = check_stderr(
+            [APERTIUM_TAGGER, '-d'] + flags +
+            ['-g', model_fn, test2],
+            stdout=DEVNULL)
+        self.assertEqual(subst_stderr.strip(), EXPECTED_SUBST)
+        ambg_class = check_output(
+           [rel('test-find-similar-ambiguity-class'), model_fn],
+           input="NOUN ADJ\n")
+        substituted_class = set(ambg_class.split(" "))
+        # Should get open class
+        self.assertSetEqual(substituted_class, set(("VERB", "NOUN", "ADJ")))
+
+    def test_changing_class_hmm_sup(self):
+        model_fn = tmp("")
+        untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED)
+        tagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED)
+        check_call(
+            [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn,
+             model_fn, tagged, untagged])
+        self.changing_class_impl([], model_fn)
+
+    def test_changing_class_hmm_unsup(self):
+        model_fn = tmp("")
+        untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED)
+        check_call(
+            [APERTIUM_TAGGER, '-t', '1', self.dic_fn, untagged, self.tsx_fn,
+             model_fn])
+        self.changing_class_impl([], model_fn)
+
+    def test_changing_class_sliding_window(self):
+        model_fn = tmp("")
+        untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED)
+        check_call(
+            [APERTIUM_TAGGER, '--sliding-window', '-t', '1', self.dic_fn,
+             untagged, self.tsx_fn, model_fn])
+        self.changing_class_impl(['--sliding-window'], model_fn)
+
+    def test_cat_is_a_verb(self):
+        model_fn = tmp("")
+        untagged = tmp(TRAIN_CAT_TO_BE_A_VERB_UNTAGGED)
+        tagged = tmp(TRAIN_CAT_TO_BE_A_VERB_TAGGED)
+        new_ambg_class = tmp(TEST_NEW_AMBG_CLASS)
+        check_call(
+            [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn,
+             model_fn, tagged, untagged])
+        subst_stdout = check_output(
+            [APERTIUM_TAGGER, '-d', '-g', model_fn, new_ambg_class], stderr=DEVNULL)
+        lexical_units = streamparser.parse(subst_stdout)
+        acceptable = False
+        for lexical_unit in lexical_units:
+            for reading in lexical_unit.readings:
+                if reading.baseform == 'cat' and \
+                        ('adj' in reading.tags or 'n' in reading.tags):
+                    acceptable = True
+        self.assertTrue(
+            acceptable,
+            "'cat' must be output and tagged as an adjective or a noun.\n" +
+            "Actual output:\n{}".format(subst_stdout))

Property changes on: branches/apertium-tagger/apertium/tests/tagger/__init__.py
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: branches/apertium-tagger/apertium/tests/tagger/streamparser.py
===================================================================
--- branches/apertium-tagger/apertium/tests/tagger/streamparser.py	(nonexistent)
+++ branches/apertium-tagger/apertium/tests/tagger/streamparser.py	(revision 69391)
@@ -0,0 +1,171 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+"""
+Usage: streamparser.py [FILE]
+
+Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found.
+"""
+
+import re, pprint, sys, itertools, fileinput
+from enum import Enum
+from collections import namedtuple
+
+
+Knownness = Enum('Knownness', 'known unknown biunknown genunknown')
+try:
+    Knownness.__doc__ = """Level of knowledge associated with a lexical unit.
+    Values:
+        known
+        unknown: Denoted by '*', analysis not available.
+        biunknown: Denoted by '@', translation not available.
+        genunknown: Denoted by '#', generated form not available.
+"""
+except AttributeError:
+    # Python 3.2 users have to read the source
+    pass
+
+SReading = namedtuple('SReading', ['baseform', 'tags'])
+try:
+    SReading.__doc__ = """A single subreading of an analysis of a token.
+    Fields:
+        baseform (str): The base form (lemma, lexical form, citation form) of the reading.
+        tags (list of str): The morphological tags associated with the reading.
+"""
+except AttributeError:
+    # Python 3.2 users have to read the source
+    pass
+
+def subreadingToString(sub):
+    return sub.baseform+"".join("<"+t+">" for t in sub.tags)
+
+def readingToString(reading):
+    return "+".join(subreadingToString(sub) for sub in reading)
+
+def mainpos(reading, ltr=False):
+    """Return the first part-of-speech tag of a reading. If there are
+    several subreadings, by default give the first tag of the last
+    subreading. If ltr=True, give the first tag of the first
+    subreading, see
+    http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more
+    information.
+
+    """
+    if ltr:
+        return reading[0].tags[0]
+    else:
+        return reading[-1].tags[0]
+
+class LexicalUnit:
+
+    """A lexical unit consisting of a lemma and its readings.
+
+    Attributes:
+        lexicalUnit (str): The lexical unit in Apertium stream format.
+        wordform (str): The word form (surface form) of the lexical unit.
+        readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings.
+        knownness (Knownness): The level of knowledge of the lexical unit.
+    """
+
+    knownness = Knownness.known
+    def __init__(self, lexicalUnit):
+        self.lexicalUnit = lexicalUnit
+
+        cohort = re.split(r'(?<!\\)/', lexicalUnit)
+        self.wordform = cohort[0]
+        readings = cohort[1:]
+
+        self.readings = []
+        for reading in readings:
+            if len(reading) < 1:
+                print("WARNING: Empty readings for {}".format(self.lexicalUnit), file=sys.stderr)
+            elif reading[0] not in '*#@':
+                subreadings = []
+
+                subreadingParts = re.findall(r'([^<]+)((?:<[^>]+>)+)', reading)
+                for subreading in subreadingParts:
+                    baseform = subreading[0].lstrip('+')
+                    tags = re.findall(r'<([^>]+)>', subreading[1])
+
+                    subreadings.append(SReading(baseform=baseform, tags=tags))
+
+                self.readings.append(subreadings)
+            else:
+                self.knownness = {'*': Knownness.unknown, '@': Knownness.biunknown, '#': Knownness.genunknown}[readings[0][0]]
+
+    def __repr__(self):
+        return self.lexicalUnit
+
+
+def parse(stream, withText=False):
+    """Generates lexical units from a character stream.
+
+    Args:
+        stream (iterable): A character stream containing lexical units, superblanks and other text.
+        withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit.
+
+    Yields:
+        LexicalUnit: The next lexical unit found in the character stream. (if withText is False)
+        (str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True)
+    """
+
+    buffer = ''
+    textBuffer = ''
+    inLexicalUnit = False
+    inSuperblank = False
+
+    for char in stream:
+
+        if inSuperblank:
+            if char == ']':
+                inSuperblank = False
+                textBuffer += char
+            elif char == '\\':
+                textBuffer += char
+                textBuffer += next(stream)
+            else:
+                textBuffer += char
+        elif inLexicalUnit:
+            if char == '$':
+                if withText:
+                    yield (textBuffer, LexicalUnit(buffer))
+                else:
+                    yield LexicalUnit(buffer)
+                buffer = ''
+                textBuffer = ''
+                inLexicalUnit = False
+            elif char == '\\':
+                buffer += char
+                buffer += next(stream)
+            else:
+                buffer += char
+        else:
+            if char == '[':
+                inSuperblank = True
+                textBuffer += char
+            elif char == '^':
+                inLexicalUnit = True
+            elif char == '\\':
+                textBuffer += char
+                textBuffer += next(stream)
+            else:
+                textBuffer += char
+
+
+def parse_file(f, withText=False):
+    """Generates lexical units from a file.
+
+    Args:
+        f (file): A file containing lexical units, superblanks and other text.
+
+    Yields:
+        LexicalUnit: The next lexical unit found in the file.
+    """
+
+    return parse(itertools.chain.from_iterable(f), withText)
+
+
+if __name__ == '__main__':
+    lexicalUnits = parse_file(fileinput.input())
+
+    for lexicalUnit in lexicalUnits:
+        pprint.pprint(lexicalUnit.readings, width=120)
Index: branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc
===================================================================
--- branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc	(nonexistent)
+++ branches/apertium-tagger/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc	(revision 69391)
@@ -0,0 +1,61 @@
+#include "apertium/utf_converter.h"
+#include "apertium/tagger_utils.h"
+#include "apertium/tagger_data_hmm.h"
+#include "apertium/tagger_data.h"
+#include <iostream>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+void print_ambiguity_class(const vector<wstring> &array_tags, const set<TTag> &abgset)
+{
+  unsigned int j;
+  set<TTag>::const_iterator abgseti;
+  for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) {
+    wcout << array_tags[*abgseti];
+    if (j < abgset.size() - 1) {
+      wcout << " ";
+    }
+  }
+}
+
+void find_similar_ambiguity_class_io(TaggerData &td)
+{
+  vector<wstring> &array_tags = td.getArrayTags();
+  wstring line = L"";
+  getline(wcin, line, L'\n');
+
+  wstringstream line_stream(line);
+  set<TTag> ambiguity_class;
+  wstring tag_name;
+  while (line_stream >> tag_name) {
+    vector<wstring>::iterator it;
+    it = find(array_tags.begin(), array_tags.end(), tag_name);
+    if (it == array_tags.end()) {
+        wcerr << L"Tag not in model: " << tag_name << L'\n';
+        exit(-3);
+    }
+    ambiguity_class.insert(it - array_tags.begin());
+  }
+  set<TTag> similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class);
+  print_ambiguity_class(array_tags, similar_ambiguity_class);
+}
+
+int main(int argc, char *argv[])
+{
+  if (argc < 2) {
+    cerr<<"Usage: "<<argv[0]<<" <probfile>\n";
+    exit(-1);
+  }
+  char* probfile = argv[1];
+  TaggerDataHMM tagger_data_hmm;
+  FILE* fin = fopen(probfile, "r");
+  if (!fin) {
+    cerr<<"Error: cannot open file '"<<probfile<<"'\n";
+    exit(-2);
+  }
+  tagger_data_hmm.read(fin);
+  fclose(fin);
+
+  find_similar_ambiguity_class_io((TaggerData&)tagger_data_hmm);
+}
Index: branches/apertium-tagger/apertium/apertium/apertium_tagger.cc
===================================================================
--- branches/apertium-tagger/apertium/apertium/apertium_tagger.cc	(revision 69388)
+++ branches/apertium-tagger/apertium/apertium/apertium_tagger.cc	(revision 69391)
@@ -734,6 +734,7 @@
   }
 
   FILE_Tagger_.set_debug(TheFlags.getDebug());
+  TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags());
   TaggerWord::generate_marks = TheFlags.getMark();
   FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial());
   FILE_Tagger_.setNullFlush(TheFlags.getNullFlush());