commit 8d3d8f2211b910450f45d628d642884a24b192d9
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Wed Jun 30 08:53:05 2021 -0500

    use ICU (#40)
    
    ICU
    - convert all `std::wstring`s and related types to `UString`
    - use `lttoolbox/input_file.h` for reading UTF-8 input with nulls
    - use `UFILE*` for output
    
    efficiency, cleanliness and code style
    - move constant initializers to header file
    - store references to special transducer symbols to save `alphabet` lookups
    - use range-for loops when possible
    - prefer `std::vector` to `std::list`
    - prefer `str.empty()` to `str == ""`
    - drop old, unused file in `src/`
    
    helper functions
    - `XMLParseUtil` has more specialized functions now
    - `XMLParseUtil` is now in lttoolbox, so drop apertium dependency

diff --git a/.gitignore b/.gitignore
index 94e8d70..d16ffc9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ stamp-h1
 src/*.o
 src/lsx-comp
 src/lsx-proc
+*.pyc
diff --git a/configure.ac b/configure.ac
index 5b60c37..b6b36b2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,9 @@
 AC_PREREQ(2.61)
 
 m4_define([required_libxml_version], [2.6.17])
-m4_define([required_apertium_version], [3.7.0])
-m4_define([required_lttoolbox_version], [3.5.3])
+m4_define([required_lttoolbox_version], [3.6.0])
 
-AC_INIT([apertium-separable], [0.3.6], [apertium-stuff@lists.sourceforge.net])
+AC_INIT([apertium-separable], [0.4.0], [apertium-stuff@lists.sourceforge.net])
 AM_INIT_AUTOMAKE
 AC_CONFIG_MACRO_DIR([m4])
 
@@ -28,23 +27,23 @@ PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= required_lttoolbox_version])
 AC_SUBST(LTTOOLBOX_CFLAGS)
 AC_SUBST(LTTOOLBOX_LIBS)
 
-PKG_CHECK_MODULES([APERTIUM], [apertium >= required_apertium_version])
-
-AC_SUBST(APERTIUM_CFLAGS)
-AC_SUBST(APERTIUM_LIBS)
-
 PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version])
 
 AC_SUBST(LIBXML_CFLAGS)
 AC_SUBST(LIBXML_LIBS)
 
+PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc])
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LIBS)
+
 # Checks for libraries.
 AC_CHECK_LIB(xml2, xmlReaderForFile)
 
-AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked])
+AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])])
 
-CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS"
-LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS"
+CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS"
+LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS"
 
 # Checks for highest supported C++ standard
 AC_LANG(C++)
diff --git a/src/lsx_comp.cc b/src/lsx_comp.cc
index 4cbfff4..84a9905 100644
--- a/src/lsx_comp.cc
+++ b/src/lsx_comp.cc
@@ -2,6 +2,7 @@
 #include <cerrno>
 #include <iostream>
 #include <stdlib.h>
+#include <cstring>
 
 #include <lsx_compiler.h>
 #include <lttoolbox/lt_locale.h>
@@ -29,7 +30,7 @@ int main (int argc, char** argv)
 
   Compiler c;
 
-  wstring dir;
+  UString dir;
 
   if(strcmp(argv[1], "lr") == 0)
   {
@@ -54,7 +55,7 @@ int main (int argc, char** argv)
   FILE* fst = fopen(argv[argc-1], "w+");
   if(!fst)
   {
-    wcerr << "Error: Cannot open file '" << fst << "'." << endl;
+    cerr << "Error: Cannot open file '" << fst << "'." << endl;
     exit(EXIT_FAILURE);
   }
   c.write(fst);
diff --git a/src/lsx_compiler.cc b/src/lsx_compiler.cc
index 3ecb3f1..73dd340 100644
--- a/src/lsx_compiler.cc
+++ b/src/lsx_compiler.cc
@@ -19,7 +19,7 @@
 #include <lttoolbox/entry_token.h>
 #include <lttoolbox/lt_locale.h>
 #include <lttoolbox/xml_parse_util.h>
-#include <apertium/string_utils.h>
+#include <lttoolbox/string_utils.h>
 
 #include <cstdlib>
 #include <iostream>
@@ -29,35 +29,27 @@ using namespace std;
 
 // Removed static globals copied from lttoolbox's compiler.cc. Same namespace, same mangling, bad result.
 
-wstring const Compiler::COMPILER_ANYTAG_ELEM        = L"t";
-wstring const Compiler::COMPILER_ANYCHAR_ELEM       = L"w";
-wstring const Compiler::COMPILER_WB_ELEM            = L"j";
-
-Compiler::Compiler() :
-reader(0),
-verbose(false),
-first_element(false)
-{
-}
-
-Compiler::~Compiler()
-{
-}
+UString const Compiler::COMPILER_ANYTAG_ELEM        = "t"_u;
+UString const Compiler::COMPILER_ANYCHAR_ELEM       = "w"_u;
+UString const Compiler::COMPILER_WB_ELEM            = "j"_u;
 
 void
-Compiler::parse(string const &fichero, wstring const &dir)
+Compiler::parse(string const &fichero, UString const &dir)
 {
     direction = dir;
     reader = xmlReaderForFile(fichero.c_str(), NULL, 0);
     if(reader == NULL)
     {
-        wcerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl;
+        cerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl;
         exit(EXIT_FAILURE);
     }
 
-    alphabet.includeSymbol(L"<ANY_TAG>");
-    alphabet.includeSymbol(L"<ANY_CHAR>");
-    alphabet.includeSymbol(L"<$>");
+    alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL);
+    alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL);
+    alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL);
+    any_tag       = alphabet(Transducer::ANY_TAG_SYMBOL);
+    any_char      = alphabet(Transducer::ANY_CHAR_SYMBOL);
+    word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL);
 
     int ret = xmlTextReaderRead(reader);
     while(ret == 1)
@@ -69,7 +61,7 @@ Compiler::parse(string const &fichero, wstring const &dir)
 
     if(ret != 0)
     {
-        wcerr << L"Error: Parse error at the end of input." << endl;
+        cerr << "Error: Parse error at the end of input." << endl;
     }
 
     xmlFreeTextReader(reader);
@@ -77,19 +69,16 @@ Compiler::parse(string const &fichero, wstring const &dir)
 
 
     // Minimize transducers and ensure that all paths end with <$>
-    int end_trans = alphabet(alphabet(L"<$>"), alphabet(L"<$>"));
-    for(map<wstring, Transducer, Ltstr>::iterator it = sections.begin(),
-        limit = sections.end();
-        it != limit; it++)
-    {
-        (it->second).minimize();
+    int end_trans = alphabet(word_boundary, word_boundary);
+    for (auto& it : sections) {
+        it.second.minimize();
         // any paths which did not already end with <$> now will
         // having 2 finals isn't a problem because -separable only checks
         // for finals when it reads $, and you can't have 2 of those in a row
-        for(auto fin : (it->second).getFinals())
+        for(auto fin : it.second.getFinals())
         {
-          int end_state = (it->second).insertSingleTransduction(end_trans, fin.first);
-          (it->second).setFinal(end_state);
+          int end_state = it.second.insertSingleTransduction(end_trans, fin.first);
+          it.second.setFinal(end_state);
         }
     }
 }
@@ -105,8 +94,7 @@ Compiler::procAlphabet()
         int ret = xmlTextReaderRead(reader);
         if(ret == 1)
         {
-            xmlChar const *valor = xmlTextReaderConstValue(reader);
-            letters = XMLParseUtil::towstring(valor);
+            UString letters = XMLParseUtil::readValue(reader);
             bool espai = true;
             for(unsigned int i = 0; i < letters.length(); i++)
             {
@@ -118,13 +106,13 @@ Compiler::procAlphabet()
             }
             if(espai == true)  // libxml2 returns '\n' for <alphabet></alphabet>, should be empty
             {
-                letters = L"";
+              letters.clear();
             }
         }
         else
         {
-            wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-            wcerr << L"): Missing alphabet symbols." << endl;
+            cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+            cerr << "): Missing alphabet symbols." << endl;
             exit(EXIT_FAILURE);
         }
     }
@@ -133,7 +121,11 @@ Compiler::procAlphabet()
 void
 Compiler::procSDef()
 {
-    alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">");
+  UString s;
+  s += '<';
+  s.append(attrib(COMPILER_N_ATTR));
+  s += '>';
+  alphabet.includeSymbol(s);
 }
 
 void
@@ -151,15 +143,15 @@ Compiler::procParDef()
         {
             paradigms[current_paradigm].minimize();
             paradigms[current_paradigm].joinFinals();
-            current_paradigm = L"";
+            current_paradigm.clear();
         }
     }
 }
 
 int
-Compiler::matchTransduction(list<int> const &pi, list<int> const &pd, int estado, Transducer &t)
+Compiler::matchTransduction(vector<int> const &pi, vector<int> const &pd, int estado, Transducer &t)
 {
-    list<int>::const_iterator izqda, dcha, limizqda, limdcha;
+    vector<int>::const_iterator izqda, dcha, limizqda, limdcha;
 
     if(direction == COMPILER_RESTRICTION_LR_VAL)
     {
@@ -183,8 +175,6 @@ Compiler::matchTransduction(list<int> const &pi, list<int> const &pd, int estado
     }
     else
     {
-        int rsymbol = 0;
-
         while(true)
         {
             int etiqueta;
@@ -202,33 +192,31 @@ Compiler::matchTransduction(list<int> const &pi, list<int> const &pd, int estado
             else if(dcha == limdcha)
             {
                 etiqueta = alphabet(*izqda, 0);
-                rsymbol = 0;
                 izqda++;
             }
             else
             {
                 etiqueta = alphabet(*izqda, *dcha);
-                rsymbol = *dcha;
                 izqda++;
                 dcha++;
             }
 
-            if(etiqueta == alphabet(0, alphabet(L"<ANY_TAG>")) ||
-               etiqueta == alphabet(0, alphabet(L"<ANY_CHAR>"))
+            if(etiqueta == alphabet(0, any_tag) ||
+               etiqueta == alphabet(0, any_char)
               )
             {
               // rl compilation of a badly written rule
               // having an epsilon with wildcard output will produce
               // garbage output -- see https://github.com/apertium/apertium-separable/issues/8
-              wcerr << L"Warning: Cannot insert <t/> from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl;
+              cerr << "Warning: Cannot insert <t/> from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl;
               continue;
             }
 
             int nuevo_estado = t.insertSingleTransduction(etiqueta, estado);
-            if(etiqueta == alphabet(alphabet(L"<ANY_TAG>"),alphabet(L"<ANY_TAG>"))
-               || etiqueta == alphabet(alphabet(L"<ANY_CHAR>"),alphabet(L"<ANY_CHAR>"))
-               || etiqueta == alphabet(alphabet(L"<ANY_TAG>"), 0)
-               || etiqueta == alphabet(alphabet(L"<ANY_CHAR>"), 0)
+            if(etiqueta == alphabet(any_tag, any_tag)
+               || etiqueta == alphabet(any_char, any_char)
+               || etiqueta == alphabet(any_tag, 0)
+               || etiqueta == alphabet(any_char, 0)
               )
             {
                 t.linkStates(nuevo_estado, nuevo_estado, etiqueta);
@@ -242,12 +230,12 @@ Compiler::matchTransduction(list<int> const &pi, list<int> const &pd, int estado
 
 
 void
-Compiler::requireEmptyError(wstring const &name)
+Compiler::requireEmptyError(UString const &name)
 {
     if(!xmlTextReaderIsEmptyElement(reader))
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Non-empty element '<" << name << ">' should be empty." << endl;
         exit(EXIT_FAILURE);
     }
 }
@@ -255,139 +243,137 @@ Compiler::requireEmptyError(wstring const &name)
 bool
 Compiler::allBlanks()
 {
-    bool flag = true;
-    wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader));
-
-    for(unsigned int i = 0, limit = text.size(); i < limit; i++)
-    {
-        flag = flag && iswspace(text[i]);
-    }
-
-    return flag;
+  vector<int32_t> text;
+  XMLParseUtil::readValueInto32(reader, text);
+  for (auto& it : text) {
+    if (!u_isspace(it)) {
+      return false;
+    }
+  }
+  return true;
 }
 
 void
-Compiler::readString(list<int> &result, wstring const &name)
+Compiler::readString(vector<int> &result, UString const &name)
 {
-    if(name == L"#text")
+    if(name == "#text"_u)
     {
-        wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader));
-        for(unsigned int i = 0, limit = value.size(); i < limit; i++)
-        {
-            result.push_back(static_cast<int>(value[i]));
-        }
+      XMLParseUtil::readValueInto32(reader, result);
     }
     else if(name == COMPILER_BLANK_ELEM)
     {
         requireEmptyError(name);
-        result.push_back(static_cast<int>(L' '));
+        result.push_back(static_cast<int>(' '));
     }
     else if(name == COMPILER_POSTGENERATOR_ELEM)
     {
         requireEmptyError(name);
-        result.push_back(static_cast<int>(L'~'));
+        result.push_back(static_cast<int>('~'));
     }
     else if(name == COMPILER_GROUP_ELEM)
     {
         int tipo=xmlTextReaderNodeType(reader);
         if(tipo != XML_READER_TYPE_END_ELEMENT)
         {
-            result.push_back(static_cast<int>(L'#'));
+            result.push_back(static_cast<int>('#'));
         }
     }
     else if(name == COMPILER_S_ELEM)
     {
         requireEmptyError(name);
-        wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">";
+        UString symbol;
+        symbol += '<';
+        symbol.append(attrib(COMPILER_N_ATTR));
+        symbol += '>';
 
         if(!alphabet.isSymbolDefined(symbol))
         {
-            wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-            wcerr << L"): Undefined symbol '" << symbol << L"'." << endl;
+            cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+            cerr << "): Undefined symbol '" << symbol << "'." << endl;
             exit(EXIT_FAILURE);
         }
         result.push_back(alphabet(symbol));
     }
     else if(name == COMPILER_ANYTAG_ELEM)
     {
-        result.push_back(alphabet(L"<ANY_TAG>"));
+        result.push_back(any_tag);
     }
     else if(name == COMPILER_ANYCHAR_ELEM)
     {
-        result.push_back(alphabet(L"<ANY_CHAR>"));
+        result.push_back(any_char);
     }
     else if(name == COMPILER_WB_ELEM)
     {
         requireEmptyError(name);
-        result.push_back(alphabet(L"<$>"));
+        result.push_back(word_boundary);
     }
 
     else
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Invalid specification of element '<" << name;
-        wcerr << L">' in this context." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Invalid specification of element '<" << name;
+        cerr << ">' in this context." << endl;
         exit(EXIT_FAILURE);
     }
 }
 
 void
-Compiler::skipBlanks(wstring &name)
+Compiler::skipBlanks(UString &name)
 {
-    while(name == L"#text" || name == L"#comment")
+    while(name == "#text"_u || name == "#comment"_u)
     {
-        if(name != L"#comment")
+        if(name != "#comment"_u)
         {
             if(!allBlanks())
             {
-                wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-                wcerr << L"): Invalid construction." << endl;
+                cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+                cerr << "): Invalid construction." << endl;
                 exit(EXIT_FAILURE);
             }
         }
 
         xmlTextReaderRead(reader);
-        name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+        name = XMLParseUtil::readName(reader);
     }
 }
 
 void
-Compiler::skip(wstring &name, wstring const &elem)
+Compiler::skip(UString &name, UString const &elem)
 {
     skip(name, elem, true);
 }
 
 void
-Compiler::skip(wstring &name, wstring const &elem, bool open)
+Compiler::skip(UString &name, UString const &elem, bool open)
 {
     xmlTextReaderRead(reader);
-    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
-    wstring slash;
+    name = XMLParseUtil::readName(reader);
+    UString slash;
 
     if(!open)
     {
-        slash = L"/";
+        slash = "/"_u;
     }
 
-    while(name == L"#text" || name == L"#comment")
+    while(name == "#text"_u || name == "#comment"_u)
     {
-        if(name != L"#comment")
+        if(name != "#comment"_u)
         {
             if(!allBlanks())
             {
-                wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-                wcerr << L"): Invalid construction." << endl;
+                cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+                cerr << "): Invalid construction." << endl;
                 exit(EXIT_FAILURE);
             }
         }
         xmlTextReaderRead(reader);
-        name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+        name = XMLParseUtil::readName(reader);
     }
 
     if(name != elem)
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Expected '<" << slash << elem << L">'." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Expected '<" << slash << elem << ">'." << endl;
         exit(EXIT_FAILURE);
     }
 }
@@ -395,16 +381,16 @@ Compiler::skip(wstring &name, wstring const &elem, bool open)
 EntryToken
 Compiler::procIdentity()
 {
-    list<int> both_sides;
+    vector<int> both_sides;
 
     if(!xmlTextReaderIsEmptyElement(reader))
     {
-        wstring name = L"";
+        UString name;
 
         while(true)
         {
             xmlTextReaderRead(reader);
-            name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+            name = XMLParseUtil::readName(reader);
             if(name == COMPILER_IDENTITY_ELEM)
             {
                 break;
@@ -413,10 +399,10 @@ Compiler::procIdentity()
         }
     }
 
-    if(verbose && first_element && (both_sides.front() == (int)L' '))
+    if(verbose && first_element && (both_sides.front() == (int)' '))
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Entry begins with space." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Entry begins with space." << endl;
     }
     first_element = false;
     EntryToken e;
@@ -427,18 +413,18 @@ Compiler::procIdentity()
 EntryToken
 Compiler::procTransduction()
 {
-    list<int> lhs, rhs;
-    wstring name;
+    vector<int> lhs, rhs;
+    UString name;
 
     skip(name, COMPILER_LEFT_ELEM);
 
     if(!xmlTextReaderIsEmptyElement(reader))
     {
-        name = L"";
+      name.clear();
         while(true)
         {
             xmlTextReaderRead(reader);
-            name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+            name = XMLParseUtil::readName(reader);
             if(name == COMPILER_LEFT_ELEM)
             {
                 break;
@@ -447,10 +433,10 @@ Compiler::procTransduction()
         }
     }
 
-    if(verbose && first_element && (lhs.front() == (int)L' '))
+    if(verbose && first_element && (lhs.front() == (int)' '))
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Entry begins with space." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Entry begins with space." << endl;
     }
     first_element = false;
 
@@ -458,11 +444,11 @@ Compiler::procTransduction()
 
     if(!xmlTextReaderIsEmptyElement(reader))
     {
-        name = L"";
+      name.clear();
         while(true)
         {
             xmlTextReaderRead(reader);
-            name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+            name = XMLParseUtil::readName(reader);
             if(name == COMPILER_RIGHT_ELEM)
             {
                 break;
@@ -479,8 +465,8 @@ Compiler::procTransduction()
     return e;
 }
 
-wstring
-Compiler::attrib(wstring const &name)
+UString
+Compiler::attrib(UString const &name)
 {
     return XMLParseUtil::attrib(reader, name);
 }
@@ -489,20 +475,20 @@ EntryToken
 Compiler::procPar()
 {
     EntryToken e;
-    wstring nomparadigma = attrib(COMPILER_N_ATTR);
+    UString nomparadigma = attrib(COMPILER_N_ATTR);
     first_element = false;
 
-    if(current_paradigm != L"" && nomparadigma == current_paradigm)
+    if(!current_paradigm.empty() && nomparadigma == current_paradigm)
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." <<endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Paradigm refers to itself '" << nomparadigma << "'." <<endl;
         exit(EXIT_FAILURE);
     }
 
     if(paradigms.find(nomparadigma) == paradigms.end())
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Undefined paradigm '" << nomparadigma << L"'." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Undefined paradigm '" << nomparadigma << "'." << endl;
         exit(EXIT_FAILURE);
     }
     e.setParadigm(nomparadigma);
@@ -512,8 +498,7 @@ Compiler::procPar()
 void
 Compiler::insertEntryTokens(vector<EntryToken> const &elements)
 {
-    if(current_paradigm != L"")
-    {
+  if(!current_paradigm.empty()) {
         // compilation of paradigms
         Transducer &t = paradigms[current_paradigm];
         int e = t.getInitial();
@@ -537,8 +522,8 @@ Compiler::insertEntryTokens(vector<EntryToken> const &elements)
             }
             else
             {
-                wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-                wcerr << L"): Invalid entry token." << endl;
+                cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+                cerr << "): Invalid entry token." << endl;
                 exit(EXIT_FAILURE);
             }
         }
@@ -608,15 +593,14 @@ Compiler::insertEntryTokens(vector<EntryToken> const &elements)
 
 
 void
-Compiler::requireAttribute(wstring const &value, wstring const &attrname,
-                           wstring const &elemname)
+Compiler::requireAttribute(UString const &value, UString const &attrname,
+                           UString const &elemname)
 {
-    if(value == L"")
-    {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): '<" << elemname;
-        wcerr << L"' element must specify non-void '";
-        wcerr << attrname << L"' attribute." << endl;
+  if(value.empty()) {
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): '<" << elemname;
+        cerr << "' element must specify non-void '";
+        cerr << attrname << "' attribute." << endl;
         exit(EXIT_FAILURE);
     }
 }
@@ -629,46 +613,46 @@ Compiler::procSection()
 
     if(tipo != XML_READER_TYPE_END_ELEMENT)
     {
-        wstring const &id = attrib(COMPILER_ID_ATTR);
-        wstring const &type = attrib(COMPILER_TYPE_ATTR);
+        UString const &id = attrib(COMPILER_ID_ATTR);
+        UString const &type = attrib(COMPILER_TYPE_ATTR);
         requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM);
         requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM);
 
         current_section = id;
-        current_section += L"@";
+        current_section += '@';
         current_section.append(type);
     }
     else
     {
-        current_section = L"";
+      current_section.clear();
     }
 }
 
 void
 Compiler::procEntry()
 {
-    wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR);
-    wstring ignore = this->attrib(COMPILER_IGNORE_ATTR);
-    wstring altval = this->attrib(COMPILER_ALT_ATTR);
-    wstring varval = this->attrib(COMPILER_V_ATTR);
-    wstring varl   = this->attrib(COMPILER_VL_ATTR);
-    wstring varr   = this->attrib(COMPILER_VR_ATTR);
-
-    //���if entry is masked by a restriction of direction or an ignore mark
-    if((atributo != L"" && atributo != direction)
+    UString atributo=this->attrib(COMPILER_RESTRICTION_ATTR);
+    UString ignore = this->attrib(COMPILER_IGNORE_ATTR);
+    UString altval = this->attrib(COMPILER_ALT_ATTR);
+    UString varval = this->attrib(COMPILER_V_ATTR);
+    UString varl   = this->attrib(COMPILER_VL_ATTR);
+    UString varr   = this->attrib(COMPILER_VR_ATTR);
+
+    // if entry is masked by a restriction of direction or an ignore mark
+    if((!atributo.empty() && atributo != direction)
        || ignore == COMPILER_IGNORE_YES_VAL
-       || (altval != L"" && altval != alt)
-       || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant)
-       || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left)
-       || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right))
+       || (!altval.empty() && altval != alt)
+       || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant)
+       || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left)
+       || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right))
     {
         // parse to the end of the entry
-        wstring name = L"";
+        UString name;
 
         while(name != COMPILER_ENTRY_ELEM)
         {
             xmlTextReaderRead(reader);
-            name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+            name = XMLParseUtil::readName(reader);
         }
 
         return;
@@ -681,14 +665,14 @@ Compiler::procEntry()
         int ret = xmlTextReaderRead(reader);
         if(ret != 1)
         {
-            wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-            wcerr << L"): Parse error." << endl;
+            cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+            cerr << "): Parse error." << endl;
             exit(EXIT_FAILURE);
         }
-        wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+        UString name = XMLParseUtil::readName(reader);
         skipBlanks(name);
 
-        if(current_paradigm == L"" && verbose)
+        if(current_paradigm.empty() && verbose)
         {
             first_element = true;
         }
@@ -712,12 +696,12 @@ Compiler::procEntry()
 
             // detecci���n del uso de paradigmas no definidos
 
-            wstring const &p = elements.rbegin()->paradigmName();
+            UString const &p = elements.rbegin()->paradigmName();
 
             if(paradigms.find(p) == paradigms.end())
             {
-                wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-                wcerr << L"): Undefined paradigm '" << p << L"'." <<endl;
+                cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+                cerr << "): Undefined paradigm '" << p << "'." <<endl;
                 exit(EXIT_FAILURE);
             }
             // descartar entradas con paradigms vac���os (por las direciones,
@@ -727,7 +711,7 @@ Compiler::procEntry()
                 while(name != COMPILER_ENTRY_ELEM || tipo != XML_READER_TYPE_END_ELEMENT)
                 {
                     xmlTextReaderRead(reader);
-                    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+                    name = XMLParseUtil::readName(reader);
                     tipo = xmlTextReaderNodeType(reader);
                 }
                 return;
@@ -736,8 +720,8 @@ Compiler::procEntry()
         else if(name == COMPILER_ENTRY_ELEM && tipo == XML_READER_TYPE_END_ELEMENT)
         {
             /* INSERTING FINAL <$> HERE */
-            // list<int> wb;
-            // wb.push_back(alphabet(L"<$>"));
+            // vector<int> wb;
+            // wb.push_back(word_boundary);
             // EntryToken e;
             // e.setSingleTransduction(wb, wb);
             // elements.push_back(e);
@@ -748,9 +732,9 @@ Compiler::procEntry()
         }
         else
         {
-            wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-            wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << COMPILER_ENTRY_ELEM;
-            wcerr << L">'." << endl;
+            cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+            cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM;
+            cerr << ">'." << endl;
             exit(EXIT_FAILURE);
         }
 
@@ -760,12 +744,11 @@ Compiler::procEntry()
 void
 Compiler::procNode()
 {
-    xmlChar const *xnombre = xmlTextReaderConstName(reader);
-    wstring nombre = XMLParseUtil::towstring(xnombre);
+    UString nombre = XMLParseUtil::readName(reader);
 
-    // HACER: optimizar el orden de ejecuci���n de esta ristra de "ifs"
+    // HACER: optimizar el orden de ejecución de esta ristra de "ifs"
 
-    if(nombre == L"#text")
+    if(nombre == "#text"_u)
     {
         /* ignorar */
     }
@@ -801,14 +784,14 @@ Compiler::procNode()
     {
         procSection();
     }
-    else if(nombre == L"#comment")
+    else if(nombre == "#comment"_u)
     {
         /* ignorar */
     }
     else
     {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Invalid node '<" << nombre << L">'." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Invalid node '<" << nombre << ">'." << endl;
         exit(EXIT_FAILURE);
     }
 }
@@ -818,7 +801,7 @@ Compiler::procRegexp()
 {
     EntryToken et;
     xmlTextReaderRead(reader);
-    wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader));
+    UString re = XMLParseUtil::readValue(reader);
     et.setRegexp(re);
     xmlTextReaderRead(reader);
     return et;
@@ -828,7 +811,7 @@ void
 Compiler::write(FILE *output)
 {
     // letters
-    Compression::wstring_write(letters, output);
+    Compression::string_write(letters, output);
 
     // symbols
     alphabet.write(output);
@@ -836,16 +819,11 @@ Compiler::write(FILE *output)
     // transducers
     Compression::multibyte_write(sections.size(), output);
 
-    int conta=0;
-    for(map<wstring, Transducer, Ltstr>::iterator it = sections.begin(),
-        limit = sections.end();
-        it != limit; it++)
-    {
-        conta++;
-        wcout << it->first << " " << it->second.size();
-        wcout << " " << it->second.numberOfTransitions() << endl;
-        Compression::wstring_write(it->first, output);
-        it->second.write(output);
+    for (auto& it : sections) {
+        cout << it.first << " " << it.second.size();
+        cout << " " << it.second.numberOfTransitions() << endl;
+        Compression::string_write(it.first, output);
+        it.second.write(output);
     }
 }
 
diff --git a/src/lsx_compiler.h b/src/lsx_compiler.h
index 971c6fb..3956f16 100644
--- a/src/lsx_compiler.h
+++ b/src/lsx_compiler.h
@@ -20,12 +20,12 @@
 #include <lttoolbox/alphabet.h>
 #include <lttoolbox/regexp_compiler.h>
 #include <lttoolbox/entry_token.h>
-#include <lttoolbox/ltstr.h>
 #include <lttoolbox/transducer.h>
 
 #include <map>
 #include <string>
 #include <set>
+#include <cstdint>
 #include <libxml/xmlreader.h>
 
 using namespace std;
@@ -39,88 +39,95 @@ private:
     /**
      * The libxml2's XML reader
      */
-    xmlTextReaderPtr reader;
+    xmlTextReaderPtr reader = nullptr;
 
     /**
      * The alt value
      */
-    wstring alt;
+    UString alt;
 
     /**
      * The variant value (monodix)
      */
-    wstring variant;
+    UString variant;
 
     /**
      * The variant value (left side of bidix)
      */
-    wstring variant_left;
+    UString variant_left;
 
     /**
      * The variant value (right side of bidix)
      */
-    wstring variant_right;
+    UString variant_right;
 
     /**
      * The paradigm being compiled
      */
-    wstring current_paradigm;
+    UString current_paradigm;
 
     /**
      * The dictionary section being compiled
      */
-    wstring current_section;
+    UString current_section;
 
     /**
      * The direction of the compilation, 'lr' (left-to-right) or 'rl'
      * (right-to-left)
      */
-    wstring direction;
+    UString direction;
 
     /**
      * List of characters to be considered alphabetic
      */
-    wstring letters;
+    UString letters;
 
     /**
      * Set verbose mode: warnings which may or may not be correct
      */
-    bool verbose;
+    bool verbose = false;
 
     /**
      * First element (of an entry)
      */
-    bool first_element;
+    bool first_element = false;
 
     /**
      * Identifier of all the symbols during the compilation
      */
     Alphabet alphabet;
 
+    /**
+     * Special symbols
+     */
+    int32_t any_tag = 0;
+    int32_t any_char = 0;
+    int32_t word_boundary = 0;
+
     /**
      * List of named transducers-paradigms
      */
-    map<wstring, Transducer, Ltstr> paradigms;
+    map<UString, Transducer> paradigms;
 
     /**
      * List of named dictionary sections
      */
-    map<wstring, Transducer, Ltstr> sections;
+    map<UString, Transducer> sections;
 
     /**
      * List of named prefix copy of a paradigm
      */
-    map<wstring, map<wstring, int, Ltstr>, Ltstr> prefix_paradigms;
+    map<UString, map<UString, int>> prefix_paradigms;
 
     /**
      * List of named suffix copy of a paradigm
      */
-    map<wstring, map<wstring, int, Ltstr>, Ltstr> suffix_paradigms;
+    map<UString, map<UString, int>> suffix_paradigms;
 
     /**
      * List of named endings of a suffix copy of a paradgim
      */
-    map<wstring, map<wstring, int, Ltstr>, Ltstr> postsuffix_paradigms;
+    map<UString, map<UString, int>> postsuffix_paradigms;
 
 
     /*
@@ -175,7 +182,7 @@ private:
      * @param name the name of the attribute
      * @return the value of the attribute
      */
-    wstring attrib(wstring const &name);
+    UString attrib(UString const &name);
 
     /**
      * Construct symbol pairs by align left side of both parts and insert
@@ -186,7 +193,7 @@ private:
      * @param t the transducer
      * @return the last state of the inserted transduction
      */
-    int matchTransduction(list<int> const &lp, list<int> const &rp,
+    int matchTransduction(vector<int> const &lp, vector<int> const &rp,
                           int state, Transducer &t);
     /**
      * Parse the &lt;p&lt; element
@@ -217,7 +224,7 @@ private:
      * @param name the name of the node
      * @param elem the name of the expected node
      */
-    void skip(wstring &name, wstring const &elem);
+    void skip(UString &name, UString const &elem);
 
     /**
      * Skip all document #text nodes before "elem"
@@ -225,22 +232,22 @@ private:
      * @param elem the name of the expected node
      * @param open true for open element, false for closed
      */
-    void skip(wstring &name, wstring const &elem, bool open);
+    void skip(UString &name, UString const &elem, bool open);
 
     /**
      * Skip all blank #text nodes before "name"
      * @param name the name of the node
      */
-    void skipBlanks(wstring &name);
+    void skipBlanks(UString &name);
 
 
-    void readString(list<int> &result, wstring const &name);
+    void readString(vector<int> &result, UString const &name);
 
     /**
      * Force an element to be empty, and check for it
      * @param name the element
      */
-    void requireEmptyError(wstring const &name);
+    void requireEmptyError(UString const &name);
 
     /**
      * Force an attribute to be specified, amd check for it
@@ -249,8 +256,8 @@ private:
      * @param elemname the parent of the attribute
      */
 
-    void requireAttribute(wstring const &value, wstring const &attrname,
-                          wstring const &elemname);
+    void requireAttribute(UString const &value, UString const &attrname,
+                          UString const &elemname);
     /**
      * True if all the elements in the current node are blanks
      * @return true if all are blanks
@@ -263,60 +270,50 @@ public:
      * Constants to represent the element and the attributes of
      * dictionaries
      */
-    static wstring const COMPILER_DICTIONARY_ELEM;
-    static wstring const COMPILER_ALPHABET_ELEM;
-    static wstring const COMPILER_SDEFS_ELEM;
-    static wstring const COMPILER_SDEF_ELEM;
-    static wstring const COMPILER_N_ATTR;
-    static wstring const COMPILER_PARDEFS_ELEM;
-    static wstring const COMPILER_PARDEF_ELEM;
-    static wstring const COMPILER_PAR_ELEM;
-    static wstring const COMPILER_ENTRY_ELEM;
-    static wstring const COMPILER_RESTRICTION_ATTR;
-    static wstring const COMPILER_RESTRICTION_LR_VAL;
-    static wstring const COMPILER_RESTRICTION_RL_VAL;
-    static wstring const COMPILER_PAIR_ELEM;
-    static wstring const COMPILER_LEFT_ELEM;
-    static wstring const COMPILER_RIGHT_ELEM;
-    static wstring const COMPILER_S_ELEM;
-    static wstring const COMPILER_REGEXP_ELEM;
-    static wstring const COMPILER_SECTION_ELEM;
-    static wstring const COMPILER_ID_ATTR;
-    static wstring const COMPILER_TYPE_ATTR;
-    static wstring const COMPILER_IDENTITY_ELEM;
-    static wstring const COMPILER_JOIN_ELEM;
-    static wstring const COMPILER_BLANK_ELEM;
-    static wstring const COMPILER_POSTGENERATOR_ELEM;
-    static wstring const COMPILER_GROUP_ELEM;
-    static wstring const COMPILER_LEMMA_ATTR;
-    static wstring const COMPILER_IGNORE_ATTR;
-    static wstring const COMPILER_IGNORE_YES_VAL;
-    static wstring const COMPILER_ALT_ATTR;
-    static wstring const COMPILER_V_ATTR;
-    static wstring const COMPILER_VL_ATTR;
-    static wstring const COMPILER_VR_ATTR;
-
-    static wstring const COMPILER_ANYTAG_ELEM;
-    static wstring const COMPILER_ANYCHAR_ELEM;
-    static wstring const COMPILER_WB_ELEM;
-
-
-    /**
-     * Constructor
-     */
-    Compiler();
-
-    /**
-     * Destructor
-     */
-    ~Compiler();
+    static UString const COMPILER_DICTIONARY_ELEM;
+    static UString const COMPILER_ALPHABET_ELEM;
+    static UString const COMPILER_SDEFS_ELEM;
+    static UString const COMPILER_SDEF_ELEM;
+    static UString const COMPILER_N_ATTR;
+    static UString const COMPILER_PARDEFS_ELEM;
+    static UString const COMPILER_PARDEF_ELEM;
+    static UString const COMPILER_PAR_ELEM;
+    static UString const COMPILER_ENTRY_ELEM;
+    static UString const COMPILER_RESTRICTION_ATTR;
+    static UString const COMPILER_RESTRICTION_LR_VAL;
+    static UString const COMPILER_RESTRICTION_RL_VAL;
+    static UString const COMPILER_PAIR_ELEM;
+    static UString const COMPILER_LEFT_ELEM;
+    static UString const COMPILER_RIGHT_ELEM;
+    static UString const COMPILER_S_ELEM;
+    static UString const COMPILER_REGEXP_ELEM;
+    static UString const COMPILER_SECTION_ELEM;
+    static UString const COMPILER_ID_ATTR;
+    static UString const COMPILER_TYPE_ATTR;
+    static UString const COMPILER_IDENTITY_ELEM;
+    static UString const COMPILER_JOIN_ELEM;
+    static UString const COMPILER_BLANK_ELEM;
+    static UString const COMPILER_POSTGENERATOR_ELEM;
+    static UString const COMPILER_GROUP_ELEM;
+    static UString const COMPILER_LEMMA_ATTR;
+    static UString const COMPILER_IGNORE_ATTR;
+    static UString const COMPILER_IGNORE_YES_VAL;
+    static UString const COMPILER_ALT_ATTR;
+    static UString const COMPILER_V_ATTR;
+    static UString const COMPILER_VL_ATTR;
+    static UString const COMPILER_VR_ATTR;
+
+    static UString const COMPILER_ANYTAG_ELEM;
+    static UString const COMPILER_ANYCHAR_ELEM;
+    static UString const COMPILER_WB_ELEM;
+
 
     /**
      * Compile dictionary to letter transducers
      * @param fichero file
      * @param dir direction
      */
-    void parse(string const &fichero, wstring const &dir);
+    void parse(string const &fichero, UString const &dir);
 
     //  auto getAlt();
     //  auto getInt();
diff --git a/src/lsx_proc.cc b/src/lsx_proc.cc
index cdc8094..eaf327c 100644
--- a/src/lsx_proc.cc
+++ b/src/lsx_proc.cc
@@ -28,8 +28,8 @@ int main (int argc, char** argv)
   LtLocale::tryToSetLocale();
   
   LSXProcessor fstp;
-  FILE* input = stdin;
-  FILE* output = stdout;
+  InputFile input;
+  UFILE* output = u_finit(stdout, NULL, NULL);
 
 #if HAVE_GETOPT_LONG
   static struct option long_options[]=
@@ -72,22 +72,18 @@ int main (int argc, char** argv)
   }
   FILE* fst = fopen(argv[optind], "rb");
   if(!fst) {
-    wcerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl;
+    cerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl;
     exit(EXIT_FAILURE);
   }
   fstp.load(fst);
 
   if (optind <= (argc - 2)) {
-    input = fopen(argv[optind+1], "rb");
-    if (input == NULL || ferror(input)) {
-      wcerr << "Error: Cannot open file '" << argv[optind+1] << "' for reading." << endl;
-      exit(EXIT_FAILURE);
-    }
+    input.open_or_exit(argv[optind+1]);
   }
   if (optind <= (argc - 3)) {
-    output = fopen(argv[optind+2], "wb");
-    if (output == NULL || ferror(output)) {
-      wcerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl;
+    output = u_fopen(argv[optind+2], "w", NULL, NULL);
+    if (output == NULL) {
+      cerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl;
     }
   }
   
diff --git a/src/lsx_processor.cc b/src/lsx_processor.cc
index 7f68dc5..ce1eaf4 100644
--- a/src/lsx_processor.cc
+++ b/src/lsx_processor.cc
@@ -1,20 +1,21 @@
 #include "lsx_processor.h"
 
 #include <lttoolbox/compression.h>
+#include <cstring>
 
 LSXProcessor::LSXProcessor()
 {
-  escaped_chars.insert(L'[');
-  escaped_chars.insert(L']');
-  escaped_chars.insert(L'{');
-  escaped_chars.insert(L'}');
-  escaped_chars.insert(L'^');
-  escaped_chars.insert(L'$');
-  escaped_chars.insert(L'/');
-  escaped_chars.insert(L'\\');
-  escaped_chars.insert(L'@');
-  escaped_chars.insert(L'<');
-  escaped_chars.insert(L'>');
+  escaped_chars.insert('[');
+  escaped_chars.insert(']');
+  escaped_chars.insert('{');
+  escaped_chars.insert('}');
+  escaped_chars.insert('^');
+  escaped_chars.insert('$');
+  escaped_chars.insert('/');
+  escaped_chars.insert('\\');
+  escaped_chars.insert('@');
+  escaped_chars.insert('<');
+  escaped_chars.insert('>');
 
   null_flush = false;
   dictionary_case = false;
@@ -46,18 +47,18 @@ LSXProcessor::load(FILE *input)
   int len = Compression::multibyte_read(input);
   while(len > 0)
   {
-    alphabetic_chars.insert(static_cast<wchar_t>(Compression::multibyte_read(input)));
+    alphabetic_chars.insert(static_cast<UChar32>(Compression::multibyte_read(input)));
     len--;
   }
 
   // symbols
   alphabet.read(input);
-  word_boundary = alphabet(L"<$>");
-  any_char = alphabet(L"<ANY_CHAR>");
-  any_tag = alphabet(L"<ANY_TAG>");
+  word_boundary = alphabet("<$>"_u);
+  any_char = alphabet("<ANY_CHAR>"_u);
+  any_tag = alphabet("<ANY_TAG>"_u);
 
   len = Compression::multibyte_read(input);
-  Compression::wstring_read(input); // name
+  Compression::string_read(input); // name
   // there should only be 1 transducer in the file
   // so ignore any subsequent ones
   trans.read(input, alphabet);
@@ -67,65 +68,65 @@ LSXProcessor::load(FILE *input)
 }
 
 void
-LSXProcessor::readNextLU(FILE* input)
+LSXProcessor::readNextLU(InputFile& input)
 {
-  vector<wstring> parts = vector<wstring>(3);
+  vector<UString> parts = vector<UString>(3);
   int loc = 0; // 0 = blank, 1 = bound blank, 2 = LU
   bool box = false; // are we in a [ ] blank
-  while(!feof(input))
+  while(!input.eof())
   {
-    wchar_t c = fgetwc_unlocked(input);
-    if ((unsigned int)c == WEOF) {
+    UChar32 c = input.get();
+    if ((unsigned int)c == U_EOF) {
         break;
     }
-    if(null_flush && c == L'\0')
+    if(null_flush && c == '\0')
     {
       at_end = true;
       at_null = true;
       break;
     }
-    else if(c == L'\\')
+    else if(c == '\\')
     {
       parts[loc] += c;
-      c = fgetwc_unlocked(input);
+      c = input.get();
       parts[loc] += c;
     }
     else if(loc == 0 && box)
     {
-      if(c == L']')
+      if(c == ']')
       {
         box = false;
       }
       parts[loc] += c;
     }
-    else if(loc == 0 && c == L'[')
+    else if(loc == 0 && c == '[')
     {
-      c = fgetwc_unlocked(input);
-      if(c == L'[')
+      c = input.get();
+      if(c == '[')
       {
         loc = 1;
       }
       else
       {
-        parts[loc] += L'[';
+        parts[loc] += '[';
         parts[loc] += c;
-        if(c != L']')
+        if(c != ']')
         {
           box = true;
         }
-        if(c == L'\\')
+        if(c == '\\')
         {
-          parts[loc] += fgetwc_unlocked(input);
+          parts[loc] += input.get();
         }
       }
     }
-    else if(loc == 1 && c == L']')
+    else if(loc == 1 && c == ']')
     {
-      c = fgetwc_unlocked(input);
-      if(c == L']')
+      c = input.get();
+      if(c == ']')
       {
-        c = fgetwc_unlocked(input);
-        if(c == L'^')
+        c = input.get();
+        if(c == '^')
         {
           loc = 2;
         }
@@ -134,25 +135,25 @@ LSXProcessor::readNextLU(FILE* input)
           // this situation is invalid
           // but I like making parsers harder to break than required
           // by the standard
-          parts[loc] += L"]]";
+          parts[loc] += "]]"_u;
           parts[loc] += c;
         }
       }
       else
       {
-        parts[loc] += L']';
+        parts[loc] += ']';
         parts[loc] += c;
-        if(c == L'\\')
+        if(c == '\\')
         {
-          parts[loc] += fgetwc_unlocked(input);
+          parts[loc] += input.get();
         }
       }
     }
-    else if(loc == 0 && c == L'^')
+    else if(loc == 0 && c == '^')
     {
       loc = 2;
     }
-    else if(loc == 2 && c == L'$')
+    else if(loc == 2 && c == '$')
     {
       break;
     }
@@ -161,7 +162,7 @@ LSXProcessor::readNextLU(FILE* input)
       parts[loc] += c;
     }
   }
-  if(feof(input))
+  if(input.eof())
   {
     at_end = true;
   }
@@ -171,7 +172,7 @@ LSXProcessor::readNextLU(FILE* input)
 }
 
 void
-LSXProcessor::processWord(FILE* input, FILE* output)
+LSXProcessor::processWord(InputFile& input, UFILE* output)
 {
   if(lu_queue.size() == 0)
   {
@@ -180,14 +181,14 @@ LSXProcessor::processWord(FILE* input, FILE* output)
   if(at_end && lu_queue.size() == 1 && lu_queue.back().size() == 0)
   {
     // we're at the final blank, no more work to do
-    fputws_unlocked(blank_queue.back().c_str(), output);
+    write(blank_queue.back(), output);
     blank_queue.pop_front();
     bound_blank_queue.pop_front();
     lu_queue.pop_front();
     return;
   }
   size_t last_final = 0;
-  wstring last_final_out;
+  UString last_final_out;
   State s;
   s.init(trans.getInitial());
   size_t idx = 0;
@@ -203,7 +204,7 @@ LSXProcessor::processWord(FILE* input, FILE* output)
       }
       readNextLU(input);
     }
-    wstring lu = lu_queue[idx];
+    UString lu = lu_queue[idx];
     if(lu.size() == 0)
     {
       break;
@@ -214,22 +215,22 @@ LSXProcessor::processWord(FILE* input, FILE* output)
     }
     for(size_t i = 0; i < lu.size(); i++)
     {
-      if(lu[i] == L'<')
+      if(lu[i] == '<')
       {
         size_t j = i+1;
         for(; j < lu.size(); j++)
         {
-          if(lu[j] == L'\\')
+          if(lu[j] == '\\')
           {
             j++;
           }
-          else if(lu[j] == L'>')
+          else if(lu[j] == '>')
           {
             j++;
             break;
           }
         }
-        wstring tag = lu.substr(i, j-i);
+        UString tag = lu.substr(i, j-i);
         i = j-1;
         if(!alphabet.isSymbolDefined(tag))
         {
@@ -239,7 +240,7 @@ LSXProcessor::processWord(FILE* input, FILE* output)
       }
       else
       {
-        if(lu[i] == L'\\')
+        if(lu[i] == '\\')
         {
           i++;
         }
@@ -258,28 +259,24 @@ LSXProcessor::processWord(FILE* input, FILE* output)
   }
   if(last_final == 0)
   {
-    fputws_unlocked(blank_queue.front().c_str(), output);
+    write(blank_queue.front(), output);
     blank_queue.pop_front();
-    if(bound_blank_queue.front().size() > 0)
+    if(!bound_blank_queue.front().empty())
     {
-      fputws_unlocked(L"[[", output);
-      fputws_unlocked(bound_blank_queue.front().c_str(), output);
-      fputws_unlocked(L"]]", output);
+      u_fprintf(output, "[[%S]]", bound_blank_queue.front().c_str());
     }
     bound_blank_queue.pop_front();
-    fputwc_unlocked(L'^', output);
-    fputws_unlocked(lu_queue.front().c_str(), output);
-    fputwc_unlocked(L'$', output);
+    u_fprintf(output, "^%S$", lu_queue.front().c_str());
     lu_queue.pop_front();
     return;
   }
-  vector<wstring> out_lus;
+  vector<UString> out_lus;
   size_t pos = 0;
-  while(pos != wstring::npos && pos != last_final_out.size())
+  while(pos != UString::npos && pos != last_final_out.size())
   {
     size_t start = pos;
-    pos = last_final_out.find(L"<$>", start);
-    if(pos == wstring::npos)
+    pos = last_final_out.find("<$>"_u, start);
+    if(pos == UString::npos)
     {
       out_lus.push_back(last_final_out.substr(start));
     }
@@ -290,26 +287,26 @@ LSXProcessor::processWord(FILE* input, FILE* output)
     }
   }
   
-  wstring wblank;
+  UString wblank;
   for(size_t i = 0; i < last_final; i++)
   {
     if(!bound_blank_queue[i].empty())
     {
       if(wblank.empty())
       {
-        wblank += L"[[";
+        wblank += "[["_u;
       }
       else
       {
-        wblank += L"; ";
+        wblank += "; "_u;
       }
       
-      wblank += bound_blank_queue[i].c_str();
+      wblank += bound_blank_queue[i];
     }
   }
   if(!wblank.empty())
   {
-    wblank += L"]]";
+    wblank += "]]"_u;
   }
   
   size_t i = 0;
@@ -317,22 +314,22 @@ LSXProcessor::processWord(FILE* input, FILE* output)
   {
     if(i < last_final)
     {
-      fputws_unlocked(blank_queue[i].c_str(), output);
+      write(blank_queue[i], output);
     }
     else
     {
-      fputwc_unlocked(L' ', output);
+      u_fputc(' ', output);
     }
-    fputws_unlocked(wblank.c_str(), output);
-    fputwc_unlocked(L'^', output);
-    fputws_unlocked(out_lus[i].c_str(), output);
-    fputwc_unlocked(L'$', output);
+    write(wblank, output);
+    u_fputc('^', output);
+    write(out_lus[i], output);
+    u_fputc('$', output);
   }
   for(; i < last_final; i++)
   {
-    if(blank_queue[i] != L" ")
+    if(blank_queue[i] != " "_u)
     {
-      fputws_unlocked(blank_queue[i].c_str(), output);
+      write(blank_queue[i], output);
     }
   }
   blank_queue.erase(blank_queue.begin(), blank_queue.begin()+last_final);
@@ -341,7 +338,7 @@ LSXProcessor::processWord(FILE* input, FILE* output)
 }
 
 void
-LSXProcessor::process(FILE* input, FILE* output)
+LSXProcessor::process(InputFile& input, UFILE* output)
 {
   while(true)
   {
@@ -351,12 +348,8 @@ LSXProcessor::process(FILE* input, FILE* output)
     }
     if(at_null)
     {
-      fputwc_unlocked(L'\0', output);
-      int code = fflush(output);
-      if(code != 0)
-      {
-        wcerr << L"Could not flush output " << errno << endl;
-      }
+      u_fputc('\0', output);
+      u_fflush(output);
       at_end = false;
       at_null = false;
     }
diff --git a/src/lsx_processor.h b/src/lsx_processor.h
index 90d5a47..264a5dd 100644
--- a/src/lsx_processor.h
+++ b/src/lsx_processor.h
@@ -2,10 +2,11 @@
 #define _LSX_PROCESSOR_H_
 
 #include <lttoolbox/alphabet.h>
-#include <lttoolbox/ltstr.h>
+#include <lttoolbox/input_file.h>
 #include <lttoolbox/my_stdio.h>
 #include <lttoolbox/state.h>
 #include <lttoolbox/trans_exe.h>
+#include <unicode/ustdio.h>
 #include <deque>
 
 class LSXProcessor
@@ -13,8 +14,8 @@ class LSXProcessor
 private:
   TransExe trans;
   State initial_state;
-  set<wchar_t> escaped_chars;
-  set<wchar_t> alphabetic_chars;
+  set<UChar32> escaped_chars;
+  set<UChar32> alphabetic_chars;
   map<Node *, double> all_finals;
   Alphabet alphabet;
   bool null_flush;
@@ -22,12 +23,12 @@ private:
   bool at_end;
   bool at_null;
 
-  deque<wstring> blank_queue;
-  deque<wstring> bound_blank_queue;
-  deque<wstring> lu_queue;
+  deque<UString> blank_queue;
+  deque<UString> bound_blank_queue;
+  deque<UString> lu_queue;
 
-  void readNextLU(FILE* input);
-  void processWord(FILE* input, FILE* output);
+  void readNextLU(InputFile& input);
+  void processWord(InputFile& input, UFILE* output);
 
   int word_boundary;
   int any_char;
@@ -35,7 +36,7 @@ private:
 public:
   LSXProcessor();
   void load(FILE* input);
-  void process(FILE* input, FILE* output);
+  void process(InputFile& input, UFILE* output);
   void setNullFlush(bool val)
   {
     null_flush = val;
diff --git a/src/processor.cc b/src/processor.cc
deleted file mode 100644
index aab265e..0000000
--- a/src/processor.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <cwchar>
-#include <cstdio>
-#include <cerrno>
-#include <string>
-#include <iostream>
-#include <list>
-#include <set>
-
-#include <lttoolbox/ltstr.h>
-#include <lttoolbox/lt_locale.h>
-#include <lttoolbox/transducer.h>
-#include <lttoolbox/compression.h>
-#include <lttoolbox/alphabet.h>
-#include <lttoolbox/state.h>
-#include <lttoolbox/trans_exe.h>
-
-wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2);
-
-
-/* get the text between delim1 and delim2 */
-/* next_token() */
-wstring
-readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2)
-{
-    wstring result = L"";
-    result += delim1;
-    wchar_t c = delim1;
-
-    while(!feof(input) && c != delim2)
-    {
-        c = static_cast<wchar_t>(fgetwc(input)); //fget_unlocked
-        result += c;
-    }
-
-    return result;
-}
-
-/***
-main
-***/
-int main (int argc, char** argv)
-{
-    Alphabet alphabet;
-    TransExe transducer;
-
-    LtLocale::tryToSetLocale();
-    FILE *fst = fopen(argv[1], "r");
-
-    set<wchar_t> alphabetic_chars;
-    int len = Compression::multibyte_read(fst);
-    while(len > 0)
-    {
-        alphabetic_chars.insert(static_cast<wchar_t>(Compression::multibyte_read(fst)));
-        len--;
-    }
-
-    alphabet.read(fst);
-    wcout << L"alphabet_size: " << alphabet.size() << endl;
-
-    len = Compression::multibyte_read(fst);
-    len = Compression::multibyte_read(fst);
-    wcout << len << endl;
-    wstring name = L"";
-    while(len > 0)
-    {
-        name += static_cast<wchar_t>(Compression::multibyte_read(fst));
-        len--;
-    }
-    wcout << name << endl;
-
-    transducer.read(fst, alphabet);
-
-    FILE *input = stdin;
-    FILE *output = stdout;
-
-    /* preparing for processing */
-    vector<State> alive_states; //A set of alive states is maintained to compute all the possible ways to
-    set<Node *> anfinals; //alive node finals ?
-    set<wchar_t> escaped_chars;
-
-    State* initial_state = new State();
-    initial_state->init(transducer.getInitial()); // getInitial() returns an int
-    anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end());
-
-    set<int> final_states = transducer.getFinals();
-    for(auto final_state : final_states) {
-        final_state.init(transducer.getInitial()); //initialize
-    }
-
-
-    /* processing */
-
-    vector<State> new_states;
-    alive_states.push_back(*initial_state);
-    // TODO: insert the other states
-    // TODO: insert the final state
-
-    int line_number = 0;
-    bool accepted = true;
-    while(!feof(input)) // while true
-    {
-        //initialize conditions
-        int tag_count = 0;
-        State* current_state = initial_state;
-        bool in_lemma = false;
-        bool in_take = false;
-        bool in_out = false;
-
-        while (alive_states.size() > 1 and !isFinal(current_state)) {
-            //get the next token
-            int val = fgetwc(input); // read 1 wide char
-            bool is_tag = false;
-            if(val == L'<') // if in tag, get the whole tag
-            {
-                in_lemma = false;
-                is_tag = true;
-                wstring tag = L"";
-                tag = readFullBlock(input, L'<', L'>');
-                val = static_cast<int>(alphabet(tag));
-
-                tag_count++;
-
-                cout << "val before: " << val << endl;
-                cout << "tag_count: " << tag_count << endl;
-
-                if(val == 0 && tag_count > 2) //TODO: val==0?
-                {
-                    val = static_cast<int>(alphabet(L"<ANY_TAG>"));
-                }
-
-                cout << "val after: " << val << endl;
-                fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val);
-
-                if (tag == '<sent>') {
-                    accepted = true;
-                }
-            }
-            else if(in_lemma && !in_take && !in_out) {
-                val == static_cast<int>(alphabet(L"&"));
-            }
-
-            // if (current_state == initial_state && not eof) {
-                //successfully reached eof
-                //exit()
-
-            if (current_state == initial_state && val != '\n') {
-                accepted = true;
-                break;
-            } else if (val == '\n') { //or sent
-                accepted = true;
-            }
-
-            //step into the next state
-            for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) { //step //for every state in alive_states
-                State s = *it;
-
-                if (tag_count > 2) {
-                    s.step(val, alphabet(L"<ANY_TAG>"));
-                } else {
-                    s.step(val)
-                }
-
-                if(s.size() > 0)
-                {
-                    new_states.push_back(s);
-                }
-                wcout << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl;
-            }
-
-            alive_states.swap(new_states);
-        }
-        return 0;
-    }
diff --git a/src/transducer.py b/src/transducer.py
deleted file mode 100644
index 77fc1b4..0000000
--- a/src/transducer.py
+++ /dev/null
@@ -1,189 +0,0 @@
-#usage: python transducer.py testfile.txt
-
-import sys
-
-transitions = {
-    (-1,'^') : 0,
-    (0,'t') : 1,
-    (1,'a') : 2,
-    (2,'k') : 3,
-    (3,'e') : 4,
-    (4,'<vblex>') : 5,
-    (5,'<ANY_TAG>') : 6,
-    (6,'<ANY_TAG>') : 7,
-    (6,'$') : 8,
-    (7,'<ANY_TAG>') : 7,
-    (7,'$'): 8,
-    (8,' ') : 9,
-    (9,'^') : 10,
-    (10,'&') : 11,
-    (11,'&') : 11,
-    (11,'<n>') : 12,
-    (11,'<adj>') : 13,
-    (11,'<det>') : 14,
-    (11,'<prn>') : 15,
-    (11,'<np>'): 16,
-    (12,'<ANY_TAG>') : 200,
-    (200,'<ANY_TAG>') : 201,
-    (200,'$') : 17,
-    (201,'<ANY_TAG>') : 201,
-    (201,'$') : 17,
-    (13,'<ANY_TAG>') : 225,
-    (13,'$') : 250,
-    (225,'<ANY_TAG>') : 225,
-    (225,'$') : 250,
-    (250,' '):251,
-    (251,'^'):252,
-    (252,'&'):253,
-    (253,'&'):253,
-    (253,'<n>'):12,
-    (253,'<adj>'):13,
-    (14,'<ANY_TAG>') : 275,
-    (275,'<ANY_TAG>') : 276,
-    (275,'$') : 250,
-    (276,'<ANY_TAG>') : 276,
-    (276,'$') : 250,
-    (15,'<ANY_TAG>') : 200,
-    (16,'<ANY_TAG>'): 200,
-    (100,'<ANY_TAG>') : 100,
-    (100,'$') : 17,
-    (17,' ') : 18, #do not go to state 17 unless you are expecting 'out' to be the next word
-    (18,'^') : 19,
-    (19,'o') : 20,
-    (20,'u') : 21,
-    (21,'t') : 22,
-    (22,'<adv>') : 23,
-    (22,'<pr>') : 24,
-    (23,'$') : 25,
-    (24,'$') : 25,
-    (25,'') : 26,
-    (25,' ') : 26,
-    (25,'\n') : 26,
-    (25,'^') : 27,
-    (27,'.') : 28,
-    (28,'<sent>') : 29,
-    (29,'$') : 25
-}
-
-#<ANY_TAG_A> is required
-#<ANY_TAG_B> is optional
-states = {
-    -1 : '',
-    0 : '^',
-    1 : 't',
-    2 : 'a',
-    3 : 'k',
-    4 : 'e',
-    5 : '<vblex>',
-    6 : '<ANY_TAG_A>', #secondary tag is necessary
-    7 : '<ANY_TAG_B>', #third, fourth, fifth...tags are optional
-    8 : '$',
-    9 : ' ',
-    10 : '^',
-    11 : '&', #represents any character 'ANY_CHAR
-    12 : '<n>',
-    13 : '<adj>',
-    14 : '<det>',
-    15 : '<prn>',
-    16 : '<np>',
-    100: '<ANY_TAG_B>',
-    200: '<ANY_TAG_A>',
-    201: '<ANY_TAG_B>',
-    225: '<ANY_TAG_B>',
-    250: '$',
-    251: ' ',
-    252: '^',
-    253: '&',
-    275: '<ANY_TAG_A>',
-    276: '<ANY_TAG_B>',
-    17 : '$',
-    18 : ' ',
-    19 : '^',
-    20 : 'o',
-    21 : 'u',
-    22 : 't',
-    23 : '<adv>',
-    24 : '<pr>',
-    25 : '$',
-    26 : '\n',
-    27 : '^',
-    28 : '.',
-    29 : '<sent>',
-
-}
-
-def next_token(file, subsequent_tag, in_lemma, in_take, in_out):
-    original_token = file.read(1)
-    modified_token = original_token
-    if original_token == '<': #if in tag
-        in_lemma = False
-        c = ''
-        while c != '>':
-            c = file.read(1)
-            original_token += c
-            modified_token += c
-        if subsequent_tag:
-            modified_token = '<ANY_TAG>'
-    if in_lemma and not in_take and not in_out:
-        modified_token = '&' #ANY_CHAR
-    return original_token, modified_token
-
-def step(state, token): #token is at the next state
-    next_state = transitions.get((state,token))
-    output_token = states.get(next_state)
-    return next_state, output_token #return the next state, or None if it doesn't exist
-
-def main():
-    f = open(sys.argv[1])
-    line_number = 0
-    accepted = True
-    while True:
-        line = ''
-        if accepted:
-            line_number += 1
-        current_state = -1
-
-        subsequent_tag = False
-        in_lemma = False
-        in_take = False
-        in_out = False
-
-        while states.get(current_state) != None and current_state != 26:
-            original_token, modified_token = next_token(f, subsequent_tag, in_lemma, in_take, in_out)
-            if current_state == -1 and modified_token == '':
-                print('successfully reached end of file')
-                exit(0)
-            elif current_state == -1 and modified_token == '\n':
-                accepted = True
-                break
-            elif modified_token == '\n':
-                accepted = True
-
-            current_state, output_token = step(current_state, modified_token)
-            if output_token == None:
-                break
-
-            line += original_token
-
-            subsequent_tag = current_state in [5, 6, 7, 12, 13, 14, 15, 16, 100, 200, 201, 225, 275, 276]
-            in_lemma = current_state in [1, 2, 3, 10, 11, 252, 253, 19, 20, 21, 22]
-            in_take = current_state in [1, 2, 3, 4]
-            if current_state == 19:
-                pos = f.tell() #store the current buffer position
-                peek = f.read(4) #read in the next 4 chars
-                f.seek(pos) #return to the original position
-                if peek == 'out<':
-                    in_out = True
-
-        if current_state == 26:
-            print str(line_number) + '   ' + line
-            accepted = True
-        else:
-            if accepted:
-                print str(line_number) + '   string not accepted \n'
-                accepted = False
-                current_state = -1
-                line_number += 1
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/src/transducer2.cc b/src/transducer2.cc
deleted file mode 100644
index 7042095..0000000
--- a/src/transducer2.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-#include <cwchar>
-#include <cstdio>
-#include <cerrno>
-#include <string>
-#include <iostream>
-#include <list>
-#include <set>
-#include <regex>
-
-#include <lttoolbox/ltstr.h>
-#include <lttoolbox/lt_locale.h>
-#include <lttoolbox/transducer.h>
-#include <lttoolbox/compression.h>
-#include <lttoolbox/alphabet.h>
-#include <lttoolbox/state.h>
-#include <lttoolbox/trans_exe.h>
-
-using namespace std;
-
-int main (int argc, char** argv) {
-    Alphabet alphabet;
-
-    LtLocale::tryToSetLocale();
-
-    alphabet.includeSymbol(L"<vblex>");
-    alphabet.includeSymbol(L"<n>");
-    alphabet.includeSymbol(L"<adj>");
-    alphabet.includeSymbol(L"<det>");
-    alphabet.includeSymbol(L"<prn>");
-    alphabet.includeSymbol(L"<np>");
-
-    alphabet.includeSymbol(L"<ANY_TAG>");
-    alphabet.includeSymbol(L"<ANY_CHAR>");
-    alphabet.includeSymbol(L"<$>");
-
-    int vblex_sym = alphabet(L"<vblex>");
-    int n_sym = alphabet(L"<n>");
-    int adj_sym = alphabet(L"<adj>");
-    int det_sym = alphabet(L"<det>");
-    int prn_sym = alphabet(L"<prn>");
-    int np_sym = alphabet(L"<np>");
-
-    int any_tag = alphabet(L"<ANY_TAG>");
-    int any_char = alphabet(L"<ANY_CHAR>");
-    int wb_sym = alphabet(L"<$>");
-
-    /* reap from input file */
-    for (string line; getline(cin, line);) {
-        Transducer t;
-        string first_token = line.substr(0, line.find(' '));
-        string second_token = line.substr(line.find(' ') + 1);
-
-        /* noun phrase acceptor: see README */
-
-        int initial = t.getInitial();
-        int take_out = initial;
-        for (wchar_t c : first_token) {
-            take_out = t.insertSingleTransduction(alphabet(c,c), take_out);
-        }
-        take_out = t.insertSingleTransduction(alphabet(0,L'#'), take_out);
-        take_out = t.insertSingleTransduction(alphabet(0,L' '), take_out);
-        for (wchar_t c : second_token) {
-            take_out = t.insertSingleTransduction(alphabet(0,c), take_out);
-        }
-        take_out = t.insertSingleTransduction(alphabet(vblex_sym,vblex_sym), take_out);
-        int loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        int after_takeout = take_out;
-
-        /* no det */
-        int from_nodet = after_takeout;
-
-        /* first lemma */
-        loop = after_takeout;
-        take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop);
-        t.linkStates(take_out, loop, 0);
-
-        int first_lm = take_out;
-
-        /* prn */
-        take_out = t.insertSingleTransduction(alphabet(prn_sym,prn_sym), first_lm);
-
-        loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        int after_prn = take_out;
-
-        /* np */
-        take_out = t.insertSingleTransduction(alphabet(np_sym,np_sym), first_lm);
-
-        loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        int after_np = take_out;
-
-        /* det */
-        take_out = t.insertSingleTransduction(alphabet(det_sym,det_sym), first_lm);
-
-        loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        int after_det = take_out;
-
-        /* no adj */
-        int from_noadj = take_out; //same as after_det
-
-        /* lemma for the adj */
-        loop = after_det;
-        take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop);
-        t.linkStates(take_out, loop, 0);
-
-        int lm_adj = take_out;
-
-        /* adj */
-        take_out = t.insertSingleTransduction(alphabet(adj_sym,adj_sym), lm_adj);
-
-        int optional_adj = take_out;
-
-        loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-
-        //may not have a second tag
-        t.linkStates(optional_adj, take_out, 0);
-
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        int after_adj = take_out;
-
-        /* lemma for the noun */
-        loop = after_adj;
-        take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop);
-        t.linkStates(take_out, loop, 0);
-
-        int lm_noun = take_out;
-
-        /* possible subsequent adj */
-        t.linkStates(lm_noun, lm_adj, alphabet(adj_sym,adj_sym));
-
-        /* n */
-        take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), lm_noun);
-
-        loop = take_out;
-        take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop);
-        t.linkStates(take_out, loop, 0);
-
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out);
-
-        /* out */
-        int before_out = take_out;
-
-        for (wchar_t c : second_token) {
-            take_out = t.insertSingleTransduction(alphabet(c,0), take_out);
-        }
-        take_out = t.insertSingleTransduction(alphabet(any_tag, 0), take_out);
-        take_out = t.insertSingleTransduction(alphabet(wb_sym,0), take_out);
-
-        t.setFinal(take_out);
-
-        /* final link states */
-        t.linkStates(after_takeout, before_out, 0);
-        t.linkStates(after_prn, before_out, 0);
-        t.linkStates(after_np, before_out, 0);
-        t.linkStates(from_nodet, after_det, 0);
-        t.linkStates(from_noadj, after_adj, 0);
-
-        string filename = regex_replace(line,std::regex("\\s+"), "") + ".fst";
-        FILE* fst = fopen(filename.c_str(), "w+");
-        // First write the letter symbols of the alphabet
-        Compression::wstring_write(L"abcdefghijklmnopqrstuvwxyz", fst);
-        // Then write the multicharacter symbols
-        alphabet.write(fst);
-        // Then write then number of transducers
-        Compression::multibyte_write(1, fst);
-        // Then write the name of the transducer
-        Compression::wstring_write(L"main@standard", fst);
-        // Then write the transducer
-        t.write(fst);
-        cout << line << " t.size(): " << t.size() << endl ;
-        fclose(fst);
-    }
-
-    return 0;
-}
\ No newline at end of file