commit 7be2cae1285391ddb214101fac72ae92ca39f594
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Fri Jun 11 18:38:07 2021 -0500

    use ICU

diff --git a/configure.ac b/configure.ac
index 735e785..0e34469 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,15 +58,22 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version])
 AC_SUBST(LIBXML_CFLAGS)
 AC_SUBST(LIBXML_LIBS)
 
+PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc])
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LIBS)
+
 # Checks for libraries.
 AC_CHECK_LIB(xml2, xmlReaderForFile)
 
+AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])])
+
 AC_CHECK_FUNCS([setlocale strdup])
 
 AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked])
 
-CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS"
-LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS -lz"
+CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS"
+LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS $ICU_LIBS -lz"
 
 # Checks for highest supported C++ standard
 AC_LANG(C++)
diff --git a/src/lrx_comp.cc b/src/lrx_comp.cc
index c1d5b46..41a28da 100644
--- a/src/lrx_comp.cc
+++ b/src/lrx_comp.cc
@@ -56,7 +56,9 @@ int main (int argc, char **argv)
       compiler.setDebugMode(true);
     }
 
+    cerr << "parse!" << endl;
     compiler.parse(argv[2]);
+    cerr << "write!" << endl;
     FILE *output = fopen(argv[3], "wb");
     compiler.write(output);
   }
diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc
index 3fb4e6a..9e17f5b 100644
--- a/src/lrx_compiler.cc
+++ b/src/lrx_compiler.cc
@@ -21,67 +21,55 @@
 
 using namespace std;
 
-wstring const LRXCompiler::LRX_COMPILER_LRX_ELEM        = L"lrx";
-wstring const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM    = L"def-seqs";
-wstring const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM     = L"def-seq";
-wstring const LRXCompiler::LRX_COMPILER_RULES_ELEM      = L"rules";
-wstring const LRXCompiler::LRX_COMPILER_RULE_ELEM       = L"rule";
-wstring const LRXCompiler::LRX_COMPILER_MATCH_ELEM      = L"match";
-wstring const LRXCompiler::LRX_COMPILER_SELECT_ELEM     = L"select";
-wstring const LRXCompiler::LRX_COMPILER_REMOVE_ELEM     = L"remove";
-wstring const LRXCompiler::LRX_COMPILER_OR_ELEM         = L"or";
-wstring const LRXCompiler::LRX_COMPILER_REPEAT_ELEM     = L"repeat";
-wstring const LRXCompiler::LRX_COMPILER_SEQ_ELEM        = L"seq";
-
-wstring const LRXCompiler::LRX_COMPILER_LEMMA_ATTR      = L"lemma";
-wstring const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR     = L"suffix";
-wstring const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR   = L"contains";
-wstring const LRXCompiler::LRX_COMPILER_CASE_ATTR       = L"case";
-wstring const LRXCompiler::LRX_COMPILER_SURFACE_ATTR    = L"surface";
-wstring const LRXCompiler::LRX_COMPILER_TAGS_ATTR       = L"tags";
-wstring const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR     = L"weight";
-wstring const LRXCompiler::LRX_COMPILER_COMMENT_ATTR    = L"c";
-wstring const LRXCompiler::LRX_COMPILER_NAME_ATTR       = L"n";
-wstring const LRXCompiler::LRX_COMPILER_FROM_ATTR       = L"from";
-wstring const LRXCompiler::LRX_COMPILER_UPTO_ATTR       = L"upto";
-
-wstring const LRXCompiler::LRX_COMPILER_TYPE_SELECT     = L"select";
-wstring const LRXCompiler::LRX_COMPILER_TYPE_REMOVE     = L"remove";
-wstring const LRXCompiler::LRX_COMPILER_TYPE_SKIP       = L"skip";
+UString const LRXCompiler::LRX_COMPILER_LRX_ELEM        = "lrx"_u;
+UString const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM    = "def-seqs"_u;
+UString const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM     = "def-seq"_u;
+UString const LRXCompiler::LRX_COMPILER_RULES_ELEM      = "rules"_u;
+UString const LRXCompiler::LRX_COMPILER_RULE_ELEM       = "rule"_u;
+UString const LRXCompiler::LRX_COMPILER_MATCH_ELEM      = "match"_u;
+UString const LRXCompiler::LRX_COMPILER_SELECT_ELEM     = "select"_u;
+UString const LRXCompiler::LRX_COMPILER_REMOVE_ELEM     = "remove"_u;
+UString const LRXCompiler::LRX_COMPILER_OR_ELEM         = "or"_u;
+UString const LRXCompiler::LRX_COMPILER_REPEAT_ELEM     = "repeat"_u;
+UString const LRXCompiler::LRX_COMPILER_SEQ_ELEM        = "seq"_u;
+
+UString const LRXCompiler::LRX_COMPILER_LEMMA_ATTR      = "lemma"_u;
+UString const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR     = "suffix"_u;
+UString const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR   = "contains"_u;
+UString const LRXCompiler::LRX_COMPILER_CASE_ATTR       = "case"_u;
+UString const LRXCompiler::LRX_COMPILER_SURFACE_ATTR    = "surface"_u;
+UString const LRXCompiler::LRX_COMPILER_TAGS_ATTR       = "tags"_u;
+UString const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR     = "weight"_u;
+UString const LRXCompiler::LRX_COMPILER_COMMENT_ATTR    = "c"_u;
+UString const LRXCompiler::LRX_COMPILER_NAME_ATTR       = "n"_u;
+UString const LRXCompiler::LRX_COMPILER_FROM_ATTR       = "from"_u;
+UString const LRXCompiler::LRX_COMPILER_UPTO_ATTR       = "upto"_u;
+
+UString const LRXCompiler::LRX_COMPILER_TYPE_SELECT     = "select"_u;
+UString const LRXCompiler::LRX_COMPILER_TYPE_REMOVE     = "remove"_u;
+UString const LRXCompiler::LRX_COMPILER_TYPE_SKIP       = "skip"_u;
 
 double const  LRXCompiler::LRX_COMPILER_DEFAULT_WEIGHT  = 1.0;
 
-wstring
-LRXCompiler::itow(int i)
-{
-  // Convert an int to a wstring
-  wchar_t buf[50];
-  memset(buf, '\0', sizeof(buf));
-  swprintf(buf, 50, L"%d", i);
-  wstring id(buf);
-  return id;
-}
-
-int
-LRXCompiler::wtoi(wstring w)
+void
+LRXCompiler::debug(const char* fmt, ...)
 {
-  // Convert a wstring to an int
-  wistringstream wstrm(w);
-  int i_name = -numeric_limits<int>::max();
-  wstrm >> i_name;
-
-  return i_name;
+  if (debugMode) {
+    va_list argptr;
+    va_start(argptr, fmt);
+    u_vfprintf(debug_output, fmt, argptr);
+    va_end(argptr);
+  }
 }
 
-double
-LRXCompiler::wtod(wstring w)
+UString
+LRXCompiler::itow(int i)
 {
-  // Convert a wstring to a double
-  wistringstream wstrm(w);
-  double d_name = -numeric_limits<double>::max();
-  wstrm >> d_name;
-
-  return d_name;
+  // Convert an int to a UString
+  UChar buf[50];
+  u_snprintf(buf, 50, "%d", i);
+  UString id(buf);
+  return id;
 }
 
 LRXCompiler::LRXCompiler()
@@ -90,6 +78,7 @@ LRXCompiler::LRXCompiler()
 
   debugMode = false;
   outputGraph = false;
+  debug_output = u_finit(stderr, NULL, NULL);
 
   currentRuleId = 0;
 
@@ -99,15 +88,15 @@ LRXCompiler::LRXCompiler()
 
   canSelect = true;
 
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SELECT + L">");
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_REMOVE + L">");
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SKIP + L">");
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SELECT + ">"_u);
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_REMOVE + ">"_u);
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SKIP + ">"_u);
 
-  alphabet.includeSymbol(L"<ANY_TAG>");
-  alphabet.includeSymbol(L"<ANY_CHAR>");
-  alphabet.includeSymbol(L"<ANY_UPPER>");
-  alphabet.includeSymbol(L"<ANY_LOWER>");
-  alphabet.includeSymbol(L"<$>");
+  alphabet.includeSymbol("<ANY_TAG>"_u);
+  alphabet.includeSymbol("<ANY_CHAR>"_u);
+  alphabet.includeSymbol("<ANY_UPPER>"_u);
+  alphabet.includeSymbol("<ANY_LOWER>"_u);
+  alphabet.includeSymbol("<$>"_u);
 
 }
 
@@ -129,64 +118,47 @@ LRXCompiler::setOutputGraph(bool o)
 }
 
 void
-LRXCompiler::skipBlanks(wstring &name)
+LRXCompiler::skipBlanks(UString &name)
 {
-  while(name == L"#text" || name == L"#comment")
+  while(name == "#text"_u || name == "#comment"_u)
   {
-    if(name != L"#comment")
+    if(name != "#comment"_u)
     {
       if(!allBlanks())
       {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Invalid construction." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): Invalid construction." << endl;
         exit(EXIT_FAILURE);
       }
     }
 
     xmlTextReaderRead(reader);
-    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    name = XMLParseUtil::readName(reader);
   }
 }
 
-wstring
-LRXCompiler::attrib(wstring const &name)
+UString
+LRXCompiler::attrib(UString const &name)
 {
   return XMLParseUtil::attrib(reader, name);
 }
 
-wstring
-LRXCompiler::attrib(wstring const &name, const wstring fallback)
+UString
+LRXCompiler::attrib(UString const &name, const UString fallback)
 {
-  string mystr = "";
-  for (int i = 0, limit = name.size(); i != limit; i++) {
-    mystr += static_cast<char>(name[i]);
-  }
-
-  xmlChar *attrname = xmlCharStrdup(mystr.c_str());
-  xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname);
-  wstring result = XMLParseUtil::towstring(myattr);
-  xmlFree(myattr);
-  xmlFree(attrname);
-  if(myattr == NULL) {
-    return fallback;
-  }
-  else {
-    return result;
-  }
+  return XMLParseUtil::attrib(reader, name, fallback);
 }
 
 bool
 LRXCompiler::allBlanks()
 {
-  bool flag = true;
-  wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader));
-
-  for(unsigned int i = 0, limit = text.size(); i < limit; i++)
-  {
-    flag = flag && iswspace(text[i]);
+  UString text = XMLParseUtil::readValue(reader);
+  for (auto& c : text) {
+    if (!u_isspace(c)) {
+      return false;
+    }
   }
-
-  return flag;
+  return true;
 }
 
 void
@@ -210,7 +182,7 @@ LRXCompiler::parse(string const &fitxer)
 
   if(ret != 0)
   {
-    wcerr << L"Error: Parse error at the end of input." << endl;
+    cerr << "Error: Parse error at the end of input." << endl;
   }
 
 }
@@ -218,14 +190,13 @@ LRXCompiler::parse(string const &fitxer)
 void
 LRXCompiler::procNode()
 {
-  xmlChar const *xnombre = xmlTextReaderConstName(reader);
-  wstring nombre = XMLParseUtil::towstring(xnombre);
+  UString nombre = XMLParseUtil::readName(reader);
 
-  if(nombre == L"#text")
+  if(nombre == "#text"_u)
   {
     /* ignorar */
   }
-  else if(nombre== L"#comment")
+  else if(nombre== "#comment"_u)
   {
     /* ignorar */
   }
@@ -251,8 +222,8 @@ LRXCompiler::procNode()
   }
   else
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Invalid node '<" << nombre << L">'." << endl;
+    cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+    cerr << "): Invalid node '<" << nombre << ">'." << endl;
     exit(EXIT_FAILURE);
   }
 
@@ -262,10 +233,13 @@ LRXCompiler::procNode()
 void
 LRXCompiler::procRule()
 {
-  wstring comment = this->attrib(LRX_COMPILER_COMMENT_ATTR);
-  wstring xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR);
-  wstring nombre = this->attrib(LRX_COMPILER_NAME_ATTR);
-  double weight =  wtod (xweight);
+  UString comment = this->attrib(LRX_COMPILER_COMMENT_ATTR);
+  UString xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR);
+  UString nombre = this->attrib(LRX_COMPILER_NAME_ATTR);
+  double weight = LRX_COMPILER_DEFAULT_WEIGHT;
+  if (!xweight.empty()) {
+    weight = stod(xweight);
+  }
 
   if(weight <= -numeric_limits<int>::max())
   {
@@ -276,25 +250,22 @@ LRXCompiler::procRule()
   currentState = transducer.insertNewSingleTransduction(alphabet(0, 0), currentState);
 
   currentRuleId++;
-  wstring ruleId = L"<" + itow(currentRuleId) + L">";
+  UString ruleId = "<"_u + itow(currentRuleId) + ">"_u;
   weights[currentRuleId] = weight;
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"  rule: %d, weight: %.2f \n", currentRuleId, weight);
-  }
+  debug("  rule: %d, weight: %.2f \n", currentRuleId, weight);
 
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
     if(ret != 1)
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Parse error." << endl;
       exit(EXIT_FAILURE);
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -316,7 +287,7 @@ LRXCompiler::procRule()
     }
     else if(name == LRX_COMPILER_RULE_ELEM)
     {
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState);
       if(!alphabet.isSymbolDefined(ruleId.c_str()))
       {
         alphabet.includeSymbol(ruleId.c_str());
@@ -328,9 +299,9 @@ LRXCompiler::procRule()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_RULE_ELEM;
-      wcerr << L">'." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_RULE_ELEM;
+      cerr << ">'." << endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -343,10 +314,7 @@ void
 LRXCompiler::procOr()
 {
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"    or: \n");
-  }
+  debug("    or: \n");
 
   int or_initial_state = currentState;
   vector<int> reachedStates;
@@ -355,12 +323,12 @@ LRXCompiler::procOr()
     int ret = xmlTextReaderRead(reader);
     if(ret != 1)
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Parse error." << endl;
       exit(EXIT_FAILURE);
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -392,9 +360,9 @@ LRXCompiler::procOr()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_OR_ELEM;
-      wcerr << L">'." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_OR_ELEM;
+      cerr << ">'." << endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -412,18 +380,18 @@ LRXCompiler::procDefSeq()
   int oldstate = currentState;
   currentState = initialState;
   lastState = initialState;
-  wstring seqname = this->attrib(LRX_COMPILER_NAME_ATTR);
+  UString seqname = this->attrib(LRX_COMPILER_NAME_ATTR);
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
     if(ret != 1)
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Parse error." << endl;
       exit(EXIT_FAILURE);
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -450,9 +418,9 @@ LRXCompiler::procDefSeq()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM;
-      wcerr << L">'." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_REPEAT_ELEM;
+      cerr << ">'." << endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -468,22 +436,19 @@ void
 LRXCompiler::procMatch()
 {
   // These are mutually exclusive
-  wstring lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR);
-  wstring suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR);
-  wstring _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive
+  UString lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR);
+  UString suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR);
+  UString _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive
 
   // This is currently disabled: Future use
-  wstring surface = this->attrib(LRX_COMPILER_SURFACE_ATTR);
+  UString surface = this->attrib(LRX_COMPILER_SURFACE_ATTR);
 
-  wstring tags = this->attrib(LRX_COMPILER_TAGS_ATTR, L"*");
+  UString tags = this->attrib(LRX_COMPILER_TAGS_ATTR, "*"_u);
 
-  if(surface != L"")
+  if(!surface.empty())
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"      match: %S\n", surface.c_str());
-    }
+    debug("      match: %S\n", surface.c_str());
 
     for(auto& it : surface)
     {
@@ -492,70 +457,64 @@ LRXCompiler::procMatch()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"      match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str());
-    }
+    debug("      match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str());
 
-    if(_case != L"")
+    if(_case != ""_u)
     {
-      if(_case == L"AA") // <ANY_UPPER>+
+      if(_case == "AA"_u) // <ANY_UPPER>+
       {
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_UPPER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_UPPER>"_u), 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
-      else if(_case == L"aa")  // <ANY_LOWER>+
+      else if(_case == "aa"_u)  // <ANY_LOWER>+
       {
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_LOWER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_LOWER>"_u), 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
-      else if(_case == L"Aa") // <ANY_UPPER>+ <ANY_LOWER>+
+      else if(_case == "Aa"_u) // <ANY_UPPER>+ <ANY_LOWER>+
       {
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_UPPER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_UPPER>"_u), 0), currentState);
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_LOWER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_LOWER>"_u), 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
     }
-    if(lemma == L"*" && suffix == L"" && contains == L"" && _case == L"")
+    if(lemma == "*"_u && suffix.empty() && contains.empty() && _case.empty())
     {
       // This is only if there is no suffix or case or contains
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        char: -\n");
-      }
+      debug("        char: -\n");
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u), 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(suffix != L"")
+    else if(suffix != ""_u)
     {
       // A suffix is <ANY_CHAR> any amount of times followed by whatever is in the suffix
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u), 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
       for(auto& it : suffix)
       {
         currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState);
       }
     }
-    else if(contains != L"")
+    else if(!contains.empty())
     {
       // A contains is <ANY_CHAR> any amount of times followed by whatever is in the attribute
       // followed by <ANY_CHAR> any amount of times
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u), 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
       for(auto& it : suffix)
       {
         currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState);
       }
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u), 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(lemma != L"*")
+    else if(lemma != "*"_u)
     {
       for(auto& it : lemma)
       {
@@ -564,66 +523,57 @@ LRXCompiler::procMatch()
     }
     else
     {
-      fwprintf(stderr, L"Something surprising happened in <match> compilation\n");
+      cerr << "Something surprising happened in <match> compilation\n";
     }
 
-    wstring tag = L"";
+    UString tag;
     for(auto& it : tags)
     {
-      if(it == L'.')
+      if(it == '.')
       {
-        if(tag == L"")
+        if(tag.empty())
         {
           continue;
         }
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
           int localLast = currentState;
-          currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"), 0), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u), 0), currentState);
           transducer.linkStates(currentState, localLast, 0);
         }
         else
         {
           currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState);
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u), 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(tag == L"")
+    else if(tag.empty())
     {
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
         alphabet.includeSymbol(tag.c_str());
       }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState);
     }
   }
@@ -631,31 +581,31 @@ LRXCompiler::procMatch()
   if(xmlTextReaderIsEmptyElement(reader))
   {
     // If self-closing
-    currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<skip>")), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<skip>"_u)), currentState);
     return;
   }
 
-  wstring name = L"";
+  UString name = ""_u;
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
     if(ret != 1)
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Parse error." << endl;
       exit(EXIT_FAILURE);
     }
 
-    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_SELECT_ELEM)
     {
       if(!canSelect)
       {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): <select> is not permitted inside <repeat>." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): <select> is not permitted inside <repeat>." << endl;
         exit(EXIT_FAILURE);
       }
       procSelect();
@@ -664,8 +614,8 @@ LRXCompiler::procMatch()
     {
       if(!canSelect)
       {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): <remove> is not permitted inside <repeat>." << endl;
+        cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+        cerr << "): <remove> is not permitted inside <repeat>." << endl;
         exit(EXIT_FAILURE);
       }
       procRemove();
@@ -676,9 +626,9 @@ LRXCompiler::procMatch()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_MATCH_ELEM;
-      wcerr << L">'." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_MATCH_ELEM;
+      cerr << ">'." << endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -691,11 +641,11 @@ void
 LRXCompiler::procSelect()
 {
 
-  wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
+  UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
 
-  wstring key = L"<" + LRX_COMPILER_TYPE_SELECT + L">";
-  if(lemma != L"*")
+  UString key = "<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u;
+  if(lemma != "*"_u)
   {
     key += lemma;
   }
@@ -703,22 +653,19 @@ LRXCompiler::procSelect()
   Transducer recogniser;
   int localCurrentState = recogniser.getInitial();
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        select: %S, %S\n", lemma.c_str(), tags.c_str());
-  }
+  debug("        select: %S, %S\n", lemma.c_str(), tags.c_str());
 
-  currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_SELECT + L">")), currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u)), currentState);
 
 
-  if(lemma == L"*")
+  if(lemma == "*"_u)
   {
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_CHAR>")), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_CHAR>"_u)), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u),0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_CHAR>";
+    key = key + "<ANY_CHAR>"_u;
   }
   else {
     for (auto &it : lemma) {
@@ -727,29 +674,26 @@ LRXCompiler::procSelect()
     }
   }
 
-  if(tags != L"")
+  if(tags != ""_u)
   {
-    wstring tag = L"";
+    UString tag = ""_u;
     for(auto& it : tags)
     {
-      if(it == L'.')
+      if(it == '.')
       {
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
-          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
 	  int localLast = localCurrentState;
-          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
 	  recogniser.linkStates(localCurrentState, localLast, 0);
-          key = key + L"<ANY_TAG>";
+          key = key + "<ANY_TAG>"_u;
         }
         else
         {
@@ -757,34 +701,28 @@ LRXCompiler::procSelect()
           localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
           key = key + tag;
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
-      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+      debug("        tag: %S\n", tag.c_str());
+      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
       int localLast = localCurrentState;
-      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
       recogniser.linkStates(localCurrentState, localLast, 0);
-      key = key + L"<ANY_TAG>";
+      key = key + "<ANY_TAG>"_u;
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
         alphabet.includeSymbol(tag.c_str());
       }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState);
       localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
       key = key + tag;
@@ -792,26 +730,20 @@ LRXCompiler::procSelect()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"        tag: -\n");
-    }
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+    debug("        tag: -\n");
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_TAG>";
+    key = key + "<ANY_TAG>"_u;
   }
 
 
   recogniser.setFinal(localCurrentState);
 
   recognisers[key] = recogniser;
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        select: %d\n", recognisers[key].size());
-  }
-  //currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
+  debug("        select: %d\n", recognisers[key].size());
+  //currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState);
 
   return;
 }
@@ -820,11 +752,11 @@ void
 LRXCompiler::procRemove()
 {
 
-  wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
+  UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
 
-  wstring key = L"<" + LRX_COMPILER_TYPE_REMOVE + L">";
-  if(lemma != L"*")
+  UString key = "<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u;
+  if(lemma != "*"_u)
   {
     key += lemma;
   }
@@ -832,21 +764,18 @@ LRXCompiler::procRemove()
   Transducer recogniser;
   int localCurrentState = recogniser.getInitial();
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        remove: %S, %S\n", lemma.c_str(), tags.c_str());
-  }
+  debug("        remove: %S, %S\n", lemma.c_str(), tags.c_str());
 
-  currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_REMOVE + L">")), currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u)), currentState);
 
-  if(lemma == L"*")
+  if(lemma == "*"_u)
   {
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_CHAR>")), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_CHAR>"_u)), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_CHAR>"_u),0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_CHAR>";
+    key = key + "<ANY_CHAR>"_u;
   }
   else
   {
@@ -857,29 +786,26 @@ LRXCompiler::procRemove()
     }
   }
 
-  if(tags != L"")
+  if(tags != ""_u)
   {
-    wstring tag = L"";
+    UString tag = ""_u;
     for(auto& it : tags)
     {
-      if(it == L'.')
+      if(it == '.')
       {
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
-          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
 	  int localLast = localCurrentState;
-          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
 	  recogniser.linkStates(localCurrentState, localLast, 0);
-          key = key + L"<ANY_TAG>";
+          key = key + "<ANY_TAG>"_u;
         }
         else
         {
@@ -887,34 +813,28 @@ LRXCompiler::procRemove()
           localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
           key = key + tag;
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
-      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+      debug("        tag: %S\n", tag.c_str());
+      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
       int localLast = localCurrentState;
-      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
       recogniser.linkStates(localCurrentState, localLast, 0);
-      key = key + L"<ANY_TAG>";
+      key = key + "<ANY_TAG>"_u;
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
-        alphabet.includeSymbol(tag.c_str());
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
+        alphabet.includeSymbol(tag);
       }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState);
       localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
       key = key + tag;
@@ -922,25 +842,19 @@ LRXCompiler::procRemove()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"        tag: -\n");
-    }
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+    debug("        tag: -\n");
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<ANY_TAG>"_u)), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet("<ANY_TAG>"_u),0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_TAG>";
+    key = key + "<ANY_TAG>"_u;
   }
 
 
   recogniser.setFinal(localCurrentState);
 
   recognisers[key] = recogniser;
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        remove: %d\n", recognisers[key].size());
-  }
+  debug("        remove: %d\n", recognisers[key].size());
 
   return;
 }
@@ -951,20 +865,20 @@ LRXCompiler::procRepeat()
 {
   bool couldSelect = canSelect;
   canSelect = false;
-  wstring xfrom = this->attrib(LRX_COMPILER_FROM_ATTR);
-  wstring xupto = this->attrib(LRX_COMPILER_UPTO_ATTR);
+  UString xfrom = this->attrib(LRX_COMPILER_FROM_ATTR);
+  UString xupto = this->attrib(LRX_COMPILER_UPTO_ATTR);
   int from = stoi(xfrom);
   int upto = stoi(xupto);
   if(from < 0 || upto < 0)
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Number of repetitions cannot be negative." << endl;
+    cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+    cerr << "): Number of repetitions cannot be negative." << endl;
     exit(EXIT_FAILURE);
   }
   else if(from > upto)
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Lower bound on number of repetitions cannot be larger than upper bound." << endl;
+    cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+    cerr << "): Lower bound on number of repetitions cannot be larger than upper bound." << endl;
     exit(EXIT_FAILURE);
   }
   int count = upto - from;
@@ -978,12 +892,12 @@ LRXCompiler::procRepeat()
     int ret = xmlTextReaderRead(reader);
     if(ret != 1)
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Parse error." << endl;
       exit(EXIT_FAILURE);
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -1006,9 +920,9 @@ LRXCompiler::procRepeat()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM;
-      wcerr << L">'." << endl;
+      cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+      cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_REPEAT_ELEM;
+      cerr << ">'." << endl;
       exit(EXIT_FAILURE);
     }
   }
@@ -1031,11 +945,11 @@ LRXCompiler::procRepeat()
 void
 LRXCompiler::procSeq()
 {
-  wstring name = this->attrib(LRX_COMPILER_NAME_ATTR);
+  UString name = this->attrib(LRX_COMPILER_NAME_ATTR);
   if(sequences.find(name) == sequences.end())
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Sequence '" << name << L"' not defined." << endl;
+    cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader);
+    cerr << "): Sequence '" << name << "' not defined." << endl;
     exit(EXIT_FAILURE);
   }
   currentState = transducer.insertTransducer(currentState, sequences[name]);
@@ -1050,28 +964,24 @@ LRXCompiler::write(FILE *fst)
   Compression::multibyte_write(recognisers.size(), fst);
   for(auto& it : recognisers)
   {
-    Compression::wstring_write(it.first, fst);
-    if(debugMode)
-    {
-      fwprintf(stderr, L"+ %d => %S\n", it.second.size(), it.first.c_str());
-      it.second.show(alphabet, stderr, 0, false);
+    Compression::string_write(it.first, fst);
+    debug("+ %d => %S\n", it.second.size(), it.first.c_str());
+    if (debugMode) {
+      it.second.show(alphabet, debug_output, 0, false);
     }
     it.second.write(fst);
   }
 
-  Compression::wstring_write(L"main", fst);
+  Compression::string_write("main"_u, fst);
   if(outputGraph)
   {
-    transducer.show(alphabet, stderr, 0, false);
+    transducer.show(alphabet, debug_output, 0, false);
   }
   transducer.write(fst);
 
   for(auto& it : weights)
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"%.4f %d\n", it.second, it.first);
-    }
+    debug("%.4f %d\n", it.second, it.first);
     weight record{it.first, "", it.second};
     weight_to_le(record);
     fwrite((void *)&record, 1, sizeof(weight), fst);
@@ -1079,6 +989,6 @@ LRXCompiler::write(FILE *fst)
 
   if(!outputGraph)
   {
-    fwprintf(stderr, L"%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions());
+    u_fprintf(debug_output, "%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions());
   }
 }
diff --git a/src/lrx_compiler.h b/src/lrx_compiler.h
index 099c4a7..04cbe1e 100644
--- a/src/lrx_compiler.h
+++ b/src/lrx_compiler.h
@@ -43,6 +43,8 @@
 #include <lttoolbox/trans_exe.h>
 #include <lttoolbox/my_stdio.h>
 
+#include <unicode/ustdio.h>
+
 using namespace std;
 
 class LRXCompiler
@@ -52,10 +54,10 @@ private:
   Alphabet alphabet;
   Transducer transducer;
 
-  map<wstring, Transducer> recognisers; // keyed on pattern
+  map<UString, Transducer> recognisers; // keyed on pattern
   map<int, double> weights; // keyed on rule id
 
-  map<wstring, Transducer> sequences;
+  map<UString, Transducer> sequences;
 
   int initialState;
   int lastState;
@@ -66,9 +68,11 @@ private:
 
   bool debugMode;
   bool outputGraph;
+  UFILE* debug_output;
+  void debug(const char* fmt, ...);
   bool allBlanks();
 
-  void skipBlanks(wstring &name);
+  void skipBlanks(UString &name);
   void procNode();
   void procList();
   void procListMatch();
@@ -82,43 +86,43 @@ private:
   void procSeq();
 
   /* If attrib does not exist (or other error), returns an empty string: */
-  wstring attrib(wstring const &name);
+  UString attrib(UString const &name);
 
   /* If attrib does not exist (or other error), returns fallback: */
-  wstring attrib(wstring const &name, const wstring fallback);
+  UString attrib(UString const &name, const UString fallback);
 
-  wstring itow(int i);
-  int wtoi(wstring);
-  double wtod(wstring);
+  UString itow(int i);
+  int wtoi(UString);
+  double wtod(UString);
 
 public:
-  static wstring const LRX_COMPILER_LRX_ELEM;
-  static wstring const LRX_COMPILER_DEFSEQS_ELEM;
-  static wstring const LRX_COMPILER_DEFSEQ_ELEM;
-  static wstring const LRX_COMPILER_RULES_ELEM;
-  static wstring const LRX_COMPILER_RULE_ELEM;
-  static wstring const LRX_COMPILER_MATCH_ELEM;
-  static wstring const LRX_COMPILER_SELECT_ELEM;
-  static wstring const LRX_COMPILER_REMOVE_ELEM;
-  static wstring const LRX_COMPILER_OR_ELEM;
-  static wstring const LRX_COMPILER_REPEAT_ELEM;
-  static wstring const LRX_COMPILER_SEQ_ELEM;
-
-  static wstring const LRX_COMPILER_SURFACE_ATTR;
-  static wstring const LRX_COMPILER_SUFFIX_ATTR;
-  static wstring const LRX_COMPILER_LEMMA_ATTR;
-  static wstring const LRX_COMPILER_CONTAINS_ATTR;
-  static wstring const LRX_COMPILER_CASE_ATTR;
-  static wstring const LRX_COMPILER_TAGS_ATTR;
-  static wstring const LRX_COMPILER_COMMENT_ATTR;
-  static wstring const LRX_COMPILER_NAME_ATTR;
-  static wstring const LRX_COMPILER_WEIGHT_ATTR;
-  static wstring const LRX_COMPILER_FROM_ATTR;
-  static wstring const LRX_COMPILER_UPTO_ATTR;
-
-  static wstring const LRX_COMPILER_TYPE_SELECT;
-  static wstring const LRX_COMPILER_TYPE_REMOVE;
-  static wstring const LRX_COMPILER_TYPE_SKIP;
+  static UString const LRX_COMPILER_LRX_ELEM;
+  static UString const LRX_COMPILER_DEFSEQS_ELEM;
+  static UString const LRX_COMPILER_DEFSEQ_ELEM;
+  static UString const LRX_COMPILER_RULES_ELEM;
+  static UString const LRX_COMPILER_RULE_ELEM;
+  static UString const LRX_COMPILER_MATCH_ELEM;
+  static UString const LRX_COMPILER_SELECT_ELEM;
+  static UString const LRX_COMPILER_REMOVE_ELEM;
+  static UString const LRX_COMPILER_OR_ELEM;
+  static UString const LRX_COMPILER_REPEAT_ELEM;
+  static UString const LRX_COMPILER_SEQ_ELEM;
+
+  static UString const LRX_COMPILER_SURFACE_ATTR;
+  static UString const LRX_COMPILER_SUFFIX_ATTR;
+  static UString const LRX_COMPILER_LEMMA_ATTR;
+  static UString const LRX_COMPILER_CONTAINS_ATTR;
+  static UString const LRX_COMPILER_CASE_ATTR;
+  static UString const LRX_COMPILER_TAGS_ATTR;
+  static UString const LRX_COMPILER_COMMENT_ATTR;
+  static UString const LRX_COMPILER_NAME_ATTR;
+  static UString const LRX_COMPILER_WEIGHT_ATTR;
+  static UString const LRX_COMPILER_FROM_ATTR;
+  static UString const LRX_COMPILER_UPTO_ATTR;
+
+  static UString const LRX_COMPILER_TYPE_SELECT;
+  static UString const LRX_COMPILER_TYPE_REMOVE;
+  static UString const LRX_COMPILER_TYPE_SKIP;
 
   static double  const LRX_COMPILER_DEFAULT_WEIGHT;
 
diff --git a/src/lrx_proc.cc b/src/lrx_proc.cc
index bd77260..32ac345 100644
--- a/src/lrx_proc.cc
+++ b/src/lrx_proc.cc
@@ -92,7 +92,8 @@ int main(int argc, char *argv[])
     }
   }
 
-  FILE *input = stdin, *output = stdout;
+  InputFile input;
+  UFILE* output = u_finit(stdout, NULL, NULL);
   LtLocale::tryToSetLocale();
 
   if(optind == (argc - 3))
@@ -103,14 +104,12 @@ int main(int argc, char *argv[])
       endProgram(argv[0]);
     }
 
-    input = fopen(argv[optind+1], "rb");
-    if(input == NULL || ferror(input))
-    {
+    if (!input.open(argv[optind+1])) {
       endProgram(argv[0]);
     }
 
-    output= fopen(argv[optind+2], "wb");
-    if(output == NULL || ferror(output))
+    output = u_fopen(argv[optind+2], "w", NULL, NULL);
+    if(output == NULL)
     {
       endProgram(argv[0]);
     }
@@ -126,9 +125,7 @@ int main(int argc, char *argv[])
       endProgram(argv[0]);
     }
 
-    input = fopen(argv[optind+1], "rb");
-    if(input == NULL || ferror(input))
-    {
+    if (!input.open(argv[optind+1])) {
       endProgram(argv[0]);
     }
 
@@ -150,14 +147,8 @@ int main(int argc, char *argv[])
     endProgram(argv[0]);
   }
 
-#ifdef _MSC_VER
-        _setmode(_fileno(input), _O_U8TEXT);
-        _setmode(_fileno(output), _O_U8TEXT);
-#endif
-
   lrxp.init();
   lrxp.process(input, output);
-  fclose(input);
-  fclose(output);
+  u_fclose(output);
   return EXIT_SUCCESS;
 }
diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc
index 276c6ba..3501d02 100644
--- a/src/lrx_processor.cc
+++ b/src/lrx_processor.cc
@@ -20,18 +20,17 @@
 #include <cstdint>
 using namespace std;
 
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_SELECT     = L"<select>";
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE     = L"<remove>";
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_SKIP       = L"<skip>";
+UString const LRXProcessor::LRX_PROCESSOR_TAG_SELECT     = "<select>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE     = "<remove>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_SKIP       = "<skip>"_u;
 
-wstring
+UString
 LRXProcessor::itow(int i)
 {
-  // Convert an int to a wstring
-  wchar_t buf[50];
-  memset(buf, '\0', sizeof(buf));
-  swprintf(buf, 50, L"%d", i);
-  wstring id(buf);
+  // Convert an int to a UString
+  UChar buf[50];
+  u_snprintf(buf, 50, "%d", i);
+  UString id(buf);
   return id;
 }
 
@@ -82,34 +81,21 @@ LRXProcessor::load(FILE *in)
 
   while(len > 0)
   {
-    int len2 = Compression::multibyte_read(in);
-    wstring name = L"";
-    while(len2 > 0)
-    {
-      name += static_cast<wchar_t>(Compression::multibyte_read(in));
-      len2--;
-    }
+    UString name = Compression::string_read(in);
     recognisers[name].read(in, alphabet);
     if(debugMode)
     {
-      fwprintf(stderr, L"Recogniser: %S, [finals: %d]\n", name.c_str(), recognisers[name].getFinals().size());
+      cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n";
     }
     len--;
   }
 
   if(debugMode)
   {
-    fwprintf(stderr, L"recognisers: %d\n", recognisers.size());
+    cerr << "recognisers: " << recognisers.size() << endl;
   }
 
-  int len3 = Compression::multibyte_read(in);
-
-  wstring name = L"";
-  while(len3 > 0)
-  {
-    name += static_cast<wchar_t>(Compression::multibyte_read(in));
-    len3--;
-  }
+  UString name = Compression::string_read(in);
 
   transducer.read(in, alphabet);
 
@@ -118,13 +104,15 @@ LRXProcessor::load(FILE *in)
   while(fread(&record, sizeof(weight), 1, in))
   {
     weight_from_le(record);
-    wstring sid = L"<" + itow(record.id) + L">";
+    UString sid = "<"_u + itow(record.id) + ">"_u;
     weights[sid] = record.pisu;
 
+    /*
     if(debugMode)
     {
-      //fwprintf(stderr, L"%S %d weight(%.4f)\n", sid.c_str(), record.id, record.pisu);
+      cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n";
     }
+    */
   }
 
   return;
@@ -137,42 +125,26 @@ LRXProcessor::init()
 
   anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end());
 
-  escaped_chars.insert(L'[');
-  escaped_chars.insert(L']');
-  escaped_chars.insert(L'{');
-  escaped_chars.insert(L'}');
-  escaped_chars.insert(L'^');
-  escaped_chars.insert(L'$');
-  escaped_chars.insert(L'/');
-  escaped_chars.insert(L'\\');
-  escaped_chars.insert(L'@');
-  escaped_chars.insert(L'<');
-  escaped_chars.insert(L'>');
+  escaped_chars.insert('[');
+  escaped_chars.insert(']');
+  escaped_chars.insert('{');
+  escaped_chars.insert('}');
+  escaped_chars.insert('^');
+  escaped_chars.insert('$');
+  escaped_chars.insert('/');
+  escaped_chars.insert('\\');
+  escaped_chars.insert('@');
+  escaped_chars.insert('<');
+  escaped_chars.insert('>');
 
 }
 
-wstring
-LRXProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2)
-{
-  wstring result = L"";
-  result += delim1;
-  wchar_t c = delim1;
-
-  while(!feof(input) && c != delim2)
-  {
-    c = static_cast<wchar_t>(fgetwc_unlocked(input));
-    result += c;
-  }
-
-  return result;
-}
-
 bool
-LRXProcessor::recognisePattern(const wstring lu, const wstring op)
+LRXProcessor::recognisePattern(const UString lu, const UString op)
 {
   if(recognisers.count(op) < 1)
   {
-    fwprintf(stderr, L"WARNING: Recogniser not found for key %S, skipping... [LU: %S]\n", op.c_str(), lu.c_str());
+    cerr << "WARNING: Recogniser not found for key " << op << ", skipping... [LU: " << lu << "]" << endl;
     return false;
   }
 
@@ -184,14 +156,14 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
   end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end());
 
   bool readingTag = false;
-  wstring tag = L"";
+  UString tag;
   int val = 0;
   for(auto& it : lu)
   {
 /*
     if(debugMode)
     {
-      fwprintf(stderr, L"alive: %d\n", cur.size());
+      cerr << "alive: " << cur.size() << endl;
     }
 */
     if(cur.size() < 1)  // I think that any time we have 0 alive states,
@@ -199,29 +171,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
     {
       return false;
     }
-    if(it == L'<')
+    if(it == '<')
     {
-      tag = L"";
+      tag.clear();
       readingTag = true;
-      tag = tag + it;
+      tag += it;
       continue;
     }
-    if(it == L'>')
+    if(it == '>')
     {
       tag = tag + it;
       val = static_cast<int>(alphabet(tag));
       if(val == 0)
       {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
+        val = static_cast<int>(alphabet("<ANY_TAG>"_u));
       }
 /*
       if(debugMode)
       {
-        fwprintf(stderr, L":: tag %S: %d\n", tag.c_str(), val);
-        fwprintf(stderr, L"  step: %S\n", tag.c_str());
+        cerr << ":: tag " << tag << ": " << val << endl;
+        cerr << "  step: " << tag << endl;
       }
 */
-      cur.step(val, alphabet(L"<ANY_TAG>"));
+      cur.step(val, alphabet("<ANY_TAG>"_u));
       readingTag = false;
       continue;
     }
@@ -236,21 +208,21 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
 /*
       if(debugMode)
       {
-        fwprintf(stderr, L"  step: %C\n", val);
+        cerr << "  step: " << val << endl;
       }
 */
-      //cur.step(val, a(L"<ANY_CHAR>"));
+      //cur.step(val, a("<ANY_CHAR>"));
       //cur.step(val);
       set<int> alts;
       if(!iswupper(val))
       {
-        alts.insert(alphabet(L"<ANY_CHAR>"));
-        alts.insert(alphabet(L"<ANY_LOWER>"));
+        alts.insert(alphabet("<ANY_CHAR>"_u));
+        alts.insert(alphabet("<ANY_LOWER>"_u));
       }
       else
       {
-        alts.insert(alphabet(L"<ANY_CHAR>"));
-        alts.insert(alphabet(L"<ANY_UPPER>"));
+        alts.insert(alphabet("<ANY_CHAR>"_u));
+        alts.insert(alphabet("<ANY_UPPER>"_u));
         alts.insert(towlower(val));
       }
       cur.step(val, alts);
@@ -261,7 +233,7 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
 /*
   if(debugMode)
   {
-    fwprintf(stderr, L">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
+    cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n";
   }
 */
   if(cur.isFinal(end_states))
@@ -272,541 +244,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
   return false;
 }
 
-/*
-void
-LRXProcessor::processFlush(FILE *output,
-                           map<int, wstring > &sl,
-                           map<int, vector<wstring> > &tl,
-                           map<int, wstring > &blanks,
-                           map<int, pair<double, vector<State> > > &covers,
-                           pair<double, vector<State> > &empty_seq,
-                           map<pair<int, int>, vector<State> > &spans,
-                           int last_final)
-{
-  if(debugMode)
-  {
-    fwprintf(stderr, L"FLUSH:\n");
-  }
-
-  map<int, pair<double, vector<State> > >::iterator it;
-  map<int, pair<wstring, wstring> > operations;
-
-  for(it = covers.begin(); it != covers.end(); it++)
-  {
-    pair<double, vector<State> > best = it->second;
-    if(debugMode)
-    {
-      fwprintf(stderr, L"===================================================\n");
-      fwprintf(stderr, L"[%d][%d] covers[%d] best (score: %d, size: %d)\n", pos, last_final, it->first, best.first, best.second.size());
-    }
-
-    // return M[i-1]
-    if(it->first == last_final)
-    {
-      vector<State>::iterator it2;
-      for(it2 = best.second.begin(); it2 != best.second.end(); it2++)
-      {
-        if(debugMode)
-        {
-          wstring out = it2->filterFinals(anfinals, alphabet, escaped_chars);
-          fwprintf(stderr, L"!!!    filter_finals: %S\n", out.c_str());
-        }
-        set<pair<wstring, vector<wstring> > > outpaths;
-        outpaths = it2->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-
-        int j = 1;
-        set<pair<wstring, vector<wstring> > >::iterator it3;
-        for(it3 = outpaths.begin(); it3 != outpaths.end(); it3++)
-        {
-          wstring id = it3->first;
-          vector<wstring> ops = it3->second;
-          vector<wstring>::iterator op;
-          for(op = ops.begin(); op != ops.end(); op++)
-          {
-            if(*op != LRX_PROCESSOR_TAG_SKIP)
-            {
-              int starting_point = -1;
-              map<pair<int, int>, vector<State> >::iterator ix;
-              for(ix = spans.begin(); ix != spans.end(); ix++)
-              {
-                vector<State>::iterator iy;
-                for(iy = ix->second.begin(); iy != ix->second.end(); iy++)
-                {
-                  set<pair<wstring, vector<wstring> > > y;
-                  y = iy->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-                  if(y == outpaths)
-                  {
-                    starting_point = ix->first.first;
-                  }
-                }
-              }
-              if(debugMode)
-              {
-                fwprintf(stderr, L"=> APPLY [pos: %d, dep: %d, j: %d, start: %d, len: %d]: %S // %S\n", pos, starting_point, j, starting_point+j, ops.size(), id.c_str(), op->c_str());
-              }
-              operations[starting_point+j].first = id;
-              operations[starting_point+j].second = *op;
-            }
-            j++;
-          }
-        }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"[best: %d, outpaths: %d]\n", best.first, outpaths.size());
-        }
-      }
-    }
-  }
-
-  covers.clear();
-  covers[-1] = empty_seq;
-  covers[-1].first = 0;
-
-  // Here we actually apply the rules that we've matched
-
-  unsigned int spos = 0;
-  for(spos = 0; spos <= pos; spos++)
-  {
-    if(sl[spos] == L"")
-    {
-      continue;
-    }
-    wstring  op = operations[spos].second;
-    wstring  tipus = L"";
-    if(op.find(LRX_PROCESSOR_TAG_SELECT) != wstring::npos)
-    {
-      tipus = LRX_PROCESSOR_TAG_SELECT;
-    }
-    if(op.find(LRX_PROCESSOR_TAG_REMOVE) != wstring::npos)
-    {
-      tipus = LRX_PROCESSOR_TAG_REMOVE;
-    }
-    if(debugMode)
-    {
-      fwprintf(stderr, L"#APPL%S. %S\n", tipus.c_str(), op.c_str());
-    }
-
-    fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
-
-    vector<wstring>::iterator ti;
-    vector<wstring>::iterator penum = tl[spos].end(); penum--;
-
-    if(tipus == LRX_PROCESSOR_TAG_SELECT && tl[spos].size() > 1)
-    {
-      bool matched = true;
-      bool selected = false;
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        matched = recognisePattern(*ti, op);
-        if(matched)
-        {
-          if(traceMode || debugMode)
-          {
-            fwprintf(stderr, L"%d:SELECT%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str());
-          }
-          fwprintf(output, L"%S", ti->c_str());
-          selected = true;
-          break;
-        }
-      }
-      if(!selected)
-      {
-        for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-        {
-          fwprintf(output, L"%S", ti->c_str());
-          if(ti != penum)
-          {
-            fwprintf(output, L"/");
-          }
-        }
-      }
-    }
-    else if(tipus == LRX_PROCESSOR_TAG_REMOVE && tl[spos].size() > 1)
-    {
-      bool matched = true;
-      vector<wstring> new_tl;  // The new list of TL translations
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        matched = recognisePattern(*ti, op);
-        if(matched)
-        {
-          if(traceMode || debugMode)
-          {
-            fwprintf(stderr, L"%d:REMOVE%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str());
-          }
-          continue;
-        }
-        new_tl.push_back(*ti);
-      }
-      vector<wstring>::iterator nti;
-      vector<wstring>::iterator npenum = new_tl.end(); npenum--;
-      for(nti = new_tl.begin(); nti != new_tl.end(); nti++)
-      {
-        fwprintf(output, L"%S", nti->c_str());
-        if(nti != npenum)
-        {
-          fwprintf(output, L"/");
-        }
-      }
-      new_tl.clear();
-    }
-    else
-    {
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        fwprintf(output, L"%S", ti->c_str());
-        if(ti != penum)
-        {
-          fwprintf(output, L"/");
-        }
-      }
-    }
-    fwprintf(output, L"$");
-    if(debugMode)
-    {
-      fwprintf(output, L"%d", spos);
-    }
-  }
-}
-*/
-
-/*
-void
-LRXProcessor::process(FILE *input, FILE *output)
-{
-  bool isEscaped = false;
-
-  map<int, wstring > sl; // map of SL words
-  map<int, vector<wstring> > tl; // map of vectors of TL translations
-  map<int, wstring > blanks; // map of the superblanks
-
-  map<int, pair<double, vector<State> > > covers ;
-  pair<double, vector<State> > empty_seq;
-  map<pair<int, int>, vector<State> > spans ;
-
-  covers[-1] = empty_seq;
-  covers[-1].first = 1.0;
-
-  vector<State> alive_states_clean ;
-  vector<State> alive_states = alive_states_clean ;
-  alive_states.push_back(*initial_state);
-  vector<State> new_states;
-
-  int last_final = -1; // check what we actually use this for
-
-  while(!feof(input))
-  {
-    int val = fgetwc_unlocked(input);
-
-    if(nullFlush && val == L'\0')
-    {
-      processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-      fwprintf(output, L"%S", blanks[pos].c_str());
-      pos = 0;
-      last_final = 0;
-      tl.clear();
-      sl.clear();
-      blanks.clear();
-      spans.clear();
-
-      fputwc_unlocked(val, output);
-      fflush(output);
-      continue;
-    }
-
-    // We're starting to read a new lexical form
-    if(val == L'^' && !isEscaped && outOfWord)
-    {
-      outOfWord = false;
-      continue;
-    }
-
-    // We've seen the surface form
-    if(val == L'/' && !isEscaped && !outOfWord)
-    {
-      // Read in target equivalences
-      wstring trad = L"";
-      val = fgetwc_unlocked(input);
-      while(val != L'$')
-      {
-        if(val != L'$')
-        {
-          trad += static_cast<wchar_t>(val);
-        }
-        if(val == L'/')
-        {
-          tl[pos].push_back(trad.substr(0, trad.length()-1));
-          trad = L"";
-        }
-        val = fgetwc_unlocked(input);
-      }
-      tl[pos].push_back(trad);
-
-      if(debugMode)
-      {
-        for(vector<wstring>::iterator it = tl[pos].begin(); it != tl[pos].end(); it++)
-        {
-          fwprintf(stderr, L"trad[%d]: %S\n", pos, it->c_str());
-        }
-      }
-    }
-
-    // We've finished reading a lexical form
-    if((feof(input) || val == L'$') && !isEscaped && !outOfWord)
-    {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size());
-      }
-
-      new_states.clear(); // alive_states_new
-      pair<double, vector<State> > new_best_cover;
-      new_best_cover.first = -numeric_limits<int>::max();
-
-      vector<int> matched_rules;
-
-      // \forall s \in A
-      for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
-      {
-        State s = *it;
-        // \IF \exists c \in Q : \delta(s, sent[i]) = c
-        s.step(alphabet(L"<$>"));
-
-        // A \gets A \cup {c}
-        if(s.size() > 0) // If the current state has outgoing transitions,
-                         // add it to the new alive states
-        {
-          new_states.push_back(s);
-        }
-        s.step(alphabet(L"<$>"));
-
-        // \IF c \in F
-        if(s.isFinal(anfinals))
-        {
-          // We've reached a final state, so we need to evaluate the rule we've matched
-          if(debugMode)
-          {
-            wstring out = s.filterFinals(anfinals, alphabet, escaped_chars);
-            fwprintf(stderr, L"    filter_finals: %S\n", out.c_str());
-          }
-
-          set<pair<wstring, vector<wstring> > > outpaths;
-          outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-
-          set<pair<wstring, vector<wstring> > >::iterator it;
-          for(it = outpaths.begin(); it != outpaths.end(); it++)
-          {
-            vector<State> reached;
-
-            vector<wstring> path = (*it).second;
-            wstring id = (*it).first;
-
-            if(debugMode)
-            {
-              fwprintf(stderr, L"id:      %S:\n", id.c_str());
-              for(vector<wstring>::iterator it2 = path.begin(); it2 != path.end(); it2++)
-              {
-                fwprintf(stderr, L"op:        %S\n", it2->c_str());
-              }
-              fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos);
-            }
-
-            spans[make_pair((pos-path.size()), pos)].push_back(s);
-
-            // M[i-ChunkLength(c)]
-            pair<double, vector<State> > newseq = covers[(pos - path.size())];
-            newseq.first = newseq.first + path.size() ;
-
-            if(newseq.first > new_best_cover.first)
-            {
-              State new_state;
-              new_state = s;
-              reached.push_back(new_state);
-              map<int, pair<double, vector<State> > >::iterator k;
-              for(k = covers.begin(); k != covers.end(); k++)
-              {
-                vector<State>::iterator l;
-                pair<double, vector<State> > p = k->second;
-                for(l = p.second.begin(); l != p.second.end(); l++)
-                {
-                  if(debugMode)
-                  {
-                    fwprintf(stderr, L"= [cov: %d][len: %d][pos: %d][pat: %d] INCLUDE FINALS?\n", k->first, p.first, pos, path.size());
-                  }
-                  if(k->first <= (pos - path.size()))
-                  {
-                    if(debugMode)
-                    {
-                      wstring out2 = l->filterFinals(anfinals, alphabet, escaped_chars);
-                      fwprintf(stderr, L"    == INCLUDE FINALS: %S\n", out2.c_str());
-                    }
-                    reached.push_back(*l);
-                  }
-                }
-              }
-              newseq.second = reached;
-              new_best_cover = newseq;
-              covers[pos] = newseq;
-              if(debugMode)
-              {
-                fwprintf(stderr, L"++ FINALS(%d) covers[%d] [%d, %d] BEST: %.4f > %.4f\n", newseq.second.size(), (pos - path.size()), pos, path.size(), newseq.first, new_best_cover.first);
-              }
-            }
-
-            last_final = pos;
-          }
-        }
-      }
-
-      alive_states.swap(new_states);
-      alive_states.push_back(*initial_state);
-
-      if(debugMode)
-      {
-        fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size());
-      }
-
-      if(alive_states.size() == 1)
-      {
-        // If we have only a single alive state, it means no rules are
-        // active, and we can flush the buffers.
-        processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-
-        pos = 0;
-        last_final = 0;
-        tl.clear();
-        sl.clear();
-        blanks.clear();
-        spans.clear();
-      }
-
-      pos++;
-      if(debugMode)
-      {
-        fwprintf(stderr, L"==> new pos: %d\n", pos);
-      }
-
-      outOfWord = true;
-      continue;
-    }
-
-
-    // We're reading a tag
-    if(val == L'<' && !isEscaped && !outOfWord)
-    {
-      wstring tag = L"";
-      tag = readFullBlock(input, L'<', L'>');
-      sl[pos] = sl[pos] + tag;
-      val = static_cast<int>(alphabet(tag));
-      if(val == 0)
-      {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val);
-      }
-    }
-
-    if(!outOfWord)
-    {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"outOfWord = false\n");
-      }
-
-      new_states.clear();
-      wstring res = L"";
-      for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
-      {
-        res = L"";
-        State s = *it;
-        if(val < 0)
-        {
-          alphabet.getSymbol(res, val,  false);
-          if(debugMode)
-          {
-            fwprintf(stderr, L"  step: %S\n", res.c_str());
-          }
-          s.step(val, alphabet(L"<ANY_TAG>"));
-        }
-        else
-        {
-          if(debugMode)
-          {
-            fwprintf(stderr, L"  step: %C\n", val);
-          }
-          s.step_case(val, alphabet(L"<ANY_CHAR>"), false);
-        }
-        if(s.size() > 0) // If the current state has outgoing transitions, add it to the new alive states
-        {
-          new_states.push_back(s);
-        }
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"new_states: %d\n", new_states.size());
-      }
-      alive_states.swap(new_states);
-      alive_states.push_back(*initial_state);
-
-    }
-
-    // We're still reading a surface form
-    if(val > 0 && val != L'$' && !isEscaped && !outOfWord)
-    {
-      sl[pos] = sl[pos] + static_cast<wchar_t>(val);
-    }
-
-    // Reading a superblank
-    if(outOfWord)
-    {
-      if(!feof(input))
-      {
-        blanks[pos] = blanks[pos] + static_cast<wchar_t>(val);
-      }
-      if(debugMode)
-      {
-        //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str());
-      }
-    }
-
-    // Increment the current line number (for rule tracing)
-    if(val == L'\n')
-    {
-      lineno++;
-    }
-  }
-
-  processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-
-  fwprintf(output, L"%S", blanks[pos].c_str());
-}
-*/
-
 void
-LRXProcessor::process(FILE *input, FILE *output)
+LRXProcessor::process(InputFile& input, UFILE *output)
 {
   bool isEscaped = false;
 
-  map<int, wstring > sl; // map of SL words
-  map<int, vector<wstring> > tl; // map of vectors of TL translations
-  map<int, wstring > blanks; // map of the superblanks
+  map<int, UString > sl; // map of SL words
+  map<int, vector<UString> > tl; // map of vectors of TL translations
+  map<int, UString > blanks; // map of the superblanks
 
-  map<int, map<wstring, double> > scores; //
-  map<int, map<wstring, OpType> > operations;
+  map<int, map<UString, double> > scores; //
+  map<int, map<UString, OpType> > operations;
 
   vector<State*> alive_states ;
   alive_states.push_back(new State(*initial_state));
 
-  int val = 0;
-  while((val = fgetwc_unlocked(input)) != EOF && val != WEOF)
+  int32_t val = 0;
+  while((val = input.get()) != U_EOF)
   {
 
-    if(nullFlush && val == L'\0')
+    if(nullFlush && val == '\0')
     {
       processFlush(output, sl, tl, blanks, scores, operations);
-      fwprintf(output, L"%S", blanks[pos].c_str());
+      u_fprintf(output, "%S", blanks[pos].c_str());
       pos = 0;
       tl.clear();
       sl.clear();
@@ -816,63 +276,62 @@ LRXProcessor::process(FILE *input, FILE *output)
       alive_states.clear();
       alive_states.push_back(new State(*initial_state));
 
-      fputwc_unlocked(val, output);
-      fflush(output);
+      u_fputc(val, output);
+      u_fflush(output);
       continue;
     }
 
     // We're starting to read a new lexical form
-    if(val == L'^' && !isEscaped && outOfWord)
+    if(val == '^' && !isEscaped && outOfWord)
     {
       outOfWord = false;
       continue;
     }
 
     // We've seen the surface form
-    if(val == L'/' && !isEscaped && !outOfWord)
+    if(val == '/' && !isEscaped && !outOfWord)
     {
       // Read in target equivalences
-      wstring trad = L"";
-      val = fgetwc_unlocked(input);
-      while(val != L'$' && val != EOF && val != WEOF)
+      UString trad;
+      val = input.get();
+      while(val != '$' && val != U_EOF)
       {
-        if(val != L'$')
+        if(val != '$')
         {
-          trad += static_cast<wchar_t>(val);
+          trad += val;
         }
-        if(val == L'/')
+        if(val == '/')
         {
           tl[pos].push_back(trad.substr(0, trad.length()-1));
-          trad = L"";
+          trad.clear();
         }
-        val = fgetwc_unlocked(input);
+        val = input.get();
       }
       tl[pos].push_back(trad);
 
       if(debugMode)
       {
-        for(auto& it : tl[pos])
-        {
-          fwprintf(stderr, L"trad[%d]: %S\n", pos, it.c_str());
+        for(auto& it : tl[pos]) {
+          cerr << "trad[" << pos << "]: " << it << endl;
         }
       }
     }
 
-    if((feof(input) || val == L'$') && !isEscaped && !outOfWord)
+    if((input.eof() || val == '$') && !isEscaped && !outOfWord)
     {
       if(debugMode)
       {
-        fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]: %S\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size(), sl[pos].c_str());
+        cerr << "[POS] " << pos << ": [sl " << sl[pos].size() << " ; tl " << tl[pos].size() << " ; bl " << blanks[pos].size() << "]: " << sl[pos] << endl;
       }
       {
         vector<State *> new_states; // TODO: Can we avoid the State-copying here?
         // \forall s \in A
-        set<wstring> seen_ids;
+        set<UString> seen_ids;
         for(auto& it : alive_states)
         {
           State s = *it;
           // \IF \exists c \in Q : \delta(s, sent[i]) = c
-          s.step(alphabet(L"<$>"));
+          s.step(alphabet("<$>"_u));
 
           // A \gets A \cup {c}
           if (s.size() > 0) // If the current state has outgoing transitions,
@@ -880,7 +339,7 @@ LRXProcessor::process(FILE *input, FILE *output)
           {
             new_states.push_back(new State(s));
           }
-          s.step(alphabet(L"<$>"));
+          s.step(alphabet("<$>"_u));
 
           // \IF c \in F
           if (s.isFinal(anfinals))
@@ -888,18 +347,18 @@ LRXProcessor::process(FILE *input, FILE *output)
             // We've reached a final state, so we need to evaluate the rule we've matched
             if (debugMode)
             {
-              wstring out = s.filterFinals(anfinals, alphabet, escaped_chars);
-              fwprintf(stderr, L"    filter_finals: %S\n", out.c_str());
+              UString out = s.filterFinals(anfinals, alphabet, escaped_chars);
+              cerr << "    filter_finals: " << out << endl;
             }
 
-            set<pair<wstring, vector<wstring>>> outpaths;
+            set<pair<UString, vector<UString>>> outpaths;
             outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
 
             for (auto& it : outpaths)
             {
               vector<State> reached;
-              vector<wstring> path = it.second;
-              wstring id = it.first;
+              vector<UString> path = it.second;
+              UString id = it.first;
 
               if (seen_ids.find(id) != seen_ids.end())
               {
@@ -911,13 +370,14 @@ LRXProcessor::process(FILE *input, FILE *output)
 
               if (debugMode)
               {
-                fwprintf(stderr, L"id:      %S: (lambda: %.5f)\n", id.c_str(), weights[id.c_str()]);
+                cerr << "id:      " << id << ": (lambda: ";
+                cerr << weights[id] << ")\n";
               }
               for (auto& it2 : path)
               {
                 if (debugMode)
                 {
-                  fwprintf(stderr, L"op:        %S\n", it2.c_str());
+                  cerr << "op:        " << it2 << endl;
                 }
                 if (it2 != LRX_PROCESSOR_TAG_SKIP)
                 {
@@ -928,9 +388,10 @@ LRXProcessor::process(FILE *input, FILE *output)
                   scores[j][it2] += weights[id.c_str()];
                   if (debugMode)
                   {
-                    fwprintf(stderr, L"#[%d]SCORE %.5f / %S\n", j, scores[j][it2], it2.c_str());
+                    cerr << "#[" << j << "]SCORE " << scores[j][it2] << " / ";
+                    cerr << it2 << endl;
                   }
-                  if(it2.at(0) == L'<' && it2.at(1) == L'r') {
+                  if(it2.at(0) == '<' && it2.at(1) == 'r') {
                     operations[j][it2] = Remove;
                   }
                   else {
@@ -939,7 +400,7 @@ LRXProcessor::process(FILE *input, FILE *output)
                 }
                 j++;
               }
-              // fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos);
+              // cerr << "#SPAN[" << (pos-path.size()) << ", " << pos << "]\n";
             }
           }
         }
@@ -953,13 +414,12 @@ LRXProcessor::process(FILE *input, FILE *output)
 
         if (debugMode)
         {
-          fwprintf(stderr, L"seen:");
-          for (auto& it : seen_ids)
-          {
-            fwprintf(stderr, L" %S ", it.c_str());
+          cerr << "seen:";
+          for (auto& it : seen_ids) {
+            cerr << " " << it << " ";
           }
-          fwprintf(stderr, L"\n");
-          fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size());
+          cerr << endl;
+          cerr << "#CURRENT_ALIVE: " << alive_states.size() << endl;
         }
       }
 
@@ -970,7 +430,7 @@ LRXProcessor::process(FILE *input, FILE *output)
 
         if(debugMode)
         {
-          fwprintf(stderr, L"FLUSH:\n");
+          cerr << "FLUSH:" << endl;
         }
 
 
@@ -988,7 +448,7 @@ LRXProcessor::process(FILE *input, FILE *output)
       pos++;
       if(debugMode)
       {
-        fwprintf(stderr, L"==> new pos: %d\n", pos);
+        cerr << "==> new pos: " << pos << endl;
       }
 
       outOfWord = true;
@@ -996,19 +456,18 @@ LRXProcessor::process(FILE *input, FILE *output)
     }
 
     // We're reading a tag
-    if(val == L'<' && !isEscaped && !outOfWord)
+    if(val == '<' && !isEscaped && !outOfWord)
     {
-      wstring tag = L"";
-      tag = readFullBlock(input, L'<', L'>');
+      UString tag = input.readBlock('<', '>');
       sl[pos] = sl[pos] + tag;
       val = static_cast<int>(alphabet(tag));
       if(val == 0)
       {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
+        val = static_cast<int>(alphabet("<ANY_TAG>"_u));
       }
       if(debugMode)
       {
-        fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val);
+        cerr << "tag " << tag << ": " << val << "\n";
       }
     }
 
@@ -1016,39 +475,39 @@ LRXProcessor::process(FILE *input, FILE *output)
     {
       if(debugMode)
       {
-        fwprintf(stderr, L"outOfWord = false\n");
+        cerr << "outOfWord = false\n";
       }
 
-      wstring res = L"";
+      UString res;
       for(auto& s : alive_states)
       {
-        res = L"";
+        res.clear();
         if(val < 0)
         {
           alphabet.getSymbol(res, val,  false);
           if(debugMode)
           {
-            fwprintf(stderr, L"  step: %S\n", res.c_str());
+            cerr << "  step: " << res << endl;
           }
-          s->step(val, alphabet(L"<ANY_TAG>"));
+          s->step(val, alphabet("<ANY_TAG>"_u));
         }
         else
         {
 
           set<int> alts;
-          alts.insert(alphabet(L"<ANY_CHAR>"));
+          alts.insert(alphabet("<ANY_CHAR>"_u));
           if(iswupper(val))
           {
             alts.insert(towlower(val));
-            alts.insert(alphabet(L"<ANY_UPPER>"));
+            alts.insert(alphabet("<ANY_UPPER>"_u));
           }
           else
           {
-            alts.insert(alphabet(L"<ANY_LOWER>"));
+            alts.insert(alphabet("<ANY_LOWER>"_u));
           }
           if(debugMode)
           {
-            fwprintf(stderr, L"  step: %C [alts: %d]\n", val, alts.size());
+            cerr << "  step: " << val << " [alts: " << alts.size() << "]\n";
           }
           s->step(val, alts);
         }
@@ -1057,26 +516,28 @@ LRXProcessor::process(FILE *input, FILE *output)
     }
 
     // We're still reading a surface form
-    if(val > 0 && val != L'$' && !isEscaped && !outOfWord)
+    if(val > 0 && val != '$' && !isEscaped && !outOfWord)
     {
-      sl[pos] = sl[pos] + static_cast<wchar_t>(val);
+      sl[pos] += val;
     }
 
     // Reading a superblank
     if(outOfWord)
     {
-      if(!feof(input))
+      if(!input.eof())
       {
-        blanks[pos] = blanks[pos] + static_cast<wchar_t>(val);
+        blanks[pos] += val;
       }
+      /*
       if(debugMode)
       {
-        //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str());
+        cerr << "blanks[" << pos << "] = " << blanks[pos] << endl;
       }
+      */
     }
 
     // Increment the current line number (for rule tracing)
-    if(val == L'\n')
+    if(val == '\n')
     {
       lineno++;
     }
@@ -1084,42 +545,42 @@ LRXProcessor::process(FILE *input, FILE *output)
   }
 
   processFlush(output, sl, tl, blanks, scores, operations);
-  fwprintf(output, L"%S", blanks[pos].c_str());
+  write(blanks[pos], output);
 }
 
 void
-LRXProcessor::processFlush(FILE *output,
-                             map<int, wstring > &sl,
-                             map<int, vector<wstring> > &tl,
-                             map<int, wstring > &blanks,
-                             map<int, map<wstring, double> > &scores,
-                             map<int, map<wstring, OpType> > &operations) {
-
+LRXProcessor::processFlush(UFILE *output,
+                           map<int, UString > &sl,
+                           map<int, vector<UString> > &tl,
+                           map<int, UString > &blanks,
+                           map<int, map<UString, double> > &scores,
+                           map<int, map<UString, OpType> > &operations) {
+  
   struct ScoredMatch {
       OpType op;
-      wstring* ti;              // matched target translation
+      UString* ti;              // matched target translation
       double weight;
   };
 
   unsigned int spos = 0;
   for(spos = 0; spos <= pos; spos++)
   {
-    if(sl[spos] == L"")
+    if(sl[spos].empty())
     {
       continue;
     }
 
-    fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
+    u_fprintf(output, "%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
 
-    vector<wstring>::iterator ti;
+    vector<UString>::iterator ti;
     auto penum = tl[spos].end();
     penum--;
 
     if(tl[spos].size() > 1)
     {
       //--
-      set<wstring*> ti_keep;
-      set<wstring*> ti_removed;
+      set<UString*> ti_keep;
+      set<UString*> ti_removed;
       vector<ScoredMatch> spos_matches;
       for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
       {
@@ -1128,9 +589,13 @@ LRXProcessor::processFlush(FILE *output,
           bool matched = recognisePattern(*ti, si.first);
           OpType op = operations[spos][si.first];
           if (debugMode) {
-            wstring checks = matched ? L"✔️ " : L"❎";
-            fwprintf(stderr, L"%S >>> %d -> %S -> %.5f\n", checks.c_str(), spos,
-                     si.first.c_str(), si.second);
+            if (matched) {
+              cerr << "✔️ ";
+            } else {
+              cerr << "❎";
+            }
+            cerr << " >>> " << spos << " -> ";
+            cerr << si.first << " -> " << si.second << endl;
           }
           if(matched) {
             spos_matches.push_back({ op, &*ti, si.second });
@@ -1144,15 +609,10 @@ LRXProcessor::processFlush(FILE *output,
              [](const auto &a, const auto &b) { return a.weight > b.weight; });
         for (const auto &m : spos_matches) {
           if (traceMode || debugMode) {
-            wstring op = (m.op == Select ? L"SELECT" : L"REMOVE");
-            fwprintf(
-                stderr, L"%d:%S:%.5f:%S:%d:%S\n",
-                lineno,
-                op.c_str(),
-                m.weight,
-                sl[spos].c_str(),
-                ti_keep.size(),
-                m.ti->c_str());
+            std::string op = (m.op == Select ? "SELECT" : "REMOVE");
+            cerr << lineno << ":" << op << ":" << m.weight;
+            cerr << ":" << sl[spos] << ":" << ti_keep.size();
+            cerr << ":" << m.ti << endl;
           }
           // We have to keep track of translations that have been removed so
           // that we don't end up adding back a translation that was removed.
@@ -1168,9 +628,9 @@ LRXProcessor::processFlush(FILE *output,
         bool printed = false;
         for(const auto& ti_max : ti_keep) {
           if(printed) {
-            fwprintf(output, L"/");
+            u_fprintf(output, "/");
           }
-          fwprintf(output, L"%S", ti_max->c_str());
+          u_fprintf(output, "%S", ti_max->c_str());
           printed = true;
         }
       }
@@ -1178,10 +638,10 @@ LRXProcessor::processFlush(FILE *output,
       {
         for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
         {
-          fwprintf(output, L"%S", ti->c_str());
+          u_fprintf(output, "%S", ti->c_str());
           if(ti != penum)
           {
-            fwprintf(output, L"/");
+            u_fprintf(output, "/");
           }
         }
       }
@@ -1190,18 +650,18 @@ LRXProcessor::processFlush(FILE *output,
     {
       for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
       {
-        fwprintf(output, L"%S", ti->c_str());
+        u_fprintf(output, "%S", ti->c_str());
         if(ti != penum)
         {
-          fwprintf(output, L"/");
+          u_fprintf(output, "/");
         }
       }
     }
 
-    fwprintf(output, L"$");
+    u_fprintf(output, "$");
     if(debugMode)
     {
-      fwprintf(output, L"%d", spos);
+      u_fprintf(output, "%d", spos);
     }
 
 
diff --git a/src/lrx_processor.h b/src/lrx_processor.h
index 26973aa..cad60f9 100644
--- a/src/lrx_processor.h
+++ b/src/lrx_processor.h
@@ -34,7 +34,6 @@
 
 #include <libxml/xmlreader.h>
 
-#include <lttoolbox/ltstr.h>
 #include <lttoolbox/lt_locale.h>
 #include <lttoolbox/transducer.h>
 #include <lttoolbox/xml_parse_util.h>
@@ -46,46 +45,23 @@
 #include <lttoolbox/match_exe.h>
 #include <lttoolbox/trans_exe.h>
 #include <lttoolbox/my_stdio.h>
+#include <lttoolbox/input_file.h>
 
 using namespace std;
-/*
-class BiltransToken {
-public:
-	bool isEOF = false;
-	wstring source;
-	wstring blanks;
-	vector<wstring> target;
-
-	wstring toString(bool delim) {
-		wstring out = source;
-		for(int i = 0; i < target.size(); i++) {
-			out += L'/' + target[i];
-		}
-		if (delim && (source.size() > 0 || target.size() > 0)) {
-			out = blanks + L'^' + out + L'$';
-		} else {
-			out = blanks + out;
-		}
-		return out;
-	}
-};
 
-*/
 class LRXProcessor
 {
 private:
 
   Alphabet alphabet;
   TransExe transducer;
-  map<wstring, TransExe> recognisers;
-  map<wstring, double> weights;
-
-//  map<int, BiltransToken> bts;
+  map<UString, TransExe> recognisers;
+  map<UString, double> weights;
 
   vector<State> alive_states;
 
   map<Node *, double> anfinals;
-  set<wchar_t> escaped_chars;
+  set<UChar32> escaped_chars;
   State *initial_state;
 
   bool traceMode;
@@ -96,39 +72,27 @@ private:
   unsigned int pos;
   unsigned long lineno;
 
-  wstring itow(int i);
-  bool recognisePattern(const wstring lu, const wstring op);
-  wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2);
-
-//  BiltransToken readBiltransToken(FILE *input = stdin);
+  UString itow(int i);
+  bool recognisePattern(const UString lu, const UString op);
+  UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2);
 
   void makeTransition(int);
   void filterFinals();
   void evaluateRules();
 
-/*
-  void processFlush(FILE *output,
-                    map<int, wstring > &sl,
-                    map<int, vector<wstring> > &tl,
-                    map<int, wstring > &blanks,
-                    map<int, pair<double, vector<State> > > &covers,
-                    pair<double, vector<State> > &empty_seq,
-                    map<pair<int, int>, vector<State> > &spans,
-                    int last_final);
-*/
   enum OpType { Select, Remove };
 
-  void processFlush(FILE *output,
-                      map<int, wstring > &sl,
-                      map<int, vector<wstring> > &tl,
-                      map<int, wstring > &blanks,
-                      map<int, map<wstring, double> > &scores,
-                      map<int, map<wstring, OpType> > &operations);
+  void processFlush(UFILE *output,
+                      map<int, UString > &sl,
+                      map<int, vector<UString> > &tl,
+                      map<int, UString > &blanks,
+                      map<int, map<UString, double> > &scores,
+                      map<int, map<UString, OpType> > &operations);
 
 public:
-  static wstring const LRX_PROCESSOR_TAG_SELECT;
-  static wstring const LRX_PROCESSOR_TAG_REMOVE;
-  static wstring const LRX_PROCESSOR_TAG_SKIP;
+  static UString const LRX_PROCESSOR_TAG_SELECT;
+  static UString const LRX_PROCESSOR_TAG_REMOVE;
+  static UString const LRX_PROCESSOR_TAG_SKIP;
 
   LRXProcessor();
   ~LRXProcessor();
@@ -139,9 +103,7 @@ public:
 
   void init();
   void load(FILE *input);
-  void process(FILE *input, FILE *output);
-//  void processME(FILE *input, FILE *output);
-
+  void process(InputFile& input, UFILE *output);
 };
 
 #endif /* __LRX_PROCESSOR_H__ */
diff --git a/src/multi_translator.cc b/src/multi_translator.cc
index 7e2ad1e..4f2e7c3 100644
--- a/src/multi_translator.cc
+++ b/src/multi_translator.cc
@@ -30,10 +30,10 @@ int MultiTranslator::calculateFertility(vector<BiltransToken> sent) {
 }
 
 
-BiltransToken MultiTranslator::parseBiltransToken(wstring bt) {
+BiltransToken MultiTranslator::parseBiltransToken(UString bt) {
 
 	BiltransToken token;
-	vector<wstring> tokens = wsplit(bt, L'/');
+	vector<UString> tokens = wsplit(bt, '/');
 
 	token.sourceToken = parseTaggerToken(tokens[0]);
 
@@ -49,9 +49,9 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) {
 	bool isPos;
 	if (bt.sourceToken.tags.size() > 0) {
 		isPos =
-		bt.sourceToken.tags[0] == L"n" ||
-		bt.sourceToken.tags[0] == L"vblex" ||
-		bt.sourceToken.tags[0] == L"adj";
+		bt.sourceToken.tags[0] == "n"_u ||
+		bt.sourceToken.tags[0] == "vblex"_u ||
+		bt.sourceToken.tags[0] == "adj"_u;
 	} else {
 		isPos = false;
 	}
@@ -60,10 +60,10 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) {
 
 }
 
-BiltransToken MultiTranslator::getFullToken(wstring source) {
+BiltransToken MultiTranslator::getFullToken(UString source) {
 
 	BiltransToken token;
-	if (source[0] == L'*') {
+	if (source[0] == '*') {
 		token.sourceToken.lemma = source;
 		TaggerToken tmp;
 		tmp.lemma = source;
@@ -71,21 +71,22 @@ BiltransToken MultiTranslator::getFullToken(wstring source) {
 		return token;
 	}
 
-	wstring target = bilingual.biltrans(source, false);
-	if (target == L"") {
-		target = L"@" + source;
+	UString target = bilingual.biltrans(source, false);
+	if (target.empty()) {
+      target += '@';
+      target.append(source);
 	}
-	token = parseBiltransToken(source + L"/" + target);
+	token = parseBiltransToken(source + "/"_u + target);
 	return token;
 
 }
 
-BiltransToken MultiTranslator::getTrimmedToken(wstring source)
+BiltransToken MultiTranslator::getTrimmedToken(UString source)
 {
 	BiltransToken ttoken;
 	BiltransToken ftoken;
 
-	if (source[0] == L'*') {
+	if (source[0] == '*') {
 		ttoken.sourceToken.lemma = source;
 		TaggerToken tmp;
 		tmp.lemma = source;
@@ -99,8 +100,8 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
         // the bilingual.* methods in FSTProcessor. Unknown why we get the
         // leaks in the first place...
 
-        wstring fstr = L"";
-        wstring tstr = L"";
+        UString fstr;
+        UString tstr;
 
 	if((f_cache.find(source) == f_cache.end()))
         {
@@ -116,37 +117,39 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
 
         /*---------------------------------------------*/
 
-	if (fstr == L"") {
-		fstr = L"@" + source;
-	}
-	if (tstr == L"") {
-		tstr = L"@" + source;
-	}
+        if (fstr.empty()) {
+          fstr += '@';
+          fstr.append(source);
+        }
+        if (tstr.empty()) {
+          tstr += '@';
+          tstr.append(source);
+        }
 
-	ttoken = parseBiltransToken(source + L"/" + tstr);
-	ftoken = parseBiltransToken(source + L"/" + fstr);
+	ttoken = parseBiltransToken(source + "/"_u + tstr);
+	ftoken = parseBiltransToken(source + "/"_u + fstr);
 
 
 	if(this->trimmed) {
 		for(size_t i = 0; i < ftoken.targetTokens.size(); ++i ) {
 			if(ttoken.targetTokens[i].tags.size() <
 			   ftoken.targetTokens[i].tags.size()) {
-				ttoken.targetTokens[i].tags.push_back(L"*");
+				ttoken.targetTokens[i].tags.push_back("*"_u);
 			}
 		}
 	}
 
-	vector<wstring> newTags;
+	vector<UString> newTags;
 	//bool sourceTrimmed = false;
 	for(size_t i = 0; i < ttoken.sourceToken.tags.size(); ++i) {
-		wstring tag = ttoken.sourceToken.tags[i];
+		UString tag = ttoken.sourceToken.tags[i];
 		if (find(ttoken.targetTokens[0].tags, tag) ==
 			find(ftoken.targetTokens[0].tags, tag)) {
 			newTags.push_back(tag);
 		}
 	}
 	if(ttoken.sourceToken.tags.size() > newTags.size()) {
-		newTags.push_back(L"*");
+		newTags.push_back("*"_u);
 	}
 	ttoken.sourceToken.tags = newTags;
 
@@ -154,50 +157,50 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
 }
 
 void MultiTranslator::biltransToMultiTranslator(int sn, int &tn, unsigned int idx,
-	vector<BiltransToken> s, wstring buffer)
+	vector<BiltransToken> s, UString buffer)
 {
 
 	if (idx == s.size() ) {
-		wcout << L".[][" <<  sn << L" " << tn << L"].[]\t" << buffer << endl;
+		cout << ".[][" <<  sn << " " << tn << "].[]\t" << buffer << endl;
 		tn += 1;
 		return;
 	}
 	auto n = s[idx].targetTokens.size();
-	wstring base;
-	base = s[idx].sourceToken.toString(false) + L"/";
+	UString base;
+	base = s[idx].sourceToken.toString(false) + "/"_u;
 	for(size_t i = 0; i < n; ++i) {
-		wstring token = L"^" + base + s[idx].targetTokens[i].toString(false) + L"$";
+		UString token = "^"_u + base + s[idx].targetTokens[i].toString(false) + "$"_u;
 		if(idx != s.size() - 1) {
-			token += L" ";
+			token += ' ';
 		}
 		biltransToMultiTranslator(sn, tn, idx+1, s, buffer + token);
 	}
 }
 void MultiTranslator::printBiltransSentence(int n, vector<BiltransToken> s) {
 	if (number_lines) {
-		wcout << n << "\t";
+		cout << n << "\t";
 	}
 	for(size_t i = 0; i < s.size(); ++i) {
-		wcout << s[i].toString(true);
+		cout << s[i].toString(true);
 		if (i != s.size() - 1) {
-			wcout << L" ";
+			cout << " ";
 		}
 	}
-	wcout << endl;
+	cout << endl;
 }
 
 void MultiTranslator::printTaggerOutput(int n, vector<BiltransToken> sentence) {
 	if (number_lines) {
-		wcout << n << "\t";
+		cout << n << "\t";
 	}
 
 	for(size_t i = 0; i < sentence.size(); ++i) {
-		wcout << sentence[i].sourceToken.toString(true);
+		cout << sentence[i].sourceToken.toString(true);
 		if (i != sentence.size() -1) {
-			wcout << L" ";
+			cout << " ";
 		}
 	}
-	wcout << endl;
+	cout << endl;
 }
 
 void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
@@ -207,8 +210,8 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 	int numberOfUnknown = 0;
 	int fertility = 1;
 	for(size_t i = 0; i < sentence.size(); ++i) {
-		wstring token = sentence[i].toString(false);
-		wstring target;
+		UString token = sentence[i].toString(false);
+		UString target;
 
 		BiltransToken bt;
 		if(this->trimmed){
@@ -220,7 +223,7 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 		if (isPosAmbig(bt)) {
 			hasAmbigPos = true;
 		}
-		if(token[0] == L'*') {
+		if(token[0] == '*') {
 			numberOfUnknown ++;
 		}
 		fertility *= bt.targetTokens.size();
@@ -240,7 +243,7 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 		} else if(mode == "-b") {
 			printBiltransSentence(this->sn, outputSentence);
 		} else if (mode == "-m") {
-			wstring outBuffer = L"";
+			UString outBuffer;
 			int tn = 0;
 			biltransToMultiTranslator(this->sn, tn, 0, outputSentence, outBuffer);
 		}
diff --git a/src/multi_translator.h b/src/multi_translator.h
index d4d69cd..b2574e5 100644
--- a/src/multi_translator.h
+++ b/src/multi_translator.h
@@ -7,33 +7,38 @@
 
 class BiltransToken {
 public:
-	TaggerToken sourceToken;
-	vector<TaggerToken> targetTokens;
-	wstring blanks;
-
-	bool isEOF;
-
-	BiltransToken() {
-		isEOF = false;
-	}
-
-	wstring toString(bool delimiter) {
-		wstring out = sourceToken.toString(false);
-		for(unsigned int i = 0; i < targetTokens.size(); i++) {
-			out += L'/' + targetTokens[i].toString(false);
-		}
-		if (delimiter) {
-			out = L"^" + out + L"$";
-		}
-		return out;
-	}
+  TaggerToken sourceToken;
+  vector<TaggerToken> targetTokens;
+  UString blanks;
+  
+  bool isEOF;
+  
+  BiltransToken() {
+    isEOF = false;
+  }
+
+  UString toString(bool delimiter) {
+    UString out;
+    if (delimiter) {
+      out += '^';
+    }
+    out.append(sourceToken.toString(false));
+    for (auto& tok : targetTokens) {
+      out += '/';
+      out.append(tok.toString(false));
+    }
+    if (delimiter) {
+      out += '$';
+    }
+    return out;
+  }
 };
 
 class MultiTranslator : public TaggerOutputProcessor {
 private:
 	FSTProcessor bilingual;
-	map<wstring, wstring> f_cache;
-	map<wstring, wstring> t_cache;
+	map<UString, UString> f_cache;
+	map<UString, UString> t_cache;
 	string path;
 
 	bool trimmed;
@@ -44,10 +49,10 @@ private:
 
 	bool isPosAmbig(BiltransToken token);
 
-	BiltransToken getTrimmedToken(wstring str);
-	BiltransToken getFullToken(wstring str);
+	BiltransToken getTrimmedToken(UString str);
+	BiltransToken getFullToken(UString str);
 
-	BiltransToken parseBiltransToken(wstring bt);
+	BiltransToken parseBiltransToken(UString bt);
 
 	void processSentence(vector<TaggerToken> s);
 
@@ -56,7 +61,7 @@ private:
 	void printTaggerOutput(int i, vector<BiltransToken> s);
 
 	void biltransToMultiTranslator(int sn, int &tn, unsigned int idx,
-			vector<BiltransToken> s, wstring buffer);
+			vector<BiltransToken> s, UString buffer);
 
 
 
diff --git a/src/tagger_output_processor.cc b/src/tagger_output_processor.cc
index 63b07f8..52d5c7c 100644
--- a/src/tagger_output_processor.cc
+++ b/src/tagger_output_processor.cc
@@ -9,7 +9,7 @@ TaggerOutputProcessor::~TaggerOutputProcessor() {
 
 }
 
-int TaggerOutputProcessor::find(vector<wstring> xs, wstring x) {
+int TaggerOutputProcessor::find(vector<UString> xs, UString x) {
 	for (size_t i = 0; i < xs.size(); ++i) {
 		if (xs[i] == x)
 			return i;
@@ -17,10 +17,10 @@ int TaggerOutputProcessor::find(vector<wstring> xs, wstring x) {
 	return -1;
 }
 
-TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) {
+TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) {
 	TaggerToken token;
 	int state = 0; // lemma;
-	wstring buffer;
+	UString buffer;
 	for (auto& c : str) {
 		if(c == L'<' && state == 0) {
 			state = 1;
@@ -41,10 +41,10 @@ TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) {
 	return token;
 }
 
-vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
+vector<UString> TaggerOutputProcessor::parseTags(UString token) {
 	int state = 0; // outside
-	vector<wstring> tags;
-	wstring buffer;
+	vector<UString> tags;
+	UString buffer;
 	for (auto& c : token) {
 		if (state == 0) {
 			if (c == '<') {
@@ -53,7 +53,7 @@ vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
 		} else if (state == 1) {
 			if (c == '>') {
 				tags.push_back(buffer);
-				buffer = L"";
+				buffer.clear();
 				state = 0;
 			} else {
 				buffer += c;
@@ -63,26 +63,26 @@ vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
 	return tags;
 }
 
-vector<wstring> TaggerOutputProcessor::wsplit(wstring wstr, wchar_t delim) {
-	vector<wstring> tokens;
-	wstring buffer;
+vector<UString> TaggerOutputProcessor::wsplit(UString wstr, wchar_t delim) {
+	vector<UString> tokens;
+	UString buffer;
 
 	for(size_t i = 0; i < wstr.size(); ++i) {
 		if(wstr[i] == delim && (i == 0 || wstr[i-1] != L'\\')) {
 			tokens.push_back(buffer);
-			buffer = L"";
+			buffer.clear();
 		} else {
 			buffer += wstr[i];
 		}
 	}
-	if(buffer != L"") {
+	if(!buffer.empty()) {
 		tokens.push_back(buffer);
 	}
 	return tokens;
 }
 
-wstring TaggerOutputProcessor::getLemma(wstring token) {
-	wstring buffer;
+UString TaggerOutputProcessor::getLemma(UString token) {
+	UString buffer;
 	for (auto& c : token) {
 		if(c != '<') {
 			buffer += c;
@@ -94,7 +94,7 @@ wstring TaggerOutputProcessor::getLemma(wstring token) {
 }
 
 void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) {
-	wstring buffer;
+	UString buffer;
 	vector<TaggerToken> sentence;
 	bool escaped = false;
 	int state = 0; // outside
@@ -126,7 +126,7 @@ void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) {
 		} else if (state == 1) {
 			if(c == L'$' && !escaped) {
 				sentence.push_back(parseTaggerToken(buffer));
-				buffer = L"";
+				buffer.clear();
 				state = 0;
 			} else if (c == '\\' && !escaped) {
 				escaped = true;
diff --git a/src/tagger_output_processor.h b/src/tagger_output_processor.h
index 40c00ad..cdcba9d 100644
--- a/src/tagger_output_processor.h
+++ b/src/tagger_output_processor.h
@@ -18,30 +18,36 @@ using namespace std;
 
 class TaggerToken {
 public:
-	wstring lemma;
-	vector<wstring> tags;
-	wstring toString(bool delimiters) {
-		wstring out = lemma;
-		for (auto& tag : tags) {
-			out += L"<" + tag + L">";
-		}
-		if (delimiters) {
-			out = L"^" + out + L"$";
-		}
-		return out;
-	}
+  UString lemma;
+  vector<UString> tags;
+  UString toString(bool delimiters) {
+    UString out;
+    if (delimiters) {
+      out += '^';
+    }
+    out.append(lemma);
+    for (auto& tag : tags) {
+      out += '<';
+      out.append(tag);
+      out += '>';
+    }
+    if (delimiters) {
+      out += '$';
+    }
+    return out;
+  }
 };
 
 class TaggerOutputProcessor {
 protected:
 	int sn;
 
-	vector<wstring> parseTags(wstring token);
-	vector<wstring> wsplit(wstring wstr, wchar_t delim);
-	TaggerToken parseTaggerToken(wstring buffer);
+	vector<UString> parseTags(UString token);
+	vector<UString> wsplit(UString wstr, wchar_t delim);
+	TaggerToken parseTaggerToken(UString buffer);
 
-	int find(vector<wstring> xs, wstring x);
-	wstring getLemma(wstring token);
+	int find(vector<UString> xs, UString x);
+	UString getLemma(UString token);
 
 	virtual void processSentence(vector<TaggerToken>) =0;
 public: