commit 6ed619ecb9637b83e98f433f6cb55ef75ef4919c
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Wed Jun 30 08:53:51 2021 -0500

    use ICU (#71)
    
    ICU changes
    - convert all `std::wstring`s and related types to `UString`
    - use `lttoolbox/input_file.h` for reading UTF-8 with nulls
    - use `UFILE*` for writing output
    
    efficiency, readability, and code style changes
    - copy `.editorconfig` file from lttoolbox
    - move locale setting from constructor to CLI interface
    - move constant initializers to class headers
    - store values of special transducer symbols rather than repeatedly looking them up
    - prefer `str.empty()` to `str == ""`
    - remove unused `#include`s
    - delete long section of commented out code in `lrx_processor.cc`
    
    helper function and dependency changes
    - all needed helper functions have moved to lttoolbox, so drop apertium dependency
    - rely on `StringUtils` for converting strings to numbers
    - add `debug` and `error` `printf`-like functions in `lrx_compiler`
    - use `XMLParseUtil` specialized functions

diff --git a/.editorconfig b/.editorconfig
new file mode 100755
index 0000000..dd10a25
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,18 @@
+# https://editorconfig.org/
+root = yes
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_size = 4
+indent_style = tab
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+[**.cc]
+indent_size = 2
+indent_style = space
+
+[**.h]
+indent_size = 2
+indent_style = space
diff --git a/.gitignore b/.gitignore
index 5010f84..0104dde 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,8 @@ src/lrx-proc
 multitrans
 stamp-h1
 
+/python/apertium_lex_tools.py
+/python/apertium_lex_tools_wrap.cpp
 /python/lex_tools_wrap.cpp
 /python/lextools.py
 /python/setup.py
diff --git a/configure.ac b/configure.ac
index 735e785..8004204 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,9 @@
 AC_PREREQ(2.61)
 
 m4_define([required_libxml_version], [2.6.17])
-m4_define([required_apertium_version], [3.7.1])
-m4_define([required_lttoolbox_version], [3.5.3])
+m4_define([required_lttoolbox_version], [3.6.0])
 
-AC_INIT([apertium-lex-tools], [0.2.7], [apertium-stuff@lists.sourceforge.net])
+AC_INIT([apertium-lex-tools], [0.3.0], [apertium-stuff@lists.sourceforge.net])
 AM_INIT_AUTOMAKE
 AC_CONFIG_MACRO_DIR([m4])
 
@@ -48,25 +47,27 @@ PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= required_lttoolbox_version])
 AC_SUBST(LTTOOLBOX_CFLAGS)
 AC_SUBST(LTTOOLBOX_LIBS)
 
-PKG_CHECK_MODULES([APERTIUM], [apertium >= required_apertium_version])
-
-AC_SUBST(APERTIUM_CFLAGS)
-AC_SUBST(APERTIUM_LIBS)
-
 PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version])
 
 AC_SUBST(LIBXML_CFLAGS)
 AC_SUBST(LIBXML_LIBS)
 
+PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc])
+
+AC_SUBST(ICU_CFLAGS)
+AC_SUBST(ICU_LIBS)
+
 # Checks for libraries.
 AC_CHECK_LIB(xml2, xmlReaderForFile)
 
+AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])])
+
 AC_CHECK_FUNCS([setlocale strdup])
 
-AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked])
+AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked])
 
-CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS"
-LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS -lz"
+CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS"
+LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS -lz"
 
 # Checks for highest supported C++ standard
 AC_LANG(C++)
diff --git a/python/apertium_lex_tools.i b/python/apertium_lex_tools.i
index 051346f..9304bda 100644
--- a/python/apertium_lex_tools.i
+++ b/python/apertium_lex_tools.i
@@ -51,8 +51,9 @@ public:
 
   void lrx_proc(int argc, char **argv, char *input_path, char *output_path)
   {
-    FILE* input = fopen(input_path, "rb");
-    FILE* output = fopen(output_path, "wb");
+	InputFile input;
+	input.open(input_path);
+	UFILE* output = u_fopen(output_path, "w", NULL, NULL);
     optind = 1;
     while(true)
     {
@@ -83,8 +84,7 @@ public:
       }
     }
     process(input, output);
-    fclose(input);
-    fclose(output);
+    u_fclose(output);
   }
 };
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 9da20b3..85973a7 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -34,7 +34,7 @@ def get_include_dirs():
 apertium_lex_tools_module = Extension(
     name='_apertium_lex_tools',
     sources=get_sources(),
-    swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split(),
+    swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split()+'@ICU_CFLAGS@'.split(),
     include_dirs=get_include_dirs(),
     library_dirs=['/usr/include/libxml2', '/usr/local/lib'],
     extra_compile_args='@CXXFLAGS@'.split(),
diff --git a/src/biltrans-without-queue.cpp b/src/biltrans-without-queue.cpp
index 9dc5d55..d394a8c 100644
--- a/src/biltrans-without-queue.cpp
+++ b/src/biltrans-without-queue.cpp
@@ -3,8 +3,8 @@
 int main(int argc, char** argv) {
 
 	if (argc != 2 && argc != 3) {
-		wcout << "Usage: " << argv[0];
-		wcout << "<path to a binary bilingual transducer> [--trimmed | -t]" << endl;
+		cout << "Usage: " << argv[0];
+		cout << "<path to a binary bilingual transducer> [--trimmed | -t]" << endl;
 		exit(1);
 	}
 	string path(argv[1]);
diff --git a/src/irstlm_ranker.cpp b/src/irstlm_ranker.cpp
index b50c31b..9a047a2 100644
--- a/src/irstlm_ranker.cpp
+++ b/src/irstlm_ranker.cpp
@@ -19,7 +19,6 @@ IrstlmRanker::IrstlmRanker(const string &filePath,
 		exit(-1);
 	}
     cout.precision(10);
-    wcout.precision(10);
 
 	lineno = 0;
 	sublineno = 0;
@@ -387,7 +386,7 @@ int main(int argc, char ** argv) {
 	// I don't know :)
 
     if(setlocale(LC_CTYPE, "") == NULL) {
-        wcerr << L"Warning: unsupported locale, fallback to \"C\"" << endl;
+        cerr << "Warning: unsupported locale, fallback to \"C\"" << endl;
         setlocale(LC_ALL, "C");
     }
 
@@ -410,4 +409,3 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
-
diff --git a/src/ldx_proc.cc b/src/ldx_proc.cc
index 9525bb5..b3fbd01 100644
--- a/src/ldx_proc.cc
+++ b/src/ldx_proc.cc
@@ -25,114 +25,53 @@
 
 #include <lttoolbox/exception.h>
 #include <lttoolbox/fst_processor.h>
-#include <lttoolbox/ltstr.h>
 #include <lttoolbox/lt_locale.h>
+#include <lttoolbox/input_file.h>
+#include <unicode/uchar.h>
+#include <unicode/ustdio.h>
 
 using namespace std;
 
 
-int readGeneration(FILE *input, FILE *output);
-void skipUntil(FILE *input, FILE *output, wint_t const character);
-wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2);
-wchar_t readEscaped(FILE *input);
-void streamError();
+int32_t readGeneration(InputFile& input, UFILE *output);
+void skipUntil(InputFile& input, UFILE *output, UChar32 const character);
 
 
 FSTProcessor fstp;
 bool outOfWord = true;
-set<wchar_t> escaped_chars;
+set<int32_t> escaped_chars;
 
 
 void
-streamError()
-{
-  throw Exception("Error: Malformed input stream.");
-}
-
-wchar_t
-readEscaped(FILE *input)
-{
-  if(feof(input))
-  {
-    streamError();
-  }
-
-  wchar_t val = static_cast<wchar_t>(fgetwc_unlocked(input));
-
-  if(feof(input) || escaped_chars.find(val) == escaped_chars.end())
-  {
-    streamError();
-  }
-
-  return val;
-}
-
-
-wstring
-readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2)
-{
-  wstring result = L"";
-  result += delim1;
-  wchar_t c = delim1;
-
-  while(!feof(input) && c != delim2)
-  {
-    c = static_cast<wchar_t>(fgetwc_unlocked(input));
-    result += c;
-    if(c != L'\\')
-    {
-      continue;
-    }
-    else
-    {
-      result += static_cast<wchar_t>(readEscaped(input));
-    }
-  }
-
-  if(c != delim2)
-  {
-    streamError();
-  }
-
-  return result;
-}
-
-
-void
-skipUntil(FILE *input, FILE *output, wint_t const character)
+skipUntil(InputFile& input, UFILE* output, UChar32 const character)
 {
   while(true)
   {
-    wint_t val = fgetwc_unlocked(input);
-    if(feof(input))
-    {
+    UChar32 val = input.get();
+    if (input.eof()) {
       return;
     }
 
     switch(val)
     {
-      case L'\\':
-        val = fgetwc_unlocked(input);
-        if(feof(input))
-        {
+      case '\\':
+        val = input.get();
+        if (input.eof()) {
           return;
         }
-        fputwc_unlocked(L'\\', output);
-        fputwc_unlocked(val, output);
+        u_fputc('\\', ouput);
+        u_fputc(val, output);
         break;
 
-      case L'\0':
-        fputwc_unlocked(val, output);
+      case '\0':
+        u_fputc(val, output);
         break;
 
       default:
-        if(val == character)
-        {
+        if (val == character) {
           return;
-        }
-        else
-        {
-          fputwc_unlocked(val, output);
+        } else {
+          u_fputc(val, output);
         }
         break;
     }
@@ -140,48 +79,47 @@ skipUntil(FILE *input, FILE *output, wint_t const character)
 }
 
 
-int
-readGeneration(FILE *input, FILE *output)
+int32_t
+readGeneration(InputFile& input, UFILE* output)
 {
-  wint_t val = fgetwc_unlocked(input);
+  UChar32 val = input.get();
 
-  if(feof(input))
-  {
+  if (input.eof()) {
     return 0x7fffffff;
   }
 
   if(outOfWord)
   {
-    if(val == L'^')
+    if(val == '^')
     {
-      val = fgetwc_unlocked(input);
-      if(feof(input))
+      val = input.get();
+      if(input.eof())
       {
         return 0x7fffffff;
       }
     }
-    else if(val == L'\\')
+    else if(val == '\\')
     {
-      fputwc_unlocked(val, output);
-      val = fgetwc_unlocked(input);
-      if(feof(input))
+      u_fputc(val, ouput);
+      val = input.get();
+      if(input.eof())
       {
         return 0x7fffffff;
       }
-      fputwc_unlocked(val,output);
-      skipUntil(input, output, L'^');
-      val = fgetwc_unlocked(input);
-      if(feof(input))
+      u_fputc(val,output);
+      skipUntil(input, output, '^');
+      val = input.get();
+      if(input.eof())
       {
         return 0x7fffffff;
       }
     }
     else
     {
-      fputwc_unlocked(val, output);
-      skipUntil(input, output, L'^');
-      val = fgetwc_unlocked(input);
-      if(feof(input))
+      u_fputc(val, output);
+      skipUntil(input, output, '^');
+      val = input.get();
+      if(input.eof())
       {
         return 0x7fffffff;
       }
@@ -189,24 +127,24 @@ readGeneration(FILE *input, FILE *output)
     outOfWord = false;
   }
 
-  if(val == L'\\')
+  if(val == '\\')
   {
-    val = fgetwc_unlocked(input);
-    return static_cast<int>(val);
+    val = input.get();
+    return static_cast<int32_t>(val);
   }
-  else if(val == L'$')
+  else if(val == '$')
   {
     outOfWord = true;
-    return static_cast<int>(L'$');
+    return static_cast<int32_t>('$');
   }
-  else if(val == L'[')
+  else if(val == '[')
   {
-    fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output);
+    write(input.readBlock('[', ']'), output);
     return readGeneration(input, output);
   }
   else
   {
-    return static_cast<int>(val);
+    return static_cast<int32_t>(val);
   }
 
   return 0x7fffffff;
@@ -215,7 +153,8 @@ readGeneration(FILE *input, FILE *output)
 
 int main(int argc, char **argv)
 {
-  FILE *input = stdin, *output = stdout;
+  InputFile input;
+  UFILE* output = u_finit(stdout, NULL, NULL);
 
   LtLocale::tryToSetLocale();
 
@@ -226,17 +165,17 @@ int main(int argc, char **argv)
     exit(-1);
   }
 
-  escaped_chars.insert(L'[');
-  escaped_chars.insert(L']');
-  escaped_chars.insert(L'{');
-  escaped_chars.insert(L'}');
-  escaped_chars.insert(L'^');
-  escaped_chars.insert(L'$');
-  escaped_chars.insert(L'/');
-  escaped_chars.insert(L'\\');
-  escaped_chars.insert(L'@');
-  escaped_chars.insert(L'<');
-  escaped_chars.insert(L'>');
+  escaped_chars.insert('[');
+  escaped_chars.insert(']');
+  escaped_chars.insert('{');
+  escaped_chars.insert('}');
+  escaped_chars.insert('^');
+  escaped_chars.insert('$');
+  escaped_chars.insert('/');
+  escaped_chars.insert('\\');
+  escaped_chars.insert('@');
+  escaped_chars.insert('<');
+  escaped_chars.insert('>');
 
 
   FILE *t_rl = fopen(argv[1], "rb");
@@ -253,25 +192,25 @@ int main(int argc, char **argv)
   // read until '/', then read each from '/' adding to a map, then look up first in transducer, and if the result
   // is found in the map, then output it, otherwise error.
 
-  int val = 0, i = 0;
+  int32_t val = 0, i = 0;
   bool seenFirst = false;
-  wstring sl = L"";
-  wstring tl = L"";
-  set<wstring> tllu;
-  set<wstring> tllu_defaults;
+  UString sl;
+  UString tl;
+  set<UString> tllu;
+  set<UString> tllu_defaults;
 
-  skipUntil(input, output, L'^');
+  skipUntil(input, output, '^');
   outOfWord = false;
 
   while((val = readGeneration(input, output)) != 0x7fffffff)
   {
     switch(val)
     {
-      case L'^':
+      case '^':
         outOfWord = false;
-	val = readGeneration(input, output);
+        val = readGeneration(input, output);
         break;
-      case L'/':
+      case '/':
         if(!seenFirst)
         {
           seenFirst = true;
@@ -281,13 +220,13 @@ int main(int argc, char **argv)
           tllu.insert(tl);
         }
         i++;
-        tl = L"";
-	val = readGeneration(input, output);
-        if(val != L'$')
+        tl.clear();
+        val = readGeneration(input, output);
+        if(val != '$')
         {
           break;
         }
-      case L'$':
+      case '$':
         outOfWord = true;
         if(!seenFirst)
         {
@@ -299,23 +238,28 @@ int main(int argc, char **argv)
         }
 
         seenFirst = false;
-        fputws_unlocked(L"^", output);
-        fputws_unlocked(sl.c_str(), output);
+        u_fputc('^', output);
+        write(sl, output);
         if(tllu.size() > 1)
         {
-          tl = L"";
-          wstring in = L"^" + sl + L"$";
-          wstring trad = fstp.biltrans(in);
+          tl.clear();
+          UString in;
+          in += '^';
+          in.append(sl);
+          in += '$';
+          UString trad = fstp.biltrans(in);
           int j = 0;
           bool tlout = false;
           for(auto& it : tllu)
           {
-            wstring t = L"^" + it + L"$";
+            UString t;
+            t += '^';
+            t.append(it);
+            t += '$';
             if(t == trad)
             {
-              fputws_unlocked(L"/", output);
-              wstring to = t.substr(1, wcslen(t.c_str())-2);
-              fputws_unlocked(to.c_str(), output);
+              u_fputc('/', output);
+              write(it, output);
               tlout = true;
               break;
             }
@@ -328,36 +272,35 @@ int main(int argc, char **argv)
             {
               if(it != tllu.end())
               {
-                fputws_unlocked(L"/", output);
+                u_fputc('/', output);
               }
-              fputws_unlocked(it->c_str(), output);
+              write(*it, output);
             }
           }
 
         }
         else
         {
-          fputws_unlocked(L"/", output);
-          fputws_unlocked(tl.c_str(), output);
+          u_fputc('/', output);
+          write(tl, output);
         }
-        fputws_unlocked(L"$", output);
+        u_fputc('$', output);
 
-        sl = L""; tl = L"";
+        sl.clear();
+        tl.clear();
         tllu.clear();
         i = 0;
         break;
     }
     if(!seenFirst && !outOfWord)
     {
-      sl.append(1, static_cast<wchar_t>(val));
+      sl += static_cast<UChar32>(val);
     }
     else if(!outOfWord)
     {
-      tl.append(1, static_cast<wchar_t>(val));
+      tl += static_cast<UChar32>(val);
     }
   }
 
   return 0;
 }
-
-
diff --git a/src/lrx_comp.cc b/src/lrx_comp.cc
index c1d5b46..c9fe3bb 100644
--- a/src/lrx_comp.cc
+++ b/src/lrx_comp.cc
@@ -16,6 +16,9 @@
  */
 
 #include <lrx_compiler.h>
+#include <cstring>
+#include <iostream>
+#include <lttoolbox/lt_locale.h>
 
 using namespace std;
 
@@ -31,6 +34,8 @@ void endProgram(char *name)
 
 int main (int argc, char **argv)
 {
+  LtLocale::tryToSetLocale();
+
   LRXCompiler compiler;
 
   if(argc != 3 && argc != 4)
diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc
index 3fb4e6a..b2cf64c 100644
--- a/src/lrx_compiler.cc
+++ b/src/lrx_compiler.cc
@@ -15,100 +15,92 @@
  * along with this program; if not, see <https://www.gnu.org/licenses/>.
  */
 
-#include <weight.h>
 #include <lrx_compiler.h>
-#include <cstdint>
+#include <weight.h>
+#include <lttoolbox/string_utils.h>
+#include <lttoolbox/xml_parse_util.h>
+#include <lttoolbox/compression.h>
+#include <iostream>
 
 using namespace std;
 
-wstring const LRXCompiler::LRX_COMPILER_LRX_ELEM        = L"lrx";
-wstring const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM    = L"def-seqs";
-wstring const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM     = L"def-seq";
-wstring const LRXCompiler::LRX_COMPILER_RULES_ELEM      = L"rules";
-wstring const LRXCompiler::LRX_COMPILER_RULE_ELEM       = L"rule";
-wstring const LRXCompiler::LRX_COMPILER_MATCH_ELEM      = L"match";
-wstring const LRXCompiler::LRX_COMPILER_SELECT_ELEM     = L"select";
-wstring const LRXCompiler::LRX_COMPILER_REMOVE_ELEM     = L"remove";
-wstring const LRXCompiler::LRX_COMPILER_OR_ELEM         = L"or";
-wstring const LRXCompiler::LRX_COMPILER_REPEAT_ELEM     = L"repeat";
-wstring const LRXCompiler::LRX_COMPILER_SEQ_ELEM        = L"seq";
-
-wstring const LRXCompiler::LRX_COMPILER_LEMMA_ATTR      = L"lemma";
-wstring const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR     = L"suffix";
-wstring const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR   = L"contains";
-wstring const LRXCompiler::LRX_COMPILER_CASE_ATTR       = L"case";
-wstring const LRXCompiler::LRX_COMPILER_SURFACE_ATTR    = L"surface";
-wstring const LRXCompiler::LRX_COMPILER_TAGS_ATTR       = L"tags";
-wstring const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR     = L"weight";
-wstring const LRXCompiler::LRX_COMPILER_COMMENT_ATTR    = L"c";
-wstring const LRXCompiler::LRX_COMPILER_NAME_ATTR       = L"n";
-wstring const LRXCompiler::LRX_COMPILER_FROM_ATTR       = L"from";
-wstring const LRXCompiler::LRX_COMPILER_UPTO_ATTR       = L"upto";
-
-wstring const LRXCompiler::LRX_COMPILER_TYPE_SELECT     = L"select";
-wstring const LRXCompiler::LRX_COMPILER_TYPE_REMOVE     = L"remove";
-wstring const LRXCompiler::LRX_COMPILER_TYPE_SKIP       = L"skip";
+UString const LRXCompiler::LRX_COMPILER_LRX_ELEM        = "lrx"_u;
+UString const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM    = "def-seqs"_u;
+UString const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM     = "def-seq"_u;
+UString const LRXCompiler::LRX_COMPILER_RULES_ELEM      = "rules"_u;
+UString const LRXCompiler::LRX_COMPILER_RULE_ELEM       = "rule"_u;
+UString const LRXCompiler::LRX_COMPILER_MATCH_ELEM      = "match"_u;
+UString const LRXCompiler::LRX_COMPILER_SELECT_ELEM     = "select"_u;
+UString const LRXCompiler::LRX_COMPILER_REMOVE_ELEM     = "remove"_u;
+UString const LRXCompiler::LRX_COMPILER_OR_ELEM         = "or"_u;
+UString const LRXCompiler::LRX_COMPILER_REPEAT_ELEM     = "repeat"_u;
+UString const LRXCompiler::LRX_COMPILER_SEQ_ELEM        = "seq"_u;
+
+UString const LRXCompiler::LRX_COMPILER_LEMMA_ATTR      = "lemma"_u;
+UString const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR     = "suffix"_u;
+UString const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR   = "contains"_u;
+UString const LRXCompiler::LRX_COMPILER_CASE_ATTR       = "case"_u;
+UString const LRXCompiler::LRX_COMPILER_SURFACE_ATTR    = "surface"_u;
+UString const LRXCompiler::LRX_COMPILER_TAGS_ATTR       = "tags"_u;
+UString const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR     = "weight"_u;
+UString const LRXCompiler::LRX_COMPILER_COMMENT_ATTR    = "c"_u;
+UString const LRXCompiler::LRX_COMPILER_NAME_ATTR       = "n"_u;
+UString const LRXCompiler::LRX_COMPILER_FROM_ATTR       = "from"_u;
+UString const LRXCompiler::LRX_COMPILER_UPTO_ATTR       = "upto"_u;
+
+UString const LRXCompiler::LRX_COMPILER_TYPE_SELECT     = "select"_u;
+UString const LRXCompiler::LRX_COMPILER_TYPE_REMOVE     = "remove"_u;
+UString const LRXCompiler::LRX_COMPILER_TYPE_SKIP       = "skip"_u;
 
 double const  LRXCompiler::LRX_COMPILER_DEFAULT_WEIGHT  = 1.0;
 
-wstring
-LRXCompiler::itow(int i)
-{
-  // Convert an int to a wstring
-  wchar_t buf[50];
-  memset(buf, '\0', sizeof(buf));
-  swprintf(buf, 50, L"%d", i);
-  wstring id(buf);
-  return id;
-}
-
-int
-LRXCompiler::wtoi(wstring w)
+void
+LRXCompiler::debug(const char* fmt, ...)
 {
-  // Convert a wstring to an int
-  wistringstream wstrm(w);
-  int i_name = -numeric_limits<int>::max();
-  wstrm >> i_name;
-
-  return i_name;
+  if (debugMode) {
+    va_list argptr;
+    va_start(argptr, fmt);
+    u_vfprintf(debug_output, fmt, argptr);
+    va_end(argptr);
+  }
 }
 
-double
-LRXCompiler::wtod(wstring w)
+void
+LRXCompiler::error(const char* fmt, ...)
 {
-  // Convert a wstring to a double
-  wistringstream wstrm(w);
-  double d_name = -numeric_limits<double>::max();
-  wstrm >> d_name;
-
-  return d_name;
+  u_fprintf(debug_output, "Error (line %d): ",
+            xmlTextReaderGetParserLineNumber(reader));
+  va_list argptr;
+  va_start(argptr, fmt);
+  u_vfprintf(debug_output, fmt, argptr);
+  va_end(argptr);
+  u_fputc('\n', debug_output);
+  exit(EXIT_FAILURE);
 }
 
 LRXCompiler::LRXCompiler()
 {
-  LtLocale::tryToSetLocale();
-
-  debugMode = false;
-  outputGraph = false;
-
-  currentRuleId = 0;
+  debug_output = u_finit(stderr, NULL, NULL);
 
   initialState = transducer.getInitial();
   currentState = initialState;
   lastState = initialState;
 
-  canSelect = true;
-
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SELECT + L">");
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_REMOVE + L">");
-  alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SKIP + L">");
-
-  alphabet.includeSymbol(L"<ANY_TAG>");
-  alphabet.includeSymbol(L"<ANY_CHAR>");
-  alphabet.includeSymbol(L"<ANY_UPPER>");
-  alphabet.includeSymbol(L"<ANY_LOWER>");
-  alphabet.includeSymbol(L"<$>");
-
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SELECT + ">"_u);
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_REMOVE + ">"_u);
+  alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SKIP + ">"_u);
+
+  alphabet.includeSymbol("<ANY_TAG>"_u);
+  alphabet.includeSymbol("<ANY_CHAR>"_u);
+  alphabet.includeSymbol("<ANY_UPPER>"_u);
+  alphabet.includeSymbol("<ANY_LOWER>"_u);
+  alphabet.includeSymbol("<$>"_u);
+
+  any_tag        = alphabet("<ANY_TAG>"_u);
+  any_char       = alphabet("<ANY_CHAR>"_u);
+  any_upper      = alphabet("<ANY_UPPER>"_u);
+  any_lower      = alphabet("<ANY_LOWER>"_u);
+  word_boundary  = alphabet(alphabet("<$>"_u), alphabet("<$>"_u));
 }
 
 LRXCompiler::~LRXCompiler()
@@ -129,64 +121,45 @@ LRXCompiler::setOutputGraph(bool o)
 }
 
 void
-LRXCompiler::skipBlanks(wstring &name)
+LRXCompiler::skipBlanks(UString &name)
 {
-  while(name == L"#text" || name == L"#comment")
+  while(name == "#text"_u || name == "#comment"_u)
   {
-    if(name != L"#comment")
+    if(name != "#comment"_u)
     {
       if(!allBlanks())
       {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): Invalid construction." << endl;
-        exit(EXIT_FAILURE);
+        error("Invalid construction.");
       }
     }
 
     xmlTextReaderRead(reader);
-    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    name = XMLParseUtil::readName(reader);
   }
 }
 
-wstring
-LRXCompiler::attrib(wstring const &name)
+UString
+LRXCompiler::attrib(UString const &name)
 {
   return XMLParseUtil::attrib(reader, name);
 }
 
-wstring
-LRXCompiler::attrib(wstring const &name, const wstring fallback)
+UString
+LRXCompiler::attrib(UString const &name, const UString fallback)
 {
-  string mystr = "";
-  for (int i = 0, limit = name.size(); i != limit; i++) {
-    mystr += static_cast<char>(name[i]);
-  }
-
-  xmlChar *attrname = xmlCharStrdup(mystr.c_str());
-  xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname);
-  wstring result = XMLParseUtil::towstring(myattr);
-  xmlFree(myattr);
-  xmlFree(attrname);
-  if(myattr == NULL) {
-    return fallback;
-  }
-  else {
-    return result;
-  }
+  return XMLParseUtil::attrib(reader, name, fallback);
 }
 
 bool
 LRXCompiler::allBlanks()
 {
-  bool flag = true;
-  wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader));
-
-  for(unsigned int i = 0, limit = text.size(); i < limit; i++)
-  {
-    flag = flag && iswspace(text[i]);
+  UString text = XMLParseUtil::readValue(reader);
+  for (auto& c : text) {
+    if (!u_isspace(c)) {
+      return false;
+    }
   }
-
-  return flag;
+  return true;
 }
 
 void
@@ -210,7 +183,7 @@ LRXCompiler::parse(string const &fitxer)
 
   if(ret != 0)
   {
-    wcerr << L"Error: Parse error at the end of input." << endl;
+    cerr << "Error: Parse error at the end of input." << endl;
   }
 
 }
@@ -218,14 +191,13 @@ LRXCompiler::parse(string const &fitxer)
 void
 LRXCompiler::procNode()
 {
-  xmlChar const *xnombre = xmlTextReaderConstName(reader);
-  wstring nombre = XMLParseUtil::towstring(xnombre);
+  UString nombre = XMLParseUtil::readName(reader);
 
-  if(nombre == L"#text")
+  if(nombre == "#text"_u)
   {
     /* ignorar */
   }
-  else if(nombre== L"#comment")
+  else if(nombre== "#comment"_u)
   {
     /* ignorar */
   }
@@ -251,9 +223,7 @@ LRXCompiler::procNode()
   }
   else
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Invalid node '<" << nombre << L">'." << endl;
-    exit(EXIT_FAILURE);
+    error("Invalid node '<%S>'.", nombre.c_str());
   }
 
   return;
@@ -262,10 +232,13 @@ LRXCompiler::procNode()
 void
 LRXCompiler::procRule()
 {
-  wstring comment = this->attrib(LRX_COMPILER_COMMENT_ATTR);
-  wstring xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR);
-  wstring nombre = this->attrib(LRX_COMPILER_NAME_ATTR);
-  double weight =  wtod (xweight);
+  UString comment = this->attrib(LRX_COMPILER_COMMENT_ATTR);
+  UString xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR);
+  UString nombre = this->attrib(LRX_COMPILER_NAME_ATTR);
+  double weight = LRX_COMPILER_DEFAULT_WEIGHT;
+  if (!xweight.empty()) {
+    weight = StringUtils::stod(xweight);
+  }
 
   if(weight <= -numeric_limits<int>::max())
   {
@@ -276,25 +249,19 @@ LRXCompiler::procRule()
   currentState = transducer.insertNewSingleTransduction(alphabet(0, 0), currentState);
 
   currentRuleId++;
-  wstring ruleId = L"<" + itow(currentRuleId) + L">";
+  UString ruleId = "<"_u + StringUtils::itoa(currentRuleId) + ">"_u;
   weights[currentRuleId] = weight;
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"  rule: %d, weight: %.2f \n", currentRuleId, weight);
-  }
+  debug("  rule: %d, weight: %.2f \n", currentRuleId, weight);
 
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
-    if(ret != 1)
-    {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
-      exit(EXIT_FAILURE);
+    if(ret != 1) {
+      error("Parse error.");
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -316,7 +283,7 @@ LRXCompiler::procRule()
     }
     else if(name == LRX_COMPILER_RULE_ELEM)
     {
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
+      currentState = transducer.insertSingleTransduction(word_boundary, currentState);
       if(!alphabet.isSymbolDefined(ruleId.c_str()))
       {
         alphabet.includeSymbol(ruleId.c_str());
@@ -328,39 +295,26 @@ LRXCompiler::procRule()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_RULE_ELEM;
-      wcerr << L">'." << endl;
-      exit(EXIT_FAILURE);
+      error("Invalid inclusion of '<%S>' into '<rule>'.", name.c_str());
     }
   }
-
-
-  return;
 }
 
 void
 LRXCompiler::procOr()
 {
-
-  if(debugMode)
-  {
-    fwprintf(stderr, L"    or: \n");
-  }
+  debug("    or: \n");
 
   int or_initial_state = currentState;
   vector<int> reachedStates;
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
-    if(ret != 1)
-    {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
-      exit(EXIT_FAILURE);
+    if(ret != 1) {
+      error("Parse error.");
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -392,10 +346,7 @@ LRXCompiler::procOr()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_OR_ELEM;
-      wcerr << L">'." << endl;
-      exit(EXIT_FAILURE);
+      error("Invalid inclusion of '<%S>' into '<or>'.", name.c_str());
     }
   }
 
@@ -412,18 +363,15 @@ LRXCompiler::procDefSeq()
   int oldstate = currentState;
   currentState = initialState;
   lastState = initialState;
-  wstring seqname = this->attrib(LRX_COMPILER_NAME_ATTR);
+  UString seqname = this->attrib(LRX_COMPILER_NAME_ATTR);
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
-    if(ret != 1)
-    {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
-      exit(EXIT_FAILURE);
+    if(ret != 1) {
+      error("Parse error.");
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -450,10 +398,7 @@ LRXCompiler::procDefSeq()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM;
-      wcerr << L">'." << endl;
-      exit(EXIT_FAILURE);
+      error("Invalid inclusion of '<%S>' into '<repeat>'.", name.c_str());
     }
   }
   sequences[seqname] = transducer;
@@ -468,22 +413,19 @@ void
 LRXCompiler::procMatch()
 {
   // These are mutually exclusive
-  wstring lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR);
-  wstring suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR);
-  wstring _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive
+  UString lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR);
+  UString suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR);
+  UString _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive
 
   // This is currently disabled: Future use
-  wstring surface = this->attrib(LRX_COMPILER_SURFACE_ATTR);
+  UString surface = this->attrib(LRX_COMPILER_SURFACE_ATTR);
 
-  wstring tags = this->attrib(LRX_COMPILER_TAGS_ATTR, L"*");
+  UString tags = this->attrib(LRX_COMPILER_TAGS_ATTR, "*"_u);
 
-  if(surface != L"")
+  if(!surface.empty())
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"      match: %S\n", surface.c_str());
-    }
+    debug("      match: %S\n", surface.c_str());
 
     for(auto& it : surface)
     {
@@ -492,70 +434,64 @@ LRXCompiler::procMatch()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"      match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str());
-    }
+    debug("      match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str());
 
-    if(_case != L"")
+    if(_case != ""_u)
     {
-      if(_case == L"AA") // <ANY_UPPER>+
+      if(_case == "AA"_u) // <ANY_UPPER>+
       {
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_UPPER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(any_upper, 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
-      else if(_case == L"aa")  // <ANY_LOWER>+
+      else if(_case == "aa"_u)  // <ANY_LOWER>+
       {
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_LOWER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(any_lower, 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
-      else if(_case == L"Aa") // <ANY_UPPER>+ <ANY_LOWER>+
+      else if(_case == "Aa"_u) // <ANY_UPPER>+ <ANY_LOWER>+
       {
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_UPPER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(any_upper, 0), currentState);
         int localLast = currentState;
-        currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_LOWER>"), 0), currentState);
+        currentState = transducer.insertSingleTransduction(alphabet(any_lower, 0), currentState);
         transducer.linkStates(currentState, localLast, 0);
       }
     }
-    if(lemma == L"*" && suffix == L"" && contains == L"" && _case == L"")
+    if(lemma == "*"_u && suffix.empty() && contains.empty() && _case.empty())
     {
       // This is only if there is no suffix or case or contains
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        char: -\n");
-      }
+      debug("        char: -\n");
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(suffix != L"")
+    else if(suffix != ""_u)
     {
       // A suffix is <ANY_CHAR> any amount of times followed by whatever is in the suffix
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
       for(auto& it : suffix)
       {
         currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState);
       }
     }
-    else if(contains != L"")
+    else if(!contains.empty())
     {
       // A contains is <ANY_CHAR> any amount of times followed by whatever is in the attribute
       // followed by <ANY_CHAR> any amount of times
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
       for(auto& it : suffix)
       {
         currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState);
       }
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(lemma != L"*")
+    else if(lemma != "*"_u)
     {
       for(auto& it : lemma)
       {
@@ -564,66 +500,57 @@ LRXCompiler::procMatch()
     }
     else
     {
-      fwprintf(stderr, L"Something surprising happened in <match> compilation\n");
+      cerr << "Something surprising happened in <match> compilation\n";
     }
 
-    wstring tag = L"";
+    UString tag;
     for(auto& it : tags)
     {
-      if(it == L'.')
+      if(it == '.')
       {
-        if(tag == L"")
+        if(tag.empty())
         {
           continue;
         }
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
           int localLast = currentState;
-          currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"), 0), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(any_tag, 0), currentState);
           transducer.linkStates(currentState, localLast, 0);
         }
         else
         {
           currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState);
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       int localLast = currentState;
-      currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"), 0), currentState);
+      currentState = transducer.insertSingleTransduction(alphabet(any_tag, 0), currentState);
       transducer.linkStates(currentState, localLast, 0);
     }
-    else if(tag == L"")
+    else if(tag.empty())
     {
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
         alphabet.includeSymbol(tag.c_str());
       }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState);
     }
   }
@@ -631,42 +558,33 @@ LRXCompiler::procMatch()
   if(xmlTextReaderIsEmptyElement(reader))
   {
     // If self-closing
-    currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<skip>")), currentState);
+    currentState = transducer.insertSingleTransduction(word_boundary, currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<skip>"_u)), currentState);
     return;
   }
 
-  wstring name = L"";
+  UString name = ""_u;
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
-    if(ret != 1)
-    {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
-      exit(EXIT_FAILURE);
+    if(ret != 1) {
+      error("Parse error.");
     }
 
-    name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_SELECT_ELEM)
     {
-      if(!canSelect)
-      {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): <select> is not permitted inside <repeat>." << endl;
-        exit(EXIT_FAILURE);
+      if(!canSelect) {
+        error("<select> is not permitted inside <repeat>.");
       }
       procSelect();
     }
     else if(name == LRX_COMPILER_REMOVE_ELEM)
     {
-      if(!canSelect)
-      {
-        wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-        wcerr << L"): <remove> is not permitted inside <repeat>." << endl;
-        exit(EXIT_FAILURE);
+      if(!canSelect) {
+        error("<remove> is not permitted inside <repeat>.");
       }
       procRemove();
     }
@@ -676,10 +594,7 @@ LRXCompiler::procMatch()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_MATCH_ELEM;
-      wcerr << L">'." << endl;
-      exit(EXIT_FAILURE);
+      error("Invalid inclusion of '<%S>' into '<match>'.");
     }
   }
 
@@ -691,11 +606,11 @@ void
 LRXCompiler::procSelect()
 {
 
-  wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
+  UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
 
-  wstring key = L"<" + LRX_COMPILER_TYPE_SELECT + L">";
-  if(lemma != L"*")
+  UString key = "<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u;
+  if(lemma != "*"_u)
   {
     key += lemma;
   }
@@ -703,22 +618,19 @@ LRXCompiler::procSelect()
   Transducer recogniser;
   int localCurrentState = recogniser.getInitial();
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        select: %S, %S\n", lemma.c_str(), tags.c_str());
-  }
+  debug("        select: %S, %S\n", lemma.c_str(), tags.c_str());
 
-  currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_SELECT + L">")), currentState);
+  currentState = transducer.insertSingleTransduction(word_boundary, currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u)), currentState);
 
 
-  if(lemma == L"*")
+  if(lemma == "*"_u)
   {
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_CHAR>")), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, any_char), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(any_char ,0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_CHAR>";
+    key = key + "<ANY_CHAR>"_u;
   }
   else {
     for (auto &it : lemma) {
@@ -727,29 +639,24 @@ LRXCompiler::procSelect()
     }
   }
 
-  if(tags != L"")
-  {
-    wstring tag = L"";
-    for(auto& it : tags)
-    {
-      if(it == L'.')
+  if(!tags.empty()) {
+    UString tag;
+    for(auto& it : tags) {
+      if(it == '.')
       {
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
-        {
-          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
 	  int localLast = localCurrentState;
-          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+          localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState);
 	  recogniser.linkStates(localCurrentState, localLast, 0);
-          key = key + L"<ANY_TAG>";
+          key = key + "<ANY_TAG>"_u;
         }
         else
         {
@@ -757,34 +664,28 @@ LRXCompiler::procSelect()
           localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
           key = key + tag;
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
-      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+      debug("        tag: %S\n", tag.c_str());
+      currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
       int localLast = localCurrentState;
-      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+      localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState);
       recogniser.linkStates(localCurrentState, localLast, 0);
-      key = key + L"<ANY_TAG>";
+      key = key + "<ANY_TAG>"_u;
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
         alphabet.includeSymbol(tag.c_str());
       }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState);
       localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
       key = key + tag;
@@ -792,26 +693,20 @@ LRXCompiler::procSelect()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"        tag: -\n");
-    }
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+    debug("        tag: -\n");
+    currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_TAG>";
+    key = key + "<ANY_TAG>"_u;
   }
 
 
   recogniser.setFinal(localCurrentState);
 
   recognisers[key] = recogniser;
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        select: %d\n", recognisers[key].size());
-  }
-  //currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
+  debug("        select: %d\n", recognisers[key].size());
+  //currentState = transducer.insertSingleTransduction(word_boundary, currentState);
 
   return;
 }
@@ -820,11 +715,11 @@ void
 LRXCompiler::procRemove()
 {
 
-  wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*");
-  wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
+  UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u);
+  UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR);
 
-  wstring key = L"<" + LRX_COMPILER_TYPE_REMOVE + L">";
-  if(lemma != L"*")
+  UString key = "<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u;
+  if(lemma != "*"_u)
   {
     key += lemma;
   }
@@ -832,21 +727,18 @@ LRXCompiler::procRemove()
   Transducer recogniser;
   int localCurrentState = recogniser.getInitial();
 
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        remove: %S, %S\n", lemma.c_str(), tags.c_str());
-  }
+  debug("        remove: %S, %S\n", lemma.c_str(), tags.c_str());
 
-  currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState);
-  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_REMOVE + L">")), currentState);
+  currentState = transducer.insertSingleTransduction(word_boundary, currentState);
+  currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u)), currentState);
 
-  if(lemma == L"*")
+  if(lemma == "*"_u)
   {
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_CHAR>")), currentState);
+    currentState = transducer.insertSingleTransduction(alphabet(0, any_char), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_CHAR>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(any_char ,0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_CHAR>";
+    key = key + "<ANY_CHAR>"_u;
   }
   else
   {
@@ -857,29 +749,26 @@ LRXCompiler::procRemove()
     }
   }
 
-  if(tags != L"")
+  if(tags != ""_u)
   {
-    wstring tag = L"";
+    UString tag = ""_u;
     for(auto& it : tags)
     {
-      if(it == L'.')
+      if(it == '.')
       {
-        tag = L"<" + tag + L">";
+        tag = "<"_u + tag + ">"_u;
         if(!alphabet.isSymbolDefined(tag.c_str()))
         {
           alphabet.includeSymbol(tag.c_str());
         }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-        }
-        if(tag == L"<*>")
+        debug("        tag: %S\n", tag.c_str());
+        if(tag == "<*>"_u)
         {
-          currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+          currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
 	  int localLast = localCurrentState;
-          localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+          localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag, 0), localCurrentState);
 	  recogniser.linkStates(localCurrentState, localLast, 0);
-          key = key + L"<ANY_TAG>";
+          key = key + "<ANY_TAG>"_u;
         }
         else
         {
@@ -887,34 +776,28 @@ LRXCompiler::procRemove()
           localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
           key = key + tag;
         }
-        tag = L"";
+        tag = ""_u;
         continue;
       }
       tag = tag + it;
     }
-    if(tag == L"*")
+    if(tag == "*"_u)
     {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
-      }
-      currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+      debug("        tag: %S\n", tag.c_str());
+      currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
       int localLast = localCurrentState;
-      localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+      localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag, 0), localCurrentState);
       recogniser.linkStates(localCurrentState, localLast, 0);
-      key = key + L"<ANY_TAG>";
+      key = key + "<ANY_TAG>"_u;
     }
     else
     {
-      tag = L"<" + tag + L">";
+      tag = "<"_u + tag + ">"_u;
       if(!alphabet.isSymbolDefined(tag.c_str()))
       {
-        alphabet.includeSymbol(tag.c_str());
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"        tag: %S\n", tag.c_str());
+        alphabet.includeSymbol(tag);
       }
+      debug("        tag: %S\n", tag.c_str());
       currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState);
       localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState);
       key = key + tag;
@@ -922,25 +805,19 @@ LRXCompiler::procRemove()
   }
   else
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"        tag: -\n");
-    }
-    currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<ANY_TAG>")), currentState);
+    debug("        tag: -\n");
+    currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState);
     int localLast = localCurrentState;
-    localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L"<ANY_TAG>"),0), localCurrentState);
+    localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag,0), localCurrentState);
     recogniser.linkStates(localCurrentState, localLast, 0);
-    key = key + L"<ANY_TAG>";
+    key = key + "<ANY_TAG>"_u;
   }
 
 
   recogniser.setFinal(localCurrentState);
 
   recognisers[key] = recogniser;
-  if(debugMode)
-  {
-    fwprintf(stderr, L"        remove: %d\n", recognisers[key].size());
-  }
+  debug("        remove: %d\n", recognisers[key].size());
 
   return;
 }
@@ -951,21 +828,14 @@ LRXCompiler::procRepeat()
 {
   bool couldSelect = canSelect;
   canSelect = false;
-  wstring xfrom = this->attrib(LRX_COMPILER_FROM_ATTR);
-  wstring xupto = this->attrib(LRX_COMPILER_UPTO_ATTR);
-  int from = stoi(xfrom);
-  int upto = stoi(xupto);
-  if(from < 0 || upto < 0)
-  {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Number of repetitions cannot be negative." << endl;
-    exit(EXIT_FAILURE);
-  }
-  else if(from > upto)
-  {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Lower bound on number of repetitions cannot be larger than upper bound." << endl;
-    exit(EXIT_FAILURE);
+  UString xfrom = this->attrib(LRX_COMPILER_FROM_ATTR);
+  UString xupto = this->attrib(LRX_COMPILER_UPTO_ATTR);
+  int from = StringUtils::stoi(xfrom);
+  int upto = StringUtils::stoi(xupto);
+  if(from < 0 || upto < 0) {
+    error("Number of repetitions cannot be negative.");
+  } else if(from > upto) {
+    error("Lower bound on number of repetitions cannot be larger than upper bound.");
   }
   int count = upto - from;
   int oldstate = currentState;
@@ -976,14 +846,11 @@ LRXCompiler::procRepeat()
   while(true)
   {
     int ret = xmlTextReaderRead(reader);
-    if(ret != 1)
-    {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Parse error." << endl;
-      exit(EXIT_FAILURE);
+    if(ret != 1) {
+      error("Parse error.");
     }
 
-    wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader));
+    UString name = XMLParseUtil::readName(reader);
     skipBlanks(name);
 
     if(name == LRX_COMPILER_MATCH_ELEM)
@@ -1006,10 +873,7 @@ LRXCompiler::procRepeat()
     }
     else
     {
-      wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-      wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM;
-      wcerr << L">'." << endl;
-      exit(EXIT_FAILURE);
+      error("Invalid inclusion of '<%S>' into '<repeat>'.", name.c_str());
     }
   }
   for(int i = 0; i < from; i++)
@@ -1031,12 +895,10 @@ LRXCompiler::procRepeat()
 void
 LRXCompiler::procSeq()
 {
-  wstring name = this->attrib(LRX_COMPILER_NAME_ATTR);
+  UString name = this->attrib(LRX_COMPILER_NAME_ATTR);
   if(sequences.find(name) == sequences.end())
   {
-    wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader);
-    wcerr << L"): Sequence '" << name << L"' not defined." << endl;
-    exit(EXIT_FAILURE);
+    error("Sequence '%S' is not defined.", name.c_str());
   }
   currentState = transducer.insertTransducer(currentState, sequences[name]);
 }
@@ -1050,28 +912,24 @@ LRXCompiler::write(FILE *fst)
   Compression::multibyte_write(recognisers.size(), fst);
   for(auto& it : recognisers)
   {
-    Compression::wstring_write(it.first, fst);
-    if(debugMode)
-    {
-      fwprintf(stderr, L"+ %d => %S\n", it.second.size(), it.first.c_str());
-      it.second.show(alphabet, stderr, 0, false);
+    Compression::string_write(it.first, fst);
+    debug("+ %d => %S\n", it.second.size(), it.first.c_str());
+    if (debugMode) {
+      it.second.show(alphabet, debug_output, 0, false);
     }
     it.second.write(fst);
   }
 
-  Compression::wstring_write(L"main", fst);
+  Compression::string_write("main"_u, fst);
   if(outputGraph)
   {
-    transducer.show(alphabet, stderr, 0, false);
+    transducer.show(alphabet, debug_output, 0, false);
   }
   transducer.write(fst);
 
   for(auto& it : weights)
   {
-    if(debugMode)
-    {
-      fwprintf(stderr, L"%.4f %d\n", it.second, it.first);
-    }
+    debug("%.4f %d\n", it.second, it.first);
     weight record{it.first, "", it.second};
     weight_to_le(record);
     fwrite((void *)&record, 1, sizeof(weight), fst);
@@ -1079,6 +937,6 @@ LRXCompiler::write(FILE *fst)
 
   if(!outputGraph)
   {
-    fwprintf(stderr, L"%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions());
+    u_fprintf(debug_output, "%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions());
   }
 }
diff --git a/src/lrx_compiler.h b/src/lrx_compiler.h
index 099c4a7..6752215 100644
--- a/src/lrx_compiler.h
+++ b/src/lrx_compiler.h
@@ -18,30 +18,12 @@
 #ifndef __LRX_COMPILER_H__
 #define __LRX_COMPILER_H__
 
-#include <cwchar>
-#include <cstdio>
-#include <libgen.h>
-#include <cerrno>
 #include <string>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <cstdlib>
-#include <list>
-#include <set>
-
+#include <cstdint>
 #include <libxml/xmlreader.h>
-
-#include <lttoolbox/ltstr.h>
-#include <lttoolbox/lt_locale.h>
 #include <lttoolbox/transducer.h>
-#include <lttoolbox/xml_parse_util.h>
 #include <lttoolbox/alphabet.h>
-#include <lttoolbox/compression.h>
-#include <lttoolbox/regexp_compiler.h>
-#include <lttoolbox/state.h>
-#include <lttoolbox/trans_exe.h>
-#include <lttoolbox/my_stdio.h>
+#include <unicode/ustdio.h>
 
 using namespace std;
 
@@ -52,23 +34,33 @@ private:
   Alphabet alphabet;
   Transducer transducer;
 
-  map<wstring, Transducer> recognisers; // keyed on pattern
-  map<int, double> weights; // keyed on rule id
+  map<UString, Transducer> recognisers; // keyed on pattern
+  map<int32_t, double> weights; // keyed on rule id
 
-  map<wstring, Transducer> sequences;
+  map<UString, Transducer> sequences;
 
-  int initialState;
-  int lastState;
-  int currentState;
-  bool canSelect; // disallow <select>, <remove> inside <def-seq>, <repeat>
+  int32_t initialState;
+  int32_t lastState;
+  int32_t currentState;
+  // disallow <select>, <remove> inside <def-seq>, <repeat>
+  bool canSelect = true;
 
-  int currentRuleId;
+  int32_t currentRuleId = 0;
 
-  bool debugMode;
-  bool outputGraph;
+  int32_t any_tag = 0;
+  int32_t any_char = 0;
+  int32_t any_upper = 0;
+  int32_t any_lower = 0;
+  int32_t word_boundary = 0;
+
+  bool debugMode = false;
+  bool outputGraph = false;
+  UFILE* debug_output;
+  void debug(const char* fmt, ...);
+  void error(const char* fmt, ...);
   bool allBlanks();
 
-  void skipBlanks(wstring &name);
+  void skipBlanks(UString &name);
   void procNode();
   void procList();
   void procListMatch();
@@ -82,43 +74,39 @@ private:
   void procSeq();
 
   /* If attrib does not exist (or other error), returns an empty string: */
-  wstring attrib(wstring const &name);
+  UString attrib(UString const &name);
 
   /* If attrib does not exist (or other error), returns fallback: */
-  wstring attrib(wstring const &name, const wstring fallback);
-
-  wstring itow(int i);
-  int wtoi(wstring);
-  double wtod(wstring);
+  UString attrib(UString const &name, const UString fallback);
 
 public:
-  static wstring const LRX_COMPILER_LRX_ELEM;
-  static wstring const LRX_COMPILER_DEFSEQS_ELEM;
-  static wstring const LRX_COMPILER_DEFSEQ_ELEM;
-  static wstring const LRX_COMPILER_RULES_ELEM;
-  static wstring const LRX_COMPILER_RULE_ELEM;
-  static wstring const LRX_COMPILER_MATCH_ELEM;
-  static wstring const LRX_COMPILER_SELECT_ELEM;
-  static wstring const LRX_COMPILER_REMOVE_ELEM;
-  static wstring const LRX_COMPILER_OR_ELEM;
-  static wstring const LRX_COMPILER_REPEAT_ELEM;
-  static wstring const LRX_COMPILER_SEQ_ELEM;
-
-  static wstring const LRX_COMPILER_SURFACE_ATTR;
-  static wstring const LRX_COMPILER_SUFFIX_ATTR;
-  static wstring const LRX_COMPILER_LEMMA_ATTR;
-  static wstring const LRX_COMPILER_CONTAINS_ATTR;
-  static wstring const LRX_COMPILER_CASE_ATTR;
-  static wstring const LRX_COMPILER_TAGS_ATTR;
-  static wstring const LRX_COMPILER_COMMENT_ATTR;
-  static wstring const LRX_COMPILER_NAME_ATTR;
-  static wstring const LRX_COMPILER_WEIGHT_ATTR;
-  static wstring const LRX_COMPILER_FROM_ATTR;
-  static wstring const LRX_COMPILER_UPTO_ATTR;
-
-  static wstring const LRX_COMPILER_TYPE_SELECT;
-  static wstring const LRX_COMPILER_TYPE_REMOVE;
-  static wstring const LRX_COMPILER_TYPE_SKIP;
+  static UString const LRX_COMPILER_LRX_ELEM;
+  static UString const LRX_COMPILER_DEFSEQS_ELEM;
+  static UString const LRX_COMPILER_DEFSEQ_ELEM;
+  static UString const LRX_COMPILER_RULES_ELEM;
+  static UString const LRX_COMPILER_RULE_ELEM;
+  static UString const LRX_COMPILER_MATCH_ELEM;
+  static UString const LRX_COMPILER_SELECT_ELEM;
+  static UString const LRX_COMPILER_REMOVE_ELEM;
+  static UString const LRX_COMPILER_OR_ELEM;
+  static UString const LRX_COMPILER_REPEAT_ELEM;
+  static UString const LRX_COMPILER_SEQ_ELEM;
+
+  static UString const LRX_COMPILER_SURFACE_ATTR;
+  static UString const LRX_COMPILER_SUFFIX_ATTR;
+  static UString const LRX_COMPILER_LEMMA_ATTR;
+  static UString const LRX_COMPILER_CONTAINS_ATTR;
+  static UString const LRX_COMPILER_CASE_ATTR;
+  static UString const LRX_COMPILER_TAGS_ATTR;
+  static UString const LRX_COMPILER_COMMENT_ATTR;
+  static UString const LRX_COMPILER_NAME_ATTR;
+  static UString const LRX_COMPILER_WEIGHT_ATTR;
+  static UString const LRX_COMPILER_FROM_ATTR;
+  static UString const LRX_COMPILER_UPTO_ATTR;
+
+  static UString const LRX_COMPILER_TYPE_SELECT;
+  static UString const LRX_COMPILER_TYPE_REMOVE;
+  static UString const LRX_COMPILER_TYPE_SKIP;
 
   static double  const LRX_COMPILER_DEFAULT_WEIGHT;
 
diff --git a/src/lrx_proc.cc b/src/lrx_proc.cc
index bd77260..db713ed 100644
--- a/src/lrx_proc.cc
+++ b/src/lrx_proc.cc
@@ -20,11 +20,7 @@
 #include <getopt.h>
 #include <iostream>
 #include <libgen.h>
-
-#ifdef _MSC_VER
-#include <io.h>
-#include <fcntl.h>
-#endif
+#include <lttoolbox/lt_locale.h>
 
 using namespace std;
 
@@ -92,7 +88,8 @@ int main(int argc, char *argv[])
     }
   }
 
-  FILE *input = stdin, *output = stdout;
+  InputFile input;
+  UFILE* output = u_finit(stdout, NULL, NULL);
   LtLocale::tryToSetLocale();
 
   if(optind == (argc - 3))
@@ -103,14 +100,12 @@ int main(int argc, char *argv[])
       endProgram(argv[0]);
     }
 
-    input = fopen(argv[optind+1], "rb");
-    if(input == NULL || ferror(input))
-    {
+    if (!input.open(argv[optind+1])) {
       endProgram(argv[0]);
     }
 
-    output= fopen(argv[optind+2], "wb");
-    if(output == NULL || ferror(output))
+    output = u_fopen(argv[optind+2], "w", NULL, NULL);
+    if(output == NULL)
     {
       endProgram(argv[0]);
     }
@@ -126,9 +121,7 @@ int main(int argc, char *argv[])
       endProgram(argv[0]);
     }
 
-    input = fopen(argv[optind+1], "rb");
-    if(input == NULL || ferror(input))
-    {
+    if (!input.open(argv[optind+1])) {
       endProgram(argv[0]);
     }
 
@@ -150,14 +143,8 @@ int main(int argc, char *argv[])
     endProgram(argv[0]);
   }
 
-#ifdef _MSC_VER
-        _setmode(_fileno(input), _O_U8TEXT);
-        _setmode(_fileno(output), _O_U8TEXT);
-#endif
-
   lrxp.init();
   lrxp.process(input, output);
-  fclose(input);
-  fclose(output);
+  u_fclose(output);
   return EXIT_SUCCESS;
 }
diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc
index 276c6ba..8715097 100644
--- a/src/lrx_processor.cc
+++ b/src/lrx_processor.cc
@@ -17,21 +17,28 @@
 
 #include <weight.h>
 #include <lrx_processor.h>
-#include <cstdint>
+#include <iostream>
+#include <lttoolbox/compression.h>
+
 using namespace std;
 
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_SELECT     = L"<select>";
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE     = L"<remove>";
-wstring const LRXProcessor::LRX_PROCESSOR_TAG_SKIP       = L"<skip>";
+UString const LRXProcessor::LRX_PROCESSOR_TAG_SELECT     = "<select>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE     = "<remove>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_SKIP       = "<skip>"_u;
+
+UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_CHAR       = "<ANY_CHAR>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_TAG        = "<ANY_TAG>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_UPPER      = "<ANY_UPPER>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_LOWER      = "<ANY_LOWER>"_u;
+UString const LRXProcessor::LRX_PROCESSOR_TAG_WORD_BOUNDARY  = "<$>"_u;
 
-wstring
+UString
 LRXProcessor::itow(int i)
 {
-  // Convert an int to a wstring
-  wchar_t buf[50];
-  memset(buf, '\0', sizeof(buf));
-  swprintf(buf, 50, L"%d", i);
-  wstring id(buf);
+  // Convert an int to a UString
+  UChar buf[50];
+  u_snprintf(buf, 50, "%d", i);
+  UString id(buf);
   return id;
 }
 
@@ -77,39 +84,31 @@ void
 LRXProcessor::load(FILE *in)
 {
   alphabet.read(in);
+  any_char      = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR);
+  any_tag       = alphabet(LRX_PROCESSOR_TAG_ANY_TAG);
+  any_upper     = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER);
+  any_lower     = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER);
+  word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY);
 
   int len = Compression::multibyte_read(in);
 
   while(len > 0)
   {
-    int len2 = Compression::multibyte_read(in);
-    wstring name = L"";
-    while(len2 > 0)
-    {
-      name += static_cast<wchar_t>(Compression::multibyte_read(in));
-      len2--;
-    }
+    UString name = Compression::string_read(in);
     recognisers[name].read(in, alphabet);
     if(debugMode)
     {
-      fwprintf(stderr, L"Recogniser: %S, [finals: %d]\n", name.c_str(), recognisers[name].getFinals().size());
+      cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n";
     }
     len--;
   }
 
   if(debugMode)
   {
-    fwprintf(stderr, L"recognisers: %d\n", recognisers.size());
+    cerr << "recognisers: " << recognisers.size() << endl;
   }
 
-  int len3 = Compression::multibyte_read(in);
-
-  wstring name = L"";
-  while(len3 > 0)
-  {
-    name += static_cast<wchar_t>(Compression::multibyte_read(in));
-    len3--;
-  }
+  UString name = Compression::string_read(in);
 
   transducer.read(in, alphabet);
 
@@ -118,13 +117,15 @@ LRXProcessor::load(FILE *in)
   while(fread(&record, sizeof(weight), 1, in))
   {
     weight_from_le(record);
-    wstring sid = L"<" + itow(record.id) + L">";
+    UString sid = "<"_u + itow(record.id) + ">"_u;
     weights[sid] = record.pisu;
 
+    /*
     if(debugMode)
     {
-      //fwprintf(stderr, L"%S %d weight(%.4f)\n", sid.c_str(), record.id, record.pisu);
+      cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n";
     }
+    */
   }
 
   return;
@@ -137,42 +138,26 @@ LRXProcessor::init()
 
   anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end());
 
-  escaped_chars.insert(L'[');
-  escaped_chars.insert(L']');
-  escaped_chars.insert(L'{');
-  escaped_chars.insert(L'}');
-  escaped_chars.insert(L'^');
-  escaped_chars.insert(L'$');
-  escaped_chars.insert(L'/');
-  escaped_chars.insert(L'\\');
-  escaped_chars.insert(L'@');
-  escaped_chars.insert(L'<');
-  escaped_chars.insert(L'>');
-
-}
-
-wstring
-LRXProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2)
-{
-  wstring result = L"";
-  result += delim1;
-  wchar_t c = delim1;
+  escaped_chars.insert('[');
+  escaped_chars.insert(']');
+  escaped_chars.insert('{');
+  escaped_chars.insert('}');
+  escaped_chars.insert('^');
+  escaped_chars.insert('$');
+  escaped_chars.insert('/');
+  escaped_chars.insert('\\');
+  escaped_chars.insert('@');
+  escaped_chars.insert('<');
+  escaped_chars.insert('>');
 
-  while(!feof(input) && c != delim2)
-  {
-    c = static_cast<wchar_t>(fgetwc_unlocked(input));
-    result += c;
-  }
-
-  return result;
 }
 
 bool
-LRXProcessor::recognisePattern(const wstring lu, const wstring op)
+LRXProcessor::recognisePattern(const UString lu, const UString op)
 {
   if(recognisers.count(op) < 1)
   {
-    fwprintf(stderr, L"WARNING: Recogniser not found for key %S, skipping... [LU: %S]\n", op.c_str(), lu.c_str());
+    cerr << "WARNING: Recogniser not found for key " << op << ", skipping... [LU: " << lu << "]" << endl;
     return false;
   }
 
@@ -184,14 +169,14 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
   end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end());
 
   bool readingTag = false;
-  wstring tag = L"";
+  UString tag;
   int val = 0;
   for(auto& it : lu)
   {
 /*
     if(debugMode)
     {
-      fwprintf(stderr, L"alive: %d\n", cur.size());
+      cerr << "alive: " << cur.size() << endl;
     }
 */
     if(cur.size() < 1)  // I think that any time we have 0 alive states,
@@ -199,35 +184,35 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
     {
       return false;
     }
-    if(it == L'<')
+    if(it == '<')
     {
-      tag = L"";
+      tag.clear();
       readingTag = true;
-      tag = tag + it;
+      tag += it;
       continue;
     }
-    if(it == L'>')
+    if(it == '>')
     {
       tag = tag + it;
-      val = static_cast<int>(alphabet(tag));
+      val = alphabet(tag);
       if(val == 0)
       {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
+        val = any_tag;
       }
 /*
       if(debugMode)
       {
-        fwprintf(stderr, L":: tag %S: %d\n", tag.c_str(), val);
-        fwprintf(stderr, L"  step: %S\n", tag.c_str());
+        cerr << ":: tag " << tag << ": " << val << endl;
+        cerr << "  step: " << tag << endl;
       }
 */
-      cur.step(val, alphabet(L"<ANY_TAG>"));
+      cur.step(val, any_tag);
       readingTag = false;
       continue;
     }
     if(readingTag)
     {
-      tag = tag + it;
+      tag += it;
     }
     else
     {
@@ -236,22 +221,21 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
 /*
       if(debugMode)
       {
-        fwprintf(stderr, L"  step: %C\n", val);
+        cerr << "  step: " << val << endl;
       }
 */
-      //cur.step(val, a(L"<ANY_CHAR>"));
+      //cur.step(val, a("<ANY_CHAR>"));
       //cur.step(val);
       set<int> alts;
-      if(!iswupper(val))
+      alts.insert(any_char);
+      if(!u_isupper(val))
       {
-        alts.insert(alphabet(L"<ANY_CHAR>"));
-        alts.insert(alphabet(L"<ANY_LOWER>"));
+        alts.insert(any_lower);
       }
       else
       {
-        alts.insert(alphabet(L"<ANY_CHAR>"));
-        alts.insert(alphabet(L"<ANY_UPPER>"));
-        alts.insert(towlower(val));
+        alts.insert(any_upper);
+        alts.insert(u_tolower(val));
       }
       cur.step(val, alts);
 
@@ -261,7 +245,7 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
 /*
   if(debugMode)
   {
-    fwprintf(stderr, L">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
+    cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n";
   }
 */
   if(cur.isFinal(end_states))
@@ -272,541 +256,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op)
   return false;
 }
 
-/*
-void
-LRXProcessor::processFlush(FILE *output,
-                           map<int, wstring > &sl,
-                           map<int, vector<wstring> > &tl,
-                           map<int, wstring > &blanks,
-                           map<int, pair<double, vector<State> > > &covers,
-                           pair<double, vector<State> > &empty_seq,
-                           map<pair<int, int>, vector<State> > &spans,
-                           int last_final)
-{
-  if(debugMode)
-  {
-    fwprintf(stderr, L"FLUSH:\n");
-  }
-
-  map<int, pair<double, vector<State> > >::iterator it;
-  map<int, pair<wstring, wstring> > operations;
-
-  for(it = covers.begin(); it != covers.end(); it++)
-  {
-    pair<double, vector<State> > best = it->second;
-    if(debugMode)
-    {
-      fwprintf(stderr, L"===================================================\n");
-      fwprintf(stderr, L"[%d][%d] covers[%d] best (score: %d, size: %d)\n", pos, last_final, it->first, best.first, best.second.size());
-    }
-
-    // return M[i-1]
-    if(it->first == last_final)
-    {
-      vector<State>::iterator it2;
-      for(it2 = best.second.begin(); it2 != best.second.end(); it2++)
-      {
-        if(debugMode)
-        {
-          wstring out = it2->filterFinals(anfinals, alphabet, escaped_chars);
-          fwprintf(stderr, L"!!!    filter_finals: %S\n", out.c_str());
-        }
-        set<pair<wstring, vector<wstring> > > outpaths;
-        outpaths = it2->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-
-        int j = 1;
-        set<pair<wstring, vector<wstring> > >::iterator it3;
-        for(it3 = outpaths.begin(); it3 != outpaths.end(); it3++)
-        {
-          wstring id = it3->first;
-          vector<wstring> ops = it3->second;
-          vector<wstring>::iterator op;
-          for(op = ops.begin(); op != ops.end(); op++)
-          {
-            if(*op != LRX_PROCESSOR_TAG_SKIP)
-            {
-              int starting_point = -1;
-              map<pair<int, int>, vector<State> >::iterator ix;
-              for(ix = spans.begin(); ix != spans.end(); ix++)
-              {
-                vector<State>::iterator iy;
-                for(iy = ix->second.begin(); iy != ix->second.end(); iy++)
-                {
-                  set<pair<wstring, vector<wstring> > > y;
-                  y = iy->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-                  if(y == outpaths)
-                  {
-                    starting_point = ix->first.first;
-                  }
-                }
-              }
-              if(debugMode)
-              {
-                fwprintf(stderr, L"=> APPLY [pos: %d, dep: %d, j: %d, start: %d, len: %d]: %S // %S\n", pos, starting_point, j, starting_point+j, ops.size(), id.c_str(), op->c_str());
-              }
-              operations[starting_point+j].first = id;
-              operations[starting_point+j].second = *op;
-            }
-            j++;
-          }
-        }
-        if(debugMode)
-        {
-          fwprintf(stderr, L"[best: %d, outpaths: %d]\n", best.first, outpaths.size());
-        }
-      }
-    }
-  }
-
-  covers.clear();
-  covers[-1] = empty_seq;
-  covers[-1].first = 0;
-
-  // Here we actually apply the rules that we've matched
-
-  unsigned int spos = 0;
-  for(spos = 0; spos <= pos; spos++)
-  {
-    if(sl[spos] == L"")
-    {
-      continue;
-    }
-    wstring  op = operations[spos].second;
-    wstring  tipus = L"";
-    if(op.find(LRX_PROCESSOR_TAG_SELECT) != wstring::npos)
-    {
-      tipus = LRX_PROCESSOR_TAG_SELECT;
-    }
-    if(op.find(LRX_PROCESSOR_TAG_REMOVE) != wstring::npos)
-    {
-      tipus = LRX_PROCESSOR_TAG_REMOVE;
-    }
-    if(debugMode)
-    {
-      fwprintf(stderr, L"#APPL%S. %S\n", tipus.c_str(), op.c_str());
-    }
-
-    fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
-
-    vector<wstring>::iterator ti;
-    vector<wstring>::iterator penum = tl[spos].end(); penum--;
-
-    if(tipus == LRX_PROCESSOR_TAG_SELECT && tl[spos].size() > 1)
-    {
-      bool matched = true;
-      bool selected = false;
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        matched = recognisePattern(*ti, op);
-        if(matched)
-        {
-          if(traceMode || debugMode)
-          {
-            fwprintf(stderr, L"%d:SELECT%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str());
-          }
-          fwprintf(output, L"%S", ti->c_str());
-          selected = true;
-          break;
-        }
-      }
-      if(!selected)
-      {
-        for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-        {
-          fwprintf(output, L"%S", ti->c_str());
-          if(ti != penum)
-          {
-            fwprintf(output, L"/");
-          }
-        }
-      }
-    }
-    else if(tipus == LRX_PROCESSOR_TAG_REMOVE && tl[spos].size() > 1)
-    {
-      bool matched = true;
-      vector<wstring> new_tl;  // The new list of TL translations
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        matched = recognisePattern(*ti, op);
-        if(matched)
-        {
-          if(traceMode || debugMode)
-          {
-            fwprintf(stderr, L"%d:REMOVE%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str());
-          }
-          continue;
-        }
-        new_tl.push_back(*ti);
-      }
-      vector<wstring>::iterator nti;
-      vector<wstring>::iterator npenum = new_tl.end(); npenum--;
-      for(nti = new_tl.begin(); nti != new_tl.end(); nti++)
-      {
-        fwprintf(output, L"%S", nti->c_str());
-        if(nti != npenum)
-        {
-          fwprintf(output, L"/");
-        }
-      }
-      new_tl.clear();
-    }
-    else
-    {
-      for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
-      {
-        fwprintf(output, L"%S", ti->c_str());
-        if(ti != penum)
-        {
-          fwprintf(output, L"/");
-        }
-      }
-    }
-    fwprintf(output, L"$");
-    if(debugMode)
-    {
-      fwprintf(output, L"%d", spos);
-    }
-  }
-}
-*/
-
-/*
 void
-LRXProcessor::process(FILE *input, FILE *output)
+LRXProcessor::process(InputFile& input, UFILE *output)
 {
   bool isEscaped = false;
 
-  map<int, wstring > sl; // map of SL words
-  map<int, vector<wstring> > tl; // map of vectors of TL translations
-  map<int, wstring > blanks; // map of the superblanks
-
-  map<int, pair<double, vector<State> > > covers ;
-  pair<double, vector<State> > empty_seq;
-  map<pair<int, int>, vector<State> > spans ;
-
-  covers[-1] = empty_seq;
-  covers[-1].first = 1.0;
-
-  vector<State> alive_states_clean ;
-  vector<State> alive_states = alive_states_clean ;
-  alive_states.push_back(*initial_state);
-  vector<State> new_states;
-
-  int last_final = -1; // check what we actually use this for
-
-  while(!feof(input))
-  {
-    int val = fgetwc_unlocked(input);
-
-    if(nullFlush && val == L'\0')
-    {
-      processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-      fwprintf(output, L"%S", blanks[pos].c_str());
-      pos = 0;
-      last_final = 0;
-      tl.clear();
-      sl.clear();
-      blanks.clear();
-      spans.clear();
-
-      fputwc_unlocked(val, output);
-      fflush(output);
-      continue;
-    }
-
-    // We're starting to read a new lexical form
-    if(val == L'^' && !isEscaped && outOfWord)
-    {
-      outOfWord = false;
-      continue;
-    }
+  map<int, UString > sl; // map of SL words
+  map<int, vector<UString> > tl; // map of vectors of TL translations
+  map<int, UString > blanks; // map of the superblanks
 
-    // We've seen the surface form
-    if(val == L'/' && !isEscaped && !outOfWord)
-    {
-      // Read in target equivalences
-      wstring trad = L"";
-      val = fgetwc_unlocked(input);
-      while(val != L'$')
-      {
-        if(val != L'$')
-        {
-          trad += static_cast<wchar_t>(val);
-        }
-        if(val == L'/')
-        {
-          tl[pos].push_back(trad.substr(0, trad.length()-1));
-          trad = L"";
-        }
-        val = fgetwc_unlocked(input);
-      }
-      tl[pos].push_back(trad);
-
-      if(debugMode)
-      {
-        for(vector<wstring>::iterator it = tl[pos].begin(); it != tl[pos].end(); it++)
-        {
-          fwprintf(stderr, L"trad[%d]: %S\n", pos, it->c_str());
-        }
-      }
-    }
-
-    // We've finished reading a lexical form
-    if((feof(input) || val == L'$') && !isEscaped && !outOfWord)
-    {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size());
-      }
-
-      new_states.clear(); // alive_states_new
-      pair<double, vector<State> > new_best_cover;
-      new_best_cover.first = -numeric_limits<int>::max();
-
-      vector<int> matched_rules;
-
-      // \forall s \in A
-      for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
-      {
-        State s = *it;
-        // \IF \exists c \in Q : \delta(s, sent[i]) = c
-        s.step(alphabet(L"<$>"));
-
-        // A \gets A \cup {c}
-        if(s.size() > 0) // If the current state has outgoing transitions,
-                         // add it to the new alive states
-        {
-          new_states.push_back(s);
-        }
-        s.step(alphabet(L"<$>"));
-
-        // \IF c \in F
-        if(s.isFinal(anfinals))
-        {
-          // We've reached a final state, so we need to evaluate the rule we've matched
-          if(debugMode)
-          {
-            wstring out = s.filterFinals(anfinals, alphabet, escaped_chars);
-            fwprintf(stderr, L"    filter_finals: %S\n", out.c_str());
-          }
-
-          set<pair<wstring, vector<wstring> > > outpaths;
-          outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
-
-          set<pair<wstring, vector<wstring> > >::iterator it;
-          for(it = outpaths.begin(); it != outpaths.end(); it++)
-          {
-            vector<State> reached;
-
-            vector<wstring> path = (*it).second;
-            wstring id = (*it).first;
-
-            if(debugMode)
-            {
-              fwprintf(stderr, L"id:      %S:\n", id.c_str());
-              for(vector<wstring>::iterator it2 = path.begin(); it2 != path.end(); it2++)
-              {
-                fwprintf(stderr, L"op:        %S\n", it2->c_str());
-              }
-              fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos);
-            }
-
-            spans[make_pair((pos-path.size()), pos)].push_back(s);
-
-            // M[i-ChunkLength(c)]
-            pair<double, vector<State> > newseq = covers[(pos - path.size())];
-            newseq.first = newseq.first + path.size() ;
-
-            if(newseq.first > new_best_cover.first)
-            {
-              State new_state;
-              new_state = s;
-              reached.push_back(new_state);
-              map<int, pair<double, vector<State> > >::iterator k;
-              for(k = covers.begin(); k != covers.end(); k++)
-              {
-                vector<State>::iterator l;
-                pair<double, vector<State> > p = k->second;
-                for(l = p.second.begin(); l != p.second.end(); l++)
-                {
-                  if(debugMode)
-                  {
-                    fwprintf(stderr, L"= [cov: %d][len: %d][pos: %d][pat: %d] INCLUDE FINALS?\n", k->first, p.first, pos, path.size());
-                  }
-                  if(k->first <= (pos - path.size()))
-                  {
-                    if(debugMode)
-                    {
-                      wstring out2 = l->filterFinals(anfinals, alphabet, escaped_chars);
-                      fwprintf(stderr, L"    == INCLUDE FINALS: %S\n", out2.c_str());
-                    }
-                    reached.push_back(*l);
-                  }
-                }
-              }
-              newseq.second = reached;
-              new_best_cover = newseq;
-              covers[pos] = newseq;
-              if(debugMode)
-              {
-                fwprintf(stderr, L"++ FINALS(%d) covers[%d] [%d, %d] BEST: %.4f > %.4f\n", newseq.second.size(), (pos - path.size()), pos, path.size(), newseq.first, new_best_cover.first);
-              }
-            }
-
-            last_final = pos;
-          }
-        }
-      }
-
-      alive_states.swap(new_states);
-      alive_states.push_back(*initial_state);
-
-      if(debugMode)
-      {
-        fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size());
-      }
-
-      if(alive_states.size() == 1)
-      {
-        // If we have only a single alive state, it means no rules are
-        // active, and we can flush the buffers.
-        processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-
-        pos = 0;
-        last_final = 0;
-        tl.clear();
-        sl.clear();
-        blanks.clear();
-        spans.clear();
-      }
-
-      pos++;
-      if(debugMode)
-      {
-        fwprintf(stderr, L"==> new pos: %d\n", pos);
-      }
-
-      outOfWord = true;
-      continue;
-    }
-
-
-    // We're reading a tag
-    if(val == L'<' && !isEscaped && !outOfWord)
-    {
-      wstring tag = L"";
-      tag = readFullBlock(input, L'<', L'>');
-      sl[pos] = sl[pos] + tag;
-      val = static_cast<int>(alphabet(tag));
-      if(val == 0)
-      {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val);
-      }
-    }
-
-    if(!outOfWord)
-    {
-      if(debugMode)
-      {
-        fwprintf(stderr, L"outOfWord = false\n");
-      }
-
-      new_states.clear();
-      wstring res = L"";
-      for(vector<State>::const_iterator it = alive_states.begin(); it != alive_states.end(); it++)
-      {
-        res = L"";
-        State s = *it;
-        if(val < 0)
-        {
-          alphabet.getSymbol(res, val,  false);
-          if(debugMode)
-          {
-            fwprintf(stderr, L"  step: %S\n", res.c_str());
-          }
-          s.step(val, alphabet(L"<ANY_TAG>"));
-        }
-        else
-        {
-          if(debugMode)
-          {
-            fwprintf(stderr, L"  step: %C\n", val);
-          }
-          s.step_case(val, alphabet(L"<ANY_CHAR>"), false);
-        }
-        if(s.size() > 0) // If the current state has outgoing transitions, add it to the new alive states
-        {
-          new_states.push_back(s);
-        }
-      }
-      if(debugMode)
-      {
-        fwprintf(stderr, L"new_states: %d\n", new_states.size());
-      }
-      alive_states.swap(new_states);
-      alive_states.push_back(*initial_state);
-
-    }
-
-    // We're still reading a surface form
-    if(val > 0 && val != L'$' && !isEscaped && !outOfWord)
-    {
-      sl[pos] = sl[pos] + static_cast<wchar_t>(val);
-    }
-
-    // Reading a superblank
-    if(outOfWord)
-    {
-      if(!feof(input))
-      {
-        blanks[pos] = blanks[pos] + static_cast<wchar_t>(val);
-      }
-      if(debugMode)
-      {
-        //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str());
-      }
-    }
-
-    // Increment the current line number (for rule tracing)
-    if(val == L'\n')
-    {
-      lineno++;
-    }
-  }
-
-  processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final);
-
-  fwprintf(output, L"%S", blanks[pos].c_str());
-}
-*/
-
-void
-LRXProcessor::process(FILE *input, FILE *output)
-{
-  bool isEscaped = false;
-
-  map<int, wstring > sl; // map of SL words
-  map<int, vector<wstring> > tl; // map of vectors of TL translations
-  map<int, wstring > blanks; // map of the superblanks
-
-  map<int, map<wstring, double> > scores; //
-  map<int, map<wstring, OpType> > operations;
+  map<int, map<UString, double> > scores; //
+  map<int, map<UString, OpType> > operations;
 
   vector<State*> alive_states ;
   alive_states.push_back(new State(*initial_state));
 
-  int val = 0;
-  while((val = fgetwc_unlocked(input)) != EOF && val != WEOF)
+  int32_t val = 0;
+  while((val = input.get()) != U_EOF)
   {
 
-    if(nullFlush && val == L'\0')
+    if(nullFlush && val == '\0')
     {
       processFlush(output, sl, tl, blanks, scores, operations);
-      fwprintf(output, L"%S", blanks[pos].c_str());
+      u_fprintf(output, "%S", blanks[pos].c_str());
       pos = 0;
       tl.clear();
       sl.clear();
@@ -816,63 +288,62 @@ LRXProcessor::process(FILE *input, FILE *output)
       alive_states.clear();
       alive_states.push_back(new State(*initial_state));
 
-      fputwc_unlocked(val, output);
-      fflush(output);
+      u_fputc(val, output);
+      u_fflush(output);
       continue;
     }
 
     // We're starting to read a new lexical form
-    if(val == L'^' && !isEscaped && outOfWord)
+    if(val == '^' && !isEscaped && outOfWord)
     {
       outOfWord = false;
       continue;
     }
 
     // We've seen the surface form
-    if(val == L'/' && !isEscaped && !outOfWord)
+    if(val == '/' && !isEscaped && !outOfWord)
     {
       // Read in target equivalences
-      wstring trad = L"";
-      val = fgetwc_unlocked(input);
-      while(val != L'$' && val != EOF && val != WEOF)
+      UString trad;
+      val = input.get();
+      while(val != '$' && val != U_EOF)
       {
-        if(val != L'$')
+        if(val != '$')
         {
-          trad += static_cast<wchar_t>(val);
+          trad += val;
         }
-        if(val == L'/')
+        if(val == '/')
         {
           tl[pos].push_back(trad.substr(0, trad.length()-1));
-          trad = L"";
+          trad.clear();
         }
-        val = fgetwc_unlocked(input);
+        val = input.get();
       }
       tl[pos].push_back(trad);
 
       if(debugMode)
       {
-        for(auto& it : tl[pos])
-        {
-          fwprintf(stderr, L"trad[%d]: %S\n", pos, it.c_str());
+        for(auto& it : tl[pos]) {
+          cerr << "trad[" << pos << "]: " << it << endl;
         }
       }
     }
 
-    if((feof(input) || val == L'$') && !isEscaped && !outOfWord)
+    if((input.eof() || val == '$') && !isEscaped && !outOfWord)
     {
       if(debugMode)
       {
-        fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]: %S\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size(), sl[pos].c_str());
+        cerr << "[POS] " << pos << ": [sl " << sl[pos].size() << " ; tl " << tl[pos].size() << " ; bl " << blanks[pos].size() << "]: " << sl[pos] << endl;
       }
       {
         vector<State *> new_states; // TODO: Can we avoid the State-copying here?
         // \forall s \in A
-        set<wstring> seen_ids;
+        set<UString> seen_ids;
         for(auto& it : alive_states)
         {
           State s = *it;
           // \IF \exists c \in Q : \delta(s, sent[i]) = c
-          s.step(alphabet(L"<$>"));
+          s.step(word_boundary);
 
           // A \gets A \cup {c}
           if (s.size() > 0) // If the current state has outgoing transitions,
@@ -880,7 +351,7 @@ LRXProcessor::process(FILE *input, FILE *output)
           {
             new_states.push_back(new State(s));
           }
-          s.step(alphabet(L"<$>"));
+          s.step(word_boundary);
 
           // \IF c \in F
           if (s.isFinal(anfinals))
@@ -888,18 +359,18 @@ LRXProcessor::process(FILE *input, FILE *output)
             // We've reached a final state, so we need to evaluate the rule we've matched
             if (debugMode)
             {
-              wstring out = s.filterFinals(anfinals, alphabet, escaped_chars);
-              fwprintf(stderr, L"    filter_finals: %S\n", out.c_str());
+              UString out = s.filterFinals(anfinals, alphabet, escaped_chars);
+              cerr << "    filter_finals: " << out << endl;
             }
 
-            set<pair<wstring, vector<wstring>>> outpaths;
+            set<pair<UString, vector<UString>>> outpaths;
             outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0);
 
             for (auto& it : outpaths)
             {
               vector<State> reached;
-              vector<wstring> path = it.second;
-              wstring id = it.first;
+              vector<UString> path = it.second;
+              UString id = it.first;
 
               if (seen_ids.find(id) != seen_ids.end())
               {
@@ -911,13 +382,14 @@ LRXProcessor::process(FILE *input, FILE *output)
 
               if (debugMode)
               {
-                fwprintf(stderr, L"id:      %S: (lambda: %.5f)\n", id.c_str(), weights[id.c_str()]);
+                cerr << "id:      " << id << ": (lambda: ";
+                cerr << weights[id] << ")\n";
               }
               for (auto& it2 : path)
               {
                 if (debugMode)
                 {
-                  fwprintf(stderr, L"op:        %S\n", it2.c_str());
+                  cerr << "op:        " << it2 << endl;
                 }
                 if (it2 != LRX_PROCESSOR_TAG_SKIP)
                 {
@@ -928,9 +400,10 @@ LRXProcessor::process(FILE *input, FILE *output)
                   scores[j][it2] += weights[id.c_str()];
                   if (debugMode)
                   {
-                    fwprintf(stderr, L"#[%d]SCORE %.5f / %S\n", j, scores[j][it2], it2.c_str());
+                    cerr << "#[" << j << "]SCORE " << scores[j][it2] << " / ";
+                    cerr << it2 << endl;
                   }
-                  if(it2.at(0) == L'<' && it2.at(1) == L'r') {
+                  if(it2.at(0) == '<' && it2.at(1) == 'r') {
                     operations[j][it2] = Remove;
                   }
                   else {
@@ -939,7 +412,7 @@ LRXProcessor::process(FILE *input, FILE *output)
                 }
                 j++;
               }
-              // fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos);
+              // cerr << "#SPAN[" << (pos-path.size()) << ", " << pos << "]\n";
             }
           }
         }
@@ -953,13 +426,12 @@ LRXProcessor::process(FILE *input, FILE *output)
 
         if (debugMode)
         {
-          fwprintf(stderr, L"seen:");
-          for (auto& it : seen_ids)
-          {
-            fwprintf(stderr, L" %S ", it.c_str());
+          cerr << "seen:";
+          for (auto& it : seen_ids) {
+            cerr << " " << it << " ";
           }
-          fwprintf(stderr, L"\n");
-          fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size());
+          cerr << endl;
+          cerr << "#CURRENT_ALIVE: " << alive_states.size() << endl;
         }
       }
 
@@ -970,7 +442,7 @@ LRXProcessor::process(FILE *input, FILE *output)
 
         if(debugMode)
         {
-          fwprintf(stderr, L"FLUSH:\n");
+          cerr << "FLUSH:" << endl;
         }
 
 
@@ -988,7 +460,7 @@ LRXProcessor::process(FILE *input, FILE *output)
       pos++;
       if(debugMode)
       {
-        fwprintf(stderr, L"==> new pos: %d\n", pos);
+        cerr << "==> new pos: " << pos << endl;
       }
 
       outOfWord = true;
@@ -996,19 +468,17 @@ LRXProcessor::process(FILE *input, FILE *output)
     }
 
     // We're reading a tag
-    if(val == L'<' && !isEscaped && !outOfWord)
+    if(val == '<' && !isEscaped && !outOfWord)
     {
-      wstring tag = L"";
-      tag = readFullBlock(input, L'<', L'>');
+      UString tag = input.readBlock('<', '>');
       sl[pos] = sl[pos] + tag;
-      val = static_cast<int>(alphabet(tag));
-      if(val == 0)
-      {
-        val = static_cast<int>(alphabet(L"<ANY_TAG>"));
+      val = alphabet(tag);
+      if (val == 0) {
+        val = any_tag;
       }
       if(debugMode)
       {
-        fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val);
+        cerr << "tag " << tag << ": " << val << "\n";
       }
     }
 
@@ -1016,39 +486,39 @@ LRXProcessor::process(FILE *input, FILE *output)
     {
       if(debugMode)
       {
-        fwprintf(stderr, L"outOfWord = false\n");
+        cerr << "outOfWord = false\n";
       }
 
-      wstring res = L"";
+      UString res;
       for(auto& s : alive_states)
       {
-        res = L"";
+        res.clear();
         if(val < 0)
         {
           alphabet.getSymbol(res, val,  false);
           if(debugMode)
           {
-            fwprintf(stderr, L"  step: %S\n", res.c_str());
+            cerr << "  step: " << res << endl;
           }
-          s->step(val, alphabet(L"<ANY_TAG>"));
+          s->step(val, any_tag);
         }
         else
         {
 
           set<int> alts;
-          alts.insert(alphabet(L"<ANY_CHAR>"));
-          if(iswupper(val))
+          alts.insert(any_char);
+          if(u_isupper(val))
           {
-            alts.insert(towlower(val));
-            alts.insert(alphabet(L"<ANY_UPPER>"));
+            alts.insert(u_tolower(val));
+            alts.insert(any_upper);
           }
           else
           {
-            alts.insert(alphabet(L"<ANY_LOWER>"));
+            alts.insert(any_lower);
           }
           if(debugMode)
           {
-            fwprintf(stderr, L"  step: %C [alts: %d]\n", val, alts.size());
+            cerr << "  step: " << val << " [alts: " << alts.size() << "]\n";
           }
           s->step(val, alts);
         }
@@ -1057,26 +527,28 @@ LRXProcessor::process(FILE *input, FILE *output)
     }
 
     // We're still reading a surface form
-    if(val > 0 && val != L'$' && !isEscaped && !outOfWord)
+    if(val > 0 && val != '$' && !isEscaped && !outOfWord)
     {
-      sl[pos] = sl[pos] + static_cast<wchar_t>(val);
+      sl[pos] += val;
     }
 
     // Reading a superblank
     if(outOfWord)
     {
-      if(!feof(input))
+      if(!input.eof())
       {
-        blanks[pos] = blanks[pos] + static_cast<wchar_t>(val);
+        blanks[pos] += val;
       }
+      /*
       if(debugMode)
       {
-        //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str());
+        cerr << "blanks[" << pos << "] = " << blanks[pos] << endl;
       }
+      */
     }
 
     // Increment the current line number (for rule tracing)
-    if(val == L'\n')
+    if(val == '\n')
     {
       lineno++;
     }
@@ -1084,42 +556,42 @@ LRXProcessor::process(FILE *input, FILE *output)
   }
 
   processFlush(output, sl, tl, blanks, scores, operations);
-  fwprintf(output, L"%S", blanks[pos].c_str());
+  write(blanks[pos], output);
 }
 
 void
-LRXProcessor::processFlush(FILE *output,
-                             map<int, wstring > &sl,
-                             map<int, vector<wstring> > &tl,
-                             map<int, wstring > &blanks,
-                             map<int, map<wstring, double> > &scores,
-                             map<int, map<wstring, OpType> > &operations) {
+LRXProcessor::processFlush(UFILE *output,
+                           map<int, UString > &sl,
+                           map<int, vector<UString> > &tl,
+                           map<int, UString > &blanks,
+                           map<int, map<UString, double> > &scores,
+                           map<int, map<UString, OpType> > &operations) {
 
   struct ScoredMatch {
       OpType op;
-      wstring* ti;              // matched target translation
+      UString* ti;              // matched target translation
       double weight;
   };
 
   unsigned int spos = 0;
   for(spos = 0; spos <= pos; spos++)
   {
-    if(sl[spos] == L"")
+    if(sl[spos].empty())
     {
       continue;
     }
 
-    fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
+    u_fprintf(output, "%S^%S/", blanks[spos].c_str(), sl[spos].c_str());
 
-    vector<wstring>::iterator ti;
+    vector<UString>::iterator ti;
     auto penum = tl[spos].end();
     penum--;
 
     if(tl[spos].size() > 1)
     {
       //--
-      set<wstring*> ti_keep;
-      set<wstring*> ti_removed;
+      set<UString*> ti_keep;
+      set<UString*> ti_removed;
       vector<ScoredMatch> spos_matches;
       for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
       {
@@ -1128,9 +600,13 @@ LRXProcessor::processFlush(FILE *output,
           bool matched = recognisePattern(*ti, si.first);
           OpType op = operations[spos][si.first];
           if (debugMode) {
-            wstring checks = matched ? L"✔️ " : L"❎";
-            fwprintf(stderr, L"%S >>> %d -> %S -> %.5f\n", checks.c_str(), spos,
-                     si.first.c_str(), si.second);
+            if (matched) {
+              cerr << "✔️ ";
+            } else {
+              cerr << "❎";
+            }
+            cerr << " >>> " << spos << " -> ";
+            cerr << si.first << " -> " << si.second << endl;
           }
           if(matched) {
             spos_matches.push_back({ op, &*ti, si.second });
@@ -1144,15 +620,10 @@ LRXProcessor::processFlush(FILE *output,
              [](const auto &a, const auto &b) { return a.weight > b.weight; });
         for (const auto &m : spos_matches) {
           if (traceMode || debugMode) {
-            wstring op = (m.op == Select ? L"SELECT" : L"REMOVE");
-            fwprintf(
-                stderr, L"%d:%S:%.5f:%S:%d:%S\n",
-                lineno,
-                op.c_str(),
-                m.weight,
-                sl[spos].c_str(),
-                ti_keep.size(),
-                m.ti->c_str());
+            std::string op = (m.op == Select ? "SELECT" : "REMOVE");
+            cerr << lineno << ":" << op << ":" << m.weight;
+            cerr << ":" << sl[spos] << ":" << ti_keep.size();
+            cerr << ":" << m.ti << endl;
           }
           // We have to keep track of translations that have been removed so
           // that we don't end up adding back a translation that was removed.
@@ -1168,9 +639,9 @@ LRXProcessor::processFlush(FILE *output,
         bool printed = false;
         for(const auto& ti_max : ti_keep) {
           if(printed) {
-            fwprintf(output, L"/");
+            u_fprintf(output, "/");
           }
-          fwprintf(output, L"%S", ti_max->c_str());
+          u_fprintf(output, "%S", ti_max->c_str());
           printed = true;
         }
       }
@@ -1178,10 +649,10 @@ LRXProcessor::processFlush(FILE *output,
       {
         for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
         {
-          fwprintf(output, L"%S", ti->c_str());
+          u_fprintf(output, "%S", ti->c_str());
           if(ti != penum)
           {
-            fwprintf(output, L"/");
+            u_fprintf(output, "/");
           }
         }
       }
@@ -1190,18 +661,18 @@ LRXProcessor::processFlush(FILE *output,
     {
       for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++)
       {
-        fwprintf(output, L"%S", ti->c_str());
+        u_fprintf(output, "%S", ti->c_str());
         if(ti != penum)
         {
-          fwprintf(output, L"/");
+          u_fputc('/', output);
         }
       }
     }
 
-    fwprintf(output, L"$");
+    u_fputc('$', output);
     if(debugMode)
     {
-      fwprintf(output, L"%d", spos);
+      u_fprintf(output, "%d", spos);
     }
 
 
diff --git a/src/lrx_processor.h b/src/lrx_processor.h
index 26973aa..1a03d86 100644
--- a/src/lrx_processor.h
+++ b/src/lrx_processor.h
@@ -18,74 +18,33 @@
 #ifndef __LRX_PROCESSOR_H__
 #define __LRX_PROCESSOR_H__
 
-#include <cwchar>
 #include <cstdio>
 #include <libgen.h>
-#include <cerrno>
-#include <string>
-#include <iostream>
-#include <cmath>
-#include <sstream>
-#include <limits>
-#include <cstdlib>
-#include <list>
-#include <algorithm>
 #include <set>
+#include <cstdint>
 
 #include <libxml/xmlreader.h>
 
-#include <lttoolbox/ltstr.h>
-#include <lttoolbox/lt_locale.h>
-#include <lttoolbox/transducer.h>
-#include <lttoolbox/xml_parse_util.h>
 #include <lttoolbox/alphabet.h>
-#include <lttoolbox/exception.h>
-#include <lttoolbox/compression.h>
-#include <lttoolbox/regexp_compiler.h>
 #include <lttoolbox/state.h>
-#include <lttoolbox/match_exe.h>
 #include <lttoolbox/trans_exe.h>
-#include <lttoolbox/my_stdio.h>
+#include <lttoolbox/input_file.h>
 
 using namespace std;
-/*
-class BiltransToken {
-public:
-	bool isEOF = false;
-	wstring source;
-	wstring blanks;
-	vector<wstring> target;
-
-	wstring toString(bool delim) {
-		wstring out = source;
-		for(int i = 0; i < target.size(); i++) {
-			out += L'/' + target[i];
-		}
-		if (delim && (source.size() > 0 || target.size() > 0)) {
-			out = blanks + L'^' + out + L'$';
-		} else {
-			out = blanks + out;
-		}
-		return out;
-	}
-};
 
-*/
 class LRXProcessor
 {
 private:
 
   Alphabet alphabet;
   TransExe transducer;
-  map<wstring, TransExe> recognisers;
-  map<wstring, double> weights;
-
-//  map<int, BiltransToken> bts;
+  map<UString, TransExe> recognisers;
+  map<UString, double> weights;
 
   vector<State> alive_states;
 
   map<Node *, double> anfinals;
-  set<wchar_t> escaped_chars;
+  set<UChar32> escaped_chars;
   State *initial_state;
 
   bool traceMode;
@@ -93,42 +52,41 @@ private:
   bool nullFlush;
   bool outOfWord;
 
+  int32_t any_char;
+  int32_t any_upper;
+  int32_t any_lower;
+  int32_t any_tag;
+  int32_t word_boundary;
+
   unsigned int pos;
   unsigned long lineno;
 
-  wstring itow(int i);
-  bool recognisePattern(const wstring lu, const wstring op);
-  wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2);
-
-//  BiltransToken readBiltransToken(FILE *input = stdin);
+  UString itow(int i);
+  bool recognisePattern(const UString lu, const UString op);
+  UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2);
 
   void makeTransition(int);
   void filterFinals();
   void evaluateRules();
 
-/*
-  void processFlush(FILE *output,
-                    map<int, wstring > &sl,
-                    map<int, vector<wstring> > &tl,
-                    map<int, wstring > &blanks,
-                    map<int, pair<double, vector<State> > > &covers,
-                    pair<double, vector<State> > &empty_seq,
-                    map<pair<int, int>, vector<State> > &spans,
-                    int last_final);
-*/
   enum OpType { Select, Remove };
 
-  void processFlush(FILE *output,
-                      map<int, wstring > &sl,
-                      map<int, vector<wstring> > &tl,
-                      map<int, wstring > &blanks,
-                      map<int, map<wstring, double> > &scores,
-                      map<int, map<wstring, OpType> > &operations);
+  void processFlush(UFILE *output,
+                      map<int, UString > &sl,
+                      map<int, vector<UString> > &tl,
+                      map<int, UString > &blanks,
+                      map<int, map<UString, double> > &scores,
+                      map<int, map<UString, OpType> > &operations);
 
 public:
-  static wstring const LRX_PROCESSOR_TAG_SELECT;
-  static wstring const LRX_PROCESSOR_TAG_REMOVE;
-  static wstring const LRX_PROCESSOR_TAG_SKIP;
+  static UString const LRX_PROCESSOR_TAG_SELECT;
+  static UString const LRX_PROCESSOR_TAG_REMOVE;
+  static UString const LRX_PROCESSOR_TAG_SKIP;
+  static UString const LRX_PROCESSOR_TAG_ANY_CHAR;
+  static UString const LRX_PROCESSOR_TAG_ANY_TAG;
+  static UString const LRX_PROCESSOR_TAG_ANY_UPPER;
+  static UString const LRX_PROCESSOR_TAG_ANY_LOWER;
+  static UString const LRX_PROCESSOR_TAG_WORD_BOUNDARY;
 
   LRXProcessor();
   ~LRXProcessor();
@@ -139,9 +97,7 @@ public:
 
   void init();
   void load(FILE *input);
-  void process(FILE *input, FILE *output);
-//  void processME(FILE *input, FILE *output);
-
+  void process(InputFile& input, UFILE *output);
 };
 
 #endif /* __LRX_PROCESSOR_H__ */
diff --git a/src/multi_translator.cc b/src/multi_translator.cc
index 7e2ad1e..ea98145 100644
--- a/src/multi_translator.cc
+++ b/src/multi_translator.cc
@@ -1,4 +1,5 @@
 #include "multi_translator.h"
+#include <iostream>
 
 MultiTranslator::MultiTranslator(string path, string mode, bool trimmed, bool filter, bool number_lines) {
 	this->trimmed = trimmed;
@@ -30,10 +31,10 @@ int MultiTranslator::calculateFertility(vector<BiltransToken> sent) {
 }
 
 
-BiltransToken MultiTranslator::parseBiltransToken(wstring bt) {
+BiltransToken MultiTranslator::parseBiltransToken(UString bt) {
 
 	BiltransToken token;
-	vector<wstring> tokens = wsplit(bt, L'/');
+	vector<UString> tokens = wsplit(bt, '/');
 
 	token.sourceToken = parseTaggerToken(tokens[0]);
 
@@ -49,9 +50,9 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) {
 	bool isPos;
 	if (bt.sourceToken.tags.size() > 0) {
 		isPos =
-		bt.sourceToken.tags[0] == L"n" ||
-		bt.sourceToken.tags[0] == L"vblex" ||
-		bt.sourceToken.tags[0] == L"adj";
+		bt.sourceToken.tags[0] == "n"_u ||
+		bt.sourceToken.tags[0] == "vblex"_u ||
+		bt.sourceToken.tags[0] == "adj"_u;
 	} else {
 		isPos = false;
 	}
@@ -60,10 +61,10 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) {
 
 }
 
-BiltransToken MultiTranslator::getFullToken(wstring source) {
+BiltransToken MultiTranslator::getFullToken(UString source) {
 
 	BiltransToken token;
-	if (source[0] == L'*') {
+	if (source[0] == '*') {
 		token.sourceToken.lemma = source;
 		TaggerToken tmp;
 		tmp.lemma = source;
@@ -71,21 +72,22 @@ BiltransToken MultiTranslator::getFullToken(wstring source) {
 		return token;
 	}
 
-	wstring target = bilingual.biltrans(source, false);
-	if (target == L"") {
-		target = L"@" + source;
+	UString target = bilingual.biltrans(source, false);
+	if (target.empty()) {
+      target += '@';
+      target.append(source);
 	}
-	token = parseBiltransToken(source + L"/" + target);
+	token = parseBiltransToken(source + "/"_u + target);
 	return token;
 
 }
 
-BiltransToken MultiTranslator::getTrimmedToken(wstring source)
+BiltransToken MultiTranslator::getTrimmedToken(UString source)
 {
 	BiltransToken ttoken;
 	BiltransToken ftoken;
 
-	if (source[0] == L'*') {
+	if (source[0] == '*') {
 		ttoken.sourceToken.lemma = source;
 		TaggerToken tmp;
 		tmp.lemma = source;
@@ -99,8 +101,8 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
         // the bilingual.* methods in FSTProcessor. Unknown why we get the
         // leaks in the first place...
 
-        wstring fstr = L"";
-        wstring tstr = L"";
+        UString fstr;
+        UString tstr;
 
 	if((f_cache.find(source) == f_cache.end()))
         {
@@ -116,37 +118,39 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
 
         /*---------------------------------------------*/
 
-	if (fstr == L"") {
-		fstr = L"@" + source;
-	}
-	if (tstr == L"") {
-		tstr = L"@" + source;
-	}
+        if (fstr.empty()) {
+          fstr += '@';
+          fstr.append(source);
+        }
+        if (tstr.empty()) {
+          tstr += '@';
+          tstr.append(source);
+        }
 
-	ttoken = parseBiltransToken(source + L"/" + tstr);
-	ftoken = parseBiltransToken(source + L"/" + fstr);
+	ttoken = parseBiltransToken(source + "/"_u + tstr);
+	ftoken = parseBiltransToken(source + "/"_u + fstr);
 
 
 	if(this->trimmed) {
 		for(size_t i = 0; i < ftoken.targetTokens.size(); ++i ) {
 			if(ttoken.targetTokens[i].tags.size() <
 			   ftoken.targetTokens[i].tags.size()) {
-				ttoken.targetTokens[i].tags.push_back(L"*");
+				ttoken.targetTokens[i].tags.push_back("*"_u);
 			}
 		}
 	}
 
-	vector<wstring> newTags;
+	vector<UString> newTags;
 	//bool sourceTrimmed = false;
 	for(size_t i = 0; i < ttoken.sourceToken.tags.size(); ++i) {
-		wstring tag = ttoken.sourceToken.tags[i];
+		UString tag = ttoken.sourceToken.tags[i];
 		if (find(ttoken.targetTokens[0].tags, tag) ==
 			find(ftoken.targetTokens[0].tags, tag)) {
 			newTags.push_back(tag);
 		}
 	}
 	if(ttoken.sourceToken.tags.size() > newTags.size()) {
-		newTags.push_back(L"*");
+		newTags.push_back("*"_u);
 	}
 	ttoken.sourceToken.tags = newTags;
 
@@ -154,50 +158,50 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source)
 }
 
 void MultiTranslator::biltransToMultiTranslator(int sn, int &tn, unsigned int idx,
-	vector<BiltransToken> s, wstring buffer)
+	vector<BiltransToken> s, UString buffer)
 {
 
 	if (idx == s.size() ) {
-		wcout << L".[][" <<  sn << L" " << tn << L"].[]\t" << buffer << endl;
+		cout << ".[][" <<  sn << " " << tn << "].[]\t" << buffer << endl;
 		tn += 1;
 		return;
 	}
 	auto n = s[idx].targetTokens.size();
-	wstring base;
-	base = s[idx].sourceToken.toString(false) + L"/";
+	UString base;
+	base = s[idx].sourceToken.toString(false) + "/"_u;
 	for(size_t i = 0; i < n; ++i) {
-		wstring token = L"^" + base + s[idx].targetTokens[i].toString(false) + L"$";
+		UString token = "^"_u + base + s[idx].targetTokens[i].toString(false) + "$"_u;
 		if(idx != s.size() - 1) {
-			token += L" ";
+			token += ' ';
 		}
 		biltransToMultiTranslator(sn, tn, idx+1, s, buffer + token);
 	}
 }
 void MultiTranslator::printBiltransSentence(int n, vector<BiltransToken> s) {
 	if (number_lines) {
-		wcout << n << "\t";
+		cout << n << "\t";
 	}
 	for(size_t i = 0; i < s.size(); ++i) {
-		wcout << s[i].toString(true);
+		cout << s[i].toString(true);
 		if (i != s.size() - 1) {
-			wcout << L" ";
+			cout << " ";
 		}
 	}
-	wcout << endl;
+	cout << endl;
 }
 
 void MultiTranslator::printTaggerOutput(int n, vector<BiltransToken> sentence) {
 	if (number_lines) {
-		wcout << n << "\t";
+		cout << n << "\t";
 	}
 
 	for(size_t i = 0; i < sentence.size(); ++i) {
-		wcout << sentence[i].sourceToken.toString(true);
+		cout << sentence[i].sourceToken.toString(true);
 		if (i != sentence.size() -1) {
-			wcout << L" ";
+			cout << " ";
 		}
 	}
-	wcout << endl;
+	cout << endl;
 }
 
 void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
@@ -207,8 +211,8 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 	int numberOfUnknown = 0;
 	int fertility = 1;
 	for(size_t i = 0; i < sentence.size(); ++i) {
-		wstring token = sentence[i].toString(false);
-		wstring target;
+		UString token = sentence[i].toString(false);
+		UString target;
 
 		BiltransToken bt;
 		if(this->trimmed){
@@ -220,7 +224,7 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 		if (isPosAmbig(bt)) {
 			hasAmbigPos = true;
 		}
-		if(token[0] == L'*') {
+		if(token[0] == '*') {
 			numberOfUnknown ++;
 		}
 		fertility *= bt.targetTokens.size();
@@ -240,7 +244,7 @@ void MultiTranslator::processSentence(vector<TaggerToken> sentence) {
 		} else if(mode == "-b") {
 			printBiltransSentence(this->sn, outputSentence);
 		} else if (mode == "-m") {
-			wstring outBuffer = L"";
+			UString outBuffer;
 			int tn = 0;
 			biltransToMultiTranslator(this->sn, tn, 0, outputSentence, outBuffer);
 		}
diff --git a/src/multi_translator.h b/src/multi_translator.h
index d4d69cd..30ec426 100644
--- a/src/multi_translator.h
+++ b/src/multi_translator.h
@@ -4,36 +4,42 @@
 #define BILTRANS_WITHOUT_QUEUE
 
 #include "tagger_output_processor.h"
+#include <lttoolbox/fst_processor.h>
 
 class BiltransToken {
 public:
-	TaggerToken sourceToken;
-	vector<TaggerToken> targetTokens;
-	wstring blanks;
-
-	bool isEOF;
-
-	BiltransToken() {
-		isEOF = false;
-	}
-
-	wstring toString(bool delimiter) {
-		wstring out = sourceToken.toString(false);
-		for(unsigned int i = 0; i < targetTokens.size(); i++) {
-			out += L'/' + targetTokens[i].toString(false);
-		}
-		if (delimiter) {
-			out = L"^" + out + L"$";
-		}
-		return out;
-	}
+  TaggerToken sourceToken;
+  vector<TaggerToken> targetTokens;
+  UString blanks;
+
+  bool isEOF;
+
+  BiltransToken() {
+    isEOF = false;
+  }
+
+  UString toString(bool delimiter) {
+    UString out;
+    if (delimiter) {
+      out += '^';
+    }
+    out.append(sourceToken.toString(false));
+    for (auto& tok : targetTokens) {
+      out += '/';
+      out.append(tok.toString(false));
+    }
+    if (delimiter) {
+      out += '$';
+    }
+    return out;
+  }
 };
 
 class MultiTranslator : public TaggerOutputProcessor {
 private:
 	FSTProcessor bilingual;
-	map<wstring, wstring> f_cache;
-	map<wstring, wstring> t_cache;
+	map<UString, UString> f_cache;
+	map<UString, UString> t_cache;
 	string path;
 
 	bool trimmed;
@@ -44,10 +50,10 @@ private:
 
 	bool isPosAmbig(BiltransToken token);
 
-	BiltransToken getTrimmedToken(wstring str);
-	BiltransToken getFullToken(wstring str);
+	BiltransToken getTrimmedToken(UString str);
+	BiltransToken getFullToken(UString str);
 
-	BiltransToken parseBiltransToken(wstring bt);
+	BiltransToken parseBiltransToken(UString bt);
 
 	void processSentence(vector<TaggerToken> s);
 
@@ -56,7 +62,7 @@ private:
 	void printTaggerOutput(int i, vector<BiltransToken> s);
 
 	void biltransToMultiTranslator(int sn, int &tn, unsigned int idx,
-			vector<BiltransToken> s, wstring buffer);
+			vector<BiltransToken> s, UString buffer);
 
 
 
@@ -69,4 +75,3 @@ public:
 };
 
 #endif
-
diff --git a/src/multitrans.cc b/src/multitrans.cc
index a4643bc..ad94ae1 100644
--- a/src/multitrans.cc
+++ b/src/multitrans.cc
@@ -1,4 +1,5 @@
 #include "multi_translator.h"
+#include <lttoolbox/lt_locale.h>
 
 bool trim = false;
 bool filter = false;
@@ -9,18 +10,18 @@ string path;
 string mode;
 
 void printError(char *name) {
-    wcout << "Usage: " << name << " ";
-    wcout << "<mode> [options] <path to a binary bilingual transducer>" << endl;
-    wcout << "Modes: " << endl;
-    wcout << "  --biltrans           | -b" << endl;
-    wcout << "  --multitrans         | -m" << endl;
-    wcout << "  --trim-tagger-output | -p" << endl;
+    cout << "Usage: " << name << " ";
+    cout << "<mode> [options] <path to a binary bilingual transducer>" << endl;
+    cout << "Modes: " << endl;
+    cout << "  --biltrans           | -b" << endl;
+    cout << "  --multitrans         | -m" << endl;
+    cout << "  --trim-tagger-output | -p" << endl;
 
-    wcout << "Options: " << endl;
-    wcout << "  --filter-lines | -f" << endl;
-    wcout << "  --trim-lines   | -t" << endl;
-    wcout << "  --number-lines | -n" << endl;
-    wcout << "  --null-flush   | -z" << endl;
+    cout << "Options: " << endl;
+    cout << "  --filter-lines | -f" << endl;
+    cout << "  --trim-lines   | -t" << endl;
+    cout << "  --number-lines | -n" << endl;
+    cout << "  --null-flush   | -z" << endl;
 }
 
 void parseArguments(int argc, char **argv) {
@@ -59,6 +60,7 @@ void parseArguments(int argc, char **argv) {
 }
 
 int main(int argc, char** argv) {
+  LtLocale::tryToSetLocale();
     parseArguments(argc, argv);
 
     MultiTranslator mt(path, mode, trim, filter, number_lines);
diff --git a/src/tagger_output_processor.cc b/src/tagger_output_processor.cc
index 63b07f8..859aae3 100644
--- a/src/tagger_output_processor.cc
+++ b/src/tagger_output_processor.cc
@@ -1,15 +1,8 @@
 #include "tagger_output_processor.h"
+#include <lttoolbox/string_utils.h>
+#include <lttoolbox/input_file.h>
 
-TaggerOutputProcessor::TaggerOutputProcessor() {
-	sn = 0;
-	LtLocale::tryToSetLocale();
-}
-
-TaggerOutputProcessor::~TaggerOutputProcessor() {
-
-}
-
-int TaggerOutputProcessor::find(vector<wstring> xs, wstring x) {
+int TaggerOutputProcessor::find(vector<UString> xs, UString x) {
 	for (size_t i = 0; i < xs.size(); ++i) {
 		if (xs[i] == x)
 			return i;
@@ -17,21 +10,21 @@ int TaggerOutputProcessor::find(vector<wstring> xs, wstring x) {
 	return -1;
 }
 
-TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) {
+TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) {
 	TaggerToken token;
 	int state = 0; // lemma;
-	wstring buffer;
+	UString buffer;
 	for (auto& c : str) {
-		if(c == L'<' && state == 0) {
+		if(c == '<' && state == 0) {
 			state = 1;
 			token.lemma = buffer;
 			buffer.clear();
 		}
 
-		if (c == L'>') {
+		if (c == '>') {
 			token.tags.push_back(buffer);
 			buffer.clear();
-		} else if (c != L'<') {
+		} else if (c != '<') {
 			buffer += c;
 		}
 	}
@@ -41,10 +34,10 @@ TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) {
 	return token;
 }
 
-vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
+vector<UString> TaggerOutputProcessor::parseTags(UString token) {
 	int state = 0; // outside
-	vector<wstring> tags;
-	wstring buffer;
+	vector<UString> tags;
+	UString buffer;
 	for (auto& c : token) {
 		if (state == 0) {
 			if (c == '<') {
@@ -53,7 +46,7 @@ vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
 		} else if (state == 1) {
 			if (c == '>') {
 				tags.push_back(buffer);
-				buffer = L"";
+				buffer.clear();
 				state = 0;
 			} else {
 				buffer += c;
@@ -63,26 +56,26 @@ vector<wstring> TaggerOutputProcessor::parseTags(wstring token) {
 	return tags;
 }
 
-vector<wstring> TaggerOutputProcessor::wsplit(wstring wstr, wchar_t delim) {
-	vector<wstring> tokens;
-	wstring buffer;
+vector<UString> TaggerOutputProcessor::wsplit(UString wstr, UChar delim) {
+	vector<UString> tokens;
+	UString buffer;
 
 	for(size_t i = 0; i < wstr.size(); ++i) {
-		if(wstr[i] == delim && (i == 0 || wstr[i-1] != L'\\')) {
+		if(wstr[i] == delim && (i == 0 || wstr[i-1] != '\\')) {
 			tokens.push_back(buffer);
-			buffer = L"";
+			buffer.clear();
 		} else {
 			buffer += wstr[i];
 		}
 	}
-	if(buffer != L"") {
+	if(!buffer.empty()) {
 		tokens.push_back(buffer);
 	}
 	return tokens;
 }
 
-wstring TaggerOutputProcessor::getLemma(wstring token) {
-	wstring buffer;
+UString TaggerOutputProcessor::getLemma(UString token) {
+	UString buffer;
 	for (auto& c : token) {
 		if(c != '<') {
 			buffer += c;
@@ -94,47 +87,19 @@ wstring TaggerOutputProcessor::getLemma(wstring token) {
 }
 
 void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) {
-	wstring buffer;
 	vector<TaggerToken> sentence;
-	bool escaped = false;
-	int state = 0; // outside
-	wchar_t c;
-	while((c = fgetwc(stdin))) {
-		if (c == -1) {
-			break;
-		}
+	UChar32 c;
+  InputFile in;
+	while (!in.eof()) {
+    c = in.get();
 
-		if (nullFlush && c == L'\0') {
+		if ((c == '\n') || (nullFlush && c == '\0')) {
 		  processSentence(sentence);
 		  sentence.clear();
-		  buffer.clear();
-		}
-
-		if(c == L'\n') {
-			processSentence(sentence);
-			sentence.clear();
-			buffer.clear();
-		}
-		if (state == 0) {
-			if (c == '^' && !escaped) {
-				state = 1; // inside
-			} else if (c == '\\' && !escaped) {
-				escaped = true;
-			} else {
-				escaped = false;
-			}
-		} else if (state == 1) {
-			if(c == L'$' && !escaped) {
-				sentence.push_back(parseTaggerToken(buffer));
-				buffer = L"";
-				state = 0;
-			} else if (c == '\\' && !escaped) {
-				escaped = true;
-				buffer += c;
-			} else {
-				buffer += c;
-				escaped = false;
-			}
+		} else if (c == '\\') {
+      in.get();
+    } else if (c == '^') {
+      sentence.push_back(parseTaggerToken(in.readBlock('^', '$')));
 		}
 	}
 }
diff --git a/src/tagger_output_processor.h b/src/tagger_output_processor.h
index 40c00ad..0219ccf 100644
--- a/src/tagger_output_processor.h
+++ b/src/tagger_output_processor.h
@@ -2,54 +2,46 @@
 #define TAGGER_OUTPUT_PROCESSOR
 
 #include <stdio.h>
-#include <string>
-#include <iostream>
-
-#include <lttoolbox/fst_processor.h>
-#include <lttoolbox/lt_locale.h>
-#include <lttoolbox/ltstr.h>
-
-#include <cwchar>
-#include <set>
-#include <apertium/tsx_reader.h>
-#include <apertium/string_utils.h>
+#include <lttoolbox/ustring.h>
 
 using namespace std;
 
 class TaggerToken {
 public:
-	wstring lemma;
-	vector<wstring> tags;
-	wstring toString(bool delimiters) {
-		wstring out = lemma;
-		for (auto& tag : tags) {
-			out += L"<" + tag + L">";
-		}
-		if (delimiters) {
-			out = L"^" + out + L"$";
-		}
-		return out;
-	}
+  UString lemma;
+  vector<UString> tags;
+  UString toString(bool delimiters) {
+    UString out;
+    if (delimiters) {
+      out += '^';
+    }
+    out.append(lemma);
+    for (auto& tag : tags) {
+      out += '<';
+      out.append(tag);
+      out += '>';
+    }
+    if (delimiters) {
+      out += '$';
+    }
+    return out;
+  }
 };
 
 class TaggerOutputProcessor {
 protected:
-	int sn;
+	int sn = 0;
 
-	vector<wstring> parseTags(wstring token);
-	vector<wstring> wsplit(wstring wstr, wchar_t delim);
-	TaggerToken parseTaggerToken(wstring buffer);
+	vector<UString> parseTags(UString token);
+	vector<UString> wsplit(UString wstr, UChar delim);
+	TaggerToken parseTaggerToken(UString buffer);
 
-	int find(vector<wstring> xs, wstring x);
-	wstring getLemma(wstring token);
+	int find(vector<UString> xs, UString x);
+	UString getLemma(UString token);
 
 	virtual void processSentence(vector<TaggerToken>) =0;
 public:
-	TaggerOutputProcessor();
-	~TaggerOutputProcessor();
-
 	void processTaggerOutput(bool nullFlush=false);
-
 };
 
 #endif
diff --git a/src/yasmet.cc b/src/yasmet.cc
index d203555..ae25a2e 100644
--- a/src/yasmet.cc
+++ b/src/yasmet.cc
@@ -1,4 +1,3 @@
-#include <cwchar>
 #include <cstdio>
 #include <libgen.h>
 #include <cerrno>