commit ad15367786575821a004efe19854b4f539074aaa
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Thu May 27 18:46:38 2021 -0500

    the long march part 1

diff --git a/.gitignore b/.gitignore
index 6972eaf..95e0253 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,3 +80,5 @@
 *.egg-info/
 *.egg
 **/.mypy_cache/
+
+*~
diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am
index 501b04a..6ade398 100644
--- a/lttoolbox/Makefile.am
+++ b/lttoolbox/Makefile.am
@@ -3,12 +3,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h  \
             deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \
             ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \
             pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \
+			string_utils.h \
             transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \
             string_to_wostream.h
 cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \
              expander.cc fst_processor.cc lt_locale.cc match_exe.cc \
              match_node.cc match_state.cc node.cc pattern_list.cc \
-             regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \
+             regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \
              trans_exe.cc xml_parse_util.cc tmx_compiler.cc
 
 library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME)
diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc
index 6a47095..122e0e3 100644
--- a/lttoolbox/alphabet.cc
+++ b/lttoolbox/alphabet.cc
@@ -26,11 +26,10 @@
 #include <cwchar>
 #include <cwctype>
 
-#if defined(_WIN32) && !defined(_MSC_VER)
-#include <utf8_fwrap.h>
-#endif
+#include "string_utils.h"
 
 using namespace std;
+using namespace icu;
 
 Alphabet::Alphabet()
 {
@@ -74,7 +73,7 @@ Alphabet::copy(Alphabet const &a)
 }
 
 void
-Alphabet::includeSymbol(wstring const &s)
+Alphabet::includeSymbol(UnicodeString const &s)
 {
   if(slexic.find(s) == slexic.end())
   {
@@ -99,13 +98,13 @@ Alphabet::operator()(int const c1, int const c2)
 }
 
 int
-Alphabet::operator()(wstring const &s)
+Alphabet::operator()(UnicodeString const &s)
 {
   return slexic[s];
 }
 
 int
-Alphabet::operator()(wstring const &s) const
+Alphabet::operator()(UnicodeString const &s) const
 {
   auto it = slexic.find(s);
   if (it == slexic.end()) {
@@ -115,7 +114,7 @@ Alphabet::operator()(wstring const &s) const
 }
 
 bool
-Alphabet::isSymbolDefined(wstring const &s)
+Alphabet::isSymbolDefined(UnicodeString const &s)
 {
   return slexic.find(s) != slexic.end();
 }
@@ -133,7 +132,7 @@ Alphabet::write(FILE *output)
   Compression::multibyte_write(slexicinv.size(), output);  // taglist size
   for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++)
   {
-    Compression::wstring_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output);
+    Compression::string_write(slexicinv[i].tempSubString(1, slexicinv[i].length()-2), output);
   }
 
   // Then we write the list of pairs
@@ -160,7 +159,7 @@ Alphabet::read(FILE *input)
   while(tam > 0)
   {
     tam--;
-    wstring mytag = L"<" + Compression::wstring_read(input) + L">";
+    UnicodeString mytag = "<" + Compression::string_read(input) + ">";
     a_new.slexicinv.push_back(mytag);
     a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics
   }
@@ -185,7 +184,7 @@ Alphabet::read(FILE *input)
 void
 Alphabet::serialise(std::ostream &serialised) const
 {
-  Serialiser<const vector<wstring> >::serialise(slexicinv, serialised);
+  Serialiser<const vector<UnicodeString> >::serialise(slexicinv, serialised);
   Serialiser<vector<pair<int, int> > >::serialise(spairinv, serialised);
 }
 
@@ -196,7 +195,7 @@ Alphabet::deserialise(std::istream &serialised)
   slexic.clear();
   spairinv.clear();
   spair.clear();
-  slexicinv = Deserialiser<vector<wstring> >::deserialise(serialised);
+  slexicinv = Deserialiser<vector<UnicodeString> >::deserialise(serialised);
   for (size_t i = 0; i < slexicinv.size(); i++) {
     slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics
   }
@@ -207,20 +206,20 @@ Alphabet::deserialise(std::istream &serialised)
 }
 
 void
-Alphabet::writeSymbol(int const symbol, FILE *output) const
+Alphabet::writeSymbol(int const symbol, UFILE *output) const
 {
   if(symbol < 0)
   {
-    fputws_unlocked(slexicinv[-symbol-1].c_str(), output);
+    u_fputs(slexicinv[-symbol-1], output);
   }
   else
   {
-    fputwc_unlocked(static_cast<wchar_t>(symbol), output);
+    u_fputc(static_cast<UChar>(symbol), output);
   }
 }
 
 void
-Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
+Alphabet::getSymbol(UnicodeString &result, int const symbol, bool uppercase) const
 {
   if(symbol == 0)
   {
@@ -231,7 +230,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
   {
     if(symbol >= 0)
     {
-      result += static_cast<wchar_t>(symbol);
+      result += static_cast<UChar>(symbol);
     }
     else
     {
@@ -240,7 +239,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const
   }
   else if(symbol >= 0)
   {
-    result += static_cast<wchar_t>(towupper(static_cast<wint_t>(symbol)));
+    result += static_cast<UChar>(toupper(static_cast<wint_t>(symbol)));
   }
   else
   {
@@ -261,7 +260,7 @@ Alphabet::decode(int const code) const
 }
 
 set<int>
-Alphabet::symbolsWhereLeftIs(wchar_t l) const {
+Alphabet::symbolsWhereLeftIs(UChar l) const {
   set<int> eps;
   for(const auto& sp: spair) {  // [(l, r) : tag]
     if(sp.first.first == l) {
@@ -271,7 +270,7 @@ Alphabet::symbolsWhereLeftIs(wchar_t l) const {
   return eps;
 }
 
-void Alphabet::setSymbol(int symbol, wstring newSymbolString) {
+void Alphabet::setSymbol(int symbol, UnicodeString newSymbolString) {
   //Should be a special character!
   if (symbol < 0) slexicinv[-symbol-1] = newSymbolString;
 }
diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h
index 3218334..9d59da1 100644
--- a/lttoolbox/alphabet.h
+++ b/lttoolbox/alphabet.h
@@ -22,10 +22,11 @@
 #include <map>
 #include <set>
 #include <vector>
-
-#include <lttoolbox/ltstr.h>
+#include <unicode/unistr.h>
+#include <unicode/ustdio.h>
 
 using namespace std;
+using namespace icu;
 
 /**
  * Alphabet class.
@@ -38,13 +39,13 @@ private:
    * Symbol-identifier relationship. Only contains <tags>.
    * @see slexicinv
    */
-  map<wstring, int, Ltstr> slexic;
+  map<UnicodeString, int> slexic;
 
   /**
    * Identifier-symbol relationship. Only contains <tags>.
    * @see slexic
    */
-  vector<wstring> slexicinv;
+  vector<UnicodeString> slexicinv;
 
 
   /**
@@ -89,7 +90,7 @@ public:
   /**
    * Include a symbol into the alphabet.
    */
-  void includeSymbol(wstring const &s);
+  void includeSymbol(UnicodeString const &s);
 
   /**
    * Get an unique code for every symbol pair.  This flavour is for
@@ -99,7 +100,7 @@ public:
    * @return code for (c1, c2).
    */
   int operator()(int const c1, int const c2);
-  int operator()(wstring const &s) const;
+  int operator()(UnicodeString const &s) const;
 
   /**
    * Gets the individual symbol identifier. Assumes it already exists!
@@ -107,14 +108,14 @@ public:
    * @param s symbol to be identified.
    * @return symbol identifier.
    */
-  int operator()(wstring const &s);
+  int operator()(UnicodeString const &s);
 
   /**
    * Check wether the symbol is defined in the alphabet.
    * @param s symbol
    * @return true if defined
    */
-  bool isSymbolDefined(wstring const &s);
+  bool isSymbolDefined(UnicodeString const &s);
 
   /**
    * Returns the size of the alphabet (number of symbols).
@@ -142,7 +143,7 @@ public:
    * @param symbol symbol code.
    * @param output output stream.
    */
-  void writeSymbol(int const symbol, FILE *output) const;
+  void writeSymbol(int const symbol, UFILE *output) const;
 
   /**
    * Concat a symbol in the string that is passed by reference.
@@ -150,7 +151,7 @@ public:
    * @param symbol code of the symbol
    * @param uppercase true if we want an uppercase symbol
    */
-  void getSymbol(wstring &result, int const symbol,
+  void getSymbol(UnicodeString &result, int const symbol,
 		 bool uppercase = false) const;
 
   /**
@@ -165,7 +166,7 @@ public:
    * @param symbol the code of the symbol to set
    * @param newSymbolString the new string for this symbol
    */
-  void setSymbol(int symbol, wstring newSymbolString);
+  void setSymbol(int symbol, UnicodeString newSymbolString);
 
   /**
    * Note: both the symbol int and int-pair are specific to this alphabet instance.
@@ -178,7 +179,7 @@ public:
   /**
    * Get all symbols where the left-hand side of the symbol-pair is l.
    */
-  set<int> symbolsWhereLeftIs(wchar_t l) const;
+  set<int> symbolsWhereLeftIs(UChar l) const;
 
   enum Side
   {
diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc
index a511f5a..6d33b96 100644
--- a/lttoolbox/att_compiler.cc
+++ b/lttoolbox/att_compiler.cc
@@ -22,17 +22,30 @@
 #include <lttoolbox/string_to_wostream.h>
 #include <algorithm>
 #include <stack>
+#include <unicode/unistr.h>
+#include <unicode/numfmt.h>
 
 using namespace std;
+using namespace icu;
 
 AttCompiler::AttCompiler() :
 starting_state(0),
 default_weight(0.0000)
 {
+  UErrorCode status = U_ZERO_ERROR;
+  int_parser = NumberFormat::createInstance(status);
+  int_parser->setParseIntergerOnly(true);
+  float_parser = NumberFormat::createInstance(status);
+  if (status != U_ZERO_ERROR) {
+    cerr << "Error: unable to set up numeric converter." << endl;
+    exit(EXIT_FAILURE);
+  }
 }
 
 AttCompiler::~AttCompiler()
 {
+  delete int_parser;
+  delete float_parser;
 }
 
 void
@@ -46,26 +59,52 @@ AttCompiler::clear()
   alphabet = Alphabet();
 }
 
+int
+AttCompiler::parse_state(const UnicodeString& s, int line)
+{
+  UErrorCode status = U_ZERO_ERROR;
+  Formattable result;
+  int_parser->parse(s, result, status);
+  if (status != U_ZERO_ERROR) {
+    cerr << "ERROR: Unable to parse state number on line " << line << "." << endl;
+    // TODO: error messages should also print file names
+  }
+  return result.getLong();
+}
+
+double
+AttCompiler::parse_weight(const UnicodeString& s, int line)
+{
+  UErrorCode status = U_ZERO_ERROR;
+  Formattable result;
+  float_parser->parse(s, result, status);
+  if (status != U_ZERO_ERROR) {
+    cerr << "ERROR: Unable to parse state number on line " << line << "." << endl;
+    // TODO: error messages should also print file names
+  }
+  return result.getDouble();
+}
+
 /**
  * Converts symbols like @0@ to epsilon, @_SPACE_@ to space, etc.
  * @todo Are there other special symbols? If so, add them, and maybe use a map
  *       for conversion?
  */
 void
-AttCompiler::convert_hfst(wstring& symbol)
+AttCompiler::convert_hfst(UnicodeString& symbol)
 {
-  if (symbol == L"@0@" || symbol == L"ε")
+  if (symbol == "@0@" || symbol == "ε")
   {
-    symbol = L"";
+    symbol = "";
   }
-  else if (symbol == L"@_SPACE_@")
+  else if (symbol == "@_SPACE_@")
   {
-    symbol = L" ";
+    symbol = " ";
   }
 }
 
 bool
-AttCompiler::is_word_punct(wchar_t symbol)
+AttCompiler::is_word_punct(UChar symbol)
 {
   // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges
   if((symbol >= 0x0300 && symbol <= 0x036F) // Combining Diacritics
@@ -90,12 +129,12 @@ AttCompiler::is_word_punct(wchar_t symbol)
  *         only) character otherwise.
  */
 int
-AttCompiler::symbol_code(const wstring& symbol)
+AttCompiler::symbol_code(const UnicodeString& symbol)
 {
   if (symbol.length() > 1) {
     alphabet.includeSymbol(symbol);
     return alphabet(symbol);
-  } else if (symbol == L"") {
+  } else if (symbol == "") {
     return 0;
   } else if ((iswpunct(symbol[0]) || iswspace(symbol[0])) && !is_word_punct(symbol[0])) {
     return symbol[0];
@@ -128,77 +167,85 @@ AttCompiler::has_multiple_fsts(string const &file_name)
 }
 
 void
-AttCompiler::parse(string const &file_name, wstring const &dir)
+AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir)
 {
   clear();
 
-  wifstream infile(file_name.c_str());  // TODO: error checking
-  vector<wstring> tokens;
-  wstring line;
+  UFILE* infile = u_fopen_u(file_name, "r");
+  if (infile == NULL) {
+    cerr << "Error: unable to open '" << file_name << "' for reading." << endl;
+  }
+  vector<UnicodeString> tokens;
   bool first_line_in_fst = true;       // First line -- see below
-  int state_id_offset = 0;
+  bool multiple_transducers = false;
+  int state_id_offset = 1;
   int largest_seen_state_id = 0;
+  int line_number = 0;
 
-  if (has_multiple_fsts(file_name)){
-    wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl;
-
-    // Set the starting state to 0 (Epsilon transtions will be added later)
-    starting_state = 0;
-    state_id_offset = 1;
-  }
-
-  while (getline(infile, line))
+  while (!u_feof(infile))
   {
+    lint_number++;
     tokens.clear();
+    tokens.push_back("");
+    do {
+      UChar32 c = u_fgetcx(infile);
+      if (c == '\n') {
+        break;
+      } else if (c == '\t') {
+        tokens.push_back("");
+      } else {
+        tokens.back() += c;
+      }
+    } while (!u_feof(infile));
+
     int from, to;
     wstring upper, lower;
     double weight;
 
-    if (line.length() == 0 && first_line_in_fst)
+    if (tokens[0].length() == 0 && first_line_in_fst)
     {
-      wcerr << "Error: empty file '" << file_name << "'." << endl;
+      cerr << "Error: empty file '" << file_name << "'." << endl;
       exit(EXIT_FAILURE);
     }
-    if (first_line_in_fst && line.find(L"\t") == wstring::npos)
+    if (first_line_in_fst && tokens.size() == 1)
     {
-      wcerr << "Error: invalid format '" << file_name << "'." << endl;
+      cerr << "Error: invalid format in file '" << file_name << "' on line " << line_number << "." << endl;
       exit(EXIT_FAILURE);
     }
 
     /* Empty line. */
-    if (line.length() == 0)
+    if (tokens.size() == 1 && tokens[0].length() == 0)
     {
       continue;
     }
-    split(line, L'\t', tokens);
 
     if (tokens[0].find('-') == 0)
     {
+      if (state_id_offset == 1) {
+        // this is the first split we've seen
+        cerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl;
+        multiple_transducers = true;
+      }
       // Update the offset for the new FST
       state_id_offset = largest_seen_state_id + 1;
       first_line_in_fst = true;
       continue;
     }
 
-    from = stoi(tokens[0]) + state_id_offset;
+    from = parse_state(tokens[0]) + state_id_offset;
     largest_seen_state_id = max(largest_seen_state_id, from);
 
     AttNode* source = get_node(from);
     /* First line: the initial state is of both types. */
     if (first_line_in_fst)
     {
-      // If the file has a single FST - No need for state id mapping
-      if (state_id_offset == 0)
-        starting_state = from;
-      else{
-        AttNode * starting_node = get_node(starting_state);
-
-        // Add an Epsilon transition from the new starting state
-        starting_node->transductions.push_back(
-          Transduction(from, L"", L"",
-            alphabet(symbol_code(L""), symbol_code(L"")),
-            default_weight));
-      }
+      AttNode * starting_node = get_node(starting_state);
+
+      // Add an Epsilon transition from the new starting state
+      starting_node->transductions.push_back(
+        Transduction(from, L"", L"",
+                     alphabet(symbol_code(L""), symbol_code(L"")),
+                     default_weight));
       first_line_in_fst = false;
     }
 
@@ -207,7 +254,7 @@ AttCompiler::parse(string const &file_name, wstring const &dir)
     {
       if (tokens.size() > 1)
       {
-        weight = stod(tokens[1]);
+        weight = parse_weight(tokens[1]);
       }
       else
       {
@@ -217,9 +264,9 @@ AttCompiler::parse(string const &file_name, wstring const &dir)
     }
     else
     {
-      to = stoi(tokens[1]) + state_id_offset;
+      to = parse_state(tokens[1]) + state_id_offset;
       largest_seen_state_id = max(largest_seen_state_id, to);
-      if(dir == L"RL")
+      if(dir == "RL")
       {
         upper = tokens[3];
         lower = tokens[2];
@@ -247,12 +294,19 @@ AttCompiler::parse(string const &file_name, wstring const &dir)
     }
   }
 
+  if (!multiple_transducers) {
+    starting_state = 1;
+    // if we aren't disjuncting multiple transducers
+    // then we have an extra epsilon transduction at the beginning
+    // so skip it
+  }
+
   /* Classify the nodes of the graph. */
   classify_forwards();
   set<int> path;
   classify_backwards(starting_state, path);
 
-  infile.close();
+  u_fclose(infile);
 }
 
 /** Extracts the sub-transducer made of states of type @p type. */
diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h
index 126ca56..a03179a 100644
--- a/lttoolbox/att_compiler.h
+++ b/lttoolbox/att_compiler.h
@@ -28,6 +28,9 @@
 #include <lttoolbox/transducer.h>
 #include <lttoolbox/compression.h>
 
+#include <unicode/numfmt.h>
+#include <unicode/unistr.h>
+
 #include <cstdlib>
 
 #define UNDECIDED 0
@@ -36,25 +39,11 @@
 #define BOTH      3
 
 using namespace std;
+using namespace icu;
 
 /** Bitmask; 1 = WORD, 2 = PUNCT, 3 = BOTH. */
 typedef unsigned int TransducerType;
 
-namespace
-{
-  /** Splits a string into fields. */
-  inline vector<wstring>& split(const wstring& s, wchar_t delim, vector<wstring> &out)
-  {
-      wistringstream ss(s);
-      wstring item;
-      while (getline(ss, item, delim))
-      {
-        out.push_back(item);
-      }
-      return out;
-  }
-};
-
 /**
  * Converts transducers from AT&T text format to lt binary format.
  *
@@ -91,7 +80,7 @@ public:
    * Reads the AT&T format file @p file_name. The transducer and the alphabet
    * are both cleared before reading the new file.
    */
-  void parse(string const &file_name, wstring const &dir);
+  void parse(UnicodeString const &file_name, UnicodeString const &dir);
 
   /** Writes the transducer to @p file_name in lt binary format. */
 
@@ -113,20 +102,20 @@ private:
 
   Alphabet alphabet;
   /** All non-multicharacter symbols. */
-  set<wchar_t> letters;
+  set<UChar> letters;
 
   /** Used in AttNode. */
   struct Transduction
   {
     int            to;
-    wstring        upper;
-    wstring        lower;
+    UnicodeString  upper;
+    UnicodeString  lower;
     int            tag;
     double         weight;
     TransducerType type;
 
-    Transduction(int to, wstring upper, wstring lower, int tag, double weight,
-                 TransducerType type=UNDECIDED) :
+    Transduction(int to, UnicodeString upper, UnicodeString lower, int tag,
+                 double weight, TransducerType type=UNDECIDED) :
       to(to), upper(upper), lower(lower), tag(tag), weight(weight), type(type) {}
   };
 
@@ -170,7 +159,7 @@ private:
    * Returns true for combining diacritics and modifier letters
    *
    */
-  bool is_word_punct(wchar_t symbol);
+  bool is_word_punct(UChar symbol);
 
   /**
    * Determines initial type of single transition
@@ -186,7 +175,7 @@ private:
    * @todo Are there other special symbols? If so, add them, and maybe use a map
    *       for conversion?
    */
-  void convert_hfst(wstring& symbol);
+  void convert_hfst(UnicodeString& symbol);
 
   /**
    * Returns the code of the symbol in the alphabet. Run after convert_hfst has
@@ -197,12 +186,15 @@ private:
    * @return the code of the symbol, if @p symbol is multichar; its first (and
    *         only) character otherwise.
    */
-  int symbol_code(const wstring& symbol);
+  int symbol_code(const UnicodeString& symbol);
 
   /**
-   * Finds whether an at&t file contains multiple FSTs or not
-  */
-  bool has_multiple_fsts(string const &file_name);
+   * Wrappers around ICU number parsing functions
+   */
+  NumberFormat* int_parser;
+  NumberFormat* float_parser;
+  int parse_state(const UnicodeString& s, int line);
+  double parse_weight(const UnicodeString& s, int line);
 };
 
 #endif /* _MYATT_COMPILER_ */
diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc
index 0ba78b5..bcafbdc 100644
--- a/lttoolbox/compression.cc
+++ b/lttoolbox/compression.cc
@@ -254,7 +254,7 @@ Compression::multibyte_read(istream &input)
 
 
 void
-Compression::wstring_write(wstring const &str, FILE *output)
+Compression::string_write(UnicodeString const &str, FILE *output)
 {
   Compression::multibyte_write(str.size(), output);
   for(auto c : str)
@@ -264,38 +264,14 @@ Compression::wstring_write(wstring const &str, FILE *output)
 }
 
 wstring
-Compression::wstring_read(FILE *input)
-{
-  wstring retval = L"";
-
-  for(unsigned int i = 0, limit = Compression::multibyte_read(input);
-      i != limit; i++)
-  {
-    retval += static_cast<wchar_t>(Compression::multibyte_read(input));
-  }
-
-  return retval;
-}
-
-void
-Compression::string_write(string const &str, FILE *output)
-{
-  Compression::multibyte_write(str.size(), output);
-  for(auto c : str)
-  {
-    Compression::multibyte_write(static_cast<int>(c), output);
-  }
-}
-
-string
 Compression::string_read(FILE *input)
 {
-  string retval = "";
+  UnicodeString retval = "";
 
   for(unsigned int i = 0, limit = Compression::multibyte_read(input);
       i != limit; i++)
   {
-    retval += static_cast<char>(Compression::multibyte_read(input));
+    retval += static_cast<UChar>(Compression::multibyte_read(input));
   }
 
   return retval;
diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h
index 8af6cf9..80798b4 100644
--- a/lttoolbox/compression.h
+++ b/lttoolbox/compression.h
@@ -19,11 +19,12 @@
 
 #include <cstdio>
 #include <cstdint>
-#include <string>
+#include <unicode/unistr.h>
 #include <iostream>
 #include <stdexcept>
 
 using namespace std;
+using namespace icu;
 
 // Global lttoolbox features
 constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'};
@@ -174,23 +175,6 @@ public:
    */
   static unsigned int multibyte_read(istream &is);
 
-  /**
-   * This method allows to write a wide string to an output stream
-   * using its UCSencoding as integer.
-   * @see wstring_read()
-   * @param str the string to write.
-   * @param output the output stream.
-   */
-  static void wstring_write(wstring const &str, FILE *output);
-
-  /**
-   * This method reads a wide string from the input stream.
-   * @see wstring_write()
-   * @param input the input stream.
-   * @return the wide string read.
-   */
-  static wstring wstring_read(FILE *input);
-
   /**
    * This method allows to write a plain string to an output stream
    * using its UCSencoding as integer.
@@ -198,7 +182,7 @@ public:
    * @param str the string to write.
    * @param output the output stream.
    */
-  static void string_write(string const &str, FILE *output);
+  static void string_write(UnicodeString const &str, FILE *output);
 
   /**
    * This method reads a plain string from the input stream.
@@ -206,7 +190,7 @@ public:
    * @param input the input stream.
    * @return the string read.
    */
-  static string string_read(FILE *input);
+  static UnicodeString string_read(FILE *input);
 
   /**
    * Encodes a double value and writes it into the output stream
diff --git a/lttoolbox/deserialiser.h b/lttoolbox/deserialiser.h
index 4697640..c0c5c53 100644
--- a/lttoolbox/deserialiser.h
+++ b/lttoolbox/deserialiser.h
@@ -33,6 +33,8 @@
 #include <type_traits>
 #include <iterator>
 
+#include <unicode/unistr.h>
+
 template <typename DeserialisedType> class Deserialiser;
 
 template <typename value_type>
@@ -111,6 +113,13 @@ Deserialiser<std::basic_string<value_type> >::deserialise(
   return SerialisedType_;
 }
 
+template <>
+icu::UnicodeString
+Deserialiser<icu::UnicodeString>::deserialise(std::istream &Stream_) {
+  std::string s = Deserialiser<std::string>::deserialise(Stream_);
+  return icu::UnicodeString::fromUTF8(s);
+}
+
 template <typename first_type, typename second_type>
 std::pair<first_type, second_type>
 Deserialiser<std::pair<first_type, second_type> >::deserialise(
diff --git a/lttoolbox/serialiser.h b/lttoolbox/serialiser.h
index 01abb3e..99f85b1 100644
--- a/lttoolbox/serialiser.h
+++ b/lttoolbox/serialiser.h
@@ -30,6 +30,8 @@
 #include <utility>
 #include <vector>
 
+#include <unicode/unistr.h>
+
 namespace {
 template <typename SerialisedType>
 static unsigned char compressedSize(const SerialisedType &SerialisedType_) {
@@ -143,6 +145,13 @@ void Serialiser<std::basic_string<value_type> >::serialise(
   }
 }
 
+template <>
+void Serialiser<icu::UnicodeString>::serialise(const icu::UnicodeString& s,
+                                               std::ostream& Output) {
+  std::string temp;
+  ::serialise(s.toUTF8String(temp), Output);
+}
+
 template <typename first_type, typename second_type>
 void Serialiser<std::pair<first_type, second_type> >::serialise(
     const std::pair<first_type, second_type> &SerialisedType_,
diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc
new file mode 100644
index 0000000..3ee4e94
--- /dev/null
+++ b/lttoolbox/string_utils.cc
@@ -0,0 +1,7 @@
+#include "string_utils.h"
+
+void
+u_fputs(const UnicodeString str, UFILE* output)
+{
+  u_fprintf(output, "%S", str.getTerminatedBuffer());
+}
diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h
new file mode 100644
index 0000000..5cf6b9e
--- /dev/null
+++ b/lttoolbox/string_utils.h
@@ -0,0 +1,9 @@
+#ifndef _LT_STRING_UTILS_H_
+#define _LT_STRING_UTILS_H_
+
+#include <unicode/unistr.h>
+#include <unicode/ustdio.h>
+
+void u_fputs(const UnicodeString str, UFILE* output);
+
+#endif
\ No newline at end of file