commit ac7867f9a3fbd34fc928468e0d5d9ea16d6e1e2b
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Thu Jun 3 11:14:15 2021 -0500

    use utf-32 sometimes and some type cleanup

diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc
index 9ac7cd1..a01524e 100644
--- a/lttoolbox/alphabet.cc
+++ b/lttoolbox/alphabet.cc
@@ -33,8 +33,8 @@ using namespace icu;
 
 Alphabet::Alphabet()
 {
-  spair[pair<int, int>(0,0)] = 0;
-  spairinv.push_back(pair<int, int>(0,0));
+  spair[pair<int32_t, int32_t>(0,0)] = 0;
+  spairinv.push_back(pair<int32_t, int32_t>(0,0));
 }
 
 Alphabet::~Alphabet()
@@ -77,19 +77,19 @@ Alphabet::includeSymbol(UString const &s)
 {
   if(slexic.find(s) == slexic.end())
   {
-    int slexic_size = slexic.size();
+    int32_t slexic_size = slexic.size();
     slexic[s] = -(slexic_size+1);
     slexicinv.push_back(s);
   }
 }
 
-int
-Alphabet::operator()(int const c1, int const c2)
+int32_t
+Alphabet::operator()(int32_t const c1, int32_t const c2)
 {
   auto tmp = make_pair(c1, c2);
   if(spair.find(tmp) == spair.end())
   {
-    int spair_size = spair.size();
+    int32_t spair_size = spair.size();
     spair[tmp] = spair_size;
     spairinv.push_back(tmp);
   }
@@ -97,13 +97,13 @@ Alphabet::operator()(int const c1, int const c2)
   return spair[tmp];
 }
 
-int
+int32_t
 Alphabet::operator()(UString const &s)
 {
   return slexic[s];
 }
 
-int
+int32_t
 Alphabet::operator()(UString const &s) const
 {
   auto it = slexic.find(s);
@@ -119,7 +119,7 @@ Alphabet::isSymbolDefined(UString const &s)
   return slexic.find(s) != slexic.end();
 }
 
-int
+int32_t
 Alphabet::size() const
 {
   return slexic.size();
@@ -130,16 +130,16 @@ Alphabet::write(FILE *output)
 {
   // First, we write the taglist
   Compression::multibyte_write(slexicinv.size(), output);  // taglist size
-  for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++)
+  for(size_t i = 0, limit = slexicinv.size(); i < limit; i++)
   {
     Compression::string_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output);
   }
 
   // Then we write the list of pairs
   // All numbers are biased + slexicinv.size() to be positive or zero
-  unsigned int bias = slexicinv.size();
+  size_t bias = slexicinv.size();
   Compression::multibyte_write(spairinv.size(), output);
-  for(unsigned int i = 0, limit = spairinv.size(); i != limit; i++)
+  for(size_t i = 0, limit = spairinv.size(); i != limit; i++)
   {
     Compression::multibyte_write(spairinv[i].first + bias, output);
     Compression::multibyte_write(spairinv[i].second + bias, output);
@@ -154,8 +154,8 @@ Alphabet::read(FILE *input)
   a_new.spair.clear();
 
   // Reading of taglist
-  int tam = Compression::multibyte_read(input);
-  map<int, string> tmp;
+  int32_t tam = Compression::multibyte_read(input);
+  map<int32_t, string> tmp;
   while(tam > 0)
   {
     tam--;
@@ -167,15 +167,15 @@ Alphabet::read(FILE *input)
   }
 
   // Reading of pairlist
-  unsigned int bias = a_new.slexicinv.size();
+  size_t bias = a_new.slexicinv.size();
   tam = Compression::multibyte_read(input);
   while(tam > 0)
   {
     tam--;
-    int first = Compression::multibyte_read(input);
-    int second = Compression::multibyte_read(input);
-    pair<int, int> tmp(first - bias, second - bias);
-    int spair_size = a_new.spair.size();
+    int32_t first = Compression::multibyte_read(input);
+    int32_t second = Compression::multibyte_read(input);
+    pair<int32_t, int32_t> tmp(first - bias, second - bias);
+    int32_t spair_size = a_new.spair.size();
     a_new.spair[tmp] = spair_size;
     a_new.spairinv.push_back(tmp);
   }
@@ -187,7 +187,7 @@ void
 Alphabet::serialise(std::ostream &serialised) const
 {
   Serialiser<const vector<UString> >::serialise(slexicinv, serialised);
-  Serialiser<vector<pair<int, int> > >::serialise(spairinv, serialised);
+  Serialiser<vector<pair<int32_t, int32_t> > >::serialise(spairinv, serialised);
 }
 
 void
@@ -201,14 +201,14 @@ Alphabet::deserialise(std::istream &serialised)
   for (size_t i = 0; i < slexicinv.size(); i++) {
     slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics
   }
-  spairinv = Deserialiser<vector<pair<int, int> > >::deserialise(serialised);
+  spairinv = Deserialiser<vector<pair<int32_t, int32_t> > >::deserialise(serialised);
   for (size_t i = 0; i < slexicinv.size(); i++) {
     spair[spairinv[i]] = i;
   }
 }
 
 void
-Alphabet::writeSymbol(int const symbol, UFILE *output) const
+Alphabet::writeSymbol(int32_t const symbol, UFILE *output) const
 {
   if(symbol < 0)
   {
@@ -221,7 +221,7 @@ Alphabet::writeSymbol(int const symbol, UFILE *output) const
 }
 
 void
-Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const
+Alphabet::getSymbol(UString &result, int32_t const symbol, bool uppercase) const
 {
   if(symbol == 0)
   {
@@ -232,7 +232,7 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const
   {
     if(symbol >= 0)
     {
-      result += static_cast<UChar>(symbol);
+      result += static_cast<UChar32>(symbol);
     }
     else
     {
@@ -241,7 +241,7 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const
   }
   else if(symbol >= 0)
   {
-    result += u_toupper(static_cast<UChar>(symbol));
+    result += u_toupper(static_cast<UChar32>(symbol));
   }
   else
   {
@@ -250,20 +250,20 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const
 }
 
 bool
-Alphabet::isTag(int const symbol) const
+Alphabet::isTag(int32_t const symbol) const
 {
   return symbol < 0;
 }
 
-pair<int, int> const &
-Alphabet::decode(int const code) const
+pair<int32_t, int32_t> const &
+Alphabet::decode(int32_t const code) const
 {
   return spairinv[code];
 }
 
-set<int>
+set<int32_t>
 Alphabet::symbolsWhereLeftIs(UChar l) const {
-  set<int> eps;
+  set<int32_t> eps;
   for(const auto& sp: spair) {  // [(l, r) : tag]
     if(sp.first.first == l) {
       eps.insert(sp.second);
@@ -272,17 +272,17 @@ Alphabet::symbolsWhereLeftIs(UChar l) const {
   return eps;
 }
 
-void Alphabet::setSymbol(int symbol, UString newSymbolString) {
+void Alphabet::setSymbol(int32_t symbol, UString newSymbolString) {
   //Should be a special character!
   if (symbol < 0) slexicinv[-symbol-1] = newSymbolString;
 }
 
 void
-Alphabet::createLoopbackSymbols(set<int> &symbols, Alphabet &basis, Side s, bool nonTagsToo)
+Alphabet::createLoopbackSymbols(set<int32_t> &symbols, Alphabet &basis, Side s, bool nonTagsToo)
 {
-  // Non-tag letters get the same int in spairinv across alphabets,
+  // Non-tag letters get the same int32_t in spairinv across alphabets,
   // but tags may differ, so do those separately afterwards.
-  set<int> tags;
+  set<int32_t> tags;
   for(auto& it : basis.spairinv)
   {
     if(s == left) {
diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h
index 807b656..a300242 100644
--- a/lttoolbox/alphabet.h
+++ b/lttoolbox/alphabet.h
@@ -22,8 +22,7 @@
 #include <map>
 #include <set>
 #include <vector>
-#include <unicode/unistr.h>
-#include <unicode/ustdio.h>
+#include <cstdint>
 #include "ustring.h"
 
 using namespace std;
@@ -40,7 +39,7 @@ private:
    * Symbol-identifier relationship. Only contains <tags>.
    * @see slexicinv
    */
-  map<UString, int> slexic;
+  map<UString, int32_t> slexic;
 
   /**
    * Identifier-symbol relationship. Only contains <tags>.
@@ -54,13 +53,13 @@ private:
    * other characters are wchar_t's casted to ints.
    * @see spairinv
    */
-  map<pair<int,int>, int> spair;
+  map<pair<int32_t, int32_t>, int32_t> spair;
 
   /**
    * All symbol-pairs (both <tags> and letters).
    * @see spair
    */
-  vector<pair<int, int> > spairinv;
+  vector<pair<int32_t, int32_t> > spairinv;
 
 
   void copy(Alphabet const &a);
@@ -100,8 +99,8 @@ public:
    * @param c2 right symbol.
    * @return code for (c1, c2).
    */
-  int operator()(int const c1, int const c2);
-  int operator()(UString const &s) const;
+  int32_t operator()(int32_t const c1, int32_t const c2);
+  int32_t operator()(UString const &s) const;
 
   /**
    * Gets the individual symbol identifier. Assumes it already exists!
@@ -109,7 +108,7 @@ public:
    * @param s symbol to be identified.
    * @return symbol identifier.
    */
-  int operator()(UString const &s);
+  int32_t operator()(UString const &s);
 
   /**
    * Check wether the symbol is defined in the alphabet.
@@ -122,7 +121,7 @@ public:
    * Returns the size of the alphabet (number of symbols).
    * @return number of symbols.
    */
-  int size() const;
+  int32_t size() const;
 
   /**
    * Write method.
@@ -144,7 +143,7 @@ public:
    * @param symbol symbol code.
    * @param output output stream.
    */
-  void writeSymbol(int const symbol, UFILE *output) const;
+  void writeSymbol(int32_t const symbol, UFILE *output) const;
 
   /**
    * Concat a symbol in the string that is passed by reference.
@@ -152,7 +151,7 @@ public:
    * @param symbol code of the symbol
    * @param uppercase true if we want an uppercase symbol
    */
-  void getSymbol(UString &result, int const symbol,
+  void getSymbol(UString &result, int32_t const symbol,
 		 bool uppercase = false) const;
 
   /**
@@ -160,14 +159,14 @@ public:
    * @param symbol the code of the symbol
    * @return true if the symbol is a tag
    */
-  bool isTag(int const symbol) const;
+  bool isTag(int32_t const symbol) const;
 
   /**
    * Sets an already existing symbol to represent a new value.
    * @param symbol the code of the symbol to set
    * @param newSymbolString the new string for this symbol
    */
-  void setSymbol(int symbol, UString newSymbolString);
+  void setSymbol(int32_t symbol, UString newSymbolString);
 
   /**
    * Note: both the symbol int and int-pair are specific to this alphabet instance.
@@ -175,12 +174,12 @@ public:
    * @param code a symbol
    * @return the pair which code represents in this alphabet
    */
-  pair<int, int> const & decode(int const code) const;
+  pair<int32_t, int32_t> const & decode(int32_t const code) const;
 
   /**
    * Get all symbols where the left-hand side of the symbol-pair is l.
    */
-  set<int> symbolsWhereLeftIs(UChar l) const;
+  set<int32_t> symbolsWhereLeftIs(UChar l) const;
 
   enum Side
   {
@@ -197,7 +196,7 @@ public:
    * @param s whether to loopback on the left or right side of the symbol-pair
    * @param nonTagsToo by default only tags are included, but if this is true we include all symbols
    */
-  void createLoopbackSymbols(set<int> &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false);
+  void createLoopbackSymbols(set<int32_t> &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false);
 };
 
 #endif
diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc
index ee01b42..7e14624 100644
--- a/lttoolbox/att_compiler.cc
+++ b/lttoolbox/att_compiler.cc
@@ -25,6 +25,8 @@
 #include <unicode/unistr.h>
 #include <unicode/numfmt.h>
 #include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#include <utf8.h>
 
 using namespace std;
 using namespace icu;
@@ -95,24 +97,31 @@ AttCompiler::is_word_punct(UChar symbol)
 int
 AttCompiler::symbol_code(const UString& symbol)
 {
-  if (symbol.length() > 1) {
+  if (u_strHasMoreChar32Than(symbol.c_str(), -1, 1)) {
     alphabet.includeSymbol(symbol);
     return alphabet(symbol);
   } else if (symbol.empty()) {
     return 0;
-  } else if ((u_ispunct(symbol[0]) || u_isspace(symbol[0])) && !is_word_punct(symbol[0])) {
-    return symbol[0];
   } else {
-    letters.insert(symbol[0]);
-    if(u_islower(symbol[0]))
-    {
-      letters.insert(u_toupper(symbol[0]));
+    UChar32 c = symbol[0];
+    if (symbol.size() > 1) {
+      vector<char> v8;
+      vector<UChar32> v32;
+      utf8::utf16to8(symbol.begin(), symbol.end(), std::back_inserter(v8));
+      utf8::utf8to32(v8.begin(), v8.end(), std::back_inserter(v32));
+      c = v32[0];
     }
-    else if(u_isupper(symbol[0]))
-    {
-      letters.insert(u_tolower(symbol[0]));
+    if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) {
+      return c;
+    } else {
+      letters.insert(c);
+      if(u_islower(c)) {
+        letters.insert(u_toupper(c));
+      } else if(u_isupper(c)) {
+        letters.insert(u_tolower(c));
+      }
+      return c;
     }
-    return symbol[0];
   }
 }
 
@@ -138,7 +147,7 @@ AttCompiler::parse(string const &file_name, bool read_rl)
     tokens.clear();
     tokens.push_back(""_u);
     do {
-      UChar32 c = u_fgetcx(infile);
+      UChar c = u_fgetc(infile);
       if (c == '\n') {
         break;
       } else if (c == '\t') {
@@ -355,11 +364,12 @@ AttCompiler::_extract_transducer(TransducerType type, int from,
 void
 AttCompiler::classify_single_transition(Transduction& t)
 {
-  if (t.upper.length() == 1) {
-    if (letters.find(t.upper[0]) != letters.end()) {
+  int32_t sym = alphabet.decode(t.tag).first;
+  if (sym > 0) {
+    if (letters.find(sym) != letters.end()) {
       t.type |= WORD;
     }
-    if (u_ispunct(t.upper[0])) {
+    if (u_ispunct(sym)) {
       t.type |= PUNCT;
     }
   }
@@ -453,14 +463,14 @@ AttCompiler::write(FILE *output)
   Compression::string_write("main@standard"_u, output);
   Transducer word_fst = extract_transducer(WORD);
   word_fst.write(output);
-  wcout << L"main@standard" << " " << word_fst.size();
-  wcout << " " << word_fst.numberOfTransitions() << endl;
+  cout << "main@standard" << " " << word_fst.size();
+  cout << " " << word_fst.numberOfTransitions() << endl;
   Compression::string_write("final@inconditional"_u, output);
   if(punct_fst.numberOfTransitions() != 0)
   {
     punct_fst.write(output);
-    wcout << L"final@inconditional" << " " << punct_fst.size();
-    wcout << " " << punct_fst.numberOfTransitions() << endl;
+    cout << "final@inconditional" << " " << punct_fst.size();
+    cout << " " << punct_fst.numberOfTransitions() << endl;
   }
 //  fclose(output);
 }
diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc
index c60a6d2..849ac63 100644
--- a/lttoolbox/fst_processor.cc
+++ b/lttoolbox/fst_processor.cc
@@ -334,8 +334,8 @@ FSTProcessor::readAnalysis(InputFile& input)
     return input_buffer.next();
   }
 
-  UChar val = input.get();
-  int altval = 0;
+  UChar32 val = input.get();
+  int32_t altval = 0;
   if(input.eof())
   {
     input_buffer.add(0);        // so it's treated like the NUL byte
@@ -347,7 +347,7 @@ FSTProcessor::readAnalysis(InputFile& input)
   if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end())
   {
     input_buffer.add(val);
-    val = static_cast<UChar>(input.get());
+    val = input.get();
   }
 
   if(escaped_chars.find(val) != escaped_chars.end())
@@ -355,12 +355,12 @@ FSTProcessor::readAnalysis(InputFile& input)
     switch(val)
     {
       case '<':
-        altval = static_cast<int>(alphabet(readFullBlock(input, '<', '>')));
+        altval = alphabet(readFullBlock(input, '<', '>'));
         input_buffer.add(altval);
         return altval;
 
       case '[':
-        val = static_cast<UChar>(input.get());
+        val = input.get();
 
         if(val == '[')
         {
@@ -372,12 +372,12 @@ FSTProcessor::readAnalysis(InputFile& input)
           blankqueue.push(readFullBlock(input, '[', ']'));
         }
 
-        input_buffer.add(static_cast<int>(' '));
-        return static_cast<int>(' ');
+        input_buffer.add(static_cast<int32_t>(' '));
+        return static_cast<int32_t>(' ');
 
       case '\\':
-        val = static_cast<UChar>(input.get());
-        input_buffer.add(static_cast<int>(val));
+        val = input.get();
+        input_buffer.add(static_cast<int32_t>(val));
         return val;
 
       default:
@@ -1223,7 +1223,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
   bool firstupper = false, uppercase = false;
   map<int, set<int> >::iterator rcx_map_ptr;
 
-  UChar val;
+  UChar32 val;
   do
   {
     val = readAnalysis(input);
diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h
index f4bd51e..5eb0a76 100644
--- a/lttoolbox/fst_processor.h
+++ b/lttoolbox/fst_processor.h
@@ -32,6 +32,7 @@
 #include <queue>
 #include <set>
 #include <string>
+#include <cstdint>
 
 using namespace std;
 
@@ -142,7 +143,7 @@ private:
   /**
    * Input buffer
    */
-  Buffer<int> input_buffer;
+  Buffer<int32_t> input_buffer;
 
   /**
    * Begin of the transducer
@@ -220,7 +221,7 @@ private:
   /**
    * Show or not the controls symbols (as compoundRSymbol)
    */
-   bool showControlSymbols;
+  bool showControlSymbols;
 
   /**
    * Max compound elements
diff --git a/lttoolbox/input_file.cc b/lttoolbox/input_file.cc
index 81ded8e..ac6eda5 100644
--- a/lttoolbox/input_file.cc
+++ b/lttoolbox/input_file.cc
@@ -79,17 +79,11 @@ InputFile::internal_read()
     break;
   }
   memset(ubuffer, 0, 3*sizeof(UChar));
-  utf8::utf8to16(cbuffer, cbuffer+i, ubuffer+1);
-  if (ubuffer[2]) {
-    ubuffer[0] = ubuffer[2];
-    buffer_size = 2;
-  } else {
-    ubuffer[0] = ubuffer[1];
-    buffer_size = 1;
-  }
+  utf8::utf8to32(cbuffer, cbuffer+i, ubuffer);
+  buffer_size = 1;
 }
 
-UChar
+UChar32
 InputFile::get()
 {
   if (!buffer_size) {
@@ -98,7 +92,7 @@ InputFile::get()
   return ubuffer[--buffer_size];
 }
 
-UChar
+UChar32
 InputFile::peek()
 {
   if (!buffer_size) {
@@ -108,7 +102,7 @@ InputFile::peek()
 }
 
 void
-InputFile::unget(UChar c)
+InputFile::unget(UChar32 c)
 {
   // this will probably segfault if called multiple times
   ubuffer[buffer_size++] = c;
diff --git a/lttoolbox/input_file.h b/lttoolbox/input_file.h
index c2d7c35..56608ca 100644
--- a/lttoolbox/input_file.h
+++ b/lttoolbox/input_file.h
@@ -8,7 +8,7 @@ class InputFile
 {
 private:
   FILE* infile;
-  UChar ubuffer[3];
+  UChar32 ubuffer[3];
   char cbuffer[4];
   int buffer_size;
   void internal_read();
@@ -17,9 +17,9 @@ public:
   ~InputFile();
   bool open(char* fname);
   void close();
-  UChar get();
-  UChar peek();
-  void unget(UChar c);
+  UChar32 get();
+  UChar32 peek();
+  void unget(UChar32 c);
   bool eof();
 };
 
diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h
index 435bb0a..907e5e9 100644
--- a/lttoolbox/ustring.h
+++ b/lttoolbox/ustring.h
@@ -40,4 +40,14 @@ inline UString operator "" _u(const char* str, std::size_t len) {
 	return us;
 }
 
+static void operator+=(UString& str, UChar32 c)
+{
+  if (c <= 0xFFFF) {
+    str += static_cast<UChar>(c);
+  } else {
+    str += static_cast<UChar>(0xD800 + ((c - 0x10000) >> 10));
+    str += static_cast<UChar>(0xDC00 + (c & 0x3FF));
+  }
+}
+
 #endif
diff --git a/tests/data/non-bmp.att b/tests/data/non-bmp.att
new file mode 100644
index 0000000..1a1f661
--- /dev/null
+++ b/tests/data/non-bmp.att
@@ -0,0 +1,34 @@
+0	1	𐅀	𐅀	0.000
+0	1	𐅁	𐅁	0.000
+0	1	𐅂	𐅂	0.000
+0	1	𐅃	𐅃	0.000
+0	1	𐅄	𐅄	0.000
+0	1	𐅅	𐅅	0.000
+0	1	𐅆	𐅆	0.000
+0	1	𐅇	𐅇	0.000
+0	1	𐅈	𐅈	0.000
+0	1	𐅉	𐅉	0.000
+0	1	𐅊	𐅊	0.000
+0	1	𐅋	𐅋	0.000
+0	1	𐅌	𐅌	0.000
+0	1	𐅍	𐅍	0.000
+0	1	𐅎	𐅎	0.000
+0	1	𐅏	𐅏	0.000
+1	1	𐅀	𐅀	0.000
+1	1	𐅁	𐅁	0.000
+1	1	𐅂	𐅂	0.000
+1	1	𐅃	𐅃	0.000
+1	1	𐅄	𐅄	0.000
+1	1	𐅅	𐅅	0.000
+1	1	𐅆	𐅆	0.000
+1	1	𐅇	𐅇	0.000
+1	1	𐅈	𐅈	0.000
+1	1	𐅉	𐅉	0.000
+1	1	𐅊	𐅊	0.000
+1	1	𐅋	𐅋	0.000
+1	1	𐅌	𐅌	0.000
+1	1	𐅍	𐅍	0.000
+1	1	𐅎	𐅎	0.000
+1	1	𐅏	𐅏	0.000
+1	2	@0@	<num>	0.000
+2	0.000
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py
index cfbda32..44c7e13 100644
--- a/tests/lt_proc/__init__.py
+++ b/tests/lt_proc/__init__.py
@@ -220,11 +220,16 @@ class SpaceAtEOF(ProcTest):
     flushing = False
 
 
-class NonBMPTest(ProcTest):
+class NonBMPDixTest(ProcTest):
 	procdix = "data/non-bmp.dix"
 	inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆']
 	expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅<num>$', '^𐅂𐅄𐅆/𐅂𐅄𐅆<num>$']
 
 
+class NonBMPATTTest(ProcTest):
+	procdix = "data/non-bmp.att"
+	inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆']
+	expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅<num>$', '^𐅂𐅄𐅆/𐅂𐅄𐅆<num>$']
+
 # These fail on some systems:
 #from null_flush_invalid_stream_format import *