commit ac7867f9a3fbd34fc928468e0d5d9ea16d6e1e2b Author: Daniel Swanson Date: Thu Jun 3 11:14:15 2021 -0500 use utf-32 sometimes and some type cleanup diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 9ac7cd1..a01524e 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -33,8 +33,8 @@ using namespace icu; Alphabet::Alphabet() { - spair[pair(0,0)] = 0; - spairinv.push_back(pair(0,0)); + spair[pair(0,0)] = 0; + spairinv.push_back(pair(0,0)); } Alphabet::~Alphabet() @@ -77,19 +77,19 @@ Alphabet::includeSymbol(UString const &s) { if(slexic.find(s) == slexic.end()) { - int slexic_size = slexic.size(); + int32_t slexic_size = slexic.size(); slexic[s] = -(slexic_size+1); slexicinv.push_back(s); } } -int -Alphabet::operator()(int const c1, int const c2) +int32_t +Alphabet::operator()(int32_t const c1, int32_t const c2) { auto tmp = make_pair(c1, c2); if(spair.find(tmp) == spair.end()) { - int spair_size = spair.size(); + int32_t spair_size = spair.size(); spair[tmp] = spair_size; spairinv.push_back(tmp); } @@ -97,13 +97,13 @@ Alphabet::operator()(int const c1, int const c2) return spair[tmp]; } -int +int32_t Alphabet::operator()(UString const &s) { return slexic[s]; } -int +int32_t Alphabet::operator()(UString const &s) const { auto it = slexic.find(s); @@ -119,7 +119,7 @@ Alphabet::isSymbolDefined(UString const &s) return slexic.find(s) != slexic.end(); } -int +int32_t Alphabet::size() const { return slexic.size(); @@ -130,16 +130,16 @@ Alphabet::write(FILE *output) { // First, we write the taglist Compression::multibyte_write(slexicinv.size(), output); // taglist size - for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++) + for(size_t i = 0, limit = slexicinv.size(); i < limit; i++) { Compression::string_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); } // Then we write the list of pairs // All numbers are biased + slexicinv.size() to be positive or zero - unsigned int bias = slexicinv.size(); + size_t bias = slexicinv.size(); Compression::multibyte_write(spairinv.size(), output); - for(unsigned int i = 0, limit = spairinv.size(); i != limit; i++) + for(size_t i = 0, limit = spairinv.size(); i != limit; i++) { Compression::multibyte_write(spairinv[i].first + bias, output); Compression::multibyte_write(spairinv[i].second + bias, output); @@ -154,8 +154,8 @@ Alphabet::read(FILE *input) a_new.spair.clear(); // Reading of taglist - int tam = Compression::multibyte_read(input); - map tmp; + int32_t tam = Compression::multibyte_read(input); + map tmp; while(tam > 0) { tam--; @@ -167,15 +167,15 @@ Alphabet::read(FILE *input) } // Reading of pairlist - unsigned int bias = a_new.slexicinv.size(); + size_t bias = a_new.slexicinv.size(); tam = Compression::multibyte_read(input); while(tam > 0) { tam--; - int first = Compression::multibyte_read(input); - int second = Compression::multibyte_read(input); - pair tmp(first - bias, second - bias); - int spair_size = a_new.spair.size(); + int32_t first = Compression::multibyte_read(input); + int32_t second = Compression::multibyte_read(input); + pair tmp(first - bias, second - bias); + int32_t spair_size = a_new.spair.size(); a_new.spair[tmp] = spair_size; a_new.spairinv.push_back(tmp); } @@ -187,7 +187,7 @@ void Alphabet::serialise(std::ostream &serialised) const { Serialiser >::serialise(slexicinv, serialised); - Serialiser > >::serialise(spairinv, serialised); + Serialiser > >::serialise(spairinv, serialised); } void @@ -201,14 +201,14 @@ Alphabet::deserialise(std::istream &serialised) for (size_t i = 0; i < slexicinv.size(); i++) { slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics } - spairinv = Deserialiser > >::deserialise(serialised); + spairinv = Deserialiser > >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { spair[spairinv[i]] = i; } } void -Alphabet::writeSymbol(int const symbol, UFILE *output) const +Alphabet::writeSymbol(int32_t const symbol, UFILE *output) const { if(symbol < 0) { @@ -221,7 +221,7 @@ Alphabet::writeSymbol(int const symbol, UFILE *output) const } void -Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const +Alphabet::getSymbol(UString &result, int32_t const symbol, bool uppercase) const { if(symbol == 0) { @@ -232,7 +232,7 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const { if(symbol >= 0) { - result += static_cast(symbol); + result += static_cast(symbol); } else { @@ -241,7 +241,7 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const } else if(symbol >= 0) { - result += u_toupper(static_cast(symbol)); + result += u_toupper(static_cast(symbol)); } else { @@ -250,20 +250,20 @@ Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const } bool -Alphabet::isTag(int const symbol) const +Alphabet::isTag(int32_t const symbol) const { return symbol < 0; } -pair const & -Alphabet::decode(int const code) const +pair const & +Alphabet::decode(int32_t const code) const { return spairinv[code]; } -set +set Alphabet::symbolsWhereLeftIs(UChar l) const { - set eps; + set eps; for(const auto& sp: spair) { // [(l, r) : tag] if(sp.first.first == l) { eps.insert(sp.second); @@ -272,17 +272,17 @@ Alphabet::symbolsWhereLeftIs(UChar l) const { return eps; } -void Alphabet::setSymbol(int symbol, UString newSymbolString) { +void Alphabet::setSymbol(int32_t symbol, UString newSymbolString) { //Should be a special character! if (symbol < 0) slexicinv[-symbol-1] = newSymbolString; } void -Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, bool nonTagsToo) +Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, bool nonTagsToo) { - // Non-tag letters get the same int in spairinv across alphabets, + // Non-tag letters get the same int32_t in spairinv across alphabets, // but tags may differ, so do those separately afterwards. - set tags; + set tags; for(auto& it : basis.spairinv) { if(s == left) { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 807b656..a300242 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -22,8 +22,7 @@ #include #include #include -#include -#include +#include #include "ustring.h" using namespace std; @@ -40,7 +39,7 @@ private: * Symbol-identifier relationship. Only contains . * @see slexicinv */ - map slexic; + map slexic; /** * Identifier-symbol relationship. Only contains . @@ -54,13 +53,13 @@ private: * other characters are wchar_t's casted to ints. * @see spairinv */ - map, int> spair; + map, int32_t> spair; /** * All symbol-pairs (both and letters). * @see spair */ - vector > spairinv; + vector > spairinv; void copy(Alphabet const &a); @@ -100,8 +99,8 @@ public: * @param c2 right symbol. * @return code for (c1, c2). */ - int operator()(int const c1, int const c2); - int operator()(UString const &s) const; + int32_t operator()(int32_t const c1, int32_t const c2); + int32_t operator()(UString const &s) const; /** * Gets the individual symbol identifier. Assumes it already exists! @@ -109,7 +108,7 @@ public: * @param s symbol to be identified. * @return symbol identifier. */ - int operator()(UString const &s); + int32_t operator()(UString const &s); /** * Check wether the symbol is defined in the alphabet. @@ -122,7 +121,7 @@ public: * Returns the size of the alphabet (number of symbols). * @return number of symbols. */ - int size() const; + int32_t size() const; /** * Write method. @@ -144,7 +143,7 @@ public: * @param symbol symbol code. * @param output output stream. */ - void writeSymbol(int const symbol, UFILE *output) const; + void writeSymbol(int32_t const symbol, UFILE *output) const; /** * Concat a symbol in the string that is passed by reference. @@ -152,7 +151,7 @@ public: * @param symbol code of the symbol * @param uppercase true if we want an uppercase symbol */ - void getSymbol(UString &result, int const symbol, + void getSymbol(UString &result, int32_t const symbol, bool uppercase = false) const; /** @@ -160,14 +159,14 @@ public: * @param symbol the code of the symbol * @return true if the symbol is a tag */ - bool isTag(int const symbol) const; + bool isTag(int32_t const symbol) const; /** * Sets an already existing symbol to represent a new value. * @param symbol the code of the symbol to set * @param newSymbolString the new string for this symbol */ - void setSymbol(int symbol, UString newSymbolString); + void setSymbol(int32_t symbol, UString newSymbolString); /** * Note: both the symbol int and int-pair are specific to this alphabet instance. @@ -175,12 +174,12 @@ public: * @param code a symbol * @return the pair which code represents in this alphabet */ - pair const & decode(int const code) const; + pair const & decode(int32_t const code) const; /** * Get all symbols where the left-hand side of the symbol-pair is l. */ - set symbolsWhereLeftIs(UChar l) const; + set symbolsWhereLeftIs(UChar l) const; enum Side { @@ -197,7 +196,7 @@ public: * @param s whether to loopback on the left or right side of the symbol-pair * @param nonTagsToo by default only tags are included, but if this is true we include all symbols */ - void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); }; #endif diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index ee01b42..7e14624 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -25,6 +25,8 @@ #include #include #include +#include +#include using namespace std; using namespace icu; @@ -95,24 +97,31 @@ AttCompiler::is_word_punct(UChar symbol) int AttCompiler::symbol_code(const UString& symbol) { - if (symbol.length() > 1) { + if (u_strHasMoreChar32Than(symbol.c_str(), -1, 1)) { alphabet.includeSymbol(symbol); return alphabet(symbol); } else if (symbol.empty()) { return 0; - } else if ((u_ispunct(symbol[0]) || u_isspace(symbol[0])) && !is_word_punct(symbol[0])) { - return symbol[0]; } else { - letters.insert(symbol[0]); - if(u_islower(symbol[0])) - { - letters.insert(u_toupper(symbol[0])); + UChar32 c = symbol[0]; + if (symbol.size() > 1) { + vector v8; + vector v32; + utf8::utf16to8(symbol.begin(), symbol.end(), std::back_inserter(v8)); + utf8::utf8to32(v8.begin(), v8.end(), std::back_inserter(v32)); + c = v32[0]; } - else if(u_isupper(symbol[0])) - { - letters.insert(u_tolower(symbol[0])); + if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) { + return c; + } else { + letters.insert(c); + if(u_islower(c)) { + letters.insert(u_toupper(c)); + } else if(u_isupper(c)) { + letters.insert(u_tolower(c)); + } + return c; } - return symbol[0]; } } @@ -138,7 +147,7 @@ AttCompiler::parse(string const &file_name, bool read_rl) tokens.clear(); tokens.push_back(""_u); do { - UChar32 c = u_fgetcx(infile); + UChar c = u_fgetc(infile); if (c == '\n') { break; } else if (c == '\t') { @@ -355,11 +364,12 @@ AttCompiler::_extract_transducer(TransducerType type, int from, void AttCompiler::classify_single_transition(Transduction& t) { - if (t.upper.length() == 1) { - if (letters.find(t.upper[0]) != letters.end()) { + int32_t sym = alphabet.decode(t.tag).first; + if (sym > 0) { + if (letters.find(sym) != letters.end()) { t.type |= WORD; } - if (u_ispunct(t.upper[0])) { + if (u_ispunct(sym)) { t.type |= PUNCT; } } @@ -453,14 +463,14 @@ AttCompiler::write(FILE *output) Compression::string_write("main@standard"_u, output); Transducer word_fst = extract_transducer(WORD); word_fst.write(output); - wcout << L"main@standard" << " " << word_fst.size(); - wcout << " " << word_fst.numberOfTransitions() << endl; + cout << "main@standard" << " " << word_fst.size(); + cout << " " << word_fst.numberOfTransitions() << endl; Compression::string_write("final@inconditional"_u, output); if(punct_fst.numberOfTransitions() != 0) { punct_fst.write(output); - wcout << L"final@inconditional" << " " << punct_fst.size(); - wcout << " " << punct_fst.numberOfTransitions() << endl; + cout << "final@inconditional" << " " << punct_fst.size(); + cout << " " << punct_fst.numberOfTransitions() << endl; } // fclose(output); } diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index c60a6d2..849ac63 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -334,8 +334,8 @@ FSTProcessor::readAnalysis(InputFile& input) return input_buffer.next(); } - UChar val = input.get(); - int altval = 0; + UChar32 val = input.get(); + int32_t altval = 0; if(input.eof()) { input_buffer.add(0); // so it's treated like the NUL byte @@ -347,7 +347,7 @@ FSTProcessor::readAnalysis(InputFile& input) if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { input_buffer.add(val); - val = static_cast(input.get()); + val = input.get(); } if(escaped_chars.find(val) != escaped_chars.end()) @@ -355,12 +355,12 @@ FSTProcessor::readAnalysis(InputFile& input) switch(val) { case '<': - altval = static_cast(alphabet(readFullBlock(input, '<', '>'))); + altval = alphabet(readFullBlock(input, '<', '>')); input_buffer.add(altval); return altval; case '[': - val = static_cast(input.get()); + val = input.get(); if(val == '[') { @@ -372,12 +372,12 @@ FSTProcessor::readAnalysis(InputFile& input) blankqueue.push(readFullBlock(input, '[', ']')); } - input_buffer.add(static_cast(' ')); - return static_cast(' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); case '\\': - val = static_cast(input.get()); - input_buffer.add(static_cast(val)); + val = input.get(); + input_buffer.add(static_cast(val)); return val; default: @@ -1223,7 +1223,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output) bool firstupper = false, uppercase = false; map >::iterator rcx_map_ptr; - UChar val; + UChar32 val; do { val = readAnalysis(input); diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index f4bd51e..5eb0a76 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -32,6 +32,7 @@ #include #include #include +#include using namespace std; @@ -142,7 +143,7 @@ private: /** * Input buffer */ - Buffer input_buffer; + Buffer input_buffer; /** * Begin of the transducer @@ -220,7 +221,7 @@ private: /** * Show or not the controls symbols (as compoundRSymbol) */ - bool showControlSymbols; + bool showControlSymbols; /** * Max compound elements diff --git a/lttoolbox/input_file.cc b/lttoolbox/input_file.cc index 81ded8e..ac6eda5 100644 --- a/lttoolbox/input_file.cc +++ b/lttoolbox/input_file.cc @@ -79,17 +79,11 @@ InputFile::internal_read() break; } memset(ubuffer, 0, 3*sizeof(UChar)); - utf8::utf8to16(cbuffer, cbuffer+i, ubuffer+1); - if (ubuffer[2]) { - ubuffer[0] = ubuffer[2]; - buffer_size = 2; - } else { - ubuffer[0] = ubuffer[1]; - buffer_size = 1; - } + utf8::utf8to32(cbuffer, cbuffer+i, ubuffer); + buffer_size = 1; } -UChar +UChar32 InputFile::get() { if (!buffer_size) { @@ -98,7 +92,7 @@ InputFile::get() return ubuffer[--buffer_size]; } -UChar +UChar32 InputFile::peek() { if (!buffer_size) { @@ -108,7 +102,7 @@ InputFile::peek() } void -InputFile::unget(UChar c) +InputFile::unget(UChar32 c) { // this will probably segfault if called multiple times ubuffer[buffer_size++] = c; diff --git a/lttoolbox/input_file.h b/lttoolbox/input_file.h index c2d7c35..56608ca 100644 --- a/lttoolbox/input_file.h +++ b/lttoolbox/input_file.h @@ -8,7 +8,7 @@ class InputFile { private: FILE* infile; - UChar ubuffer[3]; + UChar32 ubuffer[3]; char cbuffer[4]; int buffer_size; void internal_read(); @@ -17,9 +17,9 @@ public: ~InputFile(); bool open(char* fname); void close(); - UChar get(); - UChar peek(); - void unget(UChar c); + UChar32 get(); + UChar32 peek(); + void unget(UChar32 c); bool eof(); }; diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 435bb0a..907e5e9 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -40,4 +40,14 @@ inline UString operator "" _u(const char* str, std::size_t len) { return us; } +static void operator+=(UString& str, UChar32 c) +{ + if (c <= 0xFFFF) { + str += static_cast(c); + } else { + str += static_cast(0xD800 + ((c - 0x10000) >> 10)); + str += static_cast(0xDC00 + (c & 0x3FF)); + } +} + #endif diff --git a/tests/data/non-bmp.att b/tests/data/non-bmp.att new file mode 100644 index 0000000..1a1f661 --- /dev/null +++ b/tests/data/non-bmp.att @@ -0,0 +1,34 @@ +0 1 𐅀 𐅀 0.000 +0 1 𐅁 𐅁 0.000 +0 1 𐅂 𐅂 0.000 +0 1 𐅃 𐅃 0.000 +0 1 𐅄 𐅄 0.000 +0 1 𐅅 𐅅 0.000 +0 1 𐅆 𐅆 0.000 +0 1 𐅇 𐅇 0.000 +0 1 𐅈 𐅈 0.000 +0 1 𐅉 𐅉 0.000 +0 1 𐅊 𐅊 0.000 +0 1 𐅋 𐅋 0.000 +0 1 𐅌 𐅌 0.000 +0 1 𐅍 𐅍 0.000 +0 1 𐅎 𐅎 0.000 +0 1 𐅏 𐅏 0.000 +1 1 𐅀 𐅀 0.000 +1 1 𐅁 𐅁 0.000 +1 1 𐅂 𐅂 0.000 +1 1 𐅃 𐅃 0.000 +1 1 𐅄 𐅄 0.000 +1 1 𐅅 𐅅 0.000 +1 1 𐅆 𐅆 0.000 +1 1 𐅇 𐅇 0.000 +1 1 𐅈 𐅈 0.000 +1 1 𐅉 𐅉 0.000 +1 1 𐅊 𐅊 0.000 +1 1 𐅋 𐅋 0.000 +1 1 𐅌 𐅌 0.000 +1 1 𐅍 𐅍 0.000 +1 1 𐅎 𐅎 0.000 +1 1 𐅏 𐅏 0.000 +1 2 @0@ 0.000 +2 0.000 diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index cfbda32..44c7e13 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -220,11 +220,16 @@ class SpaceAtEOF(ProcTest): flushing = False -class NonBMPTest(ProcTest): +class NonBMPDixTest(ProcTest): procdix = "data/non-bmp.dix" inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] +class NonBMPATTTest(ProcTest): + procdix = "data/non-bmp.att" + inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] + # These fail on some systems: #from null_flush_invalid_stream_format import *