commit e317a7884d83aa31bc91a5ca540651740c7f061b Author: Daniel Swanson Date: Fri Jun 4 17:36:25 2021 -0500 assorted nits diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 5a7ec03..2f16c76 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -24,6 +24,7 @@ #include #include #include +#include using namespace std; using namespace icu; @@ -102,13 +103,8 @@ AttCompiler::symbol_code(const UString& symbol) } else if (symbol.empty()) { return 0; } else { - UChar32 c = symbol[0]; - if (symbol.size() > 1) { - // it's 2 UTF-16 code units, - // so combine them into a single UTF-32 codepoint - c = ((c - 0xD800) << 10) + 0x10000; - c += (symbol[1] - 0xDC00); - } + UChar32 c; + U16_GET(symbol, 0, 0, symbol.size(), c); if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) { return c; } else { diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc index 8b66be7..42c4d2b 100644 --- a/lttoolbox/compression.cc +++ b/lttoolbox/compression.cc @@ -258,14 +258,12 @@ Compression::multibyte_read(istream &input) void Compression::string_write(UString const &str, FILE *output) { - vector vec; - string temp; - utf8::utf16to8(str.begin(), str.end(), std::back_inserter(temp)); - utf8::utf8to32(temp.begin(), temp.end(), std::back_inserter(vec)); + vector vec; + ustring_to_vec32(str, vec); Compression::multibyte_write(vec.size(), output); for(auto c : vec) { - Compression::multibyte_write(static_cast(c), output); + Compression::multibyte_write(c, output); } } @@ -273,17 +271,12 @@ UString Compression::string_read(FILE *input) { UString retval; - std::vector vec; + unsigned int limit = Compression::multibyte_read(input); + retval.reserve(limit); - for(unsigned int i = 0, limit = Compression::multibyte_read(input); - i != limit; i++) - { - vec.push_back(static_cast(Compression::multibyte_read(input))); - //retval += static_cast(Compression::multibyte_read(input)); + for(unsigned int i = 0; i != limit; i++) { + retval += static_cast(Compression::multibyte_read(input)); } - string temp; - utf8::utf32to8(vec.begin(), vec.end(), std::back_inserter(temp)); - utf8::utf8to16(temp.begin(), temp.end(), std::back_inserter(retval)); return retval; } diff --git a/lttoolbox/entry_token.cc b/lttoolbox/entry_token.cc index da00e9a..f9401ab 100644 --- a/lttoolbox/entry_token.cc +++ b/lttoolbox/entry_token.cc @@ -79,8 +79,8 @@ EntryToken::setSingleTransduction(vector const &pi, vector const &pd, void EntryToken::setRegexp(UString const &r) { - //myregexp = r; - myregexp = vector(r.begin(), r.end()); + myregexp.clear(); + ustring_to_vec32(r, myregexp); type = regexp; } diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 99aa12f..1d21b76 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -153,7 +153,7 @@ FSTProcessor::procNodeICX() } else { - cerr << "Error in ICX UFILE (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } @@ -191,7 +191,7 @@ FSTProcessor::procNodeRCX() } else { - cerr << "Error in RCX UFILE (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } @@ -1110,7 +1110,7 @@ FSTProcessor::compoundAnalysis(UString input_word, bool uppercase, bool firstupp for(unsigned int i=0; i const &finals, result += it->first; if(display_weights) { - UChar* temp = new UChar[16]; + UChar temp[16]{}; // if anyone wants a weight of 10000, this will not be enough u_sprintf(temp, "", it->second); result += temp; - delete temp; } } diff --git a/lttoolbox/tmx_compiler.cc b/lttoolbox/tmx_compiler.cc index 5ca3b8d..db0394b 100644 --- a/lttoolbox/tmx_compiler.cc +++ b/lttoolbox/tmx_compiler.cc @@ -522,13 +522,12 @@ TMXCompiler::align(vector &origin, vector &meta) { modified_meta.push_back('@'); modified_meta.push_back('('); - UChar* valor = new UChar[8]; - u_snprintf(valor, 8, "%d", j+1); - for(int k = 0, limit3 = u_strlen(valor); k != limit3; k++) + UChar valor[8]{}; + int limit3 = u_snprintf(valor, 8, "%d", j+1); + for(int k = 0; k != limit3; k++) { modified_meta.push_back(valor[k]); } - delete[] valor; modified_meta.push_back(')'); i += nl-1; tocado = true; diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc index b2081a3..b537a78 100644 --- a/lttoolbox/ustring.cc +++ b/lttoolbox/ustring.cc @@ -20,6 +20,7 @@ #include #include #include +#include using namespace icu; @@ -61,3 +62,20 @@ to_ustring(const char* s) utf8::utf8to16(s, s+sz, std::back_inserter(ret)); return ret; } + +void +ustring_to_vec32(const UString& str, std::vector& vec) +{ + if (str.empty()) { + return; + } + + size_t i = 0; + size_t len = str.size(); + vec.reserve(vec.size() + str.size()); + int32_t c; + while (i < str.size()) { + U16_NEXT(str, i, len, c); + vec.push_back(c); + } +} diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index b808ed2..da4b3f6 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -18,9 +18,11 @@ #ifndef _LT_USTRING_H_ #define _LT_USTRING_H_ -#include #include #include +#include +#include +#include typedef std::basic_string UString; @@ -35,11 +37,13 @@ double stod(const UString& str); // for command-line arguments UString to_ustring(const char* str); -static std::ostream& operator<<(std::ostream& ostr, const UString& str) +// append UTF-16 string to UTF-32 vector of symbols +void ustring_to_vec32(const UString& str, std::vector& vec); + +inline std::ostream& operator<<(std::ostream& ostr, const UString& str) { std::string res; - icu::UnicodeString temp = str.c_str(); - temp.toUTF8String(res); + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res)); ostr << res; return ostr; } @@ -52,7 +56,7 @@ inline UString operator "" _u(const char* str, std::size_t len) { return us; } -static void operator+=(UString& str, UChar32 c) +inline void operator+=(UString& str, UChar32 c) { if (c <= 0xFFFF) { str += static_cast(c); diff --git a/python/lttoolbox.i.in b/python/lttoolbox.i.in index d3630c0..f57fe92 100644 --- a/python/lttoolbox.i.in +++ b/python/lttoolbox.i.in @@ -54,8 +54,8 @@ public: void lt_proc(int argc, char **argv, char *input_path, char *output_path) { - InputFile input; - input.open(input_path); + InputFile input; + input.open(input_path); UFILE* output = u_fopen(output_path, "w", NULL, NULL); int cmd = 0; int c = 0;