commit ad15367786575821a004efe19854b4f539074aaa Author: Daniel Swanson Date: Thu May 27 18:46:38 2021 -0500 the long march part 1 diff --git a/.gitignore b/.gitignore index 6972eaf..95e0253 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,5 @@ *.egg-info/ *.egg **/.mypy_cache/ + +*~ diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 501b04a..6ade398 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -3,12 +3,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \ ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \ + string_utils.h \ transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \ string_to_wostream.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ trans_exe.cc xml_parse_util.cc tmx_compiler.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 6a47095..122e0e3 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -26,11 +26,10 @@ #include #include -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif +#include "string_utils.h" using namespace std; +using namespace icu; Alphabet::Alphabet() { @@ -74,7 +73,7 @@ Alphabet::copy(Alphabet const &a) } void -Alphabet::includeSymbol(wstring const &s) +Alphabet::includeSymbol(UnicodeString const &s) { if(slexic.find(s) == slexic.end()) { @@ -99,13 +98,13 @@ Alphabet::operator()(int const c1, int const c2) } int -Alphabet::operator()(wstring const &s) +Alphabet::operator()(UnicodeString const &s) { return slexic[s]; } int -Alphabet::operator()(wstring const &s) const +Alphabet::operator()(UnicodeString const &s) const { auto it = slexic.find(s); if (it == slexic.end()) { @@ -115,7 +114,7 @@ Alphabet::operator()(wstring const &s) const } bool -Alphabet::isSymbolDefined(wstring const &s) +Alphabet::isSymbolDefined(UnicodeString const &s) { return slexic.find(s) != slexic.end(); } @@ -133,7 +132,7 @@ Alphabet::write(FILE *output) Compression::multibyte_write(slexicinv.size(), output); // taglist size for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++) { - Compression::wstring_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); + Compression::string_write(slexicinv[i].tempSubString(1, slexicinv[i].length()-2), output); } // Then we write the list of pairs @@ -160,7 +159,7 @@ Alphabet::read(FILE *input) while(tam > 0) { tam--; - wstring mytag = L"<" + Compression::wstring_read(input) + L">"; + UnicodeString mytag = "<" + Compression::string_read(input) + ">"; a_new.slexicinv.push_back(mytag); a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics } @@ -185,7 +184,7 @@ Alphabet::read(FILE *input) void Alphabet::serialise(std::ostream &serialised) const { - Serialiser >::serialise(slexicinv, serialised); + Serialiser >::serialise(slexicinv, serialised); Serialiser > >::serialise(spairinv, serialised); } @@ -196,7 +195,7 @@ Alphabet::deserialise(std::istream &serialised) slexic.clear(); spairinv.clear(); spair.clear(); - slexicinv = Deserialiser >::deserialise(serialised); + slexicinv = Deserialiser >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics } @@ -207,20 +206,20 @@ Alphabet::deserialise(std::istream &serialised) } void -Alphabet::writeSymbol(int const symbol, FILE *output) const +Alphabet::writeSymbol(int const symbol, UFILE *output) const { if(symbol < 0) { - fputws_unlocked(slexicinv[-symbol-1].c_str(), output); + u_fputs(slexicinv[-symbol-1], output); } else { - fputwc_unlocked(static_cast(symbol), output); + u_fputc(static_cast(symbol), output); } } void -Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const +Alphabet::getSymbol(UnicodeString &result, int const symbol, bool uppercase) const { if(symbol == 0) { @@ -231,7 +230,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const { if(symbol >= 0) { - result += static_cast(symbol); + result += static_cast(symbol); } else { @@ -240,7 +239,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const } else if(symbol >= 0) { - result += static_cast(towupper(static_cast(symbol))); + result += static_cast(toupper(static_cast(symbol))); } else { @@ -261,7 +260,7 @@ Alphabet::decode(int const code) const } set -Alphabet::symbolsWhereLeftIs(wchar_t l) const { +Alphabet::symbolsWhereLeftIs(UChar l) const { set eps; for(const auto& sp: spair) { // [(l, r) : tag] if(sp.first.first == l) { @@ -271,7 +270,7 @@ Alphabet::symbolsWhereLeftIs(wchar_t l) const { return eps; } -void Alphabet::setSymbol(int symbol, wstring newSymbolString) { +void Alphabet::setSymbol(int symbol, UnicodeString newSymbolString) { //Should be a special character! if (symbol < 0) slexicinv[-symbol-1] = newSymbolString; } diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 3218334..9d59da1 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -22,10 +22,11 @@ #include #include #include - -#include +#include +#include using namespace std; +using namespace icu; /** * Alphabet class. @@ -38,13 +39,13 @@ private: * Symbol-identifier relationship. Only contains . * @see slexicinv */ - map slexic; + map slexic; /** * Identifier-symbol relationship. Only contains . * @see slexic */ - vector slexicinv; + vector slexicinv; /** @@ -89,7 +90,7 @@ public: /** * Include a symbol into the alphabet. */ - void includeSymbol(wstring const &s); + void includeSymbol(UnicodeString const &s); /** * Get an unique code for every symbol pair. This flavour is for @@ -99,7 +100,7 @@ public: * @return code for (c1, c2). */ int operator()(int const c1, int const c2); - int operator()(wstring const &s) const; + int operator()(UnicodeString const &s) const; /** * Gets the individual symbol identifier. Assumes it already exists! @@ -107,14 +108,14 @@ public: * @param s symbol to be identified. * @return symbol identifier. */ - int operator()(wstring const &s); + int operator()(UnicodeString const &s); /** * Check wether the symbol is defined in the alphabet. * @param s symbol * @return true if defined */ - bool isSymbolDefined(wstring const &s); + bool isSymbolDefined(UnicodeString const &s); /** * Returns the size of the alphabet (number of symbols). @@ -142,7 +143,7 @@ public: * @param symbol symbol code. * @param output output stream. */ - void writeSymbol(int const symbol, FILE *output) const; + void writeSymbol(int const symbol, UFILE *output) const; /** * Concat a symbol in the string that is passed by reference. @@ -150,7 +151,7 @@ public: * @param symbol code of the symbol * @param uppercase true if we want an uppercase symbol */ - void getSymbol(wstring &result, int const symbol, + void getSymbol(UnicodeString &result, int const symbol, bool uppercase = false) const; /** @@ -165,7 +166,7 @@ public: * @param symbol the code of the symbol to set * @param newSymbolString the new string for this symbol */ - void setSymbol(int symbol, wstring newSymbolString); + void setSymbol(int symbol, UnicodeString newSymbolString); /** * Note: both the symbol int and int-pair are specific to this alphabet instance. @@ -178,7 +179,7 @@ public: /** * Get all symbols where the left-hand side of the symbol-pair is l. */ - set symbolsWhereLeftIs(wchar_t l) const; + set symbolsWhereLeftIs(UChar l) const; enum Side { diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index a511f5a..6d33b96 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -22,17 +22,30 @@ #include #include #include +#include +#include using namespace std; +using namespace icu; AttCompiler::AttCompiler() : starting_state(0), default_weight(0.0000) { + UErrorCode status = U_ZERO_ERROR; + int_parser = NumberFormat::createInstance(status); + int_parser->setParseIntergerOnly(true); + float_parser = NumberFormat::createInstance(status); + if (status != U_ZERO_ERROR) { + cerr << "Error: unable to set up numeric converter." << endl; + exit(EXIT_FAILURE); + } } AttCompiler::~AttCompiler() { + delete int_parser; + delete float_parser; } void @@ -46,26 +59,52 @@ AttCompiler::clear() alphabet = Alphabet(); } +int +AttCompiler::parse_state(const UnicodeString& s, int line) +{ + UErrorCode status = U_ZERO_ERROR; + Formattable result; + int_parser->parse(s, result, status); + if (status != U_ZERO_ERROR) { + cerr << "ERROR: Unable to parse state number on line " << line << "." << endl; + // TODO: error messages should also print file names + } + return result.getLong(); +} + +double +AttCompiler::parse_weight(const UnicodeString& s, int line) +{ + UErrorCode status = U_ZERO_ERROR; + Formattable result; + float_parser->parse(s, result, status); + if (status != U_ZERO_ERROR) { + cerr << "ERROR: Unable to parse state number on line " << line << "." << endl; + // TODO: error messages should also print file names + } + return result.getDouble(); +} + /** * Converts symbols like @0@ to epsilon, @_SPACE_@ to space, etc. * @todo Are there other special symbols? If so, add them, and maybe use a map * for conversion? */ void -AttCompiler::convert_hfst(wstring& symbol) +AttCompiler::convert_hfst(UnicodeString& symbol) { - if (symbol == L"@0@" || symbol == L"ε") + if (symbol == "@0@" || symbol == "ε") { - symbol = L""; + symbol = ""; } - else if (symbol == L"@_SPACE_@") + else if (symbol == "@_SPACE_@") { - symbol = L" "; + symbol = " "; } } bool -AttCompiler::is_word_punct(wchar_t symbol) +AttCompiler::is_word_punct(UChar symbol) { // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges if((symbol >= 0x0300 && symbol <= 0x036F) // Combining Diacritics @@ -90,12 +129,12 @@ AttCompiler::is_word_punct(wchar_t symbol) * only) character otherwise. */ int -AttCompiler::symbol_code(const wstring& symbol) +AttCompiler::symbol_code(const UnicodeString& symbol) { if (symbol.length() > 1) { alphabet.includeSymbol(symbol); return alphabet(symbol); - } else if (symbol == L"") { + } else if (symbol == "") { return 0; } else if ((iswpunct(symbol[0]) || iswspace(symbol[0])) && !is_word_punct(symbol[0])) { return symbol[0]; @@ -128,77 +167,85 @@ AttCompiler::has_multiple_fsts(string const &file_name) } void -AttCompiler::parse(string const &file_name, wstring const &dir) +AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) { clear(); - wifstream infile(file_name.c_str()); // TODO: error checking - vector tokens; - wstring line; + UFILE* infile = u_fopen_u(file_name, "r"); + if (infile == NULL) { + cerr << "Error: unable to open '" << file_name << "' for reading." << endl; + } + vector tokens; bool first_line_in_fst = true; // First line -- see below - int state_id_offset = 0; + bool multiple_transducers = false; + int state_id_offset = 1; int largest_seen_state_id = 0; + int line_number = 0; - if (has_multiple_fsts(file_name)){ - wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; - - // Set the starting state to 0 (Epsilon transtions will be added later) - starting_state = 0; - state_id_offset = 1; - } - - while (getline(infile, line)) + while (!u_feof(infile)) { + lint_number++; tokens.clear(); + tokens.push_back(""); + do { + UChar32 c = u_fgetcx(infile); + if (c == '\n') { + break; + } else if (c == '\t') { + tokens.push_back(""); + } else { + tokens.back() += c; + } + } while (!u_feof(infile)); + int from, to; wstring upper, lower; double weight; - if (line.length() == 0 && first_line_in_fst) + if (tokens[0].length() == 0 && first_line_in_fst) { - wcerr << "Error: empty file '" << file_name << "'." << endl; + cerr << "Error: empty file '" << file_name << "'." << endl; exit(EXIT_FAILURE); } - if (first_line_in_fst && line.find(L"\t") == wstring::npos) + if (first_line_in_fst && tokens.size() == 1) { - wcerr << "Error: invalid format '" << file_name << "'." << endl; + cerr << "Error: invalid format in file '" << file_name << "' on line " << line_number << "." << endl; exit(EXIT_FAILURE); } /* Empty line. */ - if (line.length() == 0) + if (tokens.size() == 1 && tokens[0].length() == 0) { continue; } - split(line, L'\t', tokens); if (tokens[0].find('-') == 0) { + if (state_id_offset == 1) { + // this is the first split we've seen + cerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + multiple_transducers = true; + } // Update the offset for the new FST state_id_offset = largest_seen_state_id + 1; first_line_in_fst = true; continue; } - from = stoi(tokens[0]) + state_id_offset; + from = parse_state(tokens[0]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, from); AttNode* source = get_node(from); /* First line: the initial state is of both types. */ if (first_line_in_fst) { - // If the file has a single FST - No need for state id mapping - if (state_id_offset == 0) - starting_state = from; - else{ - AttNode * starting_node = get_node(starting_state); - - // Add an Epsilon transition from the new starting state - starting_node->transductions.push_back( - Transduction(from, L"", L"", - alphabet(symbol_code(L""), symbol_code(L"")), - default_weight)); - } + AttNode * starting_node = get_node(starting_state); + + // Add an Epsilon transition from the new starting state + starting_node->transductions.push_back( + Transduction(from, L"", L"", + alphabet(symbol_code(L""), symbol_code(L"")), + default_weight)); first_line_in_fst = false; } @@ -207,7 +254,7 @@ AttCompiler::parse(string const &file_name, wstring const &dir) { if (tokens.size() > 1) { - weight = stod(tokens[1]); + weight = parse_weight(tokens[1]); } else { @@ -217,9 +264,9 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } else { - to = stoi(tokens[1]) + state_id_offset; + to = parse_state(tokens[1]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, to); - if(dir == L"RL") + if(dir == "RL") { upper = tokens[3]; lower = tokens[2]; @@ -247,12 +294,19 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } } + if (!multiple_transducers) { + starting_state = 1; + // if we aren't disjuncting multiple transducers + // then we have an extra epsilon transduction at the beginning + // so skip it + } + /* Classify the nodes of the graph. */ classify_forwards(); set path; classify_backwards(starting_state, path); - infile.close(); + u_fclose(infile); } /** Extracts the sub-transducer made of states of type @p type. */ diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h index 126ca56..a03179a 100644 --- a/lttoolbox/att_compiler.h +++ b/lttoolbox/att_compiler.h @@ -28,6 +28,9 @@ #include #include +#include +#include + #include #define UNDECIDED 0 @@ -36,25 +39,11 @@ #define BOTH 3 using namespace std; +using namespace icu; /** Bitmask; 1 = WORD, 2 = PUNCT, 3 = BOTH. */ typedef unsigned int TransducerType; -namespace -{ - /** Splits a string into fields. */ - inline vector& split(const wstring& s, wchar_t delim, vector &out) - { - wistringstream ss(s); - wstring item; - while (getline(ss, item, delim)) - { - out.push_back(item); - } - return out; - } -}; - /** * Converts transducers from AT&T text format to lt binary format. * @@ -91,7 +80,7 @@ public: * Reads the AT&T format file @p file_name. The transducer and the alphabet * are both cleared before reading the new file. */ - void parse(string const &file_name, wstring const &dir); + void parse(UnicodeString const &file_name, UnicodeString const &dir); /** Writes the transducer to @p file_name in lt binary format. */ @@ -113,20 +102,20 @@ private: Alphabet alphabet; /** All non-multicharacter symbols. */ - set letters; + set letters; /** Used in AttNode. */ struct Transduction { int to; - wstring upper; - wstring lower; + UnicodeString upper; + UnicodeString lower; int tag; double weight; TransducerType type; - Transduction(int to, wstring upper, wstring lower, int tag, double weight, - TransducerType type=UNDECIDED) : + Transduction(int to, UnicodeString upper, UnicodeString lower, int tag, + double weight, TransducerType type=UNDECIDED) : to(to), upper(upper), lower(lower), tag(tag), weight(weight), type(type) {} }; @@ -170,7 +159,7 @@ private: * Returns true for combining diacritics and modifier letters * */ - bool is_word_punct(wchar_t symbol); + bool is_word_punct(UChar symbol); /** * Determines initial type of single transition @@ -186,7 +175,7 @@ private: * @todo Are there other special symbols? If so, add them, and maybe use a map * for conversion? */ - void convert_hfst(wstring& symbol); + void convert_hfst(UnicodeString& symbol); /** * Returns the code of the symbol in the alphabet. Run after convert_hfst has @@ -197,12 +186,15 @@ private: * @return the code of the symbol, if @p symbol is multichar; its first (and * only) character otherwise. */ - int symbol_code(const wstring& symbol); + int symbol_code(const UnicodeString& symbol); /** - * Finds whether an at&t file contains multiple FSTs or not - */ - bool has_multiple_fsts(string const &file_name); + * Wrappers around ICU number parsing functions + */ + NumberFormat* int_parser; + NumberFormat* float_parser; + int parse_state(const UnicodeString& s, int line); + double parse_weight(const UnicodeString& s, int line); }; #endif /* _MYATT_COMPILER_ */ diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc index 0ba78b5..bcafbdc 100644 --- a/lttoolbox/compression.cc +++ b/lttoolbox/compression.cc @@ -254,7 +254,7 @@ Compression::multibyte_read(istream &input) void -Compression::wstring_write(wstring const &str, FILE *output) +Compression::string_write(UnicodeString const &str, FILE *output) { Compression::multibyte_write(str.size(), output); for(auto c : str) @@ -264,38 +264,14 @@ Compression::wstring_write(wstring const &str, FILE *output) } wstring -Compression::wstring_read(FILE *input) -{ - wstring retval = L""; - - for(unsigned int i = 0, limit = Compression::multibyte_read(input); - i != limit; i++) - { - retval += static_cast(Compression::multibyte_read(input)); - } - - return retval; -} - -void -Compression::string_write(string const &str, FILE *output) -{ - Compression::multibyte_write(str.size(), output); - for(auto c : str) - { - Compression::multibyte_write(static_cast(c), output); - } -} - -string Compression::string_read(FILE *input) { - string retval = ""; + UnicodeString retval = ""; for(unsigned int i = 0, limit = Compression::multibyte_read(input); i != limit; i++) { - retval += static_cast(Compression::multibyte_read(input)); + retval += static_cast(Compression::multibyte_read(input)); } return retval; diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 8af6cf9..80798b4 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -19,11 +19,12 @@ #include #include -#include +#include #include #include using namespace std; +using namespace icu; // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; @@ -174,23 +175,6 @@ public: */ static unsigned int multibyte_read(istream &is); - /** - * This method allows to write a wide string to an output stream - * using its UCSencoding as integer. - * @see wstring_read() - * @param str the string to write. - * @param output the output stream. - */ - static void wstring_write(wstring const &str, FILE *output); - - /** - * This method reads a wide string from the input stream. - * @see wstring_write() - * @param input the input stream. - * @return the wide string read. - */ - static wstring wstring_read(FILE *input); - /** * This method allows to write a plain string to an output stream * using its UCSencoding as integer. @@ -198,7 +182,7 @@ public: * @param str the string to write. * @param output the output stream. */ - static void string_write(string const &str, FILE *output); + static void string_write(UnicodeString const &str, FILE *output); /** * This method reads a plain string from the input stream. @@ -206,7 +190,7 @@ public: * @param input the input stream. * @return the string read. */ - static string string_read(FILE *input); + static UnicodeString string_read(FILE *input); /** * Encodes a double value and writes it into the output stream diff --git a/lttoolbox/deserialiser.h b/lttoolbox/deserialiser.h index 4697640..c0c5c53 100644 --- a/lttoolbox/deserialiser.h +++ b/lttoolbox/deserialiser.h @@ -33,6 +33,8 @@ #include #include +#include + template class Deserialiser; template @@ -111,6 +113,13 @@ Deserialiser >::deserialise( return SerialisedType_; } +template <> +icu::UnicodeString +Deserialiser::deserialise(std::istream &Stream_) { + std::string s = Deserialiser::deserialise(Stream_); + return icu::UnicodeString::fromUTF8(s); +} + template std::pair Deserialiser >::deserialise( diff --git a/lttoolbox/serialiser.h b/lttoolbox/serialiser.h index 01abb3e..99f85b1 100644 --- a/lttoolbox/serialiser.h +++ b/lttoolbox/serialiser.h @@ -30,6 +30,8 @@ #include #include +#include + namespace { template static unsigned char compressedSize(const SerialisedType &SerialisedType_) { @@ -143,6 +145,13 @@ void Serialiser >::serialise( } } +template <> +void Serialiser::serialise(const icu::UnicodeString& s, + std::ostream& Output) { + std::string temp; + ::serialise(s.toUTF8String(temp), Output); +} + template void Serialiser >::serialise( const std::pair &SerialisedType_, diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc new file mode 100644 index 0000000..3ee4e94 --- /dev/null +++ b/lttoolbox/string_utils.cc @@ -0,0 +1,7 @@ +#include "string_utils.h" + +void +u_fputs(const UnicodeString str, UFILE* output) +{ + u_fprintf(output, "%S", str.getTerminatedBuffer()); +} diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h new file mode 100644 index 0000000..5cf6b9e --- /dev/null +++ b/lttoolbox/string_utils.h @@ -0,0 +1,9 @@ +#ifndef _LT_STRING_UTILS_H_ +#define _LT_STRING_UTILS_H_ + +#include +#include + +void u_fputs(const UnicodeString str, UFILE* output); + +#endif \ No newline at end of file