commit 44ef72a8fb396ca0279ec698144cc5c830a1c06e Author: Daniel Swanson Date: Tue Jun 1 12:58:27 2021 -0500 the long march part 2 diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 6ade398..cba2bc6 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -3,14 +3,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \ ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \ - string_utils.h \ transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \ - string_to_wostream.h + string_to_wostream.h ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ - trans_exe.cc xml_parse_util.cc tmx_compiler.cc + regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \ + trans_exe.cc xml_parse_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 122e0e3..5345538 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -26,7 +26,7 @@ #include #include -#include "string_utils.h" +#include using namespace std; using namespace icu; @@ -73,7 +73,7 @@ Alphabet::copy(Alphabet const &a) } void -Alphabet::includeSymbol(UnicodeString const &s) +Alphabet::includeSymbol(UString const &s) { if(slexic.find(s) == slexic.end()) { @@ -98,13 +98,13 @@ Alphabet::operator()(int const c1, int const c2) } int -Alphabet::operator()(UnicodeString const &s) +Alphabet::operator()(UString const &s) { return slexic[s]; } int -Alphabet::operator()(UnicodeString const &s) const +Alphabet::operator()(UString const &s) const { auto it = slexic.find(s); if (it == slexic.end()) { @@ -114,7 +114,7 @@ Alphabet::operator()(UnicodeString const &s) const } bool -Alphabet::isSymbolDefined(UnicodeString const &s) +Alphabet::isSymbolDefined(UString const &s) { return slexic.find(s) != slexic.end(); } @@ -132,7 +132,7 @@ Alphabet::write(FILE *output) Compression::multibyte_write(slexicinv.size(), output); // taglist size for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++) { - Compression::string_write(slexicinv[i].tempSubString(1, slexicinv[i].length()-2), output); + Compression::string_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); } // Then we write the list of pairs @@ -159,7 +159,9 @@ Alphabet::read(FILE *input) while(tam > 0) { tam--; - UnicodeString mytag = "<" + Compression::string_read(input) + ">"; + UString mytag{'<'}; + mytag.append(Compression::string_read(input)); + mytag += (UChar)'>'; a_new.slexicinv.push_back(mytag); a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics } @@ -184,7 +186,7 @@ Alphabet::read(FILE *input) void Alphabet::serialise(std::ostream &serialised) const { - Serialiser >::serialise(slexicinv, serialised); + Serialiser >::serialise(slexicinv, serialised); Serialiser > >::serialise(spairinv, serialised); } @@ -195,7 +197,7 @@ Alphabet::deserialise(std::istream &serialised) slexic.clear(); spairinv.clear(); spair.clear(); - slexicinv = Deserialiser >::deserialise(serialised); + slexicinv = Deserialiser >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics } @@ -219,7 +221,7 @@ Alphabet::writeSymbol(int const symbol, UFILE *output) const } void -Alphabet::getSymbol(UnicodeString &result, int const symbol, bool uppercase) const +Alphabet::getSymbol(UString &result, int const symbol, bool uppercase) const { if(symbol == 0) { @@ -239,7 +241,7 @@ Alphabet::getSymbol(UnicodeString &result, int const symbol, bool uppercase) con } else if(symbol >= 0) { - result += static_cast(toupper(static_cast(symbol))); + result += u_toupper(static_cast(symbol)); } else { @@ -270,7 +272,7 @@ Alphabet::symbolsWhereLeftIs(UChar l) const { return eps; } -void Alphabet::setSymbol(int symbol, UnicodeString newSymbolString) { +void Alphabet::setSymbol(int symbol, UString newSymbolString) { //Should be a special character! if (symbol < 0) slexicinv[-symbol-1] = newSymbolString; } diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 9d59da1..52fe76c 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -24,6 +24,7 @@ #include #include #include +#include "ustring.h" using namespace std; using namespace icu; @@ -39,13 +40,13 @@ private: * Symbol-identifier relationship. Only contains . * @see slexicinv */ - map slexic; + map slexic; /** * Identifier-symbol relationship. Only contains . * @see slexic */ - vector slexicinv; + vector slexicinv; /** @@ -90,7 +91,7 @@ public: /** * Include a symbol into the alphabet. */ - void includeSymbol(UnicodeString const &s); + void includeSymbol(UString const &s); /** * Get an unique code for every symbol pair. This flavour is for @@ -100,7 +101,7 @@ public: * @return code for (c1, c2). */ int operator()(int const c1, int const c2); - int operator()(UnicodeString const &s) const; + int operator()(UString const &s) const; /** * Gets the individual symbol identifier. Assumes it already exists! @@ -108,14 +109,14 @@ public: * @param s symbol to be identified. * @return symbol identifier. */ - int operator()(UnicodeString const &s); + int operator()(UString const &s); /** * Check wether the symbol is defined in the alphabet. * @param s symbol * @return true if defined */ - bool isSymbolDefined(UnicodeString const &s); + bool isSymbolDefined(UString const &s); /** * Returns the size of the alphabet (number of symbols). @@ -151,7 +152,7 @@ public: * @param symbol code of the symbol * @param uppercase true if we want an uppercase symbol */ - void getSymbol(UnicodeString &result, int const symbol, + void getSymbol(UString &result, int const symbol, bool uppercase = false) const; /** @@ -166,7 +167,7 @@ public: * @param symbol the code of the symbol to set * @param newSymbolString the new string for this symbol */ - void setSymbol(int symbol, UnicodeString newSymbolString); + void setSymbol(int symbol, UString newSymbolString); /** * Note: both the symbol int and int-pair are specific to this alphabet instance. diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 6d33b96..7d5ce10 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -31,22 +31,10 @@ using namespace icu; AttCompiler::AttCompiler() : starting_state(0), default_weight(0.0000) -{ - UErrorCode status = U_ZERO_ERROR; - int_parser = NumberFormat::createInstance(status); - int_parser->setParseIntergerOnly(true); - float_parser = NumberFormat::createInstance(status); - if (status != U_ZERO_ERROR) { - cerr << "Error: unable to set up numeric converter." << endl; - exit(EXIT_FAILURE); - } -} +{} AttCompiler::~AttCompiler() -{ - delete int_parser; - delete float_parser; -} +{} void AttCompiler::clear() @@ -59,47 +47,21 @@ AttCompiler::clear() alphabet = Alphabet(); } -int -AttCompiler::parse_state(const UnicodeString& s, int line) -{ - UErrorCode status = U_ZERO_ERROR; - Formattable result; - int_parser->parse(s, result, status); - if (status != U_ZERO_ERROR) { - cerr << "ERROR: Unable to parse state number on line " << line << "." << endl; - // TODO: error messages should also print file names - } - return result.getLong(); -} - -double -AttCompiler::parse_weight(const UnicodeString& s, int line) -{ - UErrorCode status = U_ZERO_ERROR; - Formattable result; - float_parser->parse(s, result, status); - if (status != U_ZERO_ERROR) { - cerr << "ERROR: Unable to parse state number on line " << line << "." << endl; - // TODO: error messages should also print file names - } - return result.getDouble(); -} - /** * Converts symbols like @0@ to epsilon, @_SPACE_@ to space, etc. * @todo Are there other special symbols? If so, add them, and maybe use a map * for conversion? */ void -AttCompiler::convert_hfst(UnicodeString& symbol) +AttCompiler::convert_hfst(UString& symbol) { - if (symbol == "@0@" || symbol == "ε") + if (symbol == (const UChar*)"@0@" || symbol == (const UChar*)"ε") { - symbol = ""; + symbol = (const UChar*)""; } - else if (symbol == "@_SPACE_@") + else if (symbol == (const UChar*)"@_SPACE_@") { - symbol = " "; + symbol = (const UChar*)" "; } } @@ -129,12 +91,12 @@ AttCompiler::is_word_punct(UChar symbol) * only) character otherwise. */ int -AttCompiler::symbol_code(const UnicodeString& symbol) +AttCompiler::symbol_code(const UString& symbol) { if (symbol.length() > 1) { alphabet.includeSymbol(symbol); return alphabet(symbol); - } else if (symbol == "") { + } else if (symbol.empty()) { return 0; } else if ((iswpunct(symbol[0]) || iswspace(symbol[0])) && !is_word_punct(symbol[0])) { return symbol[0]; @@ -152,30 +114,16 @@ AttCompiler::symbol_code(const UnicodeString& symbol) } } -bool -AttCompiler::has_multiple_fsts(string const &file_name) -{ - wifstream infile(file_name.c_str()); // TODO: error checking - wstring line; - - while(getline(infile, line)){ - if (line.find('-') == 0) - return true; - } - - return false; -} - void -AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) +AttCompiler::parse(string const &file_name, bool read_rl) { clear(); - UFILE* infile = u_fopen_u(file_name, "r"); + UFILE* infile = u_fopen(file_name.c_str(), "r", NULL, NULL); if (infile == NULL) { cerr << "Error: unable to open '" << file_name << "' for reading." << endl; } - vector tokens; + vector tokens; bool first_line_in_fst = true; // First line -- see below bool multiple_transducers = false; int state_id_offset = 1; @@ -184,22 +132,22 @@ AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) while (!u_feof(infile)) { - lint_number++; + line_number++; tokens.clear(); - tokens.push_back(""); + tokens.push_back((UChar*)""); do { UChar32 c = u_fgetcx(infile); if (c == '\n') { break; } else if (c == '\t') { - tokens.push_back(""); + tokens.push_back((UChar*)""); } else { tokens.back() += c; } } while (!u_feof(infile)); int from, to; - wstring upper, lower; + UString upper, lower; double weight; if (tokens[0].length() == 0 && first_line_in_fst) @@ -232,7 +180,7 @@ AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) continue; } - from = parse_state(tokens[0]) + state_id_offset; + from = stoi(tokens[0]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, from); AttNode* source = get_node(from); @@ -243,9 +191,8 @@ AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) // Add an Epsilon transition from the new starting state starting_node->transductions.push_back( - Transduction(from, L"", L"", - alphabet(symbol_code(L""), symbol_code(L"")), - default_weight)); + Transduction(from, (const UChar*)"", (const UChar*)"", + 0, default_weight)); first_line_in_fst = false; } @@ -254,7 +201,7 @@ AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) { if (tokens.size() > 1) { - weight = parse_weight(tokens[1]); + weight = stod(tokens[1]); } else { @@ -264,9 +211,9 @@ AttCompiler::parse(UnicodeString const &file_name, UnicodeString const &dir) } else { - to = parse_state(tokens[1]) + state_id_offset; + to = stoi(tokens[1]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, to); - if(dir == "RL") + if(read_rl) { upper = tokens[3]; lower = tokens[2]; @@ -490,7 +437,7 @@ AttCompiler::write(FILE *output) Transducer punct_fst = extract_transducer(PUNCT); /* Non-multichar symbols. */ - Compression::wstring_write(wstring(letters.begin(), letters.end()), output); + Compression::string_write(UString(letters.begin(), letters.end()), output); /* Multichar symbols. */ alphabet.write(output); /* And now the FST. */ @@ -502,12 +449,12 @@ AttCompiler::write(FILE *output) { Compression::multibyte_write(2, output); } - Compression::wstring_write(L"main@standard", output); + Compression::string_write((const UChar*)"main@standard", output); Transducer word_fst = extract_transducer(WORD); word_fst.write(output); wcout << L"main@standard" << " " << word_fst.size(); wcout << " " << word_fst.numberOfTransitions() << endl; - Compression::wstring_write(L"final@inconditional", output); + Compression::string_write((const UChar*)"final@inconditional", output); if(punct_fst.numberOfTransitions() != 0) { punct_fst.write(output); diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h index a03179a..29f2071 100644 --- a/lttoolbox/att_compiler.h +++ b/lttoolbox/att_compiler.h @@ -19,18 +19,15 @@ #include #include -#include #include #include #include +#include #include #include #include -#include -#include - #include #define UNDECIDED 0 @@ -79,8 +76,9 @@ public: /** * Reads the AT&T format file @p file_name. The transducer and the alphabet * are both cleared before reading the new file. + * If read_rl = true then the second tape is used as the input */ - void parse(UnicodeString const &file_name, UnicodeString const &dir); + void parse(string const &file_name, bool read_rl); /** Writes the transducer to @p file_name in lt binary format. */ @@ -108,13 +106,13 @@ private: struct Transduction { int to; - UnicodeString upper; - UnicodeString lower; + UString upper; + UString lower; int tag; double weight; TransducerType type; - Transduction(int to, UnicodeString upper, UnicodeString lower, int tag, + Transduction(int to, UString upper, UString lower, int tag, double weight, TransducerType type=UNDECIDED) : to(to), upper(upper), lower(lower), tag(tag), weight(weight), type(type) {} }; @@ -175,7 +173,7 @@ private: * @todo Are there other special symbols? If so, add them, and maybe use a map * for conversion? */ - void convert_hfst(UnicodeString& symbol); + void convert_hfst(UString& symbol); /** * Returns the code of the symbol in the alphabet. Run after convert_hfst has @@ -186,15 +184,7 @@ private: * @return the code of the symbol, if @p symbol is multichar; its first (and * only) character otherwise. */ - int symbol_code(const UnicodeString& symbol); - - /** - * Wrappers around ICU number parsing functions - */ - NumberFormat* int_parser; - NumberFormat* float_parser; - int parse_state(const UnicodeString& s, int line); - double parse_weight(const UnicodeString& s, int line); + int symbol_code(const UString& symbol); }; #endif /* _MYATT_COMPILER_ */ diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 00a6287..9408191 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -28,41 +28,47 @@ using namespace std; -wstring const Compiler::COMPILER_DICTIONARY_ELEM = L"dictionary"; -wstring const Compiler::COMPILER_ALPHABET_ELEM = L"alphabet"; -wstring const Compiler::COMPILER_SDEFS_ELEM = L"sdefs"; -wstring const Compiler::COMPILER_SDEF_ELEM = L"sdef"; -wstring const Compiler::COMPILER_N_ATTR = L"n"; -wstring const Compiler::COMPILER_PARDEFS_ELEM = L"pardefs"; -wstring const Compiler::COMPILER_PARDEF_ELEM = L"pardef"; -wstring const Compiler::COMPILER_PAR_ELEM = L"par"; -wstring const Compiler::COMPILER_ENTRY_ELEM = L"e"; -wstring const Compiler::COMPILER_RESTRICTION_ATTR = L"r"; -wstring const Compiler::COMPILER_RESTRICTION_LR_VAL = L"LR"; -wstring const Compiler::COMPILER_RESTRICTION_RL_VAL = L"RL"; -wstring const Compiler::COMPILER_PAIR_ELEM = L"p"; -wstring const Compiler::COMPILER_LEFT_ELEM = L"l"; -wstring const Compiler::COMPILER_RIGHT_ELEM = L"r"; -wstring const Compiler::COMPILER_S_ELEM = L"s"; -wstring const Compiler::COMPILER_M_ELEM = L"m"; -wstring const Compiler::COMPILER_REGEXP_ELEM = L"re"; -wstring const Compiler::COMPILER_SECTION_ELEM = L"section"; -wstring const Compiler::COMPILER_ID_ATTR = L"id"; -wstring const Compiler::COMPILER_TYPE_ATTR = L"type"; -wstring const Compiler::COMPILER_IDENTITY_ELEM = L"i"; -wstring const Compiler::COMPILER_IDENTITYGROUP_ELEM = L"ig"; -wstring const Compiler::COMPILER_JOIN_ELEM = L"j"; -wstring const Compiler::COMPILER_BLANK_ELEM = L"b"; -wstring const Compiler::COMPILER_POSTGENERATOR_ELEM = L"a"; -wstring const Compiler::COMPILER_GROUP_ELEM = L"g"; -wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm"; -wstring const Compiler::COMPILER_IGNORE_ATTR = L"i"; -wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes"; -wstring const Compiler::COMPILER_ALT_ATTR = L"alt"; -wstring const Compiler::COMPILER_V_ATTR = L"v"; -wstring const Compiler::COMPILER_VL_ATTR = L"vl"; -wstring const Compiler::COMPILER_VR_ATTR = L"vr"; -wstring const Compiler::COMPILER_WEIGHT_ATTR = L"w"; +UString const Compiler::COMPILER_DICTIONARY_ELEM = (const UChar*)"dictionary"; +UString const Compiler::COMPILER_ALPHABET_ELEM = (const UChar*)"alphabet"; +UString const Compiler::COMPILER_SDEFS_ELEM = (const UChar*)"sdefs"; +UString const Compiler::COMPILER_SDEF_ELEM = (const UChar*)"sdef"; +UString const Compiler::COMPILER_N_ATTR = (const UChar*)"n"; +UString const Compiler::COMPILER_PARDEFS_ELEM = (const UChar*)"pardefs"; +UString const Compiler::COMPILER_PARDEF_ELEM = (const UChar*)"pardef"; +UString const Compiler::COMPILER_PAR_ELEM = (const UChar*)"par"; +UString const Compiler::COMPILER_ENTRY_ELEM = (const UChar*)"e"; +UString const Compiler::COMPILER_RESTRICTION_ATTR = (const UChar*)"r"; +UString const Compiler::COMPILER_RESTRICTION_LR_VAL = (const UChar*)"LR"; +UString const Compiler::COMPILER_RESTRICTION_RL_VAL = (const UChar*)"RL"; +UString const Compiler::COMPILER_PAIR_ELEM = (const UChar*)"p"; +UString const Compiler::COMPILER_LEFT_ELEM = (const UChar*)"l"; +UString const Compiler::COMPILER_RIGHT_ELEM = (const UChar*)"r"; +UString const Compiler::COMPILER_S_ELEM = (const UChar*)"s"; +UString const Compiler::COMPILER_M_ELEM = (const UChar*)"m"; +UString const Compiler::COMPILER_REGEXP_ELEM = (const UChar*)"re"; +UString const Compiler::COMPILER_SECTION_ELEM = (const UChar*)"section"; +UString const Compiler::COMPILER_ID_ATTR = (const UChar*)"id"; +UString const Compiler::COMPILER_TYPE_ATTR = (const UChar*)"type"; +UString const Compiler::COMPILER_IDENTITY_ELEM = (const UChar*)"i"; +UString const Compiler::COMPILER_IDENTITYGROUP_ELEM = (const UChar*)"ig"; +UString const Compiler::COMPILER_JOIN_ELEM = (const UChar*)"j"; +UString const Compiler::COMPILER_BLANK_ELEM = (const UChar*)"b"; +UString const Compiler::COMPILER_POSTGENERATOR_ELEM = (const UChar*)"a"; +UString const Compiler::COMPILER_GROUP_ELEM = (const UChar*)"g"; +UString const Compiler::COMPILER_LEMMA_ATTR = (const UChar*)"lm"; +UString const Compiler::COMPILER_IGNORE_ATTR = (const UChar*)"i"; +UString const Compiler::COMPILER_IGNORE_YES_VAL = (const UChar*)"yes"; +UString const Compiler::COMPILER_ALT_ATTR = (const UChar*)"alt"; +UString const Compiler::COMPILER_V_ATTR = (const UChar*)"v"; +UString const Compiler::COMPILER_VL_ATTR = (const UChar*)"vl"; +UString const Compiler::COMPILER_VR_ATTR = (const UChar*)"vr"; +UString const Compiler::COMPILER_WEIGHT_ATTR = (const UChar*)"w"; +UString const Compiler::COMPILER_TEXT_NODE = (const UChar*)"#text"; +UString const Compiler::COMPILER_COMMENT_NODE = (const UChar*)"#comment"; +UString const Compiler::COMPILER_ACX_ANALYSIS_ELEM = (const UChar*)"analysis-chars"; +UString const Compiler::COMPILER_ACX_CHAR_ELEM = (const UChar*)"char"; +UString const Compiler::COMPILER_ACX_EQUIV_CHAR_ELEM= (const UChar*)"equiv-char"; +UString const Compiler::COMPILER_ACX_VALUE_ATTR = (const UChar*)"value"; Compiler::Compiler() : reader(0), @@ -78,14 +84,14 @@ Compiler::~Compiler() } void -Compiler::parseACX(string const &file, wstring const &dir) +Compiler::parseACX(string const &file, UString const &dir) { if(dir == COMPILER_RESTRICTION_LR_VAL) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: cannot open '" << file << "'." << endl; + cerr << "Error: cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -98,13 +104,13 @@ Compiler::parseACX(string const &file, wstring const &dir) } void -Compiler::parse(string const &file, wstring const &dir) +Compiler::parse(string const &file, UString const &dir) { direction = dir; reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -117,7 +123,7 @@ Compiler::parse(string const &file, wstring const &dir) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -136,9 +142,9 @@ Compiler::parse(string const &file, wstring const &dir) } bool -Compiler::valid(wstring const& dir) const +Compiler::valid(UString const& dir) const { - const wstring side = dir == COMPILER_RESTRICTION_RL_VAL ? L"right" : L"left"; + const char* side = dir == COMPILER_RESTRICTION_RL_VAL ? "right" : "left"; const set epsilonSymbols = alphabet.symbolsWhereLeftIs(0); const set spaceSymbols = alphabet.symbolsWhereLeftIs(L' '); for (auto §ion : sections) { @@ -147,11 +153,11 @@ Compiler::valid(wstring const& dir) const auto initial = fst.getInitial(); for(const auto i : fst.closure(initial, epsilonSymbols)) { if (finals.count(i)) { - wcerr << L"Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << endl; + cerr << "Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << endl; return false; } if(fst.closure(i, spaceSymbols).size() > 1) { // >1 since closure always includes self - wcerr << L"Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << endl; + cerr << "Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << endl; return false; } } @@ -170,7 +176,7 @@ Compiler::procAlphabet() if(ret == 1) { xmlChar const *value = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::towstring(value); + letters = XMLParseUtil::toUString(value); bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { @@ -182,13 +188,13 @@ Compiler::procAlphabet() } if(space == true) // libxml2 returns '\n' for , should be empty { - letters = L""; + letters.clear(); } } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Missing alphabet symbols." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Missing alphabet symbols." << endl; exit(EXIT_FAILURE); } } @@ -197,7 +203,7 @@ Compiler::procAlphabet() void Compiler::procSDef() { - alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); + alphabet.includeSymbol((const UChar*)"<"+attrib(COMPILER_N_ATTR)+(const UChar*)">"); } void @@ -215,7 +221,7 @@ Compiler::procParDef() { paradigms[current_paradigm].minimize(); paradigms[current_paradigm].joinFinals(); - current_paradigm = L""; + current_paradigm.clear(); } } } @@ -313,12 +319,12 @@ Compiler::matchTransduction(list const &pi, void -Compiler::requireEmptyError(wstring const &name) +Compiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -327,7 +333,7 @@ bool Compiler::allBlanks() { bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); for(auto c : text) { @@ -338,11 +344,11 @@ Compiler::allBlanks() } void -Compiler::readString(list &result, wstring const &name) +Compiler::readString(list &result, UString const &name) { - if(name == L"#text") + if(name == COMPILER_TEXT_NODE) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString value = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); for(unsigned int i = 0, limit = value.size(); i < limit; i++) { result.push_back(static_cast(value[i])); @@ -382,12 +388,12 @@ Compiler::readString(list &result, wstring const &name) else if(name == COMPILER_S_ELEM) { requireEmptyError(name); - wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; + UString symbol = (const UChar*)"<" + attrib(COMPILER_N_ATTR) + (const UChar*)">"; if(!alphabet.isSymbolDefined(symbol)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Undefined symbol '" << symbol << "'." << endl; exit(EXIT_FAILURE); } @@ -395,88 +401,88 @@ Compiler::readString(list &result, wstring const &name) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Compiler::skipBlanks(wstring &name) +Compiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == COMPILER_TEXT_NODE || name == COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); } } void -Compiler::skip(wstring &name, wstring const &elem) +Compiler::skip(UString &name, UString const &elem) { skip(name, elem, true); } void -Compiler::skip(wstring &name, wstring const &elem, bool open) +Compiler::skip(UString &name, UString const &elem, bool open) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - wstring slash; + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + UString slash; if(!open) { - slash = L"/"; + slash = (const UChar*)"/"; } - while(name == L"#text" || name == L"#comment") + while(name == COMPILER_TEXT_NODE || name == COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << slash << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << slash << elem << ">'." << endl; exit(EXIT_FAILURE); } } EntryToken -Compiler::procIdentity(wstring const &wsweight, bool ig) +Compiler::procIdentity(UString const &wsweight, bool ig) { list both_sides; double entry_weight = stod(wsweight); if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); if(name == COMPILER_IDENTITY_ELEM || name == COMPILER_IDENTITYGROUP_ELEM) { break; @@ -487,8 +493,8 @@ Compiler::procIdentity(wstring const &wsweight, bool ig) if(verbose && first_element && (both_sides.front() == (int)L' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; EntryToken e; @@ -507,21 +513,21 @@ Compiler::procIdentity(wstring const &wsweight, bool ig) } EntryToken -Compiler::procTransduction(wstring const &wsweight) +Compiler::procTransduction(UString const &wsweight) { list lhs, rhs; double entry_weight = stod(wsweight); - wstring name; + UString name; skip(name, COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); if(name == COMPILER_LEFT_ELEM) { break; @@ -532,8 +538,8 @@ Compiler::procTransduction(wstring const &wsweight) if(verbose && first_element && (lhs.front() == (int)L' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; @@ -541,11 +547,11 @@ Compiler::procTransduction(wstring const &wsweight) if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); if(name == COMPILER_RIGHT_ELEM) { break; @@ -561,8 +567,8 @@ Compiler::procTransduction(wstring const &wsweight) return e; } -wstring -Compiler::attrib(wstring const &name) +UString +Compiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } @@ -571,20 +577,20 @@ EntryToken Compiler::procPar() { EntryToken e; - wstring paradigm_name = attrib(COMPILER_N_ATTR); + UString paradigm_name = attrib(COMPILER_N_ATTR); first_element = false; - if(current_paradigm != L"" && paradigm_name == current_paradigm) + if(!current_paradigm.empty() && paradigm_name == current_paradigm) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Paradigm refers to itself '" << paradigm_name << L"'." < const &elements) { - if(current_paradigm != L"") + if(!current_paradigm.empty()) { // compilation of paradigms Transducer &t = paradigms[current_paradigm]; @@ -620,8 +626,8 @@ Compiler::insertEntryTokens(vector const &elements) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid entry token." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid entry token." << endl; exit(EXIT_FAILURE); } } @@ -691,15 +697,15 @@ Compiler::insertEntryTokens(vector const &elements) void -Compiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Compiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } @@ -712,55 +718,55 @@ Compiler::procSection() if(type != XML_READER_TYPE_END_ELEMENT) { - wstring const &id = attrib(COMPILER_ID_ATTR); - wstring const &type = attrib(COMPILER_TYPE_ATTR); + UString const &id = attrib(COMPILER_ID_ATTR); + UString const &type = attrib(COMPILER_TYPE_ATTR); requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += L"@"; + current_section += (const UChar*)"@"; current_section.append(type); } else { - current_section = L""; + current_section.clear(); } } void Compiler::procEntry() { - wstring attribute = this->attrib(COMPILER_RESTRICTION_ATTR); - wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); - wstring altval = this->attrib(COMPILER_ALT_ATTR); - wstring varval = this->attrib(COMPILER_V_ATTR); - wstring varl = this->attrib(COMPILER_VL_ATTR); - wstring varr = this->attrib(COMPILER_VR_ATTR); - wstring wsweight = this->attrib(COMPILER_WEIGHT_ATTR); + UString attribute = this->attrib(COMPILER_RESTRICTION_ATTR); + UString ignore = this->attrib(COMPILER_IGNORE_ATTR); + UString altval = this->attrib(COMPILER_ALT_ATTR); + UString varval = this->attrib(COMPILER_V_ATTR); + UString varl = this->attrib(COMPILER_VL_ATTR); + UString varr = this->attrib(COMPILER_VR_ATTR); + UString wsweight = this->attrib(COMPILER_WEIGHT_ATTR); // if entry is masked by a restriction of direction or an ignore mark - if((attribute != L"" && attribute != direction) + if((!attribute.empty() && attribute != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (altval != L"" && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) + || (!altval.empty() && altval != alt) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) + || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) { // parse to the end of the entry - wstring name = L""; + UString name; while(name != COMPILER_ENTRY_ELEM) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); } return; } - if(wsweight == L"") + if(wsweight.empty()) { - wsweight = L"0.0000"; + wsweight = (const UChar*)"0.0000"; } vector elements; @@ -770,14 +776,14 @@ Compiler::procEntry() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); skipBlanks(name); - if(current_paradigm == L"" && verbose) + if(current_paradigm.empty() && verbose) { first_element = true; } @@ -805,12 +811,12 @@ Compiler::procEntry() // detection of the use of undefined paradigms - wstring const &p = elements.rbegin()->paradigmName(); + UString const &p = elements.rbegin()->paradigmName(); if(paradigms.find(p) == paradigms.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." <' into '<" << COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -848,31 +854,31 @@ void Compiler::procNodeACX() { xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::toUString(xname); + if(name == COMPILER_TEXT_NODE) { /* ignore */ } - else if(name == L"analysis-chars") + else if(name == COMPILER_ACX_ANALYSIS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == COMPILER_ACX_CHAR_ELEM) { - acx_current_char = static_cast(attrib(L"value")[0]); + acx_current_char = static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0]); } - else if(name == L"equiv-char") + else if(name == COMPILER_ACX_EQUIV_CHAR_ELEM) { - acx_map[acx_current_char].insert(static_cast(attrib(L"value")[0])); + acx_map[acx_current_char].insert(static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == COMPILER_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -881,11 +887,11 @@ void Compiler::procNode() { xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = XMLParseUtil::toUString(xname); // TODO: optimize the execution order of the string "ifs" - if(name == L"#text") + if(name == COMPILER_TEXT_NODE) { /* ignore */ } @@ -921,14 +927,14 @@ Compiler::procNode() { procSection(); } - else if(name== L"#comment") + else if(name== COMPILER_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -938,7 +944,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); et.setRegexp(re); xmlTextReaderRead(reader); return et; @@ -952,7 +958,7 @@ Compiler::write(FILE *output) write_le(output, features); // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alphabet.write(output); @@ -964,35 +970,35 @@ Compiler::write(FILE *output) for(auto& it : sections) { count++; - wcout << it.first << " " << it.second.size(); - wcout << " " << it.second.numberOfTransitions() << endl; - Compression::wstring_write(it.first, output); + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + Compression::string_write(it.first, output); it.second.write(output); } } void -Compiler::setAltValue(string const &a) +Compiler::setAltValue(UString const &a) { - alt = XMLParseUtil::stows(a); + alt = a; } void -Compiler::setVariantValue(string const &v) +Compiler::setVariantValue(UString const &v) { - variant = XMLParseUtil::stows(v); + variant = v; } void -Compiler::setVariantLeftValue(string const &v) +Compiler::setVariantLeftValue(UString const &v) { - variant_left = XMLParseUtil::stows(v); + variant_left = v; } void -Compiler::setVariantRightValue(string const &v) +Compiler::setVariantRightValue(UString const &v) { - variant_right = XMLParseUtil::stows(v); + variant_right = v; } void diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index acd8b7a..b03c14a 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include @@ -54,43 +54,43 @@ private: /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The dictionary section being compiled */ - wstring current_section; + UString current_section; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * List of characters to be considered alphabetic */ - wstring letters; + UString letters; /** * Set verbose mode: warnings which may or may not be correct @@ -121,27 +121,27 @@ private: /** * List of named transducers-paradigms */ - map paradigms; + map paradigms; /** * List of named dictionary sections */ - map sections; + map sections; /** * List of named prefix copy of a paradigm */ - map, Ltstr> prefix_paradigms; + map > prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map, Ltstr> suffix_paradigms; + map > suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map, Ltstr> postsuffix_paradigms; + map > postsuffix_paradigms; /** * Mapping of aliases of characters specified in ACX files @@ -205,7 +205,7 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Construct symbol pairs by align left side of both parts and insert @@ -222,13 +222,13 @@ private: * Parse the <p> element * @return a list of tokens from the dictionary's entry */ - EntryToken procTransduction(wstring const &wsweight); + EntryToken procTransduction(UString const &wsweight); /** * Parse the <i> element * @return a list of tokens from the dictionary's entry */ - EntryToken procIdentity(wstring const &wsweight, bool ig = false); + EntryToken procIdentity(UString const &wsweight, bool ig = false); /** * Parse the <par> element @@ -247,7 +247,7 @@ private: * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all document #text nodes before "elem" @@ -255,22 +255,22 @@ private: * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(wstring &name, wstring const &elem, bool open); + void skip(UString &name, UString const &elem, bool open); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(list &result, wstring const &name); + void readString(list &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -278,8 +278,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -287,7 +287,7 @@ private: */ bool allBlanks(); - bool valid(wstring const& dir) const; + bool valid(UString const& dir) const; public: @@ -295,41 +295,47 @@ public: * Constants to represent the element and the attributes of * dictionaries */ - LTTOOLBOX_IMPORTS static wstring const COMPILER_DICTIONARY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ALPHABET_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SDEFS_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SDEF_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_N_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PARDEFS_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PARDEF_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PAR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ENTRY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_LR_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_RL_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PAIR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_LEFT_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RIGHT_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_S_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_M_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_REGEXP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SECTION_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ID_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_TYPE_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IDENTITY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IDENTITYGROUP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_JOIN_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_BLANK_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_POSTGENERATOR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_GROUP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_LEMMA_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_YES_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ALT_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_V_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_VL_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_VR_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_WEIGHT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_DICTIONARY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ALPHABET_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SDEFS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SDEF_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_N_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEFS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEF_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_PAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ENTRY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_LR_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_RL_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_PAIR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_LEFT_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_RIGHT_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_S_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_M_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_REGEXP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SECTION_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ID_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_TYPE_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITYGROUP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_JOIN_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_BLANK_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_POSTGENERATOR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_GROUP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_LEMMA_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_YES_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_ALT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_V_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_VL_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_VR_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_WEIGHT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_TEXT_NODE; + LTTOOLBOX_IMPORTS static UString const COMPILER_COMMENT_NODE; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_ANALYSIS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_CHAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_EQUIV_CHAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_VALUE_ATTR; /** * Constructor @@ -344,12 +350,12 @@ public: /** * Compile dictionary to letter transducers */ - void parse(string const &file, wstring const &dir); + void parse(string const &file, UString const &dir); /** * Read ACX file */ - void parseACX(string const &file, wstring const &dir); + void parseACX(string const &file, UString const &dir); /** @@ -372,25 +378,25 @@ public: * Set the alt value to use in compilation * @param a the value */ - void setAltValue(string const &a); + void setAltValue(UString const &a); /** * Set the variant value to use in compilation * @param v the value */ - void setVariantValue(string const &v); + void setVariantValue(UString const &v); /** * Set the variant_left value to use in compilation * @param v the value */ - void setVariantLeftValue(string const &v); + void setVariantLeftValue(UString const &v); /** * Set the variant_right value to use in compilation * @param v the value */ - void setVariantRightValue(string const &v); + void setVariantRightValue(UString const &v); }; diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc index bcafbdc..455a95e 100644 --- a/lttoolbox/compression.cc +++ b/lttoolbox/compression.cc @@ -254,7 +254,7 @@ Compression::multibyte_read(istream &input) void -Compression::string_write(UnicodeString const &str, FILE *output) +Compression::string_write(UString const &str, FILE *output) { Compression::multibyte_write(str.size(), output); for(auto c : str) @@ -263,10 +263,10 @@ Compression::string_write(UnicodeString const &str, FILE *output) } } -wstring +UString Compression::string_read(FILE *input) { - UnicodeString retval = ""; + UString retval; for(unsigned int i = 0, limit = Compression::multibyte_read(input); i != limit; i++) diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 80798b4..c3b20c4 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -19,12 +19,11 @@ #include #include -#include #include #include +#include using namespace std; -using namespace icu; // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; @@ -182,7 +181,7 @@ public: * @param str the string to write. * @param output the output stream. */ - static void string_write(UnicodeString const &str, FILE *output); + static void string_write(UString const &str, FILE *output); /** * This method reads a plain string from the input stream. @@ -190,7 +189,7 @@ public: * @param input the input stream. * @return the string read. */ - static UnicodeString string_read(FILE *input); + static UString string_read(FILE *input); /** * Encodes a double value and writes it into the output stream diff --git a/lttoolbox/deserialiser.h b/lttoolbox/deserialiser.h index c0c5c53..3551023 100644 --- a/lttoolbox/deserialiser.h +++ b/lttoolbox/deserialiser.h @@ -33,7 +33,7 @@ #include #include -#include +#include template class Deserialiser; @@ -88,6 +88,11 @@ public: inline static char deserialise(std::istream &Stream_); }; +template <> class Deserialiser { +public: + inline static char deserialise(std::istream &Stream_); +}; + template<> class Deserialiser { public: inline static double deserialise(std::istream &Stream_); @@ -113,13 +118,6 @@ Deserialiser >::deserialise( return SerialisedType_; } -template <> -icu::UnicodeString -Deserialiser::deserialise(std::istream &Stream_) { - std::string s = Deserialiser::deserialise(Stream_); - return icu::UnicodeString::fromUTF8(s); -} - template std::pair Deserialiser >::deserialise( @@ -185,6 +183,10 @@ char Deserialiser::deserialise(std::istream &Stream_) { return int_deserialise(Stream_); } +char Deserialiser::deserialise(std::istream &Stream_) { + return int_deserialise(Stream_); +} + double Deserialiser::deserialise(std::istream &Stream_) { union { uint64_t i; diff --git a/lttoolbox/entry_token.cc b/lttoolbox/entry_token.cc index f03bca5..e23db8a 100644 --- a/lttoolbox/entry_token.cc +++ b/lttoolbox/entry_token.cc @@ -61,7 +61,7 @@ EntryToken::destroy() } void -EntryToken::setParadigm(wstring const &np) +EntryToken::setParadigm(UString const &np) { parName = np; type = paradigm; @@ -77,7 +77,7 @@ EntryToken::setSingleTransduction(list const &pi, list const &pd, doub } void -EntryToken::setRegexp(wstring const &r) +EntryToken::setRegexp(UString const &r) { myregexp = r; type = regexp; @@ -101,7 +101,7 @@ EntryToken::isRegexp() const return type == regexp; } -wstring const & +UString const & EntryToken::paradigmName() const { return parName; @@ -119,7 +119,7 @@ EntryToken::right() const return rightSide; } -wstring const & +UString const & EntryToken::regExp() const { return myregexp; diff --git a/lttoolbox/entry_token.h b/lttoolbox/entry_token.h index 6b2886c..804162e 100644 --- a/lttoolbox/entry_token.h +++ b/lttoolbox/entry_token.h @@ -19,7 +19,7 @@ #include -#include +#include using namespace std; @@ -42,7 +42,7 @@ private: /** * Name of the paradigm (if it is of 'paradigm' 'type') */ - wstring parName; + UString parName; /** * Weight value for the entry (default_weight if unspecified) @@ -62,7 +62,7 @@ private: /** * Regular expression (if 'regexp') */ - wstring myregexp; + UString myregexp; /** * copy method @@ -99,7 +99,7 @@ public: * Sets the name of the paradigm. * @param np the paradigm name */ - void setParadigm(wstring const &np); + void setParadigm(UString const &np); /** * Set both parts of a single transduction. @@ -113,7 +113,7 @@ public: * Set regular expression. * @param r the regular expression specification. */ - void setRegexp(wstring const &r); + void setRegexp(UString const &r); /** * eTest EntryToken to detect if is a paradigm. @@ -137,7 +137,7 @@ public: * Retrieve the name of the paradigm. * @return the name of the paradigm. */ - wstring const & paradigmName() const; + UString const & paradigmName() const; /** * Retrieve the left part of the paradigm. @@ -155,7 +155,7 @@ public: * Retrieve the regular expression specification. * @return the regular expression specification. */ - wstring const & regExp() const; + UString const & regExp() const; /** * Retrieve the weight value of the entry. diff --git a/lttoolbox/expander.cc b/lttoolbox/expander.cc index 8592331..2e4de84 100644 --- a/lttoolbox/expander.cc +++ b/lttoolbox/expander.cc @@ -42,12 +42,12 @@ Expander::~Expander() } void -Expander::expand(string const &file, FILE *output) +Expander::expand(string const &file, UFILE* output) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -60,7 +60,7 @@ Expander::expand(string const &file, FILE *output) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -78,17 +78,17 @@ Expander::procParDef() } else { - current_paradigm = L""; + current_paradigm.clear(); } } void -Expander::requireEmptyError(wstring const &name) +Expander::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -97,7 +97,7 @@ bool Expander::allBlanks() { bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = to_ustring((char*)xmlTextReaderConstValue(reader)); for(auto c : text) { @@ -108,16 +108,16 @@ Expander::allBlanks() } void -Expander::readString(wstring &result, wstring const &name) +Expander::readString(UString &result, UString const &name) { - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - wstring escaped = L"^$/<>{}\\*@#+~:"; + UString value = to_ustring((char*)xmlTextReaderConstValue(reader)); + UString escaped = (const UChar*)"^$/<>{}\\*@#+~:"; for(size_t i = value.size()-1; i > 0; i--) { - if(escaped.find(value[i]) != wstring::npos) { - value.insert(value.begin()+i, L'\\'); + if(escaped.find(value[i]) != UString::npos) { + value.insert(value.begin()+i, '\\'); } } result.append(value); @@ -125,105 +125,105 @@ Expander::readString(wstring &result, wstring const &name) else if(name == Compiler::COMPILER_BLANK_ELEM) { requireEmptyError(name); - result += L' '; + result += ' '; } else if(name == Compiler::COMPILER_M_ELEM) { requireEmptyError(name); if(keep_boundaries) { - result += L'>'; + result += '>'; } } else if(name == Compiler::COMPILER_JOIN_ELEM) { requireEmptyError(name); - result += L'+'; + result += '+'; } else if(name == Compiler::COMPILER_POSTGENERATOR_ELEM) { requireEmptyError(name); - result += L'~'; + result += '~'; } else if(name == Compiler::COMPILER_GROUP_ELEM) { int type=xmlTextReaderNodeType(reader); if(type != XML_READER_TYPE_END_ELEMENT) { - result += L'#'; + result += '#'; } } else if(name == Compiler::COMPILER_S_ELEM) { requireEmptyError(name); - result += L'<'; + result += '<'; result.append(attrib(Compiler::COMPILER_N_ATTR)); - result += L'>'; + result += '>'; } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Expander::skipBlanks(wstring &name) +Expander::skipBlanks(UString &name) { - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); } } void -Expander::skip(wstring &name, wstring const &elem) +Expander::skip(UString &name, UString const &elem) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << elem << ">'." << endl; exit(EXIT_FAILURE); } } -wstring +UString Expander::procIdentity() { - wstring both_sides = L""; + UString both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); if(name == Compiler::COMPILER_IDENTITY_ELEM) { break; @@ -234,21 +234,21 @@ Expander::procIdentity() return both_sides; } -pair +pair Expander::procIdentityGroup() { - wstring lhs = L""; - wstring rhs = L"#"; - wstring both_sides = L""; + UString lhs; + UString rhs = (const UChar*)"#"; + UString both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { break; @@ -259,25 +259,25 @@ Expander::procIdentityGroup() lhs += both_sides; rhs += both_sides; - pair e(lhs, rhs); + pair e(lhs, rhs); return e; } -pair +pair Expander::procTransduction() { - wstring lhs = L"", rhs = L""; - wstring name = L""; + UString lhs, rhs; + UString name; skip(name, Compiler::COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); if(name == Compiler::COMPILER_LEFT_ELEM) { break; @@ -290,11 +290,11 @@ Expander::procTransduction() if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = to_ustring((char*)xmlTextReaderConstName(reader)); if(name == Compiler::COMPILER_RIGHT_ELEM) { break; @@ -305,67 +305,67 @@ Expander::procTransduction() skip(name, Compiler::COMPILER_PAIR_ELEM); - pair e(lhs, rhs); + pair e(lhs, rhs); return e; } -wstring -Expander::attrib(wstring const &name) +UString +Expander::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } -wstring +UString Expander::procPar() { EntryToken e; - wstring paradigm_name = attrib(Compiler::COMPILER_N_ATTR); + UString paradigm_name = attrib(Compiler::COMPILER_N_ATTR); return paradigm_name; } void -Expander::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Expander::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr<< attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr<< attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } void -Expander::procEntry(FILE *output) +Expander::procEntry(UFILE* output) { - wstring attribute = this->attrib(Compiler::COMPILER_RESTRICTION_ATTR); - wstring entrname = this->attrib(Compiler::COMPILER_LEMMA_ATTR); - wstring altval = this->attrib(Compiler::COMPILER_ALT_ATTR); - wstring varval = this->attrib(Compiler::COMPILER_V_ATTR); - wstring varl = this->attrib(Compiler::COMPILER_VL_ATTR); - wstring varr = this->attrib(Compiler::COMPILER_VR_ATTR); - wstring wsweight = this->attrib(Compiler::COMPILER_WEIGHT_ATTR); - - wstring myname = L""; - if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == L"yes" - || (altval != L"" && altval != alt) - || (varval != L"" && varval != variant && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) - || ((varl != L"" && varl != variant_left) && (varr != L"" && varr != variant_right)) - || (varl != L"" && varl != variant_left && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) - || (varr != L"" && varr != variant_right && attribute == Compiler::COMPILER_RESTRICTION_LR_VAL)) + UString attribute = this->attrib(Compiler::COMPILER_RESTRICTION_ATTR); + UString entrname = this->attrib(Compiler::COMPILER_LEMMA_ATTR); + UString altval = this->attrib(Compiler::COMPILER_ALT_ATTR); + UString varval = this->attrib(Compiler::COMPILER_V_ATTR); + UString varl = this->attrib(Compiler::COMPILER_VL_ATTR); + UString varr = this->attrib(Compiler::COMPILER_VR_ATTR); + UString wsweight = this->attrib(Compiler::COMPILER_WEIGHT_ATTR); + + UString myname; + if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == Compiler::COMPILER_IGNORE_YES_VAL + || (!altval.empty() && altval != alt) + || (!varval.empty() && varval != variant && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) + || ((!varl.empty() && varl != variant_left) && (!varr.empty() && varr != variant_right)) + || (!varl.empty() && varl != variant_left && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) + || (!varr.empty() && varr != variant_right && attribute == Compiler::COMPILER_RESTRICTION_LR_VAL)) { do { int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - myname = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + myname = to_ustring((char*)xmlTextReaderConstName(reader)); } while(myname != Compiler::COMPILER_ENTRY_ELEM); return; @@ -373,19 +373,19 @@ Expander::procEntry(FILE *output) EntList items, items_lr, items_rl; if(attribute == Compiler::COMPILER_RESTRICTION_LR_VAL - || (varval != L"" && varval != variant && attribute != Compiler::COMPILER_RESTRICTION_RL_VAL) - || (varl != L"" && varl != variant_left)) + || (!varval.empty() && varval != variant && attribute != Compiler::COMPILER_RESTRICTION_RL_VAL) + || (!varl.empty() && varl != variant_left)) { - items_lr.push_back(make_pair(L"", L"")); + items_lr.push_back(make_pair((const UChar*)"", (const UChar*)"")); } else if(attribute == Compiler::COMPILER_RESTRICTION_RL_VAL - || (varr != L"" && varr != variant_right)) + || (!varr.empty() && varr != variant_right)) { - items_rl.push_back(make_pair(L"", L"")); + items_rl.push_back(make_pair((const UChar*)"", (const UChar*)"")); } else { - items.push_back(make_pair(L"", L"")); + items.push_back(make_pair((const UChar*)"", (const UChar*)"")); } while(true) @@ -393,53 +393,53 @@ Expander::procEntry(FILE *output) int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = to_ustring((char*)xmlTextReaderConstName(reader)); skipBlanks(name); int type = xmlTextReaderNodeType(reader); if(name == Compiler::COMPILER_PAIR_ELEM) { - pair p = procTransduction(); + pair p = procTransduction(); append(items, p); append(items_lr, p); append(items_rl, p); } else if(name == Compiler::COMPILER_IDENTITY_ELEM) { - wstring val = procIdentity(); + UString val = procIdentity(); append(items, val); append(items_lr, val); append(items_rl, val); } else if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { - pair p = procIdentityGroup(); + pair p = procIdentityGroup(); append(items, p); append(items_lr, p); append(items_rl, p); } else if(name == Compiler::COMPILER_REGEXP_ELEM) { - wstring val = L"__REGEXP__" + procRegexp(); + UString val = (const UChar*)"__REGEXP__" + procRegexp(); append(items, val); append(items_lr, val); append(items_rl, val); } else if(name == Compiler::COMPILER_PAR_ELEM) { - wstring p = procPar(); + UString p = procPar(); // detection of the use of undefined paradigms if(paradigm.find(p) == paradigm.end() && paradigm_lr.find(p) == paradigm_lr.end() && paradigm_rl.find(p) == paradigm_rl.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." <', output); - fputwc_unlocked(L':', output); - fputws_unlocked(it.second.c_str(), output); - fputwc_unlocked(L'\n', output); + u_fputs(it.first, output); + u_fputc(':', output); + u_fputc('>', output); + u_fputc(':', output); + u_fputs(it.second, output); + u_fputc('\n', output); } for(auto& it : items_rl) { - fputws_unlocked(it.first.c_str(), output); - fputwc_unlocked(L':', output); - fputwc_unlocked(L'<', output); - fputwc_unlocked(L':', output); - fputws_unlocked(it.second.c_str(), output); - fputwc_unlocked(L'\n', output); + u_fputs(it.first, output); + u_fputc(':', output); + u_fputc('<', output); + u_fputc(':', output); + u_fputs(it.second, output); + u_fputc('\n', output); } } else @@ -531,31 +531,30 @@ Expander::procEntry(FILE *output) return; } - else if(name == L"#text" && allBlanks()) + else if(name == Compiler::COMPILER_TEXT_NODE && allBlanks()) { } - else if(name == L"#comment") + else if(name == Compiler::COMPILER_COMMENT_NODE) { } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << Compiler::COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << Compiler::COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } } void -Expander::procNode(FILE *output) +Expander::procNode(UFILE *output) { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = to_ustring((char*)xmlTextReaderConstName(reader)); // DO: optimize the execution order of this string "ifs" - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { /* ignorar */ } @@ -591,23 +590,23 @@ Expander::procNode(FILE *output) { /* ignorar */ } - else if(name == L"#comment") + else if(name == Compiler::COMPILER_COMMENT_NODE) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } -wstring +UString Expander::procRegexp() { xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString re = to_ustring((char*)xmlTextReaderConstValue(reader)); xmlTextReaderRead(reader); return re; } @@ -622,7 +621,7 @@ Expander::append(EntList &result, { for(auto& it2 : endings) { - temp.push_back(pair(it.first + it2.first, + temp.push_back(pair(it.first + it2.first, it.second + it2.second)); } } @@ -631,7 +630,7 @@ Expander::append(EntList &result, } void -Expander::append(EntList &result, wstring const &endings) +Expander::append(EntList &result, UString const &endings) { for(auto& it : result) { @@ -642,7 +641,7 @@ Expander::append(EntList &result, wstring const &endings) void Expander::append(EntList &result, - pair const &endings) + pair const &endings) { for(auto& it : result) { @@ -652,27 +651,27 @@ Expander::append(EntList &result, } void -Expander::setAltValue(string const &a) +Expander::setAltValue(UString const &a) { - alt = XMLParseUtil::stows(a); + alt = a; } void -Expander::setVariantValue(string const &v) +Expander::setVariantValue(UString const &v) { - variant = XMLParseUtil::stows(v); + variant = v; } void -Expander::setVariantLeftValue(string const &v) +Expander::setVariantLeftValue(UString const &v) { - variant_left = XMLParseUtil::stows(v); + variant_left = v; } void -Expander::setVariantRightValue(string const &v) +Expander::setVariantRightValue(UString const &v) { - variant_right = XMLParseUtil::stows(v); + variant_right = v; } void @@ -680,4 +679,3 @@ Expander::setKeepBoundaries(bool keep) { keep_boundaries = keep; } - diff --git a/lttoolbox/expander.h b/lttoolbox/expander.h index 74da7e2..127929f 100644 --- a/lttoolbox/expander.h +++ b/lttoolbox/expander.h @@ -27,7 +27,7 @@ using namespace std; -typedef list > EntList; +typedef list > EntList; /** * An expander of dictionaries @@ -43,33 +43,33 @@ private: /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * Do we print boundaries or not? @@ -79,16 +79,16 @@ private: /** * Paradigms */ - map paradigm; + map paradigm; - map paradigm_lr; + map paradigm_lr; - map paradigm_rl; + map paradigm_rl; /** * Method to parse an XML Node */ - void procNode(FILE *output); + void procNode(UFILE* output); /** * Parse the <pardef> element @@ -98,67 +98,67 @@ private: /** * Parse the <e> element */ - void procEntry(FILE *output); + void procEntry(UFILE* output); /** * Parse the <re> element * @return the string representing the regular expression */ - wstring procRegexp(); + UString procRegexp(); /** * Gets an attribute value with their name and the current context * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Parse the <p> element * @return a pair of strings, left part and right part of a transduction */ - pair procTransduction(); + pair procTransduction(); /** * Parse the <i> element * @return a string from the dictionary's entry */ - wstring procIdentity(); + UString procIdentity(); /** * Parse the <ig> element * @return a pair of strings, whose right part begins with '#' * but are otherwise identical */ - pair procIdentityGroup(); + pair procIdentityGroup(); /** * Parse the <par> element * @return the name of the paradigm */ - wstring procPar(); + UString procPar(); /** * Skip all document #text nodes before "elem" * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(wstring &result, wstring const &name); + void readString(UString &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -166,8 +166,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -181,8 +181,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - list > const &endings); + static void append(list > &result, + list > const &endings); /** * Append a list of endings to a list of current transductions. @@ -190,8 +190,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - wstring const &endings); + static void append(list > &result, + UString const &endings); /** * Append a list of endings to a list of current transductions. @@ -199,8 +199,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - pair const &endings); + static void append(list > &result, + pair const &endings); public: /** @@ -216,31 +216,31 @@ public: /** * Compile dictionary to letter transducers */ - void expand(string const &file, FILE *output); + void expand(string const &file, UFILE* output); /** * Set the alt value to use in compilation * @param a the value */ - void setAltValue(string const &a); + void setAltValue(UString const &a); /** * Set the variant value to use in expansion * @param v the value */ - void setVariantValue(string const &v); + void setVariantValue(UString const &v); /** * Set the variant_left value to use in expansion * @param v the value */ - void setVariantLeftValue(string const &v); + void setVariantLeftValue(UString const &v); /** * Set the variant_right value to use in expansion * @param v the value */ - void setVariantRightValue(string const &v); + void setVariantRightValue(UString const &v); /** * Set if we are going to keep morpheme boundaries diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc index 9d05c21..ae31314 100644 --- a/lttoolbox/lt_comp.cc +++ b/lttoolbox/lt_comp.cc @@ -103,21 +103,21 @@ int main(int argc, char *argv[]) switch (cnt) { case 'a': - c.setAltValue(optarg); + c.setAltValue(to_ustring(optarg)); break; case 'v': - c.setVariantValue(optarg); + c.setVariantValue(to_ustring(optarg)); break; case 'l': vl = optarg; - c.setVariantLeftValue(vl); + c.setVariantLeftValue(to_ustring(optarg)); break; case 'r': vr = optarg; - c.setVariantRightValue(vr); + c.setVariantRightValue(to_ustring(optarg)); break; case 'm': @@ -192,7 +192,7 @@ int main(int argc, char *argv[]) if(ttype == 'a') { LtLocale::tryToSetLocale(); - a.parse(infile, Compiler::COMPILER_RESTRICTION_LR_VAL); + a.parse(infile, false); } else { @@ -214,7 +214,7 @@ int main(int argc, char *argv[]) if(ttype == 'a') { LtLocale::tryToSetLocale(); - a.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); + a.parse(infile, true); } else { diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index e94ee9f..4e04c0d 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -74,17 +74,17 @@ RegexpCompiler::isReserved(int const t) { switch(t) { - case L'(': - case L')': - case L'[': - case L']': - case L'*': - case L'?': - case L'+': - case L'-': - case L'^': - case L'\\': - case L'|': + case '(': + case ')': + case '[': + case ']': + case '*': + case '?': + case '+': + case '-': + case '^': + case '\\': + case '|': case FIN_FICHERO: return true; @@ -129,7 +129,7 @@ RegexpCompiler::consume(int const t) } void -RegexpCompiler::compile(wstring const &er) +RegexpCompiler::compile(UString const &er) { input = er; token = static_cast(input[0]); @@ -141,7 +141,7 @@ RegexpCompiler::compile(wstring const &er) void RegexpCompiler::S() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { RExpr(); Cola(); @@ -155,7 +155,7 @@ RegexpCompiler::S() void RegexpCompiler::RExpr() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { Term(); RExprp(); @@ -169,14 +169,14 @@ RegexpCompiler::RExpr() void RegexpCompiler::Cola() { - if(token == FIN_FICHERO || token == L')') + if(token == FIN_FICHERO || token == ')') { } - else if(token == L'|') + else if(token == '|') { int e = state; state = transducer.getInitial(); - consume(L'|'); + consume('|'); RExpr(); Cola(); @@ -192,7 +192,7 @@ RegexpCompiler::Cola() void RegexpCompiler::Term() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Transducer t; int e = t.getInitial(); @@ -216,15 +216,15 @@ RegexpCompiler::Term() postop = L""; state = transducer.insertTransducer(state, t, (*alphabet)(0, 0)); } - else if(token == L'(') + else if(token == '(') { Transducer t = transducer; int e = state; transducer.clear(); state = transducer.getInitial(); - consume(L'('); + consume('('); S(); - consume(L')'); + consume(')'); transducer.setFinal(state, default_weight); Postop(); if(postop == L"*") @@ -244,9 +244,9 @@ RegexpCompiler::Term() state = t.insertTransducer(e, transducer, (*alphabet)(0, 0)); transducer = t; } - else if(token == L'[') + else if(token == '[') { - consume(L'['); + consume('['); Esp(); } else @@ -258,12 +258,12 @@ RegexpCompiler::Term() void RegexpCompiler::RExprp() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { Term(); RExprp(); } - else if(token == L'|' || token == FIN_FICHERO || token == L')') + else if(token == '|' || token == FIN_FICHERO || token == ')') { } else @@ -280,9 +280,9 @@ RegexpCompiler::Letra() letter = token; consume(token); } - else if(token == L'\\') + else if(token == '\\') { - consume(L'\\'); + consume('\\'); letter = token; Reservado(); } @@ -295,24 +295,24 @@ RegexpCompiler::Letra() void RegexpCompiler::Postop() { - if(token == L'*') + if(token == '*') { - consume(L'*'); + consume('*'); postop = L"*"; } - else if(token == L'?') + else if(token == '?') { - consume(L'?'); + consume('?'); postop = L"?"; } - else if(token == L'+') + else if(token == '+') { - consume(L'+'); + consume('+'); postop = L"+"; } - else if(token == L'(' || token == L'[' || !isReserved(token) || - token == L'\\' || token == L'|' || token == FIN_FICHERO || - token == L')') + else if(token == '(' || token == '[' || !isReserved(token) || + token == '\\' || token == '|' || token == FIN_FICHERO || + token == ')') { } else @@ -325,10 +325,10 @@ void RegexpCompiler::Esp() { Transducer t; - if(!isReserved(token) || token == L'\\' || token == L']') + if(!isReserved(token) || token == '\\' || token == ']') { Lista(); - consume(L']'); + consume(']'); Postop(); for(set::iterator it = brackets.begin(); @@ -342,11 +342,11 @@ RegexpCompiler::Esp() t.joinFinals((*alphabet)(0, 0)); } - else if(token == L'^') + else if(token == '^') { - consume(L'^'); + consume('^'); Lista(); - consume(L']'); + consume(']'); Postop(); for(int i = 0; i < 256 ;i++) @@ -388,12 +388,12 @@ RegexpCompiler::Esp() void RegexpCompiler::Lista() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Elem(); Lista(); } - else if(token == L']') + else if(token == ']') { } else @@ -418,7 +418,7 @@ RegexpCompiler::Reservado() void RegexpCompiler::Elem() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Letra(); int rango1 = letter; @@ -446,12 +446,12 @@ RegexpCompiler::Elem() void RegexpCompiler::ColaLetra() { - if(token == L'-') + if(token == '-') { - consume(L'-'); + consume('-'); Letra(); } - else if(!isReserved(token) || token == L'\\' || token == L']') + else if(!isReserved(token) || token == '\\' || token == ']') { } else diff --git a/lttoolbox/regexp_compiler.h b/lttoolbox/regexp_compiler.h index dd11ca9..da428d9 100644 --- a/lttoolbox/regexp_compiler.h +++ b/lttoolbox/regexp_compiler.h @@ -17,6 +17,7 @@ #ifndef _REGEXP_COMPILER_ #define _REGEXP_COMPILER_ +#include #include #include @@ -41,7 +42,7 @@ private: /** * Input string */ - wstring input; + UString input; /** * Alphabet to encode symbols @@ -66,7 +67,7 @@ private: /** * Post-operator: '+', '?', '*' */ - wstring postop; + UString postop; /** * Default value of weight @@ -200,7 +201,7 @@ public: * Function that parses a regular expression and produces a transducer * @param er the regular expression */ - void compile(wstring const &er); + void compile(UString const &er); /** * Set the decoder of symbols diff --git a/lttoolbox/serialiser.h b/lttoolbox/serialiser.h index 99f85b1..833b06b 100644 --- a/lttoolbox/serialiser.h +++ b/lttoolbox/serialiser.h @@ -29,8 +29,7 @@ #include #include #include - -#include +#include namespace { template @@ -110,6 +109,12 @@ public: std::ostream &Output); }; +template <> class Serialiser { +public: + inline static void serialise(const UChar &SerialisedType_, + std::ostream &Output); +}; + template<> class Serialiser { public: inline static void serialise(const double &SerialisedType_, @@ -145,13 +150,6 @@ void Serialiser >::serialise( } } -template <> -void Serialiser::serialise(const icu::UnicodeString& s, - std::ostream& Output) { - std::string temp; - ::serialise(s.toUTF8String(temp), Output); -} - template void Serialiser >::serialise( const std::pair &SerialisedType_, @@ -232,6 +230,11 @@ void Serialiser::serialise(const char &SerialisedType_, int_serialise((uint8_t)SerialisedType_, Output); } +void Serialiser::serialise(const UChar &SerialisedType_, + std::ostream &Output) { + int_serialise((uint16_t)SerialisedType_, Output); +} + void Serialiser::serialise(const double &SerialisedType_, std::ostream &Output) { union { diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc deleted file mode 100644 index 3ee4e94..0000000 --- a/lttoolbox/string_utils.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include "string_utils.h" - -void -u_fputs(const UnicodeString str, UFILE* output) -{ - u_fprintf(output, "%S", str.getTerminatedBuffer()); -} diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h deleted file mode 100644 index 5cf6b9e..0000000 --- a/lttoolbox/string_utils.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _LT_STRING_UTILS_H_ -#define _LT_STRING_UTILS_H_ - -#include -#include - -void u_fputs(const UnicodeString str, UFILE* output); - -#endif \ No newline at end of file diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc new file mode 100644 index 0000000..a20ccd9 --- /dev/null +++ b/lttoolbox/ustring.cc @@ -0,0 +1,61 @@ +#include "ustring.h" + +#include +#include + +using namespace icu; + +void +u_fputs(const UString& str, UFILE* output) +{ + u_fputs(str.c_str(), output); +} + +int +stoi(const UString& str) +{ + int ret; + int c = u_sscanf(str.c_str(), "%d", &ret); + if (c != 1) { + throw std::invalid_argument(); + } + return ret; +} + +double +stod(const UString& str) +{ + double ret; + int c = u_sscanf(str.c_str(), "%f", &ret); + if (c != 1) { + throw std::invalid_argument(); + } + return ret; +} + +UString +to_ustring(const char* s) +{ + UnicodeString temp = UnicodeString::fromUTF8(s); + UString ret = temp.getTerminatedBuffer(); + return ret; +} + +char* +to_char(const UString& str) +{ + std::string stemp; + UnicodeString utemp = str.c_str(); + utemp.toUTF8String(stemp); + return stemp.c_str(); +} + +static std::ostream& +operator<<(std::ostream& ostr, const UString& str) +{ + std::string res; + UnicodeString temp = str.c_str(); + temp.toUTF8String(res); + ostr << res; + return ostr; +} diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h new file mode 100644 index 0000000..34c4b0c --- /dev/null +++ b/lttoolbox/ustring.h @@ -0,0 +1,26 @@ +#ifndef _LT_USTRING_H_ +#define _LT_USTRING_H_ + +#include +#include +#include + +typedef std::basic_string UString; + +void u_fputs(const UString& str, UFILE* output); + +// like std::stoi, throws invalid_argument if unable to parse +int stoi(const UString& str); + +// like std::stoi, throws invalid_argument if unable to parse +double stod(const UString& str); + +// for command-line arguments +UString to_ustring(const char* str); + +// for interfacing with e.g. XML library +char* to_char(const UString& str); + +static std::ostream& operator<<(std::ostream& ostr, const UString& str); + +#endif diff --git a/lttoolbox/xml_parse_util.cc b/lttoolbox/xml_parse_util.cc index 3149900..2a82701 100644 --- a/lttoolbox/xml_parse_util.cc +++ b/lttoolbox/xml_parse_util.cc @@ -21,34 +21,23 @@ using namespace std; -wstring -XMLParseUtil::attrib(xmlTextReaderPtr reader, wstring const &name) +UString +XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name) { - string mystr = ""; - for(int i = 0, limit = name.size(); i != limit; i++) - { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); + xmlChar *attrname = xmlCharStrdup(to_char(name)); xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = towstring(myattr); + UString result = to_ustring(myattr); xmlFree(myattr); xmlFree(attrname); return result; } -wstring -XMLParseUtil::attrib(xmlTextReaderPtr reader, wstring const &name, const wstring fallback) +UString +XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name, const UString fallback) { - string mystr = ""; - for (int i = 0, limit = name.size(); i != limit; i++) { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); + xmlChar *attrname = xmlCharStrdup(to_char(name)); xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = XMLParseUtil::towstring(myattr); + UString result = to_ustring(myattr); xmlFree(myattr); xmlFree(attrname); if(myattr == NULL) { @@ -58,88 +47,3 @@ XMLParseUtil::attrib(xmlTextReaderPtr reader, wstring const &name, const wstring return result; } } - - -string -XMLParseUtil::latin1(xmlChar const *input) -{ - if(input == NULL) - { - return ""; - } - - int outputlen = xmlStrlen(input) + 1; - int inputlen = xmlStrlen(input); - - unsigned char* output = new unsigned char[outputlen]; - - if(UTF8Toisolat1(output, &outputlen, input, &inputlen) != 0) - { - } - - output[outputlen] = 0; - string result = reinterpret_cast(output); - delete[] output; - return result; -} - -wstring -XMLParseUtil::towstring(xmlChar const * input) -{ - wstring result = L""; - - for(int i = 0, limit = xmlStrlen(input); i != limit; i++) - { - int val = 0; - if(((unsigned char) input[i] & 0x80) == 0x0) - { - val = static_cast(input[i]); - } - else if(((unsigned char) input[i] & 0xE0) == 0xC0) - { - val = (input[i] & 0x1F) << 6; - i++; - val += input[i] & 0x7F; - } - else if(((unsigned char) input[i] & 0xF0) == 0xE0) - { - val = (input[i] & 0x0F) << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - } - else if(((unsigned char) input[i] & 0xF8) == 0xF0) - { - val = (input[i] & 0x07) << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - } - else - { - wcerr << L"UTF-8 invalid string" << endl; - exit(EXIT_FAILURE); - } - - result += static_cast(val); - } - return result; -} - -wstring -XMLParseUtil::stows(string const &str) -{ - wchar_t* result = new wchar_t[str.size()+1]; - size_t retval = mbstowcs(result, str.c_str(), str.size()); - result[retval] = L'\0'; - wstring result2 = result; - delete[] result; - return result2; -} diff --git a/lttoolbox/xml_parse_util.h b/lttoolbox/xml_parse_util.h index beca741..eaf34a0 100644 --- a/lttoolbox/xml_parse_util.h +++ b/lttoolbox/xml_parse_util.h @@ -19,8 +19,7 @@ #include #include -#include -#include +#include using namespace std; @@ -29,14 +28,12 @@ class XMLParseUtil public: /* If attrib does not exist (or other error), returns an empty string: */ - static wstring attrib(xmlTextReaderPtr reader, wstring const &name); + static UString attrib(xmlTextReaderPtr reader, UString const &name); /* If attrib does not exist (or other error), returns fallback: */ - static wstring attrib(xmlTextReaderPtr reader, wstring const &name, const wstring fallback); + static UString attrib(xmlTextReaderPtr reader, UString const &name, const UString fallback); - static string latin1(xmlChar const * input); // mark for deletion - static wstring towstring(xmlChar const * input); - static wstring stows(string const &str); + static UString toUString(xmlChar const * input); }; #endif