commit 9ece019c9d7e2a2e70d56a9bbd8f9d06b41d86c5 Author: Daniel Swanson Date: Fri Jun 11 14:24:15 2021 -0500 use ICU diff --git a/configure.ac b/configure.ac index 5b60c37..b200861 100644 --- a/configure.ac +++ b/configure.ac @@ -38,13 +38,19 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) +PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc]) + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS" +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS $ICU_LIBS" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/lsx_comp.cc b/src/lsx_comp.cc index 4f9f5a6..905395e 100644 --- a/src/lsx_comp.cc +++ b/src/lsx_comp.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ int main (int argc, char** argv) Compiler c; - wstring dir; + UString dir; if(strcmp(argv[1], "lr") == 0) { diff --git a/src/lsx_compiler.cc b/src/lsx_compiler.cc index 1e5b4c8..069c890 100644 --- a/src/lsx_compiler.cc +++ b/src/lsx_compiler.cc @@ -29,14 +29,15 @@ using namespace std; // Removed static globals copied from lttoolbox's compiler.cc. Same namespace, same mangling, bad result. -wstring const Compiler::COMPILER_ANYTAG_ELEM = L"t"; -wstring const Compiler::COMPILER_ANYCHAR_ELEM = L"w"; -wstring const Compiler::COMPILER_WB_ELEM = L"j"; +UString const Compiler::COMPILER_ANYTAG_ELEM = "t"_u; +UString const Compiler::COMPILER_ANYCHAR_ELEM = "w"_u; +UString const Compiler::COMPILER_WB_ELEM = "j"_u; Compiler::Compiler() : reader(0), verbose(false), -first_element(false) +first_element(false), +any_tag(0), any_char(0), word_boundary(0) { } @@ -45,19 +46,22 @@ Compiler::~Compiler() } void -Compiler::parse(string const &fichero, wstring const &dir) +Compiler::parse(string const &fichero, UString const &dir) { direction = dir; reader = xmlReaderForFile(fichero.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl; + cerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl; exit(EXIT_FAILURE); } - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); + alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL); + alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL); + alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL); + any_tag = alphabet(Transducer::ANY_TAG_SYMBOL); + any_char = alphabet(Transducer::ANY_CHAR_SYMBOL); + word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -69,7 +73,7 @@ Compiler::parse(string const &fichero, wstring const &dir) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -77,19 +81,16 @@ Compiler::parse(string const &fichero, wstring const &dir) // Minimize transducers and ensure that all paths end with <$> - int end_trans = alphabet(alphabet(L"<$>"), alphabet(L"<$>")); - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - (it->second).minimize(); + int end_trans = alphabet(word_boundary, word_boundary); + for (auto& it : sections) { + it.second.minimize(); // any paths which did not already end with <$> now will // having 2 finals isn't a problem because -separable only checks // for finals when it reads $, and you can't have 2 of those in a row - for(auto fin : (it->second).getFinals()) + for(auto fin : it.second.getFinals()) { - int end_state = (it->second).insertSingleTransduction(end_trans, fin.first); - (it->second).setFinal(end_state); + int end_state = it.second.insertSingleTransduction(end_trans, fin.first); + it.second.setFinal(end_state); } } } @@ -105,8 +106,7 @@ Compiler::procAlphabet() int ret = xmlTextReaderRead(reader); if(ret == 1) { - xmlChar const *valor = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::towstring(valor); + UString letters = XMLParseUtil::readValue(reader); bool espai = true; for(unsigned int i = 0; i < letters.length(); i++) { @@ -118,13 +118,13 @@ Compiler::procAlphabet() } if(espai == true) // libxml2 returns '\n' for , should be empty { - letters = L""; + letters.clear(); } } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Missing alphabet symbols." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Missing alphabet symbols." << endl; exit(EXIT_FAILURE); } } @@ -133,7 +133,11 @@ Compiler::procAlphabet() void Compiler::procSDef() { - alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); + UString s; + s += '<'; + s.append(attrib(COMPILER_N_ATTR)); + s += '>'; + alphabet.includeSymbol(s); } void @@ -151,15 +155,15 @@ Compiler::procParDef() { paradigms[current_paradigm].minimize(); paradigms[current_paradigm].joinFinals(); - current_paradigm = L""; + current_paradigm.clear(); } } } int -Compiler::matchTransduction(list const &pi, list const &pd, int estado, Transducer &t) +Compiler::matchTransduction(vector const &pi, vector const &pd, int estado, Transducer &t) { - list::const_iterator izqda, dcha, limizqda, limdcha; + vector::const_iterator izqda, dcha, limizqda, limdcha; if(direction == COMPILER_RESTRICTION_LR_VAL) { @@ -183,8 +187,6 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado } else { - int rsymbol = 0; - while(true) { int etiqueta; @@ -202,33 +204,31 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado else if(dcha == limdcha) { etiqueta = alphabet(*izqda, 0); - rsymbol = 0; izqda++; } else { etiqueta = alphabet(*izqda, *dcha); - rsymbol = *dcha; izqda++; dcha++; } - if(etiqueta == alphabet(0, alphabet(L"")) || - etiqueta == alphabet(0, alphabet(L"")) + if(etiqueta == alphabet(0, any_tag) || + etiqueta == alphabet(0, any_char) ) { // rl compilation of a badly written rule // having an epsilon with wildcard output will produce // garbage output -- see https://github.com/apertium/apertium-separable/issues/8 - wcerr << L"Warning: Cannot insert from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl; + cerr << "Warning: Cannot insert from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl; continue; } int nuevo_estado = t.insertSingleTransduction(etiqueta, estado); - if(etiqueta == alphabet(alphabet(L""),alphabet(L"")) - || etiqueta == alphabet(alphabet(L""),alphabet(L"")) - || etiqueta == alphabet(alphabet(L""), 0) - || etiqueta == alphabet(alphabet(L""), 0) + if(etiqueta == alphabet(any_tag, any_tag) + || etiqueta == alphabet(any_char, any_char) + || etiqueta == alphabet(any_tag, 0) + || etiqueta == alphabet(any_char, 0) ) { t.linkStates(nuevo_estado, estado, 0); @@ -242,12 +242,12 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado void -Compiler::requireEmptyError(wstring const &name) +Compiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -255,139 +255,137 @@ Compiler::requireEmptyError(wstring const &name) bool Compiler::allBlanks() { - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - - for(unsigned int i = 0, limit = text.size(); i < limit; i++) - { - flag = flag && iswspace(text[i]); - } - - return flag; + vector text; + XMLParseUtil::readValueInto32(reader, text); + for (auto& it : text) { + if (!u_isspace(it)) { + return false; + } + } + return true; } void -Compiler::readString(list &result, wstring const &name) +Compiler::readString(vector &result, UString const &name) { - if(name == L"#text") + if(name == "#text"_u) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } + XMLParseUtil::readValueInto32(reader, result); } else if(name == COMPILER_BLANK_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L' ')); + result.push_back(static_cast(' ')); } else if(name == COMPILER_POSTGENERATOR_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L'~')); + result.push_back(static_cast('~')); } else if(name == COMPILER_GROUP_ELEM) { int tipo=xmlTextReaderNodeType(reader); if(tipo != XML_READER_TYPE_END_ELEMENT) { - result.push_back(static_cast(L'#')); + result.push_back(static_cast('#')); } } else if(name == COMPILER_S_ELEM) { requireEmptyError(name); - wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; + UString symbol; + symbol += '<'; + symbol.append(attrib(COMPILER_N_ATTR)); + symbol += '>'; if(!alphabet.isSymbolDefined(symbol)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Undefined symbol '" << symbol << "'." << endl; exit(EXIT_FAILURE); } result.push_back(alphabet(symbol)); } else if(name == COMPILER_ANYTAG_ELEM) { - result.push_back(alphabet(L"")); + result.push_back(any_tag); } else if(name == COMPILER_ANYCHAR_ELEM) { - result.push_back(alphabet(L"")); + result.push_back(any_char); } else if(name == COMPILER_WB_ELEM) { requireEmptyError(name); - result.push_back(alphabet(L"<$>")); + result.push_back(word_boundary); } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Compiler::skipBlanks(wstring &name) +Compiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -Compiler::skip(wstring &name, wstring const &elem) +Compiler::skip(UString &name, UString const &elem) { skip(name, elem, true); } void -Compiler::skip(wstring &name, wstring const &elem, bool open) +Compiler::skip(UString &name, UString const &elem, bool open) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - wstring slash; + name = XMLParseUtil::readName(reader); + UString slash; if(!open) { - slash = L"/"; + slash = "/"_u; } - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << slash << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << slash << elem << ">'." << endl; exit(EXIT_FAILURE); } } @@ -395,16 +393,16 @@ Compiler::skip(wstring &name, wstring const &elem, bool open) EntryToken Compiler::procIdentity() { - list both_sides; + vector both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_IDENTITY_ELEM) { break; @@ -413,10 +411,10 @@ Compiler::procIdentity() } } - if(verbose && first_element && (both_sides.front() == (int)L' ')) + if(verbose && first_element && (both_sides.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; EntryToken e; @@ -427,18 +425,18 @@ Compiler::procIdentity() EntryToken Compiler::procTransduction() { - list lhs, rhs; - wstring name; + vector lhs, rhs; + UString name; skip(name, COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_LEFT_ELEM) { break; @@ -447,10 +445,10 @@ Compiler::procTransduction() } } - if(verbose && first_element && (lhs.front() == (int)L' ')) + if(verbose && first_element && (lhs.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; @@ -458,11 +456,11 @@ Compiler::procTransduction() if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_RIGHT_ELEM) { break; @@ -479,8 +477,8 @@ Compiler::procTransduction() return e; } -wstring -Compiler::attrib(wstring const &name) +UString +Compiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } @@ -489,20 +487,20 @@ EntryToken Compiler::procPar() { EntryToken e; - wstring nomparadigma = attrib(COMPILER_N_ATTR); + UString nomparadigma = attrib(COMPILER_N_ATTR); first_element = false; - if(current_paradigm != L"" && nomparadigma == current_paradigm) + if(!current_paradigm.empty() && nomparadigma == current_paradigm) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." < const &elements) { - if(current_paradigm != L"") - { + if(!current_paradigm.empty()) { // compilation of paradigms Transducer &t = paradigms[current_paradigm]; int e = t.getInitial(); @@ -537,8 +534,8 @@ Compiler::insertEntryTokens(vector const &elements) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid entry token." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid entry token." << endl; exit(EXIT_FAILURE); } } @@ -608,15 +605,14 @@ Compiler::insertEntryTokens(vector const &elements) void -Compiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Compiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + if(value.empty()) { + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } @@ -629,46 +625,46 @@ Compiler::procSection() if(tipo != XML_READER_TYPE_END_ELEMENT) { - wstring const &id = attrib(COMPILER_ID_ATTR); - wstring const &type = attrib(COMPILER_TYPE_ATTR); + UString const &id = attrib(COMPILER_ID_ATTR); + UString const &type = attrib(COMPILER_TYPE_ATTR); requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += L"@"; + current_section += '@'; current_section.append(type); } else { - current_section = L""; + current_section.clear(); } } void Compiler::procEntry() { - wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR); - wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); - wstring altval = this->attrib(COMPILER_ALT_ATTR); - wstring varval = this->attrib(COMPILER_V_ATTR); - wstring varl = this->attrib(COMPILER_VL_ATTR); - wstring varr = this->attrib(COMPILER_VR_ATTR); - - //���if entry is masked by a restriction of direction or an ignore mark - if((atributo != L"" && atributo != direction) + UString atributo=this->attrib(COMPILER_RESTRICTION_ATTR); + UString ignore = this->attrib(COMPILER_IGNORE_ATTR); + UString altval = this->attrib(COMPILER_ALT_ATTR); + UString varval = this->attrib(COMPILER_V_ATTR); + UString varl = this->attrib(COMPILER_VL_ATTR); + UString varr = this->attrib(COMPILER_VR_ATTR); + + // if entry is masked by a restriction of direction or an ignore mark + if((!atributo.empty() && atributo != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (altval != L"" && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) + || (!altval.empty() && altval != alt) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) + || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) { // parse to the end of the entry - wstring name = L""; + UString name; while(name != COMPILER_ENTRY_ELEM) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } return; @@ -681,14 +677,14 @@ Compiler::procEntry() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); - if(current_paradigm == L"" && verbose) + if(current_paradigm.empty() && verbose) { first_element = true; } @@ -712,12 +708,12 @@ Compiler::procEntry() // detecci���n del uso de paradigmas no definidos - wstring const &p = elements.rbegin()->paradigmName(); + UString const &p = elements.rbegin()->paradigmName(); if(paradigms.find(p) == paradigms.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." < HERE */ - // list wb; - // wb.push_back(alphabet(L"<$>")); + // vector wb; + // wb.push_back(word_boundary); // EntryToken e; // e.setSingleTransduction(wb, wb); // elements.push_back(e); @@ -748,9 +744,9 @@ Compiler::procEntry() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } @@ -760,12 +756,11 @@ Compiler::procEntry() void Compiler::procNode() { - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); + UString nombre = XMLParseUtil::readName(reader); - // HACER: optimizar el orden de ejecuci���n de esta ristra de "ifs" + // HACER: optimizar el orden de ejecución de esta ristra de "ifs" - if(nombre == L"#text") + if(nombre == "#text"_u) { /* ignorar */ } @@ -801,14 +796,14 @@ Compiler::procNode() { procSection(); } - else if(nombre == L"#comment") + else if(nombre == "#comment"_u) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << nombre << ">'." << endl; exit(EXIT_FAILURE); } } @@ -818,7 +813,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::readValue(reader); et.setRegexp(re); xmlTextReaderRead(reader); return et; @@ -828,7 +823,7 @@ void Compiler::write(FILE *output) { // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alphabet.write(output); @@ -836,16 +831,11 @@ Compiler::write(FILE *output) // transducers Compression::multibyte_write(sections.size(), output); - int conta=0; - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - conta++; - wcout << it->first << " " << it->second.size(); - wcout << " " << it->second.numberOfTransitions() << endl; - Compression::wstring_write(it->first, output); - it->second.write(output); + for (auto& it : sections) { + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + Compression::string_write(it.first, output); + it.second.write(output); } } diff --git a/src/lsx_compiler.h b/src/lsx_compiler.h index 971c6fb..10a5aa2 100644 --- a/src/lsx_compiler.h +++ b/src/lsx_compiler.h @@ -20,12 +20,12 @@ #include #include #include -#include #include #include #include #include +#include #include using namespace std; @@ -44,43 +44,43 @@ private: /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The dictionary section being compiled */ - wstring current_section; + UString current_section; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * List of characters to be considered alphabetic */ - wstring letters; + UString letters; /** * Set verbose mode: warnings which may or may not be correct @@ -97,30 +97,37 @@ private: */ Alphabet alphabet; + /** + * Special symbols + */ + int32_t any_tag; + int32_t any_char; + int32_t word_boundary; + /** * List of named transducers-paradigms */ - map paradigms; + map paradigms; /** * List of named dictionary sections */ - map sections; + map sections; /** * List of named prefix copy of a paradigm */ - map, Ltstr> prefix_paradigms; + map> prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map, Ltstr> suffix_paradigms; + map> suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map, Ltstr> postsuffix_paradigms; + map> postsuffix_paradigms; /* @@ -175,7 +182,7 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Construct symbol pairs by align left side of both parts and insert @@ -186,7 +193,7 @@ private: * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(list const &lp, list const &rp, + int matchTransduction(vector const &lp, vector const &rp, int state, Transducer &t); /** * Parse the <p< element @@ -217,7 +224,7 @@ private: * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all document #text nodes before "elem" @@ -225,22 +232,22 @@ private: * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(wstring &name, wstring const &elem, bool open); + void skip(UString &name, UString const &elem, bool open); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(list &result, wstring const &name); + void readString(vector &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -249,8 +256,8 @@ private: * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks * @return true if all are blanks @@ -263,42 +270,42 @@ public: * Constants to represent the element and the attributes of * dictionaries */ - static wstring const COMPILER_DICTIONARY_ELEM; - static wstring const COMPILER_ALPHABET_ELEM; - static wstring const COMPILER_SDEFS_ELEM; - static wstring const COMPILER_SDEF_ELEM; - static wstring const COMPILER_N_ATTR; - static wstring const COMPILER_PARDEFS_ELEM; - static wstring const COMPILER_PARDEF_ELEM; - static wstring const COMPILER_PAR_ELEM; - static wstring const COMPILER_ENTRY_ELEM; - static wstring const COMPILER_RESTRICTION_ATTR; - static wstring const COMPILER_RESTRICTION_LR_VAL; - static wstring const COMPILER_RESTRICTION_RL_VAL; - static wstring const COMPILER_PAIR_ELEM; - static wstring const COMPILER_LEFT_ELEM; - static wstring const COMPILER_RIGHT_ELEM; - static wstring const COMPILER_S_ELEM; - static wstring const COMPILER_REGEXP_ELEM; - static wstring const COMPILER_SECTION_ELEM; - static wstring const COMPILER_ID_ATTR; - static wstring const COMPILER_TYPE_ATTR; - static wstring const COMPILER_IDENTITY_ELEM; - static wstring const COMPILER_JOIN_ELEM; - static wstring const COMPILER_BLANK_ELEM; - static wstring const COMPILER_POSTGENERATOR_ELEM; - static wstring const COMPILER_GROUP_ELEM; - static wstring const COMPILER_LEMMA_ATTR; - static wstring const COMPILER_IGNORE_ATTR; - static wstring const COMPILER_IGNORE_YES_VAL; - static wstring const COMPILER_ALT_ATTR; - static wstring const COMPILER_V_ATTR; - static wstring const COMPILER_VL_ATTR; - static wstring const COMPILER_VR_ATTR; - - static wstring const COMPILER_ANYTAG_ELEM; - static wstring const COMPILER_ANYCHAR_ELEM; - static wstring const COMPILER_WB_ELEM; + static UString const COMPILER_DICTIONARY_ELEM; + static UString const COMPILER_ALPHABET_ELEM; + static UString const COMPILER_SDEFS_ELEM; + static UString const COMPILER_SDEF_ELEM; + static UString const COMPILER_N_ATTR; + static UString const COMPILER_PARDEFS_ELEM; + static UString const COMPILER_PARDEF_ELEM; + static UString const COMPILER_PAR_ELEM; + static UString const COMPILER_ENTRY_ELEM; + static UString const COMPILER_RESTRICTION_ATTR; + static UString const COMPILER_RESTRICTION_LR_VAL; + static UString const COMPILER_RESTRICTION_RL_VAL; + static UString const COMPILER_PAIR_ELEM; + static UString const COMPILER_LEFT_ELEM; + static UString const COMPILER_RIGHT_ELEM; + static UString const COMPILER_S_ELEM; + static UString const COMPILER_REGEXP_ELEM; + static UString const COMPILER_SECTION_ELEM; + static UString const COMPILER_ID_ATTR; + static UString const COMPILER_TYPE_ATTR; + static UString const COMPILER_IDENTITY_ELEM; + static UString const COMPILER_JOIN_ELEM; + static UString const COMPILER_BLANK_ELEM; + static UString const COMPILER_POSTGENERATOR_ELEM; + static UString const COMPILER_GROUP_ELEM; + static UString const COMPILER_LEMMA_ATTR; + static UString const COMPILER_IGNORE_ATTR; + static UString const COMPILER_IGNORE_YES_VAL; + static UString const COMPILER_ALT_ATTR; + static UString const COMPILER_V_ATTR; + static UString const COMPILER_VL_ATTR; + static UString const COMPILER_VR_ATTR; + + static UString const COMPILER_ANYTAG_ELEM; + static UString const COMPILER_ANYCHAR_ELEM; + static UString const COMPILER_WB_ELEM; /** @@ -316,7 +323,7 @@ public: * @param fichero file * @param dir direction */ - void parse(string const &fichero, wstring const &dir); + void parse(string const &fichero, UString const &dir); // auto getAlt(); // auto getInt(); diff --git a/src/lsx_proc.cc b/src/lsx_proc.cc index fbaedba..eaf327c 100644 --- a/src/lsx_proc.cc +++ b/src/lsx_proc.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include "lsx_processor.h" @@ -27,8 +28,8 @@ int main (int argc, char** argv) LtLocale::tryToSetLocale(); LSXProcessor fstp; - FILE* input = stdin; - FILE* output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); #if HAVE_GETOPT_LONG static struct option long_options[]= @@ -71,22 +72,18 @@ int main (int argc, char** argv) } FILE* fst = fopen(argv[optind], "rb"); if(!fst) { - wcerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl; + cerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl; exit(EXIT_FAILURE); } fstp.load(fst); if (optind <= (argc - 2)) { - input = fopen(argv[optind+1], "rb"); - if (input == NULL || ferror(input)) { - wcerr << "Error: Cannot open file '" << argv[optind+1] << "' for reading." << endl; - exit(EXIT_FAILURE); - } + input.open_or_exit(argv[optind+1]); } if (optind <= (argc - 3)) { - output = fopen(argv[optind+2], "wb"); - if (output == NULL || ferror(output)) { - wcerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl; + output = u_fopen(argv[optind+2], "w", NULL, NULL); + if (output == NULL) { + cerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl; } } diff --git a/src/lsx_processor.cc b/src/lsx_processor.cc index 7f68dc5..043cf2c 100644 --- a/src/lsx_processor.cc +++ b/src/lsx_processor.cc @@ -1,20 +1,21 @@ #include "lsx_processor.h" #include +#include LSXProcessor::LSXProcessor() { - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); null_flush = false; dictionary_case = false; @@ -52,12 +53,12 @@ LSXProcessor::load(FILE *input) // symbols alphabet.read(input); - word_boundary = alphabet(L"<$>"); - any_char = alphabet(L""); - any_tag = alphabet(L""); + word_boundary = alphabet("<$>"_u); + any_char = alphabet(""_u); + any_tag = alphabet(""_u); len = Compression::multibyte_read(input); - Compression::wstring_read(input); // name + Compression::string_read(input); // name // there should only be 1 transducer in the file // so ignore any subsequent ones trans.read(input, alphabet); @@ -67,65 +68,65 @@ LSXProcessor::load(FILE *input) } void -LSXProcessor::readNextLU(FILE* input) +LSXProcessor::readNextLU(InputFile& input) { - vector parts = vector(3); + vector parts = vector(3); int loc = 0; // 0 = blank, 1 = bound blank, 2 = LU bool box = false; // are we in a [ ] blank - while(!feof(input)) + while(!input.eof()) { - wchar_t c = fgetwc_unlocked(input); - if ((unsigned int)c == WEOF) { + UChar32 c = input.get(); + if ((unsigned int)c == U_EOF) { break; } - if(null_flush && c == L'\0') + if(null_flush && c == '\0') { at_end = true; at_null = true; break; } - else if(c == L'\\') + else if(c == '\\') { parts[loc] += c; - c = fgetwc_unlocked(input); + c = input.get(); parts[loc] += c; } else if(loc == 0 && box) { - if(c == L']') + if(c == ']') { box = false; } parts[loc] += c; } - else if(loc == 0 && c == L'[') + else if(loc == 0 && c == '[') { - c = fgetwc_unlocked(input); - if(c == L'[') + c = input.get(); + if(c == '[') { loc = 1; } else { - parts[loc] += L'['; + parts[loc] += '['; parts[loc] += c; - if(c != L']') + if(c != ']') { box = true; } - if(c == L'\\') + if(c == '\\') { - parts[loc] += fgetwc_unlocked(input); + parts[loc] += input.get(); } } } - else if(loc == 1 && c == L']') + else if(loc == 1 && c == ']') { - c = fgetwc_unlocked(input); - if(c == L']') + c = input.get(); + if(c == ']') { - c = fgetwc_unlocked(input); - if(c == L'^') + c = input.get(); + if(c == '^') { loc = 2; } @@ -134,25 +135,25 @@ LSXProcessor::readNextLU(FILE* input) // this situation is invalid // but I like making parsers harder to break than required // by the standard - parts[loc] += L"]]"; + parts[loc] += "]]"_u; parts[loc] += c; } } else { - parts[loc] += L']'; + parts[loc] += ']'; parts[loc] += c; - if(c == L'\\') + if(c == '\\') { - parts[loc] += fgetwc_unlocked(input); + parts[loc] += input.get(); } } } - else if(loc == 0 && c == L'^') + else if(loc == 0 && c == '^') { loc = 2; } - else if(loc == 2 && c == L'$') + else if(loc == 2 && c == '$') { break; } @@ -161,7 +162,7 @@ LSXProcessor::readNextLU(FILE* input) parts[loc] += c; } } - if(feof(input)) + if(input.eof()) { at_end = true; } @@ -171,7 +172,7 @@ LSXProcessor::readNextLU(FILE* input) } void -LSXProcessor::processWord(FILE* input, FILE* output) +LSXProcessor::processWord(InputFile& input, UFILE* output) { if(lu_queue.size() == 0) { @@ -180,14 +181,14 @@ LSXProcessor::processWord(FILE* input, FILE* output) if(at_end && lu_queue.size() == 1 && lu_queue.back().size() == 0) { // we're at the final blank, no more work to do - fputws_unlocked(blank_queue.back().c_str(), output); + write(blank_queue.back(), output); blank_queue.pop_front(); bound_blank_queue.pop_front(); lu_queue.pop_front(); return; } size_t last_final = 0; - wstring last_final_out; + UString last_final_out; State s; s.init(trans.getInitial()); size_t idx = 0; @@ -203,7 +204,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } readNextLU(input); } - wstring lu = lu_queue[idx]; + UString lu = lu_queue[idx]; if(lu.size() == 0) { break; @@ -214,22 +215,22 @@ LSXProcessor::processWord(FILE* input, FILE* output) } for(size_t i = 0; i < lu.size(); i++) { - if(lu[i] == L'<') + if(lu[i] == '<') { size_t j = i+1; for(; j < lu.size(); j++) { - if(lu[j] == L'\\') + if(lu[j] == '\\') { j++; } - else if(lu[j] == L'>') + else if(lu[j] == '>') { j++; break; } } - wstring tag = lu.substr(i, j-i); + UString tag = lu.substr(i, j-i); i = j-1; if(!alphabet.isSymbolDefined(tag)) { @@ -239,7 +240,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } else { - if(lu[i] == L'\\') + if(lu[i] == '\\') { i++; } @@ -258,28 +259,24 @@ LSXProcessor::processWord(FILE* input, FILE* output) } if(last_final == 0) { - fputws_unlocked(blank_queue.front().c_str(), output); + write(blank_queue.front(), output); blank_queue.pop_front(); - if(bound_blank_queue.front().size() > 0) + if(!bound_blank_queue.front().empty()) { - fputws_unlocked(L"[[", output); - fputws_unlocked(bound_blank_queue.front().c_str(), output); - fputws_unlocked(L"]]", output); + u_fprintf(output, "[[%S]]", bound_blank_queue.front().c_str()); } bound_blank_queue.pop_front(); - fputwc_unlocked(L'^', output); - fputws_unlocked(lu_queue.front().c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S$", lu_queue.front().c_str()); lu_queue.pop_front(); return; } - vector out_lus; + vector out_lus; size_t pos = 0; - while(pos != wstring::npos && pos != last_final_out.size()) + while(pos != UString::npos && pos != last_final_out.size()) { size_t start = pos; - pos = last_final_out.find(L"<$>", start); - if(pos == wstring::npos) + pos = last_final_out.find("<$>"_u, start); + if(pos == UString::npos) { out_lus.push_back(last_final_out.substr(start)); } @@ -290,26 +287,26 @@ LSXProcessor::processWord(FILE* input, FILE* output) } } - wstring wblank; + UString wblank; for(size_t i = 0; i < last_final; i++) { if(!bound_blank_queue[i].empty()) { if(wblank.empty()) { - wblank += L"[["; + wblank += "[["_u; } else { - wblank += L"; "; + wblank += "; "_u; } - wblank += bound_blank_queue[i].c_str(); + wblank += bound_blank_queue[i]; } } if(!wblank.empty()) { - wblank += L"]]"; + wblank += "]]"_u; } size_t i = 0; @@ -317,22 +314,22 @@ LSXProcessor::processWord(FILE* input, FILE* output) { if(i < last_final) { - fputws_unlocked(blank_queue[i].c_str(), output); + write(blank_queue[i], output); } else { - fputwc_unlocked(L' ', output); + u_fputc(' ', output); } - fputws_unlocked(wblank.c_str(), output); - fputwc_unlocked(L'^', output); - fputws_unlocked(out_lus[i].c_str(), output); - fputwc_unlocked(L'$', output); + write(wblank, output); + u_fputc('^', output); + write(out_lus[i], output); + u_fputc('$', output); } for(; i < last_final; i++) { - if(blank_queue[i] != L" ") + if(blank_queue[i] != " "_u) { - fputws_unlocked(blank_queue[i].c_str(), output); + write(blank_queue[i], output); } } blank_queue.erase(blank_queue.begin(), blank_queue.begin()+last_final); @@ -341,7 +338,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } void -LSXProcessor::process(FILE* input, FILE* output) +LSXProcessor::process(InputFile& input, UFILE* output) { while(true) { @@ -351,12 +348,8 @@ LSXProcessor::process(FILE* input, FILE* output) } if(at_null) { - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); at_end = false; at_null = false; } diff --git a/src/lsx_processor.h b/src/lsx_processor.h index 90d5a47..264a5dd 100644 --- a/src/lsx_processor.h +++ b/src/lsx_processor.h @@ -2,10 +2,11 @@ #define _LSX_PROCESSOR_H_ #include -#include +#include #include #include #include +#include #include class LSXProcessor @@ -13,8 +14,8 @@ class LSXProcessor private: TransExe trans; State initial_state; - set escaped_chars; - set alphabetic_chars; + set escaped_chars; + set alphabetic_chars; map all_finals; Alphabet alphabet; bool null_flush; @@ -22,12 +23,12 @@ private: bool at_end; bool at_null; - deque blank_queue; - deque bound_blank_queue; - deque lu_queue; + deque blank_queue; + deque bound_blank_queue; + deque lu_queue; - void readNextLU(FILE* input); - void processWord(FILE* input, FILE* output); + void readNextLU(InputFile& input); + void processWord(InputFile& input, UFILE* output); int word_boundary; int any_char; @@ -35,7 +36,7 @@ private: public: LSXProcessor(); void load(FILE* input); - void process(FILE* input, FILE* output); + void process(InputFile& input, UFILE* output); void setNullFlush(bool val) { null_flush = val;