commit 8d3d8f2211b910450f45d628d642884a24b192d9 Author: Daniel Swanson Date: Wed Jun 30 08:53:05 2021 -0500 use ICU (#40) ICU - convert all `std::wstring`s and related types to `UString` - use `lttoolbox/input_file.h` for reading UTF-8 input with nulls - use `UFILE*` for output efficiency, cleanliness and code style - move constant initializers to header file - store references to special transducer symbols to save `alphabet` lookups - use range-for loops when possible - prefer `std::vector` to `std::list` - prefer `str.empty()` to `str == ""` - drop old, unused file in `src/` helper functions - `XMLParseUtil` has more specialized functions now - `XMLParseUtil` is now in lttoolbox, so drop apertium dependency diff --git a/.gitignore b/.gitignore index 94e8d70..d16ffc9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ stamp-h1 src/*.o src/lsx-comp src/lsx-proc +*.pyc diff --git a/configure.ac b/configure.ac index 5b60c37..b6b36b2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,10 +1,9 @@ AC_PREREQ(2.61) m4_define([required_libxml_version], [2.6.17]) -m4_define([required_apertium_version], [3.7.0]) -m4_define([required_lttoolbox_version], [3.5.3]) +m4_define([required_lttoolbox_version], [3.6.0]) -AC_INIT([apertium-separable], [0.3.6], [apertium-stuff@lists.sourceforge.net]) +AC_INIT([apertium-separable], [0.4.0], [apertium-stuff@lists.sourceforge.net]) AM_INIT_AUTOMAKE AC_CONFIG_MACRO_DIR([m4]) @@ -28,23 +27,23 @@ PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= required_lttoolbox_version]) AC_SUBST(LTTOOLBOX_CFLAGS) AC_SUBST(LTTOOLBOX_LIBS) -PKG_CHECK_MODULES([APERTIUM], [apertium >= required_apertium_version]) - -AC_SUBST(APERTIUM_CFLAGS) -AC_SUBST(APERTIUM_LIBS) - PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) +PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc]) + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS" +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/lsx_comp.cc b/src/lsx_comp.cc index 4cbfff4..84a9905 100644 --- a/src/lsx_comp.cc +++ b/src/lsx_comp.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ int main (int argc, char** argv) Compiler c; - wstring dir; + UString dir; if(strcmp(argv[1], "lr") == 0) { @@ -54,7 +55,7 @@ int main (int argc, char** argv) FILE* fst = fopen(argv[argc-1], "w+"); if(!fst) { - wcerr << "Error: Cannot open file '" << fst << "'." << endl; + cerr << "Error: Cannot open file '" << fst << "'." << endl; exit(EXIT_FAILURE); } c.write(fst); diff --git a/src/lsx_compiler.cc b/src/lsx_compiler.cc index 3ecb3f1..73dd340 100644 --- a/src/lsx_compiler.cc +++ b/src/lsx_compiler.cc @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -29,35 +29,27 @@ using namespace std; // Removed static globals copied from lttoolbox's compiler.cc. Same namespace, same mangling, bad result. -wstring const Compiler::COMPILER_ANYTAG_ELEM = L"t"; -wstring const Compiler::COMPILER_ANYCHAR_ELEM = L"w"; -wstring const Compiler::COMPILER_WB_ELEM = L"j"; - -Compiler::Compiler() : -reader(0), -verbose(false), -first_element(false) -{ -} - -Compiler::~Compiler() -{ -} +UString const Compiler::COMPILER_ANYTAG_ELEM = "t"_u; +UString const Compiler::COMPILER_ANYCHAR_ELEM = "w"_u; +UString const Compiler::COMPILER_WB_ELEM = "j"_u; void -Compiler::parse(string const &fichero, wstring const &dir) +Compiler::parse(string const &fichero, UString const &dir) { direction = dir; reader = xmlReaderForFile(fichero.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl; + cerr << "Error: Cannot open '" << fichero.c_str() << "'." << endl; exit(EXIT_FAILURE); } - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); + alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL); + alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL); + alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL); + any_tag = alphabet(Transducer::ANY_TAG_SYMBOL); + any_char = alphabet(Transducer::ANY_CHAR_SYMBOL); + word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -69,7 +61,7 @@ Compiler::parse(string const &fichero, wstring const &dir) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -77,19 +69,16 @@ Compiler::parse(string const &fichero, wstring const &dir) // Minimize transducers and ensure that all paths end with <$> - int end_trans = alphabet(alphabet(L"<$>"), alphabet(L"<$>")); - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - (it->second).minimize(); + int end_trans = alphabet(word_boundary, word_boundary); + for (auto& it : sections) { + it.second.minimize(); // any paths which did not already end with <$> now will // having 2 finals isn't a problem because -separable only checks // for finals when it reads $, and you can't have 2 of those in a row - for(auto fin : (it->second).getFinals()) + for(auto fin : it.second.getFinals()) { - int end_state = (it->second).insertSingleTransduction(end_trans, fin.first); - (it->second).setFinal(end_state); + int end_state = it.second.insertSingleTransduction(end_trans, fin.first); + it.second.setFinal(end_state); } } } @@ -105,8 +94,7 @@ Compiler::procAlphabet() int ret = xmlTextReaderRead(reader); if(ret == 1) { - xmlChar const *valor = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::towstring(valor); + UString letters = XMLParseUtil::readValue(reader); bool espai = true; for(unsigned int i = 0; i < letters.length(); i++) { @@ -118,13 +106,13 @@ Compiler::procAlphabet() } if(espai == true) // libxml2 returns '\n' for , should be empty { - letters = L""; + letters.clear(); } } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Missing alphabet symbols." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Missing alphabet symbols." << endl; exit(EXIT_FAILURE); } } @@ -133,7 +121,11 @@ Compiler::procAlphabet() void Compiler::procSDef() { - alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); + UString s; + s += '<'; + s.append(attrib(COMPILER_N_ATTR)); + s += '>'; + alphabet.includeSymbol(s); } void @@ -151,15 +143,15 @@ Compiler::procParDef() { paradigms[current_paradigm].minimize(); paradigms[current_paradigm].joinFinals(); - current_paradigm = L""; + current_paradigm.clear(); } } } int -Compiler::matchTransduction(list const &pi, list const &pd, int estado, Transducer &t) +Compiler::matchTransduction(vector const &pi, vector const &pd, int estado, Transducer &t) { - list::const_iterator izqda, dcha, limizqda, limdcha; + vector::const_iterator izqda, dcha, limizqda, limdcha; if(direction == COMPILER_RESTRICTION_LR_VAL) { @@ -183,8 +175,6 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado } else { - int rsymbol = 0; - while(true) { int etiqueta; @@ -202,33 +192,31 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado else if(dcha == limdcha) { etiqueta = alphabet(*izqda, 0); - rsymbol = 0; izqda++; } else { etiqueta = alphabet(*izqda, *dcha); - rsymbol = *dcha; izqda++; dcha++; } - if(etiqueta == alphabet(0, alphabet(L"")) || - etiqueta == alphabet(0, alphabet(L"")) + if(etiqueta == alphabet(0, any_tag) || + etiqueta == alphabet(0, any_char) ) { // rl compilation of a badly written rule // having an epsilon with wildcard output will produce // garbage output -- see https://github.com/apertium/apertium-separable/issues/8 - wcerr << L"Warning: Cannot insert from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl; + cerr << "Warning: Cannot insert from empty input. Ignoring. (You probably want to specify exact tags when deleting a word.)" << endl; continue; } int nuevo_estado = t.insertSingleTransduction(etiqueta, estado); - if(etiqueta == alphabet(alphabet(L""),alphabet(L"")) - || etiqueta == alphabet(alphabet(L""),alphabet(L"")) - || etiqueta == alphabet(alphabet(L""), 0) - || etiqueta == alphabet(alphabet(L""), 0) + if(etiqueta == alphabet(any_tag, any_tag) + || etiqueta == alphabet(any_char, any_char) + || etiqueta == alphabet(any_tag, 0) + || etiqueta == alphabet(any_char, 0) ) { t.linkStates(nuevo_estado, nuevo_estado, etiqueta); @@ -242,12 +230,12 @@ Compiler::matchTransduction(list const &pi, list const &pd, int estado void -Compiler::requireEmptyError(wstring const &name) +Compiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -255,139 +243,137 @@ Compiler::requireEmptyError(wstring const &name) bool Compiler::allBlanks() { - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - - for(unsigned int i = 0, limit = text.size(); i < limit; i++) - { - flag = flag && iswspace(text[i]); - } - - return flag; + vector text; + XMLParseUtil::readValueInto32(reader, text); + for (auto& it : text) { + if (!u_isspace(it)) { + return false; + } + } + return true; } void -Compiler::readString(list &result, wstring const &name) +Compiler::readString(vector &result, UString const &name) { - if(name == L"#text") + if(name == "#text"_u) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } + XMLParseUtil::readValueInto32(reader, result); } else if(name == COMPILER_BLANK_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L' ')); + result.push_back(static_cast(' ')); } else if(name == COMPILER_POSTGENERATOR_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L'~')); + result.push_back(static_cast('~')); } else if(name == COMPILER_GROUP_ELEM) { int tipo=xmlTextReaderNodeType(reader); if(tipo != XML_READER_TYPE_END_ELEMENT) { - result.push_back(static_cast(L'#')); + result.push_back(static_cast('#')); } } else if(name == COMPILER_S_ELEM) { requireEmptyError(name); - wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; + UString symbol; + symbol += '<'; + symbol.append(attrib(COMPILER_N_ATTR)); + symbol += '>'; if(!alphabet.isSymbolDefined(symbol)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Undefined symbol '" << symbol << "'." << endl; exit(EXIT_FAILURE); } result.push_back(alphabet(symbol)); } else if(name == COMPILER_ANYTAG_ELEM) { - result.push_back(alphabet(L"")); + result.push_back(any_tag); } else if(name == COMPILER_ANYCHAR_ELEM) { - result.push_back(alphabet(L"")); + result.push_back(any_char); } else if(name == COMPILER_WB_ELEM) { requireEmptyError(name); - result.push_back(alphabet(L"<$>")); + result.push_back(word_boundary); } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Compiler::skipBlanks(wstring &name) +Compiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -Compiler::skip(wstring &name, wstring const &elem) +Compiler::skip(UString &name, UString const &elem) { skip(name, elem, true); } void -Compiler::skip(wstring &name, wstring const &elem, bool open) +Compiler::skip(UString &name, UString const &elem, bool open) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - wstring slash; + name = XMLParseUtil::readName(reader); + UString slash; if(!open) { - slash = L"/"; + slash = "/"_u; } - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << slash << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << slash << elem << ">'." << endl; exit(EXIT_FAILURE); } } @@ -395,16 +381,16 @@ Compiler::skip(wstring &name, wstring const &elem, bool open) EntryToken Compiler::procIdentity() { - list both_sides; + vector both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_IDENTITY_ELEM) { break; @@ -413,10 +399,10 @@ Compiler::procIdentity() } } - if(verbose && first_element && (both_sides.front() == (int)L' ')) + if(verbose && first_element && (both_sides.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; EntryToken e; @@ -427,18 +413,18 @@ Compiler::procIdentity() EntryToken Compiler::procTransduction() { - list lhs, rhs; - wstring name; + vector lhs, rhs; + UString name; skip(name, COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_LEFT_ELEM) { break; @@ -447,10 +433,10 @@ Compiler::procTransduction() } } - if(verbose && first_element && (lhs.front() == (int)L' ')) + if(verbose && first_element && (lhs.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; @@ -458,11 +444,11 @@ Compiler::procTransduction() if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_RIGHT_ELEM) { break; @@ -479,8 +465,8 @@ Compiler::procTransduction() return e; } -wstring -Compiler::attrib(wstring const &name) +UString +Compiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } @@ -489,20 +475,20 @@ EntryToken Compiler::procPar() { EntryToken e; - wstring nomparadigma = attrib(COMPILER_N_ATTR); + UString nomparadigma = attrib(COMPILER_N_ATTR); first_element = false; - if(current_paradigm != L"" && nomparadigma == current_paradigm) + if(!current_paradigm.empty() && nomparadigma == current_paradigm) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." < const &elements) { - if(current_paradigm != L"") - { + if(!current_paradigm.empty()) { // compilation of paradigms Transducer &t = paradigms[current_paradigm]; int e = t.getInitial(); @@ -537,8 +522,8 @@ Compiler::insertEntryTokens(vector const &elements) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid entry token." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid entry token." << endl; exit(EXIT_FAILURE); } } @@ -608,15 +593,14 @@ Compiler::insertEntryTokens(vector const &elements) void -Compiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Compiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + if(value.empty()) { + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } @@ -629,46 +613,46 @@ Compiler::procSection() if(tipo != XML_READER_TYPE_END_ELEMENT) { - wstring const &id = attrib(COMPILER_ID_ATTR); - wstring const &type = attrib(COMPILER_TYPE_ATTR); + UString const &id = attrib(COMPILER_ID_ATTR); + UString const &type = attrib(COMPILER_TYPE_ATTR); requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += L"@"; + current_section += '@'; current_section.append(type); } else { - current_section = L""; + current_section.clear(); } } void Compiler::procEntry() { - wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR); - wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); - wstring altval = this->attrib(COMPILER_ALT_ATTR); - wstring varval = this->attrib(COMPILER_V_ATTR); - wstring varl = this->attrib(COMPILER_VL_ATTR); - wstring varr = this->attrib(COMPILER_VR_ATTR); - - //���if entry is masked by a restriction of direction or an ignore mark - if((atributo != L"" && atributo != direction) + UString atributo=this->attrib(COMPILER_RESTRICTION_ATTR); + UString ignore = this->attrib(COMPILER_IGNORE_ATTR); + UString altval = this->attrib(COMPILER_ALT_ATTR); + UString varval = this->attrib(COMPILER_V_ATTR); + UString varl = this->attrib(COMPILER_VL_ATTR); + UString varr = this->attrib(COMPILER_VR_ATTR); + + // if entry is masked by a restriction of direction or an ignore mark + if((!atributo.empty() && atributo != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (altval != L"" && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) + || (!altval.empty() && altval != alt) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) + || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) { // parse to the end of the entry - wstring name = L""; + UString name; while(name != COMPILER_ENTRY_ELEM) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } return; @@ -681,14 +665,14 @@ Compiler::procEntry() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); - if(current_paradigm == L"" && verbose) + if(current_paradigm.empty() && verbose) { first_element = true; } @@ -712,12 +696,12 @@ Compiler::procEntry() // detecci���n del uso de paradigmas no definidos - wstring const &p = elements.rbegin()->paradigmName(); + UString const &p = elements.rbegin()->paradigmName(); if(paradigms.find(p) == paradigms.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." < HERE */ - // list wb; - // wb.push_back(alphabet(L"<$>")); + // vector wb; + // wb.push_back(word_boundary); // EntryToken e; // e.setSingleTransduction(wb, wb); // elements.push_back(e); @@ -748,9 +732,9 @@ Compiler::procEntry() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } @@ -760,12 +744,11 @@ Compiler::procEntry() void Compiler::procNode() { - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); + UString nombre = XMLParseUtil::readName(reader); - // HACER: optimizar el orden de ejecuci���n de esta ristra de "ifs" + // HACER: optimizar el orden de ejecución de esta ristra de "ifs" - if(nombre == L"#text") + if(nombre == "#text"_u) { /* ignorar */ } @@ -801,14 +784,14 @@ Compiler::procNode() { procSection(); } - else if(nombre == L"#comment") + else if(nombre == "#comment"_u) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << nombre << ">'." << endl; exit(EXIT_FAILURE); } } @@ -818,7 +801,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::readValue(reader); et.setRegexp(re); xmlTextReaderRead(reader); return et; @@ -828,7 +811,7 @@ void Compiler::write(FILE *output) { // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alphabet.write(output); @@ -836,16 +819,11 @@ Compiler::write(FILE *output) // transducers Compression::multibyte_write(sections.size(), output); - int conta=0; - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - conta++; - wcout << it->first << " " << it->second.size(); - wcout << " " << it->second.numberOfTransitions() << endl; - Compression::wstring_write(it->first, output); - it->second.write(output); + for (auto& it : sections) { + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + Compression::string_write(it.first, output); + it.second.write(output); } } diff --git a/src/lsx_compiler.h b/src/lsx_compiler.h index 971c6fb..3956f16 100644 --- a/src/lsx_compiler.h +++ b/src/lsx_compiler.h @@ -20,12 +20,12 @@ #include #include #include -#include #include #include #include #include +#include #include using namespace std; @@ -39,88 +39,95 @@ private: /** * The libxml2's XML reader */ - xmlTextReaderPtr reader; + xmlTextReaderPtr reader = nullptr; /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The dictionary section being compiled */ - wstring current_section; + UString current_section; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * List of characters to be considered alphabetic */ - wstring letters; + UString letters; /** * Set verbose mode: warnings which may or may not be correct */ - bool verbose; + bool verbose = false; /** * First element (of an entry) */ - bool first_element; + bool first_element = false; /** * Identifier of all the symbols during the compilation */ Alphabet alphabet; + /** + * Special symbols + */ + int32_t any_tag = 0; + int32_t any_char = 0; + int32_t word_boundary = 0; + /** * List of named transducers-paradigms */ - map paradigms; + map paradigms; /** * List of named dictionary sections */ - map sections; + map sections; /** * List of named prefix copy of a paradigm */ - map, Ltstr> prefix_paradigms; + map> prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map, Ltstr> suffix_paradigms; + map> suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map, Ltstr> postsuffix_paradigms; + map> postsuffix_paradigms; /* @@ -175,7 +182,7 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Construct symbol pairs by align left side of both parts and insert @@ -186,7 +193,7 @@ private: * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(list const &lp, list const &rp, + int matchTransduction(vector const &lp, vector const &rp, int state, Transducer &t); /** * Parse the <p< element @@ -217,7 +224,7 @@ private: * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all document #text nodes before "elem" @@ -225,22 +232,22 @@ private: * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(wstring &name, wstring const &elem, bool open); + void skip(UString &name, UString const &elem, bool open); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(list &result, wstring const &name); + void readString(vector &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -249,8 +256,8 @@ private: * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks * @return true if all are blanks @@ -263,60 +270,50 @@ public: * Constants to represent the element and the attributes of * dictionaries */ - static wstring const COMPILER_DICTIONARY_ELEM; - static wstring const COMPILER_ALPHABET_ELEM; - static wstring const COMPILER_SDEFS_ELEM; - static wstring const COMPILER_SDEF_ELEM; - static wstring const COMPILER_N_ATTR; - static wstring const COMPILER_PARDEFS_ELEM; - static wstring const COMPILER_PARDEF_ELEM; - static wstring const COMPILER_PAR_ELEM; - static wstring const COMPILER_ENTRY_ELEM; - static wstring const COMPILER_RESTRICTION_ATTR; - static wstring const COMPILER_RESTRICTION_LR_VAL; - static wstring const COMPILER_RESTRICTION_RL_VAL; - static wstring const COMPILER_PAIR_ELEM; - static wstring const COMPILER_LEFT_ELEM; - static wstring const COMPILER_RIGHT_ELEM; - static wstring const COMPILER_S_ELEM; - static wstring const COMPILER_REGEXP_ELEM; - static wstring const COMPILER_SECTION_ELEM; - static wstring const COMPILER_ID_ATTR; - static wstring const COMPILER_TYPE_ATTR; - static wstring const COMPILER_IDENTITY_ELEM; - static wstring const COMPILER_JOIN_ELEM; - static wstring const COMPILER_BLANK_ELEM; - static wstring const COMPILER_POSTGENERATOR_ELEM; - static wstring const COMPILER_GROUP_ELEM; - static wstring const COMPILER_LEMMA_ATTR; - static wstring const COMPILER_IGNORE_ATTR; - static wstring const COMPILER_IGNORE_YES_VAL; - static wstring const COMPILER_ALT_ATTR; - static wstring const COMPILER_V_ATTR; - static wstring const COMPILER_VL_ATTR; - static wstring const COMPILER_VR_ATTR; - - static wstring const COMPILER_ANYTAG_ELEM; - static wstring const COMPILER_ANYCHAR_ELEM; - static wstring const COMPILER_WB_ELEM; - - - /** - * Constructor - */ - Compiler(); - - /** - * Destructor - */ - ~Compiler(); + static UString const COMPILER_DICTIONARY_ELEM; + static UString const COMPILER_ALPHABET_ELEM; + static UString const COMPILER_SDEFS_ELEM; + static UString const COMPILER_SDEF_ELEM; + static UString const COMPILER_N_ATTR; + static UString const COMPILER_PARDEFS_ELEM; + static UString const COMPILER_PARDEF_ELEM; + static UString const COMPILER_PAR_ELEM; + static UString const COMPILER_ENTRY_ELEM; + static UString const COMPILER_RESTRICTION_ATTR; + static UString const COMPILER_RESTRICTION_LR_VAL; + static UString const COMPILER_RESTRICTION_RL_VAL; + static UString const COMPILER_PAIR_ELEM; + static UString const COMPILER_LEFT_ELEM; + static UString const COMPILER_RIGHT_ELEM; + static UString const COMPILER_S_ELEM; + static UString const COMPILER_REGEXP_ELEM; + static UString const COMPILER_SECTION_ELEM; + static UString const COMPILER_ID_ATTR; + static UString const COMPILER_TYPE_ATTR; + static UString const COMPILER_IDENTITY_ELEM; + static UString const COMPILER_JOIN_ELEM; + static UString const COMPILER_BLANK_ELEM; + static UString const COMPILER_POSTGENERATOR_ELEM; + static UString const COMPILER_GROUP_ELEM; + static UString const COMPILER_LEMMA_ATTR; + static UString const COMPILER_IGNORE_ATTR; + static UString const COMPILER_IGNORE_YES_VAL; + static UString const COMPILER_ALT_ATTR; + static UString const COMPILER_V_ATTR; + static UString const COMPILER_VL_ATTR; + static UString const COMPILER_VR_ATTR; + + static UString const COMPILER_ANYTAG_ELEM; + static UString const COMPILER_ANYCHAR_ELEM; + static UString const COMPILER_WB_ELEM; + /** * Compile dictionary to letter transducers * @param fichero file * @param dir direction */ - void parse(string const &fichero, wstring const &dir); + void parse(string const &fichero, UString const &dir); // auto getAlt(); // auto getInt(); diff --git a/src/lsx_proc.cc b/src/lsx_proc.cc index cdc8094..eaf327c 100644 --- a/src/lsx_proc.cc +++ b/src/lsx_proc.cc @@ -28,8 +28,8 @@ int main (int argc, char** argv) LtLocale::tryToSetLocale(); LSXProcessor fstp; - FILE* input = stdin; - FILE* output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); #if HAVE_GETOPT_LONG static struct option long_options[]= @@ -72,22 +72,18 @@ int main (int argc, char** argv) } FILE* fst = fopen(argv[optind], "rb"); if(!fst) { - wcerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl; + cerr << "Error: Cannot open file '" << argv[optind] << "' for reading." << endl; exit(EXIT_FAILURE); } fstp.load(fst); if (optind <= (argc - 2)) { - input = fopen(argv[optind+1], "rb"); - if (input == NULL || ferror(input)) { - wcerr << "Error: Cannot open file '" << argv[optind+1] << "' for reading." << endl; - exit(EXIT_FAILURE); - } + input.open_or_exit(argv[optind+1]); } if (optind <= (argc - 3)) { - output = fopen(argv[optind+2], "wb"); - if (output == NULL || ferror(output)) { - wcerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl; + output = u_fopen(argv[optind+2], "w", NULL, NULL); + if (output == NULL) { + cerr << "Error: Cannot open file '" << argv[optind+2] << "' for writing." << endl; } } diff --git a/src/lsx_processor.cc b/src/lsx_processor.cc index 7f68dc5..ce1eaf4 100644 --- a/src/lsx_processor.cc +++ b/src/lsx_processor.cc @@ -1,20 +1,21 @@ #include "lsx_processor.h" #include +#include LSXProcessor::LSXProcessor() { - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); null_flush = false; dictionary_case = false; @@ -46,18 +47,18 @@ LSXProcessor::load(FILE *input) int len = Compression::multibyte_read(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); len--; } // symbols alphabet.read(input); - word_boundary = alphabet(L"<$>"); - any_char = alphabet(L""); - any_tag = alphabet(L""); + word_boundary = alphabet("<$>"_u); + any_char = alphabet(""_u); + any_tag = alphabet(""_u); len = Compression::multibyte_read(input); - Compression::wstring_read(input); // name + Compression::string_read(input); // name // there should only be 1 transducer in the file // so ignore any subsequent ones trans.read(input, alphabet); @@ -67,65 +68,65 @@ LSXProcessor::load(FILE *input) } void -LSXProcessor::readNextLU(FILE* input) +LSXProcessor::readNextLU(InputFile& input) { - vector parts = vector(3); + vector parts = vector(3); int loc = 0; // 0 = blank, 1 = bound blank, 2 = LU bool box = false; // are we in a [ ] blank - while(!feof(input)) + while(!input.eof()) { - wchar_t c = fgetwc_unlocked(input); - if ((unsigned int)c == WEOF) { + UChar32 c = input.get(); + if ((unsigned int)c == U_EOF) { break; } - if(null_flush && c == L'\0') + if(null_flush && c == '\0') { at_end = true; at_null = true; break; } - else if(c == L'\\') + else if(c == '\\') { parts[loc] += c; - c = fgetwc_unlocked(input); + c = input.get(); parts[loc] += c; } else if(loc == 0 && box) { - if(c == L']') + if(c == ']') { box = false; } parts[loc] += c; } - else if(loc == 0 && c == L'[') + else if(loc == 0 && c == '[') { - c = fgetwc_unlocked(input); - if(c == L'[') + c = input.get(); + if(c == '[') { loc = 1; } else { - parts[loc] += L'['; + parts[loc] += '['; parts[loc] += c; - if(c != L']') + if(c != ']') { box = true; } - if(c == L'\\') + if(c == '\\') { - parts[loc] += fgetwc_unlocked(input); + parts[loc] += input.get(); } } } - else if(loc == 1 && c == L']') + else if(loc == 1 && c == ']') { - c = fgetwc_unlocked(input); - if(c == L']') + c = input.get(); + if(c == ']') { - c = fgetwc_unlocked(input); - if(c == L'^') + c = input.get(); + if(c == '^') { loc = 2; } @@ -134,25 +135,25 @@ LSXProcessor::readNextLU(FILE* input) // this situation is invalid // but I like making parsers harder to break than required // by the standard - parts[loc] += L"]]"; + parts[loc] += "]]"_u; parts[loc] += c; } } else { - parts[loc] += L']'; + parts[loc] += ']'; parts[loc] += c; - if(c == L'\\') + if(c == '\\') { - parts[loc] += fgetwc_unlocked(input); + parts[loc] += input.get(); } } } - else if(loc == 0 && c == L'^') + else if(loc == 0 && c == '^') { loc = 2; } - else if(loc == 2 && c == L'$') + else if(loc == 2 && c == '$') { break; } @@ -161,7 +162,7 @@ LSXProcessor::readNextLU(FILE* input) parts[loc] += c; } } - if(feof(input)) + if(input.eof()) { at_end = true; } @@ -171,7 +172,7 @@ LSXProcessor::readNextLU(FILE* input) } void -LSXProcessor::processWord(FILE* input, FILE* output) +LSXProcessor::processWord(InputFile& input, UFILE* output) { if(lu_queue.size() == 0) { @@ -180,14 +181,14 @@ LSXProcessor::processWord(FILE* input, FILE* output) if(at_end && lu_queue.size() == 1 && lu_queue.back().size() == 0) { // we're at the final blank, no more work to do - fputws_unlocked(blank_queue.back().c_str(), output); + write(blank_queue.back(), output); blank_queue.pop_front(); bound_blank_queue.pop_front(); lu_queue.pop_front(); return; } size_t last_final = 0; - wstring last_final_out; + UString last_final_out; State s; s.init(trans.getInitial()); size_t idx = 0; @@ -203,7 +204,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } readNextLU(input); } - wstring lu = lu_queue[idx]; + UString lu = lu_queue[idx]; if(lu.size() == 0) { break; @@ -214,22 +215,22 @@ LSXProcessor::processWord(FILE* input, FILE* output) } for(size_t i = 0; i < lu.size(); i++) { - if(lu[i] == L'<') + if(lu[i] == '<') { size_t j = i+1; for(; j < lu.size(); j++) { - if(lu[j] == L'\\') + if(lu[j] == '\\') { j++; } - else if(lu[j] == L'>') + else if(lu[j] == '>') { j++; break; } } - wstring tag = lu.substr(i, j-i); + UString tag = lu.substr(i, j-i); i = j-1; if(!alphabet.isSymbolDefined(tag)) { @@ -239,7 +240,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } else { - if(lu[i] == L'\\') + if(lu[i] == '\\') { i++; } @@ -258,28 +259,24 @@ LSXProcessor::processWord(FILE* input, FILE* output) } if(last_final == 0) { - fputws_unlocked(blank_queue.front().c_str(), output); + write(blank_queue.front(), output); blank_queue.pop_front(); - if(bound_blank_queue.front().size() > 0) + if(!bound_blank_queue.front().empty()) { - fputws_unlocked(L"[[", output); - fputws_unlocked(bound_blank_queue.front().c_str(), output); - fputws_unlocked(L"]]", output); + u_fprintf(output, "[[%S]]", bound_blank_queue.front().c_str()); } bound_blank_queue.pop_front(); - fputwc_unlocked(L'^', output); - fputws_unlocked(lu_queue.front().c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S$", lu_queue.front().c_str()); lu_queue.pop_front(); return; } - vector out_lus; + vector out_lus; size_t pos = 0; - while(pos != wstring::npos && pos != last_final_out.size()) + while(pos != UString::npos && pos != last_final_out.size()) { size_t start = pos; - pos = last_final_out.find(L"<$>", start); - if(pos == wstring::npos) + pos = last_final_out.find("<$>"_u, start); + if(pos == UString::npos) { out_lus.push_back(last_final_out.substr(start)); } @@ -290,26 +287,26 @@ LSXProcessor::processWord(FILE* input, FILE* output) } } - wstring wblank; + UString wblank; for(size_t i = 0; i < last_final; i++) { if(!bound_blank_queue[i].empty()) { if(wblank.empty()) { - wblank += L"[["; + wblank += "[["_u; } else { - wblank += L"; "; + wblank += "; "_u; } - wblank += bound_blank_queue[i].c_str(); + wblank += bound_blank_queue[i]; } } if(!wblank.empty()) { - wblank += L"]]"; + wblank += "]]"_u; } size_t i = 0; @@ -317,22 +314,22 @@ LSXProcessor::processWord(FILE* input, FILE* output) { if(i < last_final) { - fputws_unlocked(blank_queue[i].c_str(), output); + write(blank_queue[i], output); } else { - fputwc_unlocked(L' ', output); + u_fputc(' ', output); } - fputws_unlocked(wblank.c_str(), output); - fputwc_unlocked(L'^', output); - fputws_unlocked(out_lus[i].c_str(), output); - fputwc_unlocked(L'$', output); + write(wblank, output); + u_fputc('^', output); + write(out_lus[i], output); + u_fputc('$', output); } for(; i < last_final; i++) { - if(blank_queue[i] != L" ") + if(blank_queue[i] != " "_u) { - fputws_unlocked(blank_queue[i].c_str(), output); + write(blank_queue[i], output); } } blank_queue.erase(blank_queue.begin(), blank_queue.begin()+last_final); @@ -341,7 +338,7 @@ LSXProcessor::processWord(FILE* input, FILE* output) } void -LSXProcessor::process(FILE* input, FILE* output) +LSXProcessor::process(InputFile& input, UFILE* output) { while(true) { @@ -351,12 +348,8 @@ LSXProcessor::process(FILE* input, FILE* output) } if(at_null) { - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); at_end = false; at_null = false; } diff --git a/src/lsx_processor.h b/src/lsx_processor.h index 90d5a47..264a5dd 100644 --- a/src/lsx_processor.h +++ b/src/lsx_processor.h @@ -2,10 +2,11 @@ #define _LSX_PROCESSOR_H_ #include -#include +#include #include #include #include +#include #include class LSXProcessor @@ -13,8 +14,8 @@ class LSXProcessor private: TransExe trans; State initial_state; - set escaped_chars; - set alphabetic_chars; + set escaped_chars; + set alphabetic_chars; map all_finals; Alphabet alphabet; bool null_flush; @@ -22,12 +23,12 @@ private: bool at_end; bool at_null; - deque blank_queue; - deque bound_blank_queue; - deque lu_queue; + deque blank_queue; + deque bound_blank_queue; + deque lu_queue; - void readNextLU(FILE* input); - void processWord(FILE* input, FILE* output); + void readNextLU(InputFile& input); + void processWord(InputFile& input, UFILE* output); int word_boundary; int any_char; @@ -35,7 +36,7 @@ private: public: LSXProcessor(); void load(FILE* input); - void process(FILE* input, FILE* output); + void process(InputFile& input, UFILE* output); void setNullFlush(bool val) { null_flush = val; diff --git a/src/processor.cc b/src/processor.cc deleted file mode 100644 index aab265e..0000000 --- a/src/processor.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); - - -/* get the text between delim1 and delim2 */ -/* next_token() */ -wstring -readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc(input)); //fget_unlocked - result += c; - } - - return result; -} - -/*** -main -***/ -int main (int argc, char** argv) -{ - Alphabet alphabet; - TransExe transducer; - - LtLocale::tryToSetLocale(); - FILE *fst = fopen(argv[1], "r"); - - set alphabetic_chars; - int len = Compression::multibyte_read(fst); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(fst))); - len--; - } - - alphabet.read(fst); - wcout << L"alphabet_size: " << alphabet.size() << endl; - - len = Compression::multibyte_read(fst); - len = Compression::multibyte_read(fst); - wcout << len << endl; - wstring name = L""; - while(len > 0) - { - name += static_cast(Compression::multibyte_read(fst)); - len--; - } - wcout << name << endl; - - transducer.read(fst, alphabet); - - FILE *input = stdin; - FILE *output = stdout; - - /* preparing for processing */ - vector alive_states; //A set of alive states is maintained to compute all the possible ways to - set anfinals; //alive node finals ? - set escaped_chars; - - State* initial_state = new State(); - initial_state->init(transducer.getInitial()); // getInitial() returns an int - anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); - - set final_states = transducer.getFinals(); - for(auto final_state : final_states) { - final_state.init(transducer.getInitial()); //initialize - } - - - /* processing */ - - vector new_states; - alive_states.push_back(*initial_state); - // TODO: insert the other states - // TODO: insert the final state - - int line_number = 0; - bool accepted = true; - while(!feof(input)) // while true - { - //initialize conditions - int tag_count = 0; - State* current_state = initial_state; - bool in_lemma = false; - bool in_take = false; - bool in_out = false; - - while (alive_states.size() > 1 and !isFinal(current_state)) { - //get the next token - int val = fgetwc(input); // read 1 wide char - bool is_tag = false; - if(val == L'<') // if in tag, get the whole tag - { - in_lemma = false; - is_tag = true; - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); - val = static_cast(alphabet(tag)); - - tag_count++; - - cout << "val before: " << val << endl; - cout << "tag_count: " << tag_count << endl; - - if(val == 0 && tag_count > 2) //TODO: val==0? - { - val = static_cast(alphabet(L"")); - } - - cout << "val after: " << val << endl; - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); - - if (tag == '') { - accepted = true; - } - } - else if(in_lemma && !in_take && !in_out) { - val == static_cast(alphabet(L"&")); - } - - // if (current_state == initial_state && not eof) { - //successfully reached eof - //exit() - - if (current_state == initial_state && val != '\n') { - accepted = true; - break; - } else if (val == '\n') { //or sent - accepted = true; - } - - //step into the next state - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) { //step //for every state in alive_states - State s = *it; - - if (tag_count > 2) { - s.step(val, alphabet(L"")); - } else { - s.step(val) - } - - if(s.size() > 0) - { - new_states.push_back(s); - } - wcout << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; - } - - alive_states.swap(new_states); - } - return 0; - } diff --git a/src/transducer.py b/src/transducer.py deleted file mode 100644 index 77fc1b4..0000000 --- a/src/transducer.py +++ /dev/null @@ -1,189 +0,0 @@ -#usage: python transducer.py testfile.txt - -import sys - -transitions = { - (-1,'^') : 0, - (0,'t') : 1, - (1,'a') : 2, - (2,'k') : 3, - (3,'e') : 4, - (4,'') : 5, - (5,'') : 6, - (6,'') : 7, - (6,'$') : 8, - (7,'') : 7, - (7,'$'): 8, - (8,' ') : 9, - (9,'^') : 10, - (10,'&') : 11, - (11,'&') : 11, - (11,'') : 12, - (11,'') : 13, - (11,'') : 14, - (11,'') : 15, - (11,''): 16, - (12,'') : 200, - (200,'') : 201, - (200,'$') : 17, - (201,'') : 201, - (201,'$') : 17, - (13,'') : 225, - (13,'$') : 250, - (225,'') : 225, - (225,'$') : 250, - (250,' '):251, - (251,'^'):252, - (252,'&'):253, - (253,'&'):253, - (253,''):12, - (253,''):13, - (14,'') : 275, - (275,'') : 276, - (275,'$') : 250, - (276,'') : 276, - (276,'$') : 250, - (15,'') : 200, - (16,''): 200, - (100,'') : 100, - (100,'$') : 17, - (17,' ') : 18, #do not go to state 17 unless you are expecting 'out' to be the next word - (18,'^') : 19, - (19,'o') : 20, - (20,'u') : 21, - (21,'t') : 22, - (22,'') : 23, - (22,'') : 24, - (23,'$') : 25, - (24,'$') : 25, - (25,'') : 26, - (25,' ') : 26, - (25,'\n') : 26, - (25,'^') : 27, - (27,'.') : 28, - (28,'') : 29, - (29,'$') : 25 -} - -# is required -# is optional -states = { - -1 : '', - 0 : '^', - 1 : 't', - 2 : 'a', - 3 : 'k', - 4 : 'e', - 5 : '', - 6 : '', #secondary tag is necessary - 7 : '', #third, fourth, fifth...tags are optional - 8 : '$', - 9 : ' ', - 10 : '^', - 11 : '&', #represents any character 'ANY_CHAR - 12 : '', - 13 : '', - 14 : '', - 15 : '', - 16 : '', - 100: '', - 200: '', - 201: '', - 225: '', - 250: '$', - 251: ' ', - 252: '^', - 253: '&', - 275: '', - 276: '', - 17 : '$', - 18 : ' ', - 19 : '^', - 20 : 'o', - 21 : 'u', - 22 : 't', - 23 : '', - 24 : '', - 25 : '$', - 26 : '\n', - 27 : '^', - 28 : '.', - 29 : '', - -} - -def next_token(file, subsequent_tag, in_lemma, in_take, in_out): - original_token = file.read(1) - modified_token = original_token - if original_token == '<': #if in tag - in_lemma = False - c = '' - while c != '>': - c = file.read(1) - original_token += c - modified_token += c - if subsequent_tag: - modified_token = '' - if in_lemma and not in_take and not in_out: - modified_token = '&' #ANY_CHAR - return original_token, modified_token - -def step(state, token): #token is at the next state - next_state = transitions.get((state,token)) - output_token = states.get(next_state) - return next_state, output_token #return the next state, or None if it doesn't exist - -def main(): - f = open(sys.argv[1]) - line_number = 0 - accepted = True - while True: - line = '' - if accepted: - line_number += 1 - current_state = -1 - - subsequent_tag = False - in_lemma = False - in_take = False - in_out = False - - while states.get(current_state) != None and current_state != 26: - original_token, modified_token = next_token(f, subsequent_tag, in_lemma, in_take, in_out) - if current_state == -1 and modified_token == '': - print('successfully reached end of file') - exit(0) - elif current_state == -1 and modified_token == '\n': - accepted = True - break - elif modified_token == '\n': - accepted = True - - current_state, output_token = step(current_state, modified_token) - if output_token == None: - break - - line += original_token - - subsequent_tag = current_state in [5, 6, 7, 12, 13, 14, 15, 16, 100, 200, 201, 225, 275, 276] - in_lemma = current_state in [1, 2, 3, 10, 11, 252, 253, 19, 20, 21, 22] - in_take = current_state in [1, 2, 3, 4] - if current_state == 19: - pos = f.tell() #store the current buffer position - peek = f.read(4) #read in the next 4 chars - f.seek(pos) #return to the original position - if peek == 'out<': - in_out = True - - if current_state == 26: - print str(line_number) + ' ' + line - accepted = True - else: - if accepted: - print str(line_number) + ' string not accepted \n' - accepted = False - current_state = -1 - line_number += 1 - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/transducer2.cc b/src/transducer2.cc deleted file mode 100644 index 7042095..0000000 --- a/src/transducer2.cc +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -int main (int argc, char** argv) { - Alphabet alphabet; - - LtLocale::tryToSetLocale(); - - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); - - int vblex_sym = alphabet(L""); - int n_sym = alphabet(L""); - int adj_sym = alphabet(L""); - int det_sym = alphabet(L""); - int prn_sym = alphabet(L""); - int np_sym = alphabet(L""); - - int any_tag = alphabet(L""); - int any_char = alphabet(L""); - int wb_sym = alphabet(L"<$>"); - - /* reap from input file */ - for (string line; getline(cin, line);) { - Transducer t; - string first_token = line.substr(0, line.find(' ')); - string second_token = line.substr(line.find(' ') + 1); - - /* noun phrase acceptor: see README */ - - int initial = t.getInitial(); - int take_out = initial; - for (wchar_t c : first_token) { - take_out = t.insertSingleTransduction(alphabet(c,c), take_out); - } - take_out = t.insertSingleTransduction(alphabet(0,L'#'), take_out); - take_out = t.insertSingleTransduction(alphabet(0,L' '), take_out); - for (wchar_t c : second_token) { - take_out = t.insertSingleTransduction(alphabet(0,c), take_out); - } - take_out = t.insertSingleTransduction(alphabet(vblex_sym,vblex_sym), take_out); - int loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_takeout = take_out; - - /* no det */ - int from_nodet = after_takeout; - - /* first lemma */ - loop = after_takeout; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int first_lm = take_out; - - /* prn */ - take_out = t.insertSingleTransduction(alphabet(prn_sym,prn_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_prn = take_out; - - /* np */ - take_out = t.insertSingleTransduction(alphabet(np_sym,np_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_np = take_out; - - /* det */ - take_out = t.insertSingleTransduction(alphabet(det_sym,det_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_det = take_out; - - /* no adj */ - int from_noadj = take_out; //same as after_det - - /* lemma for the adj */ - loop = after_det; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int lm_adj = take_out; - - /* adj */ - take_out = t.insertSingleTransduction(alphabet(adj_sym,adj_sym), lm_adj); - - int optional_adj = take_out; - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - //may not have a second tag - t.linkStates(optional_adj, take_out, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_adj = take_out; - - /* lemma for the noun */ - loop = after_adj; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int lm_noun = take_out; - - /* possible subsequent adj */ - t.linkStates(lm_noun, lm_adj, alphabet(adj_sym,adj_sym)); - - /* n */ - take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), lm_noun); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - /* out */ - int before_out = take_out; - - for (wchar_t c : second_token) { - take_out = t.insertSingleTransduction(alphabet(c,0), take_out); - } - take_out = t.insertSingleTransduction(alphabet(any_tag, 0), take_out); - take_out = t.insertSingleTransduction(alphabet(wb_sym,0), take_out); - - t.setFinal(take_out); - - /* final link states */ - t.linkStates(after_takeout, before_out, 0); - t.linkStates(after_prn, before_out, 0); - t.linkStates(after_np, before_out, 0); - t.linkStates(from_nodet, after_det, 0); - t.linkStates(from_noadj, after_adj, 0); - - string filename = regex_replace(line,std::regex("\\s+"), "") + ".fst"; - FILE* fst = fopen(filename.c_str(), "w+"); - // First write the letter symbols of the alphabet - Compression::wstring_write(L"abcdefghijklmnopqrstuvwxyz", fst); - // Then write the multicharacter symbols - alphabet.write(fst); - // Then write then number of transducers - Compression::multibyte_write(1, fst); - // Then write the name of the transducer - Compression::wstring_write(L"main@standard", fst); - // Then write the transducer - t.write(fst); - cout << line << " t.size(): " << t.size() << endl ; - fclose(fst); - } - - return 0; -} \ No newline at end of file