commit 7be2cae1285391ddb214101fac72ae92ca39f594 Author: Daniel Swanson Date: Fri Jun 11 18:38:07 2021 -0500 use ICU diff --git a/configure.ac b/configure.ac index 735e785..0e34469 100644 --- a/configure.ac +++ b/configure.ac @@ -58,15 +58,22 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) +PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc]) + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) + AC_CHECK_FUNCS([setlocale strdup]) AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS -lz" +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS $ICU_LIBS -lz" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/lrx_comp.cc b/src/lrx_comp.cc index c1d5b46..41a28da 100644 --- a/src/lrx_comp.cc +++ b/src/lrx_comp.cc @@ -56,7 +56,9 @@ int main (int argc, char **argv) compiler.setDebugMode(true); } + cerr << "parse!" << endl; compiler.parse(argv[2]); + cerr << "write!" << endl; FILE *output = fopen(argv[3], "wb"); compiler.write(output); } diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc index 3fb4e6a..9e17f5b 100644 --- a/src/lrx_compiler.cc +++ b/src/lrx_compiler.cc @@ -21,67 +21,55 @@ using namespace std; -wstring const LRXCompiler::LRX_COMPILER_LRX_ELEM = L"lrx"; -wstring const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM = L"def-seqs"; -wstring const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM = L"def-seq"; -wstring const LRXCompiler::LRX_COMPILER_RULES_ELEM = L"rules"; -wstring const LRXCompiler::LRX_COMPILER_RULE_ELEM = L"rule"; -wstring const LRXCompiler::LRX_COMPILER_MATCH_ELEM = L"match"; -wstring const LRXCompiler::LRX_COMPILER_SELECT_ELEM = L"select"; -wstring const LRXCompiler::LRX_COMPILER_REMOVE_ELEM = L"remove"; -wstring const LRXCompiler::LRX_COMPILER_OR_ELEM = L"or"; -wstring const LRXCompiler::LRX_COMPILER_REPEAT_ELEM = L"repeat"; -wstring const LRXCompiler::LRX_COMPILER_SEQ_ELEM = L"seq"; - -wstring const LRXCompiler::LRX_COMPILER_LEMMA_ATTR = L"lemma"; -wstring const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR = L"suffix"; -wstring const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR = L"contains"; -wstring const LRXCompiler::LRX_COMPILER_CASE_ATTR = L"case"; -wstring const LRXCompiler::LRX_COMPILER_SURFACE_ATTR = L"surface"; -wstring const LRXCompiler::LRX_COMPILER_TAGS_ATTR = L"tags"; -wstring const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR = L"weight"; -wstring const LRXCompiler::LRX_COMPILER_COMMENT_ATTR = L"c"; -wstring const LRXCompiler::LRX_COMPILER_NAME_ATTR = L"n"; -wstring const LRXCompiler::LRX_COMPILER_FROM_ATTR = L"from"; -wstring const LRXCompiler::LRX_COMPILER_UPTO_ATTR = L"upto"; - -wstring const LRXCompiler::LRX_COMPILER_TYPE_SELECT = L"select"; -wstring const LRXCompiler::LRX_COMPILER_TYPE_REMOVE = L"remove"; -wstring const LRXCompiler::LRX_COMPILER_TYPE_SKIP = L"skip"; +UString const LRXCompiler::LRX_COMPILER_LRX_ELEM = "lrx"_u; +UString const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM = "def-seqs"_u; +UString const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM = "def-seq"_u; +UString const LRXCompiler::LRX_COMPILER_RULES_ELEM = "rules"_u; +UString const LRXCompiler::LRX_COMPILER_RULE_ELEM = "rule"_u; +UString const LRXCompiler::LRX_COMPILER_MATCH_ELEM = "match"_u; +UString const LRXCompiler::LRX_COMPILER_SELECT_ELEM = "select"_u; +UString const LRXCompiler::LRX_COMPILER_REMOVE_ELEM = "remove"_u; +UString const LRXCompiler::LRX_COMPILER_OR_ELEM = "or"_u; +UString const LRXCompiler::LRX_COMPILER_REPEAT_ELEM = "repeat"_u; +UString const LRXCompiler::LRX_COMPILER_SEQ_ELEM = "seq"_u; + +UString const LRXCompiler::LRX_COMPILER_LEMMA_ATTR = "lemma"_u; +UString const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR = "suffix"_u; +UString const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR = "contains"_u; +UString const LRXCompiler::LRX_COMPILER_CASE_ATTR = "case"_u; +UString const LRXCompiler::LRX_COMPILER_SURFACE_ATTR = "surface"_u; +UString const LRXCompiler::LRX_COMPILER_TAGS_ATTR = "tags"_u; +UString const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR = "weight"_u; +UString const LRXCompiler::LRX_COMPILER_COMMENT_ATTR = "c"_u; +UString const LRXCompiler::LRX_COMPILER_NAME_ATTR = "n"_u; +UString const LRXCompiler::LRX_COMPILER_FROM_ATTR = "from"_u; +UString const LRXCompiler::LRX_COMPILER_UPTO_ATTR = "upto"_u; + +UString const LRXCompiler::LRX_COMPILER_TYPE_SELECT = "select"_u; +UString const LRXCompiler::LRX_COMPILER_TYPE_REMOVE = "remove"_u; +UString const LRXCompiler::LRX_COMPILER_TYPE_SKIP = "skip"_u; double const LRXCompiler::LRX_COMPILER_DEFAULT_WEIGHT = 1.0; -wstring -LRXCompiler::itow(int i) -{ - // Convert an int to a wstring - wchar_t buf[50]; - memset(buf, '\0', sizeof(buf)); - swprintf(buf, 50, L"%d", i); - wstring id(buf); - return id; -} - -int -LRXCompiler::wtoi(wstring w) +void +LRXCompiler::debug(const char* fmt, ...) { - // Convert a wstring to an int - wistringstream wstrm(w); - int i_name = -numeric_limits::max(); - wstrm >> i_name; - - return i_name; + if (debugMode) { + va_list argptr; + va_start(argptr, fmt); + u_vfprintf(debug_output, fmt, argptr); + va_end(argptr); + } } -double -LRXCompiler::wtod(wstring w) +UString +LRXCompiler::itow(int i) { - // Convert a wstring to a double - wistringstream wstrm(w); - double d_name = -numeric_limits::max(); - wstrm >> d_name; - - return d_name; + // Convert an int to a UString + UChar buf[50]; + u_snprintf(buf, 50, "%d", i); + UString id(buf); + return id; } LRXCompiler::LRXCompiler() @@ -90,6 +78,7 @@ LRXCompiler::LRXCompiler() debugMode = false; outputGraph = false; + debug_output = u_finit(stderr, NULL, NULL); currentRuleId = 0; @@ -99,15 +88,15 @@ LRXCompiler::LRXCompiler() canSelect = true; - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SELECT + L">"); - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_REMOVE + L">"); - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SKIP + L">"); + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SELECT + ">"_u); + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_REMOVE + ">"_u); + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SKIP + ">"_u); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol("<$>"_u); } @@ -129,64 +118,47 @@ LRXCompiler::setOutputGraph(bool o) } void -LRXCompiler::skipBlanks(wstring &name) +LRXCompiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } -wstring -LRXCompiler::attrib(wstring const &name) +UString +LRXCompiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } -wstring -LRXCompiler::attrib(wstring const &name, const wstring fallback) +UString +LRXCompiler::attrib(UString const &name, const UString fallback) { - string mystr = ""; - for (int i = 0, limit = name.size(); i != limit; i++) { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); - xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = XMLParseUtil::towstring(myattr); - xmlFree(myattr); - xmlFree(attrname); - if(myattr == NULL) { - return fallback; - } - else { - return result; - } + return XMLParseUtil::attrib(reader, name, fallback); } bool LRXCompiler::allBlanks() { - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - - for(unsigned int i = 0, limit = text.size(); i < limit; i++) - { - flag = flag && iswspace(text[i]); + UString text = XMLParseUtil::readValue(reader); + for (auto& c : text) { + if (!u_isspace(c)) { + return false; + } } - - return flag; + return true; } void @@ -210,7 +182,7 @@ LRXCompiler::parse(string const &fitxer) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } } @@ -218,14 +190,13 @@ LRXCompiler::parse(string const &fitxer) void LRXCompiler::procNode() { - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); + UString nombre = XMLParseUtil::readName(reader); - if(nombre == L"#text") + if(nombre == "#text"_u) { /* ignorar */ } - else if(nombre== L"#comment") + else if(nombre== "#comment"_u) { /* ignorar */ } @@ -251,8 +222,8 @@ LRXCompiler::procNode() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << nombre << ">'." << endl; exit(EXIT_FAILURE); } @@ -262,10 +233,13 @@ LRXCompiler::procNode() void LRXCompiler::procRule() { - wstring comment = this->attrib(LRX_COMPILER_COMMENT_ATTR); - wstring xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR); - wstring nombre = this->attrib(LRX_COMPILER_NAME_ATTR); - double weight = wtod (xweight); + UString comment = this->attrib(LRX_COMPILER_COMMENT_ATTR); + UString xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR); + UString nombre = this->attrib(LRX_COMPILER_NAME_ATTR); + double weight = LRX_COMPILER_DEFAULT_WEIGHT; + if (!xweight.empty()) { + weight = stod(xweight); + } if(weight <= -numeric_limits::max()) { @@ -276,25 +250,22 @@ LRXCompiler::procRule() currentState = transducer.insertNewSingleTransduction(alphabet(0, 0), currentState); currentRuleId++; - wstring ruleId = L"<" + itow(currentRuleId) + L">"; + UString ruleId = "<"_u + itow(currentRuleId) + ">"_u; weights[currentRuleId] = weight; - if(debugMode) - { - fwprintf(stderr, L" rule: %d, weight: %.2f \n", currentRuleId, weight); - } + debug(" rule: %d, weight: %.2f \n", currentRuleId, weight); while(true) { int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -316,7 +287,7 @@ LRXCompiler::procRule() } else if(name == LRX_COMPILER_RULE_ELEM) { - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState); if(!alphabet.isSymbolDefined(ruleId.c_str())) { alphabet.includeSymbol(ruleId.c_str()); @@ -328,9 +299,9 @@ LRXCompiler::procRule() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_RULE_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_RULE_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -343,10 +314,7 @@ void LRXCompiler::procOr() { - if(debugMode) - { - fwprintf(stderr, L" or: \n"); - } + debug(" or: \n"); int or_initial_state = currentState; vector reachedStates; @@ -355,12 +323,12 @@ LRXCompiler::procOr() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -392,9 +360,9 @@ LRXCompiler::procOr() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_OR_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_OR_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -412,18 +380,18 @@ LRXCompiler::procDefSeq() int oldstate = currentState; currentState = initialState; lastState = initialState; - wstring seqname = this->attrib(LRX_COMPILER_NAME_ATTR); + UString seqname = this->attrib(LRX_COMPILER_NAME_ATTR); while(true) { int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -450,9 +418,9 @@ LRXCompiler::procDefSeq() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_REPEAT_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -468,22 +436,19 @@ void LRXCompiler::procMatch() { // These are mutually exclusive - wstring lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR); - wstring suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR); - wstring _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive + UString lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR); + UString suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR); + UString _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive // This is currently disabled: Future use - wstring surface = this->attrib(LRX_COMPILER_SURFACE_ATTR); + UString surface = this->attrib(LRX_COMPILER_SURFACE_ATTR); - wstring tags = this->attrib(LRX_COMPILER_TAGS_ATTR, L"*"); + UString tags = this->attrib(LRX_COMPILER_TAGS_ATTR, "*"_u); - if(surface != L"") + if(!surface.empty()) { - if(debugMode) - { - fwprintf(stderr, L" match: %S\n", surface.c_str()); - } + debug(" match: %S\n", surface.c_str()); for(auto& it : surface) { @@ -492,70 +457,64 @@ LRXCompiler::procMatch() } else { - if(debugMode) - { - fwprintf(stderr, L" match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str()); - } + debug(" match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str()); - if(_case != L"") + if(_case != ""_u) { - if(_case == L"AA") // + + if(_case == "AA"_u) // + { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(_case == L"aa") // + + else if(_case == "aa"_u) // + { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(_case == L"Aa") // + + + else if(_case == "Aa"_u) // + + { - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } } - if(lemma == L"*" && suffix == L"" && contains == L"" && _case == L"") + if(lemma == "*"_u && suffix.empty() && contains.empty() && _case.empty()) { // This is only if there is no suffix or case or contains - if(debugMode) - { - fwprintf(stderr, L" char: -\n"); - } + debug(" char: -\n"); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(suffix != L"") + else if(suffix != ""_u) { // A suffix is any amount of times followed by whatever is in the suffix int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); for(auto& it : suffix) { currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState); } } - else if(contains != L"") + else if(!contains.empty()) { // A contains is any amount of times followed by whatever is in the attribute // followed by any amount of times int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); for(auto& it : suffix) { currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState); } - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(lemma != L"*") + else if(lemma != "*"_u) { for(auto& it : lemma) { @@ -564,66 +523,57 @@ LRXCompiler::procMatch() } else { - fwprintf(stderr, L"Something surprising happened in compilation\n"); + cerr << "Something surprising happened in compilation\n"; } - wstring tag = L""; + UString tag; for(auto& it : tags) { - if(it == L'.') + if(it == '.') { - if(tag == L"") + if(tag.empty()) { continue; } - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } else { currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState); } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet(""_u), 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(tag == L"") + else if(tag.empty()) { } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState); } } @@ -631,31 +581,31 @@ LRXCompiler::procMatch() if(xmlTextReaderIsEmptyElement(reader)) { // If self-closing - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); return; } - wstring name = L""; + UString name = ""_u; while(true) { int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_SELECT_ELEM) { if(!canSelect) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): is not permitted inside ." << endl; exit(EXIT_FAILURE); } procSelect(); @@ -664,8 +614,8 @@ LRXCompiler::procMatch() { if(!canSelect) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): is not permitted inside ." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): is not permitted inside ." << endl; exit(EXIT_FAILURE); } procRemove(); @@ -676,9 +626,9 @@ LRXCompiler::procMatch() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_MATCH_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_MATCH_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -691,11 +641,11 @@ void LRXCompiler::procSelect() { - wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR); + UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR); - wstring key = L"<" + LRX_COMPILER_TYPE_SELECT + L">"; - if(lemma != L"*") + UString key = "<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u; + if(lemma != "*"_u) { key += lemma; } @@ -703,22 +653,19 @@ LRXCompiler::procSelect() Transducer recogniser; int localCurrentState = recogniser.getInitial(); - if(debugMode) - { - fwprintf(stderr, L" select: %S, %S\n", lemma.c_str(), tags.c_str()); - } + debug(" select: %S, %S\n", lemma.c_str(), tags.c_str()); - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_SELECT + L">")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u)), currentState); - if(lemma == L"*") + if(lemma == "*"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { for (auto &it : lemma) { @@ -727,29 +674,26 @@ LRXCompiler::procSelect() } } - if(tags != L"") + if(tags != ""_u) { - wstring tag = L""; + UString tag = ""_u; for(auto& it : tags) { - if(it == L'.') + if(it == '.') { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -757,34 +701,28 @@ LRXCompiler::procSelect() localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: %S\n", tag.c_str()); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState); localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; @@ -792,26 +730,20 @@ LRXCompiler::procSelect() } else { - if(debugMode) - { - fwprintf(stderr, L" tag: -\n"); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: -\n"); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } recogniser.setFinal(localCurrentState); recognisers[key] = recogniser; - if(debugMode) - { - fwprintf(stderr, L" select: %d\n", recognisers[key].size()); - } - //currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); + debug(" select: %d\n", recognisers[key].size()); + //currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState); return; } @@ -820,11 +752,11 @@ void LRXCompiler::procRemove() { - wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR); + UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR); - wstring key = L"<" + LRX_COMPILER_TYPE_REMOVE + L">"; - if(lemma != L"*") + UString key = "<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u; + if(lemma != "*"_u) { key += lemma; } @@ -832,21 +764,18 @@ LRXCompiler::procRemove() Transducer recogniser; int localCurrentState = recogniser.getInitial(); - if(debugMode) - { - fwprintf(stderr, L" remove: %S, %S\n", lemma.c_str(), tags.c_str()); - } + debug(" remove: %S, %S\n", lemma.c_str(), tags.c_str()); - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_REMOVE + L">")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(alphabet("<$>"_u), alphabet("<$>"_u)), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u)), currentState); - if(lemma == L"*") + if(lemma == "*"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -857,29 +786,26 @@ LRXCompiler::procRemove() } } - if(tags != L"") + if(tags != ""_u) { - wstring tag = L""; + UString tag = ""_u; for(auto& it : tags) { - if(it == L'.') + if(it == '.') { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -887,34 +813,28 @@ LRXCompiler::procRemove() localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: %S\n", tag.c_str()); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { - alphabet.includeSymbol(tag.c_str()); - } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); + alphabet.includeSymbol(tag); } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState); localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; @@ -922,25 +842,19 @@ LRXCompiler::procRemove() } else { - if(debugMode) - { - fwprintf(stderr, L" tag: -\n"); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: -\n"); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(""_u),0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } recogniser.setFinal(localCurrentState); recognisers[key] = recogniser; - if(debugMode) - { - fwprintf(stderr, L" remove: %d\n", recognisers[key].size()); - } + debug(" remove: %d\n", recognisers[key].size()); return; } @@ -951,20 +865,20 @@ LRXCompiler::procRepeat() { bool couldSelect = canSelect; canSelect = false; - wstring xfrom = this->attrib(LRX_COMPILER_FROM_ATTR); - wstring xupto = this->attrib(LRX_COMPILER_UPTO_ATTR); + UString xfrom = this->attrib(LRX_COMPILER_FROM_ATTR); + UString xupto = this->attrib(LRX_COMPILER_UPTO_ATTR); int from = stoi(xfrom); int upto = stoi(xupto); if(from < 0 || upto < 0) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Number of repetitions cannot be negative." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Number of repetitions cannot be negative." << endl; exit(EXIT_FAILURE); } else if(from > upto) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Lower bound on number of repetitions cannot be larger than upper bound." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Lower bound on number of repetitions cannot be larger than upper bound." << endl; exit(EXIT_FAILURE); } int count = upto - from; @@ -978,12 +892,12 @@ LRXCompiler::procRepeat() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -1006,9 +920,9 @@ LRXCompiler::procRepeat() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << LRX_COMPILER_REPEAT_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -1031,11 +945,11 @@ LRXCompiler::procRepeat() void LRXCompiler::procSeq() { - wstring name = this->attrib(LRX_COMPILER_NAME_ATTR); + UString name = this->attrib(LRX_COMPILER_NAME_ATTR); if(sequences.find(name) == sequences.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Sequence '" << name << L"' not defined." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Sequence '" << name << "' not defined." << endl; exit(EXIT_FAILURE); } currentState = transducer.insertTransducer(currentState, sequences[name]); @@ -1050,28 +964,24 @@ LRXCompiler::write(FILE *fst) Compression::multibyte_write(recognisers.size(), fst); for(auto& it : recognisers) { - Compression::wstring_write(it.first, fst); - if(debugMode) - { - fwprintf(stderr, L"+ %d => %S\n", it.second.size(), it.first.c_str()); - it.second.show(alphabet, stderr, 0, false); + Compression::string_write(it.first, fst); + debug("+ %d => %S\n", it.second.size(), it.first.c_str()); + if (debugMode) { + it.second.show(alphabet, debug_output, 0, false); } it.second.write(fst); } - Compression::wstring_write(L"main", fst); + Compression::string_write("main"_u, fst); if(outputGraph) { - transducer.show(alphabet, stderr, 0, false); + transducer.show(alphabet, debug_output, 0, false); } transducer.write(fst); for(auto& it : weights) { - if(debugMode) - { - fwprintf(stderr, L"%.4f %d\n", it.second, it.first); - } + debug("%.4f %d\n", it.second, it.first); weight record{it.first, "", it.second}; weight_to_le(record); fwrite((void *)&record, 1, sizeof(weight), fst); @@ -1079,6 +989,6 @@ LRXCompiler::write(FILE *fst) if(!outputGraph) { - fwprintf(stderr, L"%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions()); + u_fprintf(debug_output, "%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions()); } } diff --git a/src/lrx_compiler.h b/src/lrx_compiler.h index 099c4a7..04cbe1e 100644 --- a/src/lrx_compiler.h +++ b/src/lrx_compiler.h @@ -43,6 +43,8 @@ #include #include +#include + using namespace std; class LRXCompiler @@ -52,10 +54,10 @@ private: Alphabet alphabet; Transducer transducer; - map recognisers; // keyed on pattern + map recognisers; // keyed on pattern map weights; // keyed on rule id - map sequences; + map sequences; int initialState; int lastState; @@ -66,9 +68,11 @@ private: bool debugMode; bool outputGraph; + UFILE* debug_output; + void debug(const char* fmt, ...); bool allBlanks(); - void skipBlanks(wstring &name); + void skipBlanks(UString &name); void procNode(); void procList(); void procListMatch(); @@ -82,43 +86,43 @@ private: void procSeq(); /* If attrib does not exist (or other error), returns an empty string: */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /* If attrib does not exist (or other error), returns fallback: */ - wstring attrib(wstring const &name, const wstring fallback); + UString attrib(UString const &name, const UString fallback); - wstring itow(int i); - int wtoi(wstring); - double wtod(wstring); + UString itow(int i); + int wtoi(UString); + double wtod(UString); public: - static wstring const LRX_COMPILER_LRX_ELEM; - static wstring const LRX_COMPILER_DEFSEQS_ELEM; - static wstring const LRX_COMPILER_DEFSEQ_ELEM; - static wstring const LRX_COMPILER_RULES_ELEM; - static wstring const LRX_COMPILER_RULE_ELEM; - static wstring const LRX_COMPILER_MATCH_ELEM; - static wstring const LRX_COMPILER_SELECT_ELEM; - static wstring const LRX_COMPILER_REMOVE_ELEM; - static wstring const LRX_COMPILER_OR_ELEM; - static wstring const LRX_COMPILER_REPEAT_ELEM; - static wstring const LRX_COMPILER_SEQ_ELEM; - - static wstring const LRX_COMPILER_SURFACE_ATTR; - static wstring const LRX_COMPILER_SUFFIX_ATTR; - static wstring const LRX_COMPILER_LEMMA_ATTR; - static wstring const LRX_COMPILER_CONTAINS_ATTR; - static wstring const LRX_COMPILER_CASE_ATTR; - static wstring const LRX_COMPILER_TAGS_ATTR; - static wstring const LRX_COMPILER_COMMENT_ATTR; - static wstring const LRX_COMPILER_NAME_ATTR; - static wstring const LRX_COMPILER_WEIGHT_ATTR; - static wstring const LRX_COMPILER_FROM_ATTR; - static wstring const LRX_COMPILER_UPTO_ATTR; - - static wstring const LRX_COMPILER_TYPE_SELECT; - static wstring const LRX_COMPILER_TYPE_REMOVE; - static wstring const LRX_COMPILER_TYPE_SKIP; + static UString const LRX_COMPILER_LRX_ELEM; + static UString const LRX_COMPILER_DEFSEQS_ELEM; + static UString const LRX_COMPILER_DEFSEQ_ELEM; + static UString const LRX_COMPILER_RULES_ELEM; + static UString const LRX_COMPILER_RULE_ELEM; + static UString const LRX_COMPILER_MATCH_ELEM; + static UString const LRX_COMPILER_SELECT_ELEM; + static UString const LRX_COMPILER_REMOVE_ELEM; + static UString const LRX_COMPILER_OR_ELEM; + static UString const LRX_COMPILER_REPEAT_ELEM; + static UString const LRX_COMPILER_SEQ_ELEM; + + static UString const LRX_COMPILER_SURFACE_ATTR; + static UString const LRX_COMPILER_SUFFIX_ATTR; + static UString const LRX_COMPILER_LEMMA_ATTR; + static UString const LRX_COMPILER_CONTAINS_ATTR; + static UString const LRX_COMPILER_CASE_ATTR; + static UString const LRX_COMPILER_TAGS_ATTR; + static UString const LRX_COMPILER_COMMENT_ATTR; + static UString const LRX_COMPILER_NAME_ATTR; + static UString const LRX_COMPILER_WEIGHT_ATTR; + static UString const LRX_COMPILER_FROM_ATTR; + static UString const LRX_COMPILER_UPTO_ATTR; + + static UString const LRX_COMPILER_TYPE_SELECT; + static UString const LRX_COMPILER_TYPE_REMOVE; + static UString const LRX_COMPILER_TYPE_SKIP; static double const LRX_COMPILER_DEFAULT_WEIGHT; diff --git a/src/lrx_proc.cc b/src/lrx_proc.cc index bd77260..32ac345 100644 --- a/src/lrx_proc.cc +++ b/src/lrx_proc.cc @@ -92,7 +92,8 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); if(optind == (argc - 3)) @@ -103,14 +104,12 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { + if (!input.open(argv[optind+1])) { endProgram(argv[0]); } - output= fopen(argv[optind+2], "wb"); - if(output == NULL || ferror(output)) + output = u_fopen(argv[optind+2], "w", NULL, NULL); + if(output == NULL) { endProgram(argv[0]); } @@ -126,9 +125,7 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { + if (!input.open(argv[optind+1])) { endProgram(argv[0]); } @@ -150,14 +147,8 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - lrxp.init(); lrxp.process(input, output); - fclose(input); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc index 276c6ba..3501d02 100644 --- a/src/lrx_processor.cc +++ b/src/lrx_processor.cc @@ -20,18 +20,17 @@ #include using namespace std; -wstring const LRXProcessor::LRX_PROCESSOR_TAG_SELECT = L""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_SKIP = ""_u; -wstring +UString LRXProcessor::itow(int i) { - // Convert an int to a wstring - wchar_t buf[50]; - memset(buf, '\0', sizeof(buf)); - swprintf(buf, 50, L"%d", i); - wstring id(buf); + // Convert an int to a UString + UChar buf[50]; + u_snprintf(buf, 50, "%d", i); + UString id(buf); return id; } @@ -82,34 +81,21 @@ LRXProcessor::load(FILE *in) while(len > 0) { - int len2 = Compression::multibyte_read(in); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(in)); - len2--; - } + UString name = Compression::string_read(in); recognisers[name].read(in, alphabet); if(debugMode) { - fwprintf(stderr, L"Recogniser: %S, [finals: %d]\n", name.c_str(), recognisers[name].getFinals().size()); + cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n"; } len--; } if(debugMode) { - fwprintf(stderr, L"recognisers: %d\n", recognisers.size()); + cerr << "recognisers: " << recognisers.size() << endl; } - int len3 = Compression::multibyte_read(in); - - wstring name = L""; - while(len3 > 0) - { - name += static_cast(Compression::multibyte_read(in)); - len3--; - } + UString name = Compression::string_read(in); transducer.read(in, alphabet); @@ -118,13 +104,15 @@ LRXProcessor::load(FILE *in) while(fread(&record, sizeof(weight), 1, in)) { weight_from_le(record); - wstring sid = L"<" + itow(record.id) + L">"; + UString sid = "<"_u + itow(record.id) + ">"_u; weights[sid] = record.pisu; + /* if(debugMode) { - //fwprintf(stderr, L"%S %d weight(%.4f)\n", sid.c_str(), record.id, record.pisu); + cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n"; } + */ } return; @@ -137,42 +125,26 @@ LRXProcessor::init() anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); } -wstring -LRXProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - } - - return result; -} - bool -LRXProcessor::recognisePattern(const wstring lu, const wstring op) +LRXProcessor::recognisePattern(const UString lu, const UString op) { if(recognisers.count(op) < 1) { - fwprintf(stderr, L"WARNING: Recogniser not found for key %S, skipping... [LU: %S]\n", op.c_str(), lu.c_str()); + cerr << "WARNING: Recogniser not found for key " << op << ", skipping... [LU: " << lu << "]" << endl; return false; } @@ -184,14 +156,14 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end()); bool readingTag = false; - wstring tag = L""; + UString tag; int val = 0; for(auto& it : lu) { /* if(debugMode) { - fwprintf(stderr, L"alive: %d\n", cur.size()); + cerr << "alive: " << cur.size() << endl; } */ if(cur.size() < 1) // I think that any time we have 0 alive states, @@ -199,29 +171,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) { return false; } - if(it == L'<') + if(it == '<') { - tag = L""; + tag.clear(); readingTag = true; - tag = tag + it; + tag += it; continue; } - if(it == L'>') + if(it == '>') { tag = tag + it; val = static_cast(alphabet(tag)); if(val == 0) { - val = static_cast(alphabet(L"")); + val = static_cast(alphabet(""_u)); } /* if(debugMode) { - fwprintf(stderr, L":: tag %S: %d\n", tag.c_str(), val); - fwprintf(stderr, L" step: %S\n", tag.c_str()); + cerr << ":: tag " << tag << ": " << val << endl; + cerr << " step: " << tag << endl; } */ - cur.step(val, alphabet(L"")); + cur.step(val, alphabet(""_u)); readingTag = false; continue; } @@ -236,21 +208,21 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) /* if(debugMode) { - fwprintf(stderr, L" step: %C\n", val); + cerr << " step: " << val << endl; } */ - //cur.step(val, a(L"")); + //cur.step(val, a("")); //cur.step(val); set alts; if(!iswupper(val)) { - alts.insert(alphabet(L"")); - alts.insert(alphabet(L"")); + alts.insert(alphabet(""_u)); + alts.insert(alphabet(""_u)); } else { - alts.insert(alphabet(L"")); - alts.insert(alphabet(L"")); + alts.insert(alphabet(""_u)); + alts.insert(alphabet(""_u)); alts.insert(towlower(val)); } cur.step(val, alts); @@ -261,7 +233,7 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) /* if(debugMode) { - fwprintf(stderr, L">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"; } */ if(cur.isFinal(end_states)) @@ -272,541 +244,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) return false; } -/* -void -LRXProcessor::processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > > &covers, - pair > &empty_seq, - map, vector > &spans, - int last_final) -{ - if(debugMode) - { - fwprintf(stderr, L"FLUSH:\n"); - } - - map > >::iterator it; - map > operations; - - for(it = covers.begin(); it != covers.end(); it++) - { - pair > best = it->second; - if(debugMode) - { - fwprintf(stderr, L"===================================================\n"); - fwprintf(stderr, L"[%d][%d] covers[%d] best (score: %d, size: %d)\n", pos, last_final, it->first, best.first, best.second.size()); - } - - // return M[i-1] - if(it->first == last_final) - { - vector::iterator it2; - for(it2 = best.second.begin(); it2 != best.second.end(); it2++) - { - if(debugMode) - { - wstring out = it2->filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L"!!! filter_finals: %S\n", out.c_str()); - } - set > > outpaths; - outpaths = it2->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - - int j = 1; - set > >::iterator it3; - for(it3 = outpaths.begin(); it3 != outpaths.end(); it3++) - { - wstring id = it3->first; - vector ops = it3->second; - vector::iterator op; - for(op = ops.begin(); op != ops.end(); op++) - { - if(*op != LRX_PROCESSOR_TAG_SKIP) - { - int starting_point = -1; - map, vector >::iterator ix; - for(ix = spans.begin(); ix != spans.end(); ix++) - { - vector::iterator iy; - for(iy = ix->second.begin(); iy != ix->second.end(); iy++) - { - set > > y; - y = iy->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - if(y == outpaths) - { - starting_point = ix->first.first; - } - } - } - if(debugMode) - { - fwprintf(stderr, L"=> APPLY [pos: %d, dep: %d, j: %d, start: %d, len: %d]: %S // %S\n", pos, starting_point, j, starting_point+j, ops.size(), id.c_str(), op->c_str()); - } - operations[starting_point+j].first = id; - operations[starting_point+j].second = *op; - } - j++; - } - } - if(debugMode) - { - fwprintf(stderr, L"[best: %d, outpaths: %d]\n", best.first, outpaths.size()); - } - } - } - } - - covers.clear(); - covers[-1] = empty_seq; - covers[-1].first = 0; - - // Here we actually apply the rules that we've matched - - unsigned int spos = 0; - for(spos = 0; spos <= pos; spos++) - { - if(sl[spos] == L"") - { - continue; - } - wstring op = operations[spos].second; - wstring tipus = L""; - if(op.find(LRX_PROCESSOR_TAG_SELECT) != wstring::npos) - { - tipus = LRX_PROCESSOR_TAG_SELECT; - } - if(op.find(LRX_PROCESSOR_TAG_REMOVE) != wstring::npos) - { - tipus = LRX_PROCESSOR_TAG_REMOVE; - } - if(debugMode) - { - fwprintf(stderr, L"#APPL%S. %S\n", tipus.c_str(), op.c_str()); - } - - fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); - - vector::iterator ti; - vector::iterator penum = tl[spos].end(); penum--; - - if(tipus == LRX_PROCESSOR_TAG_SELECT && tl[spos].size() > 1) - { - bool matched = true; - bool selected = false; - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - matched = recognisePattern(*ti, op); - if(matched) - { - if(traceMode || debugMode) - { - fwprintf(stderr, L"%d:SELECT%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str()); - } - fwprintf(output, L"%S", ti->c_str()); - selected = true; - break; - } - } - if(!selected) - { - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - fwprintf(output, L"%S", ti->c_str()); - if(ti != penum) - { - fwprintf(output, L"/"); - } - } - } - } - else if(tipus == LRX_PROCESSOR_TAG_REMOVE && tl[spos].size() > 1) - { - bool matched = true; - vector new_tl; // The new list of TL translations - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - matched = recognisePattern(*ti, op); - if(matched) - { - if(traceMode || debugMode) - { - fwprintf(stderr, L"%d:REMOVE%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str()); - } - continue; - } - new_tl.push_back(*ti); - } - vector::iterator nti; - vector::iterator npenum = new_tl.end(); npenum--; - for(nti = new_tl.begin(); nti != new_tl.end(); nti++) - { - fwprintf(output, L"%S", nti->c_str()); - if(nti != npenum) - { - fwprintf(output, L"/"); - } - } - new_tl.clear(); - } - else - { - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - fwprintf(output, L"%S", ti->c_str()); - if(ti != penum) - { - fwprintf(output, L"/"); - } - } - } - fwprintf(output, L"$"); - if(debugMode) - { - fwprintf(output, L"%d", spos); - } - } -} -*/ - -/* -void -LRXProcessor::process(FILE *input, FILE *output) -{ - bool isEscaped = false; - - map sl; // map of SL words - map > tl; // map of vectors of TL translations - map blanks; // map of the superblanks - - map > > covers ; - pair > empty_seq; - map, vector > spans ; - - covers[-1] = empty_seq; - covers[-1].first = 1.0; - - vector alive_states_clean ; - vector alive_states = alive_states_clean ; - alive_states.push_back(*initial_state); - vector new_states; - - int last_final = -1; // check what we actually use this for - - while(!feof(input)) - { - int val = fgetwc_unlocked(input); - - if(nullFlush && val == L'\0') - { - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - fwprintf(output, L"%S", blanks[pos].c_str()); - pos = 0; - last_final = 0; - tl.clear(); - sl.clear(); - blanks.clear(); - spans.clear(); - - fputwc_unlocked(val, output); - fflush(output); - continue; - } - - // We're starting to read a new lexical form - if(val == L'^' && !isEscaped && outOfWord) - { - outOfWord = false; - continue; - } - - // We've seen the surface form - if(val == L'/' && !isEscaped && !outOfWord) - { - // Read in target equivalences - wstring trad = L""; - val = fgetwc_unlocked(input); - while(val != L'$') - { - if(val != L'$') - { - trad += static_cast(val); - } - if(val == L'/') - { - tl[pos].push_back(trad.substr(0, trad.length()-1)); - trad = L""; - } - val = fgetwc_unlocked(input); - } - tl[pos].push_back(trad); - - if(debugMode) - { - for(vector::iterator it = tl[pos].begin(); it != tl[pos].end(); it++) - { - fwprintf(stderr, L"trad[%d]: %S\n", pos, it->c_str()); - } - } - } - - // We've finished reading a lexical form - if((feof(input) || val == L'$') && !isEscaped && !outOfWord) - { - if(debugMode) - { - fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size()); - } - - new_states.clear(); // alive_states_new - pair > new_best_cover; - new_best_cover.first = -numeric_limits::max(); - - vector matched_rules; - - // \forall s \in A - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - // \IF \exists c \in Q : \delta(s, sent[i]) = c - s.step(alphabet(L"<$>")); - - // A \gets A \cup {c} - if(s.size() > 0) // If the current state has outgoing transitions, - // add it to the new alive states - { - new_states.push_back(s); - } - s.step(alphabet(L"<$>")); - - // \IF c \in F - if(s.isFinal(anfinals)) - { - // We've reached a final state, so we need to evaluate the rule we've matched - if(debugMode) - { - wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" filter_finals: %S\n", out.c_str()); - } - - set > > outpaths; - outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - - set > >::iterator it; - for(it = outpaths.begin(); it != outpaths.end(); it++) - { - vector reached; - - vector path = (*it).second; - wstring id = (*it).first; - - if(debugMode) - { - fwprintf(stderr, L"id: %S:\n", id.c_str()); - for(vector::iterator it2 = path.begin(); it2 != path.end(); it2++) - { - fwprintf(stderr, L"op: %S\n", it2->c_str()); - } - fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos); - } - - spans[make_pair((pos-path.size()), pos)].push_back(s); - - // M[i-ChunkLength(c)] - pair > newseq = covers[(pos - path.size())]; - newseq.first = newseq.first + path.size() ; - - if(newseq.first > new_best_cover.first) - { - State new_state; - new_state = s; - reached.push_back(new_state); - map > >::iterator k; - for(k = covers.begin(); k != covers.end(); k++) - { - vector::iterator l; - pair > p = k->second; - for(l = p.second.begin(); l != p.second.end(); l++) - { - if(debugMode) - { - fwprintf(stderr, L"= [cov: %d][len: %d][pos: %d][pat: %d] INCLUDE FINALS?\n", k->first, p.first, pos, path.size()); - } - if(k->first <= (pos - path.size())) - { - if(debugMode) - { - wstring out2 = l->filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" == INCLUDE FINALS: %S\n", out2.c_str()); - } - reached.push_back(*l); - } - } - } - newseq.second = reached; - new_best_cover = newseq; - covers[pos] = newseq; - if(debugMode) - { - fwprintf(stderr, L"++ FINALS(%d) covers[%d] [%d, %d] BEST: %.4f > %.4f\n", newseq.second.size(), (pos - path.size()), pos, path.size(), newseq.first, new_best_cover.first); - } - } - - last_final = pos; - } - } - } - - alive_states.swap(new_states); - alive_states.push_back(*initial_state); - - if(debugMode) - { - fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size()); - } - - if(alive_states.size() == 1) - { - // If we have only a single alive state, it means no rules are - // active, and we can flush the buffers. - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - - pos = 0; - last_final = 0; - tl.clear(); - sl.clear(); - blanks.clear(); - spans.clear(); - } - - pos++; - if(debugMode) - { - fwprintf(stderr, L"==> new pos: %d\n", pos); - } - - outOfWord = true; - continue; - } - - - // We're reading a tag - if(val == L'<' && !isEscaped && !outOfWord) - { - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); - sl[pos] = sl[pos] + tag; - val = static_cast(alphabet(tag)); - if(val == 0) - { - val = static_cast(alphabet(L"")); - } - if(debugMode) - { - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); - } - } - - if(!outOfWord) - { - if(debugMode) - { - fwprintf(stderr, L"outOfWord = false\n"); - } - - new_states.clear(); - wstring res = L""; - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - res = L""; - State s = *it; - if(val < 0) - { - alphabet.getSymbol(res, val, false); - if(debugMode) - { - fwprintf(stderr, L" step: %S\n", res.c_str()); - } - s.step(val, alphabet(L"")); - } - else - { - if(debugMode) - { - fwprintf(stderr, L" step: %C\n", val); - } - s.step_case(val, alphabet(L""), false); - } - if(s.size() > 0) // If the current state has outgoing transitions, add it to the new alive states - { - new_states.push_back(s); - } - } - if(debugMode) - { - fwprintf(stderr, L"new_states: %d\n", new_states.size()); - } - alive_states.swap(new_states); - alive_states.push_back(*initial_state); - - } - - // We're still reading a surface form - if(val > 0 && val != L'$' && !isEscaped && !outOfWord) - { - sl[pos] = sl[pos] + static_cast(val); - } - - // Reading a superblank - if(outOfWord) - { - if(!feof(input)) - { - blanks[pos] = blanks[pos] + static_cast(val); - } - if(debugMode) - { - //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str()); - } - } - - // Increment the current line number (for rule tracing) - if(val == L'\n') - { - lineno++; - } - } - - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - - fwprintf(output, L"%S", blanks[pos].c_str()); -} -*/ - void -LRXProcessor::process(FILE *input, FILE *output) +LRXProcessor::process(InputFile& input, UFILE *output) { bool isEscaped = false; - map sl; // map of SL words - map > tl; // map of vectors of TL translations - map blanks; // map of the superblanks + map sl; // map of SL words + map > tl; // map of vectors of TL translations + map blanks; // map of the superblanks - map > scores; // - map > operations; + map > scores; // + map > operations; vector alive_states ; alive_states.push_back(new State(*initial_state)); - int val = 0; - while((val = fgetwc_unlocked(input)) != EOF && val != WEOF) + int32_t val = 0; + while((val = input.get()) != U_EOF) { - if(nullFlush && val == L'\0') + if(nullFlush && val == '\0') { processFlush(output, sl, tl, blanks, scores, operations); - fwprintf(output, L"%S", blanks[pos].c_str()); + u_fprintf(output, "%S", blanks[pos].c_str()); pos = 0; tl.clear(); sl.clear(); @@ -816,63 +276,62 @@ LRXProcessor::process(FILE *input, FILE *output) alive_states.clear(); alive_states.push_back(new State(*initial_state)); - fputwc_unlocked(val, output); - fflush(output); + u_fputc(val, output); + u_fflush(output); continue; } // We're starting to read a new lexical form - if(val == L'^' && !isEscaped && outOfWord) + if(val == '^' && !isEscaped && outOfWord) { outOfWord = false; continue; } // We've seen the surface form - if(val == L'/' && !isEscaped && !outOfWord) + if(val == '/' && !isEscaped && !outOfWord) { // Read in target equivalences - wstring trad = L""; - val = fgetwc_unlocked(input); - while(val != L'$' && val != EOF && val != WEOF) + UString trad; + val = input.get(); + while(val != '$' && val != U_EOF) { - if(val != L'$') + if(val != '$') { - trad += static_cast(val); + trad += val; } - if(val == L'/') + if(val == '/') { tl[pos].push_back(trad.substr(0, trad.length()-1)); - trad = L""; + trad.clear(); } - val = fgetwc_unlocked(input); + val = input.get(); } tl[pos].push_back(trad); if(debugMode) { - for(auto& it : tl[pos]) - { - fwprintf(stderr, L"trad[%d]: %S\n", pos, it.c_str()); + for(auto& it : tl[pos]) { + cerr << "trad[" << pos << "]: " << it << endl; } } } - if((feof(input) || val == L'$') && !isEscaped && !outOfWord) + if((input.eof() || val == '$') && !isEscaped && !outOfWord) { if(debugMode) { - fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]: %S\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size(), sl[pos].c_str()); + cerr << "[POS] " << pos << ": [sl " << sl[pos].size() << " ; tl " << tl[pos].size() << " ; bl " << blanks[pos].size() << "]: " << sl[pos] << endl; } { vector new_states; // TODO: Can we avoid the State-copying here? // \forall s \in A - set seen_ids; + set seen_ids; for(auto& it : alive_states) { State s = *it; // \IF \exists c \in Q : \delta(s, sent[i]) = c - s.step(alphabet(L"<$>")); + s.step(alphabet("<$>"_u)); // A \gets A \cup {c} if (s.size() > 0) // If the current state has outgoing transitions, @@ -880,7 +339,7 @@ LRXProcessor::process(FILE *input, FILE *output) { new_states.push_back(new State(s)); } - s.step(alphabet(L"<$>")); + s.step(alphabet("<$>"_u)); // \IF c \in F if (s.isFinal(anfinals)) @@ -888,18 +347,18 @@ LRXProcessor::process(FILE *input, FILE *output) // We've reached a final state, so we need to evaluate the rule we've matched if (debugMode) { - wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" filter_finals: %S\n", out.c_str()); + UString out = s.filterFinals(anfinals, alphabet, escaped_chars); + cerr << " filter_finals: " << out << endl; } - set>> outpaths; + set>> outpaths; outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); for (auto& it : outpaths) { vector reached; - vector path = it.second; - wstring id = it.first; + vector path = it.second; + UString id = it.first; if (seen_ids.find(id) != seen_ids.end()) { @@ -911,13 +370,14 @@ LRXProcessor::process(FILE *input, FILE *output) if (debugMode) { - fwprintf(stderr, L"id: %S: (lambda: %.5f)\n", id.c_str(), weights[id.c_str()]); + cerr << "id: " << id << ": (lambda: "; + cerr << weights[id] << ")\n"; } for (auto& it2 : path) { if (debugMode) { - fwprintf(stderr, L"op: %S\n", it2.c_str()); + cerr << "op: " << it2 << endl; } if (it2 != LRX_PROCESSOR_TAG_SKIP) { @@ -928,9 +388,10 @@ LRXProcessor::process(FILE *input, FILE *output) scores[j][it2] += weights[id.c_str()]; if (debugMode) { - fwprintf(stderr, L"#[%d]SCORE %.5f / %S\n", j, scores[j][it2], it2.c_str()); + cerr << "#[" << j << "]SCORE " << scores[j][it2] << " / "; + cerr << it2 << endl; } - if(it2.at(0) == L'<' && it2.at(1) == L'r') { + if(it2.at(0) == '<' && it2.at(1) == 'r') { operations[j][it2] = Remove; } else { @@ -939,7 +400,7 @@ LRXProcessor::process(FILE *input, FILE *output) } j++; } - // fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos); + // cerr << "#SPAN[" << (pos-path.size()) << ", " << pos << "]\n"; } } } @@ -953,13 +414,12 @@ LRXProcessor::process(FILE *input, FILE *output) if (debugMode) { - fwprintf(stderr, L"seen:"); - for (auto& it : seen_ids) - { - fwprintf(stderr, L" %S ", it.c_str()); + cerr << "seen:"; + for (auto& it : seen_ids) { + cerr << " " << it << " "; } - fwprintf(stderr, L"\n"); - fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size()); + cerr << endl; + cerr << "#CURRENT_ALIVE: " << alive_states.size() << endl; } } @@ -970,7 +430,7 @@ LRXProcessor::process(FILE *input, FILE *output) if(debugMode) { - fwprintf(stderr, L"FLUSH:\n"); + cerr << "FLUSH:" << endl; } @@ -988,7 +448,7 @@ LRXProcessor::process(FILE *input, FILE *output) pos++; if(debugMode) { - fwprintf(stderr, L"==> new pos: %d\n", pos); + cerr << "==> new pos: " << pos << endl; } outOfWord = true; @@ -996,19 +456,18 @@ LRXProcessor::process(FILE *input, FILE *output) } // We're reading a tag - if(val == L'<' && !isEscaped && !outOfWord) + if(val == '<' && !isEscaped && !outOfWord) { - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); + UString tag = input.readBlock('<', '>'); sl[pos] = sl[pos] + tag; val = static_cast(alphabet(tag)); if(val == 0) { - val = static_cast(alphabet(L"")); + val = static_cast(alphabet(""_u)); } if(debugMode) { - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); + cerr << "tag " << tag << ": " << val << "\n"; } } @@ -1016,39 +475,39 @@ LRXProcessor::process(FILE *input, FILE *output) { if(debugMode) { - fwprintf(stderr, L"outOfWord = false\n"); + cerr << "outOfWord = false\n"; } - wstring res = L""; + UString res; for(auto& s : alive_states) { - res = L""; + res.clear(); if(val < 0) { alphabet.getSymbol(res, val, false); if(debugMode) { - fwprintf(stderr, L" step: %S\n", res.c_str()); + cerr << " step: " << res << endl; } - s->step(val, alphabet(L"")); + s->step(val, alphabet(""_u)); } else { set alts; - alts.insert(alphabet(L"")); + alts.insert(alphabet(""_u)); if(iswupper(val)) { alts.insert(towlower(val)); - alts.insert(alphabet(L"")); + alts.insert(alphabet(""_u)); } else { - alts.insert(alphabet(L"")); + alts.insert(alphabet(""_u)); } if(debugMode) { - fwprintf(stderr, L" step: %C [alts: %d]\n", val, alts.size()); + cerr << " step: " << val << " [alts: " << alts.size() << "]\n"; } s->step(val, alts); } @@ -1057,26 +516,28 @@ LRXProcessor::process(FILE *input, FILE *output) } // We're still reading a surface form - if(val > 0 && val != L'$' && !isEscaped && !outOfWord) + if(val > 0 && val != '$' && !isEscaped && !outOfWord) { - sl[pos] = sl[pos] + static_cast(val); + sl[pos] += val; } // Reading a superblank if(outOfWord) { - if(!feof(input)) + if(!input.eof()) { - blanks[pos] = blanks[pos] + static_cast(val); + blanks[pos] += val; } + /* if(debugMode) { - //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str()); + cerr << "blanks[" << pos << "] = " << blanks[pos] << endl; } + */ } // Increment the current line number (for rule tracing) - if(val == L'\n') + if(val == '\n') { lineno++; } @@ -1084,42 +545,42 @@ LRXProcessor::process(FILE *input, FILE *output) } processFlush(output, sl, tl, blanks, scores, operations); - fwprintf(output, L"%S", blanks[pos].c_str()); + write(blanks[pos], output); } void -LRXProcessor::processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > &scores, - map > &operations) { - +LRXProcessor::processFlush(UFILE *output, + map &sl, + map > &tl, + map &blanks, + map > &scores, + map > &operations) { + struct ScoredMatch { OpType op; - wstring* ti; // matched target translation + UString* ti; // matched target translation double weight; }; unsigned int spos = 0; for(spos = 0; spos <= pos; spos++) { - if(sl[spos] == L"") + if(sl[spos].empty()) { continue; } - fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); + u_fprintf(output, "%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); - vector::iterator ti; + vector::iterator ti; auto penum = tl[spos].end(); penum--; if(tl[spos].size() > 1) { //-- - set ti_keep; - set ti_removed; + set ti_keep; + set ti_removed; vector spos_matches; for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { @@ -1128,9 +589,13 @@ LRXProcessor::processFlush(FILE *output, bool matched = recognisePattern(*ti, si.first); OpType op = operations[spos][si.first]; if (debugMode) { - wstring checks = matched ? L"✔️ " : L"❎"; - fwprintf(stderr, L"%S >>> %d -> %S -> %.5f\n", checks.c_str(), spos, - si.first.c_str(), si.second); + if (matched) { + cerr << "✔️ "; + } else { + cerr << "❎"; + } + cerr << " >>> " << spos << " -> "; + cerr << si.first << " -> " << si.second << endl; } if(matched) { spos_matches.push_back({ op, &*ti, si.second }); @@ -1144,15 +609,10 @@ LRXProcessor::processFlush(FILE *output, [](const auto &a, const auto &b) { return a.weight > b.weight; }); for (const auto &m : spos_matches) { if (traceMode || debugMode) { - wstring op = (m.op == Select ? L"SELECT" : L"REMOVE"); - fwprintf( - stderr, L"%d:%S:%.5f:%S:%d:%S\n", - lineno, - op.c_str(), - m.weight, - sl[spos].c_str(), - ti_keep.size(), - m.ti->c_str()); + std::string op = (m.op == Select ? "SELECT" : "REMOVE"); + cerr << lineno << ":" << op << ":" << m.weight; + cerr << ":" << sl[spos] << ":" << ti_keep.size(); + cerr << ":" << m.ti << endl; } // We have to keep track of translations that have been removed so // that we don't end up adding back a translation that was removed. @@ -1168,9 +628,9 @@ LRXProcessor::processFlush(FILE *output, bool printed = false; for(const auto& ti_max : ti_keep) { if(printed) { - fwprintf(output, L"/"); + u_fprintf(output, "/"); } - fwprintf(output, L"%S", ti_max->c_str()); + u_fprintf(output, "%S", ti_max->c_str()); printed = true; } } @@ -1178,10 +638,10 @@ LRXProcessor::processFlush(FILE *output, { for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { - fwprintf(output, L"%S", ti->c_str()); + u_fprintf(output, "%S", ti->c_str()); if(ti != penum) { - fwprintf(output, L"/"); + u_fprintf(output, "/"); } } } @@ -1190,18 +650,18 @@ LRXProcessor::processFlush(FILE *output, { for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { - fwprintf(output, L"%S", ti->c_str()); + u_fprintf(output, "%S", ti->c_str()); if(ti != penum) { - fwprintf(output, L"/"); + u_fprintf(output, "/"); } } } - fwprintf(output, L"$"); + u_fprintf(output, "$"); if(debugMode) { - fwprintf(output, L"%d", spos); + u_fprintf(output, "%d", spos); } diff --git a/src/lrx_processor.h b/src/lrx_processor.h index 26973aa..cad60f9 100644 --- a/src/lrx_processor.h +++ b/src/lrx_processor.h @@ -34,7 +34,6 @@ #include -#include #include #include #include @@ -46,46 +45,23 @@ #include #include #include +#include using namespace std; -/* -class BiltransToken { -public: - bool isEOF = false; - wstring source; - wstring blanks; - vector target; - - wstring toString(bool delim) { - wstring out = source; - for(int i = 0; i < target.size(); i++) { - out += L'/' + target[i]; - } - if (delim && (source.size() > 0 || target.size() > 0)) { - out = blanks + L'^' + out + L'$'; - } else { - out = blanks + out; - } - return out; - } -}; -*/ class LRXProcessor { private: Alphabet alphabet; TransExe transducer; - map recognisers; - map weights; - -// map bts; + map recognisers; + map weights; vector alive_states; map anfinals; - set escaped_chars; + set escaped_chars; State *initial_state; bool traceMode; @@ -96,39 +72,27 @@ private: unsigned int pos; unsigned long lineno; - wstring itow(int i); - bool recognisePattern(const wstring lu, const wstring op); - wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); - -// BiltransToken readBiltransToken(FILE *input = stdin); + UString itow(int i); + bool recognisePattern(const UString lu, const UString op); + UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2); void makeTransition(int); void filterFinals(); void evaluateRules(); -/* - void processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > > &covers, - pair > &empty_seq, - map, vector > &spans, - int last_final); -*/ enum OpType { Select, Remove }; - void processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > &scores, - map > &operations); + void processFlush(UFILE *output, + map &sl, + map > &tl, + map &blanks, + map > &scores, + map > &operations); public: - static wstring const LRX_PROCESSOR_TAG_SELECT; - static wstring const LRX_PROCESSOR_TAG_REMOVE; - static wstring const LRX_PROCESSOR_TAG_SKIP; + static UString const LRX_PROCESSOR_TAG_SELECT; + static UString const LRX_PROCESSOR_TAG_REMOVE; + static UString const LRX_PROCESSOR_TAG_SKIP; LRXProcessor(); ~LRXProcessor(); @@ -139,9 +103,7 @@ public: void init(); void load(FILE *input); - void process(FILE *input, FILE *output); -// void processME(FILE *input, FILE *output); - + void process(InputFile& input, UFILE *output); }; #endif /* __LRX_PROCESSOR_H__ */ diff --git a/src/multi_translator.cc b/src/multi_translator.cc index 7e2ad1e..4f2e7c3 100644 --- a/src/multi_translator.cc +++ b/src/multi_translator.cc @@ -30,10 +30,10 @@ int MultiTranslator::calculateFertility(vector sent) { } -BiltransToken MultiTranslator::parseBiltransToken(wstring bt) { +BiltransToken MultiTranslator::parseBiltransToken(UString bt) { BiltransToken token; - vector tokens = wsplit(bt, L'/'); + vector tokens = wsplit(bt, '/'); token.sourceToken = parseTaggerToken(tokens[0]); @@ -49,9 +49,9 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) { bool isPos; if (bt.sourceToken.tags.size() > 0) { isPos = - bt.sourceToken.tags[0] == L"n" || - bt.sourceToken.tags[0] == L"vblex" || - bt.sourceToken.tags[0] == L"adj"; + bt.sourceToken.tags[0] == "n"_u || + bt.sourceToken.tags[0] == "vblex"_u || + bt.sourceToken.tags[0] == "adj"_u; } else { isPos = false; } @@ -60,10 +60,10 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) { } -BiltransToken MultiTranslator::getFullToken(wstring source) { +BiltransToken MultiTranslator::getFullToken(UString source) { BiltransToken token; - if (source[0] == L'*') { + if (source[0] == '*') { token.sourceToken.lemma = source; TaggerToken tmp; tmp.lemma = source; @@ -71,21 +71,22 @@ BiltransToken MultiTranslator::getFullToken(wstring source) { return token; } - wstring target = bilingual.biltrans(source, false); - if (target == L"") { - target = L"@" + source; + UString target = bilingual.biltrans(source, false); + if (target.empty()) { + target += '@'; + target.append(source); } - token = parseBiltransToken(source + L"/" + target); + token = parseBiltransToken(source + "/"_u + target); return token; } -BiltransToken MultiTranslator::getTrimmedToken(wstring source) +BiltransToken MultiTranslator::getTrimmedToken(UString source) { BiltransToken ttoken; BiltransToken ftoken; - if (source[0] == L'*') { + if (source[0] == '*') { ttoken.sourceToken.lemma = source; TaggerToken tmp; tmp.lemma = source; @@ -99,8 +100,8 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) // the bilingual.* methods in FSTProcessor. Unknown why we get the // leaks in the first place... - wstring fstr = L""; - wstring tstr = L""; + UString fstr; + UString tstr; if((f_cache.find(source) == f_cache.end())) { @@ -116,37 +117,39 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) /*---------------------------------------------*/ - if (fstr == L"") { - fstr = L"@" + source; - } - if (tstr == L"") { - tstr = L"@" + source; - } + if (fstr.empty()) { + fstr += '@'; + fstr.append(source); + } + if (tstr.empty()) { + tstr += '@'; + tstr.append(source); + } - ttoken = parseBiltransToken(source + L"/" + tstr); - ftoken = parseBiltransToken(source + L"/" + fstr); + ttoken = parseBiltransToken(source + "/"_u + tstr); + ftoken = parseBiltransToken(source + "/"_u + fstr); if(this->trimmed) { for(size_t i = 0; i < ftoken.targetTokens.size(); ++i ) { if(ttoken.targetTokens[i].tags.size() < ftoken.targetTokens[i].tags.size()) { - ttoken.targetTokens[i].tags.push_back(L"*"); + ttoken.targetTokens[i].tags.push_back("*"_u); } } } - vector newTags; + vector newTags; //bool sourceTrimmed = false; for(size_t i = 0; i < ttoken.sourceToken.tags.size(); ++i) { - wstring tag = ttoken.sourceToken.tags[i]; + UString tag = ttoken.sourceToken.tags[i]; if (find(ttoken.targetTokens[0].tags, tag) == find(ftoken.targetTokens[0].tags, tag)) { newTags.push_back(tag); } } if(ttoken.sourceToken.tags.size() > newTags.size()) { - newTags.push_back(L"*"); + newTags.push_back("*"_u); } ttoken.sourceToken.tags = newTags; @@ -154,50 +157,50 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) } void MultiTranslator::biltransToMultiTranslator(int sn, int &tn, unsigned int idx, - vector s, wstring buffer) + vector s, UString buffer) { if (idx == s.size() ) { - wcout << L".[][" << sn << L" " << tn << L"].[]\t" << buffer << endl; + cout << ".[][" << sn << " " << tn << "].[]\t" << buffer << endl; tn += 1; return; } auto n = s[idx].targetTokens.size(); - wstring base; - base = s[idx].sourceToken.toString(false) + L"/"; + UString base; + base = s[idx].sourceToken.toString(false) + "/"_u; for(size_t i = 0; i < n; ++i) { - wstring token = L"^" + base + s[idx].targetTokens[i].toString(false) + L"$"; + UString token = "^"_u + base + s[idx].targetTokens[i].toString(false) + "$"_u; if(idx != s.size() - 1) { - token += L" "; + token += ' '; } biltransToMultiTranslator(sn, tn, idx+1, s, buffer + token); } } void MultiTranslator::printBiltransSentence(int n, vector s) { if (number_lines) { - wcout << n << "\t"; + cout << n << "\t"; } for(size_t i = 0; i < s.size(); ++i) { - wcout << s[i].toString(true); + cout << s[i].toString(true); if (i != s.size() - 1) { - wcout << L" "; + cout << " "; } } - wcout << endl; + cout << endl; } void MultiTranslator::printTaggerOutput(int n, vector sentence) { if (number_lines) { - wcout << n << "\t"; + cout << n << "\t"; } for(size_t i = 0; i < sentence.size(); ++i) { - wcout << sentence[i].sourceToken.toString(true); + cout << sentence[i].sourceToken.toString(true); if (i != sentence.size() -1) { - wcout << L" "; + cout << " "; } } - wcout << endl; + cout << endl; } void MultiTranslator::processSentence(vector sentence) { @@ -207,8 +210,8 @@ void MultiTranslator::processSentence(vector sentence) { int numberOfUnknown = 0; int fertility = 1; for(size_t i = 0; i < sentence.size(); ++i) { - wstring token = sentence[i].toString(false); - wstring target; + UString token = sentence[i].toString(false); + UString target; BiltransToken bt; if(this->trimmed){ @@ -220,7 +223,7 @@ void MultiTranslator::processSentence(vector sentence) { if (isPosAmbig(bt)) { hasAmbigPos = true; } - if(token[0] == L'*') { + if(token[0] == '*') { numberOfUnknown ++; } fertility *= bt.targetTokens.size(); @@ -240,7 +243,7 @@ void MultiTranslator::processSentence(vector sentence) { } else if(mode == "-b") { printBiltransSentence(this->sn, outputSentence); } else if (mode == "-m") { - wstring outBuffer = L""; + UString outBuffer; int tn = 0; biltransToMultiTranslator(this->sn, tn, 0, outputSentence, outBuffer); } diff --git a/src/multi_translator.h b/src/multi_translator.h index d4d69cd..b2574e5 100644 --- a/src/multi_translator.h +++ b/src/multi_translator.h @@ -7,33 +7,38 @@ class BiltransToken { public: - TaggerToken sourceToken; - vector targetTokens; - wstring blanks; - - bool isEOF; - - BiltransToken() { - isEOF = false; - } - - wstring toString(bool delimiter) { - wstring out = sourceToken.toString(false); - for(unsigned int i = 0; i < targetTokens.size(); i++) { - out += L'/' + targetTokens[i].toString(false); - } - if (delimiter) { - out = L"^" + out + L"$"; - } - return out; - } + TaggerToken sourceToken; + vector targetTokens; + UString blanks; + + bool isEOF; + + BiltransToken() { + isEOF = false; + } + + UString toString(bool delimiter) { + UString out; + if (delimiter) { + out += '^'; + } + out.append(sourceToken.toString(false)); + for (auto& tok : targetTokens) { + out += '/'; + out.append(tok.toString(false)); + } + if (delimiter) { + out += '$'; + } + return out; + } }; class MultiTranslator : public TaggerOutputProcessor { private: FSTProcessor bilingual; - map f_cache; - map t_cache; + map f_cache; + map t_cache; string path; bool trimmed; @@ -44,10 +49,10 @@ private: bool isPosAmbig(BiltransToken token); - BiltransToken getTrimmedToken(wstring str); - BiltransToken getFullToken(wstring str); + BiltransToken getTrimmedToken(UString str); + BiltransToken getFullToken(UString str); - BiltransToken parseBiltransToken(wstring bt); + BiltransToken parseBiltransToken(UString bt); void processSentence(vector s); @@ -56,7 +61,7 @@ private: void printTaggerOutput(int i, vector s); void biltransToMultiTranslator(int sn, int &tn, unsigned int idx, - vector s, wstring buffer); + vector s, UString buffer); diff --git a/src/tagger_output_processor.cc b/src/tagger_output_processor.cc index 63b07f8..52d5c7c 100644 --- a/src/tagger_output_processor.cc +++ b/src/tagger_output_processor.cc @@ -9,7 +9,7 @@ TaggerOutputProcessor::~TaggerOutputProcessor() { } -int TaggerOutputProcessor::find(vector xs, wstring x) { +int TaggerOutputProcessor::find(vector xs, UString x) { for (size_t i = 0; i < xs.size(); ++i) { if (xs[i] == x) return i; @@ -17,10 +17,10 @@ int TaggerOutputProcessor::find(vector xs, wstring x) { return -1; } -TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) { +TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) { TaggerToken token; int state = 0; // lemma; - wstring buffer; + UString buffer; for (auto& c : str) { if(c == L'<' && state == 0) { state = 1; @@ -41,10 +41,10 @@ TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) { return token; } -vector TaggerOutputProcessor::parseTags(wstring token) { +vector TaggerOutputProcessor::parseTags(UString token) { int state = 0; // outside - vector tags; - wstring buffer; + vector tags; + UString buffer; for (auto& c : token) { if (state == 0) { if (c == '<') { @@ -53,7 +53,7 @@ vector TaggerOutputProcessor::parseTags(wstring token) { } else if (state == 1) { if (c == '>') { tags.push_back(buffer); - buffer = L""; + buffer.clear(); state = 0; } else { buffer += c; @@ -63,26 +63,26 @@ vector TaggerOutputProcessor::parseTags(wstring token) { return tags; } -vector TaggerOutputProcessor::wsplit(wstring wstr, wchar_t delim) { - vector tokens; - wstring buffer; +vector TaggerOutputProcessor::wsplit(UString wstr, wchar_t delim) { + vector tokens; + UString buffer; for(size_t i = 0; i < wstr.size(); ++i) { if(wstr[i] == delim && (i == 0 || wstr[i-1] != L'\\')) { tokens.push_back(buffer); - buffer = L""; + buffer.clear(); } else { buffer += wstr[i]; } } - if(buffer != L"") { + if(!buffer.empty()) { tokens.push_back(buffer); } return tokens; } -wstring TaggerOutputProcessor::getLemma(wstring token) { - wstring buffer; +UString TaggerOutputProcessor::getLemma(UString token) { + UString buffer; for (auto& c : token) { if(c != '<') { buffer += c; @@ -94,7 +94,7 @@ wstring TaggerOutputProcessor::getLemma(wstring token) { } void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) { - wstring buffer; + UString buffer; vector sentence; bool escaped = false; int state = 0; // outside @@ -126,7 +126,7 @@ void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) { } else if (state == 1) { if(c == L'$' && !escaped) { sentence.push_back(parseTaggerToken(buffer)); - buffer = L""; + buffer.clear(); state = 0; } else if (c == '\\' && !escaped) { escaped = true; diff --git a/src/tagger_output_processor.h b/src/tagger_output_processor.h index 40c00ad..cdcba9d 100644 --- a/src/tagger_output_processor.h +++ b/src/tagger_output_processor.h @@ -18,30 +18,36 @@ using namespace std; class TaggerToken { public: - wstring lemma; - vector tags; - wstring toString(bool delimiters) { - wstring out = lemma; - for (auto& tag : tags) { - out += L"<" + tag + L">"; - } - if (delimiters) { - out = L"^" + out + L"$"; - } - return out; - } + UString lemma; + vector tags; + UString toString(bool delimiters) { + UString out; + if (delimiters) { + out += '^'; + } + out.append(lemma); + for (auto& tag : tags) { + out += '<'; + out.append(tag); + out += '>'; + } + if (delimiters) { + out += '$'; + } + return out; + } }; class TaggerOutputProcessor { protected: int sn; - vector parseTags(wstring token); - vector wsplit(wstring wstr, wchar_t delim); - TaggerToken parseTaggerToken(wstring buffer); + vector parseTags(UString token); + vector wsplit(UString wstr, wchar_t delim); + TaggerToken parseTaggerToken(UString buffer); - int find(vector xs, wstring x); - wstring getLemma(wstring token); + int find(vector xs, UString x); + UString getLemma(UString token); virtual void processSentence(vector) =0; public: