commit 6ed619ecb9637b83e98f433f6cb55ef75ef4919c Author: Daniel Swanson Date: Wed Jun 30 08:53:51 2021 -0500 use ICU (#71) ICU changes - convert all `std::wstring`s and related types to `UString` - use `lttoolbox/input_file.h` for reading UTF-8 with nulls - use `UFILE*` for writing output efficiency, readability, and code style changes - copy `.editorconfig` file from lttoolbox - move locale setting from constructor to CLI interface - move constant initializers to class headers - store values of special transducer symbols rather than repeatedly looking them up - prefer `str.empty()` to `str == ""` - remove unused `#include`s - delete long section of commented out code in `lrx_processor.cc` helper function and dependency changes - all needed helper functions have moved to lttoolbox, so drop apertium dependency - rely on `StringUtils` for converting strings to numbers - add `debug` and `error` `printf`-like functions in `lrx_compiler` - use `XMLParseUtil` specialized functions diff --git a/.editorconfig b/.editorconfig new file mode 100755 index 0000000..dd10a25 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +# https://editorconfig.org/ +root = yes + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 4 +indent_style = tab +insert_final_newline = true +trim_trailing_whitespace = true + +[**.cc] +indent_size = 2 +indent_style = space + +[**.h] +indent_size = 2 +indent_style = space diff --git a/.gitignore b/.gitignore index 5010f84..0104dde 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,8 @@ src/lrx-proc multitrans stamp-h1 +/python/apertium_lex_tools.py +/python/apertium_lex_tools_wrap.cpp /python/lex_tools_wrap.cpp /python/lextools.py /python/setup.py diff --git a/configure.ac b/configure.ac index 735e785..8004204 100644 --- a/configure.ac +++ b/configure.ac @@ -1,10 +1,9 @@ AC_PREREQ(2.61) m4_define([required_libxml_version], [2.6.17]) -m4_define([required_apertium_version], [3.7.1]) -m4_define([required_lttoolbox_version], [3.5.3]) +m4_define([required_lttoolbox_version], [3.6.0]) -AC_INIT([apertium-lex-tools], [0.2.7], [apertium-stuff@lists.sourceforge.net]) +AC_INIT([apertium-lex-tools], [0.3.0], [apertium-stuff@lists.sourceforge.net]) AM_INIT_AUTOMAKE AC_CONFIG_MACRO_DIR([m4]) @@ -48,25 +47,27 @@ PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= required_lttoolbox_version]) AC_SUBST(LTTOOLBOX_CFLAGS) AC_SUBST(LTTOOLBOX_LIBS) -PKG_CHECK_MODULES([APERTIUM], [apertium >= required_apertium_version]) - -AC_SUBST(APERTIUM_CFLAGS) -AC_SUBST(APERTIUM_LIBS) - PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) +PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc]) + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) + AC_CHECK_FUNCS([setlocale strdup]) -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked]) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS -lz" +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS -lz" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/python/apertium_lex_tools.i b/python/apertium_lex_tools.i index 051346f..9304bda 100644 --- a/python/apertium_lex_tools.i +++ b/python/apertium_lex_tools.i @@ -51,8 +51,9 @@ public: void lrx_proc(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "rb"); - FILE* output = fopen(output_path, "wb"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); optind = 1; while(true) { @@ -83,8 +84,7 @@ public: } } process(input, output); - fclose(input); - fclose(output); + u_fclose(output); } }; diff --git a/python/setup.py.in b/python/setup.py.in index 9da20b3..85973a7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -34,7 +34,7 @@ def get_include_dirs(): apertium_lex_tools_module = Extension( name='_apertium_lex_tools', sources=get_sources(), - swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split(), + swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split()+'@ICU_CFLAGS@'.split(), include_dirs=get_include_dirs(), library_dirs=['/usr/include/libxml2', '/usr/local/lib'], extra_compile_args='@CXXFLAGS@'.split(), diff --git a/src/biltrans-without-queue.cpp b/src/biltrans-without-queue.cpp index 9dc5d55..d394a8c 100644 --- a/src/biltrans-without-queue.cpp +++ b/src/biltrans-without-queue.cpp @@ -3,8 +3,8 @@ int main(int argc, char** argv) { if (argc != 2 && argc != 3) { - wcout << "Usage: " << argv[0]; - wcout << " [--trimmed | -t]" << endl; + cout << "Usage: " << argv[0]; + cout << " [--trimmed | -t]" << endl; exit(1); } string path(argv[1]); diff --git a/src/irstlm_ranker.cpp b/src/irstlm_ranker.cpp index b50c31b..9a047a2 100644 --- a/src/irstlm_ranker.cpp +++ b/src/irstlm_ranker.cpp @@ -19,7 +19,6 @@ IrstlmRanker::IrstlmRanker(const string &filePath, exit(-1); } cout.precision(10); - wcout.precision(10); lineno = 0; sublineno = 0; @@ -387,7 +386,7 @@ int main(int argc, char ** argv) { // I don't know :) if(setlocale(LC_CTYPE, "") == NULL) { - wcerr << L"Warning: unsupported locale, fallback to \"C\"" << endl; + cerr << "Warning: unsupported locale, fallback to \"C\"" << endl; setlocale(LC_ALL, "C"); } @@ -410,4 +409,3 @@ int main(int argc, char ** argv) { return 0; } - diff --git a/src/ldx_proc.cc b/src/ldx_proc.cc index 9525bb5..b3fbd01 100644 --- a/src/ldx_proc.cc +++ b/src/ldx_proc.cc @@ -25,114 +25,53 @@ #include #include -#include #include +#include +#include +#include using namespace std; -int readGeneration(FILE *input, FILE *output); -void skipUntil(FILE *input, FILE *output, wint_t const character); -wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); -wchar_t readEscaped(FILE *input); -void streamError(); +int32_t readGeneration(InputFile& input, UFILE *output); +void skipUntil(InputFile& input, UFILE *output, UChar32 const character); FSTProcessor fstp; bool outOfWord = true; -set escaped_chars; +set escaped_chars; void -streamError() -{ - throw Exception("Error: Malformed input stream."); -} - -wchar_t -readEscaped(FILE *input) -{ - if(feof(input)) - { - streamError(); - } - - wchar_t val = static_cast(fgetwc_unlocked(input)); - - if(feof(input) || escaped_chars.find(val) == escaped_chars.end()) - { - streamError(); - } - - return val; -} - - -wstring -readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - if(c != L'\\') - { - continue; - } - else - { - result += static_cast(readEscaped(input)); - } - } - - if(c != delim2) - { - streamError(); - } - - return result; -} - - -void -skipUntil(FILE *input, FILE *output, wint_t const character) +skipUntil(InputFile& input, UFILE* output, UChar32 const character) { while(true) { - wint_t val = fgetwc_unlocked(input); - if(feof(input)) - { + UChar32 val = input.get(); + if (input.eof()) { return; } switch(val) { - case L'\\': - val = fgetwc_unlocked(input); - if(feof(input)) - { + case '\\': + val = input.get(); + if (input.eof()) { return; } - fputwc_unlocked(L'\\', output); - fputwc_unlocked(val, output); + u_fputc('\\', ouput); + u_fputc(val, output); break; - case L'\0': - fputwc_unlocked(val, output); + case '\0': + u_fputc(val, output); break; default: - if(val == character) - { + if (val == character) { return; - } - else - { - fputwc_unlocked(val, output); + } else { + u_fputc(val, output); } break; } @@ -140,48 +79,47 @@ skipUntil(FILE *input, FILE *output, wint_t const character) } -int -readGeneration(FILE *input, FILE *output) +int32_t +readGeneration(InputFile& input, UFILE* output) { - wint_t val = fgetwc_unlocked(input); + UChar32 val = input.get(); - if(feof(input)) - { + if (input.eof()) { return 0x7fffffff; } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, ouput); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } @@ -189,24 +127,24 @@ readGeneration(FILE *input, FILE *output) outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return static_cast(val); + val = input.get(); + return static_cast(val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return static_cast(L'$'); + return static_cast('$'); } - else if(val == L'[') + else if(val == '[') { - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + write(input.readBlock('[', ']'), output); return readGeneration(input, output); } else { - return static_cast(val); + return static_cast(val); } return 0x7fffffff; @@ -215,7 +153,8 @@ readGeneration(FILE *input, FILE *output) int main(int argc, char **argv) { - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); @@ -226,17 +165,17 @@ int main(int argc, char **argv) exit(-1); } - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); FILE *t_rl = fopen(argv[1], "rb"); @@ -253,25 +192,25 @@ int main(int argc, char **argv) // read until '/', then read each from '/' adding to a map, then look up first in transducer, and if the result // is found in the map, then output it, otherwise error. - int val = 0, i = 0; + int32_t val = 0, i = 0; bool seenFirst = false; - wstring sl = L""; - wstring tl = L""; - set tllu; - set tllu_defaults; + UString sl; + UString tl; + set tllu; + set tllu_defaults; - skipUntil(input, output, L'^'); + skipUntil(input, output, '^'); outOfWord = false; while((val = readGeneration(input, output)) != 0x7fffffff) { switch(val) { - case L'^': + case '^': outOfWord = false; - val = readGeneration(input, output); + val = readGeneration(input, output); break; - case L'/': + case '/': if(!seenFirst) { seenFirst = true; @@ -281,13 +220,13 @@ int main(int argc, char **argv) tllu.insert(tl); } i++; - tl = L""; - val = readGeneration(input, output); - if(val != L'$') + tl.clear(); + val = readGeneration(input, output); + if(val != '$') { break; } - case L'$': + case '$': outOfWord = true; if(!seenFirst) { @@ -299,23 +238,28 @@ int main(int argc, char **argv) } seenFirst = false; - fputws_unlocked(L"^", output); - fputws_unlocked(sl.c_str(), output); + u_fputc('^', output); + write(sl, output); if(tllu.size() > 1) { - tl = L""; - wstring in = L"^" + sl + L"$"; - wstring trad = fstp.biltrans(in); + tl.clear(); + UString in; + in += '^'; + in.append(sl); + in += '$'; + UString trad = fstp.biltrans(in); int j = 0; bool tlout = false; for(auto& it : tllu) { - wstring t = L"^" + it + L"$"; + UString t; + t += '^'; + t.append(it); + t += '$'; if(t == trad) { - fputws_unlocked(L"/", output); - wstring to = t.substr(1, wcslen(t.c_str())-2); - fputws_unlocked(to.c_str(), output); + u_fputc('/', output); + write(it, output); tlout = true; break; } @@ -328,36 +272,35 @@ int main(int argc, char **argv) { if(it != tllu.end()) { - fputws_unlocked(L"/", output); + u_fputc('/', output); } - fputws_unlocked(it->c_str(), output); + write(*it, output); } } } else { - fputws_unlocked(L"/", output); - fputws_unlocked(tl.c_str(), output); + u_fputc('/', output); + write(tl, output); } - fputws_unlocked(L"$", output); + u_fputc('$', output); - sl = L""; tl = L""; + sl.clear(); + tl.clear(); tllu.clear(); i = 0; break; } if(!seenFirst && !outOfWord) { - sl.append(1, static_cast(val)); + sl += static_cast(val); } else if(!outOfWord) { - tl.append(1, static_cast(val)); + tl += static_cast(val); } } return 0; } - - diff --git a/src/lrx_comp.cc b/src/lrx_comp.cc index c1d5b46..c9fe3bb 100644 --- a/src/lrx_comp.cc +++ b/src/lrx_comp.cc @@ -16,6 +16,9 @@ */ #include +#include +#include +#include using namespace std; @@ -31,6 +34,8 @@ void endProgram(char *name) int main (int argc, char **argv) { + LtLocale::tryToSetLocale(); + LRXCompiler compiler; if(argc != 3 && argc != 4) diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc index 3fb4e6a..b2cf64c 100644 --- a/src/lrx_compiler.cc +++ b/src/lrx_compiler.cc @@ -15,100 +15,92 @@ * along with this program; if not, see . */ -#include #include -#include +#include +#include +#include +#include +#include using namespace std; -wstring const LRXCompiler::LRX_COMPILER_LRX_ELEM = L"lrx"; -wstring const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM = L"def-seqs"; -wstring const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM = L"def-seq"; -wstring const LRXCompiler::LRX_COMPILER_RULES_ELEM = L"rules"; -wstring const LRXCompiler::LRX_COMPILER_RULE_ELEM = L"rule"; -wstring const LRXCompiler::LRX_COMPILER_MATCH_ELEM = L"match"; -wstring const LRXCompiler::LRX_COMPILER_SELECT_ELEM = L"select"; -wstring const LRXCompiler::LRX_COMPILER_REMOVE_ELEM = L"remove"; -wstring const LRXCompiler::LRX_COMPILER_OR_ELEM = L"or"; -wstring const LRXCompiler::LRX_COMPILER_REPEAT_ELEM = L"repeat"; -wstring const LRXCompiler::LRX_COMPILER_SEQ_ELEM = L"seq"; - -wstring const LRXCompiler::LRX_COMPILER_LEMMA_ATTR = L"lemma"; -wstring const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR = L"suffix"; -wstring const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR = L"contains"; -wstring const LRXCompiler::LRX_COMPILER_CASE_ATTR = L"case"; -wstring const LRXCompiler::LRX_COMPILER_SURFACE_ATTR = L"surface"; -wstring const LRXCompiler::LRX_COMPILER_TAGS_ATTR = L"tags"; -wstring const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR = L"weight"; -wstring const LRXCompiler::LRX_COMPILER_COMMENT_ATTR = L"c"; -wstring const LRXCompiler::LRX_COMPILER_NAME_ATTR = L"n"; -wstring const LRXCompiler::LRX_COMPILER_FROM_ATTR = L"from"; -wstring const LRXCompiler::LRX_COMPILER_UPTO_ATTR = L"upto"; - -wstring const LRXCompiler::LRX_COMPILER_TYPE_SELECT = L"select"; -wstring const LRXCompiler::LRX_COMPILER_TYPE_REMOVE = L"remove"; -wstring const LRXCompiler::LRX_COMPILER_TYPE_SKIP = L"skip"; +UString const LRXCompiler::LRX_COMPILER_LRX_ELEM = "lrx"_u; +UString const LRXCompiler::LRX_COMPILER_DEFSEQS_ELEM = "def-seqs"_u; +UString const LRXCompiler::LRX_COMPILER_DEFSEQ_ELEM = "def-seq"_u; +UString const LRXCompiler::LRX_COMPILER_RULES_ELEM = "rules"_u; +UString const LRXCompiler::LRX_COMPILER_RULE_ELEM = "rule"_u; +UString const LRXCompiler::LRX_COMPILER_MATCH_ELEM = "match"_u; +UString const LRXCompiler::LRX_COMPILER_SELECT_ELEM = "select"_u; +UString const LRXCompiler::LRX_COMPILER_REMOVE_ELEM = "remove"_u; +UString const LRXCompiler::LRX_COMPILER_OR_ELEM = "or"_u; +UString const LRXCompiler::LRX_COMPILER_REPEAT_ELEM = "repeat"_u; +UString const LRXCompiler::LRX_COMPILER_SEQ_ELEM = "seq"_u; + +UString const LRXCompiler::LRX_COMPILER_LEMMA_ATTR = "lemma"_u; +UString const LRXCompiler::LRX_COMPILER_SUFFIX_ATTR = "suffix"_u; +UString const LRXCompiler::LRX_COMPILER_CONTAINS_ATTR = "contains"_u; +UString const LRXCompiler::LRX_COMPILER_CASE_ATTR = "case"_u; +UString const LRXCompiler::LRX_COMPILER_SURFACE_ATTR = "surface"_u; +UString const LRXCompiler::LRX_COMPILER_TAGS_ATTR = "tags"_u; +UString const LRXCompiler::LRX_COMPILER_WEIGHT_ATTR = "weight"_u; +UString const LRXCompiler::LRX_COMPILER_COMMENT_ATTR = "c"_u; +UString const LRXCompiler::LRX_COMPILER_NAME_ATTR = "n"_u; +UString const LRXCompiler::LRX_COMPILER_FROM_ATTR = "from"_u; +UString const LRXCompiler::LRX_COMPILER_UPTO_ATTR = "upto"_u; + +UString const LRXCompiler::LRX_COMPILER_TYPE_SELECT = "select"_u; +UString const LRXCompiler::LRX_COMPILER_TYPE_REMOVE = "remove"_u; +UString const LRXCompiler::LRX_COMPILER_TYPE_SKIP = "skip"_u; double const LRXCompiler::LRX_COMPILER_DEFAULT_WEIGHT = 1.0; -wstring -LRXCompiler::itow(int i) -{ - // Convert an int to a wstring - wchar_t buf[50]; - memset(buf, '\0', sizeof(buf)); - swprintf(buf, 50, L"%d", i); - wstring id(buf); - return id; -} - -int -LRXCompiler::wtoi(wstring w) +void +LRXCompiler::debug(const char* fmt, ...) { - // Convert a wstring to an int - wistringstream wstrm(w); - int i_name = -numeric_limits::max(); - wstrm >> i_name; - - return i_name; + if (debugMode) { + va_list argptr; + va_start(argptr, fmt); + u_vfprintf(debug_output, fmt, argptr); + va_end(argptr); + } } -double -LRXCompiler::wtod(wstring w) +void +LRXCompiler::error(const char* fmt, ...) { - // Convert a wstring to a double - wistringstream wstrm(w); - double d_name = -numeric_limits::max(); - wstrm >> d_name; - - return d_name; + u_fprintf(debug_output, "Error (line %d): ", + xmlTextReaderGetParserLineNumber(reader)); + va_list argptr; + va_start(argptr, fmt); + u_vfprintf(debug_output, fmt, argptr); + va_end(argptr); + u_fputc('\n', debug_output); + exit(EXIT_FAILURE); } LRXCompiler::LRXCompiler() { - LtLocale::tryToSetLocale(); - - debugMode = false; - outputGraph = false; - - currentRuleId = 0; + debug_output = u_finit(stderr, NULL, NULL); initialState = transducer.getInitial(); currentState = initialState; lastState = initialState; - canSelect = true; - - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SELECT + L">"); - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_REMOVE + L">"); - alphabet.includeSymbol(L"<"+ LRX_COMPILER_TYPE_SKIP + L">"); - - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); - + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SELECT + ">"_u); + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_REMOVE + ">"_u); + alphabet.includeSymbol("<"_u+ LRX_COMPILER_TYPE_SKIP + ">"_u); + + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol("<$>"_u); + + any_tag = alphabet(""_u); + any_char = alphabet(""_u); + any_upper = alphabet(""_u); + any_lower = alphabet(""_u); + word_boundary = alphabet(alphabet("<$>"_u), alphabet("<$>"_u)); } LRXCompiler::~LRXCompiler() @@ -129,64 +121,45 @@ LRXCompiler::setOutputGraph(bool o) } void -LRXCompiler::skipBlanks(wstring &name) +LRXCompiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; - exit(EXIT_FAILURE); + error("Invalid construction."); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } -wstring -LRXCompiler::attrib(wstring const &name) +UString +LRXCompiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } -wstring -LRXCompiler::attrib(wstring const &name, const wstring fallback) +UString +LRXCompiler::attrib(UString const &name, const UString fallback) { - string mystr = ""; - for (int i = 0, limit = name.size(); i != limit; i++) { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); - xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = XMLParseUtil::towstring(myattr); - xmlFree(myattr); - xmlFree(attrname); - if(myattr == NULL) { - return fallback; - } - else { - return result; - } + return XMLParseUtil::attrib(reader, name, fallback); } bool LRXCompiler::allBlanks() { - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - - for(unsigned int i = 0, limit = text.size(); i < limit; i++) - { - flag = flag && iswspace(text[i]); + UString text = XMLParseUtil::readValue(reader); + for (auto& c : text) { + if (!u_isspace(c)) { + return false; + } } - - return flag; + return true; } void @@ -210,7 +183,7 @@ LRXCompiler::parse(string const &fitxer) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } } @@ -218,14 +191,13 @@ LRXCompiler::parse(string const &fitxer) void LRXCompiler::procNode() { - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); + UString nombre = XMLParseUtil::readName(reader); - if(nombre == L"#text") + if(nombre == "#text"_u) { /* ignorar */ } - else if(nombre== L"#comment") + else if(nombre== "#comment"_u) { /* ignorar */ } @@ -251,9 +223,7 @@ LRXCompiler::procNode() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid node '<%S>'.", nombre.c_str()); } return; @@ -262,10 +232,13 @@ LRXCompiler::procNode() void LRXCompiler::procRule() { - wstring comment = this->attrib(LRX_COMPILER_COMMENT_ATTR); - wstring xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR); - wstring nombre = this->attrib(LRX_COMPILER_NAME_ATTR); - double weight = wtod (xweight); + UString comment = this->attrib(LRX_COMPILER_COMMENT_ATTR); + UString xweight = this->attrib(LRX_COMPILER_WEIGHT_ATTR); + UString nombre = this->attrib(LRX_COMPILER_NAME_ATTR); + double weight = LRX_COMPILER_DEFAULT_WEIGHT; + if (!xweight.empty()) { + weight = StringUtils::stod(xweight); + } if(weight <= -numeric_limits::max()) { @@ -276,25 +249,19 @@ LRXCompiler::procRule() currentState = transducer.insertNewSingleTransduction(alphabet(0, 0), currentState); currentRuleId++; - wstring ruleId = L"<" + itow(currentRuleId) + L">"; + UString ruleId = "<"_u + StringUtils::itoa(currentRuleId) + ">"_u; weights[currentRuleId] = weight; - if(debugMode) - { - fwprintf(stderr, L" rule: %d, weight: %.2f \n", currentRuleId, weight); - } + debug(" rule: %d, weight: %.2f \n", currentRuleId, weight); while(true) { int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); + if(ret != 1) { + error("Parse error."); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -316,7 +283,7 @@ LRXCompiler::procRule() } else if(name == LRX_COMPILER_RULE_ELEM) { - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); + currentState = transducer.insertSingleTransduction(word_boundary, currentState); if(!alphabet.isSymbolDefined(ruleId.c_str())) { alphabet.includeSymbol(ruleId.c_str()); @@ -328,39 +295,26 @@ LRXCompiler::procRule() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_RULE_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' into ''.", name.c_str()); } } - - - return; } void LRXCompiler::procOr() { - - if(debugMode) - { - fwprintf(stderr, L" or: \n"); - } + debug(" or: \n"); int or_initial_state = currentState; vector reachedStates; while(true) { int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); + if(ret != 1) { + error("Parse error."); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -392,10 +346,7 @@ LRXCompiler::procOr() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_OR_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' into ''.", name.c_str()); } } @@ -412,18 +363,15 @@ LRXCompiler::procDefSeq() int oldstate = currentState; currentState = initialState; lastState = initialState; - wstring seqname = this->attrib(LRX_COMPILER_NAME_ATTR); + UString seqname = this->attrib(LRX_COMPILER_NAME_ATTR); while(true) { int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); + if(ret != 1) { + error("Parse error."); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -450,10 +398,7 @@ LRXCompiler::procDefSeq() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' into ''.", name.c_str()); } } sequences[seqname] = transducer; @@ -468,22 +413,19 @@ void LRXCompiler::procMatch() { // These are mutually exclusive - wstring lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR); - wstring suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR); - wstring _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive + UString lemma = this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString contains = this->attrib(LRX_COMPILER_SUFFIX_ATTR); + UString suffix = this->attrib(LRX_COMPILER_CONTAINS_ATTR); + UString _case = this->attrib(LRX_COMPILER_CASE_ATTR); // This could potentially be non-exclusive // This is currently disabled: Future use - wstring surface = this->attrib(LRX_COMPILER_SURFACE_ATTR); + UString surface = this->attrib(LRX_COMPILER_SURFACE_ATTR); - wstring tags = this->attrib(LRX_COMPILER_TAGS_ATTR, L"*"); + UString tags = this->attrib(LRX_COMPILER_TAGS_ATTR, "*"_u); - if(surface != L"") + if(!surface.empty()) { - if(debugMode) - { - fwprintf(stderr, L" match: %S\n", surface.c_str()); - } + debug(" match: %S\n", surface.c_str()); for(auto& it : surface) { @@ -492,70 +434,64 @@ LRXCompiler::procMatch() } else { - if(debugMode) - { - fwprintf(stderr, L" match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str()); - } + debug(" match: [%S, %S, %S, %S] %S\n", lemma.c_str(), suffix.c_str(), contains.c_str(), _case.c_str(), tags.c_str()); - if(_case != L"") + if(_case != ""_u) { - if(_case == L"AA") // + + if(_case == "AA"_u) // + { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_upper, 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(_case == L"aa") // + + else if(_case == "aa"_u) // + { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_lower, 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(_case == L"Aa") // + + + else if(_case == "Aa"_u) // + + { - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_upper, 0), currentState); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_lower, 0), currentState); transducer.linkStates(currentState, localLast, 0); } } - if(lemma == L"*" && suffix == L"" && contains == L"" && _case == L"") + if(lemma == "*"_u && suffix.empty() && contains.empty() && _case.empty()) { // This is only if there is no suffix or case or contains - if(debugMode) - { - fwprintf(stderr, L" char: -\n"); - } + debug(" char: -\n"); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(suffix != L"") + else if(suffix != ""_u) { // A suffix is any amount of times followed by whatever is in the suffix int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState); transducer.linkStates(currentState, localLast, 0); for(auto& it : suffix) { currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState); } } - else if(contains != L"") + else if(!contains.empty()) { // A contains is any amount of times followed by whatever is in the attribute // followed by any amount of times int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState); transducer.linkStates(currentState, localLast, 0); for(auto& it : suffix) { currentState = transducer.insertSingleTransduction(alphabet(it, 0), currentState); } - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_char, 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(lemma != L"*") + else if(lemma != "*"_u) { for(auto& it : lemma) { @@ -564,66 +500,57 @@ LRXCompiler::procMatch() } else { - fwprintf(stderr, L"Something surprising happened in compilation\n"); + cerr << "Something surprising happened in compilation\n"; } - wstring tag = L""; + UString tag; for(auto& it : tags) { - if(it == L'.') + if(it == '.') { - if(tag == L"") + if(tag.empty()) { continue; } - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_tag, 0), currentState); transducer.linkStates(currentState, localLast, 0); } else { currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState); } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); int localLast = currentState; - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L""), 0), currentState); + currentState = transducer.insertSingleTransduction(alphabet(any_tag, 0), currentState); transducer.linkStates(currentState, localLast, 0); } - else if(tag == L"") + else if(tag.empty()) { } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(alphabet(tag.c_str()), 0), currentState); } } @@ -631,42 +558,33 @@ LRXCompiler::procMatch() if(xmlTextReaderIsEmptyElement(reader)) { // If self-closing - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(word_boundary, currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(""_u)), currentState); return; } - wstring name = L""; + UString name = ""_u; while(true) { int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); + if(ret != 1) { + error("Parse error."); } - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_SELECT_ELEM) { - if(!canSelect) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): is not permitted inside ."); } procSelect(); } else if(name == LRX_COMPILER_REMOVE_ELEM) { - if(!canSelect) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): is not permitted inside ." << endl; - exit(EXIT_FAILURE); + if(!canSelect) { + error(" is not permitted inside ."); } procRemove(); } @@ -676,10 +594,7 @@ LRXCompiler::procMatch() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_MATCH_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' into ''."); } } @@ -691,11 +606,11 @@ void LRXCompiler::procSelect() { - wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR); + UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR); - wstring key = L"<" + LRX_COMPILER_TYPE_SELECT + L">"; - if(lemma != L"*") + UString key = "<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u; + if(lemma != "*"_u) { key += lemma; } @@ -703,22 +618,19 @@ LRXCompiler::procSelect() Transducer recogniser; int localCurrentState = recogniser.getInitial(); - if(debugMode) - { - fwprintf(stderr, L" select: %S, %S\n", lemma.c_str(), tags.c_str()); - } + debug(" select: %S, %S\n", lemma.c_str(), tags.c_str()); - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_SELECT + L">")), currentState); + currentState = transducer.insertSingleTransduction(word_boundary, currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_SELECT + ">"_u)), currentState); - if(lemma == L"*") + if(lemma == "*"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, any_char), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_char ,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { for (auto &it : lemma) { @@ -727,29 +639,24 @@ LRXCompiler::procSelect() } } - if(tags != L"") - { - wstring tag = L""; - for(auto& it : tags) - { - if(it == L'.') + if(!tags.empty()) { + UString tag; + for(auto& it : tags) { + if(it == '.') { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") - { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -757,34 +664,28 @@ LRXCompiler::procSelect() localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: %S\n", tag.c_str()); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState); localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; @@ -792,26 +693,20 @@ LRXCompiler::procSelect() } else { - if(debugMode) - { - fwprintf(stderr, L" tag: -\n"); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: -\n"); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag ,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } recogniser.setFinal(localCurrentState); recognisers[key] = recogniser; - if(debugMode) - { - fwprintf(stderr, L" select: %d\n", recognisers[key].size()); - } - //currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); + debug(" select: %d\n", recognisers[key].size()); + //currentState = transducer.insertSingleTransduction(word_boundary, currentState); return; } @@ -820,11 +715,11 @@ void LRXCompiler::procRemove() { - wstring lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, L"*"); - wstring tags =this->attrib(LRX_COMPILER_TAGS_ATTR); + UString lemma =this->attrib(LRX_COMPILER_LEMMA_ATTR, "*"_u); + UString tags =this->attrib(LRX_COMPILER_TAGS_ATTR); - wstring key = L"<" + LRX_COMPILER_TYPE_REMOVE + L">"; - if(lemma != L"*") + UString key = "<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u; + if(lemma != "*"_u) { key += lemma; } @@ -832,21 +727,18 @@ LRXCompiler::procRemove() Transducer recogniser; int localCurrentState = recogniser.getInitial(); - if(debugMode) - { - fwprintf(stderr, L" remove: %S, %S\n", lemma.c_str(), tags.c_str()); - } + debug(" remove: %S, %S\n", lemma.c_str(), tags.c_str()); - currentState = transducer.insertSingleTransduction(alphabet(alphabet(L"<$>"), alphabet(L"<$>")), currentState); - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"<" + LRX_COMPILER_TYPE_REMOVE + L">")), currentState); + currentState = transducer.insertSingleTransduction(word_boundary, currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, alphabet("<"_u + LRX_COMPILER_TYPE_REMOVE + ">"_u)), currentState); - if(lemma == L"*") + if(lemma == "*"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, any_char), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_char ,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -857,29 +749,26 @@ LRXCompiler::procRemove() } } - if(tags != L"") + if(tags != ""_u) { - wstring tag = L""; + UString tag = ""_u; for(auto& it : tags) { - if(it == L'.') + if(it == '.') { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { alphabet.includeSymbol(tag.c_str()); } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - if(tag == L"<*>") + debug(" tag: %S\n", tag.c_str()); + if(tag == "<*>"_u) { - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag, 0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { @@ -887,34 +776,28 @@ LRXCompiler::procRemove() localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; } - tag = L""; + tag = ""_u; continue; } tag = tag + it; } - if(tag == L"*") + if(tag == "*"_u) { - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: %S\n", tag.c_str()); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag, 0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } else { - tag = L"<" + tag + L">"; + tag = "<"_u + tag + ">"_u; if(!alphabet.isSymbolDefined(tag.c_str())) { - alphabet.includeSymbol(tag.c_str()); - } - if(debugMode) - { - fwprintf(stderr, L" tag: %S\n", tag.c_str()); + alphabet.includeSymbol(tag); } + debug(" tag: %S\n", tag.c_str()); currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(tag.c_str())), currentState); localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(tag.c_str()),0), localCurrentState); key = key + tag; @@ -922,25 +805,19 @@ LRXCompiler::procRemove() } else { - if(debugMode) - { - fwprintf(stderr, L" tag: -\n"); - } - currentState = transducer.insertSingleTransduction(alphabet(0, alphabet(L"")), currentState); + debug(" tag: -\n"); + currentState = transducer.insertSingleTransduction(alphabet(0, any_tag), currentState); int localLast = localCurrentState; - localCurrentState = recogniser.insertSingleTransduction(alphabet(alphabet(L""),0), localCurrentState); + localCurrentState = recogniser.insertSingleTransduction(alphabet(any_tag,0), localCurrentState); recogniser.linkStates(localCurrentState, localLast, 0); - key = key + L""; + key = key + ""_u; } recogniser.setFinal(localCurrentState); recognisers[key] = recogniser; - if(debugMode) - { - fwprintf(stderr, L" remove: %d\n", recognisers[key].size()); - } + debug(" remove: %d\n", recognisers[key].size()); return; } @@ -951,21 +828,14 @@ LRXCompiler::procRepeat() { bool couldSelect = canSelect; canSelect = false; - wstring xfrom = this->attrib(LRX_COMPILER_FROM_ATTR); - wstring xupto = this->attrib(LRX_COMPILER_UPTO_ATTR); - int from = stoi(xfrom); - int upto = stoi(xupto); - if(from < 0 || upto < 0) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Number of repetitions cannot be negative." << endl; - exit(EXIT_FAILURE); - } - else if(from > upto) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Lower bound on number of repetitions cannot be larger than upper bound." << endl; - exit(EXIT_FAILURE); + UString xfrom = this->attrib(LRX_COMPILER_FROM_ATTR); + UString xupto = this->attrib(LRX_COMPILER_UPTO_ATTR); + int from = StringUtils::stoi(xfrom); + int upto = StringUtils::stoi(xupto); + if(from < 0 || upto < 0) { + error("Number of repetitions cannot be negative."); + } else if(from > upto) { + error("Lower bound on number of repetitions cannot be larger than upper bound."); } int count = upto - from; int oldstate = currentState; @@ -976,14 +846,11 @@ LRXCompiler::procRepeat() while(true) { int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); + if(ret != 1) { + error("Parse error."); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(name == LRX_COMPILER_MATCH_ELEM) @@ -1006,10 +873,7 @@ LRXCompiler::procRepeat() } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << LRX_COMPILER_REPEAT_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' into ''.", name.c_str()); } } for(int i = 0; i < from; i++) @@ -1031,12 +895,10 @@ LRXCompiler::procRepeat() void LRXCompiler::procSeq() { - wstring name = this->attrib(LRX_COMPILER_NAME_ATTR); + UString name = this->attrib(LRX_COMPILER_NAME_ATTR); if(sequences.find(name) == sequences.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Sequence '" << name << L"' not defined." << endl; - exit(EXIT_FAILURE); + error("Sequence '%S' is not defined.", name.c_str()); } currentState = transducer.insertTransducer(currentState, sequences[name]); } @@ -1050,28 +912,24 @@ LRXCompiler::write(FILE *fst) Compression::multibyte_write(recognisers.size(), fst); for(auto& it : recognisers) { - Compression::wstring_write(it.first, fst); - if(debugMode) - { - fwprintf(stderr, L"+ %d => %S\n", it.second.size(), it.first.c_str()); - it.second.show(alphabet, stderr, 0, false); + Compression::string_write(it.first, fst); + debug("+ %d => %S\n", it.second.size(), it.first.c_str()); + if (debugMode) { + it.second.show(alphabet, debug_output, 0, false); } it.second.write(fst); } - Compression::wstring_write(L"main", fst); + Compression::string_write("main"_u, fst); if(outputGraph) { - transducer.show(alphabet, stderr, 0, false); + transducer.show(alphabet, debug_output, 0, false); } transducer.write(fst); for(auto& it : weights) { - if(debugMode) - { - fwprintf(stderr, L"%.4f %d\n", it.second, it.first); - } + debug("%.4f %d\n", it.second, it.first); weight record{it.first, "", it.second}; weight_to_le(record); fwrite((void *)&record, 1, sizeof(weight), fst); @@ -1079,6 +937,6 @@ LRXCompiler::write(FILE *fst) if(!outputGraph) { - fwprintf(stderr, L"%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions()); + u_fprintf(debug_output, "%d: %d@%d\n", currentRuleId, transducer.size(), transducer.numberOfTransitions()); } } diff --git a/src/lrx_compiler.h b/src/lrx_compiler.h index 099c4a7..6752215 100644 --- a/src/lrx_compiler.h +++ b/src/lrx_compiler.h @@ -18,30 +18,12 @@ #ifndef __LRX_COMPILER_H__ #define __LRX_COMPILER_H__ -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include - +#include #include - -#include -#include #include -#include #include -#include -#include -#include -#include -#include +#include using namespace std; @@ -52,23 +34,33 @@ private: Alphabet alphabet; Transducer transducer; - map recognisers; // keyed on pattern - map weights; // keyed on rule id + map recognisers; // keyed on pattern + map weights; // keyed on rule id - map sequences; + map sequences; - int initialState; - int lastState; - int currentState; - bool canSelect; // disallow , inside , + bool canSelect = true; - int currentRuleId; + int32_t currentRuleId = 0; - bool debugMode; - bool outputGraph; + int32_t any_tag = 0; + int32_t any_char = 0; + int32_t any_upper = 0; + int32_t any_lower = 0; + int32_t word_boundary = 0; + + bool debugMode = false; + bool outputGraph = false; + UFILE* debug_output; + void debug(const char* fmt, ...); + void error(const char* fmt, ...); bool allBlanks(); - void skipBlanks(wstring &name); + void skipBlanks(UString &name); void procNode(); void procList(); void procListMatch(); @@ -82,43 +74,39 @@ private: void procSeq(); /* If attrib does not exist (or other error), returns an empty string: */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /* If attrib does not exist (or other error), returns fallback: */ - wstring attrib(wstring const &name, const wstring fallback); - - wstring itow(int i); - int wtoi(wstring); - double wtod(wstring); + UString attrib(UString const &name, const UString fallback); public: - static wstring const LRX_COMPILER_LRX_ELEM; - static wstring const LRX_COMPILER_DEFSEQS_ELEM; - static wstring const LRX_COMPILER_DEFSEQ_ELEM; - static wstring const LRX_COMPILER_RULES_ELEM; - static wstring const LRX_COMPILER_RULE_ELEM; - static wstring const LRX_COMPILER_MATCH_ELEM; - static wstring const LRX_COMPILER_SELECT_ELEM; - static wstring const LRX_COMPILER_REMOVE_ELEM; - static wstring const LRX_COMPILER_OR_ELEM; - static wstring const LRX_COMPILER_REPEAT_ELEM; - static wstring const LRX_COMPILER_SEQ_ELEM; - - static wstring const LRX_COMPILER_SURFACE_ATTR; - static wstring const LRX_COMPILER_SUFFIX_ATTR; - static wstring const LRX_COMPILER_LEMMA_ATTR; - static wstring const LRX_COMPILER_CONTAINS_ATTR; - static wstring const LRX_COMPILER_CASE_ATTR; - static wstring const LRX_COMPILER_TAGS_ATTR; - static wstring const LRX_COMPILER_COMMENT_ATTR; - static wstring const LRX_COMPILER_NAME_ATTR; - static wstring const LRX_COMPILER_WEIGHT_ATTR; - static wstring const LRX_COMPILER_FROM_ATTR; - static wstring const LRX_COMPILER_UPTO_ATTR; - - static wstring const LRX_COMPILER_TYPE_SELECT; - static wstring const LRX_COMPILER_TYPE_REMOVE; - static wstring const LRX_COMPILER_TYPE_SKIP; + static UString const LRX_COMPILER_LRX_ELEM; + static UString const LRX_COMPILER_DEFSEQS_ELEM; + static UString const LRX_COMPILER_DEFSEQ_ELEM; + static UString const LRX_COMPILER_RULES_ELEM; + static UString const LRX_COMPILER_RULE_ELEM; + static UString const LRX_COMPILER_MATCH_ELEM; + static UString const LRX_COMPILER_SELECT_ELEM; + static UString const LRX_COMPILER_REMOVE_ELEM; + static UString const LRX_COMPILER_OR_ELEM; + static UString const LRX_COMPILER_REPEAT_ELEM; + static UString const LRX_COMPILER_SEQ_ELEM; + + static UString const LRX_COMPILER_SURFACE_ATTR; + static UString const LRX_COMPILER_SUFFIX_ATTR; + static UString const LRX_COMPILER_LEMMA_ATTR; + static UString const LRX_COMPILER_CONTAINS_ATTR; + static UString const LRX_COMPILER_CASE_ATTR; + static UString const LRX_COMPILER_TAGS_ATTR; + static UString const LRX_COMPILER_COMMENT_ATTR; + static UString const LRX_COMPILER_NAME_ATTR; + static UString const LRX_COMPILER_WEIGHT_ATTR; + static UString const LRX_COMPILER_FROM_ATTR; + static UString const LRX_COMPILER_UPTO_ATTR; + + static UString const LRX_COMPILER_TYPE_SELECT; + static UString const LRX_COMPILER_TYPE_REMOVE; + static UString const LRX_COMPILER_TYPE_SKIP; static double const LRX_COMPILER_DEFAULT_WEIGHT; diff --git a/src/lrx_proc.cc b/src/lrx_proc.cc index bd77260..db713ed 100644 --- a/src/lrx_proc.cc +++ b/src/lrx_proc.cc @@ -20,11 +20,7 @@ #include #include #include - -#ifdef _MSC_VER -#include -#include -#endif +#include using namespace std; @@ -92,7 +88,8 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); if(optind == (argc - 3)) @@ -103,14 +100,12 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { + if (!input.open(argv[optind+1])) { endProgram(argv[0]); } - output= fopen(argv[optind+2], "wb"); - if(output == NULL || ferror(output)) + output = u_fopen(argv[optind+2], "w", NULL, NULL); + if(output == NULL) { endProgram(argv[0]); } @@ -126,9 +121,7 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { + if (!input.open(argv[optind+1])) { endProgram(argv[0]); } @@ -150,14 +143,8 @@ int main(int argc, char *argv[]) endProgram(argv[0]); } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - lrxp.init(); lrxp.process(input, output); - fclose(input); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc index 276c6ba..8715097 100644 --- a/src/lrx_processor.cc +++ b/src/lrx_processor.cc @@ -17,21 +17,28 @@ #include #include -#include +#include +#include + using namespace std; -wstring const LRXProcessor::LRX_PROCESSOR_TAG_SELECT = L""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_REMOVE = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_SKIP = ""_u; + +UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_CHAR = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_TAG = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_UPPER = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_ANY_LOWER = ""_u; +UString const LRXProcessor::LRX_PROCESSOR_TAG_WORD_BOUNDARY = "<$>"_u; -wstring +UString LRXProcessor::itow(int i) { - // Convert an int to a wstring - wchar_t buf[50]; - memset(buf, '\0', sizeof(buf)); - swprintf(buf, 50, L"%d", i); - wstring id(buf); + // Convert an int to a UString + UChar buf[50]; + u_snprintf(buf, 50, "%d", i); + UString id(buf); return id; } @@ -77,39 +84,31 @@ void LRXProcessor::load(FILE *in) { alphabet.read(in); + any_char = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR); + any_tag = alphabet(LRX_PROCESSOR_TAG_ANY_TAG); + any_upper = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER); + any_lower = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER); + word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY); int len = Compression::multibyte_read(in); while(len > 0) { - int len2 = Compression::multibyte_read(in); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(in)); - len2--; - } + UString name = Compression::string_read(in); recognisers[name].read(in, alphabet); if(debugMode) { - fwprintf(stderr, L"Recogniser: %S, [finals: %d]\n", name.c_str(), recognisers[name].getFinals().size()); + cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n"; } len--; } if(debugMode) { - fwprintf(stderr, L"recognisers: %d\n", recognisers.size()); + cerr << "recognisers: " << recognisers.size() << endl; } - int len3 = Compression::multibyte_read(in); - - wstring name = L""; - while(len3 > 0) - { - name += static_cast(Compression::multibyte_read(in)); - len3--; - } + UString name = Compression::string_read(in); transducer.read(in, alphabet); @@ -118,13 +117,15 @@ LRXProcessor::load(FILE *in) while(fread(&record, sizeof(weight), 1, in)) { weight_from_le(record); - wstring sid = L"<" + itow(record.id) + L">"; + UString sid = "<"_u + itow(record.id) + ">"_u; weights[sid] = record.pisu; + /* if(debugMode) { - //fwprintf(stderr, L"%S %d weight(%.4f)\n", sid.c_str(), record.id, record.pisu); + cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n"; } + */ } return; @@ -137,42 +138,26 @@ LRXProcessor::init() anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); - -} - -wstring -LRXProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - } - - return result; } bool -LRXProcessor::recognisePattern(const wstring lu, const wstring op) +LRXProcessor::recognisePattern(const UString lu, const UString op) { if(recognisers.count(op) < 1) { - fwprintf(stderr, L"WARNING: Recogniser not found for key %S, skipping... [LU: %S]\n", op.c_str(), lu.c_str()); + cerr << "WARNING: Recogniser not found for key " << op << ", skipping... [LU: " << lu << "]" << endl; return false; } @@ -184,14 +169,14 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end()); bool readingTag = false; - wstring tag = L""; + UString tag; int val = 0; for(auto& it : lu) { /* if(debugMode) { - fwprintf(stderr, L"alive: %d\n", cur.size()); + cerr << "alive: " << cur.size() << endl; } */ if(cur.size() < 1) // I think that any time we have 0 alive states, @@ -199,35 +184,35 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) { return false; } - if(it == L'<') + if(it == '<') { - tag = L""; + tag.clear(); readingTag = true; - tag = tag + it; + tag += it; continue; } - if(it == L'>') + if(it == '>') { tag = tag + it; - val = static_cast(alphabet(tag)); + val = alphabet(tag); if(val == 0) { - val = static_cast(alphabet(L"")); + val = any_tag; } /* if(debugMode) { - fwprintf(stderr, L":: tag %S: %d\n", tag.c_str(), val); - fwprintf(stderr, L" step: %S\n", tag.c_str()); + cerr << ":: tag " << tag << ": " << val << endl; + cerr << " step: " << tag << endl; } */ - cur.step(val, alphabet(L"")); + cur.step(val, any_tag); readingTag = false; continue; } if(readingTag) { - tag = tag + it; + tag += it; } else { @@ -236,22 +221,21 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) /* if(debugMode) { - fwprintf(stderr, L" step: %C\n", val); + cerr << " step: " << val << endl; } */ - //cur.step(val, a(L"")); + //cur.step(val, a("")); //cur.step(val); set alts; - if(!iswupper(val)) + alts.insert(any_char); + if(!u_isupper(val)) { - alts.insert(alphabet(L"")); - alts.insert(alphabet(L"")); + alts.insert(any_lower); } else { - alts.insert(alphabet(L"")); - alts.insert(alphabet(L"")); - alts.insert(towlower(val)); + alts.insert(any_upper); + alts.insert(u_tolower(val)); } cur.step(val, alts); @@ -261,7 +245,7 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) /* if(debugMode) { - fwprintf(stderr, L">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); + cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"; } */ if(cur.isFinal(end_states)) @@ -272,541 +256,29 @@ LRXProcessor::recognisePattern(const wstring lu, const wstring op) return false; } -/* -void -LRXProcessor::processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > > &covers, - pair > &empty_seq, - map, vector > &spans, - int last_final) -{ - if(debugMode) - { - fwprintf(stderr, L"FLUSH:\n"); - } - - map > >::iterator it; - map > operations; - - for(it = covers.begin(); it != covers.end(); it++) - { - pair > best = it->second; - if(debugMode) - { - fwprintf(stderr, L"===================================================\n"); - fwprintf(stderr, L"[%d][%d] covers[%d] best (score: %d, size: %d)\n", pos, last_final, it->first, best.first, best.second.size()); - } - - // return M[i-1] - if(it->first == last_final) - { - vector::iterator it2; - for(it2 = best.second.begin(); it2 != best.second.end(); it2++) - { - if(debugMode) - { - wstring out = it2->filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L"!!! filter_finals: %S\n", out.c_str()); - } - set > > outpaths; - outpaths = it2->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - - int j = 1; - set > >::iterator it3; - for(it3 = outpaths.begin(); it3 != outpaths.end(); it3++) - { - wstring id = it3->first; - vector ops = it3->second; - vector::iterator op; - for(op = ops.begin(); op != ops.end(); op++) - { - if(*op != LRX_PROCESSOR_TAG_SKIP) - { - int starting_point = -1; - map, vector >::iterator ix; - for(ix = spans.begin(); ix != spans.end(); ix++) - { - vector::iterator iy; - for(iy = ix->second.begin(); iy != ix->second.end(); iy++) - { - set > > y; - y = iy->filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - if(y == outpaths) - { - starting_point = ix->first.first; - } - } - } - if(debugMode) - { - fwprintf(stderr, L"=> APPLY [pos: %d, dep: %d, j: %d, start: %d, len: %d]: %S // %S\n", pos, starting_point, j, starting_point+j, ops.size(), id.c_str(), op->c_str()); - } - operations[starting_point+j].first = id; - operations[starting_point+j].second = *op; - } - j++; - } - } - if(debugMode) - { - fwprintf(stderr, L"[best: %d, outpaths: %d]\n", best.first, outpaths.size()); - } - } - } - } - - covers.clear(); - covers[-1] = empty_seq; - covers[-1].first = 0; - - // Here we actually apply the rules that we've matched - - unsigned int spos = 0; - for(spos = 0; spos <= pos; spos++) - { - if(sl[spos] == L"") - { - continue; - } - wstring op = operations[spos].second; - wstring tipus = L""; - if(op.find(LRX_PROCESSOR_TAG_SELECT) != wstring::npos) - { - tipus = LRX_PROCESSOR_TAG_SELECT; - } - if(op.find(LRX_PROCESSOR_TAG_REMOVE) != wstring::npos) - { - tipus = LRX_PROCESSOR_TAG_REMOVE; - } - if(debugMode) - { - fwprintf(stderr, L"#APPL%S. %S\n", tipus.c_str(), op.c_str()); - } - - fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); - - vector::iterator ti; - vector::iterator penum = tl[spos].end(); penum--; - - if(tipus == LRX_PROCESSOR_TAG_SELECT && tl[spos].size() > 1) - { - bool matched = true; - bool selected = false; - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - matched = recognisePattern(*ti, op); - if(matched) - { - if(traceMode || debugMode) - { - fwprintf(stderr, L"%d:SELECT%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str()); - } - fwprintf(output, L"%S", ti->c_str()); - selected = true; - break; - } - } - if(!selected) - { - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - fwprintf(output, L"%S", ti->c_str()); - if(ti != penum) - { - fwprintf(output, L"/"); - } - } - } - } - else if(tipus == LRX_PROCESSOR_TAG_REMOVE && tl[spos].size() > 1) - { - bool matched = true; - vector new_tl; // The new list of TL translations - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - matched = recognisePattern(*ti, op); - if(matched) - { - if(traceMode || debugMode) - { - fwprintf(stderr, L"%d:REMOVE%S:%S:%S\n", lineno, operations[spos].first.c_str(), sl[spos].c_str(), op.c_str()); - } - continue; - } - new_tl.push_back(*ti); - } - vector::iterator nti; - vector::iterator npenum = new_tl.end(); npenum--; - for(nti = new_tl.begin(); nti != new_tl.end(); nti++) - { - fwprintf(output, L"%S", nti->c_str()); - if(nti != npenum) - { - fwprintf(output, L"/"); - } - } - new_tl.clear(); - } - else - { - for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) - { - fwprintf(output, L"%S", ti->c_str()); - if(ti != penum) - { - fwprintf(output, L"/"); - } - } - } - fwprintf(output, L"$"); - if(debugMode) - { - fwprintf(output, L"%d", spos); - } - } -} -*/ - -/* void -LRXProcessor::process(FILE *input, FILE *output) +LRXProcessor::process(InputFile& input, UFILE *output) { bool isEscaped = false; - map sl; // map of SL words - map > tl; // map of vectors of TL translations - map blanks; // map of the superblanks - - map > > covers ; - pair > empty_seq; - map, vector > spans ; - - covers[-1] = empty_seq; - covers[-1].first = 1.0; - - vector alive_states_clean ; - vector alive_states = alive_states_clean ; - alive_states.push_back(*initial_state); - vector new_states; - - int last_final = -1; // check what we actually use this for - - while(!feof(input)) - { - int val = fgetwc_unlocked(input); - - if(nullFlush && val == L'\0') - { - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - fwprintf(output, L"%S", blanks[pos].c_str()); - pos = 0; - last_final = 0; - tl.clear(); - sl.clear(); - blanks.clear(); - spans.clear(); - - fputwc_unlocked(val, output); - fflush(output); - continue; - } - - // We're starting to read a new lexical form - if(val == L'^' && !isEscaped && outOfWord) - { - outOfWord = false; - continue; - } + map sl; // map of SL words + map > tl; // map of vectors of TL translations + map blanks; // map of the superblanks - // We've seen the surface form - if(val == L'/' && !isEscaped && !outOfWord) - { - // Read in target equivalences - wstring trad = L""; - val = fgetwc_unlocked(input); - while(val != L'$') - { - if(val != L'$') - { - trad += static_cast(val); - } - if(val == L'/') - { - tl[pos].push_back(trad.substr(0, trad.length()-1)); - trad = L""; - } - val = fgetwc_unlocked(input); - } - tl[pos].push_back(trad); - - if(debugMode) - { - for(vector::iterator it = tl[pos].begin(); it != tl[pos].end(); it++) - { - fwprintf(stderr, L"trad[%d]: %S\n", pos, it->c_str()); - } - } - } - - // We've finished reading a lexical form - if((feof(input) || val == L'$') && !isEscaped && !outOfWord) - { - if(debugMode) - { - fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size()); - } - - new_states.clear(); // alive_states_new - pair > new_best_cover; - new_best_cover.first = -numeric_limits::max(); - - vector matched_rules; - - // \forall s \in A - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - // \IF \exists c \in Q : \delta(s, sent[i]) = c - s.step(alphabet(L"<$>")); - - // A \gets A \cup {c} - if(s.size() > 0) // If the current state has outgoing transitions, - // add it to the new alive states - { - new_states.push_back(s); - } - s.step(alphabet(L"<$>")); - - // \IF c \in F - if(s.isFinal(anfinals)) - { - // We've reached a final state, so we need to evaluate the rule we've matched - if(debugMode) - { - wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" filter_finals: %S\n", out.c_str()); - } - - set > > outpaths; - outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); - - set > >::iterator it; - for(it = outpaths.begin(); it != outpaths.end(); it++) - { - vector reached; - - vector path = (*it).second; - wstring id = (*it).first; - - if(debugMode) - { - fwprintf(stderr, L"id: %S:\n", id.c_str()); - for(vector::iterator it2 = path.begin(); it2 != path.end(); it2++) - { - fwprintf(stderr, L"op: %S\n", it2->c_str()); - } - fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos); - } - - spans[make_pair((pos-path.size()), pos)].push_back(s); - - // M[i-ChunkLength(c)] - pair > newseq = covers[(pos - path.size())]; - newseq.first = newseq.first + path.size() ; - - if(newseq.first > new_best_cover.first) - { - State new_state; - new_state = s; - reached.push_back(new_state); - map > >::iterator k; - for(k = covers.begin(); k != covers.end(); k++) - { - vector::iterator l; - pair > p = k->second; - for(l = p.second.begin(); l != p.second.end(); l++) - { - if(debugMode) - { - fwprintf(stderr, L"= [cov: %d][len: %d][pos: %d][pat: %d] INCLUDE FINALS?\n", k->first, p.first, pos, path.size()); - } - if(k->first <= (pos - path.size())) - { - if(debugMode) - { - wstring out2 = l->filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" == INCLUDE FINALS: %S\n", out2.c_str()); - } - reached.push_back(*l); - } - } - } - newseq.second = reached; - new_best_cover = newseq; - covers[pos] = newseq; - if(debugMode) - { - fwprintf(stderr, L"++ FINALS(%d) covers[%d] [%d, %d] BEST: %.4f > %.4f\n", newseq.second.size(), (pos - path.size()), pos, path.size(), newseq.first, new_best_cover.first); - } - } - - last_final = pos; - } - } - } - - alive_states.swap(new_states); - alive_states.push_back(*initial_state); - - if(debugMode) - { - fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size()); - } - - if(alive_states.size() == 1) - { - // If we have only a single alive state, it means no rules are - // active, and we can flush the buffers. - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - - pos = 0; - last_final = 0; - tl.clear(); - sl.clear(); - blanks.clear(); - spans.clear(); - } - - pos++; - if(debugMode) - { - fwprintf(stderr, L"==> new pos: %d\n", pos); - } - - outOfWord = true; - continue; - } - - - // We're reading a tag - if(val == L'<' && !isEscaped && !outOfWord) - { - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); - sl[pos] = sl[pos] + tag; - val = static_cast(alphabet(tag)); - if(val == 0) - { - val = static_cast(alphabet(L"")); - } - if(debugMode) - { - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); - } - } - - if(!outOfWord) - { - if(debugMode) - { - fwprintf(stderr, L"outOfWord = false\n"); - } - - new_states.clear(); - wstring res = L""; - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - res = L""; - State s = *it; - if(val < 0) - { - alphabet.getSymbol(res, val, false); - if(debugMode) - { - fwprintf(stderr, L" step: %S\n", res.c_str()); - } - s.step(val, alphabet(L"")); - } - else - { - if(debugMode) - { - fwprintf(stderr, L" step: %C\n", val); - } - s.step_case(val, alphabet(L""), false); - } - if(s.size() > 0) // If the current state has outgoing transitions, add it to the new alive states - { - new_states.push_back(s); - } - } - if(debugMode) - { - fwprintf(stderr, L"new_states: %d\n", new_states.size()); - } - alive_states.swap(new_states); - alive_states.push_back(*initial_state); - - } - - // We're still reading a surface form - if(val > 0 && val != L'$' && !isEscaped && !outOfWord) - { - sl[pos] = sl[pos] + static_cast(val); - } - - // Reading a superblank - if(outOfWord) - { - if(!feof(input)) - { - blanks[pos] = blanks[pos] + static_cast(val); - } - if(debugMode) - { - //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str()); - } - } - - // Increment the current line number (for rule tracing) - if(val == L'\n') - { - lineno++; - } - } - - processFlush(output, sl, tl, blanks, covers, empty_seq, spans, last_final); - - fwprintf(output, L"%S", blanks[pos].c_str()); -} -*/ - -void -LRXProcessor::process(FILE *input, FILE *output) -{ - bool isEscaped = false; - - map sl; // map of SL words - map > tl; // map of vectors of TL translations - map blanks; // map of the superblanks - - map > scores; // - map > operations; + map > scores; // + map > operations; vector alive_states ; alive_states.push_back(new State(*initial_state)); - int val = 0; - while((val = fgetwc_unlocked(input)) != EOF && val != WEOF) + int32_t val = 0; + while((val = input.get()) != U_EOF) { - if(nullFlush && val == L'\0') + if(nullFlush && val == '\0') { processFlush(output, sl, tl, blanks, scores, operations); - fwprintf(output, L"%S", blanks[pos].c_str()); + u_fprintf(output, "%S", blanks[pos].c_str()); pos = 0; tl.clear(); sl.clear(); @@ -816,63 +288,62 @@ LRXProcessor::process(FILE *input, FILE *output) alive_states.clear(); alive_states.push_back(new State(*initial_state)); - fputwc_unlocked(val, output); - fflush(output); + u_fputc(val, output); + u_fflush(output); continue; } // We're starting to read a new lexical form - if(val == L'^' && !isEscaped && outOfWord) + if(val == '^' && !isEscaped && outOfWord) { outOfWord = false; continue; } // We've seen the surface form - if(val == L'/' && !isEscaped && !outOfWord) + if(val == '/' && !isEscaped && !outOfWord) { // Read in target equivalences - wstring trad = L""; - val = fgetwc_unlocked(input); - while(val != L'$' && val != EOF && val != WEOF) + UString trad; + val = input.get(); + while(val != '$' && val != U_EOF) { - if(val != L'$') + if(val != '$') { - trad += static_cast(val); + trad += val; } - if(val == L'/') + if(val == '/') { tl[pos].push_back(trad.substr(0, trad.length()-1)); - trad = L""; + trad.clear(); } - val = fgetwc_unlocked(input); + val = input.get(); } tl[pos].push_back(trad); if(debugMode) { - for(auto& it : tl[pos]) - { - fwprintf(stderr, L"trad[%d]: %S\n", pos, it.c_str()); + for(auto& it : tl[pos]) { + cerr << "trad[" << pos << "]: " << it << endl; } } } - if((feof(input) || val == L'$') && !isEscaped && !outOfWord) + if((input.eof() || val == '$') && !isEscaped && !outOfWord) { if(debugMode) { - fwprintf(stderr, L"[POS] %d: [sl %d ; tl %d ; bl %d]: %S\n", pos, sl[pos].size(), tl[pos].size(), blanks[pos].size(), sl[pos].c_str()); + cerr << "[POS] " << pos << ": [sl " << sl[pos].size() << " ; tl " << tl[pos].size() << " ; bl " << blanks[pos].size() << "]: " << sl[pos] << endl; } { vector new_states; // TODO: Can we avoid the State-copying here? // \forall s \in A - set seen_ids; + set seen_ids; for(auto& it : alive_states) { State s = *it; // \IF \exists c \in Q : \delta(s, sent[i]) = c - s.step(alphabet(L"<$>")); + s.step(word_boundary); // A \gets A \cup {c} if (s.size() > 0) // If the current state has outgoing transitions, @@ -880,7 +351,7 @@ LRXProcessor::process(FILE *input, FILE *output) { new_states.push_back(new State(s)); } - s.step(alphabet(L"<$>")); + s.step(word_boundary); // \IF c \in F if (s.isFinal(anfinals)) @@ -888,18 +359,18 @@ LRXProcessor::process(FILE *input, FILE *output) // We've reached a final state, so we need to evaluate the rule we've matched if (debugMode) { - wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - fwprintf(stderr, L" filter_finals: %S\n", out.c_str()); + UString out = s.filterFinals(anfinals, alphabet, escaped_chars); + cerr << " filter_finals: " << out << endl; } - set>> outpaths; + set>> outpaths; outpaths = s.filterFinalsLRX(anfinals, alphabet, escaped_chars, false, false, 0); for (auto& it : outpaths) { vector reached; - vector path = it.second; - wstring id = it.first; + vector path = it.second; + UString id = it.first; if (seen_ids.find(id) != seen_ids.end()) { @@ -911,13 +382,14 @@ LRXProcessor::process(FILE *input, FILE *output) if (debugMode) { - fwprintf(stderr, L"id: %S: (lambda: %.5f)\n", id.c_str(), weights[id.c_str()]); + cerr << "id: " << id << ": (lambda: "; + cerr << weights[id] << ")\n"; } for (auto& it2 : path) { if (debugMode) { - fwprintf(stderr, L"op: %S\n", it2.c_str()); + cerr << "op: " << it2 << endl; } if (it2 != LRX_PROCESSOR_TAG_SKIP) { @@ -928,9 +400,10 @@ LRXProcessor::process(FILE *input, FILE *output) scores[j][it2] += weights[id.c_str()]; if (debugMode) { - fwprintf(stderr, L"#[%d]SCORE %.5f / %S\n", j, scores[j][it2], it2.c_str()); + cerr << "#[" << j << "]SCORE " << scores[j][it2] << " / "; + cerr << it2 << endl; } - if(it2.at(0) == L'<' && it2.at(1) == L'r') { + if(it2.at(0) == '<' && it2.at(1) == 'r') { operations[j][it2] = Remove; } else { @@ -939,7 +412,7 @@ LRXProcessor::process(FILE *input, FILE *output) } j++; } - // fwprintf(stderr, L"#SPAN[%d, %d]\n", (pos-path.size()), pos); + // cerr << "#SPAN[" << (pos-path.size()) << ", " << pos << "]\n"; } } } @@ -953,13 +426,12 @@ LRXProcessor::process(FILE *input, FILE *output) if (debugMode) { - fwprintf(stderr, L"seen:"); - for (auto& it : seen_ids) - { - fwprintf(stderr, L" %S ", it.c_str()); + cerr << "seen:"; + for (auto& it : seen_ids) { + cerr << " " << it << " "; } - fwprintf(stderr, L"\n"); - fwprintf(stderr, L"#CURRENT_ALIVE: %d\n", alive_states.size()); + cerr << endl; + cerr << "#CURRENT_ALIVE: " << alive_states.size() << endl; } } @@ -970,7 +442,7 @@ LRXProcessor::process(FILE *input, FILE *output) if(debugMode) { - fwprintf(stderr, L"FLUSH:\n"); + cerr << "FLUSH:" << endl; } @@ -988,7 +460,7 @@ LRXProcessor::process(FILE *input, FILE *output) pos++; if(debugMode) { - fwprintf(stderr, L"==> new pos: %d\n", pos); + cerr << "==> new pos: " << pos << endl; } outOfWord = true; @@ -996,19 +468,17 @@ LRXProcessor::process(FILE *input, FILE *output) } // We're reading a tag - if(val == L'<' && !isEscaped && !outOfWord) + if(val == '<' && !isEscaped && !outOfWord) { - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); + UString tag = input.readBlock('<', '>'); sl[pos] = sl[pos] + tag; - val = static_cast(alphabet(tag)); - if(val == 0) - { - val = static_cast(alphabet(L"")); + val = alphabet(tag); + if (val == 0) { + val = any_tag; } if(debugMode) { - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); + cerr << "tag " << tag << ": " << val << "\n"; } } @@ -1016,39 +486,39 @@ LRXProcessor::process(FILE *input, FILE *output) { if(debugMode) { - fwprintf(stderr, L"outOfWord = false\n"); + cerr << "outOfWord = false\n"; } - wstring res = L""; + UString res; for(auto& s : alive_states) { - res = L""; + res.clear(); if(val < 0) { alphabet.getSymbol(res, val, false); if(debugMode) { - fwprintf(stderr, L" step: %S\n", res.c_str()); + cerr << " step: " << res << endl; } - s->step(val, alphabet(L"")); + s->step(val, any_tag); } else { set alts; - alts.insert(alphabet(L"")); - if(iswupper(val)) + alts.insert(any_char); + if(u_isupper(val)) { - alts.insert(towlower(val)); - alts.insert(alphabet(L"")); + alts.insert(u_tolower(val)); + alts.insert(any_upper); } else { - alts.insert(alphabet(L"")); + alts.insert(any_lower); } if(debugMode) { - fwprintf(stderr, L" step: %C [alts: %d]\n", val, alts.size()); + cerr << " step: " << val << " [alts: " << alts.size() << "]\n"; } s->step(val, alts); } @@ -1057,26 +527,28 @@ LRXProcessor::process(FILE *input, FILE *output) } // We're still reading a surface form - if(val > 0 && val != L'$' && !isEscaped && !outOfWord) + if(val > 0 && val != '$' && !isEscaped && !outOfWord) { - sl[pos] = sl[pos] + static_cast(val); + sl[pos] += val; } // Reading a superblank if(outOfWord) { - if(!feof(input)) + if(!input.eof()) { - blanks[pos] = blanks[pos] + static_cast(val); + blanks[pos] += val; } + /* if(debugMode) { - //fwprintf(stderr, L"blanks[%d] = %S\n", pos, blanks[pos].c_str()); + cerr << "blanks[" << pos << "] = " << blanks[pos] << endl; } + */ } // Increment the current line number (for rule tracing) - if(val == L'\n') + if(val == '\n') { lineno++; } @@ -1084,42 +556,42 @@ LRXProcessor::process(FILE *input, FILE *output) } processFlush(output, sl, tl, blanks, scores, operations); - fwprintf(output, L"%S", blanks[pos].c_str()); + write(blanks[pos], output); } void -LRXProcessor::processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > &scores, - map > &operations) { +LRXProcessor::processFlush(UFILE *output, + map &sl, + map > &tl, + map &blanks, + map > &scores, + map > &operations) { struct ScoredMatch { OpType op; - wstring* ti; // matched target translation + UString* ti; // matched target translation double weight; }; unsigned int spos = 0; for(spos = 0; spos <= pos; spos++) { - if(sl[spos] == L"") + if(sl[spos].empty()) { continue; } - fwprintf(output, L"%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); + u_fprintf(output, "%S^%S/", blanks[spos].c_str(), sl[spos].c_str()); - vector::iterator ti; + vector::iterator ti; auto penum = tl[spos].end(); penum--; if(tl[spos].size() > 1) { //-- - set ti_keep; - set ti_removed; + set ti_keep; + set ti_removed; vector spos_matches; for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { @@ -1128,9 +600,13 @@ LRXProcessor::processFlush(FILE *output, bool matched = recognisePattern(*ti, si.first); OpType op = operations[spos][si.first]; if (debugMode) { - wstring checks = matched ? L"✔️ " : L"❎"; - fwprintf(stderr, L"%S >>> %d -> %S -> %.5f\n", checks.c_str(), spos, - si.first.c_str(), si.second); + if (matched) { + cerr << "✔️ "; + } else { + cerr << "❎"; + } + cerr << " >>> " << spos << " -> "; + cerr << si.first << " -> " << si.second << endl; } if(matched) { spos_matches.push_back({ op, &*ti, si.second }); @@ -1144,15 +620,10 @@ LRXProcessor::processFlush(FILE *output, [](const auto &a, const auto &b) { return a.weight > b.weight; }); for (const auto &m : spos_matches) { if (traceMode || debugMode) { - wstring op = (m.op == Select ? L"SELECT" : L"REMOVE"); - fwprintf( - stderr, L"%d:%S:%.5f:%S:%d:%S\n", - lineno, - op.c_str(), - m.weight, - sl[spos].c_str(), - ti_keep.size(), - m.ti->c_str()); + std::string op = (m.op == Select ? "SELECT" : "REMOVE"); + cerr << lineno << ":" << op << ":" << m.weight; + cerr << ":" << sl[spos] << ":" << ti_keep.size(); + cerr << ":" << m.ti << endl; } // We have to keep track of translations that have been removed so // that we don't end up adding back a translation that was removed. @@ -1168,9 +639,9 @@ LRXProcessor::processFlush(FILE *output, bool printed = false; for(const auto& ti_max : ti_keep) { if(printed) { - fwprintf(output, L"/"); + u_fprintf(output, "/"); } - fwprintf(output, L"%S", ti_max->c_str()); + u_fprintf(output, "%S", ti_max->c_str()); printed = true; } } @@ -1178,10 +649,10 @@ LRXProcessor::processFlush(FILE *output, { for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { - fwprintf(output, L"%S", ti->c_str()); + u_fprintf(output, "%S", ti->c_str()); if(ti != penum) { - fwprintf(output, L"/"); + u_fprintf(output, "/"); } } } @@ -1190,18 +661,18 @@ LRXProcessor::processFlush(FILE *output, { for(ti = tl[spos].begin(); ti != tl[spos].end(); ti++) { - fwprintf(output, L"%S", ti->c_str()); + u_fprintf(output, "%S", ti->c_str()); if(ti != penum) { - fwprintf(output, L"/"); + u_fputc('/', output); } } } - fwprintf(output, L"$"); + u_fputc('$', output); if(debugMode) { - fwprintf(output, L"%d", spos); + u_fprintf(output, "%d", spos); } diff --git a/src/lrx_processor.h b/src/lrx_processor.h index 26973aa..1a03d86 100644 --- a/src/lrx_processor.h +++ b/src/lrx_processor.h @@ -18,74 +18,33 @@ #ifndef __LRX_PROCESSOR_H__ #define __LRX_PROCESSOR_H__ -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include -#include -#include -#include -#include #include -#include -#include -#include #include -#include #include -#include +#include using namespace std; -/* -class BiltransToken { -public: - bool isEOF = false; - wstring source; - wstring blanks; - vector target; - - wstring toString(bool delim) { - wstring out = source; - for(int i = 0; i < target.size(); i++) { - out += L'/' + target[i]; - } - if (delim && (source.size() > 0 || target.size() > 0)) { - out = blanks + L'^' + out + L'$'; - } else { - out = blanks + out; - } - return out; - } -}; -*/ class LRXProcessor { private: Alphabet alphabet; TransExe transducer; - map recognisers; - map weights; - -// map bts; + map recognisers; + map weights; vector alive_states; map anfinals; - set escaped_chars; + set escaped_chars; State *initial_state; bool traceMode; @@ -93,42 +52,41 @@ private: bool nullFlush; bool outOfWord; + int32_t any_char; + int32_t any_upper; + int32_t any_lower; + int32_t any_tag; + int32_t word_boundary; + unsigned int pos; unsigned long lineno; - wstring itow(int i); - bool recognisePattern(const wstring lu, const wstring op); - wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); - -// BiltransToken readBiltransToken(FILE *input = stdin); + UString itow(int i); + bool recognisePattern(const UString lu, const UString op); + UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2); void makeTransition(int); void filterFinals(); void evaluateRules(); -/* - void processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > > &covers, - pair > &empty_seq, - map, vector > &spans, - int last_final); -*/ enum OpType { Select, Remove }; - void processFlush(FILE *output, - map &sl, - map > &tl, - map &blanks, - map > &scores, - map > &operations); + void processFlush(UFILE *output, + map &sl, + map > &tl, + map &blanks, + map > &scores, + map > &operations); public: - static wstring const LRX_PROCESSOR_TAG_SELECT; - static wstring const LRX_PROCESSOR_TAG_REMOVE; - static wstring const LRX_PROCESSOR_TAG_SKIP; + static UString const LRX_PROCESSOR_TAG_SELECT; + static UString const LRX_PROCESSOR_TAG_REMOVE; + static UString const LRX_PROCESSOR_TAG_SKIP; + static UString const LRX_PROCESSOR_TAG_ANY_CHAR; + static UString const LRX_PROCESSOR_TAG_ANY_TAG; + static UString const LRX_PROCESSOR_TAG_ANY_UPPER; + static UString const LRX_PROCESSOR_TAG_ANY_LOWER; + static UString const LRX_PROCESSOR_TAG_WORD_BOUNDARY; LRXProcessor(); ~LRXProcessor(); @@ -139,9 +97,7 @@ public: void init(); void load(FILE *input); - void process(FILE *input, FILE *output); -// void processME(FILE *input, FILE *output); - + void process(InputFile& input, UFILE *output); }; #endif /* __LRX_PROCESSOR_H__ */ diff --git a/src/multi_translator.cc b/src/multi_translator.cc index 7e2ad1e..ea98145 100644 --- a/src/multi_translator.cc +++ b/src/multi_translator.cc @@ -1,4 +1,5 @@ #include "multi_translator.h" +#include MultiTranslator::MultiTranslator(string path, string mode, bool trimmed, bool filter, bool number_lines) { this->trimmed = trimmed; @@ -30,10 +31,10 @@ int MultiTranslator::calculateFertility(vector sent) { } -BiltransToken MultiTranslator::parseBiltransToken(wstring bt) { +BiltransToken MultiTranslator::parseBiltransToken(UString bt) { BiltransToken token; - vector tokens = wsplit(bt, L'/'); + vector tokens = wsplit(bt, '/'); token.sourceToken = parseTaggerToken(tokens[0]); @@ -49,9 +50,9 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) { bool isPos; if (bt.sourceToken.tags.size() > 0) { isPos = - bt.sourceToken.tags[0] == L"n" || - bt.sourceToken.tags[0] == L"vblex" || - bt.sourceToken.tags[0] == L"adj"; + bt.sourceToken.tags[0] == "n"_u || + bt.sourceToken.tags[0] == "vblex"_u || + bt.sourceToken.tags[0] == "adj"_u; } else { isPos = false; } @@ -60,10 +61,10 @@ bool MultiTranslator::isPosAmbig(BiltransToken bt) { } -BiltransToken MultiTranslator::getFullToken(wstring source) { +BiltransToken MultiTranslator::getFullToken(UString source) { BiltransToken token; - if (source[0] == L'*') { + if (source[0] == '*') { token.sourceToken.lemma = source; TaggerToken tmp; tmp.lemma = source; @@ -71,21 +72,22 @@ BiltransToken MultiTranslator::getFullToken(wstring source) { return token; } - wstring target = bilingual.biltrans(source, false); - if (target == L"") { - target = L"@" + source; + UString target = bilingual.biltrans(source, false); + if (target.empty()) { + target += '@'; + target.append(source); } - token = parseBiltransToken(source + L"/" + target); + token = parseBiltransToken(source + "/"_u + target); return token; } -BiltransToken MultiTranslator::getTrimmedToken(wstring source) +BiltransToken MultiTranslator::getTrimmedToken(UString source) { BiltransToken ttoken; BiltransToken ftoken; - if (source[0] == L'*') { + if (source[0] == '*') { ttoken.sourceToken.lemma = source; TaggerToken tmp; tmp.lemma = source; @@ -99,8 +101,8 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) // the bilingual.* methods in FSTProcessor. Unknown why we get the // leaks in the first place... - wstring fstr = L""; - wstring tstr = L""; + UString fstr; + UString tstr; if((f_cache.find(source) == f_cache.end())) { @@ -116,37 +118,39 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) /*---------------------------------------------*/ - if (fstr == L"") { - fstr = L"@" + source; - } - if (tstr == L"") { - tstr = L"@" + source; - } + if (fstr.empty()) { + fstr += '@'; + fstr.append(source); + } + if (tstr.empty()) { + tstr += '@'; + tstr.append(source); + } - ttoken = parseBiltransToken(source + L"/" + tstr); - ftoken = parseBiltransToken(source + L"/" + fstr); + ttoken = parseBiltransToken(source + "/"_u + tstr); + ftoken = parseBiltransToken(source + "/"_u + fstr); if(this->trimmed) { for(size_t i = 0; i < ftoken.targetTokens.size(); ++i ) { if(ttoken.targetTokens[i].tags.size() < ftoken.targetTokens[i].tags.size()) { - ttoken.targetTokens[i].tags.push_back(L"*"); + ttoken.targetTokens[i].tags.push_back("*"_u); } } } - vector newTags; + vector newTags; //bool sourceTrimmed = false; for(size_t i = 0; i < ttoken.sourceToken.tags.size(); ++i) { - wstring tag = ttoken.sourceToken.tags[i]; + UString tag = ttoken.sourceToken.tags[i]; if (find(ttoken.targetTokens[0].tags, tag) == find(ftoken.targetTokens[0].tags, tag)) { newTags.push_back(tag); } } if(ttoken.sourceToken.tags.size() > newTags.size()) { - newTags.push_back(L"*"); + newTags.push_back("*"_u); } ttoken.sourceToken.tags = newTags; @@ -154,50 +158,50 @@ BiltransToken MultiTranslator::getTrimmedToken(wstring source) } void MultiTranslator::biltransToMultiTranslator(int sn, int &tn, unsigned int idx, - vector s, wstring buffer) + vector s, UString buffer) { if (idx == s.size() ) { - wcout << L".[][" << sn << L" " << tn << L"].[]\t" << buffer << endl; + cout << ".[][" << sn << " " << tn << "].[]\t" << buffer << endl; tn += 1; return; } auto n = s[idx].targetTokens.size(); - wstring base; - base = s[idx].sourceToken.toString(false) + L"/"; + UString base; + base = s[idx].sourceToken.toString(false) + "/"_u; for(size_t i = 0; i < n; ++i) { - wstring token = L"^" + base + s[idx].targetTokens[i].toString(false) + L"$"; + UString token = "^"_u + base + s[idx].targetTokens[i].toString(false) + "$"_u; if(idx != s.size() - 1) { - token += L" "; + token += ' '; } biltransToMultiTranslator(sn, tn, idx+1, s, buffer + token); } } void MultiTranslator::printBiltransSentence(int n, vector s) { if (number_lines) { - wcout << n << "\t"; + cout << n << "\t"; } for(size_t i = 0; i < s.size(); ++i) { - wcout << s[i].toString(true); + cout << s[i].toString(true); if (i != s.size() - 1) { - wcout << L" "; + cout << " "; } } - wcout << endl; + cout << endl; } void MultiTranslator::printTaggerOutput(int n, vector sentence) { if (number_lines) { - wcout << n << "\t"; + cout << n << "\t"; } for(size_t i = 0; i < sentence.size(); ++i) { - wcout << sentence[i].sourceToken.toString(true); + cout << sentence[i].sourceToken.toString(true); if (i != sentence.size() -1) { - wcout << L" "; + cout << " "; } } - wcout << endl; + cout << endl; } void MultiTranslator::processSentence(vector sentence) { @@ -207,8 +211,8 @@ void MultiTranslator::processSentence(vector sentence) { int numberOfUnknown = 0; int fertility = 1; for(size_t i = 0; i < sentence.size(); ++i) { - wstring token = sentence[i].toString(false); - wstring target; + UString token = sentence[i].toString(false); + UString target; BiltransToken bt; if(this->trimmed){ @@ -220,7 +224,7 @@ void MultiTranslator::processSentence(vector sentence) { if (isPosAmbig(bt)) { hasAmbigPos = true; } - if(token[0] == L'*') { + if(token[0] == '*') { numberOfUnknown ++; } fertility *= bt.targetTokens.size(); @@ -240,7 +244,7 @@ void MultiTranslator::processSentence(vector sentence) { } else if(mode == "-b") { printBiltransSentence(this->sn, outputSentence); } else if (mode == "-m") { - wstring outBuffer = L""; + UString outBuffer; int tn = 0; biltransToMultiTranslator(this->sn, tn, 0, outputSentence, outBuffer); } diff --git a/src/multi_translator.h b/src/multi_translator.h index d4d69cd..30ec426 100644 --- a/src/multi_translator.h +++ b/src/multi_translator.h @@ -4,36 +4,42 @@ #define BILTRANS_WITHOUT_QUEUE #include "tagger_output_processor.h" +#include class BiltransToken { public: - TaggerToken sourceToken; - vector targetTokens; - wstring blanks; - - bool isEOF; - - BiltransToken() { - isEOF = false; - } - - wstring toString(bool delimiter) { - wstring out = sourceToken.toString(false); - for(unsigned int i = 0; i < targetTokens.size(); i++) { - out += L'/' + targetTokens[i].toString(false); - } - if (delimiter) { - out = L"^" + out + L"$"; - } - return out; - } + TaggerToken sourceToken; + vector targetTokens; + UString blanks; + + bool isEOF; + + BiltransToken() { + isEOF = false; + } + + UString toString(bool delimiter) { + UString out; + if (delimiter) { + out += '^'; + } + out.append(sourceToken.toString(false)); + for (auto& tok : targetTokens) { + out += '/'; + out.append(tok.toString(false)); + } + if (delimiter) { + out += '$'; + } + return out; + } }; class MultiTranslator : public TaggerOutputProcessor { private: FSTProcessor bilingual; - map f_cache; - map t_cache; + map f_cache; + map t_cache; string path; bool trimmed; @@ -44,10 +50,10 @@ private: bool isPosAmbig(BiltransToken token); - BiltransToken getTrimmedToken(wstring str); - BiltransToken getFullToken(wstring str); + BiltransToken getTrimmedToken(UString str); + BiltransToken getFullToken(UString str); - BiltransToken parseBiltransToken(wstring bt); + BiltransToken parseBiltransToken(UString bt); void processSentence(vector s); @@ -56,7 +62,7 @@ private: void printTaggerOutput(int i, vector s); void biltransToMultiTranslator(int sn, int &tn, unsigned int idx, - vector s, wstring buffer); + vector s, UString buffer); @@ -69,4 +75,3 @@ public: }; #endif - diff --git a/src/multitrans.cc b/src/multitrans.cc index a4643bc..ad94ae1 100644 --- a/src/multitrans.cc +++ b/src/multitrans.cc @@ -1,4 +1,5 @@ #include "multi_translator.h" +#include bool trim = false; bool filter = false; @@ -9,18 +10,18 @@ string path; string mode; void printError(char *name) { - wcout << "Usage: " << name << " "; - wcout << " [options] " << endl; - wcout << "Modes: " << endl; - wcout << " --biltrans | -b" << endl; - wcout << " --multitrans | -m" << endl; - wcout << " --trim-tagger-output | -p" << endl; + cout << "Usage: " << name << " "; + cout << " [options] " << endl; + cout << "Modes: " << endl; + cout << " --biltrans | -b" << endl; + cout << " --multitrans | -m" << endl; + cout << " --trim-tagger-output | -p" << endl; - wcout << "Options: " << endl; - wcout << " --filter-lines | -f" << endl; - wcout << " --trim-lines | -t" << endl; - wcout << " --number-lines | -n" << endl; - wcout << " --null-flush | -z" << endl; + cout << "Options: " << endl; + cout << " --filter-lines | -f" << endl; + cout << " --trim-lines | -t" << endl; + cout << " --number-lines | -n" << endl; + cout << " --null-flush | -z" << endl; } void parseArguments(int argc, char **argv) { @@ -59,6 +60,7 @@ void parseArguments(int argc, char **argv) { } int main(int argc, char** argv) { + LtLocale::tryToSetLocale(); parseArguments(argc, argv); MultiTranslator mt(path, mode, trim, filter, number_lines); diff --git a/src/tagger_output_processor.cc b/src/tagger_output_processor.cc index 63b07f8..859aae3 100644 --- a/src/tagger_output_processor.cc +++ b/src/tagger_output_processor.cc @@ -1,15 +1,8 @@ #include "tagger_output_processor.h" +#include +#include -TaggerOutputProcessor::TaggerOutputProcessor() { - sn = 0; - LtLocale::tryToSetLocale(); -} - -TaggerOutputProcessor::~TaggerOutputProcessor() { - -} - -int TaggerOutputProcessor::find(vector xs, wstring x) { +int TaggerOutputProcessor::find(vector xs, UString x) { for (size_t i = 0; i < xs.size(); ++i) { if (xs[i] == x) return i; @@ -17,21 +10,21 @@ int TaggerOutputProcessor::find(vector xs, wstring x) { return -1; } -TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) { +TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) { TaggerToken token; int state = 0; // lemma; - wstring buffer; + UString buffer; for (auto& c : str) { - if(c == L'<' && state == 0) { + if(c == '<' && state == 0) { state = 1; token.lemma = buffer; buffer.clear(); } - if (c == L'>') { + if (c == '>') { token.tags.push_back(buffer); buffer.clear(); - } else if (c != L'<') { + } else if (c != '<') { buffer += c; } } @@ -41,10 +34,10 @@ TaggerToken TaggerOutputProcessor::parseTaggerToken(wstring str) { return token; } -vector TaggerOutputProcessor::parseTags(wstring token) { +vector TaggerOutputProcessor::parseTags(UString token) { int state = 0; // outside - vector tags; - wstring buffer; + vector tags; + UString buffer; for (auto& c : token) { if (state == 0) { if (c == '<') { @@ -53,7 +46,7 @@ vector TaggerOutputProcessor::parseTags(wstring token) { } else if (state == 1) { if (c == '>') { tags.push_back(buffer); - buffer = L""; + buffer.clear(); state = 0; } else { buffer += c; @@ -63,26 +56,26 @@ vector TaggerOutputProcessor::parseTags(wstring token) { return tags; } -vector TaggerOutputProcessor::wsplit(wstring wstr, wchar_t delim) { - vector tokens; - wstring buffer; +vector TaggerOutputProcessor::wsplit(UString wstr, UChar delim) { + vector tokens; + UString buffer; for(size_t i = 0; i < wstr.size(); ++i) { - if(wstr[i] == delim && (i == 0 || wstr[i-1] != L'\\')) { + if(wstr[i] == delim && (i == 0 || wstr[i-1] != '\\')) { tokens.push_back(buffer); - buffer = L""; + buffer.clear(); } else { buffer += wstr[i]; } } - if(buffer != L"") { + if(!buffer.empty()) { tokens.push_back(buffer); } return tokens; } -wstring TaggerOutputProcessor::getLemma(wstring token) { - wstring buffer; +UString TaggerOutputProcessor::getLemma(UString token) { + UString buffer; for (auto& c : token) { if(c != '<') { buffer += c; @@ -94,47 +87,19 @@ wstring TaggerOutputProcessor::getLemma(wstring token) { } void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) { - wstring buffer; vector sentence; - bool escaped = false; - int state = 0; // outside - wchar_t c; - while((c = fgetwc(stdin))) { - if (c == -1) { - break; - } + UChar32 c; + InputFile in; + while (!in.eof()) { + c = in.get(); - if (nullFlush && c == L'\0') { + if ((c == '\n') || (nullFlush && c == '\0')) { processSentence(sentence); sentence.clear(); - buffer.clear(); - } - - if(c == L'\n') { - processSentence(sentence); - sentence.clear(); - buffer.clear(); - } - if (state == 0) { - if (c == '^' && !escaped) { - state = 1; // inside - } else if (c == '\\' && !escaped) { - escaped = true; - } else { - escaped = false; - } - } else if (state == 1) { - if(c == L'$' && !escaped) { - sentence.push_back(parseTaggerToken(buffer)); - buffer = L""; - state = 0; - } else if (c == '\\' && !escaped) { - escaped = true; - buffer += c; - } else { - buffer += c; - escaped = false; - } + } else if (c == '\\') { + in.get(); + } else if (c == '^') { + sentence.push_back(parseTaggerToken(in.readBlock('^', '$'))); } } } diff --git a/src/tagger_output_processor.h b/src/tagger_output_processor.h index 40c00ad..0219ccf 100644 --- a/src/tagger_output_processor.h +++ b/src/tagger_output_processor.h @@ -2,54 +2,46 @@ #define TAGGER_OUTPUT_PROCESSOR #include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include +#include using namespace std; class TaggerToken { public: - wstring lemma; - vector tags; - wstring toString(bool delimiters) { - wstring out = lemma; - for (auto& tag : tags) { - out += L"<" + tag + L">"; - } - if (delimiters) { - out = L"^" + out + L"$"; - } - return out; - } + UString lemma; + vector tags; + UString toString(bool delimiters) { + UString out; + if (delimiters) { + out += '^'; + } + out.append(lemma); + for (auto& tag : tags) { + out += '<'; + out.append(tag); + out += '>'; + } + if (delimiters) { + out += '$'; + } + return out; + } }; class TaggerOutputProcessor { protected: - int sn; + int sn = 0; - vector parseTags(wstring token); - vector wsplit(wstring wstr, wchar_t delim); - TaggerToken parseTaggerToken(wstring buffer); + vector parseTags(UString token); + vector wsplit(UString wstr, UChar delim); + TaggerToken parseTaggerToken(UString buffer); - int find(vector xs, wstring x); - wstring getLemma(wstring token); + int find(vector xs, UString x); + UString getLemma(UString token); virtual void processSentence(vector) =0; public: - TaggerOutputProcessor(); - ~TaggerOutputProcessor(); - void processTaggerOutput(bool nullFlush=false); - }; #endif diff --git a/src/yasmet.cc b/src/yasmet.cc index d203555..ae25a2e 100644 --- a/src/yasmet.cc +++ b/src/yasmet.cc @@ -1,4 +1,3 @@ -#include #include #include #include