commit b74306d181734ee6ba92493a227ff8aeee334a68 Author: Daniel Swanson Date: Wed Jun 30 08:55:14 2021 -0500 use ICU (#74) ICU changes - convert `std::wstring` to `UString` and `wchar_t` to `UChar` - note: if there exist compiled files with non-BMP string literals, this might be a breaking change, but I don't think any such files exist - use `lttoolbox/input_file.h` and ICU `UFILE*` for stream I/O - rely on shared case functions efficiency, readability, and code style changes - move constant initializers to class headers - prefer `str.empty()` to `str == ""` - prefer range-for loops - remove unused `#include`s - make `die()` and `warn()` printf-like in `trx_compiler.cc` helper function and dependency changes - use apertium's `apertium_re` wrapper rather than maintaining a separate copy - move regex optimization code to apertium to share it with t*x - use XML iterators from `lttoolbox/xml_walk_util.h` in `trx_compiler.cc` diff --git a/configure.ac b/configure.ac index 9ac4f1d..58e2008 100644 --- a/configure.ac +++ b/configure.ac @@ -1,10 +1,10 @@ AC_PREREQ(2.61) m4_define([required_libxml_version], [2.6.17]) -m4_define([required_apertium_version], [3.7.0]) -m4_define([required_lttoolbox_version], [3.5.3]) +m4_define([required_apertium_version], [3.8.0]) +m4_define([required_lttoolbox_version], [3.6.0]) -AC_INIT([apertium-recursive], [1.0.1], [awesomeevildudes@gmail.com]) +AC_INIT([apertium-recursive], [1.1.0], [awesomeevildudes@gmail.com]) AM_INIT_AUTOMAKE AC_CONFIG_HEADER([src/auto_config.h]) AC_CONFIG_MACRO_DIR([m4]) @@ -38,17 +38,22 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0 >= required_libxml_version]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) -PKG_CHECK_MODULES(PCRE, [libpcre >= 6.4]) +PKG_CHECK_MODULES([ICU], [icu-i18n, icu-io, icu-uc]) + +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) AC_CHECK_FUNCS([setlocale strdup getopt_long]) -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt_long, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) + +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt_long]) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $PCRE_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS $PCRE_LIBS" +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $APERTIUM_LIBS $LIBXML_LIBS $ICU_LIBS" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/Makefile.am b/src/Makefile.am index c5b81de..1293423 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,9 +2,9 @@ AM_LDFLAGS=$(LIBS) bin_PROGRAMS = rtx-comp rtx-proc rtx-decomp random-path -rtx_comp_SOURCES = rtx_comp.cc rtx_compiler.cc trx_compiler.cc pattern.cc apertium_re.cc +rtx_comp_SOURCES = rtx_comp.cc rtx_compiler.cc trx_compiler.cc pattern.cc -rtx_proc_SOURCES = rtx_proc.cc rtx_processor.cc apertium_re.cc chunk.cc +rtx_proc_SOURCES = rtx_proc.cc rtx_processor.cc chunk.cc rtx_decomp_SOURCES = rtx_decomp.cc diff --git a/src/apertium_re.cc b/src/apertium_re.cc deleted file mode 100644 index fa22667..0000000 --- a/src/apertium_re.cc +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#include -#include -#include -#include -#include -#include - -using namespace Apertium; -using namespace std; - -ApertiumRE::ApertiumRE() : -re(0) -{ - empty = true; -} - -ApertiumRE::~ApertiumRE() -{ - if(!empty) - { - pcre_free(re); - } - empty = true; -} - -void -ApertiumRE::read(FILE *input) -{ - unsigned int size = Compression::multibyte_read(input); - re = static_cast(pcre_malloc(size)); - if(size != fread(re, 1, size, input)) - { - wcerr << L"Error reading regexp" << endl; - exit(EXIT_FAILURE); - } - - empty = false; -} - -void -ApertiumRE::compile(string const &str) -{ - const char *error; - int erroroffset; - re = pcre_compile(str.c_str(), PCRE_DOTALL|PCRE_EXTENDED|PCRE_UTF8, - &error, &erroroffset, NULL); - if(re == NULL) - { - wcerr << L"Error: pcre_compile "; - wcerr << error << endl; - exit(EXIT_FAILURE); - } - - empty = false; -} - -void -ApertiumRE::write(FILE *output) const -{ - if(empty) - { - wcerr << L"Error, cannot write empty regexp" << endl; - exit(EXIT_FAILURE); - } - - size_t size; - int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); - if(rc < 0) - { - wcerr << L"Error calling pcre_fullinfo()\n" << endl; - exit(EXIT_FAILURE); - } - - Compression::multibyte_write(size, output); - - size_t rc2 = fwrite(re, 1, size, output); - if(rc2 != size) - { - wcerr << L"Error writing precompiled regex\n" << endl; - exit(EXIT_FAILURE); - } -} - -string -ApertiumRE::match(string const &str) const -{ - if(empty) - { - return ""; - } - - int result[3]; - int workspace[4096]; -// int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return ""; - - default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; - exit(EXIT_FAILURE); - } - } - - return str.substr(result[0], result[1]-result[0]); -} - -void -ApertiumRE::replace(string &str, string const &value) const -{ - if(empty) - { - return; - } - - int result[3]; - int workspace[4096]; - // int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return; - - default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; - exit(EXIT_FAILURE); - } - } - - string res = str.substr(0, result[0]); - res.append(value); - res.append(str.substr(result[1])); - str = res; -} diff --git a/src/apertium_re.h b/src/apertium_re.h deleted file mode 100644 index ee73df3..0000000 --- a/src/apertium_re.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _APERTIUM_RE_ -#define _APERTIUM_RE_ - -#include -#include -#include -#include - -using namespace std; - -class ApertiumRE -{ -private: - bool empty; - pcre *re; -public: - ApertiumRE(); - ~ApertiumRE(); - void read(FILE *); - void write(FILE *) const; - string match(string const &str) const; - void replace(string &str, string const &value) const; - void compile(string const &str); -}; - -#endif diff --git a/src/bytecode.h b/src/bytecode.h index f8b738d..65acc92 100644 --- a/src/bytecode.h +++ b/src/bytecode.h @@ -2,104 +2,105 @@ #define __RTXBYTECODE__ #include +#include // Stack Operations -static const wchar_t DROP = L'd'; -static const wchar_t DUP = L'*'; -static const wchar_t OVER = L'o'; -static const wchar_t SWAP = L'w'; +static const UChar DROP = 'd'; +static const UChar DUP = '*'; +static const UChar OVER = 'o'; +static const UChar SWAP = 'w'; // Literals -static const wchar_t STRING = L's'; -static const wchar_t INT = L'i'; -static const wchar_t PUSHFALSE = L'f'; -static const wchar_t PUSHTRUE = L't'; -static const wchar_t PUSHNULL = L'0'; +static const UChar STRING = 's'; +static const UChar INT = 'i'; +static const UChar PUSHFALSE = 'f'; +static const UChar PUSHTRUE = 't'; +static const UChar PUSHNULL = '0'; // Jumps -static const wchar_t JUMP = L'j'; -static const wchar_t JUMPONTRUE = L'J'; -static const wchar_t JUMPONFALSE = L'?'; +static const UChar JUMP = 'j'; +static const UChar JUMPONTRUE = 'J'; +static const UChar JUMPONFALSE = '?'; // Logical Operators -static const wchar_t AND = L'&'; -static const wchar_t OR = L'|'; -static const wchar_t NOT = L'!'; +static const UChar AND = '&'; +static const UChar OR = '|'; +static const UChar NOT = '!'; // String Comparisons -static const wchar_t EQUAL = L'='; -static const wchar_t ISPREFIX = L'('; -static const wchar_t ISSUFFIX = L')'; -static const wchar_t ISSUBSTRING = L'c'; +static const UChar EQUAL = '='; +static const UChar ISPREFIX = '('; +static const UChar ISSUFFIX = ')'; +static const UChar ISSUBSTRING = 'c'; // Caseless String Comparisons -static const wchar_t EQUALCL = L'q'; -static const wchar_t ISPREFIXCL = L'p'; -static const wchar_t ISSUFFIXCL = L'u'; -static const wchar_t ISSUBSTRINGCL = L'r'; +static const UChar EQUALCL = 'q'; +static const UChar ISPREFIXCL = 'p'; +static const UChar ISSUFFIXCL = 'u'; +static const UChar ISSUBSTRINGCL = 'r'; // List Comparisons -static const wchar_t HASPREFIX = L'['; -static const wchar_t HASSUFFIX = L']'; -static const wchar_t IN = L'n'; +static const UChar HASPREFIX = '['; +static const UChar HASSUFFIX = ']'; +static const UChar IN = 'n'; // Caseless List Comparisons -static const wchar_t HASPREFIXCL = L'{'; -static const wchar_t HASSUFFIXCL = L'}'; -static const wchar_t INCL = L'N'; +static const UChar HASPREFIXCL = '{'; +static const UChar HASSUFFIXCL = '}'; +static const UChar INCL = 'N'; // Case Operations -static const wchar_t GETCASE = L'a'; -static const wchar_t SETCASE = L'A'; +static const UChar GETCASE = 'a'; +static const UChar SETCASE = 'A'; // Variables -static const wchar_t FETCHVAR = L'v'; -static const wchar_t SETVAR = L'$'; -static const wchar_t FETCHCHUNK = L'5'; -static const wchar_t SETCHUNK = L'6'; +static const UChar FETCHVAR = 'v'; +static const UChar SETVAR = '$'; +static const UChar FETCHCHUNK = '5'; +static const UChar SETCHUNK = '6'; // Clips -static const wchar_t SOURCECLIP = L'S'; -static const wchar_t TARGETCLIP = L'T'; -static const wchar_t REFERENCECLIP = L'R'; -static const wchar_t SETCLIP = L'>'; +static const UChar SOURCECLIP = 'S'; +static const UChar TARGETCLIP = 'T'; +static const UChar REFERENCECLIP = 'R'; +static const UChar SETCLIP = '>'; // Chunks -static const wchar_t CHUNK = L'C'; -static const wchar_t APPENDCHILD = L'1'; -static const wchar_t APPENDSURFACE = L'2'; -static const wchar_t APPENDALLCHILDREN = L'3'; -static const wchar_t APPENDALLINPUT = L'4'; -static const wchar_t PUSHINPUT = L'7'; -static const wchar_t APPENDSURFACESL = L'8'; -static const wchar_t APPENDSURFACEREF = L'9'; +static const UChar CHUNK = 'C'; +static const UChar APPENDCHILD = '1'; +static const UChar APPENDSURFACE = '2'; +static const UChar APPENDALLCHILDREN = '3'; +static const UChar APPENDALLINPUT = '4'; +static const UChar PUSHINPUT = '7'; +static const UChar APPENDSURFACESL = '8'; +static const UChar APPENDSURFACEREF = '9'; // Output -static const wchar_t OUTPUT = L'<'; -static const wchar_t BLANK = L'b'; -static const wchar_t OUTPUTALL = L'@'; -static const wchar_t CONJOIN = L'+'; +static const UChar OUTPUT = '<'; +static const UChar BLANK = 'b'; +static const UChar OUTPUTALL = '@'; +static const UChar CONJOIN = '+'; // Other -static const wchar_t CONCAT = L'-'; -static const wchar_t REJECTRULE = L'X'; -static const wchar_t DISTAG = L'D'; -static const wchar_t GETRULE = L'^'; -static const wchar_t SETRULE = L'%'; -static const wchar_t LUCOUNT = L'#'; +static const UChar CONCAT = '-'; +static const UChar REJECTRULE = 'X'; +static const UChar DISTAG = 'D'; +static const UChar GETRULE = '^'; +static const UChar SETRULE = '%'; +static const UChar LUCOUNT = '#'; #endif diff --git a/src/chunk.cc b/src/chunk.cc index 135d4db..8c7a8c2 100644 --- a/src/chunk.cc +++ b/src/chunk.cc @@ -1,12 +1,11 @@ #include #include -#include -#include +#include #include -wstring -combineWblanks(wstring wblank_current, wstring wblank_to_add) +UString +combineWblanks(UString wblank_current, UString wblank_to_add) { if(wblank_current.empty() && wblank_to_add.empty()) { @@ -21,8 +20,8 @@ combineWblanks(wstring wblank_current, wstring wblank_to_add) return wblank_current; } - wstring new_out_wblank; - for(wstring::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + UString new_out_wblank; + for(UString::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) { if(*it == '\\') { @@ -44,7 +43,7 @@ combineWblanks(wstring wblank_current, wstring wblank_to_add) } } - for(wstring::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + for(UString::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) { if(*it == '\\') { @@ -69,67 +68,48 @@ combineWblanks(wstring wblank_current, wstring wblank_to_add) return new_out_wblank; } -wstring +UString Chunk::chunkPart(ApertiumRE const &part, const ClipType side) { - string chunk; switch(side) { case SourceClip: - chunk = UtfConverter::toUtf8(source); + return part.match(source); break; case TargetClip: - chunk = UtfConverter::toUtf8(target); + return part.match(target); break; case ReferenceClip: - chunk = UtfConverter::toUtf8(coref); + return part.match(coref); break; } - string result = part.match(chunk); - if(result.size() == 0) - { - return wstring(L""); - } - else - { - return UtfConverter::fromUtf8(result); - } + return ""_u; } void -Chunk::setChunkPart(ApertiumRE const &part, wstring const &value) +Chunk::setChunkPart(ApertiumRE const &part, UString const &value) { - string surf = UtfConverter::toUtf8(target); - if(part.match(surf).size() == 0) - { - //target += value; - } - else - { - string val = UtfConverter::toUtf8(value); - part.replace(surf, val); - target = UtfConverter::fromUtf8(surf); - } + part.replace(target, value); } -vector -Chunk::getTags(const vector& parentTags) +vector +Chunk::getTags(const vector& parentTags) { unsigned int last = 0; - vector ret; + vector ret; for(unsigned int i = 0, limit = target.size(); i < limit; i++) { - if(target[i] == L'<') + if(target[i] == '<') { last = i; bool isNum = true; for(unsigned int j = i+1; j < limit; j++) { - if(target[j] == L'>') + if(target[j] == '>') { if(isNum) { - unsigned int n = stoul(target.substr(last+1, j-last-1)); + unsigned int n = StringUtils::stoi(target.substr(last+1, j-last-1)); if(n != 0 && n <= parentTags.size()) { ret.push_back(parentTags[n-1]); @@ -137,7 +117,7 @@ Chunk::getTags(const vector& parentTags) break; } } - wstring tag = target.substr(last, j-last+1); + UString tag = target.substr(last, j-last+1); ret.push_back(tag); last = j+1; break; @@ -148,7 +128,7 @@ Chunk::getTags(const vector& parentTags) } } } - else if(target[i] == L'\\') + else if(target[i] == '\\') { i++; } @@ -157,27 +137,27 @@ Chunk::getTags(const vector& parentTags) } void -Chunk::updateTags(const vector& parentTags) +Chunk::updateTags(const vector& parentTags) { if(isBlank) return; unsigned int last = 0; - wstring result; + UString result; result.reserve(target.size() + (2*parentTags.size())); // a rough estimate - works if most number tags are 1 digit and most new tags are 3 chars or less for(unsigned int i = 0, limit = target.size(); i < limit; i++) { - if(target[i] == L'<') + if(target[i] == '<') { result += target.substr(last, i-last); last = i; bool isNum = true; for(unsigned int j = i+1; j < limit; j++) { - if(target[j] == L'>') + if(target[j] == '>') { if(isNum) { - unsigned int n = stoul(target.substr(last+1, j-last-1)); + unsigned int n = StringUtils::stoi(target.substr(last+1, j-last-1)); if(n != 0 && n <= parentTags.size()) { result += parentTags[n-1]; @@ -196,7 +176,7 @@ Chunk::updateTags(const vector& parentTags) } } } - else if(target[i] == L'\\') + else if(target[i] == '\\') { i++; } @@ -209,11 +189,11 @@ Chunk::updateTags(const vector& parentTags) } void -Chunk::output(const vector& parentTags, FILE* out = NULL) +Chunk::output(const vector& parentTags, UFILE* out = NULL) { if(contents.size() > 0) { - vector tags = getTags(parentTags); + vector tags = getTags(parentTags); for(unsigned int i = 0; i < contents.size(); i++) { contents[i]->output(tags, out); @@ -223,11 +203,11 @@ Chunk::output(const vector& parentTags, FILE* out = NULL) { if(out == NULL) { - cout << UtfConverter::toUtf8(target); + cout << target; } else { - fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); + write(target, out); } } else @@ -238,29 +218,26 @@ Chunk::output(const vector& parentTags, FILE* out = NULL) } else if(out == NULL) { - cout << UtfConverter::toUtf8(wblank); + cout << wblank; cout << "^"; - cout << UtfConverter::toUtf8(target); + cout << target; cout << "$"; } else { - fputs_unlocked(UtfConverter::toUtf8(wblank).c_str(), out); - fputc_unlocked('^', out); - fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); - fputc_unlocked('$', out); + u_fprintf(out, "%S^%S$", wblank.c_str(), target.c_str()); } } } void -Chunk::output(FILE* out) +Chunk::output(UFILE* out) { - vector tags; + vector tags; output(tags, out); } -wstring +UString Chunk::matchSurface() { if(contents.size() == 0) @@ -282,22 +259,22 @@ Chunk::conjoin(Chunk* other) unsigned int lemq_loc = 0; for(; lemq_loc < target.size(); lemq_loc++) { - if(target[lemq_loc] == L'\\') + if(target[lemq_loc] == '\\') { lemq_loc++; continue; } - else if(target[lemq_loc] == L'#') + else if(target[lemq_loc] == '#') { break; } } - target.insert(lemq_loc, L"+" + other->target); + target.insert(lemq_loc, "+"_u + other->target); wblank = combineWblanks(other->wblank, wblank); } void -Chunk::writeTree(TreeMode mode, FILE* out) +Chunk::writeTree(TreeMode mode, UFILE* out) { switch(mode) { @@ -305,21 +282,21 @@ Chunk::writeTree(TreeMode mode, FILE* out) case TreeModeNest: writeTreePlain(out, 0); break; case TreeModeLatex: if(isBlank) return; - writeString(L"\\begin{forest}\n%where n children=0{tier=word}{}\n", out); - writeString(L"% Uncomment the preceding line to make the LUs bottom-aligned.\n", out); + writeString("\\begin{forest}\n%where n children=0{tier=word}{}\n"_u, out); + writeString("% Uncomment the preceding line to make the LUs bottom-aligned.\n"_u, out); writeTreeLatex(out); - writeString(L"\n\\end{forest}\n", out); + writeString("\n\\end{forest}\n"_u, out); break; case TreeModeDot: if(isBlank) return; - writeString(L"digraph {", out); + writeString("digraph {"_u, out); writeTreeDot(out); - writeString(L"}\n", out); + writeString("}\n"_u, out); break; case TreeModeBox: { if(isBlank) return; - vector> tree = writeTreeBox(); + vector> tree = writeTreeBox(); if(tree.size() == 0) return; unsigned int tr = 4, sl = 12, st = 11, tl = 12, tt = 11, rl = 0, rt = 0; for(unsigned int i = 0; i < tree.size(); i++) @@ -335,56 +312,57 @@ Chunk::writeTree(TreeMode mode, FILE* out) bool doCoref = (rl > 0 || rt > 0); if(doCoref && rl < 17) rl = 17; if(doCoref && rt < 16) rt = 16; - writeString(L"Tree" + wstring(tr-3, L' '), out); - writeString(L"Source Lemma" + wstring(sl - 11, L' '), out); - writeString(L"Source Tags" + wstring(st - 10, L' '), out); - writeString(L"Target Lemma" + wstring(tl - 11, L' '), out); - writeString(L"Target Tags" + wstring(tt - 10, L' '), out); + writeString("Tree"_u + UString(tr-3, ' '), out); + writeString("Source Lemma"_u + UString(sl - 11, ' '), out); + writeString("Source Tags"_u + UString(st - 10, ' '), out); + writeString("Target Lemma"_u + UString(tl - 11, ' '), out); + writeString("Target Tags"_u + UString(tt - 10, ' '), out); if(doCoref) { - writeString(L"Coreference Lemma" + wstring(rl - 16, L' '), out); - writeString(L"Coreference Tags", out); - if(rt > 16) writeString(wstring(rt - 16, L' '), out); + writeString("Coreference Lemma"_u + UString(rl - 16, ' '), out); + writeString("Coreference Tags"_u, out); + if(rt > 16) writeString(UString(rt - 16, ' '), out); } - writeString(L"\n", out); - writeString(wstring(tr, L'─') + L" ", out); - writeString(wstring(sl, L'─') + L" ", out); - writeString(wstring(st, L'─') + L" ", out); - writeString(wstring(tl, L'─') + L" ", out); - writeString(wstring(tt, L'─'), out); - if(doCoref) writeString(L" " + wstring(rl, L'─'), out); - if(doCoref) writeString(L" " + wstring(rt, L'─'), out); - writeString(L"\n", out); + writeString("\n"_u, out); + UChar dash = u'\u2500'; // '─' + writeString(UString(tr, dash) + " "_u, out); + writeString(UString(sl, dash) + " "_u, out); + writeString(UString(st, dash) + " "_u, out); + writeString(UString(tl, dash) + " "_u, out); + writeString(UString(tt, dash), out); + if(doCoref) writeString(" "_u + UString(rl, dash), out); + if(doCoref) writeString(" "_u + UString(rt, dash), out); + writeString("\n"_u, out); for(unsigned int i = 0; i < tree.size(); i++) { - writeString(wstring(tr - tree[i][0].size(), L' ') + tree[i][0] + L" ", out); - writeString(tree[i][1] + wstring(sl - tree[i][1].size() + 1, L' '), out); - writeString(tree[i][2] + wstring(st - tree[i][2].size() + 1, L' '), out); - writeString(tree[i][3] + wstring(tl - tree[i][3].size() + 1, L' '), out); - writeString(tree[i][4] + wstring(tt - tree[i][4].size(), L' '), out); + writeString(UString(tr - tree[i][0].size(), ' ') + tree[i][0] + " "_u, out); + writeString(tree[i][1] + UString(sl - tree[i][1].size() + 1, ' '), out); + writeString(tree[i][2] + UString(st - tree[i][2].size() + 1, ' '), out); + writeString(tree[i][3] + UString(tl - tree[i][3].size() + 1, ' '), out); + writeString(tree[i][4] + UString(tt - tree[i][4].size(), ' '), out); if(doCoref) { - writeString(L" " + tree[i][5] + wstring(rl - tree[i][5].size(), L' '), out); - writeString(L" " + tree[i][6], out); + writeString(" "_u + tree[i][5] + UString(rl - tree[i][5].size(), ' '), out); + writeString(" "_u + tree[i][6], out); } - writeString(L"\n", out); + writeString("\n"_u, out); } - writeString(L"\n", out); + writeString("\n"_u, out); } break; default: - wcerr << L"That tree mode has not yet been implemented." << endl; + wcerr << "That tree mode has not yet been implemented." << endl; } } -pair -Chunk::chopString(wstring s) +pair +Chunk::chopString(UString s) { - wstring lem; - wstring tags; + UString lem; + UString tags; for(unsigned int i = 0; i < s.size(); i++) { - if(s[i] == L'<') + if(s[i] == '<') { lem = s.substr(0, i); tags = s.substr(i+1, s.size()-i-2); @@ -395,24 +373,24 @@ Chunk::chopString(wstring s) { lem = s; } - return make_pair(lem, StringUtils::substitute(tags, L"><", L".")); + return make_pair(lem, StringUtils::substitute(tags, "><"_u, "."_u)); } void -Chunk::writeString(wstring s, FILE* out) +Chunk::writeString(UString s, UFILE* out) { - if(out == NULL) wcerr << s; - else fputs_unlocked(UtfConverter::toUtf8(s).c_str(), out); + if(out == NULL) cerr << s; + else write(s, out); } void -Chunk::writeTreePlain(FILE* out, int depth) +Chunk::writeTreePlain(UFILE* out, int depth) { if(depth >= 0 && isBlank) return; - wstring base; + UString base; for(int i = 0; i < depth; i++) { - base += L'\t'; + base += '\t'; } if(!isBlank) { @@ -420,21 +398,21 @@ Chunk::writeTreePlain(FILE* out, int depth) { base += wblank; } - base += L"^"; + base += '^'; } if(source.size() > 0) { - base += source + L"/"; + base += source + "/"_u; } base += target; if(coref.size() > 0) { - base += L"/" + coref; + base += "/"_u + coref; } writeString(base, out); if(contents.size() > 0) { - writeString((depth == -1) ? L"{" : L"{\n", out); + writeString((depth == -1) ? "{"_u : "{\n"_u, out); int newdepth = (depth == -1) ? -1 : depth + 1; for(unsigned int i = 0; i < contents.size(); i++) { @@ -442,111 +420,114 @@ Chunk::writeTreePlain(FILE* out, int depth) } for(int i = 0; i < depth; i++) { - writeString(L"\t", out); + writeString("\t"_u, out); } - writeString(L"}", out); + writeString("}"_u, out); } - if(!isBlank) writeString(L"$", out); - if(depth != -1) writeString(L"\n", out); + if(!isBlank) writeString("$"_u, out); + if(depth != -1) writeString("\n"_u, out); } void -Chunk::writeTreeLatex(FILE* out) +Chunk::writeTreeLatex(UFILE* out) { if(isBlank) return; - wstring nl = L" \\\\ "; - wstring base; - pair p; + UString nl = " \\\\ "_u; + UString base; + pair p; if(source.size() > 0) { p = chopString(source); - base += L"\\textbf{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}" + nl; + base += "\\textbf{"_u + p.first + "}"_u + nl + "\\texttt{"_u + p.second + "}"_u + nl; } p = chopString(target); if(contents.size() == 0) { - base += L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; + base += "\\textit{"_u + p.first + "}"_u + nl + "\\texttt{"_u + p.second + "}"_u; } else { unsigned int i = 0; for(; i < p.second.size(); i++) { - if(p.second[i] == L'.') break; + if(p.second[i] == '.') break; } if(i < p.second.size()) { - base += p.second.substr(0, i) + nl + L"\\textit{" + p.first + L"}"; - base += nl + L"\\texttt{" + p.second.substr(i+1) + L"}"; + base += p.second.substr(0, i) + nl + "\\textit{"_u + p.first + "}"_u; + base += nl + "\\texttt{"_u + p.second.substr(i+1) + "}"_u; } else { - base += p.second + nl + L"\\textit{" + p.first + L"}"; + base += p.second + nl + "\\textit{"_u + p.first + "}"_u; } } if(coref.size() > 0) { p = chopString(coref); - base += nl + L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; + base += nl + "\\textit{"_u + p.first + "}"_u + nl + "\\texttt{"_u + p.second + "}"_u; } - base = L"[{ \\begin{tabular}{c} " + base + L" \\end{tabular} } "; - base = StringUtils::substitute(base, L"_", L"\\_"); + base = "[{ \\begin{tabular}{c} "_u + base + " \\end{tabular} } "_u; + base = StringUtils::substitute(base, "_"_u, "\\_"_u); writeString(base, out); for(unsigned int i = 0; i < contents.size(); i++) contents[i]->writeTreeLatex(out); - writeString(L" ]", out); + writeString(" ]"_u, out); } -wstring -Chunk::writeTreeDot(FILE* out) +UString +Chunk::writeTreeDot(UFILE* out) { - if(isBlank) return L""; + if(isBlank) return ""_u; static int nodeId = 0; nodeId++; - wstring name = L"n" + to_wstring(nodeId); - wstring node = name + L" \\[label=\""; + UString name = "n"_u + StringUtils::itoa(nodeId); + UString node = name; + node += " \\[label=\""_u; if(source.size() > 0) { - node += source + L"\\\\n"; + node += source; + node += "\\\\n"_u; } node += target; if(coref.size() > 0) { - node += L"\\\\n" + coref; + node += "\\\\n"_u; + node += coref; } - node += L"\"\\];"; + node += "\"\\];"_u; writeString(node, out); for(unsigned int i = 0; i < contents.size(); i++) { - wstring kid = contents[i]->writeTreeDot(out); - if(kid.size() > 0) writeString(name + L" -> " + kid + L";", out); + UString kid = contents[i]->writeTreeDot(out); + if(kid.size() > 0) writeString(name + " -> "_u + kid + ";"_u, out); } return name; } -vector> +vector> Chunk::writeTreeBox() { if(contents.size() == 0) { - vector ret; + vector ret; ret.resize(7); - pair p = chopString(source); + pair p = chopString(source); ret[1] = p.first; ret[2] = p.second; p = chopString(target); ret[3] = p.first; ret[4] = p.second; p = chopString(coref); ret[5] = p.first; ret[6] = p.second; - return vector>(1, ret); + return vector>(1, ret); } else { vector> bounds; - vector> tree; + vector> tree; for(unsigned int i = 0; i < contents.size(); i++) { if(!contents[i]->isBlank) { - vector> temp = contents[i]->writeTreeBox(); + vector> temp = contents[i]->writeTreeBox(); tree.insert(tree.end(), temp.begin(), temp.end()); if(temp.size() == 1) { @@ -556,8 +537,8 @@ Chunk::writeTreeBox() int first = -1, last = -1; for(unsigned int j = tree.size() - temp.size(); j < tree.size(); j++) { - if(first == -1 && tree[j][0][0] != L' ') first = j; - else if(first != -1 && last == -1 && tree[j][0][0] == L' ') last = j-1; + if(first == -1 && tree[j][0][0] != ' ') first = j; + else if(first != -1 && last == -1 && tree[j][0][0] == ' ') last = j-1; } first = (first == -1) ? tree.size() - temp.size() : first; last = (last == -1) ? tree.size() - 1 : last; @@ -566,7 +547,7 @@ Chunk::writeTreeBox() } if(tree.size() == 1) { - tree[0][0] = L"─" + tree[0][0]; + tree[0][0] = u'\u2500' + tree[0][0]; // '─' return tree; } unsigned int center = tree.size() / 2; @@ -589,7 +570,7 @@ Chunk::writeTreeBox() unsigned int sz = tree[i][0].size(); if(lines.count(i) == 0) { - tree[i][0] = wstring(len - sz, L' ') + tree[i][0]; + tree[i][0] = UString(len - sz, ' ') + tree[i][0]; } else { @@ -597,24 +578,36 @@ Chunk::writeTreeBox() { switch(tree[i][0][0]) { - case L'│': tree[i][0][0] = L'┤'; break; - case L'├': tree[i][0][0] = L'┼'; break; - case L'┌': tree[i][0][0] = L'┬'; break; - case L'└': tree[i][0][0] = L'┴'; break; - default: break; + case u'\u2502': // '│' + tree[i][0][0] = u'\u2524'; break; // '┤' + case u'\u251c': // '├' + tree[i][0][0] = u'\u253c'; break; // '┼' + case u'\u250c': // '┌' + tree[i][0][0] = u'\u252c'; break; // '┬' + case u'\u2514': // '└' + tree[i][0][0] = u'\u2534'; break; // '┴' + default: break; } } - tree[i][0] = wstring(len - sz, L'─') + tree[i][0]; + tree[i][0] = UString(len - sz, u'\u2500') + tree[i][0]; // '─' } - if(i < firstLine || i > lastLine) tree[i][0] = L' ' + tree[i][0]; - else if(i == firstLine && i == lastLine) tree[i][0] = L'─' + tree[i][0]; - else if(i == firstLine) tree[i][0] = L'┌' + tree[i][0]; - else if(i > firstLine && i < lastLine) - { - if(lines.count(i) == 0) tree[i][0] = L'│' + tree[i][0]; - else tree[i][0] = L'├' + tree[i][0]; + UChar prefix = ' '; + if (i > firstLine && i < lastLine) { + if (lines.count(i) == 0) { + prefix = u'\u2502'; // '│' + } else { + prefix = u'\u251c'; // '├' + } + } else if (i == firstLine) { + if (i == lastLine) { + prefix = u'\u2500'; // '─' + } else { + prefix = u'\u250c'; // '┌' + } + } else if (i == lastLine) { + prefix = u'\u2514'; // '└' } - else if(i == lastLine) tree[i][0] = L'└' + tree[i][0]; + tree[i][0] = prefix + tree[i][0]; } return tree; } diff --git a/src/chunk.h b/src/chunk.h index 214dc8d..9e6f6d0 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -2,8 +2,7 @@ #define __RTXCHUNK__ #include -#include -#include +#include #include #include @@ -28,10 +27,10 @@ enum TreeMode class Chunk { public: - wstring source; - wstring target; - wstring coref; - wstring wblank; + UString source; + UString target; + UString coref; + UString wblank; bool isBlank; bool isJoiner; vector contents; @@ -40,13 +39,13 @@ public: Chunk() : isBlank(false), isJoiner(false), rule(-1) {} - Chunk(wstring blankContent) + Chunk(UString blankContent) : target(blankContent), isBlank(true), isJoiner(false), rule(-1) {} - Chunk(wstring src, wstring dest, wstring cor, wstring wbl) + Chunk(UString src, UString dest, UString cor, UString wbl) : source(src), target(dest), coref(cor), wblank(wbl), isBlank(false), isJoiner(false), rule(-1) {} - Chunk(wstring dest, vector& children, int r = -1) + Chunk(UString dest, vector& children, int r = -1) : target(dest), isBlank(false), isJoiner(false), contents(children), rule(r) {} Chunk(Chunk& other) // copy constructor @@ -100,29 +99,29 @@ public: return ret; } - wstring chunkPart(ApertiumRE const &part, const ClipType side); - void setChunkPart(ApertiumRE const &part, wstring const &value); - vector getTags(const vector& parentTags); - void updateTags(const vector& parentTags); - void output(const vector& parentTags, FILE* out); - void output(FILE* out); - wstring matchSurface(); + UString chunkPart(ApertiumRE const &part, const ClipType side); + void setChunkPart(ApertiumRE const &part, UString const &value); + vector getTags(const vector& parentTags); + void updateTags(const vector& parentTags); + void output(const vector& parentTags, UFILE* out); + void output(UFILE* out); + UString matchSurface(); void appendChild(Chunk* kid); void conjoin(Chunk* other); - void writeTree(TreeMode mode, FILE* out); + void writeTree(TreeMode mode, UFILE* out); private: - static pair chopString(wstring s); - static void writeString(wstring s, FILE* out); - void writeTreePlain(FILE* out, int depth); - void writeTreeLatex(FILE* out); - wstring writeTreeDot(FILE* out); - vector> writeTreeBox(); + static pair chopString(UString s); + static void writeString(UString s, UFILE* out); + void writeTreePlain(UFILE* out, int depth); + void writeTreeLatex(UFILE* out); + UString writeTreeDot(UFILE* out); + vector> writeTreeBox(); }; /** * Combines two wordbound blanks and returns it */ -wstring combineWblanks(wstring wblank_current, wstring wblank_to_add); +UString combineWblanks(UString wblank_current, UString wblank_to_add); #endif diff --git a/src/matcher.h b/src/matcher.h index da69cab..a1c8f78 100644 --- a/src/matcher.h +++ b/src/matcher.h @@ -6,6 +6,7 @@ #include #include #include +#include using namespace std; @@ -134,9 +135,9 @@ public: initial = t.getInitial(); - any_char = (*a)(L""); - any_tag = (*a)(L""); - lookahead = (*a)(L""); + any_char = (*a)(""_u); + any_tag = (*a)(""_u); + lookahead = (*a)(""_u); prematchIdx = 0; } @@ -173,26 +174,26 @@ public: } void matchBlank(int* state, int& first, int& last) { - step(state, first, last, L' '); + step(state, first, last, ' '); } - void matchChunk(int* state, int& first, int& last, const wstring& ch, bool addInit = true) + void matchChunk(int* state, int& first, int& last, const UString& ch, bool addInit = true) { - step(state, first, last, L'^'); + step(state, first, last, '^'); if(addInit) { - applySymbol(initial, L'^', state, last); + applySymbol(initial, '^', state, last); } for(unsigned int i = 0, limit = ch.size(); i < limit; i++) { switch(ch[i]) { - case L'\\': + case '\\': step(state, first, last, towlower(ch[++i]), any_char); break; - case L'<': + case '<': for(unsigned int j = i+1; j < ch.size(); j++) { - if(ch[j] == L'>') + if(ch[j] == '>') { int symbol = (*alpha)(ch.substr(i, j-i+1)); if(symbol) @@ -213,23 +214,23 @@ public: break; } } - step(state, first, last, L'$'); + step(state, first, last, '$'); } - void prepareChunk(const wstring& chunk) + void prepareChunk(const UString& chunk) { prematchIdx = 0; for(unsigned int i = 0, limit = chunk.size(); i < limit; i++) { switch(chunk[i]) { - case L'\\': + case '\\': prematchAlt[prematchIdx] = any_char; prematch[prematchIdx++] = towlower(chunk[++i]); break; - case L'<': + case '<': for(unsigned int j = i+1; j < chunk.size(); j++) { - if(chunk[j] == L'>') + if(chunk[j] == '>') { int symbol = (*alpha)(chunk.substr(i, j-i+1)); prematchAlt[prematchIdx] = any_tag; @@ -248,8 +249,8 @@ public: } void matchPreparedChunk(int* state, int& first, int& last) { - step(state, first, last, L'^'); - applySymbol(initial, L'^', state, last); + step(state, first, last, '^'); + applySymbol(initial, '^', state, last); for(int i = 0; i < prematchIdx; i++) { if(prematch[i] == any_tag) @@ -261,20 +262,20 @@ public: step(state, first, last, prematch[i], prematchAlt[i]); } } - step(state, first, last, L'$'); + step(state, first, last, '$'); } bool shouldShift(int* state, int first, int last) { for(int i = first; i != last; i = (i+1)%RTXStateSize) { - if(nodes[state[i]].search(L' ') != -1) + if(nodes[state[i]].search(' ') != -1) { return true; } } return false; } - bool shouldShift(int* state, int first, int last, const wstring& chunk) + bool shouldShift(int* state, int first, int last, const UString& chunk) { int local_state[RTXStateSize]; memcpy(local_state, state, RTXStateSize*sizeof(int)); @@ -357,8 +358,8 @@ public: int firstWord; int lastWord; int id; - map stringVars; - map wblankVars; + map stringVars; + map wblankVars; vector chunkVars; ParseNode() : first(0), last(0), firstWord(0), lastWord(0), id(-1) diff --git a/src/pattern.cc b/src/pattern.cc index 32d5312..1e3acd7 100644 --- a/src/pattern.cc +++ b/src/pattern.cc @@ -3,9 +3,9 @@ #include #include -#include -#include -#include +#include +#include +#include #include #include @@ -14,28 +14,26 @@ using namespace std; PatternBuilder::PatternBuilder() { - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - attr_items[L"lem"] = L"^(([^<]|\"\\<\")+)"; - attr_items[L"lemq"] = L"\\#[- _][^<]+"; - attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)"; - attr_items[L"whole"] = L"(.+)"; - attr_items[L"tags"] = L"((<[^>]+>)+)"; - attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!! - attr_items[L"chcontent"] = L"(\\{.+)"; - attr_items[L"content"] = L"(\\{.+)"; - attr_items[L"pos_tag"] = L"(<[^>]+>)"; - starCanBeEmpty = false; - chunkVarCount = 0; + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + alphabet.includeSymbol(""_u); + attr_items["lem"_u] = "^(([^<]|\"\\<\")+)"_u; + attr_items["lemq"_u] = "\\#[- _][^<]+"_u; + attr_items["lemh"_u] = "^(([^<#]|\"\\<\"|\"\\#\")+)"_u; + attr_items["whole"_u] = "(.+)"_u; + attr_items["tags"_u] = "((<[^>]+>)+)"_u; + attr_items["chname"_u] = "(\\{([^/]+)\\/)"_u; // includes delimiters { and / !!! + attr_items["chcontent"_u] = "(\\{.+)"_u; + attr_items["content"_u] = "(\\{.+)"_u; + attr_items["pos_tag"_u] = "(<[^>]+>)"_u; } int -PatternBuilder::insertLemma(int const base, wstring const &lemma) +PatternBuilder::insertLemma(int const base, UString const &lemma) { int retval = base; - static int const any_char = alphabet(L""); - if(lemma == L"") + static int const any_char = alphabet(""_u); + if(lemma.empty()) { retval = transducer.insertSingleTransduction(any_char, retval); transducer.linkStates(retval, retval, any_char); @@ -44,14 +42,14 @@ PatternBuilder::insertLemma(int const base, wstring const &lemma) { for(unsigned int i = 0, limit = lemma.size(); i != limit; i++) { - if(lemma[i] == L'\\') + if(lemma[i] == '\\') { - //retval = transducer.insertSingleTransduction(L'\\', retval); + //retval = transducer.insertSingleTransduction('\\', retval); i++; retval = transducer.insertSingleTransduction(int(lemma[i]), retval); } - else if(lemma[i] == L'*') + else if(lemma[i] == '*') { retval = transducer.insertSingleTransduction(any_char, retval); transducer.linkStates(retval, retval, any_char); @@ -68,13 +66,13 @@ PatternBuilder::insertLemma(int const base, wstring const &lemma) } int -PatternBuilder::insertTags(int const base, const vector& tags) +PatternBuilder::insertTags(int const base, const vector& tags) { int retval = base; - static int const any_tag = alphabet(L""); + static int const any_tag = alphabet(""_u); for(unsigned int i = 0; i < tags.size(); i++) { - if(tags[i] == L"*") + if(tags[i] == "*"_u) { if(!starCanBeEmpty) { @@ -84,10 +82,10 @@ PatternBuilder::insertTags(int const base, const vector& tags) } else { - vector tgs = StringUtils::split_wstring(tags[i], L"."); + vector tgs = StringUtils::split(tags[i], "."_u); for(auto t : tgs) { - wstring tg = L"<" + t + L">"; + UString tg = "<"_u + t + ">"_u; alphabet.includeSymbol(tg); retval = transducer.insertSingleTransduction(alphabet(tg), retval); } @@ -99,106 +97,15 @@ PatternBuilder::insertTags(int const base, const vector& tags) int PatternBuilder::countToFinalSymbol(const int count) { - const wstring count_sym = L""; + const UString count_sym = ""_u; alphabet.includeSymbol(count_sym); const int symbol = alphabet(count_sym); if(count != -1) final_symbols.insert(symbol); return symbol; } -vector -PatternBuilder::buildTrie(vector parts) -{ - vector ret; - vector> p2; - for(auto p : parts) - { - if(p.size() == 0) continue; - bool found = false; - for(unsigned int t = 0; t < p2.size(); t++) - { - if(ret[t]->self == p[0]) - { - p2[t].push_back(p.substr(1)); - found = true; - break; - } - } - if(!found) - { - TrieNode* t = new TrieNode; - t->self = p[0]; - ret.push_back(t); - p2.push_back(vector(1, p.substr(1))); - } - } - for(unsigned int i = 0; i < ret.size(); i++) - { - ret[i]->next = buildTrie(p2[i]); - } - return ret; -} - -wstring -PatternBuilder::unbuildTrie(PatternBuilder::TrieNode* t) -{ - if(t->self == L'\0') return L""; - wstring single; - bool end = false; - vector groups; - int ct = t->next.size(); - for(auto it : t->next) - { - wstring blob = unbuildTrie(it); - if(blob.size() == 0) - { - end = true; - ct--; - } - else if(blob.size() == 1) - { - if(single.size() > 0) ct--; - single += blob; - } - else groups.push_back(blob); - } - wstring ret; - if(t->self == L'#') ret += L'\\'; - ret += t->self; - if(single.size() == 0 && groups.size() == 0) return ret; - if(single.size() > 1) single = L"[" + single + L"]"; - if(ct > 1 || (groups.size() == 1 && end)) ret += L"(?:"; - for(unsigned int i = 0; i < groups.size(); i++) - { - if(i > 0) ret += L"|"; - ret += groups[i]; - } - if(single.size() > 0) - { - if(groups.size() > 0) ret += L"|"; - ret += single; - } - if(ct > 1 || (groups.size() == 1 && end)) ret += L")"; - if(end) ret += L"?"; - return ret; -} - -wstring -PatternBuilder::trie(vector parts) -{ - if(parts.size() == 0) return L""; - for(unsigned int i = 0; i < parts.size(); i++) - { - parts[i] = L"<" + parts[i]; - parts[i] += L'\0'; - } - vector l = buildTrie(parts); - // they all start with L'<', so there will only be 1. - return L"(" + unbuildTrie(l[0]) + L">)"; -} - void -PatternBuilder::addPattern(vector> pat, int rule, double weight, bool isLex) +PatternBuilder::addPattern(const vector>& pat, int rule, double weight, bool isLex) { int state = transducer.getInitial(); for(unsigned int p = 0; p < pat.size(); p++) @@ -214,17 +121,17 @@ PatternBuilder::addPattern(vector> pat, int rule, double if(pe->tags.size() > 0) lookahead[state].push_back(pe->tags[0]); } } - state = transducer.insertSingleTransduction(L' ', state); + state = transducer.insertSingleTransduction(' ', state); } - state = transducer.insertNewSingleTransduction(L'^', state); + state = transducer.insertNewSingleTransduction('^', state); int end = insertLemma(state, pat[p][0]->lemma); end = insertTags(end, pat[p][0]->tags); - end = transducer.insertSingleTransduction(L'$', end); + end = transducer.insertSingleTransduction('$', end); for(unsigned int i = 1; i < pat[p].size(); i++) { int temp = insertLemma(state, pat[p][i]->lemma); temp = insertTags(temp, pat[p][i]->tags); - transducer.linkStates(temp, end, L'$'); + transducer.linkStates(temp, end, '$'); } state = end; } @@ -234,7 +141,7 @@ PatternBuilder::addPattern(vector> pat, int rule, double } void -PatternBuilder::addRule(int rule, double weight, vector> pattern, vector firstChunk, wstring name) +PatternBuilder::addRule(int rule, double weight, const vector>& pattern, const vector& firstChunk, const UString& name) { rules[rule] = make_pair(firstChunk, pattern); addPattern(pattern, rule, weight, false); @@ -255,82 +162,64 @@ PatternBuilder::addRule(int rule, double weight, vector> } void -PatternBuilder::addList(wstring name, set vals) +PatternBuilder::addList(const UString& name, const set& vals) { lists[name] = vals; } void -PatternBuilder::addAttr(wstring name, set vals) +PatternBuilder::addAttr(const UString& name, const set& vals) { - /*wstring pat = L"("; - for(set::iterator it = vals.begin(); it != vals.end(); it++) - { - if(pat.size() > 1) - { - pat += L"|"; - } - pat += L"<" + StringUtils::substitute(*it, L".", L"><") + L">"; - } - pat += L")"; - attr_items[name] = pat;*/ - vector pat; - for(auto it : vals) - { - wstring p = StringUtils::substitute(it, L"\\.", L"<>"); - p = StringUtils::substitute(p, L".", L"><"); - pat.push_back(StringUtils::substitute(p, L"<>", L"\\.")); - } - wstring pt = trie(pat); - //wcerr << name << "\t" << pt << endl; - attr_items[name] = pt; + vector pat; + pat.assign(vals.begin(), vals.end()); + attr_items[name] = optimize_regex(pat); } bool -PatternBuilder::isAttrDefined(wstring name) +PatternBuilder::isAttrDefined(const UString& name) { return attr_items.find(name) != attr_items.end(); } void -PatternBuilder::addVar(wstring name, wstring val) +PatternBuilder::addVar(const UString& name, const UString& val) { variables[name] = val; } -wstring -PatternBuilder::BCstring(const wstring& s) +UString +PatternBuilder::BCstring(const UString& s) { - wstring ret; + UString ret; ret += STRING; - ret += (wchar_t)s.size(); + ret += (UChar)s.size(); ret += s; return ret; } -wstring -PatternBuilder::BCifthenelse(const wstring& cond, const wstring& yes, const wstring& no) +UString +PatternBuilder::BCifthenelse(const UString& cond, const UString& yes, const UString& no) { - wstring ret = cond; + UString ret = cond; if(yes.size() == 0) { ret += JUMPONTRUE; - ret += (wchar_t)no.size(); + ret += (UChar)no.size(); ret += no; } else if(no.size() == 0) { ret += JUMPONFALSE; - ret += (wchar_t)yes.size(); + ret += (UChar)yes.size(); ret += yes; } else { ret += JUMPONFALSE; - ret += (wchar_t)(yes.size() + 2); + ret += (UChar)(yes.size() + 2); ret += yes; ret += JUMP; - ret += (wchar_t)no.size(); + ret += (UChar)no.size(); ret += no; } return ret; @@ -342,11 +231,11 @@ PatternBuilder::buildLookahead() for(auto it : firstSet) { firstSet[it.first].insert(it.first); - vector todo; + vector todo; for(auto op : it.second) todo.push_back(op); while(todo.size() > 0) { - wstring cur = todo.back(); + UString cur = todo.back(); todo.pop_back(); if(cur != it.first && firstSet.find(cur) != firstSet.end()) { @@ -363,9 +252,9 @@ PatternBuilder::buildLookahead() } for(auto it : lookahead) { - int state = transducer.insertSingleTransduction(alphabet(L""), it.first); - state = transducer.insertSingleTransduction(L'^', state); - transducer.linkStates(state, state, alphabet(L"")); + int state = transducer.insertSingleTransduction(alphabet(""_u), it.first); + state = transducer.insertSingleTransduction('^', state); + transducer.linkStates(state, state, alphabet(""_u)); int end = -1; for(auto next : it.second) { @@ -373,19 +262,19 @@ PatternBuilder::buildLookahead() for(auto tag : firstSet[next]) { int temp = state; - if(tag != L"*") + if(tag != "*"_u) { - temp = transducer.insertSingleTransduction(alphabet(L"<" + tag + L">"), temp); + temp = transducer.insertSingleTransduction(alphabet("<"_u + tag + ">"_u), temp); } - transducer.linkStates(temp, temp, alphabet(L"")); + transducer.linkStates(temp, temp, alphabet(""_u)); if(end == -1) { - end = transducer.insertSingleTransduction(L'$', temp); + end = transducer.insertSingleTransduction('$', temp); transducer.setFinal(end); } else { - transducer.linkStates(temp, end, L'$'); + transducer.linkStates(temp, end, '$'); } } } @@ -402,7 +291,7 @@ PatternBuilder::isPrefix(const vector>& rule, const vect for(auto r : rule[i]) { if(r->tags.size() == 0) continue; - else if(r->tags[0] == L"*") + else if(r->tags[0] == "*"_u) { found = true; break; @@ -410,7 +299,7 @@ PatternBuilder::isPrefix(const vector>& rule, const vect for(auto p : prefix[i]) { if(p->tags.size() == 0) continue; - else if(p->tags[0] == L"*" || p->tags[0] == r->tags[0]) + else if(p->tags[0] == "*"_u || p->tags[0] == r->tags[0]) { found = true; break; @@ -430,7 +319,7 @@ PatternBuilder::buildFallback() starCanBeEmpty = true; vector fallback; PatternElement* fall = new PatternElement; - fall->tags.push_back(L"FALL:BACK"); + fall->tags.push_back("FALL:BACK"_u); fallback.push_back(fall); for(auto rule : rules) { @@ -439,13 +328,13 @@ PatternBuilder::buildFallback() { PatternElement* pe = new PatternElement; pe->tags.push_back(tg); - pe->tags.push_back(L"*"); + pe->tags.push_back("*"_u); result.push_back(pe); } vector> resultPat; resultPat.push_back(result); - set patPrefix; - set resultPrefix; + set patPrefix; + set resultPrefix; for(auto rule2 : rules) { if(isPrefix(rule2.second.second, resultPat)) @@ -475,7 +364,7 @@ PatternBuilder::buildFallback() { PatternElement* pe = new PatternElement; pe->tags.push_back(it); - pe->tags.push_back(L"*"); + pe->tags.push_back("*"_u); add.push_back(pe); } resultPat.push_back(add); @@ -492,33 +381,24 @@ PatternBuilder::buildFallback() void PatternBuilder::loadLexFile(const string& fname) { - wifstream lex; - lex.open(fname); - if(!lex.is_open()) - { - wcerr << "Unable to open file " << fname.c_str() << " for reading." << endl; - exit(EXIT_FAILURE); - } - while(!lex.eof()) - { - wstring name; - while(!lex.eof() && lex.peek() != L'\t') name += lex.get(); + InputFile lex; + lex.open_or_exit(fname.c_str()); + while(!lex.eof()) { + UString name; + while(!lex.eof() && lex.peek() != '\t') name += lex.get(); lex.get(); - wstring weight; - while(!lex.eof() && lex.peek() != L'\t') weight += lex.get(); + UString weight; + while(!lex.eof() && lex.peek() != '\t') weight += lex.get(); lex.get(); if(lex.eof()) break; vector> pat; - while(!lex.eof() && lex.peek() != L'\n') - { + while(!lex.eof() && lex.peek() != '\n') { PatternElement* p = new PatternElement; - while(lex.peek() != L'@') p->lemma += towlower(lex.get()); + while(lex.peek() != '@') p->lemma += u_tolower(lex.get()); lex.get(); - wstring tag; - while(lex.peek() != L' ' && lex.peek() != L'\n') - { - if(lex.peek() == L'.') - { + UString tag; + while(lex.peek() != ' ' && lex.peek() != '\n') { + if(lex.peek() == '.') { lex.get(); p->tags.push_back(tag); tag.clear(); @@ -526,29 +406,29 @@ PatternBuilder::loadLexFile(const string& fname) else tag += lex.get(); } p->tags.push_back(tag); - if(lex.peek() == L' ') lex.get(); + if(lex.peek() == ' ') lex.get(); pat.push_back(vector(1, p)); } lex.get(); - lexicalizations[name].push_back(make_pair(stod(weight), pat)); + lexicalizations[name].push_back(make_pair(StringUtils::stod(weight), pat)); } } void -PatternBuilder::write(FILE* output, int longest, vector> inputBytecode, vector outputBytecode) +PatternBuilder::write(FILE* output, int longest, vector> inputBytecode, vector outputBytecode) { Compression::multibyte_write(longest, output); Compression::multibyte_write(inputBytecode.size(), output); for(unsigned int i = 0; i < inputBytecode.size(); i++) { Compression::multibyte_write(inputBytecode[i].first, output); - Compression::wstring_write(inputBytecode[i].second, output); + Compression::string_write(inputBytecode[i].second, output); } Compression::multibyte_write(outputBytecode.size(), output); for(unsigned int i = 0; i < outputBytecode.size(); i++) { - Compression::wstring_write(outputBytecode[i], output); + Compression::string_write(outputBytecode[i], output); } Compression::multibyte_write(chunkVarCount, output); @@ -565,7 +445,7 @@ PatternBuilder::write(FILE* output, int longest, vector> inpu // Find all arcs with "final_symbols" in the transitions, let their source node instead be final, // and extract the rule number from the arc. Record relation between source node and rule number // in finals_rules. It is now no longer safe to minimize -- but we already did that. - const wstring rule_sym_pre = L" > >::const_iterator it = transitions.begin(), limit = transitions.end(); it != limit; ++it) { @@ -579,12 +459,12 @@ PatternBuilder::write(FILE* output, int longest, vector> inpu continue; } // Extract the rule number encoded by countToFinalSymbol(): - wstring s; + UString s; alphabet.getSymbol(s, symbol); if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) { continue; } - const int rule_num = stoi(s.substr(rule_sym_pre.size())); + const int rule_num = StringUtils::stoi(s.substr(rule_sym_pre.size())); transducer.setFinal(src); finals_rules.insert(make_pair(src, make_pair(rule_num, wgt))); } @@ -610,54 +490,42 @@ PatternBuilder::write(FILE* output, int longest, vector> inpu // attr_items - // precompiled regexps - Compression::string_write(string(pcre_version()), output); + // empty version number since we're not on PCRE anymore + Compression::multibyte_write(0, output); Compression::multibyte_write(attr_items.size(), output); - map::iterator it, limit; - for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++) - { - Compression::wstring_write(it->first, output); - ApertiumRE my_re; - my_re.compile(UtfConverter::toUtf8(it->second)); - my_re.write(output); - Compression::wstring_write(it->second, output); + for (auto& it : attr_items) { + Compression::string_write(it.first, output); + Compression::multibyte_write(0, output); // empty binary form of regex + Compression::string_write(it.second, output); } // variables Compression::multibyte_write(variables.size(), output); - for(map::const_iterator it = variables.begin(), limit = variables.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, output); - Compression::wstring_write(it->second, output); + for (auto& it : variables) { + Compression::string_write(it.first, output); + Compression::string_write(it.second, output); } // lists Compression::multibyte_write(lists.size(), output); - for(map, Ltstr>::const_iterator it = lists.begin(), limit = lists.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, output); - Compression::multibyte_write(it->second.size(), output); + for (auto& it : lists) { + Compression::string_write(it.first, output); + Compression::multibyte_write(it.second.size(), output); - for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); - it2 != limit2; it2++) - { - Compression::wstring_write(*it2, output); + for (auto& it2 : it.second) { + Compression::string_write(it2, output); } } // rule names Compression::multibyte_write(inRuleNames.size(), output); - for(unsigned int i = 0; i < inRuleNames.size(); i++) - { - Compression::wstring_write(inRuleNames[i], output); + for (auto& name : inRuleNames) { + Compression::string_write(name, output); } Compression::multibyte_write(outRuleNames.size(), output); - for(unsigned int i = 0; i < outRuleNames.size(); i++) - { - Compression::wstring_write(outRuleNames[i], output); + for (auto& name : outRuleNames) { + Compression::string_write(name, output); } } diff --git a/src/pattern.h b/src/pattern.h index 0e92d0a..ed48cc6 100644 --- a/src/pattern.h +++ b/src/pattern.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include @@ -15,8 +15,8 @@ using namespace std; struct PatternElement { - wstring lemma; - vector tags; + UString lemma; + vector tags; }; class PatternBuilder @@ -31,27 +31,27 @@ private: * Attribute categories * name => regex */ - map attr_items; + map attr_items; /** * Lists * name => { values } */ - map, Ltstr> lists; + map> lists; /** * Global string variables * name => initial value */ - map variables; + map variables; /** * Symbols marking ends of rules in pattern transducer */ set final_symbols; - map> lookahead; - map> firstSet; + map> lookahead; + map> firstSet; /** * Alphabet of pattern transducer @@ -67,9 +67,9 @@ private: * Lexicalized weights for rules * rule id => [ ( weight, processed pattern ) ... ] */ - map>>>> lexicalizations; + map>>>> lexicalizations; - map, vector>>> rules; + map, vector>>> rules; ////////// // TRANSDUCER PATH BUILDING @@ -79,13 +79,13 @@ private: * Starting from base, add path for lemma * @return end state */ - int insertLemma(int const base, wstring const &lemma); + int insertLemma(int const base, UString const &lemma); /** * Starting from base, insert each tag in tags * @return end state */ - int insertTags(int const base, const vector& tags); + int insertTags(int const base, const vector& tags); /** * Generate symbol of the form L"" to mark rule end @@ -95,7 +95,7 @@ private: /** * Build complete path */ - void addPattern(vector> pat, int rule, double weight, bool isLex); + void addPattern(const vector>& pat, int rule, double weight, bool isLex); void buildLookahead(); @@ -103,31 +103,6 @@ private: void buildFallback(); - ////////// - // ATTRIBUTE COMPRESSION - ////////// - - struct TrieNode - { - wchar_t self; - vector next; - }; - - /** - * Construct tries for a set of inputs, return one for each initial character - */ - vector buildTrie(vector parts); - - /** - * Convert trie to regex - */ - wstring unbuildTrie(TrieNode* t); - - /** - * Wrapper around buildTrie() and unbuildTrie() - */ - wstring trie(vector parts); - public: ////////// @@ -137,40 +112,39 @@ public: // false: * = 1 or more tags, true: * = 0 or more tags /** * If false, L"*" must match at least one tag, otherwise it can match 0 - * Default: false */ - bool starCanBeEmpty; + bool starCanBeEmpty = false; /** * Number of global Chunk* variables to allocate space for */ - unsigned int chunkVarCount; + unsigned int chunkVarCount = 0; /** * Debug names for input-time rules */ - vector inRuleNames; + vector inRuleNames; /** * Debug names for output-time rules */ - vector outRuleNames; + vector outRuleNames; PatternBuilder(); - void addRule(int rule, double weight, vector> pattern, vector firstChunk, wstring name); - void addList(wstring name, set vals); - void addAttr(wstring name, set vals); - bool isAttrDefined(wstring name); - void addVar(wstring name, wstring val); + void addRule(int rule, double weight, const vector>& pattern, const vector& firstChunk, const UString& name); + void addList(const UString& name, const set& vals); + void addAttr(const UString& name, const set& vals); + bool isAttrDefined(const UString& name); + void addVar(const UString& name, const UString& val); void loadLexFile(const string& fname); - void write(FILE* output, int longest, vector> inputBytecode, vector outputBytecode); + void write(FILE* output, int longest, vector> inputBytecode, vector outputBytecode); ////////// // BYTECODE CONSTRUCTION ////////// - wstring BCstring(const wstring& s); - wstring BCifthenelse(const wstring& cond, const wstring& yes, const wstring& no); + UString BCstring(const UString& s); + UString BCifthenelse(const UString& cond, const UString& yes, const UString& no); }; #endif diff --git a/src/randpath.cc b/src/randpath.cc index a2ed32e..b8ae2da 100644 --- a/src/randpath.cc +++ b/src/randpath.cc @@ -3,18 +3,18 @@ #include #include #include -#include #include #include #include #include +#include using namespace std; Alphabet A; Transducer T; -wstring prefix; -vector> paths; +UString prefix; +vector> paths; unsigned int donecount = 0; bool load(FILE* input) @@ -53,13 +53,7 @@ bool load(FILE* input) while(len > 0) { - int len2 = Compression::multibyte_read(input); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(input)); - len2--; - } + UString name = Compression::string_read(input); T.read(input); len--; return true; @@ -97,14 +91,14 @@ void followPath(int idx) { paths.push_back(make_pair(ops[i].second, paths[idx].second)); A.getSymbol(paths.back().second, ops[i].first); - if(paths.back().second.size() > 0 && paths.back().second.back() == L'+') + if(paths.back().second.size() > 0 && paths.back().second.back() == '+') { paths.pop_back(); } } state = ops[0].second; A.getSymbol(paths[idx].second, ops[0].first); - if(paths[idx].second.size() > 0 && paths[idx].second.back() == L'+') + if(paths[idx].second.size() > 0 && paths[idx].second.back() == '+') { paths.erase(paths.begin() + idx); return; @@ -120,12 +114,12 @@ void generatePaths() for(unsigned int i = 0; i < prefix.size(); i++) { int sym = prefix[i]; - int sym2 = towlower(prefix[i]); - if(prefix[i] == L'<') + int sym2 = u_tolower(prefix[i]); + if(prefix[i] == '<') { for(unsigned int j = i+1; j < prefix.size(); j++) { - if(prefix[j] == L'>') + if(prefix[j] == '>') { sym = A(prefix.substr(i, j-i+1)); i = j; @@ -151,7 +145,7 @@ void generatePaths() } for(auto s : states) { - paths.push_back(make_pair(s, L"")); + paths.push_back(make_pair(s, ""_u)); followPath(paths.size() - 1); } while(donecount < paths.size()) @@ -172,30 +166,30 @@ int main(int argc, char *argv[]) LtLocale::tryToSetLocale(); if(argc != 3) { - wcerr << "Usage: " << argv[0] << " transducer prefix" << endl; + cerr << "Usage: " << argv[0] << " transducer prefix" << endl; return EXIT_FAILURE; } FILE* tf = fopen(argv[1], "rb"); if(tf == NULL) { - wcerr << "Unable to open " << argv[1] << " for reading." << endl; + cerr << "Unable to open " << argv[1] << " for reading." << endl; return EXIT_FAILURE; } if(!load(tf)) { - wcerr << "Unable to read transducer." << endl; + cerr << "Unable to read transducer." << endl; return EXIT_FAILURE; } - prefix = UtfConverter::fromUtf8(argv[2]); + prefix = to_ustring(argv[2]); generatePaths(); if(paths.size() == 0) { - wcerr << "No paths begin with that prefix." << endl; + cerr << "No paths begin with that prefix." << endl; return EXIT_FAILURE; } //seed_seq s (prefix.begin(), prefix.end()); unsigned s = chrono::system_clock::now().time_since_epoch().count(); minstd_rand0 g (s); - wcout << prefix << paths[g() % paths.size()].second << endl; + cout << prefix << paths[g() % paths.size()].second << endl; return EXIT_SUCCESS; } diff --git a/src/rtx_comp.cc b/src/rtx_comp.cc index 145a05a..d9deb8e 100644 --- a/src/rtx_comp.cc +++ b/src/rtx_comp.cc @@ -3,14 +3,11 @@ #include #include #include -#include -#include #include #include #include #include -using namespace Apertium; using namespace std; void endProgram(char *name) @@ -51,7 +48,7 @@ int main(int argc, char *argv[]) bool stats = false; bool summary = false; - vector exclude; + vector exclude; vector lexFiles; while(true) @@ -71,7 +68,7 @@ int main(int argc, char *argv[]) switch(c) { case 'e': - exclude.push_back(UtfConverter::fromUtf8(optarg)); + exclude.push_back(to_ustring(optarg)); break; case 'l': diff --git a/src/rtx_compiler.cc b/src/rtx_compiler.cc index 4318cb8..8993d1a 100644 --- a/src/rtx_compiler.cc +++ b/src/rtx_compiler.cc @@ -1,15 +1,14 @@ #include #include -#include -#include +#include using namespace std; -wstring const -RTXCompiler::ANY_TAG = L""; +UString const +RTXCompiler::ANY_TAG = ""_u; -wstring const -RTXCompiler::ANY_CHAR = L""; +UString const +RTXCompiler::ANY_CHAR = ""_u; RTXCompiler::RTXCompiler() { @@ -23,48 +22,48 @@ RTXCompiler::RTXCompiler() currentLocType = LocTypeNone; PB.starCanBeEmpty = true; summarizing = false; - outputRules[L"UNKNOWN:INTERNAL"] = vector(1, L"_"); + outputRules["UNKNOWN:INTERNAL"_u] = vector(1, "_"_u); } -wstring const -RTXCompiler::SPECIAL_CHARS = L"!@$%()={}[]|/:;<>,.→"; +UString const +RTXCompiler::SPECIAL_CHARS = "!@$%()={}[]|/:;<>,.→"_u; void -RTXCompiler::die(wstring message) +RTXCompiler::die(UString message) { if(errorsAreSyntax) { - wcerr << L"Syntax error on line " << currentLine << L" of "; + cerr << "Syntax error on line " << currentLine << " of "; } else { - wcerr << L"Error in "; + cerr << "Error in "; while(macroNameStack.size() > 0) { - wcerr << "macro '" << macroNameStack.back() << "', invoked by "; + cerr << "macro '" << macroNameStack.back() << "', invoked by "; macroNameStack.pop_back(); } - wcerr << L"rule beginning on line " << currentRule->line << L" of "; + cerr << "rule beginning on line " << currentRule->line << " of "; } - wcerr << UtfConverter::fromUtf8(sourceFile) << L": " << message << endl; + cerr << sourceFile << ": " << message << endl; if(errorsAreSyntax && !source.eof()) { - wstring arr = wstring(recentlyRead.size()-2, L' '); + UString arr = UString(recentlyRead.size()-2, ' '); recentlyRead += unreadbuf; - while(!source.eof() && peekchar() != L'\n') + while(!source.eof() && peekchar() != '\n') { recentlyRead += source.get(); } - wcerr << recentlyRead << endl; - wcerr << arr << L"^^^" << endl; + cerr << recentlyRead << endl; + cerr << arr << "^^^" << endl; } exit(EXIT_FAILURE); } -wchar_t +UChar RTXCompiler::getchar() { - wchar_t c; + UChar c; if(unreadbuf.size() > 0) { c = unreadbuf[0]; @@ -75,7 +74,7 @@ RTXCompiler::getchar() return c; } -wchar_t +UChar RTXCompiler::peekchar() { if(unreadbuf.size() > 0) return unreadbuf[0]; @@ -98,12 +97,12 @@ RTXCompiler::unread() void RTXCompiler::eatSpaces() { - wchar_t c; + UChar c; bool inComment = false; while(!source.eof()) { c = peekchar(); - if(c == L'\n') + if(c == '\n') { getchar(); inComment = false; @@ -115,7 +114,7 @@ RTXCompiler::eatSpaces() { getchar(); } - else if(c == L'!') + else if(c == '!') { getchar(); inComment = true; @@ -127,62 +126,61 @@ RTXCompiler::eatSpaces() } } -wstring +UString RTXCompiler::nextTokenNoSpace() { if(source.eof()) { - die(L"Unexpected end of file"); + die("Unexpected end of file"_u); } - wchar_t c = getchar(); - wchar_t next = peekchar(); - wstring ret; - if(c == L'→') - { - ret = L"->"; + UChar c = getchar(); + UChar next = peekchar(); + UString ret; + if (c == u'\u2192') { // '→' + ret = "->"_u; } else if(SPECIAL_CHARS.find(c) != string::npos) { - ret = wstring(1, c); + ret = UString(1, c); } - else if(c == L'-' && next == L'>') + else if(c == '-' && next == '>') { getchar(); - ret = wstring(1, c) + wstring(1, next); + ret = UString(1, c) + UString(1, next); } else if(isspace(c)) { - die(L"unexpected space"); + die("unexpected space"_u); } - else if(c == L'!') + else if(c == '!') { - die(L"unexpected comment"); + die("unexpected comment"_u); } - else if(c == L'"') + else if(c == '"') { next = getchar(); - while(!source.eof() && next != L'"') + while(!source.eof() && next != '"') { - if(next == L'\\') next = getchar(); + if(next == '\\') next = getchar(); ret += next; - if(source.eof()) die(L"Unexpected end of file."); + if(source.eof()) die("Unexpected end of file."_u); next = getchar(); } } else { - ret = wstring(1, c); + ret = UString(1, c); while(!source.eof()) { c = peekchar(); - if(c == L'\\') + if(c == '\\') { getchar(); ret += getchar(); } else if(SPECIAL_CHARS.find(c) == string::npos && !isspace(c)) { - ret += wstring(1, getchar()); + ret += UString(1, getchar()); } else { @@ -194,7 +192,7 @@ RTXCompiler::nextTokenNoSpace() } bool -RTXCompiler::isNextToken(wchar_t c) +RTXCompiler::isNextToken(UChar c) { if(peekchar() == c) { @@ -204,46 +202,46 @@ RTXCompiler::isNextToken(wchar_t c) return false; } -wstring -RTXCompiler::nextToken(wstring check1 = L"", wstring check2 = L"") +UString +RTXCompiler::nextToken(UString check1 = ""_u, UString check2 = ""_u) { eatSpaces(); - wstring tok = nextTokenNoSpace(); - if(tok == check1 || tok == check2 || (check1 == L"" && check2 == L"")) + UString tok = nextTokenNoSpace(); + if(tok == check1 || tok == check2 || (check1.empty() && check2.empty())) { } - else if(check1 != L"" && check2 != L"") + else if(!check1.empty() && !check2.empty()) { - die(L"expected '" + check1 + L"' or '" + check2 + L"', found '" + tok + L"'"); + die("expected '"_u + check1 + "' or '"_u + check2 + "', found '"_u + tok + "'"_u); } - else if(check1 != L"") + else if(!check1.empty()) { - die(L"expected '" + check1 + L"', found '" + tok + L"'"); + die("expected '"_u + check1 + "', found '"_u + tok + "'"_u); } else { - die(L"expected '" + check2 + L"', found '" + tok + L"'"); + die("expected '"_u + check2 + "', found '"_u + tok + "'"_u); } return tok; } -wstring +UString RTXCompiler::parseIdent(bool prespace = false) { if(prespace) { eatSpaces(); } - wchar_t next = peekchar(); - wstring ret = nextTokenNoSpace(); - if(next == L'"') + UChar next = peekchar(); + UString ret = nextTokenNoSpace(); + if(next == '"') { // so that quoted special characters don't fail the next check return ret; } - if(ret == L"->" || (ret.size() == 1 && SPECIAL_CHARS.find(ret[0]) != string::npos)) + if(ret == "->"_u || (ret.size() == 1 && SPECIAL_CHARS.find(ret[0]) != string::npos)) { - die(L"expected identifier, found '" + ret + L"'"); + die("expected identifier, found '"_u + ret + "'"_u); } return ret; } @@ -251,35 +249,30 @@ RTXCompiler::parseIdent(bool prespace = false) unsigned int RTXCompiler::parseInt() { - wstring ret; + UString ret; while(isdigit(peekchar())) { ret += getchar(); } - return stoul(ret); + return StringUtils::stoi(ret); } float RTXCompiler::parseWeight() { - wstring ret; - while(isdigit(peekchar()) || peekchar() == L'.') + UString ret; + while(isdigit(peekchar()) || peekchar() == '.') { ret += getchar(); } float r; try { - wstring::size_type loc; - r = stof(ret, &loc); - if(loc != ret.size()) - { - die(L"unable to parse weight: " + ret); - } + r = StringUtils::stod(ret); } catch(const invalid_argument& ia) { - die(L"unable to parse weight: " + ret); + die("unable to parse weight: "_u + ret); } return r; } @@ -287,17 +280,17 @@ RTXCompiler::parseWeight() void RTXCompiler::parseRule() { - wstring firstLabel = parseIdent(); - wstring next = nextToken(); - if(next == L":") + UString firstLabel = parseIdent(); + UString next = nextToken(); + if(next == ":"_u) { parseOutputRule(firstLabel); } - else if(next == L">") + else if(next == ">"_u) { parseRetagRule(firstLabel); } - else if(next == L"=") + else if(next == "="_u) { parseAttrRule(firstLabel); } @@ -308,74 +301,74 @@ RTXCompiler::parseRule() } void -RTXCompiler::parseOutputRule(wstring pattern) +RTXCompiler::parseOutputRule(UString pattern) { - nodeIsSurface[pattern] = !isNextToken(L':'); + nodeIsSurface[pattern] = !isNextToken(':'); eatSpaces(); - vector output; - if(peekchar() == L'(') + vector output; + if(peekchar() == '(') { LocationType typewas = currentLocType; Location locwas = currentLoc; currentLoc = LocChunk; currentLocType = LocTypeMacro; macros[pattern] = parseOutputCond(); - output.push_back(L"macro"); + output.push_back("macro"_u); currentLocType = typewas; currentLoc = locwas; - nextToken(L";"); + nextToken(";"_u); } - else if(peekchar() == L'%') + else if(peekchar() == '%') { - output.push_back(L"%"); - nextToken(L"%"); - nextToken(L";"); + output.push_back("%"_u); + nextToken("%"_u); + nextToken(";"_u); } else { - wstring cur; + UString cur; while(!source.eof()) { cur = nextToken(); - if(cur == L"<") + if(cur == "<"_u) { cur = cur + parseIdent(); - cur += nextToken(L">"); + cur += nextToken(">"_u); } output.push_back(cur); - if(nextToken(L".", L";") == L";") + if(nextToken("."_u, ";"_u) == ";"_u) { break; } } if(output.size() == 0) { - die(L"empty tag order rule"); + die("empty tag order rule"_u); } } outputRules[pattern] = output; } void -RTXCompiler::parseRetagRule(wstring srcTag) +RTXCompiler::parseRetagRule(UString srcTag) { - wstring destTag = parseIdent(true); - nextToken(L":"); - vector> rule; - rule.push_back(pair(srcTag, destTag)); + UString destTag = parseIdent(true); + nextToken(":"_u); + vector> rule; + rule.push_back(pair(srcTag, destTag)); while(!source.eof()) { eatSpaces(); - bool list = isNextToken(L'['); - wstring cs = parseIdent(true); + bool list = isNextToken('['); + UString cs = parseIdent(true); if(list) { - nextToken(L"]"); - cs = L"[]" + cs; + nextToken("]"_u); + cs = "[]"_u + cs; } - wstring cd = parseIdent(true); - rule.push_back(pair(cs, cd)); - if(nextToken(L";", L",") == L";") + UString cd = parseIdent(true); + rule.push_back(pair(cs, cd)); + if(nextToken(";"_u, ","_u) == ";"_u) { break; } @@ -386,7 +379,7 @@ RTXCompiler::parseRetagRule(wstring srcTag) if(other[0].first == srcTag && other[0].second == destTag) { found = true; - wcerr << "Warning: Tag-rewrite rule '" << srcTag << "' > '" << destTag << "' is defined multiple times. Mappings in earlier definition may be overwritten." << endl; + cerr << "Warning: Tag-rewrite rule '" << srcTag << "' > '" << destTag << "' is defined multiple times. Mappings in earlier definition may be overwritten." << endl; other.insert(other.begin()+1, rule.begin()+1, rule.end()); break; } @@ -403,38 +396,38 @@ RTXCompiler::parseRetagRule(wstring srcTag) } void -RTXCompiler::parseAttrRule(wstring categoryName) +RTXCompiler::parseAttrRule(UString categoryName) { if(collections.find(categoryName) != collections.end() || PB.isAttrDefined(categoryName)) { - die(L"Redefinition of attribute category '" + categoryName + L"'."); + die("Redefinition of attribute category '"_u + categoryName + "'."_u); } eatSpaces(); - if(isNextToken(L'(')) + if(isNextToken('(')) { - wstring undef = parseIdent(true); - wstring def = parseIdent(true); + UString undef = parseIdent(true); + UString def = parseIdent(true); attrDefaults[categoryName] = make_pair(undef, def); - nextToken(L")"); + nextToken(")"_u); } - vector members; - vector noOver; + vector members; + vector noOver; while(true) { eatSpaces(); - if(isNextToken(L';')) + if(isNextToken(';')) { break; } - if(isNextToken(L'[')) + if(isNextToken('[')) { - wstring other = parseIdent(true); + UString other = parseIdent(true); if(collections.find(other) == collections.end()) { - die(L"Use of category '" + other + L"' in set arithmetic before definition."); + die("Use of category '"_u + other + "' in set arithmetic before definition."_u); } - vector otherstuff = collections[other]; + vector otherstuff = collections[other]; for(unsigned int i = 0; i < otherstuff.size(); i++) { members.push_back(otherstuff[i]); @@ -444,11 +437,11 @@ RTXCompiler::parseAttrRule(wstring categoryName) { noOver.push_back(otherstuff[i]); } - nextToken(L"]"); + nextToken("]"_u); } - else if(isNextToken(L'@')) + else if(isNextToken('@')) { - wstring next = parseIdent(); + UString next = parseIdent(); members.push_back(next); noOver.push_back(next); } @@ -459,18 +452,18 @@ RTXCompiler::parseAttrRule(wstring categoryName) } if(members.size() == 0) { - die(L"empty attribute list"); + die("empty attribute list"_u); } - collections.insert(pair>(categoryName, members)); - noOverwrite.insert(pair>(categoryName, noOver)); + collections.insert(pair>(categoryName, members)); + noOverwrite.insert(pair>(categoryName, noOver)); if(noOver.size() > 0) { for(unsigned int i = 0; i < noOver.size(); i++) { - noOver[i] = L"<" + noOver[i] + L">"; + noOver[i] = "<"_u + noOver[i] + ">"_u; } } - collections.insert(make_pair(categoryName + L" over", noOver)); + collections.insert(make_pair(categoryName + " over"_u, noOver)); } RTXCompiler::Clip* @@ -483,26 +476,26 @@ RTXCompiler::parseClip(int src = -2) { ret->src = src; } - else if(isNextToken(L'>')) + else if(isNextToken('>')) { ret->src = parseInt(); - nextToken(L"."); + nextToken("."_u); bounds = false; } else if(isdigit(peekchar())) { ret->src = parseInt(); - nextToken(L"."); + nextToken("."_u); } - else if(isNextToken(L'$')) + else if(isNextToken('$')) { - if(isNextToken(L'$')) + if(isNextToken('$')) { ret->src = ChunkVarClip; ret->varName = parseIdent(); - nextToken(L"."); + nextToken("."_u); } - else if(isNextToken(L'%')) + else if(isNextToken('%')) { ret->src = StringVarClip; ret->varName = parseIdent(); @@ -512,11 +505,11 @@ RTXCompiler::parseClip(int src = -2) ret->src = ParentClip; if(currentLocType != LocTypeOutput) { - die(L"Chunk tags can only be accessed from output sections of reduction rules."); + die("Chunk tags can only be accessed from output sections of reduction rules."_u); } } } - else if(peekchar() == L'(') + else if(peekchar() == '(') { OutputChunk* chunkwas = currentChunk; OutputChoice* choicewas = currentChoice; @@ -541,54 +534,54 @@ RTXCompiler::parseClip(int src = -2) { if(ret->src == ParentClip || ret->src > 1) { - die(L"Macros can only access their single argument."); + die("Macros can only access their single argument."_u); } } else if(bounds && src == -2 && ret->src > (int)currentRule->pattern.size()) { - die(L"Clip source is out of bounds (position " + to_wstring(ret->src) + L" requested, but rule has only " + to_wstring(currentRule->pattern.size()) + L" elements in its pattern)."); + die("Clip source is out of bounds (position "_u + StringUtils::itoa(ret->src) + " requested, but rule has only "_u + StringUtils::itoa(currentRule->pattern.size()) + " elements in its pattern)."_u); } if(ret->src != StringVarClip) { ret->part = (src == -3) ? nextToken() : parseIdent(); } - if(isNextToken(L'/')) + if(isNextToken('/')) { if(ret->src == ConstantClip) { - die(L"literal value cannot have a side"); + die("literal value cannot have a side"_u); } else if(ret->src == StringVarClip) { - die(L"variable cannot have a side"); + die("variable cannot have a side"_u); } ret->side = parseIdent(); } else if(ret->src == ParentClip) { - ret->side = L"tl"; + ret->side = "tl"_u; } - if(isNextToken(L'>')) + if(isNextToken('>')) { if(ret->src == ConstantClip) { - die(L"literal value cannot be rewritten"); + die("literal value cannot be rewritten"_u); } else if(ret->src == ParentClip || ret->src == StringVarClip) { - die(L"variable cannot be rewritten"); + die("variable cannot be rewritten"_u); } ret->rewrite.push_back(parseIdent()); } return ret; } -wchar_t -RTXCompiler::lookupOperator(wstring op) +UChar +RTXCompiler::lookupOperator(UString op) { - wstring key = StringUtils::tolower(op); - key = StringUtils::substitute(key, L"-", L""); - key = StringUtils::substitute(key, L"_", L""); + UString key = StringUtils::tolower(op); + key = StringUtils::substitute(key, "-"_u, ""_u); + key = StringUtils::substitute(key, "_"_u, ""_u); for(unsigned int i = 0; i < OPERATORS.size(); i++) { if(key == OPERATORS[i].first) @@ -602,12 +595,12 @@ RTXCompiler::lookupOperator(wstring op) RTXCompiler::Cond* RTXCompiler::parseCond() { - nextToken(L"("); + nextToken("("_u); eatSpaces(); vector parts; - while(!source.eof() && peekchar() != L')') + while(!source.eof() && peekchar() != ')') { - if(peekchar() == L'(') + if(peekchar() == '(') { parts.push_back(parseCond()); } @@ -620,8 +613,8 @@ RTXCompiler::parseCond() } eatSpaces(); } - nextToken(L")"); - if(parts.size() == 0) die(L"Empty conditional."); + nextToken(")"_u); + if(parts.size() == 0) die("Empty conditional."_u); vector> denot; bool negated = false; for(unsigned int i = 0; i < parts.size(); i++) @@ -629,7 +622,7 @@ RTXCompiler::parseCond() if(i != parts.size() - 1 && parts[i]->op == 0 && parts[i]->val->src == 0) { - wchar_t op = lookupOperator(parts[i]->val->part); + UChar op = lookupOperator(parts[i]->val->part); if(op == NOT) { negated = !negated; @@ -645,14 +638,14 @@ RTXCompiler::parseCond() if(i != 0 && i != denot.size() - 1 && denot[i].second->op == 0 && denot[i].second->val->src == 0) { - wchar_t op = lookupOperator(denot[i].second->val->part); + UChar op = lookupOperator(denot[i].second->val->part); if(op != 0 && op != AND && op != OR && op != NOT) { if(destring.back().second->op == 0 && denot[i+1].second->op == 0) { if(destring.back().first || denot[i+1].first) { - die(L"Cannot negate string (I can't parse 'not a = b', use 'not (a = b)' or 'a not = b' instead)."); + die("Cannot negate string (I can't parse 'not a = b', use 'not (a = b)' or 'a not = b' instead)."_u); } denot[i].second->left = destring.back().second; denot[i].second->right = denot[i+1].second; @@ -675,13 +668,13 @@ RTXCompiler::parseCond() ret->right = destring[0].second; } else ret = destring[0].second; - if(destring.size() % 2 == 0) die(L"ANDs, ORs, and conditions don't come out evenly."); + if(destring.size() % 2 == 0) die("ANDs, ORs, and conditions don't come out evenly."_u); for(unsigned int i = 1; i < destring.size(); i += 2) { - if(destring[i].second->op != 0) die(L"Expected operator, found condition."); - if(destring[i].second->val->src != 0) die(L"Expected operator, found clip."); - wchar_t op = lookupOperator(destring[i].second->val->part); - if(op == 0) die(L"Unknown operator '" + destring[i].second->val->part + L"'."); + if(destring[i].second->op != 0) die("Expected operator, found condition."_u); + if(destring[i].second->val->src != 0) die("Expected operator, found clip."_u); + UChar op = lookupOperator(destring[i].second->val->part); + if(op == 0) die("Unknown operator '"_u + destring[i].second->val->part + "'."_u); Cond* temp = ret; ret = new Cond; ret->left = temp; @@ -707,64 +700,64 @@ RTXCompiler::parseCond() void RTXCompiler::parsePatternElement(Rule* rule) { - vector pat; - if(isNextToken(L'%')) + vector pat; + if(isNextToken('%')) { rule->grab_all = rule->pattern.size()+1; } - wstring t1 = nextToken(); - if(t1 == L"$") + UString t1 = nextToken(); + if(t1 == "$"_u) { t1 += parseIdent(); } - else if(t1 == L"[") + else if(t1 == "["_u) { - t1 = L"$" + parseIdent(); - if(!isNextToken(L']')) die(L"expected closing bracket after lemma category"); + t1 = "$"_u + parseIdent(); + if(!isNextToken(']')) die("expected closing bracket after lemma category"_u); } - if(isNextToken(L'@')) + if(isNextToken('@')) { pat.push_back(t1); pat.push_back(parseIdent()); } - else if(t1[0] == L'$') + else if(t1[0] == '$') { - die(L"first tag in pattern element must be literal"); + die("first tag in pattern element must be literal"_u); } else { - pat.push_back(L""); + pat.push_back(""_u); pat.push_back(t1); } while(!source.eof()) { - if(!isNextToken(L'.')) + if(!isNextToken('.')) { break; } - wstring cur = nextToken(); - if(cur == L"$") + UString cur = nextToken(); + if(cur == "$"_u) { Clip* cl = parseClip(rule->pattern.size()+1); if(rule->vars.find(cl->part) != rule->vars.end()) { - die(L"rule has multiple sources for attribute " + cl->part); + die("rule has multiple sources for attribute "_u + cl->part); } rule->vars[cl->part] = cl; } - else if(cur == L"[") + else if(cur == "["_u) { - pat.push_back(L"[" + parseIdent() + L"]"); - nextToken(L"]"); + pat.push_back("["_u + parseIdent() + "]"_u); + nextToken("]"_u); } else { pat.push_back(cur); } } - if(pat.size() == 2 && pat[1] == L"*") + if(pat.size() == 2 && pat[1] == "*"_u) { - pat[1] = L"UNKNOWN:INTERNAL"; + pat[1] = "UNKNOWN:INTERNAL"_u; } rule->pattern.push_back(pat); eatSpaces(); @@ -784,59 +777,59 @@ RTXCompiler::OutputChunk* RTXCompiler::parseOutputElement() { OutputChunk* ret = new OutputChunk; - ret->conjoined = isNextToken(L'+'); + ret->conjoined = isNextToken('+'); ret->interpolated = false; - if(!ret->conjoined) ret->interpolated = isNextToken(L'<'); + if(!ret->conjoined) ret->interpolated = isNextToken('<'); ret->nextConjoined = false; if(ret->conjoined || ret->interpolated) { - wstring verb = (ret->conjoined ? L"conjoin" : L"interpolate"); + UString verb = (ret->conjoined ? "conjoin"_u : "interpolate"_u); if(currentChunk == NULL) { - die(L"Cannot " + verb + L" from within if statement."); + die("Cannot "_u + verb + " from within if statement."_u); } if(currentChunk->children.size() == 0) { - die(L"Cannot " + verb + L" first element."); + die("Cannot "_u + verb + " first element."_u); } if(currentChunk->children.back()->conds.size() > 0) { - die(L"Cannot " + verb + L" to something in an if statement."); + die("Cannot "_u + verb + " to something in an if statement."_u); } if(currentChunk->children.back()->chunks.size() == 0) { - die(L"Cannot " + verb + L" inside and outside of if statement and cannot " + verb + L" first element."); + die("Cannot "_u + verb + " inside and outside of if statement and cannot "_u + verb + " first element."_u); } - if(currentChunk->children.back()->chunks[0]->mode == L"_") + if(currentChunk->children.back()->chunks[0]->mode == "_"_u) { - die(L"Cannot " + verb + L" to a blank."); + die("Cannot "_u + verb + " to a blank."_u); } eatSpaces(); if(ret->interpolated) currentChunk->children.back()->chunks[0]->nextConjoined = true; } - bool isInterp = isNextToken(L'>'); + bool isInterp = isNextToken('>'); eatSpaces(); - ret->getall = isNextToken(L'%'); - if(peekchar() == L'_') + ret->getall = isNextToken('%'); + if(peekchar() == '_') { if(ret->getall) { - die(L"% cannot be used on blanks"); + die("% cannot be used on blanks"_u); } - ret->mode = L"_"; + ret->mode = "_"_u; getchar(); if(isdigit(peekchar())) { ret->pos = parseInt(); if(currentRule->pattern.size() == 1) { - die(L"Cannot output indexed blank because pattern is one element long and thus does not include blanks."); + die("Cannot output indexed blank because pattern is one element long and thus does not include blanks."_u); } if(ret->pos < 1 || ret->pos >= currentRule->pattern.size()) { - die(L"Position index of blank out of bounds, expected an integer from 1 to " + to_wstring(currentRule->pattern.size()-1) + L"."); + die("Position index of blank out of bounds, expected an integer from 1 to "_u + StringUtils::itoa(currentRule->pattern.size()-1) + "."_u); } - wcerr << L"Warning: Use of indexed blank on line " << currentLine << L" is deprecated." << endl; + cerr << "Warning: Use of indexed blank on line " << currentLine << " is deprecated." << endl; } else { @@ -845,81 +838,81 @@ RTXCompiler::parseOutputElement() } else if(isdigit(peekchar())) { - ret->mode = L"#"; + ret->mode = "#"_u; ret->pos = parseInt(); if(ret->pos == 0) { - die(L"There is no position 0."); + die("There is no position 0."_u); } else if(currentLocType != LocTypeMacro && !isInterp && ret->pos > currentRule->pattern.size()) { - die(L"There are only " + to_wstring(currentRule->pattern.size()) + L" elements in the pattern."); + die("There are only "_u + StringUtils::itoa(currentRule->pattern.size()) + " elements in the pattern."_u); } - if(peekchar() == L'(') + if(peekchar() == '(') { - nextToken(L"("); + nextToken("("_u); ret->pattern = parseIdent(); - nextToken(L")"); + nextToken(")"_u); } else if(currentLocType == LocTypeMacro) { - die(L"Outputs in a macro must specify a pattern."); + die("Outputs in a macro must specify a pattern."_u); } } - else if(isNextToken(L'*')) + else if(isNextToken('*')) { - if(peekchar() != L'(') + if(peekchar() != '(') { - die(L"No macro name specified."); + die("No macro name specified."_u); } - nextToken(L"("); + nextToken("("_u); ret->pattern = parseIdent(true); - nextToken(L")"); + nextToken(")"_u); ret->pos = 0; - ret->mode = L"#"; + ret->mode = "#"_u; } - else if(isNextToken(L'$')) + else if(isNextToken('$')) { - if(isInterp) die(L"Interpolating a global variable does not make sense."); - if(ret->getall) die(L"Using % with a global variable does not make sense."); - nextToken(L"$"); - ret->mode = L"$$"; + if(isInterp) die("Interpolating a global variable does not make sense."_u); + if(ret->getall) die("Using % with a global variable does not make sense."_u); + nextToken("$"_u); + ret->mode = "$$"_u; ret->pattern = parseIdent(true); } else { ret->lemma = parseIdent(); ret->pos = 0; - wstring mode = nextToken(L"@", L"("); - if(mode == L"@") + UString mode = nextToken("@"_u, "("_u); + if(mode == "@"_u) { if(ret->getall) { - die(L"% not supported on output literals with @. Use %lemma(pos)."); + die("% not supported on output literals with @. Use %lemma(pos)."_u); } - ret->mode = L"@"; + ret->mode = "@"_u; while(true) { - wstring cur = nextToken(); - wstring var = to_wstring(ret->tags.size()); + UString cur = nextToken(); + UString var = StringUtils::itoa(ret->tags.size()); ret->tags.push_back(var); Clip* cl = new Clip; - if(cur == L"$") + if(cur == "$"_u) { cl->src = -1; cl->part = parseIdent(); } - else if(cur == L"[") + else if(cur == "["_u) { cl = parseClip(); - nextToken(L"]"); + nextToken("]"_u); } - else if(cur == L"{") + else if(cur == "{"_u) { ret->tags.pop_back(); - var = L"lemcase"; + var = "lemcase"_u; cl = parseClip(); - nextToken(L"}"); + nextToken("}"_u); } else { @@ -927,7 +920,7 @@ RTXCompiler::parseOutputElement() cl->part = cur; } ret->vars[var] = cl; - if(!isNextToken(L'.')) + if(!isNextToken('.')) { break; } @@ -935,58 +928,58 @@ RTXCompiler::parseOutputElement() } else { - ret->mode = L"#@"; + ret->mode = "#@"_u; ret->pattern = parseIdent(true); - nextToken(L")"); + nextToken(")"_u); Clip* pos = new Clip; pos->src = 0; pos->part = ret->pattern; - pos->rewrite.push_back(L"pos_tag"); - ret->vars[L"pos_tag"] = pos; + pos->rewrite.push_back("pos_tag"_u); + ret->vars["pos_tag"_u] = pos; unsigned int i = 0; for(; i < ret->lemma.size(); i++) { - if(ret->lemma[i] == L'#') break; + if(ret->lemma[i] == '#') break; } Clip* lemh = new Clip; lemh->part = ret->lemma.substr(0, i); lemh->src = 0; - lemh->rewrite.push_back(L"lemh"); - ret->vars[L"lemh"] = lemh; + lemh->rewrite.push_back("lemh"_u); + ret->vars["lemh"_u] = lemh; if(i < ret->lemma.size()) { Clip* lemq = new Clip; lemq->part = ret->lemma.substr(i+2); lemq->src = 0; - lemq->rewrite.push_back(L"lemq"); - ret->vars[L"lemq"] = lemq; + lemq->rewrite.push_back("lemq"_u); + ret->vars["lemq"_u] = lemq; } Clip* lem = new Clip; lem->part = ret->lemma; lem->src = 0; - lem->rewrite.push_back(L"lem"); - ret->vars[L"lem"] = lem; + lem->rewrite.push_back("lem"_u); + ret->vars["lem"_u] = lem; } } - if(isNextToken(L'[')) + if(isNextToken('[')) { - while(!source.eof() && peekchar() != L']') + while(!source.eof() && peekchar() != ']') { eatSpaces(); - wstring var = parseIdent(); - nextToken(L"="); + UString var = parseIdent(); + nextToken("="_u); eatSpaces(); Clip* cl = parseClip(); - if(cl->part == L"_") + if(cl->part == "_"_u) { - cl->part = L""; + cl->part.clear(); } if(cl->src != 0 && cl->src != -2) { cl->rewrite.push_back(var); } ret->vars[var] = cl; - if(nextToken(L",", L"]") == L"]") + if(nextToken(","_u, "]"_u) == "]"_u) { break; } @@ -999,7 +992,7 @@ RTXCompiler::parseOutputElement() RTXCompiler::OutputChoice* RTXCompiler::parseOutputCond() { - nextToken(L"("); + nextToken("("_u); OutputChoice* choicewas = currentChoice; OutputChunk* chunkwas = currentChunk; Clip* clipwas = currentClip; @@ -1009,31 +1002,31 @@ RTXCompiler::parseOutputCond() currentClip = NULL; while(true) { - wstring mode = StringUtils::tolower(nextToken()); - mode = StringUtils::substitute(mode, L"-", L""); - mode = StringUtils::substitute(mode, L"_", L""); - if(ret->conds.size() == 0 && mode != L"if" && mode != L"always") + UString mode = StringUtils::tolower(nextToken()); + mode = StringUtils::substitute(mode, "-"_u, ""_u); + mode = StringUtils::substitute(mode, "_"_u, ""_u); + if(ret->conds.size() == 0 && mode != "if"_u && mode != "always"_u) { - die(L"If statement must begin with 'if'."); + die("If statement must begin with 'if'."_u); } - if(ret->conds.size() > 0 && mode == L"always") + if(ret->conds.size() > 0 && mode == "always"_u) { - die(L"Always clause must be only clause."); + die("Always clause must be only clause."_u); } - if(mode == L"if" || mode == L"elif" || mode == L"elseif") + if(mode == "if"_u || mode == "elif"_u || mode == "elseif"_u) { ret->conds.push_back(parseCond()); } - else if(mode == L")") + else if(mode == ")"_u) { break; } - else if(mode != L"else" && mode != L"otherwise" && mode != L"always") + else if(mode != "else"_u && mode != "otherwise"_u && mode != "always"_u) { - die(L"Unknown statement: '" + mode + L"'."); + die("Unknown statement: '"_u + mode + "'."_u); } eatSpaces(); - if(peekchar() == L'(') + if(peekchar() == '(') { ret->nest.push_back(parseOutputCond()); ret->chunks.push_back(NULL); @@ -1045,29 +1038,29 @@ RTXCompiler::parseOutputCond() ret->chunks.push_back(NULL); ret->nest.push_back(NULL); } - else if(peekchar() == L'{') + else if(peekchar() == '{') { if(currentLoc == LocChunk) { - die(L"Nested chunks are currently not allowed."); + die("Nested chunks are currently not allowed."_u); } else if(currentLocType == LocTypeMacro) { - die(L"Macros cannot generate entire chunks."); + die("Macros cannot generate entire chunks."_u); } else if(currentLoc == LocVarSet) { - die(L"Global variables cannot be set to chunks."); + die("Global variables cannot be set to chunks."_u); } ret->nest.push_back(NULL); ret->clips.push_back(NULL); ret->chunks.push_back(parseOutputChunk()); } - else if(peekchar() == L'[') + else if(peekchar() == '[') { if(currentLoc == LocVarSet) { - die(L"Global variables must be set to single nodes."); + die("Global variables must be set to single nodes."_u); } ret->nest.push_back(NULL); ret->clips.push_back(NULL); @@ -1077,15 +1070,15 @@ RTXCompiler::parseOutputCond() { if(currentLoc != LocChunk && currentLoc != LocVarSet) { - die(L"Conditional non-chunk output current not possible."); + die("Conditional non-chunk output current not possible."_u); } ret->chunks.push_back(parseOutputElement()); ret->nest.push_back(NULL); ret->clips.push_back(NULL); } - if(mode == L"else" || mode == L"otherwise" || mode == L"always") + if(mode == "else"_u || mode == "otherwise"_u || mode == "always"_u) { - nextToken(L")"); + nextToken(")"_u); break; } } @@ -1094,29 +1087,29 @@ RTXCompiler::parseOutputCond() currentClip = clipwas; if(ret->chunks.size() == 0) { - die(L"If statement cannot be empty."); + die("If statement cannot be empty."_u); } if(ret->conds.size() == ret->nest.size()) { if(currentLoc == LocChunk && currentLocType == LocTypeMacro) { - wcerr << L"Warning: if statement without else in macro on line " << currentLine << L"." << endl; - wcerr << L" This may fail to produce output and cause crashes at runtime." << endl; + cerr << "Warning: if statement without else in macro on line " << currentLine << "." << endl; + cerr << " This may fail to produce output and cause crashes at runtime." << endl; } - //die(L"If statement has no else clause and thus could produce no output."); + //die("If statement has no else clause and thus could produce no output."_u); ret->nest.push_back(NULL); if(currentLoc == LocClip) { Clip* blank = new Clip; blank->src = 0; - blank->part = L""; + blank->part.clear(); ret->clips.push_back(blank); ret->chunks.push_back(NULL); } else { OutputChunk* temp = new OutputChunk; - temp->mode = L"[]"; + temp->mode = "[]"_u; temp->pos = 0; temp->conjoined = false; ret->chunks.push_back(temp); @@ -1133,20 +1126,20 @@ RTXCompiler::parseOutputChunk() int end; OutputChunk* ch = new OutputChunk; ch->conjoined = false; - if(nextToken(L"{", L"[") == L"{") + if(nextToken("{"_u, "["_u) == "{"_u) { currentLoc = LocChunk; - ch->mode = L"{}"; - end = L'}'; + ch->mode = "{}"_u; + end = '}'; } else { if(currentLoc != LocChunk) { - die(L"Output grouping with [] only valid inside chunks."); + die("Output grouping with [] only valid inside chunks."_u); } - ch->mode = L"[]"; - end = L']'; + ch->mode = "[]"_u; + end = ']'; } eatSpaces(); OutputChunk* chunkwas = currentChunk; @@ -1156,7 +1149,7 @@ RTXCompiler::parseOutputChunk() ch->pos = 0; while(peekchar() != end) { - if(peekchar() == L'(') + if(peekchar() == '(') { ch->children.push_back(parseOutputCond()); } @@ -1165,8 +1158,8 @@ RTXCompiler::parseOutputChunk() ch->children.push_back(chunkToCond(parseOutputElement())); } } - nextToken(wstring(1, end)); - if(end == L'}') currentLoc = LocTopLevel; + nextToken(UString(1, end)); + if(end == '}') currentLoc = LocTopLevel; eatSpaces(); currentChunk = chunkwas; currentChoice = choicewas; @@ -1174,18 +1167,18 @@ RTXCompiler::parseOutputChunk() } void -RTXCompiler::parseReduceRule(wstring output, wstring next) +RTXCompiler::parseReduceRule(UString output, UString next) { - vector outNodes; + vector outNodes; outNodes.push_back(output); - if(next != L"->") + if(next != "->"_u) { - wstring cur = next; - while(cur != L"->") + UString cur = next; + while(cur != "->"_u) { - if(SPECIAL_CHARS.find(cur) != wstring::npos) + if(SPECIAL_CHARS.find(cur) != UString::npos) { - die(L"Chunk names must be identifiers. (I think I'm parsing a reduction rule.)\nIf this error doesn't make sense to you, a common reason is that on the line before this you have ; instead of |"); + die("Chunk names must be identifiers. (I think I'm parsing a reduction rule.)\nIf this error doesn't make sense to you, a common reason is that on the line before this you have ; instead of |"_u); } outNodes.push_back(cur); cur = nextToken(); @@ -1204,11 +1197,11 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) rule->line = currentLine; currentLocType = LocTypeInput; currentLoc = LocTopLevel; - if(!source.eof() && peekchar() == L'"') + if(!source.eof() && peekchar() == '"') { setUnreadMark(); - wstring nm = parseIdent(); - if(peekchar() == L'@') + UString nm = parseIdent(); + if(peekchar() == '@') { unread(); } @@ -1221,41 +1214,41 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) if(isdigit(peekchar())) { rule->weight = parseWeight(); - nextToken(L":"); + nextToken(":"_u); eatSpaces(); } else { rule->weight = 0; } - while(!source.eof() && peekchar() != L'{' && peekchar() != L'(' && peekchar() != L'?') + while(!source.eof() && peekchar() != '{' && peekchar() != '(' && peekchar() != '?') { - if(peekchar() == L'[') + if(peekchar() == '[') { setUnreadMark(); getchar(); - wchar_t next = peekchar(); + UChar next = peekchar(); unread(); - if(next == L'$' || isspace(next)) break; + if(next == '$' || isspace(next)) break; } parsePatternElement(rule); } if(rule->pattern.size() == 0) { - die(L"empty pattern"); + die("empty pattern"_u); } eatSpaces(); - if(isNextToken(L'?')) + if(isNextToken('?')) { rule->cond = parseCond(); eatSpaces(); } - if(isNextToken(L'[')) + if(isNextToken('[')) { while(!source.eof()) { eatSpaces(); - if(!isNextToken(L'$')) + if(!isNextToken('$')) { unsigned int idx = 1; if(isdigit(peekchar())) @@ -1264,20 +1257,20 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) } if(idx == 0 || idx > outNodes.size()) { - die(L"Chunk index for setting source or reference is out of range."); + die("Chunk index for setting source or reference is out of range."_u); } - nextToken(L"/"); - bool sl = (nextToken(L"sl", L"ref") == L"sl"); - nextToken(L"="); + nextToken("/"_u); + bool sl = (nextToken("sl"_u, "ref"_u) == "sl"_u); + nextToken("="_u); currentLoc = LocVarSet; OutputChoice* cond; - if(peekchar() == L'(') cond = parseOutputCond(); + if(peekchar() == '(') cond = parseOutputCond(); else cond = chunkToCond(parseOutputElement()); if(sl) { if(rule->output_sl[idx-1] != NULL) { - die(L"Rule sets chunk source multiple times."); + die("Rule sets chunk source multiple times."_u); } rule->output_sl[idx-1] = cond; } @@ -1285,21 +1278,21 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) { if(rule->output_ref[idx-1] != NULL) { - die(L"Rule sets chunk reference multiple times."); + die("Rule sets chunk reference multiple times."_u); } rule->output_ref[idx-1] = cond; } } - else if(isNextToken(L'$')) + else if(isNextToken('$')) { - wstring var = parseIdent(); + UString var = parseIdent(); if(rule->globals.find(var) != rule->globals.end()) { - die(L"Rule sets global variable $$" + var + L" multiple times."); + die("Rule sets global variable $$"_u + var + " multiple times."_u); } - nextToken(L"="); + nextToken("="_u); currentLoc = LocVarSet; - if(peekchar() == L'(') rule->globals[var] = parseOutputCond(); + if(peekchar() == '(') rule->globals[var] = parseOutputCond(); else rule->globals[var] = chunkToCond(parseOutputElement()); currentLoc = LocTopLevel; if(globalVarNames.find(var) == globalVarNames.end()) @@ -1308,27 +1301,27 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) globalVarNames[var] = temp; } } - else if(isNextToken(L'%')) + else if(isNextToken('%')) { - wstring var = parseIdent(); + UString var = parseIdent(); if(rule->stringGlobals.find(var) != rule->stringGlobals.end()) { - die(L"Rule sets global variable $%" + var + L" multiple times."); + die("Rule sets global variable $%"_u + var + " multiple times."_u); } - nextToken(L"="); + nextToken("="_u); rule->stringGlobals[var] = parseClip(); } else { - wstring var = parseIdent(); + UString var = parseIdent(); if(rule->vars.find(var) != rule->vars.end()) { - die(L"rule has multiple sources for attribute " + var); + die("rule has multiple sources for attribute "_u + var); } - nextToken(L"="); + nextToken("="_u); rule->vars[var] = parseClip(); } - if(nextToken(L",", L"]") == L"]") + if(nextToken(","_u, "]"_u) == "]"_u) { break; } @@ -1338,34 +1331,34 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) currentLocType = LocTypeOutput; if(rule->result.size() > 1) { - nextToken(L"{"); + nextToken("{"_u); } unsigned int chunk_count = 0; while(chunk_count < rule->result.size()) { eatSpaces(); - if(source.eof()) die(L"Unexpected end of file."); + if(source.eof()) die("Unexpected end of file."_u); switch(peekchar()) { - case L'(': + case '(': rule->output.push_back(parseOutputCond()); chunk_count++; break; - case L'{': + case '{': rule->output.push_back(chunkToCond(parseOutputChunk())); chunk_count++; break; - case L'_': + case '_': rule->output.push_back(chunkToCond(parseOutputElement())); break; - case L'}': + case '}': if(rule->result.size() == 1) { - die(L"Unexpected } in output pattern."); + die("Unexpected } in output pattern."_u); } else if(chunk_count < rule->result.size()) { - die(L"Output pattern does not have enough chunks."); + die("Output pattern does not have enough chunks."_u); } break; default: @@ -1376,10 +1369,10 @@ RTXCompiler::parseReduceRule(wstring output, wstring next) } if(rule->result.size() > 1) { - nextToken(L"}"); + nextToken("}"_u); } reductionRules.push_back(rule); - if(nextToken(L"|", L";") == L";") + if(nextToken("|"_u, ";"_u) == ";"_u) { break; } @@ -1392,28 +1385,28 @@ RTXCompiler::processRetagRules() { for(auto rule : retagRules) { - map> vals; - wstring src = rule[0].first; - wstring dest = rule[0].second; + map> vals; + UString src = rule[0].first; + UString dest = rule[0].second; if(!PB.isAttrDefined(src) && collections.find(src) == collections.end()) { - wcerr << L"Warning: Source category for tag-rewrite rule '" << src << "' > '" << dest << "' is undefined." << endl; + cerr << "Warning: Source category for tag-rewrite rule '" << src << "' > '" << dest << "' is undefined." << endl; continue; } if(!PB.isAttrDefined(dest) && collections.find(dest) == collections.end()) { - wcerr << L"Warning: Destination category for tag-rewrite rule '" << src << "' > '" << dest << "' is undefined." << endl; + cerr << "Warning: Destination category for tag-rewrite rule '" << src << "' > '" << dest << "' is undefined." << endl; continue; } if(collections.find(src) == collections.end() || collections.find(dest) == collections.end()) continue; for(unsigned int i = 1; i < rule.size(); i++) { - if(rule[i].first[0] == L'[') + if(rule[i].first[0] == '[') { - wstring cat = rule[i].first.substr(2); + UString cat = rule[i].first.substr(2); if(collections.find(cat) == collections.end()) { - wcerr << L"Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' contains mapping from undefined category '" << cat << "'." << endl; + cerr << "Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' contains mapping from undefined category '" << cat << "'." << endl; continue; } for(auto v : collections[cat]) vals[v].push_back(rule[i].second); @@ -1440,14 +1433,14 @@ RTXCompiler::processRetagRules() } if(!found) { - wcerr << L"Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' does not convert '" << a << "'." << endl; + cerr << "Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' does not convert '" << a << "'." << endl; } } else if(vals[a].size() > 1) { - wcerr << L"Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' converts '" << a << "' to multiple values: "; - for(auto b : vals[a]) wcerr << "'" << b << "', "; - wcerr << "defaulting to '" << vals[a][0] << "'." << endl; + cerr << "Warning: Tag-rewrite rule '" << src << "' > '" << dest << "' converts '" << a << "' to multiple values: "; + for(auto b : vals[a]) cerr << "'" << b << "', "; + cerr << "defaulting to '" << vals[a][0] << "'." << endl; } } } @@ -1465,24 +1458,24 @@ RTXCompiler::makePattern(int ruleid) vector> pat; for(unsigned int i = 0; i < rule->pattern.size(); i++) { - vector> tags; - tags.push_back(vector()); + vector> tags; + tags.push_back(vector()); for(unsigned int j = 1; j < rule->pattern[i].size(); j++) { - wstring tg = rule->pattern[i][j]; - if(rule->pattern[i][j][0] == L'[') + UString tg = rule->pattern[i][j]; + if(rule->pattern[i][j][0] == '[') { tg = tg.substr(1, tg.size()-2); if(collections.find(tg) == collections.end()) { - die(L"unknown attribute category '" + tg + L"'"); + die("unknown attribute category '"_u + tg + "'"_u); } - vector> tmp; + vector> tmp; for(auto tls : tags) { for(auto t : collections[tg]) { - vector tmp2; + vector tmp2; tmp2.assign(tls.begin(), tls.end()); tmp2.push_back(t); tmp.push_back(tmp2); @@ -1498,9 +1491,9 @@ RTXCompiler::makePattern(int ruleid) } } } - for(unsigned int t = 0; t < tags.size(); t++) tags[t].push_back(L"*"); - wstring lem = rule->pattern[i][0]; - if(lem.size() == 0 || lem[0] != L'$') + for(unsigned int t = 0; t < tags.size(); t++) tags[t].push_back("*"_u); + UString lem = rule->pattern[i][0]; + if(lem.size() == 0 || lem[0] != '$') { vector pel; for(auto tls : tags) @@ -1514,7 +1507,7 @@ RTXCompiler::makePattern(int ruleid) } else { - vector lems = collections[lem.substr(1)]; + vector lems = collections[lem.substr(1)]; vector el; for(unsigned int j = 0; j < lems.size(); j++) { @@ -1531,45 +1524,45 @@ RTXCompiler::makePattern(int ruleid) } if(excluded.find(rule->name) == excluded.end()) { - PB.addRule(ruleid+1, rule->weight, pat, vector(1, rule->result[0]), rule->name); + PB.addRule(ruleid+1, rule->weight, pat, vector(1, rule->result[0]), rule->name); } } -wstring -RTXCompiler::compileString(wstring s) +UString +RTXCompiler::compileString(UString s) { - wstring ret; + UString ret; ret += STRING; - ret += (wchar_t)s.size(); + ret += (UChar)s.size(); ret += s; return ret; } -wstring -RTXCompiler::compileTag(wstring s) +UString +RTXCompiler::compileTag(UString s) { if(s.size() == 0) { return compileString(s); } - wstring tag; - tag += L'<'; + UString tag; + tag += '<'; tag += s; - tag += L'>'; - return compileString(StringUtils::substitute(tag, L".", L"><")); + tag += '>'; + return compileString(StringUtils::substitute(tag, "."_u, "><"_u)); } -wstring -RTXCompiler::compileClip(Clip* c, wstring _dest = L"") +UString +RTXCompiler::compileClip(Clip* c, UString _dest = ""_u) { - if(c->src == -1 && c->part == L"lu-count") + if(c->src == -1 && c->part == "lu-count"_u) { - return wstring(1, LUCOUNT); + return UString(1, LUCOUNT); } if(c->src == -2) { - wstring ret = processOutputChoice(c->choice); - if(_dest == L"lem" || _dest == L"lemh" || _dest == L"lemq" || _dest == L"lemcase") + UString ret = processOutputChoice(c->choice); + if(_dest == "lem"_u || _dest == "lemh"_u || _dest == "lemq"_u || _dest == "lemcase"_u) { ret += DISTAG; } @@ -1579,18 +1572,18 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") { return PB.BCstring(c->varName) + FETCHVAR; } - if(c->src != 0 && !(c->part == L"lemcase" || + if(c->src != 0 && !(c->part == "lemcase"_u || collections.find(c->part) != collections.end() || PB.isAttrDefined(c->part))) { - die(L"Attempt to clip undefined attribute '" + c->part + L"'."); + die("Attempt to clip undefined attribute '"_u + c->part + "'."_u); } int src = (c->src == -1) ? 0 : c->src; bool useReplace = (currentLocType == LocTypeOutput); - wstring cl; + UString cl; if(src == -4) { cl += INT; - cl += globalVarNames[c->varName]; + cl += (UChar)globalVarNames[c->varName]; cl += FETCHCHUNK; } else @@ -1599,15 +1592,15 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") cl += src; cl += PUSHINPUT; } - if(c->part == L"whole" || c->part == L"chcontent") return cl; - cl += (c->part == L"lemcase") ? compileString(L"lem") : compileString(c->part); - wstring ret = cl; - wstring undeftag; - wstring deftag; - wstring thedefault; - wstring blank; + if(c->part == "whole"_u || c->part == "chcontent"_u) return cl; + cl += (c->part == "lemcase"_u) ? compileString("lem"_u) : compileString(c->part); + UString ret = cl; + UString undeftag; + UString deftag; + UString thedefault; + UString blank; blank += DUP; - blank += compileString(L""); + blank += compileString(""_u); blank += EQUAL; if(useReplace && undeftag.size() > 0) { @@ -1633,60 +1626,60 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") blank += JUMPONFALSE; if(c->src == 0) { - if(_dest == L"lem" || _dest == L"lemh" || _dest == L"lemq" || _dest == L"lemcase" || + if(_dest == "lem"_u || _dest == "lemh"_u || _dest == "lemq"_u || _dest == "lemcase"_u || (c->rewrite.size() > 0 && - (c->rewrite.back() == L"lem" || c->rewrite.back() == L"lemh" || - c->rewrite.back() == L"lemq" || c->rewrite.back() == L"lemcase"))) + (c->rewrite.back() == "lem"_u || c->rewrite.back() == "lemh"_u || + c->rewrite.back() == "lemq"_u || c->rewrite.back() == "lemcase"_u))) { return compileString(c->part); } else return compileTag(c->part); } - else if(c->side == L"sl") + else if(c->side == "sl"_u) { ret += SOURCECLIP; ret += blank; - ret += (wchar_t)thedefault.size(); + ret += (UChar)thedefault.size(); ret += thedefault; } - else if(c->side == L"ref") + else if(c->side == "ref"_u) { ret += REFERENCECLIP; ret += blank; - ret += (wchar_t)thedefault.size(); + ret += (UChar)thedefault.size(); ret += thedefault; } - else if(c->side == L"tl" || c->part == L"lemcase" || + else if(c->side == "tl"_u || c->part == "lemcase"_u || (c->src > 0 && !nodeIsSurface[currentRule->pattern[c->src-1][1]])) { ret += TARGETCLIP; ret += blank; - ret += (wchar_t)thedefault.size(); + ret += (UChar)thedefault.size(); ret += thedefault; } else { ret += TARGETCLIP; ret += blank; - ret += (wchar_t)(6 + 2*cl.size() + 2*blank.size() + thedefault.size()); + ret += (UChar)(6 + 2*cl.size() + 2*blank.size() + thedefault.size()); ret += DROP; ret += cl; ret += REFERENCECLIP; ret += blank; - ret += (wchar_t)(3 + cl.size() + blank.size() + thedefault.size()); + ret += (UChar)(3 + cl.size() + blank.size() + thedefault.size()); ret += DROP; ret += cl; ret += SOURCECLIP; ret += blank; - ret += (wchar_t)thedefault.size(); + ret += (UChar)thedefault.size(); ret += thedefault; } - if(c->part == L"lemcase") + if(c->part == "lemcase"_u) { ret += GETCASE; } - wstring src_cat = c->part; - vector rewrite = c->rewrite; + UString src_cat = c->part; + vector rewrite = c->rewrite; if(_dest.size() > 0 && rewrite.size() == 0 && currentLocType == LocTypeOutput) { rewrite.push_back(_dest); @@ -1694,7 +1687,7 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") for(auto dest : rewrite) { bool found = false; - vector> rule; + vector> rule; for(unsigned int i = 0; i < retagRules.size(); i++) { if(retagRules[i][0].first == src_cat && retagRules[i][0].second == dest) @@ -1706,21 +1699,21 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") } if(!found && dest != src_cat) { - if(dest == L"lem" || dest == L"lemh" || dest == L"lemq") + if(dest == "lem"_u || dest == "lemh"_u || dest == "lemq"_u) { ret += DISTAG; return ret; } - die(L"There is no tag-rewrite rule from '" + src_cat + L"' to '" + dest + L"'."); + die("There is no tag-rewrite rule from '"_u + src_cat + "' to '"_u + dest + "'."_u); } - wstring check; + UString check; for(unsigned int i = 1; i < rule.size(); i++) { - wstring cur; + UString cur; cur += DUP; cur += DISTAG; if(rule[i].first.size() > 2 && - rule[i].first[0] == L'[' && rule[i].first[1] == L']') + rule[i].first[0] == '[' && rule[i].first[1] == ']') { cur += compileString(rule[i].first.substr(2)); cur += IN; @@ -1731,31 +1724,31 @@ RTXCompiler::compileClip(Clip* c, wstring _dest = L"") cur += EQUAL; } cur += JUMPONFALSE; - cur += (wchar_t)(rule[i].second.size() + (i == 1 ? 5 : 7)); + cur += (UChar)(rule[i].second.size() + (i == 1 ? 5 : 7)); cur += DROP; cur += compileTag(rule[i].second); if(i != 1) { cur += JUMP; - cur += (wchar_t)check.size(); + cur += (UChar)check.size(); } check = cur + check; } ret += check; - if(dest == L"lemh" || dest == L"lem" || dest == L"lemq") + if(dest == "lemh"_u || dest == "lem"_u || dest == "lemq"_u) { if(dest != dest) ret += DISTAG; } } - if(_dest == L"lem" || _dest == L"lemh" || _dest == L"lemq" || _dest == L"lemcase") + if(_dest == "lem"_u || _dest == "lemh"_u || _dest == "lemq"_u || _dest == "lemcase"_u) { ret += DISTAG; } return ret; } -wstring -RTXCompiler::compileClip(wstring part, int pos, wstring side = L"") +UString +RTXCompiler::compileClip(UString part, int pos, UString side = ""_u) { Clip cl; cl.part = part; @@ -1791,7 +1784,7 @@ RTXCompiler::processMacroClip(Clip* mac, OutputChunk* arg) } else { - die(L"Macro not given value for attribute '" + mac->part + L"'."); + die("Macro not given value for attribute '"_u + mac->part + "'."_u); } } else ret->src = arg->pos; @@ -1839,7 +1832,7 @@ RTXCompiler::processMacroChunk(OutputChunk* mac, OutputChunk* arg) { ret->children.push_back(processMacroChoice(mac->children[i], arg)); } - for(map::iterator it = mac->vars.begin(), + for(map::iterator it = mac->vars.begin(), limit = mac->vars.end(); it != limit; it++) { ret->vars[it->first] = processMacroClip(it->second, arg); @@ -1847,7 +1840,7 @@ RTXCompiler::processMacroChunk(OutputChunk* mac, OutputChunk* arg) if(mac->pos == 1) { ret->pos = arg->pos; - for(map::iterator it = arg->vars.begin(), + for(map::iterator it = arg->vars.begin(), limit = arg->vars.end(); it != limit; it++) { if(ret->vars.find(it->first) == ret->vars.end() || arg->pos == 0) @@ -1887,29 +1880,29 @@ RTXCompiler::processMacroChoice(OutputChoice* mac, OutputChunk* arg) return ret; } -wstring +UString RTXCompiler::processOutputChunk(OutputChunk* r) { - wstring ret; + UString ret; if(r->conjoined && currentLocType == LocTypeOutput) { ret += CONJOIN; ret += OUTPUT; } - if(r->mode == L"_") + if(r->mode == "_"_u) { ret += INT; - ret += (wchar_t)r->pos; + ret += (UChar)r->pos; ret += BLANK; if(currentLocType == LocTypeOutput) { ret += OUTPUT; } } - else if(r->mode == L"$$") + else if(r->mode == "$$"_u) { ret += INT; - ret += (wchar_t)globalVarNames[r->pattern]; + ret += (UChar)globalVarNames[r->pattern]; ret += FETCHCHUNK; if(r->interpolated) ret += APPENDCHILD; if(currentLocType == LocTypeOutput) @@ -1917,39 +1910,39 @@ RTXCompiler::processOutputChunk(OutputChunk* r) ret += OUTPUT; } } - else if(r->mode == L"{}" || r->mode == L"[]" || r->mode == L"") + else if(r->mode == "{}"_u || r->mode == "[]"_u || r->mode.empty()) { for(unsigned int i = 0; i < r->children.size(); i++) { ret += processOutputChoice(r->children[i]); } } - else if(r->mode == L"#" || r->mode == L"#@") + else if(r->mode == "#"_u || r->mode == "#@"_u) { bool interp = r->pos > currentRule->pattern.size(); - wstring pos; + UString pos; if(!interp && r->pos != 0) { if(currentRule->pattern[r->pos-1].size() < 2) { - die(L"could not find tag order for element " + to_wstring(r->pos)); + die("could not find tag order for element "_u + StringUtils::itoa(r->pos)); } pos = currentRule->pattern[r->pos-1][1]; } - wstring patname = (r->pattern != L"") ? r->pattern : pos; - pos = (pos != L"") ? pos : patname; + UString patname = (r->pattern.empty()) ? pos : r->pattern; + pos = (pos.empty()) ? patname : pos; if(outputRules.find(patname) == outputRules.end()) { if(interp) { - ret += compileClip(L"whole", r->pos, L"tl"); + ret += compileClip("whole"_u, r->pos, "tl"_u); if(r->interpolated) ret += APPENDCHILD; ret += OUTPUT; return ret; } - die(L"Could not find output pattern '" + patname + L"'."); + die("Could not find output pattern '"_u + patname + "'."_u); } - vector pattern = outputRules[patname]; + vector pattern = outputRules[patname]; if(r->getall) { @@ -1968,20 +1961,20 @@ RTXCompiler::processOutputChunk(OutputChunk* r) if(r->interpolated) { ret += INT; - ret += (wchar_t)0; + ret += (UChar)0; ret += BLANK; ret += APPENDCHILD; } - if(pattern.size() == 1 && pattern[0] == L"macro") + if(pattern.size() == 1 && pattern[0] == "macro"_u) { macroNameStack.push_back(patname); ret += processOutputChoice(processMacroChoice(macros[patname], r)); macroNameStack.pop_back(); return ret; } - if(pattern.size() == 1 && pattern[0] == L"%") + if(pattern.size() == 1 && pattern[0] == "%"_u) { - ret += compileClip(L"whole", r->pos, L"tl"); + ret += compileClip("whole"_u, r->pos, "tl"_u); if(currentLocType == LocTypeOutput && !r->nextConjoined) { ret += OUTPUT; @@ -1992,12 +1985,12 @@ RTXCompiler::processOutputChunk(OutputChunk* r) { ret += CHUNK; } - if(r->mode == L"#@") + if(r->mode == "#@"_u) { unsigned int j; for(j = 0; j < r->lemma.size(); j++) { - if(r->lemma[j] == L'#') break; + if(r->lemma[j] == '#') break; } if(j < r->lemma.size()) { @@ -2005,57 +1998,57 @@ RTXCompiler::processOutputChunk(OutputChunk* r) Clip* c = new Clip; c->part = r->lemma.substr(j); c->src = 0; - c->rewrite.push_back(L"lemq"); - r->vars[L"lemq"] = c; + c->rewrite.push_back("lemq"_u); + r->vars["lemq"_u] = c; } else ret += compileString(r->lemma); } - else if(r->vars.find(L"lem") != r->vars.end()) + else if(r->vars.find("lem"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"lem"], L"lem"); + ret += compileClip(r->vars["lem"_u], "lem"_u); } - else if(r->vars.find(L"lemh") != r->vars.end()) + else if(r->vars.find("lemh"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"lemh"], L"lemh"); + ret += compileClip(r->vars["lemh"_u], "lemh"_u); } else if(r->pos == 0) { if(currentRule->grab_all != -1) { - ret += compileClip(L"lem", currentRule->grab_all, L"tl"); + ret += compileClip("lem"_u, currentRule->grab_all, "tl"_u); } else { - ret += compileString(L"default"); + ret += compileString("default"_u); } } else { Clip* c = new Clip; - c->part = L"lemh"; + c->part = "lemh"_u; c->src = r->pos; - c->side = L"tl"; - c->rewrite.push_back(L"lemh"); - ret += compileClip(c, L"lemh"); + c->side = "tl"_u; + c->rewrite.push_back("lemh"_u); + ret += compileClip(c, "lemh"_u); } - if(r->vars.find(L"lemcase") != r->vars.end()) + if(r->vars.find("lemcase"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"lemcase"], L"lemcase"); + ret += compileClip(r->vars["lemcase"_u], "lemcase"_u); ret += SETCASE; } ret += currentSurface; for(unsigned int i = 0; i < pattern.size(); i++) { - if(pattern[i] == L"_") + if(pattern[i] == "_"_u) { - if(r->vars.find(L"pos_tag") != r->vars.end()) + if(r->vars.find("pos_tag"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"pos_tag"]); + ret += compileClip(r->vars["pos_tag"_u]); } else if(r->pos != 0) { //ret += compileTag(currentRule->pattern[r->pos-1][1]); - ret += compileClip(L"pos_tag", r->pos, L"tl"); + ret += compileClip("pos_tag"_u, r->pos, "tl"_u); } else { @@ -2063,20 +2056,20 @@ RTXCompiler::processOutputChunk(OutputChunk* r) } ret += currentSurface; } - else if(pattern[i][0] == L'<') + else if(pattern[i][0] == '<') { ret += compileString(pattern[i]); ret += currentSurface; } else { - wstring ret_temp; - vector ops = altAttrs[pattern[i]]; + UString ret_temp; + vector ops = altAttrs[pattern[i]]; if(ops.size() == 0) { ops.push_back(pattern[i]); } - wstring var; + UString var; for(unsigned int v = 0; v < ops.size(); v++) { if(r->vars.find(ops[v]) != r->vars.end()) @@ -2085,7 +2078,7 @@ RTXCompiler::processOutputChunk(OutputChunk* r) break; } } - if(var == L"" && r->pos != 0) + if(var.empty() && r->pos != 0) { Clip* cl = new Clip; cl->src = r->pos; @@ -2093,14 +2086,14 @@ RTXCompiler::processOutputChunk(OutputChunk* r) if(currentLocType == LocTypeOutput) cl->rewrite.push_back(pattern[i]); ret_temp += compileClip(cl, pattern[i]); } - else if(var == L"") + else if(var.empty()) { bool found = false; for(unsigned int t = 0; t < parentTags.size(); t++) { if(parentTags[t] == pattern[i]) { - ret_temp += compileTag(to_wstring(t+1)); + ret_temp += compileTag(StringUtils::itoa(t+1)); found = true; break; } @@ -2117,7 +2110,7 @@ RTXCompiler::processOutputChunk(OutputChunk* r) } else if(r->pos == 0) { - die(L"Cannot find source for tag '" + pattern[i] + L"'."); + die("Cannot find source for tag '"_u + pattern[i] + "'."_u); } else { @@ -2131,37 +2124,37 @@ RTXCompiler::processOutputChunk(OutputChunk* r) } if(currentLocType == LocTypeOutput && noOverwrite[pattern[i]].size() > 0) { - ret += compileClip(pattern[i], r->pos, L"tl"); + ret += compileClip(pattern[i], r->pos, "tl"_u); ret += DUP; - ret += compileString(pattern[i] + L" over"); + ret += compileString(pattern[i] + " over"_u); ret += IN; ret += JUMPONTRUE; - ret += (wchar_t)(1+ret_temp.size()); + ret += (UChar)(1+ret_temp.size()); ret += DROP; } ret += ret_temp; ret += currentSurface; } } - if(r->vars.find(L"lemq") != r->vars.end()) + if(r->vars.find("lemq"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"lemq"], L"lemq"); + ret += compileClip(r->vars["lemq"_u], "lemq"_u); ret += currentSurface; } else if(r->pos != 0) { - ret += compileClip(L"lemq", r->pos, L"tl"); + ret += compileClip("lemq"_u, r->pos, "tl"_u); ret += currentSurface; } if(r->pos != 0) { - ret += compileClip(L"whole", r->pos, L"tl"); + ret += compileClip("whole"_u, r->pos, "tl"_u); ret += APPENDALLCHILDREN; ret += INT; - ret += (wchar_t)r->pos; + ret += (UChar)r->pos; ret += GETRULE; ret += INT; - ret += (wchar_t)0; + ret += (UChar)0; ret += SETRULE; } if(r->interpolated) ret += APPENDCHILD; @@ -2172,7 +2165,7 @@ RTXCompiler::processOutputChunk(OutputChunk* r) else if(currentLoc == LocVarSet) { ret += INT; - ret += (wchar_t)currentVar; + ret += (UChar)currentVar; ret += SETCHUNK; } } @@ -2181,15 +2174,15 @@ RTXCompiler::processOutputChunk(OutputChunk* r) if(r->interpolated) { ret += INT; - ret += (wchar_t)0; + ret += (UChar)0; ret += BLANK; ret += APPENDCHILD; } ret += CHUNK; ret += compileString(r->lemma); - if(r->vars.find(L"lemcase") != r->vars.end()) + if(r->vars.find("lemcase"_u) != r->vars.end()) { - ret += compileClip(r->vars[L"lemcase"]); + ret += compileClip(r->vars["lemcase"_u]); ret += SETCASE; } ret += currentSurface; @@ -2209,17 +2202,17 @@ RTXCompiler::processOutputChunk(OutputChunk* r) else if(currentLoc == LocVarSet) { ret += INT; - ret += (wchar_t)currentVar; + ret += (UChar)currentVar; ret += SETCHUNK; } } return ret; } -wstring +UString RTXCompiler::processCond(Cond* cond) { - wstring ret; + UString ret; if(cond == NULL) { ret += PUSHTRUE; @@ -2229,31 +2222,31 @@ RTXCompiler::processCond(Cond* cond) { if(cond->left->op == 0 || cond->right->op == 0) { - die(L"Cannot evaluate AND with string as operand (try adding parentheses)."); + die("Cannot evaluate AND with string as operand (try adding parentheses)."_u); } } else if(cond->op == OR) { if(cond->left->op == 0 || cond->right->op == 0) { - die(L"Cannot evaluate OR with string as operand (try adding parentheses)."); + die("Cannot evaluate OR with string as operand (try adding parentheses)."_u); } } else if(cond->op == NOT) { if(cond->right->op == 0) { - die(L"Attempt to negate string value."); + die("Attempt to negate string value."_u); } } else if(cond->op != 0 && (cond->left->op != 0 || cond->right->op != 0)) { - die(L"String operator cannot take condition as operand."); + die("String operator cannot take condition as operand."_u); } else if(cond->op == EQUAL) { - wstring lit; - wstring attr; + UString lit; + UString attr; bool rew = false; Clip* l = cond->left->val; if(l->src == 0) lit = l->part; @@ -2281,7 +2274,7 @@ RTXCompiler::processCond(Cond* cond) break; } } - if(!found) die(L"'" + lit + L"' is not an element of list '" + attr + L"', so this check will always fail."); + if(!found) die("'"_u + lit + "' is not an element of list '"_u + attr + "', so this check will always fail."_u); } } if(cond->op == 0) @@ -2293,7 +2286,7 @@ RTXCompiler::processCond(Cond* cond) else { ret = compileClip(cond->val); - if(cond->val->part != L"lem" && cond->val->part != L"lemh" && cond->val->part != L"lemq") + if(cond->val->part != "lem"_u && cond->val->part != "lemh"_u && cond->val->part != "lemq"_u) { ret += DISTAG; } @@ -2313,10 +2306,10 @@ RTXCompiler::processCond(Cond* cond) return ret; } -wstring +UString RTXCompiler::processOutputChoice(OutputChoice* choice) { - wstring ret; + UString ret; if(choice->nest.back() != NULL) { ret += processOutputChoice(choice->nest.back()); @@ -2332,7 +2325,7 @@ RTXCompiler::processOutputChoice(OutputChoice* choice) int n = choice->conds.size(); for(int i = 1; i <= n; i++) { - wstring act; + UString act; if(choice->nest[n-i] != NULL) { act = processOutputChoice(choice->nest[n-i]); @@ -2346,10 +2339,10 @@ RTXCompiler::processOutputChoice(OutputChoice* choice) act = processOutputChunk(choice->chunks[n-i]); } act += JUMP; - act += (wchar_t)ret.size(); - wstring cond = processCond(choice->conds[n-i]); + act += (UChar)ret.size(); + UString cond = processCond(choice->conds[n-i]); cond += JUMPONFALSE; - cond += (wchar_t)act.size(); + cond += (UChar)act.size(); ret = cond + act + ret; } return ret; @@ -2364,21 +2357,21 @@ RTXCompiler::processRules() rule = reductionRules[ruleid]; if(summarizing) { - if(rule->name.size() > 0) wcerr << "\"" << rule->name << "\": "; - for(auto it : rule->result) wcerr << it << " "; - wcerr << "->"; - for(auto it : rule->pattern) wcerr << " " << it[1]; - wcerr << endl; + if(rule->name.size() > 0) cerr << "\"" << rule->name << "\": "; + for(auto it : rule->result) cerr << it << " "; + cerr << "->"; + for(auto it : rule->pattern) cerr << " " << it[1]; + cerr << endl; } currentRule = rule; currentChunk = NULL; currentChoice = NULL; makePattern(ruleid); - wstring comp; + UString comp; if(rule->cond != NULL) { currentLocType = LocTypeInput; - comp = processCond(rule->cond) + JUMPONTRUE + (wchar_t)1 + REJECTRULE; + comp = processCond(rule->cond) + JUMPONTRUE + (UChar)1 + REJECTRULE; } for(auto it : rule->globals) { @@ -2395,7 +2388,7 @@ RTXCompiler::processRules() comp += SETVAR; } currentLoc = LocTopLevel; - vector outcomp; + vector outcomp; outcomp.resize(rule->pattern.size()); parentTags.clear(); unsigned int patidx = 0; @@ -2403,12 +2396,12 @@ RTXCompiler::processRules() { currentLocType = LocTypeInput; OutputChoice* cur = rule->output[i]; - if(cur->chunks.size() == 1 && cur->chunks[0]->mode == L"_") + if(cur->chunks.size() == 1 && cur->chunks[0]->mode == "_"_u) { currentSurface = APPENDSURFACE; comp += processOutputChoice(cur); } - else if(cur->chunks.size() == 1 && cur->chunks[0]->mode == L"#") + else if(cur->chunks.size() == 1 && cur->chunks[0]->mode == "#"_u) { currentSurface = APPENDSURFACE; comp += processOutputChoice(cur); @@ -2417,17 +2410,17 @@ RTXCompiler::processRules() else { OutputChunk* ch = new OutputChunk; - ch->mode = L"#"; + ch->mode = "#"_u; ch->pos = 0; ch->getall = true; ch->vars = rule->vars; - if(ch->vars.find(L"lemcase") == ch->vars.end()) + if(ch->vars.find("lemcase"_u) == ch->vars.end()) { Clip* lemcase = new Clip; lemcase->src = 1; - lemcase->part = L"lemcase"; - lemcase->side = L"tl"; - ch->vars[L"lemcase"] = lemcase; + lemcase->part = "lemcase"_u; + lemcase->side = "tl"_u; + ch->vars["lemcase"_u] = lemcase; } ch->conjoined = false; ch->interpolated = false; @@ -2446,9 +2439,9 @@ RTXCompiler::processRules() comp += processOutputChoice(rule->output_ref[patidx]); } comp += INT; - comp += (wchar_t)outputBytecode.size(); + comp += (UChar)outputBytecode.size(); comp += INT; - comp += (wchar_t)0; + comp += (UChar)0; comp += SETRULE; comp += APPENDALLINPUT; parentTags = outputRules[ch->pattern]; @@ -2457,11 +2450,11 @@ RTXCompiler::processRules() outputBytecode.push_back(processOutputChoice(cur)); if(rule->name.size() > 0) { - PB.outRuleNames.push_back(rule->name + L" - line " + to_wstring(rule->line)); + PB.outRuleNames.push_back(rule->name + " - line "_u + StringUtils::itoa(rule->line)); } else { - PB.outRuleNames.push_back(L"line " + to_wstring(rule->line)); + PB.outRuleNames.push_back("line "_u + StringUtils::itoa(rule->line)); } parentTags.clear(); patidx++; @@ -2486,7 +2479,7 @@ RTXCompiler::read(const string &fname) source.open(fname); if(!source.is_open()) { - wcerr << L"Unable to open file " << fname.c_str() << " for reading." << endl; + cerr << "Unable to open file " << fname.c_str() << " for reading." << endl; exit(EXIT_FAILURE); } while(true) @@ -2502,15 +2495,13 @@ RTXCompiler::read(const string &fname) errorsAreSyntax = false; processRetagRules(); processRules(); - for(map>::iterator it=collections.begin(); it != collections.end(); ++it) - { - set vals; - for(unsigned int i = 0; i < it->second.size(); i++) - { - vals.insert(it->second[i]); + for (auto& it : collections) { + set vals; + for (auto& it2 : it.second) { + vals.insert(it2); } - PB.addList(it->first, vals); - PB.addAttr(it->first, vals); + PB.addList(it.first, vals); + PB.addAttr(it.first, vals); } } @@ -2524,18 +2515,18 @@ RTXCompiler::write(const string &fname) exit(EXIT_FAILURE); } - vector> inRules; + vector> inRules; for(unsigned int i = 0; i < reductionRules.size(); i++) { inRules.push_back(make_pair(2*reductionRules[i]->pattern.size() - 1, reductionRules[i]->compiled)); if(reductionRules[i]->name.size() > 0) { - PB.inRuleNames.push_back(reductionRules[i]->name + L" - line " + to_wstring(reductionRules[i]->line)); + PB.inRuleNames.push_back(reductionRules[i]->name + " - line "_u + StringUtils::itoa(reductionRules[i]->line)); } else { - PB.inRuleNames.push_back(L"line " + to_wstring(reductionRules[i]->line)); + PB.inRuleNames.push_back("line "_u + StringUtils::itoa(reductionRules[i]->line)); } } diff --git a/src/rtx_compiler.h b/src/rtx_compiler.h index 9bd0924..6d333c6 100644 --- a/src/rtx_compiler.h +++ b/src/rtx_compiler.h @@ -36,16 +36,16 @@ private: struct Clip { int src; - wstring part; - wstring side; - vector rewrite; + UString part; + UString side; + vector rewrite; OutputChoice* choice; - wstring varName; + UString varName; }; struct Cond { - wchar_t op; + UChar op; Clip* val; Cond* left; Cond* right; @@ -53,13 +53,13 @@ private: struct OutputChunk { - wstring mode; + UString mode; unsigned int pos; - wstring lemma; - vector tags; + UString lemma; + vector tags; bool getall; - map vars; - wstring pattern; + map vars; + UString pattern; vector children; bool conjoined; bool interpolated; @@ -79,16 +79,16 @@ private: int line; int grab_all; float weight; - wstring name; - vector> pattern; + UString name; + vector> pattern; vector output; vector output_sl; vector output_ref; - map vars; - map globals; - map stringGlobals; - vector result; - wstring compiled; + map vars; + map globals; + map stringGlobals; + vector result; + UString compiled; Cond* cond; }; @@ -121,7 +121,7 @@ private: /** * Names of rules that should be excluded from the pattern transducer */ - set excluded; + set excluded; ////////// // COLLECTIONS AND DATA STRUCTURES @@ -130,10 +130,10 @@ private: /** * All characters not allowed in identifiers */ - static wstring const SPECIAL_CHARS; + static UString const SPECIAL_CHARS; - static wstring const ANY_TAG; - static wstring const ANY_CHAR; + static UString const ANY_TAG; + static UString const ANY_CHAR; /** * Pattern-file generator @@ -143,20 +143,20 @@ private: /** * Map of names to attribute lists */ - map> collections; + map> collections; /** * Map of attribute names to default and replacement values * First value of pair is value to return if the attribute is not found * Second value is value to overwrite it with if it's still there at output */ - map> attrDefaults; + map> attrDefaults; /** * Map of attribute names to values that should never be modified * Note: This is not currently used */ - map> noOverwrite; + map> noOverwrite; /** * List of tag-replacement rules @@ -164,33 +164,33 @@ private: * Followed by some number of pair * Note: This is not currently used */ - vector>> retagRules; + vector>> retagRules; /** * Map key => [ value ] * Where key and value both name attribute lists * Where for each value, there is a tag-replacement rule from value to key */ - map> altAttrs; + map> altAttrs; /** * Map of pattern names to output patterns * Where '_' represents "lemh" and the part of speech tag * (which is usually the pattern name) * "lemq" is automatically appended to the end - * If the contents of the vector is L"macro", look at macros + * If the contents of the vector is "macro"_u, look at macros */ - map> outputRules; + map> outputRules; /** * Map of pattern names to conditioned output patterns */ - map macros; + map macros; /** * Names of global chunk-type variables and corresponding indecies */ - map globalVarNames; + map globalVarNames; /** * Map of pattern names to booleans @@ -198,7 +198,7 @@ private: * and thus all clips should be target clips * true indicates both surface only and unspecified */ - map nodeIsSurface; + map nodeIsSurface; /** * List of all reduction rules in the order they were parsed @@ -209,7 +209,7 @@ private: * List of compiled forms of output-time rules * in the order they were generated */ - vector outputBytecode; + vector outputBytecode; /** * Either the current rule being parsed or the current rule being compiled @@ -240,13 +240,13 @@ private: * Which surface of a chunk is being assigned to * one of APPENDSURFACE, APPENDSURFACESL, APPENDSURFACEREF */ - wchar_t currentSurface; + UChar currentSurface; /** * All attributes which can be clipped from the chunk whose children * are currently being compiled */ - vector parentTags; + vector parentTags; /** * Current construct being parsed or compiled @@ -266,7 +266,7 @@ private: /** * Input stream */ - wifstream source; + ifstream source; ////////// // ERROR REPORTING @@ -274,19 +274,19 @@ private: // for generating error messages int currentLine; - wstring recentlyRead; - wstring unreadbuf; + UString recentlyRead; + UString unreadbuf; int unreadmark; bool errorsAreSyntax; string sourceFile; - vector macroNameStack; + vector macroNameStack; /** * Report an error in the input file and exit * if errorsAreSyntax == true, will also print the most recently read line * with a marker of the approximate location of the error */ - void die(wstring message); + void die(UString message); ////////// // TOKENIZATION @@ -298,7 +298,7 @@ private: * to ensure that recentlyRead gets updated properly * @return character */ - wchar_t getchar(); + UChar getchar(); /** * Return the next character in the input stream without reading @@ -306,7 +306,7 @@ private: * in order to properly manage unreadbuf * @ return character */ - wchar_t peekchar(); + UChar peekchar(); /** * Mark the current location so that it can be jumped back to with unread() @@ -328,7 +328,7 @@ private: * Report a syntax error if it is preceded by spaces * @return token */ - wstring nextTokenNoSpace(); + UString nextTokenNoSpace(); /** * Parse the next token @@ -337,14 +337,14 @@ private: * report a syntax error * @return token */ - wstring nextToken(wstring check1, wstring check2); + UString nextToken(UString check1, UString check2); /** * Parse an identifier * Calls eatSpaces() beforehand if prespace == true * @return identifier */ - wstring parseIdent(bool prespace); + UString parseIdent(bool prespace); /** * Parse an integer @@ -362,7 +362,7 @@ private: * If the next character in the input stream is c, consume it and return true * Otherwise return false */ - bool isNextToken(wchar_t c); + bool isNextToken(UChar c); ////////// // COMPONENT PARSING @@ -392,106 +392,106 @@ private: /** * Convert a string to an operator * @param op - the string from the rule - * @return bytecode for corresponding operation or L'\0' if not found - */ - wchar_t lookupOperator(wstring op); - - const vector> OPERATORS = { - make_pair(L"and", AND), - make_pair(L"&", AND), - - make_pair(L"or", OR), - make_pair(L"|", OR), - - make_pair(L"not", NOT), - make_pair(L"~", NOT), - make_pair(L"⌐", NOT), - - make_pair(L"equal", EQUAL), - make_pair(L"=", EQUAL), - - make_pair(L"isprefix", ISPREFIX), - make_pair(L"startswith", ISPREFIX), - make_pair(L"beginswith", ISPREFIX), - - make_pair(L"issuffix", ISSUFFIX), - make_pair(L"endswith", ISSUFFIX), - - make_pair(L"issubstring", ISSUBSTRING), - make_pair(L"contains", ISSUBSTRING), - - make_pair(L"equalcl", EQUALCL), - make_pair(L"equalcaseless", EQUALCL), - make_pair(L"equalfold", EQUALCL), - make_pair(L"equalfoldcase", EQUALCL), - - make_pair(L"isprefixcl", ISPREFIXCL), - make_pair(L"startswithcl", ISPREFIXCL), - make_pair(L"beginswithcl", ISPREFIXCL), - make_pair(L"isprefixcaseless", ISPREFIXCL), - make_pair(L"startswithcaseless", ISPREFIXCL), - make_pair(L"beginswithcaseless", ISPREFIXCL), - make_pair(L"isprefixfold", ISPREFIXCL), - make_pair(L"startswithfold", ISPREFIXCL), - make_pair(L"beginswithfold", ISPREFIXCL), - make_pair(L"isprefixfoldcase", ISPREFIXCL), - make_pair(L"startswithfoldcase", ISPREFIXCL), - make_pair(L"beginswithfoldcase", ISPREFIXCL), - - make_pair(L"issuffixcl", ISSUFFIXCL), - make_pair(L"endswithcl", ISSUFFIXCL), - make_pair(L"issuffixcaseless", ISSUFFIXCL), - make_pair(L"endswithcaseless", ISSUFFIXCL), - make_pair(L"issuffixfold", ISSUFFIXCL), - make_pair(L"endswithfold", ISSUFFIXCL), - make_pair(L"issuffixfoldcase", ISSUFFIXCL), - make_pair(L"endswithfoldcase", ISSUFFIXCL), - - make_pair(L"issubstringcl", ISSUBSTRINGCL), - make_pair(L"issubstringcaseless", ISSUBSTRINGCL), - make_pair(L"issubstringfold", ISSUBSTRINGCL), - make_pair(L"issubstringfoldcase", ISSUBSTRINGCL), - - make_pair(L"hasprefix", HASPREFIX), - make_pair(L"startswithlist", HASPREFIX), - make_pair(L"beginswithlist", HASPREFIX), - - make_pair(L"hassuffix", HASSUFFIX), - make_pair(L"endswithlist", HASSUFFIX), - - make_pair(L"in", IN), - make_pair(L"∈", IN), - - make_pair(L"hasprefixcl", HASPREFIXCL), - make_pair(L"startswithlistcl", HASPREFIXCL), - make_pair(L"beginswithlistcl", HASPREFIXCL), - make_pair(L"hasprefixcaseless", HASPREFIXCL), - make_pair(L"startswithlistcaseless", HASPREFIXCL), - make_pair(L"beginswithlistcaseless", HASPREFIXCL), - make_pair(L"hasprefixfold", HASPREFIXCL), - make_pair(L"startswithlistfold", HASPREFIXCL), - make_pair(L"beginswithlistfold", HASPREFIXCL), - make_pair(L"hasprefixfoldcase", HASPREFIXCL), - make_pair(L"startswithlistfoldcase", HASPREFIXCL), - make_pair(L"beginswithlistfoldcase", HASPREFIXCL), - - make_pair(L"hassuffixcl", HASSUFFIXCL), - make_pair(L"endswithlistcl", HASSUFFIXCL), - make_pair(L"hassuffixcaseless", HASSUFFIXCL), - make_pair(L"endswithlistcaseless", HASSUFFIXCL), - make_pair(L"hassuffixfold", HASSUFFIXCL), - make_pair(L"endswithlistfold", HASSUFFIXCL), - make_pair(L"hassuffixfoldcase", HASSUFFIXCL), - make_pair(L"endswithlistfoldcase", HASSUFFIXCL), - - make_pair(L"incl", INCL), - make_pair(L"∈cl", INCL), // why you would want to use ∈ here I'm not sure - make_pair(L"incaseless", INCL), - make_pair(L"∈caseless", INCL), // but the documentation implies they exist - make_pair(L"infold", INCL), - make_pair(L"∈fold", INCL), // so here they are - make_pair(L"infoldcase", INCL), - make_pair(L"∈foldcase", INCL) + * @return bytecode for corresponding operation or '\0' if not found + */ + UChar lookupOperator(UString op); + + const vector> OPERATORS = { + make_pair("and"_u, AND), + make_pair("&"_u, AND), + + make_pair("or"_u, OR), + make_pair("|"_u, OR), + + make_pair("not"_u, NOT), + make_pair("~"_u, NOT), + make_pair("⌐"_u, NOT), + + make_pair("equal"_u, EQUAL), + make_pair("="_u, EQUAL), + + make_pair("isprefix"_u, ISPREFIX), + make_pair("startswith"_u, ISPREFIX), + make_pair("beginswith"_u, ISPREFIX), + + make_pair("issuffix"_u, ISSUFFIX), + make_pair("endswith"_u, ISSUFFIX), + + make_pair("issubstring"_u, ISSUBSTRING), + make_pair("contains"_u, ISSUBSTRING), + + make_pair("equalcl"_u, EQUALCL), + make_pair("equalcaseless"_u, EQUALCL), + make_pair("equalfold"_u, EQUALCL), + make_pair("equalfoldcase"_u, EQUALCL), + + make_pair("isprefixcl"_u, ISPREFIXCL), + make_pair("startswithcl"_u, ISPREFIXCL), + make_pair("beginswithcl"_u, ISPREFIXCL), + make_pair("isprefixcaseless"_u, ISPREFIXCL), + make_pair("startswithcaseless"_u, ISPREFIXCL), + make_pair("beginswithcaseless"_u, ISPREFIXCL), + make_pair("isprefixfold"_u, ISPREFIXCL), + make_pair("startswithfold"_u, ISPREFIXCL), + make_pair("beginswithfold"_u, ISPREFIXCL), + make_pair("isprefixfoldcase"_u, ISPREFIXCL), + make_pair("startswithfoldcase"_u, ISPREFIXCL), + make_pair("beginswithfoldcase"_u, ISPREFIXCL), + + make_pair("issuffixcl"_u, ISSUFFIXCL), + make_pair("endswithcl"_u, ISSUFFIXCL), + make_pair("issuffixcaseless"_u, ISSUFFIXCL), + make_pair("endswithcaseless"_u, ISSUFFIXCL), + make_pair("issuffixfold"_u, ISSUFFIXCL), + make_pair("endswithfold"_u, ISSUFFIXCL), + make_pair("issuffixfoldcase"_u, ISSUFFIXCL), + make_pair("endswithfoldcase"_u, ISSUFFIXCL), + + make_pair("issubstringcl"_u, ISSUBSTRINGCL), + make_pair("issubstringcaseless"_u, ISSUBSTRINGCL), + make_pair("issubstringfold"_u, ISSUBSTRINGCL), + make_pair("issubstringfoldcase"_u, ISSUBSTRINGCL), + + make_pair("hasprefix"_u, HASPREFIX), + make_pair("startswithlist"_u, HASPREFIX), + make_pair("beginswithlist"_u, HASPREFIX), + + make_pair("hassuffix"_u, HASSUFFIX), + make_pair("endswithlist"_u, HASSUFFIX), + + make_pair("in"_u, IN), + make_pair("∈"_u, IN), + + make_pair("hasprefixcl"_u, HASPREFIXCL), + make_pair("startswithlistcl"_u, HASPREFIXCL), + make_pair("beginswithlistcl"_u, HASPREFIXCL), + make_pair("hasprefixcaseless"_u, HASPREFIXCL), + make_pair("startswithlistcaseless"_u, HASPREFIXCL), + make_pair("beginswithlistcaseless"_u, HASPREFIXCL), + make_pair("hasprefixfold"_u, HASPREFIXCL), + make_pair("startswithlistfold"_u, HASPREFIXCL), + make_pair("beginswithlistfold"_u, HASPREFIXCL), + make_pair("hasprefixfoldcase"_u, HASPREFIXCL), + make_pair("startswithlistfoldcase"_u, HASPREFIXCL), + make_pair("beginswithlistfoldcase"_u, HASPREFIXCL), + + make_pair("hassuffixcl"_u, HASSUFFIXCL), + make_pair("endswithlistcl"_u, HASSUFFIXCL), + make_pair("hassuffixcaseless"_u, HASSUFFIXCL), + make_pair("endswithlistcaseless"_u, HASSUFFIXCL), + make_pair("hassuffixfold"_u, HASSUFFIXCL), + make_pair("endswithlistfold"_u, HASSUFFIXCL), + make_pair("hassuffixfoldcase"_u, HASSUFFIXCL), + make_pair("endswithlistfoldcase"_u, HASSUFFIXCL), + + make_pair("incl"_u, INCL), + make_pair("∈cl"_u, INCL), // why you would want to use ∈ here I'm not sure + make_pair("incaseless"_u, INCL), + make_pair("∈caseless"_u, INCL), // but the documentation implies they exist + make_pair("infold"_u, INCL), + make_pair("∈fold"_u, INCL), // so here they are + make_pair("infoldcase"_u, INCL), + make_pair("∈foldcase"_u, INCL) }; /** @@ -532,23 +532,23 @@ private: /** * Parse a tag-order rule */ - void parseOutputRule(wstring pattern); + void parseOutputRule(UString pattern); /** * Parse a tag-replacement rule * Note: these rules currently have no effect */ - void parseRetagRule(wstring srcTag); + void parseRetagRule(UString srcTag); /** * Parse an attribute category */ - void parseAttrRule(wstring name); + void parseAttrRule(UString name); /** * Parse a reduction rule and append it to reductionRules */ - void parseReduceRule(wstring firstnode, wstring next); + void parseReduceRule(UString firstnode, UString next); ////////// // ANALYSIS @@ -574,14 +574,14 @@ private: * @param s - the string * @return bytecode */ - wstring compileString(wstring s); + UString compileString(UString s); /** * Compiles a string as to a literal tag * @param s - the tag * @return bytecode */ - wstring compileTag(wstring s); + UString compileTag(UString s); /** * Compile a Clip object @@ -591,12 +591,12 @@ private: * @param dest - the destination attribute * @return bytecode */ - wstring compileClip(Clip* c, wstring dest); + UString compileClip(Clip* c, UString dest); /** * Wrapper around compileClip(Clip*) */ - wstring compileClip(wstring part, int pos, wstring side); + UString compileClip(UString part, int pos, UString side); // TODO Clip* processMacroClip(Clip* mac, OutputChunk* arg); @@ -609,28 +609,28 @@ private: * @param ch - the element * @return bytecode */ - wstring processOutputChunk(OutputChunk* ch); + UString processOutputChunk(OutputChunk* ch); /** * Compile and the output rule for a chunk * @param chunk - the chunk * @return bytecode */ - wstring processOutput(OutputChunk* chunk); + UString processOutput(OutputChunk* chunk); /** * Compile the output rule for an if statement * @param chunk - the chunk * @return bytecode */ - wstring processOutputChoice(OutputChoice* choice); + UString processOutputChoice(OutputChoice* choice); /** * Compile a Cond object * @param cond - the conditional * @return bytecode */ - wstring processCond(Cond* cond); + UString processCond(Cond* cond); /** * Iterate over reductionRules, compiling them @@ -646,7 +646,7 @@ public: { summarizing = value; } - void excludeRule(wstring name) + void excludeRule(UString name) { excluded.insert(name); } diff --git a/src/rtx_decomp.cc b/src/rtx_decomp.cc index f64b24b..9ecbc21 100644 --- a/src/rtx_decomp.cc +++ b/src/rtx_decomp.cc @@ -22,9 +22,9 @@ void endProgram(char *name) exit(EXIT_FAILURE); } -void writeRule(wstring rule, FILE* out) +void writeRule(UString rule, FILE* out) { - wstring line; + UString line; for(unsigned int i = 0; i < rule.size(); i++) { line.clear(); @@ -51,7 +51,7 @@ void writeRule(wstring rule, FILE* out) { fputwc(rule[++i], out); } - //wstring s = rule.substr(i+1, len); + //UString s = rule.substr(i+1, len); fwprintf(out, L"\"\n"); } break; @@ -278,11 +278,11 @@ int main(int argc, char *argv[]) fwprintf(out, L"Input rules:\n"); fwprintf(out, L"Longest pattern: %d chunks\nNumber of rules: %d\n\n", longestPattern, count); int patlen; - wstring cur; + UString cur; for(int i = 0; i < count; i++) { patlen = Compression::multibyte_read(in); - cur = Compression::wstring_read(in); + cur = Compression::string_read(in); fwprintf(out, L"Rule %d (%d bytes, pattern %d chunks)\n", i+1, cur.size(), patlen); writeRule(cur, out); } @@ -291,7 +291,7 @@ int main(int argc, char *argv[]) fwprintf(out, L"Output rules:\nNumber of rules: %d\n\n", count); for(int i = 0; i < count; i++) { - cur = Compression::wstring_read(in); + cur = Compression::string_read(in); fwprintf(out, L"Rule %d (%d bytes)\n", i, cur.size()); writeRule(cur, out); } diff --git a/src/rtx_proc.cc b/src/rtx_proc.cc index fb18746..63093cb 100644 --- a/src/rtx_proc.cc +++ b/src/rtx_proc.cc @@ -3,6 +3,7 @@ #include #include #include +#include void endProgram(char *name) { @@ -147,23 +148,24 @@ int main(int argc, char *argv[]) p.read(argv[optind]); - FILE *input = stdin, *output = stdout; + FILE *input = stdin; + UFILE* output = u_finit(stdout, NULL, NULL); if(optind <= (argc - 2)) { input = fopen(argv[optind+1], "rb"); if(input == NULL) { - wcerr << "Unable to open " << argv[optind+1] << " for reading." << endl; + cerr << "Unable to open " << argv[optind+1] << " for reading." << endl; exit(EXIT_FAILURE); } } if(optind <= (argc - 3)) { - output = fopen(argv[optind+2], "wb"); + output = u_fopen(argv[optind+2], "w", NULL, NULL); if(input == NULL) { - wcerr << "Unable to open " << argv[optind+2] << " for writing." << endl; + cerr << "Unable to open " << argv[optind+2] << " for writing." << endl; exit(EXIT_FAILURE); } } @@ -171,6 +173,6 @@ int main(int argc, char *argv[]) p.process(input, output); fclose(input); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc index f2056a8..65532b4 100644 --- a/src/rtx_processor.cc +++ b/src/rtx_processor.cc @@ -1,35 +1,17 @@ #include #include #include -#include -#include +//#include #include #include -#include +#include //#include -using namespace Apertium; using namespace std; RTXProcessor::RTXProcessor() { - furtherInput = true; - inword = false; - inwblank = false; - printingSteps = false; - printingRules = false; - printingBranches = false; - printingAll = false; - noCoref = true; - isLinear = false; - null_flush = false; - printingTrees = false; - printingText = true; - treePrintMode = TreeModeNest; - newBranchId = 0; - noFilter = true; - currentBranch = NULL; } RTXProcessor::~RTXProcessor() @@ -43,7 +25,7 @@ RTXProcessor::read(string const &filename) FILE *in = fopen(filename.c_str(), "rb"); if(in == NULL) { - wcerr << "Unable to open file " << filename.c_str() << endl; + cerr << "Unable to open file " << filename.c_str() << endl; exit(EXIT_FAILURE); } @@ -54,13 +36,13 @@ RTXProcessor::read(string const &filename) for(int i = 0; i < count; i++) { pat_size.push_back(Compression::multibyte_read(in)); - rule_map.push_back(Compression::wstring_read(in)); + rule_map.push_back(Compression::string_read(in)); } count = Compression::multibyte_read(in); output_rules.reserve(count); for(int i = 0; i < count; i++) { - output_rules.push_back(Compression::wstring_read(in)); + output_rules.push_back(Compression::string_read(in)); } varCount = Compression::multibyte_read(in); @@ -88,32 +70,35 @@ RTXProcessor::read(string const &filename) delete t; // attr_items - bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + bool recompile_attrs = !Compression::string_read(in).empty(); for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - wstring const cad_k = Compression::wstring_read(in); + UString const cad_k = Compression::string_read(in); attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + UString fallback = Compression::string_read(in); + if (recompile_attrs && cad_k == "chname"_u) { + // chname was previously "({([^/]+)\\/)" + // which was fine for PCRE, but ICU chokes on the unmatched bracket + fallback = "(\\{([^/]+)\\/)"_u; } + attr_items[cad_k].compile(fallback); } // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - wstring const cad_k = Compression::wstring_read(in); - variables[cad_k] = Compression::wstring_read(in); + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - wstring const cad_k = Compression::wstring_read(in); + UString const cad_k = Compression::string_read(in); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { - wstring const cad_v = Compression::wstring_read(in); + UString const cad_v = Compression::string_read(in); lists[cad_k].insert(cad_v); listslow[cad_k].insert(StringUtils::tolower(cad_v)); } @@ -122,19 +107,19 @@ RTXProcessor::read(string const &filename) int nameCount = Compression::multibyte_read(in); for(int i = 0; i < nameCount; i++) { - inRuleNames.push_back(Compression::wstring_read(in)); + inRuleNames.push_back(Compression::string_read(in)); } nameCount = Compression::multibyte_read(in); for(int i = 0; i < nameCount; i++) { - outRuleNames.push_back(Compression::wstring_read(in)); + outRuleNames.push_back(Compression::string_read(in)); } fclose(in); } bool -RTXProcessor::beginsWith(wstring const &s1, wstring const &s2) const +RTXProcessor::beginsWith(UString const &s1, UString const &s2) const { int const limit = s2.size(), constraint = s1.size(); @@ -154,7 +139,7 @@ RTXProcessor::beginsWith(wstring const &s1, wstring const &s2) const } bool -RTXProcessor::endsWith(wstring const &s1, wstring const &s2) const +RTXProcessor::endsWith(UString const &s1, UString const &s2) const { int const limit = s2.size(), constraint = s1.size(); @@ -173,43 +158,6 @@ RTXProcessor::endsWith(wstring const &s1, wstring const &s2) const return true; } -wstring -RTXProcessor::copycase(wstring const &source_word, wstring const &target_word) -{ - wstring result; - - bool firstupper = iswupper(source_word[0]); - bool uppercase = firstupper && iswupper(source_word[source_word.size()-1]); - bool sizeone = source_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - if(isLinear) - { - result = target_word; - result[0] = towlower(result[0]); - } - else result = StringUtils::tolower(target_word); - } - else - { - result = StringUtils::toupper(target_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return result; -} - -wstring -RTXProcessor::caseOf(wstring const &s) -{ - return copycase(s, wstring(L"aa")); -} - inline bool RTXProcessor::popBool() { @@ -219,7 +167,7 @@ RTXProcessor::popBool() } else { - wcerr << "tried to pop bool but mode is " << theStack[stackIdx].mode << endl; + cerr << "tried to pop bool but mode is " << theStack[stackIdx].mode << endl; exit(1); } } @@ -233,12 +181,12 @@ RTXProcessor::popInt() } else { - wcerr << "tried to pop int but mode is " << theStack[stackIdx].mode << endl; + cerr << "tried to pop int but mode is " << theStack[stackIdx].mode << endl; exit(1); } } -inline wstring +inline UString RTXProcessor::popString() { if(theStack[stackIdx].mode == 2) @@ -251,13 +199,13 @@ RTXProcessor::popString() } else { - wcerr << "tried to pop wstring but mode is " << theStack[stackIdx].mode << endl; + cerr << "tried to pop UString but mode is " << theStack[stackIdx].mode << endl; exit(1); } } inline void -RTXProcessor::popString(wstring& dest) +RTXProcessor::popString(UString& dest) { if(theStack[stackIdx].mode == 2) { @@ -269,7 +217,7 @@ RTXProcessor::popString(wstring& dest) } else { - wcerr << "tried to pop wstring but mode is " << theStack[stackIdx].mode << endl; + cerr << "tried to pop UString but mode is " << theStack[stackIdx].mode << endl; exit(1); } } @@ -283,8 +231,8 @@ RTXProcessor::popChunk() } else { - wcerr << "tried to pop Chunk but mode is " << theStack[stackIdx].mode << endl; - wcerr << "The most common reason for getting this error is a macro that is missing an else clause." << endl; + cerr << "tried to pop Chunk but mode is " << theStack[stackIdx].mode << endl; + cerr << "The most common reason for getting this error is a macro that is missing an else clause." << endl; exit(1); } } @@ -312,43 +260,43 @@ RTXProcessor::stackCopy(int src, int dest) theWblankStack[dest] = theWblankStack[src]; break; default: - wcerr << "Unknown StackElement mode " << theStack[src].mode; + cerr << "Unknown StackElement mode " << theStack[src].mode; break; } } bool -RTXProcessor::gettingLemmaFromWord(wstring attr) +RTXProcessor::gettingLemmaFromWord(UString attr) { - return (attr.compare(L"lem") == 0 || attr.compare(L"lemh") == 0 || attr.compare(L"whole") == 0); + return (attr.compare("lem"_u) == 0 || attr.compare("lemh"_u) == 0 || attr.compare("whole"_u) == 0); } bool -RTXProcessor::applyRule(const wstring& rule) +RTXProcessor::applyRule(const UString& rule) { stackIdx = 0; vector editted = vector(currentInput.size(), false); - const wchar_t* rule_data = rule.data(); + const UChar* rule_data = rule.data(); for(unsigned int i = 0, rule_size = rule.size(); i < rule_size; i++) { switch(rule_data[i]) { case DROP: - if(printingSteps) { wcerr << "drop" << endl; } + if(printingSteps) { cerr << "drop" << endl; } stackIdx--; break; case DUP: - if(printingSteps) { wcerr << "dup" << endl; } + if(printingSteps) { cerr << "dup" << endl; } stackCopy(stackIdx, stackIdx+1); stackIdx++; break; case OVER: - if(printingSteps) { wcerr << "over" << endl; } + if(printingSteps) { cerr << "over" << endl; } stackCopy(stackIdx-1, stackIdx+1); stackIdx++; break; case SWAP: - if(printingSteps) { wcerr << "swap" << endl; } + if(printingSteps) { cerr << "swap" << endl; } { stackCopy(stackIdx, stackIdx+1); stackCopy(stackIdx-1, stackIdx); @@ -357,67 +305,67 @@ RTXProcessor::applyRule(const wstring& rule) break; case STRING: { - if(printingSteps) { wcerr << "string" << endl; } + if(printingSteps) { cerr << "string" << endl; } int ct = rule_data[++i]; stackIdx++; theStack[stackIdx].mode = 2; theStack[stackIdx].s.assign(rule, i+1, ct); //pushStack(rule.substr(i+1, ct)); i += ct; - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } } break; case INT: - if(printingSteps) { wcerr << "int " << (int)rule[i+1] << endl; } + if(printingSteps) { cerr << "int " << (int)rule[i+1] << endl; } pushStack((int)rule_data[++i]); break; case PUSHFALSE: - if(printingSteps) { wcerr << "pushfalse" << endl; } + if(printingSteps) { cerr << "pushfalse" << endl; } pushStack(false); break; case PUSHTRUE: - if(printingSteps) { wcerr << "pushtrue" << endl; } + if(printingSteps) { cerr << "pushtrue" << endl; } pushStack(true); break; case PUSHNULL: - if(printingSteps) { wcerr << "pushnull" << endl; } + if(printingSteps) { cerr << "pushnull" << endl; } pushStack((Chunk*)NULL); break; case JUMP: - if(printingSteps) { wcerr << "jump" << endl; } + if(printingSteps) { cerr << "jump" << endl; } ++i; i += rule_data[i]; break; case JUMPONTRUE: - if(printingSteps) { wcerr << "jumpontrue" << endl; } + if(printingSteps) { cerr << "jumpontrue" << endl; } if(!popBool()) { i++; - if(printingSteps) { wcerr << " -> false" << endl; } + if(printingSteps) { cerr << " -> false" << endl; } } else { ++i; i += rule_data[i]; - if(printingSteps) { wcerr << " -> true, jumping" << endl; } + if(printingSteps) { cerr << " -> true, jumping" << endl; } } break; case JUMPONFALSE: - if(printingSteps) { wcerr << "jumponfalse" << endl; } + if(printingSteps) { cerr << "jumponfalse" << endl; } if(popBool()) { i++; - if(printingSteps) { wcerr << " -> true" << endl; } + if(printingSteps) { cerr << " -> true" << endl; } } else { ++i; i += rule_data[i]; - if(printingSteps) { wcerr << " -> false, jumping" << endl; } + if(printingSteps) { cerr << " -> false, jumping" << endl; } } break; case AND: - if(printingSteps) { wcerr << "and" << endl; } + if(printingSteps) { cerr << "and" << endl; } { bool a = popBool(); bool b = popBool(); @@ -425,7 +373,7 @@ RTXProcessor::applyRule(const wstring& rule) } break; case OR: - if(printingSteps) { wcerr << "or" << endl; } + if(printingSteps) { cerr << "or" << endl; } { bool a = popBool(); bool b = popBool(); @@ -433,16 +381,16 @@ RTXProcessor::applyRule(const wstring& rule) } break; case NOT: - if(printingSteps) { wcerr << "not" << endl; } + if(printingSteps) { cerr << "not" << endl; } theStack[stackIdx].b = !theStack[stackIdx].b; break; case EQUAL: case EQUALCL: - if(printingSteps) { wcerr << "equal" << endl; } + if(printingSteps) { cerr << "equal" << endl; } { - wstring a; + UString a; popString(a); - wstring b; + UString b; popString(b); if(rule_data[i] == EQUALCL) { @@ -450,15 +398,15 @@ RTXProcessor::applyRule(const wstring& rule) b = StringUtils::tolower(b); } pushStack(a == b); - if(printingSteps) { wcerr << " -> " << (a == b ? "true" : "false") << endl; } + if(printingSteps) { cerr << " -> " << (a == b ? "true" : "false") << endl; } } break; case ISPREFIX: case ISPREFIXCL: - if(printingSteps) { wcerr << "isprefix" << endl; } + if(printingSteps) { cerr << "isprefix" << endl; } { - wstring substr = popString(); - wstring str = popString(); + UString substr = popString(); + UString str = popString(); if(rule[i] == ISPREFIXCL) { pushStack(beginsWith(StringUtils::tolower(str), StringUtils::tolower(substr))); @@ -471,10 +419,10 @@ RTXProcessor::applyRule(const wstring& rule) break; case ISSUFFIX: case ISSUFFIXCL: - if(printingSteps) { wcerr << "issuffix" << endl; } + if(printingSteps) { cerr << "issuffix" << endl; } { - wstring substr = popString(); - wstring str = popString(); + UString substr = popString(); + UString str = popString(); if(rule[i] == ISSUFFIXCL) { pushStack(endsWith(StringUtils::tolower(str), StringUtils::tolower(substr))); @@ -487,11 +435,11 @@ RTXProcessor::applyRule(const wstring& rule) break; case HASPREFIX: case HASPREFIXCL: - if(printingSteps) { wcerr << "hasprefix" << endl; } + if(printingSteps) { cerr << "hasprefix" << endl; } { - wstring list = popString(); - wstring needle = popString(); - set::iterator it, limit; + UString list = popString(); + UString needle = popString(); + set::iterator it, limit; if(rule[i] == HASPREFIX) { @@ -519,11 +467,11 @@ RTXProcessor::applyRule(const wstring& rule) break; case HASSUFFIX: case HASSUFFIXCL: - if(printingSteps) { wcerr << "hassuffix" << endl; } + if(printingSteps) { cerr << "hassuffix" << endl; } { - wstring list = popString(); - wstring needle = popString(); - set::iterator it, limit; + UString list = popString(); + UString needle = popString(); + set::iterator it, limit; if(rule[i] == HASSUFFIX) { @@ -551,50 +499,50 @@ RTXProcessor::applyRule(const wstring& rule) break; case ISSUBSTRING: case ISSUBSTRINGCL: - if(printingSteps) { wcerr << "issubstring" << endl; } + if(printingSteps) { cerr << "issubstring" << endl; } { - wstring needle = popString(); - wstring haystack = popString(); + UString needle = popString(); + UString haystack = popString(); if(rule[i] == ISSUBSTRINGCL) { needle = StringUtils::tolower(needle); haystack = StringUtils::tolower(haystack); } - pushStack(haystack.find(needle) != wstring::npos); + pushStack(haystack.find(needle) != UString::npos); } break; case IN: case INCL: - if(printingSteps) { wcerr << "in" << endl; } + if(printingSteps) { cerr << "in" << endl; } { - wstring list = popString(); - wstring str = popString(); + UString list = popString(); + UString str = popString(); if(rule[i] == INCL) { str = StringUtils::tolower(str); - set &myset = listslow[list]; + set &myset = listslow[list]; pushStack(myset.find(str) != myset.end()); } else { - set &myset = lists[list]; + set &myset = lists[list]; pushStack(myset.find(str) != myset.end()); } } break; case SETVAR: - if(printingSteps) { wcerr << "setvar" << endl; } + if(printingSteps) { cerr << "setvar" << endl; } { - wstring var = popString(); - wstring val = popString(); + UString var = popString(); + UString val = popString(); currentBranch->stringVars[var] = val; currentBranch->wblankVars[var] = theWblankStack[stackIdx+1]; theWblankStack[stackIdx+1].clear(); - if(printingSteps) { wcerr << " -> " << var << " = '" << val << "'" << endl; } + if(printingSteps) { cerr << " -> " << var << " = '" << val << "'" << endl; } } break; case OUTPUT: - if(printingSteps) { wcerr << "output" << endl; } + if(printingSteps) { cerr << "output" << endl; } { Chunk* ch = popChunk(); if(ch == NULL) break; // FETCHCHUNK @@ -602,14 +550,14 @@ RTXProcessor::applyRule(const wstring& rule) { bool word = true; unsigned int last = 0; - const wchar_t* targ = ch->target.data(); + const UChar* targ = ch->target.data(); bool chunk = false; for(unsigned int c = 0, limit = ch->target.size(); c < limit; c++) { - if(targ[c] == L'\\') c++; - else if((targ[c] == L'{' || targ[c] == L'$') && word) + if(targ[c] == '\\') c++; + else if((targ[c] == '{' || targ[c] == '$') && word) { - if(targ[c] == L'{') chunk = true; + if(targ[c] == '{') chunk = true; Chunk* temp = chunkPool.next(); temp->isBlank = false; temp->target = ch->target.substr(last, c-last); @@ -620,7 +568,7 @@ RTXProcessor::applyRule(const wstring& rule) last = c+1; word = false; } - else if((targ[c] == L'^' || targ[c] == L'}') && !word) + else if((targ[c] == '^' || targ[c] == '}') && !word) { if(c > last) { @@ -630,7 +578,7 @@ RTXProcessor::applyRule(const wstring& rule) if(chunk) currentOutput.back()->contents.push_back(temp); else currentOutput.push_back(temp); } - if(targ[c] == L'}') chunk = false; + if(targ[c] == '}') chunk = false; last = c+1; word = true; } @@ -656,12 +604,12 @@ RTXProcessor::applyRule(const wstring& rule) } break; case OUTPUTALL: - if(printingSteps) { wcerr << "outputall" << endl; } + if(printingSteps) { cerr << "outputall" << endl; } currentOutput = currentInput; return true; break; case PUSHINPUT: - if(printingSteps) { wcerr << "pushinput" << endl; } + if(printingSteps) { cerr << "pushinput" << endl; } { int loc = popInt(); int pos = 2*(loc-1); @@ -682,7 +630,7 @@ RTXProcessor::applyRule(const wstring& rule) } if(ch == NULL) { - //wcerr << L"Clip index is out of bounds." << endl; + //cerr << "Clip index is out of bounds." << endl; //exit(EXIT_FAILURE); ch = currentInput.back(); } @@ -691,12 +639,12 @@ RTXProcessor::applyRule(const wstring& rule) } break; case SOURCECLIP: - if(printingSteps) { wcerr << "sourceclip" << endl; } + if(printingSteps) { cerr << "sourceclip" << endl; } { - wstring part; + UString part; popString(part); Chunk* ch = popChunk(); - if(ch == NULL) pushStack(L""); + if(ch == NULL) pushStack(""); else { if(gettingLemmaFromWord(part)) @@ -708,16 +656,16 @@ RTXProcessor::applyRule(const wstring& rule) pushStack(ch->chunkPart(attr_items[part], SourceClip)); } } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } } break; case TARGETCLIP: - if(printingSteps) { wcerr << "targetclip" << endl; } + if(printingSteps) { cerr << "targetclip" << endl; } { - wstring part; + UString part; popString(part); Chunk* ch = popChunk(); - if(ch == NULL) pushStack(L""); + if(ch == NULL) pushStack(""); else { if(gettingLemmaFromWord(part)) @@ -729,25 +677,25 @@ RTXProcessor::applyRule(const wstring& rule) pushStack(ch->chunkPart(attr_items[part], TargetClip)); } } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } } break; case REFERENCECLIP: - if(printingSteps) { wcerr << "referenceclip" << endl; } + if(printingSteps) { cerr << "referenceclip" << endl; } { - wstring part; + UString part; popString(part); Chunk* ch = popChunk(); - if(ch == NULL) pushStack(L""); + if(ch == NULL) pushStack(""); else pushStack(ch->chunkPart(attr_items[part], ReferenceClip)); - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } } break; case SETCLIP: - if(printingSteps) { wcerr << "setclip" << endl; } + if(printingSteps) { cerr << "setclip" << endl; } { int pos = 2*(popInt()-1); - wstring part = popString(); + UString part = popString(); if(pos >= 0) { if(!editted[pos]) @@ -756,7 +704,7 @@ RTXProcessor::applyRule(const wstring& rule) editted[pos] = true; } currentInput[pos]->setChunkPart(attr_items[part], popString()); - if(printingSteps) { wcerr << " -> " << currentInput[pos]->target << endl; } + if(printingSteps) { cerr << " -> " << currentInput[pos]->target << endl; } } else { @@ -765,46 +713,46 @@ RTXProcessor::applyRule(const wstring& rule) } break; case FETCHVAR: - if(printingSteps) { wcerr << "fetchvar" << endl; } + if(printingSteps) { cerr << "fetchvar" << endl; } { - wstring name = popString(); - wstring val = currentBranch->stringVars[name]; - wstring wblank_val = currentBranch->wblankVars[name]; + UString name = popString(); + UString val = currentBranch->stringVars[name]; + UString wblank_val = currentBranch->wblankVars[name]; pushStack(val, wblank_val); - if(printingSteps) { wcerr << " -> " << name << " = " << val << endl; } + if(printingSteps) { cerr << " -> " << name << " = " << val << endl; } } break; case FETCHCHUNK: - if(printingSteps) { wcerr << "fetchchunk" << endl; } + if(printingSteps) { cerr << "fetchchunk" << endl; } pushStack(currentBranch->chunkVars[popInt()]); break; case SETCHUNK: - if(printingSteps) { wcerr << "setchunk" << endl; } + if(printingSteps) { cerr << "setchunk" << endl; } { int pos = popInt(); currentBranch->chunkVars[pos] = popChunk(); } break; case GETCASE: - if(printingSteps) { wcerr << "getcase" << endl; } - pushStack(caseOf(popString())); - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << "getcase" << endl; } + pushStack(StringUtils::getcase(popString())); + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } break; case SETCASE: - if(printingSteps) { wcerr << "setcase" << endl; } + if(printingSteps) { cerr << "setcase" << endl; } { - wstring src = popString(); - wstring dest = popString(); - pushStack(copycase(src, dest)); + UString src = popString(); + UString dest = popString(); + pushStack(StringUtils::copycase(src, dest)); } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } break; case CONCAT: - if(printingSteps) { wcerr << "concat" << endl; } + if(printingSteps) { cerr << "concat" << endl; } { if(theStack[stackIdx].mode != 2 || theStack[stackIdx-1].mode != 2) { - wcerr << L"Cannot CONCAT non-strings." << endl; + cerr << "Cannot CONCAT non-strings." << endl; exit(EXIT_FAILURE); } stackIdx--; @@ -812,7 +760,7 @@ RTXProcessor::applyRule(const wstring& rule) } break; case CHUNK: - if(printingSteps) { wcerr << "chunk" << endl; } + if(printingSteps) { cerr << "chunk" << endl; } { Chunk* ch = chunkPool.next(); ch->isBlank = false; @@ -820,15 +768,15 @@ RTXProcessor::applyRule(const wstring& rule) } break; case APPENDCHILD: - if(printingSteps) { wcerr << "appendchild" << endl; } + if(printingSteps) { cerr << "appendchild" << endl; } { Chunk* kid = popChunk(); - if(isLinear && kid->target[0] == L'^') + if(isLinear && kid->target[0] == '^') { unsigned int j = 0; for(; j < kid->target.size(); j++) { - if(kid->target[j] == L'$') break; + if(kid->target[j] == '$') break; } Chunk* ch = chunkPool.next(); ch->isBlank = false; @@ -847,21 +795,21 @@ RTXProcessor::applyRule(const wstring& rule) out_wblank.clear(); theStack[stackIdx].c->contents.push_back(kid); } - if(printingSteps) { wcerr << " -> child with surface '" << kid->target << L"' appended" << endl; } + if(printingSteps) { cerr << " -> child with surface '" << kid->target << "' appended" << endl; } } break; case APPENDSURFACE: - if(printingSteps) { wcerr << "appendsurface" << endl; } + if(printingSteps) { cerr << "appendsurface" << endl; } { if(theStack[stackIdx].mode != 2 && theStack[stackIdx].mode != 3) { - wcerr << L"Cannot append non-string to chunk surface." << endl; + cerr << "Cannot append non-string to chunk surface." << endl; exit(EXIT_FAILURE); } stackIdx--; if(theStack[stackIdx].mode != 3) { - wcerr << L"Cannot APPENDSURFACE to non-chunk." << endl; + cerr << "Cannot APPENDSURFACE to non-chunk." << endl; exit(EXIT_FAILURE); } if(theStack[stackIdx+1].mode == 2) @@ -875,21 +823,21 @@ RTXProcessor::applyRule(const wstring& rule) theStack[stackIdx].c->target += theStack[stackIdx+1].c->target; theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank; } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx+1].s << endl; } } break; case APPENDSURFACESL: - if(printingSteps) { wcerr << "appendsurfacesl" << endl; } + if(printingSteps) { cerr << "appendsurfacesl" << endl; } { if(theStack[stackIdx].mode != 2 && theStack[stackIdx].mode != 3) { - wcerr << L"Cannot append non-string to chunk surface." << endl; + cerr << "Cannot append non-string to chunk surface." << endl; exit(EXIT_FAILURE); } stackIdx--; if(theStack[stackIdx].mode != 3) { - wcerr << L"Cannot APPENDSURFACESL to non-chunk." << endl; + cerr << "Cannot APPENDSURFACESL to non-chunk." << endl; exit(EXIT_FAILURE); } if(theStack[stackIdx+1].mode == 2) @@ -903,21 +851,21 @@ RTXProcessor::applyRule(const wstring& rule) theStack[stackIdx].c->source += theStack[stackIdx+1].c->source; theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank; } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx+1].s << endl; } } break; case APPENDSURFACEREF: - if(printingSteps) { wcerr << "appendsurfaceref" << endl; } + if(printingSteps) { cerr << "appendsurfaceref" << endl; } { if(theStack[stackIdx].mode != 2 && theStack[stackIdx].mode != 3) { - wcerr << L"Cannot append non-string to chunk surface." << endl; + cerr << "Cannot append non-string to chunk surface." << endl; exit(EXIT_FAILURE); } stackIdx--; if(theStack[stackIdx].mode != 3) { - wcerr << L"Cannot APPENDSURFACEREF to non-chunk." << endl; + cerr << "Cannot APPENDSURFACEREF to non-chunk." << endl; exit(EXIT_FAILURE); } if(theStack[stackIdx+1].mode == 2) @@ -928,11 +876,11 @@ RTXProcessor::applyRule(const wstring& rule) { theStack[stackIdx].c->coref += theStack[stackIdx+1].c->coref; } - if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; } + if(printingSteps) { cerr << " -> " << theStack[stackIdx+1].s << endl; } } break; case APPENDALLCHILDREN: - if(printingSteps) { wcerr << "appendallchildren" << endl; } + if(printingSteps) { cerr << "appendallchildren" << endl; } { Chunk* ch = popChunk(); for(unsigned int k = 0; k < ch->contents.size(); k++) @@ -942,20 +890,20 @@ RTXProcessor::applyRule(const wstring& rule) } break; case APPENDALLINPUT: - if(printingSteps) { wcerr << "appendallinput" << endl; } + if(printingSteps) { cerr << "appendallinput" << endl; } { vector& vec = theStack[stackIdx].c->contents; vec.insert(vec.end(), currentInput.begin(), currentInput.end()); } break; case BLANK: - if(printingSteps) { wcerr << "blank" << endl; } + if(printingSteps) { cerr << "blank" << endl; } { int loc = 2*(popInt()-1) + 1; if(loc == -1) { Chunk* ch = chunkPool.next(); - ch->target = L" "; + ch->target = " "_u; ch->isBlank = true; pushStack(ch); } @@ -966,43 +914,43 @@ RTXProcessor::applyRule(const wstring& rule) } break; case CONJOIN: - if(printingSteps) { wcerr << "conjoin" << endl; } + if(printingSteps) { cerr << "conjoin" << endl; } { Chunk* join = chunkPool.next(); join->isBlank = true; join->isJoiner = true; - join->target = L"+"; + join->target = "+"_u; pushStack(join); } break; case REJECTRULE: - if(printingSteps) { wcerr << "rejectrule" << endl; } + if(printingSteps) { cerr << "rejectrule" << endl; } return false; break; case DISTAG: - if(printingSteps) { wcerr << "distag" << endl; } + if(printingSteps) { cerr << "distag" << endl; } { if(theStack[stackIdx].mode != 2) { - wcerr << L"Cannot DISTAG non-string." << endl; + cerr << "Cannot DISTAG non-string." << endl; exit(EXIT_FAILURE); } - wstring& s = theStack[stackIdx].s; - if(s.size() > 0 && s[0] == L'<' && s[s.size()-1] == L'>') + UString& s = theStack[stackIdx].s; + if(s.size() > 0 && s[0] == '<' && s[s.size()-1] == '>') { - s = StringUtils::substitute(s.substr(1, s.size()-2), L"><", L"."); + s = StringUtils::substitute(s.substr(1, s.size()-2), "><"_u, "."_u); } } break; case GETRULE: - if(printingSteps) { wcerr << "getrule" << endl; } + if(printingSteps) { cerr << "getrule" << endl; } { int pos = 2*(popInt()-1); pushStack(currentInput[pos]->rule); } break; case SETRULE: - if(printingSteps) { wcerr << "setrule" << endl; } + if(printingSteps) { cerr << "setrule" << endl; } { int pos = 2*(popInt()-1); int rl = popInt(); @@ -1010,9 +958,9 @@ RTXProcessor::applyRule(const wstring& rule) { if(stackIdx == 0 || theStack[stackIdx].mode != 3) { - wcerr << "Empty stack or top item is not chunk." << endl; - wcerr << "Check for conditionals that might not generate output" << endl; - wcerr << "and ensure that lists of attributes are complete." << endl; + cerr << "Empty stack or top item is not chunk." << endl; + cerr << "Check for conditionals that might not generate output" << endl; + cerr << "and ensure that lists of attributes are complete." << endl; exit(1); } theStack[stackIdx].c->rule = rl; @@ -1024,11 +972,11 @@ RTXProcessor::applyRule(const wstring& rule) } break; case LUCOUNT: - if(printingSteps) { wcerr << "lucount" << endl; } - pushStack(to_wstring((currentInput.size() + 1) / 2)); + if(printingSteps) { cerr << "lucount" << endl; } + pushStack(StringUtils::itoa((currentInput.size() + 1) / 2)); break; default: - wcerr << "unknown instruction: " << rule[i] << endl; + cerr << "unknown instruction: " << rule[i] << endl; exit(1); } } @@ -1036,37 +984,35 @@ RTXProcessor::applyRule(const wstring& rule) } Chunk * -RTXProcessor::readToken(FILE *in) +RTXProcessor::readToken() { int pos = 0; - wstring cur; - wstring wbl; - wstring src; - wstring dest; - wstring coref; + UString cur; + UString wbl; + UString src; + UString dest; + UString coref; cur.reserve(256); - bool inSquare = false; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (null_flush && val == 0)) - { + UChar32 val = infile.get(); + if (infile.eof() || (null_flush && val == '\0')) { furtherInput = false; Chunk* ret = chunkPool.next(); ret->target = cur; ret->isBlank = true; return ret; } - else if(val == L'\\') + else if(val == '\\') { - cur += L'\\'; - cur += wchar_t(fgetwc_unlocked(in)); + cur += '\\'; + cur += infile.get(); } - else if(val == L'[' && !inword) + else if(val == '[' && !inword) { - val = fgetwc_unlocked(in); + val = infile.get(); - if(val == L'[') + if(val == '[') { inwblank = true; Chunk* ret = chunkPool.next(); @@ -1076,64 +1022,42 @@ RTXProcessor::readToken(FILE *in) } else { - cur += L'['; - inSquare = true; - - if(val == L'\\') - { - cur += L'\\'; - cur += static_cast(fgetwc_unlocked(in)); - } - else - { - cur += val; - if(val == L']') - { - inSquare = false; - } - } - } - } - else if(inSquare) - { - cur += val; - if(val == L']') - { - inSquare = false; + infile.unget(val); + cur += infile.readBlock('[', ']'); } } else if(inwblank) { - if(val == L']') + if(val == ']') { cur += val; - val = fgetwc_unlocked(in); + val = infile.get(); - if(val == L'\\') + if(val == '\\') { - cur += L'\\'; - cur += static_cast(fgetwc_unlocked(in)); + cur += '\\'; + cur += infile.get(); } - else if(val == L']') + else if(val == ']') { cur += val; - val = fgetwc_unlocked(in); + val = infile.get(); - if(val == L'\\') + if(val == '\\') { - cur += L'\\'; - cur += static_cast(fgetwc_unlocked(in)); + cur += '\\'; + cur += infile.get(); } - else if(val == L'^') + else if(val == '^') { inwblank = false; - cur = L"[[" + cur; + cur = "[["_u + cur; wbl.swap(cur); inword = true; } else { - wcerr << L"Parse Error: Wordbound blank should be immediately followed by a Lexical Unit -> [[..]]^..$" << endl; + cerr << "Parse Error: Wordbound blank should be immediately followed by a Lexical Unit -> [[..]]^..$" << endl; exit(EXIT_FAILURE); } } @@ -1147,7 +1071,7 @@ RTXProcessor::readToken(FILE *in) cur += val; } } - else if(inword && (val == L'$' || val == L'/')) + else if(inword && (val == '$' || val == '/')) { if(pos == 0) { @@ -1157,7 +1081,7 @@ RTXProcessor::readToken(FILE *in) { dest.swap(cur); } - else if(pos >= 2 && !noCoref && val == L'$') + else if(pos >= 2 && !noCoref && val == '$') { coref.swap(cur); } @@ -1166,7 +1090,7 @@ RTXProcessor::readToken(FILE *in) cur.clear(); } pos++; - if(val == L'$') + if(val == '$') { inword = false; Chunk* ret = chunkPool.next(); @@ -1175,10 +1099,10 @@ RTXProcessor::readToken(FILE *in) ret->target = dest; ret->coref = coref; ret->isBlank = false; - if(src.size() > 0 && src[0] == L'*' && dest.size() > 0 && dest[0] == L'*') + if(src.size() > 0 && src[0] == '*' && dest.size() > 0 && dest[0] == '*') { Chunk* ret2 = chunkPool.next(); - ret2->target = ret->target.substr(1) + L""; + ret2->target = ret->target.substr(1) + ""_u; ret2->contents.push_back(ret); ret2->rule = -1; ret2->isBlank = false; @@ -1187,7 +1111,7 @@ RTXProcessor::readToken(FILE *in) return ret; } } - else if(!inword && val == L'^') + else if(!inword && val == '^') { inword = true; Chunk* ret = chunkPool.next(); @@ -1197,7 +1121,7 @@ RTXProcessor::readToken(FILE *in) } else { - cur += wchar_t(val); + cur += val; } } } @@ -1284,7 +1208,7 @@ RTXProcessor::lookahead(ParseNode* node) void RTXProcessor::checkForReduce(vector& result, ParseNode* node) { - if(printingAll) wcerr << "Checking for reductions for branch " << node->id << endl; + if(printingAll) cerr << "Checking for reductions for branch " << node->id << endl; mx->resetRejected(); pair rule_and_weight = node->getRule(); int rule = rule_and_weight.first; @@ -1299,28 +1223,28 @@ RTXProcessor::checkForReduce(vector& result, ParseNode* node) node->getChunks(currentInput, len-1); currentOutput.clear(); if(printingRules || printingAll) { - if(printingAll && treePrintMode == TreeModeLatex) wcerr << "\\subsection{"; - else wcerr << endl; - wcerr << "Applying rule " << rule; + if(printingAll && treePrintMode == TreeModeLatex) cerr << "\\subsection{"; + else cerr << endl; + cerr << "Applying rule " << rule; if(rule <= (int)inRuleNames.size()) { - wcerr << " (" << inRuleNames[rule-1] << ")"; + cerr << " (" << inRuleNames[rule-1] << ")"; } - if(printingAll) wcerr << " to branch " << node->id << " with weight " << rule_and_weight.second; - if(printingAll && treePrintMode == TreeModeLatex) wcerr << "}" << endl << endl; - else wcerr << ": "; + if(printingAll) cerr << " to branch " << node->id << " with weight " << rule_and_weight.second; + if(printingAll && treePrintMode == TreeModeLatex) cerr << "}" << endl << endl; + else cerr << ": "; for(unsigned int i = 0; i < currentInput.size(); i++) { currentInput[i]->writeTree((printingAll ? treePrintMode : TreeModeFlat), NULL); } - wcerr << endl; + cerr << endl; } if(applyRule(rule_map[rule-1])) { if(printingAll) { for(auto c : currentOutput) c->writeTree(treePrintMode, NULL); - wcerr << endl; + cerr << endl; } vector temp; temp.reserve(currentOutput.size()); @@ -1383,8 +1307,8 @@ RTXProcessor::checkForReduce(vector& result, ParseNode* node) } else { - if(printingRules) { wcerr << " -> rule was rejected" << endl; } - if(printingAll) wcerr << "This rule was rejeced." << endl << endl; + if(printingRules) { cerr << " -> rule was rejected" << endl; } + if(printingAll) cerr << "This rule was rejeced." << endl << endl; mx->rejectRule(rule); rule_and_weight = node->getRule(); rule = rule_and_weight.first; @@ -1393,19 +1317,19 @@ RTXProcessor::checkForReduce(vector& result, ParseNode* node) } if(rule == -1) { - if(printingAll) wcerr << "No further reductions possible for branch " << node->id << "." << endl; + if(printingAll) cerr << "No further reductions possible for branch " << node->id << "." << endl; result.push_back(node); } else if(lookahead(node)) { node->id = ++newBranchId; - if(printingAll) wcerr << endl << "Splitting stack and creating branch " << node->id << endl; + if(printingAll) cerr << endl << "Splitting stack and creating branch " << node->id << endl; result.push_back(node); } } void -RTXProcessor::outputAll(FILE* out) +RTXProcessor::outputAll(UFILE* out) { unsigned int queueSize = outputQueue.size() - 1; bool conjoining = false; @@ -1416,31 +1340,31 @@ RTXProcessor::outputAll(FILE* out) outputQueue.pop_front(); if(printingTrees && outputQueue.size() == queueSize) { - if(printingText) fputc_unlocked('\n', out); + if(printingText) u_fputc('\n', out); queueSize--; ch->writeTree(treePrintMode, out); - fflush(out); + u_fflush(out); if(!printingText) continue; } if(ch->rule == -1) { if(printingRules && !ch->isBlank) { - fflush(out); - wcerr << endl << "No rule specified: "; + u_fflush(out); + cerr << endl << "No rule specified: "; ch->writeTree(TreeModeFlat, NULL); - wcerr << endl; + cerr << endl; } if(printingAll && !ch->isBlank) { - if(treePrintMode == TreeModeLatex) wcerr << "\\subsubsection{Output Node}" << endl; - else wcerr << "Output Node:" << endl; + if(treePrintMode == TreeModeLatex) cerr << "\\subsubsection{Output Node}" << endl; + else cerr << "Output Node:" << endl; ch->writeTree(treePrintMode, NULL); - wcerr << endl; + cerr << endl; } if(ch->contents.size() > 0) { - vector tags = ch->getTags(vector()); + vector tags = ch->getTags(vector()); for(auto it = ch->contents.rbegin(); it != ch->contents.rend(); it++) { (*it)->updateTags(tags); @@ -1473,7 +1397,7 @@ RTXProcessor::outputAll(FILE* out) else { parentChunk = ch; - vector tags = ch->getTags(vector()); + vector tags = ch->getTags(vector()); currentInput = ch->contents; for(unsigned int i = 0; i < currentInput.size(); i++) { @@ -1481,40 +1405,40 @@ RTXProcessor::outputAll(FILE* out) } currentOutput.clear(); if(printingRules) { - fflush(out); - wcerr << endl << "Applying output rule " << ch->rule; + u_fflush(out); + cerr << endl << "Applying output rule " << ch->rule; if(ch->rule < (int)outRuleNames.size()) { - wcerr << " (" << outRuleNames[ch->rule] << ")"; + cerr << " (" << outRuleNames[ch->rule] << ")"; } - wcerr << ": " << parentChunk->target << " -> "; + cerr << ": " << parentChunk->target << " -> "; for(unsigned int i = 0; i < currentInput.size(); i++) { currentInput[i]->writeTree(TreeModeFlat, NULL); } - wcerr << endl; + cerr << endl; } if(printingAll) { if(treePrintMode == TreeModeLatex) { - wcerr << "\\subsubsection{Applying Output Rule " << ch->rule; + cerr << "\\subsubsection{Applying Output Rule " << ch->rule; if(ch->rule < (int)outRuleNames.size()) { - wcerr << ": " << outRuleNames[ch->rule] << "}" << endl << endl; + cerr << ": " << outRuleNames[ch->rule] << "}" << endl << endl; } } else { - wcerr << "Applying Output Rule " << ch->rule; + cerr << "Applying Output Rule " << ch->rule; if(ch->rule < (int)outRuleNames.size()) { - wcerr << ": " << outRuleNames[ch->rule] << endl << endl; + cerr << ": " << outRuleNames[ch->rule] << endl << endl; } } ch->writeTree(treePrintMode, NULL); } - fflush(out); + u_fflush(out); applyRule(output_rules[ch->rule]); for(vector::reverse_iterator it = currentOutput.rbegin(), limit = currentOutput.rend(); it != limit; it++) @@ -1526,7 +1450,7 @@ RTXProcessor::outputAll(FILE* out) if(tojoin != NULL) tojoin->output(out); while(!blankQueue.empty()) { - if(blankQueue.front() == L" ") + if(blankQueue.front() == " "_u) { blankQueue.pop_front(); } @@ -1538,17 +1462,22 @@ RTXProcessor::outputAll(FILE* out) } void -RTXProcessor::writeBlank(FILE* out) +RTXProcessor::writeBlank(UFILE* out) { if(blankQueue.empty()) { - blankQueue.push_back(L" "); + blankQueue.push_back(" "_u); } Chunk* blank = chunkPool.next(); blank->target = blankQueue.front(); blankQueue.pop_front(); blank->isBlank = true; - blank->output(out); + if (printingText) { + blank->output(out); + } + if (printingTrees) { + blank->writeTree(treePrintMode, out); + } } bool @@ -1558,9 +1487,9 @@ RTXProcessor::filterParseGraph() { if(treePrintMode == TreeModeLatex) { - wcerr << "\\subsection{Filtering Branches}\n\n\\begin{itemize}" << endl; + cerr << "\\subsection{Filtering Branches}\n\n\\begin{itemize}" << endl; } - else wcerr << endl << "Filtering Branches:" << endl; + else cerr << endl << "Filtering Branches:" << endl; } bool shouldOutput = !furtherInput && inputBuffer.size() == 1; int state[parseGraph.size()]; @@ -1584,9 +1513,9 @@ RTXProcessor::filterParseGraph() { if(treePrintMode == TreeModeLatex) { - wcerr << L"\\item No branch can accept further input." << endl; + cerr << "\\item No branch can accept further input." << endl; } - else wcerr << L"No branch can accept further input." << endl; + else cerr << "No branch can accept further input." << endl; } shouldOutput = true; memset(state, 1, N*sizeof(int)); @@ -1597,33 +1526,33 @@ RTXProcessor::filterParseGraph() { if(treePrintMode == TreeModeLatex) { - wcerr << "\\item Input buffer is empty." << endl; + cerr << "\\item Input buffer is empty." << endl; } - else wcerr << L"Input buffer is empty." << endl; + else cerr << "Input buffer is empty." << endl; } int min = -1; ParseNode* minNode = NULL; ParseNode* cur = NULL; map> filter; - if(printingBranches) { wcerr << L"shouldOutput: " << shouldOutput << L" branch count: " << N << endl; } + if(printingBranches) { cerr << "shouldOutput: " << shouldOutput << " branch count: " << N << endl; } for(int i = 0; i < N; i++) { - if(printingBranches) { wcerr << "examining node " << i << "(length: " << parseGraph[i]->length << ", weight: " << parseGraph[i]->weight << ") ... "; } + if(printingBranches) { cerr << "examining node " << i << "(length: " << parseGraph[i]->length << ", weight: " << parseGraph[i]->weight << ") ... "; } if(printingAll) { - if(treePrintMode == TreeModeLatex) wcerr << "\\item "; - wcerr << "Branch " << parseGraph[i]->id << " "; + if(treePrintMode == TreeModeLatex) cerr << "\\item "; + cerr << "Branch " << parseGraph[i]->id << " "; } if(state[i] == 0) { - if(printingAll) wcerr << " has no possible continuations." << endl; + if(printingAll) cerr << " has no possible continuations." << endl; continue; } else if(noFilter && !shouldOutput) continue; if(min == -1) { - if(printingAll) wcerr << " has no active branch to compare to." << endl; - if(printingBranches) { wcerr << "FIRST!" << endl; } + if(printingAll) cerr << " has no active branch to compare to." << endl; + if(printingBranches) { cerr << "FIRST!" << endl; } min = i; minNode = parseGraph[i]; cur = minNode; @@ -1637,8 +1566,8 @@ RTXProcessor::filterParseGraph() if(cur->length < minNode->length || (cur->length == minNode->length && cur->weight >= minNode->weight)) { - if(printingBranches) { wcerr << i << L" beats " << min << " in length or weight" << endl; } - if(printingAll) wcerr << " has fewer partial parses or a higher weight than branch " << minNode->id << "." << endl; + if(printingBranches) { cerr << i << " beats " << min << " in length or weight" << endl; } + if(printingAll) cerr << " has fewer partial parses or a higher weight than branch " << minNode->id << "." << endl; state[min] = 0; min = i; minNode = cur; @@ -1646,16 +1575,16 @@ RTXProcessor::filterParseGraph() else { state[i] = 0; - if(printingBranches) {wcerr << min << L" beats " << i << " in length or weight" << endl; } - if(printingAll) wcerr << " has more partial parses or a lower weight than branch " << minNode->id << "." << endl; + if(printingBranches) {cerr << min << " beats " << i << " in length or weight" << endl; } + if(printingAll) cerr << " has more partial parses or a lower weight than branch " << minNode->id << "." << endl; } count--; } else if(filter.find(cur->firstWord) == filter.end()) { filter[cur->firstWord].push_back(i); - if(printingBranches) { wcerr << i << " has nothing to compare with" << endl; } - if(printingAll) wcerr << " has no prior branch covering the same final span." << endl; + if(printingBranches) { cerr << i << " has nothing to compare with" << endl; } + if(printingAll) cerr << " has no prior branch covering the same final span." << endl; } else { @@ -1663,19 +1592,19 @@ RTXProcessor::filterParseGraph() double w = parseGraph[other[0]]->weight; if(w > cur->weight) { - if(printingBranches) { wcerr << i << L" has lower weight - discarding." << endl; } - if(printingAll) wcerr << " has a lower weight than branch " << parseGraph[other[0]]->id << " and will be discarded." << endl; + if(printingBranches) { cerr << i << " has lower weight - discarding." << endl; } + if(printingAll) cerr << " has a lower weight than branch " << parseGraph[other[0]]->id << " and will be discarded." << endl; state[i] = 0; count--; } else if(w < cur->weight) { - if(printingBranches) { wcerr << i << L" has higher weight - discarding others." << endl; } + if(printingBranches) { cerr << i << " has higher weight - discarding others." << endl; } if(printingAll) { - wcerr << " has a higher weight than "; - for(auto it : other) wcerr << "branch " << parseGraph[it]->id << ", "; - wcerr << "which will be discarded." << endl; + cerr << " has a higher weight than "; + for(auto it : other) cerr << "branch " << parseGraph[it]->id << ", "; + cerr << "which will be discarded." << endl; } for(vector::iterator it = other.begin(), limit = other.end(); it != limit; it++) @@ -1688,14 +1617,14 @@ RTXProcessor::filterParseGraph() } else { - if(printingBranches) { wcerr << i << " has same weight - keeping all." << endl; } - if(printingAll) wcerr << " has the same weight as branch " << parseGraph[other[0]]->id << "." << endl; + if(printingBranches) { cerr << i << " has same weight - keeping all." << endl; } + if(printingAll) cerr << " has the same weight as branch " << parseGraph[other[0]]->id << "." << endl; other.push_back(i); } } } } - if(printingAll && treePrintMode == TreeModeLatex) wcerr << "\\end{itemize}" << endl << endl; + if(printingAll && treePrintMode == TreeModeLatex) cerr << "\\end{itemize}" << endl << endl; if(count == N) return shouldOutput; if(count > 100 && filter.size() > 0) { @@ -1718,34 +1647,34 @@ RTXProcessor::filterParseGraph() temp.push_back(parseGraph[i]); if(printingBranches) { - wcerr << L"keeping branch " << i << " first word: " << parseGraph[i]->firstWord << " ending with "; + cerr << "keeping branch " << i << " first word: " << parseGraph[i]->firstWord << " ending with "; parseGraph[i]->chunk->writeTree(TreeModeFlat, NULL); - wcerr << endl; + cerr << endl; } } else if(printingBranches) { - wcerr << L"discarding branch " << i << " first word: " << parseGraph[i]->firstWord << " ending with "; + cerr << "discarding branch " << i << " first word: " << parseGraph[i]->firstWord << " ending with "; parseGraph[i]->chunk->writeTree(TreeModeFlat, NULL); - wcerr << endl; + cerr << endl; } } - if(printingBranches) { wcerr << L"remaining branches: " << temp.size() << endl << endl; } + if(printingBranches) { cerr << "remaining branches: " << temp.size() << endl << endl; } parseGraph.swap(temp); return shouldOutput; } void -RTXProcessor::processGLR(FILE *in, FILE *out) +RTXProcessor::processGLR(UFILE *out) { int sentenceId = 1; if(printingAll && treePrintMode == TreeModeLatex) { - wcerr << "\\section{Sentence " << sentenceId << "}" << endl << endl; + cerr << "\\section{Sentence " << sentenceId << "}" << endl << endl; } while(furtherInput && inputBuffer.size() < 5) { - inputBuffer.push_back(readToken(in)); + inputBuffer.push_back(readToken()); } bool real_printingAll = printingAll; while(true) @@ -1766,11 +1695,11 @@ RTXProcessor::processGLR(FILE *in, FILE *out) } if(printingAll) { - wcerr << endl; - if(treePrintMode == TreeModeLatex) wcerr << "\\subsection{Reading Input}" << endl << endl; - else wcerr << "Reading Input:" << endl; + cerr << endl; + if(treePrintMode == TreeModeLatex) cerr << "\\subsection{Reading Input}" << endl << endl; + else cerr << "Reading Input:" << endl; next->writeTree(treePrintMode, NULL); - wcerr << endl; + cerr << endl; } inputBuffer.pop_front(); if(parseGraph.size() == 0) @@ -1778,15 +1707,20 @@ RTXProcessor::processGLR(FILE *in, FILE *out) // skip parseGraph stuff if a blank is the only thing being processed if(next->isBlank) { - next->output(out); + if (printingText) { + next->output(out); + } + if (printingTrees) { + next->writeTree(treePrintMode, out); + } if(furtherInput) { - inputBuffer.push_back(readToken(in)); + inputBuffer.push_back(readToken()); } if(inputBuffer.empty()) { - wcerr.flush(); - fflush(out); + cerr.flush(); + u_fflush(out); break; } continue; @@ -1820,45 +1754,45 @@ RTXProcessor::processGLR(FILE *in, FILE *out) { for(auto branch : parseGraph) { - wcerr << "Branch " << branch->id << ": " << branch->length << " nodes, weight = " << branch->weight << endl; + cerr << "Branch " << branch->id << ": " << branch->length << " nodes, weight = " << branch->weight << endl; vector parts; parts.resize(branch->length); branch->getChunks(parts, branch->length-1); for(auto node : parts) { - if(node->isBlank) wcerr << "[Blank]: " << endl; - else wcerr << "[Chunk]: " << endl; + if(node->isBlank) cerr << "[Blank]: " << endl; + else cerr << "[Chunk]: " << endl; node->writeTree(treePrintMode, NULL); } } } - if(furtherInput) inputBuffer.push_back(readToken(in)); + if(furtherInput) inputBuffer.push_back(readToken()); if(filterParseGraph()) { - wcerr.flush(); + cerr.flush(); if(printingAll) { - if(treePrintMode == TreeModeLatex) wcerr << "\\subsection{Outputting Branch " << parseGraph[0]->id << "}" << endl << endl; + if(treePrintMode == TreeModeLatex) cerr << "\\subsection{Outputting Branch " << parseGraph[0]->id << "}" << endl << endl; else { - wcerr << endl; - wcerr << "************************************************************" << endl; - wcerr << "************************************************************" << endl; - wcerr << "************************************************************" << endl; - wcerr << "Outputting Branch " << parseGraph[0]->id << endl << endl; + cerr << endl; + cerr << "************************************************************" << endl; + cerr << "************************************************************" << endl; + cerr << "************************************************************" << endl; + cerr << "Outputting Branch " << parseGraph[0]->id << endl << endl; vector parts; parts.resize(parseGraph[0]->length); parseGraph[0]->getChunks(parts, parseGraph[0]->length-1); for(auto node : parts) { - if(node->isBlank) wcerr << "[Blank]: " << endl; - else wcerr << "[Chunk]: " << endl; + if(node->isBlank) cerr << "[Blank]: " << endl; + else cerr << "[Chunk]: " << endl; node->writeTree(treePrintMode, NULL); } - wcerr << "************************************************************" << endl; - wcerr << "************************************************************" << endl; - wcerr << "************************************************************" << endl; - wcerr << endl; + cerr << "************************************************************" << endl; + cerr << "************************************************************" << endl; + cerr << "************************************************************" << endl; + cerr << endl; } } currentBranch = parseGraph[0]; @@ -1867,11 +1801,11 @@ RTXProcessor::processGLR(FILE *in, FILE *out) outputAll(out); variables = currentBranch->stringVars; wblank_variables = currentBranch->wblankVars; - fflush(out); - vector wblanks; - vector sources; - vector targets; - vector corefs; + u_fflush(out); + vector wblanks; + vector sources; + vector targets; + vector corefs; vector blanks; vector unknowns; int N = inputBuffer.size(); @@ -1894,15 +1828,15 @@ RTXProcessor::processGLR(FILE *in, FILE *out) blanks.push_back(temp->isBlank); inputBuffer.pop_front(); } - //wcerr << "clearing chunkPool, size was " << chunkPool.size() << endl; - //wcerr << "clearing parsePool, size was " << parsePool.size() << endl; + //cerr << "clearing chunkPool, size was " << chunkPool.size() << endl; + //cerr << "clearing parsePool, size was " << parsePool.size() << endl; chunkPool.reset(); parsePool.reset(); newBranchId = 0; if(printingAll) sentenceId++; if((furtherInput || inputBuffer.size() > 1) && printingAll && treePrintMode == TreeModeLatex) { - wcerr << endl << endl << "\\section{Sentence " << sentenceId << "}" << endl << endl; + cerr << endl << endl << "\\section{Sentence " << sentenceId << "}" << endl << endl; } for(int i = 0; i < N; i++) { @@ -1915,7 +1849,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out) if(unknowns[i]) { Chunk* c2 = chunkPool.next(); - c2->target = targets[i].substr(1) + L""; + c2->target = targets[i].substr(1) + ""_u; c2->contents.push_back(c); c = c2; } @@ -1926,11 +1860,11 @@ RTXProcessor::processGLR(FILE *in, FILE *out) if(!furtherInput && inputBuffer.size() == 1) { // if stream is empty, the last token is definitely a blank - wcerr.flush(); + cerr.flush(); inputBuffer.front()->output(out); blankQueue.clear(); inputBuffer.pop_front(); - fflush(out); + u_fflush(out); break; } else if(!furtherInput && inputBuffer.size() == 0) break; @@ -2003,17 +1937,17 @@ RTXProcessor::processTRXLayer(list& t1x, list& t2x) } currentOutput.clear(); if(printingRules) { - wcerr << endl << "Applying rule " << rule; + cerr << endl << "Applying rule " << rule; if(rule <= (int)inRuleNames.size()) { - wcerr << " (" << inRuleNames[rule-1] << ")"; + cerr << " (" << inRuleNames[rule-1] << ")"; } - wcerr << ": "; + cerr << ": "; for(unsigned int i = 0; i < currentInput.size(); i++) { currentInput[i]->writeTree(TreeModeFlat, NULL); } - wcerr << endl; + cerr << endl; } if(applyRule(rule_map[rule-1])) { @@ -2035,7 +1969,7 @@ RTXProcessor::processTRXLayer(list& t1x, list& t2x) } void -RTXProcessor::processTRX(FILE *in, FILE *out) +RTXProcessor::processTRX(UFILE *out) { list t1x; list t2x; @@ -2044,7 +1978,7 @@ RTXProcessor::processTRX(FILE *in, FILE *out) { while(furtherInput && t1x.size() < 2*longestPattern) { - t1x.push_back(readToken(in)); + t1x.push_back(readToken()); } if(furtherInput) { @@ -2066,7 +2000,7 @@ RTXProcessor::processTRX(FILE *in, FILE *out) { Chunk* cur = t3x.front(); t3x.pop_front(); - vector tags = cur->getTags(vector()); + vector tags = cur->getTags(vector()); if(cur->rule == -1) { if(cur->contents.size() == 0) cur->output(out); @@ -2083,14 +2017,14 @@ RTXProcessor::processTRX(FILE *in, FILE *out) else { if(printingRules) { - wcerr << endl << L"Applying output rule " << cur->rule; + cerr << endl << "Applying output rule " << cur->rule; if(cur->rule < (int)outRuleNames.size()) { - wcerr << " (" << outRuleNames[cur->rule] << ")"; + cerr << " (" << outRuleNames[cur->rule] << ")"; } - wcerr << ": "; + cerr << ": "; cur->writeTree(TreeModeFlat, NULL); - wcerr << endl; + cerr << endl; } parentChunk = cur; currentInput = cur->contents; @@ -2110,32 +2044,33 @@ RTXProcessor::processTRX(FILE *in, FILE *out) } void -RTXProcessor::process(FILE* in, FILE* out) +RTXProcessor::process(FILE* in, UFILE* out) { if(printingAll && treePrintMode == TreeModeLatex) { - wcerr << "\\documentclass{article}" << endl; - wcerr << "\\usepackage{fontspec}" << endl; - wcerr << "\\setmainfont{FreeSans}" << endl; - wcerr << "\\usepackage{forest}" << endl; - wcerr << "\\usepackage[cm]{fullpage}" << endl << endl; - wcerr << "\\begin{document}" << endl << endl; + cerr << "\\documentclass{article}" << endl; + cerr << "\\usepackage{fontspec}" << endl; + cerr << "\\setmainfont{FreeSans}" << endl; + cerr << "\\usepackage{forest}" << endl; + cerr << "\\usepackage[cm]{fullpage}" << endl << endl; + cerr << "\\begin{document}" << endl << endl; } + infile.wrap(in); if(null_flush) { - while(!feof(in)) + while(!infile.eof()) { furtherInput = true; if(isLinear) { - processTRX(in, out); + processTRX(out); } else { - processGLR(in, out); + processGLR(out); } - fputc_unlocked('\0', out); - fflush(out); + u_fputc('\0', out); + u_fflush(out); chunkPool.reset(); parsePool.reset(); inputBuffer.clear(); @@ -2148,14 +2083,14 @@ RTXProcessor::process(FILE* in, FILE* out) } else if(isLinear) { - processTRX(in, out); + processTRX(out); } else { - processGLR(in, out); + processGLR(out); } if(printingAll && treePrintMode == TreeModeLatex) { - wcerr << endl << endl << "\\end{document}" << endl; + cerr << endl << endl << "\\end{document}" << endl; } } diff --git a/src/rtx_processor.h b/src/rtx_processor.h index afb5920..7607e1c 100644 --- a/src/rtx_processor.h +++ b/src/rtx_processor.h @@ -2,16 +2,13 @@ #define __RTXPROCESSOR__ #include -#include -#include +#include #include -#include -#include #include #include #include +#include -#include #include #include #include @@ -24,7 +21,7 @@ struct StackElement int mode; bool b; int i; - wstring s; + UString s; Chunk* c; }; @@ -49,52 +46,52 @@ private: /** * Attribute category regular expressions */ - map attr_items; + map attr_items; /** * Virtual machine global variables * name => value */ - map variables; + map variables; /** * Virtual machine global variables to wblank map * name => value */ - map wblank_variables; + map wblank_variables; /** * Lists * name => { values } */ - map, Ltstr> lists; + map> lists; /** * Lists, but all values are converted to lower case * Used for case-insensitive comparison * name => { values } */ - map, Ltstr> listslow; + map> listslow; /** * Bytecode for input-time rules */ - vector rule_map; + vector rule_map; /** * Bytecode for output-time rules */ - vector output_rules; + vector output_rules; /** * Debug names for input-time rules (may be empty) */ - vector inRuleNames; + vector inRuleNames; /** * Debug names for output-time rules (may be empty) */ - vector outRuleNames; + vector outRuleNames; /** * Length of pattern of each input-time rule, including blanks @@ -114,7 +111,7 @@ private: /** * false if EOF or \0 has been reached in the input stream, true otherwise */ - bool furtherInput; + bool furtherInput = true; /** * The stack used by the virtual machine @@ -133,12 +130,12 @@ private: * A parallel stack to store wordbound blanks that mimics the operations * of the main stack. wblanks are added everytime lemmas are clipped */ - wstring theWblankStack[32]; + UString theWblankStack[32]; /** * wordbound blank to be output */ - wstring out_wblank; + UString out_wblank; /** * Input to the virtual machine @@ -173,7 +170,7 @@ private: * then we want to output it directly, particularly if it's empty * and because of lookahead, only processGLR() knows which blanks are which */ - list blankQueue; + list blankQueue; /** * The parse stack @@ -211,7 +208,7 @@ private: * Branch of parseGraph currently being operated on * Needed by applyRule() for FETCHCHUNK and SETCHUNK */ - ParseNode* currentBranch; + ParseNode* currentBranch = nullptr; ////////// // SETTINGS @@ -219,122 +216,95 @@ private: /** * true if the next input token should be parsed as an LU, false otherwise - * Initial value: false */ - bool inword; + bool inword = false; /** * true if the next input token should be parsed as a wordbound blank, false otherwise - * Initial value: false */ - bool inwblank; + bool inwblank = false; /** * Whether output should flush on \0 - * Default: false */ - bool null_flush; + bool null_flush = false; /** * If true, each instruction of virtual machine will be printed to wcerr - * Default: false */ - bool printingSteps; + bool printingSteps = false; /** * If true, each rule that is applied will be printed to wcerr - * Default: false */ - bool printingRules; + bool printingRules = false; /** * If true, each action of filterParseGraph() will be logged to wcerr - * Default: false */ - bool printingBranches; + bool printingBranches = false; /** * If true, produce a full report, similar to (printingRules && printingBranches) * Affected by treePrintMode - * Default: false */ - bool printingAll; + bool printingAll = false; /** * false if input comes from apertium-anaphora, true otherwise - * Default: true */ - bool noCoref; + bool noCoref = true; /** * true if rule application should mimic the chunker-interchunk-postchunk * pipeline, false otherwise - * Default: false */ - bool isLinear; + bool isLinear = false; /** * If true, parse tree will be printed according to treePrintMode * before output-time rules are applied - * Default: false */ - bool printingTrees; + bool printingTrees = false; /** * If false, output-time rules will not be applied and linear output * will not be produced - * Default: true */ - bool printingText; + bool printingText = true; /** * Manner in which to print trees * Set by setOutputMode() * Enum defined in chunk.h - * Default: TreeModeNest */ - TreeMode treePrintMode; + TreeMode treePrintMode = TreeModeNest; /** * Counter used to give distinct, consistent identifiers to ParseNodes * for tracing purposes */ - int newBranchId; + int newBranchId = 0; /** * If this is set to true, filterParseGraph() will only discard branches * on parse error */ - bool noFilter; + bool noFilter = true; ////////// // VIRTUAL MACHINE ////////// - /** - * Determine capitalization of a string - * @param str - input string - * @return L"AA", L"Aa", or L"aa" - */ - wstring caseOf(wstring const &str); - - /** - * Produce a version of target_word with the case of source_word - * @param source_word - source of case - * @param target_word - source of content - * @return generated string - */ - wstring copycase(wstring const &source_word, wstring const &target_word); - /** * Return whether str1 begins with str2 */ - bool beginsWith(wstring const &str1, wstring const &str2) const; + bool beginsWith(UString const &str1, UString const &str2) const; /** * Return whether str1 ends with str2 */ - bool endsWith(wstring const &str1, wstring const &str2) const; + bool endsWith(UString const &str1, UString const &str2) const; /** * The virtual machine @@ -343,7 +313,7 @@ private: * @param rule - bytecode for rule to be applied * @return false if REJECTRULE was executed, true otherwise */ - bool applyRule(const wstring& rule); + bool applyRule(const UString& rule); /** * Pop and return a boolean from theStack @@ -358,20 +328,20 @@ private: int popInt(); /** - * Pop and return a wstring from theStack - * Log error and call exit(1) if top element is not a wstring + * Pop and return a UString from theStack + * Log error and call exit(1) if top element is not a UString */ - wstring popString(); + UString popString(); /** * Equivalent to popString(), but with called as - * wstring x; popString(x); + * UString x; popString(x); * rather than - * wstring x = popString(); + * UString x = popString(); * This uses a swap to save an allocation and a copy, which is almost twice * as fast, which has a noticeable impact on overall speed */ - void popString(wstring& dest); + void popString(UString& dest); /** * Pop and return a Chunk pointer from theStack @@ -391,7 +361,7 @@ private: theStack[stackIdx].i = i; theWblankStack[stackIdx].clear(); } - inline void pushStack(const wstring& s, wstring wbl = L"") + inline void pushStack(const UString& s, UString wbl = ""_u) { theStack[++stackIdx].mode = 2; theStack[stackIdx].s.assign(s); @@ -413,13 +383,15 @@ private: // RULE SELECTION AND I/O ////////// + InputFile infile; + /** * Read an LU or a blank * Modifies: furtherInput * @param in - input stream * @return pointer to token read */ - Chunk* readToken(FILE *in); + Chunk* readToken(); bool lookahead(ParseNode* node); @@ -434,13 +406,13 @@ private: /** * Output the next blank in blankQueue, or a space if the queue is empty */ - void writeBlank(FILE* out); + void writeBlank(UFILE* out); /** * Apply output-time rules and write nodes to output stream * @param out - output stream */ - void outputAll(FILE* out); + void outputAll(UFILE* out); /** * Prune any ParseNodes that have reached error states @@ -453,7 +425,7 @@ private: * Process input as a GLR parser * Read input, call checkForReduce(), call filterParseGraph(), call outputAll() */ - void processGLR(FILE* in, FILE* out); + void processGLR(UFILE* out); /** * Apply longest rule matching the beginning of t1x and append the result to t2x @@ -464,19 +436,19 @@ private: * Mimic apertium-transfer | apertium-interchunk | apertium-postchunk * Read input, call processTRXLayer twice, apply output-time rules, output */ - void processTRX(FILE* in, FILE* out); + void processTRX(UFILE* out); /** * True if clipping lem/lemh/whole */ - bool gettingLemmaFromWord(wstring attr); + bool gettingLemmaFromWord(UString attr); public: RTXProcessor(); ~RTXProcessor(); void read(string const &filename); - void process(FILE *in, FILE *out); + void process(FILE *in, UFILE *out); bool getNullFlush(void); void setNullFlush(bool null_flush); void printSteps(bool val) diff --git a/src/trx_compiler.cc b/src/trx_compiler.cc index f0f5959..8b01907 100644 --- a/src/trx_compiler.cc +++ b/src/trx_compiler.cc @@ -8,9 +8,9 @@ #include #include #include -#include +#include +#include -using namespace Apertium; using namespace std; TRXCompiler::TRXCompiler() @@ -23,19 +23,43 @@ TRXCompiler::~TRXCompiler() // TODO } +UString +name(xmlNode* node) +{ + return to_ustring((const char*) node->name); +} + +bool +nameIs(xmlNode* node, const char* name) +{ + return !xmlStrcmp(node->name, (const xmlChar*) name); +} + void -TRXCompiler::die(xmlNode* node, wstring msg) +TRXCompiler::die(xmlNode* node, const char* fmt, ...) { - wcerr << "Error in " << UtfConverter::fromUtf8((char*) curDoc->URL); - wcerr << " on line " << node->line << ": " << msg << endl; + UFILE* out = u_finit(stderr, NULL, NULL); + u_fprintf(out, "Error in %S on line %d: ", + to_ustring((char*) curDoc->URL).c_str(), node->line); + va_list argptr; + va_start(argptr, fmt); + u_vfprintf(out, fmt, argptr); + va_end(argptr); + u_fputc('\n', out); exit(EXIT_FAILURE); } void -TRXCompiler::warn(xmlNode* node, wstring msg) +TRXCompiler::warn(xmlNode* node, const char* fmt, ...) { - wcerr << "Warning in " << UtfConverter::fromUtf8((char*) curDoc->URL); - wcerr << " on line " << node->line << ": " << msg << endl; + UFILE* out = u_finit(stderr, NULL, NULL); + u_fprintf(out, "Warning in %S on line %d: ", + to_ustring((char*) curDoc->URL).c_str(), node->line); + va_list argptr; + va_start(argptr, fmt); + u_vfprintf(out, fmt, argptr); + va_end(argptr); + u_fputc('\n', out); } void @@ -44,7 +68,7 @@ TRXCompiler::compile(string file) curDoc = xmlReadFile(file.c_str(), NULL, 0); if(curDoc == NULL) { - wcerr << "Error: Could not parse file '" << file << "'." << endl; + cerr << "Error: Could not parse file '" << file << "'." << endl; exit(EXIT_FAILURE); } processFile(xmlDocGetRootElement(curDoc)); @@ -53,90 +77,49 @@ TRXCompiler::compile(string file) void TRXCompiler::processFile(xmlNode* node) { - for(xmlNode* i = node->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar*) "section-def-cats")) - { - processCats(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar*) "section-def-attrs")) - { - processAttrs(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar*) "section-def-vars")) - { - processVars(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar*) "section-def-lists")) - { - processLists(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar*) "section-def-macros")) - { - gatherMacros(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar*) "section-rules")) - { - processRules(i); - } - } - } -} - -xmlChar* -TRXCompiler::requireAttr(xmlNode* node, const xmlChar* attr) -{ - for(xmlAttr* a = node->properties; a != NULL; a = a->next) - { - if(!xmlStrcmp(a->name, attr)) - { - return a->children->content; + for (auto i : children(node)) { + if(nameIs(i, "section-def-cats")) { + processCats(i); + } else if(nameIs(i, "section-def-attrs")) { + processAttrs(i); + } else if(nameIs(i, "section-def-vars")) { + processVars(i); + } else if(nameIs(i, "section-def-lists")) { + processLists(i); + } else if(nameIs(i, "section-def-macros")) { + gatherMacros(i); + } else if(nameIs(i, "section-rules")) { + processRules(i); } } - die(node, L"Expected attribute '" + UtfConverter::fromUtf8((const char*) attr) + L"'"); - return NULL; - // since die() ends the process, NULL will never be returned, - // but this keeps the compiler from complaining about the lack of a return statement } -xmlChar* -TRXCompiler::getAttr(xmlNode* node, const xmlChar* attr) +UString +TRXCompiler::requireAttr(xmlNode* node, const char* attr) { - for(xmlAttr* a = node->properties; a != NULL; a = a->next) - { - if(!xmlStrcmp(a->name, attr)) - { - return a->children->content; + for (xmlAttr* a = node->properties; a != NULL; a = a->next) { + if (!xmlStrcmp(a->name, (const xmlChar*) attr)) { + return to_ustring((const char*) a->children->content); } } - return NULL; -} - -inline wstring -TRXCompiler::toWstring(const xmlChar* s) -{ - return (s == NULL) ? L"" : UtfConverter::fromUtf8((char*) s); + die(node, "Expected attribute '%S'", to_ustring(attr).c_str()); + return ""_u; // since die() exits, this will not be returned + // but we each do our part to keep the typechecker happy... } int TRXCompiler::getPos(xmlNode* node, bool isBlank = false) { - wstring v; - if(!xmlStrcmp(node->name, (const xmlChar*) "b")) - { - v = toWstring(getAttr(node, (const xmlChar*) "pos")); - if(v == L"") - { + UString v; + if(nameIs(node, "b")) { + v = getattr(node, "pos"); + if (v.empty()) { return 0; } + } else { + v = requireAttr(node, "pos"); } - else - { - v = toWstring(requireAttr(node, (const xmlChar*) "pos")); - } - if(v.size() == 0) + if(v.empty()) { if(isBlank) { @@ -144,7 +127,7 @@ TRXCompiler::getPos(xmlNode* node, bool isBlank = false) } else { - die(node, L"Cannot interpret empty pos attribute."); + die(node, "Cannot interpret empty pos attribute."); } } for(unsigned int i = 0; i < v.size(); i++) @@ -153,13 +136,13 @@ TRXCompiler::getPos(xmlNode* node, bool isBlank = false) { if(isBlank) { - warn(node, L"Disregarding non-integer position."); + warn(node, "Disregarding non-integer position."); return 0; } - die(node, L"Position must be an integer."); + die(node, "Position must be an integer."); } } - int ret = stoi(v); + int ret = StringUtils::stoi(v); if(inOutput && ret == 0) { return ret; @@ -173,10 +156,10 @@ TRXCompiler::getPos(xmlNode* node, bool isBlank = false) { if(isBlank) { - warn(node, L"Disregarding out-of-bounds position."); + warn(node, "Disregarding out-of-bounds position."); return 0; } - die(node, L"Position " + to_wstring(ret) + L" is out of bounds."); + die(node, "Position %d is out of bounds.", ret); } if(macroPosShift.size() > 0) { @@ -188,70 +171,52 @@ TRXCompiler::getPos(xmlNode* node, bool isBlank = false) void TRXCompiler::processCats(xmlNode* node) { - for(xmlNode* cat = node->children; cat != NULL; cat = cat->next) - { - if(cat->type == XML_ELEMENT_NODE) - { - if(xmlStrcmp(cat->name, (const xmlChar*) "def-cat")) - { - warn(cat, L"Unexpected tag in section-def-cats - ignoring"); + for (auto cat : children(node)) { + if (!nameIs(cat, "def-cat")) { + warn(cat, "Unexpected tag in section-def-cats - ignoring"); + continue; + } + UString pat_name = requireAttr(cat, "n"); + vector pat; + for (auto item : children(cat)) { + if (!nameIs(item, "cat-item")) { + warn(cat, "Unexpected tag <%S> in def-cat - ignoring", name(item).c_str()); continue; } - wstring name = toWstring(requireAttr(cat, (const xmlChar*) "n")); - vector pat; - for(xmlNode* item = cat->children; item != NULL; item = item->next) - { - if(item->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(item->name, (const xmlChar*) "cat-item")) - { - warn(cat, L"Unexpected tag <" + toWstring(item->name) + L"> in def-cat - ignoring"); - continue; - } - PatternElement* cur = new PatternElement; - cur->lemma = toWstring(getAttr(item, (const xmlChar*) "lemma")); - wstring tags = toWstring(requireAttr(item, (const xmlChar*) "tags")); - if(tags == L"") tags = L"UNKNOWN:INTERNAL"; - cur->tags = StringUtils::split_wstring(tags, L"."); - pat.push_back(cur); - } - if(patterns.find(name) != patterns.end()) - { - warn(cat, L"Redefinition of pattern '" + name + L"', using later value"); - } - patterns[name] = pat; + PatternElement* cur = new PatternElement; + cur->lemma = getattr(item, "lemma"); + UString tags = requireAttr(item, "tags"); + if(tags.empty()) tags = "UNKNOWN:INTERNAL"_u; + cur->tags = StringUtils::split(tags, "."_u); + pat.push_back(cur); } + if(patterns.find(pat_name) != patterns.end()) { + warn(cat, "Redefinition of pattern '%S', using later value", pat_name.c_str()); + } + patterns[pat_name] = pat; } } void TRXCompiler::processAttrs(xmlNode* node) { - for(xmlNode* cat = node->children; cat != NULL; cat = cat->next) - { - if(cat->type != XML_ELEMENT_NODE) - { - continue; - } - if(xmlStrcmp(cat->name, (const xmlChar*) "def-attr")) - { - warn(cat, L"Unexpected tag in section-def-attrs - ignoring"); + for (auto cat : children(node)) { + if (!nameIs(cat, "def-attr")) { + warn(cat, "Unexpected tag in section-def-attrs - ignoring"); continue; } - wstring name = toWstring(getAttr(cat, (const xmlChar*) "n")); - set ats; - for(xmlNode* item = cat->children; item != NULL; item = item->next) - { - if(item->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(item->name, (const xmlChar*) "attr-item")) - { - warn(item, L"Unexpected tag in def-attr - ignoring"); + UString name = getattr(cat, "n"); + set ats; + for (auto item : children(cat)) { + if (!nameIs(item, "attr-item")) { + warn(item, "Unexpected tag in def-attr - ignoring"); continue; } - ats.insert(toWstring(getAttr(item, (const xmlChar*) "tags"))); + ats.insert(getattr(item, "tags")); } if(PB.isAttrDefined(name)) { - warn(cat, L"Redefinition of attribute '" + name + L"' - using later definition"); + warn(cat, "Redefinition of attribute '%S' - using later definition", name.c_str()); } PB.addAttr(name, ats); } @@ -260,16 +225,13 @@ TRXCompiler::processAttrs(xmlNode* node) void TRXCompiler::processVars(xmlNode* node) { - for(xmlNode* var = node->children; var != NULL; var = var->next) - { - if(var->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(var->name, (const xmlChar*) "def-var")) - { - warn(var, L"Unexpected tag in section-def-vars - ignoring"); + for (auto var : children(node)) { + if (!nameIs(var, "def-var")) { + warn(var, "Unexpected tag in section-def-vars - ignoring"); continue; } - wstring name = toWstring(requireAttr(var, (const xmlChar*) "n")); - vars[name] = toWstring(getAttr(var, (const xmlChar*) "v")); + UString name = requireAttr(var, "n"); + vars[name] = getattr(var, "v"); PB.addVar(name, vars[name]); } } @@ -277,32 +239,23 @@ TRXCompiler::processVars(xmlNode* node) void TRXCompiler::processLists(xmlNode* node) { - for(xmlNode* cat = node->children; cat != NULL; cat = cat->next) - { - if(cat->type != XML_ELEMENT_NODE) - { - continue; - } - if(xmlStrcmp(cat->name, (const xmlChar*) "def-list")) - { - warn(cat, L"Unexpected tag in section-def-lists - ignoring"); + for (auto cat : children(node)) { + if (!nameIs(cat, "def-list")) { + warn(cat, "Unexpected tag in section-def-lists - ignoring"); continue; } - wstring name = toWstring(getAttr(cat, (const xmlChar*) "n")); - set ats; - for(xmlNode* item = cat->children; item != NULL; item = item->next) - { - if(item->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(item->name, (const xmlChar*) "list-item")) - { - warn(item, L"Unexpected tag in def-list - ignoring"); + UString name = getattr(cat, "n"); + set ats; + for (auto item : children(cat)) { + if (!nameIs(item, "list-item")) { + warn(item, "Unexpected tag in def-list - ignoring"); continue; } - ats.insert(toWstring(getAttr(item, (const xmlChar*) "v"))); + ats.insert(getattr(item, "v")); } if(lists.find(name) != lists.end()) { - warn(cat, L"Redefinition of list '" + name + L"' - using later definition"); + warn(cat, "Redefinition of list '%S' - using later definition", name.c_str()); } lists[name] = ats; PB.addList(name, ats); @@ -312,19 +265,16 @@ TRXCompiler::processLists(xmlNode* node) void TRXCompiler::gatherMacros(xmlNode* node) { - for(xmlNode* mac = node->children; mac != NULL; mac = mac->next) - { - if(mac->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(mac->name, (const xmlChar*) "def-macro")) - { - warn(mac, L"Unexpected tag in section-def-macros - ignoring"); + for (auto mac : children(node)) { + if (!nameIs(mac, "def-macro")) { + warn(mac, "Unexpected tag in section-def-macros - ignoring"); continue; } - wstring name = toWstring(requireAttr(mac, (const xmlChar*) "n")); - int npar = atoi((const char*) requireAttr(mac, (const xmlChar*) "npar")); + UString name = requireAttr(mac, "n"); + int npar = StringUtils::stoi(requireAttr(mac, "npar")); if(macros.find(name) != macros.end()) { - warn(mac, L"Redefinition of macro '" + name + L"' - using later definition"); + warn(mac, "Redefinition of macro '%S' - using later definition", name.c_str()); } macros[name] = make_pair(npar, mac); } @@ -333,65 +283,54 @@ TRXCompiler::gatherMacros(xmlNode* node) void TRXCompiler::processRules(xmlNode* node) { - for(xmlNode* rule = node->children; rule != NULL; rule = rule->next) - { - if(rule->type != XML_ELEMENT_NODE) continue; + for (auto rule : children(node)) { if(xmlStrcmp(rule->name, (const xmlChar*) "rule")) { - warn(rule, L"Ignoring non- element in ."); + warn(rule, "Ignoring non- element in ."); continue; } - if(!xmlStrcmp(getAttr(rule, (const xmlChar*) "i"), (const xmlChar*) "yes")) - { + if (getattr(rule, "i") == "yes"_u) { continue; } curPatternSize = 0; localVars.clear(); - wstring id = toWstring(getAttr(rule, (const xmlChar*) "id")); - wstring weight = toWstring(getAttr(rule, (const xmlChar*) "weight")); - wstring firstChunk = toWstring(getAttr(rule, (const xmlChar*) "firstChunk")); - if(firstChunk == L"") firstChunk = L"*"; + UString id = getattr(rule, "id"); + UString weight = getattr(rule, "weight"); + UString firstChunk = getattr(rule, "firstChunk"); + if(firstChunk.empty()) firstChunk = "*"_u; xmlNode* action = NULL; - wstring outputAction; + UString outputAction; bool pat = false; - wstring assertClause = L""; - for(xmlNode* part = rule->children; part != NULL; part = part->next) - { - if(part->type != XML_ELEMENT_NODE) continue; - if(!xmlStrcmp(part->name, (const xmlChar*) "local")) + UString assertClause; + for (auto part : children(rule)) { + if(nameIs(part, "local")) { - for(xmlNode* var = rule->children; var != NULL; var = var->next) - { - if(var->type == XML_ELEMENT_NODE && - !xmlStrcmp(var->name, (const xmlChar*) "var")) - { - localVars.insert(toWstring(requireAttr(var, (const xmlChar*) "n"))); + for (auto var : children(rule)) { + if(nameIs(var, "var")) { + localVars.insert(requireAttr(var, "n")); } } } - else if(!xmlStrcmp(part->name, (const xmlChar*) "pattern")) + else if(nameIs(part, "pattern")) { if(pat) { - die(rule, L"Rule cannot have multiple s."); + die(rule, "Rule cannot have multiple s."); } pat = true; vector> pls; - for(xmlNode* pi = part->children; pi != NULL; pi = pi->next) - { - if(pi->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(pi->name, (const xmlChar*) "pattern-item")) - { - warn(pi, L"Ignoring non- in ."); + for (auto pi : children(part)) { + if (!nameIs(pi, "pattern-item")) { + warn(pi, "Ignoring non- in ."); continue; } curPatternSize++; - wstring name = toWstring(requireAttr(pi, (const xmlChar*) "n")); + UString name = requireAttr(pi, "n"); if(patterns.find(name) == patterns.end()) { - die(pi, L"Unknown pattern '" + name + L"'."); + die(pi, "Unknown pattern '%S'.", name.c_str()); } else { @@ -400,7 +339,7 @@ TRXCompiler::processRules(xmlNode* node) } if(curPatternSize == 0) { - die(rule, L"Rule cannot have empty pattern."); + die(rule, "Rule cannot have empty pattern."); } if(curPatternSize > longestPattern) { @@ -408,16 +347,14 @@ TRXCompiler::processRules(xmlNode* node) } if(excludedRules.find(id) == excludedRules.end()) { - PB.addRule(inputRules.size() + 1, (weight.size() > 0 ? stod(weight) : 0.0), pls, StringUtils::split_wstring(firstChunk, L" "), id); + PB.addRule(inputRules.size() + 1, (weight.size() > 0 ? StringUtils::stod(weight) : 0.0), pls, StringUtils::split(firstChunk, " "_u), id); } inputRuleSizes.push_back(pls.size()); } - else if(!xmlStrcmp(part->name, (const xmlChar*) "assert")) + else if(nameIs(part, "assert")) { bool firstAssert = (assertClause.size() == 0); - for(xmlNode* clause = part->children; clause != NULL; clause = clause->next) - { - if(clause->type != XML_ELEMENT_NODE) continue; + for (auto clause : children(part)) { assertClause += processCond(clause); if(!firstAssert) { @@ -426,38 +363,37 @@ TRXCompiler::processRules(xmlNode* node) firstAssert = false; } } - else if(!xmlStrcmp(part->name, (const xmlChar*) "action")) + else if(nameIs(part, "action")) { if(action != NULL) { - die(rule, L"Rule cannot have multiple s."); + die(rule, "Rule cannot have multiple s."); } action = part; } - else if(!xmlStrcmp(part->name, (const xmlChar*) "output-action")) + else if(nameIs(part, "output-action")) { if(outputAction.size() > 0) { - die(part, L"Rule cannot have multiple s."); + die(part, "Rule cannot have multiple s."); } inOutput = true; - for(xmlNode* state = part->children; state != NULL; state = state->next) - { - if(state->type == XML_ELEMENT_NODE) outputAction += processStatement(state); + for (auto state : children(part)) { + outputAction += processStatement(state); } } else { - warn(part, L"Unknown element <" + toWstring(part->name) + L"> in , ignoring."); + warn(part, "Unknown element <%S> in , ignoring.", name(part).c_str()); } } if(!pat) { - die(rule, L"Rule must have ."); + die(rule, "Rule must have ."); } if(action == NULL) { - die(rule, L"Rule must have ."); + die(rule, "Rule must have ."); } else { @@ -471,17 +407,15 @@ TRXCompiler::processRules(xmlNode* node) currentOutputRule = -1; } inOutput = false; - wstring actionStr; + UString actionStr; if(assertClause.size() > 0) { actionStr = assertClause; actionStr += JUMPONTRUE; - actionStr += (wchar_t)1; + actionStr += (UChar)1; actionStr += REJECTRULE; } - for(xmlNode* state = action->children; state != NULL; state = state->next) - { - if(state->type != XML_ELEMENT_NODE) continue; + for (auto state : children(action)) { actionStr += processStatement(state); } inputRules.push_back(actionStr); @@ -489,53 +423,46 @@ TRXCompiler::processRules(xmlNode* node) } } -wstring +UString TRXCompiler::processStatement(xmlNode* node) { - if(!xmlStrcmp(getAttr(node, (const xmlChar*) "i"), (const xmlChar*) "yes")) - { - return L""; + if (getattr(node, "i") == "yes"_u) { + return ""_u; } - wstring ret; - if(!xmlStrcmp(node->name, (const xmlChar*) "let") || - !xmlStrcmp(node->name, (const xmlChar*) "modify-case")) - { - wstring name = toWstring(node->name); + UString ret; + if(nameIs(node, "let") || nameIs(node, "modify-case")) { xmlNode* var = NULL; - wstring val; + UString val; bool val_is_clip = false; - for(xmlNode* n = node->children; n != NULL; n = n->next) - { - if(n->type != XML_ELEMENT_NODE) continue; + for (auto n : children(node)) { if(var == NULL) { var = n; } else if(val.size() == 0) { - val_is_clip = (!xmlStrcmp(n->name, (const xmlChar*) "clip")); + val_is_clip = (nameIs(n, "clip")); val = processValue(n); } - else - { - die(node, L"<" + name + L"> cannot have more than two children."); + else { + die(node, "<%S> cannot have more than two children.", name(node).c_str()); } } if(val.size() == 0) { - die(node, L"<" + name + L"> must have two children."); + die(node, "<%S> must have two children.", name(node).c_str()); } - if(!xmlStrcmp(var->name, (const xmlChar*) "var")) + if(nameIs(var, "var")) { - wstring vname = toWstring(requireAttr(var, (const xmlChar*) "n")); + UString vname = requireAttr(var, "n"); if(vars.find(vname) == vars.end()) { - die(var, L"Undefined variable '" + vname + L"'."); + die(var, "Undefined variable '%S'.", vname.c_str()); } - if(name == L"modify-case") + if(nameIs(node, "modify-case")) { ret += STRING; - ret += (wchar_t)vname.size(); + ret += (UChar)vname.size(); ret += vname; ret += FETCHVAR; ret += val; @@ -546,34 +473,34 @@ TRXCompiler::processStatement(xmlNode* node) ret += val; } ret += STRING; - ret += (wchar_t)vname.size(); + ret += (UChar)vname.size(); ret += vname; ret += SETVAR; } - else if(!xmlStrcmp(var->name, (const xmlChar*) "clip")) + else if(nameIs(var, "clip")) { - wstring side = toWstring(getAttr(var, (const xmlChar*) "side")); - if(!(side == L"" || side == L"tl")) + UString side = getattr(var, "side"); + if(!(side.empty() || side == "tl"_u)) { - warn(var, L"Cannot set side '" + side + L"', setting 'tl' instead."); + warn(var, "Cannot set side '%S', setting 'tl' instead.", side.c_str()); } - wstring part = toWstring(requireAttr(var, (const xmlChar*) "part")); + UString part = requireAttr(var, "part"); if(!PB.isAttrDefined(part)) { - die(var, L"Unknown attribute '" + part + L"'"); + die(var, "Unknown attribute '%S'", part.c_str()); } - wstring set_str; + UString set_str; set_str += PB.BCstring(part); set_str += INT; - set_str += (wchar_t)getPos(var); + set_str += (UChar)getPos(var); set_str += SETCLIP; - if(name == L"modify-case") + if(nameIs(node, "modify-case")) { ret += INT; - ret += (wchar_t)getPos(var); + ret += (UChar)getPos(var); ret += PUSHINPUT; ret += STRING; - ret += (wchar_t)part.size(); + ret += (UChar)part.size(); ret += part; ret += TARGETCLIP; ret += val; @@ -585,11 +512,11 @@ TRXCompiler::processStatement(xmlNode* node) ret = val; if(val_is_clip) { - wstring cond; + UString cond; cond += DUP; - cond += PB.BCstring(L""); + cond += PB.BCstring(""_u); cond += EQUAL; - ret += PB.BCifthenelse(cond, wstring(1, DROP), set_str); + ret += PB.BCifthenelse(cond, UString(1, DROP), set_str); } else { @@ -599,344 +526,301 @@ TRXCompiler::processStatement(xmlNode* node) } else { - die(node, L"Cannot set value of <" + toWstring(var->name) + L">."); + die(node, "Cannot set value of <%S>.", name(var).c_str()); } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "out")) + else if(nameIs(node, "out")) { - for(xmlNode* o = node->children; o != NULL; o = o->next) - { - if(o->type == XML_ELEMENT_NODE) - { - ret += processValue(o); - ret += OUTPUT; - } + for (auto o : children(node)) { + ret += processValue(o); + ret += OUTPUT; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "choose")) + else if(nameIs(node, "choose")) { ret = processChoose(node); } - else if(!xmlStrcmp(node->name, (const xmlChar*) "call-macro")) + else if(nameIs(node, "call-macro")) { // TODO: DTD implies number of arguments can be variable - wstring name = toWstring(requireAttr(node, (const xmlChar*) "n")); + UString name = requireAttr(node, "n"); if(macros.find(name) == macros.end()) { - die(node, L"Unknown macro '" + name + L"'."); + die(node, "Unknown macro '%S'.", name.c_str()); } vector temp; - for(xmlNode* param = node->children; param != NULL; param = param->next) - { - if(param->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(param->name, (const xmlChar*) "with-param")) - { - warn(param, L"Ignoring non- in "); - } - else - { + for (auto param : children(node)) { + if (nameIs(param, "with-param")) { temp.push_back(getPos(param)); + } else { + warn(param, "Ignoring non- in "); } } unsigned int shouldbe = macros[name].first; if(shouldbe < temp.size()) { - die(node, L"Too many parameters, macro '" + name + L"' expects " + to_wstring(shouldbe) + L", got " + to_wstring(temp.size()) + L"."); + die(node, "Too many parameters, macro '%S' expects %d, got %d.", name.c_str(), shouldbe, temp.size()); } if(shouldbe > temp.size()) { - die(node, L"Not enough parameters, macro '" + name + L"' expects " + to_wstring(shouldbe) + L", got " + to_wstring(temp.size()) + L"."); + die(node, "Not enough parameters, macro '%S' expects %d, got %d.", name.c_str(), shouldbe, temp.size()); } macroPosShift.push_back(temp); xmlNode* mac = macros[name].second; - for(xmlNode* state = mac->children; state != NULL; state = state->next) - { - if(state->type != XML_ELEMENT_NODE) continue; + for (auto state : children(mac)) { ret += processStatement(state); } macroPosShift.pop_back(); } - else if(!xmlStrcmp(node->name, (const xmlChar*) "append")) + else if(nameIs(node, "append")) { // TODO: DTD says this can append to a clip - wstring name = toWstring(requireAttr(node, (const xmlChar*) "n")); + UString name = requireAttr(node, "n"); if(vars.find(name) == vars.end() && localVars.find(name) == localVars.end()) { - die(node, L"Unknown variable '" + name + L"'."); + die(node, "Unknown variable '%S'.", name.c_str()); } ret += STRING; - ret += (wchar_t)name.size(); + ret += (UChar)name.size(); ret += name; ret += FETCHVAR; - for(xmlNode* part = node->children; part != NULL; part = part->next) - { - if(part->type == XML_ELEMENT_NODE) - { - ret += processValue(part); - ret += CONCAT; - } + for (auto part : children(node)) { + ret += processValue(part); + ret += CONCAT; } ret += STRING; - ret += (wchar_t)name.size(); + ret += (UChar)name.size(); ret += name; ret += SETVAR; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "reject-current-rule")) + else if(nameIs(node, "reject-current-rule")) { ret += REJECTRULE; } else { - die(node, L"Unrecognized statement '" + toWstring(node->name) + L"'"); + die(node, "Unrecognized statement '%S'", name(node).c_str()); } return ret; } -wstring +UString TRXCompiler::processValue(xmlNode* node) { - wstring ret; - if(!xmlStrcmp(node->name, (const xmlChar*) "b")) + UString ret; + if(nameIs(node, "b")) { ret += INT; - ret += (wchar_t)getPos(node); + ret += (UChar)getPos(node); ret += BLANK; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "clip")) + else if(nameIs(node, "clip")) { ret += INT; - ret += (wchar_t)getPos(node); + ret += (UChar)getPos(node); ret += PUSHINPUT; ret += STRING; - wstring part = toWstring(requireAttr(node, (const xmlChar*) "part")); + UString part = requireAttr(node, "part"); if(!PB.isAttrDefined(part)) { - die(node, L"Unknown attribute '" + part + L"'"); + die(node, "Unknown attribute '%S'", part.c_str()); } - ret += (wchar_t)part.size(); + ret += (UChar)part.size(); ret += part; - wstring side = toWstring(getAttr(node, (const xmlChar*) "side")); - if(side == L"sl") + UString side = getattr(node, "side"); + if(side == "sl"_u) { ret += SOURCECLIP; } - else if(side == L"tl" || side == L"") + else if(side == "tl"_u || side.empty()) { ret += TARGETCLIP; } - else if(side == L"ref") + else if(side == "ref"_u) { ret += REFERENCECLIP; } else { - warn(node, L"Unknown clip side '" + side + L"', defaulting to 'tl'."); + warn(node, "Unknown clip side '%S', defaulting to 'tl'.", side.c_str()); ret += TARGETCLIP; } - wstring link = toWstring(getAttr(node, (const xmlChar*) "link-to")); + UString link = getattr(node, "link-to"); if(link.size() > 0) { ret += DUP; ret += STRING; - ret += (wchar_t)0; + ret += (UChar)0; ret += EQUAL; ret += JUMPONTRUE; - ret += (wchar_t)(link.size() + 5); + ret += (UChar)(link.size() + 5); ret += DROP; ret += STRING; - ret += (wchar_t)(link.size() + 2); - ret += L'<'; + ret += (UChar)(link.size() + 2); + ret += '<'; ret += link; - ret += L'>'; + ret += '>'; } // TODO: what does attribute "queue" do? } - else if(!xmlStrcmp(node->name, (const xmlChar*) "lit")) + else if(nameIs(node, "lit")) { ret += STRING; - wstring v = toWstring(requireAttr(node, (const xmlChar*) "v")); - ret += (wchar_t)v.size(); + UString v = requireAttr(node, "v"); + ret += (UChar)v.size(); ret += v; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "lit-tag")) + else if(nameIs(node, "lit-tag")) { ret += STRING; - wstring v = L"<" + toWstring(requireAttr(node, (const xmlChar*) "v")) + L">"; - v = StringUtils::substitute(v, L".", L"><"); - if(v == L"<>") + UString v = "<"_u + requireAttr(node, "v") + ">"_u; + v = StringUtils::substitute(v, "."_u, "><"_u); + if(v == "<>"_u) { - v = L""; + v.clear(); } - ret += (wchar_t)v.size(); + ret += (UChar)v.size(); ret += v; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "var")) + else if(nameIs(node, "var")) { ret += STRING; - wstring v = toWstring(requireAttr(node, (const xmlChar*) "n")); + UString v = requireAttr(node, "n"); if(vars.find(v) == vars.end() && localVars.find(v) == localVars.end()) { - die(node, L"Unknown variable '" + v + L"'."); + die(node, "Unknown variable '%S'.", v.c_str()); } - ret += (wchar_t)v.size(); + ret += (UChar)v.size(); ret += v; ret += FETCHVAR; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "get-case-from")) + else if(nameIs(node, "get-case-from")) { - for(xmlNode* c = node->children; c != NULL; c = c->next) - { - if(c->type == XML_ELEMENT_NODE) - { - if(ret.size() > 0) - { - die(node, L" cannot have multiple children."); - } - ret += processValue(c); + for (auto c : children(node)) { + if (ret.empty()) { + ret.append(processValue(c)); + } else { + die(node, " cannot have multiple children."); } } if(ret.size() == 0) { - die(node, L" cannot be empty."); + die(node, " cannot be empty."); } ret += INT; - ret += (wchar_t)getPos(node); + ret += (UChar)getPos(node); ret += PUSHINPUT; ret += STRING; - ret += (wchar_t)3; - ret += L"lem"; + ret += (UChar)3; + ret += "lem"_u; ret += (inOutput ? TARGETCLIP : SOURCECLIP); ret += SETCASE; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "case-of")) + else if(nameIs(node, "case-of")) { ret += INT; ret += getPos(node); ret += PUSHINPUT; ret += STRING; - wstring part = toWstring(requireAttr(node, (const xmlChar*) "part")); - ret += (wchar_t)part.size(); + UString part = requireAttr(node, "part"); + ret += (UChar)part.size(); ret += part; - wstring side = toWstring(getAttr(node, (const xmlChar*) "side")); - if(side == L"sl") + UString side = getattr(node, "side"); + if(side == "sl"_u) { ret += SOURCECLIP; } - else if(side == L"tl" || side == L"") + else if(side == "tl"_u || side.empty()) { ret += TARGETCLIP; } - else if(side == L"ref") + else if(side == "ref"_u) { ret += REFERENCECLIP; } else { - warn(node, L"Unknown side '" + side + L"', defaulting to target."); + warn(node, "Unknown side '%S', defaulting to target.", side.c_str()); ret += TARGETCLIP; } ret += GETCASE; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "concat")) + else if(nameIs(node, "concat")) { - for(xmlNode* c = node->children; c != NULL; c = c->next) - { + for (auto c : children(node)) { unsigned int l = ret.size(); - if(c->type == XML_ELEMENT_NODE) - { - ret += processValue(c); - if(l > 0 && ret.size() > l) - { - ret += CONCAT; - } + ret += processValue(c); + if(l > 0 && ret.size() > l) { + ret += CONCAT; } } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "lu")) + else if(nameIs(node, "lu")) { ret += CHUNK; - wstring children; - for(xmlNode* p = node->children; p != NULL; p = p->next) - { - if(p->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(p->name, (const xmlChar*) "clip")) - { - wstring part = toWstring(getAttr(p, (const xmlChar*) "part")); - if(part == L"whole" || part == L"chcontent" || part == L"content") - { - children += INT; - children += (wchar_t)getPos(p); - children += PUSHINPUT; - children += APPENDALLCHILDREN; - if(part != L"whole") continue; - } + UString children_str; + for (auto p : children(node)) { + if(nameIs(p, "clip")) { + UString part = getattr(p, "part"); + if(part == "whole"_u || part == "chcontent"_u || part == "content"_u) { + children_str += INT; + children_str += (UChar)getPos(p); + children_str += PUSHINPUT; + children_str += APPENDALLCHILDREN; + if(part != "whole"_u) continue; } - ret += processValue(p); - ret += APPENDSURFACE; } + ret += processValue(p); + ret += APPENDSURFACE; } - ret += children; + ret += children_str; } - else if(!xmlStrcmp(node->name, (const xmlChar*) "mlu")) + else if(nameIs(node, "mlu")) { ret += CHUNK; - for(xmlNode* lu = node->children; lu != NULL; lu = lu->next) - { - if(lu->type != XML_ELEMENT_NODE) continue; - if(xmlStrcmp(lu->name, (const xmlChar*) "lu")) - { - die(node, L" can only contain s."); + for (auto lu : children(node)) { + if (!nameIs(lu, "lu")) { + die(node, " can only contain s."); } if(ret.size() > 1) { ret += CONJOIN; ret += APPENDCHILD; - // apertium/transfer.cc has checks against appending '' wstring or '+#' + // apertium/transfer.cc has checks against appending '' UString or '+#' // TODO? } ret += processValue(lu); ret += APPENDCHILD; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "chunk")) + else if(nameIs(node, "chunk")) { ret += CHUNK; - for(xmlNode* part = node->children; part != NULL; part = part->next) - { - if(part->type != XML_ELEMENT_NODE) continue; - if(!xmlStrcmp(part->name, (const xmlChar*) "source")) + for (auto part : children(node)) { + if(nameIs(part, "source")) { - for(xmlNode* seg = part->children; seg != NULL; seg = seg->next) - { - if(seg->type != XML_ELEMENT_NODE) continue; + for (auto seg : children(part)) { ret += processValue(seg); ret += APPENDSURFACESL; } } - else if(!xmlStrcmp(part->name, (const xmlChar*) "target")) + else if(nameIs(part, "target")) { - for(xmlNode* seg = part->children; seg != NULL; seg = seg->next) - { - if(seg->type != XML_ELEMENT_NODE) continue; + for (auto seg : children(part)) { ret += processValue(seg); ret += APPENDSURFACE; } } - else if(!xmlStrcmp(part->name, (const xmlChar*) "reference")) + else if(nameIs(part, "reference")) { - for(xmlNode* seg = part->children; seg != NULL; seg = seg->next) - { - if(seg->type != XML_ELEMENT_NODE) continue; + for (auto seg : children(part)) { ret += processValue(seg); ret += APPENDSURFACEREF; } } - else if(!xmlStrcmp(part->name, (const xmlChar*) "contents")) + else if(nameIs(part, "contents")) { - for(xmlNode* seg = part->children; seg != NULL; seg = seg->next) - { - if(seg->type != XML_ELEMENT_NODE) continue; + for (auto seg : children(part)) { ret += processValue(seg); ret += APPENDCHILD; } @@ -944,31 +828,29 @@ TRXCompiler::processValue(xmlNode* node) } if(!inOutput && currentOutputRule != -1) { - ret += (wchar_t)currentOutputRule; - ret += (wchar_t)0; + ret += (UChar)currentOutputRule; + ret += (UChar)0; ret += SETRULE; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "lu-count")) + else if(nameIs(node, "lu-count")) { ret += LUCOUNT; } else { - die(node, L"Unrecognized expression '" + toWstring(node->name) + L"'"); + die(node, "Unrecognized expression '%S'", name(node).c_str()); } return ret; } -wstring +UString TRXCompiler::processCond(xmlNode* node) { - wstring ret; - if(!xmlStrcmp(node->name, (const xmlChar*) "and")) + UString ret; + if(nameIs(node, "and")) { - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { unsigned int len = ret.size(); ret += processCond(op); if(len > 0 && ret.size() > len) @@ -977,11 +859,9 @@ TRXCompiler::processCond(xmlNode* node) } } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "or")) + else if(nameIs(node, "or")) { - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { unsigned int len = ret.size(); ret += processCond(op); if(len > 0 && ret.size() > len) @@ -990,14 +870,12 @@ TRXCompiler::processCond(xmlNode* node) } } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "not")) + else if(nameIs(node, "not")) { - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { if(ret.size() > 0) { - die(node, L" cannot have multiple children"); + die(node, " cannot have multiple children"); } else { @@ -1006,22 +884,18 @@ TRXCompiler::processCond(xmlNode* node) } } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "equal")) + else if(nameIs(node, "equal")) { int i = 0; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type == XML_ELEMENT_NODE) - { - ret += processValue(op); - i++; - } + for (auto op : children(node)) { + ret += processValue(op); + i++; } if(i != 2) { - die(node, L" must have exactly two children"); + die(node, " must have exactly two children"); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += EQUALCL; } @@ -1030,22 +904,18 @@ TRXCompiler::processCond(xmlNode* node) ret += EQUAL; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "begins-with")) + else if(nameIs(node, "begins-with")) { int i = 0; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type == XML_ELEMENT_NODE) - { - ret += processValue(op); - i++; - } + for (auto op : children(node)) { + ret += processValue(op); + i++; } if(i != 2) { - die(node, L" must have exactly two children"); + die(node, " must have exactly two children"); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += ISPREFIXCL; } @@ -1054,42 +924,40 @@ TRXCompiler::processCond(xmlNode* node) ret += ISPREFIX; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "begins-with-list")) + else if(nameIs(node, "begins-with-list")) { bool list = false; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { if(ret.size() == 0) { ret += processValue(op); } else if(list) { - die(node, L" cannot have more than two children."); + die(node, " cannot have more than two children."); } else if(xmlStrcmp(op->name, (const xmlChar*) "list")) { - die(op, L"Expected , found <" + toWstring(op->name) + L"> instead."); + die(op, "Expected , found <%S> instead.", to_ustring((const char*)op->name).c_str()); } else { - wstring name = toWstring(requireAttr(op, (const xmlChar*) "n")); + UString name = requireAttr(op, "n"); if(lists.find(name) == lists.end()) { - die(op, L"Unknown list '" + name + L"'."); + die(op, "Unknown list '%S'.", name.c_str()); } ret += STRING; - ret += (wchar_t)name.size(); + ret += (UChar)name.size(); ret += name; list = true; } } if(!list) { - die(node, L" must have two children."); + die(node, " must have two children."); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += HASPREFIXCL; } @@ -1098,22 +966,18 @@ TRXCompiler::processCond(xmlNode* node) ret += HASPREFIX; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "ends-with")) + else if(nameIs(node, "ends-with")) { int i = 0; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type == XML_ELEMENT_NODE) - { - ret += processValue(op); - i++; - } + for (auto op : children(node)) { + ret += processValue(op); + i++; } if(i != 2) { - die(node, L" must have exactly two children"); + die(node, " must have exactly two children"); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += ISSUFFIXCL; } @@ -1122,42 +986,40 @@ TRXCompiler::processCond(xmlNode* node) ret += ISSUFFIX; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "ends-with-list")) + else if(nameIs(node, "ends-with-list")) { bool list = false; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { if(ret.size() == 0) { ret += processValue(op); } else if(list) { - die(node, L" cannot have more than two children."); + die(node, " cannot have more than two children."); } else if(xmlStrcmp(op->name, (const xmlChar*) "list")) { - die(op, L"Expected , found <" + toWstring(op->name) + L"> instead."); + die(op, "Expected , found <%S> instead.", name(op).c_str()); } else { - wstring name = toWstring(requireAttr(op, (const xmlChar*) "n")); + UString name = requireAttr(op, "n"); if(lists.find(name) == lists.end()) { - die(op, L"Unknown list '" + name + L"'."); + die(op, "Unknown list '%S'.", name.c_str()); } ret += STRING; - ret += (wchar_t)name.size(); + ret += (UChar)name.size(); ret += name; list = true; } } if(!list) { - die(node, L" must have two children."); + die(node, " must have two children."); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += HASSUFFIXCL; } @@ -1166,22 +1028,18 @@ TRXCompiler::processCond(xmlNode* node) ret += HASSUFFIX; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "contains-substring")) + else if(nameIs(node, "contains-substring")) { int i = 0; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type == XML_ELEMENT_NODE) - { - ret += processValue(op); - i++; - } + for (auto op : children(node)) { + ret += processValue(op); + i++; } if(i != 2) { - die(node, L" must have exactly two children"); + die(node, " must have exactly two children"); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += ISSUBSTRINGCL; } @@ -1190,42 +1048,41 @@ TRXCompiler::processCond(xmlNode* node) ret += ISSUBSTRING; } } - else if(!xmlStrcmp(node->name, (const xmlChar*) "in")) + else if(nameIs(node, "in")) { bool list = false; - for(xmlNode* op = node->children; op != NULL; op = op->next) - { - if(op->type != XML_ELEMENT_NODE) continue; + for (auto op : children(node)) { if(ret.size() == 0) { ret += processValue(op); } else if(list) { - die(node, L" cannot have more than two children."); + die(node, " cannot have more than two children."); } else if(xmlStrcmp(op->name, (const xmlChar*) "list")) { - die(op, L"Expected , found <" + toWstring(op->name) + L"> instead."); + die(op, "Expected , found <%S> instead.", + name(op).c_str()); } else { - wstring name = toWstring(requireAttr(op, (const xmlChar*) "n")); + UString name = requireAttr(op, "n"); if(lists.find(name) == lists.end()) { - die(op, L"Unknown list '" + name + L"'."); + die(op, "Unknown list '%S'.", name.c_str()); } ret += STRING; - ret += (wchar_t)name.size(); + ret += (UChar)name.size(); ret += name; list = true; } } if(!list) { - die(node, L" must have two children."); + die(node, " must have two children."); } - if(toWstring(getAttr(node, (const xmlChar*) "caseless")) == L"yes") + if(getattr(node, "caseless") == "yes"_u) { ret += INCL; } @@ -1236,111 +1093,101 @@ TRXCompiler::processCond(xmlNode* node) } else { - die(node, L"Unrecognized condition '" + toWstring(node->name) + L"'"); + die(node, "Unrecognized condition '%S'", name(node).c_str()); } return ret; } -wstring +UString TRXCompiler::processChoose(xmlNode* node) { - vector> clauses; + vector> clauses; int when = 0; int otherwise = 0; - for(xmlNode* cl = node->children; cl != NULL; cl = cl->next) - { - if(cl->type != XML_ELEMENT_NODE) continue; - if(!xmlStrcmp(cl->name, (const xmlChar*) "when")) + for (auto cl : children(node)) { + if(nameIs(cl, "when")) { if(otherwise > 0) { - warn(cl, L"Clauses after will not be executed."); + warn(cl, "Clauses after will not be executed."); continue; } when++; - wstring test, block; - for(xmlNode* n = cl->children; n != NULL; n = n->next) - { - if(n->type != XML_ELEMENT_NODE) continue; - if(!xmlStrcmp(n->name, (const xmlChar*) "test")) + UString test, block; + for (auto n : children(cl)) { + if(nameIs(n, "test")) { if(test.size() != 0) { - die(n, L"Cannot have multiple s in a clause."); + die(n, "Cannot have multiple s in a clause."); } - for(xmlNode* t = n->children; t != NULL; t = t->next) - { - if(t->type != XML_ELEMENT_NODE) continue; + for (auto t : children(n)) { if(test.size() == 0) { test = processCond(t); } else { - die(t, L" must have exactly one child."); + die(t, " must have exactly one child."); } } if(test.size() == 0) { - die(n, L" cannot be empty."); + die(n, " cannot be empty."); } } else { if(test.size() == 0) { - die(n, L" clause must begin with ."); + die(n, " clause must begin with ."); } block += processStatement(n); } } clauses.push_back(make_pair(test, block)); } - else if(!xmlStrcmp(cl->name, (const xmlChar*) "otherwise")) + else if(nameIs(cl, "otherwise")) { otherwise++; if(otherwise > 1) { - warn(cl, L"Multiple clauses will not be executed."); + warn(cl, "Multiple clauses will not be executed."); continue; } - wstring block; - for(xmlNode* state = cl->children; state != NULL; state = state->next) - { - if(state->type == XML_ELEMENT_NODE) - { - block += processStatement(state); - } + UString block; + for (auto state : children(cl)) { + block += processStatement(state); } if(block.size() > 0) { - clauses.push_back(make_pair(L"", block)); + clauses.push_back(make_pair(""_u, block)); } else { - warn(cl, L"Empty clause."); + warn(cl, "Empty clause."); } } else { - warn(cl, L"Ignoring unexpected clause in ."); + warn(cl, "Ignoring unexpected clause in ."); } } - wstring ret; - for(vector>::reverse_iterator it = clauses.rbegin(), + UString ret; + for(vector>::reverse_iterator it = clauses.rbegin(), limit = clauses.rend(); it != limit; it++) { - wstring act = it->second; + UString act = it->second; if(ret.size() > 0) { act += JUMP; - act += (wchar_t)ret.size(); + act += (UChar)ret.size(); } - wstring test = it->first; + UString test = it->first; if(test.size() > 0) { test += JUMPONFALSE; - test += (wchar_t)act.size(); + test += (UChar)act.size(); } ret = test + act + ret; } @@ -1359,10 +1206,10 @@ TRXCompiler::write(const char* binfile) FILE* bin = fopen(binfile, "wb"); if(bin == NULL) { - wcerr << L"Error: Cannot open " << binfile << L" for writing." << endl; + cerr << "Error: Cannot open " << binfile << " for writing." << endl; exit(EXIT_FAILURE); } - vector> inRules; + vector> inRules; for(unsigned int i = 0; i < inputRules.size(); i++) { inRules.push_back(make_pair((inputRuleSizes[i]*2 - 1), inputRules[i])); @@ -1374,6 +1221,6 @@ TRXCompiler::write(const char* binfile) void TRXCompiler::printStats() { - wcout << "Rules: " << inputRules.size() << endl; - wcout << "Macros: " << macros.size() << endl; + cout << "Rules: " << inputRules.size() << endl; + cout << "Macros: " << macros.size() << endl; } diff --git a/src/trx_compiler.h b/src/trx_compiler.h index 7f17fc7..86cb3dc 100644 --- a/src/trx_compiler.h +++ b/src/trx_compiler.h @@ -4,12 +4,12 @@ #include #include #include -#include #include #include #include #include +#include using namespace std; @@ -26,38 +26,38 @@ private: * Macros defined in the current file * name => ( parameters, xml node ) */ - map, Ltstr> macros; + map> macros; /** * Patterns defined in the current file */ - map, Ltstr> patterns; + map> patterns; /** * Global string variables * name => initial value */ - map vars; + map vars; /** * Rule-specific string variable names */ - set localVars; + set localVars; /** * All lists */ - map, Ltstr> lists; + map> lists; /** * Ids of rules which should not be compiled */ - set excludedRules; + set excludedRules; /** * Bytecode for non-postchunk rules */ - vector inputRules; + vector inputRules; /** * Sizes of patterns for non-postchunk rules @@ -67,7 +67,7 @@ private: /** * Bytecode for postchunk rules */ - vector outputRules; + vector outputRules; /** * Remapped positions within macros @@ -109,40 +109,25 @@ private: * Report a fatal error and exit * @param node - xml element closest to the error */ - void die(xmlNode* node, wstring msg); + void die(xmlNode* node, const char* fmt, ...); /** * Report a non-fatal error * @param node - xml element closest to the error */ - void warn(xmlNode* node, wstring msg); + void warn(xmlNode* node, const char* fmt, ...); ////////// // PARSING UTILITIES ////////// - /** - * Return the value of an attribute or an empty string - * @param node - xml element - * @param attr - name of attribute - * @return attribute value or empty string - */ - xmlChar* getAttr(xmlNode* node, const xmlChar* attr); - /** * getAttr(), but calls die() if attribute isn't found * @param node - xml element * @param attr - name of attribute * @return attribute value */ - xmlChar* requireAttr(xmlNode* node, const xmlChar* attr); - - /** - * Convert a the libxml string format to std::wstring - * @param s - libxml string - * @return equivalent wstring - */ - wstring toWstring(const xmlChar* s); + UString requireAttr(xmlNode* node, const char* attr); /** * Parse pos attribute and convert appropriately if in a macro @@ -169,7 +154,7 @@ private: * @param ats - category elements * @return inserted name, may or may not be equal to name */ - wstring insertAttr(wstring name, set ats); + UString insertAttr(UString name, set ats); /** * Pass a list to PatternBuilder, name-mangling if necessary @@ -177,7 +162,7 @@ private: * @param ats - list elements * @return inserted name, may or may not be equal to name */ - wstring insertList(wstring name, set ats); + UString insertList(UString name, set ats); ////////// // XML PARSING @@ -224,7 +209,7 @@ private: * , , , , , , * @return bytecode */ - wstring processStatement(xmlNode* node); + UString processStatement(xmlNode* node); /** * Parse and compile one of @@ -232,7 +217,7 @@ private: * , , , , * @return bytecode */ - wstring processValue(xmlNode* node); + UString processValue(xmlNode* node); /** * Parse and compile one of @@ -240,13 +225,13 @@ private: * , , , * @return bytecode */ - wstring processCond(xmlNode* node); + UString processCond(xmlNode* node); /** * Parse and compile * @return bytecode */ - wstring processChoose(xmlNode* node); + UString processChoose(xmlNode* node); public: TRXCompiler(); @@ -254,7 +239,7 @@ public: void loadLex(const string& fname); void compile(string file); void write(const char* binfile); - void excludeRule(wstring name) + void excludeRule(UString name) { excludedRules.insert(name); }