commit a0518792bf879df98957a9ce4411873a024e1f65 Author: Daniel Swanson Date: Tue Jun 8 13:21:44 2021 -0500 compiles now diff --git a/apertium/apertium-postlatex.l b/apertium/apertium-postlatex.l index c33673d..4cf5edf 100644 --- a/apertium/apertium-postlatex.l +++ b/apertium/apertium-postlatex.l @@ -19,11 +19,11 @@ extern "C" { } #include -#include #ifndef GENFORMAT #include "apertium_config.h" #endif #include +#include #ifdef _WIN32 #include #include @@ -32,37 +32,6 @@ extern "C" { using namespace std; AccentsMap accentsMap(true); -wstring closesym = L""; -string memconv = ""; - -wstring convertir(string const &multibyte, int const length) -{ - memconv.append(multibyte.c_str(), length); - int tam = memconv.size(); - wchar_t *retval = new wchar_t[tam+1]; - size_t l = mbstowcs(retval, memconv.c_str(), tam); - - if(l == ((size_t) -1)) - { - delete[] retval; - if(memconv.size() >= 4) - { - wcerr << L"Warning: wrong encoding" << endl; - } - return L""; - } - else - { - memconv = ""; - retval[l] = 0; - wstring ret = retval; - delete[] retval; - return ret; - } -} - - - %} @@ -79,104 +48,105 @@ wstring convertir(string const &multibyte, int const length) " { - fputws(L"\"",yyout); + fputs("\"",yyout); } ' { - fputws(L"\'",yyout); + fputs("\'",yyout); } < { - fputws(L"<",yyout); + fputs("<",yyout); } > { - fputws(L">",yyout); + fputs(">",yyout); } & { - fputws(L"\\&",yyout); + fputs("\\&",yyout); } \ { - fputws(L"&",yyout); + fputs("&",yyout); } \ { - fputws(L"\\{", yyout); + fputs("\\{", yyout); } \ { - fputws(L"\\}", yyout); + fputs("\\}", yyout); } \ { - fputws(L"\\%", yyout); + fputs("\\%", yyout); } ¿ { - fputws(L"?`",yyout); + fputs("?`",yyout); } ¡ { - fputws(L"!`",yyout); + fputs("!`",yyout); } \ { BEGIN(mathenv); - fputws(L"$$",yyout); + fputs("$$",yyout); } \<\/MATH_DOLLARS\> { - fputws(L"$$",yyout); + fputs("$$",yyout); BEGIN(0); } \ { BEGIN(mathenv); - fputws(L"$",yyout); + fputs("$",yyout); } \<\/MATH_DOLLAR\> { - fputws(L"$",yyout); + fputs("$",yyout); BEGIN(0); } \ { - fputws(L"\\(",yyout); + fputs("\\(",yyout); } \<\/MATH_PAR\> { - fputws(L"\\)",yyout); + fputs("\\)",yyout); } \ { - fputws(L"\\[",yyout); + fputs("\\[",yyout); } \<\/MATH_BRA\> { - fputws(L"\\]",yyout); + fputs("\\]",yyout); } \ { - fputws(L"{",yyout); + fputs("{",yyout); } \<\/CONTENTS\> { - fputws(L"}",yyout); + fputs("}",yyout); } &NBSP; { - fputws(L"~",yyout); + fputs("~",yyout); } \ { - fputws(L"\\\\",yyout); + fputs("\\\\",yyout); } \[^\<]* { - fputws((wstring(L"\%")+convertir(yytext+9,yyleng-9)).c_str(),yyout); + fputs("\%", yyout); + fputrange(yytext, 9, yyleng-9, yyout); } \<\/COMMENT\> { @@ -184,14 +154,15 @@ wstring convertir(string const &multibyte, int const length) \[^\<]* { - fputws((wstring(L"[")+convertir(yytext+7,yyleng-7)).c_str(),yyout); + fputc('[', yyout); + fputrange(yytext, 7, yyleng-7, yyout); } \<\/PARAM\> { - fputws(L"]", yyout); + fputs("]", yyout); } \ { - fputws(L"\\verb", yyout); + fputs("\\verb", yyout); } \<\/VERB\> { @@ -201,99 +172,118 @@ wstring convertir(string const &multibyte, int const length) ł { - fputws(L"\\l", yyout); + fputs("\\l", yyout); } œ { - fputws(L"{\\oe}",yyout); + fputs("{\\oe}",yyout); } Œ { - fputws(L"{\\OE}",yyout); + fputs("{\\OE}",yyout); } æ { - fputws(L"{\\ae}",yyout); + fputs("{\\ae}",yyout); } Æ { - fputws(L"{\\AE}",yyout); + fputs("{\\AE}",yyout); } å { - fputws(L"{\\aa}",yyout); + fputs("{\\aa}",yyout); } Å { - fputws(L"{\\AA}",yyout); + fputs("{\\AA}",yyout); } ø { - fputws(L"{\\o}",yyout); + fputs("{\\o}",yyout); } Ø { - fputws(L"{\\O}",yyout); + fputs("{\\O}",yyout); } ß { - fputws(L"{\\ss}",yyout); + fputs("{\\ss}",yyout); } \<[a-zA-Z0-9]+\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-2)+wstring(L"}")).c_str(),yyout); + fputs("\\begin{", yyout); + fputrange(yytext, 1, yyleng-2, yyout); + fputc('}', yyout); } \ { - fputws((wstring(L"\\#")+convertir(yytext+6,yyleng-8)).c_str(),yyout); + fputs("\\#", yyout); + fputrange(yytext, 6, yyleng-8, yyout); } \ { - fputws(L"\\#", yyout); + fputs("\\#", yyout); } \<[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-7)+wstring(L"*}")).c_str(),yyout); + fputs("\\begin{", yyout); + fputrange(yytext, 1, yyleng-7, yyout); + fputs("*}", yyout); } \<\/[a-zA-Z0-9]+\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-3)+wstring(L"}")).c_str(),yyout); + fputs("\\end{", yyout); + fputrange(yytext, 2, yyleng-3, yyout); + fputc('}', yyout); } \<\/[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-8)+wstring(L"*}")).c_str(),yyout); + fputs("\\end{", yyout); + fputrange(yytext, 2, yyleng-8, yyout); + fputs("*}", yyout); } \<[a-zA-Z0-9]+\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-3)).c_str(),yyout); + fputc('\\', yyout); + fputrange(yytext, 1, yyleng-3, yyout); } \<[a-zA-Z0-9]+_STAR\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-8)+wstring(L"*")).c_str(),yyout); + fputc('\\', yyout); + fputrange(yytext, 1, yyleng-8, yyout); + fputc('*', yyout); } /*NO ENTIENDO ESTA REGLA \# { - fputws(L"\\#", yyout); + fputs("\\#", yyout); }*/ [^A-Za-z\n] { - wstring wt = convertir(yytext,yyleng); - wstring wa = accentsMap.get(wt); - if( wa == L"" ) - fputws(wt.c_str(),yyout); - else - fputws(wstring(L"\\"+wa.substr(0,1)+L"{"+wa.substr(1)+L"}").c_str(),yyout); + UString wt = to_ustring(yytext); + UString wa = accentsMap.get(wt); + if (wa.empty()) { + fputus(wt, yyout); + } else { + UString tmp; + tmp += '\\'; + tmp += wa[0]; + tmp += '{'; + tmp += wa.substr(1); + tmp += '}'; + fputus(tmp, yyout); + } } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } @@ -323,7 +313,7 @@ int main(int argc, char *argv[]) base++; } - if((argc-base) > 4) + if((argc-base) > 4) { usage(argv[0]); } @@ -347,10 +337,6 @@ int main(int argc, char *argv[]) break; } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif // prevent warning message yy_push_state(1); yy_top_state(); diff --git a/apertium/apertium-prelatex.l b/apertium/apertium-prelatex.l index 2bf7243..075581f 100644 --- a/apertium/apertium-prelatex.l +++ b/apertium/apertium-prelatex.l @@ -20,7 +20,6 @@ extern "C" { } #include -#include #ifndef GENFORMAT #include "apertium_config.h" #endif @@ -33,38 +32,10 @@ extern "C" { using namespace std; AccentsMap accentsMap(false); -wstring closesym = L""; -string memconv = ""; +UString closesym; //For german babel detection bool ngermanbabel = false; -wstring convertir(string const &multibyte, int const length) -{ - memconv.append(multibyte.c_str(), length); - int tam = memconv.size(); - wchar_t *retval = new wchar_t[tam+1]; - size_t l = mbstowcs(retval, memconv.c_str(), tam); - - if(l == ((size_t) -1)) - { - delete[] retval; - if(memconv.size() >= 4) - { - wcerr << L"Warning: wrong encoding" << endl; - } - return L""; - } - else - { - memconv = ""; - retval[l] = 0; - wstring ret = retval; - delete[] retval; - return ret; - } -} - - %} @@ -84,25 +55,25 @@ wstring convertir(string const &multibyte, int const length) \\t\{..\} { //This information is lost - fputws(convertir(yytext+3,yyleng-4).c_str(),yyout); + fputrange(yytext, 3, yyleng-4, yyout); } \\l { - fputws(L"ł", yyout); + fputs("ł", yyout); } \"[oOaAuUsS] { //When usepackage[ngerman]{babel} is present (not checked). - if(!ngermanbabel) - fputws(convertir(yytext,yyleng).c_str(),yyout); - else { + if(!ngermanbabel) { + fputs(yytext, yyout); + } else { switch(yytext[1]){ - case 'o': fputws(L"ö", yyout); break; - case 'O': fputws(L"Ö", yyout); break; - case 'a': fputws(L"ä", yyout); break; - case 'A': fputws(L"Ä", yyout); break; - case 'u': fputws(L"ü", yyout); break; - case 'U': fputws(L"Ü", yyout); break; - case 's': fputws(L"ß", yyout); break; - case 'S': fputws(L"ß", yyout); break; + case 'o': fputs("ö", yyout); break; + case 'O': fputs("Ö", yyout); break; + case 'a': fputs("ä", yyout); break; + case 'A': fputs("Ä", yyout); break; + case 'u': fputs("ü", yyout); break; + case 'U': fputs("Ü", yyout); break; + case 's': fputs("ß", yyout); break; + case 'S': fputs("ß", yyout); break; } } } @@ -113,288 +84,304 @@ wstring convertir(string const &multibyte, int const length) switch(yytext[1]){ case '^': if(yytext[4]=='i') - fputws(L"î", yyout); + fputs("î", yyout); else - fputws(L"ĵ",yyout); + fputs("ĵ",yyout); break; case '\"': if(yytext[4]=='i') - fputws(L"ï",yyout); + fputs("ï",yyout); else - fputws(L"j",yyout); //should actually be j with umlaut + fputs("j",yyout); //should actually be j with umlaut break; case '\'': if(yytext[4]=='i') - fputws(L"í",yyout); + fputs("í",yyout); else - fputws(L"j",yyout); //should actually be j with accent + fputs("j",yyout); //should actually be j with accent break; case '`': if(yytext[4]=='i') - fputws(L"ì",yyout); + fputs("ì",yyout); else - fputws(L"k",yyout); //should actually be j with accent + fputs("k",yyout); //should actually be j with accent break; } } \{\\oe\} { - fputws(L"œ",yyout); + fputs("œ",yyout); } \{\\OE\} { - fputws(L"Œ",yyout); + fputs("Œ",yyout); } \{\\ae\} { - fputws(L"æ",yyout); + fputs("æ",yyout); } \{\\AE\} { - fputws(L"Æ",yyout); + fputs("Æ",yyout); } \{\\aa\} { - fputws(L"å",yyout); + fputs("å",yyout); } \{\\AA\} { - fputws(L"Å",yyout); + fputs("Å",yyout); } \{\\o\} { - fputws(L"ø",yyout); + fputs("ø",yyout); } \{\\O\} { - fputws(L"Ø",yyout); + fputs("Ø",yyout); } \{\\ss\} { - fputws(L"ß",yyout); + fputs("ß",yyout); } \\#[0-9]+ { - fputws((wstring(L"")).c_str(),yyout); + fputs("", yyout); } \\# { - fputws(L"", yyout); + fputs("", yyout); } \\[`'\^\"H~ck=b.druv]((\{.\})|(.)) { - wstring ws = convertir(yytext,yyleng).c_str(); - - wstring result = accentsMap.get( - L""+ws.substr(1,1)+ ( - (yyleng==3)? ws.substr(2,1) : ws.substr(3,1) - )); - - if(result == L"") - { - fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); - } - else - { - fputws(result.c_str(), yyout); - } + UString ws = to_ustring(yytext); + UString key; + key += ws[1]; + key += (yyleng == 3) ? ws[2] : ws[3]; + + UString result = accentsMap.get(key); + if (result.empty()) { + fputc('<', yyout); + fputrange(yytext, 1, yyleng, yyout); + fputs("/>", yyout); + } else { + fputus(result, yyout); + } } \\\\ { - fputws(L"
",yyout); + fputs("
",yyout); } \%.* { - if(yytext[yyleng-1]=='\r') - fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"\r")).c_str(),yyout); - else - fputws((wstring(L"")+convertir(yytext+1,yyleng-1)+wstring(L"")).c_str(),yyout); + fputs("", yyout); + if (yytext[yyleng-1] == '\r') { + fputrange(yytext, 1, yyleng-2, yyout); + fputs("\r", yyout); + } else { + fputrange(yytext, 1, yyleng-1, yyout); + fputs("", yyout); + } } \\usepackage\[[^\]]*\] { - wstring ws = convertir(yytext+12,yyleng-13); - fputws((wstring(L"")+ws+wstring(L"")).c_str(), yyout); - if(ws.find(L"ngerman") != wstring::npos) - ngermanbabel = true; + fputs("", yyout); + // this is maybe wrong, but hopefully no one puts non-ASCII + // characters in their package names + UString ws = to_ustring(yytext).substr(12, yyleng-13); + fputus(ws, yyout); + fputs("", yyout); + if(ws.find("ngerman"_u) != UString::npos) + ngermanbabel = true; } \[[^\]]*\] { - fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"")).c_str(), yyout); + fputs("", yyout); + fputrange(yytext, 1, yyleng-2, yyout); + fputs("", yyout); } \\begin[^a-zA-Z0-9_] { BEGIN(readbrackets); - closesym = L""; + closesym = ""_u; } \\end[^a-zA-Z0-9_] { BEGIN(readbrackets); - closesym = L"/"; + closesym = "/"_u; } [ \n\r\t]*\{?[ \n\r\t]* { - wstring ws = convertir(yytext,yyleng); - int i = ws.find(L'{'); //remove it - if(i>=0) - ws = ws.substr(0,i)+ws.substr(i+1); - fputws(ws.c_str(),yyout); + UString ws = to_ustring(yytext); + int i = ws.find('{'); + if (i >= 0) { + ws = ws.substr(0, i) + ws.substr(i+1); + } + fputus(ws, yyout); } [a-zA-Z0-9]+\* { - fputws((wstring(L"<")+closesym+convertir(yytext,yyleng-1)+wstring(L"_STAR>")).c_str(),yyout); + fputc('<', yyout); + fputus(closesym, yyout); + fputrange(yytext, 0, yyleng-1, yyout); + fputs("_STAR>", yyout); } [a-zA-Z0-9]+ { - fputws((wstring(L"<")+closesym+convertir(yytext,yyleng)+wstring(L">")).c_str(),yyout); + fputc('<', yyout); + fputus(closesym, yyout); + fputs(yytext, yyout); + fputc('>', yyout); } [ \n\r\t]*\}[ \n\r\t]* { BEGIN(0); - wstring ws = convertir(yytext,yyleng); - int i = ws.find(L'}'); //remove it - if(i>=0) - ws = ws.substr(0,i)+ws.substr(i+1); - fputws(ws.c_str(),yyout); + UString ws = to_ustring(yytext); + int i = ws.find('}'); + if (i >= 0) { + ws = ws.substr(0, i) + ws.substr(i+1); + } + fputus(ws, yyout); } \\[A-Za-z]+\* { - fputws((wstring(L"<")+convertir(yytext+1,yyleng-2)+wstring(L"_STAR/>")).c_str(),yyout); + fputc('<', yyout); + fputrange(yytext, 1, yyleng-2, yyout); + fputs("_STAR/>", yyout); } \\[A-Za-z]+ { - fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); + fputc('<', yyout); + fputrange(yytext, 1, yyleng, yyout); + fputs("/>", yyout); } \\\{ { - fputws(L"", yyout); + fputs("", yyout); } \\\{ { - fputws(L"", yyout); + fputs("", yyout); } \\\% { - fputws(L"", yyout); + fputs("", yyout); } \{ { - fputws(L"",yyout); + fputs("",yyout); } \} { - fputws((wstring(L"")).c_str(),yyout); + fputs("", yyout); } ~ { - fputws(L"&NBSP;",yyout); + fputs("&NBSP;",yyout); } \$\$ { BEGIN(mathenv); - fputws(L"",yyout); + fputs("",yyout); } \$\$ { - fputws(L"",yyout); + fputs("",yyout); BEGIN(0); } \$ { BEGIN(mathenv); - fputws(L"",yyout); + fputs("",yyout); } \$ { - fputws(L"",yyout); + fputs("",yyout); BEGIN(0); } \\verb[|][^|]+[|] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("", yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\verb[!][^!]+[!] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\verb[?][^?]+[?] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\verb[/][^/]+[/] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\verb[#][^#]+[#] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\verb[+][^+]+[+] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fputrange(yytext, 5, yyleng-5, yyout); + fputs("", yyout); } \\\( { - fputws(L"",yyout); + fputs("",yyout); } \\\) { - fputws(L"",yyout); + fputs("",yyout); } \\\[ { - fputws(L"",yyout); + fputs("",yyout); } \\\] { - fputws(L"",yyout); + fputs("",yyout); } \?` { - fputws(L"¿",yyout); + fputs("¿",yyout); } !` { - fputws(L"¡",yyout); + fputs("¡",yyout); } \" { - fputws(L""",yyout); + fputs(""",yyout); } \' { - fputws(L"'",yyout); + fputs("'",yyout); } \< { - fputws(L"<",yyout); + fputs("<",yyout); } \> { - fputws(L">",yyout); + fputs(">",yyout); } \\\& { - fputws(L"&",yyout); + fputs("&",yyout); } \& { - fputws(L"",yyout); + fputs("",yyout); } @@ -402,11 +389,11 @@ wstring convertir(string const &multibyte, int const length) (.|\n|\r) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } diff --git a/apertium/apertium_interchunk.cc b/apertium/apertium_interchunk.cc index 63dbf09..e2dcb9f 100644 --- a/apertium/apertium_interchunk.cc +++ b/apertium/apertium_interchunk.cc @@ -74,7 +74,7 @@ UFILE * open_output(const char* filename) if(!output) { cerr << "Error: can't open output file '"; - cerr << filename.c_str() << "'." << endl; + cerr << filename << "'." << endl; exit(EXIT_FAILURE); } return output; @@ -120,7 +120,8 @@ int main(int argc, char *argv[]) InputFile input; UFILE* output = u_finit(stdout, NULL, NULL); - string f1, f2; + const char* f1; + const char* f2; switch(argc - optind + 1) { case 5: diff --git a/apertium/apertium_perceptron_trace.cc b/apertium/apertium_perceptron_trace.cc index 92e05c4..f006a93 100644 --- a/apertium/apertium_perceptron_trace.cc +++ b/apertium/apertium_perceptron_trace.cc @@ -41,11 +41,11 @@ int perceptron_trace(int argc, char* argv[]) PerceptronTagger pt(flags); pt.read_spec(argv[2]); - std::wifstream untagged_stream; + std::ifstream untagged_stream; try_open_fstream("UNTAGGED_CORPUS", argv[3], untagged_stream); Stream untagged(flags, untagged_stream, argv[3]); - std::wifstream tagged_stream; + std::ifstream tagged_stream; try_open_fstream("TAGGED_CORPUS", argv[4], tagged_stream); Stream tagged(flags, tagged_stream, argv[4]); diff --git a/apertium/apertium_postchunk.cc b/apertium/apertium_postchunk.cc index ae8ea12..26204c3 100644 --- a/apertium/apertium_postchunk.cc +++ b/apertium/apertium_postchunk.cc @@ -47,10 +47,10 @@ void message(char *progname) exit(EXIT_FAILURE); } -void testfile(string const &filename) +void testfile(const char* filename) { struct stat mybuf; - if(stat(filename.c_str(), &mybuf) == -1) + if(stat(filename, &mybuf) == -1) { cerr << "Error: can't stat file '"; cerr << filename << "'." << endl; @@ -110,7 +110,8 @@ int main(int argc, char *argv[]) InputFile input; UFILE* output = u_finit(stdout, NULL, NULL); - string f1, f2; + const char* f1; + const char* f2; switch(argc - optind + 1) { case 5: diff --git a/apertium/apertium_posttransfer.cc b/apertium/apertium_posttransfer.cc index af40c60..8b95db8 100644 --- a/apertium/apertium_posttransfer.cc +++ b/apertium/apertium_posttransfer.cc @@ -49,15 +49,15 @@ void processStream(InputFile& in, UFILE* out, bool null_flush) { if (!((c == ' ') && (prev == ' '))) { - putc(c, out); + u_fputc(c, out); } if (c == 0 && null_flush) { - fflush(out); - putc(c, out); + u_fflush(out); + u_fputc(c, out); } prev = c; - c = fgetc(in); + c = in.get(); } } diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc index a073afc..b67fac8 100644 --- a/apertium/apertium_pretransfer.cc +++ b/apertium/apertium_pretransfer.cc @@ -27,6 +27,7 @@ #endif #include #include +#include using namespace Apertium; using namespace std; diff --git a/apertium/apertium_tagger_readwords.cc b/apertium/apertium_tagger_readwords.cc index 7f0a14c..01469a0 100644 --- a/apertium/apertium_tagger_readwords.cc +++ b/apertium/apertium_tagger_readwords.cc @@ -17,7 +17,6 @@ */ #include "getopt_long.h" -#include #include #include #include @@ -41,8 +40,8 @@ void check_file(FILE *f, const string& path) { } } -void readwords (FILE *is, int corpus_length) { - FileMorphoStream lexmorfo(is, true, &tagger_data_hmm); +void readwords (int corpus_length) { + FileMorphoStream lexmorfo(NULL, true, &tagger_data_hmm); TaggerWord *word=NULL; int nwords=0; @@ -50,15 +49,15 @@ void readwords (FILE *is, int corpus_length) { while(word) { nwords++; - cout<get_superficial_form())<<" "<get_string_tags())<<"\n"; + cout << word->get_superficial_form() << " " << word->get_string_tags() << "\n"; if (check_ambclasses) { int k=tagger_data_hmm.getOutput()[word->get_tags()]; if ((k>=tagger_data_hmm.getM())||(k<0)) { cerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; - cerr<<"Ambiguity class: "<get_string_tags())<<"\n"; + cerr<<"Word: "<< word->get_superficial_form() << "\n"; + cerr<<"Ambiguity class: "<< word->get_string_tags() << "\n"; } } @@ -191,5 +190,5 @@ int main(int argc, char* argv[]) { TaggerWord::setArrayTags(tagger_data_hmm.getArrayTags()); - readwords(stdin, corpus_length); + readwords(corpus_length); } diff --git a/apertium/apertium_tmxbuild.cc b/apertium/apertium_tmxbuild.cc index 7118471..06588d8 100644 --- a/apertium/apertium_tmxbuild.cc +++ b/apertium/apertium_tmxbuild.cc @@ -64,7 +64,7 @@ int main(int argc, char *argv[]) LtLocale::tryToSetLocale(); string output_file = ""; string doc1 = "", doc2 = ""; - string lang1 = "", lang2 = ""; + UString lang1, lang2; double percent = 0.85; int low_limit = 15; @@ -174,8 +174,8 @@ int main(int argc, char *argv[]) case 5: doc1 = argv[optind - 1 + 3]; doc2 = argv[optind - 1 + 4]; - lang1 = argv[optind - 1 + 1]; - lang2 = argv[optind - 1 + 2]; + lang1 = to_ustring(argv[optind - 1 + 1]); + lang2 = to_ustring(argv[optind - 1 + 2]); break; default: @@ -183,7 +183,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } - TMXBuilder tmxb(UtfConverter::fromUtf8(lang1), UtfConverter::fromUtf8(lang2)); + TMXBuilder tmxb(lang1, lang2); // if(!tmxb.check(doc1, doc2)) // { // cerr << "Error: The two files are incompatible for building a TMX." << endl; diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index 3afb9d3..249a235 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -15,421 +15,167 @@ * along with this program; if not, see . */ #include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include -#include "apertium_config.h" -#include + +#include using namespace Apertium; using namespace std; -void -Interchunk::destroy() -{ - delete me; - me = NULL; - - if(doc) - { - xmlFreeDoc(doc); - doc = NULL; - } -} - -Interchunk::Interchunk() : -word(0), -lword(0), -last_lword(0), -output(0), -any_char(0), -any_tag(0), -nwords(0) -{ - me = NULL; - doc = NULL; - root_element = NULL; - lastrule = NULL; - inword = false; - null_flush = false; - internal_null_flush = false; - trace = false; - in_out = false; -} - -Interchunk::~Interchunk() -{ - destroy(); -} +Interchunk::Interchunk() + : word(0), last_lword(0), inword(false) +{} -void -Interchunk::readData(FILE *in) +bool +Interchunk::checkIndex(xmlNode *element, int index, int limit) { - alphabet.read(in); - any_char = alphabet(TRXReader::ANY_CHAR); - any_tag = alphabet(TRXReader::ANY_TAG); - - Transducer t; - t.read(in, alphabet.size()); - - map finals; - - // finals - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - int key = Compression::multibyte_read(in); - finals[key] = Compression::multibyte_read(in); - } - - me = new MatchExe(t, finals); - - // attr_items - //bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); - Compression::string_read(in); // version - bool recompile_attrs = true; - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - UString const cad_k = Compression::string_read(in); - attr_items[cad_k].read(in); - UString fallback = Compression::string_read(in); - if(recompile_attrs) { - //attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - // TODO regexs - } - } - - // variables - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + if(index >= limit) { - UString const cad_k = Compression::string_read(in); - variables[cad_k] = Compression::string_read(in); + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; + return false; } - - // macros - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - UString const cad_k = Compression::string_read(in); - macros[cad_k] = Compression::multibyte_read(in); + if(index < 0) { + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; + return false; } - - // lists - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + if(word[index] == 0) { - UString const cad_k = Compression::string_read(in); - - for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) - { - UString const cad_v = Compression::string_read(in); - lists[cad_k].insert(cad_v); - listslow[cad_k].insert(StringUtils::tolower(cad_v)); - } + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; + return false; } + return true; } -void -Interchunk::read(const char* transferfile, const char* datafile) -{ - readInterchunk(transferfile); - - // datafile - FILE *in = fopen(datafile, "rb"); - if(!in) - { - cerr << "Error: Could not open file '" << datafile << "'." << endl; - exit(EXIT_FAILURE); +UString +Interchunk::evalCachedString(xmlNode* element) +{ + TransferInstr& ti = evalStringCache[element]; + switch (ti.getType()) { + case ti_clip_tl: + if (checkIndex(element, ti.getPos(), lword)) { + if (ti.getContent() == "content"_u) { + UString wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + return wf.substr(1, wf.length()-2); // trim { and } + } else { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + } + break; + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if (!blank_queue.empty()) { + UString retblank = blank_queue.front(); + if (in_out) { + blank_queue.pop(); + } + return retblank; + } else { + return " "_u; + } + break; + + case ti_get_case_from: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode*) ti.getPointer())); + } + break; + + case ti_case_of_tl: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; + + default: + return ""_u; } - readData(in); - fclose(in); - + return ""_u; } void -Interchunk::readInterchunk(const char* in) +Interchunk::processClip(xmlNode* element) { - doc = xmlReadFile(in, NULL, 0); - - if(doc == NULL) - { - cerr << "Error: Could not parse file '" << in << "'." << endl; - exit(EXIT_FAILURE); - } - - root_element = xmlDocGetRootElement(doc); - - // search for macros & rules - for(xmlNode *i = root_element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) - { - collectMacros(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) - { - collectRules(i); - } + int pos = 0; + UString part; + for (xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((const char*) i->children->content); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char*) i->children->content) - 1; } } + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); } void -Interchunk::collectRules(xmlNode *localroot) +Interchunk::processBlank(xmlNode* element) { - for(xmlNode *rule = localroot->children; rule != NULL; rule = rule->next) - { - if(rule->type == XML_ELEMENT_NODE) - { - size_t line = rule->line; - for(xmlNode *rulechild = rule->children; ; rulechild = rulechild->next) - { - if(rulechild->type == XML_ELEMENT_NODE && !xmlStrcmp(rulechild->name, (const xmlChar *) "action")) - { - rule_map.push_back(rulechild); - rule_lines.push_back(line); - break; - } - } - } + if (element->properties == NULL) { + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); + } else { + int pos = atoi((const char*) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); } } void -Interchunk::collectMacros(xmlNode *localroot) +Interchunk::processLuCount(xmlNode* element) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - macro_map.push_back(i); - } - } + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); } -bool -Interchunk::checkIndex(xmlNode *element, int index, int limit) +UString +Interchunk::processLu(xmlNode* element) { - if(index >= limit) - { - cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; - return false; - } - if(index < 0) { - cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; - return false; - } - if(word[index] == 0) - { - cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; - return false; - } - return true; + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + return ""_u; // make the type checker happy } - UString -Interchunk::evalString(xmlNode *element) +Interchunk::processMlu(xmlNode* element) { - if (element == 0) - { - throw "Interchunk::evalString() was passed a NULL element"; - } - - map::iterator it; - it = evalStringCache.find(element); - if(it != evalStringCache.end()) - { - TransferInstr &ti = it->second; - switch(ti.getType()) - { - case ti_clip_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(ti.getContent() == "content"_u) // jacob's new 'part' - { - UString wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); - return wf.substr(1, wf.length()-2); // trim away the { and } - } - else - { - return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); - } - } - break; - - case ti_var: - return variables[ti.getContent()]; - - case ti_lit_tag: - case ti_lit: - return ti.getContent(); - - case ti_b: - if(!blank_queue.empty()) - { - UString retblank = blank_queue.front(); - - if(in_out) - { - blank_queue.pop(); - } - - return retblank; - } - else - { - return " "_u; - } - break; - - case ti_get_case_from: - if(checkIndex(element, ti.getPos(), lword)) - { - return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), - evalString((xmlNode *) ti.getPointer())); - } - break; - - case ti_case_of_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); - } - break; - - default: - return ""_u; - } - return ""_u; - } - - if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) - { - int pos = 0; - UString part; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((char*)i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *)i->children->content) - 1; - } - } - - evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) - { - evalStringCache[element] = TransferInstr(ti_lit_tag, - tags(to_ustring((const char *) element->properties->children->content)), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) - { - evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) - { - if(element->properties == NULL) - { - evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); - } - else - { - int pos = atoi((const char *) element->properties->children->content) - 1; - evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) - { - int pos = atoi((const char *) element->properties->children->content) - 1; - xmlNode *param = NULL; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - param = i; - break; - } - } - - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) - { - evalStringCache[element] = TransferInstr(ti_var, to_ustring((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) - { - int pos = 0; - UString part; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((char*)i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - } + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + return ""_u; // make the type checker happy +} - evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) - { - UString value; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - value.append(evalString(i)); - } +void +Interchunk::processCaseOf(xmlNode* element) +{ + int pos = 0; + UString part; + for (xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((char*) i->children->content); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char*) i->children->content) - 1; } - return value; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) - { - return processChunk(element); - } - else - { - cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; - exit(EXIT_FAILURE); - } - - return evalString(element); + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } void Interchunk::processOut(xmlNode *localroot) { in_out = true; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) - { - write(processChunk(i), output); - } - else // 'b' - { - write(evalString(i), output); - } + + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { + write(processChunk(i), output); + } else { // 'b' + write(evalString(i), output); } } @@ -442,65 +188,25 @@ Interchunk::processChunk(xmlNode *localroot) UString result; result.append("^"_u); - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - result.append(evalString(i)); - } + for (auto i : children(localroot)) { + result.append(evalString(i)); } result.append("$"_u); return result; } -void -Interchunk::processInstruction(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) - { - processChoose(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) - { - processLet(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) - { - processAppend(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) - { - processOut(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) - { - processCallMacro(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) - { - processModifyCase(localroot); - } -} - void Interchunk::processLet(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -564,46 +270,17 @@ Interchunk::processLet(xmlNode *localroot) } } -void -Interchunk::processAppend(xmlNode *localroot) -{ - UString name; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "n")) - { - name = to_ustring((char *) i->children->content); - break; - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - variables[name].append(evalString(i)); - } - } -} - void Interchunk::processModifyCase(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -624,7 +301,7 @@ Interchunk::processModifyCase(xmlNode *localroot) } } - UString const result = copycase(evalString(rightSide), + UString const result = StringUtils::copycase(evalString(rightSide), word[pos]->chunkPart(attr_items[part])); bool match = word[pos]->setChunkPart(attr_items[part], result); if(!match && trace) @@ -635,7 +312,7 @@ Interchunk::processModifyCase(xmlNode *localroot) else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { UString const val = to_ustring((const char *) leftSide->properties->children->content); - variables[val] = copycase(evalString(rightSide), variables[val]); + variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); } } @@ -659,16 +336,11 @@ Interchunk::processCallMacro(xmlNode *localroot) // ToDo: Is it at all valid if npar <= 0 ? InterchunkWord **myword = NULL; + int idx = 0; if(npar > 0) { myword = new InterchunkWord *[npar]; - } - - int idx = 0; - for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { + for (auto i : children(localroot)) { int pos = atoi((const char *) i->properties->children->content)-1; myword[idx] = word[pos]; idx++; @@ -678,12 +350,8 @@ Interchunk::processCallMacro(xmlNode *localroot) swap(myword, word); swap(npar, lword); - for(xmlNode *i = macro->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } + for (auto i : children(macro)) { + processInstruction(i); } swap(myword, word); @@ -692,603 +360,6 @@ Interchunk::processCallMacro(xmlNode *localroot) delete[] myword; } -void -Interchunk::processChoose(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "when")) - { - bool picked_option = false; - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(j->name, (const xmlChar *) "test")) - { - if(!processTest(j)) - { - break; - } - else - { - picked_option = true; - } - } - else - { - processInstruction(j); - } - } - } - if(picked_option) - { - return; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - processInstruction(j); - } - } - } - } - } -} - -bool -Interchunk::processLogical(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) - { - return processEqual(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) - { - return processBeginsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) - { - return processBeginsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) - { - return processEndsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) - { - return processEndsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) - { - return processContainsSubstring(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) - { - return processOr(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) - { - return processAnd(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) - { - return processNot(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) - { - return processIn(localroot); - } - - return false; -} - -bool -Interchunk::processIn(xmlNode *localroot) -{ - xmlNode *value = NULL; - UString idlist; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(value == NULL) - { - value = i; - } - else - { - idlist = to_ustring((char*)i->properties->children->content); - break; - } - } - } - - UString sval = evalString(value); - - if(localroot->properties != NULL) - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - set &myset = listslow[idlist]; - return (myset.find(tolower(sval)) != myset.end()); - } - } - - set &myset = lists[idlist]; - return (myset.find(sval) != myset.end()); -} - -bool -Interchunk::processTest(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return processLogical(i); - } - } - return false; -} - -bool -Interchunk::processAnd(xmlNode *localroot) -{ - bool val = true; - for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val && processLogical(i); - } - } - - return val; -} - -bool -Interchunk::processOr(xmlNode *localroot) -{ - bool val = false; - for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val || processLogical(i); - } - } - - return val; -} - -bool -Interchunk::processNot(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return !processLogical(i); - } - } - return false; -} - -bool -Interchunk::processEqual(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first) == evalString(second); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)) == tolower(evalString(second)); - } - else - { - return evalString(first) == evalString(second); - } - } -} - -bool -Interchunk::beginsWith(UString const &s1, UString const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = 0; i != limit; i++) - { - if(s1[i] != s2[i]) - { - return false; - } - } - - return true; -} - -bool -Interchunk::endsWith(UString const &s1, UString const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) - { - if(s1[j] != s2[i]) - { - return false; - } - } - - return true; -} - - -bool -Interchunk::processBeginsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return beginsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return beginsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return beginsWith(evalString(first), evalString(second)); - } - } -} - -bool -Interchunk::processEndsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return endsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return endsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return endsWith(evalString(first), evalString(second)); - } - } -} - -bool -Interchunk::processBeginsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - UString idlist = to_ustring((char*)second->properties->children->content); - UString needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[idlist].begin(); - limit = lists[idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[idlist].begin(); - limit = listslow[idlist].end(); - } - - for(; it != limit; it++) - { - if(beginsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Interchunk::processEndsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - UString idlist = to_ustring((char*)second->properties->children->content); - UString needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[idlist].begin(); - limit = lists[idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[idlist].begin(); - limit = listslow[idlist].end(); - } - - for(; it != limit; it++) - { - if(endsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Interchunk::processContainsSubstring(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first).find(evalString(second)) != UString::npos; - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)).find(tolower(evalString(second))) != UString::npos; - } - else - { - return evalString(first).find(evalString(second)) != UString::npos; - } - } -} - -UString -Interchunk::copycase(UString const &source_word, UString const &target_word) -{ - UString result; - UString const s_word = source_word; - UString const t_word = target_word; - - bool firstupper = iswupper(s_word[0]); - bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); - bool sizeone = s_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - result = StringUtils::tolower(t_word); - } - else - { - result = StringUtils::toupper(t_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return result; -} - -UString -Interchunk::caseOf(UString const &s) -{ - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"_u; - } - else if(!iswupper(s[s.size()-1])) - { - return "Aa"_u; - } - else - { - return "AA"_u; - } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) - { - return "aa"_u; - } - else - { - return "Aa"_u; - } - } - else - { - return "aa"_u; - } -} - -UString -Interchunk::tolower(UString const &str) const -{ - return StringUtils::tolower(str); -} - -UString -Interchunk::tags(UString const &str) const -{ - UString result = "<"_u; - - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - if(str[i] == '.') - { - result.append("><"_u); - } - else - { - result += str[i]; - } - } - - result += '>'; - - return result; -} - -void -Interchunk::processRule(xmlNode *localroot) -{ - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } - } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" "_u) != 0) { - write(blank_queue.front(), output); - } - blank_queue.pop(); - } -} - TransferToken & Interchunk::readToken(InputFile& in) { @@ -1363,24 +434,6 @@ Interchunk::readToken(InputFile& in) } } -bool -Interchunk::getNullFlush(void) -{ - return null_flush; -} - -void -Interchunk::setNullFlush(bool null_flush) -{ - this->null_flush = null_flush; -} - -void -Interchunk::setTrace(bool trace) -{ - this->trace = trace; -} - void Interchunk::interchunk_wrapper_null_flush(InputFile& in, UFILE* out) { diff --git a/apertium/interchunk.h b/apertium/interchunk.h index dc940f8..bad25f9 100644 --- a/apertium/interchunk.h +++ b/apertium/interchunk.h @@ -17,102 +17,36 @@ #ifndef _INTERCHUNK_ #define _INTERCHUNK_ -#include -#include +#include + #include -#include -#include -#include #include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include using namespace std; -class Interchunk +class Interchunk : public TransferBase { private: - - Alphabet alphabet; - MatchExe *me; - MatchState ms; - map attr_items; - map variables; - map macros; - map> lists; - map> listslow; - vector macro_map; - vector rule_map; - vector rule_lines; - xmlDoc *doc; - xmlNode *root_element; InterchunkWord **word; - queue blank_queue; - int lword; int last_lword; - Buffer input_buffer; - vector tmpword; - vector tmpblank; - - UFILE* output; - int any_char; - int any_tag; - xmlNode *lastrule; - unsigned int nwords; - - map evalStringCache; bool inword; - bool null_flush; - bool internal_null_flush; - bool trace; - bool in_out; - - void destroy(); - void readData(FILE *input); - void readInterchunk(const char* input); - void collectMacros(xmlNode *localroot); - void collectRules(xmlNode *localroot); - UString caseOf(UString const &str); - UString copycase(UString const &source_word, UString const &target_word); void processLet(xmlNode *localroot); - void processAppend(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - bool processLogical(xmlNode *localroot); - bool processTest(xmlNode *localroot); - bool processAnd(xmlNode *localroot); - bool processOr(xmlNode *localroot); - bool processEqual(xmlNode *localroot); - bool processBeginsWith(xmlNode *localroot); - bool processBeginsWithList(xmlNode *localroot); - bool processEndsWith(xmlNode *localroot); - bool processEndsWithList(xmlNode *localroot); - bool processContainsSubstring(xmlNode *localroot); - bool processNot(xmlNode *localroot); - bool processIn(xmlNode *localroot); - void processRule(xmlNode *localroot); - UString evalString(xmlNode *localroot); - void processInstruction(xmlNode *localroot); - void processChoose(xmlNode *localroot); UString processChunk(xmlNode *localroot); + void processClip(xmlNode* localroot); + void processBlank(xmlNode* localroot); + void processCaseOf(xmlNode* localroot); + + void processLuCount(xmlNode* localroot); + UString processLu(xmlNode* localroot); + UString processMlu(xmlNode* localroot); + + UString evalCachedString(xmlNode* element); - bool beginsWith(UString const &str1, UString const &str2) const; - bool endsWith(UString const &str1, UString const &str2) const; - UString tolower(UString const &str) const; - UString tags(UString const &str) const; UString readWord(InputFile& in); UString readBlank(InputFile& in); UString readUntil(InputFile& in, int const symbol) const; @@ -124,13 +58,8 @@ private: public: Interchunk(); - ~Interchunk(); - void read(const char* transferfile, const char* datafile); void interchunk(InputFile& in, UFILE* out); - bool getNullFlush(void); - void setNullFlush(bool null_flush); - void setTrace(bool trace); }; #endif diff --git a/apertium/latex_accentsmap.cc b/apertium/latex_accentsmap.cc index acf20b3..d8c660f 100644 --- a/apertium/latex_accentsmap.cc +++ b/apertium/latex_accentsmap.cc @@ -206,7 +206,16 @@ void AccentsMap::init_locale(){ ]*/ +void fputus(const UString& s, FILE* out) +{ + string temp; + temp.reserve(s.size()*2); + utf8::utf16to8(s.begin(), s.end(), std::back_inserter(temp)); + fputs(temp.c_str(), out); +} - - - +void fputrange(const char* s, int start, int len, FILE* out) +{ + string temp = s; + fputs(temp.substr(start, len).c_str(), out); +} diff --git a/apertium/latex_accentsmap.h b/apertium/latex_accentsmap.h index 5a61daf..64a13cb 100644 --- a/apertium/latex_accentsmap.h +++ b/apertium/latex_accentsmap.h @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -25,15 +24,6 @@ using namespace std; -/*struct Ltstr // Already in lttoolbox/ltstr.h -{ - bool operator()(UString const &s1, UString const &s2) const - { - return wcscmp(s1.c_str(), s2.c_str()) < 0; - } -}; -*/ - class AccentsMap { typedef std::map acmap; private: @@ -53,3 +43,5 @@ class AccentsMap { UString get(UString input); }; +void fputus(const UString& s, FILE* out); +void fputrange(const char* s, int start, int len, FILE* out); diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index e066642..893ea2b 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -15,34 +15,18 @@ * along with this program; if not, see . */ #include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include -#include "apertium_config.h" -#include + +#include using namespace Apertium; using namespace std; -Postchunk::Postchunk() : -word(0), -lword(0), -output(0), -nwords(0) -{ - lastrule = NULL; - inword = false; - in_out = false; - in_let_var = false; - in_wblank = false; -} +Postchunk::Postchunk() + : word(0), in_wblank(false), inword(false) +{} bool Postchunk::checkIndex(xmlNode *element, int index, int limit) @@ -65,359 +49,200 @@ Postchunk::checkIndex(xmlNode *element, int index, int limit) } UString -Postchunk::evalString(xmlNode *element) +Postchunk::evalCachedString(xmlNode* element) { - map::iterator it; - it = evalStringCache.find(element); - if(it != evalStringCache.end()) - { - TransferInstr &ti = it->second; - switch(ti.getType()) - { - case ti_clip_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(gettingLemmaFromWord(ti.getContent()) && lword > 1) - { - if(in_lu) - { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); - } - else if(in_let_var) - { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); - } - } - - return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + TransferInstr& ti = evalStringCache[element]; + switch (ti.getType()) { + case ti_clip_tl: + if (checkIndex(element, ti.getPos(), lword)) { + if (gettingLemmaFromWord(ti.getContent()) && lword > 1) { + if (in_lu) { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); + } else if (in_let_var) { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], + word[ti.getPos()]->getWblank()); } - break; - - case ti_lu_count: - return StringUtils::itoa(tmpword.size()); - - case ti_var: - if(lword > 1) - { - out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); - } - - return variables[ti.getContent()]; - - case ti_lit_tag: - case ti_lit: - return ti.getContent(); - - case ti_b: - if(!blank_queue.empty()) - { - UString retblank = blank_queue.front(); - if(in_out) - { - blank_queue.pop(); - } - - return retblank; - } - else - { - return " "_u; - } - break; - - case ti_get_case_from: - if(checkIndex(element, ti.getPos(), lword)) - { - return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), - evalString((xmlNode *) ti.getPointer())); - } - break; + } + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + break; - case ti_case_of_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); - } - break; + case ti_lu_count: + return StringUtils::itoa(tmpword.size()); - default: - return ""_u; + case ti_var: + if (lword > 1) { + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); } - return ""_u; - } + return variables[ti.getContent()]; - if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) - { - int pos = 0; - UString part; + case ti_lit_tag: + case ti_lit: + return ti.getContent(); - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((const char*)i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *)i->children->content); + case ti_b: + if (!blank_queue.empty()) { + UString retblank = blank_queue.front(); + if (in_out) { + blank_queue.pop(); } + return retblank; + } else { + return " "_u; } + break; - evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) - { - evalStringCache[element] = TransferInstr(ti_lit_tag, - tags(to_ustring((const char *) element->properties->children->content)), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) - { - evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) - { - if(element->properties == NULL) - { - evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); - } - else - { - int pos = atoi((const char *) element->properties->children->content) - 1; - evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); + case ti_get_case_from: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode*) ti.getPointer())); } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) - { - int pos = atoi((const char *) element->properties->children->content); - xmlNode *param = NULL; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - param = i; - break; - } + break; + + case ti_case_of_tl: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); } + break; - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); + default: + return ""_u; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) - { - evalStringCache[element] = TransferInstr(ti_var, to_ustring((const char *) element->properties->children->content), 0); + return ""_u; +} + +void +Postchunk::processClip(xmlNode* element) +{ + int pos = 0; + UString part; + for(xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((const char*) i->children->content); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char *)i->children->content); + } } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lu-count")) - { - evalStringCache[element] = TransferInstr(ti_lu_count, ""_u, 0); + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); +} + +void +Postchunk::processBlank(xmlNode* element) +{ + if (element->properties == NULL) { + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); + } else { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) - { - int pos = 0; - UString part; +} - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((const char*)i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content); - } +void +Postchunk::processLuCount(xmlNode* element) +{ + evalStringCache[element] = TransferInstr(ti_lu_count, ""_u, 0); +} + +void +Postchunk::processCaseOf(xmlNode* element) +{ + int pos = 0; + UString part; + for (xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((const char*) i->children->content); + } else if(!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char *) i->children->content); } + } + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); +} - evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); +UString +Postchunk::processLu(xmlNode* element) +{ + in_lu = true; + out_wblank.clear(); + + UString myword; + for (auto i : children(element)) { + myword.append(evalString(i)); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) - { - UString value; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - value.append(evalString(i)); - } - } - return value; + in_lu = false; + + if (lword == 1) { + out_wblank = word[1]->getWblank(); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - UString myword; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - myword.append(evalString(i)); - } - } - - in_lu = false; - - if(lword == 1) - { - out_wblank = word[1]->getWblank(); - } - if(myword.empty()) { - return ""_u; - } else { - return out_wblank+"^"_u+myword+"$"_u; - } + if (myword.empty()) { + return ""_u; + } else { + return out_wblank+"^"_u+myword+"$"_u; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) - { - UString value; +} - bool first_time = true; - out_wblank.clear(); +UString +Postchunk::processMlu(xmlNode* element) +{ + UString value; + + bool first_time = true; + out_wblank.clear(); + in_lu = true; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_lu = true; - - UString myword; + for (auto i : children(element)) { + UString myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } + for (auto j : children(i)) { + myword.append(evalString(j)); + } - in_lu = false; - - if(!first_time) - { - if(!myword.empty() && myword[0] != '#') //'+#' problem - { + if (!first_time) { + if(!myword.empty() && myword[0] != '#') { //'+#' problem value += '+'; } - } - else - { + } else { if (!myword.empty()) { - first_time = false; + first_time = false; } - } + } value.append(myword); - } - } - - if(lword == 1) - { - out_wblank = word[1]->getWblank(); - } + } - if (value.empty()) { - return ""_u; - } else { - return out_wblank+"^"_u+value+"$"_u; - } + in_lu = false; + + if (lword == 1) { + out_wblank = word[1]->getWblank(); } - else - { - cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; - exit(EXIT_FAILURE); + if (value.empty()) { + return ""_u; + } else { + return out_wblank+"^"_u+value+"$"_u; } +} - return evalString(element); +UString +Postchunk::processChunk(xmlNode* element) +{ + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + return ""_u; // make the type checker happy } void Postchunk::processOut(xmlNode *localroot) { in_out = true; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - UString myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(lword == 1) - { - out_wblank = word[1]->getWblank(); - } - if (!myword.empty()) { - u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) - { - UString myword; - bool first_time = true; - out_wblank.clear(); - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - - UString mylocalword; - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - mylocalword.append(evalString(k)); - } - } - - in_lu = false; - - if(!first_time) - { - if(!mylocalword.empty()) - { - myword += '+'; - } - } - else - { - if(!mylocalword.empty()) - { - first_time = false; - } - } - - myword.append(mylocalword); - } - } - - if(lword == 1) - { - out_wblank = word[1]->getWblank(); - } - - u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); - } - else // 'b' - { - write(evalString(i), output); - } + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + write(processLu(i), output); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { + write(processMlu(i), output); + } else { // 'b' + write(evalString(i), output); } } @@ -427,19 +252,10 @@ Postchunk::processOut(xmlNode *localroot) void Postchunk::processTags(xmlNode *localroot) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - write(evalString(j), output); - } - } + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) { + for (auto j : children(i)) { + write(evalString(j), output); } } } @@ -450,19 +266,12 @@ Postchunk::processLet(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -543,19 +352,12 @@ Postchunk::processModifyCase(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -576,7 +378,7 @@ Postchunk::processModifyCase(xmlNode *localroot) } } - UString const result = copycase(evalString(rightSide), + UString const result = StringUtils::copycase(evalString(rightSide), word[pos]->chunkPart(attr_items[part])); bool match = word[pos]->setChunkPart(attr_items[part], result); @@ -588,7 +390,7 @@ Postchunk::processModifyCase(xmlNode *localroot) else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { UString const val = to_ustring((const char *) leftSide->properties->children->content); - variables[val] = copycase(evalString(rightSide), variables[val]); + variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); } } @@ -624,30 +426,22 @@ Postchunk::processCallMacro(xmlNode *localroot) bool indexesOK = true; int idx = 1; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - int pos = atoi((const char *) i->properties->children->content); - if(!checkIndex(localroot, pos, lword)) { - indexesOK = false; // avoid segfaulting on empty chunks, e.g. ^x{}$ - pos = 1; - } - myword[idx] = word[pos]; - idx++; + for (auto i : children(localroot)) { + int pos = atoi((const char *) i->properties->children->content); + if(!checkIndex(localroot, pos, lword)) { + indexesOK = false; // avoid segfaulting on empty chunks, e.g. ^x{}$ + pos = 1; } + myword[idx] = word[pos]; + idx++; } swap(myword, word); swap(npar, lword); if(indexesOK) { - for(xmlNode *i = macro->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } + for (auto i : children(macro)) { + processInstruction(i); } } else { @@ -660,119 +454,6 @@ Postchunk::processCallMacro(xmlNode *localroot) delete[] myword; } -UString -Postchunk::copycase(UString const &source_word, UString const &target_word) -{ - UString result; - - bool firstupper = iswupper(source_word[0]); - bool uppercase = firstupper && iswupper(source_word[source_word.size()-1]); - bool sizeone = source_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - result = StringUtils::tolower(target_word); - } - else - { - result = StringUtils::toupper(target_word); - } - - if(firstupper) - { - // TODO: 32 - result[0] = u_toupper(result[0]); - } - - return result; -} - -UString -Postchunk::caseOf(UString const &s) -{ - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"_u; - } - else if(!iswupper(s[s.size()-1])) - { - return "Aa"_u; - } - else - { - return "AA"_u; - } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) - { - return "aa"_u; - } - else - { - return "Aa"_u; - } - } - else - { - return "aa"_u; - } -} - -UString -Postchunk::tolower(UString const &str) const -{ - return StringUtils::tolower(str); -} - -UString -Postchunk::tags(UString const &str) const -{ - UString result = "<"_u; - - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - if(str[i] == '.') - { - result.append("><"_u); - } - else - { - result += str[i]; - } - } - - result += '>'; - - return result; -} - -int -Postchunk::processRule(xmlNode *localroot) -{ - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } - } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" "_u) != 0) - { - write(blank_queue.front(), output); - } - blank_queue.pop(); - } - return -1; -} - TransferToken & Postchunk::readToken(InputFile& in) { @@ -847,24 +528,6 @@ Postchunk::readToken(InputFile& in) } } -bool -Postchunk::getNullFlush(void) -{ - return null_flush; -} - -void -Postchunk::setNullFlush(bool null_flush) -{ - this->null_flush = null_flush; -} - -void -Postchunk::setTrace(bool trace) -{ - this->trace = trace; -} - void Postchunk::postchunk_wrapper_null_flush(InputFile& in, UFILE* out) { @@ -1159,7 +822,7 @@ void Postchunk::unchunk(UString const &chunk, UFILE* output) { vector vectags = getVecTags(chunk); - UString case_info = caseOf(pseudolemma(chunk)); + UString case_info = StringUtils::getcase(pseudolemma(chunk)); bool uppercase_all = false; bool uppercase_first = false; @@ -1264,7 +927,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, vector &blanks) { vector vectags = getVecTags(chunk); - UString case_info = caseOf(pseudolemma(chunk)); + UString case_info = StringUtils::getcase(pseudolemma(chunk)); bool uppercase_all = false; bool uppercase_first = false; bool lastblank = true; diff --git a/apertium/postchunk.h b/apertium/postchunk.h index 7a8ba51..43ad7b1 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -19,57 +19,39 @@ #include -#include #include -#include #include -#include -#include -#include - using namespace std; -class Postchunk : TransferBase +class Postchunk : public TransferBase { private: InterchunkWord **word; - queue blank_queue; - int lword; - Buffer input_buffer; - vector tmpword; - vector tmpblank; - bool in_out; bool in_lu; bool in_wblank; UString out_wblank; map var_out_wblank; - UFILE *output; - - xmlNode *lastrule; - unsigned int nwords; - bool inword; - static UString caseOf(UString const &str); - UString copycase(UString const &source_word, UString const &target_word); - + UString evalCachedString(xmlNode* element); + void processClip(xmlNode* element); + void processBlank(xmlNode* element); + void processLuCount(xmlNode* element); + void processCaseOf(xmlNode* element); + UString processLu(xmlNode* element); + UString processMlu(xmlNode* element); + + UString processChunk(xmlNode* element); + void processLet(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - UString evalString(xmlNode *localroot); - int processRule(xmlNode* localroot); - void processInstruction(xmlNode *localroot); - void processChoose(xmlNode *localroot); void processTags(xmlNode *localroot); - bool beginsWith(UString const &str1, UString const &str2) const; - bool endsWith(UString const &str1, UString const &str2) const; - UString tolower(UString const &str) const; - UString tags(UString const &str) const; UString readWord(InputFile& in); UString readBlank(InputFile& in); UString readUntil(InputFile& in, int const symbol) const; @@ -87,18 +69,11 @@ private: static UString wordzero(UString const &chunk); bool checkIndex(xmlNode *element, int index, int limit); void postchunk_wrapper_null_flush(InputFile& in, UFILE* out); - bool gettingLemmaFromWord(UString attr); - UString combineWblanks(UString wblank_current, UString wblank_to_add); public: Postchunk(); - ~Postchunk(); - void read(string const &transferfile, string const &datafile); void postchunk(InputFile& in, UFILE* out); - bool getNullFlush(void); - void setNullFlush(bool null_flush); - void setTrace(bool trace); }; #endif diff --git a/apertium/transfer.cc b/apertium/transfer.cc index b2fcdbb..40ef727 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -15,38 +15,20 @@ * along with this program; if not, see . */ #include -#include -#include + #include -#include -#include #include -#include #include -#include -#include using namespace Apertium; using namespace std; -Transfer::Transfer() : -word(0), -lword(0), -last_lword(0), -output(0), -nwords(0) -{ - lastrule = NULL; - defaultAttrs = lu; - useBilingual = true; - preBilingual = false; - isExtended = false; - trace_att = false; - in_lu = false; - in_out = false; - in_wblank = false; -} +Transfer::Transfer() + : word(nullptr), last_lword(0), in_lu(false), in_wblank(false), + isExtended(false), defaultAttrs(lu), preBilingual(false), + useBilingual(true), trace_att(false) +{} void Transfer::readBil(string const &fstfile) @@ -113,544 +95,316 @@ Transfer::checkIndex(xmlNode *element, int index, int limit) } UString -Transfer::evalString(xmlNode *element) +Transfer::evalCachedString(xmlNode *element) { - map::iterator it; - it = evalStringCache.find(element); - if(it != evalStringCache.end()) - { - TransferInstr &ti = it->second; - switch(ti.getType()) - { - case ti_clip_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) - { - if(in_lu) - { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); - } - else if(in_let_var) - { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); - } - } - - return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); - } - break; - - case ti_clip_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) - { - if(in_lu) - { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); - } - else if(in_let_var) - { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); - } - } - - return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); - } - break; - - case ti_clip_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - return word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()); - } - break; - - case ti_linkto_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(!word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()).empty()) - { - UString ret; - ret += '<'; - ret += UString((UChar*) ti.getPointer()); - ret += '>'; - return ret; - } - else - { - return ""_u; - } - } - break; - - case ti_linkto_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(!word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()).empty()) - { - UString ret; - ret += '<'; - ret += UString((UChar*) ti.getPointer()); - ret += '>'; - return ret; - } - else - { - return ""_u; - } + TransferInstr& ti = evalStringCache[element]; + switch (ti.getType()) { + case ti_clip_sl: + if (checkIndex(element, ti.getPos(), lword)) { + if (gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { + if(in_lu) { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); + } else if (in_let_var) { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); } - break; - - case ti_linkto_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - if(!word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()).empty()) - { - UString ret; - ret += '<'; - ret += UString((UChar*) ti.getPointer()); - ret += '>'; - return ret; - } - else - { - return ""_u; - } - } - break; - - case ti_var: - if(last_lword > 1) - { - out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); - } - return variables[ti.getContent()]; - - case ti_lit_tag: - case ti_lit: - return ti.getContent(); - - case ti_b: - if(!blank_queue.empty()) - { - UString retblank = blank_queue.front(); - if(in_out) - { - blank_queue.pop(); - } + } - return retblank; - } - else - { - return " "_u; - } - break; + return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); + } + break; - case ti_get_case_from: - if(checkIndex(element, ti.getPos(), lword)) - { - return StringUtils::copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), - evalString((xmlNode *) ti.getPointer())); + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) { + if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { + if(in_lu) { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); + } else if(in_let_var) { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); } - break; + } + + return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); + } + break; - case ti_case_of_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - return StringUtils::getcase(word[ti.getPos()]->source(attr_items[ti.getContent()])); - } - break; + case ti_clip_ref: + if(checkIndex(element, ti.getPos(), lword)) { + return word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()); + } + break; - case ti_case_of_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - return StringUtils::getcase(word[ti.getPos()]->target(attr_items[ti.getContent()])); - } - break; + case ti_linkto_sl: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; + } else { + return ""_u; + } + } + break; - case ti_case_of_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - return StringUtils::getcase(word[ti.getPos()]->reference(attr_items[ti.getContent()])); - } - break; + case ti_linkto_tl: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; + } else { + return ""_u; + } + } + break; - default: + case ti_linkto_ref: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; + } else { return ""_u; + } } - return ""_u; - } + break; - if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) - { - int pos = 0; - xmlChar *side = NULL, *as = NULL; - UString part; - bool queue = true; + case ti_var: + if(last_lword > 1) { + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + } + return variables[ti.getContent()]; - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((const char*) i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *)i->children->content) - 1; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) - { - if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) - { - queue = false; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) - { - as = i->children->content; + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(!blank_queue.empty()) { + UString retblank = blank_queue.front(); + if(in_out) { + blank_queue.pop(); } + + return retblank; + } else { + return " "_u; } + break; - if(as != NULL) - { - if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_linkto_sl, part, pos, (void *) as, queue); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_linkto_ref, part, pos, (void *) as, queue); - } - else - { - evalStringCache[element] = TransferInstr(ti_linkto_tl, part, pos, (void *) as, queue); - } + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); } - else if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); + break; + + case ti_case_of_sl: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->source(attr_items[ti.getContent()])); } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->target(attr_items[ti.getContent()])); } - else - { - evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); + break; + + case ti_case_of_ref: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->reference(attr_items[ti.getContent()])); } + break; + + default: + return ""_u; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) - { - evalStringCache[element] = TransferInstr(ti_lit_tag, - tags(to_ustring((const char *) element->properties->children->content)), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) - { - evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) - { - evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) - { - int pos = atoi((const char *) element->properties->children->content) - 1; - xmlNode *param = NULL; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - param = i; - break; + return ""_u; +} + +void +Transfer::processClip(xmlNode* element) +{ + int pos = 0; + xmlChar *side = NULL, *as = NULL; + UString part; + bool queue = true; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) { + side = i->children->content; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { + part = to_ustring((const char*) i->children->content); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { + pos = atoi((const char *)i->children->content) - 1; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) { + queue = false; } + } else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) { + as = i->children->content; } - - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) - { - evalStringCache[element] = TransferInstr(ti_var, getattr(element, "v"), 0); + + if(as != NULL) { + if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_linkto_sl, part, pos, (void *) as, queue); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_linkto_ref, part, pos, (void *) as, queue); + } else { + evalStringCache[element] = TransferInstr(ti_linkto_tl, part, pos, (void *) as, queue); + } + } else if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); + } else { + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) - { - int pos = 0; - xmlChar *side = NULL; - UString part; +} - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = to_ustring((const char*) i->children->content); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - } +void +Transfer::processBlank(xmlNode* element) +{ + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); +} - if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_case_of_sl, part, pos); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_case_of_ref, part, pos); - } - else - { - evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); +void +Transfer::processCaseOf(xmlNode* element) +{ + int pos = 0; + xmlChar *side = NULL; + UString part; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) { + side = i->children->content; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { + part = to_ustring((const char*) i->children->content); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { + pos = atoi((const char *) i->children->content) - 1; } } - else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) - { - UString value; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - value.append(evalString(i)); - } - } - return value; + + if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_case_of_sl, part, pos); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_case_of_ref, part, pos); + } else { + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); +} + +UString +Transfer::processLu(xmlNode* element) +{ + in_lu = true; + out_wblank.clear(); - UString myword; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - myword.append(evalString(i)); - } - } + UString myword; + for (auto i : children(element)) { + myword.append(evalString(i)); + } - in_lu = false; + in_lu = false; - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } + if(last_lword == 1) { + out_wblank = word[0]->getWblank(); + } - if(!myword.empty()) - { - if(myword[0] != '[' || myword[1] != '[') - { - UString ret = out_wblank; - ret += '^'; - ret += myword; - ret += '$'; - return ret; - } - else - { - myword += '$'; - return myword; - } - } - else - { - return ""_u; + if(!myword.empty()) { + if(myword[0] != '[' || myword[1] != '[') { + UString ret = out_wblank; + ret += '^'; + ret += myword; + ret += '$'; + return ret; + } else { + myword += '$'; + return myword; } + } else { + return ""_u; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) - { - UString value; - - bool first_time = true; - out_wblank.clear(); +} - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_lu = true; - - UString myword; +UString +Transfer::processMlu(xmlNode* element) +{ + UString value; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; + bool first_time = true; + out_wblank.clear(); + + in_lu = true; + for (auto i : children(element)) { + UString myword; + for (auto j : children(i)) { + myword.append(evalString(j)); + } - if(!first_time) - { - if(!myword.empty() && myword[0] != '#') //'+#' problem - { + if (!first_time) { + if(!myword.empty() && myword[0] != '#') { //'+#' problem value += '+'; } - } - else - { + } else { if (!myword.empty()) { first_time = false; } } value.append(myword); - } - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(!value.empty()) - { - UString ret = out_wblank; - ret += '^'; - ret += value; - ret += '$'; - return ret; - } - else - { - return ""_u; - } } - else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) - { - return processChunk(element); + + if(last_lword == 1) { + out_wblank = word[0]->getWblank(); } - else - { - cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; - exit(EXIT_FAILURE); + + if(!value.empty()) { + UString ret = out_wblank; + ret += '^'; + ret += value; + ret += '$'; + return ret; + } else { + return ""_u; } +} - return evalString(element); +void +Transfer::processLuCount(xmlNode* element) +{ + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); } void Transfer::processOut(xmlNode *localroot) { in_out = true; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(defaultAttrs == lu) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - UString myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(!myword.empty()) - { - if(myword[0] != '[' || myword[1] != '[') - { - u_fprintf(output, "%S^", out_wblank.c_str()); - } - u_fprintf(output, "%S$", myword.c_str()); - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) - { - UString myword; - bool first_time = true; - out_wblank.clear(); - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - - UString mylocalword; - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - mylocalword.append(evalString(k)); - } - } - - in_lu = false; - if(!first_time) - { - if(!mylocalword.empty() && mylocalword[0] != '#') //'+#' problem - { - myword += '+'; - } - } - else - { - if(!mylocalword.empty()) - { - first_time = false; - } - } - - myword.append(mylocalword); - } - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(!myword.empty()) { - u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); - } - } - else { // 'b' - write(evalString(i), output); - } + for (auto i : children(localroot)) { + if(defaultAttrs == lu) { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + write(processLu(i), output); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { + write(processMlu(i), output); } - else - { - if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) - { - write(processChunk(i), output); - } - else // 'b' - { - write(evalString(i), output); - } + } else { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { + write(processChunk(i), output); + } else { // 'b' + write(evalString(i), output); } } } - in_out = false; } @@ -711,100 +465,16 @@ Transfer::processChunk(xmlNode *localroot) } } - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) - { - result.append(processTags(i)); - result += '{'; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - UString myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(!myword.empty()) - { - result.append(out_wblank); - result += '^'; - result.append(myword); - result += '$'; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) - { - bool first_time = true; - UString myword; - - out_wblank.clear(); - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - UString mylocalword; - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - mylocalword.append(evalString(k)); - } - } - - in_lu = false; - - if(!first_time) - { - if(!mylocalword.empty() && mylocalword[0] != '#') // '+#' problem - { - myword += '+'; - } - } - else - { - first_time = false; - } - } - myword.append(mylocalword); - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(!myword.empty()) - { - result.append(out_wblank); - result += '^'; - result.append(myword); - result += '$'; - } - } - else // 'b' - { - result.append(evalString(i)); - } + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) { + result.append(processTags(i)); + result += '{'; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + result.append(processLu(i)); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { + result.append(processMlu(i)); + } else { // 'b' + result.append(evalString(i)); } } result += '}'; @@ -816,19 +486,10 @@ UString Transfer::processTags(xmlNode *localroot) { UString result; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - result.append(evalString(j)); - } - } + for (auto i : children(localroot)) { + if (!xmlStrcmp(i->name, (const xmlChar*) "tag")) { + for (auto j : children(i)) { + result.append(evalString(j)); } } } @@ -840,19 +501,12 @@ Transfer::processLet(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -1000,19 +654,12 @@ Transfer::processModifyCase(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -1108,20 +755,15 @@ Transfer::processCallMacro(xmlNode *localroot) // ToDo: Is it at all valid if npar <= 0 ? TransferWord **myword = NULL; + int idx = 0; if(npar > 0) { myword = new TransferWord *[npar]; std::fill(myword, myword+npar, (TransferWord *)(0)); - } - - int idx = 0; - for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { + for (auto i : children(localroot)) { if (idx >= npar) { - cerr << "Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; - return; + cerr << "Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; + return; } int pos = atoi((const char *) i->properties->children->content)-1; myword[idx] = word[pos]; @@ -1133,12 +775,8 @@ Transfer::processCallMacro(xmlNode *localroot) swap(myword, word); swap(npar, lword); - for(xmlNode *i = macro->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } + for (auto i : children(macro)) { + processInstruction(i); } swap(myword, word); @@ -1147,37 +785,6 @@ Transfer::processCallMacro(xmlNode *localroot) delete[] myword; } -int -Transfer::processRule(xmlNode *localroot) -{ - int instruction_return, words_to_consume = -1; - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - instruction_return = processInstruction(i); - // When an instruction which modifies the number of words to be consumed - // from the input is found, execution of the rule is stopped - if(instruction_return != -1) - { - words_to_consume = instruction_return; - break; - } - } - } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" "_u) != 0) { - write(blank_queue.front(), output); - } - blank_queue.pop(); - } - - return words_to_consume; -} - TransferToken & Transfer::readToken(InputFile& in) { diff --git a/apertium/transfer.h b/apertium/transfer.h index 5955daa..9cb77be 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -19,32 +19,19 @@ #include -#include #include -#include #include - -#include -#include -#include +#include using namespace std; -class Transfer : TransferBase +class Transfer : public TransferBase { private: TransferWord **word; - queue blank_queue; - int lword; int last_lword; - Buffer input_buffer; - vector tmpword; - vector tmpblank; - - bool in_out; bool in_lu; - bool in_wblank; UString out_wblank; map var_out_wblank; @@ -52,10 +39,6 @@ private: FSTProcessor fstp; FSTProcessor extended; bool isExtended; - UFILE *output; - - xmlNode *lastrule; - unsigned int nwords; enum OutputType{lu,chunk}; @@ -68,28 +51,19 @@ private: void readBil(string const &filename); void processLet(xmlNode *localroot); - void processAppend(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - bool processLogical(xmlNode *localroot); - bool processTest(xmlNode *localroot); - bool processAnd(xmlNode *localroot); - bool processOr(xmlNode *localroot); - bool processEqual(xmlNode *localroot); - bool processBeginsWith(xmlNode *localroot); - bool processBeginsWithList(xmlNode *localroot); - bool processEndsWith(xmlNode *localroot); - bool processEndsWithList(xmlNode *local); - bool processContainsSubstring(xmlNode *localroot); - bool processNot(xmlNode *localroot); - bool processIn(xmlNode *localroot); - int processRule(xmlNode *localroot); - UString evalString(xmlNode *localroot); - int processInstruction(xmlNode *localroot); - int processChoose(xmlNode *localroot); + UString evalCachedString(xmlNode *localroot); UString processChunk(xmlNode *localroot); UString processTags(xmlNode *localroot); + void processClip(xmlNode* element); + void processBlank(xmlNode* element); + void processCaseOf(xmlNode* element); + UString processLu(xmlNode* element); + UString processMlu(xmlNode* element); + + void processLuCount(xmlNode* element); UString readWord(InputFile& in); UString readBlank(InputFile& in); @@ -102,7 +76,6 @@ private: void tmp_clear(); public: Transfer(); - ~Transfer(); void read(string const &transferfile, string const &datafile, string const &fstfile = ""); diff --git a/apertium/transfer_base.cc b/apertium/transfer_base.cc index 6b96e04..f48511f 100644 --- a/apertium/transfer_base.cc +++ b/apertium/transfer_base.cc @@ -9,7 +9,8 @@ using namespace std; TransferBase::TransferBase() : me(nullptr), doc(nullptr), root_element(nullptr), - any_char(0), any_tag(0), in_let_var(false), + lword(0), nwords(0), output(nullptr), + any_char(0), any_tag(0), in_let_var(false), in_out(false), null_flush(false), internal_null_flush(false), trace(false) {} @@ -193,7 +194,7 @@ TransferBase::evalString(xmlNode* element) processBlank(element); } else if (!xmlStrcmp(element->name, (const xmlChar*) "get-case-from")) { int pos = atoi((const char*) element->properties->children->content); - xmlNode* param; + xmlNode* param = NULL; for (auto it : children(element)) { param = it; break; @@ -202,9 +203,9 @@ TransferBase::evalString(xmlNode* element) } else if (!xmlStrcmp(element->name, (const xmlChar*) "var")) { evalStringCache[element] = TransferInstr(ti_var, getattr(element, "n"), 0); } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu-count")) { - evalLuCount(element); + processLuCount(element); } else if (!xmlStrcmp(element->name, (const xmlChar*) "case-of")) { - evalCaseOf(element); + processCaseOf(element); } else if (!xmlStrcmp(element->name, (const xmlChar*) "concat")) { UString value; for (auto it : children(element)) { @@ -212,11 +213,11 @@ TransferBase::evalString(xmlNode* element) } return value; } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu")) { - evalLu(element); + return processLu(element); } else if (!xmlStrcmp(element->name, (const xmlChar*) "mlu")) { - evalMlu(element); + return processMlu(element); } else if (!xmlStrcmp(element->name, (const xmlChar*) "chunk")) { - evalChunk(element); + return processChunk(element); } else { cerr << "Error: unexpected expression: '" << element->name << "'" << endl; exit(EXIT_FAILURE); @@ -224,6 +225,29 @@ TransferBase::evalString(xmlNode* element) return evalCachedString(element); } +int +TransferBase::processRule(xmlNode* localroot) +{ + int words_to_consume = -1; + // iterating over the tag + for (auto i : children(localroot)) { + words_to_consume = processInstruction(i); + // When an instruction which modifies the number of words to be consumed + // from the input is found, execution of the rule is stopped + if (words_to_consume != -1) { + break; + } + } + // flush remaining non-space blanks + while (!blank_queue.empty()) { + if (blank_queue.front() != " "_u) { + write(blank_queue.front(), output); + } + blank_queue.pop(); + } + return words_to_consume; +} + int TransferBase::processInstruction(xmlNode* localroot) { diff --git a/apertium/transfer_base.h b/apertium/transfer_base.h index d5adfde..88ae2d4 100644 --- a/apertium/transfer_base.h +++ b/apertium/transfer_base.h @@ -3,17 +3,21 @@ #include #include +#include #include #include #include #include +#include #include #include #include #include +#include +#include using namespace std; @@ -34,10 +38,21 @@ protected: xmlDoc* doc; xmlNode* root_element; + queue blank_queue; + Buffer input_buffer; + int lword; + vector tmpword; + vector tmpblank; + xmlNode* lastrule; + unsigned int nwords; + + UFILE* output; + int32_t any_char; int32_t any_tag; bool in_let_var; + bool in_out; UString var_val; map evalStringCache; @@ -56,12 +71,13 @@ protected: virtual void processClip(xmlNode* element) = 0; virtual void processBlank(xmlNode* element) = 0; - virtual void evalLuCount(xmlNode* element) = 0; - virtual void evalCaseOf(xmlNode* element) = 0; - virtual void evalLu(xmlNode* element) = 0; - virtual void evalMlu(xmlNode* element) = 0; - virtual void evalChunk(xmlNode* element) = 0; + virtual void processLuCount(xmlNode* element) = 0; + virtual void processCaseOf(xmlNode* element) = 0; + virtual UString processLu(xmlNode* element) = 0; + virtual UString processMlu(xmlNode* element) = 0; + virtual UString processChunk(xmlNode* element) = 0; + int processRule(xmlNode* localroot); int processInstruction(xmlNode* localroot); int processRejectCurrentRule(xmlNode* localroot); int processChoose(xmlNode* localroot); @@ -91,8 +107,6 @@ protected: bool processEqual(xmlNode *localroot); bool processIn(xmlNode *localroot); - virtual int processRule(xmlNode *localroot) = 0; - UString tags(const UString& s) const; public: diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc index 6d811de..4e1a8c8 100644 --- a/apertium/trx_reader.cc +++ b/apertium/trx_reader.cc @@ -23,11 +23,9 @@ #include using namespace Apertium; -UString const -TRXReader::ANY_TAG = ""; -UString const -TRXReader::ANY_CHAR = ""; +UString const TRXReader::ANY_TAG = ""_u; +UString const TRXReader::ANY_CHAR = ""_u; TRXReader::TRXReader() { @@ -40,7 +38,7 @@ TRXReader::insertLemma(int const base, UString const &lemma) { int retval = base; static int const any_char = td.getAlphabet()(ANY_CHAR); - if(lemma == "") + if(lemma.empty()) { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); @@ -91,7 +89,7 @@ TRXReader::insertTags(int const base, UString const &tags) } else { - UString symbol = "<"; + UString symbol = "<"_u; for(unsigned int j = i; j != limit; j++) { if(tags[j] == '.') @@ -102,7 +100,7 @@ TRXReader::insertTags(int const base, UString const &tags) } } - if(symbol == "<") + if(symbol == "<"_u) { symbol.append(tags.substr(i)); i = limit; @@ -126,56 +124,56 @@ TRXReader::parse() { procDefCats(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name == "section-def-attrs") + if(name == "section-def-attrs"_u) { procDefAttrs(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "section-def-vars") + if(name == "section-def-vars"_u) { procDefVars(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "section-def-lists") + if(name == "section-def-lists"_u) { procDefLists(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "section-def-macros") + if(name == "section-def-macros"_u) { procDefMacros(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "section-rules") + if(name == "section-rules"_u) { procRules(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } @@ -189,17 +187,17 @@ TRXReader::procRules() set alive_states; while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-rules") + name != "section-rules"_u) { step(); - if(name == "rule") + if(name == "rule"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { count++; } } - else if(name == "pattern") + else if(name == "pattern"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -229,18 +227,18 @@ TRXReader::procRules() } } } - else if(name == "pattern-item") + else if(name == "pattern-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { pair::iterator, multimap::iterator> range; - range = cat_items.equal_range(attrib("n")); + range = cat_items.equal_range(attrib("n"_u)); if(range.first == range.second) { - parseError("Undefined cat-item '" + attrib("n")); + parseError("Undefined cat-item '"_u + attrib("n"_u)); } // new code @@ -277,18 +275,18 @@ TRXReader::procRules() alive_states = alive_states_new; } } - else if(name == "let") + else if(name == "let"_u) { int count = 0; int lineno = xmlTextReaderGetParserLineNumber(reader); - while(name != "let" || type != XML_READER_TYPE_END_ELEMENT) + while(name != "let"_u || type != XML_READER_TYPE_END_ELEMENT) { step(); if(type == XML_ELEMENT_NODE) { count++; - if(name == "clip" && attrib("side") == "sl") + if(name == "clip"_u && attrib("side"_u) == "sl"_u) { cerr << "Warning (" << lineno; cerr << "): assignment to 'sl' side has no effect." << endl; @@ -327,38 +325,38 @@ TRXReader::procDefAttrs() UString attrname; while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-def-attrs") + name != "section-def-attrs"_u) { step(); - if(name == "attr-item") + if(name == "attr-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - insertAttrItem(attrname, attrib("tags")); + insertAttrItem(attrname, attrib("tags"_u)); } } - else if(name == "def-attr") + else if(name == "def-attr"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - attrname = attrib("n"); + attrname = attrib("n"_u); } else { UString all = td.getAttrItems()[attrname]; - td.getAttrItems()[attrname] = "(" + all + ")"; - attrname = ""; + td.getAttrItems()[attrname] = "("_u + all + ")"_u; + attrname.clear(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "section-def-attrs") + else if(name == "section-def-attrs"_u) { // do nothing } @@ -372,11 +370,11 @@ TRXReader::procDefAttrs() void TRXReader::procDefCats() { - while(type == XML_READER_TYPE_END_ELEMENT || !(name == "transfer" || name == "interchunk" || name == "postchunk")) + while(type == XML_READER_TYPE_END_ELEMENT || !(name == "transfer"_u || name == "interchunk"_u || name == "postchunk"_u)) { step(); - if(name != "#text" && name != "transfer" && name != "interchunk" && - name != "postchunk" && name != "section-def-cats" && name != "#comment") + if(name != "#text"_u && name != "transfer"_u && name != "interchunk"_u && + name != "postchunk"_u && name != "section-def-cats"_u && name != "#comment"_u) { unexpectedTag(); } @@ -385,43 +383,43 @@ TRXReader::procDefCats() UString catname; while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-def-cats") + name != "section-def-cats"_u) { step(); - if(name == "cat-item") + if(name == "cat-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - if(attrib("tags") != "") + if(!attrib("tags"_u).empty()) { - insertCatItem(catname, attrib("lemma"), attrib("tags")); + insertCatItem(catname, attrib("lemma"_u), attrib("tags"_u)); } else { - insertCatItem(catname, attrib("name"), ""); + insertCatItem(catname, attrib("name"_u), ""_u); } } } - else if(name == "def-cat") + else if(name == "def-cat"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - catname = attrib("n"); + catname = attrib("n"_u); } else { - catname = ""; + catname.clear(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "section-def-cats") + else if(name == "section-def-cats"_u) { // do nothing } @@ -436,25 +434,25 @@ void TRXReader::procDefVars() { while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-def-vars") + name != "section-def-vars"_u) { step(); - if(name == "def-var") + if(name == "def-var"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - createVar(attrib("n"), attrib("v")); + createVar(attrib("n"_u), attrib("v"_u)); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "section-def-vars") + else if(name == "section-def-vars"_u) { // do nothing } @@ -471,36 +469,36 @@ TRXReader::procDefLists() UString listname; while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-def-lists") + name != "section-def-lists"_u) { step(); - if(name == "list-item") + if(name == "list-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - insertListItem(listname, attrib("v")); + insertListItem(listname, attrib("v"_u)); } } - else if(name == "def-list") + else if(name == "def-list"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - listname = attrib("n"); + listname = attrib("n"_u); } else { - listname = ""; + listname.clear(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "section-def-lists") + else if(name == "section-def-lists"_u) { // do nothing } @@ -516,14 +514,14 @@ TRXReader::procDefMacros() { int count = 0; while(type != XML_READER_TYPE_END_ELEMENT || - name != "section-def-macros") + name != "section-def-macros"_u) { step(); - if(name == "def-macro") + if(name == "def-macro"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - createMacro(attrib("n"), count++); + createMacro(attrib("n"_u), count++); } } } @@ -534,7 +532,7 @@ TRXReader::createMacro(UString const &name, int const value) { if(td.getMacros().find(name) != td.getMacros().end()) { - parseError("Macro '" + name + "' defined at least twice"); + parseError("Macro '"_u + name + "' defined at least twice"_u); } td.getMacros()[name] = value; } @@ -575,7 +573,7 @@ TRXReader::insertAttrItem(UString const &name, UString const &tags) { if(tags[i] == '.') { - td.getAttrItems()[name].append("><"); + td.getAttrItems()[name].append("><"_u); } else { diff --git a/apertium/tsx_reader.cc b/apertium/tsx_reader.cc index 92f0d5a..a6c051a 100644 --- a/apertium/tsx_reader.cc +++ b/apertium/tsx_reader.cc @@ -61,13 +61,13 @@ TSXReader::clearTagIndex() { tag_index->clear(); array_tags->clear(); - newTagIndex("LPAR"); - newTagIndex("RPAR"); - newTagIndex("LQUEST"); - newTagIndex("CM"); - newTagIndex("SENT"); - newTagIndex("kEOF"); - newTagIndex("kUNDEF"); + newTagIndex("LPAR"_u); + newTagIndex("RPAR"_u); + newTagIndex("LQUEST"_u); + newTagIndex("CM"_u); + newTagIndex("SENT"_u); + newTagIndex("kEOF"_u); + newTagIndex("kUNDEF"_u); } TSXReader & @@ -84,25 +84,25 @@ TSXReader::operator =(TSXReader const &o) void TSXReader::newTagIndex(UString const &tag) { - if(tag_index->find("TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_"_u + tag) != tag_index->end()) { - parseError("'" + tag + "' already defined"); + parseError("'"_u + tag + "' already defined"_u); } - array_tags->push_back("TAG_" + tag); - (*tag_index)["TAG_" + tag] = array_tags->size() - 1; + array_tags->push_back("TAG_"_u + tag); + (*tag_index)["TAG_"_u + tag] = array_tags->size() - 1; } void TSXReader::newDefTag(UString const &tag) { - if(tag_index->find("TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_"_u + tag) != tag_index->end()) { - parseError("'" + tag + "' already defined"); + parseError("'"_u + tag + "' already defined"_u); } array_tags->push_back(tag); - (*tag_index)["TAG_" + tag] = array_tags->size() - 1; + (*tag_index)["TAG_"_u + tag] = array_tags->size() - 1; } void @@ -115,26 +115,26 @@ TSXReader::newConstant(UString const &constant) void TSXReader::procDiscardOnAmbiguity() { - while(type != XML_READER_TYPE_END_ELEMENT || name != "discard-on-ambiguity") + while(type != XML_READER_TYPE_END_ELEMENT || name != "discard-on-ambiguity"_u) { step(); - if(name == "discard") + if(name == "discard"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - tdata.addDiscard("<" + StringUtils::substitute(attrib("tags"), ".", "><") + ">"); + tdata.addDiscard("<"_u + StringUtils::substitute(attrib("tags"_u), "."_u, "><"_u) + ">"_u); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "discard-on-ambiguity") + else if(name == "discard-on-ambiguity"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -142,7 +142,7 @@ TSXReader::procDiscardOnAmbiguity() } else { - parseError("Unexpected 'discard-on-ambiguity' open tag"); + parseError("Unexpected 'discard-on-ambiguity' open tag"_u); } } else @@ -155,36 +155,36 @@ TSXReader::procDiscardOnAmbiguity() void TSXReader::procDefLabel() { - UString name_attr = attrib("name"); - UString closed_attr = attrib("closed"); + UString name_attr = attrib("name"_u); + UString closed_attr = attrib("closed"_u); newDefTag(name_attr); - if(closed_attr != "true") + if(closed_attr != "true"_u) { - open_class->insert((*tag_index)["TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"_u + name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != "def-label") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-label"_u) { step(); - if(name == "tags-item") + if(name == "tags-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)["TAG_"+name_attr], attrib("lemma"), - attrib("tags")); + plist->insert((*tag_index)["TAG_"_u + name_attr], attrib("lemma"_u), + attrib("tags"_u)); } } - else if(name == "def-label") + else if(name == "def-label"_u) { return; } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } @@ -198,50 +198,50 @@ TSXReader::procDefLabel() void TSXReader::procDefMult() { - UString name_attr = attrib("name"); - UString closed_attr = attrib("closed"); + UString name_attr = attrib("name"_u); + UString closed_attr = attrib("closed"_u); newDefTag(name_attr); - if(closed_attr != "true") + if(closed_attr != "true"_u) { - open_class->insert((*tag_index)["TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"_u + name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != "def-mult") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-mult"_u) { step(); - if(name == "sequence") + if(name == "sequence"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { plist->beginSequence(); - while(type != XML_READER_TYPE_END_ELEMENT || name != "sequence") + while(type != XML_READER_TYPE_END_ELEMENT || name != "sequence"_u) { step(); - if(name == "label-item") + if(name == "label-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)["TAG_"+name_attr], - (*tag_index)["TAG_"+attrib("label")]); + plist->insert((*tag_index)["TAG_"_u + name_attr], + (*tag_index)["TAG_"_u + attrib("label"_u)]); } } - else if(name == "tags-item") + else if(name == "tags-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)["TAG_"+name_attr], - attrib("lemma"), attrib("tags")); + plist->insert((*tag_index)["TAG_"_u + name_attr], + attrib("lemma"_u), attrib("tags"_u)); } } - else if(name == "sequence") + else if(name == "sequence"_u) { break; } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } @@ -249,15 +249,15 @@ TSXReader::procDefMult() plist->endSequence(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "def-mult") + else if(name == "def-mult"_u) { // do nothing } @@ -271,41 +271,41 @@ TSXReader::procDefMult() void TSXReader::procTagset() { - while(type == XML_READER_TYPE_END_ELEMENT || name != "tagset") + while(type == XML_READER_TYPE_END_ELEMENT || name != "tagset"_u) { step(); - if(name != "#text" && name != "tagger" && name != "tagset") + if(name != "#text"_u && name != "tagger"_u && name != "tagset"_u) { unexpectedTag(); } } - while(type != XML_READER_TYPE_END_ELEMENT || name != "tagset") + while(type != XML_READER_TYPE_END_ELEMENT || name != "tagset"_u) { step(); - if(name == "def-label") + if(name == "def-label"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procDefLabel(); } } - else if(name == "def-mult") + else if(name == "def-mult"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procDefMult(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "tagset") + else if(name == "tagset"_u) { // do nothing } @@ -323,27 +323,27 @@ TSXReader::procLabelSequence() TForbidRule forbid_rule; step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name != "label-item") + if(name != "label-item"_u) { - parseError(" tag expected"); + parseError(" tag expected"_u); } - forbid_rule.tagi = (*tag_index)["TAG_" + attrib("label")]; + forbid_rule.tagi = (*tag_index)["TAG_"_u + attrib("label"_u)]; step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name != "label-item") + if(name != "label-item"_u) { - parseError(" tag expected"); + parseError(" tag expected"_u); } - forbid_rule.tagj = (*tag_index)["TAG_" + attrib("label")]; + forbid_rule.tagj = (*tag_index)["TAG_"_u + attrib("label"_u)]; forbid_rules->push_back(forbid_rule); } @@ -351,25 +351,25 @@ TSXReader::procLabelSequence() void TSXReader::procForbid() { - while(type != XML_READER_TYPE_END_ELEMENT || name != "forbid") + while(type != XML_READER_TYPE_END_ELEMENT || name != "forbid"_u) { step(); - if(name == "label-sequence") + if(name == "label-sequence"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procLabelSequence(); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "forbid") + else if(name == "forbid"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -377,12 +377,12 @@ TSXReader::procForbid() } else { - parseError("Unexpected '" + name + "' open tag"); + parseError("Unexpected '"_u + name + "' open tag"_u); } } else { - parseError("Unexpected '" + name + "' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -391,14 +391,14 @@ void TSXReader::procEnforce() { TEnforceAfterRule aux; - while(type != XML_READER_TYPE_END_ELEMENT || name != "enforce-rules") + while(type != XML_READER_TYPE_END_ELEMENT || name != "enforce-rules"_u) { step(); - if(name == "enforce-after") + if(name == "enforce-after"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagi = (*tag_index)["TAG_" + attrib("label")]; + aux.tagi = (*tag_index)["TAG_"_u + attrib("label"_u)]; } else { @@ -406,26 +406,26 @@ TSXReader::procEnforce() aux.tagsj.clear(); } } - else if(name == "label-set") + else if(name == "label-set"_u) { // do nothing } - else if(name == "label-item") + else if(name == "label-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagsj.push_back((*tag_index)["TAG_" + attrib("label")]); + aux.tagsj.push_back((*tag_index)["TAG_"_u + attrib("label"_u)]); } } - else if(name == "#text") + else if(name == "#text"_u) { // do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "enforce-rules") + else if(name == "enforce-rules"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -433,12 +433,12 @@ TSXReader::procEnforce() } else { - parseError("Unexpected 'enforce-rules' open tag"); + parseError("Unexpected 'enforce-rules' open tag"_u); } } else { - parseError("Unexpected '" + name + "' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -446,26 +446,26 @@ TSXReader::procEnforce() void TSXReader::procPreferences() { - while(type != XML_READER_TYPE_END_ELEMENT || name != "preferences") + while(type != XML_READER_TYPE_END_ELEMENT || name != "preferences"_u) { step(); - if(name == "prefer") + if(name == "prefer"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - UString const tags = "<" + StringUtils::substitute(attrib("tags"), ".", "><") + ">"; + UString const tags = "<"_u + StringUtils::substitute(attrib("tags"_u), "."_u, "><"_u) + ">"_u; prefer_rules->push_back(tags); } } - else if(name == "#text") + else if(name == "#text"_u) { //do nothing } - else if(name == "#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == "preferences") + else if(name == "preferences"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -473,12 +473,12 @@ TSXReader::procPreferences() } else { - parseError("Unexpected 'preferences' open tag"); + parseError("Unexpected 'preferences' open tag"_u); } } else { - parseError("Unexpected '" + name + "' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -494,38 +494,38 @@ TSXReader::parse() procTagset(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name == "forbid") + if(name == "forbid"_u) { procForbid(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "enforce-rules") + if(name == "enforce-rules"_u) { procEnforce(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "preferences") + if(name == "preferences"_u) { procPreferences(); step(); - while(name == "#text" || name == "#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == "discard-on-ambiguity") + if(name == "discard-on-ambiguity"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -533,20 +533,20 @@ TSXReader::parse() } } - newConstant("kMOT"); - newConstant("kDOLLAR"); - newConstant("kBARRA"); - newConstant("kMAS"); - newConstant("kIGNORAR"); - newConstant("kBEGIN"); - newConstant("kUNKNOWN"); - - plist->insert((*tag_index)["TAG_LPAR"], "", "lpar"); - plist->insert((*tag_index)["TAG_RPAR"], "", "rpar"); - plist->insert((*tag_index)["TAG_LQUEST"], "", "lquest"); - plist->insert((*tag_index)["TAG_CM"], "", "cm"); - plist->insert((*tag_index)["TAG_SENT"], "", "sent"); -// plist->insert((*tag_index)["TAG_kMAS"], "+", ""); + newConstant("kMOT"_u); + newConstant("kDOLLAR"_u); + newConstant("kBARRA"_u); + newConstant("kMAS"_u); + newConstant("kIGNORAR"_u); + newConstant("kBEGIN"_u); + newConstant("kUNKNOWN"_u); + + plist->insert((*tag_index)["TAG_LPAR"_u], ""_u, "lpar"_u); + plist->insert((*tag_index)["TAG_RPAR"_u], ""_u, "rpar"_u); + plist->insert((*tag_index)["TAG_LQUEST"_u], ""_u, "lquest"_u); + plist->insert((*tag_index)["TAG_CM"_u], ""_u, "cm"_u); + plist->insert((*tag_index)["TAG_SENT"_u], ""_u, "sent"_u); +// plist->insert((*tag_index)["TAG_kMAS"_u], "+"_u, ""_u); plist->buildTransducer(); } diff --git a/apertium/xml_reader.cc b/apertium/xml_reader.cc index ccd3c3c..f0a542b 100644 --- a/apertium/xml_reader.cc +++ b/apertium/xml_reader.cc @@ -25,7 +25,7 @@ XMLReader::XMLReader() : reader(0), type(0) {} void XMLReader::stepToTag() { - while (name == "#text" || name == "#comment") { + while (name == "#text"_u || name == "#comment"_u) { step(); } } @@ -36,7 +36,7 @@ XMLReader::step() int retval = xmlTextReaderRead(reader); if (retval != 1) { - parseError("unexpected EOF"); + parseError("unexpected EOF"_u); } name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); @@ -69,12 +69,6 @@ XMLReader::attrib(UString const &name) return XMLParseUtil::attrib(reader, name); } -string -XMLReader::attrib(string const &name) -{ - return UtfConverter::toUtf8(attrib(UtfConverter::fromUtf8(name))); -} - void XMLReader::parseError(UString const &message) { @@ -87,7 +81,7 @@ XMLReader::parseError(UString const &message) void XMLReader::unexpectedTag() { - parseError("unexpected '<" + name + ">' tag"); + parseError("unexpected '<"_u + name + ">' tag"_u); } void diff --git a/apertium/xml_reader.h b/apertium/xml_reader.h index 46dfa7b..e2e3909 100644 --- a/apertium/xml_reader.h +++ b/apertium/xml_reader.h @@ -36,7 +36,6 @@ protected: int type; UString name; UString attrib(UString const &name); - string attrib(string const &name); void parseError(UString const &message); void unexpectedTag(); void stepToTag(); diff --git a/apertium/xml_walk_util.cc b/apertium/xml_walk_util.cc index 9a122ec..be2b10e 100644 --- a/apertium/xml_walk_util.cc +++ b/apertium/xml_walk_util.cc @@ -1,6 +1,6 @@ #include -children::children(const xmlNode* node_) +children::children(xmlNode* node_) : node(node_), cur(node->children) {} @@ -20,6 +20,7 @@ children::operator++() cur = cur->next; } } + return *this; } children diff --git a/apertium/xml_walk_util.h b/apertium/xml_walk_util.h index d715ebb..13ca6a4 100644 --- a/apertium/xml_walk_util.h +++ b/apertium/xml_walk_util.h @@ -12,8 +12,9 @@ private: xmlNode* node; xmlNode* cur; public: - children(const xmlNode* node); + children(xmlNode* node); children(const children& it); + ~children(); children& operator++(); children begin();