commit 09429754c11bc63786d2e12dbd1d2540de96b4e6 Author: Daniel Swanson Date: Wed Jun 30 08:50:26 2021 -0500 ICU Stuff (#127) ICU changes - replace all `std::wstring` and related types with `UString` - move input streams to `lttoolbox/input_file.h` - move output streams to `UFILE*` - switch transfer regular expressions from PCRE to ICU - use lttoolbox case functions efficiency, readability, and code style changes - copy `.editorconfig` file from lttoolbox - eliminate use of `void*` in transfer - prefer `.clear()` and `.empty()` over `= ""` and `== ""` - prefer range-for loops - simplify file-closing code in `tagger.cc` - remove unused `transfer_word_list` helper function, dependency, and code structure changes - unbundle utfcpp and rely on installed version - incorporate m4 changes suggested in #125 - move `StringUtils` to lttoolbox - add transfer regex optimizer from apertium-recursive - add `transfer_base` to combine shared code from `transfer`, `interchunk`, and `postchunk` - use iterator from `lttoolbox/xml_walk_util.h` to simplify transfer code other - add `` in interchunk and postchunk (closes #34) - reset transfer variables on null flush (closes #101) diff --git a/.editorconfig b/.editorconfig new file mode 100755 index 0000000..dd10a25 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +# https://editorconfig.org/ +root = yes + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 4 +indent_style = tab +insert_final_newline = true +trim_trailing_whitespace = true + +[**.cc] +indent_size = 2 +indent_style = space + +[**.h] +indent_size = 2 +indent_style = space diff --git a/Makefile.am b/Makefile.am index 444db77..dc6873f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,7 +15,7 @@ if HAVE_PYTHON_BINDINGS SUBDIRS += python endif -EXTRA_DIST=autogen.sh README-MODES apertium.m4 utf8 tests +EXTRA_DIST=autogen.sh README-MODES apertium.m4 tests install-data-local: mkdir -p $(DESTDIR)$(modesdir) diff --git a/apertium.m4 b/apertium.m4 index 46c8a6a..6b8012e 100644 --- a/apertium.m4 +++ b/apertium.m4 @@ -109,19 +109,25 @@ AC_DEFUN([AP_MKINCLUDE], cat >$srcdir/ap_include.am <@/@name' modes.xml | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + modes=\`xmllint --xpath '//mode@<:@@install="yes"@:>@/@name' \@S|@< | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ if test -n "\$\$modes"; then \\ \$(INSTALL_DATA) \$\$modes \$(DESTDIR)\$(apertium_modesdir); \\ rm \$\$modes; \\ fi +uninstall-modes: modes.xml + files=\`xmllint --xpath '//mode@<:@@install="yes"@:>@/@name' \@S|@< | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + if test -n "\$\$files"; then \\ + dir=\$(DESTDIR)\$(apertium_modesdir); \$(am__uninstall_files_from_dir) + fi + .deps/.d: \$(MKDIR_P) .deps touch \$[]@ diff --git a/apertium/Makefile.am b/apertium/Makefile.am index 48223c4..b731e60 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -38,7 +38,6 @@ h_sources = a.h \ stream_tagger.h \ streamed_type.h \ string_to_wostream.h \ - string_utils.h \ shell_utils.h \ tag.h \ tagger.h \ @@ -63,19 +62,19 @@ h_sources = a.h \ tmx_trail_postprocessors.h \ tmx_translate.h \ tmx_words.h \ + transfer_base.h \ transfer_data.h \ transfer.h \ transfer_instr.h \ transfer_mult.h \ + transfer_regex.h \ transfer_token.h \ transfer_word.h \ - transfer_word_list.h \ trx_reader.h \ tsx_reader.h \ ttag.h \ unigram_tagger.h \ unlocked_cstdio.h \ - utf_converter.h \ utils.h \ xml_reader.h @@ -109,7 +108,6 @@ cc_sources = a.cc \ sentence_stream.cc \ stream.cc \ stream_tagger.cc \ - string_utils.cc \ shell_utils.cc \ tag.cc \ tagger.cc \ @@ -130,16 +128,16 @@ cc_sources = a.cc \ tmx_trail_postprocessors.cc \ tmx_translate.cc \ transfer.cc \ + transfer_base.cc \ transfer_data.cc \ transfer_instr.cc \ transfer_mult.cc \ + transfer_regex.cc \ transfer_token.cc \ transfer_word.cc \ - transfer_word_list.cc \ trx_reader.cc \ tsx_reader.cc \ unigram_tagger.cc \ - utf_converter.cc \ xml_reader.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) @@ -301,7 +299,7 @@ apertium_gen_modes_SOURCES = gen_modes.cc apertium_gen_modes_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) if WINDOWS -AM_CPPFLAGS = -I$(top_srcdir)/utf8 -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) +AM_CPPFLAGS = -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) else AM_CPPFLAGS = -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) endif diff --git a/apertium/adapt_docx.cc b/apertium/adapt_docx.cc index 4699a0c..91f4937 100644 --- a/apertium/adapt_docx.cc +++ b/apertium/adapt_docx.cc @@ -30,7 +30,7 @@ #include #endif -#include "utf8/utf8.h" +#include #include "unicode/uchar.h" using namespace std; diff --git a/apertium/align.cc b/apertium/align.cc index 4b814a5..68f0bd3 100644 --- a/apertium/align.cc +++ b/apertium/align.cc @@ -15,7 +15,7 @@ #include "align.h" #include "linebreak.h" -#include +#include #include #include @@ -33,7 +33,7 @@ void align::align_( for (std::vector >::const_iterator i_ = string_.begin(); i_ != string_.end(); ++i_) { - std::wcerr << " " << std::setw(width_) << std::left << i_->first + std::cerr << " " << std::setw(width_) << std::left << i_->first << std::setw(0) << linebreak::linebreak_(i_->second, width_ + 2, width_ + 4) << '\n'; diff --git a/apertium/analysis.cc b/apertium/analysis.cc index 5e4e241..b3394ba 100644 --- a/apertium/analysis.cc +++ b/apertium/analysis.cc @@ -16,14 +16,12 @@ #include "analysis.h" #include "exception.h" -#include "morpheme.h" - -#include -#include namespace Apertium { -std::wostream &operator<<(std::wostream &Stream_, const Analysis &Analysis_) { - Stream_ << static_cast(Analysis_); +std::ostream &operator<<(std::ostream &Stream_, const Analysis &Analysis_) { + ::operator<<(Stream_, static_cast(Analysis_)); + //Stream_ << static_cast(Analysis_); + // namespace issue return Stream_; } @@ -35,21 +33,22 @@ bool operator<(const Analysis &a, const Analysis &b) { return a.TheMorphemes < b.TheMorphemes; } -Analysis::operator std::wstring() const { +Analysis::operator UString() const { if (TheMorphemes.empty()) throw Exception::Analysis::TheMorphemes_empty( "can't convert Analysis comprising empty Morpheme std::vector to " - "std::wstring"); + "UString"); std::vector::const_iterator Morpheme_ = TheMorphemes.begin(); - std::wstring wstring_ = *Morpheme_; + UString UString_ = *Morpheme_; ++Morpheme_; // Call .end() each iteration to save memory. for (; Morpheme_ != TheMorphemes.end(); ++Morpheme_) { - wstring_ += L"+" + static_cast(*Morpheme_); + UString_ += '+'; + UString_ += static_cast(*Morpheme_); } - return wstring_; + return UString_; } } diff --git a/apertium/analysis.h b/apertium/analysis.h index f70a966..194bc60 100644 --- a/apertium/analysis.h +++ b/apertium/analysis.h @@ -21,15 +21,16 @@ #include #include #include +#include namespace Apertium { class Analysis { public: - friend std::wostream &operator<<(std::wostream &Stream_, - const Analysis &Analysis_); + friend std::ostream &operator<<(std::ostream &Stream_, + const Analysis &Analysis_); friend bool operator==(const Analysis &a, const Analysis &b); friend bool operator<(const Analysis &a, const Analysis &b); - operator std::wstring() const; + operator UString() const; std::vector TheMorphemes; }; } diff --git a/apertium/apertium-multiple-translations.cc b/apertium/apertium-multiple-translations.cc index ecdebab..f7a6443 100644 --- a/apertium/apertium-multiple-translations.cc +++ b/apertium/apertium-multiple-translations.cc @@ -61,30 +61,25 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); if(argc >= 4) { - input = fopen(argv[3], "r"); - if(!input) - { + if (!input.open(argv[3])) { cerr << "Error: can't open input file '" << argv[3] << "'." << endl; exit(EXIT_FAILURE); } if(argc == 5) { - output = fopen(argv[4], "w"); + output = u_fopen(argv[4], "w", NULL, NULL); if(!output) { - cerr << "Error: can't open output file '"; - cerr << argv[4] << "'." << endl; - exit(EXIT_FAILURE); + cerr << "Error: can't open output file '"; + cerr << argv[4] << "'." << endl; + exit(EXIT_FAILURE); } } } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif TransferMult t; t.read(argv[1], argv[2]); diff --git a/apertium/apertium-postlatex-raw.l b/apertium/apertium-postlatex-raw.l index a7eb161..05a4094 100644 --- a/apertium/apertium-postlatex-raw.l +++ b/apertium/apertium-postlatex-raw.l @@ -6,10 +6,7 @@ #include #include -#include #include -#include -#include extern "C" { #if !defined(__STDC__) @@ -19,51 +16,13 @@ extern "C" { } #include -#include #ifndef GENFORMAT #include "apertium_config.h" #endif #include -#ifdef _WIN32 -#include -#include -#endif using namespace std; -AccentsMap accentsMap(true); -wstring closesym = L""; -string memconv = ""; - -wstring convertir(string const &multibyte, int const length) -{ - memconv.append(multibyte.c_str(), length); - int tam = memconv.size(); - wchar_t *retval = new wchar_t[tam+1]; - size_t l = mbstowcs(retval, memconv.c_str(), tam); - - if(l == ((size_t) -1)) - { - delete[] retval; - if(memconv.size() >= 4) - { - wcerr << L"Warning: wrong encoding" << endl; - } - return L""; - } - else - { - memconv = ""; - retval[l] = 0; - wstring ret = retval; - delete[] retval; - return ret; - } -} - - - - %} @@ -79,106 +38,107 @@ wstring convertir(string const &multibyte, int const length) " { - fputws(L"\"",yyout); + fputs_unlocked("\"",yyout); } ' { - fputws(L"\'",yyout); + fputs_unlocked("\'",yyout); } < { - fputws(L"<",yyout); + fputs_unlocked("<",yyout); } > { - fputws(L">",yyout); + fputs_unlocked(">",yyout); } & { - fputws(L"\\&",yyout); + fputs_unlocked("\\&",yyout); } \ { - fputws(L"&",yyout); + fputs_unlocked("&",yyout); } \ { - fputws(L"\\{", yyout); + fputs_unlocked("\\{", yyout); } \ { - fputws(L"\\}", yyout); + fputs_unlocked("\\}", yyout); } \ { - fputws(L"\\%", yyout); + fputs_unlocked("\\%", yyout); } ¿ { - fputws(L"?`",yyout); + fputs_unlocked("?`",yyout); } ¡ { - fputws(L"!`",yyout); + fputs_unlocked("!`",yyout); } \ { BEGIN(mathenv); - fputws(L"$$",yyout); + fputs_unlocked("$$",yyout); } \<\/MATH_DOLLARS\> { - fputws(L"$$",yyout); + fputs_unlocked("$$",yyout); BEGIN(0); } \ { BEGIN(mathenv); - fputws(L"$",yyout); + fputs_unlocked("$",yyout); } \<\/MATH_DOLLAR\> { - fputws(L"$",yyout); + fputs_unlocked("$",yyout); BEGIN(0); } \ { - fputws(L"\\(",yyout); + fputs_unlocked("\\(",yyout); } \<\/MATH_PAR\> { - fputws(L"\\)",yyout); + fputs_unlocked("\\)",yyout); } \ { - fputws(L"\\[",yyout); + fputs_unlocked("\\[",yyout); } \<\/MATH_BRA\> { - fputws(L"\\]",yyout); + fputs_unlocked("\\]",yyout); } \ { - fputws(L"{",yyout); + fputs_unlocked("{",yyout); } \<\/CONTENTS\> { - fputws(L"}",yyout); + fputs_unlocked("}",yyout); } &NBSP; { - fputws(L"~",yyout); + fputs_unlocked("~",yyout); } \ { - fputws(L"\\\\",yyout); + fputs_unlocked("\\\\",yyout); } \[^\<]* { - fputws((wstring(L"\%")+convertir(yytext+9,yyleng-9)).c_str(),yyout); + fputs_unlocked("\%", yyout); + fwrite(yytext+9, 1, yyleng-9, yyout); } \<\/COMMENT\> { @@ -186,14 +146,15 @@ wstring convertir(string const &multibyte, int const length) \[^\<]* { - fputws((wstring(L"[")+convertir(yytext+7,yyleng-7)).c_str(),yyout); + fputs_unlocked("[", yyout); + fwrite(yytext+7, 1, yyleng-7, yyout); } \<\/PARAM\> { - fputws(L"]", yyout); + fputs_unlocked("]", yyout); } \ { - fputws(L"\\verb", yyout); + fputs_unlocked("\\verb", yyout); } \<\/VERB\> { @@ -202,40 +163,51 @@ wstring convertir(string const &multibyte, int const length) \<[a-zA-Z0-9]+\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-2)+wstring(L"}")).c_str(),yyout); + fputs_unlocked("\\begin{", yyout); + fwrite(yytext+1, 1, yyleng-2, yyout); + fputs_unlocked("}", yyout); } \<[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-7)+wstring(L"*}")).c_str(),yyout); + fputs_unlocked("\\begin{", yyout); + fwrite(yytext+1, 1, yyleng-7, yyout); + fputs_unlocked("*}", yyout); } \<\/[a-zA-Z0-9]+\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-3)+wstring(L"}")).c_str(),yyout); + fputs_unlocked("\\end{", yyout); + fwrite(yytext+2, 1, yyleng-3, yyout); + fputs_unlocked("}", yyout); } \<\/[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-8)+wstring(L"*}")).c_str(),yyout); + fputs_unlocked("\\end{", yyout); + fwrite(yytext+2, 1, yyleng-8, yyout); + fputs_unlocked("*}", yyout); } \<[a-zA-Z0-9]+\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-3)).c_str(),yyout); + fputs_unlocked("\\", yyout); + fwrite(yytext+1, 1, yyleng-3, yyout); } \<[a-zA-Z0-9]+_STAR\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-8)+wstring(L"*")).c_str(),yyout); + fputc_unlocked('\\', yyout); + fwrite(yytext+1, 1, yyleng-8, yyout); + fputc_unlocked('*', yyout); } \# { - fputws(L"\\#", yyout); + fputs_unlocked("\\#", yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fwrite(yytext, 1, yyleng, yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fwrite(yytext, 1, yyleng, yyout); } @@ -289,10 +261,6 @@ int main(int argc, char *argv[]) break; } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif // prevent warning message yy_push_state(1); yy_top_state(); diff --git a/apertium/apertium-postlatex.l b/apertium/apertium-postlatex.l index c33673d..4bf1457 100644 --- a/apertium/apertium-postlatex.l +++ b/apertium/apertium-postlatex.l @@ -19,11 +19,11 @@ extern "C" { } #include -#include #ifndef GENFORMAT #include "apertium_config.h" #endif #include +#include #ifdef _WIN32 #include #include @@ -32,37 +32,6 @@ extern "C" { using namespace std; AccentsMap accentsMap(true); -wstring closesym = L""; -string memconv = ""; - -wstring convertir(string const &multibyte, int const length) -{ - memconv.append(multibyte.c_str(), length); - int tam = memconv.size(); - wchar_t *retval = new wchar_t[tam+1]; - size_t l = mbstowcs(retval, memconv.c_str(), tam); - - if(l == ((size_t) -1)) - { - delete[] retval; - if(memconv.size() >= 4) - { - wcerr << L"Warning: wrong encoding" << endl; - } - return L""; - } - else - { - memconv = ""; - retval[l] = 0; - wstring ret = retval; - delete[] retval; - return ret; - } -} - - - %} @@ -79,104 +48,105 @@ wstring convertir(string const &multibyte, int const length) " { - fputws(L"\"",yyout); + fputs("\"",yyout); } ' { - fputws(L"\'",yyout); + fputs("\'",yyout); } < { - fputws(L"<",yyout); + fputs("<",yyout); } > { - fputws(L">",yyout); + fputs(">",yyout); } & { - fputws(L"\\&",yyout); + fputs("\\&",yyout); } \ { - fputws(L"&",yyout); + fputs("&",yyout); } \ { - fputws(L"\\{", yyout); + fputs("\\{", yyout); } \ { - fputws(L"\\}", yyout); + fputs("\\}", yyout); } \ { - fputws(L"\\%", yyout); + fputs("\\%", yyout); } ¿ { - fputws(L"?`",yyout); + fputs("?`",yyout); } ¡ { - fputws(L"!`",yyout); + fputs("!`",yyout); } \ { BEGIN(mathenv); - fputws(L"$$",yyout); + fputs("$$",yyout); } \<\/MATH_DOLLARS\> { - fputws(L"$$",yyout); + fputs("$$",yyout); BEGIN(0); } \ { BEGIN(mathenv); - fputws(L"$",yyout); + fputs("$",yyout); } \<\/MATH_DOLLAR\> { - fputws(L"$",yyout); + fputs("$",yyout); BEGIN(0); } \ { - fputws(L"\\(",yyout); + fputs("\\(",yyout); } \<\/MATH_PAR\> { - fputws(L"\\)",yyout); + fputs("\\)",yyout); } \ { - fputws(L"\\[",yyout); + fputs("\\[",yyout); } \<\/MATH_BRA\> { - fputws(L"\\]",yyout); + fputs("\\]",yyout); } \ { - fputws(L"{",yyout); + fputs("{",yyout); } \<\/CONTENTS\> { - fputws(L"}",yyout); + fputs("}",yyout); } &NBSP; { - fputws(L"~",yyout); + fputs("~",yyout); } \ { - fputws(L"\\\\",yyout); + fputs("\\\\",yyout); } \[^\<]* { - fputws((wstring(L"\%")+convertir(yytext+9,yyleng-9)).c_str(),yyout); + fputs("\%", yyout); + fwrite(yytext+9, 1, yyleng-9, yyout); } \<\/COMMENT\> { @@ -184,14 +154,15 @@ wstring convertir(string const &multibyte, int const length) \[^\<]* { - fputws((wstring(L"[")+convertir(yytext+7,yyleng-7)).c_str(),yyout); + fputc('[', yyout); + fwrite(yytext+7, 1, yyleng-7, yyout); } \<\/PARAM\> { - fputws(L"]", yyout); + fputs("]", yyout); } \ { - fputws(L"\\verb", yyout); + fputs("\\verb", yyout); } \<\/VERB\> { @@ -201,99 +172,118 @@ wstring convertir(string const &multibyte, int const length) ł { - fputws(L"\\l", yyout); + fputs("\\l", yyout); } œ { - fputws(L"{\\oe}",yyout); + fputs("{\\oe}",yyout); } Œ { - fputws(L"{\\OE}",yyout); + fputs("{\\OE}",yyout); } æ { - fputws(L"{\\ae}",yyout); + fputs("{\\ae}",yyout); } Æ { - fputws(L"{\\AE}",yyout); + fputs("{\\AE}",yyout); } å { - fputws(L"{\\aa}",yyout); + fputs("{\\aa}",yyout); } Å { - fputws(L"{\\AA}",yyout); + fputs("{\\AA}",yyout); } ø { - fputws(L"{\\o}",yyout); + fputs("{\\o}",yyout); } Ø { - fputws(L"{\\O}",yyout); + fputs("{\\O}",yyout); } ß { - fputws(L"{\\ss}",yyout); + fputs("{\\ss}",yyout); } \<[a-zA-Z0-9]+\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-2)+wstring(L"}")).c_str(),yyout); + fputs("\\begin{", yyout); + fwrite(yytext+1, 1, yyleng-2, yyout); + fputc('}', yyout); } \ { - fputws((wstring(L"\\#")+convertir(yytext+6,yyleng-8)).c_str(),yyout); + fputs("\\#", yyout); + fwrite(yytext+6, 1, yyleng-8, yyout); } \ { - fputws(L"\\#", yyout); + fputs("\\#", yyout); } \<[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-7)+wstring(L"*}")).c_str(),yyout); + fputs("\\begin{", yyout); + fwrite(yytext+1, 1, yyleng-7, yyout); + fputs("*}", yyout); } \<\/[a-zA-Z0-9]+\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-3)+wstring(L"}")).c_str(),yyout); + fputs("\\end{", yyout); + fwrite(yytext+2, 1, yyleng-3, yyout); + fputc('}', yyout); } \<\/[a-zA-Z0-9]+_STAR\> { - fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-8)+wstring(L"*}")).c_str(),yyout); + fputs("\\end{", yyout); + fwrite(yytext+2, 1, yyleng-8, yyout); + fputs("*}", yyout); } \<[a-zA-Z0-9]+\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-3)).c_str(),yyout); + fputc('\\', yyout); + fwrite(yytext+1, 1, yyleng-3, yyout); } \<[a-zA-Z0-9]+_STAR\/\> { - fputws((wstring(L"\\")+convertir(yytext+1,yyleng-8)+wstring(L"*")).c_str(),yyout); + fputc('\\', yyout); + fwrite(yytext+1, 1, yyleng-8, yyout); + fputc('*', yyout); } /*NO ENTIENDO ESTA REGLA \# { - fputws(L"\\#", yyout); + fputs("\\#", yyout); }*/ [^A-Za-z\n] { - wstring wt = convertir(yytext,yyleng); - wstring wa = accentsMap.get(wt); - if( wa == L"" ) - fputws(wt.c_str(),yyout); - else - fputws(wstring(L"\\"+wa.substr(0,1)+L"{"+wa.substr(1)+L"}").c_str(),yyout); + UString wt = to_ustring(yytext); + UString wa = accentsMap.get(wt); + if (wa.empty()) { + fputus(wt, yyout); + } else { + UString tmp; + tmp += '\\'; + tmp += wa[0]; + tmp += '{'; + tmp += wa.substr(1); + tmp += '}'; + fputus(tmp, yyout); + } } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } @@ -323,7 +313,7 @@ int main(int argc, char *argv[]) base++; } - if((argc-base) > 4) + if((argc-base) > 4) { usage(argv[0]); } @@ -347,10 +337,6 @@ int main(int argc, char *argv[]) break; } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif // prevent warning message yy_push_state(1); yy_top_state(); diff --git a/apertium/apertium-prelatex.l b/apertium/apertium-prelatex.l index 2bf7243..13faec1 100644 --- a/apertium/apertium-prelatex.l +++ b/apertium/apertium-prelatex.l @@ -10,7 +10,8 @@ #include #include #include -#include +#include +#include extern "C" { #if !defined(__STDC__) @@ -20,7 +21,6 @@ extern "C" { } #include -#include #ifndef GENFORMAT #include "apertium_config.h" #endif @@ -33,38 +33,10 @@ extern "C" { using namespace std; AccentsMap accentsMap(false); -wstring closesym = L""; -string memconv = ""; +UString closesym; //For german babel detection bool ngermanbabel = false; -wstring convertir(string const &multibyte, int const length) -{ - memconv.append(multibyte.c_str(), length); - int tam = memconv.size(); - wchar_t *retval = new wchar_t[tam+1]; - size_t l = mbstowcs(retval, memconv.c_str(), tam); - - if(l == ((size_t) -1)) - { - delete[] retval; - if(memconv.size() >= 4) - { - wcerr << L"Warning: wrong encoding" << endl; - } - return L""; - } - else - { - memconv = ""; - retval[l] = 0; - wstring ret = retval; - delete[] retval; - return ret; - } -} - - %} @@ -84,25 +56,25 @@ wstring convertir(string const &multibyte, int const length) \\t\{..\} { //This information is lost - fputws(convertir(yytext+3,yyleng-4).c_str(),yyout); + fwrite(yytext+3, 1, yyleng-4, yyout); } \\l { - fputws(L"ł", yyout); + fputs("ł", yyout); } \"[oOaAuUsS] { //When usepackage[ngerman]{babel} is present (not checked). - if(!ngermanbabel) - fputws(convertir(yytext,yyleng).c_str(),yyout); - else { + if(!ngermanbabel) { + fputs(yytext, yyout); + } else { switch(yytext[1]){ - case 'o': fputws(L"ö", yyout); break; - case 'O': fputws(L"Ö", yyout); break; - case 'a': fputws(L"ä", yyout); break; - case 'A': fputws(L"Ä", yyout); break; - case 'u': fputws(L"ü", yyout); break; - case 'U': fputws(L"Ü", yyout); break; - case 's': fputws(L"ß", yyout); break; - case 'S': fputws(L"ß", yyout); break; + case 'o': fputs("ö", yyout); break; + case 'O': fputs("Ö", yyout); break; + case 'a': fputs("ä", yyout); break; + case 'A': fputs("Ä", yyout); break; + case 'u': fputs("ü", yyout); break; + case 'U': fputs("Ü", yyout); break; + case 's': fputs("ß", yyout); break; + case 'S': fputs("ß", yyout); break; } } } @@ -113,288 +85,304 @@ wstring convertir(string const &multibyte, int const length) switch(yytext[1]){ case '^': if(yytext[4]=='i') - fputws(L"î", yyout); + fputs("î", yyout); else - fputws(L"ĵ",yyout); + fputs("ĵ",yyout); break; case '\"': if(yytext[4]=='i') - fputws(L"ï",yyout); + fputs("ï",yyout); else - fputws(L"j",yyout); //should actually be j with umlaut + fputs("j",yyout); //should actually be j with umlaut break; case '\'': if(yytext[4]=='i') - fputws(L"í",yyout); + fputs("í",yyout); else - fputws(L"j",yyout); //should actually be j with accent + fputs("j",yyout); //should actually be j with accent break; case '`': if(yytext[4]=='i') - fputws(L"ì",yyout); + fputs("ì",yyout); else - fputws(L"k",yyout); //should actually be j with accent + fputs("k",yyout); //should actually be j with accent break; } } \{\\oe\} { - fputws(L"œ",yyout); + fputs("œ",yyout); } \{\\OE\} { - fputws(L"Œ",yyout); + fputs("Œ",yyout); } \{\\ae\} { - fputws(L"æ",yyout); + fputs("æ",yyout); } \{\\AE\} { - fputws(L"Æ",yyout); + fputs("Æ",yyout); } \{\\aa\} { - fputws(L"å",yyout); + fputs("å",yyout); } \{\\AA\} { - fputws(L"Å",yyout); + fputs("Å",yyout); } \{\\o\} { - fputws(L"ø",yyout); + fputs("ø",yyout); } \{\\O\} { - fputws(L"Ø",yyout); + fputs("Ø",yyout); } \{\\ss\} { - fputws(L"ß",yyout); + fputs("ß",yyout); } \\#[0-9]+ { - fputws((wstring(L"")).c_str(),yyout); + fputs("", yyout); } \\# { - fputws(L"", yyout); + fputs("", yyout); } \\[`'\^\"H~ck=b.druv]((\{.\})|(.)) { - wstring ws = convertir(yytext,yyleng).c_str(); - - wstring result = accentsMap.get( - L""+ws.substr(1,1)+ ( - (yyleng==3)? ws.substr(2,1) : ws.substr(3,1) - )); - - if(result == L"") - { - fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); - } - else - { - fputws(result.c_str(), yyout); - } + UString ws = to_ustring(yytext); + UString key; + key += ws[1]; + key += (yyleng == 3) ? ws[2] : ws[3]; + + UString result = accentsMap.get(key); + if (result.empty()) { + fputc('<', yyout); + fwrite(yytext+1, 1, yyleng-1, yyout); + fputs("/>", yyout); + } else { + fputus(result, yyout); + } } \\\\ { - fputws(L"
",yyout); + fputs("
",yyout); } \%.* { - if(yytext[yyleng-1]=='\r') - fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"\r")).c_str(),yyout); - else - fputws((wstring(L"")+convertir(yytext+1,yyleng-1)+wstring(L"")).c_str(),yyout); + fputs("", yyout); + if (yytext[yyleng-1] == '\r') { + fwrite(yytext+1, 1, yyleng-2, yyout); + fputs("\r", yyout); + } else { + fwrite(yytext+1, 1, yyleng-1, yyout); + fputs("", yyout); + } } \\usepackage\[[^\]]*\] { - wstring ws = convertir(yytext+12,yyleng-13); - fputws((wstring(L"")+ws+wstring(L"")).c_str(), yyout); - if(ws.find(L"ngerman") != wstring::npos) - ngermanbabel = true; + fputs("", yyout); + // this is maybe wrong, but hopefully no one puts non-ASCII + // characters in their package names + UString ws = to_ustring(yytext).substr(12, yyleng-13); + fputus(ws, yyout); + fputs("", yyout); + if(ws.find("ngerman"_u) != UString::npos) + ngermanbabel = true; } \[[^\]]*\] { - fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"")).c_str(), yyout); + fputs("", yyout); + fwrite(yytext+1, 1, yyleng-2, yyout); + fputs("", yyout); } \\begin[^a-zA-Z0-9_] { BEGIN(readbrackets); - closesym = L""; + closesym = ""_u; } \\end[^a-zA-Z0-9_] { BEGIN(readbrackets); - closesym = L"/"; + closesym = "/"_u; } [ \n\r\t]*\{?[ \n\r\t]* { - wstring ws = convertir(yytext,yyleng); - int i = ws.find(L'{'); //remove it - if(i>=0) - ws = ws.substr(0,i)+ws.substr(i+1); - fputws(ws.c_str(),yyout); + UString ws = to_ustring(yytext); + int i = ws.find('{'); + if (i >= 0) { + ws = ws.substr(0, i) + ws.substr(i+1); + } + fputus(ws, yyout); } [a-zA-Z0-9]+\* { - fputws((wstring(L"<")+closesym+convertir(yytext,yyleng-1)+wstring(L"_STAR>")).c_str(),yyout); + fputc('<', yyout); + fputus(closesym, yyout); + fwrite(yytext+0, 1, yyleng-1, yyout); + fputs("_STAR>", yyout); } [a-zA-Z0-9]+ { - fputws((wstring(L"<")+closesym+convertir(yytext,yyleng)+wstring(L">")).c_str(),yyout); + fputc('<', yyout); + fputus(closesym, yyout); + fputs(yytext, yyout); + fputc('>', yyout); } [ \n\r\t]*\}[ \n\r\t]* { BEGIN(0); - wstring ws = convertir(yytext,yyleng); - int i = ws.find(L'}'); //remove it - if(i>=0) - ws = ws.substr(0,i)+ws.substr(i+1); - fputws(ws.c_str(),yyout); + UString ws = to_ustring(yytext); + int i = ws.find('}'); + if (i >= 0) { + ws = ws.substr(0, i) + ws.substr(i+1); + } + fputus(ws, yyout); } \\[A-Za-z]+\* { - fputws((wstring(L"<")+convertir(yytext+1,yyleng-2)+wstring(L"_STAR/>")).c_str(),yyout); + fputc('<', yyout); + fwrite(yytext+1, 1, yyleng-2, yyout); + fputs("_STAR/>", yyout); } \\[A-Za-z]+ { - fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); + fputc('<', yyout); + fwrite(yytext+1, 1, yyleng-1, yyout); + fputs("/>", yyout); } \\\{ { - fputws(L"", yyout); + fputs("", yyout); } \\\{ { - fputws(L"", yyout); + fputs("", yyout); } \\\% { - fputws(L"", yyout); + fputs("", yyout); } \{ { - fputws(L"",yyout); + fputs("",yyout); } \} { - fputws((wstring(L"")).c_str(),yyout); + fputs("", yyout); } ~ { - fputws(L"&NBSP;",yyout); + fputs("&NBSP;",yyout); } \$\$ { BEGIN(mathenv); - fputws(L"",yyout); + fputs("",yyout); } \$\$ { - fputws(L"",yyout); + fputs("",yyout); BEGIN(0); } \$ { BEGIN(mathenv); - fputws(L"",yyout); + fputs("",yyout); } \$ { - fputws(L"",yyout); + fputs("",yyout); BEGIN(0); } \\verb[|][^|]+[|] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("", yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\verb[!][^!]+[!] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\verb[?][^?]+[?] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\verb[/][^/]+[/] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\verb[#][^#]+[#] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\verb[+][^+]+[+] { - fputws(L"",yyout); - wstring ws = convertir(yytext, yyleng); - fputws(ws.substr(5, ws.size()-5).c_str(), yyout); - fputws(L"", yyout); + fputs("",yyout); + fwrite(yytext+5, 1, yyleng-5, yyout); + fputs("", yyout); } \\\( { - fputws(L"",yyout); + fputs("",yyout); } \\\) { - fputws(L"",yyout); + fputs("",yyout); } \\\[ { - fputws(L"",yyout); + fputs("",yyout); } \\\] { - fputws(L"",yyout); + fputs("",yyout); } \?` { - fputws(L"¿",yyout); + fputs("¿",yyout); } !` { - fputws(L"¡",yyout); + fputs("¡",yyout); } \" { - fputws(L""",yyout); + fputs(""",yyout); } \' { - fputws(L"'",yyout); + fputs("'",yyout); } \< { - fputws(L"<",yyout); + fputs("<",yyout); } \> { - fputws(L">",yyout); + fputs(">",yyout); } \\\& { - fputws(L"&",yyout); + fputs("&",yyout); } \& { - fputws(L"",yyout); + fputs("",yyout); } @@ -402,11 +390,11 @@ wstring convertir(string const &multibyte, int const length) (.|\n|\r) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } (.|\n) { - fputws(convertir(yytext,yyleng).c_str(),yyout); + fputs(yytext, yyout); } @@ -420,9 +408,9 @@ wstring convertir(string const &multibyte, int const length) void usage(string const &progname) { - wcerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; - wcerr << "LaTeX format preprocessor " << endl; + cerr << "LaTeX format preprocessor " << endl; exit(EXIT_SUCCESS); } diff --git a/apertium/apertium_cleanstream.cc b/apertium/apertium_cleanstream.cc index bd43a0b..38aa316 100644 --- a/apertium/apertium_cleanstream.cc +++ b/apertium/apertium_cleanstream.cc @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include #ifdef __MINGW32__ #include @@ -29,154 +32,73 @@ using namespace std; -#ifndef fputwc_unlocked -#define fputwc_unlocked fputwc -#endif - -#ifndef fputws_unlocked -#define fputws_unlocked fputws -#endif - -#ifndef fgetwc_unlocked -#define fgetwc_unlocked getwc -#endif - - -void -tryToSetLocale() -{ -#if !defined(__CYGWIN__) && !defined (__MINGW32__) - if(setlocale(LC_CTYPE, "") != NULL) - { - return; - } - - wcerr << "Warning: unsupported locale, fallback to \"C\"" << endl; - - setlocale(LC_ALL, "C"); -#endif -#ifdef __CYGWIN__ - setlocale(LC_ALL, "C.UTF-8"); -#endif -#ifdef __MINGW32__ - //SetConsoleInputCP(65001); - SetConsoleOutputCP(65001); -#endif -} - -wstring -readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - if(c != L'\\') - { - continue; - } - else - { - result += L'\\'; - c = static_cast(fgetwc(input)); - result += c; - } - } - - if(c != delim2) - { - wcerr << "Error: expected: " << delim2 << ", saw: " << c << endl; - } - - return result; -} - int main (int argc, char** argv) { - wstring buf = L""; - wstring blanktmp = L""; + UString buf; + UString blanktmp; bool keepblank = false; bool spaced = true; bool intoken = false; - wchar_t ws = L' '; + UChar32 ws = ' '; for(int i=1; i #include #include -#include +#include +#include #include #include @@ -36,49 +37,29 @@ using namespace Apertium; using namespace std; -FILE * open_file(char const *filename, char const *mode) -{ - FILE *retval; - - struct stat var; - if(stat(filename, &var)) - { - wcerr << "Can't stat '" << filename << "'" << endl; - exit(EXIT_FAILURE); - } - - retval = fopen(filename, mode); - - if(!retval) - { - wcerr << "Can't open '" << filename << "'" << endl; - exit(EXIT_FAILURE); - } -#ifdef _MSC_VER - _setmode(_fileno(retval), _O_U8TEXT); -#endif - - return retval; -} - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); if(argc < 2 || argc > 4) { - wcerr << "USAGE: " << basename(argv[0]) << " tsx_file [input [output]" << endl; + cerr << "USAGE: " << basename(argv[0]) << " tsx_file [input [output]" << endl; exit(EXIT_FAILURE); } - FILE *input = stdin, *output = stdout; + char* input = NULL; + UFILE* output = u_finit(stdout, NULL, NULL); switch(argc) { case 4: - output = open_file(argv[3], "w"); + output = u_fopen(argv[3], "w", NULL, NULL); + if (!output) { + cerr << "Error: Unable to open '" << argv[3] << "' for writing." << endl; + exit(EXIT_FAILURE); + } // no break case 3: - input = open_file(argv[2], "r"); + input = argv[2]; // no break case 2: default: diff --git a/apertium/apertium_interchunk.cc b/apertium/apertium_interchunk.cc index 73bc555..b6b6c8b 100644 --- a/apertium/apertium_interchunk.cc +++ b/apertium/apertium_interchunk.cc @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "getopt_long.h" #ifdef _MSC_VER @@ -31,19 +31,18 @@ #include #endif -using namespace Apertium; using namespace std; void message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " [-tz] t2x preproc [input [output]]" << endl; - wcerr << " t2x t2x rules file" << endl; - wcerr << " preproc result of preprocess trules file" << endl; - wcerr << " input input file, standard input by default" << endl; - wcerr << " output output file, standard output by default" << endl; - wcerr << "OPTIONS" < saved_token = tagged_sent[token_idx]; tagged_sent[token_idx] = lu.TheAnalyses[analy_idx]; - std::wcout << L"LU:" << tagged_sent[token_idx] << std::endl ; + std::cout << "LU:" << tagged_sent[token_idx] << std::endl ; std::vector &wordoids = lu.TheAnalyses[analy_idx].TheMorphemes; for (wrd_idx=0; wrd_idx\n"; - std::wcout << "Output features and weights from a model file.\n"; - std::wcout << argv[0] << " mtx \n"; - std::wcout << "Output macros and features from an mtx file.\n"; - std::wcout << argv[0] << " path \n"; - std::wcout << "Trace a particular path through giving which features fire " + std::cout << "Run with one of:\n"; + std::cout << argv[0] << " model \n"; + std::cout << "Output features and weights from a model file.\n"; + std::cout << argv[0] << " mtx \n"; + std::cout << "Output macros and features from an mtx file.\n"; + std::cout << argv[0] << " path \n"; + std::cout << "Trace a particular path through giving which features fire " << "and the resulting score. Useful for interactively " << "designing feature sets.\n"; } diff --git a/apertium/apertium_postchunk.cc b/apertium/apertium_postchunk.cc index bde462e..8c3cfe8 100644 --- a/apertium/apertium_postchunk.cc +++ b/apertium/apertium_postchunk.cc @@ -24,60 +24,46 @@ #include #include #include -#include +#include #ifdef _MSC_VER #include #include #endif -using namespace Apertium; using namespace std; void message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " [-z] t3x preproc [input [output]]" << endl; - wcerr << " t3x t3x rules file" << endl; - wcerr << " preproc result of preprocess trules file" << endl; - wcerr << " input input file, standard input by default" << endl; - wcerr << " output output file, standard output by default" << endl; - wcerr << "OPTIONS" < #include #endif -#include +#include #include +#include -using namespace Apertium; using namespace std; void usage(char *progname) { - wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; - wcerr << L" -z null-flushing output on '\0'" << endl; - wcerr << L" -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " [input_file [output_file]]" << endl; + cerr << " -z null-flushing output on '\0'" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); } -void processStream(FILE *in, FILE *out, bool null_flush) +void processStream(InputFile& in, UFILE* out, bool null_flush) { int prev = -1; - int c = fgetc(in); + UChar32 c = in.get(); while (c != EOF) { if (!((c == ' ') && (prev == ' '))) { - putc(c, out); + u_fputc(c, out); } if (c == 0 && null_flush) { - fflush(out); - putc(c, out); + u_fflush(out); + u_fputc(c, out); } prev = c; - c = fgetc(in); + c = in.get(); } } @@ -98,44 +98,33 @@ int main(int argc, char *argv[]) usage(argv[0]); } - FILE *input, *output; + InputFile input; + UFILE* output; if((argc-optind+1) == 1) { - input = stdin; - output = stdout; + output = u_finit(stdout, NULL, NULL); } else if ((argc-optind+1) == 2) { - input = fopen(argv[argc-1], "r"); - if(!input) - { + if (!input.open(argv[argc-1])) { usage(argv[0]); } - output = stdout; + output = u_finit(stdout, NULL, NULL); } else { - input = fopen(argv[argc-2], "r"); - output = fopen(argv[argc-1], "w"); - - if(!input || !output) - { + output = u_fopen(argv[argc-1], "w", NULL, NULL); + if (!output || !input.open(argv[argc-2])) { usage(argv[0]); } } - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + cerr << "ERROR: Can't read file '" << argv[1] << "'" << endl; exit(EXIT_FAILURE); } processStream(input, output, null_flush); - -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - } diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc index 95c2c4c..f5c07f0 100644 --- a/apertium/apertium_pretransfer.cc +++ b/apertium/apertium_pretransfer.cc @@ -26,18 +26,18 @@ #include #endif #include -#include +#include +#include -using namespace Apertium; using namespace std; void usage(char *progname) { - wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; - wcerr << L" -n assume no surface forms" << endl; - wcerr << L" -e treat ~ as compound separator" << endl; - wcerr << L" -z null-flushing output on '\0'" << endl; - wcerr << L" -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " [input_file [output_file]]" << endl; + cerr << " -n assume no surface forms" << endl; + cerr << " -e treat ~ as compound separator" << endl; + cerr << " -z null-flushing output on '\\0'" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); } @@ -90,43 +90,35 @@ int main(int argc, char *argv[]) usage(argv[0]); } - FILE *input, *output; + InputFile input; + UFILE* output; if((argc-optind+1) == 1) { - input = stdin; - output = stdout; + output = u_finit(stdout, NULL, NULL); } else if ((argc-optind+1) == 2) { - input = fopen(argv[argc-1], "r"); - if(!input) - { + if(!input.open(argv[argc-1])) { usage(argv[0]); } - output = stdout; + output = u_finit(stdout, NULL, NULL); } else { - input = fopen(argv[argc-2], "r"); - output = fopen(argv[argc-1], "w"); + output = u_fopen(argv[argc-1], "w", NULL, NULL); - if(!input || !output) + if(!output || !input.open(argv[argc-2])) { usage(argv[0]); } } - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + cerr << "ERROR: Can't read file '" << argv[1] << "'" << endl; exit(EXIT_FAILURE); } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - processStream(input, output, null_flush, surface_forms, compound_sep); } diff --git a/apertium/apertium_re.cc b/apertium/apertium_re.cc index 7182614..01599b3 100644 --- a/apertium/apertium_re.cc +++ b/apertium/apertium_re.cc @@ -18,155 +18,114 @@ #include #include #include -#include +#include -using namespace Apertium; using namespace std; +using namespace icu; -std::string& pcre_version_endian() { - static std::string pve; - if (pve.empty()) { - pve = pcre_version(); -#ifdef WORDS_BIGENDIAN - pve += "-be"; -#else - pve += "-le"; -#endif - } - return pve; -} - -ApertiumRE::ApertiumRE() : -re(0) -{ - empty = true; -} +ApertiumRE::ApertiumRE() {} ApertiumRE::~ApertiumRE() { - if(!empty) - { - pcre_free(re); + if (re != nullptr) { + delete re; } - empty = true; } void ApertiumRE::read(FILE *input) { unsigned int size = Compression::multibyte_read(input); - re = static_cast(pcre_malloc(size)); - if(size != fread(re, 1, size, input)) - { - wcerr << L"Error reading regexp" << endl; + if (fseek(input, size, SEEK_CUR) != 0) { + cerr << "Error reading regexp" << endl; exit(EXIT_FAILURE); } - - empty = false; } void -ApertiumRE::compile(string const &str) +ApertiumRE::compile(UString const &str) { - const char *error; - int erroroffset; - re = pcre_compile(str.c_str(), PCRE_DOTALL|PCRE_CASELESS|PCRE_EXTENDED|PCRE_UTF8, - &error, &erroroffset, NULL); - if(re == NULL) - { - wcerr << L"Error: pcre_compile "; - wcerr << error << endl; + if (re != nullptr) { + delete re; + } + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + re = RegexPattern::compile(s, UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, err); + if(U_FAILURE(err)) { + cerr << "Error: unable to compile regular expression '" << str << "'." << endl; + cerr << "error code: " << u_errorName(err) << endl; exit(EXIT_FAILURE); } - - empty = false; } void ApertiumRE::write(FILE *output) const { - if(empty) - { - wcerr << L"Error, cannot write empty regexp" << endl; + if (re == nullptr) { + cerr << "Error, cannot write empty regexp" << endl; exit(EXIT_FAILURE); } + // for backwards compatibility, write empty binary form + Compression::multibyte_write(0, output); +} - size_t size; - int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); - if(rc < 0) - { - wcerr << L"Error calling pcre_fullinfo()\n" << endl; - exit(EXIT_FAILURE); +UString +ApertiumRE::match(UString const &str) const +{ + if(re == nullptr) { + return ""_u; } - Compression::multibyte_write(size, output); + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + RegexMatcher* m = re->matcher(s, err); - size_t rc2 = fwrite(re, 1, size, output); - if(rc2 != size) - { - wcerr << L"Error writing precompiled regex\n" << endl; + if (U_FAILURE(err)) { + cerr << "Error: Unable to apply regexp" << endl; + cerr << "error code: " << u_errorName(err) << endl; exit(EXIT_FAILURE); } -} -string -ApertiumRE::match(string const &str) const -{ - if(empty) - { - return ""; + if (!m->find()) { + return ""_u; } - int result[3]; - int workspace[4096]; -// int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return ""; - - default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; - exit(EXIT_FAILURE); - } + UString ret = m->group(err).getTerminatedBuffer(); + if (U_FAILURE(err)) { + cerr << "Error: Unable to extract substring from regexp match" << endl; + cerr << "error code: " << u_errorName(err) << endl; + exit(EXIT_FAILURE); } - return str.substr(result[0], result[1]-result[0]); + return ret; } // Return true if something was replaced and false otherwise bool -ApertiumRE::replace(string &str, string const &value) const +ApertiumRE::replace(UString &str, UString const &value) const { - if(empty) - { + if(re == nullptr) { return false; } - int result[3]; - int workspace[4096]; - // int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return false; - - default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; - exit(EXIT_FAILURE); - } + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + RegexMatcher* m = re->matcher(s, err); + + if (U_FAILURE(err)) { + cerr << "Error: Unable to apply regexp" << endl; + cerr << "error code: " << u_errorName(err) << endl; + exit(EXIT_FAILURE); } - string res = str.substr(0, result[0]); + // do this manually rather than call m->replaceFirst() + // because we want to know that a match happened + if (!m->find()) { + return false; + } + UString res = str.substr(0, m->start(err)); res.append(value); - res.append(str.substr(result[1])); - str = res; + res.append(str.substr(m->end(err))); + res.swap(str); return true; } diff --git a/apertium/apertium_re.h b/apertium/apertium_re.h index c9cb8c0..3cca42b 100644 --- a/apertium/apertium_re.h +++ b/apertium/apertium_re.h @@ -18,27 +18,24 @@ #ifndef _APERTIUM_RE_ #define _APERTIUM_RE_ -#include #include -#include +#include +#include using namespace std; -std::string& pcre_version_endian(); - class ApertiumRE { private: - bool empty; - pcre *re; + icu::RegexPattern* re = nullptr; public: ApertiumRE(); ~ApertiumRE(); void read(FILE *); void write(FILE *) const; - string match(string const &str) const; - bool replace(string &str, string const &value) const; - void compile(string const &str); + UString match(UString const &str) const; + bool replace(UString &str, UString const &value) const; + void compile(UString const &str); }; #endif diff --git a/apertium/apertium_tagger.cc b/apertium/apertium_tagger.cc index 8f6bb74..a384330 100644 --- a/apertium/apertium_tagger.cc +++ b/apertium/apertium_tagger.cc @@ -33,7 +33,7 @@ int main(int argc, char **argv) { try { Apertium::apertium_tagger(argc, argv); } catch (const Apertium::Exception::apertium_tagger::err_Exception &err_Exception_) { - std::wcerr << "Try 'apertium-tagger --help' for more information." << std::endl; + std::cerr << "Try 'apertium-tagger --help' for more information." << std::endl; return 1; } catch (...) { throw; diff --git a/apertium/apertium_tagger_apply_new_rules.cc b/apertium/apertium_tagger_apply_new_rules.cc index a8238e6..93974ac 100644 --- a/apertium/apertium_tagger_apply_new_rules.cc +++ b/apertium/apertium_tagger_apply_new_rules.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include using namespace Apertium; @@ -38,17 +38,17 @@ TTag eos; //End-of-sentence tag void check_file(FILE *f, const string& path) { if (!f) { - wcerr<<"Error: cannot open file '"< #include #include #include @@ -25,7 +24,7 @@ #include #include -#include +#include using namespace std; @@ -36,13 +35,13 @@ bool check_ambclasses; void check_file(FILE *f, const string& path) { if (!f) { - wcerr<<"Error: cannot open file '"<get_superficial_form())<<" "<get_string_tags())<<"\n"; + cout << word->get_superficial_form() << " " << word->get_string_tags() << "\n"; if (check_ambclasses) { int k=tagger_data_hmm.getOutput()[word->get_tags()]; if ((k>=tagger_data_hmm.getM())||(k<0)) { - wcerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; - wcerr<<"Ambiguity class: "<get_string_tags())<<"\n"; + cerr<<"Error: Ambiguity class number out of range: "<get_superficial_form() << "\n"; + cerr<<"Ambiguity class: "<< word->get_string_tags() << "\n"; } } @@ -69,15 +68,15 @@ void readwords (FILE *is, int corpus_length) { word=lexmorfo.get_next_word(); } - wcerr<] < file.crp \n\n"; + cerr<<"USAGE:\n"; + cerr<] < file.crp \n\n"; - wcerr<<"ARGUMENTS: \n" + cerr<<"ARGUMENTS: \n" <<" --tsxfile|-x: Specify a tagger specification file\n" <<" --probfile|-p: Specify a tagger parameter file\n" <<" --clength|-l: Specify the length of the corpus to process\n"; @@ -92,12 +91,12 @@ int main(int argc, char* argv[]) { int c; int option_index=0; - wcerr<<"LOCALE: "< #include -#include -#include +#include #include "apertium_config.h" #include -using namespace Apertium; using namespace std; void usage(char *progname) { - wcerr << L"USAGE: " << basename(progname) << L" [options] code1 code2 doc1 doc2 [output_file]" << endl; - wcerr << L"Options:" << endl; - wcerr << L" -p percent number 0 < n <= 1 to set margin of confidence of TU's " << endl; - wcerr << L" (0.85 by default) in length terms" << endl; - wcerr << L" -e edit number 0 < n <= 1 to set margin of confidence of TU's " << endl; - wcerr << L" (0.30 by default) in edit distance terms" << endl; - wcerr << L" -l low-limit ignore percent if the segment is less than lowlimit" < #include #include -#include +#include #include "getopt_long.h" #ifdef _MSC_VER #include #include #endif -using namespace Apertium; using namespace std; void message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " trules preproc biltrans [input [output]]" << endl; - wcerr << " " << basename(progname) << " -b trules preproc [input [output]]" << endl; - wcerr << " " << basename(progname) << " -n trules preproc [input [output]]" << endl; - wcerr << " " << basename(progname) << " -x extended trules preproc biltrans [input [output]]" << endl; - wcerr << " " << basename(progname) << " -c trules preproc biltrans [input [output]]" << endl; - wcerr << " " << basename(progname) << " -t trules preproc biltrans [input [output]]" << endl; - wcerr << " trules transfer rules file" << endl; - wcerr << " preproc result of preprocess trules file" << endl; - wcerr << " biltrans bilingual letter transducer file" << endl; - wcerr << " input input file, standard input by default" << endl; - wcerr << " output output file, standard output by default" << endl; - wcerr << " -b input from lexical transfer" << endl; - wcerr << " -n don't use bilingual dictionary" << endl; - wcerr << " -x bindix extended mode with user dictionary" << endl; - wcerr << " -c case-sensitiveness while accessing bilingual dictionary" << endl; - wcerr << " -t trace (show rule numbers and patterns matched)" << endl; - wcerr << " -T trace, for apertium-transfer-tools (also sets -t)" << endl; - wcerr << " -z null-flushing output on '\0'" << endl; - wcerr << " -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -b trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -n trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -x extended trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -c trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -t trules preproc biltrans [input [output]]" << endl; + cerr << " trules transfer rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " biltrans bilingual letter transducer file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << " -b input from lexical transfer" << endl; + cerr << " -n don't use bilingual dictionary" << endl; + cerr << " -x bindix extended mode with user dictionary" << endl; + cerr << " -c case-sensitiveness while accessing bilingual dictionary" << endl; + cerr << " -t trace (show rule numbers and patterns matched)" << endl; + cerr << " -T trace, for apertium-transfer-tools (also sets -t)" << endl; + cerr << " -z null-flushing output on '\0'" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); @@ -64,32 +63,27 @@ void testfile(string const &filename) struct stat mybuf; if(stat(filename.c_str(), &mybuf) == -1) { - wcerr << "Error: can't stat file '"; - wcerr << filename << "'." << endl; + cerr << "Error: can't stat file '"; + cerr << filename << "'." << endl; exit(EXIT_FAILURE); } } -FILE * open_input(string const &filename) +void open_input(InputFile& input, const char* filename) { - FILE *input = fopen(filename.c_str(), "r"); - if(!input) - { - wcerr << "Error: can't open input file '"; - wcerr << filename.c_str() << "'." << endl; + if (!input.open(filename)) { + cerr << "Error: can't open input file '"; + cerr << filename << "'." << endl; exit(EXIT_FAILURE); } - - return input; } -FILE * open_output(string const &filename) +UFILE* open_output(const char* filename) { - FILE *output = fopen(filename.c_str(), "w"); - if(!output) - { - wcerr << "Error: can't open output file '"; - wcerr << filename.c_str() << "'." << endl; + UFILE* output = u_fopen(filename, "w", NULL, NULL); + if(!output) { + cerr << "Error: can't open output file '"; + cerr << filename << "'." << endl; exit(EXIT_FAILURE); } return output; @@ -107,13 +101,13 @@ int main(int argc, char *argv[]) static struct option long_options[] = { {"from-bilingual", no_argument, 0, 'b'}, - {"no-bilingual", no_argument, 0, 'n'}, + {"no-bilingual", no_argument, 0, 'n'}, {"extended", required_argument, 0, 'x'}, - {"case-sensitive", no_argument, 0, 'c'}, - {"null-flush", no_argument, 0, 'z'}, - {"trace", no_argument, 0, 't'}, - {"trace_att", no_argument, 0, 'T'}, - {"help", no_argument, 0, 'h'}, + {"case-sensitive", no_argument, 0, 'c'}, + {"null-flush", no_argument, 0, 'z'}, + {"trace", no_argument, 0, 't'}, + {"trace_att", no_argument, 0, 'T'}, + {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; @@ -160,13 +154,14 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); switch(argc - optind + 1) { case 6: output = open_output(argv[argc-1]); - input = open_input(argv[argc-2]); + open_input(input, argv[argc-2]); testfile(argv[argc-3]); testfile(argv[argc-4]); testfile(argv[argc-5]); @@ -177,14 +172,14 @@ int main(int argc, char *argv[]) if(t.getUseBilingual() == false || t.getPreBilingual() == true) { output = open_output(argv[argc-1]); - input = open_input(argv[argc-2]); + open_input(input, argv[argc-2]); testfile(argv[argc-3]); testfile(argv[argc-4]); t.read(argv[argc-4], argv[argc-3]); } else { - input = open_input(argv[argc-1]); + open_input(input, argv[argc-1]); testfile(argv[argc-2]); testfile(argv[argc-3]); testfile(argv[argc-4]); @@ -195,7 +190,7 @@ int main(int argc, char *argv[]) case 4: if(t.getUseBilingual() == false || t.getPreBilingual() == true) { - input = open_input(argv[argc-1]); + open_input(input, argv[argc-1]); testfile(argv[argc-2]); testfile(argv[argc-3]); t.read(argv[argc-3], argv[argc-2]); @@ -226,11 +221,6 @@ int main(int argc, char *argv[]) break; } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - t.transfer(input, output); return EXIT_SUCCESS; } diff --git a/apertium/collection.cc b/apertium/collection.cc index 995c14a..ab80607 100644 --- a/apertium/collection.cc +++ b/apertium/collection.cc @@ -16,7 +16,7 @@ */ #include #include -#include +#include #include #include diff --git a/apertium/constant_manager.cc b/apertium/constant_manager.cc index c4ddb32..5302171 100644 --- a/apertium/constant_manager.cc +++ b/apertium/constant_manager.cc @@ -16,7 +16,7 @@ */ #include #include -#include +#include #include #include @@ -57,13 +57,13 @@ ConstantManager::operator =(ConstantManager const &o) return *this; } void -ConstantManager::setConstant(wstring const &constant, int const value) +ConstantManager::setConstant(UString const &constant, int const value) { constants[constant] = value; } int -ConstantManager::getConstant(wstring const &constant) +ConstantManager::getConstant(UString const &constant) { return constants[constant]; } @@ -73,10 +73,10 @@ ConstantManager::write(FILE *output) { Compression::multibyte_write(constants.size(), output); - for(map::const_iterator it = constants.begin(), limit = constants.end(); + for(map::const_iterator it = constants.begin(), limit = constants.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second, output); } } @@ -88,7 +88,7 @@ ConstantManager::read(FILE *input) int size = Compression::multibyte_read(input); for(int i = 0; i != size; i++) { - wstring mystr = Compression::wstring_read(input); + UString mystr = Compression::string_read(input); constants[mystr] = Compression::multibyte_read(input); } } @@ -96,11 +96,11 @@ ConstantManager::read(FILE *input) void ConstantManager::serialise(std::ostream &serialised) const { - Serialiser >::serialise(constants, serialised); + Serialiser >::serialise(constants, serialised); } void ConstantManager::deserialise(std::istream &serialised) { - constants = Deserialiser >::deserialise(serialised); + constants = Deserialiser >::deserialise(serialised); } diff --git a/apertium/constant_manager.h b/apertium/constant_manager.h index d8ed3f7..395edae 100644 --- a/apertium/constant_manager.h +++ b/apertium/constant_manager.h @@ -20,13 +20,14 @@ #include #include #include +#include using namespace std; class ConstantManager { private: - map constants; + map constants; void copy(ConstantManager const &o); void destroy(); @@ -36,8 +37,8 @@ public: ConstantManager(ConstantManager const &o); ConstantManager & operator =(ConstantManager const &o); - void setConstant(wstring const &constant, int const value); - int getConstant(wstring const &constant); + void setConstant(UString const &constant, int const value); + int getConstant(UString const &constant); void write(FILE *output); void read(FILE *input); void serialise(std::ostream &serialised) const; diff --git a/apertium/deformat.xsl b/apertium/deformat.xsl index fd93401..b3da49b 100644 --- a/apertium/deformat.xsl +++ b/apertium/deformat.xsl @@ -90,15 +90,12 @@ - + - - - - + @@ -164,50 +161,45 @@ extern "C" { } #include <lttoolbox/lt_locale.h> -#include <lttoolbox/ltstr.h> -#include <apertium/string_to_wostream.h> +#include <lttoolbox/ustring.h> #ifndef GENFORMAT #include "apertium_config.h" #endif -#include <utf8/utf8.h> +#include <utf8.h> #include <apertium/unlocked_cstdio.h> -#ifdef _WIN32 -#include <io.h> -#include <fcntl.h> -#define utf8to32 utf8to16 -#define utf32to8 utf16to8 -#endif using namespace std; -wstring buffer; +UString buffer; string symbuf; bool isDot, isEoh, hasWrite_dot, hasWrite_white; bool eosIncond; bool noDot; bool markEoh; -FILE *formatfile; +UFILE* formatfile; string last; int current; long int offset; vector<long int> offsets; -vector<wstring> tags; +vector<UString> tags; vector<int> orders; regex_t escape_chars; regex_t names_regexp; -void bufferAppend(wstring &buf, string const &str) +void bufferAppend(UString &buf, string const &str) { - symbuf.append(str); - if (utf8::is_valid(symbuf.begin(), symbuf.end())) { - utf8::utf8to32(symbuf.begin(), symbuf.end(), std::back_inserter(buf)); - symbuf.clear(); - } + buf += to_ustring(str.c_str()); } +void put(const UString& str, FILE* f) +{ + string temp; + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(temp)); + fputs_unlocked(temp.c_str(), f); +} void init_escape() { @@ -218,7 +210,7 @@ void init_escape() ", REG_EXTENDED)) { - wcerr << "ERROR: Illegal regular expression for escape characters" << endl; + cerr << "ERROR: Illegal regular expression for escape characters" << endl; exit(EXIT_FAILURE); } } @@ -232,7 +224,7 @@ void init_tagNames() ", REG_EXTENDED)) { - wcerr << "ERROR: Illegal regular expression for tag-names" << endl; + cerr << "ERROR: Illegal regular expression for tag-names" << endl; exit(EXIT_FAILURE); } } @@ -254,20 +246,20 @@ string backslash(string const &str) } -wstring escape(string const &str) +UString escape(string const &str) { regmatch_t pmatch; char const *mystring = str.c_str(); int base = 0; - wstring result; + UString result; while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0)) { bufferAppend(result, str.substr(base, pmatch.rm_so)); - result += L'\\'; + result += '\\'; const char *mb = str.c_str() + base + pmatch.rm_so; - wchar_t micaracter = utf8::next(mb, mb+4); + UChar32 micaracter = utf8::next(mb, mb+4); result += micaracter; base += pmatch.rm_eo; @@ -277,10 +269,10 @@ wstring escape(string const &str) return result; } -wstring escape(wstring const &str) +UString escape(UString const &str) { string dest; - utf8::utf32to8(str.begin(), str.end(), std::back_inserter(dest)); + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(dest)); return escape(dest); } @@ -302,7 +294,7 @@ string get_tagName(string tag){ - + @@ -313,9 +305,9 @@ string get_tagName(string tag){ - + - + @@ -328,7 +320,7 @@ int get_index(string end_tag){ for (int i=tags.size()-1; i >= 0; i--) { new_end_tag.clear(); - utf8::utf32to8(tags[i].begin(), tags[i].end(), std::back_inserter(new_end_tag)); + utf8::utf16to8(tags[i].begin(), tags[i].end(), std::back_inserter(new_end_tag)); if (get_tagName(end_tag) == get_tagName(new_end_tag)) return i; @@ -338,15 +330,8 @@ int get_index(string end_tag){ } void print_emptyTags() { - wchar_t tag[250]; - for (size_t i=0; i < tags.size(); i++) { - swprintf(tag, 250, L"<format-tag offset=\"%d\" order= \"%d\"><![CDATA[", offsets[i], orders[i]); - fputws(tag, formatfile); - fputws(tags[i].c_str(), formatfile); - fputwc(L']', formatfile); - swprintf(tag, 250, L"]></format-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "<format-tag offset=\"%d\" order= \"%d\"><![CDATA[%S]></format-tag>\n", offsets[i], orders[i], tags[i].c_str()); } } @@ -355,14 +340,11 @@ void print_emptyTags() { void printBuffer(int ind=-1, string end_tag="") { - wchar_t tag[250]; - wstring etiketa; - wstring wend_tag; + UString etiketa; + UString wend_tag = to_ustring(end_tag.c_str()); size_t pos; int num; - utf8::utf8to32(end_tag.begin(), end_tag.end(), std::back_inserter(wend_tag)); - if (ind != -1 && ind == tags.size()-1 && offsets[ind] == offset && orders[ind] == current) { @@ -372,7 +354,7 @@ void printBuffer(int ind=-1, string end_tag="") offsets.pop_back(); orders.pop_back(); } - else if (ind == -1 && wend_tag != L"") + else if (ind == -1 && !wend_tag.empty()) { last = "buffer"; buffer = buffer + wend_tag; @@ -382,10 +364,9 @@ void printBuffer(int ind=-1, string end_tag="") // isEoh handling TODO matxin format if (hasWrite_dot && isDot) { - swprintf(tag, 250, L"<empty-tag offset=\"%d\"/>\n", offset+1); - fputws(tag, formatfile); + u_fprintf(formatfile, "<empty-tag offset=\"%d\"/>\n", offset+1); - fputws(L" .\n", yyout); + fputs(" .\n", yyout); offset += 2; hasWrite_dot = false; } @@ -396,24 +377,22 @@ void printBuffer(int ind=-1, string end_tag="") { if (hasWrite_white) { - fputws(L" ", yyout); + fputs(" ", yyout); offset++; hasWrite_white = false; } current++; - swprintf(tag, 250, L"<format-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); - fputws(tag, formatfile); - while ((pos = buffer.find(L"]]>")) != wstring::npos) - buffer.replace(pos, 3, L"\\]\\]\\>"); - fputws(buffer.c_str(), formatfile); - swprintf(tag, 250, L"]]></format-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "<format-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + while ((pos = buffer.find("]]>")) != UString::npos) + buffer.replace(pos, 3, "\\]\\]\\>"_u); + write(buffer, formatfile); + u_fprintf(formatfile, "]]></format-tag>\n"); } else { - fputws(buffer.c_str(), yyout); + put(buffer, yyout); offset += buffer.size(); } @@ -422,30 +401,27 @@ void printBuffer(int ind=-1, string end_tag="") { if (hasWrite_white) { - fputws(L" ", yyout); + fputc(' ', yyout); offset++; hasWrite_white = false; } - num = swprintf(tag, 250, L"<open-close-tag>\n"); - swprintf(tag + num, 250 - num, L"<open-tag offset=\"%d\" order=\"%d\"><![CDATA[", offsets[ind], orders[ind]); - fputws(tag, formatfile); + u_fprintf(formatfile, "<open-close-tag>\n"); + u_fprintf(formatfile, "<open-tag offset=\"%d\"order=\"%d\"><![CDATA[", offsets[ind], orders[ind]); etiketa = tags[ind]; - while ((pos = etiketa.find(L"]]>")) != wstring::npos) - etiketa.replace(pos, 3, L"\\]\\]\\>"); - fputws(etiketa.c_str(), formatfile); + while ((pos = etiketa.find("]]>"_u)) != UString::npos) + etiketa.replace(pos, 3, "\\]\\]\\>"_u); + write(etiketa, formatfile); current++; - num = swprintf(tag, 250, L"]]></open-tag>\n"); - swprintf(tag + num, 250 - num, L"<close-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); - fputws(tag, formatfile); - while ((pos = wend_tag.find(L"]]>")) != wstring::npos) - wend_tag.replace(pos, 3, L"\\]\\]\\>"); - fputws(wend_tag.c_str(), formatfile); - num = swprintf(tag, 250, L"]]></close-tag>\n"); - swprintf(tag + num, 250 - num, L"</open-close-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "]]></open-tag>\n"); + u_fprintf(formatfile, "<close-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + while ((pos = wend_tag.find("]]>"_u)) != UString::npos) + wend_tag.replace(pos, 3, "\\]\\]\\>"_u); + write(wend_tag, formatfile); + u_fprintf(formatfile, "]]></close-tag>\n"); + u_fprintf(formatfile, "</open-close-tag>\n"); tags.erase(tags.begin() + ind); offsets.erase(offsets.begin() + ind); @@ -454,7 +430,7 @@ void printBuffer(int ind=-1, string end_tag="") last = "buffer"; - buffer = L""; + buffer.clear(); } } @@ -467,11 +443,11 @@ void preDot() { if(noDot) { - fputws_unlocked(L"[]", yyout); + fputs_unlocked("[]", yyout); } else { - fputws_unlocked(L".[]", yyout); + fputs_unlocked(".[]", yyout); } } } @@ -480,66 +456,64 @@ void printBuffer() { if(isEoh && markEoh) { - fputws_unlocked(L"[]\x2761", yyout); + put(u"[]\u2761", yyout); isEoh = false; } if(isDot && !eosIncond) { if(noDot) { - fputws_unlocked(L"[]", yyout); + fputs_unlocked("[]", yyout); } else { - fputws_unlocked(L".[]", yyout); + fputs_unlocked(".[]", yyout); } isDot = false; } if(buffer.size() > ) { string filename = tmpnam(NULL); - FILE *largeblock = fopen(filename.c_str(), "wb"); - fputws_unlocked(buffer.c_str(), largeblock); - fclose(largeblock); + UFILE *largeblock = u_fopen(filename.c_str(), "wb", NULL, NULL); + write(buffer, largeblock); + u_fclose(largeblock); preDot(); - fputwc_unlocked(L'[', yyout); - fputwc_unlocked(L'@', yyout); - std::wstring cad; - utf8::utf8to32(filename.begin(), filename.end(), std::back_inserter(cad)); - fputws_unlocked(cad.c_str(), yyout); - fputwc_unlocked(L']', yyout); + fputc_unlocked('[', yyout); + fputc_unlocked('@', yyout); + fputs_unlocked(filename.c_str(), yyout); + fputc_unlocked(']', yyout); } else if(buffer.size() > 1) { preDot(); - fputwc_unlocked(L'[', yyout); - wstring const tmp = escape(buffer); - if(tmp[0] == L'@') + fputc_unlocked('[', yyout); + UString const tmp = escape(buffer); + if(tmp[0] == '@') { - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); } - fputws_unlocked(tmp.c_str(), yyout); - fputwc_unlocked(L']', yyout); + put(tmp, yyout); + fputc_unlocked(']', yyout); } - else if(buffer.size() == 1 && buffer[0] != L' ') + else if(buffer.size() == 1 && buffer[0] != ' ') { preDot(); - fputwc_unlocked(L'[', yyout); - wstring const tmp = escape(buffer); - if(tmp[0] == L'@') + fputc_unlocked('[', yyout); + UString const tmp = escape(buffer); + if(tmp[0] == '@') { - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); } - fputws_unlocked(tmp.c_str(), yyout); + put(tmp, yyout); - fputwc_unlocked(L']', yyout); + fputc_unlocked(']', yyout); } else { - fputws_unlocked(buffer.c_str(), yyout); + put(buffer, yyout); } - buffer = L""; + buffer.clear(); } @@ -618,9 +592,9 @@ void printBuffer() - + - + @@ -637,12 +611,11 @@ void printBuffer() { printBuffer(); - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); offset++; const char *mb = yytext; - wchar_t symbol = utf8::next(mb, mb+4); - - fputwc_unlocked(symbol, yyout); + UChar32 symbol = utf8::next(mb, mb+4); + put(UString(1, symbol), yyout); offset++; hasWrite_dot = hasWrite_white = true; @@ -654,9 +627,9 @@ void printBuffer() if (utf8::is_valid(symbuf.begin(), symbuf.end())) { const char *mb = symbuf.c_str(); - wchar_t symbol = utf8::next(mb, mb+4); + UChar32 symbol = utf8::next(mb, mb+4); symbuf.clear(); - fputwc_unlocked(symbol, yyout); + put(UString(1, symbol), yyout); offset++; hasWrite_dot = hasWrite_white = true; } @@ -677,20 +650,20 @@ void usage(string const &progname) { - wcerr << "USAGE: " << progname << " format_file [input_file [output_file]" << ']' << endl; + cerr << "USAGE: " << progname << " format_file [input_file [output_file]" << ']' << endl; - wcerr << "USAGE: " << progname << " [ -h | -o | -i | -n ] [input_file [output_file]" << ']' << endl; + cerr << "USAGE: " << progname << " [ -h | -o | -i | -n ] [input_file [output_file]" << ']' << endl; - wcerr << " format processor " << endl; + cerr << " format processor " << endl; exit(EXIT_SUCCESS); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - size_t base = 0; + int base = 0; eosIncond = false; if(argc >= 2) @@ -740,7 +713,7 @@ int main(int argc, char *argv[]) usage(argv[0]); } case 2: - formatfile = fopen(argv[1+base], "wb"); + formatfile = u_fopen(argv[1+base], "wb", NULL, NULL); if(!formatfile) { usage(argv[0]); @@ -751,35 +724,23 @@ int main(int argc, char *argv[]) } - if((argc-base) > 4) - { + if((argc-base) > 4) { usage(argv[0]); } - - switch(argc-base) - { - case 3: - yyout = fopen(argv[2+base], "wb"); - if(!yyout) - { - usage(argv[0]); - } - case 2: - yyin = fopen(argv[1+base], "rb"); - if(!yyin) - { - usage(argv[0]); - } - break; - default: - break; + if ((argc - base) == 3) { + yyout = fopen(argv[2 + base], "wb"); + if (!yyout) { + usage(argv[0]); + } + } + if ((argc - base) >= 2) { + yyin = fopen(argv[1 + base], "rb"); + if (!yyin) { + usage(argv[0]); + } } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif // prevent warning message yy_push_state(1); yy_top_state(); @@ -792,8 +753,8 @@ int main(int argc, char *argv[]) - fputws(L"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n", formatfile); - fputws(L"<format>\n", formatfile); + write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"_u, formatfile); + write("<format>\n"_u, formatfile); last.clear(); @@ -807,7 +768,7 @@ int main(int argc, char *argv[]) print_emptyTags(); - fputws(L"</format>", formatfile); + write("</format>"_u, formatfile); fclose(formatfile); fclose(yyin); diff --git a/apertium/deserialiser.h b/apertium/deserialiser.h index ae40972..2f90ea2 100644 --- a/apertium/deserialiser.h +++ b/apertium/deserialiser.h @@ -90,13 +90,13 @@ i Deserialiser::deserialise(std::istream &Stream_) { Lemma Deserialiser::deserialise(std::istream &Stream_) { Lemma StreamedType_; - StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); + StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); return StreamedType_; } Morpheme Deserialiser::deserialise(std::istream &Stream_) { Morpheme SerialisedType_; - SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); + SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); SerialisedType_.TheTags = Deserialiser >::deserialise(Stream_); return SerialisedType_; @@ -104,7 +104,7 @@ Morpheme Deserialiser::deserialise(std::istream &Stream_) { Tag Deserialiser::deserialise(std::istream &Stream_) { Tag SerialisedType_; - SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); + SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); return SerialisedType_; } diff --git a/apertium/exception.h b/apertium/exception.h index 2bda473..3b97a76 100644 --- a/apertium/exception.h +++ b/apertium/exception.h @@ -27,9 +27,8 @@ namespace Exception { EXCEPTION_TYPE(const char *const what_) : ExceptionType(what_) {} \ EXCEPTION_TYPE(const std::string &what_) : ExceptionType(what_) {} \ EXCEPTION_TYPE(const std::stringstream &what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const wchar_t *const what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const std::wstring &what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const std::wstringstream &what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const UChar *const what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const UString &what_) : ExceptionType(what_) {} \ ~EXCEPTION_TYPE() throw() {} \ }; diff --git a/apertium/exception_type.cc b/apertium/exception_type.cc index 0f32b45..7c1eec8 100644 --- a/apertium/exception_type.cc +++ b/apertium/exception_type.cc @@ -15,34 +15,31 @@ #include "exception_type.h" -#include "utf_converter.h" #include #include namespace Apertium { -ExceptionType::ExceptionType(const char *const what_) : what_(what_) {} +ExceptionType::ExceptionType(const char *const what_) + : what_(to_ustring(what_)) {} -ExceptionType::ExceptionType(const std::string &what_) : what_(what_) {} +ExceptionType::ExceptionType(const std::string &what_) + : what_(to_ustring(what_.c_str())) {} ExceptionType::ExceptionType(const std::stringstream &what_) - : what_(what_.str()) {} + : what_(to_ustring(what_.str().c_str())) {} -ExceptionType::ExceptionType(const wchar_t *const what_) -{ - this->what_ = UtfConverter::toUtf8(what_); -} +ExceptionType::ExceptionType(const UChar *const what_) + : what_(what_) {} -ExceptionType::ExceptionType(const std::wstring &what_) -{ - this->what_ = UtfConverter::toUtf8(what_); -} - -ExceptionType::ExceptionType(const std::wstringstream &what_) -{ - this->what_ = UtfConverter::toUtf8(what_.str()); -} +ExceptionType::ExceptionType(const UString &what_) + : what_(what_) {} ExceptionType::~ExceptionType() throw() {} -const char *ExceptionType::what() const throw() { return what_.c_str(); } +const char *ExceptionType::what() const throw() +{ + std::string res; + utf8::utf16to8(what_.begin(), what_.end(), std::back_inserter(res)); + return res.c_str(); +} } diff --git a/apertium/exception_type.h b/apertium/exception_type.h index a780b75..9ee46ac 100644 --- a/apertium/exception_type.h +++ b/apertium/exception_type.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace Apertium { class ExceptionType : public std::exception { @@ -26,14 +27,13 @@ public: ExceptionType(const char *const what_); ExceptionType(const std::string &what_); ExceptionType(const std::stringstream &what_); - ExceptionType(const wchar_t *wchar_t_what_); - ExceptionType(const std::wstring &wchar_t_what_); - ExceptionType(const std::wstringstream &wchar_t_what_); + ExceptionType(const UChar *wchar_t_what_); + ExceptionType(const UString &wchar_t_what_); virtual ~ExceptionType() throw() = 0; const char *what() const throw(); protected: - std::string what_; + UString what_; }; } diff --git a/apertium/feature_vec.cc b/apertium/feature_vec.cc index fc95d10..1119134 100644 --- a/apertium/feature_vec.cc +++ b/apertium/feature_vec.cc @@ -68,7 +68,7 @@ operator<<(OStream & out, FeatureVec const &fv) out << std::dec << (int)(*(bc_it++))[0] << "; "; for (;bc_it != feat_it->first.end(); bc_it++) { - out << bc_it->c_str(); + out << *bc_it; if (bc_it + 1 != feat_it->first.end()) { out << ", "; @@ -79,9 +79,6 @@ operator<<(OStream & out, FeatureVec const &fv) return out; } -template std::wostream& -operator<<(std::wostream& out, FeatureVec const &fv); - template std::ostream& operator<<(std::ostream& out, FeatureVec const &fv); diff --git a/apertium/feature_vec.h b/apertium/feature_vec.h index a4dcd6a..0df6b69 100644 --- a/apertium/feature_vec.h +++ b/apertium/feature_vec.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace Apertium { diff --git a/apertium/file_morpho_stream.cc b/apertium/file_morpho_stream.cc index 5040216..82d264f 100644 --- a/apertium/file_morpho_stream.cc +++ b/apertium/file_morpho_stream.cc @@ -21,34 +21,33 @@ */ #include -#include +#include #include "apertium_config.h" #include -using namespace Apertium; -FileMorphoStream::FileMorphoStream(FILE *ftxt, bool d, TaggerData *t) : +FileMorphoStream::FileMorphoStream(const char* ftxt, bool d, TaggerData *t) : ms() { foundEOF = false; debug=d; td = t; me = td->getPatternList().newMatchExe(); alphabet = td->getPatternList().getAlphabet(); - input = ftxt; + input.open(ftxt); ca_any_char = alphabet(PatternList::ANY_CHAR); ca_any_tag = alphabet(PatternList::ANY_TAG); ConstantManager &constants = td->getConstants(); - ca_kignorar = constants.getConstant(L"kIGNORAR"); - ca_kbarra = constants.getConstant(L"kBARRA"); - ca_kdollar = constants.getConstant(L"kDOLLAR"); - ca_kbegin = constants.getConstant(L"kBEGIN"); - ca_kmot = constants.getConstant(L"kMOT"); - ca_kmas = constants.getConstant(L"kMAS"); - ca_kunknown = constants.getConstant(L"kUNKNOWN"); + ca_kignorar = constants.getConstant("kIGNORAR"_u); + ca_kbarra = constants.getConstant("kBARRA"_u); + ca_kdollar = constants.getConstant("kDOLLAR"_u); + ca_kbegin = constants.getConstant("kBEGIN"_u); + ca_kmot = constants.getConstant("kMOT"_u); + ca_kmas = constants.getConstant("kMAS"_u); + ca_kunknown = constants.getConstant("kUNKNOWN"_u); - map &tag_index = td->getTagIndex(); - ca_tag_keof = tag_index[L"TAG_kEOF"]; - ca_tag_kundef = tag_index[L"TAG_kUNDEF"]; + map &tag_index = td->getTagIndex(); + ca_tag_keof = tag_index["TAG_kEOF"_u]; + ca_tag_kundef = tag_index["TAG_kUNDEF"_u]; end_of_file = false; null_flush = false; @@ -69,7 +68,7 @@ FileMorphoStream::get_next_word() if(word->isAmbiguous()) { - vector &ref = td->getDiscardRules(); + vector &ref = td->getDiscardRules(); for(unsigned int i = 0; i < ref.size(); i++) { word->discardOnAmbiguity(ref[i]); @@ -79,7 +78,7 @@ FileMorphoStream::get_next_word() return word; } - if(feof(input)) + if(input.eof()) { return NULL; } @@ -89,77 +88,68 @@ FileMorphoStream::get_next_word() while(true) { - int symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) + UChar32 symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { end_of_file = true; - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return get_next_word(); } - if(symbol == L'^') + if(symbol == '^') { readRestOfWord(ivwords); return get_next_word(); } else { - wstring str = L""; - if(symbol == L'\\') + UString str = ""_u; + if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); - symbol = L'\\'; + symbol = input.get(); + str += '\\'; + str += symbol; + symbol = '\\'; } else { - str += static_cast(symbol); + str += symbol; } - while(symbol != L'^') + while(symbol != '^') { - symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) - { - end_of_file = true; - vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); - return get_next_word(); - } - else if(symbol == L'\\') - { - str += L'\\'; - symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) - { - end_of_file = true; - vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); - return get_next_word(); - } - str += static_cast(symbol); - symbol = L'\\'; - } - else if(symbol == L'^') - { - if(str.size() > 0) - { - vwords[ivwords]->add_ignored_string(str); + symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); + return get_next_word(); + } else if(symbol == '\\') { + str += '\\'; + symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); + return get_next_word(); } - readRestOfWord(ivwords); - return get_next_word(); - } - else - { - str += static_cast(symbol); - } + str += symbol; + symbol = '\\'; + } else if(symbol == '^') { + if(str.size() > 0) { + vwords[ivwords]->add_ignored_string(str); + } + readRestOfWord(ivwords); + return get_next_word(); + } else { + str += symbol; + } } } } } void -FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) +FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) { int floor = 0; int last_type = -1; @@ -168,9 +158,9 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) ms.init(me->getInitial()); for(int i = 0, limit = str.size(); i != limit; i++) { - if(str[i] != L'<') + if(str[i] != '<') { - if(str[i] == L'+') + if(str[i] == '+') { int val = ms.classifyFinals(me->getFinals()); if(val != -1) @@ -179,18 +169,18 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) last_type = val; } } - ms.step(towlower(str[i]), ca_any_char); + ms.step(u_tolower(str[i]), ca_any_char); } else { - wstring tag = L""; + UString tag; for(int j = i+1; j != limit; j++) { - if(str[j] == L'\\') + if(str[j] == '\\') { j++; } - else if(str[j] == L'>') + else if(str[j] == '>') { tag = str.substr(i, j-i+1); i = j; @@ -216,7 +206,7 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) vwords[ivwords]->add_tag(last_type, str.substr(floor, last_pos - floor + 1), td->getPreferRules()); - if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + if(str[last_pos+1] == '+' && last_pos+1 < limit ) { floor = last_pos + 1; last_pos = floor + 1; @@ -232,8 +222,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) { if (debug) { - wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); return; @@ -248,7 +238,7 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) vwords[ivwords]->add_tag(last_type, str.substr(floor, last_pos - floor + 1), td->getPreferRules()); - if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + if(str[last_pos+1] == '+' && last_pos+1 < limit ) { floor = last_pos + 1; last_pos = floor; @@ -264,8 +254,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) { if (debug) { - wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); return; @@ -280,8 +270,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) val = ca_tag_kundef; if (debug) { - wcerr< 0) { vwords[ivwords]->add_ignored_string(str); - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; + cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return; } - else if(symbol == L'\\') + else if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); + symbol = input.get(); + str += '\\'; + str += symbol; } - else if(symbol == L'/') + else if(symbol == '/') { vwords[ivwords]->set_superficial_form(str); - str = L""; + str.clear(); break; } - else if(symbol == L'$') + else if(symbol == '$') { vwords[ivwords]->set_superficial_form(str); - vwords[ivwords]->add_ignored_string(L"$"); + vwords[ivwords]->add_ignored_string("$"_u); break; } else { - str += static_cast(symbol); + str += symbol; } } @@ -338,45 +328,45 @@ FileMorphoStream::readRestOfWord(int &ivwords) while(true) { - int symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) + UChar32 symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { end_of_file = true; if(str.size() > 0) { vwords[ivwords]->add_ignored_string(str); - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; + cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return; } - else if(symbol == L'\\') + else if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); - symbol = L'\\'; // to prevent exiting with '\$' + symbol = input.get(); + str += '\\'; + str += symbol; + symbol = '\\'; // to prevent exiting with '\$' } - else if(symbol == L'/') + else if(symbol == '/') { lrlmClassify(str, ivwords); - str = L""; + str.clear(); ivwords = 0; continue; } - else if(symbol == L'$') + else if(symbol == '$') { - if(str[0] != L'*')// do nothing with unknown words + if(str[0] != '*')// do nothing with unknown words { - lrlmClassify(str, ivwords); + lrlmClassify(str, ivwords); } return; } else { - str += static_cast(symbol); + str += symbol; } } } @@ -402,6 +392,6 @@ FileMorphoStream::setEndOfFile(bool eof) void FileMorphoStream::rewind() { - std::fseek(input, 0, SEEK_SET); + input.rewind(); end_of_file = false; } diff --git a/apertium/file_morpho_stream.h b/apertium/file_morpho_stream.h index 3d40802..fdf8871 100644 --- a/apertium/file_morpho_stream.h +++ b/apertium/file_morpho_stream.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -47,9 +48,9 @@ using namespace std; class FileMorphoStream : public MorphoStream { private: bool foundEOF; - wstring last_string_tag; + UString last_string_tag; bool debug; - FILE *input; + InputFile input; int ca_any_char; int ca_any_tag; int ca_kignorar; @@ -74,13 +75,13 @@ private: bool end_of_file; void readRestOfWord(int &ivwords); - void lrlmClassify(wstring const &str, int &ivwords); + void lrlmClassify(UString const &str, int &ivwords); public: /** Constructor * @param is the input stream. */ - FileMorphoStream(FILE *ftxt, bool d, TaggerData *t); + FileMorphoStream(const char* ftxt, bool d, TaggerData *t); /** * Destructor diff --git a/apertium/file_tagger.cc b/apertium/file_tagger.cc index cdce82c..f272a72 100644 --- a/apertium/file_tagger.cc +++ b/apertium/file_tagger.cc @@ -40,8 +40,8 @@ void FILE_Tagger::setNullFlush(const bool &NullFlush) { TheFlags.setNullFlush(NullFlush); } -void FILE_Tagger::tagger(FILE *Input, FILE *Output) { - FileMorphoStream morpho_stream(Input, TheFlags.getDebug(), &get_tagger_data()); +void FILE_Tagger::tagger(const char* input_file, UFILE *Output) { + FileMorphoStream morpho_stream(input_file, TheFlags.getDebug(), &get_tagger_data()); tagger(morpho_stream, Output); } @@ -51,13 +51,13 @@ void FILE_Tagger::init_and_train(MorphoStream &lexmorfo, unsigned long count) { train(lexmorfo, count); } -void FILE_Tagger::init_and_train(FILE *corpus, unsigned long count) { - init_probabilities_kupiec_(corpus); - train(corpus, count); +void FILE_Tagger::init_and_train(const char* corpus_file, unsigned long count) { + init_probabilities_kupiec_(corpus_file); + train(corpus_file, count); } -void FILE_Tagger::train(FILE *corpus, unsigned long count) { - FileMorphoStream lexmorfo(corpus, true, &get_tagger_data()); +void FILE_Tagger::train(const char* corpus_file, unsigned long count) { + FileMorphoStream lexmorfo(corpus_file, true, &get_tagger_data()); train(lexmorfo, count); } @@ -67,19 +67,20 @@ void FILE_Tagger::deserialise(string const &TaggerSpecificationFilename) { deserialise(TaggerSpecificationReader_.getTaggerData()); } -void FILE_Tagger::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, - FILE *Corpus) { - FileMorphoStream stream_tagged(TaggedCorpus, true, &get_tagger_data()); - FileMorphoStream stream_untagged(Corpus, true, &get_tagger_data()); +void FILE_Tagger::init_probabilities_from_tagged_text_( + const char* tagged_file, const char* untagged_file) +{ + FileMorphoStream stream_tagged(tagged_file, true, &get_tagger_data()); + FileMorphoStream stream_untagged(untagged_file, true, &get_tagger_data()); init_probabilities_from_tagged_text_(stream_tagged, stream_untagged); } -void FILE_Tagger::init_probabilities_kupiec_(FILE *Corpus) { - FileMorphoStream lexmorfo(Corpus, true, &get_tagger_data()); +void FILE_Tagger::init_probabilities_kupiec_(const char* corpus_file) { + FileMorphoStream lexmorfo(corpus_file, true, &get_tagger_data()); init_probabilities_kupiec_(lexmorfo); } -void FILE_Tagger::read_dictionary(FILE *fdic) { +void FILE_Tagger::read_dictionary(const char* fdic) { tagger_utils::scan_for_ambg_classes(fdic, get_tagger_data()); tagger_utils::add_neccesary_ambg_classes(get_tagger_data()); post_ambg_class_scan(); diff --git a/apertium/file_tagger.h b/apertium/file_tagger.h index dc9a543..00496e9 100644 --- a/apertium/file_tagger.h +++ b/apertium/file_tagger.h @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -34,29 +36,29 @@ public: void set_debug(const bool &Debug); void set_show_sf(const bool &ShowSuperficial); void setNullFlush(const bool &NullFlush); - virtual void tagger(FILE *Input, FILE *Output); - virtual void tagger(MorphoStream &morpho_stream, FILE *Output) = 0; - virtual std::vector &getArrayTags() = 0; + virtual void tagger(const char* input_file, UFILE* Output); + virtual void tagger(MorphoStream &morpho_stream, UFILE* Output) = 0; + virtual std::vector &getArrayTags() = 0; void init_and_train(MorphoStream &lexmorfo, unsigned long Count); - void init_and_train(FILE *Corpus, unsigned long Count); - virtual void train(FILE *Corpus, unsigned long Count); + void init_and_train(const char* corpus_file, unsigned long Count); + virtual void train(const char* corpus_file, unsigned long Count); virtual void train(MorphoStream &lexmorpho, unsigned long count) = 0; virtual void train(MorphoStream &lexmorpho) = 0; virtual void serialise(FILE *Stream_) = 0; void deserialise(string const &TaggerSpecificationFilename); virtual void init_probabilities_from_tagged_text_( - FILE *TaggedCorpus, FILE *Corpus); + const char* tagged_file, const char* untagged_file); virtual void init_probabilities_from_tagged_text_( MorphoStream &stream_tagged, MorphoStream &stream_untagged) = 0; - virtual void init_probabilities_kupiec_(FILE *Corpus); + virtual void init_probabilities_kupiec_(const char* corpus_file); virtual void init_probabilities_kupiec_(MorphoStream &lexmorfo) = 0; /** It reads the expanded dictionary received as a parameter and calculates * the set of ambiguity classes that the tagger will manage. - * @param is the input stream with the expanded dictionary to read + * @param is the filename of expanded dictionary to read (or NULL for stdin) */ - void read_dictionary(FILE *is); + void read_dictionary(const char* is); virtual TaggerData& get_tagger_data() = 0; diff --git a/apertium/gen_modes.cc b/apertium/gen_modes.cc index 1dc7f8f..6dbb78c 100644 --- a/apertium/gen_modes.cc +++ b/apertium/gen_modes.cc @@ -21,8 +21,7 @@ #include #include #include -#include "string_utils.h" -#include "utf_converter.h" +#include #include #include #include @@ -32,7 +31,6 @@ #include #include -using namespace Apertium; using namespace std; void endProgram(char *name) diff --git a/apertium/hmm.cc b/apertium/hmm.cc index 9285217..3dd9414 100644 --- a/apertium/hmm.cc +++ b/apertium/hmm.cc @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include inline bool p_isnan(double v) { @@ -58,10 +58,10 @@ TaggerData& HMM::get_tagger_data() { void HMM::deserialise(FILE *Serialised_FILE_Tagger) { tdhmm.read(Serialised_FILE_Tagger); - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } -std::vector &HMM::getArrayTags() { +std::vector &HMM::getArrayTags() { return tdhmm.getArrayTags(); } @@ -69,7 +69,7 @@ void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); } void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) { tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger); - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } void HMM::init_probabilities_from_tagged_text_(MorphoStream &stream_tagged, @@ -99,7 +99,7 @@ HMM::HMM(TaggerFlags& Flags_) : FILE_Tagger(Flags_) {} HMM::HMM(TaggerDataHMM _tdhmm) : tdhmm(_tdhmm) { - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {} @@ -193,7 +193,7 @@ HMM::init_probabilities_kupiec(MorphoStream &lexmorfo) //We count for each ambiguity class the number of ocurrences word = lexmorfo.get_next_word(); while((word)) { - if (++nw%10000==0) wcerr<get_tags(); @@ -265,7 +265,7 @@ HMM::init_probabilities_kupiec(MorphoStream &lexmorfo) } } } - wcerr<get_superficial_form()!=word_untagged->get_superficial_form()) { - wcerr<get_tags().size()==0) // Unknown word tag1 = -1; else if (word_tagged->get_tags().size()>1) // Ambiguous word - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; else tag1 = *(word_tagged->get_tags()).begin(); @@ -368,7 +368,7 @@ HMM::init_probabilities_from_tagged_text(MorphoStream &stream_tagged, } } - wcerr< > ambiguity_classes; - FileMorphoStream morpho_stream(in, true, &tdhmm); + FileMorphoStream morpho_stream(input_file, true, &tdhmm); TaggerWord *word = morpho_stream.get_next_word(); @@ -434,7 +434,7 @@ HMM::filter_ambiguity_classes(FILE *in, FILE *out) { if(ambiguity_classes.find(tags) == ambiguity_classes.end()) { ambiguity_classes.insert(tags); word->outputOriginal(out); - //wcerr<get_string_tags()<get_string_tags()<<"\n"; } } delete word; @@ -474,12 +474,12 @@ HMM::train(MorphoStream &morpho_stream) { while (word) { - //wcerr<1) { @@ -521,8 +521,8 @@ HMM::train(MorphoStream &morpho_stream) { prob = alpha[len][tag]; - //wcerr<<"prob="<1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())[L"TAG_kEOF"]))) { - wcerr << L"Warning: The last tag is not the end-of-sentence-tag " - << L"but rather " << tdhmm.getArrayTags()[tag] << L". Line: " << nw - << L". Pending: " << pending.size() << ". Tags: "; - wcerr << "\n"; + if ((pending.size()>1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())["TAG_kEOF"_u]))) { + cerr << "Warning: The last tag is not the end-of-sentence-tag " + << "but rather " << tdhmm.getArrayTags()[tag] << ". Line: " << nw + << ". Pending: " << pending.size() << ". Tags: "; + cerr << "\n"; } int N = tdhmm.getN(); @@ -597,24 +597,24 @@ HMM::train(MorphoStream &morpho_stream) { j = jt->first; if (xsi[i][j]>0) { if (gamma[i]==0) { - wcerr<get_superficial_form()<get_string_tags()<get_superficial_form()<<"' "<get_string_tags()<<"\n"; } for (unsigned t=0; t1)&&(TheFlags.getDebug())) { - wstring errors; - errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n"; - errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n"; - wcerr< &getArrayTags(); + std::vector &getArrayTags(); void serialise(FILE *Stream_); void deserialise(const TaggerData &Deserialised_FILE_Tagger); void init_probabilities_from_tagged_text_(MorphoStream &stream_tagged, @@ -143,7 +143,7 @@ public: * @param in the input stream with the untagged text to tag * @param out the output stream with the tagged text */ - void tagger(MorphoStream &morpho_stream, FILE *Output); + void tagger(MorphoStream &morpho_stream, UFILE* Output); /** Prints the A matrix. */ @@ -157,7 +157,7 @@ public: */ void print_ambiguity_classes(); - void filter_ambiguity_classes(FILE *in, FILE *out); + void filter_ambiguity_classes(const char* input_file, UFILE* output); }; #endif diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index 019577d..e52c500 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -15,489 +15,197 @@ * along with this program; if not, see . */ #include -#include -#include -#include -#include -#include -#include +#include +#include + #include -#include -#include -#include "apertium_config.h" -#include -using namespace Apertium; using namespace std; -void -Interchunk::destroy() -{ - delete me; - me = NULL; +Interchunk::Interchunk() + : word(0), last_lword(0), inword(false) +{} - if(doc) +bool +Interchunk::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) { - xmlFreeDoc(doc); - doc = NULL; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; + return false; } -} - -Interchunk::Interchunk() : -word(0), -lword(0), -last_lword(0), -output(0), -any_char(0), -any_tag(0), -nwords(0) -{ - me = NULL; - doc = NULL; - root_element = NULL; - lastrule = NULL; - inword = false; - null_flush = false; - internal_null_flush = false; - trace = false; - in_out = false; -} - -Interchunk::~Interchunk() -{ - destroy(); -} - -void -Interchunk::readData(FILE *in) -{ - alphabet.read(in); - any_char = alphabet(TRXReader::ANY_CHAR); - any_tag = alphabet(TRXReader::ANY_TAG); - - Transducer t; - t.read(in, alphabet.size()); - - map finals; - - // finals - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + if(index < 0) { + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; + return false; + } + if(word[index] == 0) { - int key = Compression::multibyte_read(in); - finals[key] = Compression::multibyte_read(in); + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; + return false; } + return true; +} - me = new MatchExe(t, finals); - - // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); +UString +Interchunk::evalCachedString(xmlNode* element) +{ + TransferInstr& ti = evalStringCache[element]; + switch (ti.getType()) { + case ti_clip_tl: + if (checkIndex(element, ti.getPos(), lword)) { + if (ti.getContent() == "content"_u) { + UString wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + return wf.substr(1, wf.length()-2); // trim { and } + } else { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } } - } + break; - // variables - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); - } + case ti_var: + return variables[ti.getContent()]; - // macros - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - macros[cad_k] = Compression::multibyte_read(in); - } + case ti_lit_tag: + case ti_lit: + return ti.getContent(); - // lists - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + case ti_b: + if (!blank_queue.empty()) { + UString retblank = blank_queue.front(); + if (in_out) { + blank_queue.pop(); + } + return retblank; + } else { + return " "_u; + } + break; - for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) - { - wstring const cad_v = Compression::wstring_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + case ti_get_case_from: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString(ti.getPointer())); } - } -} + break; -void -Interchunk::read(string const &transferfile, string const &datafile) -{ - readInterchunk(transferfile); + case ti_case_of_tl: + if (checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; - // datafile - FILE *in = fopen(datafile.c_str(), "rb"); - if(!in) - { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; - exit(EXIT_FAILURE); + default: + return ""_u; } - readData(in); - fclose(in); - + return ""_u; } void -Interchunk::readInterchunk(string const &in) +Interchunk::processClip(xmlNode* element) { - doc = xmlReadFile(in.c_str(), NULL, 0); - - if(doc == NULL) - { - wcerr << "Error: Could not parse file '" << in << "'." << endl; - exit(EXIT_FAILURE); - } - - root_element = xmlDocGetRootElement(doc); - - // search for macros & rules - for(xmlNode *i = root_element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) - { - collectMacros(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) - { - collectRules(i); - } + int pos = 0; + UString part; + for (xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((const char*) i->children->content); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char*) i->children->content) - 1; } } + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); } void -Interchunk::collectRules(xmlNode *localroot) +Interchunk::processBlank(xmlNode* element) { - for(xmlNode *rule = localroot->children; rule != NULL; rule = rule->next) - { - if(rule->type == XML_ELEMENT_NODE) - { - size_t line = rule->line; - for(xmlNode *rulechild = rule->children; ; rulechild = rulechild->next) - { - if(rulechild->type == XML_ELEMENT_NODE && !xmlStrcmp(rulechild->name, (const xmlChar *) "action")) - { - rule_map.push_back(rulechild); - rule_lines.push_back(line); - break; - } - } - } + if (element->properties == NULL) { + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); + } else { + int pos = atoi((const char*) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); } } void -Interchunk::collectMacros(xmlNode *localroot) +Interchunk::processLuCount(xmlNode* element) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - macro_map.push_back(i); - } - } + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); } -bool -Interchunk::checkIndex(xmlNode *element, int index, int limit) +UString +Interchunk::processLu(xmlNode* element) { - if(index >= limit) - { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index >= limit" << endl; - return false; - } - if(index < 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index < 0" << endl; - return false; - } - if(word[index] == 0) - { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": Null access at word[index]" << endl; - return false; - } - return true; + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + return ""_u; // make the type checker happy } - -string -Interchunk::evalString(xmlNode *element) +UString +Interchunk::processMlu(xmlNode* element) { - if (element == 0) - { - throw "Interchunk::evalString() was passed a NULL element"; - } - - map::iterator it; - it = evalStringCache.find(element); - if(it != evalStringCache.end()) - { - TransferInstr &ti = it->second; - switch(ti.getType()) - { - case ti_clip_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(ti.getContent() == "content") // jacob's new 'part' - { - string wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); - return wf.substr(1, wf.length()-2); // trim away the { and } - } - else - { - return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); - } - } - break; - - case ti_var: - return variables[ti.getContent()]; - - case ti_lit_tag: - case ti_lit: - return ti.getContent(); - - case ti_b: - if(!blank_queue.empty()) - { - string retblank = blank_queue.front(); - - if(in_out) - { - blank_queue.pop(); - } - - return retblank; - } - else - { - return " "; - } - break; - - case ti_get_case_from: - if(checkIndex(element, ti.getPos(), lword)) - { - return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), - evalString((xmlNode *) ti.getPointer())); - } - break; - - case ti_case_of_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); - } - break; - - default: - return ""; - } - return ""; - } - - if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) - { - int pos = 0; - xmlChar *part = NULL; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *)i->children->content) - 1; - } - } - - evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) - { - evalStringCache[element] = TransferInstr(ti_lit_tag, - tags((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) - { - evalStringCache[element] = TransferInstr(ti_lit, ((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) - { - if(element->properties == NULL) - { - evalStringCache[element] = TransferInstr(ti_b, " ", -1); - } - else - { - int pos = atoi((const char *) element->properties->children->content) - 1; - evalStringCache[element] = TransferInstr(ti_b, "", pos); - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) - { - int pos = atoi((const char *) element->properties->children->content) - 1; - xmlNode *param = NULL; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - param = i; - break; - } - } - - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) - { - evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) - { - int pos = 0; - xmlChar *part = NULL; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - } + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + return ""_u; // make the type checker happy +} - evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) - { - string value; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - value.append(evalString(i)); - } +void +Interchunk::processCaseOf(xmlNode* element) +{ + int pos = 0; + UString part; + for (xmlAttr* i = element->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { + part = to_ustring((char*) i->children->content); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { + pos = atoi((const char*) i->children->content) - 1; } - return value; } - else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) - { - return processChunk(element); - } - else - { - wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; - exit(EXIT_FAILURE); - } - - return evalString(element); + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } void Interchunk::processOut(xmlNode *localroot) { in_out = true; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) - { - fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); - } - else // 'b' - { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); - } + + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { + write(processChunk(i), output); + } else { // 'b' + write(evalString(i), output); } } - + in_out = false; } -string +UString Interchunk::processChunk(xmlNode *localroot) { - string result; - result.append("^"); + UString result; + result.append("^"_u); - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - result.append(evalString(i)); - } + for (auto i : children(localroot)) { + result.append(evalString(i)); } - result.append("$"); + result.append("$"_u); return result; } -void -Interchunk::processInstruction(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) - { - processChoose(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) - { - processLet(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) - { - processAppend(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) - { - processOut(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) - { - processCallMacro(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) - { - processModifyCase(localroot); - } -} - void Interchunk::processLet(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } @@ -516,7 +224,7 @@ Interchunk::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -527,119 +235,90 @@ Interchunk::processLet(xmlNode *localroot) } if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); variables[val] = evalString(rightSide); evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content) - 1; + pos = atoi((const char *) i->children->content) - 1; } } - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], + bool match = word[pos]->setChunkPart(attr_items[part], evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_tl, - (const char *) part, + part, pos, NULL); } } -void -Interchunk::processAppend(xmlNode *localroot) -{ - string name; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "n")) - { - name = (char *) i->children->content; - break; - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - variables[name].append(evalString(i)); - } - } -} - void Interchunk::processModifyCase(xmlNode *localroot) { xmlNode *leftSide = NULL, *rightSide = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; } } if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content) - 1; + pos = atoi((const char *) i->children->content) - 1; } } - string const result = copycase(evalString(rightSide), - word[pos]->chunkPart(attr_items[(const char *) part])); - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], result); + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[part])); + bool match = word[pos]->setChunkPart(attr_items[part], result); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; - variables[val] = copycase(evalString(rightSide), variables[val]); + UString const val = to_ustring((const char *) leftSide->properties->children->content); + variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); } } void Interchunk::processCallMacro(xmlNode *localroot) { - const char *n = (const char *) localroot->properties->children->content; + UString n = to_ustring((const char *) localroot->properties->children->content); int npar = 0; xmlNode *macro = macro_map[macros[n]]; @@ -656,16 +335,11 @@ Interchunk::processCallMacro(xmlNode *localroot) // ToDo: Is it at all valid if npar <= 0 ? InterchunkWord **myword = NULL; + int idx = 0; if(npar > 0) { myword = new InterchunkWord *[npar]; - } - - int idx = 0; - for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { + for (auto i : children(localroot)) { int pos = atoi((const char *) i->properties->children->content)-1; myword[idx] = word[pos]; idx++; @@ -675,12 +349,8 @@ Interchunk::processCallMacro(xmlNode *localroot) swap(myword, word); swap(npar, lword); - for(xmlNode *i = macro->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } + for (auto i : children(macro)) { + processInstruction(i); } swap(myword, word); @@ -689,741 +359,91 @@ Interchunk::processCallMacro(xmlNode *localroot) delete[] myword; } -void -Interchunk::processChoose(xmlNode *localroot) +TransferToken & +Interchunk::readToken(InputFile& in) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) + if(!input_buffer.isEmpty()) { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "when")) - { - bool picked_option = false; + return input_buffer.next(); + } - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(j->name, (const xmlChar *) "test")) - { - if(!processTest(j)) - { - break; - } - else - { - picked_option = true; - } - } - else - { - processInstruction(j); - } - } - } - if(picked_option) - { - return; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - processInstruction(j); - } - } - } - } - } -} - -bool -Interchunk::processLogical(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) - { - return processEqual(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) - { - return processBeginsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) - { - return processBeginsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) - { - return processEndsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) - { - return processEndsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) - { - return processContainsSubstring(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) - { - return processOr(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) - { - return processAnd(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) - { - return processNot(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) - { - return processIn(localroot); - } - - return false; -} - -bool -Interchunk::processIn(xmlNode *localroot) -{ - xmlNode *value = NULL; - xmlChar *idlist = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(value == NULL) - { - value = i; - } - else - { - idlist = i->properties->children->content; - break; - } - } - } - - string sval = evalString(value); - - if(localroot->properties != NULL) - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - set &myset = listslow[(const char *) idlist]; - if(myset.find(tolower(sval)) != myset.end()) - { - return true; - } - else - { - return false; - } - } - } - - set &myset = lists[(const char *) idlist]; - if(myset.find(sval) != myset.end()) - { - return true; - } - else - { - return false; - } -} - -bool -Interchunk::processTest(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return processLogical(i); - } - } - return false; -} - -bool -Interchunk::processAnd(xmlNode *localroot) -{ - bool val = true; - for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val && processLogical(i); - } - } - - return val; -} - -bool -Interchunk::processOr(xmlNode *localroot) -{ - bool val = false; - for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val || processLogical(i); - } - } - - return val; -} - -bool -Interchunk::processNot(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return !processLogical(i); - } - } - return false; -} - -bool -Interchunk::processEqual(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first) == evalString(second); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)) == tolower(evalString(second)); - } - else - { - return evalString(first) == evalString(second); - } - } -} - -bool -Interchunk::beginsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = 0; i != limit; i++) - { - if(s1[i] != s2[i]) - { - return false; - } - } - - return true; -} - -bool -Interchunk::endsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) - { - if(s1[j] != s2[i]) - { - return false; - } - } - - return true; -} - - -bool -Interchunk::processBeginsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return beginsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return beginsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return beginsWith(evalString(first), evalString(second)); - } - } -} - -bool -Interchunk::processEndsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return endsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return endsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return endsWith(evalString(first), evalString(second)); - } - } -} - -bool -Interchunk::processBeginsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(beginsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Interchunk::processEndsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(endsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Interchunk::processContainsSubstring(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first).find(evalString(second)) != string::npos; - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; - } - else - { - return evalString(first).find(evalString(second)) != string::npos; - } - } -} - -string -Interchunk::copycase(string const &source_word, string const &target_word) -{ - wstring result; - wstring const s_word = UtfConverter::fromUtf8(source_word); - wstring const t_word = UtfConverter::fromUtf8(target_word); - - bool firstupper = iswupper(s_word[0]); - bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); - bool sizeone = s_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - result = StringUtils::tolower(t_word); - } - else - { - result = StringUtils::toupper(t_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return UtfConverter::toUtf8(result); -} - -string -Interchunk::caseOf(string const &str) -{ - wstring const s = UtfConverter::fromUtf8(str); - - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else if(!iswupper(s[s.size()-1])) - { - return "Aa"; - } - else - { - return "AA"; - } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else - { - return "Aa"; - } - } - else - { - return "aa"; - } -} - -string -Interchunk::tolower(string const &str) const -{ - return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); -} - -string -Interchunk::tags(string const &str) const -{ - string result = "<"; - - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - if(str[i] == '.') - { - result.append("><"); - } - else - { - result += str[i]; - } - } - - result += '>'; - - return result; -} - -void -Interchunk::processRule(xmlNode *localroot) -{ - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } - } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" ") != 0) - { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); - } - blank_queue.pop(); - } -} - -TransferToken & -Interchunk::readToken(FILE *in) -{ - if(!input_buffer.isEmpty()) - { - return input_buffer.next(); - } - - wstring content; + UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (internal_null_flush && val == 0)) + int val = in.get(); + if(in.eof() || (internal_null_flush && val == 0)) { return input_buffer.add(TransferToken(content, tt_eof)); } - if(val == L'\\') + if(val == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L']') - { - content += L']'; - break; - } - else - { - content += wchar_t(val2); - } + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == ']') { + content += ']'; + break; + } else { + content += val2; + } } } - else if(inword && val == L'{') - { - content += L'{'; - while(true) - { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L'}') - { - wint_t val3 = wchar_t(fgetwc_unlocked(in)); - ungetwc(val3, in); + else if(inword && val == '{') { + content += '{'; + while(true) { + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == '}') { + UChar32 val3 = in.peek(); - content += L'}'; - if(val3 == L'$') - { - break; - } - } - else - { - content += wchar_t(val2); - } + content += '}'; + if(val3 == '$') { + break; + } + } else { + content += val2; + } } } - else if(inword && val == L'$') + else if(inword && val == '$') { inword = false; return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { inword = true; return input_buffer.add(TransferToken(content, tt_blank)); } else { - content += wchar_t(val); + content += val; } } } -bool -Interchunk::getNullFlush(void) -{ - return null_flush; -} - void -Interchunk::setNullFlush(bool null_flush) -{ - this->null_flush = null_flush; -} - -void -Interchunk::setTrace(bool trace) -{ - this->trace = trace; -} - -void -Interchunk::interchunk_wrapper_null_flush(FILE *in, FILE *out) +Interchunk::interchunk_wrapper_null_flush(InputFile& in, UFILE* out) { null_flush = false; internal_null_flush = true; - while(!feof(in)) - { + while(!in.eof()) { interchunk(in, out); - fputwc_unlocked(L'\0', out); - int code = fflush(out); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', out); + u_fflush(out); + variables = variable_defaults; } internal_null_flush = false; null_flush = true; @@ -1431,7 +451,7 @@ Interchunk::interchunk_wrapper_null_flush(FILE *in, FILE *out) void -Interchunk::interchunk(FILE *in, FILE *out) +Interchunk::interchunk(InputFile& in, UFILE* out) { if(getNullFlush()) { @@ -1439,6 +459,9 @@ Interchunk::interchunk(FILE *in, FILE *out) } unsigned int last = input_buffer.getPos(); + unsigned int prev_last = last; + int lastrule_id = -1; + set banned_rules; output = out; ms.init(me->getInitial()); @@ -1449,52 +472,69 @@ Interchunk::interchunk(FILE *in, FILE *out) { if(lastrule != NULL) { - applyRule(); - input_buffer.setPos(last); + int words_to_consume = applyRule(); + if (words_to_consume == -1) { + banned_rules.clear(); + input_buffer.setPos(last); + } else if (words_to_consume == 1) { + banned_rules.clear(); + if (prev_last >= input_buffer.getSize()) { + input_buffer.setPos(0); + } else { + input_buffer.setPos(prev_last+1); + } + while (true) { + TransferToken& tt = input_buffer.next(); + if (tt.getType() == tt_word) { + break; + } + } + } else { + banned_rules.insert(lastrule_id); + input_buffer.setPos(prev_last); + input_buffer.next(); + last = input_buffer.getPos(); + } + lastrule_id = -1; } else { - if(tmpword.size() != 0) - { - fputwc_unlocked(L'^', output); - fputws_unlocked(tmpword[0]->c_str(), output); - fputwc_unlocked(L'$', output); - tmpword.clear(); - input_buffer.setPos(last); - input_buffer.next(); - last = input_buffer.getPos(); - ms.init(me->getInitial()); - } - else if(tmpblank.size() != 0) - { - fputws_unlocked(tmpblank[0]->c_str(), output); - tmpblank.clear(); - last = input_buffer.getPos(); - ms.init(me->getInitial()); - } + if(tmpword.size() != 0) { + u_fprintf(output, "^%S$", tmpword[0]->c_str()); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + prev_last = last; + banned_rules.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) { + write(*tmpblank[0], output); + tmpblank.clear(); + prev_last = last; + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } } } - int val = ms.classifyFinals(me->getFinals()); + int val = ms.classifyFinals(me->getFinals(), banned_rules); if(val != -1) { size_t lastrule_line = rule_lines[val-1]; lastrule = rule_map[val-1]; last = input_buffer.getPos(); - + lastrule_id = val; + last_lword = tmpword.size(); if(trace) { - wcerr << endl << L"apertium-interchunk: Rule " << val << L" line " << lastrule_line << L" "; - for (unsigned int ind = 0; ind < tmpword.size(); ind++) - { - if (ind != 0) - { - wcerr << L" "; - } - fputws_unlocked(tmpword[ind]->c_str(), stderr); + cerr << endl << "apertium-interchunk: Rule " << val << " line " << lastrule_line; + for (auto& it : tmpword) { + cerr << " " << *it; } - wcerr << endl; + cerr << endl; } } @@ -1502,38 +542,36 @@ Interchunk::interchunk(FILE *in, FILE *out) switch(current.getType()) { - case tt_word: - applyWord(current.getContent()); - tmpword.push_back(¤t.getContent()); - break; + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; - case tt_blank: - ms.step(L' '); - tmpblank.push_back(¤t.getContent()); - break; + case tt_blank: + ms.step(' '); + tmpblank.push_back(¤t.getContent()); + break; - case tt_eof: - if(tmpword.size() != 0) - { - tmpblank.push_back(¤t.getContent()); - ms.clear(); - } - else - { - fputws_unlocked(current.getContent().c_str(), output); - tmpblank.clear(); - return; - } - break; + case tt_eof: + if(tmpword.size() != 0) { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else { + write(current.getContent(), output); + tmpblank.clear(); + return; + } + break; - default: - wcerr << "Error: Unknown input token." << endl; - return; + default: + cerr << "Error: Unknown input token." << endl; + return; } } } -void +int Interchunk::applyRule() { unsigned int limit = tmpword.size(); @@ -1549,15 +587,15 @@ Interchunk::applyRule() { if(int(blank_queue.size()) < last_lword - 1) { - string blank_to_add = string(UtfConverter::toUtf8(*tmpblank[i-1])); + UString blank_to_add = UString(*tmpblank[i-1]); blank_queue.push(blank_to_add); } } - word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i])); + word[i] = new InterchunkWord(*tmpword[i]); } - processRule(lastrule); + int words_to_consume = processRule(lastrule); lastrule = NULL; if(word) @@ -1573,25 +611,26 @@ Interchunk::applyRule() tmpword.clear(); tmpblank.clear(); ms.init(me->getInitial()); + return words_to_consume; } void -Interchunk::applyWord(wstring const &word_str) +Interchunk::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -1608,14 +647,14 @@ Interchunk::applyWord(wstring const &word_str) } break; - case L'{': // ignore the unmodifiable part of the chunk - ms.step(L'$'); + case '{': // ignore the unmodifiable part of the chunk + ms.step('$'); return; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } - ms.step(L'$'); + ms.step('$'); } diff --git a/apertium/interchunk.dtd b/apertium/interchunk.dtd index 25e2f66..91af7ec 100644 --- a/apertium/interchunk.dtd +++ b/apertium/interchunk.dtd @@ -23,7 +23,7 @@ - + @@ -425,6 +425,16 @@ get-case-from --> + + + + + + + + diff --git a/apertium/postchunk.h b/apertium/postchunk.h index 9f3a254..70044e0 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -17,136 +17,63 @@ #ifndef _POSTCHUNK_ #define _POSTCHUNK_ -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include -#include -#include -#include -#include -#include -#include +#include +#include using namespace std; -class Postchunk +class Postchunk : public TransferBase { private: - Alphabet alphabet; - MatchExe *me; - MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; - vector macro_map; - vector rule_map; - vector rule_lines; - xmlDoc *doc; - xmlNode *root_element; InterchunkWord **word; - queue blank_queue; - int lword; - Buffer input_buffer; - vector tmpword; - vector tmpblank; - - bool in_out; + bool in_lu; - bool in_let_var; - string var_val; bool in_wblank; - string out_wblank; - map var_out_wblank; - - FILE *output; - int any_char; - int any_tag; - - xmlNode *lastrule; - unsigned int nwords; - - map evalStringCache; + UString out_wblank; + map var_out_wblank; bool inword; - bool null_flush; - bool internal_null_flush; - bool trace; - void destroy(); - void readData(FILE *input); - void readPostchunk(string const &input); - void collectMacros(xmlNode *localroot); - void collectRules(xmlNode *localroot); - static string caseOf(string const &str); - static wstring caseOf(wstring const &str); - string copycase(string const &source_word, string const &target_word); + UString evalCachedString(xmlNode* element); + void processClip(xmlNode* element); + void processBlank(xmlNode* element); + void processLuCount(xmlNode* element); + void processCaseOf(xmlNode* element); + UString processLu(xmlNode* element); + UString processMlu(xmlNode* element); + + UString processChunk(xmlNode* element); void processLet(xmlNode *localroot); - void processAppend(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - bool processLogical(xmlNode *localroot); - bool processTest(xmlNode *localroot); - bool processAnd(xmlNode *localroot); - bool processOr(xmlNode *localroot); - bool processEqual(xmlNode *localroot); - bool processBeginsWith(xmlNode *localroot); - bool processBeginsWithList(xmlNode *localroot); - bool processEndsWith(xmlNode *localroot); - bool processEndsWithList(xmlNode *localroot); - bool processContainsSubstring(xmlNode *localroot); - bool processNot(xmlNode *localroot); - bool processIn(xmlNode *localroot); - void processRule(xmlNode *localroot); - string evalString(xmlNode *localroot); - void processInstruction(xmlNode *localroot); - void processChoose(xmlNode *localroot); void processTags(xmlNode *localroot); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; - string readWord(FILE *in); - string readBlank(FILE *in); - string readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); - void applyRule(); - TransferToken & readToken(FILE *in); - static void unchunk(wstring const &chunk, FILE *output); - static vector getVecTags(wstring const &chunk); - static int beginChunk(wstring const &chunk); - static int endChunk(wstring const &chunk); - static void splitWordsAndBlanks(wstring const &chunk, - vector &words, - vector &blanks); - static wstring pseudolemma(wstring const &chunk); - static wstring wordzero(wstring const &chunk); + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); + int applyRule(); + TransferToken & readToken(InputFile& in); + static void unchunk(UString const &chunk, UFILE *output); + static vector getVecTags(UString const &chunk); + static int beginChunk(UString const &chunk); + static int endChunk(UString const &chunk); + static void splitWordsAndBlanks(UString const &chunk, + vector &words, + vector &blanks); + static UString pseudolemma(UString const &chunk); + static UString wordzero(UString const &chunk); bool checkIndex(xmlNode *element, int index, int limit); - void postchunk_wrapper_null_flush(FILE *in, FILE *out); - bool gettingLemmaFromWord(string attr); - string combineWblanks(string wblank_current, string wblank_to_add); + void postchunk_wrapper_null_flush(InputFile& in, UFILE* out); public: Postchunk(); - ~Postchunk(); - void read(string const &transferfile, string const &datafile); - void postchunk(FILE *in, FILE *out); - bool getNullFlush(void); - void setNullFlush(bool null_flush); - void setTrace(bool trace); + void postchunk(InputFile& in, UFILE* out); }; #endif diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc index 764b3cf..6c8c778 100644 --- a/apertium/pretransfer.cc +++ b/apertium/pretransfer.cc @@ -3,74 +3,74 @@ #include #include #include -#include +#include #include #include -wstring storeAndWriteWblank(FILE *input, FILE *output) +UString storeAndWriteWblank(InputFile& input, UFILE* output) { int mychar; - wstring content = L"[["; + UString content = "[["_u; while(true) { - mychar = fgetwc_unlocked(input); - if(feof(input)) + mychar = input.get(); + if(input.eof()) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } - + content += mychar; - fputwc_unlocked(mychar, output); - - if(mychar == L'\\') + u_fputc(mychar, output); + + if(mychar == '\\') { - mychar = fgetwc(input); + mychar = input.get(); content += mychar; - fputwc(mychar, output); + u_fputc(mychar, output); } - else if(mychar == L']') + else if(mychar == ']') { - mychar = fgetwc(input); - - if(mychar == L']') + mychar = input.get(); + + if(mychar == ']') { content += mychar; - fputwc(mychar, output); + u_fputc(mychar, output); break; } } } - + return content; } -void readAndWriteUntil(FILE *input, FILE *output, int const charcode) +void readAndWriteUntil(InputFile& input, UFILE* output, int const charcode) { int mychar; - while((mychar = fgetwc_unlocked(input)) != charcode) + while((mychar = input.get()) != charcode) { - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } - fputwc_unlocked(mychar, output); - if(mychar == L'\\') + u_fputc(mychar, output); + if(mychar == '\\') { - mychar = fgetwc(input); - fputwc(mychar, output); + mychar = input.get(); + u_fputc(mychar, output); } } } -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank = L"") +void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound_sep, UString wblank = ""_u) { int mychar; - wstring buffer = L""; + UString buffer; bool buffer_mode = false; bool in_tag = false; @@ -78,20 +78,20 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, if(surface_forms) { - while((mychar = fgetwc_unlocked(input)) != L'/') ; + while((mychar = input.get()) != '/') ; } - while((mychar = fgetwc_unlocked(input)) != L'$') + while((mychar = input.get()) != '$') { - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } switch(mychar) { - case L'<': + case '<': in_tag = true; if(!buffer_mode) { @@ -99,11 +99,11 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, } break; - case L'>': + case '>': in_tag = false; break; - case L'#': + case '#': if(buffer_mode) { buffer_mode = false; @@ -114,106 +114,106 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, if(buffer_mode) { - if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && - (mychar != L'~' || (mychar == L'~' && in_tag == true))) + if((mychar != '+' || (mychar == '+' && in_tag == true)) && + (mychar != '~' || (mychar == '~' && in_tag == true))) { - buffer += static_cast(mychar); + buffer += mychar; } - else if(in_tag == false && mychar == L'+') + else if(in_tag == false && mychar == '+') { - buffer.append(L"$ "); + buffer.append("$ "_u); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"_u); } - else if(in_tag == false && mychar == L'~' and compound_sep == true) + else if(in_tag == false && mychar == '~' and compound_sep == true) { - buffer.append(L"$"); + buffer.append("$"_u); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"_u); } } else { - if(mychar == L'+' && queuing == true) + if(mychar == '+' && queuing == true) { - buffer.append(L"$ "); + buffer.append("$ "_u); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"_u); buffer_mode = true; } else { - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); } } } - fputws_unlocked(buffer.c_str(), output); + write(buffer, output); } -void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep) +void processStream(InputFile& input, UFILE* output, bool null_flush, bool surface_forms, bool compound_sep) { while(true) { - int mychar = fgetwc_unlocked(input); - if(feof(input)) + int mychar = input.get(); + if(input.eof()) { break; } switch(mychar) { - case L'[': - fputwc_unlocked(L'[', output); - mychar = fgetwc_unlocked(input); - - if(mychar == L'[') + case '[': + u_fputc('[', output); + mychar = input.get(); + + if(mychar == '[') { - fputwc_unlocked(L'[', output); - wstring wblank = storeAndWriteWblank(input, output); - mychar = fgetwc_unlocked(input); - - if(mychar == L'^') + u_fputc('[', output); + UString wblank = storeAndWriteWblank(input, output); + mychar = input.get(); + + if(mychar == '^') { - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); procWord(input, output, surface_forms, compound_sep, wblank); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } else { - wcerr << L"ERROR: Wordbound blank isn't immediately followed by the Lexical Unit." << endl; + cerr << "ERROR: Wordbound blank isn't immediately followed by the Lexical Unit." << endl; exit(EXIT_FAILURE); } } else { - ungetwc(mychar, input); - readAndWriteUntil(input, output, L']'); - fputwc_unlocked(L']', output); + input.unget(mychar); + readAndWriteUntil(input, output, ']'); + u_fputc(']', output); } break; - case L'\\': - fputwc_unlocked(mychar, output); - fputwc_unlocked(fgetwc_unlocked(input), output); + case '\\': + u_fputc(mychar, output); + u_fputc(input.get(), output); break; - case L'^': - fputwc_unlocked(mychar, output); + case '^': + u_fputc(mychar, output); procWord(input, output, surface_forms, compound_sep); - fputwc_unlocked(L'$', output); + u_fputc('$', output); break; - case L'\0': - fputwc_unlocked(mychar, output); + case '\0': + u_fputc(mychar, output); if(null_flush) { - fflush(output); + u_fflush(output); } break; default: - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); break; } } diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h index d664b13..ffc9737 100644 --- a/apertium/pretransfer.h +++ b/apertium/pretransfer.h @@ -16,12 +16,13 @@ #ifndef PRETRANSFER_H #define PRETRANSFER_H -#include -#include +#include +#include +#include -wstring storeAndWriteWblank(FILE *input, FILE *output); -void readAndWriteUntil(FILE *input, FILE *output, int const charcode); -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank); -void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep); +UString storeAndWriteWblank(InputFile& input, UFILE *output); +void readAndWriteUntil(InputFile& input, UFILE *output, int const charcode); +void procWord(InputFile& input, UFILE *output, bool surface_forms, bool compound_sep, UString wblank); +void processStream(InputFile& input, UFILE *output, bool null_flush, bool surface_forms, bool compound_sep); #endif diff --git a/apertium/reformat.xsl b/apertium/reformat.xsl index 3316b42..f471823 100644 --- a/apertium/reformat.xsl +++ b/apertium/reformat.xsl @@ -26,7 +26,6 @@ #ifndef GENFORMAT #include "apertium_config.h" #endif -#include <utf8/utf8.h> #include <apertium/unlocked_cstdio.h> #include <cstdlib> @@ -36,22 +35,13 @@ #include <string> #include <unistd.h> #include <lttoolbox/lt_locale.h> -#include <lttoolbox/ltstr.h> -#include <apertium/string_to_wostream.h> -#include <wchar.h> -#ifdef _WIN32 -#include <io.h> -#include <fcntl.h> -#define utf8to32 utf8to16 -#define utf32to8 utf16to8 -#endif using namespace std; - + @@ -61,9 +51,9 @@ using namespace std; - + - + @@ -72,19 +62,6 @@ using namespace std; -string memconv; - -wstring convertir(char const *multibyte, int const length) -{ - std::wstring rv; - memconv.append(multibyte, length); - if (utf8::is_valid(memconv.begin(), memconv.end())) { - utf8::utf8to32(memconv.begin(), memconv.end(), std::back_inserter(rv)); - memconv.clear(); - } - return rv; -} - %} %option nounput @@ -101,26 +78,22 @@ wstring convertir(char const *multibyte, int const length) string filename = yytext; filename = filename.substr(2, filename.size()-3); FILE *temp = fopen(filename.c_str(), "rb"); - wint_t mychar; -#ifdef _MSC_VER - _setmode(_fileno(temp), _O_U8TEXT); -#endif + int mychar; if(!temp) { - wcerr << "ERROR: File '" << filename <<"' not found." << endl; + cerr << "ERROR: File '" << filename <<"' not found." << endl; exit(EXIT_FAILURE); } - while(static_cast<int>(mychar = fgetwc_unlocked(temp)) != EOF) - { - fputwc_unlocked(mychar, yyout); + while((mychar = fgetc(temp)) != EOF) { + fputc_unlocked(mychar, yyout); } fclose(temp); unlink(filename.c_str()); } "[\\@" { - fputwc_unlocked(L'@', yyout); + fputc_unlocked('@', yyout); } ".[]" { @@ -128,13 +101,13 @@ wstring convertir(char const *multibyte, int const length) } "\\" { - fputws_unlocked(convertir(yytext+1, yyleng-1).c_str(), yyout); + fwrite(yytext+1, 1, yyleng-1, yyout); } .|\n { - wstring yytext_conv = convertir(yytext, yyleng); + string yytext_conv = yytext; @@ -148,7 +121,7 @@ wstring convertir(char const *multibyte, int const length) - + @@ -157,7 +130,7 @@ wstring convertir(char const *multibyte, int const length) - + @@ -171,8 +144,8 @@ wstring convertir(char const *multibyte, int const length) void usage(string const &progname) { - wcerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; - wcerr << " format processor " << endl; + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + cerr << " format processor " << endl; exit(EXIT_SUCCESS); } @@ -203,10 +176,6 @@ int main(int argc, char *argv[]) default: break; } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif diff --git a/apertium/sentence_stream.cc b/apertium/sentence_stream.cc index c83450b..a90e56e 100644 --- a/apertium/sentence_stream.cc +++ b/apertium/sentence_stream.cc @@ -21,7 +21,7 @@ bool isSentenceEnd(StreamedType &token) { return false; } Tag &tag = *tags.begin(); - if (tag.TheTag != L"sent") { + if (tag.TheTag != "sent"_u) { return false; } return true; @@ -37,7 +37,7 @@ bool isSentenceEnd(StreamedType tok, Stream &in, bool sent_seg) { SentenceTagger::SentenceTagger() {} -void SentenceTagger::tag(Stream &in, std::wostream &out, bool sent_seg) { +void SentenceTagger::tag(Stream &in, std::ostream &out, bool sent_seg) { clearBuffers(); while (true) { @@ -67,7 +67,7 @@ void SentenceTagger::clearBuffers() const { flushes.clear(); } -void SentenceTagger::tagAndPutSentence(std::wostream &out) { +void SentenceTagger::tagAndPutSentence(std::ostream &out) { TaggedSentence tagged_sent = tagSentence(lexical_sent); TaggedSentence::const_iterator ts_it = tagged_sent.begin(); @@ -103,17 +103,17 @@ TrainingCorpus::TrainingCorpus(Stream &tagged, Stream &untagged, untagged_line++; if (!tagged_token.TheLexicalUnit || !untagged_token.TheLexicalUnit) { if (tagged_token.TheLexicalUnit || untagged_token.TheLexicalUnit) { - std::wcerr << "Normal perm\n"; - std::wcerr << "tagged: " << tagged_line << " " << (!!tagged_token.TheLexicalUnit) << "\n"; - std::wcerr << "untagged: " << untagged_line << " " << (!!untagged_token.TheLexicalUnit) << "\n"; + std::cerr << "Normal perm\n"; + std::cerr << "tagged: " << tagged_line << " " << (!!tagged_token.TheLexicalUnit) << "\n"; + std::cerr << "untagged: " << untagged_line << " " << (!!untagged_token.TheLexicalUnit) << "\n"; prematureEnd(); } break; } - //std::wcerr << tagged_token.TheLexicalUnit->TheSurfaceForm << " || " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; + //std::cerr << tagged_token.TheLexicalUnit->TheSurfaceForm << " || " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; if (untagged_token.TheLexicalUnit->TheSurfaceForm != tagged_token.TheLexicalUnit->TheSurfaceForm) { if (!skip_on_error) { - std::wstringstream what_; + std::stringstream what_; what_ << "Streams diverged at line " << tagged_line << "\n"; what_ << "Untagged token: " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; @@ -127,18 +127,18 @@ TrainingCorpus::TrainingCorpus(Stream &tagged, Stream &untagged, training_sentence->first.clear(); training_sentence->second.clear(); - std::wcerr << "fast forward\n"; + std::cerr << "fast forward\n"; bool tagged_ended = contToEndOfSent(tagged, tagged_token, tagged_line); bool untagged_ended = contToEndOfSent(untagged, untagged_token, untagged_line); if (tagged_ended || untagged_ended) { if (!tagged_ended || !untagged_ended) { - std::wcerr << "fast forward prem\n"; + std::cerr << "fast forward prem\n"; prematureEnd(); } - std::wcerr << "fast forward finish\n"; + std::cerr << "fast forward finish\n"; break; } - std::wcerr << "fast forwarded\n"; + std::cerr << "fast forwarded\n"; continue; } if (was_sentence_end) { @@ -169,7 +169,7 @@ bool TrainingCorpus::contToEndOfSent(Stream &stream, StreamedType token, if (isSentenceEnd(token, stream, sent_seg)) { return false; } - std::wcerr << "Skip " << token.TheLexicalUnit->TheSurfaceForm << "\n"; + std::cerr << "Skip " << token.TheLexicalUnit->TheSurfaceForm << "\n"; token = stream.get(); line++; } diff --git a/apertium/sentence_stream.h b/apertium/sentence_stream.h index aea298e..133f6ba 100644 --- a/apertium/sentence_stream.h +++ b/apertium/sentence_stream.h @@ -20,18 +20,18 @@ namespace SentenceStream { bool isSentenceEnd(Stream &in, bool sent_seg = false); class SentenceTagger { public: - void tag(Stream &in, std::wostream &out, bool sent_seg); + void tag(Stream &in, std::ostream &out, bool sent_seg); SentenceTagger(); protected: virtual TaggedSentence tagSentence(const Sentence &untagged) = 0; virtual void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) = 0; + std::ostream &output) = 0; private: void clearBuffers() const; - void tagAndPutSentence(std::wostream &out); + void tagAndPutSentence(std::ostream &out); void putTaggedSent( - std::wostream &out, TaggedSentence &tagged_sent, Sentence &full_sent, + std::ostream &out, TaggedSentence &tagged_sent, Sentence &full_sent, std::vector &flushes) const; mutable Sentence full_sent; mutable Sentence lexical_sent; diff --git a/apertium/shell_utils.cc b/apertium/shell_utils.cc index cccfa5b..e9d040c 100644 --- a/apertium/shell_utils.cc +++ b/apertium/shell_utils.cc @@ -73,12 +73,14 @@ FILE *try_open_file(const char *metavar, const char *filename, return f; } -FILE *try_open_file_utf8(const char *metavar, const char *filename, +UFILE* try_open_file_utf8(const char *metavar, const char *filename, const char *flags) { - FILE *f = try_open_file(metavar, filename, flags); -#ifdef _MSC_VER - _setmode(_fileno(f), _O_U8TEXT); -#endif // _MSC_VER + UFILE* f = u_fopen(filename, flags, NULL, NULL); + if (f == NULL) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::Shell::FopenError(what_); + } return f; } diff --git a/apertium/shell_utils.h b/apertium/shell_utils.h index 11b7a36..f2ca314 100644 --- a/apertium/shell_utils.h +++ b/apertium/shell_utils.h @@ -3,6 +3,7 @@ #include #include +#include namespace Apertium { namespace ShellUtils { @@ -22,7 +23,7 @@ FILE* try_open_file(const char *metavar, const char *filename, const char *flags); -FILE* +UFILE* try_open_file_utf8(const char *metavar, const char *filename, const char *flags); diff --git a/apertium/stream.cc b/apertium/stream.cc index 4d83a3e..3d8897b 100644 --- a/apertium/stream.cc +++ b/apertium/stream.cc @@ -25,23 +25,23 @@ namespace Apertium { Stream::Stream(TaggerFlags &Flags_) - : TheLineNumber(1), TheCharacterStream(std::wcin), TheFilename(), TheLine(), + : TheLineNumber(1), TheCharacterStream(std::cin), TheFilename(), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, const char *const Filename_) + std::ifstream &CharacterStream_, const char *const Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, const std::string &Filename_) + std::ifstream &CharacterStream_, const std::string &Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, + std::ifstream &CharacterStream_, const std::stringstream &Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), TheLine(), TheFlags(Flags_), private_flush_(false), @@ -49,13 +49,13 @@ Stream::Stream(TaggerFlags &Flags_, StreamedType Stream::get() { StreamedType TheStreamedType; - std::wstring Lemma; + UString Lemma; private_flush_ = false; //TheCharacterStream.clear(); if (!is_eof_throw_if_not_TheCharacterStream_good()) { while (true) { - const wchar_t Character_ = TheCharacterStream.get(); + const UChar Character_ = TheCharacterStream.get(); if (is_eof_throw_if_not_TheCharacterStream_good(TheStreamedType, Lemma, Character_)) @@ -64,21 +64,21 @@ StreamedType Stream::get() { TheLine.push_back(Character_); switch (Character_) { - case L'\\': // <\> 92, Hex 5c, Octal 134 + case '\\': // <\> 92, Hex 5c, Octal 134 case_0x5c(TheStreamedType, Lemma, Character_); continue; - case L'[': + case '[': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '[' expected to follow '[', ']' or '$'"; + << "', '[' expected to follow '[', ']' or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -86,43 +86,43 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); ThePreviousCase = PreviousCaseType(Character_); continue; - case L']': + case ']': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', ']' expected to follow '['"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', ']' expected to follow '['"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': + case '[': + case ']': push_back_Character(TheStreamedType, Lemma, Character_); ThePreviousCase = PreviousCaseType(Character_); continue; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', ']' expected to follow '[' or ']'"; + << "', ']' expected to follow '[' or ']'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } std::abort(); - case L'^': + case '^': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L']': - case L'$': + case ']': + case '$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '^' expected to follow '[', ']', or '$'"; + << "', '^' expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -130,39 +130,39 @@ StreamedType Stream::get() { TheStreamedType.TheLexicalUnit = LexicalUnit(); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'/': + case '/': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'^': + case '^': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); { - const wchar_t Character_ = TheCharacterStream.get(); + const UChar Character_ = TheCharacterStream.get(); if (is_eof_throw_if_not_TheCharacterStream_good( TheStreamedType, Lemma, Character_)) { - std::wstringstream Message; - Message << L"unexpected end-of-file following '" + std::stringstream Message; + Message << "unexpected end-of-file following '" << ThePreviousCase->ThePreviousCase << "', end-of-file expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile( @@ -172,24 +172,24 @@ StreamedType Stream::get() { TheLine.push_back(Character_); switch (Character_) { - case L'\\': + case '\\': TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); TheStreamedType.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.push_back(Morpheme()); case_0x5c(TheStreamedType, Lemma, Character_); continue; - case L'*': + case '*': ThePreviousCase = PreviousCaseType(Character_); continue; - case L'\n': { - std::wstringstream Message; - Message << L"unexpected newline following '" + case '\n': { + std::stringstream Message; + Message << "unexpected newline following '" << ThePreviousCase->ThePreviousCase << "', newline expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCharacter( Message_what(Message)); }; - case L'<': + case '<': TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); TheStreamedType.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.push_back(Morpheme()); @@ -198,18 +198,18 @@ StreamedType Stream::get() { .TheTags.push_back(Tag()); ThePreviousCase = PreviousCaseType(Character_); continue; - - case L'[': - case L']': - case L'^': - case L'#': - case L'>': - case L'+': - case L'$': { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" - << ThePreviousCase->ThePreviousCase << L"', expected '*'"; + + case '[': + case ']': + case '^': + case '#': + case '>': + case '+': + case '$': { + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" + << ThePreviousCase->ThePreviousCase << "', expected '*'"; throw Exception::Stream::UnexpectedPreviousCase( Message_what(Message)); } @@ -223,37 +223,37 @@ StreamedType Stream::get() { } continue; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -262,65 +262,65 @@ StreamedType Stream::get() { .TheMorphemes.push_back(Morpheme()); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'*': + case '*': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '*' expected to follow '[', ']', or '$' or to " - L"follow '/' immediately"; + << "', '*' expected to follow '[', ']', or '$' or to " + "follow '/' immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'<': + case '<': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '<' expected to follow '[', to follow '>' " - L"immediately, or to follow '#', '/' or '+' not " - L"immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '<' expected to follow '[', to follow '>' " + "immediately, or to follow '#', '/' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'/': + case '/': break; - case L'#': - //std::wcerr << L"[306] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; - case L'+': + case '#': + //std::cerr << "[306] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; + case '+': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '<' expected to follow '[', '/', '>'" - L"immediately, or to follow '#' or '+' not " - L"immediately"; + << "', '<' expected to follow '[', '/', '>'" + "immediately, or to follow '#' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; - case L'>': + case '>': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '<' expected to follow '[', to follow '>' " - L"immediately, or to follow '#', '/' or '+' not " - L"immediately"; + << "', '<' expected to follow '[', to follow '>' " + "immediately, or to follow '#', '/' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -329,137 +329,137 @@ StreamedType Stream::get() { .TheTags.push_back(Tag()); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'>': + case '>': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'<': + case '<': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); continue; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } std::abort(); - case L'#': - //std::wcerr << L"[391] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; + case '#': + //std::cerr << "[391] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'^': - case L'$': + case '[': + case ']': + case '^': + case '$': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'/': + case '/': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '/' not " - L"immediately"; + << "', '#' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '/' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '/' not " - L"immediately"; + << "', '#' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '/' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to follow " - L"'>' immediately, or to follow '/' not immediately"; + << "', '#' expected to follow '[', ']', or '$', to follow " + "'>' immediately, or to follow '/' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); push_back_Character(TheStreamedType, Lemma, Character_); - //std::wcerr << L"[440] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; + //std::cerr << "[440] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; continue; } push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'+': + case '+': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'^': - case L'/': - case L'$': + case '[': + case ']': + case '^': + case '/': + case '$': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', '^', '/' or " - L"'$', to follow '>' immediately, or to follow '#' " - L"not immediately"; + << "', '+' expected to follow '[', ']', '^', '/' or " + "'$', to follow '>' immediately, or to follow '#' " + "not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '#' not " - L"immediately"; + << "', '+' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '#' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: { - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', or '$', to follow " - L"'>' immediately, or to follow '#' not immediately"; + << "', '+' expected to follow '[', ']', or '$', to follow " + "'>' immediately, or to follow '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -472,87 +472,87 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'$': + case '$': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'*': + case '*': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } if (TheFlags.getDebug()) { if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm) - std::wcerr << L"unexpected lemma \"" << Lemma - << L"\", expected \"" + std::cerr << "unexpected lemma \"" << Lemma + << "\", expected \"" << TheStreamedType.TheLexicalUnit->TheSurfaceForm - << L"\"\n"; + << "\"\n"; } ThePreviousCase = PreviousCaseType(Character_); return TheStreamedType; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); return TheStreamedType; - case L'\n': + case '\n': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: - std::wstringstream Message; - Message << L"unexpected newline following '" + std::stringstream Message; + Message << "unexpected newline following '" << ThePreviousCase->ThePreviousCase - << L"', newline expected to follow '[', ']', or '$'"; + << "', newline expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -572,14 +572,14 @@ StreamedType Stream::get() { if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L']': - case L'$': + case ']': + case '$': break; default: - std::wstringstream Message; - Message << L"unexpected end-of-file following '" + std::stringstream Message; + Message << "unexpected end-of-file following '" << ThePreviousCase->ThePreviousCase - << L"', end-of-file expected to follow ']' or '$'"; + << "', end-of-file expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); } } @@ -604,38 +604,38 @@ bool Stream::peekIsBlank() { std::ios::iostate state = TheCharacterStream.rdstate(); int pos = TheCharacterStream.tellg(); - const wchar_t newline1 = TheCharacterStream.get(); - const wchar_t newline2 = TheCharacterStream.get(); + const UChar newline1 = TheCharacterStream.get(); + const UChar newline2 = TheCharacterStream.get(); TheCharacterStream.clear(state); TheCharacterStream.seekg(pos); - return newline1 == L'\n' && newline2 == L'\n'; + return newline1 == '\n' && newline2 == '\n'; } bool Stream::flush_() const { return private_flush_; } void Stream::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, TaggerFlags &flags) { + std::ostream &output, TaggerFlags &flags) { using namespace std::rel_ops; - output << L"^"; + output << "^"; if (lexical_unit.TheAnalyses.empty() || !analysis) { if (flags.getShowSuperficial()) - output << lexical_unit.TheSurfaceForm << L"/"; + output << lexical_unit.TheSurfaceForm << "/"; - output << L"*" << lexical_unit.TheSurfaceForm << L"$"; + output << "*" << lexical_unit.TheSurfaceForm << "$"; return; } if (flags.getMark()) { if (lexical_unit.TheAnalyses.size() != 1) - output << L"="; + output << "="; } if (flags.getShowSuperficial()) - output << lexical_unit.TheSurfaceForm << L"/"; + output << lexical_unit.TheSurfaceForm << "/"; output << *analysis; @@ -645,14 +645,14 @@ void Stream::outputLexicalUnit( // Call .end() each iteration to save memory. other_analysis != lexical_unit.TheAnalyses.end(); ++other_analysis) { if (*other_analysis != *analysis) - output << L"/" << *other_analysis; + output << "/" << *other_analysis; } } - output << L"$"; + output << "$"; } -Stream::PreviousCaseType::PreviousCaseType(const wchar_t &PreviousCase_) +Stream::PreviousCaseType::PreviousCaseType(const UChar &PreviousCase_) : ThePreviousCase(PreviousCase_), isPreviousCharacter(true) {} bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { @@ -660,12 +660,12 @@ bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { return true; if (!TheCharacterStream) { - std::wcerr << L"State bad " << TheCharacterStream.good() << " " + std::cerr << "State bad " << TheCharacterStream.good() << " " << TheCharacterStream.eof() << " " << TheCharacterStream.fail() << " " << TheCharacterStream.bad() << "\n"; - std::wstringstream Message; - Message << L"can't get const wchar_t: TheCharacterStream not good"; + std::stringstream Message; + Message << "can't get const UChar: TheCharacterStream not good"; throw Exception::Stream::TheCharacterStream_not_good( Message_what(Message)); } @@ -673,28 +673,28 @@ bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { return false; } -std::wstring Stream::Message_what(const std::wstringstream &Message) const { - std::wstringstream what_; +UString Stream::Message_what(const std::stringstream &Message) const { + std::stringstream what_; if (TheFilename) - what_ << std::wstring(TheFilename->begin(), TheFilename->end()) << L": "; + what_ << UString(TheFilename->begin(), TheFilename->end()) << ": "; - what_ << TheLineNumber << L":" << TheLine.size() << L": " << Message.str() - << L'\n' << TheLine << L'\n' << std::wstring(TheLine.size() - 1, L' ') - << L'^'; - return what_.str(); + what_ << TheLineNumber << ":" << TheLine.size() << ": " << Message.str() + << '\n' << TheLine << '\n' << UString(TheLine.size() - 1, ' ') + << '^'; + return to_ustring(what_.str().c_str()); } bool Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - std::wstring &Lemma, - const wchar_t &Character_) { + UString &Lemma, + const UChar &Character_) { if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_)) return true; if (!TheCharacterStream) { - std::wstringstream Message; - Message << L"can't get const wchar_t: TheCharacterStream not good"; + std::stringstream Message; + Message << "can't get const UChar: TheCharacterStream not good"; throw Exception::Stream::TheCharacterStream_not_good( Message_what(Message)); } @@ -703,13 +703,13 @@ Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, } bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, - std::wstring &Lemma, - const wchar_t &Character_) { + UString &Lemma, + const UChar &Character_) { if (TheCharacterStream.eof()) return true; if (TheFlags.getNullFlush()) { - if (Character_ == L'\0') { + if (Character_ == '\0') { push_back_Character(StreamedType_, Lemma, Character_); private_flush_ = true; return true; @@ -720,55 +720,55 @@ bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, } void Stream::push_back_Character(StreamedType &StreamedType_, - std::wstring &Lemma, - const wchar_t &Character_) { + UString &Lemma, + const UChar &Character_) { if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': StreamedType_.TheString += Character_; break; - case L']': + case ']': StreamedType_.TheString += Character_; break; - case L'^': + case '^': StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_; break; - case L'/': + case '/': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'*': + case '*': Lemma += Character_; break; - case L'<': + case '<': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheTags.back() .TheTag += Character_; break; - case L'>': + case '>': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'#': + case '#': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'+': + case '+': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'$': + case '$': StreamedType_.TheString += Character_; break; default: - std::wstringstream Message; - Message << L"unexpected previous reserved or special character '" - << ThePreviousCase->ThePreviousCase << L"'"; + std::stringstream Message; + Message << "unexpected previous reserved or special character '" + << ThePreviousCase->ThePreviousCase << "'"; throw Exception::Stream::UnexpectedPreviousCase(Message_what(Message)); } @@ -779,18 +779,18 @@ void Stream::push_back_Character(StreamedType &StreamedType_, StreamedType_.TheString += Character_; } -void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, - const wchar_t &Character_) { +void Stream::case_0x5c(StreamedType &StreamedType_, UString &Lemma, + const UChar &Character_) { push_back_Character(StreamedType_, Lemma, Character_); { - const wchar_t Character_ = TheCharacterStream.get(); + const UChar Character_ = TheCharacterStream.get(); if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma, Character_)) { - std::wstringstream Message; - Message << L"unexpected end-of-file following '\\', end-of-file " - L"expected to follow ']' or '$'"; + std::stringstream Message; + Message << "unexpected end-of-file following '\\', end-of-file " + "expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); } diff --git a/apertium/stream.h b/apertium/stream.h index e1603ea..69c266a 100644 --- a/apertium/stream.h +++ b/apertium/stream.h @@ -30,11 +30,11 @@ namespace Apertium { class Stream { public: Stream(TaggerFlags &Flags_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const char *const Filename_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const std::string &Filename_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const std::stringstream &Filename_); StreamedType get(); StreamedType peek(); @@ -43,30 +43,30 @@ public: static void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, TaggerFlags &flags); + std::ostream &output, TaggerFlags &flags); std::size_t TheLineNumber; private: class PreviousCaseType { public: - PreviousCaseType(const wchar_t &PreviousCase_); - wchar_t ThePreviousCase; + PreviousCaseType(const UChar &PreviousCase_); + UChar ThePreviousCase; bool isPreviousCharacter : 1; }; bool is_eof_throw_if_not_TheCharacterStream_good() const; - std::wstring Message_what(const std::wstringstream &Message) const; + UString Message_what(const std::stringstream &Message) const; bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - std::wstring &Lemma, - const wchar_t &Character_); + UString &Lemma, + const UChar &Character_); bool isTheCharacterStream_eof(StreamedType &StreamedType_, - std::wstring &Lemma, const wchar_t &Character_); - void push_back_Character(StreamedType &StreamedType_, std::wstring &Lemma, - const wchar_t &Character_); - void case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, - const wchar_t &Character_); - std::wistream &TheCharacterStream; + UString &Lemma, const UChar &Character_); + void push_back_Character(StreamedType &StreamedType_, UString &Lemma, + const UChar &Character_); + void case_0x5c(StreamedType &StreamedType_, UString &Lemma, + const UChar &Character_); + std::istream &TheCharacterStream; Optional TheFilename; - std::wstring TheLine; + UString TheLine; TaggerFlags &TheFlags; bool private_flush_ : 1; Optional ThePreviousCase; diff --git a/apertium/stream_tagger.cc b/apertium/stream_tagger.cc index 617588e..a00735c 100644 --- a/apertium/stream_tagger.cc +++ b/apertium/stream_tagger.cc @@ -9,7 +9,7 @@ StreamTagger::~StreamTagger() {} void StreamTagger::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) { + std::ostream &output) { Stream::outputLexicalUnit(lexical_unit, analysis, output, TheFlags); } } diff --git a/apertium/stream_tagger.h b/apertium/stream_tagger.h index 11e93ca..2d0e123 100644 --- a/apertium/stream_tagger.h +++ b/apertium/stream_tagger.h @@ -15,11 +15,11 @@ public: virtual ~StreamTagger(); virtual void serialise(std::ostream &Serialised_basic_Tagger) const = 0; virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; - virtual void tag(Stream &Input, std::wostream &Output) = 0; + virtual void tag(Stream &Input, std::ostream &Output) = 0; virtual void train(Stream &TaggedCorpus) = 0; void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output); + std::ostream &output); }; } diff --git a/apertium/streamed_type.h b/apertium/streamed_type.h index f73ec21..8a70b44 100644 --- a/apertium/streamed_type.h +++ b/apertium/streamed_type.h @@ -24,7 +24,7 @@ namespace Apertium { class StreamedType { public: - std::wstring TheString; + UString TheString; Optional TheLexicalUnit; }; } diff --git a/apertium/string_utils.cc b/apertium/string_utils.cc deleted file mode 100644 index 8ae064b..0000000 --- a/apertium/string_utils.cc +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante - * author: Felipe Sánchez-Martínez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#include -#include -#include -#include - -#ifdef _MSC_VER -#define snprintf _snprintf -#endif - -//Delete white spaces from the end and the begining of the string -wstring -StringUtils::trim(wstring const &str) -{ - if(str == L"") - { - return L""; - } - - int begin = 0, end = str.size() - 1; - - while(begin < end && iswspace(str[begin])) - { - begin++; - } - - while(end > begin && iswspace(str[end])) - { - end--; - } - - if(!iswspace(str[end])) - { - end++; - } - - return str.substr(begin, end-begin); -} - -vector -StringUtils::split_wstring(wstring const &input, wstring const &delimiter) -{ - unsigned pos; - int new_pos; - vector result; - wstring s = L""; - pos=0; - - while(pos const &v) -{ - wstring s = L""; - for(unsigned i=0; i0) - s+=L' '; - s.append(v[i]); - } - return s; -} - -wstring -StringUtils::substitute(wstring const &source, wstring const &olds, wstring const &news) { - wstring s = source; - - unsigned int p=s.find(olds , 0); - while (p!=static_cast(wstring::npos)) - { - s.replace(p, olds.length(), news); - p+=news.length(); - p=s.find(olds,p); - } - - return s; -} - -wstring -StringUtils::itoa(int n) -{ - return XMLParseUtil::stows(itoa_string(n)); -} - -string -StringUtils::itoa_string(int n) -{ - char str[256]; - snprintf(str, 256, "%d", n); - return str; -} - -wstring -StringUtils::ftoa(double f) -{ - char str[256]; - sprintf(str, "%f",f); - return XMLParseUtil::stows(str); -} - -wstring -StringUtils::tolower(wstring const &s) -{ - wstring l=s; - for(unsigned i=0; i. - */ -#ifndef __STRINGUTILS_H_ -#define __STRINGUTILS_H_ - -#include -#include -#include - -using namespace std; - -namespace Apertium -{ - bool operator==(string const &s1, string const &s2); - bool operator==(string const &s1, char const *s2); - bool operator==(char const *s1, string const &s2); - bool operator!=(string const &s1, string const &s2); - bool operator!=(string const &s1, char const *s2); - bool operator!=(char const *s1, string const &s2); -} - -class StringUtils { - public: - - static wstring trim(wstring const &str); - - static vector split_wstring(wstring const &input, wstring const &delimiter); - - static wstring vector2wstring(vector const &v); - - //Replace each ocurrence of the string 'olds' by the string 'news' in string 'source' - static wstring substitute(const wstring &source, const wstring &olds, const wstring &news); - - static wstring itoa(int n); - - static string itoa_string(int n); - - static wstring ftoa(double f); - - static wstring tolower(wstring const &s); - - static wstring toupper(wstring const &s); -}; - -std::wostream & operator<< (std::wostream & ostr, std::string const & str); - -#endif diff --git a/apertium/tag.cc b/apertium/tag.cc index 0aba018..2983f94 100644 --- a/apertium/tag.cc +++ b/apertium/tag.cc @@ -24,11 +24,16 @@ bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; } bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; } -Tag::operator std::wstring() const { +Tag::operator UString() const { if (TheTag.empty()) throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty " - "TheTag std::wstring to std::wstring"); + "TheTag UString to UString"); - return L"<" + TheTag + L">"; + UString ret; + ret.reserve(TheTag.size() + 2); + ret += '<'; + ret.append(TheTag); + ret += '>'; + return ret; } } diff --git a/apertium/tag.h b/apertium/tag.h index 147ea0d..62698e8 100644 --- a/apertium/tag.h +++ b/apertium/tag.h @@ -16,15 +16,15 @@ #ifndef TAG_H #define TAG_H -#include +#include namespace Apertium { class Tag { public: friend bool operator==(const Tag &a, const Tag &b); friend bool operator<(const Tag &a, const Tag &b); - operator std::wstring() const; - std::wstring TheTag; + operator UString() const; + UString TheTag; }; } diff --git a/apertium/tagger.cc b/apertium/tagger.cc index a7f3cc0..6e8372c 100644 --- a/apertium/tagger.cc +++ b/apertium/tagger.cc @@ -285,7 +285,7 @@ apertium_tagger::apertium_tagger(int &argc, char **&argv) std::abort(); } } catch (const ExceptionType &ExceptionType_) { - std::wcerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; + std::cerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; throw Exception::apertium_tagger::err_Exception(""); } } @@ -294,7 +294,7 @@ apertium_tagger::~apertium_tagger() {} void apertium_tagger::help() { - std::wcerr << + std::cerr << "Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" " [INPUT \\\n" " [OUTPUT]]\n" @@ -339,27 +339,27 @@ void apertium_tagger::help() { options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); align::align_(options_description_); @@ -550,24 +550,12 @@ void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const & MorphoStream* apertium_tagger::setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, FILE **UntaggedCorpus) { - if (*TheFunctionType != Retrain) { - *Dictionary = try_open_file_utf8("DICTIONARY", DicFn, "r"); - } + UFILE* *UntaggedCorpus) { *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); - FILE_Tagger_.read_dictionary(*Dictionary); - - return new FileMorphoStream(*UntaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); -} + FILE_Tagger_.read_dictionary(DicFn); -void apertium_tagger::close_untagged_files( - char *DicFn, char *UntaggedFn, - FILE *Dictionary, FILE *UntaggedCorpus) { - if (*TheFunctionType == Supervised || *TheFunctionType == Train) { - try_close_file("DICTIONARY", DicFn, Dictionary); - } - try_close_file("UNTAGGED_CORPUS", UntaggedFn, UntaggedCorpus); + return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); } /** Implementation of flags/subcommands */ @@ -591,20 +579,20 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { } if (nonoptarg < 2) { Stream Input(TheFlags); - StreamTagger_.tag(Input, std::wcout); + StreamTagger_.tag(Input, std::cout); return; } - std::wifstream Input_stream; + std::ifstream Input_stream; try_open_fstream("INPUT", argv[optind + 1], Input_stream); if (nonoptarg < 3) { Stream Input(TheFlags, Input_stream, argv[optind + 1]); - StreamTagger_.tag(Input, std::wcout); + StreamTagger_.tag(Input, std::cout); return; } - std::wofstream Output_stream; + std::ofstream Output_stream; try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); Stream Input(TheFlags, Input_stream, argv[optind + 1]); @@ -628,12 +616,12 @@ void apertium_tagger::s_StreamTaggerTrainer( expect_file_arguments(nonoptarg, 2); } - std::wifstream TaggedCorpus_stream; + std::ifstream TaggedCorpus_stream; try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind + 1]); if (*TheFunctionTypeType == Perceptron) { - std::wifstream UntaggedCorpus_stream; + std::ifstream UntaggedCorpus_stream; try_open_fstream("UNTAGGED_CORPUS", argv[optind + 2], UntaggedCorpus_stream); Stream UntaggedCorpus(TheFlags, UntaggedCorpus_stream, argv[optind + 2]); @@ -661,21 +649,16 @@ void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); TaggerWord::generate_marks = TheFlags.getMark(); - if (nonoptarg < 2) - FILE_Tagger_.tagger(stdin, stdout); - else { - FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); - - if (nonoptarg < 3) - FILE_Tagger_.tagger(Input, stdout); - else { - FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); - FILE_Tagger_.tagger(Input, Output); - try_close_file("OUTPUT", argv[optind + 2], Output); + const char* infile = NULL; + UFILE* Output = u_finit(stdout, NULL, NULL); + if (nonoptarg >= 2) { + infile = argv[optind + 1]; + if (nonoptarg >= 3) { + Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); } - - try_close_file("INPUT", argv[optind + 1], Input); } + FILE_Tagger_.tagger(infile, Output); + u_fclose(Output); } void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -697,17 +680,15 @@ void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *UntaggedCorpus; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, NULL, UntaggedFn, - NULL, &UntaggedCorpus); + &UntaggedCorpus); FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); delete ms; - close_untagged_files( - NULL, UntaggedFn, - NULL, UntaggedCorpus); + u_fclose(UntaggedCorpus); Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); @@ -732,26 +713,20 @@ void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary, *UntaggedCorpus; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); - FILE *TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); - FileMorphoStream tms(TaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); + &UntaggedCorpus); + FileMorphoStream tms(TaggedFn, true, &FILE_Tagger_.get_tagger_data()); FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); - try_close_file("TAGGED_CORPUS", TaggedFn, TaggedCorpus); delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); + u_fclose(UntaggedCorpus); if (do_unsup) { - FILE *Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); - FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", CrpFn, Corpus); - } + FILE_Tagger_.train(CrpFn, TheFunctionTypeOptionArgument); + } FILE *Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); @@ -773,17 +748,15 @@ void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary, *UntaggedCorpus; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); + &UntaggedCorpus); FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); + u_fclose(UntaggedCorpus); FILE *Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); diff --git a/apertium/tagger.h b/apertium/tagger.h index 1d00620..cf3017b 100644 --- a/apertium/tagger.h +++ b/apertium/tagger.h @@ -62,10 +62,7 @@ private: MorphoStream* setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, FILE **UntaggedCorpus); - void close_untagged_files( - char *DicFn, char *UntaggedFn, - FILE *Dictionary, FILE *UntaggedCorpus); + UFILE **UntaggedCorpus); void g_StreamTagger(StreamTagger &StreamTagger_); void s_StreamTaggerTrainer(StreamTagger &StreamTaggerTrainer_); diff --git a/apertium/tagger_data.cc b/apertium/tagger_data.cc index 759642d..0eb93ec 100644 --- a/apertium/tagger_data.cc +++ b/apertium/tagger_data.cc @@ -17,9 +17,7 @@ #include #include #include -#include - -using namespace Apertium; +#include void TaggerData::copy(TaggerData const &o) @@ -94,38 +92,38 @@ TaggerData::setForbidRules(vector &fr) forbid_rules = fr; } -map & +map & TaggerData::getTagIndex() { return tag_index; } -const map & +const map & TaggerData::getTagIndex() const { return tag_index; } void -TaggerData::setTagIndex(map const &ti) +TaggerData::setTagIndex(map const &ti) { tag_index = ti; } -vector & +vector & TaggerData::getArrayTags() { return array_tags; } -const vector & +const vector & TaggerData::getArrayTags() const { return array_tags; } void -TaggerData::setArrayTags(vector const &at) +TaggerData::setArrayTags(vector const &at) { array_tags = at; } @@ -148,38 +146,38 @@ TaggerData::setEnforceRules(vector const &tear) enforce_rules = tear; } -vector & +vector & TaggerData::getPreferRules() { return prefer_rules; } -const vector & +const vector & TaggerData::getPreferRules() const { return prefer_rules; } void -TaggerData::setPreferRules(vector const &pr) +TaggerData::setPreferRules(vector const &pr) { prefer_rules = pr; } -vector & +vector & TaggerData::getDiscardRules() { return discard; } -const vector & +const vector & TaggerData::getDiscardRules() const { return discard; } void -TaggerData::setDiscardRules(vector const &v) +TaggerData::setDiscardRules(vector const &v) { discard = v; } @@ -233,7 +231,7 @@ TaggerData::setPatternList(PatternList const &pl) } void -TaggerData::addDiscard(wstring const &tags) +TaggerData::addDiscard(UString const &tags) { discard.push_back(tags); } diff --git a/apertium/tagger_data.h b/apertium/tagger_data.h index 2190c8d..2b7dd26 100644 --- a/apertium/tagger_data.h +++ b/apertium/tagger_data.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -35,14 +34,14 @@ class TaggerData protected: set open_class; vector forbid_rules; - map tag_index; - vector array_tags; + map tag_index; + vector array_tags; vector enforce_rules; - vector prefer_rules; + vector prefer_rules; ConstantManager constants; Collection output; PatternList plist; - vector discard; + vector discard; void copy(TaggerData const &o); public: @@ -59,25 +58,25 @@ public: const vector & getForbidRules() const; void setForbidRules(vector &fr); - map & getTagIndex(); - const map & getTagIndex() const; - void setTagIndex(map const &ti); + map & getTagIndex(); + const map & getTagIndex() const; + void setTagIndex(map const &ti); - vector & getArrayTags(); - const vector & getArrayTags() const; - void setArrayTags(vector const &at); + vector & getArrayTags(); + const vector & getArrayTags() const; + void setArrayTags(vector const &at); vector & getEnforceRules(); const vector & getEnforceRules() const; void setEnforceRules(vector const &tear); - vector & getPreferRules(); - const vector & getPreferRules() const; - void setPreferRules(vector const &pr); + vector & getPreferRules(); + const vector & getPreferRules() const; + void setPreferRules(vector const &pr); - vector & getDiscardRules(); - const vector & getDiscardRules() const; - void setDiscardRules(vector const &dr); + vector & getDiscardRules(); + const vector & getDiscardRules() const; + void setDiscardRules(vector const &dr); ConstantManager & getConstants(); const ConstantManager & getConstants() const; @@ -91,7 +90,7 @@ public: PatternList & getPatternList(); const PatternList & getPatternList() const; - void addDiscard(wstring const &tags); + void addDiscard(UString const &tags); }; #endif diff --git a/apertium/tagger_data_hmm.cc b/apertium/tagger_data_hmm.cc index 0b60776..759ac73 100644 --- a/apertium/tagger_data_hmm.cc +++ b/apertium/tagger_data_hmm.cc @@ -18,7 +18,7 @@ #include #include #include -#include +#include using namespace Apertium; @@ -191,13 +191,13 @@ TaggerDataHMM::read(FILE *in) // array_tags for(int i = Compression::multibyte_read(in); i != 0; i--) { - array_tags.push_back(Compression::wstring_read(in)); + array_tags.push_back(Compression::string_read(in)); } // tag_index for(int i = Compression::multibyte_read(in); i != 0; i--) { - wstring tmp = Compression::wstring_read(in); + UString tmp = Compression::string_read(in); tag_index[tmp] = Compression::multibyte_read(in); } @@ -216,7 +216,7 @@ TaggerDataHMM::read(FILE *in) // prefer_rules for(int i = Compression::multibyte_read(in); i != 0; i--) { - prefer_rules.push_back(Compression::wstring_read(in)); + prefer_rules.push_back(Compression::string_read(in)); } // constants @@ -280,7 +280,7 @@ TaggerDataHMM::read(FILE *in) for(unsigned int i = 0; i < limit; i++) { - discard.push_back(Compression::wstring_read(in)); + discard.push_back(Compression::string_read(in)); } } @@ -310,16 +310,14 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(array_tags.size(), out); for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) { - Compression::wstring_write(array_tags[i], out); + Compression::string_write(array_tags[i], out); } // tag_index Compression::multibyte_write(tag_index.size(), out); - for(map::iterator it = tag_index.begin(), limit = tag_index.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, out); - Compression::multibyte_write(it->second, out); + for (auto& it : tag_index) { + Compression::string_write(it.first, out); + Compression::multibyte_write(it.second, out); } // enforce_rules @@ -338,7 +336,7 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(prefer_rules.size(), out); for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) { - Compression::wstring_write(prefer_rules[i], out); + Compression::string_write(prefer_rules[i], out); } // constants @@ -396,7 +394,7 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(discard.size(), out); for(unsigned int i = 0, limit = discard.size(); i != limit; i++) { - Compression::wstring_write(discard[i], out); + Compression::string_write(discard[i], out); } } } diff --git a/apertium/tagger_data_lsw.cc b/apertium/tagger_data_lsw.cc index 3a79aca..d8a521f 100644 --- a/apertium/tagger_data_lsw.cc +++ b/apertium/tagger_data_lsw.cc @@ -18,7 +18,7 @@ #include #include #include -#include +#include using namespace Apertium; @@ -136,13 +136,13 @@ TaggerDataLSW::read(FILE *in) // array_tags for(int i = Compression::multibyte_read(in); i != 0; i--) { - array_tags.push_back(Compression::wstring_read(in)); + array_tags.push_back(Compression::string_read(in)); } // tag_index for(int i = Compression::multibyte_read(in); i != 0; i--) { - wstring tmp = Compression::wstring_read(in); + UString tmp = Compression::string_read(in); tag_index[tmp] = Compression::multibyte_read(in); } @@ -161,7 +161,7 @@ TaggerDataLSW::read(FILE *in) // prefer_rules for(int i = Compression::multibyte_read(in); i != 0; i--) { - prefer_rules.push_back(Compression::wstring_read(in)); + prefer_rules.push_back(Compression::string_read(in)); } // constants @@ -212,7 +212,7 @@ TaggerDataLSW::read(FILE *in) for(unsigned int i = 0; i < limit; i++) { - discard.push_back(Compression::wstring_read(in)); + discard.push_back(Compression::string_read(in)); } } @@ -242,16 +242,14 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(array_tags.size(), out); for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) { - Compression::wstring_write(array_tags[i], out); + Compression::string_write(array_tags[i], out); } // tag_index Compression::multibyte_write(tag_index.size(), out); - for(map::iterator it = tag_index.begin(), limit = tag_index.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, out); - Compression::multibyte_write(it->second, out); + for(auto& it : tag_index) { + Compression::string_write(it.first, out); + Compression::multibyte_write(it.second, out); } // enforce_rules @@ -270,7 +268,7 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(prefer_rules.size(), out); for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) { - Compression::wstring_write(prefer_rules[i], out); + Compression::string_write(prefer_rules[i], out); } // constants @@ -317,7 +315,7 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(discard.size(), out); for(unsigned int i = 0, limit = discard.size(); i != limit; i++) { - Compression::wstring_write(discard[i], out); + Compression::string_write(discard[i], out); } } } diff --git a/apertium/tagger_data_percep_coarse_tags.cc b/apertium/tagger_data_percep_coarse_tags.cc index aa3080d..89ce084 100644 --- a/apertium/tagger_data_percep_coarse_tags.cc +++ b/apertium/tagger_data_percep_coarse_tags.cc @@ -27,8 +27,8 @@ TaggerDataPercepCoarseTags::~TaggerDataPercepCoarseTags() {} void TaggerDataPercepCoarseTags::serialise(std::ostream &serialised) const { Serialiser >::serialise(open_class, serialised); - Serialiser >::serialise(array_tags, serialised); - Serialiser >::serialise(tag_index, serialised); + Serialiser >::serialise(array_tags, serialised); + Serialiser >::serialise(tag_index, serialised); constants.serialise(serialised); output.serialise(serialised); plist.serialise(serialised); @@ -37,14 +37,14 @@ void TaggerDataPercepCoarseTags::serialise(std::ostream &serialised) const void TaggerDataPercepCoarseTags::deserialise(std::istream &serialised) { open_class = Deserialiser >::deserialise(serialised); - array_tags = Deserialiser >::deserialise(serialised); - tag_index = Deserialiser >::deserialise(serialised); + array_tags = Deserialiser >::deserialise(serialised); + tag_index = Deserialiser >::deserialise(serialised); constants.deserialise(serialised); output.deserialise(serialised); plist.deserialise(serialised); } -const wstring& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd) const +const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd) const { // Init fine -> coarse tags matching machinary MatchState ms; @@ -52,16 +52,20 @@ const wstring& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd const Alphabet alphabet = plist.getAlphabet(); int ca_any_char = alphabet(PatternList::ANY_CHAR); int ca_any_tag = alphabet(PatternList::ANY_TAG); - map::const_iterator undef_it = tag_index.find(L"TAG_kUNDEF"); + map::const_iterator undef_it = tag_index.find("TAG_kUNDEF"_u); int ca_tag_kundef = undef_it->second; // Input lemma ms.init(me->getInitial()); for (size_t i = 0; i < wrd.TheLemma.size(); i++) { - ms.step(std::towlower(wrd.TheLemma[i]), ca_any_char); + ms.step(u_tolower(wrd.TheLemma[i]), ca_any_char); } // Input fine tags for (size_t i = 0; i < wrd.TheTags.size(); i++) { - int symbol = alphabet(L"<" + wrd.TheTags[i].TheTag + L">"); + UString tag; + tag += '<'; + tag.append(wrd.TheTags[i].TheTag); + tag += '>'; + int symbol = alphabet(tag); if (symbol) { ms.step(symbol, ca_any_tag); } diff --git a/apertium/tagger_data_percep_coarse_tags.h b/apertium/tagger_data_percep_coarse_tags.h index 08317bf..6a44b0a 100644 --- a/apertium/tagger_data_percep_coarse_tags.h +++ b/apertium/tagger_data_percep_coarse_tags.h @@ -14,7 +14,7 @@ public: virtual ~TaggerDataPercepCoarseTags(); void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); - const wstring& coarsen(const Apertium::Morpheme &wrd) const; + const UString& coarsen(const Apertium::Morpheme &wrd) const; }; #endif diff --git a/apertium/tagger_utils.cc b/apertium/tagger_utils.cc index 9892866..3da0415 100644 --- a/apertium/tagger_utils.cc +++ b/apertium/tagger_utils.cc @@ -22,30 +22,16 @@ #include #include #include -#include -#ifdef _MSC_VER -#define wcstok wcstok_s -#endif -#ifdef __MINGW32__ - -wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { - (void)ptr; - return wcstok(wcs, delim); -} - -#define wcstok _wcstok -#endif +#include -using namespace Apertium; - -void tagger_utils::fatal_error (wstring const &s) { - wcerr< v[], int l) { v[i].clear(); } -int tagger_utils::ntokens_multiword(wstring const &s) +int tagger_utils::ntokens_multiword(UString const &s) { - wchar_t *news = new wchar_t[s.size()+1]; - wcscpy(news, s.c_str()); - news[s.size()] = 0; - wcerr << news << endl; - - wchar_t const *delim = L"_"; - wchar_t *ptr; - int n=0; - - if (wcstok(news, delim, &ptr)) - n++; - while (wcstok(NULL, delim, &ptr)) - n++; - - delete[] news; - - return n; + vector tmp = StringUtils::split(s, "_"_u); + int n = 0; + for (auto& it : tmp) { + if (!it.empty()) { + n++; + } + } + return n; } -int tagger_utils::nguiones_fs(wstring const & s) { - wchar_t *news = new wchar_t[s.size()+1]; - wcscpy(news, s.c_str()); - news[s.size()] = 0; - wcerr << news << endl; - wchar_t const *delim = L"-"; - wchar_t *ptr; - int n=0; - - if (wcstok(news, delim, &ptr)) - n++; - while (wcstok(NULL, delim, &ptr)) - n++; - - delete[] news; - - return n; +int tagger_utils::nguiones_fs(UString const & s) { + vector tmp = StringUtils::split(s, "-"_u); + int n = 0; + for (auto& it : tmp) { + if (!it.empty()) { + n++; + } + } + return n; } -wstring tagger_utils::trim(wstring s) +UString tagger_utils::trim(UString s) { - if (s.length()==0) - return L""; + if (s.empty()) { + return ""_u; + } for (unsigned int i=0; i<(s.length()-1); i++) { - if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) { + if ((s.at(i)==' ')&&(s.at(i+1)==' ')) { s.erase(i,1); i--; } } - if ((s.length()>0)&&(s.at(s.length()-1)==L' ')) + if ((s.length()>0)&&(s.at(s.length()-1)==' ')) s.erase(s.length()-1,1); - if ((s.length()>0)&&(s.at(0)==L' ')) + if ((s.length()>0)&&(s.at(0)==' ')) s.erase(0,1); return s; } -void tagger_utils::scan_for_ambg_classes(FILE *fdic, TaggerData &td) { +void tagger_utils::scan_for_ambg_classes(const char* fdic, TaggerData &td) { Collection &output = td.getOutput(); FileMorphoStream morpho_stream(fdic, true, &td); tagger_utils::scan_for_ambg_classes(output, morpho_stream); @@ -142,7 +112,7 @@ void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morph while (word) { if (++nw % 10000 == 0) - wcerr << L'.' << flush; + cerr << '.' << flush; tags = word->get_tags(); @@ -152,7 +122,7 @@ void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morph delete word; word = morpho_stream.get_next_word(); } - wcerr << L"\n"; + cerr << "\n"; } void @@ -179,7 +149,6 @@ set & tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { set &ret = td.getOpenClass(); Collection &output = td.getOutput(); - int ret_idx = output[ret]; for (int k=0; k &ambg_class = output[k]; @@ -188,7 +157,6 @@ tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { continue; } if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { - ret_idx = k; ret = ambg_class; } } @@ -198,27 +166,30 @@ tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { void tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw) { if (td.getOutput().has_not(tags)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n"; - errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n"; + UString errors; + errors = "A new ambiguity class was found. I cannot continue.\nWord '"_u; + errors += word.get_superficial_form(); + errors += "' not found in the dictionary.\n"_u; + errors += "New ambiguity class: "_u; + errors += word.get_string_tags(); + errors += '\n'; if (nw >= 0) { - std::wostringstream ws; + std::ostringstream ws; ws << (nw + 1); - errors+= L"Line number: " + ws.str() + L"\n"; + errors += "Line number: "_u; + errors += to_ustring(ws.str().c_str()); + errors += '\n'; } - errors+= L"Take a look at the dictionary, then retrain."; + errors += "Take a look at the dictionary, then retrain."_u; fatal_error(errors); } } static void _warn_absent_ambiguity_class(TaggerWord &word) { - wstring errors; - errors = L"A new ambiguity class was found. \n"; - errors += L"Retraining the tagger is necessary so as to take it into account.\n"; - errors += L"Word '" + word.get_superficial_form() + L"'.\n"; - errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; - wcerr << L"Error: " << errors; + cerr << "Error: A new ambiguity class was found. \n"; + cerr << "Retraining the tagger is necessary so as to take it into account.\n"; + cerr << "Word '" << word.get_superficial_form() << "'.\n"; + cerr << "New ambiguity class: " << word.get_string_tags() << "\n"; } set & @@ -265,7 +236,7 @@ istream& operator>> (istream& is, map & f) { is>>i; // warning: does not work if both is>>f[i]; // lines merged in a single one } - if (is.bad()) tagger_utils::fatal_error(L"reading map"); + if (is.bad()) tagger_utils::fatal_error("reading map"_u); return is; } @@ -280,4 +251,3 @@ ostream& operator<< (ostream& os, const set& s) { os<<'}'; return os; } - diff --git a/apertium/tagger_utils.h b/apertium/tagger_utils.h index f895735..31b6458 100644 --- a/apertium/tagger_utils.h +++ b/apertium/tagger_utils.h @@ -36,7 +36,7 @@ namespace tagger_utils /** Print a fatal error message * @param s the error message to print */ -void fatal_error (wstring const &s); +void fatal_error (UString const &s); /** Print a fatal error message related to a file * @param s the file name to be printted in the error message @@ -63,18 +63,18 @@ void clear_array_vector(vector v[], int l); /** Return the number of tokens in the multiword unit */ - int ntokens_multiword(wstring const &s); + int ntokens_multiword(UString const &s); /** Devuelve el nº de guiones que contiene la cadena pasada como argumento */ -int nguiones_fs(wstring const &cadena); +int nguiones_fs(UString const &cadena); /** Reads the expanded dictionary received as a parameter puts the resulting * ambiguity classes that the tagger will manage. * @param fdic the input stream with the expanded dictionary to read * @param td the tagger data instance to mutate */ -void scan_for_ambg_classes(FILE *fdic, TaggerData &td); +void scan_for_ambg_classes(const char* fdic, TaggerData &td); void scan_for_ambg_classes(Collection &output, MorphoStream &morpho_stream); void add_neccesary_ambg_classes(TaggerData &td); @@ -105,7 +105,7 @@ set & require_similar_ambiguity_class(TaggerData &td, set &tags); /** Just prints a warning if warn */ void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool warn); -wstring trim(wstring s); +UString trim(UString s); }; diff --git a/apertium/tagger_word.cc b/apertium/tagger_word.cc index e7982ec..ff44de4 100644 --- a/apertium/tagger_word.cc +++ b/apertium/tagger_word.cc @@ -15,25 +15,21 @@ * along with this program; if not, see . */ #include -#include -#include +#include #include "apertium_config.h" #include -using namespace Apertium; - bool TaggerWord::generate_marks=false; -vector TaggerWord::array_tags; +vector TaggerWord::array_tags; bool TaggerWord::show_ignored_string=true; -map TaggerWord::patterns; +map TaggerWord::patterns; TaggerWord::TaggerWord(bool prev_plus_cut) : show_sf(false) { - ignored_string = L""; plus_cut=false; previous_plus_cut=prev_plus_cut; } @@ -62,46 +58,45 @@ TaggerWord::get_show_sf(){ } void -TaggerWord::set_superficial_form(const wstring &sf){ +TaggerWord::set_superficial_form(const UString &sf){ superficial_form = sf; } -wstring& +UString& TaggerWord::get_superficial_form() { return superficial_form; } bool -TaggerWord::match(wstring const &s, wstring const &pattern) +TaggerWord::match(UString const &s, UString const &pattern) { - map::iterator it = patterns.find(pattern); - string const utfs = UtfConverter::toUtf8(s); + map::iterator it = patterns.find(pattern); if(it == patterns.end()) { - string utfpattern = UtfConverter::toUtf8(pattern); - string regexp = ""; + UString utfpattern = pattern; + UString regexp; while(true) { - size_t pos = utfpattern.find("<*>"); - if(pos == string::npos) + size_t pos = utfpattern.find("<*>"_u); + if(pos == UString::npos) { break; } - utfpattern.replace(pos, 3, "(<[^>]+>)+"); + utfpattern.replace(pos, 3, "(<[^>]+>)+"_u); } patterns[pattern].compile(utfpattern); - return patterns[pattern].match(utfs) != ""; + return !patterns[pattern].match(s).empty(); } else { - return it->second.match(utfs) != ""; + return !it->second.match(s).empty(); } } void -TaggerWord::add_tag(TTag &t, const wstring &lf, vector const &prefer_rules){ +TaggerWord::add_tag(TTag &t, const UString &lf, vector const &prefer_rules){ //Tag is added only is it is not present yet //Sometime one word can have more than one lexical form assigned to the same tag @@ -132,25 +127,25 @@ TaggerWord::isAmbiguous() const return tags.size() > 1; } -wstring +UString TaggerWord::get_string_tags() { - wstring st; + UString st; set::iterator itag = tags.begin(); - st=L"{"; + st += '{'; for(itag=tags.begin(); itag!=tags.end(); itag++) { if (itag!=tags.begin()) - st+=L','; + st+=','; st+=array_tags[*itag]; } - st += L'}'; + st += '}'; return st; } -wstring +UString TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { - wstring ret= L""; + UString ret; if (show_ignored_string) ret.append(ignored_string); @@ -158,30 +153,27 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { if(t==TAG_kEOF) return ret; - if (!previous_plus_cut){ - if(TaggerWord::generate_marks && isAmbiguous()) - { - ret.append(L"^="); - } - else - { - ret += L'^'; + if (!previous_plus_cut) { + if(TaggerWord::generate_marks && isAmbiguous()) { + ret.append("^="_u); + } else { + ret += '^'; } - if(get_show_sf()){ // append the superficial form + if(get_show_sf()) { // append the superficial form ret.append(superficial_form); - ret+=L'/'; + ret += '/'; } } if (lexical_forms.size()==0) { // This is an UNKNOWN WORD - ret +=L'*'; + ret += '*'; ret.append(superficial_form); - } else if ((*lexical_forms.begin()).second[0]==L'*') { //This is an + } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an //unknown word //that has //been guessed - ret += L'*'; + ret += '*'; ret.append(superficial_form); } else if (lexical_forms.size()>1) { //This is an ambiguous word ret.append(lexical_forms[t]); @@ -191,9 +183,9 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { if (ret != ignored_string) { if (plus_cut) - ret+=L'+'; + ret += '+'; else { - ret += L'$'; + ret += '$'; } } @@ -207,52 +199,49 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { return ret; } -wstring +UString TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { - wstring ret=L""; + UString ret; - if (show_ignored_string) + if (show_ignored_string) { ret.append(ignored_string); + } - if(t==TAG_kEOF) + if(t==TAG_kEOF) { return ret; + } - if (!previous_plus_cut) - { - if(TaggerWord::generate_marks && isAmbiguous()) - { - ret.append(L"^="); - } - else - { - ret += L'^'; + if (!previous_plus_cut) { + if(TaggerWord::generate_marks && isAmbiguous()) { + ret.append("^="_u); + } else { + ret += '^'; } } ret.append(superficial_form); if (lexical_forms.size()==0) { // This is an UNKNOWN WORD - ret+=L"/*"; + ret += "/*"_u; ret.append(superficial_form); } else { - ret+=L"/"; + ret+="/"_u; ret.append(lexical_forms[t]); if (lexical_forms.size()>1) { - set::iterator it; - for (it=tags.begin(); it!=tags.end(); it++) { - if (*it != t) { - ret+=L"/"; - ret.append(lexical_forms[*it]); - } + for (auto& it : tags) { + if (it != t) { + ret += '/'; + ret.append(lexical_forms[it]); + } } } } if (ret != ignored_string) { - if (plus_cut) - ret+=L"+"; - else { - ret+=L"$"; + if (plus_cut) { + ret += '+'; + } else { + ret += '$'; } } @@ -260,29 +249,30 @@ TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { } //OBSOLETE -wstring +UString TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) { - wstring ret; + UString ret; - if(t==TAG_kEOF) + if(t==TAG_kEOF) { return ret; + } if (lexical_forms.size()==0) { //This is an unknown word - ret.append(L"*^"); - ret.append(superficial_form); + ret.append("*^"_u); + ret.append(superficial_form); } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an unknown word that has been guessed - ret.append(L"*^"); + ret.append("*^"_u); ret.append(superficial_form); } else { - ret += L'^'; + ret += '^'; ret.append(lexical_forms[t]); } if (ret.length() != 0) { if (plus_cut) - ret+=L'+'; + ret += '+'; else { - ret +=L'$'; + ret += '$'; } } @@ -290,7 +280,7 @@ TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) } void -TaggerWord::add_ignored_string(wstring const &s) { +TaggerWord::add_ignored_string(UString const &s) { ignored_string.append(s); } @@ -304,14 +294,14 @@ TaggerWord::get_plus_cut() { return plus_cut; } -wostream& -operator<< (wostream& os, TaggerWord &w) { - os< const &at) +TaggerWord::setArrayTags(vector const &at) { array_tags = at; } @@ -319,42 +309,37 @@ TaggerWord::setArrayTags(vector const &at) void TaggerWord::print() { - wcout << L"[#" << superficial_form << L"# "; + cout << "[#" << superficial_form << "# "; for(set::iterator it=tags.begin(), limit = tags.end(); it != limit; it++) { - wcout << L"(" << *it << L" " << lexical_forms[*it] << L") "; + cout << "(" << *it << " " << lexical_forms[*it] << ") "; } - wcout << L"\b]\n"; + cout << "\b]\n"; } void -TaggerWord::outputOriginal(FILE *output) { +TaggerWord::outputOriginal(UFILE *output) { - wstring s=superficial_form; + UString s=superficial_form; - map::iterator it; - for(it=lexical_forms.begin(); it!=lexical_forms.end(); it++) { - if (it->second.length()>0) - { - s+=L'/'; - s.append(it->second); + for (auto& it : lexical_forms) { + if (!it.second.empty()) { + s += '/'; + s.append(it.second); } } - if (s.length()>0) - { - s=L"^"+s+L"$\n"; + if (!s.empty()) { + u_fprintf(output, "^%S$\n", s.c_str()); } - - fputws_unlocked(s.c_str(), output); } void -TaggerWord::discardOnAmbiguity(wstring const &tags) +TaggerWord::discardOnAmbiguity(UString const &tags) { if(isAmbiguous()) { - map::iterator it = lexical_forms.begin(), + map::iterator it = lexical_forms.begin(), limit = lexical_forms.end(); set newsettag; while(it != limit) diff --git a/apertium/tagger_word.h b/apertium/tagger_word.h index 601481e..560500a 100644 --- a/apertium/tagger_word.h +++ b/apertium/tagger_word.h @@ -23,7 +23,6 @@ #include #include -#include #include #include @@ -36,12 +35,12 @@ using namespace std; */ class TaggerWord{ private: - wstring superficial_form; + UString superficial_form; set tags; //Set of all possible tags - map lexical_forms; //For a given coarse tag it stores the fine tag + map lexical_forms; //For a given coarse tag it stores the fine tag //delevered by the morphological analyzer - wstring ignored_string; + UString ignored_string; bool plus_cut; //Flag to distinguish the way in which the word was ended. //If it was done by '$' its value should be false @@ -50,12 +49,12 @@ private: //previous word was ended. It has the same //plus_cut meaning bool show_sf; // Show the superficial form in the output - static map patterns; + static map patterns; - bool match(wstring const &s, wstring const &pattern); + bool match(UString const &s, UString const &pattern); public: static bool generate_marks; - static vector array_tags; + static vector array_tags; static bool show_ignored_string; @@ -77,47 +76,47 @@ public: /** Set the superficial form of the word. * @param s the superficial form */ - void set_superficial_form(const wstring &s); + void set_superficial_form(const UString &s); /** Get the superficial form of the word * */ - wstring& get_superficial_form(); + UString& get_superficial_form(); /** Add a new tag to the set of all possible tags of the word. * @param t the coarse tag * @param lf the lexical form (fine tag) */ - virtual void add_tag(TTag &t, const wstring &lf, vector const &prefer_rules); + virtual void add_tag(TTag &t, const UString &lf, vector const &prefer_rules); /** Get the set of tags of this word. * @return set of tags. */ virtual set& get_tags(); - /** Get a wstring with the set of tags + /** Get a UString with the set of tags */ - virtual wstring get_string_tags(); + virtual UString get_string_tags(); /** Get the lexical form (fine tag) for a given tag (coarse one) * @param t the tag * @return the lexical form of tag t */ - virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF); + virtual UString get_lexical_form(TTag &t, int const TAG_kEOF); - wstring get_all_chosen_tag_first(TTag &t, int const TAG_kEOF); + UString get_all_chosen_tag_first(TTag &t, int const TAG_kEOF); /** Get the lexical form (fine tag) for a given tag (coarse one) * @param t the tag * @return the lexical form of tag t without other text that * is ignored. */ - wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF); + UString get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF); /** Add text to the ignored string * */ - void add_ignored_string(wstring const &s); + void add_ignored_string(UString const &s); /** Set the flag plus_cut to a certain value. If this flag is set to true means * that there were a '+' between this word and the next one @@ -135,18 +134,18 @@ public: /** Output operator */ - friend wostream& operator<< (wostream& os, TaggerWord &w); + friend ostream& operator<< (ostream& os, TaggerWord &w); - static void setArrayTags(vector const &at); + static void setArrayTags(vector const &at); void print(); - void outputOriginal(FILE *output); + void outputOriginal(UFILE *output); bool isAmbiguous() const; // CAUTION: unknown words are not considered to // be ambiguous by this method - void discardOnAmbiguity(wstring const &tags); + void discardOnAmbiguity(UString const &tags); }; #endif diff --git a/apertium/tmx_aligner_tool.cc b/apertium/tmx_aligner_tool.cc index c595b6e..9dc7e7b 100644 --- a/apertium/tmx_aligner_tool.cc +++ b/apertium/tmx_aligner_tool.cc @@ -10,7 +10,7 @@ * * *************************************************************************/ #include -#include +#include namespace TMXAligner { @@ -28,7 +28,7 @@ void readTrailOrBisentenceList( std::istream& is, Trail& trail ) is >> huPos; if (is.peek()!=' ') { - std::wcerr << "no space in line" << std::endl; + std::cerr << "no space in line" << std::endl; throw "data error"; } is.ignore(); @@ -36,7 +36,7 @@ void readTrailOrBisentenceList( std::istream& is, Trail& trail ) is >> enPos; if (is.peek()!='\n') { - std::wcerr << "too much data in line" << std::endl; + std::cerr << "too much data in line" << std::endl; throw "data error"; } is.ignore(); @@ -99,7 +99,7 @@ void collectBisentences( const Trail& bestTrail, const AlignMatrix& dynMatrix, enBisentences.push_back( enSentenceListPretty[ bisentenceList[i].second ] ); } -// std::wcerr << huBisentences.size() << " bisentences collected." << std::endl; +// std::cerr << huBisentences.size() << " bisentences collected." << std::endl; } @@ -152,11 +152,11 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, setSentenceValues( enSentenceList, enLength, alignParameters.utfCharCountingMode ); bool quasiglobal_stopwordRemoval = false; -// std::wcerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; +// std::cerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; if (quasiglobal_stopwordRemoval) { removeStopwords( huSentenceListPretty, enSentenceList ); -// std::wcerr << "Stopwords removed." << std::endl; +// std::cerr << "Stopwords removed." << std::endl; } SentenceList huSentenceListGarbled, enSentenceListGarbled; @@ -186,9 +186,9 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (thickness>maximalThickness) { -// std::wcerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; -// std::wcerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; -// std::wcerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; +// std::cerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; +// std::cerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; +// std::cerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; thickness = maximalThickness; } @@ -196,20 +196,20 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); -// std::wcerr << std::endl; -// std::wcerr << "Rough translation-based similarity matrix ready." << std::endl; +// std::cerr << std::endl; +// std::cerr << "Rough translation-based similarity matrix ready." << std::endl; Trail bestTrail; AlignMatrix dynMatrix( huBookSize+1, enBookSize+1, thickness, 1e30 ); align( similarityMatrix, huLength, enLength, bestTrail, dynMatrix ); -// std::wcerr << "Align ready." << std::endl; +// std::cerr << "Align ready." << std::endl; double globalQuality; globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align " << globalQuality << std::endl; if (alignParameters.realignType==AlignParameters::NoRealign) { @@ -222,11 +222,11 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (!success) { -// std::wcerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; +// std::cerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; } else { -// std::wcerr << "Border of realign zone determined." << std::endl; +// std::cerr << "Border of realign zone determined." << std::endl; switch (alignParameters.realignType) { @@ -237,24 +237,24 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, SentenceList huBisentences,enBisentences; throw "unimplemented"; -// std::wcerr << "Plausible bisentences filtered." << std::endl; +// std::cerr << "Plausible bisentences filtered." << std::endl; modelOne.build(huBisentences,enBisentences); -// std::wcerr << "IBM Model I ready." << std::endl; +// std::cerr << "IBM Model I ready." << std::endl; sentenceListsToAlignMatrixIBMModelOne( huSentenceListPretty, enSentenceList, modelOne, similarityMatrixDetailed ); -// std::wcerr << "IBM Model I based similarity matrix ready." << std::endl; +// std::cerr << "IBM Model I based similarity matrix ready." << std::endl; break; } case AlignParameters::FineTranslationRealign: { TransLex transLex; transLex.build(dictionary); -// std::wcerr << "Hashtable for dictionary ready." << std::endl; +// std::cerr << "Hashtable for dictionary ready." << std::endl; sentenceListsToAlignMatrixTranslation( huSentenceListPretty, enSentenceList, transLex, similarityMatrixDetailed ); -// std::wcerr << "Fine translation-based similarity matrix ready." << std::endl; +// std::cerr << "Fine translation-based similarity matrix ready." << std::endl; break; } @@ -268,7 +268,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, Trail bestTrailDetailed; AlignMatrix dynMatrixDetailed( huBookSize+1, enBookSize+1, thickness, 1e30 ); align( similarityMatrixDetailed, huLength, enLength, bestTrailDetailed, dynMatrixDetailed ); -// std::wcerr << "Detail realign ready." << std::endl; +// std::cerr << "Detail realign ready." << std::endl; bestTrail = bestTrailDetailed; dynMatrix = dynMatrixDetailed; @@ -276,7 +276,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; } } @@ -285,27 +285,27 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if ( alignParameters.postprocessTrailQualityThreshold != -1 ) { postprocessTrail( bestTrail, trailScoresInterval, alignParameters.postprocessTrailQualityThreshold ); -// std::wcerr << "Trail start and end postprocessed by score." << std::endl; +// std::cerr << "Trail start and end postprocessed by score." << std::endl; } if ( alignParameters.postprocessTrailStartAndEndQualityThreshold != -1 ) { postprocessTrailStartAndEnd( bestTrail, trailScoresInterval, alignParameters.postprocessTrailStartAndEndQualityThreshold ); -// std::wcerr << "Trail start and end postprocessed by score." << std::endl; +// std::cerr << "Trail start and end postprocessed by score." << std::endl; } if ( alignParameters.postprocessTrailByTopologyQualityThreshold != -1 ) { postprocessTrailByTopology( bestTrail, alignParameters.postprocessTrailByTopologyQualityThreshold ); -// std::wcerr << "Trail postprocessed by topology." << std::endl; +// std::cerr << "Trail postprocessed by topology." << std::endl; } bool quasiglobal_spaceOutBySentenceLength = true; -// std::wcerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; +// std::cerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; if (quasiglobal_spaceOutBySentenceLength) { spaceOutBySentenceLength( bestTrail, huSentenceListPretty, enSentenceList, alignParameters.utfCharCountingMode ); -// std::wcerr << "Trail spaced out by sentence length." << std::endl; +// std::cerr << "Trail spaced out by sentence length." << std::endl; } // In cautious mode, auto-aligned rundles are thrown away if @@ -313,13 +313,13 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (alignParameters.cautiousMode) { cautiouslyFilterTrail( bestTrail ); -// std::wcerr << "Trail filtered by topology." << std::endl; +// std::cerr << "Trail filtered by topology." << std::endl; } globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; bool textual = ! alignParameters.justSentenceIds ; @@ -429,17 +429,17 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, std::ifstream hus(huFilename.c_str()); SentenceList huSentenceListPretty; huSentenceListPretty.readNoIds( hus ); -// std::wcerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; +// std::cerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; std::ifstream ens(enFilename.c_str()); SentenceList enSentenceList; enSentenceList.readNoIds( ens ); -// std::wcerr << enSentenceList.size() << " english sentences read." << std::endl; +// std::cerr << enSentenceList.size() << " english sentences read." << std::endl; if ( (enSentenceList. size() < huSentenceListPretty.size()/5) || (huSentenceListPretty.size() < enSentenceList. size()/5) ) { -// std::wcerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; +// std::cerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; return; } @@ -448,7 +448,7 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, /* double globalQuality = */alignerToolWithObjects ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, std::cout ); -// std::wcerr << "Quality " << globalQuality << std::endl ; +// std::cerr << "Quality " << globalQuality << std::endl ; } else @@ -458,7 +458,7 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, os ); // If you want to collect global quality information in batch mode, grep "^Quality" of stderr must do. -// std::wcerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; +// std::cerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; } } @@ -474,7 +474,7 @@ void fillPercentParameter( Arguments& args, const std::string& argName, double& void main_alignerToolUsage() { - std::wcerr << "Usage (either):\n\ + std::cerr << "Usage (either):\n\ alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\ \n\ or:\n\ @@ -586,8 +586,8 @@ int main_alignerTool(int argC, char* argV[]) if (batchMode && (remains.size()!=2) ) { - std::wcerr << "Batch mode requires exactly two file arguments." << std::endl; - std::wcerr << std::endl; + std::cerr << "Batch mode requires exactly two file arguments." << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; @@ -598,7 +598,7 @@ int main_alignerTool(int argC, char* argV[]) { if (batchMode) { - std::wcerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; + std::cerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; throw "argument error"; } else @@ -608,7 +608,7 @@ int main_alignerTool(int argC, char* argV[]) if (alignParameters.handAlignFilename.empty()) { - std::wcerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; + std::cerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; throw "argument error"; } } @@ -619,7 +619,7 @@ int main_alignerTool(int argC, char* argV[]) { if (batchMode) { - std::wcerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; + std::cerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; throw "argument error"; } else @@ -629,7 +629,7 @@ int main_alignerTool(int argC, char* argV[]) if (alignParameters.autoDictionaryDumpFilename.empty()) { - std::wcerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; + std::cerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; throw "argument error"; } } @@ -637,8 +637,8 @@ int main_alignerTool(int argC, char* argV[]) if (!batchMode && (remains.size()!=3) ) { - std::wcerr << "Nonbatch mode requires exactly three file arguments." << std::endl; - std::wcerr << std::endl; + std::cerr << "Nonbatch mode requires exactly three file arguments." << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; @@ -650,13 +650,13 @@ int main_alignerTool(int argC, char* argV[]) } catch (...) { - std::wcerr << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; } -// std::wcerr << "Reading dictionary..." << std::endl; +// std::cerr << "Reading dictionary..." << std::endl; const char* dicFilename = remains[0] ; DictionaryItems dictionary; std::ifstream dis(dicFilename); @@ -677,7 +677,7 @@ int main_alignerTool(int argC, char* argV[]) if (words.size()!=3) { - std::wcerr << "Batch file has incorrect format." << std::endl; + std::cerr << "Batch file has incorrect format." << std::endl; throw "data error"; } @@ -686,7 +686,7 @@ int main_alignerTool(int argC, char* argV[]) enFilename = words[1]; outFilename = words[2]; -// std::wcerr << "Processing " << outFilename << std::endl; +// std::cerr << "Processing " << outFilename << std::endl; bool failed = false; try { @@ -694,23 +694,23 @@ int main_alignerTool(int argC, char* argV[]) } catch ( const char* errorType ) { - std::wcerr << errorType << std::endl; + std::cerr << errorType << std::endl; failed = true; } catch ( std::exception& e ) { - std::wcerr << "some failed assertion:" << e.what() << std::endl; + std::cerr << "some failed assertion:" << e.what() << std::endl; failed = true; } catch ( ... ) { - std::wcerr << "some unknown failed assertion..." << std::endl; + std::cerr << "some unknown failed assertion..." << std::endl; failed = true; } if (failed) { - std::wcerr << "Align failed for " << outFilename << std::endl; + std::cerr << "Align failed for " << outFilename << std::endl; } } } @@ -725,17 +725,17 @@ int main_alignerTool(int argC, char* argV[]) #ifndef _DEBUG catch ( const char* errorType ) { - std::wcerr << errorType << std::endl; + std::cerr << errorType << std::endl; return -1; } catch ( std::exception& e ) { - std::wcerr << "some failed assertion:" << e.what() << std::endl; + std::cerr << "some failed assertion:" << e.what() << std::endl; return -1; } catch ( ... ) { - std::wcerr << "some unknown failed assertion..." << std::endl; + std::cerr << "some unknown failed assertion..." << std::endl; return -1; } #endif diff --git a/apertium/tmx_alignment.cc b/apertium/tmx_alignment.cc index 8b556fc..97ae617 100644 --- a/apertium/tmx_alignment.cc +++ b/apertium/tmx_alignment.cc @@ -13,7 +13,7 @@ #include // For SentenceList #include // For FrequencyMap -#include +#include #include #include @@ -21,7 +21,7 @@ #include // Copypaste-elve. TODO Elhelyezni. -#define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; } +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } std::ostream& operator<<( std::ostream& os, std::pair p ) { @@ -241,7 +241,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) bool logging = false; - if (logging) std::wcerr << std::endl; + if (logging) std::cerr << std::endl; bool over = false; bool hopelesslyBadTrail = false; @@ -304,7 +304,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) if (logging) { - std::wcerr << huPos << " \t" << enPos << std::endl; + std::cerr << huPos << " \t" << enPos << std::endl; } } @@ -314,7 +314,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) bestTrail.clear(); bestTrail.push_back(std::make_pair(huBookSize,enBookSize)); bestTrail.push_back(std::make_pair(0,0)); - std::wcerr << "Error: hopelessly bad trail." << std::endl; + std::cerr << "Error: hopelessly bad trail." << std::endl; } std::reverse(bestTrail.begin(), bestTrail.end() ); @@ -335,11 +335,11 @@ void align( const AlignMatrix& w, const SentenceValues& huLength, const Sentence buildDynProgMatrix( w, huLength, enLength, v, trellis ); -// std::wcerr << "Matrix built." << std::endl; +// std::cerr << "Matrix built." << std::endl; trelliToLadder( trellis, bestTrail ); -// std::wcerr << "Trail found." << std::endl; +// std::cerr << "Trail found." << std::endl; } @@ -383,10 +383,10 @@ double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHan { int score = countIntersectionOfTrails( trailAuto, trailHand ); - std::wcerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " + std::cerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " << trailAuto.size() << " bets." << std::endl; - std::wcerr << "Precision: " << 1.0*score/trailAuto.size() + std::cerr << "Precision: " << 1.0*score/trailAuto.size() << ", Recall: " << 1.0*score/trailHand.size() << std::endl; double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size(); @@ -494,7 +494,7 @@ bool borderDetailedAlignMatrix( AlignMatrix& alignMatrix, const Trail& trail, in } } - std::wcerr << numberOfEvaluatedItems << " items inside the border." << std::endl; + std::cerr << numberOfEvaluatedItems << " items inside the border." << std::endl; } return true; diff --git a/apertium/tmx_arguments_parser.cc b/apertium/tmx_arguments_parser.cc index 5498a7d..acd00ec 100644 --- a/apertium/tmx_arguments_parser.cc +++ b/apertium/tmx_arguments_parser.cc @@ -10,7 +10,7 @@ * * *************************************************************************/ #include -#include +#include #include #include @@ -27,7 +27,7 @@ bool Arguments::read( int argc, char **argv ) std::string p = argv[i]; if (p.empty() || p[0]!='-') { - std::wcerr << p << ": unable to parse argument\n"; + std::cerr << p << ": unable to parse argument\n"; throw "argument error"; return false; } @@ -35,7 +35,7 @@ bool Arguments::read( int argc, char **argv ) if (p.empty()) { - std::wcerr << "Empty argument\n"; + std::cerr << "Empty argument\n"; throw "argument error"; return false; } @@ -86,7 +86,7 @@ bool Arguments::read( int argc, char **argv, std::vector& remains ) if (p.empty()) { - std::wcerr << "Empty argument\n"; + std::cerr << "Empty argument\n"; throw "argument error"; return false; } @@ -124,13 +124,13 @@ bool Arguments::getNumericParam( const std::string& name, int& num ) const_iterator it=find(name); if (it==end()) { - // std::wcerr << "Argument -" << name << " missing.\n"; + // std::cerr << "Argument -" << name << " missing.\n"; return false; } if (it->second.kind != AnyData::Int) { - std::wcerr << "Argument -" << name << ": integer expected.\n"; + std::cerr << "Argument -" << name << ": integer expected.\n"; throw "argument error"; } @@ -149,7 +149,7 @@ bool Arguments::getSwitchConst( const ArgName& name, bool& sw ) const } else if (! it->second.dString.empty()) { - std::wcerr << "Argument -" << name << ": value is not allowed.\n"; + std::cerr << "Argument -" << name << ": value is not allowed.\n"; return false; } else @@ -179,7 +179,7 @@ bool Arguments::getSwitchCompact( const ArgName& name ) } else { - std::wcerr << "No value is allowed for argument -" << name << ".\n"; + std::cerr << "No value is allowed for argument -" << name << ".\n"; throw "argument error"; } } @@ -188,16 +188,16 @@ void Arguments::checkEmptyArgs() const { if (!empty()) { - std::wcerr << "Invalid argument: "; + std::cerr << "Invalid argument: "; for ( Arguments::const_iterator it=begin(); it!=end(); ++it ) { - std::wcerr << "-" << it->first; + std::cerr << "-" << it->first; if (!it->second.dString.empty()) - std::wcerr << "=" << it->second.dString; - std::wcerr << " "; + std::cerr << "=" << it->second.dString; + std::cerr << " "; } - std::wcerr << std::endl; + std::cerr << std::endl; throw "argument error"; } diff --git a/apertium/tmx_book_to_matrix.cc b/apertium/tmx_book_to_matrix.cc index fb37b79..c115afb 100644 --- a/apertium/tmx_book_to_matrix.cc +++ b/apertium/tmx_book_to_matrix.cc @@ -185,7 +185,7 @@ void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, con if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " "; + // std::cerr << huPos << " "; } } } @@ -255,7 +255,7 @@ void sentenceListsToAlignMatrixTranslation( if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " (" << numberOfEvaluatedItems << ") "; + // std::cerr << huPos << " (" << numberOfEvaluatedItems << ") "; } } } @@ -301,7 +301,7 @@ void sentenceListsToAlignMatrixIBMModelOne( if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " "; + // std::cerr << huPos << " "; } } } diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc index 15d84fa..c62204d 100644 --- a/apertium/tmx_builder.cc +++ b/apertium/tmx_builder.cc @@ -15,10 +15,8 @@ * along with this program; if not, see . */ #include -#include -#include +#include #include -#include #include @@ -37,10 +35,9 @@ #include #endif -using namespace Apertium; using namespace std; -TMXBuilder::TMXBuilder(wstring const &l1, wstring const &l2): +TMXBuilder::TMXBuilder(UString const &l1, UString const &l2): low_limit(0) { lang1 = l1; @@ -62,61 +59,34 @@ TMXBuilder::~TMXBuilder() { } -wstring -TMXBuilder::restOfBlank(FILE *input) +UString +TMXBuilder::restOfBlank(InputFile& input) { - wstring result = L"["; - - while(true) - { - wint_t val = fgetwc(input); - if(feof(input)) - { - return L""; - } - switch(val) - { - case L'\\': - result += L'\\'; - val = fgetwc(input); - if(feof(input)) - { - return L""; - } - result += static_cast(val); - break; - - case L']': - result += L']'; - return result; - - default: - result += static_cast(val); - break; - } + UString result = input.readBlock('[', ']'); + if (result[result.size()-1] == ']') { + return result; + } else { + return ""_u; } - - return L""; } -wstring -TMXBuilder::nextBlank(FILE *input) +UString +TMXBuilder::nextBlank(InputFile& input) { - wstring result = L""; + UString result; while(true) { - wint_t val = fgetwc(input); - if(feof(input)) - { - return L""; + UChar32 val = input.get(); + if(input.eof()) { + return ""_u; } switch(val) { - case L'\\': - fgetwc(input); + case '\\': + input.get(); break; - case L'[': + case '[': result = restOfBlank(input); return result; @@ -125,12 +95,12 @@ TMXBuilder::nextBlank(FILE *input) } bool -TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) +TMXBuilder::compatible(InputFile& f1, InputFile& f2, bool lazy) { - wstring s1 = nextBlank(f1), s2 = nextBlank(f2); + UString s1 = nextBlank(f1), s2 = nextBlank(f2); if(!lazy) { - while(!feof(f1) && !feof(f2)) + while(!f1.eof() && !f2.eof()) { if(s1 != s2) { @@ -142,7 +112,7 @@ TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) } else { - while(!feof(f1) && !feof(f2)) + while(!f1.eof() && !f2.eof()) { if(s1.size() < s2.size()*(1-0.05) || s1.size() > s2.size()*(1+0.05)) { @@ -158,112 +128,87 @@ TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) bool TMXBuilder::check(string const &file1, string const &file2, bool lazy) { - FILE *f1 = fopen(file1.c_str(), "rb"); - FILE *f2 = fopen(file2.c_str(), "rb"); - if(!f1 && !f2) + InputFile f1, f2; + bool bf1 = f1.open(file1.c_str()); + bool bf2 = f2.open(file2.c_str()); + if(!bf1 && !bf2) { - wcerr << L"Error: Cannot access files '" << UtfConverter::fromUtf8(file1); - wcerr << L"' and '" << UtfConverter::fromUtf8(file2) << "'" << endl; + cerr << "Error: Cannot access files '" << file1; + cerr << "' and '" << file2 << "'" << endl; return false; } - else if(!f1) + else if(!bf1) { - wcerr << L"Error: Cannot access file '"; - wcerr << UtfConverter::fromUtf8(file2); - wcerr << "'" << endl; - fclose(f2); + cerr << "Error: Cannot access file '" << file1 << "'" << endl; return false; } - else if(!f2) + else if(!bf2) { - wcerr << L"Error: Cannot access file '"; - wcerr << UtfConverter::fromUtf8(file2); - wcerr << "'" << endl; - fclose(f1); + cerr << "Error: Cannot access file '" << file2 << "'" << endl; return false; } bool retval = compatible(f1, f2, lazy); - fclose(f1); - fclose(f2); return retval; } -wstring -TMXBuilder::nextTU(FILE *input) +UString +TMXBuilder::nextTU(InputFile& input) { - wstring current_tu = L""; - wstring tmp; + UString current_tu; + UString tmp; while(true) { - wint_t symbol = fgetwc_unlocked(input); - if(feof(input)) - { - if(current_tu == L"") - { - return L""; - } - else - { - return current_tu; - } + UChar32 symbol = input.get(); + if(input.eof()) { + return current_tu; } switch(symbol) { - case L'\\': - symbol = fgetwc_unlocked(input); - if(feof(input)) - { - if(current_tu == L"") - { - return L""; - } - else - { - return current_tu; - } + case '\\': + symbol = input.get(); + if(input.eof()) { + return current_tu; } // continued down default: - current_tu += static_cast(symbol); + current_tu += symbol; break; - case L'[': + case '[': tmp = restOfBlank(input); - if(tmp.substr(0,2) == L"[ ") + if(tmp.substr(0,2) == "[ "_u) { - current_tu.append(L" "); + current_tu += ' '; } - current_tu.append(L""); - if(tmp.substr(tmp.size()-2, 2) == L" ]") + current_tu.append(""_u); + if(tmp.substr(tmp.size()-2, 2) == " ]"_u) { - current_tu.append(L" "); + current_tu += ' '; } break; - case L'.': - current_tu += L'.'; - symbol = fgetwc_unlocked(input); + case '.': + current_tu += '.'; + symbol = input.get(); - if(symbol != L'[' && !iswspace(symbol)) + if(symbol != '[' && !u_isspace(symbol)) { - if(!feof(input)) - { - ungetwc(symbol, input); + if (!input.eof()) { + input.unget(symbol); } } else { - if(!feof(input)) - { - ungetwc(symbol, input); + if (!input.eof()) { + input.unget(symbol); } return current_tu; /* size_t idx = current_tu.size()-1; - while(current_tu[idx] == L'.') + while(current_tu[idx] == '.') { idx--; } @@ -271,9 +216,9 @@ TMXBuilder::nextTU(FILE *input) } break; - case L'?': - case L'!': - current_tu += static_cast(symbol); + case '?': + case '!': + current_tu += symbol; return current_tu; } } @@ -281,34 +226,34 @@ TMXBuilder::nextTU(FILE *input) return current_tu; } -wstring -TMXBuilder::xmlize(wstring const &str) +UString +TMXBuilder::xmlize(UString const &str) { - wstring result = L""; + UString result; for(size_t i = 0, limit = str.size(); i < limit; i++) { switch(str[i]) { - case L'<': - if(i + 5 <= limit && str.substr(i,5)==L"") + case '<': + if(i + 5 <= limit && str.substr(i,5)==""_u) { - result.append(L""); + result.append(""_u); i += 4; break; } else { - result.append(L"<"); + result.append("<"_u); } break; - case L'>': - result.append(L">"); + case '>': + result.append(">"_u); break; - case L'&': - result.append(L"&"); + case '&': + result.append("&"_u); break; default: @@ -323,12 +268,12 @@ TMXBuilder::xmlize(wstring const &str) while(cambio == true) { cambio = false; - while(result.size() >= 5 && result.substr(0,5) == L"") + while(result.size() >= 5 && result.substr(0,5) == ""_u) { result = result.substr(5); cambio = true; } - while(result.size() > 0 && !iswalnum(result[0]) && !iswpunct(result[0])) + while(result.size() > 0 && !u_isalnum(result[0]) && !u_ispunct(result[0])) { result = result.substr(1); cambio = true; @@ -340,12 +285,12 @@ TMXBuilder::xmlize(wstring const &str) while(cambio == true) { cambio = false; - while(result.size() > 5 && result.substr(result.size()-5) == L"") + while(result.size() > 5 && result.substr(result.size()-5) == ""_u) { result = result.substr(0, result.size()-5); cambio = true; } - while(result.size() > 0 && !iswalnum(result[result.size()-1]) && !iswpunct(result[result.size()-1])) + while(result.size() > 0 && !u_isalnum(result[result.size()-1]) && !u_ispunct(result[result.size()-1])) { result = result.substr(0, result.size()-1); cambio = true; @@ -376,50 +321,32 @@ void TMXBuilder::generate(string const &file1, string const &file2, string const &outfile) { - FILE *output = stdout; + UFILE* output = u_finit(stdout, NULL, NULL); - if(outfile != "") + if(!outfile.empty()) { - output = fopen(outfile.c_str(), "w"); + output = u_fopen(outfile.c_str(), "w", NULL, NULL); if(!output) { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(outfile); - wcerr << L"' cannot be opened for writing" << endl; + cerr << "Error: file '" << outfile; + cerr << "' cannot be opened for writing" << endl; exit(EXIT_FAILURE); } } -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - - FILE *f1 = fopen(file1.c_str(), "r"); - if(!f1) - { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(file1); - wcerr << L"' cannot be opened for reading" << endl; - exit(EXIT_FAILURE); - } - FILE *f2 = fopen(file2.c_str(), "r"); - if(!f2) - { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(file2); - wcerr << L"' cannot be opened for reading" << endl; - exit(EXIT_FAILURE); - } + InputFile f1; + f1.open_or_exit(file1.c_str()); -#ifdef _MSC_VER - _setmode(_fileno(f1), _O_U8TEXT); - _setmode(_fileno(f2), _O_U8TEXT); -#endif + InputFile f2; + f2.open_or_exit(file2.c_str()); generateTMX(f1, f2, output); } -vector -TMXBuilder::reverseList(vector const &v) +vector +TMXBuilder::reverseList(vector const &v) { - vector retval(v.size()); + vector retval(v.size()); for(int j = v.size() - 1, i = 0; j >=0; j--, i++) { @@ -429,16 +356,15 @@ TMXBuilder::reverseList(vector const &v) return retval; } -vector -TMXBuilder::sentenceList(FILE *file) +vector +TMXBuilder::sentenceList(InputFile& file) { - vector retval; + vector retval; while(true) { - wstring f = nextTU(file); - if(feof(file)) - { + UString f = nextTU(file); + if(file.eof()) { break; } retval.push_back(f); @@ -447,10 +373,10 @@ TMXBuilder::sentenceList(FILE *file) return retval; } -vector -TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width) +vector +TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width) { - vector result; + vector result; for(unsigned int i = base; i < (base + width) && i < text.size(); i++) { @@ -485,21 +411,21 @@ TMXBuilder::argmin(int nw, int n, int w) } void -TMXBuilder::generateTMX(FILE *f1, FILE *f2, FILE *output) -{ - fprintf(output, "\n"); - fprintf(output, "\n"); - fprintf(output, "
\n"); - fprintf(output, "
\n"); - fprintf(output, "\n"); +TMXBuilder::generateTMX(InputFile& f1, InputFile& f2, UFILE* output) +{ + u_fprintf(output, "\n"); + u_fprintf(output, "\n"); + u_fprintf(output, "
\n"); + u_fprintf(output, "
\n"); + u_fprintf(output, "\n"); outputTU(f1, f2, output); - fprintf(output, "\n
\n"); + u_fprintf(output, "\n
\n"); } @@ -512,17 +438,17 @@ TMXBuilder::printTable(int *table, unsigned int nrows, unsigned int ncols) { if(j != 0) { - wcerr << L" "; + cerr << " "; } - wcerr << setw(10) << table[i*ncols + j]; + cerr << setw(10) << table[i*ncols + j]; } - wcerr << endl; + cerr << endl; } } void -TMXBuilder::printTUCond(FILE *output, wstring const &tu1, wstring const &tu2, bool secure_zone) +TMXBuilder::printTUCond(UFILE *output, UString const &tu1, UString const &tu2, bool secure_zone) { if(secure_zone && similar(tu1, tu2)) { @@ -531,30 +457,26 @@ TMXBuilder::printTUCond(FILE *output, wstring const &tu1, wstring const &tu2, bo } void -TMXBuilder::splitAndMove(FILE *f1, string const &filename) +TMXBuilder::splitAndMove(InputFile& f1, string const &filename) { - FILE *stream = fopen(filename.c_str(), "w"); - vector fichero_por_cadenas = sentenceList(f1); - for(size_t i = 0; i < fichero_por_cadenas.size(); i++) - { - fputws_unlocked(fichero_por_cadenas[i].c_str(), stream); - fputws_unlocked(L"\n", stream); + UFILE* stream = u_fopen(filename.c_str(), "w", NULL, NULL); + vector fichero_por_cadenas = sentenceList(f1); + for (auto& it : fichero_por_cadenas) { + u_fprintf(stream, "%S\n", it.c_str()); } - fclose(stream); + u_fclose(stream); } void -TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) +TMXBuilder::outputTU(InputFile& f1, InputFile& f2, UFILE* output) { string left = tmpnam(NULL); string right = tmpnam(NULL); string out = tmpnam(NULL); splitAndMove(f1, left); - fclose(f1); splitAndMove(f2, right); - fclose(f2); TMXAligner::DictionaryItems dict; AlignParameters ap; @@ -565,29 +487,25 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) TMXAligner::alignerToolWithFilenames(dict, left, right, ap, out); - FILE *stream = fopen(out.c_str(), "r"); + InputFile stream; + stream.open(out.c_str()); int conta = 0; - wstring partes[2]; - while(true) + UString partes[2]; + while(!stream.eof()) { - wchar_t val = fgetwc(stream); - if(feof(stream)) - { - break; - } + UChar32 val = stream.get(); - if(val == L'\t') + if(val == '\t') { conta++; } - else if(val == L'\n') + else if(val == '\n') { - if(partes[0] != L"" && partes[1] != L"") - { + if (!partes[0].empty() && !partes[1].empty()) { printTU(output, partes[0], partes[1]); } - partes[0] = L""; - partes[1] = L""; + partes[0].clear(); + partes[1].clear(); conta = 0; } if(conta < 2) @@ -605,7 +523,7 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) int base_i = 0, base_j = 0; - vector lista1 = reverseList(sentenceList(f1)), + vector lista1 = reverseList(sentenceList(f1)), lista2 = reverseList(sentenceList(f2)), lista3; if(freference != NULL) @@ -615,8 +533,8 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) while(true) { - vector l1 = extractFragment(lista1, base_i, window_size); - vector l2 = extractFragment(lista2, base_j, window_size) , l3; + vector l1 = extractFragment(lista1, base_i, window_size); + vector l2 = extractFragment(lista2, base_j, window_size) , l3; if(lista3.size() != 0) { @@ -696,7 +614,7 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) } } - // wcerr << L"[" << i << L" " << j << L"]" << endl; + // cerr << "[" << i << " " << j << "]" << endl; break; case 3: @@ -755,13 +673,13 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) } int -TMXBuilder::weight(wstring const &s) +TMXBuilder::weight(UString const &s) { return s.size()*2; // just the size of the string } int * -TMXBuilder::levenshteinTable(vector &l1, vector &l2, +TMXBuilder::levenshteinTable(vector &l1, vector &l2, unsigned int diagonal_width, unsigned int max_edit) { unsigned int const nrows = l1.size() + 1; @@ -809,19 +727,19 @@ TMXBuilder::levenshteinTable(vector &l1, vector &l2, return table; } -wstring -TMXBuilder::filter(wstring const &tu) +UString +TMXBuilder::filter(UString const &tu) { bool has_text = false; unsigned int count_blank = 0; for(unsigned int i = 0, limit = tu.size(); i != limit; i++) { - if(iswalpha(tu[i])) + if(u_isalpha(tu[i])) { has_text = true; } - else if(has_text && iswspace(tu[i])) + else if(has_text && u_isspace(tu[i])) { count_blank++; } @@ -829,28 +747,24 @@ TMXBuilder::filter(wstring const &tu) if(!has_text || count_blank <= 2 || tu.size() == 0) { - return L""; + return ""_u; } return xmlize(tu); } void -TMXBuilder::printTU(FILE *output, wstring const &tu1, wstring const &tu2) const +TMXBuilder::printTU(UFILE* output, UString const &tu1, UString const &tu2) const { - wstring tu1_filtered = filter(tu1); - wstring tu2_filtered = filter(tu2); - - if(tu1_filtered != L"" && tu2_filtered != L"") - { + UString tu1_filtered = filter(tu1); + UString tu2_filtered = filter(tu2); - fprintf(output, "\n %s\n", - UtfConverter::toUtf8(lang1).c_str(), - UtfConverter::toUtf8(tu1_filtered).c_str()); + if (tu1_filtered.empty() && !tu2_filtered.empty()) { + u_fprintf(output, "\n %S\n", + lang1.c_str(), tu1_filtered.c_str()); - fprintf(output, " %s\n\n", - UtfConverter::toUtf8(lang2).c_str(), - UtfConverter::toUtf8(tu2_filtered).c_str()); + u_fprintf(output, " %S\n\n", + lang2.c_str(), tu2_filtered.c_str()); } } @@ -892,7 +806,7 @@ TMXBuilder::min2(int i1, int i2) } int -TMXBuilder::editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit) +TMXBuilder::editDistance(UString const &s1, UString const &s2, unsigned int max_edit) { int const nrows = min2(s1.size() + 1, max_edit); int const ncols = min2(s2.size() + 1, max_edit); @@ -974,13 +888,13 @@ TMXBuilder::setEditDistancePercent(double e) } bool -TMXBuilder::isRemovablePunct(wchar_t const &c) +TMXBuilder::isRemovablePunct(UChar32 const &c) { - return c == L'.'; + return c == '.'; } bool -TMXBuilder::similar(wstring const &s1, wstring const &s2) +TMXBuilder::similar(UString const &s1, UString const &s2) { unsigned int l1 = s1.size(); unsigned int l2 = s2.size(); @@ -1012,8 +926,8 @@ TMXBuilder::setTranslation(string const &filename) freference = fopen(filename.c_str(), "r"); if(!freference) { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(filename); - wcerr << L"' cannot be opened for reading" << endl; + cerr << "Error: file '" << filename; + cerr << "' cannot be opened for reading" << endl; freference = NULL; } diff --git a/apertium/tmx_builder.h b/apertium/tmx_builder.h index 7e92d40..2aca2de 100644 --- a/apertium/tmx_builder.h +++ b/apertium/tmx_builder.h @@ -20,14 +20,15 @@ #include #include #include +#include using namespace std; class TMXBuilder { private: - wstring lang1; - wstring lang2; + UString lang1; + UString lang2; unsigned int max_edit; unsigned int diagonal_width; unsigned int window_size; @@ -37,35 +38,35 @@ private: unsigned int low_limit; FILE *freference; - static wstring nextTU(FILE *input); - static wstring restOfBlank(FILE *input); - static wstring nextBlank(FILE *input); - static wstring xmlize(wstring const &str); - static bool compatible(FILE *input, FILE *output, bool lazy = false); - void generateTMX(FILE *f1, FILE *f2, FILE *output); - void outputTU(FILE *f1, FILE *f2, FILE *output); - static vector reverseList(vector const &v); - static vector sentenceList(FILE *file); + static UString nextTU(InputFile& input); + static UString restOfBlank(InputFile& input); + static UString nextBlank(InputFile& input); + static UString xmlize(UString const &str); + static bool compatible(InputFile& input, InputFile& output, bool lazy = false); + void generateTMX(InputFile& f1, InputFile& f2, UFILE* output); + void outputTU(InputFile& f1, InputFile& f2, UFILE* output); + static vector reverseList(vector const &v); + static vector sentenceList(InputFile& file); static int argmin(int nw, int n, int w); - static int * levenshteinTable(vector &l1, vector &l2, + static int * levenshteinTable(vector &l1, vector &l2, unsigned int diagonal_width, unsigned int max_edit); - void printTU(FILE *output, wstring const &tu1, wstring const &tu2) const; - static wstring filter(wstring const &s); - static int weight(wstring const &s); + void printTU(UFILE* output, UString const &tu1, UString const &tu2) const; + static UString filter(UString const &s); + static int weight(UString const &s); static void printTable(int *table, unsigned int nrows, unsigned int ncols); - static int editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit); + static int editDistance(UString const &s1, UString const &s2, unsigned int max_edit); static int min3(int i1, int i2, int i3); static int min2(int i1, int i2); - void printTUCond(FILE *output, wstring const &s1, wstring const &s2, bool secure_zone); - static vector extractFragment(vector const &text, unsigned int base, + void printTUCond(UFILE* output, UString const &s1, UString const &s2, bool secure_zone); + static vector extractFragment(vector const &text, unsigned int base, unsigned int width); - static bool isRemovablePunct(wchar_t const &c); - bool similar(wstring const &s1, wstring const &s2); + static bool isRemovablePunct(UChar32 const &c); + bool similar(UString const &s1, UString const &s2); - void splitAndMove(FILE *file, string const &filename); + void splitAndMove(InputFile& file, string const &filename); public: - TMXBuilder(wstring const &l1, wstring const &l2); + TMXBuilder(UString const &l1, UString const &l2); ~TMXBuilder(); static bool check(string const &file1, string const &file2, bool lazy = false); void generate(string const &file1, string const &file2, diff --git a/apertium/tmx_dic_tree.h b/apertium/tmx_dic_tree.h index 9a0545b..957de53 100644 --- a/apertium/tmx_dic_tree.h +++ b/apertium/tmx_dic_tree.h @@ -106,7 +106,7 @@ DicTree& DicTree::add( const Atom& word, con if ( ( v->id != 0 ) && ( id != 0 ) ) { if (WarnOnConflict) - std::wcerr << "warning: conflict in tree" << std::endl; + std::cerr << "warning: conflict in tree" << std::endl; } if ( id != 0 ) { @@ -165,7 +165,7 @@ void SubsetLookup::add( const Atoms& words, const Identifier& else { if (DicTree::WarnOnConflict) - std::wcerr << "warning: conflict in tree" << std::endl; + std::cerr << "warning: conflict in tree" << std::endl; } } diff --git a/apertium/tmx_dictionary.cc b/apertium/tmx_dictionary.cc index f36c65a..70944ea 100644 --- a/apertium/tmx_dictionary.cc +++ b/apertium/tmx_dictionary.cc @@ -22,7 +22,7 @@ #include -#define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; } +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } namespace TMXAligner { @@ -151,7 +151,7 @@ void readBicorpus( std::istream& is, SentenceList& huSentenceList, SentenceList& split( line, halfs ); if (halfs.size()!=2) { - std::wcerr << "Incorrect bicorpus file: " << halfs.size() << " records in line " << huSentenceList.size() << std::endl; + std::cerr << "Incorrect bicorpus file: " << halfs.size() << " records in line " << huSentenceList.size() << std::endl; throw "data error"; } @@ -565,7 +565,7 @@ void TransLex::build( const DictionaryItems& dictionaryItems ) ++ignored; } } - std::wcerr << added << " items added to TransLex, " << ignored << " multiword items ignored." << std::endl; + std::cerr << added << " items added to TransLex, " << ignored << " multiword items ignored." << std::endl; } TransLex::DictInterval TransLex::lookupLeftWord ( const Word& huWord ) const diff --git a/apertium/tmx_trail_postprocessors.cc b/apertium/tmx_trail_postprocessors.cc index 4bde3ed..d0b7312 100644 --- a/apertium/tmx_trail_postprocessors.cc +++ b/apertium/tmx_trail_postprocessors.cc @@ -280,7 +280,7 @@ void postprocessTrailStart( Trail& bestTrail, { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -319,7 +319,7 @@ void postprocessTrailEnd( Trail& bestTrail, { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -365,7 +365,7 @@ void postprocessTrail( Trail& bestTrail, const TrailScoresInterval& trailScoresI { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -404,7 +404,7 @@ void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold ) { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << std::endl; } diff --git a/apertium/tmx_translate.cc b/apertium/tmx_translate.cc index 06db477..dd37da5 100644 --- a/apertium/tmx_translate.cc +++ b/apertium/tmx_translate.cc @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -34,7 +34,7 @@ void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dum if (hu.size()==1) { dumbDictionary[ hu[0] ] = en ; - // std::wcerr << hu[0] << "\t" << en << std::endl; + // std::cerr << hu[0] << "\t" << en << std::endl; } } } @@ -98,7 +98,7 @@ void buildDumbDictionary( TMXAligner::DumbDictionary& dumbDictionary, { std::ifstream is( dictionaryFilename.c_str() ); dictionary.read( is ); - std::wcerr << dictionary.size() << " dictionary items read." << std::endl; + std::cerr << dictionary.size() << " dictionary items read." << std::endl; } if (!enSentenceList.empty()) @@ -267,7 +267,7 @@ void naiveTranslate( { subsetLookup.add( dictionary[i].second, i+1 ); // !!! i+1 } - std::wcerr << "Index tree built." << std::endl; + std::cerr << "Index tree built." << std::endl; } for ( size_t i=0; i. */ #include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#ifdef _WIN32 -#include -#endif - -using namespace Apertium; -using namespace std; - -void -Transfer::destroy() -{ - if(me) - { - delete me; - me = NULL; - } - if(doc) - { - xmlFreeDoc(doc); - doc = NULL; - } -} - -Transfer::Transfer() : -word(0), -lword(0), -last_lword(0), -output(0), -any_char(0), -any_tag(0), -nwords(0) -{ - me = NULL; - doc = NULL; - root_element = NULL; - lastrule = NULL; - defaultAttrs = lu; - useBilingual = true; - preBilingual = false; - isExtended = false; - null_flush = false; - internal_null_flush = false; - trace = false; - trace_att = false; - in_lu = false; - in_let_var = false; - in_out = false; - in_wblank = false; -} - -Transfer::~Transfer() -{ - destroy(); -} - -void -Transfer::readData(FILE *in) -{ - alphabet.read(in); - any_char = alphabet(TRXReader::ANY_CHAR); - any_tag = alphabet(TRXReader::ANY_TAG); - - Transducer t; - t.read(in, alphabet.size()); - - map finals; - - // finals - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - int key = Compression::multibyte_read(in); - finals[key] = Compression::multibyte_read(in); - } - me = new MatchExe(t, finals); +#include +#include - // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - } - } - - // variables - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); - } - - // macros - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - macros[cad_k] = Compression::multibyte_read(in); - } +#include - // lists - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); +using namespace std; - for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) - { - wstring const cad_v = Compression::wstring_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); - } - } -} +Transfer::Transfer() + : word(nullptr), last_lword(0), in_lu(false), in_wblank(false), + isExtended(false), defaultAttrs(lu), preBilingual(false), + useBilingual(true), trace_att(false) +{} void Transfer::readBil(string const &fstfile) @@ -148,7 +35,7 @@ Transfer::readBil(string const &fstfile) FILE *in = fopen(fstfile.c_str(), "rb"); if(!in) { - wcerr << "Error: Could not open file '" << fstfile << "'." << endl; + cerr << "Error: Could not open file '" << fstfile << "'." << endl; exit(EXIT_FAILURE); } fstp.load(in); @@ -162,7 +49,7 @@ Transfer::setExtendedDictionary(string const &fstfile) FILE *in = fopen(fstfile.c_str(), "rb"); if(!in) { - wcerr << "Error: Could not open extended dictionary file '" << fstfile << "'." << endl; + cerr << "Error: Could not open extended dictionary file '" << fstfile << "'." << endl; exit(EXIT_FAILURE); } extended.load(in); @@ -175,2049 +62,829 @@ void Transfer::read(string const &transferfile, string const &datafile, string const &fstfile) { - readTransfer(transferfile); + TransferBase::read(transferfile.c_str(), datafile.c_str()); + if (getattr(root_element, "default") == "chunk"_u) { + defaultAttrs = chunk; + } else { + defaultAttrs = lu; + } + if (!fstfile.empty()) { + readBil(fstfile); + } +} - // datafile - FILE *in = fopen(datafile.c_str(), "rb"); - if(!in) +bool +Transfer::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; - exit(EXIT_FAILURE); + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; + return false; } - readData(in); - fclose(in); - - if(fstfile != "") + if(index < 0) { + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; + return false; + } + if(word[index] == 0) { - readBil(fstfile); + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; + return false; } + return true; } -void -Transfer::readTransfer(string const &in) +UString +Transfer::evalCachedString(xmlNode *element) { - doc = xmlReadFile(in.c_str(), NULL, 0); + TransferInstr& ti = evalStringCache[element]; + switch (ti.getType()) { + case ti_clip_sl: + if (checkIndex(element, ti.getPos(), lword)) { + if (gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { + if(in_lu) { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); + } else if (in_let_var) { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); + } + } - if(doc == NULL) - { - wcerr << "Error: Could not parse file '" << in << "'." << endl; - exit(EXIT_FAILURE); - } + return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); + } + break; - root_element = xmlDocGetRootElement(doc); + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) { + if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { + if(in_lu) { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); + } else if(in_let_var) { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); + } + } - // search for root element attributes - for(xmlAttr *i = root_element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "default")) - { - if(!xmlStrcmp(i->children->content, (const xmlChar *) "chunk")) - { - defaultAttrs = chunk; + return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_clip_ref: + if(checkIndex(element, ti.getPos(), lword)) { + return word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_linkto_sl: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += ti.getStrval(); + ret += '>'; + return ret; + } else { + return ""_u; } - else - { - defaultAttrs = lu; // default value for 'default' + } + break; + + case ti_linkto_tl: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += ti.getStrval(); + ret += '>'; + return ret; + } else { + return ""_u; } } - } + break; - // search for macros & rules - for(xmlNode *i = root_element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) - { - collectMacros(i); + case ti_linkto_ref: + if(checkIndex(element, ti.getPos(), lword)) { + if(!word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()).empty()) { + UString ret; + ret += '<'; + ret += ti.getStrval(); + ret += '>'; + return ret; + } else { + return ""_u; } - else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) - { - collectRules(i); + } + break; + + case ti_var: + if(last_lword > 1) { + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + } + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(!blank_queue.empty()) { + UString retblank = blank_queue.front(); + if(in_out) { + blank_queue.pop(); } + + return retblank; + } else { + return " "_u; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_sl: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->source(attr_items[ti.getContent()])); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->target(attr_items[ti.getContent()])); + } + break; + + case ti_case_of_ref: + if(checkIndex(element, ti.getPos(), lword)) { + return StringUtils::getcase(word[ti.getPos()]->reference(attr_items[ti.getContent()])); } + break; + + default: + return ""_u; } + return ""_u; } void -Transfer::collectRules(xmlNode *localroot) +Transfer::processClip(xmlNode* element) { - for(xmlNode *rule = localroot->children; rule != NULL; rule = rule->next) - { - if(rule->type == XML_ELEMENT_NODE) - { - size_t line = rule->line; - for(xmlNode *rulechild = rule->children; ; rulechild = rulechild->next) - { - if(rulechild->type == XML_ELEMENT_NODE && !xmlStrcmp(rulechild->name, (const xmlChar *) "action")) - { - rule_map.push_back(rulechild); - rule_lines.push_back(line); - break; - } - } - } + int pos = 0; + xmlChar *side = NULL; + UString as; + UString part; + bool queue = true; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) { + side = i->children->content; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { + part = to_ustring((const char*) i->children->content); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { + pos = atoi((const char *)i->children->content) - 1; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) { + queue = false; + } + } else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) { + as = to_ustring((const char*)i->children->content); + } + } + + if(!as.empty()) { + if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_linkto_sl, part, pos, NULL, queue, as); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_linkto_ref, part, pos, NULL, queue, as); + } else { + evalStringCache[element] = TransferInstr(ti_linkto_tl, part, pos, NULL, queue, as); + } + } else if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); + } else { + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); } } void -Transfer::collectMacros(xmlNode *localroot) +Transfer::processBlank(xmlNode* element) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - macro_map.push_back(i); + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); +} + +void +Transfer::processCaseOf(xmlNode* element) +{ + int pos = 0; + xmlChar *side = NULL; + UString part; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) { + side = i->children->content; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { + part = to_ustring((const char*) i->children->content); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { + pos = atoi((const char *) i->children->content) - 1; } } + + if(!xmlStrcmp(side, (const xmlChar *) "sl")) { + evalStringCache[element] = TransferInstr(ti_case_of_sl, part, pos); + } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { + evalStringCache[element] = TransferInstr(ti_case_of_ref, part, pos); + } else { + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); + } } -bool -Transfer::checkIndex(xmlNode *element, int index, int limit) +UString +Transfer::processLu(xmlNode* element) { - if(index >= limit) - { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index >= limit" << endl; - return false; + in_lu = true; + out_wblank.clear(); + + UString myword; + for (auto i : children(element)) { + myword.append(evalString(i)); } - if(index < 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index < 0" << endl; - return false; + + in_lu = false; + + if(last_lword == 1) { + out_wblank = word[0]->getWblank(); } - if(word[index] == 0) - { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": Null access at word[index]" << endl; - return false; + + if(!myword.empty()) { + if(myword[0] != '[' || myword[1] != '[') { + UString ret = out_wblank; + ret += '^'; + ret += myword; + ret += '$'; + return ret; + } else { + myword += '$'; + return myword; + } + } else { + return ""_u; } - return true; } -bool -Transfer::gettingLemmaFromWord(string attr) +UString +Transfer::processMlu(xmlNode* element) { - return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); + UString value; + + bool first_time = true; + out_wblank.clear(); + + in_lu = true; + for (auto i : children(element)) { + UString myword; + for (auto j : children(i)) { + myword.append(evalString(j)); + } + + if (!first_time) { + if(!myword.empty() && myword[0] != '#') { //'+#' problem + value += '+'; + } + } else { + if (!myword.empty()) { + first_time = false; + } + } + + value.append(myword); + } + + if(last_lword == 1) { + out_wblank = word[0]->getWblank(); + } + + if(!value.empty()) { + UString ret = out_wblank; + ret += '^'; + ret += value; + ret += '$'; + return ret; + } else { + return ""_u; + } } -string -Transfer::combineWblanks(string wblank_current, string wblank_to_add) +void +Transfer::processLuCount(xmlNode* element) { - if(wblank_current.empty() && wblank_to_add.empty()) - { - return wblank_current; - } - else if(wblank_current.empty()) - { - return wblank_to_add; + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); +} + +void +Transfer::processOut(xmlNode *localroot) +{ + in_out = true; + + for (auto i : children(localroot)) { + if(defaultAttrs == lu) { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + write(processLu(i), output); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { + write(processMlu(i), output); + } + } else { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { + write(processChunk(i), output); + } else { // 'b' + write(evalString(i), output); + } + } } - else if(wblank_to_add.empty()) + in_out = false; +} + +UString +Transfer::processChunk(xmlNode *localroot) +{ + UString name, namefrom; + UString caseofchunk = "aa"_u; + UString result; + + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) { - return wblank_current; + if(!xmlStrcmp(i->name, (const xmlChar *) "name")) + { + name = to_ustring((const char *) i->children->content); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "namefrom")) + { + namefrom = to_ustring((const char *) i->children->content); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "case")) + { + caseofchunk = to_ustring((const char *) i->children->content); + } } - - string new_out_wblank; - for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + + result += '^'; + if(!caseofchunk.empty()) { - if(*it == '\\') + if(!name.empty()) { - new_out_wblank += *it; - it++; - new_out_wblank += *it; + result.append(StringUtils::copycase(variables[caseofchunk], name)); } - else if(*it == ']') + else if(!namefrom.empty()) { - if(*(it+1) == ']') - { - new_out_wblank += ';'; - break; - } + result.append(StringUtils::copycase(variables[caseofchunk], variables[namefrom])); } else { - new_out_wblank += *it; + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); } } - - for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + else { - if(*it == '\\') + if(!name.empty()) { - new_out_wblank += *it; - it++; - new_out_wblank += *it; + result.append(name); } - else if(*it == '[') + else if(!namefrom.empty()) { - if(*(it+1) == '[') - { - new_out_wblank += ' '; - it++; - } + result.append(variables[namefrom]); } else { - new_out_wblank += *it; + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); + } + } + + for (auto i : children(localroot)) { + if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) { + result.append(processTags(i)); + result += '{'; + } else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + result.append(processLu(i)); + } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { + result.append(processMlu(i)); + } else { // 'b' + result.append(evalString(i)); + } + } + result += '}'; + result += '$'; + return result; +} + +UString +Transfer::processTags(xmlNode *localroot) +{ + UString result; + for (auto i : children(localroot)) { + if (!xmlStrcmp(i->name, (const xmlChar*) "tag")) { + for (auto j : children(i)) { + result.append(evalString(j)); + } } } - - return new_out_wblank; + return result; } -string -Transfer::evalString(xmlNode *element) +void +Transfer::processLet(xmlNode *localroot) { - map::iterator it; - it = evalStringCache.find(element); + xmlNode *leftSide = NULL, *rightSide = NULL; + + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; + } + } + + map::iterator it = evalStringCache.find(leftSide); if(it != evalStringCache.end()) { TransferInstr &ti = it->second; switch(ti.getType()) { + case ti_var: + in_let_var = true; + var_val = ti.getContent(); + + var_out_wblank[var_val].clear(); + + variables[ti.getContent()] = evalString(rightSide); + + in_let_var = false; + + return; + case ti_clip_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) + if (checkIndex(leftSide, ti.getPos(), lword)) { + bool match = word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + if (!match && trace) { - if(in_lu) - { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); - } - else if(in_let_var) - { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); - } + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } - - return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); } - break; + return; case ti_clip_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) + if (checkIndex(leftSide, ti.getPos(), lword)) { + bool match = word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + if (!match && trace) { - if(in_lu) - { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); - } - else if(in_let_var) - { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); - } - } - - return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); - } - break; - - case ti_clip_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - return word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()); - } - break; - - case ti_linkto_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()) != "") - { - return "<" + string((char *) ti.getPointer()) + ">"; - } - else - { - return ""; - } - } - break; - - case ti_linkto_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - if(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()) != "") - { - return "<" + string((char *) ti.getPointer()) + ">"; - } - else - { - return ""; - } - } - break; - - case ti_linkto_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - if(word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()) != "") - { - return "<" + string((char *) ti.getPointer()) + ">"; - } - else - { - return ""; - } - } - break; - - case ti_var: - if(last_lword > 1) - { - out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); - } - return variables[ti.getContent()]; - - case ti_lit_tag: - case ti_lit: - return ti.getContent(); - - case ti_b: - if(!blank_queue.empty()) - { - string retblank = blank_queue.front(); - if(in_out) - { - blank_queue.pop(); - } - - return retblank; - } - else - { - return " "; - } - break; - - case ti_get_case_from: - if(checkIndex(element, ti.getPos(), lword)) - { - return copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), - evalString((xmlNode *) ti.getPointer())); - } - break; - - case ti_case_of_sl: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->source(attr_items[ti.getContent()])); - } - break; - - case ti_case_of_tl: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->target(attr_items[ti.getContent()])); - } - break; - - case ti_case_of_ref: - if(checkIndex(element, ti.getPos(), lword)) - { - return caseOf(word[ti.getPos()]->reference(attr_items[ti.getContent()])); - } - break; - - default: - return ""; - } - return ""; - } - - if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) - { - int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; - bool queue = true; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *)i->children->content) - 1; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) - { - if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) - { - queue = false; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) - { - as = i->children->content; - } - } - - if(as != NULL) - { - if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_linkto_sl, (const char *) part, pos, (void *) as, queue); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_linkto_ref, (const char *) part, pos, (void *) as, queue); - } - else - { - evalStringCache[element] = TransferInstr(ti_linkto_tl, (const char *) part, pos, (void *) as, queue); - } - } - else if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_clip_ref, (const char *) part, pos, NULL, queue); - } - else - { - evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) - { - evalStringCache[element] = TransferInstr(ti_lit_tag, - tags((const char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) - { - evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) - { - evalStringCache[element] = TransferInstr(ti_b, " ", -1); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) - { - int pos = atoi((const char *) element->properties->children->content) - 1; - xmlNode *param = NULL; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - param = i; - break; - } - } - - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) - { - evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) - { - int pos = 0; - xmlChar *part = NULL, *side = NULL; - - for(xmlAttr *i = element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - } - - if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - evalStringCache[element] = TransferInstr(ti_case_of_sl, (const char *) part, pos); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - evalStringCache[element] = TransferInstr(ti_case_of_ref, (const char *) part, pos); - } - else - { - evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) - { - string value; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - value.append(evalString(i)); - } - } - return value; - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - string myword; - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - myword.append(evalString(i)); - } - } - - in_lu = false; - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(myword != "") - { - if(myword[0] != L'[' || myword[1] != L'[') - { - return out_wblank+"^"+myword+"$"; - } - else - { - return myword+"$"; - } - } - else - { - return ""; - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) - { - string value; - - bool first_time = true; - out_wblank.clear(); - - for(xmlNode *i = element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_lu = true; - - string myword; - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(!first_time) - { - if(myword != "" && myword[0] != '#') //'+#' problem - { - value.append("+"); - } - } - else - { - if(myword != "") - { - first_time = false; - } - } - - value.append(myword); - } - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(value != "") - { - return out_wblank+"^"+value+"$"; - } - else - { - return ""; - } - } - else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) - { - return processChunk(element); - } - else - { - wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; - exit(EXIT_FAILURE); - } - - return evalString(element); -} - -void -Transfer::processOut(xmlNode *localroot) -{ - in_out = true; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(defaultAttrs == lu) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - string myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(myword != "") - { - if(myword[0] != L'[' || myword[1] != L'[') - { - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked(L'^', output); - } - - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) - { - string myword; - bool first_time = true; - out_wblank.clear(); - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - - string mylocalword; - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - mylocalword.append(evalString(k)); - } - } - - in_lu = false; - - if(!first_time) - { - if(mylocalword != "" && mylocalword[0] != '#') //'+#' problem - { - myword += '+'; - } - } - else - { - if(mylocalword != "") - { - first_time = false; - } - } - - myword.append(mylocalword); - } - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(myword != "") - { - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked('^', output); - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); - } - } - else // 'b' - { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), - output); - } - } - else - { - if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) - { - fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); - } - else // 'b' - { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); - } - } - } - } - - in_out = false; -} - -string -Transfer::processChunk(xmlNode *localroot) -{ - string name, namefrom; - string caseofchunk = "aa"; - string result; - - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "name")) - { - name = (const char *) i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "namefrom")) - { - namefrom = (const char *) i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "case")) - { - caseofchunk = (const char *) i->children->content; - } - } - - result.append("^"); - if(caseofchunk != "") - { - if(name != "") - { - result.append(copycase(variables[caseofchunk], name)); - } - else if(namefrom != "") - { - result.append(copycase(variables[caseofchunk], variables[namefrom])); - } - else - { - wcerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; - exit(EXIT_FAILURE); - } - } - else - { - if(name != "") - { - result.append(name); - } - else if(namefrom != "") - { - result.append(variables[namefrom]); - } - else - { - wcerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; - exit(EXIT_FAILURE); - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) - { - result.append(processTags(i)); - result.append("{"); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) - { - in_lu = true; - out_wblank.clear(); - - string myword; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - myword.append(evalString(j)); - } - } - - in_lu = false; - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(myword != "") - { - result.append(out_wblank); - result.append("^"); - result.append(myword); - result.append("$"); - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) - { - bool first_time = true; - string myword; - - out_wblank.clear(); - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - string mylocalword; - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - mylocalword.append(evalString(k)); - } - } - - in_lu = false; - - if(!first_time) - { - if(mylocalword != "" && mylocalword[0] != '#') // '+#' problem - { - myword += '+'; - } - } - else - { - first_time = false; - } - } - myword.append(mylocalword); - } - - if(last_lword == 1) - { - out_wblank = word[0]->getWblank(); - } - - if(myword != "") - { - result.append(out_wblank); - result.append("^"); - result.append(myword); - result.append("$"); - } - } - else // 'b' - { - result.append(evalString(i)); - } - } - } - result.append("}$"); - return result; -} - -string -Transfer::processTags(xmlNode *localroot) -{ - string result; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - result.append(evalString(j)); - } - } - } - } - } - return result; -} - -int -Transfer::processInstruction(xmlNode *localroot) -{ - int words_to_consume = -1; - if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) - { - words_to_consume = processChoose(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) - { - processLet(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) - { - processAppend(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) - { - processOut(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) - { - processCallMacro(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) - { - processModifyCase(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) - { - words_to_consume = processRejectCurrentRule(localroot); - } - return words_to_consume; -} - -int -Transfer::processRejectCurrentRule(xmlNode *localroot) -{ - bool shifting = true; - string value; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "shifting")) - { - value = (char *) i->children->content; - break; - } - } - - if(value == "no") - { - shifting = false; - } - - return shifting ? 1 : 0; -} - -void -Transfer::processLet(xmlNode *localroot) -{ - xmlNode *leftSide = NULL, *rightSide = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } - } - } - - map::iterator it = evalStringCache.find(leftSide); - if(it != evalStringCache.end()) - { - TransferInstr &ti = it->second; - switch(ti.getType()) - { - case ti_var: - in_let_var = true; - var_val = ti.getContent(); - - var_out_wblank[var_val].clear(); - - variables[ti.getContent()] = evalString(rightSide); - - in_let_var = false; - - return; - - case ti_clip_sl: - if (checkIndex(leftSide, ti.getPos(), lword)) { - bool match = word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); - if (!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - return; - - case ti_clip_tl: - if (checkIndex(leftSide, ti.getPos(), lword)) { - bool match = word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); - if (!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - return; - - case ti_clip_ref: - if (checkIndex(leftSide, ti.getPos(), lword)) { - bool match = word[ti.getPos()]->setReference(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); - if (!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - return; - - default: - return; - } - } - if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "var")) - { - in_let_var = true; - - string const val = (const char *) leftSide->properties->children->content; - - var_val = val; - var_out_wblank[var_val].clear(); - - variables[val] = evalString(rightSide); - - in_let_var = false; - evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); - } - else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) - { - int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; - bool queue = true; - - for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) - { - if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) - { - queue = false; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) - { - as = i->children->content; // TODO: set but never read - } - } - - if (pos >= lword) { - wcerr << L"Error: Transfer::processLet() bad access on pos >= lword" << endl; - return; - } - if (word[pos] == 0) { - wcerr << L"Error: Transfer::processLet() null access on word[pos]" << endl; - return; - } - - if(!xmlStrcmp(side, (const xmlChar *) "tl")) - { - bool match = word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - bool match = word[pos]->setReference(attr_items[(const char *) part], evalString(rightSide), queue); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - evalStringCache[leftSide] = TransferInstr(ti_clip_ref, (const char *) part, pos, NULL, queue); - } - else - { - bool match = word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); - } - } -} - -void -Transfer::processAppend(xmlNode *localroot) -{ - string name; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "n")) - { - name = (char *) i->children->content; - break; - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_let_var = true; - var_val = name; - variables[name].append(evalString(i)); - in_let_var = false; - } - } -} - -void -Transfer::processModifyCase(xmlNode *localroot) -{ - xmlNode *leftSide = NULL, *rightSide = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(leftSide == NULL) - { - leftSide = i; - } - else - { - rightSide = i; - break; - } - } - } - - if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) - { - int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; - bool queue = true; - - for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "side")) - { - side = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) - { - part = i->children->content; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) - { - pos = atoi((const char *) i->children->content) - 1; - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) - { - if(!xmlStrcmp(i->children->content, (xmlChar const *) "no")) - { - queue = false; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) - { - as = i->children->content; - (void)as; // ToDo, remove "as" and the whole else? - } - } - if(!xmlStrcmp(side, (const xmlChar *) "sl")) - { - string const result = copycase(evalString(rightSide), - word[pos]->source(attr_items[(const char *) part], queue)); - bool match = word[pos]->setSource(attr_items[(const char *) part], result); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - else if(!xmlStrcmp(side, (const xmlChar *) "ref")) - { - string const result = copycase(evalString(rightSide), - word[pos]->reference(attr_items[(const char *) part], queue)); - bool match = word[pos]->setReference(attr_items[(const char *) part], result); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - else - { - string const result = copycase(evalString(rightSide), - word[pos]->target(attr_items[(const char *) part], queue)); - bool match = word[pos]->setTarget(attr_items[(const char *) part], result); - if(!match && trace) - { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; - } - } - } - else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) - { - string const val = (const char *) leftSide->properties->children->content; - variables[val] = copycase(evalString(rightSide), variables[val]); - } -} - -void -Transfer::processCallMacro(xmlNode *localroot) -{ - string const n = (const char *) localroot->properties->children->content; - int npar = 0; - - xmlNode *macro = macro_map[macros[n]]; - - for(xmlAttr *i = macro->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) - { - npar = atoi((const char *) i->children->content); - break; - } - } - - // ToDo: Is it at all valid if npar <= 0 ? - - TransferWord **myword = NULL; - if(npar > 0) - { - myword = new TransferWord *[npar]; - std::fill(myword, myword+npar, (TransferWord *)(0)); - } - - int idx = 0; - for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if (idx >= npar) { - wcerr << L"Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; - return; - } - int pos = atoi((const char *) i->properties->children->content)-1; - myword[idx] = word[pos]; - - idx++; - } - } - - swap(myword, word); - swap(npar, lword); - - for(xmlNode *i = macro->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - processInstruction(i); - } - } - - swap(myword, word); - swap(npar, lword); - - delete[] myword; -} - -int -Transfer::processChoose(xmlNode *localroot) -{ - int words_to_consume = -1; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "when")) - { - bool picked_option = false; - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(j->name, (const xmlChar *) "test")) - { - if(!processTest(j)) - { - break; - } - else - { - picked_option = true; - } - } - else - { - words_to_consume = processInstruction(j); - if(words_to_consume != -1) - { - return words_to_consume; - } - } - } - } - if(picked_option) - { - return words_to_consume; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - words_to_consume = processInstruction(j); - if(words_to_consume != -1) - { - return words_to_consume; - } - } - } - } - } - } - return words_to_consume; -} - -bool -Transfer::processLogical(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) - { - return processEqual(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) - { - return processBeginsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) - { - return processBeginsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) - { - return processEndsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) - { - return processEndsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) - { - return processContainsSubstring(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) - { - return processOr(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) - { - return processAnd(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) - { - return processNot(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) - { - return processIn(localroot); - } - - return false; -} - -bool -Transfer::processIn(xmlNode *localroot) -{ - xmlNode *value = NULL; - xmlChar *idlist = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(value == NULL) - { - value = i; - } - else - { - idlist = i->properties->children->content; - break; - } - } - } - - string sval = evalString(value); - - if(localroot->properties != NULL) - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - set &myset = listslow[(const char *) idlist]; - if(myset.find(tolower(sval)) != myset.end()) - { - return true; - } - else - { - return false; - } - } - } - - set &myset = lists[(const char *) idlist]; - if(myset.find(sval) != myset.end()) - { - return true; - } - else - { - return false; - } -} - -bool -Transfer::processTest(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return processLogical(i); - } - } - return false; -} - -bool -Transfer::processAnd(xmlNode *localroot) -{ - bool val = true; - for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val && processLogical(i); - } - } - - return val; -} - -bool -Transfer::processOr(xmlNode *localroot) -{ - bool val = false; - for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val || processLogical(i); - } - } - - return val; -} - -bool -Transfer::processNot(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return !processLogical(i); - } - } - return false; -} - -bool -Transfer::processEqual(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first) == evalString(second); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)) == tolower(evalString(second)); - } - else - { - return evalString(first) == evalString(second); - } - } -} - -bool -Transfer::beginsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = 0; i != limit; i++) - { - if(s1[i] != s2[i]) - { - return false; - } - } - - return true; -} - -bool -Transfer::endsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) - { - if(s1[j] != s2[i]) - { - return false; - } - } - - return true; -} - - -bool -Transfer::processBeginsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return beginsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return beginsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return beginsWith(evalString(first), evalString(second)); - } - } -} - -bool -Transfer::processEndsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return endsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return endsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return endsWith(evalString(first), evalString(second)); - } - } -} - -bool -Transfer::processBeginsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } + } + return; - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; + case ti_clip_ref: + if (checkIndex(leftSide, ti.getPos(), lword)) { + bool match = word[ti.getPos()]->setReference(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + if (!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } + } + return; - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); + default: + return; + } } - else + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } + in_let_var = true; - for(; it != limit; it++) - { - if(beginsWith(needle, *it)) - { - return true; - } - } - return false; -} + UString const val = to_ustring((const char *) leftSide->properties->children->content); + var_val = val; + var_out_wblank[var_val].clear(); -bool -Transfer::processEndsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; + variables[val] = evalString(rightSide); - for(xmlNode *i = localroot->children; i != NULL; i = i->next) + in_let_var = false; + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { - if(i->type == XML_ELEMENT_NODE) + int pos = 0; + xmlChar *side = NULL, *as = NULL; + UString part; + bool queue = true; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { - if(first == NULL) + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) { - first = i; + side = i->children->content; } - else + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - second = i; - break; + part = to_ustring((const char*) i->children->content); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; // TODO: set but never read } } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - for(; it != limit; it++) - { - if(endsWith(needle, *it)) - { - return true; + if (pos >= lword) { + cerr << "Error: Transfer::processLet() bad access on pos >= lword" << endl; + return; + } + if (word[pos] == 0) { + cerr << "Error: Transfer::processLet() null access on word[pos]" << endl; + return; } - } - return false; -} - -bool -Transfer::processContainsSubstring(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) + if(!xmlStrcmp(side, (const xmlChar *) "tl")) { - if(first == NULL) - { - first = i; - } - else + bool match = word[pos]->setTarget(attr_items[part], evalString(rightSide), queue); + if(!match && trace) { - second = i; - break; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); } - } - - if(localroot->properties == NULL) - { - return evalString(first).find(evalString(second)) != string::npos; - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) + else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + bool match = word[pos]->setReference(attr_items[part], evalString(rightSide), queue); + if(!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } + evalStringCache[leftSide] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); } else { - return evalString(first).find(evalString(second)) != string::npos; + bool match = word[pos]->setSource(attr_items[part], evalString(rightSide), queue); + if(!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } + evalStringCache[leftSide] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); } } } -string -Transfer::copycase(string const &source_word, string const &target_word) +void +Transfer::processModifyCase(xmlNode *localroot) { - wstring result; - wstring const s_word = UtfConverter::fromUtf8(source_word); - wstring const t_word = UtfConverter::fromUtf8(target_word); - - bool firstupper = iswupper(s_word[0]); - bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); - bool sizeone = s_word.size() == 1; + xmlNode *leftSide = NULL, *rightSide = NULL; - if(!uppercase || (sizeone && uppercase)) - { - result = t_word; - result[0] = towlower(result[0]); - //result = StringUtils::tolower(t_word); - } - else - { - result = StringUtils::toupper(t_word); + for (auto i : children(localroot)) { + if(leftSide == NULL) { + leftSide = i; + } else { + rightSide = i; + break; + } } - if(firstupper) + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { - result[0] = towupper(result[0]); - } - - return UtfConverter::toUtf8(result); -} - -string -Transfer::caseOf(string const &str) -{ - wstring const s = UtfConverter::fromUtf8(str); + int pos = 0; + xmlChar *side = NULL, *as = NULL; + UString part; + bool queue = true; - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else if(!iswupper(s[s.size()-1])) + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { - return "Aa"; + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = to_ustring((const char*)i->children->content); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (xmlChar const *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + (void)as; // ToDo, remove "as" and the whole else? + } } - else + if(!xmlStrcmp(side, (const xmlChar *) "sl")) { - return "AA"; + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->source(attr_items[part], queue)); + bool match = word[pos]->setSource(attr_items[part], result); + if(!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) + else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - return "aa"; + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->reference(attr_items[part], queue)); + bool match = word[pos]->setReference(attr_items[part], result); + if(!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } } else { - return "Aa"; + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->target(attr_items[part], queue)); + bool match = word[pos]->setTarget(attr_items[part], result); + if(!match && trace) + { + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + } } } - else + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - return "aa"; + UString const val = to_ustring((const char *) leftSide->properties->children->content); + variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); } } -string -Transfer::tolower(string const &str) const +void +Transfer::processCallMacro(xmlNode *localroot) { - return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); -} + UString const n = to_ustring((const char *) localroot->properties->children->content); + int npar = 0; -string -Transfer::tags(string const &str) const -{ - string result = "<"; + xmlNode *macro = macro_map[macros[n]]; - for(unsigned int i = 0, limit = str.size(); i != limit; i++) + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) { - if(str[i] == '.') - { - result.append("><"); - } - else + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) { - result += str[i]; + npar = atoi((const char *) i->children->content); + break; } } - result += '>'; - - return result; -} + // ToDo: Is it at all valid if npar <= 0 ? -int -Transfer::processRule(xmlNode *localroot) -{ - int instruction_return, words_to_consume = -1; - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) + TransferWord **myword = NULL; + int idx = 0; + if(npar > 0) { - if(i->type == XML_ELEMENT_NODE) - { - instruction_return = processInstruction(i); - // When an instruction which modifies the number of words to be consumed - // from the input is found, execution of the rule is stopped - if(instruction_return != -1) - { - words_to_consume = instruction_return; - break; + myword = new TransferWord *[npar]; + std::fill(myword, myword+npar, (TransferWord *)(0)); + for (auto i : children(localroot)) { + if (idx >= npar) { + cerr << "Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; + return; } + int pos = atoi((const char *) i->properties->children->content)-1; + myword[idx] = word[pos]; + + idx++; } } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" ") != 0) - { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); - } - blank_queue.pop(); + + swap(myword, word); + swap(npar, lword); + + for (auto i : children(macro)) { + processInstruction(i); } - - return words_to_consume; + + swap(myword, word); + swap(npar, lword); + + delete[] myword; } TransferToken & -Transfer::readToken(FILE *in) +Transfer::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content; + UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (val == 0 && internal_null_flush)) + UChar32 val = in.get(); + if(in.eof() || (val == 0 && internal_null_flush)) { in_wblank = false; return input_buffer.add(TransferToken(content, tt_eof)); } if(in_wblank) { - content = L"[["; - content+= wchar_t(val); - + content = "[["_u; + content += val; + while(true) { - int val3 = fgetwc_unlocked(in); - if(val3 == L'\\') + UChar32 val3 = in.get(); + if(val3 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val3 == L'$') //[[..]]^..$ is the LU + else if(val3 == '$') //[[..]]^..$ is the LU { in_wblank = false; return input_buffer.add(TransferToken(content, tt_word)); } - else if(val3 == L'\0' && null_flush) + else if(val3 == '\0' && null_flush) { in_wblank = false; - fflush(output); + u_fflush(output); } else { - content += wchar_t(val3); + content += val3; } } } if(val == '\\') { - content += L'\\'; - content += (wchar_t) fgetwc_unlocked(in); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + UChar32 val2 = in.get(); + if(val2 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val2 == L'[') + else if(val2 == '[') { //wordbound blank in_wblank = true; content.pop_back(); - + return input_buffer.add(TransferToken(content, tt_blank)); } - else if(val2 == L']') + else if(val2 == ']') { - content += L']'; + content += ']'; break; } else { - content += wchar_t(val2); + content += val2; } } } - else if(val == L'$') + else if(val == '$') { return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { return input_buffer.add(TransferToken(content, tt_blank)); } - else if(val == L'\0' && null_flush) + else if(val == '\0' && null_flush) { in_wblank = false; - fflush(output); + u_fflush(output); } else { - content += wchar_t(val); + content += val; } } } -bool -Transfer::getNullFlush(void) -{ - return null_flush; -} - -void -Transfer::setNullFlush(bool null_flush) -{ - this->null_flush = null_flush; -} - -void -Transfer::setTrace(bool trace) -{ - this->trace = trace; -} - void Transfer::setTraceATT(bool trace) { @@ -2229,24 +896,21 @@ Transfer::tmp_clear() { tmpblank.clear(); tmpword.clear(); + variables = variable_defaults; } void -Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) +Transfer::transfer_wrapper_null_flush(InputFile& in, UFILE* out) { null_flush = false; internal_null_flush = true; - while(!feof(in)) + while(!in.eof()) { tmp_clear(); transfer(in, out); - fputwc_unlocked(L'\0', out); - int code = fflush(out); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', out); + u_fflush(out); } internal_null_flush = false; @@ -2254,7 +918,7 @@ Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) } void -Transfer::transfer(FILE *in, FILE *out) +Transfer::transfer(InputFile& in, UFILE* out) { if(getNullFlush()) { @@ -2274,31 +938,31 @@ Transfer::transfer(FILE *in, FILE *out) { if(trace_att) { - wcerr << "Loop start " << endl; - wcerr << "ms.size: " << ms.size() << endl; + cerr << "Loop start " << endl; + cerr << "ms.size: " << ms.size() << endl; - wcerr << "tmpword.size(): " << tmpword.size() << endl; + cerr << "tmpword.size(): " << tmpword.size() << endl; for (unsigned int ind = 0; ind < tmpword.size(); ind++) { if(ind != 0) { - wcerr << L" "; + cerr << " "; } - wcerr << *tmpword[ind]; + cerr << *tmpword[ind]; } - wcerr << endl; + cerr << endl; - wcerr << "tmpblank.size(): " << tmpblank.size() << endl; + cerr << "tmpblank.size(): " << tmpblank.size() << endl; for (unsigned int ind = 0; ind < tmpblank.size(); ind++) { - wcerr << L"'"; - wcerr << *tmpblank[ind]; - wcerr << L"' "; + cerr << "'"; + cerr << *tmpblank[ind]; + cerr << "' "; } - wcerr << endl; + cerr << endl; - wcerr << "last: " << last << endl; - wcerr << "prev_last: " << prev_last << endl << endl; + cerr << "last: " << last << endl; + cerr << "prev_last: " << prev_last << endl << endl; } if(ms.size() == 0) @@ -2309,7 +973,7 @@ Transfer::transfer(FILE *in, FILE *out) if(trace_att) { - wcerr << "num_words_to_consume: " << num_words_to_consume << endl; + cerr << "num_words_to_consume: " << num_words_to_consume << endl; } //Consume all the words from the input which matched the rule. @@ -2356,41 +1020,38 @@ Transfer::transfer(FILE *in, FILE *out) { if(trace_att) { - wcerr << "printing tmpword[0]" < tr; - wstring tr_wblank; + pair tr; + UString tr_wblank; if(useBilingual && preBilingual == false) { - if(isExtended && (*tmpword[0])[0] == L'*') - { - tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); - if(tr.first[0] == L'@') - { - tr.first[0] = L'*'; - } - else - { - tr.first = L"%" + tr.first; + if(isExtended && (*tmpword[0])[0] == '*') { + tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); + if(tr.first[0] == '@') { + tr.first[0] = '*'; + } else { + UString temp; + temp += '%'; + temp.append(tr.first); + temp.swap(tr.first); } - } - else - { - tr = fstp.biltransWithQueue(*tmpword[0], false); + } else { + tr = fstp.biltransWithQueue(*tmpword[0], false); } } else if(preBilingual) { - wstring sl; - wstring tl; - wstring ref; - wstring wblank; + UString sl; + UString tl; + UString ref; + UString wblank; int seenSlash = 0; - for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) + for(UString::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) { - if(*it == L'\\') + if(*it == '\\') { if(seenSlash == 0) { @@ -2412,19 +1073,19 @@ Transfer::transfer(FILE *in, FILE *out) } continue; } - else if(*it == L'[') + else if(*it == '[') { - if(*(it+1) == L'[') //wordbound blank + if(*(it+1) == '[') //wordbound blank { while(true) { - if(*it == L'\\') + if(*it == '\\') { wblank.push_back(*it); it++; wblank.push_back(*it); } - else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + else if(*it == '^' && *(it-1) == ']' && *(it-2) == ']') { break; } @@ -2432,7 +1093,7 @@ Transfer::transfer(FILE *in, FILE *out) { wblank.push_back(*it); } - + it++; } } @@ -2453,7 +1114,7 @@ Transfer::transfer(FILE *in, FILE *out) } continue; } - else if(*it == L'/') + else if(*it == '/') { seenSlash++; @@ -2474,44 +1135,29 @@ Transfer::transfer(FILE *in, FILE *out) } } //tmpword[0]->assign(sl); - tr = pair(tl, false); + tr = pair(tl, false); tr_wblank = wblank; - //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; + //cerr << "pb: " << *tmpword[0] << " :: " << sl << " >> " << tl << endl ; } else { - tr = pair(*tmpword[0], 0); + tr = pair(*tmpword[0], 0); } - if(tr.first.size() != 0) - { - if(defaultAttrs == lu) - { - if(tr.first[0] != L'[' || tr.first[1] != L'[') - { - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); - } - fputws_unlocked(tr.first.c_str(), output); - fputwc_unlocked(L'$', output); - } - else - { - if(tr.first[0] == '*') - { - fputws_unlocked(L"^unknown{", output); - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); - } - else - { - fputws_unlocked(L"^default{", output); - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); + if(tr.first.size() != 0) { + if(defaultAttrs == lu) { + if(tr.first[0] != '[' || tr.first[1] != '[') { + u_fprintf(output, "%S^", tr_wblank.c_str()); + } + u_fprintf(output, "%S$", tr.first.c_str()); + } else { + if(tr.first[0] == '*') { + u_fprintf(output, "^unknown{%S^", tr_wblank.c_str()); + } else { + u_fprintf(output, "^default{%S^", tr_wblank.c_str()); + } + u_fprintf(output, "%S$}$", tr.first.c_str()); } - fputws_unlocked(tr.first.c_str(), output); - fputws_unlocked(L"$}$", output); - } } banned_rules.clear(); tmpword.clear(); @@ -2523,11 +1169,10 @@ Transfer::transfer(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - if(trace_att) - { - wcerr << "printing tmpblank[0]" <c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); prev_last = last; last = input_buffer.getPos(); @@ -2544,18 +1189,12 @@ Transfer::transfer(FILE *in, FILE *out) last = input_buffer.getPos(); last_lword = tmpword.size(); - if(trace) - { - wcerr << endl << L"apertium-transfer: Rule " << val << L" line " << lastrule_line << L" "; - for (unsigned int ind = 0; ind < tmpword.size(); ind++) - { - if (ind != 0) - { - wcerr << L" "; - } - fputws_unlocked(tmpword[ind]->c_str(), stderr); + if(trace) { + cerr << endl << "apertium-transfer: Rule " << val << " line " << lastrule_line; + for (auto& it : tmpword) { + cerr << " " << *it; } - wcerr << endl; + cerr << endl; } } @@ -2569,7 +1208,7 @@ Transfer::transfer(FILE *in, FILE *out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -2581,13 +1220,13 @@ Transfer::transfer(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); return; } break; default: - wcerr << "Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } @@ -2598,7 +1237,7 @@ Transfer::applyRule() { int words_to_consume; unsigned int limit = tmpword.size(); - //wcerr << L"applyRule: " << tmpword.size() << endl; + //cerr << "applyRule: " << tmpword.size() << endl; for(unsigned int i = 0; i != limit; i++) { @@ -2612,33 +1251,27 @@ Transfer::applyRule() { if(int(blank_queue.size()) < last_lword - 1) { - string blank_to_add = string(UtfConverter::toUtf8(*tmpblank[i-1])); - blank_queue.push(blank_to_add); + blank_queue.push(*tmpblank[i-1]); } } - pair tr; + pair tr; if(useBilingual && preBilingual == false) { tr = fstp.biltransWithQueue(*tmpword[i], false); - wstring refx,wblankx; - word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), - UtfConverter::toUtf8(wblankx), - tr.second); + word[i] = new TransferWord(*tmpword[i], tr.first, ""_u, ""_u, tr.second); } else if(preBilingual) { - wstring sl; - wstring tl; - wstring ref; - wstring wblank; + UString sl; + UString tl; + UString ref; + UString wblank; int seenSlash = 0; - for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) + for(UString::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) { - if(*it == L'\\') + if(*it == '\\') { if(seenSlash == 0) { @@ -2660,19 +1293,19 @@ Transfer::applyRule() } continue; } - else if(*it == L'[') + else if(*it == '[') { - if(*(it+1) == L'[') //wordbound blank + if(*(it+1) == '[') //wordbound blank { while(true) { - if(*it == L'\\') + if(*it == '\\') { wblank.push_back(*it); it++; wblank.push_back(*it); } - else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + else if(*it == '^' && *(it-1) == ']' && *(it-2) == ']') { break; } @@ -2680,7 +1313,7 @@ Transfer::applyRule() { wblank.push_back(*it); } - + it++; } } @@ -2702,7 +1335,7 @@ Transfer::applyRule() continue; } - if(*it == L'/') + if(*it == '/') { seenSlash++; @@ -2722,22 +1355,13 @@ Transfer::applyRule() ref.push_back(*it); } } - tr = pair(tl, false); - word[i] = new TransferWord(UtfConverter::toUtf8(sl), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(ref), - UtfConverter::toUtf8(wblank), - tr.second); + tr = pair(tl, false); + word[i] = new TransferWord(sl, tr.first, ref, wblank, tr.second); } else // neither useBilingual nor preBilingual (sl==tl) { - tr = pair(*tmpword[i], false); - wstring refx,wblankx; - word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), - UtfConverter::toUtf8(wblankx), - tr.second); + tr = pair(*tmpword[i], false); + word[i] = new TransferWord(*tmpword[i], tr.first, ""_u, ""_u, tr.second); } } @@ -2762,53 +1386,53 @@ Transfer::applyRule() /* HERE */ void -Transfer::applyWord(wstring const &word_str) +Transfer::applyWord(UString const &word_str) { - ms.step(L'^'); - + ms.step('^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; - ms.step(towlower(word_str[i]), any_char); - break; - - case L'[': - if(word_str[i+1] == L'[') + ms.step(u_tolower(word_str[i]), any_char); + break; + + case '[': + if(word_str[i+1] == '[') { while(true) { - if(word_str[i] == L'\\') + if(word_str[i] == '\\') { i++; } else if(i >= 4) { - if(word_str[i] == L'^' && word_str[i-1] == L']' && word_str[i-2] == L']') + if(word_str[i] == '^' && word_str[i-1] == ']' && word_str[i-2] == ']') { break; } } - + i++; } } else { - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); } break; - - case L'/': + + case '/': i = limit; break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -2826,11 +1450,11 @@ Transfer::applyWord(wstring const &word_str) break; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } - ms.step(L'$'); + ms.step('$'); } void diff --git a/apertium/transfer.h b/apertium/transfer.h index 80ff1bc..9cb77be 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -17,150 +17,75 @@ #ifndef _TRANSFER_ #define _TRANSFER_ -#include -#include +#include + #include -#include -#include -#include #include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include +#include using namespace std; -class Transfer +class Transfer : public TransferBase { private: - Alphabet alphabet; - MatchExe *me; - MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; - vector macro_map; - vector rule_map; - vector rule_lines; - xmlDoc *doc; - xmlNode *root_element; TransferWord **word; - queue blank_queue; - int lword; int last_lword; - Buffer input_buffer; - vector tmpword; - vector tmpblank; - - bool in_out; bool in_lu; - bool in_let_var; - string var_val; //stores the name of the variable being processed (in let or append) - bool in_wblank; - string out_wblank; - map var_out_wblank; + UString out_wblank; + map var_out_wblank; - bool gettingLemmaFromWord(string attr); - string combineWblanks(string wblank_current, string wblank_to_add); - FSTProcessor fstp; FSTProcessor extended; bool isExtended; - FILE *output; - int any_char; - int any_tag; - - xmlNode *lastrule; - unsigned int nwords; - - map evalStringCache; enum OutputType{lu,chunk}; OutputType defaultAttrs; bool preBilingual; bool useBilingual; - bool null_flush; - bool internal_null_flush; - bool trace; bool trace_att; - string emptyblank; + UString emptyblank; - void destroy(); - void readData(FILE *input); void readBil(string const &filename); - void readTransfer(string const &input); - void collectMacros(xmlNode *localroot); - void collectRules(xmlNode *localroot); - string caseOf(string const &str); - string copycase(string const &source_word, string const &target_word); void processLet(xmlNode *localroot); - void processAppend(xmlNode *localroot); - int processRejectCurrentRule(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - bool processLogical(xmlNode *localroot); - bool processTest(xmlNode *localroot); - bool processAnd(xmlNode *localroot); - bool processOr(xmlNode *localroot); - bool processEqual(xmlNode *localroot); - bool processBeginsWith(xmlNode *localroot); - bool processBeginsWithList(xmlNode *localroot); - bool processEndsWith(xmlNode *localroot); - bool processEndsWithList(xmlNode *local); - bool processContainsSubstring(xmlNode *localroot); - bool processNot(xmlNode *localroot); - bool processIn(xmlNode *localroot); - int processRule(xmlNode *localroot); - string evalString(xmlNode *localroot); - int processInstruction(xmlNode *localroot); - int processChoose(xmlNode *localroot); - string processChunk(xmlNode *localroot); - string processTags(xmlNode *localroot); + UString evalCachedString(xmlNode *localroot); + UString processChunk(xmlNode *localroot); + UString processTags(xmlNode *localroot); + void processClip(xmlNode* element); + void processBlank(xmlNode* element); + void processCaseOf(xmlNode* element); + UString processLu(xmlNode* element); + UString processMlu(xmlNode* element); + + void processLuCount(xmlNode* element); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; - wstring readWord(FILE *in); - wstring readBlank(FILE *in); - wstring readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); int applyRule(); - TransferToken & readToken(FILE *in); + TransferToken & readToken(InputFile& in); bool checkIndex(xmlNode *element, int index, int limit); - void transfer_wrapper_null_flush(FILE *in, FILE *out); + void transfer_wrapper_null_flush(InputFile& in, UFILE* out); void tmp_clear(); public: Transfer(); - ~Transfer(); void read(string const &transferfile, string const &datafile, string const &fstfile = ""); - void transfer(FILE *in, FILE *out); + void transfer(InputFile& in, UFILE* out); void setUseBilingual(bool value); bool getUseBilingual(void) const; void setPreBilingual(bool value); bool getPreBilingual(void) const; void setExtendedDictionary(string const &fstfile); void setCaseSensitiveness(bool value); - bool getNullFlush(void); - void setNullFlush(bool null_flush); - void setTrace(bool trace); void setTraceATT(bool trace); }; diff --git a/apertium/transfer_base.cc b/apertium/transfer_base.cc new file mode 100644 index 0000000..9feb672 --- /dev/null +++ b/apertium/transfer_base.cc @@ -0,0 +1,603 @@ +#include +#include +#include +#include +#include + +using namespace std; + +TransferBase::TransferBase() + : me(nullptr), doc(nullptr), root_element(nullptr), + lword(0), lastrule(nullptr), nwords(0), output(nullptr), + any_char(0), any_tag(0), in_let_var(false), in_out(false), + null_flush(false), internal_null_flush(false), trace(false) +{} + +TransferBase::~TransferBase() +{ + if (me) { + delete me; + me = nullptr; + } + if (doc) { + xmlFreeDoc(doc); + doc = nullptr; + } +} + +void +TransferBase::read(const char* transferfile, const char* datafile) +{ + doc = xmlReadFile(transferfile, NULL, 0); + if (doc == NULL) { + cerr << "Error: Could not parse file '" << transferfile << "'." << endl; + exit(EXIT_FAILURE); + } + root_element = xmlDocGetRootElement(doc); + + for (auto i : children(root_element)) { + if (!xmlStrcmp(i->name, (const xmlChar*) "section-def-macros")) { + collectMacros(i); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "section-rules")) { + collectRules(i); + } + } + + + FILE* in = fopen(datafile, "rb"); + if (!in) { + cerr << "Error: Could not open file '" << datafile << "' for reading." << endl; + exit(EXIT_FAILURE); + } + + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool icu = Compression::string_read(in).empty(); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + attr_items[cad_k].read(in); + UString fallback = Compression::string_read(in); + if (!icu && cad_k == "chname"_u) { + // chname was previously "({([^/]+)\\/)" + // which is fine for PCRE, but ICU chokes on the unmatched bracket + fallback = "(\\{([^/]+)\\/)"_u; + } + attr_items[cad_k].compile(fallback); + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); + variable_defaults[cad_k] = variables[cad_k]; + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + UString const cad_v = Compression::string_read(in); + lists[cad_k].insert(cad_v); + listslow[cad_k].insert(StringUtils::tolower(cad_v)); + } + } +} + +void +TransferBase::collectRules(xmlNode* localroot) +{ + for (auto rule : children(localroot)) { + size_t line = rule->line; + for (auto rulechild : children(rule)) { + if(!xmlStrcmp(rulechild->name, (const xmlChar *) "action")) { + rule_map.push_back(rulechild); + rule_lines.push_back(line); + break; + } + } + } +} + +void +TransferBase::collectMacros(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + macro_map.push_back(i); + } +} + +bool +TransferBase::gettingLemmaFromWord(const UString& attr) +{ + return attr == "lem"_u || attr == "lemh"_u || attr == "whole"_u; +} + +UString +TransferBase::combineWblanks(const UString& first, const UString& second) +{ + if (first.empty()) { + return second; + } else if (second.empty()) { + return first; + } + UString ret; + ret.reserve(first.size() + second.size()); + if (endsWith(first, "]]"_u)) { + if (first.size() > 2) { + size_t i = first.size() - 3; + bool esc = false; + while (first[i] == '\\') { + i--; + esc = !esc; + } + if (esc) { + ret.append(first); + } else { + ret.append(first.substr(0, first.size()-2)); + } + } else { + ret.append(first.substr(0, first.size()-2)); + } + } else { + ret.append(first); + } + ret += ';'; + ret += ' '; + if (beginsWith(second, "[["_u)) { + ret.append(second.substr(2)); + } else { + ret.append(second); + } + return ret; +} + +UString +TransferBase::evalString(xmlNode* element) +{ + if (!element) { + throw "evalString() was called on a NULL element"; + } + if (evalStringCache.find(element) != evalStringCache.end()) { + return evalCachedString(element); + } + if (!xmlStrcmp(element->name, (const xmlChar*) "clip")) { + processClip(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lit-tag")) { + evalStringCache[element] = TransferInstr(ti_lit_tag, tags(getattr(element, "v")), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lit")) { + evalStringCache[element] = TransferInstr(ti_lit, getattr(element, "v"), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "b")) { + processBlank(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "get-case-from")) { + int pos = atoi((const char*) element->properties->children->content); + xmlNode* param = NULL; + for (auto it : children(element)) { + param = it; + break; + } + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "var")) { + evalStringCache[element] = TransferInstr(ti_var, getattr(element, "n"), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu-count")) { + processLuCount(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "case-of")) { + processCaseOf(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "concat")) { + UString value; + for (auto it : children(element)) { + value.append(evalString(it)); + } + return value; + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu")) { + return processLu(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "mlu")) { + return processMlu(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "chunk")) { + return processChunk(element); + } else { + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + return evalCachedString(element); +} + +int +TransferBase::processRule(xmlNode* localroot) +{ + int words_to_consume = -1; + // iterating over the tag + for (auto i : children(localroot)) { + words_to_consume = processInstruction(i); + // When an instruction which modifies the number of words to be consumed + // from the input is found, execution of the rule is stopped + if (words_to_consume != -1) { + break; + } + } + // flush remaining non-space blanks + while (!blank_queue.empty()) { + if (blank_queue.front() != " "_u) { + write(blank_queue.front(), output); + } + blank_queue.pop(); + } + return words_to_consume; +} + +int +TransferBase::processInstruction(xmlNode* localroot) +{ + int words_to_consume = -1; + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + words_to_consume = processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) + { + words_to_consume = processRejectCurrentRule(localroot); + } + return words_to_consume; +} + +int +TransferBase::processRejectCurrentRule(xmlNode* localroot) +{ + bool shifting = (getattr(localroot, "shifting") == "yes"_u); + return shifting ? 1 : 0; +} + +int +TransferBase::processChoose(xmlNode* localroot) +{ + int words_to_consume = -1; + for (auto option : children(localroot)) { + if (!xmlStrcmp(option->name, (const xmlChar*) "when")) { + bool picked = false; + for (auto it : children(option)) { + if (!xmlStrcmp(it->name, (const xmlChar*) "test")) { + if (!processTest(it)) { + break; + } else { + picked = true; + } + } else { + words_to_consume = processInstruction(it); + if (words_to_consume != -1) { + return words_to_consume; + } + } + } + if (picked) { + return words_to_consume; + } + } else if (!xmlStrcmp(option->name, (const xmlChar*) "otherwise")) { + for (auto it : children(option)) { + words_to_consume = processInstruction(it); + if (words_to_consume != -1) { + return words_to_consume; + } + } + } + } + return words_to_consume; +} + +void +TransferBase::processAppend(xmlNode* localroot) +{ + UString name = getattr(localroot, "n"); + for (auto i : children(localroot)) { + in_let_var = true; + var_val = name; + variables[name].append(evalString(i)); + in_let_var = false; + } +} + +bool +TransferBase::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +TransferBase::processTest(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + return processLogical(i); + } + return false; +} + +bool +TransferBase::processAnd(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + if (!processLogical(i)) { + return false; + } + } + return true; +} + +bool +TransferBase::processOr(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + if (processLogical(i)) { + return true; + } + } + return false; +} + +bool +TransferBase::processNot(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + return !processLogical(i); + } + return false; +} + +bool +TransferBase::beginsWith(const UString& haystack, const UString& needle) +{ + const size_t hlen = haystack.size(); + const size_t nlen = needle.size(); + if (hlen < nlen) { + return false; + } + for (size_t i = 0; i < nlen; i++) { + if (haystack[i] != needle[i]) { + return false; + } + } + return true; +} + +bool +TransferBase::endsWith(const UString& haystack, const UString& needle) +{ + if (needle.size() > haystack.size()) { + return false; + } + for (int h = haystack.size()-1, n = needle.size()-1; n >= 0; h--, n--) { + if (haystack[h] != needle[n]) { + return false; + } + } + return true; +} + +pair +TransferBase::twoChildren(xmlNode* localroot) +{ + xmlNode* first = nullptr; + xmlNode* second = nullptr; + for (auto i : children(localroot)) { + if (!first) { + first = i; + } else { + second = i; + break; + } + } + return make_pair(first, second); +} + +bool +TransferBase::processBeginsWith(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return beginsWith(StringUtils::tolower(evalString(ch.first)), + StringUtils::tolower(evalString(ch.second))); + } else { + return beginsWith(evalString(ch.first), evalString(ch.second)); + } +} + +bool +TransferBase::processBeginsWithList(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString needle = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + bool caseless = (getattr(localroot, "caseless") == "yes"_u); + if (caseless) { + needle = StringUtils::tolower(needle); + } + for (auto it : (caseless ? listslow[idlist] : lists[idlist])) { + if (beginsWith(needle, it)) { + return true; + } + } + return false; +} + +bool +TransferBase::processEndsWith(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return endsWith(StringUtils::tolower(evalString(ch.first)), + StringUtils::tolower(evalString(ch.second))); + } else { + return endsWith(evalString(ch.first), evalString(ch.second)); + } +} + +bool +TransferBase::processEndsWithList(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString needle = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + bool caseless = (getattr(localroot, "caseless") == "yes"_u); + if (caseless) { + needle = StringUtils::tolower(needle); + } + for (auto it : (caseless ? listslow[idlist] : lists[idlist])) { + if (endsWith(needle, it)) { + return true; + } + } + return false; +} + +bool +TransferBase::processContainsSubstring(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return StringUtils::tolower(evalString(ch.first)).find(StringUtils::tolower(evalString(ch.second))) != UString::npos; + } else { + return evalString(ch.first).find(evalString(ch.second)) != UString::npos; + } +} + +bool +TransferBase::processEqual(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return StringUtils::tolower(evalString(ch.first)) == StringUtils::tolower(evalString(ch.second)); + } else { + return evalString(ch.first) == evalString(ch.second); + } +} + +bool +TransferBase::processIn(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString sval = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + if (getattr(localroot, "caseless") == "yes"_u) { + set& myset = listslow[idlist]; + return (myset.find(StringUtils::tolower(sval)) != myset.end()); + } else { + set& myset = lists[idlist]; + return (myset.find(sval) != myset.end()); + } +} + +UString +TransferBase::tags(const UString& str) const +{ + UString ret; + ret.reserve(str.size()+2); + ret += '<'; + ret.append(StringUtils::substitute(str, "."_u, "><"_u)); + ret += '>'; + return ret; +} + +bool +TransferBase::getNullFlush(void) +{ + return null_flush; +} + +void +TransferBase::setNullFlush(bool val) +{ + null_flush = val; +} + +void +TransferBase::setTrace(bool val) +{ + trace = val; +} diff --git a/apertium/transfer_base.h b/apertium/transfer_base.h new file mode 100644 index 0000000..e283329 --- /dev/null +++ b/apertium/transfer_base.h @@ -0,0 +1,123 @@ +#ifndef _APERTIUM_TRANSFER_BASE_ +#define _APERTIUM_TRANSFER_BASE_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +using namespace std; + +class TransferBase +{ +protected: + Alphabet alphabet; + MatchExe* me; + MatchState ms; + map attr_items; + map variables; + map variable_defaults; + map macros; + map> lists; + map> listslow; + vector macro_map; + vector rule_map; + vector rule_lines; + xmlDoc* doc; + xmlNode* root_element; + + queue blank_queue; + Buffer input_buffer; + int lword; + vector tmpword; + vector tmpblank; + xmlNode* lastrule; + unsigned int nwords; + + UFILE* output; + + int32_t any_char; + int32_t any_tag; + + bool in_let_var; + bool in_out; + UString var_val; + map evalStringCache; + + bool null_flush; + bool internal_null_flush; + bool trace; + + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + + bool gettingLemmaFromWord(const UString& attr); + UString combineWblanks(const UString& first, const UString& second); + + UString evalString(xmlNode* element); + virtual UString evalCachedString(xmlNode* element) = 0; + + virtual void processClip(xmlNode* element) = 0; + virtual void processBlank(xmlNode* element) = 0; + virtual void processLuCount(xmlNode* element) = 0; + virtual void processCaseOf(xmlNode* element) = 0; + virtual UString processLu(xmlNode* element) = 0; + virtual UString processMlu(xmlNode* element) = 0; + virtual UString processChunk(xmlNode* element) = 0; + + int processRule(xmlNode* localroot); + int processInstruction(xmlNode* localroot); + int processRejectCurrentRule(xmlNode* localroot); + int processChoose(xmlNode* localroot); + void processAppend(xmlNode* localroot); + + virtual void processLet(xmlNode* localroot) = 0; + virtual void processOut(xmlNode* localroot) = 0; + virtual void processCallMacro(xmlNode* localroot) = 0; + virtual void processModifyCase(xmlNode* localroot) = 0; + + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processNot(xmlNode *localroot); + + bool beginsWith(const UString& haystack, const UString& needle); + bool endsWith(const UString& haystack, const UString& needle); + + pair twoChildren(xmlNode* localroot); + + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *localroot); + bool processContainsSubstring(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processIn(xmlNode *localroot); + + UString tags(const UString& s) const; + +public: + TransferBase(); + ~TransferBase(); + + void read(const char* transferfile, const char* datafile); + bool getNullFlush(void); + void setNullFlush(bool null_flush); + void setTrace(bool trace); +}; + +#endif diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index fc08552..6e818a3 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -17,12 +17,10 @@ #include #include -#include #include #include -#include +#include -using namespace Apertium; using namespace std; void @@ -46,14 +44,14 @@ TransferData::destroy() TransferData::TransferData() { // adding fixed attr_items - attr_items[L"lem"] = L"^(([^<]|\"\\<\")+)"; - attr_items[L"lemq"] = L"\\#[- _][^<]+"; - attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)"; - attr_items[L"whole"] = L"(.+)"; - attr_items[L"tags"] = L"((<[^>]+>)+)"; - attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!! - attr_items[L"chcontent"] = L"(\\{.+)"; - attr_items[L"content"] = L"(\\{.+)"; + attr_items["lem"_u] = "^(([^<]|\"\\<\")+)"_u; + attr_items["lemq"_u] = "\\#[- _][^<]+"_u; + attr_items["lemh"_u] = "^(([^<#]|\"\\<\"|\"\\#\")+)"_u; + attr_items["whole"_u] = "(.+)"_u; + attr_items["tags"_u] = "((<[^>]+>)+)"_u; + attr_items["chname"_u] = "(\\{([^/]+)\\/)"_u; // includes delimiters { and / !!! + attr_items["chcontent"_u] = "(\\{.+)"_u; + attr_items["content"_u] = "(\\{.+)"_u; } TransferData::~TransferData() @@ -89,25 +87,25 @@ TransferData::getTransducer() return transducer; } -map & +map & TransferData::getAttrItems() { return attr_items; } -map & +map & TransferData::getMacros() { return macros; } -map, Ltstr> & +map> & TransferData::getLists() { return lists; } -map & +map & TransferData::getVariables() { return variables; @@ -115,7 +113,9 @@ TransferData::getVariables() int TransferData::countToFinalSymbol(const int count) { - const wstring count_sym = L""; + UChar buf[64]; + u_snprintf(buf, 64, "", count); + UString count_sym = buf; alphabet.includeSymbol(count_sym); const int symbol = alphabet(count_sym); final_symbols.insert(symbol); @@ -134,7 +134,7 @@ TransferData::write(FILE *output) // Find all arcs with "final_symbols" in the transitions, let their source node instead be final, // and extract the rule number from the arc. Record relation between source node and rule number // in finals_rules. It is now no longer safe to minimize -- but we already did that. - const wstring rule_sym_pre = L" > >::const_iterator it = transitions.begin(), limit = transitions.end(); it != limit; ++it) { @@ -152,12 +152,12 @@ TransferData::write(FILE *output) continue; } // Extract the rule number encoded by countToFinalSymbol(): - wstring s; + UString s; alphabet.getSymbol(s, symbol); if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) { continue; } - const int rule_num = stoi(s.substr(rule_sym_pre.size())); + const int rule_num = StringUtils::stoi(s.substr(rule_sym_pre.size())); transducer.setFinal(src, wgt); finals_rules[src] = rule_num; } @@ -188,34 +188,34 @@ TransferData::write(FILE *output) // variables Compression::multibyte_write(variables.size(), output); - for(map::const_iterator it = variables.begin(), limit = variables.end(); + for(map::const_iterator it = variables.begin(), limit = variables.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); - Compression::wstring_write(it->second, output); + Compression::string_write(it->first, output); + Compression::string_write(it->second, output); } // macros Compression::multibyte_write(macros.size(), output); - for(map::const_iterator it = macros.begin(), limit = macros.end(); + for(map::const_iterator it = macros.begin(), limit = macros.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second, output); } // lists Compression::multibyte_write(lists.size(), output); - for(map, Ltstr>::const_iterator it = lists.begin(), limit = lists.end(); + for(map>::const_iterator it = lists.begin(), limit = lists.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second.size(), output); - for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); + for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); it2 != limit2; it2++) { - Compression::wstring_write(*it2, output); + Compression::string_write(*it2, output); } } @@ -224,16 +224,16 @@ TransferData::write(FILE *output) void TransferData::writeRegexps(FILE *output) { - Compression::string_write(pcre_version_endian(), output); + // since ICU doesn't have a binary form, it doesn't matter + // what the version is, so leave it blank + Compression::string_write(""_u, output); Compression::multibyte_write(attr_items.size(), output); - map::iterator it, limit; - for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++) - { - Compression::wstring_write(it->first, output); - ApertiumRE my_re; - my_re.compile(UtfConverter::toUtf8(it->second)); - my_re.write(output); - Compression::wstring_write(it->second, output); + for (auto& it : attr_items) { + Compression::string_write(it.first, output); + // empty binary form, since ICU doesn't have a dump function + // like PCRE did + Compression::multibyte_write(0, output); + Compression::string_write(it.second, output); } } diff --git a/apertium/transfer_data.h b/apertium/transfer_data.h index 49b5755..1c8ffea 100644 --- a/apertium/transfer_data.h +++ b/apertium/transfer_data.h @@ -18,7 +18,6 @@ #define _TRANSFERDATA_ #include -#include #include #include @@ -32,10 +31,10 @@ private: void copy(TransferData const &o); void destroy(); - map attr_items; - map macros; - map, Ltstr> lists; - map variables; + map attr_items; + map macros; + map> lists; + map variables; set final_symbols; Alphabet alphabet; @@ -50,13 +49,13 @@ private: Alphabet & getAlphabet(); Transducer & getTransducer(); - map & getAttrItems(); + map & getAttrItems(); map seen_rules; - map & getMacros(); - map, Ltstr> & getLists(); - map & getVariables(); + map & getMacros(); + map> & getLists(); + map & getVariables(); /** * Encode the rule count in an arc label/symbol (later extracted by diff --git a/apertium/transfer_instr.cc b/apertium/transfer_instr.cc index 7efee52..0b2e5c1 100644 --- a/apertium/transfer_instr.cc +++ b/apertium/transfer_instr.cc @@ -15,9 +15,8 @@ * along with this program; if not, see . */ #include -#include +#include -using namespace Apertium; void TransferInstr::copy(TransferInstr const &o) { @@ -26,6 +25,7 @@ TransferInstr::copy(TransferInstr const &o) pos = o.pos; pointer = o.pointer; condition = o.condition; + strval = o.strval; } void @@ -33,14 +33,16 @@ TransferInstr::destroy() { } -TransferInstr::TransferInstr(TransferInstrType t, string const &c, - int const p, void *ptr, bool cond) +TransferInstr::TransferInstr(TransferInstrType t, UString const &c, + int const p, xmlNode* ptr, bool cond, + const UString& sv) { type = t; content = c; pos = p; pointer = ptr; condition = cond; + strval = sv; } TransferInstr::~TransferInstr() @@ -70,7 +72,7 @@ TransferInstr::getType() return type; } -string const & +UString const & TransferInstr::getContent() { return content; @@ -82,7 +84,7 @@ TransferInstr::getPos() return pos; } -void * +xmlNode* TransferInstr::getPointer() { return pointer; @@ -93,3 +95,9 @@ TransferInstr::getCondition() { return condition; } + +const UString& +TransferInstr::getStrval() +{ + return strval; +} diff --git a/apertium/transfer_instr.h b/apertium/transfer_instr.h index 977b112..3d51b28 100644 --- a/apertium/transfer_instr.h +++ b/apertium/transfer_instr.h @@ -17,7 +17,9 @@ #ifndef _TRANSFERINSTR_ #define _TRANSFERINSTR_ -#include +#include +#include +#include using namespace std; @@ -44,10 +46,11 @@ class TransferInstr { private: TransferInstrType type; - string content; + UString content; int pos; - void *pointer; + xmlNode* pointer; bool condition; + UString strval; void copy(TransferInstr const &o); void destroy(); @@ -58,18 +61,19 @@ public: pointer(0), condition(false) {} - TransferInstr(TransferInstrType t, string const &c, int const p, - void *ptr=NULL, bool cond = true); + TransferInstr(TransferInstrType t, UString const &c, int const p, + xmlNode* ptr=NULL, bool cond = true, const UString& sv = ""_u); ~TransferInstr(); TransferInstr(TransferInstr const &o); TransferInstr & operator =(TransferInstr const &o); TransferInstrType getType(); - string const & getContent(); + UString const & getContent(); int getPos(); - void * getPointer(); + xmlNode* getPointer(); bool getCondition(); + const UString& getStrval(); }; #endif diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc index 6491c53..b28c4f0 100644 --- a/apertium/transfer_mult.cc +++ b/apertium/transfer_mult.cc @@ -18,17 +18,12 @@ #include #include #include -#include -#include +#include #include #include #include -#ifdef _WIN32 -#include -#endif - using namespace std; void @@ -60,18 +55,6 @@ TransferMult::~TransferMult() destroy(); } -string -TransferMult::tolower(string const &str) const -{ - string result = str; - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - result[i] = ::tolower(result[i]); - } - - return result; -} - void TransferMult::readData(FILE *in) { @@ -94,41 +77,39 @@ TransferMult::readData(FILE *in) me = new MatchExe(t, finals); // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); + Compression::string_read(in); // PCRE version placeholder for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - } + UString fallback = Compression::string_read(in); + attr_items[cad_k].compile(fallback); } // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); } // macros for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); macros[cad_k] = Compression::multibyte_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { - wstring const cad_v = Compression::wstring_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + UString const cad_v = Compression::string_read(in); + lists[cad_k].insert(cad_v); + listslow[cad_k].insert(StringUtils::tolower(cad_v)); } } } @@ -139,7 +120,7 @@ TransferMult::readBil(string const &fstfile) FILE *in = fopen(fstfile.c_str(), "r"); if(!in) { - wcerr << "Error: Could not open file '" << fstfile << "'." << endl; + cerr << "Error: Could not open file '" << fstfile << "'." << endl; exit(EXIT_FAILURE); } fstp.load(in); @@ -154,7 +135,7 @@ TransferMult::read(string const &datafile, string const &fstfile) FILE *in = fopen(datafile.c_str(), "r"); if(!in) { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; + cerr << "Error: Could not open file '" << datafile << "'." << endl; exit(EXIT_FAILURE); } readData(in); @@ -164,65 +145,65 @@ TransferMult::read(string const &datafile, string const &fstfile) } TransferToken & -TransferMult::readToken(FILE *in) +TransferMult::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content = L""; + UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in)) + UChar32 val = in.get(); + if(in.eof()) { return input_buffer.add(TransferToken(content, tt_eof)); } - if(val == L'\\') + if(val == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + UChar32 val2 = in.get(); + if(val2 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val2 == L']') + else if(val2 == ']') { - content += L']'; + content += ']'; break; } else { - content += wchar_t(val2); + content += val2; } } } - else if(val == L'$') + else if(val == '$') { return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { return input_buffer.add(TransferToken(content, tt_blank)); } else { - content += wchar_t(val); + content += val; } } } void -TransferMult::transfer(FILE *in, FILE *out) +TransferMult::transfer(InputFile& in, UFILE* out) { int last = 0; @@ -243,28 +224,25 @@ TransferMult::transfer(FILE *in, FILE *out) { if(tmpword.size() != 0) { - pair tr = fstp.biltransWithQueue(*tmpword[0], false); + pair tr = fstp.biltransWithQueue(*tmpword[0], false); if(tr.first.size() != 0) { - vector multiword = acceptions(tr.first); - if(multiword.size() > 1) - { - fputws_unlocked(L"[{]", output); + vector multiword = acceptions(tr.first); + if(multiword.size() > 1) { + write("[{]"_u, output); } for(unsigned int i = 0, limit = multiword.size(); i != limit; i++) { if(i > 0) { - fputws_unlocked(L"[|]", output); + write("[|]"_u, output); } - fputwc_unlocked(L'^', output); - fputws_unlocked(multiword[i].c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S$", multiword[i].c_str()); } if(multiword.size() > 1) { - fputws_unlocked(L".[][}]", output); - } + write(".[][}]"_u, output); + } } tmpword.clear(); isRule = false; @@ -275,7 +253,7 @@ TransferMult::transfer(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - fputws_unlocked(tmpblank[0]->c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); last = input_buffer.getPos(); ms.init(me->getInitial()); @@ -300,7 +278,7 @@ TransferMult::transfer(FILE *in, FILE *out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -312,45 +290,45 @@ TransferMult::transfer(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); return; } break; default: - wcerr << L"Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } } bool -TransferMult::isDefaultWord(wstring const &str) +TransferMult::isDefaultWord(UString const &str) { - return str.find(L" D<"); + return str.find(" D<"_u) != UString::npos; } -vector -TransferMult::acceptions(wstring str) +vector +TransferMult::acceptions(UString str) { - vector result; + vector result; int low = 0; // removing '@' - if(str[0] == L'@') + if(str[0] == '@') { str = str.substr(1); } for(unsigned int i = 0, limit = str.size(); i != limit; i++) { - if(str[i] == L'\\') + if(str[i] == '\\') { i++; } - else if(str[i] == L'/') + else if(str[i] == '/') { - wstring new_word = str.substr(low, i-low); + UString new_word = str.substr(low, i-low); if(result.size() > 1 && isDefaultWord(new_word)) { @@ -365,7 +343,7 @@ TransferMult::acceptions(wstring str) } } - wstring otherword = str.substr(low); + UString otherword = str.substr(low); if(result.size() > 0 && isDefaultWord(otherword)) { result.push_back(result[0]); @@ -379,10 +357,10 @@ TransferMult::acceptions(wstring str) // eliminar las acepciones sin sentido marcado if(result.size() >= 2) { - vector result2; + vector result2; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { - if(result[i].find(L"__") != wstring::npos) + if(result[i].find("__"_u) != UString::npos) { result2.push_back(result[i]); } @@ -397,22 +375,22 @@ TransferMult::acceptions(wstring str) } void -TransferMult::writeMultiple(list >::iterator itwords, - list::iterator itblanks, - list >::const_iterator limitwords, - wstring acum , bool multiple) +TransferMult::writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + UString acum , bool multiple) { if(itwords == limitwords) { if(multiple) { - output_string.append(L"[|]"); + output_string.append("[|]"_u); } output_string.append(acum); } else { - vector &refword = *itwords; + vector &refword = *itwords; itwords++; @@ -420,19 +398,27 @@ TransferMult::writeMultiple(list >::iterator itwords, { for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { - writeMultiple(itwords, itblanks, limitwords, - acum + L"^" + refword[i] + L"$", multiple || (i > 0)); + UString temp = acum; + temp += '^'; + temp += refword[i]; + temp += '$'; + writeMultiple(itwords, itblanks, limitwords, temp, multiple || (i > 0)); } } else { - wstring &refblank = *itblanks; + UString &refblank = *itblanks; itblanks++; for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { + UString temp = acum; + temp += '^'; + temp += refword[i]; + temp += '$'; + temp += refblank; writeMultiple(itwords, itblanks, limitwords, - acum + L"^" + refword[i] + L"$" + refblank, + temp, multiple || (i > 0)); } } @@ -442,31 +428,26 @@ TransferMult::writeMultiple(list >::iterator itwords, void TransferMult::applyRule() { - list blanks; - list > words; + list blanks; + list > words; - pair tr = fstp.biltransWithQueue(*tmpword[0], false); + pair tr = fstp.biltransWithQueue(*tmpword[0], false); words.push_back(acceptions(tr.first)); for(unsigned int i = 1; i != numwords; i++) { blanks.push_back(*tmpblank[i-1]); - pair tr = fstp.biltransWithQueue(*tmpword[i], false); + pair tr = fstp.biltransWithQueue(*tmpword[i], false); words.push_back(acceptions(tr.first)); } - output_string = L""; + output_string.clear(); writeMultiple(words.begin(), blanks.begin(), words.end()); - if(output_string.find(L"[|]") != wstring::npos) - { - fputws_unlocked(L"[{]", output); - fputws_unlocked(output_string.c_str(), output); - fputws_unlocked(L".[][}]", output); - } - else - { - fputws_unlocked(output_string.c_str(), output); + if(output_string.find("[|]"_u) != UString::npos) { + u_fprintf(output, "[{]%S.[][}]", output_string.c_str()); + } else { + write(output_string, output); } ms.init(me->getInitial()); @@ -477,22 +458,22 @@ TransferMult::applyRule() } void -TransferMult::applyWord(wstring const &word_str) +TransferMult::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -510,9 +491,9 @@ TransferMult::applyWord(wstring const &word_str) break; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } - ms.step(L'$'); + ms.step('$'); } diff --git a/apertium/transfer_mult.h b/apertium/transfer_mult.h index c6c8920..02d2963 100644 --- a/apertium/transfer_mult.h +++ b/apertium/transfer_mult.h @@ -40,20 +40,20 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; TransferWord **word; - string **blank; + UString **blank; Buffer input_buffer; - vector tmpword; - vector tmpblank; - wstring output_string; + vector tmpword; + vector tmpblank; + UString output_string; FSTProcessor fstp; - FILE *output; + UFILE* output; int any_char; int any_tag; bool isRule; @@ -66,33 +66,33 @@ private: OutputType defaultAttrs; void destroy(); - void readData(FILE *input); + void readData(FILE* input); void readBil(string const &filename); - string caseOf(string const &str); - string copycase(string const &source_word, string const &target_word); + UString caseOf(UString const &str); + UString copycase(UString const &source_word, UString const &target_word); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; - wstring readWord(FILE *in); - wstring readBlank(FILE *in); - wstring readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + bool beginsWith(UString const &str1, UString const &str2) const; + bool endsWith(UString const &str1, UString const &str2) const; + UString tolower(UString const &str) const; + UString tags(UString const &str) const; + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); void applyRule(); - TransferToken & readToken(FILE *in); - void writeMultiple(list >::iterator itwords, - list::iterator itblanks, - list >::const_iterator limitwords, - wstring acum = L"", bool multiple = false); - vector acceptions(wstring str); - bool isDefaultWord(wstring const &str); + TransferToken & readToken(InputFile& in); + void writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + UString acum = ""_u, bool multiple = false); + vector acceptions(UString str); + bool isDefaultWord(UString const &str); public: TransferMult(); ~TransferMult(); void read(string const &datafile, string const &fstfile); - void transfer(FILE *in, FILE *out); + void transfer(InputFile& in, UFILE* out); }; #endif diff --git a/apertium/transfer_regex.cc b/apertium/transfer_regex.cc new file mode 100644 index 0000000..f2cf207 --- /dev/null +++ b/apertium/transfer_regex.cc @@ -0,0 +1,109 @@ +#include + +struct TrieNode { + UChar32 c; + std::vector next; +}; + +TrieNode* +add_char(TrieNode* root, UChar32 c) +{ + for (auto node : root->next) { + if (node->c == c) { + return node; + } + } + TrieNode* t = new TrieNode; + t->c = c; + root->next.push_back(t); + return t; +} + +void +add_entry(TrieNode* root, const std::vector& vec) +{ + bool escape = false; + TrieNode* cur = root; + for (auto c : vec) { + if (!escape) { + if (c == '\\') { + escape = true; + continue; + } else if (c == '.') { + cur = add_char(cur, '>'); + cur = add_char(cur, '<'); + continue; + } + } + escape = false; + cur = add_char(cur, c); + } + add_char(cur, '\0'); +} + +UString +unbuildTrie(TrieNode* root) +{ + UString single; + single += '['; + std::vector groups; + bool end = false; + int single_count = 0; + for (auto it : root->next) { + if (it->next.empty()) { + end = true; + } else if (it->next.size() == 1 && it->next[0]->c == '\0') { + single += it->c; + single_count++; + } else { + groups.push_back(unbuildTrie(it)); + } + } + if (single_count > 0) { + if (single_count == 1) { + groups.push_back(single.substr(1)); + } else { + single += ']'; + groups.push_back(single); + } + } + UString ret; + ret += root->c; + if (groups.empty()) { + return ret; + } else if (groups.size() == 1) { + ret += groups[0]; + } else { + ret += '('; ret += '?'; ret += ':'; + for (size_t i = 0; i < groups.size(); i++) { + if (i > 0) { + ret += '|'; + } + ret += groups[i]; + } + ret += ')'; + } + if (end) { + ret += '?'; + } + return ret; +} + +UString +optimize_regex(const std::vector& options) +{ + TrieNode* root = new TrieNode; + root->c = '<'; + std::vector v; + for (auto& s : options) { + v.clear(); + ustring_to_vec32(s, v); + add_entry(root, v); + } + UString ret; + ret += '('; + ret.append(unbuildTrie(root)); + ret += '>'; + ret += ')'; + return ret; +} diff --git a/apertium/transfer_regex.h b/apertium/transfer_regex.h new file mode 100644 index 0000000..63543a1 --- /dev/null +++ b/apertium/transfer_regex.h @@ -0,0 +1,9 @@ +#ifndef __TRANSFER_REGEX_OPTIMIZER__ +#define __TRANSFER_REGEX_OPTIMIZER__ + +#include +#include + +UString optimize_regex(const std::vector& options); + +#endif // __TRANSFER_REGEX_OPTIMIZER__ diff --git a/apertium/transfer_token.cc b/apertium/transfer_token.cc index d5b4858..a679f07 100644 --- a/apertium/transfer_token.cc +++ b/apertium/transfer_token.cc @@ -15,9 +15,7 @@ * along with this program; if not, see . */ #include -#include - -using namespace Apertium; +#include void TransferToken::copy(TransferToken const &o) @@ -36,7 +34,7 @@ type(tt_eof) { } -TransferToken::TransferToken(wstring const &content, +TransferToken::TransferToken(UString const &content, TransferTokenType type) { this->content = content; @@ -70,7 +68,7 @@ TransferToken::getType() return type; } -wstring & +UString & TransferToken::getContent() { return content; @@ -83,8 +81,7 @@ TransferToken::setType(TransferTokenType type) } void -TransferToken::setContent(wstring const &content) +TransferToken::setContent(UString const &content) { this->content = content; } - diff --git a/apertium/transfer_token.h b/apertium/transfer_token.h index 039e7d6..a0ca3fc 100644 --- a/apertium/transfer_token.h +++ b/apertium/transfer_token.h @@ -18,6 +18,7 @@ #define _TRANSFERTOKEN_ #include +#include using namespace std; @@ -33,20 +34,20 @@ class TransferToken { private: TransferTokenType type; - wstring content; + UString content; void copy(TransferToken const &o); void destroy(); public: TransferToken(); - TransferToken(wstring const &content, TransferTokenType type); + TransferToken(UString const &content, TransferTokenType type); ~TransferToken(); TransferToken(TransferToken const &o); TransferToken & operator =(TransferToken const &o); TransferTokenType getType(); - wstring & getContent(); + UString & getContent(); void setType(TransferTokenType type); - void setContent(wstring const &content); + void setContent(UString const &content); }; #endif diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc index bca3232..a3ce8c7 100644 --- a/apertium/transfer_word.cc +++ b/apertium/transfer_word.cc @@ -17,9 +17,8 @@ #include #include -#include +#include -using namespace Apertium; void TransferWord::copy(TransferWord const &o) { @@ -40,7 +39,7 @@ queue_length(0) { } -TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue) +TransferWord::TransferWord(UString const &src, UString const &tgt, UString const &ref, UString const &wblank, int queue) { init(src, tgt, ref, wblank); queue_length = queue; @@ -68,7 +67,7 @@ TransferWord::operator =(TransferWord const &o) } void -TransferWord::init(string const &src, string const &tgt, string const &ref, string const &wblank) +TransferWord::init(UString const &src, UString const &tgt, UString const &ref, UString const &wblank) { s_str = src; t_str = tgt; @@ -76,7 +75,7 @@ TransferWord::init(string const &src, string const &tgt, string const &ref, stri wb_str = wblank; } -string +UString TransferWord::source(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -89,7 +88,7 @@ TransferWord::source(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::target(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -102,7 +101,7 @@ TransferWord::target(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::reference(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -115,14 +114,14 @@ TransferWord::reference(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::getWblank() { return wb_str; } bool -TransferWord::setSource(ApertiumRE const &part, string const &value, +TransferWord::setSource(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -131,7 +130,7 @@ TransferWord::setSource(ApertiumRE const &part, string const &value, } else { - string mystring = s_str.substr(0, s_str.size() - queue_length); + UString mystring = s_str.substr(0, s_str.size() - queue_length); bool ret = part.replace(mystring, value); s_str = mystring + s_str.substr(s_str.size() - queue_length); return ret; @@ -139,7 +138,7 @@ TransferWord::setSource(ApertiumRE const &part, string const &value, } bool -TransferWord::setTarget(ApertiumRE const &part, string const &value, +TransferWord::setTarget(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -148,7 +147,7 @@ TransferWord::setTarget(ApertiumRE const &part, string const &value, } else { - string mystring = t_str.substr(0, t_str.size() - queue_length); + UString mystring = t_str.substr(0, t_str.size() - queue_length); bool ret = part.replace(mystring, value); t_str = mystring + t_str.substr(t_str.size() - queue_length); return ret; @@ -156,7 +155,7 @@ TransferWord::setTarget(ApertiumRE const &part, string const &value, } bool -TransferWord::setReference(ApertiumRE const &part, string const &value, +TransferWord::setReference(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -165,7 +164,7 @@ TransferWord::setReference(ApertiumRE const &part, string const &value, } else { - string mystring = r_str.substr(0, r_str.size() - queue_length); + UString mystring = r_str.substr(0, r_str.size() - queue_length); bool ret = part.replace(mystring, value); r_str = mystring + r_str.substr(r_str.size() - queue_length); return ret; diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h index 5e63133..c286edf 100644 --- a/apertium/transfer_word.h +++ b/apertium/transfer_word.h @@ -22,6 +22,7 @@ #include #include #include +#include using namespace std; @@ -34,22 +35,22 @@ private: /** * Source language word */ - string s_str; + UString s_str; /** * Target language word */ - string t_str; + UString t_str; /** * Reference word */ - string r_str; + UString r_str; /** * Wordbound blank */ - string wb_str; + UString wb_str; /** * Queue length @@ -73,7 +74,7 @@ private: * @param part regular expression to match/access * @return reference to matched/accessed string */ - string access(string const &str, ApertiumRE const &part); + UString access(UString const &str, ApertiumRE const &part); /** * Assings a value to the source/target/reference side of a word using the @@ -82,7 +83,7 @@ private: * @param part regular expression to match/access * @param value the string to be assigned */ - void assign(string &str, ApertiumRE const &part, string const &value); + void assign(UString &str, ApertiumRE const &part, UString const &value); public: /** @@ -108,7 +109,7 @@ public: * @param wblank wordbound blank * @param queue queue lenght */ - TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue = 0); + TransferWord(UString const &src, UString const &tgt, UString const &ref, UString const &wblank, int queue = 0); /** * Assignment operator @@ -125,7 +126,7 @@ public: * @param ref reference word * @param wblank wordbound blank */ - void init(string const &src, string const &tgt, string const &ref, string const &wblank); + void init(UString const &src, UString const &tgt, UString const &ref, UString const &wblank); /** * Reference a source language word part @@ -133,7 +134,7 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string source(ApertiumRE const &part, bool with_queue = true); + UString source(ApertiumRE const &part, bool with_queue = true); /** * Reference a target language word part @@ -141,7 +142,7 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string target(ApertiumRE const &part, bool with_queue = true); + UString target(ApertiumRE const &part, bool with_queue = true); /** * Reference the reference word part @@ -149,13 +150,13 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string reference(ApertiumRE const &part, bool with_queue = true); + UString reference(ApertiumRE const &part, bool with_queue = true); /** * Reference the wordbound blank part * @returns reference to the wordbound blank */ - string getWblank(); + UString getWblank(); /** * Sets a value for a source language word part @@ -164,7 +165,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setSource(ApertiumRE const &part, string const &value, + bool setSource(ApertiumRE const &part, UString const &value, bool with_queue = true); /** @@ -174,7 +175,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setTarget(ApertiumRE const &part, string const &value, + bool setTarget(ApertiumRE const &part, UString const &value, bool with_queue = true); /** @@ -184,7 +185,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setReference(ApertiumRE const &part, string const &value, + bool setReference(ApertiumRE const &part, UString const &value, bool with_queue = true); }; diff --git a/apertium/transfer_word_list.cc b/apertium/transfer_word_list.cc deleted file mode 100644 index 9b730dd..0000000 --- a/apertium/transfer_word_list.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#include -#include - -using namespace Apertium; -void -TransferWordList::copy(TransferWordList const &o) -{ - casefull_set = o.casefull_set; - caseless_set = o.caseless_set; -} - -void -TransferWordList::destroy() -{ -} - -TransferWordList::TransferWordList() -{ -} - -TransferWordList::~TransferWordList() -{ - destroy(); -} - -TransferWordList::TransferWordList(TransferWordList const &o) -{ - copy(o); -} - -TransferWordList & -TransferWordList::operator =(TransferWordList const &o) -{ - if(this != &o) - { - destroy(); - copy(o); - } - return *this; -} - -bool -TransferWordList::search(string const &cad, bool caseless) -{ - if(caseless) - { - return caseless_set.find(cad) != caseless_set.end(); - } - else - { - return casefull_set.find(cad) != casefull_set.end(); - } -} - -void -TransferWordList::addWord(string const &cad) -{ - casefull_set.insert(cad); - caseless_set.insert(cad); -} diff --git a/apertium/transfer_word_list.h b/apertium/transfer_word_list.h deleted file mode 100644 index 6e3f602..0000000 --- a/apertium/transfer_word_list.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#ifndef _TRANSFERWORDLIST_ -#define _TRANSFERWORDLIST_ - -#include -#include -#include -#ifdef _MSC_VER -#define strcasecmp _stricmp -#endif - -using namespace std; - -struct ltstr -{ - bool operator()(string const &s1, string const &s2) const - { - return s1 < s2; - } -}; - -struct ltstri -{ - bool operator()(string const &s1, string const &s2) const - { - return strcasecmp(s1.c_str(), s2.c_str()) < 0; - } -}; - -class TransferWordList -{ -private: - set casefull_set; - set caseless_set; - - void copy(TransferWordList const &o); - void destroy(); -public: - TransferWordList(); - ~TransferWordList(); - TransferWordList(TransferWordList const &o); - TransferWordList & operator =(TransferWordList const &o); - - bool search(string const &cad, bool caseless = false); - void addWord(string const &cad); -}; - -#endif diff --git a/apertium/transferpp.cc b/apertium/transferpp.cc index 62cf712..278da6b 100644 --- a/apertium/transferpp.cc +++ b/apertium/transferpp.cc @@ -18,10 +18,9 @@ #include #include #include -#include +#include #include -using namespace Apertium; using namespace std; int main(int argc, char *argv[]) @@ -30,7 +29,7 @@ int main(int argc, char *argv[]) if(argc != 3) { - wcerr << "USAGE: " << basename(argv[0]) << " rules_file transfer_file" << endl; + cerr << "USAGE: " << basename(argv[0]) << " rules_file transfer_file" << endl; exit(EXIT_FAILURE); } diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc index 8cc0e2d..e400cc0 100644 --- a/apertium/trx_reader.cc +++ b/apertium/trx_reader.cc @@ -20,14 +20,10 @@ #include #include -#include +#include -using namespace Apertium; -wstring const -TRXReader::ANY_TAG = L""; - -wstring const -TRXReader::ANY_CHAR = L""; +UString const TRXReader::ANY_TAG = ""_u; +UString const TRXReader::ANY_CHAR = ""_u; TRXReader::TRXReader() { @@ -36,29 +32,29 @@ TRXReader::TRXReader() } int -TRXReader::insertLemma(int const base, wstring const &lemma) +TRXReader::insertLemma(int const base, UString const &lemma) { int retval = base; static int const any_char = td.getAlphabet()(ANY_CHAR); - if(lemma == L"") + if(lemma.empty()) { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); - int another = td.getTransducer().insertSingleTransduction(L'\\', retval); + int another = td.getTransducer().insertSingleTransduction('\\', retval); td.getTransducer().linkStates(another, retval, any_char); } else { for(unsigned int i = 0, limit = lemma.size(); i != limit; i++) { - if(lemma[i] == L'\\') + if(lemma[i] == '\\') { - retval = td.getTransducer().insertSingleTransduction(L'\\', retval); + retval = td.getTransducer().insertSingleTransduction('\\', retval); i++; retval = td.getTransducer().insertSingleTransduction(int(lemma[i]), retval); } - else if(lemma[i] == L'*') + else if(lemma[i] == '*') { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); @@ -75,7 +71,7 @@ TRXReader::insertLemma(int const base, wstring const &lemma) } int -TRXReader::insertTags(int const base, wstring const &tags) +TRXReader::insertTags(int const base, UString const &tags) { int retval = base; static int const any_tag = td.getAlphabet()(ANY_TAG); @@ -83,7 +79,7 @@ TRXReader::insertTags(int const base, wstring const &tags) { for(unsigned int i = 0, limit = tags.size(); i < limit; i++) { - if(tags[i] == L'*') + if(tags[i] == '*') { retval = td.getTransducer().insertSingleTransduction(any_tag, retval); td.getTransducer().linkStates(retval, retval, any_tag); @@ -91,10 +87,10 @@ TRXReader::insertTags(int const base, wstring const &tags) } else { - wstring symbol = L"<"; + UString symbol = "<"_u; for(unsigned int j = i; j != limit; j++) { - if(tags[j] == L'.') + if(tags[j] == '.') { symbol.append(tags.substr(i, j-i)); i = j; @@ -102,12 +98,12 @@ TRXReader::insertTags(int const base, wstring const &tags) } } - if(symbol == L"<") + if(symbol == "<"_u) { symbol.append(tags.substr(i)); i = limit; } - symbol += L'>'; + symbol += '>'; td.getAlphabet().includeSymbol(symbol); retval = td.getTransducer().insertSingleTransduction(td.getAlphabet()(symbol), retval); } @@ -126,56 +122,56 @@ TRXReader::parse() { procDefCats(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name == L"section-def-attrs") + if(name == "section-def-attrs"_u) { procDefAttrs(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"section-def-vars") + if(name == "section-def-vars"_u) { procDefVars(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"section-def-lists") + if(name == "section-def-lists"_u) { procDefLists(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"section-def-macros") + if(name == "section-def-macros"_u) { procDefMacros(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"section-rules") + if(name == "section-rules"_u) { procRules(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } @@ -189,17 +185,17 @@ TRXReader::procRules() set alive_states; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-rules") + name != "section-rules"_u) { step(); - if(name == L"rule") + if(name == "rule"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { count++; } } - else if(name == L"pattern") + else if(name == "pattern"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -220,27 +216,27 @@ TRXReader::procRules() } else { - wcerr << L"Warning (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): " - << L"Paths to rule " << count << " blocked by rule " << td.seen_rules[*it] - << L"." << endl; + cerr << "Warning (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): " + << "Paths to rule " << count << " blocked by rule " << td.seen_rules[*it] + << "." << endl; } } } } - else if(name == L"pattern-item") + else if(name == "pattern-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - pair::iterator, - multimap::iterator> range; + pair::iterator, + multimap::iterator> range; - range = cat_items.equal_range(attrib(L"n")); + range = cat_items.equal_range(attrib("n"_u)); if(range.first == range.second) { - parseError(L"Undefined cat-item '" + attrib(L"n")); + parseError("Undefined cat-item '"_u + attrib("n"_u)); } // new code @@ -253,12 +249,12 @@ TRXReader::procRules() it != limit; it++) { // mark of begin of word - int tmp = td.getTransducer().insertSingleTransduction(L'^', *it); + int tmp = td.getTransducer().insertSingleTransduction('^', *it); if(*it != td.getTransducer().getInitial()) { // insert optional blank between two words - int alt = td.getTransducer().insertSingleTransduction(L' ', *it); - td.getTransducer().linkStates(alt, tmp, L'^'); + int alt = td.getTransducer().insertSingleTransduction(' ', *it); + td.getTransducer().linkStates(alt, tmp, '^'); } // insert word @@ -266,7 +262,7 @@ TRXReader::procRules() tmp = insertTags(tmp, range.first->second.tags); // insert mark of end of word - tmp = td.getTransducer().insertSingleTransduction(L'$', tmp); + tmp = td.getTransducer().insertSingleTransduction('$', tmp); // set as alive_state alive_states_new.insert(tmp); @@ -277,21 +273,21 @@ TRXReader::procRules() alive_states = alive_states_new; } } - else if(name == L"let") + else if(name == "let"_u) { int count = 0; int lineno = xmlTextReaderGetParserLineNumber(reader); - while(name != L"let" || type != XML_READER_TYPE_END_ELEMENT) + while(name != "let"_u || type != XML_READER_TYPE_END_ELEMENT) { step(); if(type == XML_ELEMENT_NODE) { count++; - if(name == L"clip" && attrib(L"side") == L"sl") + if(name == "clip"_u && attrib("side"_u) == "sl"_u) { - wcerr << L"Warning (" << lineno; - wcerr << L"): assignment to 'sl' side has no effect." << endl; + cerr << "Warning (" << lineno; + cerr << "): assignment to 'sl' side has no effect." << endl; } } @@ -311,8 +307,8 @@ TRXReader::write(string const &filename) FILE *out = fopen(filename.c_str(), "wb"); if(!out) { - wcerr << "Error: cannot open '" << filename; - wcerr << "' for writing" << endl; + cerr << "Error: cannot open '" << filename; + cerr << "' for writing" << endl; exit(EXIT_FAILURE); } @@ -324,41 +320,42 @@ TRXReader::write(string const &filename) void TRXReader::procDefAttrs() { - wstring attrname; + UString attrname; + vector items; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-attrs") + name != "section-def-attrs"_u) { step(); - if(name == L"attr-item") + if(name == "attr-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - insertAttrItem(attrname, attrib(L"tags")); + items.push_back(attrib("tags"_u)); } } - else if(name == L"def-attr") + else if(name == "def-attr"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - attrname = attrib(L"n"); + attrname = attrib("n"_u); } else { - wstring all = td.getAttrItems()[attrname]; - td.getAttrItems()[attrname] = L"(" + all + L")"; - attrname = L""; + td.getAttrItems()[attrname] = optimize_regex(items); + items.clear(); + attrname.clear(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"section-def-attrs") + else if(name == "section-def-attrs"_u) { // do nothing } @@ -372,56 +369,56 @@ TRXReader::procDefAttrs() void TRXReader::procDefCats() { - while(type == XML_READER_TYPE_END_ELEMENT || !(name == L"transfer" || name == L"interchunk" || name == L"postchunk")) + while(type == XML_READER_TYPE_END_ELEMENT || !(name == "transfer"_u || name == "interchunk"_u || name == "postchunk"_u)) { step(); - if(name != L"#text" && name != L"transfer" && name != L"interchunk" && - name != L"postchunk" && name != L"section-def-cats" && name != L"#comment") + if(name != "#text"_u && name != "transfer"_u && name != "interchunk"_u && + name != "postchunk"_u && name != "section-def-cats"_u && name != "#comment"_u) { unexpectedTag(); } } - wstring catname; + UString catname; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-cats") + name != "section-def-cats"_u) { step(); - if(name == L"cat-item") + if(name == "cat-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - if(attrib(L"tags") != L"") + if(!attrib("tags"_u).empty()) { - insertCatItem(catname, attrib(L"lemma"), attrib(L"tags")); + insertCatItem(catname, attrib("lemma"_u), attrib("tags"_u)); } else { - insertCatItem(catname, attrib(L"name"), L""); + insertCatItem(catname, attrib("name"_u), ""_u); } } } - else if(name == L"def-cat") + else if(name == "def-cat"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - catname = attrib(L"n"); + catname = attrib("n"_u); } else { - catname = L""; + catname.clear(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"section-def-cats") + else if(name == "section-def-cats"_u) { // do nothing } @@ -436,25 +433,25 @@ void TRXReader::procDefVars() { while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-vars") + name != "section-def-vars"_u) { step(); - if(name == L"def-var") + if(name == "def-var"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - createVar(attrib(L"n"), attrib(L"v")); + createVar(attrib("n"_u), attrib("v"_u)); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"section-def-vars") + else if(name == "section-def-vars"_u) { // do nothing } @@ -468,39 +465,39 @@ TRXReader::procDefVars() void TRXReader::procDefLists() { - wstring listname; + UString listname; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-lists") + name != "section-def-lists"_u) { step(); - if(name == L"list-item") + if(name == "list-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - insertListItem(listname, attrib(L"v")); + insertListItem(listname, attrib("v"_u)); } } - else if(name == L"def-list") + else if(name == "def-list"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - listname = attrib(L"n"); + listname = attrib("n"_u); } else { - listname = L""; + listname.clear(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"section-def-lists") + else if(name == "section-def-lists"_u) { // do nothing } @@ -516,72 +513,47 @@ TRXReader::procDefMacros() { int count = 0; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-macros") + name != "section-def-macros"_u) { step(); - if(name == L"def-macro") + if(name == "def-macro"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - createMacro(attrib(L"n"), count++); + createMacro(attrib("n"_u), count++); } } } } void -TRXReader::createMacro(wstring const &name, int const value) +TRXReader::createMacro(UString const &name, int const value) { if(td.getMacros().find(name) != td.getMacros().end()) { - parseError(L"Macro '" + name + L"' defined at least twice"); + parseError("Macro '"_u + name + "' defined at least twice"_u); } td.getMacros()[name] = value; } void -TRXReader::insertListItem(wstring const &name, wstring const &value) +TRXReader::insertListItem(UString const &name, UString const &value) { td.getLists()[name].insert(value); } void -TRXReader::createVar(wstring const &name, wstring const &initial_value) +TRXReader::createVar(UString const &name, UString const &initial_value) { td.getVariables()[name] = initial_value; } void -TRXReader::insertCatItem(wstring const &name, wstring const &lemma, - wstring const &tags) +TRXReader::insertCatItem(UString const &name, UString const &lemma, + UString const &tags) { LemmaTags lt; lt.lemma = lemma; lt.tags = tags; - cat_items.insert(pair(name, lt)); -} - -void -TRXReader::insertAttrItem(wstring const &name, wstring const &tags) -{ - if(td.getAttrItems()[name].size() != 0) - { - td.getAttrItems()[name] += L'|'; - } - - td.getAttrItems()[name] += '<'; - - for(unsigned int i = 0, limit = tags.size(); i != limit; i++) - { - if(tags[i] == L'.') - { - td.getAttrItems()[name].append(L"><"); - } - else - { - td.getAttrItems()[name] += tags[i]; - } - } - td.getAttrItems()[name] += L'>'; - + cat_items.insert(pair(name, lt)); } diff --git a/apertium/trx_reader.h b/apertium/trx_reader.h index 18ebef2..7766123 100644 --- a/apertium/trx_reader.h +++ b/apertium/trx_reader.h @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -32,11 +31,11 @@ class TRXReader : public XMLReader private: struct LemmaTags { - wstring lemma; - wstring tags; + UString lemma; + UString tags; }; - multimap cat_items; + multimap cat_items; TransferData td; void destroy(); @@ -50,22 +49,21 @@ private: void procDefMacros(); void procRules(); - void insertCatItem(wstring const &name, wstring const &lemma, - wstring const &tags); - void insertAttrItem(wstring const &name, wstring const &tags); - void createVar(wstring const &name, wstring const &initial_value); - void insertListItem(wstring const &name, wstring const &value); - void createMacro(wstring const &name, int const val); + void insertCatItem(UString const &name, UString const &lemma, + UString const &tags); + void createVar(UString const &name, UString const &initial_value); + void insertListItem(UString const &name, UString const &value); + void createMacro(UString const &name, int const val); - int insertLemma(int const base, wstring const &lemma); - int insertTags(int const base, wstring const &tags); + int insertLemma(int const base, UString const &lemma); + int insertTags(int const base, UString const &tags); protected: virtual void parse(); public: - static wstring const ANY_TAG; - static wstring const ANY_CHAR; + static UString const ANY_TAG; + static UString const ANY_CHAR; TRXReader(); diff --git a/apertium/tsx_reader.cc b/apertium/tsx_reader.cc index 750bbf4..ef20298 100644 --- a/apertium/tsx_reader.cc +++ b/apertium/tsx_reader.cc @@ -17,12 +17,11 @@ #include #include #include -#include +#include #include #include -using namespace Apertium; void TSXReader::copy(TSXReader const &o) { @@ -61,13 +60,13 @@ TSXReader::clearTagIndex() { tag_index->clear(); array_tags->clear(); - newTagIndex(L"LPAR"); - newTagIndex(L"RPAR"); - newTagIndex(L"LQUEST"); - newTagIndex(L"CM"); - newTagIndex(L"SENT"); - newTagIndex(L"kEOF"); - newTagIndex(L"kUNDEF"); + newTagIndex("LPAR"_u); + newTagIndex("RPAR"_u); + newTagIndex("LQUEST"_u); + newTagIndex("CM"_u); + newTagIndex("SENT"_u); + newTagIndex("kEOF"_u); + newTagIndex("kUNDEF"_u); } TSXReader & @@ -82,31 +81,31 @@ TSXReader::operator =(TSXReader const &o) } void -TSXReader::newTagIndex(wstring const &tag) +TSXReader::newTagIndex(UString const &tag) { - if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_"_u + tag) != tag_index->end()) { - parseError(L"'" + tag + L"' already defined"); + parseError("'"_u + tag + "' already defined"_u); } - array_tags->push_back(L"TAG_" + tag); - (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; + array_tags->push_back("TAG_"_u + tag); + (*tag_index)["TAG_"_u + tag] = array_tags->size() - 1; } void -TSXReader::newDefTag(wstring const &tag) +TSXReader::newDefTag(UString const &tag) { - if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_"_u + tag) != tag_index->end()) { - parseError(L"'" + tag + L"' already defined"); + parseError("'"_u + tag + "' already defined"_u); } array_tags->push_back(tag); - (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; + (*tag_index)["TAG_"_u + tag] = array_tags->size() - 1; } void -TSXReader::newConstant(wstring const &constant) +TSXReader::newConstant(UString const &constant) { constants->setConstant(constant, array_tags->size()); array_tags->push_back(constant); @@ -115,26 +114,26 @@ TSXReader::newConstant(wstring const &constant) void TSXReader::procDiscardOnAmbiguity() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"discard-on-ambiguity") + while(type != XML_READER_TYPE_END_ELEMENT || name != "discard-on-ambiguity"_u) { step(); - if(name == L"discard") + if(name == "discard"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - tdata.addDiscard(L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"); + tdata.addDiscard("<"_u + StringUtils::substitute(attrib("tags"_u), "."_u, "><"_u) + ">"_u); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"discard-on-ambiguity") + else if(name == "discard-on-ambiguity"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -142,7 +141,7 @@ TSXReader::procDiscardOnAmbiguity() } else { - parseError(L"Unexpected 'discard-on-ambiguity' open tag"); + parseError("Unexpected 'discard-on-ambiguity' open tag"_u); } } else @@ -155,36 +154,36 @@ TSXReader::procDiscardOnAmbiguity() void TSXReader::procDefLabel() { - wstring name_attr = attrib(L"name"); - wstring closed_attr = attrib(L"closed"); + UString name_attr = attrib("name"_u); + UString closed_attr = attrib("closed"_u); newDefTag(name_attr); - if(closed_attr != L"true") + if(closed_attr != "true"_u) { - open_class->insert((*tag_index)[L"TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"_u + name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-label") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-label"_u) { step(); - if(name == L"tags-item") + if(name == "tags-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], attrib(L"lemma"), - attrib(L"tags")); + plist->insert((*tag_index)["TAG_"_u + name_attr], attrib("lemma"_u), + attrib("tags"_u)); } } - else if(name == L"def-label") + else if(name == "def-label"_u) { return; } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } @@ -198,50 +197,50 @@ TSXReader::procDefLabel() void TSXReader::procDefMult() { - wstring name_attr = attrib(L"name"); - wstring closed_attr = attrib(L"closed"); + UString name_attr = attrib("name"_u); + UString closed_attr = attrib("closed"_u); newDefTag(name_attr); - if(closed_attr != L"true") + if(closed_attr != "true"_u) { - open_class->insert((*tag_index)[L"TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"_u + name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-mult") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-mult"_u) { step(); - if(name == L"sequence") + if(name == "sequence"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { plist->beginSequence(); - while(type != XML_READER_TYPE_END_ELEMENT || name != L"sequence") + while(type != XML_READER_TYPE_END_ELEMENT || name != "sequence"_u) { step(); - if(name == L"label-item") + if(name == "label-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], - (*tag_index)[L"TAG_"+attrib(L"label")]); + plist->insert((*tag_index)["TAG_"_u + name_attr], + (*tag_index)["TAG_"_u + attrib("label"_u)]); } } - else if(name == L"tags-item") + else if(name == "tags-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], - attrib(L"lemma"), attrib(L"tags")); + plist->insert((*tag_index)["TAG_"_u + name_attr], + attrib("lemma"_u), attrib("tags"_u)); } } - else if(name == L"sequence") + else if(name == "sequence"_u) { break; } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } @@ -249,15 +248,15 @@ TSXReader::procDefMult() plist->endSequence(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"def-mult") + else if(name == "def-mult"_u) { // do nothing } @@ -271,41 +270,41 @@ TSXReader::procDefMult() void TSXReader::procTagset() { - while(type == XML_READER_TYPE_END_ELEMENT || name != L"tagset") + while(type == XML_READER_TYPE_END_ELEMENT || name != "tagset"_u) { step(); - if(name != L"#text" && name != L"tagger" && name != L"tagset") + if(name != "#text"_u && name != "tagger"_u && name != "tagset"_u) { unexpectedTag(); } } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"tagset") + while(type != XML_READER_TYPE_END_ELEMENT || name != "tagset"_u) { step(); - if(name == L"def-label") + if(name == "def-label"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procDefLabel(); } } - else if(name == L"def-mult") + else if(name == "def-mult"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procDefMult(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"tagset") + else if(name == "tagset"_u) { // do nothing } @@ -323,27 +322,27 @@ TSXReader::procLabelSequence() TForbidRule forbid_rule; step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name != L"label-item") + if(name != "label-item"_u) { - parseError(L" tag expected"); + parseError(" tag expected"_u); } - forbid_rule.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + forbid_rule.tagi = (*tag_index)["TAG_"_u + attrib("label"_u)]; step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name != L"label-item") + if(name != "label-item"_u) { - parseError(L" tag expected"); + parseError(" tag expected"_u); } - forbid_rule.tagj = (*tag_index)[L"TAG_" + attrib(L"label")]; + forbid_rule.tagj = (*tag_index)["TAG_"_u + attrib("label"_u)]; forbid_rules->push_back(forbid_rule); } @@ -351,25 +350,25 @@ TSXReader::procLabelSequence() void TSXReader::procForbid() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"forbid") + while(type != XML_READER_TYPE_END_ELEMENT || name != "forbid"_u) { step(); - if(name == L"label-sequence") + if(name == "label-sequence"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { procLabelSequence(); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"forbid") + else if(name == "forbid"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -377,12 +376,12 @@ TSXReader::procForbid() } else { - parseError(L"Unexpected '" + name + L"' open tag"); + parseError("Unexpected '"_u + name + "' open tag"_u); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -391,14 +390,14 @@ void TSXReader::procEnforce() { TEnforceAfterRule aux; - while(type != XML_READER_TYPE_END_ELEMENT || name != L"enforce-rules") + while(type != XML_READER_TYPE_END_ELEMENT || name != "enforce-rules"_u) { step(); - if(name == L"enforce-after") + if(name == "enforce-after"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + aux.tagi = (*tag_index)["TAG_"_u + attrib("label"_u)]; } else { @@ -406,26 +405,26 @@ TSXReader::procEnforce() aux.tagsj.clear(); } } - else if(name == L"label-set") + else if(name == "label-set"_u) { // do nothing } - else if(name == L"label-item") + else if(name == "label-item"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagsj.push_back((*tag_index)[L"TAG_" + attrib(L"label")]); + aux.tagsj.push_back((*tag_index)["TAG_"_u + attrib("label"_u)]); } } - else if(name == L"#text") + else if(name == "#text"_u) { // do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"enforce-rules") + else if(name == "enforce-rules"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -433,12 +432,12 @@ TSXReader::procEnforce() } else { - parseError(L"Unexpected 'enforce-rules' open tag"); + parseError("Unexpected 'enforce-rules' open tag"_u); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -446,26 +445,26 @@ TSXReader::procEnforce() void TSXReader::procPreferences() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"preferences") + while(type != XML_READER_TYPE_END_ELEMENT || name != "preferences"_u) { step(); - if(name == L"prefer") + if(name == "prefer"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { - wstring const tags = L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"; + UString const tags = "<"_u + StringUtils::substitute(attrib("tags"_u), "."_u, "><"_u) + ">"_u; prefer_rules->push_back(tags); } } - else if(name == L"#text") + else if(name == "#text"_u) { //do nothing } - else if(name == L"#comment") + else if(name == "#comment"_u) { // do nothing } - else if(name == L"preferences") + else if(name == "preferences"_u) { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -473,12 +472,12 @@ TSXReader::procPreferences() } else { - parseError(L"Unexpected 'preferences' open tag"); + parseError("Unexpected 'preferences' open tag"_u); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '"_u + name + "' tag"_u); } } } @@ -494,38 +493,38 @@ TSXReader::parse() procTagset(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } - if(name == L"forbid") + if(name == "forbid"_u) { procForbid(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"enforce-rules") + if(name == "enforce-rules"_u) { procEnforce(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"preferences") + if(name == "preferences"_u) { procPreferences(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { step(); } } - if(name == L"discard-on-ambiguity") + if(name == "discard-on-ambiguity"_u) { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -533,20 +532,20 @@ TSXReader::parse() } } - newConstant(L"kMOT"); - newConstant(L"kDOLLAR"); - newConstant(L"kBARRA"); - newConstant(L"kMAS"); - newConstant(L"kIGNORAR"); - newConstant(L"kBEGIN"); - newConstant(L"kUNKNOWN"); - - plist->insert((*tag_index)[L"TAG_LPAR"], L"", L"lpar"); - plist->insert((*tag_index)[L"TAG_RPAR"], L"", L"rpar"); - plist->insert((*tag_index)[L"TAG_LQUEST"], L"", L"lquest"); - plist->insert((*tag_index)[L"TAG_CM"], L"", L"cm"); - plist->insert((*tag_index)[L"TAG_SENT"], L"", L"sent"); -// plist->insert((*tag_index)[L"TAG_kMAS"], L"+", L""); + newConstant("kMOT"_u); + newConstant("kDOLLAR"_u); + newConstant("kBARRA"_u); + newConstant("kMAS"_u); + newConstant("kIGNORAR"_u); + newConstant("kBEGIN"_u); + newConstant("kUNKNOWN"_u); + + plist->insert((*tag_index)["TAG_LPAR"_u], ""_u, "lpar"_u); + plist->insert((*tag_index)["TAG_RPAR"_u], ""_u, "rpar"_u); + plist->insert((*tag_index)["TAG_LQUEST"_u], ""_u, "lquest"_u); + plist->insert((*tag_index)["TAG_CM"_u], ""_u, "cm"_u); + plist->insert((*tag_index)["TAG_SENT"_u], ""_u, "sent"_u); +// plist->insert((*tag_index)["TAG_kMAS"_u], "+"_u, ""_u); plist->buildTransducer(); } diff --git a/apertium/tsx_reader.h b/apertium/tsx_reader.h index 8cc4829..9ef82a2 100644 --- a/apertium/tsx_reader.h +++ b/apertium/tsx_reader.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -37,17 +36,17 @@ class TSXReader : public XMLReader private: set *open_class; vector *forbid_rules; - map *tag_index; - vector *array_tags; + map *tag_index; + vector *array_tags; vector *enforce_rules; - vector *prefer_rules; + vector *prefer_rules; PatternList *plist; ConstantManager *constants; TaggerData tdata; - void newTagIndex(wstring const &tag); - void newDefTag(wstring const &tag); - void newConstant(wstring const &constant); + void newTagIndex(UString const &tag); + void newDefTag(UString const &tag); + void newConstant(UString const &constant); void procDefLabel(); void procDefMult(); void procDiscardOnAmbiguity(); diff --git a/apertium/unigram_tagger.cc b/apertium/unigram_tagger.cc index 5738816..8fb543b 100644 --- a/apertium/unigram_tagger.cc +++ b/apertium/unigram_tagger.cc @@ -134,9 +134,9 @@ UnigramTagger::score(const Analysis& Analysis_) { } if(TheFlags.getDebug()) { - score_DEBUG << L"(" << tokenCount_r_a << L" * " - << tokenCount_a << L") /\n (" - << tokenCount_a << L" + " << typeCount_a << L")"; + score_DEBUG << "(" << tokenCount_r_a << " * " + << tokenCount_a << ") /\n (" + << tokenCount_a << " + " << typeCount_a << ")"; } return (tokenCount_r_a * tokenCount_a) / (tokenCount_a + typeCount_a); } @@ -159,7 +159,7 @@ UnigramTagger::model3_score(const Analysis &Analysis_) i i_(Analysis_); Lemma l_(Analysis_); - std::wstringstream score_DEBUG_div; + std::stringstream score_DEBUG_div; if(Model3_l_t.find(i_) != Model3_l_t.end()) { if(Model3_l_t[i_].find(l_) != Model3_l_t[i_].end()) @@ -174,9 +174,9 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L"(" << tokenCount_r_i << L" * " << tokenCount_i; - std::wstringstream score_DEBUG_div; - score_DEBUG_div << L"(" << tokenCount_i << L" + " << typeCount_i << L")"; + score_DEBUG << "(" << tokenCount_r_i << " * " << tokenCount_i; + std::stringstream score_DEBUG_div; + score_DEBUG_div << "(" << tokenCount_i << " + " << typeCount_i << ")"; } long double score = tokenCount_r_i * tokenCount_i; @@ -223,9 +223,9 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L" * " << tokenCount_d_i << L" * " << tokenCount_i_d; - score_DEBUG_div << L" * (" << tokenCount_i << L" + " << typeCount_i - << L") * (" << tokenCount_d << L" + " << typeCount_d << L")"; + score_DEBUG << " * " << tokenCount_d_i << " * " << tokenCount_i_d; + score_DEBUG_div << " * (" << tokenCount_i << " + " << typeCount_i + << ") * (" << tokenCount_d << " + " << typeCount_d << ")"; } score *= (tokenCount_d_i * tokenCount_i_d); @@ -233,14 +233,14 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L") /\n [" << score_DEBUG_div.str() << L"]"; + score_DEBUG << ") /\n [" << score_DEBUG_div.str() << "]"; } return score / score_Divisor; } void -UnigramTagger::tag(Stream &Input, std::wostream &Output) +UnigramTagger::tag(Stream &Input, std::ostream &Output) { while (true) { StreamedType StreamedType_ = Input.get(); @@ -255,7 +255,7 @@ UnigramTagger::tag(Stream &Input, std::wostream &Output) } if(TheFlags.getDebug()) { - std::wcerr << L"\n\n"; + std::cerr << "\n\n"; } tag(*StreamedType_.TheLexicalUnit, Output); @@ -266,7 +266,7 @@ UnigramTagger::tag(Stream &Input, std::wostream &Output) } void -UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) +UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::ostream &Output) { Optional TheAnalysis; long double max_score = 0; @@ -275,7 +275,7 @@ UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) { if(TheFlags.getDebug()) { - score_DEBUG.str(L""); + score_DEBUG.str(""); } const Analysis& a_ = LexicalUnit_.TheAnalyses[n]; long double s = score(a_); @@ -286,10 +286,10 @@ UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) } if(TheFlags.getDebug()) { - std::wcerr << L"score(\"" << a_ << L"\") ==\n " - << score_DEBUG.str() << L" ==\n " << std::fixed + std::cerr << "score(\"" << a_ << "\") ==\n " + << score_DEBUG.str() << " ==\n " << std::fixed << std::setprecision(std::numeric_limits::digits10) - << s << L"\n"; + << s << "\n"; } } diff --git a/apertium/unigram_tagger.h b/apertium/unigram_tagger.h index 58af2e9..70737bc 100644 --- a/apertium/unigram_tagger.h +++ b/apertium/unigram_tagger.h @@ -45,8 +45,8 @@ enum UnigramTaggerModel { class UnigramTagger : public StreamTagger { private: long double model3_score(const Analysis &Analysis_); - void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output); - std::wstringstream score_DEBUG; + void tag(const LexicalUnit &LexicalUnit_, std::ostream &Output); + std::stringstream score_DEBUG; protected: UnigramTaggerModel model; @@ -94,7 +94,7 @@ public: UnigramTaggerModel getModel(); void serialise(std::ostream& o) const; void deserialise(std::istream& s); - void tag(Stream& Input, std::wostream& Output); + void tag(Stream& Input, std::ostream& Output); void train(Stream& TaggedCorpus); }; } diff --git a/apertium/unlocked_cstdio.h b/apertium/unlocked_cstdio.h index 5fe402c..2d043c5 100644 --- a/apertium/unlocked_cstdio.h +++ b/apertium/unlocked_cstdio.h @@ -40,24 +40,6 @@ #define fread_unlocked fread #endif -#if !HAVE_DECL_FGETWC_UNLOCKED -#define fgetwc_unlocked fgetwc -#endif - -#if !HAVE_DECL_FPUTWC_UNLOCKED -#define fputwc_unlocked fputwc -#endif - -#if !HAVE_DECL_FPUTWS_UNLOCKED -#define fputws_unlocked fputws -#endif - -#if !HAVE_MBTOWC -#include -inline int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } -inline int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } -#endif - #ifdef _WIN32 #include #endif diff --git a/apertium/utf_converter.cc b/apertium/utf_converter.cc deleted file mode 100644 index b79e834..0000000 --- a/apertium/utf_converter.cc +++ /dev/null @@ -1,613 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#include -#include -#include -#include - -using namespace Apertium; - -#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD -#define UNI_MAX_BMP (UTF32)0x0000FFFF -#define UNI_MAX_UTF16 (UTF32)0x0010FFFF -#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF -#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF - -using namespace std; - -namespace UtfConverter -{ - - typedef unsigned int UTF32; /* at least 32 bits */ - typedef unsigned short UTF16; /* at least 16 bits */ - typedef unsigned char UTF8; /* typically 8 bits */ - - /* Some fundamental constants */ - - typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ - } ConversionResult; - - typedef enum { - strictConversion = 0, - lenientConversion - } ConversionFlags; - - static const int halfShift = 10; /* used for shifting by 10 bits */ - - static const UTF32 halfBase = 0x0010000UL; - static const UTF32 halfMask = 0x3FFUL; - - - void conversionError() - { - wcerr << L"Error: conversion error" << endl; - exit(EXIT_FAILURE); - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; - - return result; - } - - /* --------------------------------------------------------------------- */ - - /* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ - static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 - }; - - /* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ - static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - - /* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ - static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - - /* --------------------------------------------------------------------- */ - - /* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - /* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - - static bool isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; - } - - /* --------------------------------------------------------------------- */ - - /* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ - bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - return isLegalUTF8(source, length); - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - wstring fromUtf8(string const & utf8string) - { - size_t widesize = utf8string.length(); - if (sizeof(wchar_t) == 2) - { - wstring resultstring; - resultstring.resize(widesize+1, L'\0'); - const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); - const UTF8* sourceend = sourcestart + widesize; - UTF16* targetstart = reinterpret_cast(&resultstring[0]); - UTF16* targetend = targetstart + widesize; - ConversionResult res = ConvertUTF8toUTF16(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, wcslen(resultstring.c_str())); - } - else if (sizeof(wchar_t) == 4) - { - wstring resultstring; - resultstring.resize(widesize+1, L'\0'); - const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); - const UTF8* sourceend = sourcestart + widesize; - UTF32* targetstart = reinterpret_cast(&resultstring[0]); - UTF32* targetend = targetstart + widesize; - ConversionResult res = ConvertUTF8toUTF32(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0,wcslen(resultstring.c_str())); - } - else - { - conversionError(); - } - return L""; - } - - string toUtf8(wstring const &widestring) - { - size_t widesize = widestring.length(); - - if (sizeof(wchar_t) == 2) - { - size_t utf8size = 3 * widesize + 1; - string resultstring; - resultstring.resize(utf8size, '\0'); - const UTF16* sourcestart = reinterpret_cast(widestring.c_str()); - const UTF16* sourceend = sourcestart + widesize; - UTF8* targetstart = reinterpret_cast(&resultstring[0]); - UTF8* targetend = targetstart + utf8size; - ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, strlen(resultstring.c_str())); - } - else if (sizeof(wchar_t) == 4) - { - size_t utf8size = 4 * widesize + 1; - string resultstring; - resultstring.resize(utf8size, '\0'); - const UTF32* sourcestart = reinterpret_cast(widestring.c_str()); - const UTF32* sourceend = reinterpret_cast(widestring.c_str() + widesize); - UTF8* targetstart = reinterpret_cast(&resultstring[0]); - UTF8* targetend = targetstart + utf8size; - ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, strlen(resultstring.c_str())); - } - else - { - conversionError(); - } - return ""; - } -} diff --git a/apertium/utf_converter.h b/apertium/utf_converter.h deleted file mode 100644 index 5e1f5b3..0000000 --- a/apertium/utf_converter.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#ifndef _UTFCONVERTER_ -#define _UTFCONVERTER_ - -#include - -using namespace std; - -namespace UtfConverter -{ - wstring fromUtf8(string const &utf8string); - string toUtf8(wstring const &widestring); -} - -#endif diff --git a/apertium/xml_reader.cc b/apertium/xml_reader.cc index b16a3f5..875a484 100644 --- a/apertium/xml_reader.cc +++ b/apertium/xml_reader.cc @@ -7,7 +7,7 @@ XMLReader::XmlTextReaderResource::XmlTextReaderResource( { reader = xmlReaderForFile(filename.c_str(), NULL, 0); if (reader == NULL) { - wcerr << L"Error: Cannot open '" << filename << L"'." << endl; + cerr << "Error: Cannot open '" << filename << "'." << endl; exit(EXIT_FAILURE); } } @@ -25,7 +25,7 @@ XMLReader::XMLReader() : reader(0), type(0) {} void XMLReader::stepToTag() { - while (name == L"#text" || name == L"#comment") { + while (name == "#text"_u || name == "#comment"_u) { step(); } } @@ -36,15 +36,15 @@ XMLReader::step() int retval = xmlTextReaderRead(reader); if (retval != 1) { - parseError(L"unexpected EOF"); + parseError("unexpected EOF"_u); } - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); - //std::wcerr << name << L": " << type << "\n"; + //std::cerr << name << ": " << type << "\n"; } void -XMLReader::stepPastSelfClosingTag(wstring const &tag) +XMLReader::stepPastSelfClosingTag(UString const &tag) { // libxml2 expands to inside entities. // This method exists to work around this difference. @@ -63,31 +63,40 @@ XMLReader::stepToNextTag() stepToTag(); } -wstring -XMLReader::attrib(wstring const &name) +UString +XMLReader::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } -string -XMLReader::attrib(string const &name) +std::string +XMLReader::attrib_str(const UString& name) { - return UtfConverter::toUtf8(attrib(UtfConverter::fromUtf8(name))); + return XMLParseUtil::attrib_str(reader, name); } void -XMLReader::parseError(wstring const &message) +XMLReader::parseError(UString const &message) { - wcerr << L"Error at line " << xmlTextReaderGetParserLineNumber(reader) - << L", column " << xmlTextReaderGetParserColumnNumber(reader) - << L": " << message << L"." << endl; + cerr << "Error at line " << xmlTextReaderGetParserLineNumber(reader) + << ", column " << xmlTextReaderGetParserColumnNumber(reader) + << ": " << message << "." << endl; + exit(EXIT_FAILURE); +} + +void +XMLReader::parseError(const std::string& message) +{ + cerr << "Error at line " << xmlTextReaderGetParserLineNumber(reader) + << ", column " << xmlTextReaderGetParserColumnNumber(reader) + << ": " << message << "." << endl; exit(EXIT_FAILURE); } void XMLReader::unexpectedTag() { - parseError(L"unexpected '<" + name + L">' tag"); + parseError("unexpected '<"_u + name + ">' tag"_u); } void diff --git a/apertium/xml_reader.h b/apertium/xml_reader.h index 252e2c4..60d94c7 100644 --- a/apertium/xml_reader.h +++ b/apertium/xml_reader.h @@ -2,12 +2,10 @@ #define _XMLREADER_ #include -#include +#include #include #include -#include #include -#include #include #include @@ -34,13 +32,14 @@ protected: XMLReader(); xmlTextReaderPtr reader; int type; - wstring name; - wstring attrib(wstring const &name); - string attrib(string const &name); - void parseError(wstring const &message); + UString name; + UString attrib(UString const &name); + string attrib_str(const UString& name); + void parseError(UString const &message); + void parseError(const string& message); void unexpectedTag(); void stepToTag(); - void stepPastSelfClosingTag(wstring const &tag); + void stepPastSelfClosingTag(UString const &tag); void stepToNextTag(); void step(); virtual void parse() = 0; diff --git a/configure.ac b/configure.ac index 6c992da..b6c1d2c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,8 @@ AC_PREREQ(2.52) m4_define([PKG_VERSION_MAJOR], [3]) -m4_define([PKG_VERSION_MINOR], [7]) -m4_define([PKG_VERSION_PATCH], [1]) +m4_define([PKG_VERSION_MINOR], [8]) +m4_define([PKG_VERSION_PATCH], [0]) AC_INIT([apertium], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [apertium], [https://wiki.apertium.org/]) AC_CONFIG_HEADER([apertium/apertium_config.h]) @@ -69,62 +69,21 @@ fi AC_CHECK_FUNCS(strcasecmp) -if test x$(uname) != xDarwin; -then -AC_CHECK_HEADER(pcreposix.h, - AC_CHECK_LIB(pcre, pcre_fullinfo,[ - LIBS="$LIBS -lpcreposix -lpcre" - no_comp_check=yes], - AC_MSG_ERROR([*** unable to locate pcre library ***])), - AC_MSG_ERROR([*** unable to locate pcreposix.h include file ***])) - -AC_CHECK_HEADER(pcrecpp.h, - AC_CHECK_LIB(pcrecpp,pcre_compile,[ - LIBS="$LIBS -lpcrecpp" - no_comp_check=yes], - AC_MSG_ERROR([*** unable to locate pcrecpp library ***])), - AC_MSG_ERROR([*** unable to locate pcrecpp.h include file ***])) -fi - - -PKG_CHECK_MODULES(LTTOOLBOX, [lttoolbox >= 3.5.3], CPPFLAGS="$CPPFLAGS $LTTOOLBOX_CFLAGS"; LIBS="$LIBS $LTTOOLBOX_LIBS") +PKG_CHECK_MODULES(LTTOOLBOX, [lttoolbox >= 3.6.0], CPPFLAGS="$CPPFLAGS $LTTOOLBOX_CFLAGS"; LIBS="$LIBS $LTTOOLBOX_LIBS") PKG_CHECK_MODULES(LIBXML2, [libxml-2.0 >= 2.6.17], CPPFLAGS="$CPPFLAGS $LIBXML2_CFLAGS"; LIBS="$LIBS $LIBXML2_LIBS") -PKG_CHECK_MODULES(PCRE, [libpcre >= 6.4], CPPFLAGS="$CPPFLAGS $PCRE_CFLAGS"; LIBS="$LIBS $PCRE_LIBS") - -# Check for wide strings -AC_DEFUN([AC_CXX_WSTRING],[ - AC_CACHE_CHECK(whether the compiler supports wide strings, - ac_cv_cxx_wstring, - [AC_LANG_SAVE - AC_LANG_CPLUSPLUS - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ -std::wstring test = L"test"; - ]])], - [ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no]) - AC_LANG_RESTORE - ]) -]) +PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc], CPPFLAGS="$CPPFLAGS $ICU_CFLAGS"; LIBS="$LIBS $ICU_LIBS") -AC_CXX_WSTRING AC_C_BIGENDIAN -if test "$ac_cv_cxx_wstring" = no -then - AC_MSG_ERROR([Missing wide string support]) -fi - - # Checks for header files. AC_LANG(C++) AC_HEADER_STDC AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h filesystem string_view]) AC_CHECK_LIB([stdc++fs], [_ZNSt12experimental10filesystem2v112current_pathEv]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) -# Checks for ICU -AC_CHECK_ICU(50) - -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) -AC_CHECK_FUNCS([setlocale strdup getopt snprintf mbtowc]) +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long]) +AC_CHECK_FUNCS([setlocale strdup getopt snprintf]) AC_REPLACE_FUNCS(getopt_long) AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows]) diff --git a/m4/ax_check_icu.m4 b/m4/ax_check_icu.m4 deleted file mode 100644 index 0137afd..0000000 --- a/m4/ax_check_icu.m4 +++ /dev/null @@ -1,117 +0,0 @@ -# =========================================================================== -# https://www.gnu.org/software/autoconf-archive/ax_check_icu.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_CHECK_ICU(version, action-if, action-if-not) -# -# DESCRIPTION -# -# Defines ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS. See icu-config(1) man page. -# -# LICENSE -# -# Copyright (c) 2008 Akos Maroy -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. - -#serial 7 - -AU_ALIAS([AC_CHECK_ICU], [AX_CHECK_ICU]) -AC_DEFUN([AX_CHECK_ICU], [ - succeeded=no - - if test -z "$ICU_CONFIG"; then - AC_PATH_PROG(ICU_CONFIG, icu-config, no) - fi - - if test -z "$PKG_CONFIG"; then - AC_PATH_PROG(PKG_CONFIG, pkg-config, no) - fi - - if test "$ICU_CONFIG" = "no" && test "$PKG_CONFIG" = "no" ; then - echo "*** Neither icu-config nor pkg-config could not be found. Make sure either is" - echo "*** in your path, and that taglib is properly installed." - echo "*** Or see https://ibm.com/software/globalization/icu/" - fi - - if test "$ICU_CONFIG" != "no" ; then - ICU_VERSION=`$ICU_CONFIG --version` - AC_MSG_CHECKING(for ICU >= $1 via icu-config) - VERSION_CHECK=`expr $ICU_VERSION \>\= $1` - if test "$VERSION_CHECK" = "1" ; then - AC_MSG_RESULT(yes) - succeeded=yes - - AC_MSG_CHECKING(ICU_CPPFLAGS) - ICU_CPPFLAGS=`$ICU_CONFIG --cppflags` - AC_MSG_RESULT($ICU_CPPFLAGS) - - AC_MSG_CHECKING(ICU_CFLAGS) - ICU_CFLAGS=`$ICU_CONFIG --cflags` - AC_MSG_RESULT($ICU_CFLAGS) - - AC_MSG_CHECKING(ICU_CXXFLAGS) - ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags` - AC_MSG_RESULT($ICU_CXXFLAGS) - - AC_MSG_CHECKING(ICU_LIBS) - ICU_LIBS=`$ICU_CONFIG --ldflags` - AC_MSG_RESULT($ICU_LIBS) - else - ICU_CPPFLAGS="" - ICU_CFLAGS="" - ICU_CXXFLAGS="" - ICU_LIBS="" - ## If we have a custom action on failure, don't print errors, but - ## do set a variable so people can do so. - ifelse([$3], ,echo "can't find ICU >= $1 via icu-config",) - fi - fi - - if test "$succeeded" != "yes" && test "$PKG_CONFIG" != "no" ; then - AC_MSG_CHECKING(for ICU >= $1 via pkg-config) - if $PKG_CONFIG --atleast-version=$1 icu-i18n ; then - AC_MSG_RESULT(yes) - succeeded=yes - - AC_MSG_CHECKING(ICU_CPPFLAGS) - ICU_CPPFLAGS=`$PKG_CONFIG --variable=CPPFLAGS icu-i18n` - AC_MSG_RESULT($ICU_CPPFLAGS) - - AC_MSG_CHECKING(ICU_CFLAGS) - ICU_CFLAGS=`$PKG_CONFIG --variable=CFLAGS icu-i18n` - AC_MSG_RESULT($ICU_CFLAGS) - - AC_MSG_CHECKING(ICU_CXXFLAGS) - ICU_CXXFLAGS=`$PKG_CONFIG --variable=CXXFLAGS icu-i18n` - AC_MSG_RESULT($ICU_CXXFLAGS) - - AC_MSG_CHECKING(ICU_LIBS) - ICU_LIBS=`$PKG_CONFIG --libs icu-i18n` - AC_MSG_RESULT($ICU_LIBS) - else - ICU_CPPFLAGS="" - ICU_CFLAGS="" - ICU_CXXFLAGS="" - ICU_LIBS="" - ## If we have a custom action on failure, don't print errors, but - ## do set a variable so people can do so. - ifelse([$3], ,echo "can't find ICU >= $1 via pkg-config",) - fi - fi - - if test "$succeeded" = "yes"; then - AC_SUBST(ICU_CPPFLAGS) - AC_SUBST(ICU_CFLAGS) - AC_SUBST(ICU_CXXFLAGS) - AC_SUBST(ICU_LIBS) - ifelse([$2], , :, [$2]) - else - ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3]) - fi -]) diff --git a/python/apertium_core.i b/python/apertium_core.i index 738f383..e9b5fc3 100644 --- a/python/apertium_core.i +++ b/python/apertium_core.i @@ -53,8 +53,9 @@ void pretransfer(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "r"); - FILE* output = fopen(output_path, "w"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); bool compound_sep = false; bool null_flush = false; bool surface_forms = false; @@ -85,8 +86,7 @@ void pretransfer(int argc, char **argv, char *input_path, char *output_path) } } processStream(input, output, null_flush, surface_forms, compound_sep); - fclose(input); - fclose(output); + u_fclose(output); } class ApertiumTransfer: public Transfer @@ -100,8 +100,9 @@ class ApertiumTransfer: public Transfer void transfer_text(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "r"); - FILE* output = fopen(output_path, "w"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); optind = 1; while (true) { @@ -147,8 +148,7 @@ class ApertiumTransfer: public Transfer } } transfer(input, output); - fclose(input); - fclose(output); + u_fclose(output); } }; @@ -173,8 +173,9 @@ class ApertiumInterchunk: public Interchunk void interchunk_text(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "r"); - FILE* output = fopen(output_path, "w"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); optind = 1; while (true) { @@ -197,8 +198,7 @@ class ApertiumInterchunk: public Interchunk } } interchunk(input, output); - fclose(input); - fclose(output); + u_fclose(output); } }; @@ -213,8 +213,9 @@ class ApertiumPostchunk: public Postchunk void postchunk_text(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "r"); - FILE* output = fopen(output_path, "w"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); optind = 1; while (true) { @@ -237,8 +238,7 @@ class ApertiumPostchunk: public Postchunk } } postchunk(input, output); - fclose(input); - fclose(output); + u_fclose(output); } }; diff --git a/python/setup.py.in b/python/setup.py.in index a18adcd..7bcd70d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -21,9 +21,9 @@ def get_sources(): sources = ['apertium_core.i'] cc_sources = [ # interchunk.cc postchunk.cc transfer.cc - 'apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', 'string_utils.cc', 'transfer.cc', + 'apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', 'transfer.cc', 'transfer_data.cc', 'transfer_instr.cc', 'transfer_mult.cc', 'transfer_token.cc', 'transfer_word.cc', - 'transfer_word_list.cc', 'trx_reader.cc', 'utf_converter.cc', 'xml_reader.cc', + 'trx_reader.cc', 'xml_reader.cc', # 'pretransfer.cc' 'pretransfer.cc', # tagger.cc diff --git a/tests/data/nno-nob.t2x.bin b/tests/data/nno-nob.t2x.bin index c03e145..9b8e124 100644 Binary files a/tests/data/nno-nob.t2x.bin and b/tests/data/nno-nob.t2x.bin differ diff --git a/tests/tagger/test_find_similar_ambiguity_classes.cc b/tests/tagger/test_find_similar_ambiguity_classes.cc index a6e299e..938ac3c 100644 --- a/tests/tagger/test_find_similar_ambiguity_classes.cc +++ b/tests/tagger/test_find_similar_ambiguity_classes.cc @@ -1,4 +1,4 @@ -#include "apertium/utf_converter.h" +#include #include "apertium/tagger_utils.h" #include "apertium/tagger_data_hmm.h" #include "apertium/tagger_data.h" @@ -6,35 +6,44 @@ #include #include -void print_ambiguity_class(const vector &array_tags, const set &abgset) +void print_ambiguity_class(const vector &array_tags, const set &abgset) { unsigned int j; set::const_iterator abgseti; for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { - wcout << array_tags[*abgseti]; + cout << array_tags[*abgseti]; if (j < abgset.size() - 1) { - wcout << " "; + cout << " "; } } } void find_similar_ambiguity_class_io(TaggerData &td) { - vector &array_tags = td.getArrayTags(); - wstring line = L""; - getline(wcin, line, L'\n'); - - wstringstream line_stream(line); + vector &array_tags = td.getArrayTags(); + UFILE* in = u_finit(stdin, NULL, NULL); set ambiguity_class; - wstring tag_name; - while (line_stream >> tag_name) { - vector::iterator it; + while (true) { + UString tag_name; + UChar32 c; + while (true) { + c = u_fgetcx(in); + if (u_isspace(c)) { + break; + } else { + tag_name += c; + } + } + vector::iterator it; it = find(array_tags.begin(), array_tags.end(), tag_name); if (it == array_tags.end()) { - wcerr << L"Tag not in model: " << tag_name << L'\n'; + cerr << "Tag not in model: " << tag_name << '\n'; exit(-3); } ambiguity_class.insert(it - array_tags.begin()); + if (c == '\n') { + break; + } } set similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class); print_ambiguity_class(array_tags, similar_ambiguity_class); diff --git a/utf8/utf8.h b/utf8/utf8.h deleted file mode 100644 index 82b13f5..0000000 --- a/utf8/utf8.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/utf8/utf8/checked.h b/utf8/utf8/checked.h deleted file mode 100644 index 1331155..0000000 --- a/utf8/utf8/checked.h +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - /// Deprecated in versions that include "prior" - template - uint32_t previous(octet_iterator& it, octet_iterator pass_start) - { - octet_iterator end = it; - while (utf8::internal::is_trail(*(--it))) - if (it == pass_start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - octet_iterator temp = it; - return utf8::next(temp, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - for (distance_type i = 0; i < n; ++i) - utf8::next(it, end); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start != end) { - uint32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start != end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& range_start, - const octet_iterator& range_end) : - it(octet_it), range_start(range_start), range_end(range_end) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#endif //header guard - - diff --git a/utf8/utf8/core.h b/utf8/utf8/core.h deleted file mode 100644 index f85081f..0000000 --- a/utf8/utf8/core.h +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it, octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - return UTF8_OK; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } - - //Deprecated in release 2.3 - template - inline bool is_bom (octet_iterator it) - { - return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/utf8/utf8/unchecked.h b/utf8/utf8/unchecked.h deleted file mode 100644 index 989ccef..0000000 --- a/utf8/utf8/unchecked.h +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template - uint32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template - uint32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template - inline uint32_t previous(octet_iterator& it) - { - return utf8::unchecked::prior(it); - } - - template - void advance (octet_iterator& it, distance_type n) - { - for (distance_type i = 0; i < n; ++i) - utf8::unchecked::next(it); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/utf8/utf8_fwrap.h b/utf8/utf8_fwrap.h deleted file mode 100644 index 5d41b6b..0000000 --- a/utf8/utf8_fwrap.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef _UTF8_FWRAP_HPP -#define _UTF8_FWRAP_HPP - -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 - #define utf32to8 utf16to8 -#endif - -inline wint_t fgetwc_u8(FILE *in) { -#ifdef _WIN32 - struct _cps { - FILE *f = 0; - wchar_t c = 0; - }; - static _cps cps[4]; - - for (auto& cp : cps) { - if (cp.f == in) { - cp.f = 0; - return cp.c; - } - } -#endif - - int32_t rv = 0; - int c = 0, i = 0; - char buf[4]; - if ((c = fgetc_unlocked(in)) != EOF) { - buf[i++] = static_cast(c); - if ((c & 0xF0) == 0xF0) { - if (fread_unlocked(buf+i, 1, 3, in) != 3) { - throw std::runtime_error("Could not read 3 expected bytes from stream"); - } - i += 3; - } - else if ((c & 0xE0) == 0xE0) { - if (fread_unlocked(buf+i, 1, 2, in) != 2) { - throw std::runtime_error("Could not read 2 expected bytes from stream"); - } - i += 2; - } - else if ((c & 0xC0) == 0xC0) { - if (fread_unlocked(buf+i, 1, 1, in) != 1) { - throw std::runtime_error("Could not read 1 expected byte from stream"); - } - i += 1; - } - } - if (i == 0 && c == EOF) { - rv = WEOF; - } - else { -#ifdef _WIN32 - wchar_t u16[2] = {}; - utf8::unchecked::utf8to16(buf, buf+i, u16); - - if (u16[1]) { - for (auto& cp : cps) { - if (cp.f == 0) { - cp.f = in; - cp.c = u16[1]; - return u16[0]; - } - } - throw std::runtime_error("Not enough space to store UTF-16 high surrogate"); - } - rv = u16[0]; -#else - utf8::unchecked::utf8to32(buf, buf+i, &rv); -#endif - } - return static_cast(rv); -} - -inline wint_t fputwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - if (fwrite_unlocked(buf, 1, e-buf, out) != static_cast(e-buf)) { - return WEOF; - } - - return wc; -} - -inline int fputws_u8(const wchar_t* str, FILE *out) { - static std::string buf; - buf.clear(); - size_t len = wcslen(str); - utf8::unchecked::utf32to8(str, str+len, std::back_inserter(buf)); - if (fwrite_unlocked(&buf[0], 1, buf.size(), out) != buf.size()) { - return WEOF; - } - - return 1; -} - -inline wint_t ungetwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - for (char *b = buf ; b != e ; ++b) { - if (ungetc(*b, out) == EOF) { - return WEOF; - } - } - - return wc; -} - -#ifdef fgetwc_unlocked - #undef fgetwc_unlocked -#endif -#define fgetwc_unlocked fgetwc_u8 - -#ifdef fputwc_unlocked - #undef fputwc_unlocked -#endif -#define fputwc_unlocked fputwc_u8 - -#ifdef fputws_unlocked - #undef fputws_unlocked -#endif -#define fputws_unlocked fputws_u8 - -#ifdef ungetwc - #undef ungetwc -#endif -#define ungetwc ungetwc_u8 - -#ifdef _WIN32 - #undef utf32to8 -#endif - -#endif