commit 99a71019e872ae0b5d3a1d27229891354c6341f9 Author: Daniel Swanson Date: Sat Jun 5 19:17:13 2021 -0500 grep -rI + M-% diff --git a/apertium.m4 b/apertium.m4 index 46c8a6a..6b8012e 100644 --- a/apertium.m4 +++ b/apertium.m4 @@ -109,19 +109,25 @@ AC_DEFUN([AP_MKINCLUDE], cat >$srcdir/ap_include.am <@/@name' modes.xml | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + modes=\`xmllint --xpath '//mode@<:@@install="yes"@:>@/@name' \@S|@< | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ if test -n "\$\$modes"; then \\ \$(INSTALL_DATA) \$\$modes \$(DESTDIR)\$(apertium_modesdir); \\ rm \$\$modes; \\ fi +uninstall-modes: modes.xml + files=\`xmllint --xpath '//mode@<:@@install="yes"@:>@/@name' \@S|@< | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + if test -n "\$\$files"; then \\ + dir=\$(DESTDIR)\$(apertium_modesdir); \$(am__uninstall_files_from_dir) + fi + .deps/.d: \$(MKDIR_P) .deps touch \$[]@ diff --git a/apertium/align.cc b/apertium/align.cc index 4b814a5..4d7bdb4 100644 --- a/apertium/align.cc +++ b/apertium/align.cc @@ -33,7 +33,7 @@ void align::align_( for (std::vector >::const_iterator i_ = string_.begin(); i_ != string_.end(); ++i_) { - std::wcerr << " " << std::setw(width_) << std::left << i_->first + std::cerr << " " << std::setw(width_) << std::left << i_->first << std::setw(0) << linebreak::linebreak_(i_->second, width_ + 2, width_ + 4) << '\n'; diff --git a/apertium/analysis.cc b/apertium/analysis.cc index 5e4e241..58776ed 100644 --- a/apertium/analysis.cc +++ b/apertium/analysis.cc @@ -22,8 +22,10 @@ #include namespace Apertium { -std::wostream &operator<<(std::wostream &Stream_, const Analysis &Analysis_) { - Stream_ << static_cast(Analysis_); +std::ostream &operator<<(std::ostream &Stream_, const Analysis &Analysis_) { + //Stream_ << static_cast(Analysis_); + // this line is giving a type error that I can't make sense of + // wstring wcerr TODO L"" return Stream_; } @@ -35,21 +37,22 @@ bool operator<(const Analysis &a, const Analysis &b) { return a.TheMorphemes < b.TheMorphemes; } -Analysis::operator std::wstring() const { +Analysis::operator UString() const { if (TheMorphemes.empty()) throw Exception::Analysis::TheMorphemes_empty( "can't convert Analysis comprising empty Morpheme std::vector to " - "std::wstring"); + "UString"); std::vector::const_iterator Morpheme_ = TheMorphemes.begin(); - std::wstring wstring_ = *Morpheme_; + UString UString_ = *Morpheme_; ++Morpheme_; // Call .end() each iteration to save memory. for (; Morpheme_ != TheMorphemes.end(); ++Morpheme_) { - wstring_ += L"+" + static_cast(*Morpheme_); + UString_ += '+'; + UString_ += static_cast(*Morpheme_); } - return wstring_; + return UString_; } } diff --git a/apertium/analysis.h b/apertium/analysis.h index f70a966..194bc60 100644 --- a/apertium/analysis.h +++ b/apertium/analysis.h @@ -21,15 +21,16 @@ #include #include #include +#include namespace Apertium { class Analysis { public: - friend std::wostream &operator<<(std::wostream &Stream_, - const Analysis &Analysis_); + friend std::ostream &operator<<(std::ostream &Stream_, + const Analysis &Analysis_); friend bool operator==(const Analysis &a, const Analysis &b); friend bool operator<(const Analysis &a, const Analysis &b); - operator std::wstring() const; + operator UString() const; std::vector TheMorphemes; }; } diff --git a/apertium/apertium-multiple-translations.cc b/apertium/apertium-multiple-translations.cc index ecdebab..f7a6443 100644 --- a/apertium/apertium-multiple-translations.cc +++ b/apertium/apertium-multiple-translations.cc @@ -61,30 +61,25 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); if(argc >= 4) { - input = fopen(argv[3], "r"); - if(!input) - { + if (!input.open(argv[3])) { cerr << "Error: can't open input file '" << argv[3] << "'." << endl; exit(EXIT_FAILURE); } if(argc == 5) { - output = fopen(argv[4], "w"); + output = u_fopen(argv[4], "w", NULL, NULL); if(!output) { - cerr << "Error: can't open output file '"; - cerr << argv[4] << "'." << endl; - exit(EXIT_FAILURE); + cerr << "Error: can't open output file '"; + cerr << argv[4] << "'." << endl; + exit(EXIT_FAILURE); } } } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif TransferMult t; t.read(argv[1], argv[2]); diff --git a/apertium/apertium_cleanstream.cc b/apertium/apertium_cleanstream.cc index bd43a0b..0792947 100644 --- a/apertium/apertium_cleanstream.cc +++ b/apertium/apertium_cleanstream.cc @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include #ifdef __MINGW32__ #include @@ -29,67 +32,28 @@ using namespace std; -#ifndef fputwc_unlocked -#define fputwc_unlocked fputwc -#endif - -#ifndef fputws_unlocked -#define fputws_unlocked fputws -#endif - -#ifndef fgetwc_unlocked -#define fgetwc_unlocked getwc -#endif - - -void -tryToSetLocale() +UString +readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2) { -#if !defined(__CYGWIN__) && !defined (__MINGW32__) - if(setlocale(LC_CTYPE, "") != NULL) - { - return; - } - - wcerr << "Warning: unsupported locale, fallback to \"C\"" << endl; - - setlocale(LC_ALL, "C"); -#endif -#ifdef __CYGWIN__ - setlocale(LC_ALL, "C.UTF-8"); -#endif -#ifdef __MINGW32__ - //SetConsoleInputCP(65001); - SetConsoleOutputCP(65001); -#endif -} - -wstring -readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; + UString result; result += delim1; - wchar_t c = delim1; + UChar32 c = delim1; - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); + while(!input.eof() && c != delim2) { + c = input.get(); result += c; - if(c != L'\\') - { + if(c != '\\') { continue; - } - else - { - result += L'\\'; - c = static_cast(fgetwc(input)); + } else { + result += '\\'; + c = input.get(); result += c; } } if(c != delim2) { - wcerr << "Error: expected: " << delim2 << ", saw: " << c << endl; + cerr << "Error: expected: " << delim2 << ", saw: " << c << endl; } return result; @@ -98,85 +62,70 @@ readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) int main (int argc, char** argv) { - wstring buf = L""; - wstring blanktmp = L""; + UString buf; + UString blanktmp; bool keepblank = false; bool spaced = true; bool intoken = false; - wchar_t ws = L' '; + UChar32 ws = ' '; for(int i=1; i #include #include +#include #include #include @@ -36,49 +37,29 @@ using namespace Apertium; using namespace std; -FILE * open_file(char const *filename, char const *mode) -{ - FILE *retval; - - struct stat var; - if(stat(filename, &var)) - { - wcerr << "Can't stat '" << filename << "'" << endl; - exit(EXIT_FAILURE); - } - - retval = fopen(filename, mode); - - if(!retval) - { - wcerr << "Can't open '" << filename << "'" << endl; - exit(EXIT_FAILURE); - } -#ifdef _MSC_VER - _setmode(_fileno(retval), _O_U8TEXT); -#endif - - return retval; -} - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); if(argc < 2 || argc > 4) { - wcerr << "USAGE: " << basename(argv[0]) << " tsx_file [input [output]" << endl; + cerr << "USAGE: " << basename(argv[0]) << " tsx_file [input [output]" << endl; exit(EXIT_FAILURE); } - FILE *input = stdin, *output = stdout; + char* input = NULL; + UFILE* output = u_finit(stdout, NULL, NULL); switch(argc) { case 4: - output = open_file(argv[3], "w"); + output = u_fopen(argv[3], "w", NULL, NULL); + if (!output) { + cerr << "Error: Unable to open '" << argv[3] << "' for writing." << endl; + exit(EXIT_FAILURE); + } // no break case 3: - input = open_file(argv[2], "r"); + input = argv[2]; // no break case 2: default: diff --git a/apertium/apertium_interchunk.cc b/apertium/apertium_interchunk.cc index 73bc555..63dbf09 100644 --- a/apertium/apertium_interchunk.cc +++ b/apertium/apertium_interchunk.cc @@ -36,14 +36,14 @@ using namespace std; void message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " [-tz] t2x preproc [input [output]]" << endl; - wcerr << " t2x t2x rules file" << endl; - wcerr << " preproc result of preprocess trules file" << endl; - wcerr << " input input file, standard input by default" << endl; - wcerr << " output output file, standard output by default" << endl; - wcerr << "OPTIONS" < saved_token = tagged_sent[token_idx]; tagged_sent[token_idx] = lu.TheAnalyses[analy_idx]; - std::wcout << L"LU:" << tagged_sent[token_idx] << std::endl ; + std::cout << "LU:" << tagged_sent[token_idx] << std::endl ; std::vector &wordoids = lu.TheAnalyses[analy_idx].TheMorphemes; for (wrd_idx=0; wrd_idx\n"; - std::wcout << "Output features and weights from a model file.\n"; - std::wcout << argv[0] << " mtx \n"; - std::wcout << "Output macros and features from an mtx file.\n"; - std::wcout << argv[0] << " path \n"; - std::wcout << "Trace a particular path through giving which features fire " + std::cout << "Run with one of:\n"; + std::cout << argv[0] << " model \n"; + std::cout << "Output features and weights from a model file.\n"; + std::cout << argv[0] << " mtx \n"; + std::cout << "Output macros and features from an mtx file.\n"; + std::cout << argv[0] << " path \n"; + std::cout << "Trace a particular path through giving which features fire " << "and the resulting score. Useful for interactively " << "designing feature sets.\n"; } diff --git a/apertium/apertium_postchunk.cc b/apertium/apertium_postchunk.cc index bde462e..ae8ea12 100644 --- a/apertium/apertium_postchunk.cc +++ b/apertium/apertium_postchunk.cc @@ -35,14 +35,14 @@ using namespace std; void message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " [-z] t3x preproc [input [output]]" << endl; - wcerr << " t3x t3x rules file" << endl; - wcerr << " preproc result of preprocess trules file" << endl; - wcerr << " input input file, standard input by default" << endl; - wcerr << " output output file, standard output by default" << endl; - wcerr << "OPTIONS" < #include +#include using namespace Apertium; @@ -34,16 +35,16 @@ using namespace std; void usage(char *progname) { - wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; - wcerr << L" -z null-flushing output on '\0'" << endl; - wcerr << L" -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " [input_file [output_file]]" << endl; + cerr << " -z null-flushing output on '\0'" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); } -void processStream(FILE *in, FILE *out, bool null_flush) +void processStream(InputFile& in, UFILE* out, bool null_flush) { int prev = -1; - int c = fgetc(in); + UChar32 c = in.get(); while (c != EOF) { if (!((c == ' ') && (prev == ' '))) @@ -98,44 +99,33 @@ int main(int argc, char *argv[]) usage(argv[0]); } - FILE *input, *output; + InputFile input; + UFILE* output; if((argc-optind+1) == 1) { - input = stdin; - output = stdout; + output = u_finit(stdout, NULL, NULL); } else if ((argc-optind+1) == 2) { - input = fopen(argv[argc-1], "r"); - if(!input) - { + if (!input.open(argv[argc-1])) { usage(argv[0]); } - output = stdout; + output = u_finit(stdout, NULL, NULL); } else { - input = fopen(argv[argc-2], "r"); - output = fopen(argv[argc-1], "w"); - - if(!input || !output) - { + output = u_fopen(argv[argc-1], "w", NULL, NULL); + if (!output || !input.open(argv[argc-2])) { usage(argv[0]); } } - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + cerr << "ERROR: Can't read file '" << argv[1] << "'" << endl; exit(EXIT_FAILURE); } processStream(input, output, null_flush); - -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - } diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc index 95c2c4c..a073afc 100644 --- a/apertium/apertium_pretransfer.cc +++ b/apertium/apertium_pretransfer.cc @@ -33,11 +33,11 @@ using namespace std; void usage(char *progname) { - wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; - wcerr << L" -n assume no surface forms" << endl; - wcerr << L" -e treat ~ as compound separator" << endl; - wcerr << L" -z null-flushing output on '\0'" << endl; - wcerr << L" -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " [input_file [output_file]]" << endl; + cerr << " -n assume no surface forms" << endl; + cerr << " -e treat ~ as compound separator" << endl; + cerr << " -z null-flushing output on '\\0'" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); } @@ -90,43 +90,35 @@ int main(int argc, char *argv[]) usage(argv[0]); } - FILE *input, *output; + InputFile input; + UFILE* output; if((argc-optind+1) == 1) { - input = stdin; - output = stdout; + output = u_finit(stdout, NULL, NULL); } else if ((argc-optind+1) == 2) { - input = fopen(argv[argc-1], "r"); - if(!input) - { + if(!input.open(argv[argc-1])) { usage(argv[0]); } - output = stdout; + u_finit(stdout, NULL, NULL); } else { - input = fopen(argv[argc-2], "r"); - output = fopen(argv[argc-1], "w"); + output = u_fopen(argv[argc-1], "w", NULL, NULL); - if(!input || !output) + if(!output || !input.open(argv[argc-2])) { usage(argv[0]); } } - if(feof(input)) + if(input.eof()) { - wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + cerr << "ERROR: Can't read file '" << argv[1] << "'" << endl; exit(EXIT_FAILURE); } -#ifdef _MSC_VER - _setmode(_fileno(input), _O_U8TEXT); - _setmode(_fileno(output), _O_U8TEXT); -#endif - processStream(input, output, null_flush, surface_forms, compound_sep); } diff --git a/apertium/apertium_re.cc b/apertium/apertium_re.cc index 7182614..b12dc3d 100644 --- a/apertium/apertium_re.cc +++ b/apertium/apertium_re.cc @@ -58,7 +58,7 @@ ApertiumRE::read(FILE *input) re = static_cast(pcre_malloc(size)); if(size != fread(re, 1, size, input)) { - wcerr << L"Error reading regexp" << endl; + cerr << "Error reading regexp" << endl; exit(EXIT_FAILURE); } @@ -74,8 +74,8 @@ ApertiumRE::compile(string const &str) &error, &erroroffset, NULL); if(re == NULL) { - wcerr << L"Error: pcre_compile "; - wcerr << error << endl; + cerr << "Error: pcre_compile "; + cerr << error << endl; exit(EXIT_FAILURE); } @@ -87,7 +87,7 @@ ApertiumRE::write(FILE *output) const { if(empty) { - wcerr << L"Error, cannot write empty regexp" << endl; + cerr << "Error, cannot write empty regexp" << endl; exit(EXIT_FAILURE); } @@ -95,7 +95,7 @@ ApertiumRE::write(FILE *output) const int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); if(rc < 0) { - wcerr << L"Error calling pcre_fullinfo()\n" << endl; + cerr << "Error calling pcre_fullinfo()\n" << endl; exit(EXIT_FAILURE); } @@ -104,7 +104,7 @@ ApertiumRE::write(FILE *output) const size_t rc2 = fwrite(re, 1, size, output); if(rc2 != size) { - wcerr << L"Error writing precompiled regex\n" << endl; + cerr << "Error writing precompiled regex\n" << endl; exit(EXIT_FAILURE); } } @@ -130,7 +130,7 @@ ApertiumRE::match(string const &str) const return ""; default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + cerr << "Error: Unknown error matching regexp (code " << rc << ")" << endl; exit(EXIT_FAILURE); } } @@ -159,7 +159,7 @@ ApertiumRE::replace(string &str, string const &value) const return false; default: - wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + cerr << "Error: Unknown error matching regexp (code " << rc << ")" << endl; exit(EXIT_FAILURE); } } diff --git a/apertium/apertium_tagger.cc b/apertium/apertium_tagger.cc index 8f6bb74..a384330 100644 --- a/apertium/apertium_tagger.cc +++ b/apertium/apertium_tagger.cc @@ -33,7 +33,7 @@ int main(int argc, char **argv) { try { Apertium::apertium_tagger(argc, argv); } catch (const Apertium::Exception::apertium_tagger::err_Exception &err_Exception_) { - std::wcerr << "Try 'apertium-tagger --help' for more information." << std::endl; + std::cerr << "Try 'apertium-tagger --help' for more information." << std::endl; return 1; } catch (...) { throw; diff --git a/apertium/apertium_tagger_apply_new_rules.cc b/apertium/apertium_tagger_apply_new_rules.cc index a8238e6..375c769 100644 --- a/apertium/apertium_tagger_apply_new_rules.cc +++ b/apertium/apertium_tagger_apply_new_rules.cc @@ -38,17 +38,17 @@ TTag eos; //End-of-sentence tag void check_file(FILE *f, const string& path) { if (!f) { - wcerr<<"Error: cannot open file '"<get_tags()]; if ((k>=tagger_data_hmm.getM())||(k<0)) { - wcerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; - wcerr<<"Ambiguity class: "<get_string_tags())<<"\n"; + cerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; + cerr<<"Ambiguity class: "<get_string_tags())<<"\n"; } } @@ -69,15 +69,15 @@ void readwords (FILE *is, int corpus_length) { word=lexmorfo.get_next_word(); } - wcerr<] < file.crp \n\n"; + cerr<<"USAGE:\n"; + cerr<] < file.crp \n\n"; - wcerr<<"ARGUMENTS: \n" + cerr<<"ARGUMENTS: \n" <<" --tsxfile|-x: Specify a tagger specification file\n" <<" --probfile|-p: Specify a tagger parameter file\n" <<" --clength|-l: Specify the length of the corpus to process\n"; @@ -92,12 +92,12 @@ int main(int argc, char* argv[]) { int c; int option_index=0; - wcerr<<"LOCALE: "<::const_iterator it = constants.begin(), limit = constants.end(); + for(map::const_iterator it = constants.begin(), limit = constants.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second, output); } } @@ -88,7 +88,7 @@ ConstantManager::read(FILE *input) int size = Compression::multibyte_read(input); for(int i = 0; i != size; i++) { - wstring mystr = Compression::wstring_read(input); + UString mystr = Compression::string_read(input); constants[mystr] = Compression::multibyte_read(input); } } @@ -96,11 +96,11 @@ ConstantManager::read(FILE *input) void ConstantManager::serialise(std::ostream &serialised) const { - Serialiser >::serialise(constants, serialised); + Serialiser >::serialise(constants, serialised); } void ConstantManager::deserialise(std::istream &serialised) { - constants = Deserialiser >::deserialise(serialised); + constants = Deserialiser >::deserialise(serialised); } diff --git a/apertium/constant_manager.h b/apertium/constant_manager.h index d8ed3f7..395edae 100644 --- a/apertium/constant_manager.h +++ b/apertium/constant_manager.h @@ -20,13 +20,14 @@ #include #include #include +#include using namespace std; class ConstantManager { private: - map constants; + map constants; void copy(ConstantManager const &o); void destroy(); @@ -36,8 +37,8 @@ public: ConstantManager(ConstantManager const &o); ConstantManager & operator =(ConstantManager const &o); - void setConstant(wstring const &constant, int const value); - int getConstant(wstring const &constant); + void setConstant(UString const &constant, int const value); + int getConstant(UString const &constant); void write(FILE *output); void read(FILE *input); void serialise(std::ostream &serialised) const; diff --git a/apertium/deserialiser.h b/apertium/deserialiser.h index ae40972..2f90ea2 100644 --- a/apertium/deserialiser.h +++ b/apertium/deserialiser.h @@ -90,13 +90,13 @@ i Deserialiser::deserialise(std::istream &Stream_) { Lemma Deserialiser::deserialise(std::istream &Stream_) { Lemma StreamedType_; - StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); + StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); return StreamedType_; } Morpheme Deserialiser::deserialise(std::istream &Stream_) { Morpheme SerialisedType_; - SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); + SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); SerialisedType_.TheTags = Deserialiser >::deserialise(Stream_); return SerialisedType_; @@ -104,7 +104,7 @@ Morpheme Deserialiser::deserialise(std::istream &Stream_) { Tag Deserialiser::deserialise(std::istream &Stream_) { Tag SerialisedType_; - SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); + SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); return SerialisedType_; } diff --git a/apertium/exception.h b/apertium/exception.h index 2bda473..3b97a76 100644 --- a/apertium/exception.h +++ b/apertium/exception.h @@ -27,9 +27,8 @@ namespace Exception { EXCEPTION_TYPE(const char *const what_) : ExceptionType(what_) {} \ EXCEPTION_TYPE(const std::string &what_) : ExceptionType(what_) {} \ EXCEPTION_TYPE(const std::stringstream &what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const wchar_t *const what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const std::wstring &what_) : ExceptionType(what_) {} \ - EXCEPTION_TYPE(const std::wstringstream &what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const UChar *const what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const UString &what_) : ExceptionType(what_) {} \ ~EXCEPTION_TYPE() throw() {} \ }; diff --git a/apertium/exception_type.cc b/apertium/exception_type.cc index 0f32b45..c83dc3f 100644 --- a/apertium/exception_type.cc +++ b/apertium/exception_type.cc @@ -20,29 +20,27 @@ #include namespace Apertium { -ExceptionType::ExceptionType(const char *const what_) : what_(what_) {} +ExceptionType::ExceptionType(const char *const what_) + : what_(to_ustring(what_)) {} -ExceptionType::ExceptionType(const std::string &what_) : what_(what_) {} +ExceptionType::ExceptionType(const std::string &what_) + : what_(to_ustring(what_.c_str())) {} ExceptionType::ExceptionType(const std::stringstream &what_) - : what_(what_.str()) {} + : what_(to_ustring(what_.str().c_str())) {} -ExceptionType::ExceptionType(const wchar_t *const what_) -{ - this->what_ = UtfConverter::toUtf8(what_); -} +ExceptionType::ExceptionType(const UChar *const what_) + : what_(what_) {} -ExceptionType::ExceptionType(const std::wstring &what_) -{ - this->what_ = UtfConverter::toUtf8(what_); -} - -ExceptionType::ExceptionType(const std::wstringstream &what_) -{ - this->what_ = UtfConverter::toUtf8(what_.str()); -} +ExceptionType::ExceptionType(const UString &what_) + : what_(what_) {} ExceptionType::~ExceptionType() throw() {} -const char *ExceptionType::what() const throw() { return what_.c_str(); } +const char *ExceptionType::what() const throw() +{ + std::string res; + utf8::utf16to8(what_.begin(), what_.end(), std::back_inserter(res)); + return res.c_str(); +} } diff --git a/apertium/exception_type.h b/apertium/exception_type.h index a780b75..9ee46ac 100644 --- a/apertium/exception_type.h +++ b/apertium/exception_type.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace Apertium { class ExceptionType : public std::exception { @@ -26,14 +27,13 @@ public: ExceptionType(const char *const what_); ExceptionType(const std::string &what_); ExceptionType(const std::stringstream &what_); - ExceptionType(const wchar_t *wchar_t_what_); - ExceptionType(const std::wstring &wchar_t_what_); - ExceptionType(const std::wstringstream &wchar_t_what_); + ExceptionType(const UChar *wchar_t_what_); + ExceptionType(const UString &wchar_t_what_); virtual ~ExceptionType() throw() = 0; const char *what() const throw(); protected: - std::string what_; + UString what_; }; } diff --git a/apertium/file_morpho_stream.cc b/apertium/file_morpho_stream.cc index 5040216..46a398a 100644 --- a/apertium/file_morpho_stream.cc +++ b/apertium/file_morpho_stream.cc @@ -26,29 +26,29 @@ #include using namespace Apertium; -FileMorphoStream::FileMorphoStream(FILE *ftxt, bool d, TaggerData *t) : +FileMorphoStream::FileMorphoStream(const char* ftxt, bool d, TaggerData *t) : ms() { foundEOF = false; debug=d; td = t; me = td->getPatternList().newMatchExe(); alphabet = td->getPatternList().getAlphabet(); - input = ftxt; + input.open(ftxt); ca_any_char = alphabet(PatternList::ANY_CHAR); ca_any_tag = alphabet(PatternList::ANY_TAG); ConstantManager &constants = td->getConstants(); - ca_kignorar = constants.getConstant(L"kIGNORAR"); - ca_kbarra = constants.getConstant(L"kBARRA"); - ca_kdollar = constants.getConstant(L"kDOLLAR"); - ca_kbegin = constants.getConstant(L"kBEGIN"); - ca_kmot = constants.getConstant(L"kMOT"); - ca_kmas = constants.getConstant(L"kMAS"); - ca_kunknown = constants.getConstant(L"kUNKNOWN"); + ca_kignorar = constants.getConstant("kIGNORAR"); + ca_kbarra = constants.getConstant("kBARRA"); + ca_kdollar = constants.getConstant("kDOLLAR"); + ca_kbegin = constants.getConstant("kBEGIN"); + ca_kmot = constants.getConstant("kMOT"); + ca_kmas = constants.getConstant("kMAS"); + ca_kunknown = constants.getConstant("kUNKNOWN"); - map &tag_index = td->getTagIndex(); - ca_tag_keof = tag_index[L"TAG_kEOF"]; - ca_tag_kundef = tag_index[L"TAG_kUNDEF"]; + map &tag_index = td->getTagIndex(); + ca_tag_keof = tag_index["TAG_kEOF"]; + ca_tag_kundef = tag_index["TAG_kUNDEF"]; end_of_file = false; null_flush = false; @@ -69,7 +69,7 @@ FileMorphoStream::get_next_word() if(word->isAmbiguous()) { - vector &ref = td->getDiscardRules(); + vector &ref = td->getDiscardRules(); for(unsigned int i = 0; i < ref.size(); i++) { word->discardOnAmbiguity(ref[i]); @@ -93,7 +93,7 @@ FileMorphoStream::get_next_word() if(feof(input) || (null_flush && symbol == L'\0')) { end_of_file = true; - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); return get_next_word(); } if(symbol == L'^') @@ -103,7 +103,7 @@ FileMorphoStream::get_next_word() } else { - wstring str = L""; + UString str = ""; if(symbol == L'\\') { symbol = fgetwc_unlocked(input); @@ -123,7 +123,7 @@ FileMorphoStream::get_next_word() { end_of_file = true; vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); return get_next_word(); } else if(symbol == L'\\') @@ -134,7 +134,7 @@ FileMorphoStream::get_next_word() { end_of_file = true; vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); return get_next_word(); } str += static_cast(symbol); @@ -159,7 +159,7 @@ FileMorphoStream::get_next_word() } void -FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) +FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) { int floor = 0; int last_type = -1; @@ -183,7 +183,7 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) } else { - wstring tag = L""; + UString tag = ""; for(int j = i+1; j != limit; j++) { if(str[j] == L'\\') @@ -232,8 +232,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) { if (debug) { - wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); return; @@ -264,8 +264,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) { if (debug) { - wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); return; @@ -280,8 +280,8 @@ FileMorphoStream::lrlmClassify(wstring const &str, int &ivwords) val = ca_tag_kundef; if (debug) { - wcerr< 0) { vwords[ivwords]->add_ignored_string(str); - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; + cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); return; } else if(symbol == L'\\') @@ -319,13 +319,13 @@ FileMorphoStream::readRestOfWord(int &ivwords) else if(symbol == L'/') { vwords[ivwords]->set_superficial_form(str); - str = L""; + str = ""; break; } else if(symbol == L'$') { vwords[ivwords]->set_superficial_form(str); - vwords[ivwords]->add_ignored_string(L"$"); + vwords[ivwords]->add_ignored_string("$"); break; } else @@ -345,11 +345,11 @@ FileMorphoStream::readRestOfWord(int &ivwords) if(str.size() > 0) { vwords[ivwords]->add_ignored_string(str); - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; + cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); return; } else if(symbol == L'\\') @@ -362,7 +362,7 @@ FileMorphoStream::readRestOfWord(int &ivwords) else if(symbol == L'/') { lrlmClassify(str, ivwords); - str = L""; + str = ""; ivwords = 0; continue; } diff --git a/apertium/file_morpho_stream.h b/apertium/file_morpho_stream.h index 3d40802..6a6ecf6 100644 --- a/apertium/file_morpho_stream.h +++ b/apertium/file_morpho_stream.h @@ -47,9 +47,9 @@ using namespace std; class FileMorphoStream : public MorphoStream { private: bool foundEOF; - wstring last_string_tag; + UString last_string_tag; bool debug; - FILE *input; + InputFile input; int ca_any_char; int ca_any_tag; int ca_kignorar; @@ -74,13 +74,13 @@ private: bool end_of_file; void readRestOfWord(int &ivwords); - void lrlmClassify(wstring const &str, int &ivwords); + void lrlmClassify(UString const &str, int &ivwords); public: /** Constructor * @param is the input stream. */ - FileMorphoStream(FILE *ftxt, bool d, TaggerData *t); + FileMorphoStream(const char* ftxt, bool d, TaggerData *t); /** * Destructor diff --git a/apertium/file_tagger.cc b/apertium/file_tagger.cc index cdce82c..774a616 100644 --- a/apertium/file_tagger.cc +++ b/apertium/file_tagger.cc @@ -40,8 +40,8 @@ void FILE_Tagger::setNullFlush(const bool &NullFlush) { TheFlags.setNullFlush(NullFlush); } -void FILE_Tagger::tagger(FILE *Input, FILE *Output) { - FileMorphoStream morpho_stream(Input, TheFlags.getDebug(), &get_tagger_data()); +void FILE_Tagger::tagger(const char* input_file, UFILE *Output) { + FileMorphoStream morpho_stream(input_file, TheFlags.getDebug(), &get_tagger_data()); tagger(morpho_stream, Output); } @@ -51,13 +51,13 @@ void FILE_Tagger::init_and_train(MorphoStream &lexmorfo, unsigned long count) { train(lexmorfo, count); } -void FILE_Tagger::init_and_train(FILE *corpus, unsigned long count) { - init_probabilities_kupiec_(corpus); - train(corpus, count); +void FILE_Tagger::init_and_train(const char* corpus_file, unsigned long count) { + init_probabilities_kupiec_(corpus_file); + train(corpus_file, count); } -void FILE_Tagger::train(FILE *corpus, unsigned long count) { - FileMorphoStream lexmorfo(corpus, true, &get_tagger_data()); +void FILE_Tagger::train(const char* corpus_file, unsigned long count) { + FileMorphoStream lexmorfo(corpus_file, true, &get_tagger_data()); train(lexmorfo, count); } @@ -67,15 +67,16 @@ void FILE_Tagger::deserialise(string const &TaggerSpecificationFilename) { deserialise(TaggerSpecificationReader_.getTaggerData()); } -void FILE_Tagger::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, - FILE *Corpus) { - FileMorphoStream stream_tagged(TaggedCorpus, true, &get_tagger_data()); - FileMorphoStream stream_untagged(Corpus, true, &get_tagger_data()); +void FILE_Tagger::init_probabilities_from_tagged_text_( + const char* tagged_file, const char* untagged_file) +{ + FileMorphoStream stream_tagged(tagged_file, true, &get_tagger_data()); + FileMorphoStream stream_untagged(untagged_file, true, &get_tagger_data()); init_probabilities_from_tagged_text_(stream_tagged, stream_untagged); } -void FILE_Tagger::init_probabilities_kupiec_(FILE *Corpus) { - FileMorphoStream lexmorfo(Corpus, true, &get_tagger_data()); +void FILE_Tagger::init_probabilities_kupiec_(const char* corpus_file) { + FileMorphoStream lexmorfo(corpus_file, true, &get_tagger_data()); init_probabilities_kupiec_(lexmorfo); } diff --git a/apertium/file_tagger.h b/apertium/file_tagger.h index dc9a543..c7bceb0 100644 --- a/apertium/file_tagger.h +++ b/apertium/file_tagger.h @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -34,22 +36,22 @@ public: void set_debug(const bool &Debug); void set_show_sf(const bool &ShowSuperficial); void setNullFlush(const bool &NullFlush); - virtual void tagger(FILE *Input, FILE *Output); - virtual void tagger(MorphoStream &morpho_stream, FILE *Output) = 0; - virtual std::vector &getArrayTags() = 0; + virtual void tagger(const char* input_file, UFILE* Output); + virtual void tagger(MorphoStream &morpho_stream, UFILE* Output) = 0; + virtual std::vector &getArrayTags() = 0; void init_and_train(MorphoStream &lexmorfo, unsigned long Count); - void init_and_train(FILE *Corpus, unsigned long Count); - virtual void train(FILE *Corpus, unsigned long Count); + void init_and_train(const char* corpus_file, unsigned long Count); + virtual void train(const char* corpus_file, unsigned long Count); virtual void train(MorphoStream &lexmorpho, unsigned long count) = 0; virtual void train(MorphoStream &lexmorpho) = 0; virtual void serialise(FILE *Stream_) = 0; void deserialise(string const &TaggerSpecificationFilename); virtual void init_probabilities_from_tagged_text_( - FILE *TaggedCorpus, FILE *Corpus); + const char* tagged_file, const char* untagged_file); virtual void init_probabilities_from_tagged_text_( MorphoStream &stream_tagged, MorphoStream &stream_untagged) = 0; - virtual void init_probabilities_kupiec_(FILE *Corpus); + virtual void init_probabilities_kupiec_(const char* corpus_file); virtual void init_probabilities_kupiec_(MorphoStream &lexmorfo) = 0; /** It reads the expanded dictionary received as a parameter and calculates diff --git a/apertium/hmm.cc b/apertium/hmm.cc index 9285217..2e7aec9 100644 --- a/apertium/hmm.cc +++ b/apertium/hmm.cc @@ -58,10 +58,10 @@ TaggerData& HMM::get_tagger_data() { void HMM::deserialise(FILE *Serialised_FILE_Tagger) { tdhmm.read(Serialised_FILE_Tagger); - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } -std::vector &HMM::getArrayTags() { +std::vector &HMM::getArrayTags() { return tdhmm.getArrayTags(); } @@ -69,7 +69,7 @@ void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); } void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) { tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger); - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } void HMM::init_probabilities_from_tagged_text_(MorphoStream &stream_tagged, @@ -99,7 +99,7 @@ HMM::HMM(TaggerFlags& Flags_) : FILE_Tagger(Flags_) {} HMM::HMM(TaggerDataHMM _tdhmm) : tdhmm(_tdhmm) { - eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; + eos = (tdhmm.getTagIndex())["TAG_SENT"_u]; } HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {} @@ -193,7 +193,7 @@ HMM::init_probabilities_kupiec(MorphoStream &lexmorfo) //We count for each ambiguity class the number of ocurrences word = lexmorfo.get_next_word(); while((word)) { - if (++nw%10000==0) wcerr<get_tags(); @@ -265,7 +265,7 @@ HMM::init_probabilities_kupiec(MorphoStream &lexmorfo) } } } - wcerr<get_superficial_form()!=word_untagged->get_superficial_form()) { - wcerr<get_tags().size()==0) // Unknown word tag1 = -1; else if (word_tagged->get_tags().size()>1) // Ambiguous word - wcerr<get_superficial_form()<get_superficial_form()<<"\n"; else tag1 = *(word_tagged->get_tags()).begin(); @@ -368,7 +368,7 @@ HMM::init_probabilities_from_tagged_text(MorphoStream &stream_tagged, } } - wcerr< > ambiguity_classes; - FileMorphoStream morpho_stream(in, true, &tdhmm); + FileMorphoStream morpho_stream(input_file, true, &tdhmm); TaggerWord *word = morpho_stream.get_next_word(); @@ -434,7 +434,7 @@ HMM::filter_ambiguity_classes(FILE *in, FILE *out) { if(ambiguity_classes.find(tags) == ambiguity_classes.end()) { ambiguity_classes.insert(tags); word->outputOriginal(out); - //wcerr<get_string_tags()<get_string_tags()<<"\n"; } } delete word; @@ -474,12 +474,12 @@ HMM::train(MorphoStream &morpho_stream) { while (word) { - //wcerr<1) { @@ -521,8 +521,8 @@ HMM::train(MorphoStream &morpho_stream) { prob = alpha[len][tag]; - //wcerr<<"prob="<1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())[L"TAG_kEOF"]))) { - wcerr << L"Warning: The last tag is not the end-of-sentence-tag " - << L"but rather " << tdhmm.getArrayTags()[tag] << L". Line: " << nw - << L". Pending: " << pending.size() << ". Tags: "; - wcerr << "\n"; + if ((pending.size()>1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())["TAG_kEOF"_u]))) { + cerr << "Warning: The last tag is not the end-of-sentence-tag " + << "but rather " << tdhmm.getArrayTags()[tag] << ". Line: " << nw + << ". Pending: " << pending.size() << ". Tags: "; + cerr << "\n"; } int N = tdhmm.getN(); @@ -597,24 +597,24 @@ HMM::train(MorphoStream &morpho_stream) { j = jt->first; if (xsi[i][j]>0) { if (gamma[i]==0) { - wcerr<get_superficial_form()<get_string_tags()<get_superficial_form()<<"' "<get_string_tags()<<"\n"; } for (unsigned t=0; t1)&&(TheFlags.getDebug())) { - wstring errors; - errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n"; - errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n"; - wcerr< &getArrayTags(); + std::vector &getArrayTags(); void serialise(FILE *Stream_); void deserialise(const TaggerData &Deserialised_FILE_Tagger); void init_probabilities_from_tagged_text_(MorphoStream &stream_tagged, @@ -143,7 +143,7 @@ public: * @param in the input stream with the untagged text to tag * @param out the output stream with the tagged text */ - void tagger(MorphoStream &morpho_stream, FILE *Output); + void tagger(MorphoStream &morpho_stream, UFILE* Output); /** Prints the A matrix. */ @@ -157,7 +157,7 @@ public: */ void print_ambiguity_classes(); - void filter_ambiguity_classes(FILE *in, FILE *out); + void filter_ambiguity_classes(const char* input_file, UFILE* output); }; #endif diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index 019577d..3afb9d3 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -91,55 +91,58 @@ Interchunk::readData(FILE *in) me = new MatchExe(t, finals); // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); + //bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); + Compression::string_read(in); // version + bool recompile_attrs = true; for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); + UString fallback = Compression::string_read(in); if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + //attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + // TODO regexs } } // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); } // macros for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); macros[cad_k] = Compression::multibyte_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + UString const cad_k = Compression::string_read(in); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { - wstring const cad_v = Compression::wstring_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + UString const cad_v = Compression::string_read(in); + lists[cad_k].insert(cad_v); + listslow[cad_k].insert(StringUtils::tolower(cad_v)); } } } void -Interchunk::read(string const &transferfile, string const &datafile) +Interchunk::read(const char* transferfile, const char* datafile) { readInterchunk(transferfile); // datafile - FILE *in = fopen(datafile.c_str(), "rb"); + FILE *in = fopen(datafile, "rb"); if(!in) { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; + cerr << "Error: Could not open file '" << datafile << "'." << endl; exit(EXIT_FAILURE); } readData(in); @@ -148,13 +151,13 @@ Interchunk::read(string const &transferfile, string const &datafile) } void -Interchunk::readInterchunk(string const &in) +Interchunk::readInterchunk(const char* in) { - doc = xmlReadFile(in.c_str(), NULL, 0); + doc = xmlReadFile(in, NULL, 0); if(doc == NULL) { - wcerr << "Error: Could not parse file '" << in << "'." << endl; + cerr << "Error: Could not parse file '" << in << "'." << endl; exit(EXIT_FAILURE); } @@ -215,23 +218,23 @@ Interchunk::checkIndex(xmlNode *element, int index, int limit) { if(index >= limit) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index >= limit" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; return false; } if(index < 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index < 0" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; return false; } if(word[index] == 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": Null access at word[index]" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; return false; } return true; } -string +UString Interchunk::evalString(xmlNode *element) { if (element == 0) @@ -249,9 +252,9 @@ Interchunk::evalString(xmlNode *element) case ti_clip_tl: if(checkIndex(element, ti.getPos(), lword)) { - if(ti.getContent() == "content") // jacob's new 'part' + if(ti.getContent() == "content"_u) // jacob's new 'part' { - string wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + UString wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); return wf.substr(1, wf.length()-2); // trim away the { and } } else @@ -271,7 +274,7 @@ Interchunk::evalString(xmlNode *element) case ti_b: if(!blank_queue.empty()) { - string retblank = blank_queue.front(); + UString retblank = blank_queue.front(); if(in_out) { @@ -282,7 +285,7 @@ Interchunk::evalString(xmlNode *element) } else { - return " "; + return " "_u; } break; @@ -302,21 +305,21 @@ Interchunk::evalString(xmlNode *element) break; default: - return ""; + return ""_u; } - return ""; + return ""_u; } if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = element->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -324,27 +327,27 @@ Interchunk::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) { evalStringCache[element] = TransferInstr(ti_lit_tag, - tags((const char *) element->properties->children->content), 0); + tags(to_ustring((const char *) element->properties->children->content)), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) { - evalStringCache[element] = TransferInstr(ti_lit, ((const char *) element->properties->children->content), 0); + evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) { if(element->properties == NULL) { - evalStringCache[element] = TransferInstr(ti_b, " ", -1); + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); } else { int pos = atoi((const char *) element->properties->children->content) - 1; - evalStringCache[element] = TransferInstr(ti_b, "", pos); + evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); } } else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) @@ -360,34 +363,34 @@ Interchunk::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); } else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) { - evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + evalStringCache[element] = TransferInstr(ti_var, to_ustring((const char *) element->properties->children->content), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = element->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content) - 1; + pos = atoi((const char *) i->children->content) - 1; } } - evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) { - string value; + UString value; for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -403,7 +406,7 @@ Interchunk::evalString(xmlNode *element) } else { - wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; exit(EXIT_FAILURE); } @@ -421,11 +424,11 @@ Interchunk::processOut(xmlNode *localroot) { if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { - fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + write(processChunk(i), output); } else // 'b' { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + write(evalString(i), output); } } } @@ -433,11 +436,11 @@ Interchunk::processOut(xmlNode *localroot) in_out = false; } -string +UString Interchunk::processChunk(xmlNode *localroot) { - string result; - result.append("^"); + UString result; + result.append("^"_u); for(xmlNode *i = localroot->children; i != NULL; i = i->next) { @@ -447,7 +450,7 @@ Interchunk::processChunk(xmlNode *localroot) } } - result.append("$"); + result.append("$"_u); return result; } @@ -491,12 +494,12 @@ Interchunk::processLet(xmlNode *localroot) { if(leftSide == NULL) { - leftSide = i; + leftSide = i; } else { - rightSide = i; - break; + rightSide = i; + break; } } } @@ -516,7 +519,7 @@ Interchunk::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -527,36 +530,36 @@ Interchunk::processLet(xmlNode *localroot) } if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); variables[val] = evalString(rightSide); evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content) - 1; + pos = atoi((const char *) i->children->content) - 1; } } - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], + bool match = word[pos]->setChunkPart(attr_items[part], evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_tl, - (const char *) part, + part, pos, NULL); } } @@ -564,12 +567,12 @@ Interchunk::processLet(xmlNode *localroot) void Interchunk::processAppend(xmlNode *localroot) { - string name; + UString name; for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "n")) { - name = (char *) i->children->content; + name = to_ustring((char *) i->children->content); break; } } @@ -607,31 +610,31 @@ Interchunk::processModifyCase(xmlNode *localroot) if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content) - 1; + pos = atoi((const char *) i->children->content) - 1; } } - string const result = copycase(evalString(rightSide), - word[pos]->chunkPart(attr_items[(const char *) part])); - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], result); + UString const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[part])); + bool match = word[pos]->setChunkPart(attr_items[part], result); if(!match && trace) { - wcerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-interchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); variables[val] = copycase(evalString(rightSide), variables[val]); } } @@ -639,7 +642,7 @@ Interchunk::processModifyCase(xmlNode *localroot) void Interchunk::processCallMacro(xmlNode *localroot) { - const char *n = (const char *) localroot->properties->children->content; + UString n = to_ustring((const char *) localroot->properties->children->content); int npar = 0; xmlNode *macro = macro_map[macros[n]]; @@ -791,7 +794,7 @@ bool Interchunk::processIn(xmlNode *localroot) { xmlNode *value = NULL; - xmlChar *idlist = NULL; + UString idlist; for(xmlNode *i = localroot->children; i != NULL; i = i->next) { @@ -799,44 +802,30 @@ Interchunk::processIn(xmlNode *localroot) { if(value == NULL) { - value = i; + value = i; } else { - idlist = i->properties->children->content; - break; + idlist = to_ustring((char*)i->properties->children->content); + break; } } } - string sval = evalString(value); + UString sval = evalString(value); if(localroot->properties != NULL) { if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - set &myset = listslow[(const char *) idlist]; - if(myset.find(tolower(sval)) != myset.end()) - { - return true; - } - else - { - return false; - } + set &myset = listslow[idlist]; + return (myset.find(tolower(sval)) != myset.end()); } } - set &myset = lists[(const char *) idlist]; - if(myset.find(sval) != myset.end()) - { - return true; - } - else - { - return false; - } + set &myset = lists[idlist]; + return (myset.find(sval) != myset.end()); } bool @@ -935,7 +924,7 @@ Interchunk::processEqual(xmlNode *localroot) } bool -Interchunk::beginsWith(string const &s1, string const &s2) const +Interchunk::beginsWith(UString const &s1, UString const &s2) const { int const limit = s2.size(), constraint = s1.size(); @@ -955,7 +944,7 @@ Interchunk::beginsWith(string const &s1, string const &s2) const } bool -Interchunk::endsWith(string const &s1, string const &s2) const +Interchunk::endsWith(UString const &s1, UString const &s2) const { int const limit = s2.size(), constraint = s1.size(); @@ -1074,21 +1063,21 @@ Interchunk::processBeginsWithList(xmlNode *localroot) } } - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; + UString idlist = to_ustring((char*)second->properties->children->content); + UString needle = evalString(first); + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); + it = lists[idlist].begin(); + limit = lists[idlist].end(); } else { needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); + it = listslow[idlist].begin(); + limit = listslow[idlist].end(); } for(; it != limit; it++) @@ -1122,21 +1111,21 @@ Interchunk::processEndsWithList(xmlNode *localroot) } } - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; + UString idlist = to_ustring((char*)second->properties->children->content); + UString needle = evalString(first); + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); + it = lists[idlist].begin(); + limit = lists[idlist].end(); } else { needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); + it = listslow[idlist].begin(); + limit = listslow[idlist].end(); } for(; it != limit; it++) @@ -1172,28 +1161,28 @@ Interchunk::processContainsSubstring(xmlNode *localroot) if(localroot->properties == NULL) { - return evalString(first).find(evalString(second)) != string::npos; + return evalString(first).find(evalString(second)) != UString::npos; } else { if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + return tolower(evalString(first)).find(tolower(evalString(second))) != UString::npos; } else { - return evalString(first).find(evalString(second)) != string::npos; + return evalString(first).find(evalString(second)) != UString::npos; } } } -string -Interchunk::copycase(string const &source_word, string const &target_word) +UString +Interchunk::copycase(UString const &source_word, UString const &target_word) { - wstring result; - wstring const s_word = UtfConverter::fromUtf8(source_word); - wstring const t_word = UtfConverter::fromUtf8(target_word); + UString result; + UString const s_word = source_word; + UString const t_word = target_word; bool firstupper = iswupper(s_word[0]); bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); @@ -1213,62 +1202,60 @@ Interchunk::copycase(string const &source_word, string const &target_word) result[0] = towupper(result[0]); } - return UtfConverter::toUtf8(result); + return result; } -string -Interchunk::caseOf(string const &str) +UString +Interchunk::caseOf(UString const &s) { - wstring const s = UtfConverter::fromUtf8(str); - if(s.size() > 1) { if(!iswupper(s[0])) { - return "aa"; + return "aa"_u; } else if(!iswupper(s[s.size()-1])) { - return "Aa"; + return "Aa"_u; } else { - return "AA"; + return "AA"_u; } } else if(s.size() == 1) { if(!iswupper(s[0])) { - return "aa"; + return "aa"_u; } else { - return "Aa"; + return "Aa"_u; } } else { - return "aa"; + return "aa"_u; } } -string -Interchunk::tolower(string const &str) const +UString +Interchunk::tolower(UString const &str) const { - return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); + return StringUtils::tolower(str); } -string -Interchunk::tags(string const &str) const +UString +Interchunk::tags(UString const &str) const { - string result = "<"; + UString result = "<"_u; for(unsigned int i = 0, limit = str.size(); i != limit; i++) { if(str[i] == '.') { - result.append("><"); + result.append("><"_u); } else { @@ -1295,98 +1282,83 @@ Interchunk::processRule(xmlNode *localroot) while(!blank_queue.empty()) //flush remaining blanks that are not spaces { - if(blank_queue.front().compare(" ") != 0) - { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + if(blank_queue.front().compare(" "_u) != 0) { + write(blank_queue.front(), output); } blank_queue.pop(); } } TransferToken & -Interchunk::readToken(FILE *in) +Interchunk::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content; + UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (internal_null_flush && val == 0)) + int val = in.get(); + if(in.eof() || (internal_null_flush && val == 0)) { return input_buffer.add(TransferToken(content, tt_eof)); } - if(val == L'\\') + if(val == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L']') - { - content += L']'; - break; - } - else - { - content += wchar_t(val2); - } + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == ']') { + content += ']'; + break; + } else { + content += val2; + } } } - else if(inword && val == L'{') - { - content += L'{'; - while(true) - { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L'}') - { - wint_t val3 = wchar_t(fgetwc_unlocked(in)); - ungetwc(val3, in); - - content += L'}'; - if(val3 == L'$') - { - break; - } - } - else - { - content += wchar_t(val2); - } + else if(inword && val == '{') { + content += '{'; + while(true) { + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == '}') { + UChar32 val3 = in.peek(); + + content += '}'; + if(val3 == '$') { + break; + } + } else { + content += val2; + } } } - else if(inword && val == L'$') + else if(inword && val == '$') { inword = false; return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { inword = true; return input_buffer.add(TransferToken(content, tt_blank)); } else { - content += wchar_t(val); + content += val; } } } @@ -1410,20 +1382,15 @@ Interchunk::setTrace(bool trace) } void -Interchunk::interchunk_wrapper_null_flush(FILE *in, FILE *out) +Interchunk::interchunk_wrapper_null_flush(InputFile& in, UFILE* out) { null_flush = false; internal_null_flush = true; - while(!feof(in)) - { + while(!in.eof()) { interchunk(in, out); - fputwc_unlocked(L'\0', out); - int code = fflush(out); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', out); + u_fflush(out); } internal_null_flush = false; null_flush = true; @@ -1431,7 +1398,7 @@ Interchunk::interchunk_wrapper_null_flush(FILE *in, FILE *out) void -Interchunk::interchunk(FILE *in, FILE *out) +Interchunk::interchunk(InputFile& in, UFILE* out) { if(getNullFlush()) { @@ -1456,9 +1423,7 @@ Interchunk::interchunk(FILE *in, FILE *out) { if(tmpword.size() != 0) { - fputwc_unlocked(L'^', output); - fputws_unlocked(tmpword[0]->c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S$", tmpword[0]->c_str()); tmpword.clear(); input_buffer.setPos(last); input_buffer.next(); @@ -1467,7 +1432,7 @@ Interchunk::interchunk(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - fputws_unlocked(tmpblank[0]->c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); last = input_buffer.getPos(); ms.init(me->getInitial()); @@ -1485,16 +1450,11 @@ Interchunk::interchunk(FILE *in, FILE *out) if(trace) { - wcerr << endl << L"apertium-interchunk: Rule " << val << L" line " << lastrule_line << L" "; - for (unsigned int ind = 0; ind < tmpword.size(); ind++) - { - if (ind != 0) - { - wcerr << L" "; - } - fputws_unlocked(tmpword[ind]->c_str(), stderr); + cerr << endl << "apertium-interchunk: Rule " << val << " line " << lastrule_line; + for (auto& it : tmpword) { + cerr << " " << *it; } - wcerr << endl; + cerr << endl; } } @@ -1508,7 +1468,7 @@ Interchunk::interchunk(FILE *in, FILE *out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -1520,14 +1480,14 @@ Interchunk::interchunk(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); tmpblank.clear(); return; } break; default: - wcerr << "Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } @@ -1549,12 +1509,12 @@ Interchunk::applyRule() { if(int(blank_queue.size()) < last_lword - 1) { - string blank_to_add = string(UtfConverter::toUtf8(*tmpblank[i-1])); + UString blank_to_add = UString(*tmpblank[i-1]); blank_queue.push(blank_to_add); } } - word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i])); + word[i] = new InterchunkWord(*tmpword[i]); } processRule(lastrule); @@ -1576,22 +1536,22 @@ Interchunk::applyRule() } void -Interchunk::applyWord(wstring const &word_str) +Interchunk::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; ms.step(towlower(word_str[i]), any_char); break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -1608,8 +1568,8 @@ Interchunk::applyWord(wstring const &word_str) } break; - case L'{': // ignore the unmodifiable part of the chunk - ms.step(L'$'); + case '{': // ignore the unmodifiable part of the chunk + ms.step('$'); return; default: @@ -1617,5 +1577,5 @@ Interchunk::applyWord(wstring const &word_str) break; } } - ms.step(L'$'); + ms.step('$'); } diff --git a/apertium/interchunk.h b/apertium/interchunk.h index 6efbf45..dc940f8 100644 --- a/apertium/interchunk.h +++ b/apertium/interchunk.h @@ -23,9 +23,10 @@ #include #include #include -#include +#include #include #include +#include #include #include @@ -45,25 +46,25 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; vector macro_map; vector rule_map; vector rule_lines; xmlDoc *doc; xmlNode *root_element; InterchunkWord **word; - queue blank_queue; + queue blank_queue; int lword; int last_lword; Buffer input_buffer; - vector tmpword; - vector tmpblank; + vector tmpword; + vector tmpblank; - FILE *output; + UFILE* output; int any_char; int any_tag; @@ -79,11 +80,11 @@ private: void destroy(); void readData(FILE *input); - void readInterchunk(string const &input); + void readInterchunk(const char* input); void collectMacros(xmlNode *localroot); void collectRules(xmlNode *localroot); - string caseOf(string const &str); - string copycase(string const &source_word, string const &target_word); + UString caseOf(UString const &str); + UString copycase(UString const &source_word, UString const &target_word); void processLet(xmlNode *localroot); void processAppend(xmlNode *localroot); @@ -103,30 +104,30 @@ private: bool processNot(xmlNode *localroot); bool processIn(xmlNode *localroot); void processRule(xmlNode *localroot); - string evalString(xmlNode *localroot); + UString evalString(xmlNode *localroot); void processInstruction(xmlNode *localroot); void processChoose(xmlNode *localroot); - string processChunk(xmlNode *localroot); + UString processChunk(xmlNode *localroot); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; - string readWord(FILE *in); - string readBlank(FILE *in); - string readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + bool beginsWith(UString const &str1, UString const &str2) const; + bool endsWith(UString const &str1, UString const &str2) const; + UString tolower(UString const &str) const; + UString tags(UString const &str) const; + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); void applyRule(); - TransferToken & readToken(FILE *in); + TransferToken & readToken(InputFile& in); bool checkIndex(xmlNode *element, int index, int limit); - void interchunk_wrapper_null_flush(FILE *in, FILE *out); + void interchunk_wrapper_null_flush(InputFile& in, UFILE* out); public: Interchunk(); ~Interchunk(); - void read(string const &transferfile, string const &datafile); - void interchunk(FILE *in, FILE *out); + void read(const char* transferfile, const char* datafile); + void interchunk(InputFile& in, UFILE* out); bool getNullFlush(void); void setNullFlush(bool null_flush); void setTrace(bool trace); diff --git a/apertium/interchunk_word.cc b/apertium/interchunk_word.cc index 28df1c5..50c948e 100644 --- a/apertium/interchunk_word.cc +++ b/apertium/interchunk_word.cc @@ -36,7 +36,7 @@ InterchunkWord::InterchunkWord() { } -InterchunkWord::InterchunkWord(string const &chunk) +InterchunkWord::InterchunkWord(UString const &chunk) { init(chunk); } @@ -63,7 +63,7 @@ InterchunkWord::operator =(InterchunkWord const &o) } void -InterchunkWord::init(string const &chunk) +InterchunkWord::init(UString const &chunk) { size_t b_end = 0; for(size_t i = 0; i < chunk.size(); i++) @@ -96,13 +96,13 @@ InterchunkWord::init(string const &chunk) { this->chunk = chunk; } - this->queue = ""; + this->queue.clear(); } -string +UString InterchunkWord::chunkPart(ApertiumRE const &part) { - string result = part.match(chunk); + UString result = part.match(chunk); if(result.size() == 0) { result = part.match(queue); @@ -125,14 +125,14 @@ InterchunkWord::chunkPart(ApertiumRE const &part) } } -string +UString InterchunkWord::getWblank() { return wblank; } bool -InterchunkWord::setChunkPart(ApertiumRE const &part, string const &value) +InterchunkWord::setChunkPart(ApertiumRE const &part, UString const &value) { return part.replace(chunk, value); } diff --git a/apertium/interchunk_word.h b/apertium/interchunk_word.h index 670bcca..bd0c5ed 100644 --- a/apertium/interchunk_word.h +++ b/apertium/interchunk_word.h @@ -21,6 +21,7 @@ #include #include #include +#include using namespace std; @@ -33,17 +34,17 @@ private: /** * Target language chunk name and tags */ - string chunk; + UString chunk; /** * Target language chunk content */ - string queue; + UString queue; /** * Wordbound blank (for postchunk) */ - string wblank; + UString wblank; /** * Copy method @@ -76,7 +77,7 @@ public: * Parametric constructor calling init() * @param chunk the chunk */ - InterchunkWord(string const &chunk); + InterchunkWord(UString const &chunk); /** * Assignment operator @@ -89,20 +90,20 @@ public: * Sets a chunk * @param chunk the chunk */ - void init(string const &chunk); + void init(UString const &chunk); /** * Reference a chunk part * @param part regular expression to match * @returns reference to the part of string matched */ - string chunkPart(ApertiumRE const &part); + UString chunkPart(ApertiumRE const &part); /** * Reference the wordbound blank (for postchunk) * @returns reference to the wblank string */ - string getWblank(); + UString getWblank(); /** * Sets a value for a chunk part @@ -110,7 +111,7 @@ public: * @param value the new value for the given part * @returns whether part matched */ - bool setChunkPart(ApertiumRE const &part, string const &value); + bool setChunkPart(ApertiumRE const &part, UString const &value); }; diff --git a/apertium/latex_accentsmap.cc b/apertium/latex_accentsmap.cc index 4e8dd9a..95a876b 100644 --- a/apertium/latex_accentsmap.cc +++ b/apertium/latex_accentsmap.cc @@ -100,7 +100,7 @@ void AccentsMap::init_camap() { } -wstring AccentsMap::get(wstring input){ +UString AccentsMap::get(UString input){ it = map.find(input); if(it == map.end()) return L""; diff --git a/apertium/latex_accentsmap.h b/apertium/latex_accentsmap.h index e93b6de..54214ec 100644 --- a/apertium/latex_accentsmap.h +++ b/apertium/latex_accentsmap.h @@ -27,7 +27,7 @@ using namespace std; /*struct Ltstr // Already in lttoolbox/ltstr.h { - bool operator()(wstring const &s1, wstring const &s2) const + bool operator()(UString const &s1, UString const &s2) const { return wcscmp(s1.c_str(), s2.c_str()) < 0; } @@ -35,7 +35,7 @@ using namespace std; */ class AccentsMap { - typedef std::map acmap; + typedef std::map acmap; private: acmap map; // Accent to character acmap::iterator it; // Iterator for searching @@ -50,6 +50,6 @@ class AccentsMap { void init_locale(); // The getter for both directions depending on init. - wstring get(wstring input); + UString get(UString input); }; diff --git a/apertium/lemma.cc b/apertium/lemma.cc index 9bb55f8..629870c 100644 --- a/apertium/lemma.cc +++ b/apertium/lemma.cc @@ -37,7 +37,7 @@ Lemma::Lemma(const Analysis &Analysis_) : TheLemma() { if (Analysis_.TheMorphemes.front().TheLemma.empty()) throw Exception::Morpheme::TheLemma_empty( "can't convert const Analysis & comprising Morpheme comprising empty " - "Lemma std::wstring to Lemma"); + "Lemma UString to Lemma"); TheLemma = Analysis_.TheMorphemes.front().TheLemma; } @@ -46,7 +46,7 @@ Lemma::Lemma(const Morpheme &Morpheme_) : TheLemma() { if (Morpheme_.TheLemma.empty()) throw Exception::Morpheme::TheLemma_empty("can't convert const Morpheme & " "comprising empty Lemma " - "std::wstring to Lemma"); + "UString to Lemma"); TheLemma = Morpheme_.TheLemma; } diff --git a/apertium/lemma.h b/apertium/lemma.h index 9655633..e056d74 100644 --- a/apertium/lemma.h +++ b/apertium/lemma.h @@ -29,7 +29,7 @@ public: Lemma(); Lemma(const Analysis &Analysis_); Lemma(const Morpheme &Morpheme_); - std::wstring TheLemma; + UString TheLemma; }; } diff --git a/apertium/lexical_unit.h b/apertium/lexical_unit.h index 8b7bf04..7599caa 100644 --- a/apertium/lexical_unit.h +++ b/apertium/lexical_unit.h @@ -24,7 +24,7 @@ namespace Apertium { class LexicalUnit { public: - std::wstring TheSurfaceForm; + UString TheSurfaceForm; std::vector TheAnalyses; }; } diff --git a/apertium/lswpost.cc b/apertium/lswpost.cc index c84870a..16f53ac 100644 --- a/apertium/lswpost.cc +++ b/apertium/lswpost.cc @@ -53,10 +53,10 @@ TaggerData& LSWPoST::get_tagger_data() { void LSWPoST::deserialise(FILE *Serialised_FILE_Tagger) { tdlsw.read(Serialised_FILE_Tagger); - eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"]; } -std::vector &LSWPoST::getArrayTags() { +std::vector &LSWPoST::getArrayTags() { return tdlsw.getArrayTags(); } @@ -64,7 +64,7 @@ void LSWPoST::serialise(FILE *Stream_) { tdlsw.write(Stream_); } void LSWPoST::deserialise(const TaggerData &Deserialised_FILE_Tagger) { tdlsw = TaggerDataLSW(Deserialised_FILE_Tagger); - eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"]; } void LSWPoST::init_probabilities_from_tagged_text_(MorphoStream &, MorphoStream &) { @@ -88,7 +88,7 @@ LSWPoST::LSWPoST(TaggerFlags& Flags_) : FILE_Tagger(Flags_) {} LSWPoST::LSWPoST(TaggerDataLSW t) { tdlsw = t; - eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"]; } LSWPoST::~LSWPoST() {} @@ -112,7 +112,7 @@ LSWPoST::init_probabilities(MorphoStream &morpho_stream) { int num_valid_seq = 0; word = new TaggerWord(); // word for tags left - word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + word->add_tag(eos, "sent", tdlsw.getPreferRules()); tags_left = word->get_tags(); // tags left if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); @@ -138,7 +138,7 @@ LSWPoST::init_probabilities(MorphoStream &morpho_stream) { // count each element of the para matrix while (word != NULL) { if (++nw % 10000 == 0) { - wcerr << L'.' << flush; + cerr << L'.' << flush; } tags_right = word->get_tags(); // tags right @@ -184,7 +184,7 @@ LSWPoST::init_probabilities(MorphoStream &morpho_stream) { } } - wcerr << L"\n"; + cerr << "\n"; } bool LSWPoST::is_valid_seq(TTag left, TTag mid, TTag right) { @@ -232,7 +232,7 @@ void LSWPoST::post_ambg_class_scan() { int N = (tdlsw.getTagIndex()).size(); int M = (tdlsw.getOutput()).size(); - wcerr << N << L" states and " << M < > > para_matrix_new(N, vector >(N, vector(N, 0))); word = new TaggerWord(); // word for tags left - word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + word->add_tag(eos, "sent", tdlsw.getPreferRules()); tags_left = word->get_tags(); // tags left if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); @@ -273,7 +273,7 @@ LSWPoST::train(MorphoStream &morpho_stream) { while (word) { if (++nw % 10000 == 0) { - wcerr << L'.' << flush; + cerr << L'.' << flush; } tags_right = word->get_tags(); // tags right @@ -320,11 +320,11 @@ LSWPoST::train(MorphoStream &morpho_stream) { void LSWPoST::print_para_matrix() { - wcout << L"para matrix D\n----------------------------\n"; + cout << "para matrix D\n----------------------------\n"; for (int i = 0; i < tdlsw.getN(); ++i) { for (int j = 0; j < tdlsw.getN(); ++j) { for (int k = 0; k < tdlsw.getN(); ++k) { - wcout << L"D[" << i << L"][" << j << L"][" << k << L"] = " + cout << "D[" << i << "][" << j << "][" << k << "] = " << tdlsw.getD()[i][j][k] << "\n"; } } @@ -332,14 +332,14 @@ LSWPoST::print_para_matrix() { } void -LSWPoST::tagger(MorphoStream &morpho_stream, FILE *Output) { +LSWPoST::tagger(MorphoStream &morpho_stream, UFILE* Output) { TaggerWord *word_left = NULL, *word_mid = NULL, *word_right = NULL; set tags_left, tags_mid, tags_right; set::iterator iter_left, iter_mid, iter_right; morpho_stream.setNullFlush(TheFlags.getNullFlush()); word_left = new TaggerWord(); // word left - word_left->add_tag(eos, L"sent", tdlsw.getPreferRules()); + word_left->add_tag(eos, "sent", tdlsw.getPreferRules()); word_left->set_show_sf(TheFlags.getShowSuperficial()); tags_left = word_left->get_tags(); // tags left @@ -357,7 +357,7 @@ LSWPoST::tagger(MorphoStream &morpho_stream, FILE *Output) { word_right = morpho_stream.get_next_word(); // word_right word_right->set_show_sf(TheFlags.getShowSuperficial()); - wstring micad; + UString micad; while (word_right) { tags_right = word_right->get_tags(); @@ -380,11 +380,11 @@ LSWPoST::tagger(MorphoStream &morpho_stream, FILE *Output) { } } - micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())[L"TAG_kEOF"]); - fputws_unlocked(micad.c_str(), Output); + micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())["TAG_kEOF"]); + write(micad, Output); if (morpho_stream.getEndOfFile()) { if (TheFlags.getNullFlush()) { - fputwc_unlocked(L'\0', Output); + u_fputc('\0', Output); } fflush(Output); morpho_stream.setEndOfFile(false); diff --git a/apertium/lswpost.h b/apertium/lswpost.h index 65d485e..7065cb1 100644 --- a/apertium/lswpost.h +++ b/apertium/lswpost.h @@ -57,7 +57,7 @@ protected: public: TaggerData& get_tagger_data(); void deserialise(FILE *Serialised_FILE_Tagger); - std::vector &getArrayTags(); + std::vector &getArrayTags(); void serialise(FILE *Stream_); void deserialise(const TaggerData &Deserialised_FILE_Tagger); void init_probabilities_from_tagged_text_(MorphoStream &, MorphoStream &); @@ -102,6 +102,6 @@ public: /** Do the tagging */ - void tagger(MorphoStream &morpho_stream, FILE *Output); + void tagger(MorphoStream &morpho_stream, UFILE *Output); }; #endif diff --git a/apertium/morpheme.cc b/apertium/morpheme.cc index c86c397..68e6b9c 100644 --- a/apertium/morpheme.cc +++ b/apertium/morpheme.cc @@ -29,35 +29,31 @@ bool operator<(const Morpheme &a, const Morpheme &b) { return a.TheTags < b.TheTags; } -std::wostream& operator<<(std::wostream& out, const Morpheme &morph) { +std::ostream& operator<<(std::ostream& out, const Morpheme &morph) { out << morph.TheLemma; - const std::vector &tags = morph.TheTags; - std::vector::const_iterator it = tags.begin(); - for (; it != tags.end(); it++) { - out << L"<" << it->TheTag << L">"; + for (auto& it : morph.TheTags) { + out << "<" << it.TheTag << ">"; } return out; } -Morpheme::operator std::wstring() const { +Morpheme::operator UString() const { if (TheTags.empty()) throw Exception::Morpheme::TheTags_empty("can't convert Morpheme " "comprising empty Tag std::vector " - "to std::wstring"); + "to UString"); if (TheLemma.empty()) throw Exception::Morpheme::TheLemma_empty("can't convert Morpheme " "comprising empty TheLemma " - "std::wstring to std::wstring"); + "UString to UString"); - std::wstring wstring_ = TheLemma; + UString ustring_ = TheLemma; - for (std::vector::const_iterator Tag_ = TheTags.begin(); - // Call .end() each iteration to save memory. - Tag_ != TheTags.end(); ++Tag_) { - wstring_ += static_cast(*Tag_); + for (auto& Tag_ : TheTags) { + ustring_ += static_cast(Tag_); } - return wstring_; + return ustring_; } } diff --git a/apertium/morpheme.h b/apertium/morpheme.h index 88fdece..eb2c3d2 100644 --- a/apertium/morpheme.h +++ b/apertium/morpheme.h @@ -27,9 +27,9 @@ class Morpheme { public: friend bool operator==(const Morpheme &a, const Morpheme &b); friend bool operator<(const Morpheme &a, const Morpheme &b); - friend std::wostream& operator<<(std::wostream& out, const Morpheme &morph); - operator std::wstring() const; - std::wstring TheLemma; + friend std::ostream& operator<<(std::ostream& out, const Morpheme &morph); + operator UString() const; + UString TheLemma; std::vector TheTags; }; } diff --git a/apertium/mtx_reader.cc b/apertium/mtx_reader.cc index 076ac2f..994986a 100644 --- a/apertium/mtx_reader.cc +++ b/apertium/mtx_reader.cc @@ -96,38 +96,38 @@ void MTXReader::procCoarseTags() tsx_reader.read(tsx_fn); spec.coarse_tags = Optional( tsx_reader.getTaggerData()); - stepPastSelfClosingTag(L"coarse-tags"); + stepPastSelfClosingTag("coarse-tags"); } void MTXReader::procSetDef() { - std::wstring name = attrib(L"name"); + UString name = attrib("name"); stepToNextTag(); size_t set_idx = spec.set_consts.size(); spec.set_consts.push_back(VMSet()); VMSet &vm_set = spec.set_consts.back(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == L"set-member") { + if (name == "set-member") { std::string tag = attrib("tag"); std::string str = attrib("str"); vm_set.insert(tag != "" ? tag : str); } else { - parseError(L"Expected set-member"); + parseError("Expected set-member"); } stepToNextTag(); } set_names[name] = set_idx; - assert(name == L"def-set"); + assert(name == "def-set"); stepToNextTag(); } void MTXReader::procStrDef() { - std::wstring name = attrib(L"name"); + UString name = attrib("name"); std::string tag = attrib("tag"); std::string str = attrib("str"); str_names[name] = pushStrConst(tag != "" ? tag : str); - stepPastSelfClosingTag(L"def-str"); + stepPastSelfClosingTag("def-str"); } void @@ -135,19 +135,19 @@ MTXReader::procDefns() { stepToNextTag(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == L"def-set") { + if (name == "def-set") { procSetDef(); - } else if (name == L"def-str") { + } else if (name == "def-str") { procStrDef(); - } else if (name == L"def-macro") { + } else if (name == "def-macro") { procDefMacro(); - } else if (name == L"#text" || name == L"#comment") { + } else if (name == "#text" || name == "#comment") { // skip } else { unexpectedTag(); } } - assert(name == L"defns"); + assert(name == "defns"); stepToNextTag(); } @@ -157,7 +157,7 @@ MTXReader::procGlobalPred() cur_feat = &spec.global_pred; stepToNextTag(); procBoolExpr(); - assert(name == L"global-pred" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "global-pred" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } @@ -202,50 +202,50 @@ MTXReader::procIntExpr(bool allow_fail) /* Self-closing tags */ if (!tryProcArg(INTEXPR, true) && !tryProcVar(VM::INTVAL)) { - if (name == L"sentlen") { + if (name == "sentlen") { emitOpcode(VM::SENTLENTOK); - stepPastSelfClosingTag(L"sentlen"); - } else if (name == L"pathlen") { + stepPastSelfClosingTag("sentlen"); + } else if (name == "pathlen") { emitOpcode(VM::SENTLENWRD); - stepPastSelfClosingTag(L"pathlen"); - } else if (name == L"tokaddr") { + stepPastSelfClosingTag("pathlen"); + } else if (name == "tokaddr") { emitOpcode(VM::PUSHTOKADDR); - stepPastSelfClosingTag(L"tokaddr"); - } else if (name == L"wrdidx") { + stepPastSelfClosingTag("tokaddr"); + } else if (name == "wrdidx") { emitOpcode(VM::PUSHWRDADDR); - stepPastSelfClosingTag(L"wrdidx"); - } else if (name == L"int") { + stepPastSelfClosingTag("wrdidx"); + } else if (name == "int") { emitOpcode(VM::PUSHINT); getAndEmitInt(); - stepPastSelfClosingTag(L"int"); + stepPastSelfClosingTag("int"); /* Other tags */ - } else if (name == L"add") { + } else if (name == "add") { stepToNextTag(); procIntExpr(); procIntExpr(); - assert(name == L"add" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "add" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADD); stepToNextTag(); - } else if (name == L"toklen") { + } else if (name == "toklen") { procIntExpr(); - assert(name == L"toklen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "toklen" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::TOKLENWRD); stepToNextTag(); - } else if (name == L"strlen") { + } else if (name == "strlen") { procStrExpr(); - assert(name == L"strlen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "strlen" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::STRLEN); stepToNextTag(); - } else if (name == L"arrlen") { + } else if (name == "arrlen") { procStrArrExpr(); - assert(name == L"arrlen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "arrlen" && type == XML_READER_TYPE_END_ELEMENT); procBinCompareOp(VM::ARRLEN); stepToNextTag(); } else { if (allow_fail) { return false; } - parseError(L"Expected an integer expression."); + parseError("Expected an integer expression."); } } return true; @@ -258,22 +258,22 @@ MTXReader::procStrArrExpr(bool allow_fail) if (!tryProcArg(STRARREXPR, true) && !tryProcVar(VM::STRARRVAL) && !tryProcSlice(&MTXReader::procStrArrExpr)) { - if (name == L"ex-tags") { + if (name == "ex-tags") { stepToNextTag(); procWordoidExpr(); assert(type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::EXTAGS); - } else if (name == L"ex-ambgset") { + } else if (name == "ex-ambgset") { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXAMBGSET); - } else if (name == L"for-each") { + } else if (name == "for-each") { procForEach(STREXPR); } else { if (allow_fail) { return false; } - parseError(L"Expected a string list expression."); + parseError("Expected a string list expression."); } stepToNextTag(); } @@ -282,13 +282,13 @@ MTXReader::procStrArrExpr(bool allow_fail) bool MTXReader::tryProcSubscript(bool (MTXReader::*proc_inner)(bool)) { - if (name == L"subscript") { + if (name == "subscript") { int idx = getInt("idx"); stepToNextTag(); (this->*proc_inner)(false); emitOpcode(VM::SUBSCRIPT); emitUInt(idx); - assert(name == L"subscript" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "subscript" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -297,7 +297,7 @@ bool MTXReader::tryProcSubscript(bool (MTXReader::*proc_inner)(bool)) bool MTXReader::tryProcSlice(bool (MTXReader::*proc_inner)(bool)) { - if (name == L"slice") { + if (name == "slice") { stepToNextTag(); (this->*proc_inner)(false); bool exists; @@ -314,7 +314,7 @@ bool MTXReader::tryProcSlice(bool (MTXReader::*proc_inner)(bool)) } else { emitInt(0); } - assert(name == L"slice" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "slice" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -323,17 +323,17 @@ bool MTXReader::tryProcSlice(bool (MTXReader::*proc_inner)(bool)) bool MTXReader::tryProcArg(ExprType expr_type, bool allow_fail) { - if (name == L"var") { - std::wstring var_name = attrib(L"name"); + if (name == "var") { + UString var_name = attrib("name"); if (in_global_defn) { VarNVMap::const_iterator arg_name_it = template_arg_names.find(var_name); if (arg_name_it != template_arg_names.end()) { cur_replacements->push_back(make_pair(arg_name_it->second, expr_type)); - stepPastSelfClosingTag(L"var"); + stepPastSelfClosingTag("var"); return true; } if (!allow_fail) { - parseError(L"No such argument " + var_name); + parseError("No such argument " + var_name); } } } @@ -342,31 +342,31 @@ bool MTXReader::tryProcArg(ExprType expr_type, bool allow_fail) bool MTXReader::tryProcVar(VM::StackValueType svt) { - if (name == L"var") { - std::wstring var_name = attrib(L"name"); + if (name == "var") { + UString var_name = attrib("name"); VarNVMap::const_iterator slot_names_it = slot_names.find(var_name); if (slot_names_it != slot_names.end()) { if (slot_types[slot_names_it->second] != svt) { - parseError(L"Variable " + var_name + L" has the wrong type"); + parseError("Variable " + var_name + " has the wrong type"); } emitOpcode(VM::GETVAR); emitUInt(slot_names_it->second); - stepPastSelfClosingTag(L"var"); + stepPastSelfClosingTag("var"); return true; } - parseError(L"Variable " + var_name + L" has not been set."); - } else if (!in_global_defn && name == L"macro") { + parseError("Variable " + var_name + " has not been set."); + } else if (!in_global_defn && name == "macro") { // Get template data - std::wstring var_name = attrib(L"name"); + UString var_name = attrib("name"); VarNVMap::const_iterator template_name_it = template_slot_names.find(var_name); if (template_name_it == template_slot_names.end()) { - parseError(L"No such macro " + var_name); + parseError("No such macro " + var_name); } size_t templ_idx = template_name_it->second; if (template_slot_types[templ_idx] != svt) { - parseError(L"Macro " + var_name + L" returns the wrong type"); + parseError("Macro " + var_name + " returns the wrong type"); } std::pair &templ_defn = template_defns[templ_idx]; // Get arg values @@ -417,7 +417,7 @@ bool MTXReader::tryProcVar(VM::StackValueType svt) emitOpcode(VM::GETGVAR); emitUInt(templ_instcia_it->second); // Step past end - assert(name == L"macro" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "macro" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -431,19 +431,19 @@ MTXReader::procStrExpr(bool allow_fail) && !tryProcVar(VM::STRVAL) && !tryProcSlice(&MTXReader::procStrExpr) && !tryProcSubscript(&MTXReader::procStrArrExpr)) { - if (name == L"ex-surf") { + if (name == "ex-surf") { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXTOKSURF); - } else if (name == L"ex-lemma") { + } else if (name == "ex-lemma") { stepToNextTag(); procWordoidExpr(); emitOpcode(VM::EXWRDLEMMA); - } else if (name == L"ex-coarse") { + } else if (name == "ex-coarse") { stepToNextTag(); procWordoidExpr(); emitOpcode(VM::EXWRDCOARSETAG); - } else if (name == L"join") { + } else if (name == "join") { bool has_attr; size_t str_idx = getStrRef(has_attr); if (!has_attr) { @@ -457,7 +457,7 @@ MTXReader::procStrExpr(bool allow_fail) if (allow_fail) { return false; } - parseError(L"Expected a string expression."); + parseError("Expected a string expression."); } assert(type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); @@ -470,95 +470,95 @@ MTXReader::procBoolExpr(bool allow_fail) { if (!tryProcArg(BEXPR, true) && !tryProcVar(VM::BVAL)) { - if (name == L"and") { + if (name == "and") { stepToNextTag(); procCommBoolOp(VM::AND); - assert(name == L"and" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "and" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"or") { + } else if (name == "or") { stepToNextTag(); procCommBoolOp(VM::OR); - assert(name == L"or" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "or" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"not") { + } else if (name == "not") { stepToNextTag(); procBoolExpr(); emitOpcode(VM::NOT); - assert(name == L"not" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "not" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"eq") { + } else if (name == "eq") { stepToNextTag(); procBinCompareOp(VM::EQ); - assert(name == L"eq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "eq" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"neq") { + } else if (name == "neq") { stepToNextTag(); procBinCompareOp(VM::NEQ); - assert(name == L"neq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "neq" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"lt") { + } else if (name == "lt") { stepToNextTag(); procBinCompareOp(VM::LT); - assert(name == L"lt" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "lt" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"lte") { + } else if (name == "lte") { stepToNextTag(); procBinCompareOp(VM::LTE); - assert(name == L"lte" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "lte" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"gt") { + } else if (name == "gt") { stepToNextTag(); procBinCompareOp(VM::GT); - assert(name == L"gt" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "gt" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"gte") { + } else if (name == "gte") { stepToNextTag(); procBinCompareOp(VM::GTE); - assert(name == L"gte" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "gte" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"streq") { + } else if (name == "streq") { size_t str_ref = getStrRef(); stepToNextTag(); procStrExpr(); emitOpcode(VM::STREQ); emitUInt(str_ref); - assert(name == L"streq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "streq" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"strin") { + } else if (name == "strin") { size_t set_ref = getSetRef(); stepToNextTag(); procStrExpr(); emitOpcode(VM::STRIN); emitUInt(set_ref); - assert(name == L"strin" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "strin" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); /* Identical to strin? - } else if (name == L"sethas") { + } else if (name == "sethas") { stepToNextTag(); procStrExpr(); emitSetImmOp(VM::SETHAS); */ - } else if (name == L"sethasany") { + } else if (name == "sethasany") { size_t set_ref = getSetRef(); stepToNextTag(); procStrArrExpr(); emitOpcode(VM::SETHASANY); emitUInt(set_ref); - assert(name == L"sethasany" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "sethasany" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"sethasall") { + } else if (name == "sethasall") { size_t set_ref = getSetRef(); stepToNextTag(); procStrArrExpr(); emitOpcode(VM::SETHASALL); emitUInt(set_ref); - assert(name == L"sethasall" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "sethasall" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } else { if (allow_fail) { return false; } - parseError(L"Expected a boolean expression."); + parseError("Expected a boolean expression."); } } return true; @@ -570,37 +570,37 @@ MTXReader::procAddrExpr() stepToTag(); /* Self-closing tags */ if (!tryProcArg(ADDREXPR)) { - if (name == L"wrdaddr") { + if (name == "wrdaddr") { emitOpcode(VM::PUSHADDR); - stepPastSelfClosingTag(L"wrdaddr"); + stepPastSelfClosingTag("wrdaddr"); /* Others */ - } else if (name == L"addr-of-ints") { + } else if (name == "addr-of-ints") { stepToNextTag(); procIntExpr(); procIntExpr(); - assert(name == L"addr-of-ints" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "addr-of-ints" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == L"add") { + } else if (name == "add") { stepToNextTag(); procAddrExpr(); procAddrExpr(); - assert(name == L"add" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "add" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADD2); stepToNextTag(); - } else if (name == L"adjust") { + } else if (name == "adjust") { stepToNextTag(); procAddrExpr(); - assert(name == L"adjust" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "adjust" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADJADDR); stepToNextTag(); - } else if (name == L"clamp") { + } else if (name == "clamp") { stepToNextTag(); procAddrExpr(); - assert(name == L"clamp" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "clamp" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::CLAMPADDR); stepToNextTag(); } else { - parseError(L"Expected an address expression."); + parseError("Expected an address expression."); } } } @@ -611,18 +611,18 @@ MTXReader::procWordoidArrExpr(bool allow_fail) if (!tryProcArg(WRDARREXPR, true) && !tryProcVar(VM::WRDARRVAL) && !tryProcSlice(&MTXReader::procWordoidArrExpr)) { - if (name == L"ex-wordoids") { + if (name == "ex-wordoids") { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXWRDARR); - assert(name == L"ex-wordoids" && type == XML_READER_TYPE_END_ELEMENT); - } else if (name == L"for-each") { + assert(name == "ex-wordoids" && type == XML_READER_TYPE_END_ELEMENT); + } else if (name == "for-each") { procForEach(WRDEXPR); } else { if (allow_fail) { return false; } - parseError(L"Expected a wordoid array expression."); + parseError("Expected a wordoid array expression."); } stepToNextTag(); } @@ -636,7 +636,7 @@ MTXReader::procWordoidExpr(bool allow_fail) if (!tryProcArg(WRDEXPR, true) && !tryProcVar(VM::WRDVAL) && !tryProcSubscript(&MTXReader::procWordoidArrExpr)) { - if (name == L"ex-wordoid") { + if (name == "ex-wordoid") { stepToNextTag(); procAddrExpr(); emitOpcode(VM::GETWRD); @@ -644,7 +644,7 @@ MTXReader::procWordoidExpr(bool allow_fail) if (allow_fail) { return false; } - parseError(L"Expected a wordoid expression."); + parseError("Expected a wordoid expression."); } assert(type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); @@ -657,26 +657,26 @@ MTXReader::procPred() { stepToNextTag(); procBoolExpr(); - assert(name == L"pred" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "pred" && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::DIEIFFALSE); stepToNextTag(); } size_t MTXReader::getConstRef( - const std::wstring &ref_attr, + const UString &ref_attr, const std::string &lit_attr, - const std::wstring &what, + const UString &what, VarNVMap &const_map, size_t (MTXReader::*push_new)(std::string&), bool& exists) { - std::wstring const_name = attrib(ref_attr); + UString const_name = attrib(ref_attr); if (!const_name.empty()) { exists = true; VarNVMap::iterator sit = const_map.find(const_name); if (sit == const_map.end()) { - parseError(L"No " + what + L" named " + const_name); + parseError("No " + what + " named " + const_name); } return sit->second; } @@ -692,7 +692,7 @@ MTXReader::getConstRef( size_t MTXReader::getSetRef(bool& exists) { - return getConstRef(L"name", "val", L"set", set_names, &MTXReader::pushSetConst, exists); + return getConstRef("name", "val", "set", set_names, &MTXReader::pushSetConst, exists); } size_t @@ -701,7 +701,7 @@ MTXReader::getSetRef() bool has_attr; size_t set_ref = getSetRef(has_attr); if (!has_attr) { - parseError(L"Set required"); + parseError("Set required"); } return set_ref; } @@ -709,7 +709,7 @@ MTXReader::getSetRef() size_t MTXReader::getStrRef(bool& exists) { - return getConstRef(L"name", "val", L"string", str_names, &MTXReader::pushStrConst, exists); + return getConstRef("name", "val", "string", str_names, &MTXReader::pushStrConst, exists); } size_t @@ -718,7 +718,7 @@ MTXReader::getStrRef() bool has_attr; size_t str_ref = getStrRef(has_attr); if (!has_attr) { - parseError(L"String required"); + parseError("String required"); } return str_ref; } @@ -750,7 +750,7 @@ MTXReader::getInt(std::string attr_name) bool has_attr; int i = getInt(attr_name, has_attr); if (!has_attr) { - parseError(L"String required"); + parseError("String required"); } return i; } @@ -764,12 +764,12 @@ MTXReader::getInt() template void MTXReader::emitAttr( - std::wstring what, GetT (MTXReader::*getter)(bool&), void (MTXReader::*emitter)(EmitT)) + UString what, GetT (MTXReader::*getter)(bool&), void (MTXReader::*emitter)(EmitT)) { bool has_attr = false; GetT val = (this->*getter)(has_attr); if (!has_attr) { - parseError(what + L" required"); + parseError(what + " required"); } (this->*emitter)(val); } @@ -777,19 +777,19 @@ MTXReader::emitAttr( void MTXReader::getAndEmitStrRef() { - emitAttr(L"String", &MTXReader::getStrRef, &MTXReader::emitUInt); + emitAttr("String", &MTXReader::getStrRef, &MTXReader::emitUInt); } void MTXReader::getAndEmitSetRef() { - emitAttr(L"Set", &MTXReader::getSetRef, &MTXReader::emitUInt); + emitAttr("Set", &MTXReader::getSetRef, &MTXReader::emitUInt); } void MTXReader::getAndEmitInt() { - emitAttr(L"Integer", &MTXReader::getInt, &MTXReader::emitInt); + emitAttr("Integer", &MTXReader::getInt, &MTXReader::emitInt); } void @@ -809,7 +809,7 @@ MTXReader::procInst() val = getInt(has_int_lit); int num_operands = has_set_ref + has_str_ref + has_int_lit; if (num_operands > 1) { - parseError(L"Opcodes can have at most one operand."); + parseError("Opcodes can have at most one operand."); } else if (num_operands == 1) { if (has_int_lit) { emitInt(val); @@ -837,10 +837,10 @@ MTXReader::procOut() has_expr = true; } if (!has_expr) { - parseError(L"Expected a string, bool or int expression."); + parseError("Expected a string, bool or int expression."); } stepToTag(); - assert(name == L"out" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "out" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } @@ -850,21 +850,21 @@ MTXReader::procOutMany() stepToNextTag(); procStrArrExpr(); emitOpcode(VM::FCATSTRARR); - assert(name == L"out-many" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "out-many" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } void MTXReader::printTmplDefn(const TemplateDefn &tmpl_defn) { - PerceptronSpec::printFeature(std::wcerr, tmpl_defn.first); + PerceptronSpec::printFeature(std::cerr, tmpl_defn.first); if (tmpl_defn.second.size() > 0) { - std::wcerr << "Replacements:\n"; + std::cerr << "Replacements:\n"; TemplateReplacements::const_iterator it = tmpl_defn.second.begin(); for (; it != tmpl_defn.second.end(); it++) { - std::wcerr << "Index: " << it->first << " "; + std::cerr << "Index: " << it->first << " "; printTypeExpr(it->second); - std::wcerr << "\n"; + std::cerr << "\n"; } } } @@ -874,22 +874,22 @@ MTXReader::printStackValueType(VM::StackValueType svt) { switch (svt) { case VM::INTVAL: - std::wcerr << "INT"; + std::cerr << "INT"; break; case VM::BVAL: - std::wcerr << "BOOL"; + std::cerr << "BOOL"; break; case VM::STRVAL: - std::wcerr << "STR"; + std::cerr << "STR"; break; case VM::STRARRVAL: - std::wcerr << "STRARR"; + std::cerr << "STRARR"; break; case VM::WRDVAL: - std::wcerr << "WRD"; + std::cerr << "WRD"; break; case VM::WRDARRVAL: - std::wcerr << "WRDARR"; + std::cerr << "WRDARR"; break; default: throw 1; @@ -901,29 +901,29 @@ MTXReader::printTypeExpr(ExprType expr_type) { switch (expr_type) { case VOIDEXPR: - std::wcerr << "VOID"; + std::cerr << "VOID"; break; case INTEXPR: - std::wcerr << "INT"; + std::cerr << "INT"; break; case BEXPR: - std::wcerr << "BOOL"; + std::cerr << "BOOL"; break; case STREXPR: - std::wcerr << "STR"; + std::cerr << "STR"; procStrExpr(); break; case STRARREXPR: - std::wcerr << "STRARR"; + std::cerr << "STRARR"; break; case WRDEXPR: - std::wcerr << "WRD"; + std::cerr << "WRD"; break; case WRDARREXPR: - std::wcerr << "WRDARR"; + std::cerr << "WRDARR"; break; case ADDREXPR: - std::wcerr << "ADDR"; + std::cerr << "ADDR"; break; default: throw 1; @@ -966,9 +966,9 @@ MTXReader::procTypeExpr(ExprType expr_type) void MTXReader::procForEach(ExprType expr_type) { - std::wstring var_name = attrib(L"as"); - if (var_name == L"") { - parseError(L"'as' attribute required for for-each."); + UString var_name = attrib("as"); + if (var_name == "") { + parseError("'as' attribute required for for-each."); } size_t slot_idx = slot_counter++; slot_names[var_name] = slot_idx; @@ -983,7 +983,7 @@ MTXReader::procForEach(ExprType expr_type) has_expr = true; } if (!has_expr) { - parseError(L"Expected a string array or wordoid array expression."); + parseError("Expected a string array or wordoid array expression."); } emitOpcode(VM::FOREACHINIT); @@ -1021,21 +1021,21 @@ bool MTXReader::procVoidExpr(bool allow_fail) { stepToTag(); - if (name == L"pred") { + if (name == "pred") { procPred(); - } else if (name == L"out") { + } else if (name == "out") { procOut(); - } else if (name == L"out-many") { + } else if (name == "out-many") { procOutMany(); - } else if (name == L"for-each") { + } else if (name == "for-each") { procForEach(VOIDEXPR); - } else if (name == L"inst") { + } else if (name == "inst") { procInst(); } else { if (allow_fail) { return false; } - parseError(L"Expected a void expression."); + parseError("Expected a void expression."); } return true; } @@ -1049,20 +1049,20 @@ MTXReader::procDefMacro() cur_feat = &template_defns.back().first; cur_replacements = &template_defns.back().second; - std::wstring var_name = attrib(L"as"); - if (var_name == L"") { - parseError(L"'as' attribute required for def-macro."); + UString var_name = attrib("as"); + if (var_name == "") { + parseError("'as' attribute required for def-macro."); } template_slot_names[var_name] = template_slot_counter; template_arg_names.clear(); - std::wstring args = attrib(L"args"); + UString args = attrib("args"); std::wistringstream args_ss(args); size_t arg_i = 0; for (; !args_ss.eof(); arg_i++) { - wstring arg_name; + UString arg_name; args_ss >> arg_name; - if (arg_name == L"") { + if (arg_name == "") { break; } template_arg_names[arg_name] = arg_i; @@ -1095,9 +1095,9 @@ MTXReader::procDefMacro() has_expr = true; } if (!has_expr) { - parseError(L"Expected a non-void expression."); + parseError("Expected a non-void expression."); } - assert(name == L"def-macro" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "def-macro" && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); template_slot_counter++; @@ -1114,7 +1114,7 @@ MTXReader::procFeat() while (type != XML_READER_TYPE_END_ELEMENT) { procVoidExpr(); } - assert(name == L"feat"); + assert(name == "feat"); stepToNextTag(); } @@ -1123,13 +1123,13 @@ MTXReader::procFeats() { stepToNextTag(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == L"feat") { + if (name == "feat") { procFeat(); } else { unexpectedTag(); } } - assert(name == L"feats"); + assert(name == "feats"); stepToNextTag(); } @@ -1138,7 +1138,7 @@ MTXReader::printTmplDefns() { std::vector::const_iterator it = template_defns.begin(); for (; it != template_defns.end(); it++) { - std::wcerr << " Macro " << it - template_defns.begin() << "\n"; + std::cerr << " Macro " << it - template_defns.begin() << "\n"; printTmplDefn(*it); } } @@ -1151,14 +1151,14 @@ MTXReader::parse() if (type == XML_READER_TYPE_DOCUMENT_TYPE) { stepToNextTag(); } - if (name != L"metatag") { - parseError(L"expected tag"); + if (name != "metatag") { + parseError("expected tag"); } stepToNextTag(); - if (name == L"coarse-tags") { + if (name == "coarse-tags") { procCoarseTags(); } - if (name == L"beam-width") { + if (name == "beam-width") { size_t val; std::istringstream val_ss(attrib("val")); val_ss >> val; @@ -1166,15 +1166,15 @@ MTXReader::parse() } else { spec.beam_width = 4; } - if (name == L"defns") { + if (name == "defns") { procDefns(); } - if (name == L"global-pred") { + if (name == "global-pred") { procGlobalPred(); } - if (name == L"feats") { + if (name == "feats") { procFeats(); } - assert(name == L"metatag" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "metatag" && type == XML_READER_TYPE_END_ELEMENT); } } diff --git a/apertium/mtx_reader.h b/apertium/mtx_reader.h index 6a6d1af..3474697 100644 --- a/apertium/mtx_reader.h +++ b/apertium/mtx_reader.h @@ -44,7 +44,7 @@ class MTXReader : public XMLReader }; typedef PerceptronSpec VM; - typedef std::map VarNVMap; + typedef std::map VarNVMap; typedef std::vector > TemplateReplacements; typedef std::map >, size_t> InstanciationMap; typedef std::pair TemplateDefn; @@ -59,8 +59,8 @@ protected: private: size_t pushSetConst(std::string &val); size_t pushStrConst(std::string &val); - size_t getConstRef(const std::wstring &ref_attr, const std::string &lit_attr, - const std::wstring &what, VarNVMap &const_map, + size_t getConstRef(const UString &ref_attr, const std::string &lit_attr, + const UString &what, VarNVMap &const_map, size_t (MTXReader::*push_new)(std::string&), bool& exists); size_t getSetRef(bool& exists); size_t getSetRef(); @@ -109,7 +109,7 @@ private: void procForEach(ExprType type); void procPred(); template void emitAttr( - std::wstring what, GetT (MTXReader::*getter)(bool&), + UString what, GetT (MTXReader::*getter)(bool&), void (MTXReader::*emitter)(EmitT)); void getAndEmitStrRef(); void getAndEmitSetRef(); diff --git a/apertium/perceptron_spec.cc b/apertium/perceptron_spec.cc index 89a9d73..54e8268 100644 --- a/apertium/perceptron_spec.cc +++ b/apertium/perceptron_spec.cc @@ -55,8 +55,8 @@ const std::string PerceptronSpec::type_names[] = { }; static Morpheme make_sentinel_wordoid( - const std::wstring &lemma_str, - const std::wstring &tag_str) { + const UString &lemma_str, + const UString &tag_str) { Morpheme morpheme; morpheme.TheLemma = lemma_str; Tag tag; @@ -66,17 +66,17 @@ static Morpheme make_sentinel_wordoid( } static std::vector make_sentinel_wordoids( - const std::wstring &lemma_str, - const std::wstring &tag_str) { + const UString &lemma_str, + const UString &tag_str) { std::vector morphemes; morphemes.push_back(make_sentinel_wordoid(lemma_str, tag_str)); return morphemes; } static LexicalUnit make_sentinel_token( - const std::wstring &surf, - const std::wstring &lemma_str, - const std::wstring &tag_str) { + const UString &surf, + const UString &lemma_str, + const UString &tag_str) { Analysis analy; analy.TheMorphemes = make_sentinel_wordoids(lemma_str, tag_str); LexicalUnit lu; @@ -92,9 +92,9 @@ PerceptronSpec::PerceptronSpec() { opcode_values[opcode_names[i]] = (Opcode)i; } - untagged_sentinel = make_sentinel_wordoids(L"!UNTAGGED!", L"!UT!"); - token_wordoids_underflow = make_sentinel_token(L"!SURF_UNDERFLOW!", L"!TOK_UNDERFLOW!", L"!TUF!"); - token_wordoids_overflow = make_sentinel_token(L"!SURF_OVERFLOW!", L"!TOK_OVERFLOW!", L"!TOF!"); + untagged_sentinel = make_sentinel_wordoids("!UNTAGGED!", "!UT!"); + token_wordoids_underflow = make_sentinel_token("!SURF_UNDERFLOW!", "!TOK_UNDERFLOW!", "!TUF!"); + token_wordoids_overflow = make_sentinel_token("!SURF_OVERFLOW!", "!TOK_OVERFLOW!", "!TOF!"); static_constructed = true; } @@ -289,14 +289,14 @@ subscript(std::vector vec, int idx) { void PerceptronSpec::Machine::traceMachineState() { - std::wcerr << "pc: " << bytecode_iter - feat.begin() << "\n"; - std::wcerr << "peek: "; - std::wcerr << *bytecode_iter; + std::cerr << "pc: " << bytecode_iter - feat.begin() << "\n"; + std::cerr << "peek: "; + std::cerr << *bytecode_iter; if (*bytecode_iter < num_opcodes) { - std::wcerr << " (" << opcode_names[*bytecode_iter].c_str() << ")"; + std::cerr << " (" << opcode_names[*bytecode_iter].c_str() << ")"; } - std::wcerr << "\n"; - std::wcerr << "stack: " << stack << "\n"; + std::cerr << "\n"; + std::cerr << "stack: " << stack << "\n"; } bool @@ -367,12 +367,12 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) .accumulator=StackValue(0)}); } break; case FOREACH: { - //std::wcerr << "size: " << loop_stack.back().iterable.size() + //std::cerr << "size: " << loop_stack.back().iterable.size() //<< " iteration: " << loop_stack.back().iteration << "\n"; - //std::wcerr << "foreach pc: " << bytecode_iter - feat.begin() << "\n"; + //std::cerr << "foreach pc: " << bytecode_iter - feat.begin() << "\n"; size_t slot = get_uint_operand(); size_t end_offset = get_uint_operand(); - //std::wcerr << "after foreach pc: " << bytecode_iter - feat.begin() << "\n"; + //std::cerr << "after foreach pc: " << bytecode_iter - feat.begin() << "\n"; if (loop_stack.back().iteration == loop_stack.back().iterable.size()) { stack.push(loop_stack.back().accumulator); loop_stack.pop_back(); @@ -392,10 +392,10 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) if (loop_state.iteration == 0) { if (stack.top().type == WRDVAL) { loop_state.accumulator = StackValue(std::vector()); - //std::wcerr << "Wordoid array size " << loop_state.iterable.size() << "\n"; + //std::cerr << "Wordoid array size " << loop_state.iterable.size() << "\n"; } else if (stack.top().type == STRVAL) { loop_state.accumulator = StackValue(std::vector()); - //std::wcerr << "String array size " << loop_state.iterable.size() << "\n"; + //std::cerr << "String array size " << loop_state.iterable.size() << "\n"; } else { throw 1; } @@ -403,7 +403,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) if (stack.top().type == WRDVAL) { loop_state.accumulator.wrdArr().push_back(stack.top().wrd()); } else if (stack.top().type == STRVAL) { - //std::wcerr << "String array size " << loop_state.accumulator.size() << "\n"; + //std::cerr << "String array size " << loop_state.accumulator.size() << "\n"; loop_state.accumulator.strArr().push_back(stack.top().str()); } else { throw 1; @@ -416,7 +416,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) } break; case GETGVAR: { int slot = get_uint_operand(); - //std::wcerr << "GETGVAR " << slot << " " << spec.global_results[slot] << "\n"; + //std::cerr << "GETGVAR " << slot << " " << spec.global_results[slot] << "\n"; stack.push(spec.global_results[slot]); } break; case GETVAR: { @@ -476,16 +476,16 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) stack.push(clamp(0, (int)untagged.size() - 1, stack.pop_off().intVal())); break; case GETWRD: { - //std::wcerr << "GETWRD start\n"; + //std::cerr << "GETWRD start\n"; stack.push(get_wordoid(tagged)); - //std::wcerr << "GETWRD done\n"; + //std::cerr << "GETWRD done\n"; } break; case EXTOKSURF: { - std::wstring surf = get_token(untagged).TheSurfaceForm; + UString surf = get_token(untagged).TheSurfaceForm; stack.push(new std::string(UtfConverter::toUtf8(surf))); } break; case EXWRDLEMMA: { - std::wstring lemma = stack.pop_off().wrd().TheLemma; + UString lemma = stack.pop_off().wrd().TheLemma; stack.push(new std::string(UtfConverter::toUtf8(lemma))); } break; case EXWRDCOARSETAG: { @@ -519,11 +519,11 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) case EXTAGS: { const std::vector &tags = stack.top().wrd().TheTags; /*std::vector::const_iterator it = tags.begin(); - std::wcerr << "tags: "; + std::cerr << "tags: "; for (;it != tags.end(); it++) { - std::wcerr << &(*it) << " " << it->TheTag << ", "; + std::cerr << &(*it) << " " << it->TheTag << ", "; } - std::wcerr << "\n";*/ + std::cerr << "\n";*/ std::vector *tags_str = new std::vector; tags_str->resize(tags.size()); transform(tags.begin(), tags.end(), tags_str->begin(), get_tag); diff --git a/apertium/perceptron_spec.h b/apertium/perceptron_spec.h index c577e39..092dd01 100644 --- a/apertium/perceptron_spec.h +++ b/apertium/perceptron_spec.h @@ -226,7 +226,7 @@ public: StackValue(const StackValue &other) { // C++11: Probably reference counting with shared_ptr would be better // than all this copying if it were available - //std::wcerr << "StackValue init\n"; + //std::cerr << "StackValue init\n"; type = other.type; switch (type) { case STRVAL: @@ -248,7 +248,7 @@ public: } } StackValue& operator=(StackValue other) { - //std::wcerr << "StackValue assign\n"; + //std::cerr << "StackValue assign\n"; swap(*this, other); return *this; } @@ -269,21 +269,21 @@ public: type = STRARRVAL; } StackValue(const Morpheme &wordoid) { - /*std::wcerr << L"Before "; + /*std::cerr << "Before "; std::vector::const_iterator it = wordoid.TheTags.begin(); for (;it != wordoid.TheTags.end(); it++) { - std::wcerr << &(*it) << " "; + std::cerr << &(*it) << " "; } - std::wcerr << L"\n"; - std::wcerr << L"Copy morpheme " << &wordoid;*/ + std::cerr << "\n"; + std::cerr << "Copy morpheme " << &wordoid;*/ payload.wrdval = new Morpheme(wordoid); - /*std::wcerr << L" to " << payload.wrdval << "\n"; - std::wcerr << L"After "; + /*std::cerr << " to " << payload.wrdval << "\n"; + std::cerr << "After "; it = payload.wrdval->TheTags.begin(); for (;it != payload.wrdval->TheTags.end(); it++) { - std::wcerr << &(*it) << " "; + std::cerr << &(*it) << " "; } - std::wcerr << L"\n";*/ + std::cerr << "\n";*/ type = WRDVAL; } StackValue(const std::vector &wordoids) { @@ -410,20 +410,20 @@ private: data.pop_back(); } /*void push(StackValue val) { - std::wcerr << "before copy push\n"; + std::cerr << "before copy push\n"; data.push_back(val); - std::wcerr << "after copy push\n"; + std::cerr << "after copy push\n"; }*/ void push(const StackValue &val) { - //std::wcerr << "before push\n"; + //std::cerr << "before push\n"; data.push_back(val); - //std::wcerr << "after push\n"; + //std::cerr << "after push\n"; } StackValue& top() { return data.back(); } StackValue pop_off() { - //std::wcerr << L"Top value: " << top().payload.intval << "\n"; + //std::cerr << "Top value: " << top().payload.intval << "\n"; StackValue ret = top(); pop(); return ret; diff --git a/apertium/perceptron_tagger.cc b/apertium/perceptron_tagger.cc index e121c9b..a086cfc 100644 --- a/apertium/perceptron_tagger.cc +++ b/apertium/perceptron_tagger.cc @@ -73,9 +73,9 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) { token_idx, wordoid_idx, feat_vec_delta); if (TheFlags.getDebug()) { FeatureVec fv(feat_vec_delta); - std::wcerr << "Token " << token_idx << "\t\tWordoid " << wordoid_idx << "\n"; - std::wcerr << fv; - std::wcerr << "Score: " << weights * feat_vec_delta << "\n"; + std::cerr << "Token " << token_idx << "\t\tWordoid " << wordoid_idx << "\n"; + std::cerr << fv; + std::cerr << "Score: " << weights * feat_vec_delta << "\n"; } new_agenda_item.score += weights * feat_vec_delta; } @@ -83,14 +83,14 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) { } // Apply the beam if (TheFlags.getDebug()) { - std::wcerr << "-- Before beam: --\n" << new_agenda; + std::cerr << "-- Before beam: --\n" << new_agenda; } size_t new_agenda_size = std::min((size_t)spec.beam_width, new_agenda.size()); agenda.resize(new_agenda_size); std::partial_sort_copy(new_agenda.begin(), new_agenda.end(), agenda.begin(), agenda.end()); if (TheFlags.getDebug()) { - std::wcerr << "-- After beam: --\n" << agenda; + std::cerr << "-- After beam: --\n" << agenda; } } @@ -129,7 +129,7 @@ bool PerceptronTagger::trainSentence( std::vector::const_iterator wordoid_it; for (size_t token_idx = 0; token_idx < sent_len; token_idx++) { - //std::wcerr << "Token idx: " << token_idx << "\n"; + //std::cerr << "Token idx: " << token_idx << "\n"; const TaggedToken &tagged_tok(tagged_sent[token_idx]); const StreamedType &untagged_tok(untagged_sent[token_idx]); correct_sentence.tagged.push_back(tagged_tok); @@ -156,7 +156,7 @@ bool PerceptronTagger::trainSentence( bool correct_available = false; for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) { - //std::wcerr << *agenda_it; + //std::cerr << *agenda_it; for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) { const std::vector &wordoids = analys_it->TheMorphemes; @@ -182,24 +182,24 @@ bool PerceptronTagger::trainSentence( if (TheFlags.getSkipErrors()) { return true; } else { - std::wstringstream what_; - what_ << L"Tagged analysis unavailable in untagged/ambigous input.\n"; - what_ << L"Available:\n"; + std::stringstream what_; + what_ << "Tagged analysis unavailable in untagged/ambigous input.\n"; + what_ << "Available:\n"; for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) { - what_ << *analys_it << L"\n"; + what_ << *analys_it << "\n"; } - what_ << L"Required: " << *tagged_tok << L"\n"; - what_ << L"Rerun with --skip-on-error to skip this sentence."; + what_ << "Required: " << *tagged_tok << "\n"; + what_ << "Rerun with --skip-on-error to skip this sentence."; throw Apertium::Exception::PerceptronTagger::CorrectAnalysisUnavailable(what_); } } // Apply the beam - //std::wcerr << "-- Before beam: --\n" << new_agenda; + //std::cerr << "-- Before beam: --\n" << new_agenda; size_t new_agenda_size = std::min((size_t)spec.beam_width, new_agenda.size()); agenda.resize(new_agenda_size); std::partial_sort_copy(new_agenda.begin(), new_agenda.end(), agenda.begin(), agenda.end()); - //std::wcerr << "-- After beam: --\n" << agenda; + //std::cerr << "-- After beam: --\n" << agenda; // Early update "fallen off the beam" bool any_match = false; @@ -211,29 +211,29 @@ bool PerceptronTagger::trainSentence( } } if (!any_match) { - /*std::wcerr << "Early update time!\n"; - std::wcerr << "Before:\n" << weights << "\n"; - std::wcerr << "Incorrect:\n" << agenda.front().vec << "\n"; - std::wcerr << "Correct:\n" << correct_sentence.vec << "\n";*/ + /*std::cerr << "Early update time!\n"; + std::cerr << "Before:\n" << weights << "\n"; + std::cerr << "Incorrect:\n" << agenda.front().vec << "\n"; + std::cerr << "Correct:\n" << correct_sentence.vec << "\n";*/ avg_weights -= agenda.front().vec; avg_weights += correct_sentence.vec; avg_weights.incIteration(); - //std::wcerr << "After:\n" << weights << "\n"; + //std::cerr << "After:\n" << weights << "\n"; return false; } } // Normal update - /*std::wcerr << "Best match:\n" << agenda.front().tagged << "\n\n"; - std::wcerr << "Correct:\n" << correct_sentence.tagged << "\n\n";*/ + /*std::cerr << "Best match:\n" << agenda.front().tagged << "\n\n"; + std::cerr << "Correct:\n" << correct_sentence.tagged << "\n\n";*/ if (agenda.front().tagged != correct_sentence.tagged) { - /*std::wcerr << "Normal update time!\n"; - std::wcerr << "Before:\n" << weights << "\n"; - std::wcerr << "Incorrect:\n" << agenda.front().vec << "\n"; - std::wcerr << "Correct:\n" << correct_sentence.vec << "\n";*/ + /*std::cerr << "Normal update time!\n"; + std::cerr << "Before:\n" << weights << "\n"; + std::cerr << "Incorrect:\n" << agenda.front().vec << "\n"; + std::cerr << "Correct:\n" << correct_sentence.vec << "\n";*/ avg_weights -= agenda.front().vec; avg_weights += correct_sentence.vec; avg_weights.incIteration(); - //std::wcerr << "After:\n" << weights << "\n"; + //std::cerr << "After:\n" << weights << "\n"; } return false; } @@ -248,7 +248,7 @@ void PerceptronTagger::train( TrainingCorpus tc(tagged, untagged, TheFlags.getSkipErrors(), TheFlags.getSentSeg()); size_t avail_skipped; for (int i = 0; i < iterations; i++) { - std::wcerr << "Iteration " << i + 1 << " of " << iterations << "\n"; + std::cerr << "Iteration " << i + 1 << " of " << iterations << "\n"; avail_skipped = 0; tc.shuffle(); std::vector::const_iterator si; @@ -259,12 +259,12 @@ void PerceptronTagger::train( } avg_weights.average(); if (avail_skipped) { - std::wcerr << "Skipped " << tc.skipped << " sentences due to token " + std::cerr << "Skipped " << tc.skipped << " sentences due to token " << "misalignment and " << avail_skipped << " sentences due to " << "tagged token being unavailable in untagged file out of " << tc.sentences.size() << " total sentences.\n"; } - //std::wcerr << *this; + //std::cerr << *this; } void PerceptronTagger::serialise(std::ostream &serialised) const diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index 91f55ad..4f05848 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -98,9 +98,9 @@ Postchunk::readData(FILE *in) bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); + UString fallback = Compression::string_read(in); if(recompile_attrs) { attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); } @@ -109,25 +109,25 @@ Postchunk::readData(FILE *in) // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::string_read(in)); } // macros for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); macros[cad_k] = Compression::multibyte_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { - wstring const cad_v = Compression::wstring_read(in); + UString const cad_v = Compression::string_read(in); lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); } @@ -143,7 +143,7 @@ Postchunk::read(string const &transferfile, string const &datafile) FILE *in = fopen(datafile.c_str(), "rb"); if(!in) { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; + cerr << "Error: Could not open file '" << datafile << "'." << endl; exit(EXIT_FAILURE); } readData(in); @@ -158,7 +158,7 @@ Postchunk::readPostchunk(string const &in) if(doc == NULL) { - wcerr << "Error: Could not parse file '" << in << "'." << endl; + cerr << "Error: Could not parse file '" << in << "'." << endl; exit(EXIT_FAILURE); } @@ -219,16 +219,16 @@ Postchunk::checkIndex(xmlNode *element, int index, int limit) { if(index > limit) // Note: Unlike transfer/interchunk, we allow index==limit! { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index > limit" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index > limit" << endl; return false; } if(index < 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index < 0" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index < 0" << endl; return false; } if(word[index] == 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": Null access at word[index]" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": Null access at word[index]" << endl; return false; } return true; @@ -571,7 +571,7 @@ Postchunk::evalString(xmlNode *element) else { - wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; exit(EXIT_FAILURE); } @@ -607,13 +607,9 @@ Postchunk::processOut(xmlNode *localroot) { out_wblank = word[1]->getWblank(); } - - if(myword != "") - { - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked(L'^', output); - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); + + if (!myword.empty()) { + u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); } } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) @@ -663,14 +659,11 @@ Postchunk::processOut(xmlNode *localroot) out_wblank = word[1]->getWblank(); } - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked('^', output); - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); } else // 'b' { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + write(evalString(i), output); } } } @@ -691,7 +684,7 @@ Postchunk::processTags(xmlNode *localroot) { if(j->type == XML_ELEMENT_NODE) { - fputws_unlocked(UtfConverter::fromUtf8(evalString(j)).c_str(), output); + write(evalString(j), output); } } } @@ -770,7 +763,7 @@ Postchunk::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -815,7 +808,7 @@ Postchunk::processLet(xmlNode *localroot) evalString(rightSide)); if(!match && trace) { - wcerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); @@ -891,7 +884,7 @@ Postchunk::processModifyCase(xmlNode *localroot) if(!match && trace) { - wcerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) @@ -960,7 +953,7 @@ Postchunk::processCallMacro(xmlNode *localroot) } } else { - wcerr << "Warning: Not calling macro \"" << n << "\" from line " << localroot->line << " (empty word?)" << endl; + cerr << "Warning: Not calling macro \"" << n << "\" from line " << localroot->line << " (empty word?)" << endl; } swap(myword, word); @@ -1096,7 +1089,7 @@ Postchunk::processIn(xmlNode *localroot) if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - set &myset = listslow[(const char *) idlist]; + set &myset = listslow[(const char *) idlist]; if(myset.find(tolower(sval)) != myset.end()) { return true; @@ -1108,7 +1101,7 @@ Postchunk::processIn(xmlNode *localroot) } } - set &myset = lists[(const char *) idlist]; + set &myset = lists[(const char *) idlist]; if(myset.find(sval) != myset.end()) { return true; @@ -1356,7 +1349,7 @@ Postchunk::processBeginsWithList(xmlNode *localroot) xmlChar *idlist = second->properties->children->content; string needle = evalString(first); - set::iterator it, limit; + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) @@ -1404,7 +1397,7 @@ Postchunk::processEndsWithList(xmlNode *localroot) xmlChar *idlist = second->properties->children->content; string needle = evalString(first); - set::iterator it, limit; + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) @@ -1472,9 +1465,9 @@ Postchunk::processContainsSubstring(xmlNode *localroot) string Postchunk::copycase(string const &source_word, string const &target_word) { - wstring result; - wstring const s_word = UtfConverter::fromUtf8(source_word); - wstring const t_word = UtfConverter::fromUtf8(target_word); + UString result; + UString const s_word = UtfConverter::fromUtf8(source_word); + UString const t_word = UtfConverter::fromUtf8(target_word); bool firstupper = iswupper(s_word[0]); bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); @@ -1500,7 +1493,7 @@ Postchunk::copycase(string const &source_word, string const &target_word) string Postchunk::caseOf(string const &str) { - wstring const s = UtfConverter::fromUtf8(str); + UString const s = UtfConverter::fromUtf8(str); if(s.size() > 1) { @@ -1534,38 +1527,38 @@ Postchunk::caseOf(string const &str) } } -wstring -Postchunk::caseOf(wstring const &str) +UString +Postchunk::caseOf(UString const &str) { if(str.size() > 1) { if(!iswupper(str[0])) { - return L"aa"; + return "aa"; } else if(!iswupper(str[str.size()-1])) { - return L"Aa"; + return "Aa"; } else { - return L"AA"; + return "AA"; } } else if(str.size() == 1) { if(!iswupper(str[0])) { - return L"aa"; + return "aa"; } else { - return L"Aa"; + return "Aa"; } } else { - return L"aa"; + return "aa"; } } @@ -1613,21 +1606,21 @@ Postchunk::processRule(xmlNode *localroot) { if(blank_queue.front().compare(" ") != 0) { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + write(blank_queue.front(), output); } blank_queue.pop(); } } TransferToken & -Postchunk::readToken(FILE *in) +Postchunk::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content; + UString content; while(true) { int val = fgetwc_unlocked(in); @@ -1635,25 +1628,25 @@ Postchunk::readToken(FILE *in) { return input_buffer.add(TransferToken(content, tt_eof)); } - if(val == L'\\') + if(val == '\\') { - content += L'\\'; + content += '\\'; content += wchar_t(fgetwc_unlocked(in)); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + if(val2 == '\\') { - content += L'\\'; + content += '\\'; content += wchar_t(fgetwc_unlocked(in)); } - else if(val2 == L']') + else if(val2 == ']') { - content += L']'; + content += ']'; break; } else @@ -1662,24 +1655,24 @@ Postchunk::readToken(FILE *in) } } } - else if(inword && val == L'{') + else if(inword && val == '{') { - content += L'{'; + content += '{'; while(true) { int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + if(val2 == '\\') { - content += L'\\'; + content += '\\'; content += wchar_t(fgetwc_unlocked(in)); } - else if(val2 == L'}') + else if(val2 == '}') { int val3 = wchar_t(fgetwc_unlocked(in)); ungetwc(val3, in); - content += L'}'; - if(val3 == L'$') + content += '}'; + if(val3 == '$') { break; } @@ -1690,12 +1683,12 @@ Postchunk::readToken(FILE *in) } } } - else if(inword && val == L'$') + else if(inword && val == '$') { inword = false; return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { inword = true; return input_buffer.add(TransferToken(content, tt_blank)); @@ -1726,7 +1719,7 @@ Postchunk::setTrace(bool trace) } void -Postchunk::postchunk_wrapper_null_flush(FILE *in, FILE *out) +Postchunk::postchunk_wrapper_null_flush(InputFile& in, UFILE* out) { null_flush = false; internal_null_flush = true; @@ -1734,11 +1727,11 @@ Postchunk::postchunk_wrapper_null_flush(FILE *in, FILE *out) while(!feof(in)) { postchunk(in, out); - fputwc_unlocked(L'\0', out); + u_fputc('\0', out); int code = fflush(out); if(code != 0) { - wcerr << L"Could not flush output " << errno << endl; + cerr << "Could not flush output " << errno << endl; } } @@ -1747,7 +1740,7 @@ Postchunk::postchunk_wrapper_null_flush(FILE *in, FILE *out) } void -Postchunk::postchunk(FILE *in, FILE *out) +Postchunk::postchunk(InputFile& in, UFILE* out) { if(getNullFlush()) { @@ -1781,7 +1774,7 @@ Postchunk::postchunk(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - fputws_unlocked(tmpblank[0]->c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); last = input_buffer.getPos(); ms.init(me->getInitial()); @@ -1797,16 +1790,11 @@ Postchunk::postchunk(FILE *in, FILE *out) if(trace) { - wcerr << endl << L"apertium-postchunk: Rule " << val << L" line " << lastrule_line << L" "; - for (unsigned int ind = 0; ind < tmpword.size(); ind++) - { - if (ind != 0) - { - wcerr << L" "; - } - fputws_unlocked(tmpword[ind]->c_str(), stderr); + cerr << endl << "apertium-postchunk: Rule " << val << " line " << lastrule_line; + for (auto& it : tmpword) { + cerr << " " << *it; } - wcerr << endl; + cerr << endl; } } @@ -1820,7 +1808,7 @@ Postchunk::postchunk(FILE *in, FILE *out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -1832,13 +1820,13 @@ Postchunk::postchunk(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); return; } break; default: - wcerr << "Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } @@ -1847,7 +1835,7 @@ Postchunk::postchunk(FILE *in, FILE *out) void Postchunk::applyRule() { - wstring const chunk = *tmpword[0]; + UString const chunk = *tmpword[0]; tmpword.clear(); splitWordsAndBlanks(chunk, tmpword, tmpblank); @@ -1893,19 +1881,19 @@ Postchunk::applyRule() } void -Postchunk::applyWord(wstring const &word_str) +Postchunk::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; ms.step(towlower(word_str[i]), any_char); break; - case L'<': + case '<': /* for(unsigned int j = i+1; j != limit; j++) { if(word_str[j] == '>') @@ -1925,8 +1913,8 @@ Postchunk::applyWord(wstring const &word_str) } break;*/ - case L'{': // ignore the unmodifiable part of the chunk - ms.step(L'$'); + case '{': // ignore the unmodifiable part of the chunk + ms.step('$'); return; default: @@ -1934,31 +1922,31 @@ Postchunk::applyWord(wstring const &word_str) break; } } - ms.step(L'$'); + ms.step('$'); } -vector -Postchunk::getVecTags(wstring const &chunk) +vector +Postchunk::getVecTags(UString const &chunk) { - vector vectags; + vector vectags; for(int i = 0, limit = chunk.size(); i != limit; i++) { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { i++; } - else if(chunk[i] == L'<') + else if(chunk[i] == '<') { - wstring mytag; + UString mytag; do { mytag += chunk[i++]; } - while(chunk[i] != L'>'); - vectags.push_back(mytag + L'>'); + while(chunk[i] != '>'); + vectags.push_back(mytag + '>'); } - else if(chunk[i] == L'{') + else if(chunk[i] == '{') { break; } @@ -1967,15 +1955,15 @@ Postchunk::getVecTags(wstring const &chunk) } int -Postchunk::beginChunk(wstring const &chunk) +Postchunk::beginChunk(UString const &chunk) { for(int i = 0, limit = chunk.size(); i != limit; i++) { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { i++; } - else if(chunk[i] == L'{') + else if(chunk[i] == '{') { return i + 1; } @@ -1984,82 +1972,79 @@ Postchunk::beginChunk(wstring const &chunk) } int -Postchunk::endChunk(wstring const &chunk) +Postchunk::endChunk(UString const &chunk) { return chunk.size()-2; } -wstring -Postchunk::wordzero(wstring const &chunk) +UString +Postchunk::wordzero(UString const &chunk) { for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { i++; } - else if(chunk[i] == L'{') + else if(chunk[i] == '{') { return chunk.substr(0, i); } } - return L""; + return ""; } -wstring -Postchunk::pseudolemma(wstring const &chunk) +UString +Postchunk::pseudolemma(UString const &chunk) { for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { i++; } - else if(chunk[i] == L'<' || chunk[i] == L'{') + else if(chunk[i] == '<' || chunk[i] == '{') { return chunk.substr(0, i); } } - return L""; + return ""; } void -Postchunk::unchunk(wstring const &chunk, FILE *output) +Postchunk::unchunk(UString const &chunk, UFILE* output) { - vector vectags = getVecTags(chunk); - wstring case_info = caseOf(pseudolemma(chunk)); + vector vectags = getVecTags(chunk); + UString case_info = caseOf(pseudolemma(chunk)); bool uppercase_all = false; bool uppercase_first = false; - if(case_info == L"AA") + if(case_info == "AA") { uppercase_all = true; } - else if(case_info == L"Aa") + else if(case_info == "Aa") { uppercase_first = true; } for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) { - if(chunk[i] == L'\\') - { - fputwc_unlocked(L'\\', output); - fputwc_unlocked(chunk[++i], output); - } - else if(chunk[i] == L'^') - { - fputwc_unlocked(L'^', output); - while(chunk[++i] != L'$') + if(chunk[i] == '\\') { + u_fputc('\\', output); + u_fputc(chunk[++i], output); + } else if(chunk[i] == '^') { + u_fputc('^', output); + while(chunk[++i] != '$') { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - fputwc_unlocked(L'\\', output); - fputwc_unlocked(chunk[++i], output); + u_fputc('\\', output); + u_fputc(chunk[++i], output); } - else if(chunk[i] == L'<') + else if(chunk[i] == '<') { if(iswdigit(chunk[i+1])) { @@ -2069,107 +2054,107 @@ Postchunk::unchunk(wstring const &chunk, FILE *output) //atoi(chunk.c_str()+i+1)-1; if(vectags.size() > value) { - fputws_unlocked(vectags[value].c_str(), output); + write(vectags[value], output); } - while(chunk[++i] != L'>'); + while(chunk[++i] != '>'); } else { - fputwc_unlocked(L'<', output); - while(chunk[++i] != L'>') fputwc_unlocked(chunk[i], output); - fputwc_unlocked(L'>', output); + u_fputc('<', output); + while(chunk[++i] != '>') u_fputc(chunk[i], output); + u_fputc('>', output); } } else { if(uppercase_all) { - fputwc_unlocked(towupper(chunk[i]), output); + u_fputc(towupper(chunk[i]), output); } else if(uppercase_first) { if(iswalnum(chunk[i])) { - fputwc_unlocked(towupper(chunk[i]), output); + u_fputc(towupper(chunk[i]), output); uppercase_first = false; } else { - fputwc_unlocked(chunk[i], output); + u_fputc(chunk[i], output); } } else { - fputwc_unlocked(chunk[i], output); + u_fputc(chunk[i], output); } } } - fputwc_unlocked(L'$', output); + u_fputc('$', output); } - else if(chunk[i] == L'[') + else if(chunk[i] == '[') { - fputwc_unlocked(L'[', output); - while(chunk[++i] != L']') + u_fputc('[', output); + while(chunk[++i] != ']') { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - fputwc_unlocked(L'\\', output); - fputwc_unlocked(chunk[++i], output); + u_fputc('\\', output); + u_fputc(chunk[++i], output); } else { - fputwc_unlocked(chunk[i], output); + u_fputc(chunk[i], output); } } - fputwc_unlocked(L']', output); + u_fputc(']', output); } else { - fputwc_unlocked(chunk[i], output); + u_fputc(chunk[i], output); } } } void -Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, - vector &blanks) +Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, + vector &blanks) { - vector vectags = getVecTags(chunk); - wstring case_info = caseOf(pseudolemma(chunk)); + vector vectags = getVecTags(chunk); + UString case_info = caseOf(pseudolemma(chunk)); bool uppercase_all = false; bool uppercase_first = false; bool lastblank = true; - if(case_info == L"AA") + if(case_info == "AA") { uppercase_all = true; } - else if(case_info == L"Aa") + else if(case_info == "Aa") { uppercase_first = true; } for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) { - if(chunk[i] == L'^') + if(chunk[i] == '^') { if(!lastblank) { - blanks.push_back(new wstring(L"")); + blanks.push_back(new UString("")); } lastblank = false; - wstring *myword = new wstring(); - wstring &ref = *myword; + UString *myword = new UString(); + UString &ref = *myword; - while(chunk[++i] != L'$') + while(chunk[++i] != '$') { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - ref += L'\\'; + ref += '\\'; ref += chunk[++i]; } - else if(chunk[i] == L'<') + else if(chunk[i] == '<') { if(iswdigit(chunk[i+1])) { @@ -2180,13 +2165,13 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, { ref.append(vectags[value]); } - while(chunk[++i] != L'>'); + while(chunk[++i] != '>'); } else { - ref += L'<'; - while(chunk[++i] != L'>') ref += chunk[i]; - ref += L'>'; + ref += '<'; + while(chunk[++i] != '>') ref += chunk[i]; + ref += '>'; } } else @@ -2216,26 +2201,26 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, words.push_back(myword); } - else if(chunk[i] == L'[') + else if(chunk[i] == '[') { - if(chunk[i+1] == L'[') //wordbound blank + if(chunk[i+1] == '[') //wordbound blank { if(!lastblank) { - blanks.push_back(new wstring(L"")); + blanks.push_back(new UString("")); } lastblank = false; - wstring *myword = new wstring(); - wstring &ref = *myword; + UString *myword = new UString(); + UString &ref = *myword; while(true) { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - ref += L'\\'; + ref += '\\'; ref += chunk[++i]; } - else if(chunk[i] == L']' && chunk[i-1] == L']') + else if(chunk[i] == ']' && chunk[i-1] == ']') { ref += chunk[i]; i++; //i->"^" @@ -2249,14 +2234,14 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, i++; } - while(chunk[++i] != L'$') + while(chunk[++i] != '$') { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - ref += L'\\'; + ref += '\\'; ref += chunk[++i]; } - else if(chunk[i] == L'<') + else if(chunk[i] == '<') { if(iswdigit(chunk[i+1])) { @@ -2267,13 +2252,13 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, { ref.append(vectags[value]); } - while(chunk[++i] != L'>'); + while(chunk[++i] != '>'); } else { - ref += L'<'; - while(chunk[++i] != L'>') ref += chunk[i]; - ref += L'>'; + ref += '<'; + while(chunk[++i] != '>') ref += chunk[i]; + ref += '>'; } } else @@ -2307,15 +2292,15 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, { if (!(lastblank && blanks.back())) { - blanks.push_back(new wstring()); + blanks.push_back(new UString()); } - wstring &ref = *(blanks.back()); - ref += L'['; - while(chunk[++i] != L']') + UString &ref = *(blanks.back()); + ref += '['; + while(chunk[++i] != ']') { - if(chunk[i] == L'\\') + if(chunk[i] == '\\') { - ref += L'\\'; + ref += '\\'; ref += chunk[++i]; } else @@ -2332,13 +2317,13 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, { if (!lastblank) { - wstring *myblank = new wstring(L""); + UString *myblank = new UString(""); blanks.push_back(myblank); } - wstring &ref = *(blanks.back()); - if(chunk[i] == L'\\') + UString &ref = *(blanks.back()); + if(chunk[i] == '\\') { - ref += L'\\'; + ref += '\\'; ref += chunk[++i]; } else diff --git a/apertium/postchunk.h b/apertium/postchunk.h index 9f3a254..e704274 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -44,11 +44,11 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; vector macro_map; vector rule_map; vector rule_lines; @@ -58,8 +58,8 @@ private: queue blank_queue; int lword; Buffer input_buffer; - vector tmpword; - vector tmpblank; + vector tmpword; + vector tmpblank; bool in_out; bool in_lu; @@ -69,7 +69,7 @@ private: string out_wblank; map var_out_wblank; - FILE *output; + UFILE *output; int any_char; int any_tag; @@ -89,7 +89,7 @@ private: void collectMacros(xmlNode *localroot); void collectRules(xmlNode *localroot); static string caseOf(string const &str); - static wstring caseOf(wstring const &str); + static UString caseOf(UString const &str); string copycase(string const &source_word, string const &target_word); void processLet(xmlNode *localroot); @@ -118,23 +118,23 @@ private: bool endsWith(string const &str1, string const &str2) const; string tolower(string const &str) const; string tags(string const &str) const; - string readWord(FILE *in); - string readBlank(FILE *in); - string readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + string readWord(InputFile& in); + string readBlank(InputFile& in); + string readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); void applyRule(); - TransferToken & readToken(FILE *in); - static void unchunk(wstring const &chunk, FILE *output); - static vector getVecTags(wstring const &chunk); - static int beginChunk(wstring const &chunk); - static int endChunk(wstring const &chunk); - static void splitWordsAndBlanks(wstring const &chunk, - vector &words, - vector &blanks); - static wstring pseudolemma(wstring const &chunk); - static wstring wordzero(wstring const &chunk); + TransferToken & readToken(InputFile& in); + static void unchunk(UString const &chunk, UFILE *output); + static vector getVecTags(UString const &chunk); + static int beginChunk(UString const &chunk); + static int endChunk(UString const &chunk); + static void splitWordsAndBlanks(UString const &chunk, + vector &words, + vector &blanks); + static UString pseudolemma(UString const &chunk); + static UString wordzero(UString const &chunk); bool checkIndex(xmlNode *element, int index, int limit); - void postchunk_wrapper_null_flush(FILE *in, FILE *out); + void postchunk_wrapper_null_flush(InputFile& in, UFILE* out); bool gettingLemmaFromWord(string attr); string combineWblanks(string wblank_current, string wblank_to_add); @@ -143,7 +143,7 @@ public: ~Postchunk(); void read(string const &transferfile, string const &datafile); - void postchunk(FILE *in, FILE *out); + void postchunk(InputFile& in, UFILE* out); bool getNullFlush(void); void setNullFlush(bool null_flush); void setTrace(bool trace); diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc index 764b3cf..1b42529 100644 --- a/apertium/pretransfer.cc +++ b/apertium/pretransfer.cc @@ -8,37 +8,37 @@ #include #include -wstring storeAndWriteWblank(FILE *input, FILE *output) +UString storeAndWriteWblank(InputFile& input, UFILE* output) { int mychar; - wstring content = L"[["; + UString content = "[["; while(true) { mychar = fgetwc_unlocked(input); if(feof(input)) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } content += mychar; - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); - if(mychar == L'\\') + if(mychar == '\\') { mychar = fgetwc(input); content += mychar; - fputwc(mychar, output); + u_fputc(mychar, output); } - else if(mychar == L']') + else if(mychar == ']') { mychar = fgetwc(input); - if(mychar == L']') + if(mychar == ']') { content += mychar; - fputwc(mychar, output); + u_fputc(mychar, output); break; } } @@ -47,7 +47,7 @@ wstring storeAndWriteWblank(FILE *input, FILE *output) return content; } -void readAndWriteUntil(FILE *input, FILE *output, int const charcode) +void readAndWriteUntil(InputFile& input, UFILE* output, int const charcode) { int mychar; @@ -55,22 +55,22 @@ void readAndWriteUntil(FILE *input, FILE *output, int const charcode) { if(feof(input)) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } - fputwc_unlocked(mychar, output); - if(mychar == L'\\') + u_fputc(mychar, output); + if(mychar == '\\') { mychar = fgetwc(input); - fputwc(mychar, output); + u_fputc(mychar, output); } } } -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank = L"") +void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound_sep, UString wblank = "") { int mychar; - wstring buffer = L""; + UString buffer = ""; bool buffer_mode = false; bool in_tag = false; @@ -78,20 +78,20 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, if(surface_forms) { - while((mychar = fgetwc_unlocked(input)) != L'/') ; + while((mychar = fgetwc_unlocked(input)) != '/') ; } - while((mychar = fgetwc_unlocked(input)) != L'$') + while((mychar = fgetwc_unlocked(input)) != '$') { if(feof(input)) { - wcerr << L"ERROR: Unexpected EOF" << endl; + cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); } switch(mychar) { - case L'<': + case '<': in_tag = true; if(!buffer_mode) { @@ -99,11 +99,11 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, } break; - case L'>': + case '>': in_tag = false; break; - case L'#': + case '#': if(buffer_mode) { buffer_mode = false; @@ -114,44 +114,44 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, if(buffer_mode) { - if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && - (mychar != L'~' || (mychar == L'~' && in_tag == true))) + if((mychar != '+' || (mychar == '+' && in_tag == true)) && + (mychar != '~' || (mychar == '~' && in_tag == true))) { buffer += static_cast(mychar); } - else if(in_tag == false && mychar == L'+') + else if(in_tag == false && mychar == '+') { - buffer.append(L"$ "); + buffer.append("$ "); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"); } - else if(in_tag == false && mychar == L'~' and compound_sep == true) + else if(in_tag == false && mychar == '~' and compound_sep == true) { - buffer.append(L"$"); + buffer.append("$"); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"); } } else { - if(mychar == L'+' && queuing == true) + if(mychar == '+' && queuing == true) { - buffer.append(L"$ "); + buffer.append("$ "); buffer.append(wblank); - buffer.append(L"^"); + buffer.append("^"); buffer_mode = true; } else { - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); } } } - fputws_unlocked(buffer.c_str(), output); + write(buffer, output); } -void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep) +void processStream(InputFile& input, UFILE* output, bool null_flush, bool surface_forms, bool compound_sep) { while(true) { @@ -162,49 +162,49 @@ void processStream(FILE *input, FILE *output, bool null_flush, bool surface_form } switch(mychar) { - case L'[': - fputwc_unlocked(L'[', output); + case '[': + u_fputc('[', output); mychar = fgetwc_unlocked(input); - if(mychar == L'[') + if(mychar == '[') { - fputwc_unlocked(L'[', output); - wstring wblank = storeAndWriteWblank(input, output); + u_fputc('[', output); + UString wblank = storeAndWriteWblank(input, output); mychar = fgetwc_unlocked(input); - if(mychar == L'^') + if(mychar == '^') { - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); procWord(input, output, surface_forms, compound_sep, wblank); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } else { - wcerr << L"ERROR: Wordbound blank isn't immediately followed by the Lexical Unit." << endl; + cerr << "ERROR: Wordbound blank isn't immediately followed by the Lexical Unit." << endl; exit(EXIT_FAILURE); } } else { ungetwc(mychar, input); - readAndWriteUntil(input, output, L']'); - fputwc_unlocked(L']', output); + readAndWriteUntil(input, output, ']'); + u_fputc(']', output); } break; - case L'\\': - fputwc_unlocked(mychar, output); - fputwc_unlocked(fgetwc_unlocked(input), output); + case '\\': + u_fputc(mychar, output); + u_fputc(fgetwc_unlocked(input), output); break; - case L'^': - fputwc_unlocked(mychar, output); + case '^': + u_fputc(mychar, output); procWord(input, output, surface_forms, compound_sep); - fputwc_unlocked(L'$', output); + u_fputc('$', output); break; - case L'\0': - fputwc_unlocked(mychar, output); + case '\0': + u_fputc(mychar, output); if(null_flush) { @@ -213,7 +213,7 @@ void processStream(FILE *input, FILE *output, bool null_flush, bool surface_form break; default: - fputwc_unlocked(mychar, output); + u_fputc(mychar, output); break; } } diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h index d664b13..d749318 100644 --- a/apertium/pretransfer.h +++ b/apertium/pretransfer.h @@ -18,10 +18,11 @@ #include #include +#include -wstring storeAndWriteWblank(FILE *input, FILE *output); -void readAndWriteUntil(FILE *input, FILE *output, int const charcode); -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank); -void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep); +UString storeAndWriteWblank(InputFile& input, UFILE *output); +void readAndWriteUntil(InputFile& input, UFILE *output, int const charcode); +void procWord(InputFile& input, UFILE *output, bool surface_forms, bool compound_sep, UString wblank); +void processStream(InputFile& input, UFILE *output, bool null_flush, bool surface_forms, bool compound_sep); #endif diff --git a/apertium/sentence_stream.cc b/apertium/sentence_stream.cc index c83450b..d65e698 100644 --- a/apertium/sentence_stream.cc +++ b/apertium/sentence_stream.cc @@ -21,7 +21,7 @@ bool isSentenceEnd(StreamedType &token) { return false; } Tag &tag = *tags.begin(); - if (tag.TheTag != L"sent") { + if (tag.TheTag != "sent") { return false; } return true; @@ -103,17 +103,17 @@ TrainingCorpus::TrainingCorpus(Stream &tagged, Stream &untagged, untagged_line++; if (!tagged_token.TheLexicalUnit || !untagged_token.TheLexicalUnit) { if (tagged_token.TheLexicalUnit || untagged_token.TheLexicalUnit) { - std::wcerr << "Normal perm\n"; - std::wcerr << "tagged: " << tagged_line << " " << (!!tagged_token.TheLexicalUnit) << "\n"; - std::wcerr << "untagged: " << untagged_line << " " << (!!untagged_token.TheLexicalUnit) << "\n"; + std::cerr << "Normal perm\n"; + std::cerr << "tagged: " << tagged_line << " " << (!!tagged_token.TheLexicalUnit) << "\n"; + std::cerr << "untagged: " << untagged_line << " " << (!!untagged_token.TheLexicalUnit) << "\n"; prematureEnd(); } break; } - //std::wcerr << tagged_token.TheLexicalUnit->TheSurfaceForm << " || " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; + //std::cerr << tagged_token.TheLexicalUnit->TheSurfaceForm << " || " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; if (untagged_token.TheLexicalUnit->TheSurfaceForm != tagged_token.TheLexicalUnit->TheSurfaceForm) { if (!skip_on_error) { - std::wstringstream what_; + std::stringstream what_; what_ << "Streams diverged at line " << tagged_line << "\n"; what_ << "Untagged token: " << untagged_token.TheLexicalUnit->TheSurfaceForm << "\n"; @@ -127,18 +127,18 @@ TrainingCorpus::TrainingCorpus(Stream &tagged, Stream &untagged, training_sentence->first.clear(); training_sentence->second.clear(); - std::wcerr << "fast forward\n"; + std::cerr << "fast forward\n"; bool tagged_ended = contToEndOfSent(tagged, tagged_token, tagged_line); bool untagged_ended = contToEndOfSent(untagged, untagged_token, untagged_line); if (tagged_ended || untagged_ended) { if (!tagged_ended || !untagged_ended) { - std::wcerr << "fast forward prem\n"; + std::cerr << "fast forward prem\n"; prematureEnd(); } - std::wcerr << "fast forward finish\n"; + std::cerr << "fast forward finish\n"; break; } - std::wcerr << "fast forwarded\n"; + std::cerr << "fast forwarded\n"; continue; } if (was_sentence_end) { @@ -169,7 +169,7 @@ bool TrainingCorpus::contToEndOfSent(Stream &stream, StreamedType token, if (isSentenceEnd(token, stream, sent_seg)) { return false; } - std::wcerr << "Skip " << token.TheLexicalUnit->TheSurfaceForm << "\n"; + std::cerr << "Skip " << token.TheLexicalUnit->TheSurfaceForm << "\n"; token = stream.get(); line++; } diff --git a/apertium/stream.cc b/apertium/stream.cc index 4d83a3e..7d68af3 100644 --- a/apertium/stream.cc +++ b/apertium/stream.cc @@ -49,7 +49,7 @@ Stream::Stream(TaggerFlags &Flags_, StreamedType Stream::get() { StreamedType TheStreamedType; - std::wstring Lemma; + UString Lemma; private_flush_ = false; //TheCharacterStream.clear(); @@ -75,10 +75,10 @@ StreamedType Stream::get() { case L'$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '[' expected to follow '[', ']' or '$'"; + << "', '[' expected to follow '[', ']' or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -88,9 +88,9 @@ StreamedType Stream::get() { continue; case L']': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', ']' expected to follow '['"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', ']' expected to follow '['"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -101,10 +101,10 @@ StreamedType Stream::get() { ThePreviousCase = PreviousCaseType(Character_); continue; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', ']' expected to follow '[' or ']'"; + << "', ']' expected to follow '[' or ']'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -119,10 +119,10 @@ StreamedType Stream::get() { case L'$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '^' expected to follow '[', ']', or '$'"; + << "', '^' expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -132,10 +132,10 @@ StreamedType Stream::get() { continue; case L'/': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -145,12 +145,12 @@ StreamedType Stream::get() { continue; case L'^': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -161,8 +161,8 @@ StreamedType Stream::get() { if (is_eof_throw_if_not_TheCharacterStream_good( TheStreamedType, Lemma, Character_)) { - std::wstringstream Message; - Message << L"unexpected end-of-file following '" + std::stringstream Message; + Message << "unexpected end-of-file following '" << ThePreviousCase->ThePreviousCase << "', end-of-file expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile( @@ -182,8 +182,8 @@ StreamedType Stream::get() { ThePreviousCase = PreviousCaseType(Character_); continue; case L'\n': { - std::wstringstream Message; - Message << L"unexpected newline following '" + std::stringstream Message; + Message << "unexpected newline following '" << ThePreviousCase->ThePreviousCase << "', newline expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCharacter( @@ -206,10 +206,10 @@ StreamedType Stream::get() { case L'>': case L'+': case L'$': { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" - << ThePreviousCase->ThePreviousCase << L"', expected '*'"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" + << ThePreviousCase->ThePreviousCase << "', expected '*'"; throw Exception::Stream::UnexpectedPreviousCase( Message_what(Message)); } @@ -225,12 +225,12 @@ StreamedType Stream::get() { continue; case L'>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -238,22 +238,22 @@ StreamedType Stream::get() { case L'#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '/' expected to follow '[', to follow '>' " - L"immediately, or to follow '^' or '#' not immediately"; + << "', '/' expected to follow '[', to follow '>' " + "immediately, or to follow '^' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -270,11 +270,11 @@ StreamedType Stream::get() { case L'$': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '*' expected to follow '[', ']', or '$' or to " - L"follow '/' immediately"; + << "', '*' expected to follow '[', ']', or '$' or to " + "follow '/' immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -283,11 +283,11 @@ StreamedType Stream::get() { continue; case L'<': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '<' expected to follow '[', to follow '>' " - L"immediately, or to follow '#', '/' or '+' not " - L"immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '<' expected to follow '[', to follow '>' " + "immediately, or to follow '#', '/' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -298,16 +298,16 @@ StreamedType Stream::get() { case L'/': break; case L'#': - //std::wcerr << L"[306] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; + //std::cerr << "[306] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; case L'+': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '<' expected to follow '[', '/', '>'" - L"immediately, or to follow '#' or '+' not " - L"immediately"; + << "', '<' expected to follow '[', '/', '>'" + "immediately, or to follow '#' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -315,12 +315,12 @@ StreamedType Stream::get() { case L'>': break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '<' expected to follow '[', to follow '>' " - L"immediately, or to follow '#', '/' or '+' not " - L"immediately"; + << "', '<' expected to follow '[', to follow '>' " + "immediately, or to follow '#', '/' or '+' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -331,10 +331,10 @@ StreamedType Stream::get() { continue; case L'>': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -344,29 +344,29 @@ StreamedType Stream::get() { continue; case L'<': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); continue; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '>' expected to follow '[' or to follow '<' not " - L"immediately"; + << "', '>' expected to follow '[' or to follow '<' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } std::abort(); case L'#': - //std::wcerr << L"[391] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; + //std::cerr << "[391] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { case L'[': @@ -377,42 +377,42 @@ StreamedType Stream::get() { continue; case L'/': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '/' not " - L"immediately"; + << "', '#' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '/' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; case L'>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '/' not " - L"immediately"; + << "', '#' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '/' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '#' expected to follow '[', ']', or '$', to follow " - L"'>' immediately, or to follow '/' not immediately"; + << "', '#' expected to follow '[', ']', or '$', to follow " + "'>' immediately, or to follow '/' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } ThePreviousCase = PreviousCaseType(Character_); push_back_Character(TheStreamedType, Lemma, Character_); - //std::wcerr << L"[440] Character: " << Character_ << L"||| Lemma: " << Lemma << std::endl ; + //std::cerr << "[440] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; continue; } @@ -430,36 +430,36 @@ StreamedType Stream::get() { continue; case L'>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', '^', '/' or " - L"'$', to follow '>' immediately, or to follow '#' " - L"not immediately"; + << "', '+' expected to follow '[', ']', '^', '/' or " + "'$', to follow '>' immediately, or to follow '#' " + "not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; case L'#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', or '$', to " - L"follow '>' immediately, or to follow '#' not " - L"immediately"; + << "', '+' expected to follow '[', ']', or '$', to " + "follow '>' immediately, or to follow '#' not " + "immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: { - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '+' expected to follow '[', ']', or '$', to follow " - L"'>' immediately, or to follow '#' not immediately"; + << "', '+' expected to follow '[', ']', or '$', to follow " + "'>' immediately, or to follow '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -474,10 +474,10 @@ StreamedType Stream::get() { continue; case L'$': if (!ThePreviousCase) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + std::stringstream Message; + Message << "unexpected '" << Character_ + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -487,55 +487,55 @@ StreamedType Stream::get() { continue; case L'*': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } if (TheFlags.getDebug()) { if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm) - std::wcerr << L"unexpected lemma \"" << Lemma - << L"\", expected \"" + std::cerr << "unexpected lemma \"" << Lemma + << "\", expected \"" << TheStreamedType.TheLexicalUnit->TheSurfaceForm - << L"\"\n"; + << "\"\n"; } ThePreviousCase = PreviousCaseType(Character_); return TheStreamedType; case L'>': if (!ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' not immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' not immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; case L'#': if (ThePreviousCase->isPreviousCharacter) { - std::wstringstream Message; - Message << L"unexpected '" << Character_ - << L"' immediately following '" + std::stringstream Message; + Message << "unexpected '" << Character_ + << "' immediately following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } break; default: - std::wstringstream Message; - Message << L"unexpected '" << Character_ << L"' following '" + std::stringstream Message; + Message << "unexpected '" << Character_ << "' following '" << ThePreviousCase->ThePreviousCase - << L"', '$' expected to follow '[', to follow '>' " - L"immediately, or to follow '*' or '#' not immediately"; + << "', '$' expected to follow '[', to follow '>' " + "immediately, or to follow '*' or '#' not immediately"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } @@ -549,10 +549,10 @@ StreamedType Stream::get() { case L'$': break; default: - std::wstringstream Message; - Message << L"unexpected newline following '" + std::stringstream Message; + Message << "unexpected newline following '" << ThePreviousCase->ThePreviousCase - << L"', newline expected to follow '[', ']', or '$'"; + << "', newline expected to follow '[', ']', or '$'"; throw Exception::Stream::UnexpectedCase(Message_what(Message)); } } @@ -576,10 +576,10 @@ StreamedType Stream::get() { case L'$': break; default: - std::wstringstream Message; - Message << L"unexpected end-of-file following '" + std::stringstream Message; + Message << "unexpected end-of-file following '" << ThePreviousCase->ThePreviousCase - << L"', end-of-file expected to follow ']' or '$'"; + << "', end-of-file expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); } } @@ -619,23 +619,23 @@ void Stream::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, std::wostream &output, TaggerFlags &flags) { using namespace std::rel_ops; - output << L"^"; + output << "^"; if (lexical_unit.TheAnalyses.empty() || !analysis) { if (flags.getShowSuperficial()) - output << lexical_unit.TheSurfaceForm << L"/"; + output << lexical_unit.TheSurfaceForm << "/"; - output << L"*" << lexical_unit.TheSurfaceForm << L"$"; + output << "*" << lexical_unit.TheSurfaceForm << "$"; return; } if (flags.getMark()) { if (lexical_unit.TheAnalyses.size() != 1) - output << L"="; + output << "="; } if (flags.getShowSuperficial()) - output << lexical_unit.TheSurfaceForm << L"/"; + output << lexical_unit.TheSurfaceForm << "/"; output << *analysis; @@ -645,11 +645,11 @@ void Stream::outputLexicalUnit( // Call .end() each iteration to save memory. other_analysis != lexical_unit.TheAnalyses.end(); ++other_analysis) { if (*other_analysis != *analysis) - output << L"/" << *other_analysis; + output << "/" << *other_analysis; } } - output << L"$"; + output << "$"; } Stream::PreviousCaseType::PreviousCaseType(const wchar_t &PreviousCase_) @@ -660,12 +660,12 @@ bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { return true; if (!TheCharacterStream) { - std::wcerr << L"State bad " << TheCharacterStream.good() << " " + std::cerr << "State bad " << TheCharacterStream.good() << " " << TheCharacterStream.eof() << " " << TheCharacterStream.fail() << " " << TheCharacterStream.bad() << "\n"; - std::wstringstream Message; - Message << L"can't get const wchar_t: TheCharacterStream not good"; + std::stringstream Message; + Message << "can't get const wchar_t: TheCharacterStream not good"; throw Exception::Stream::TheCharacterStream_not_good( Message_what(Message)); } @@ -673,28 +673,28 @@ bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { return false; } -std::wstring Stream::Message_what(const std::wstringstream &Message) const { - std::wstringstream what_; +UString Stream::Message_what(const std::stringstream &Message) const { + std::stringstream what_; if (TheFilename) - what_ << std::wstring(TheFilename->begin(), TheFilename->end()) << L": "; + what_ << UString(TheFilename->begin(), TheFilename->end()) << ": "; - what_ << TheLineNumber << L":" << TheLine.size() << L": " << Message.str() - << L'\n' << TheLine << L'\n' << std::wstring(TheLine.size() - 1, L' ') + what_ << TheLineNumber << ":" << TheLine.size() << ": " << Message.str() + << L'\n' << TheLine << L'\n' << UString(TheLine.size() - 1, L' ') << L'^'; return what_.str(); } bool Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - std::wstring &Lemma, + UString &Lemma, const wchar_t &Character_) { if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_)) return true; if (!TheCharacterStream) { - std::wstringstream Message; - Message << L"can't get const wchar_t: TheCharacterStream not good"; + std::stringstream Message; + Message << "can't get const wchar_t: TheCharacterStream not good"; throw Exception::Stream::TheCharacterStream_not_good( Message_what(Message)); } @@ -703,7 +703,7 @@ Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, } bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, - std::wstring &Lemma, + UString &Lemma, const wchar_t &Character_) { if (TheCharacterStream.eof()) return true; @@ -720,7 +720,7 @@ bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, } void Stream::push_back_Character(StreamedType &StreamedType_, - std::wstring &Lemma, + UString &Lemma, const wchar_t &Character_) { if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { @@ -766,9 +766,9 @@ void Stream::push_back_Character(StreamedType &StreamedType_, StreamedType_.TheString += Character_; break; default: - std::wstringstream Message; - Message << L"unexpected previous reserved or special character '" - << ThePreviousCase->ThePreviousCase << L"'"; + std::stringstream Message; + Message << "unexpected previous reserved or special character '" + << ThePreviousCase->ThePreviousCase << "'"; throw Exception::Stream::UnexpectedPreviousCase(Message_what(Message)); } @@ -779,7 +779,7 @@ void Stream::push_back_Character(StreamedType &StreamedType_, StreamedType_.TheString += Character_; } -void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, +void Stream::case_0x5c(StreamedType &StreamedType_, UString &Lemma, const wchar_t &Character_) { push_back_Character(StreamedType_, Lemma, Character_); @@ -788,9 +788,9 @@ void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma, Character_)) { - std::wstringstream Message; - Message << L"unexpected end-of-file following '\\', end-of-file " - L"expected to follow ']' or '$'"; + std::stringstream Message; + Message << "unexpected end-of-file following '\\', end-of-file " + "expected to follow ']' or '$'"; throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); } diff --git a/apertium/stream.h b/apertium/stream.h index e1603ea..2836810 100644 --- a/apertium/stream.h +++ b/apertium/stream.h @@ -54,19 +54,19 @@ private: bool isPreviousCharacter : 1; }; bool is_eof_throw_if_not_TheCharacterStream_good() const; - std::wstring Message_what(const std::wstringstream &Message) const; + UString Message_what(const std::stringstream &Message) const; bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - std::wstring &Lemma, + UString &Lemma, const wchar_t &Character_); bool isTheCharacterStream_eof(StreamedType &StreamedType_, - std::wstring &Lemma, const wchar_t &Character_); - void push_back_Character(StreamedType &StreamedType_, std::wstring &Lemma, + UString &Lemma, const wchar_t &Character_); + void push_back_Character(StreamedType &StreamedType_, UString &Lemma, const wchar_t &Character_); - void case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, + void case_0x5c(StreamedType &StreamedType_, UString &Lemma, const wchar_t &Character_); std::wistream &TheCharacterStream; Optional TheFilename; - std::wstring TheLine; + UString TheLine; TaggerFlags &TheFlags; bool private_flush_ : 1; Optional ThePreviousCase; diff --git a/apertium/streamed_type.h b/apertium/streamed_type.h index f73ec21..8a70b44 100644 --- a/apertium/streamed_type.h +++ b/apertium/streamed_type.h @@ -24,7 +24,7 @@ namespace Apertium { class StreamedType { public: - std::wstring TheString; + UString TheString; Optional TheLexicalUnit; }; } diff --git a/apertium/string_utils.cc b/apertium/string_utils.cc index 8ae064b..f6bae45 100644 --- a/apertium/string_utils.cc +++ b/apertium/string_utils.cc @@ -26,12 +26,12 @@ #endif //Delete white spaces from the end and the begining of the string -wstring -StringUtils::trim(wstring const &str) +UString +StringUtils::trim(UString const &str) { - if(str == L"") + if(str.empty()) { - return L""; + return ""_u; } int begin = 0, end = str.size() - 1; @@ -54,13 +54,13 @@ StringUtils::trim(wstring const &str) return str.substr(begin, end-begin); } -vector -StringUtils::split_wstring(wstring const &input, wstring const &delimiter) +vector +StringUtils::split_UString(UString const &input, UString const &delimiter) { unsigned pos; int new_pos; - vector result; - wstring s = L""; + vector result; + UString s = ""; pos=0; while(pos const &v) +UString +StringUtils::vector2UString(vector const &v) { - wstring s = L""; + UString s = ""; for(unsigned i=0; i0) @@ -93,12 +93,12 @@ StringUtils::vector2wstring(vector const &v) return s; } -wstring -StringUtils::substitute(wstring const &source, wstring const &olds, wstring const &news) { - wstring s = source; +UString +StringUtils::substitute(UString const &source, UString const &olds, UString const &news) { + UString s = source; unsigned int p=s.find(olds , 0); - while (p!=static_cast(wstring::npos)) + while (p!=static_cast(UString::npos)) { s.replace(p, olds.length(), news); p+=news.length(); @@ -108,7 +108,7 @@ StringUtils::substitute(wstring const &source, wstring const &olds, wstring cons return s; } -wstring +UString StringUtils::itoa(int n) { return XMLParseUtil::stows(itoa_string(n)); @@ -122,7 +122,7 @@ StringUtils::itoa_string(int n) return str; } -wstring +UString StringUtils::ftoa(double f) { char str[256]; @@ -130,10 +130,10 @@ StringUtils::ftoa(double f) return XMLParseUtil::stows(str); } -wstring -StringUtils::tolower(wstring const &s) +UString +StringUtils::tolower(UString const &s) { - wstring l=s; + UString l=s; for(unsigned i=0; i #include #include +#include using namespace std; @@ -37,24 +38,24 @@ namespace Apertium class StringUtils { public: - static wstring trim(wstring const &str); + static UString trim(UString const &str); - static vector split_wstring(wstring const &input, wstring const &delimiter); + static vector split_UString(UString const &input, UString const &delimiter); - static wstring vector2wstring(vector const &v); + static UString vector2UString(vector const &v); //Replace each ocurrence of the string 'olds' by the string 'news' in string 'source' - static wstring substitute(const wstring &source, const wstring &olds, const wstring &news); + static UString substitute(const UString &source, const UString &olds, const UString &news); - static wstring itoa(int n); + static UString itoa(int n); static string itoa_string(int n); - static wstring ftoa(double f); + static UString ftoa(double f); - static wstring tolower(wstring const &s); + static UString tolower(UString const &s); - static wstring toupper(wstring const &s); + static UString toupper(UString const &s); }; std::wostream & operator<< (std::wostream & ostr, std::string const & str); diff --git a/apertium/tag.cc b/apertium/tag.cc index 0aba018..33ad0a3 100644 --- a/apertium/tag.cc +++ b/apertium/tag.cc @@ -24,11 +24,11 @@ bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; } bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; } -Tag::operator std::wstring() const { +Tag::operator UString() const { if (TheTag.empty()) throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty " - "TheTag std::wstring to std::wstring"); + "TheTag UString to UString"); - return L"<" + TheTag + L">"; + return "<" + TheTag + ">"; } } diff --git a/apertium/tag.h b/apertium/tag.h index 147ea0d..62698e8 100644 --- a/apertium/tag.h +++ b/apertium/tag.h @@ -16,15 +16,15 @@ #ifndef TAG_H #define TAG_H -#include +#include namespace Apertium { class Tag { public: friend bool operator==(const Tag &a, const Tag &b); friend bool operator<(const Tag &a, const Tag &b); - operator std::wstring() const; - std::wstring TheTag; + operator UString() const; + UString TheTag; }; } diff --git a/apertium/tagger.cc b/apertium/tagger.cc index a7f3cc0..bb78420 100644 --- a/apertium/tagger.cc +++ b/apertium/tagger.cc @@ -285,7 +285,7 @@ apertium_tagger::apertium_tagger(int &argc, char **&argv) std::abort(); } } catch (const ExceptionType &ExceptionType_) { - std::wcerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; + std::cerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; throw Exception::apertium_tagger::err_Exception(""); } } @@ -294,7 +294,7 @@ apertium_tagger::~apertium_tagger() {} void apertium_tagger::help() { - std::wcerr << + std::cerr << "Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" " [INPUT \\\n" " [OUTPUT]]\n" @@ -339,27 +339,27 @@ void apertium_tagger::help() { options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); align::align_(options_description_); - std::wcerr << '\n'; + std::cerr << '\n'; options_description_.clear(); options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); align::align_(options_description_); @@ -550,7 +550,7 @@ void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const & MorphoStream* apertium_tagger::setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, FILE **UntaggedCorpus) { + FILE **Dictionary, UFILE* *UntaggedCorpus) { if (*TheFunctionType != Retrain) { *Dictionary = try_open_file_utf8("DICTIONARY", DicFn, "r"); } @@ -558,12 +558,12 @@ MorphoStream* apertium_tagger::setup_untagged_morpho_stream( FILE_Tagger_.read_dictionary(*Dictionary); - return new FileMorphoStream(*UntaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); + return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); } void apertium_tagger::close_untagged_files( char *DicFn, char *UntaggedFn, - FILE *Dictionary, FILE *UntaggedCorpus) { + FILE *Dictionary, UFILE* UntaggedCorpus) { if (*TheFunctionType == Supervised || *TheFunctionType == Train) { try_close_file("DICTIONARY", DicFn, Dictionary); } @@ -591,7 +591,7 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { } if (nonoptarg < 2) { Stream Input(TheFlags); - StreamTagger_.tag(Input, std::wcout); + StreamTagger_.tag(Input, std::cout); return; } @@ -600,7 +600,7 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { if (nonoptarg < 3) { Stream Input(TheFlags, Input_stream, argv[optind + 1]); - StreamTagger_.tag(Input, std::wcout); + StreamTagger_.tag(Input, std::cout); return; } @@ -664,12 +664,12 @@ void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { if (nonoptarg < 2) FILE_Tagger_.tagger(stdin, stdout); else { - FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); + UFILE* Input = try_open_file("INPUT", argv[optind + 1], "r"); if (nonoptarg < 3) FILE_Tagger_.tagger(Input, stdout); else { - FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); + UFILE* Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); FILE_Tagger_.tagger(Input, Output); try_close_file("OUTPUT", argv[optind + 2], Output); } @@ -697,7 +697,7 @@ void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *UntaggedCorpus; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, NULL, UntaggedFn, @@ -732,12 +732,13 @@ void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary, *UntaggedCorpus; + FILE *Dictionary; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, &Dictionary, &UntaggedCorpus); - FILE *TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); + UFILE* TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); FileMorphoStream tms(TaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); @@ -748,7 +749,7 @@ void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { Dictionary, UntaggedCorpus); if (do_unsup) { - FILE *Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); + UFILE* Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); try_close_file("CORPUS", CrpFn, Corpus); } @@ -773,7 +774,8 @@ void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary, *UntaggedCorpus; + FILE *Dictionary; + UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, diff --git a/apertium/tagger.h b/apertium/tagger.h index 1d00620..18be1e0 100644 --- a/apertium/tagger.h +++ b/apertium/tagger.h @@ -62,10 +62,10 @@ private: MorphoStream* setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, FILE **UntaggedCorpus); + FILE **Dictionary, UFILE **UntaggedCorpus); void close_untagged_files( char *DicFn, char *UntaggedFn, - FILE *Dictionary, FILE *UntaggedCorpus); + FILE *Dictionary, UFILE *UntaggedCorpus); void g_StreamTagger(StreamTagger &StreamTagger_); void s_StreamTaggerTrainer(StreamTagger &StreamTaggerTrainer_); diff --git a/apertium/tagger_data.cc b/apertium/tagger_data.cc index 759642d..a333e13 100644 --- a/apertium/tagger_data.cc +++ b/apertium/tagger_data.cc @@ -94,38 +94,38 @@ TaggerData::setForbidRules(vector &fr) forbid_rules = fr; } -map & +map & TaggerData::getTagIndex() { return tag_index; } -const map & +const map & TaggerData::getTagIndex() const { return tag_index; } void -TaggerData::setTagIndex(map const &ti) +TaggerData::setTagIndex(map const &ti) { tag_index = ti; } -vector & +vector & TaggerData::getArrayTags() { return array_tags; } -const vector & +const vector & TaggerData::getArrayTags() const { return array_tags; } void -TaggerData::setArrayTags(vector const &at) +TaggerData::setArrayTags(vector const &at) { array_tags = at; } @@ -148,38 +148,38 @@ TaggerData::setEnforceRules(vector const &tear) enforce_rules = tear; } -vector & +vector & TaggerData::getPreferRules() { return prefer_rules; } -const vector & +const vector & TaggerData::getPreferRules() const { return prefer_rules; } void -TaggerData::setPreferRules(vector const &pr) +TaggerData::setPreferRules(vector const &pr) { prefer_rules = pr; } -vector & +vector & TaggerData::getDiscardRules() { return discard; } -const vector & +const vector & TaggerData::getDiscardRules() const { return discard; } void -TaggerData::setDiscardRules(vector const &v) +TaggerData::setDiscardRules(vector const &v) { discard = v; } @@ -233,7 +233,7 @@ TaggerData::setPatternList(PatternList const &pl) } void -TaggerData::addDiscard(wstring const &tags) +TaggerData::addDiscard(UString const &tags) { discard.push_back(tags); } diff --git a/apertium/tagger_data.h b/apertium/tagger_data.h index 2190c8d..beba330 100644 --- a/apertium/tagger_data.h +++ b/apertium/tagger_data.h @@ -35,14 +35,14 @@ class TaggerData protected: set open_class; vector forbid_rules; - map tag_index; - vector array_tags; + map tag_index; + vector array_tags; vector enforce_rules; - vector prefer_rules; + vector prefer_rules; ConstantManager constants; Collection output; PatternList plist; - vector discard; + vector discard; void copy(TaggerData const &o); public: @@ -59,25 +59,25 @@ public: const vector & getForbidRules() const; void setForbidRules(vector &fr); - map & getTagIndex(); - const map & getTagIndex() const; - void setTagIndex(map const &ti); + map & getTagIndex(); + const map & getTagIndex() const; + void setTagIndex(map const &ti); - vector & getArrayTags(); - const vector & getArrayTags() const; - void setArrayTags(vector const &at); + vector & getArrayTags(); + const vector & getArrayTags() const; + void setArrayTags(vector const &at); vector & getEnforceRules(); const vector & getEnforceRules() const; void setEnforceRules(vector const &tear); - vector & getPreferRules(); - const vector & getPreferRules() const; - void setPreferRules(vector const &pr); + vector & getPreferRules(); + const vector & getPreferRules() const; + void setPreferRules(vector const &pr); - vector & getDiscardRules(); - const vector & getDiscardRules() const; - void setDiscardRules(vector const &dr); + vector & getDiscardRules(); + const vector & getDiscardRules() const; + void setDiscardRules(vector const &dr); ConstantManager & getConstants(); const ConstantManager & getConstants() const; @@ -91,7 +91,7 @@ public: PatternList & getPatternList(); const PatternList & getPatternList() const; - void addDiscard(wstring const &tags); + void addDiscard(UString const &tags); }; #endif diff --git a/apertium/tagger_data_hmm.cc b/apertium/tagger_data_hmm.cc index 0b60776..f81d88f 100644 --- a/apertium/tagger_data_hmm.cc +++ b/apertium/tagger_data_hmm.cc @@ -191,13 +191,13 @@ TaggerDataHMM::read(FILE *in) // array_tags for(int i = Compression::multibyte_read(in); i != 0; i--) { - array_tags.push_back(Compression::wstring_read(in)); + array_tags.push_back(Compression::string_read(in)); } // tag_index for(int i = Compression::multibyte_read(in); i != 0; i--) { - wstring tmp = Compression::wstring_read(in); + UString tmp = Compression::string_read(in); tag_index[tmp] = Compression::multibyte_read(in); } @@ -216,7 +216,7 @@ TaggerDataHMM::read(FILE *in) // prefer_rules for(int i = Compression::multibyte_read(in); i != 0; i--) { - prefer_rules.push_back(Compression::wstring_read(in)); + prefer_rules.push_back(Compression::string_read(in)); } // constants @@ -280,7 +280,7 @@ TaggerDataHMM::read(FILE *in) for(unsigned int i = 0; i < limit; i++) { - discard.push_back(Compression::wstring_read(in)); + discard.push_back(Compression::string_read(in)); } } @@ -310,16 +310,14 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(array_tags.size(), out); for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) { - Compression::wstring_write(array_tags[i], out); + Compression::string_write(array_tags[i], out); } // tag_index Compression::multibyte_write(tag_index.size(), out); - for(map::iterator it = tag_index.begin(), limit = tag_index.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, out); - Compression::multibyte_write(it->second, out); + for (auto& it : tag_index) { + Compression::string_write(it.first, out); + Compression::multibyte_write(it.second, out); } // enforce_rules @@ -338,7 +336,7 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(prefer_rules.size(), out); for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) { - Compression::wstring_write(prefer_rules[i], out); + Compression::string_write(prefer_rules[i], out); } // constants @@ -396,7 +394,7 @@ TaggerDataHMM::write(FILE *out) Compression::multibyte_write(discard.size(), out); for(unsigned int i = 0, limit = discard.size(); i != limit; i++) { - Compression::wstring_write(discard[i], out); + Compression::string_write(discard[i], out); } } } diff --git a/apertium/tagger_data_lsw.cc b/apertium/tagger_data_lsw.cc index 3a79aca..ccd068a 100644 --- a/apertium/tagger_data_lsw.cc +++ b/apertium/tagger_data_lsw.cc @@ -136,13 +136,13 @@ TaggerDataLSW::read(FILE *in) // array_tags for(int i = Compression::multibyte_read(in); i != 0; i--) { - array_tags.push_back(Compression::wstring_read(in)); + array_tags.push_back(Compression::string_read(in)); } // tag_index for(int i = Compression::multibyte_read(in); i != 0; i--) { - wstring tmp = Compression::wstring_read(in); + UString tmp = Compression::string_read(in); tag_index[tmp] = Compression::multibyte_read(in); } @@ -161,7 +161,7 @@ TaggerDataLSW::read(FILE *in) // prefer_rules for(int i = Compression::multibyte_read(in); i != 0; i--) { - prefer_rules.push_back(Compression::wstring_read(in)); + prefer_rules.push_back(Compression::string_read(in)); } // constants @@ -212,7 +212,7 @@ TaggerDataLSW::read(FILE *in) for(unsigned int i = 0; i < limit; i++) { - discard.push_back(Compression::wstring_read(in)); + discard.push_back(Compression::string_read(in)); } } @@ -242,16 +242,14 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(array_tags.size(), out); for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) { - Compression::wstring_write(array_tags[i], out); + Compression::string_write(array_tags[i], out); } // tag_index Compression::multibyte_write(tag_index.size(), out); - for(map::iterator it = tag_index.begin(), limit = tag_index.end(); - it != limit; it++) - { - Compression::wstring_write(it->first, out); - Compression::multibyte_write(it->second, out); + for(auto& it : tag_index) { + Compression::string_write(it.first, out); + Compression::multibyte_write(it.second, out); } // enforce_rules @@ -270,7 +268,7 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(prefer_rules.size(), out); for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) { - Compression::wstring_write(prefer_rules[i], out); + Compression::string_write(prefer_rules[i], out); } // constants @@ -317,7 +315,7 @@ TaggerDataLSW::write(FILE *out) Compression::multibyte_write(discard.size(), out); for(unsigned int i = 0, limit = discard.size(); i != limit; i++) { - Compression::wstring_write(discard[i], out); + Compression::string_write(discard[i], out); } } } diff --git a/apertium/tagger_data_percep_coarse_tags.cc b/apertium/tagger_data_percep_coarse_tags.cc index aa3080d..3e5496f 100644 --- a/apertium/tagger_data_percep_coarse_tags.cc +++ b/apertium/tagger_data_percep_coarse_tags.cc @@ -27,8 +27,8 @@ TaggerDataPercepCoarseTags::~TaggerDataPercepCoarseTags() {} void TaggerDataPercepCoarseTags::serialise(std::ostream &serialised) const { Serialiser >::serialise(open_class, serialised); - Serialiser >::serialise(array_tags, serialised); - Serialiser >::serialise(tag_index, serialised); + Serialiser >::serialise(array_tags, serialised); + Serialiser >::serialise(tag_index, serialised); constants.serialise(serialised); output.serialise(serialised); plist.serialise(serialised); @@ -37,14 +37,14 @@ void TaggerDataPercepCoarseTags::serialise(std::ostream &serialised) const void TaggerDataPercepCoarseTags::deserialise(std::istream &serialised) { open_class = Deserialiser >::deserialise(serialised); - array_tags = Deserialiser >::deserialise(serialised); - tag_index = Deserialiser >::deserialise(serialised); + array_tags = Deserialiser >::deserialise(serialised); + tag_index = Deserialiser >::deserialise(serialised); constants.deserialise(serialised); output.deserialise(serialised); plist.deserialise(serialised); } -const wstring& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd) const +const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd) const { // Init fine -> coarse tags matching machinary MatchState ms; @@ -52,7 +52,7 @@ const wstring& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd const Alphabet alphabet = plist.getAlphabet(); int ca_any_char = alphabet(PatternList::ANY_CHAR); int ca_any_tag = alphabet(PatternList::ANY_TAG); - map::const_iterator undef_it = tag_index.find(L"TAG_kUNDEF"); + map::const_iterator undef_it = tag_index.find("TAG_kUNDEF"); int ca_tag_kundef = undef_it->second; // Input lemma ms.init(me->getInitial()); @@ -61,7 +61,7 @@ const wstring& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd } // Input fine tags for (size_t i = 0; i < wrd.TheTags.size(); i++) { - int symbol = alphabet(L"<" + wrd.TheTags[i].TheTag + L">"); + int symbol = alphabet("<" + wrd.TheTags[i].TheTag + ">"); if (symbol) { ms.step(symbol, ca_any_tag); } diff --git a/apertium/tagger_data_percep_coarse_tags.h b/apertium/tagger_data_percep_coarse_tags.h index 08317bf..6a44b0a 100644 --- a/apertium/tagger_data_percep_coarse_tags.h +++ b/apertium/tagger_data_percep_coarse_tags.h @@ -14,7 +14,7 @@ public: virtual ~TaggerDataPercepCoarseTags(); void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); - const wstring& coarsen(const Apertium::Morpheme &wrd) const; + const UString& coarsen(const Apertium::Morpheme &wrd) const; }; #endif diff --git a/apertium/tagger_utils.cc b/apertium/tagger_utils.cc index 9892866..09e5fc2 100644 --- a/apertium/tagger_utils.cc +++ b/apertium/tagger_utils.cc @@ -39,13 +39,13 @@ wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { using namespace Apertium; -void tagger_utils::fatal_error (wstring const &s) { - wcerr< v[], int l) { v[i].clear(); } -int tagger_utils::ntokens_multiword(wstring const &s) +int tagger_utils::ntokens_multiword(UString const &s) { wchar_t *news = new wchar_t[s.size()+1]; wcscpy(news, s.c_str()); news[s.size()] = 0; - wcerr << news << endl; + cerr << news << endl; - wchar_t const *delim = L"_"; + wchar_t const *delim = "_"; wchar_t *ptr; int n=0; @@ -86,12 +86,12 @@ int tagger_utils::ntokens_multiword(wstring const &s) return n; } -int tagger_utils::nguiones_fs(wstring const & s) { +int tagger_utils::nguiones_fs(UString const & s) { wchar_t *news = new wchar_t[s.size()+1]; wcscpy(news, s.c_str()); news[s.size()] = 0; - wcerr << news << endl; - wchar_t const *delim = L"-"; + cerr << news << endl; + wchar_t const *delim = "-"; wchar_t *ptr; int n=0; @@ -105,10 +105,10 @@ int tagger_utils::nguiones_fs(wstring const & s) { return n; } -wstring tagger_utils::trim(wstring s) +UString tagger_utils::trim(UString s) { if (s.length()==0) - return L""; + return ""; for (unsigned int i=0; i<(s.length()-1); i++) { if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) { @@ -142,7 +142,7 @@ void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morph while (word) { if (++nw % 10000 == 0) - wcerr << L'.' << flush; + cerr << L'.' << flush; tags = word->get_tags(); @@ -152,7 +152,7 @@ void tagger_utils::scan_for_ambg_classes(Collection &output, MorphoStream &morph delete word; word = morpho_stream.get_next_word(); } - wcerr << L"\n"; + cerr << "\n"; } void @@ -198,27 +198,27 @@ tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { void tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw) { if (td.getOutput().has_not(tags)) { - wstring errors; - errors = L"A new ambiguity class was found. I cannot continue.\n"; - errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n"; - errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n"; + UString errors; + errors = "A new ambiguity class was found. I cannot continue.\n"; + errors+= "Word '" + word.get_superficial_form() + "' not found in the dictionary.\n"; + errors+= "New ambiguity class: " + word.get_string_tags() + "\n"; if (nw >= 0) { std::wostringstream ws; ws << (nw + 1); - errors+= L"Line number: " + ws.str() + L"\n"; + errors+= "Line number: " + ws.str() + "\n"; } - errors+= L"Take a look at the dictionary, then retrain."; + errors+= "Take a look at the dictionary, then retrain."; fatal_error(errors); } } static void _warn_absent_ambiguity_class(TaggerWord &word) { - wstring errors; - errors = L"A new ambiguity class was found. \n"; - errors += L"Retraining the tagger is necessary so as to take it into account.\n"; - errors += L"Word '" + word.get_superficial_form() + L"'.\n"; - errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; - wcerr << L"Error: " << errors; + UString errors; + errors = "A new ambiguity class was found. \n"; + errors += "Retraining the tagger is necessary so as to take it into account.\n"; + errors += "Word '" + word.get_superficial_form() + "'.\n"; + errors += "New ambiguity class: " + word.get_string_tags() + "\n"; + cerr << "Error: " << errors; } set & @@ -265,7 +265,7 @@ istream& operator>> (istream& is, map & f) { is>>i; // warning: does not work if both is>>f[i]; // lines merged in a single one } - if (is.bad()) tagger_utils::fatal_error(L"reading map"); + if (is.bad()) tagger_utils::fatal_error("reading map"); return is; } diff --git a/apertium/tagger_utils.h b/apertium/tagger_utils.h index f895735..abb0ef6 100644 --- a/apertium/tagger_utils.h +++ b/apertium/tagger_utils.h @@ -36,7 +36,7 @@ namespace tagger_utils /** Print a fatal error message * @param s the error message to print */ -void fatal_error (wstring const &s); +void fatal_error (UString const &s); /** Print a fatal error message related to a file * @param s the file name to be printted in the error message @@ -63,11 +63,11 @@ void clear_array_vector(vector v[], int l); /** Return the number of tokens in the multiword unit */ - int ntokens_multiword(wstring const &s); + int ntokens_multiword(UString const &s); /** Devuelve el nº de guiones que contiene la cadena pasada como argumento */ -int nguiones_fs(wstring const &cadena); +int nguiones_fs(UString const &cadena); /** Reads the expanded dictionary received as a parameter puts the resulting * ambiguity classes that the tagger will manage. @@ -105,7 +105,7 @@ set & require_similar_ambiguity_class(TaggerData &td, set &tags); /** Just prints a warning if warn */ void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool warn); -wstring trim(wstring s); +UString trim(UString s); }; diff --git a/apertium/tagger_word.cc b/apertium/tagger_word.cc index e7982ec..add71f3 100644 --- a/apertium/tagger_word.cc +++ b/apertium/tagger_word.cc @@ -24,16 +24,16 @@ using namespace Apertium; bool TaggerWord::generate_marks=false; -vector TaggerWord::array_tags; +vector TaggerWord::array_tags; bool TaggerWord::show_ignored_string=true; -map TaggerWord::patterns; +map TaggerWord::patterns; TaggerWord::TaggerWord(bool prev_plus_cut) : show_sf(false) { - ignored_string = L""; + ignored_string = ""; plus_cut=false; previous_plus_cut=prev_plus_cut; } @@ -62,19 +62,19 @@ TaggerWord::get_show_sf(){ } void -TaggerWord::set_superficial_form(const wstring &sf){ +TaggerWord::set_superficial_form(const UString &sf){ superficial_form = sf; } -wstring& +UString& TaggerWord::get_superficial_form() { return superficial_form; } bool -TaggerWord::match(wstring const &s, wstring const &pattern) +TaggerWord::match(UString const &s, UString const &pattern) { - map::iterator it = patterns.find(pattern); + map::iterator it = patterns.find(pattern); string const utfs = UtfConverter::toUtf8(s); if(it == patterns.end()) @@ -101,7 +101,7 @@ TaggerWord::match(wstring const &s, wstring const &pattern) } void -TaggerWord::add_tag(TTag &t, const wstring &lf, vector const &prefer_rules){ +TaggerWord::add_tag(TTag &t, const UString &lf, vector const &prefer_rules){ //Tag is added only is it is not present yet //Sometime one word can have more than one lexical form assigned to the same tag @@ -132,12 +132,12 @@ TaggerWord::isAmbiguous() const return tags.size() > 1; } -wstring +UString TaggerWord::get_string_tags() { - wstring st; + UString st; set::iterator itag = tags.begin(); - st=L"{"; + st="{"; for(itag=tags.begin(); itag!=tags.end(); itag++) { if (itag!=tags.begin()) st+=L','; @@ -148,9 +148,9 @@ TaggerWord::get_string_tags() { return st; } -wstring +UString TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { - wstring ret= L""; + UString ret= ""; if (show_ignored_string) ret.append(ignored_string); @@ -161,7 +161,7 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { if (!previous_plus_cut){ if(TaggerWord::generate_marks && isAmbiguous()) { - ret.append(L"^="); + ret.append("^="); } else { @@ -207,9 +207,9 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { return ret; } -wstring +UString TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { - wstring ret=L""; + UString ret=""; if (show_ignored_string) ret.append(ignored_string); @@ -221,7 +221,7 @@ TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { { if(TaggerWord::generate_marks && isAmbiguous()) { - ret.append(L"^="); + ret.append("^="); } else { @@ -232,16 +232,16 @@ TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { ret.append(superficial_form); if (lexical_forms.size()==0) { // This is an UNKNOWN WORD - ret+=L"/*"; + ret+="/*"; ret.append(superficial_form); } else { - ret+=L"/"; + ret+="/"; ret.append(lexical_forms[t]); if (lexical_forms.size()>1) { set::iterator it; for (it=tags.begin(); it!=tags.end(); it++) { if (*it != t) { - ret+=L"/"; + ret+="/"; ret.append(lexical_forms[*it]); } } @@ -250,9 +250,9 @@ TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { if (ret != ignored_string) { if (plus_cut) - ret+=L"+"; + ret+="+"; else { - ret+=L"$"; + ret+="$"; } } @@ -260,18 +260,18 @@ TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { } //OBSOLETE -wstring +UString TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) { - wstring ret; + UString ret; if(t==TAG_kEOF) return ret; if (lexical_forms.size()==0) { //This is an unknown word - ret.append(L"*^"); + ret.append("*^"); ret.append(superficial_form); } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an unknown word that has been guessed - ret.append(L"*^"); + ret.append("*^"); ret.append(superficial_form); } else { ret += L'^'; @@ -290,7 +290,7 @@ TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) } void -TaggerWord::add_ignored_string(wstring const &s) { +TaggerWord::add_ignored_string(UString const &s) { ignored_string.append(s); } @@ -304,14 +304,14 @@ TaggerWord::get_plus_cut() { return plus_cut; } -wostream& -operator<< (wostream& os, TaggerWord &w) { - os< const &at) +TaggerWord::setArrayTags(vector const &at) { array_tags = at; } @@ -319,42 +319,37 @@ TaggerWord::setArrayTags(vector const &at) void TaggerWord::print() { - wcout << L"[#" << superficial_form << L"# "; + cout << "[#" << superficial_form << "# "; for(set::iterator it=tags.begin(), limit = tags.end(); it != limit; it++) { - wcout << L"(" << *it << L" " << lexical_forms[*it] << L") "; + cout << "(" << *it << " " << lexical_forms[*it] << ") "; } - wcout << L"\b]\n"; + cout << "\b]\n"; } void TaggerWord::outputOriginal(FILE *output) { - wstring s=superficial_form; + UString s=superficial_form; - map::iterator it; - for(it=lexical_forms.begin(); it!=lexical_forms.end(); it++) { - if (it->second.length()>0) - { - s+=L'/'; - s.append(it->second); + for (auto& it : lexical_forms) { + if (!it.second.empty()) { + s += '/'; + s.append(it.second); } } - if (s.length()>0) - { - s=L"^"+s+L"$\n"; + if (!s.empty()) { + u_fprintf(output, "^%S$\n", s.c_str()); } - - fputws_unlocked(s.c_str(), output); } void -TaggerWord::discardOnAmbiguity(wstring const &tags) +TaggerWord::discardOnAmbiguity(UString const &tags) { if(isAmbiguous()) { - map::iterator it = lexical_forms.begin(), + map::iterator it = lexical_forms.begin(), limit = lexical_forms.end(); set newsettag; while(it != limit) diff --git a/apertium/tagger_word.h b/apertium/tagger_word.h index 601481e..d817dcb 100644 --- a/apertium/tagger_word.h +++ b/apertium/tagger_word.h @@ -36,12 +36,12 @@ using namespace std; */ class TaggerWord{ private: - wstring superficial_form; + UString superficial_form; set tags; //Set of all possible tags - map lexical_forms; //For a given coarse tag it stores the fine tag + map lexical_forms; //For a given coarse tag it stores the fine tag //delevered by the morphological analyzer - wstring ignored_string; + UString ignored_string; bool plus_cut; //Flag to distinguish the way in which the word was ended. //If it was done by '$' its value should be false @@ -50,12 +50,12 @@ private: //previous word was ended. It has the same //plus_cut meaning bool show_sf; // Show the superficial form in the output - static map patterns; + static map patterns; - bool match(wstring const &s, wstring const &pattern); + bool match(UString const &s, UString const &pattern); public: static bool generate_marks; - static vector array_tags; + static vector array_tags; static bool show_ignored_string; @@ -77,47 +77,47 @@ public: /** Set the superficial form of the word. * @param s the superficial form */ - void set_superficial_form(const wstring &s); + void set_superficial_form(const UString &s); /** Get the superficial form of the word * */ - wstring& get_superficial_form(); + UString& get_superficial_form(); /** Add a new tag to the set of all possible tags of the word. * @param t the coarse tag * @param lf the lexical form (fine tag) */ - virtual void add_tag(TTag &t, const wstring &lf, vector const &prefer_rules); + virtual void add_tag(TTag &t, const UString &lf, vector const &prefer_rules); /** Get the set of tags of this word. * @return set of tags. */ virtual set& get_tags(); - /** Get a wstring with the set of tags + /** Get a UString with the set of tags */ - virtual wstring get_string_tags(); + virtual UString get_string_tags(); /** Get the lexical form (fine tag) for a given tag (coarse one) * @param t the tag * @return the lexical form of tag t */ - virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF); + virtual UString get_lexical_form(TTag &t, int const TAG_kEOF); - wstring get_all_chosen_tag_first(TTag &t, int const TAG_kEOF); + UString get_all_chosen_tag_first(TTag &t, int const TAG_kEOF); /** Get the lexical form (fine tag) for a given tag (coarse one) * @param t the tag * @return the lexical form of tag t without other text that * is ignored. */ - wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF); + UString get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF); /** Add text to the ignored string * */ - void add_ignored_string(wstring const &s); + void add_ignored_string(UString const &s); /** Set the flag plus_cut to a certain value. If this flag is set to true means * that there were a '+' between this word and the next one @@ -135,18 +135,18 @@ public: /** Output operator */ - friend wostream& operator<< (wostream& os, TaggerWord &w); + friend ostream& operator<< (ostream& os, TaggerWord &w); - static void setArrayTags(vector const &at); + static void setArrayTags(vector const &at); void print(); - void outputOriginal(FILE *output); + void outputOriginal(UFILE *output); bool isAmbiguous() const; // CAUTION: unknown words are not considered to // be ambiguous by this method - void discardOnAmbiguity(wstring const &tags); + void discardOnAmbiguity(UString const &tags); }; #endif diff --git a/apertium/tmx_aligner_tool.cc b/apertium/tmx_aligner_tool.cc index c595b6e..90b353a 100644 --- a/apertium/tmx_aligner_tool.cc +++ b/apertium/tmx_aligner_tool.cc @@ -28,7 +28,7 @@ void readTrailOrBisentenceList( std::istream& is, Trail& trail ) is >> huPos; if (is.peek()!=' ') { - std::wcerr << "no space in line" << std::endl; + std::cerr << "no space in line" << std::endl; throw "data error"; } is.ignore(); @@ -36,7 +36,7 @@ void readTrailOrBisentenceList( std::istream& is, Trail& trail ) is >> enPos; if (is.peek()!='\n') { - std::wcerr << "too much data in line" << std::endl; + std::cerr << "too much data in line" << std::endl; throw "data error"; } is.ignore(); @@ -99,7 +99,7 @@ void collectBisentences( const Trail& bestTrail, const AlignMatrix& dynMatrix, enBisentences.push_back( enSentenceListPretty[ bisentenceList[i].second ] ); } -// std::wcerr << huBisentences.size() << " bisentences collected." << std::endl; +// std::cerr << huBisentences.size() << " bisentences collected." << std::endl; } @@ -152,11 +152,11 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, setSentenceValues( enSentenceList, enLength, alignParameters.utfCharCountingMode ); bool quasiglobal_stopwordRemoval = false; -// std::wcerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; +// std::cerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; if (quasiglobal_stopwordRemoval) { removeStopwords( huSentenceListPretty, enSentenceList ); -// std::wcerr << "Stopwords removed." << std::endl; +// std::cerr << "Stopwords removed." << std::endl; } SentenceList huSentenceListGarbled, enSentenceListGarbled; @@ -186,9 +186,9 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (thickness>maximalThickness) { -// std::wcerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; -// std::wcerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; -// std::wcerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; +// std::cerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; +// std::cerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; +// std::cerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; thickness = maximalThickness; } @@ -196,20 +196,20 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); -// std::wcerr << std::endl; -// std::wcerr << "Rough translation-based similarity matrix ready." << std::endl; +// std::cerr << std::endl; +// std::cerr << "Rough translation-based similarity matrix ready." << std::endl; Trail bestTrail; AlignMatrix dynMatrix( huBookSize+1, enBookSize+1, thickness, 1e30 ); align( similarityMatrix, huLength, enLength, bestTrail, dynMatrix ); -// std::wcerr << "Align ready." << std::endl; +// std::cerr << "Align ready." << std::endl; double globalQuality; globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align " << globalQuality << std::endl; if (alignParameters.realignType==AlignParameters::NoRealign) { @@ -222,11 +222,11 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (!success) { -// std::wcerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; +// std::cerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; } else { -// std::wcerr << "Border of realign zone determined." << std::endl; +// std::cerr << "Border of realign zone determined." << std::endl; switch (alignParameters.realignType) { @@ -237,24 +237,24 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, SentenceList huBisentences,enBisentences; throw "unimplemented"; -// std::wcerr << "Plausible bisentences filtered." << std::endl; +// std::cerr << "Plausible bisentences filtered." << std::endl; modelOne.build(huBisentences,enBisentences); -// std::wcerr << "IBM Model I ready." << std::endl; +// std::cerr << "IBM Model I ready." << std::endl; sentenceListsToAlignMatrixIBMModelOne( huSentenceListPretty, enSentenceList, modelOne, similarityMatrixDetailed ); -// std::wcerr << "IBM Model I based similarity matrix ready." << std::endl; +// std::cerr << "IBM Model I based similarity matrix ready." << std::endl; break; } case AlignParameters::FineTranslationRealign: { TransLex transLex; transLex.build(dictionary); -// std::wcerr << "Hashtable for dictionary ready." << std::endl; +// std::cerr << "Hashtable for dictionary ready." << std::endl; sentenceListsToAlignMatrixTranslation( huSentenceListPretty, enSentenceList, transLex, similarityMatrixDetailed ); -// std::wcerr << "Fine translation-based similarity matrix ready." << std::endl; +// std::cerr << "Fine translation-based similarity matrix ready." << std::endl; break; } @@ -268,7 +268,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, Trail bestTrailDetailed; AlignMatrix dynMatrixDetailed( huBookSize+1, enBookSize+1, thickness, 1e30 ); align( similarityMatrixDetailed, huLength, enLength, bestTrailDetailed, dynMatrixDetailed ); -// std::wcerr << "Detail realign ready." << std::endl; +// std::cerr << "Detail realign ready." << std::endl; bestTrail = bestTrailDetailed; dynMatrix = dynMatrixDetailed; @@ -276,7 +276,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; } } @@ -285,27 +285,27 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if ( alignParameters.postprocessTrailQualityThreshold != -1 ) { postprocessTrail( bestTrail, trailScoresInterval, alignParameters.postprocessTrailQualityThreshold ); -// std::wcerr << "Trail start and end postprocessed by score." << std::endl; +// std::cerr << "Trail start and end postprocessed by score." << std::endl; } if ( alignParameters.postprocessTrailStartAndEndQualityThreshold != -1 ) { postprocessTrailStartAndEnd( bestTrail, trailScoresInterval, alignParameters.postprocessTrailStartAndEndQualityThreshold ); -// std::wcerr << "Trail start and end postprocessed by score." << std::endl; +// std::cerr << "Trail start and end postprocessed by score." << std::endl; } if ( alignParameters.postprocessTrailByTopologyQualityThreshold != -1 ) { postprocessTrailByTopology( bestTrail, alignParameters.postprocessTrailByTopologyQualityThreshold ); -// std::wcerr << "Trail postprocessed by topology." << std::endl; +// std::cerr << "Trail postprocessed by topology." << std::endl; } bool quasiglobal_spaceOutBySentenceLength = true; -// std::wcerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; +// std::cerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; if (quasiglobal_spaceOutBySentenceLength) { spaceOutBySentenceLength( bestTrail, huSentenceListPretty, enSentenceList, alignParameters.utfCharCountingMode ); -// std::wcerr << "Trail spaced out by sentence length." << std::endl; +// std::cerr << "Trail spaced out by sentence length." << std::endl; } // In cautious mode, auto-aligned rundles are thrown away if @@ -313,13 +313,13 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, if (alignParameters.cautiousMode) { cautiouslyFilterTrail( bestTrail ); -// std::wcerr << "Trail filtered by topology." << std::endl; +// std::cerr << "Trail filtered by topology." << std::endl; } globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); - // std::wcerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; bool textual = ! alignParameters.justSentenceIds ; @@ -429,17 +429,17 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, std::ifstream hus(huFilename.c_str()); SentenceList huSentenceListPretty; huSentenceListPretty.readNoIds( hus ); -// std::wcerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; +// std::cerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; std::ifstream ens(enFilename.c_str()); SentenceList enSentenceList; enSentenceList.readNoIds( ens ); -// std::wcerr << enSentenceList.size() << " english sentences read." << std::endl; +// std::cerr << enSentenceList.size() << " english sentences read." << std::endl; if ( (enSentenceList. size() < huSentenceListPretty.size()/5) || (huSentenceListPretty.size() < enSentenceList. size()/5) ) { -// std::wcerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; +// std::cerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; return; } @@ -448,7 +448,7 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, /* double globalQuality = */alignerToolWithObjects ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, std::cout ); -// std::wcerr << "Quality " << globalQuality << std::endl ; +// std::cerr << "Quality " << globalQuality << std::endl ; } else @@ -458,7 +458,7 @@ void alignerToolWithFilenames( const DictionaryItems& dictionary, ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, os ); // If you want to collect global quality information in batch mode, grep "^Quality" of stderr must do. -// std::wcerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; +// std::cerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; } } @@ -474,7 +474,7 @@ void fillPercentParameter( Arguments& args, const std::string& argName, double& void main_alignerToolUsage() { - std::wcerr << "Usage (either):\n\ + std::cerr << "Usage (either):\n\ alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\ \n\ or:\n\ @@ -586,8 +586,8 @@ int main_alignerTool(int argC, char* argV[]) if (batchMode && (remains.size()!=2) ) { - std::wcerr << "Batch mode requires exactly two file arguments." << std::endl; - std::wcerr << std::endl; + std::cerr << "Batch mode requires exactly two file arguments." << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; @@ -598,7 +598,7 @@ int main_alignerTool(int argC, char* argV[]) { if (batchMode) { - std::wcerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; + std::cerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; throw "argument error"; } else @@ -608,7 +608,7 @@ int main_alignerTool(int argC, char* argV[]) if (alignParameters.handAlignFilename.empty()) { - std::wcerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; + std::cerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; throw "argument error"; } } @@ -619,7 +619,7 @@ int main_alignerTool(int argC, char* argV[]) { if (batchMode) { - std::wcerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; + std::cerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; throw "argument error"; } else @@ -629,7 +629,7 @@ int main_alignerTool(int argC, char* argV[]) if (alignParameters.autoDictionaryDumpFilename.empty()) { - std::wcerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; + std::cerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; throw "argument error"; } } @@ -637,8 +637,8 @@ int main_alignerTool(int argC, char* argV[]) if (!batchMode && (remains.size()!=3) ) { - std::wcerr << "Nonbatch mode requires exactly three file arguments." << std::endl; - std::wcerr << std::endl; + std::cerr << "Nonbatch mode requires exactly three file arguments." << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; @@ -650,13 +650,13 @@ int main_alignerTool(int argC, char* argV[]) } catch (...) { - std::wcerr << std::endl; + std::cerr << std::endl; main_alignerToolUsage(); throw "argument error"; } -// std::wcerr << "Reading dictionary..." << std::endl; +// std::cerr << "Reading dictionary..." << std::endl; const char* dicFilename = remains[0] ; DictionaryItems dictionary; std::ifstream dis(dicFilename); @@ -677,7 +677,7 @@ int main_alignerTool(int argC, char* argV[]) if (words.size()!=3) { - std::wcerr << "Batch file has incorrect format." << std::endl; + std::cerr << "Batch file has incorrect format." << std::endl; throw "data error"; } @@ -686,7 +686,7 @@ int main_alignerTool(int argC, char* argV[]) enFilename = words[1]; outFilename = words[2]; -// std::wcerr << "Processing " << outFilename << std::endl; +// std::cerr << "Processing " << outFilename << std::endl; bool failed = false; try { @@ -694,23 +694,23 @@ int main_alignerTool(int argC, char* argV[]) } catch ( const char* errorType ) { - std::wcerr << errorType << std::endl; + std::cerr << errorType << std::endl; failed = true; } catch ( std::exception& e ) { - std::wcerr << "some failed assertion:" << e.what() << std::endl; + std::cerr << "some failed assertion:" << e.what() << std::endl; failed = true; } catch ( ... ) { - std::wcerr << "some unknown failed assertion..." << std::endl; + std::cerr << "some unknown failed assertion..." << std::endl; failed = true; } if (failed) { - std::wcerr << "Align failed for " << outFilename << std::endl; + std::cerr << "Align failed for " << outFilename << std::endl; } } } @@ -725,17 +725,17 @@ int main_alignerTool(int argC, char* argV[]) #ifndef _DEBUG catch ( const char* errorType ) { - std::wcerr << errorType << std::endl; + std::cerr << errorType << std::endl; return -1; } catch ( std::exception& e ) { - std::wcerr << "some failed assertion:" << e.what() << std::endl; + std::cerr << "some failed assertion:" << e.what() << std::endl; return -1; } catch ( ... ) { - std::wcerr << "some unknown failed assertion..." << std::endl; + std::cerr << "some unknown failed assertion..." << std::endl; return -1; } #endif diff --git a/apertium/tmx_alignment.cc b/apertium/tmx_alignment.cc index 8b556fc..4b891b7 100644 --- a/apertium/tmx_alignment.cc +++ b/apertium/tmx_alignment.cc @@ -21,7 +21,7 @@ #include // Copypaste-elve. TODO Elhelyezni. -#define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; } +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } std::ostream& operator<<( std::ostream& os, std::pair p ) { @@ -241,7 +241,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) bool logging = false; - if (logging) std::wcerr << std::endl; + if (logging) std::cerr << std::endl; bool over = false; bool hopelesslyBadTrail = false; @@ -304,7 +304,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) if (logging) { - std::wcerr << huPos << " \t" << enPos << std::endl; + std::cerr << huPos << " \t" << enPos << std::endl; } } @@ -314,7 +314,7 @@ void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) bestTrail.clear(); bestTrail.push_back(std::make_pair(huBookSize,enBookSize)); bestTrail.push_back(std::make_pair(0,0)); - std::wcerr << "Error: hopelessly bad trail." << std::endl; + std::cerr << "Error: hopelessly bad trail." << std::endl; } std::reverse(bestTrail.begin(), bestTrail.end() ); @@ -335,11 +335,11 @@ void align( const AlignMatrix& w, const SentenceValues& huLength, const Sentence buildDynProgMatrix( w, huLength, enLength, v, trellis ); -// std::wcerr << "Matrix built." << std::endl; +// std::cerr << "Matrix built." << std::endl; trelliToLadder( trellis, bestTrail ); -// std::wcerr << "Trail found." << std::endl; +// std::cerr << "Trail found." << std::endl; } @@ -383,10 +383,10 @@ double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHan { int score = countIntersectionOfTrails( trailAuto, trailHand ); - std::wcerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " + std::cerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " << trailAuto.size() << " bets." << std::endl; - std::wcerr << "Precision: " << 1.0*score/trailAuto.size() + std::cerr << "Precision: " << 1.0*score/trailAuto.size() << ", Recall: " << 1.0*score/trailHand.size() << std::endl; double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size(); @@ -494,7 +494,7 @@ bool borderDetailedAlignMatrix( AlignMatrix& alignMatrix, const Trail& trail, in } } - std::wcerr << numberOfEvaluatedItems << " items inside the border." << std::endl; + std::cerr << numberOfEvaluatedItems << " items inside the border." << std::endl; } return true; diff --git a/apertium/tmx_arguments_parser.cc b/apertium/tmx_arguments_parser.cc index 5498a7d..97fe31c 100644 --- a/apertium/tmx_arguments_parser.cc +++ b/apertium/tmx_arguments_parser.cc @@ -27,7 +27,7 @@ bool Arguments::read( int argc, char **argv ) std::string p = argv[i]; if (p.empty() || p[0]!='-') { - std::wcerr << p << ": unable to parse argument\n"; + std::cerr << p << ": unable to parse argument\n"; throw "argument error"; return false; } @@ -35,7 +35,7 @@ bool Arguments::read( int argc, char **argv ) if (p.empty()) { - std::wcerr << "Empty argument\n"; + std::cerr << "Empty argument\n"; throw "argument error"; return false; } @@ -86,7 +86,7 @@ bool Arguments::read( int argc, char **argv, std::vector& remains ) if (p.empty()) { - std::wcerr << "Empty argument\n"; + std::cerr << "Empty argument\n"; throw "argument error"; return false; } @@ -124,13 +124,13 @@ bool Arguments::getNumericParam( const std::string& name, int& num ) const_iterator it=find(name); if (it==end()) { - // std::wcerr << "Argument -" << name << " missing.\n"; + // std::cerr << "Argument -" << name << " missing.\n"; return false; } if (it->second.kind != AnyData::Int) { - std::wcerr << "Argument -" << name << ": integer expected.\n"; + std::cerr << "Argument -" << name << ": integer expected.\n"; throw "argument error"; } @@ -149,7 +149,7 @@ bool Arguments::getSwitchConst( const ArgName& name, bool& sw ) const } else if (! it->second.dString.empty()) { - std::wcerr << "Argument -" << name << ": value is not allowed.\n"; + std::cerr << "Argument -" << name << ": value is not allowed.\n"; return false; } else @@ -179,7 +179,7 @@ bool Arguments::getSwitchCompact( const ArgName& name ) } else { - std::wcerr << "No value is allowed for argument -" << name << ".\n"; + std::cerr << "No value is allowed for argument -" << name << ".\n"; throw "argument error"; } } @@ -188,16 +188,16 @@ void Arguments::checkEmptyArgs() const { if (!empty()) { - std::wcerr << "Invalid argument: "; + std::cerr << "Invalid argument: "; for ( Arguments::const_iterator it=begin(); it!=end(); ++it ) { - std::wcerr << "-" << it->first; + std::cerr << "-" << it->first; if (!it->second.dString.empty()) - std::wcerr << "=" << it->second.dString; - std::wcerr << " "; + std::cerr << "=" << it->second.dString; + std::cerr << " "; } - std::wcerr << std::endl; + std::cerr << std::endl; throw "argument error"; } diff --git a/apertium/tmx_book_to_matrix.cc b/apertium/tmx_book_to_matrix.cc index fb37b79..c115afb 100644 --- a/apertium/tmx_book_to_matrix.cc +++ b/apertium/tmx_book_to_matrix.cc @@ -185,7 +185,7 @@ void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, con if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " "; + // std::cerr << huPos << " "; } } } @@ -255,7 +255,7 @@ void sentenceListsToAlignMatrixTranslation( if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " (" << numberOfEvaluatedItems << ") "; + // std::cerr << huPos << " (" << numberOfEvaluatedItems << ") "; } } } @@ -301,7 +301,7 @@ void sentenceListsToAlignMatrixIBMModelOne( if (!rarelyLogging || (huPos%100==0)) { - // std::wcerr << huPos << " "; + // std::cerr << huPos << " "; } } } diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc index 15d84fa..84e8e5a 100644 --- a/apertium/tmx_builder.cc +++ b/apertium/tmx_builder.cc @@ -40,7 +40,7 @@ using namespace Apertium; using namespace std; -TMXBuilder::TMXBuilder(wstring const &l1, wstring const &l2): +TMXBuilder::TMXBuilder(UString const &l1, UString const &l2): low_limit(0) { lang1 = l1; @@ -62,17 +62,17 @@ TMXBuilder::~TMXBuilder() { } -wstring -TMXBuilder::restOfBlank(FILE *input) +UString +TMXBuilder::restOfBlank(InputFile& input) { - wstring result = L"["; + UString result = "["; while(true) { wint_t val = fgetwc(input); if(feof(input)) { - return L""; + return ""; } switch(val) { @@ -81,7 +81,7 @@ TMXBuilder::restOfBlank(FILE *input) val = fgetwc(input); if(feof(input)) { - return L""; + return ""; } result += static_cast(val); break; @@ -96,20 +96,20 @@ TMXBuilder::restOfBlank(FILE *input) } } - return L""; + return ""; } -wstring -TMXBuilder::nextBlank(FILE *input) +UString +TMXBuilder::nextBlank(InputFile& input) { - wstring result = L""; + UString result = ""; while(true) { wint_t val = fgetwc(input); if(feof(input)) { - return L""; + return ""; } switch(val) { @@ -125,9 +125,9 @@ TMXBuilder::nextBlank(FILE *input) } bool -TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) +TMXBuilder::compatible(InputFile& f1, InputFile& f2, bool lazy) { - wstring s1 = nextBlank(f1), s2 = nextBlank(f2); + UString s1 = nextBlank(f1), s2 = nextBlank(f2); if(!lazy) { while(!feof(f1) && !feof(f2)) @@ -158,52 +158,45 @@ TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) bool TMXBuilder::check(string const &file1, string const &file2, bool lazy) { - FILE *f1 = fopen(file1.c_str(), "rb"); - FILE *f2 = fopen(file2.c_str(), "rb"); - if(!f1 && !f2) + InputFile f1, f2; + bool bf1 = f1.open(file1.c_str()); + bool bf2 = f2.open(file2.c_str()); + if(!bf1 && !bf2) { - wcerr << L"Error: Cannot access files '" << UtfConverter::fromUtf8(file1); - wcerr << L"' and '" << UtfConverter::fromUtf8(file2) << "'" << endl; + cerr << "Error: Cannot access files '" << file1; + cerr << "' and '" << file2 << "'" << endl; return false; } - else if(!f1) + else if(!bf1) { - wcerr << L"Error: Cannot access file '"; - wcerr << UtfConverter::fromUtf8(file2); - wcerr << "'" << endl; - fclose(f2); + cerr << "Error: Cannot access file '" << file1 << "'" << endl; return false; } - else if(!f2) + else if(!bf2) { - wcerr << L"Error: Cannot access file '"; - wcerr << UtfConverter::fromUtf8(file2); - wcerr << "'" << endl; - fclose(f1); + cerr << "Error: Cannot access file '" << file2 << "'" << endl; return false; } bool retval = compatible(f1, f2, lazy); - fclose(f1); - fclose(f2); return retval; } -wstring -TMXBuilder::nextTU(FILE *input) +UString +TMXBuilder::nextTU(InputFile& input) { - wstring current_tu = L""; - wstring tmp; + UString current_tu = ""; + UString tmp; while(true) { wint_t symbol = fgetwc_unlocked(input); if(feof(input)) { - if(current_tu == L"") + if(current_tu == "") { - return L""; + return ""; } else { @@ -216,9 +209,9 @@ TMXBuilder::nextTU(FILE *input) symbol = fgetwc_unlocked(input); if(feof(input)) { - if(current_tu == L"") + if(current_tu == "") { - return L""; + return ""; } else { @@ -232,14 +225,14 @@ TMXBuilder::nextTU(FILE *input) case L'[': tmp = restOfBlank(input); - if(tmp.substr(0,2) == L"[ ") + if(tmp.substr(0,2) == "[ ") { - current_tu.append(L" "); + current_tu.append(" "); } - current_tu.append(L""); - if(tmp.substr(tmp.size()-2, 2) == L" ]") + current_tu.append(""); + if(tmp.substr(tmp.size()-2, 2) == " ]") { - current_tu.append(L" "); + current_tu.append(" "); } break; @@ -281,34 +274,34 @@ TMXBuilder::nextTU(FILE *input) return current_tu; } -wstring -TMXBuilder::xmlize(wstring const &str) +UString +TMXBuilder::xmlize(UString const &str) { - wstring result = L""; + UString result = ""; for(size_t i = 0, limit = str.size(); i < limit; i++) { switch(str[i]) { case L'<': - if(i + 5 <= limit && str.substr(i,5)==L"") + if(i + 5 <= limit && str.substr(i,5)=="") { - result.append(L""); + result.append(""); i += 4; break; } else { - result.append(L"<"); + result.append("<"); } break; case L'>': - result.append(L">"); + result.append(">"); break; case L'&': - result.append(L"&"); + result.append("&"); break; default: @@ -323,7 +316,7 @@ TMXBuilder::xmlize(wstring const &str) while(cambio == true) { cambio = false; - while(result.size() >= 5 && result.substr(0,5) == L"") + while(result.size() >= 5 && result.substr(0,5) == "") { result = result.substr(5); cambio = true; @@ -340,7 +333,7 @@ TMXBuilder::xmlize(wstring const &str) while(cambio == true) { cambio = false; - while(result.size() > 5 && result.substr(result.size()-5) == L"") + while(result.size() > 5 && result.substr(result.size()-5) == "") { result = result.substr(0, result.size()-5); cambio = true; @@ -376,50 +369,40 @@ void TMXBuilder::generate(string const &file1, string const &file2, string const &outfile) { - FILE *output = stdout; + UFILE* output = u_finit(stdout, NULL, NULL); - if(outfile != "") + if(!outfile.empty()) { - output = fopen(outfile.c_str(), "w"); + output = u_fopen(outfile.c_str(), "w", NULL, NULL); if(!output) { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(outfile); - wcerr << L"' cannot be opened for writing" << endl; + cerr << "Error: file '" << outfile; + cerr << "' cannot be opened for writing" << endl; exit(EXIT_FAILURE); } } -#ifdef _MSC_VER - _setmode(_fileno(output), _O_U8TEXT); -#endif - FILE *f1 = fopen(file1.c_str(), "r"); - if(!f1) - { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(file1); - wcerr << L"' cannot be opened for reading" << endl; + InputFile f1; + if (!f1.open(file1.c_str())) { + cerr << "Error: file '" << file1; + cerr << "' cannot be opened for reading" << endl; exit(EXIT_FAILURE); } - FILE *f2 = fopen(file2.c_str(), "r"); - if(!f2) - { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(file2); - wcerr << L"' cannot be opened for reading" << endl; + InputFile f2; + if (!f2.open(file2.c_str())) { + cerr << "Error: file '" << file2; + cerr << "' cannot be opened for reading" << endl; exit(EXIT_FAILURE); } -#ifdef _MSC_VER - _setmode(_fileno(f1), _O_U8TEXT); - _setmode(_fileno(f2), _O_U8TEXT); -#endif - generateTMX(f1, f2, output); } -vector -TMXBuilder::reverseList(vector const &v) +vector +TMXBuilder::reverseList(vector const &v) { - vector retval(v.size()); + vector retval(v.size()); for(int j = v.size() - 1, i = 0; j >=0; j--, i++) { @@ -429,14 +412,14 @@ TMXBuilder::reverseList(vector const &v) return retval; } -vector -TMXBuilder::sentenceList(FILE *file) +vector +TMXBuilder::sentenceList(InputFile& file) { - vector retval; + vector retval; while(true) { - wstring f = nextTU(file); + UString f = nextTU(file); if(feof(file)) { break; @@ -447,10 +430,10 @@ TMXBuilder::sentenceList(FILE *file) return retval; } -vector -TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width) +vector +TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width) { - vector result; + vector result; for(unsigned int i = base; i < (base + width) && i < text.size(); i++) { @@ -485,7 +468,7 @@ TMXBuilder::argmin(int nw, int n, int w) } void -TMXBuilder::generateTMX(FILE *f1, FILE *f2, FILE *output) +TMXBuilder::generateTMX(InputFile& f1, InputFile& f2, UFILE* output) { fprintf(output, "\n"); fprintf(output, "\n"); @@ -512,17 +495,17 @@ TMXBuilder::printTable(int *table, unsigned int nrows, unsigned int ncols) { if(j != 0) { - wcerr << L" "; + cerr << " "; } - wcerr << setw(10) << table[i*ncols + j]; + cerr << setw(10) << table[i*ncols + j]; } - wcerr << endl; + cerr << endl; } } void -TMXBuilder::printTUCond(FILE *output, wstring const &tu1, wstring const &tu2, bool secure_zone) +TMXBuilder::printTUCond(UFILE *output, UString const &tu1, UString const &tu2, bool secure_zone) { if(secure_zone && similar(tu1, tu2)) { @@ -531,20 +514,18 @@ TMXBuilder::printTUCond(FILE *output, wstring const &tu1, wstring const &tu2, bo } void -TMXBuilder::splitAndMove(FILE *f1, string const &filename) +TMXBuilder::splitAndMove(InputFile& f1, string const &filename) { - FILE *stream = fopen(filename.c_str(), "w"); - vector fichero_por_cadenas = sentenceList(f1); - for(size_t i = 0; i < fichero_por_cadenas.size(); i++) - { - fputws_unlocked(fichero_por_cadenas[i].c_str(), stream); - fputws_unlocked(L"\n", stream); + UFILE* stream = u_fopen(file.c_str(), "w", NULL, NULL); + vector fichero_por_cadenas = sentenceList(f1); + for (auto& it : fichero_por_cadenas) { + u_fprintf(stream, "%S\n", it.c_str()); } - fclose(stream); + u_fclose(stream); } void -TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) +TMXBuilder::outputTU(InputFile& f1, InputFile& f2, UFILE* output) { string left = tmpnam(NULL); string right = tmpnam(NULL); @@ -565,29 +546,25 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) TMXAligner::alignerToolWithFilenames(dict, left, right, ap, out); - FILE *stream = fopen(out.c_str(), "r"); + InputFile stream; + stream.open(out.c_str()); int conta = 0; - wstring partes[2]; - while(true) + UString partes[2]; + while(!stream.eof()) { - wchar_t val = fgetwc(stream); - if(feof(stream)) - { - break; - } + UChar32 val = stream.get(); - if(val == L'\t') + if(val == '\t') { conta++; } - else if(val == L'\n') + else if(val == '\n') { - if(partes[0] != L"" && partes[1] != L"") - { + if (!partes[0].empty() && !partes[1].empty()) { printTU(output, partes[0], partes[1]); } - partes[0] = L""; - partes[1] = L""; + partes[0].clear(); + partes[1].clear(); conta = 0; } if(conta < 2) @@ -605,7 +582,7 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) int base_i = 0, base_j = 0; - vector lista1 = reverseList(sentenceList(f1)), + vector lista1 = reverseList(sentenceList(f1)), lista2 = reverseList(sentenceList(f2)), lista3; if(freference != NULL) @@ -615,8 +592,8 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) while(true) { - vector l1 = extractFragment(lista1, base_i, window_size); - vector l2 = extractFragment(lista2, base_j, window_size) , l3; + vector l1 = extractFragment(lista1, base_i, window_size); + vector l2 = extractFragment(lista2, base_j, window_size) , l3; if(lista3.size() != 0) { @@ -696,7 +673,7 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) } } - // wcerr << L"[" << i << L" " << j << L"]" << endl; + // cerr << "[" << i << " " << j << "]" << endl; break; case 3: @@ -755,13 +732,13 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) } int -TMXBuilder::weight(wstring const &s) +TMXBuilder::weight(UString const &s) { return s.size()*2; // just the size of the string } int * -TMXBuilder::levenshteinTable(vector &l1, vector &l2, +TMXBuilder::levenshteinTable(vector &l1, vector &l2, unsigned int diagonal_width, unsigned int max_edit) { unsigned int const nrows = l1.size() + 1; @@ -809,8 +786,8 @@ TMXBuilder::levenshteinTable(vector &l1, vector &l2, return table; } -wstring -TMXBuilder::filter(wstring const &tu) +UString +TMXBuilder::filter(UString const &tu) { bool has_text = false; unsigned int count_blank = 0; @@ -829,19 +806,19 @@ TMXBuilder::filter(wstring const &tu) if(!has_text || count_blank <= 2 || tu.size() == 0) { - return L""; + return ""; } return xmlize(tu); } void -TMXBuilder::printTU(FILE *output, wstring const &tu1, wstring const &tu2) const +TMXBuilder::printTU(UFILE* output, UString const &tu1, UString const &tu2) const { - wstring tu1_filtered = filter(tu1); - wstring tu2_filtered = filter(tu2); + UString tu1_filtered = filter(tu1); + UString tu2_filtered = filter(tu2); - if(tu1_filtered != L"" && tu2_filtered != L"") + if(tu1_filtered != "" && tu2_filtered != "") { fprintf(output, "\n %s\n", @@ -892,7 +869,7 @@ TMXBuilder::min2(int i1, int i2) } int -TMXBuilder::editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit) +TMXBuilder::editDistance(UString const &s1, UString const &s2, unsigned int max_edit) { int const nrows = min2(s1.size() + 1, max_edit); int const ncols = min2(s2.size() + 1, max_edit); @@ -980,7 +957,7 @@ TMXBuilder::isRemovablePunct(wchar_t const &c) } bool -TMXBuilder::similar(wstring const &s1, wstring const &s2) +TMXBuilder::similar(UString const &s1, UString const &s2) { unsigned int l1 = s1.size(); unsigned int l2 = s2.size(); @@ -1012,8 +989,8 @@ TMXBuilder::setTranslation(string const &filename) freference = fopen(filename.c_str(), "r"); if(!freference) { - wcerr << L"Error: file '" << UtfConverter::fromUtf8(filename); - wcerr << L"' cannot be opened for reading" << endl; + cerr << "Error: file '" << UtfConverter::fromUtf8(filename); + cerr << "' cannot be opened for reading" << endl; freference = NULL; } diff --git a/apertium/tmx_builder.h b/apertium/tmx_builder.h index 7e92d40..cc5d432 100644 --- a/apertium/tmx_builder.h +++ b/apertium/tmx_builder.h @@ -26,8 +26,8 @@ using namespace std; class TMXBuilder { private: - wstring lang1; - wstring lang2; + UString lang1; + UString lang2; unsigned int max_edit; unsigned int diagonal_width; unsigned int window_size; @@ -37,35 +37,35 @@ private: unsigned int low_limit; FILE *freference; - static wstring nextTU(FILE *input); - static wstring restOfBlank(FILE *input); - static wstring nextBlank(FILE *input); - static wstring xmlize(wstring const &str); - static bool compatible(FILE *input, FILE *output, bool lazy = false); - void generateTMX(FILE *f1, FILE *f2, FILE *output); - void outputTU(FILE *f1, FILE *f2, FILE *output); - static vector reverseList(vector const &v); - static vector sentenceList(FILE *file); + static UString nextTU(InputFile& input); + static UString restOfBlank(InputFile& input); + static UString nextBlank(InputFile& input); + static UString xmlize(UString const &str); + static bool compatible(InputFile& input, UFILE* output, bool lazy = false); + void generateTMX(InputFile& f1, InputFile& f2, UFILE* output); + void outputTU(InputFile& f1, InputFile& f2, UFILE* output); + static vector reverseList(vector const &v); + static vector sentenceList(InputFile& file); static int argmin(int nw, int n, int w); - static int * levenshteinTable(vector &l1, vector &l2, + static int * levenshteinTable(vector &l1, vector &l2, unsigned int diagonal_width, unsigned int max_edit); - void printTU(FILE *output, wstring const &tu1, wstring const &tu2) const; - static wstring filter(wstring const &s); - static int weight(wstring const &s); + void printTU(UFILE* output, UString const &tu1, UString const &tu2) const; + static UString filter(UString const &s); + static int weight(UString const &s); static void printTable(int *table, unsigned int nrows, unsigned int ncols); - static int editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit); + static int editDistance(UString const &s1, UString const &s2, unsigned int max_edit); static int min3(int i1, int i2, int i3); static int min2(int i1, int i2); - void printTUCond(FILE *output, wstring const &s1, wstring const &s2, bool secure_zone); - static vector extractFragment(vector const &text, unsigned int base, + void printTUCond(UFILE* output, UString const &s1, UString const &s2, bool secure_zone); + static vector extractFragment(vector const &text, unsigned int base, unsigned int width); static bool isRemovablePunct(wchar_t const &c); - bool similar(wstring const &s1, wstring const &s2); + bool similar(UString const &s1, UString const &s2); - void splitAndMove(FILE *file, string const &filename); + void splitAndMove(InputFile& file, string const &filename); public: - TMXBuilder(wstring const &l1, wstring const &l2); + TMXBuilder(UString const &l1, UString const &l2); ~TMXBuilder(); static bool check(string const &file1, string const &file2, bool lazy = false); void generate(string const &file1, string const &file2, diff --git a/apertium/tmx_dic_tree.h b/apertium/tmx_dic_tree.h index 9a0545b..957de53 100644 --- a/apertium/tmx_dic_tree.h +++ b/apertium/tmx_dic_tree.h @@ -106,7 +106,7 @@ DicTree& DicTree::add( const Atom& word, con if ( ( v->id != 0 ) && ( id != 0 ) ) { if (WarnOnConflict) - std::wcerr << "warning: conflict in tree" << std::endl; + std::cerr << "warning: conflict in tree" << std::endl; } if ( id != 0 ) { @@ -165,7 +165,7 @@ void SubsetLookup::add( const Atoms& words, const Identifier& else { if (DicTree::WarnOnConflict) - std::wcerr << "warning: conflict in tree" << std::endl; + std::cerr << "warning: conflict in tree" << std::endl; } } diff --git a/apertium/tmx_dictionary.cc b/apertium/tmx_dictionary.cc index f36c65a..70944ea 100644 --- a/apertium/tmx_dictionary.cc +++ b/apertium/tmx_dictionary.cc @@ -22,7 +22,7 @@ #include -#define massert(e) if (!(e)) { std::wcerr << #e << " failed" << std::endl; throw "assert"; } +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } namespace TMXAligner { @@ -151,7 +151,7 @@ void readBicorpus( std::istream& is, SentenceList& huSentenceList, SentenceList& split( line, halfs ); if (halfs.size()!=2) { - std::wcerr << "Incorrect bicorpus file: " << halfs.size() << " records in line " << huSentenceList.size() << std::endl; + std::cerr << "Incorrect bicorpus file: " << halfs.size() << " records in line " << huSentenceList.size() << std::endl; throw "data error"; } @@ -565,7 +565,7 @@ void TransLex::build( const DictionaryItems& dictionaryItems ) ++ignored; } } - std::wcerr << added << " items added to TransLex, " << ignored << " multiword items ignored." << std::endl; + std::cerr << added << " items added to TransLex, " << ignored << " multiword items ignored." << std::endl; } TransLex::DictInterval TransLex::lookupLeftWord ( const Word& huWord ) const diff --git a/apertium/tmx_trail_postprocessors.cc b/apertium/tmx_trail_postprocessors.cc index 4bde3ed..d0b7312 100644 --- a/apertium/tmx_trail_postprocessors.cc +++ b/apertium/tmx_trail_postprocessors.cc @@ -280,7 +280,7 @@ void postprocessTrailStart( Trail& bestTrail, { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -319,7 +319,7 @@ void postprocessTrailEnd( Trail& bestTrail, { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -365,7 +365,7 @@ void postprocessTrail( Trail& bestTrail, const TrailScoresInterval& trailScoresI { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << ", threshold " << qualityThreshold << std::endl; } @@ -404,7 +404,7 @@ void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold ) { if (global_postprocessLogging) { - std::wcerr << "Thrown away at position " << pos + std::cerr << "Thrown away at position " << pos << ", avarage " << avg << std::endl; } diff --git a/apertium/tmx_translate.cc b/apertium/tmx_translate.cc index 06db477..9761553 100644 --- a/apertium/tmx_translate.cc +++ b/apertium/tmx_translate.cc @@ -34,7 +34,7 @@ void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dum if (hu.size()==1) { dumbDictionary[ hu[0] ] = en ; - // std::wcerr << hu[0] << "\t" << en << std::endl; + // std::cerr << hu[0] << "\t" << en << std::endl; } } } @@ -98,7 +98,7 @@ void buildDumbDictionary( TMXAligner::DumbDictionary& dumbDictionary, { std::ifstream is( dictionaryFilename.c_str() ); dictionary.read( is ); - std::wcerr << dictionary.size() << " dictionary items read." << std::endl; + std::cerr << dictionary.size() << " dictionary items read." << std::endl; } if (!enSentenceList.empty()) @@ -267,7 +267,7 @@ void naiveTranslate( { subsetLookup.add( dictionary[i].second, i+1 ); // !!! i+1 } - std::wcerr << "Index tree built." << std::endl; + std::cerr << "Index tree built." << std::endl; } for ( size_t i=0; i= limit) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index >= limit" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index >= limit" << endl; return false; } if(index < 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": index < 0" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index < 0" << endl; return false; } if(word[index] == 0) { - wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) << L": line " << element->line << L": Null access at word[index]" << endl; + cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": Null access at word[index]" << endl; return false; } return true; @@ -762,7 +762,7 @@ Transfer::evalString(xmlNode *element) } else { - wcerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; exit(EXIT_FAILURE); } @@ -801,16 +801,13 @@ Transfer::processOut(xmlNode *localroot) out_wblank = word[0]->getWblank(); } - if(myword != "") + if(!myword.empty()) { if(myword[0] != L'[' || myword[1] != L'[') { - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked(L'^', output); + u_fprintf(output, "%S^", out_blank.c_str()); } - - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "%S$", myword.c_str()); } } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) @@ -860,29 +857,23 @@ Transfer::processOut(xmlNode *localroot) out_wblank = word[0]->getWblank(); } - if(myword != "") - { - fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); - fputwc_unlocked('^', output); - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - fputwc_unlocked(L'$', output); + if(!myword.empty()) { + u_fprintf(output, "%S^%S$", out_blank.c_str(), myword.c_str()); } } - else // 'b' - { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), - output); + else { // 'b' + write(evalString(i), output); } } else { if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { - fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + write(processChunk(i), output); } else // 'b' { - fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + write(evalString(i), output); } } } @@ -927,7 +918,7 @@ Transfer::processChunk(xmlNode *localroot) } else { - wcerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; exit(EXIT_FAILURE); } } @@ -943,7 +934,7 @@ Transfer::processChunk(xmlNode *localroot) } else { - wcerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; exit(EXIT_FAILURE); } } @@ -1172,7 +1163,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); if (!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -1182,7 +1173,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); if (!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -1192,7 +1183,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[ti.getPos()]->setReference(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); if (!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } return; @@ -1249,11 +1240,11 @@ Transfer::processLet(xmlNode *localroot) } if (pos >= lword) { - wcerr << L"Error: Transfer::processLet() bad access on pos >= lword" << endl; + cerr << "Error: Transfer::processLet() bad access on pos >= lword" << endl; return; } if (word[pos] == 0) { - wcerr << L"Error: Transfer::processLet() null access on word[pos]" << endl; + cerr << "Error: Transfer::processLet() null access on word[pos]" << endl; return; } @@ -1262,7 +1253,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); } @@ -1271,7 +1262,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[pos]->setReference(attr_items[(const char *) part], evalString(rightSide), queue); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_ref, (const char *) part, pos, NULL, queue); } @@ -1280,7 +1271,7 @@ Transfer::processLet(xmlNode *localroot) bool match = word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); } @@ -1373,7 +1364,7 @@ Transfer::processModifyCase(xmlNode *localroot) bool match = word[pos]->setSource(attr_items[(const char *) part], result); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) @@ -1383,7 +1374,7 @@ Transfer::processModifyCase(xmlNode *localroot) bool match = word[pos]->setReference(attr_items[(const char *) part], result); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } else @@ -1393,7 +1384,7 @@ Transfer::processModifyCase(xmlNode *localroot) bool match = word[pos]->setTarget(attr_items[(const char *) part], result); if(!match && trace) { - wcerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; + cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } } } @@ -1436,7 +1427,7 @@ Transfer::processCallMacro(xmlNode *localroot) if(i->type == XML_ELEMENT_NODE) { if (idx >= npar) { - wcerr << L"Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; + cerr << "Error: processCallMacro() number of arguments >= npar at line " << i->line << endl; return; } int pos = atoi((const char *) i->properties->children->content)-1; @@ -1600,7 +1591,7 @@ Transfer::processIn(xmlNode *localroot) if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { - set &myset = listslow[(const char *) idlist]; + set &myset = listslow[(const char *) idlist]; if(myset.find(tolower(sval)) != myset.end()) { return true; @@ -1612,7 +1603,7 @@ Transfer::processIn(xmlNode *localroot) } } - set &myset = lists[(const char *) idlist]; + set &myset = lists[(const char *) idlist]; if(myset.find(sval) != myset.end()) { return true; @@ -1860,7 +1851,7 @@ Transfer::processBeginsWithList(xmlNode *localroot) xmlChar *idlist = second->properties->children->content; string needle = evalString(first); - set::iterator it, limit; + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) @@ -1909,7 +1900,7 @@ Transfer::processEndsWithList(xmlNode *localroot) xmlChar *idlist = second->properties->children->content; string needle = evalString(first); - set::iterator it, limit; + set::iterator it, limit; if(localroot->properties == NULL || xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) @@ -1976,9 +1967,9 @@ Transfer::processContainsSubstring(xmlNode *localroot) string Transfer::copycase(string const &source_word, string const &target_word) { - wstring result; - wstring const s_word = UtfConverter::fromUtf8(source_word); - wstring const t_word = UtfConverter::fromUtf8(target_word); + UString result; + UString const s_word = UtfConverter::fromUtf8(source_word); + UString const t_word = UtfConverter::fromUtf8(target_word); bool firstupper = iswupper(s_word[0]); bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); @@ -2006,7 +1997,7 @@ Transfer::copycase(string const &source_word, string const &target_word) string Transfer::caseOf(string const &str) { - wstring const s = UtfConverter::fromUtf8(str); + UString const s = UtfConverter::fromUtf8(str); if(s.size() > 1) { @@ -2090,9 +2081,8 @@ Transfer::processRule(xmlNode *localroot) while(!blank_queue.empty()) //flush remaining blanks that are not spaces { - if(blank_queue.front().compare(" ") != 0) - { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + if(blank_queue.front().compare(" ") != 0) { + write(blank_queue.front(), output); } blank_queue.pop(); } @@ -2101,14 +2091,14 @@ Transfer::processRule(xmlNode *localroot) } TransferToken & -Transfer::readToken(FILE *in) +Transfer::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content; + UString content; while(true) { int val = fgetwc_unlocked(in); @@ -2119,7 +2109,7 @@ Transfer::readToken(FILE *in) } if(in_wblank) { - content = L"[["; + content = "[["; content+= wchar_t(val); while(true) @@ -2232,7 +2222,7 @@ Transfer::tmp_clear() } void -Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) +Transfer::transfer_wrapper_null_flush(InputFile& in, UFILE* out) { null_flush = false; internal_null_flush = true; @@ -2241,11 +2231,11 @@ Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) { tmp_clear(); transfer(in, out); - fputwc_unlocked(L'\0', out); + u_fputc('\0', out); int code = fflush(out); if(code != 0) { - wcerr << L"Could not flush output " << errno << endl; + cerr << "Could not flush output " << errno << endl; } } @@ -2254,7 +2244,7 @@ Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) } void -Transfer::transfer(FILE *in, FILE *out) +Transfer::transfer(InputFile& in, UFILE* out) { if(getNullFlush()) { @@ -2274,31 +2264,31 @@ Transfer::transfer(FILE *in, FILE *out) { if(trace_att) { - wcerr << "Loop start " << endl; - wcerr << "ms.size: " << ms.size() << endl; + cerr << "Loop start " << endl; + cerr << "ms.size: " << ms.size() << endl; - wcerr << "tmpword.size(): " << tmpword.size() << endl; + cerr << "tmpword.size(): " << tmpword.size() << endl; for (unsigned int ind = 0; ind < tmpword.size(); ind++) { if(ind != 0) { - wcerr << L" "; + cerr << " "; } - wcerr << *tmpword[ind]; + cerr << *tmpword[ind]; } - wcerr << endl; + cerr << endl; - wcerr << "tmpblank.size(): " << tmpblank.size() << endl; + cerr << "tmpblank.size(): " << tmpblank.size() << endl; for (unsigned int ind = 0; ind < tmpblank.size(); ind++) { - wcerr << L"'"; - wcerr << *tmpblank[ind]; - wcerr << L"' "; + cerr << "'"; + cerr << *tmpblank[ind]; + cerr << "' "; } - wcerr << endl; + cerr << endl; - wcerr << "last: " << last << endl; - wcerr << "prev_last: " << prev_last << endl << endl; + cerr << "last: " << last << endl; + cerr << "prev_last: " << prev_last << endl << endl; } if(ms.size() == 0) @@ -2309,7 +2299,7 @@ Transfer::transfer(FILE *in, FILE *out) if(trace_att) { - wcerr << "num_words_to_consume: " << num_words_to_consume << endl; + cerr << "num_words_to_consume: " << num_words_to_consume << endl; } //Consume all the words from the input which matched the rule. @@ -2356,11 +2346,11 @@ Transfer::transfer(FILE *in, FILE *out) { if(trace_att) { - wcerr << "printing tmpword[0]" < tr; - wstring tr_wblank; + pair tr; + UString tr_wblank; if(useBilingual && preBilingual == false) { if(isExtended && (*tmpword[0])[0] == L'*') @@ -2372,7 +2362,7 @@ Transfer::transfer(FILE *in, FILE *out) } else { - tr.first = L"%" + tr.first; + tr.first = "%" + tr.first; } } else @@ -2382,13 +2372,13 @@ Transfer::transfer(FILE *in, FILE *out) } else if(preBilingual) { - wstring sl; - wstring tl; - wstring ref; - wstring wblank; + UString sl; + UString tl; + UString ref; + UString wblank; int seenSlash = 0; - for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) + for(UString::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) { if(*it == L'\\') { @@ -2474,44 +2464,29 @@ Transfer::transfer(FILE *in, FILE *out) } } //tmpword[0]->assign(sl); - tr = pair(tl, false); + tr = pair(tl, false); tr_wblank = wblank; - //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; + //cerr << "pb: " << *tmpword[0] << " :: " << sl << " >> " << tl << endl ; } else { - tr = pair(*tmpword[0], 0); + tr = pair(*tmpword[0], 0); } - if(tr.first.size() != 0) - { - if(defaultAttrs == lu) - { - if(tr.first[0] != L'[' || tr.first[1] != L'[') - { - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); - } - fputws_unlocked(tr.first.c_str(), output); - fputwc_unlocked(L'$', output); - } - else - { - if(tr.first[0] == '*') - { - fputws_unlocked(L"^unknown{", output); - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); - } - else - { - fputws_unlocked(L"^default{", output); - fputws_unlocked(tr_wblank.c_str(), output); - fputwc_unlocked(L'^', output); + if(tr.first.size() != 0) { + if(defaultAttrs == lu) { + if(tr.first[0] != '[' || tr.first[1] != '[') { + u_fprintf(output, "%S^", tr_wblank.c_str()); + } + u_fprintf(output, "%S$", tr.first.c_str()); + } else { + if(tr.first[0] == '*') { + u_fprintf(output, "^unknown{%S^", tr_wblank.c_str()); + } else { + u_fprintf(output, "^default{%S^", tr_wblank.c_str()); + } + u_fprintf(output, "%S$}$", tr.first.c_str()); } - fputws_unlocked(tr.first.c_str(), output); - fputws_unlocked(L"$}$", output); - } } banned_rules.clear(); tmpword.clear(); @@ -2523,11 +2498,10 @@ Transfer::transfer(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - if(trace_att) - { - wcerr << "printing tmpblank[0]" <c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); prev_last = last; last = input_buffer.getPos(); @@ -2544,18 +2518,12 @@ Transfer::transfer(FILE *in, FILE *out) last = input_buffer.getPos(); last_lword = tmpword.size(); - if(trace) - { - wcerr << endl << L"apertium-transfer: Rule " << val << L" line " << lastrule_line << L" "; - for (unsigned int ind = 0; ind < tmpword.size(); ind++) - { - if (ind != 0) - { - wcerr << L" "; - } - fputws_unlocked(tmpword[ind]->c_str(), stderr); + if(trace) { + cerr << endl << "apertium-transfer: Rule " << val << " line " << lastrule_line; + for (auto& it : tmpword) { + cerr << " " << *it; } - wcerr << endl; + cerr << endl; } } @@ -2581,13 +2549,13 @@ Transfer::transfer(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); return; } break; default: - wcerr << "Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } @@ -2598,7 +2566,7 @@ Transfer::applyRule() { int words_to_consume; unsigned int limit = tmpword.size(); - //wcerr << L"applyRule: " << tmpword.size() << endl; + //cerr << "applyRule: " << tmpword.size() << endl; for(unsigned int i = 0; i != limit; i++) { @@ -2617,11 +2585,11 @@ Transfer::applyRule() } } - pair tr; + pair tr; if(useBilingual && preBilingual == false) { tr = fstp.biltransWithQueue(*tmpword[i], false); - wstring refx,wblankx; + UString refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(refx), @@ -2630,13 +2598,13 @@ Transfer::applyRule() } else if(preBilingual) { - wstring sl; - wstring tl; - wstring ref; - wstring wblank; + UString sl; + UString tl; + UString ref; + UString wblank; int seenSlash = 0; - for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) + for(UString::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) { if(*it == L'\\') { @@ -2722,7 +2690,7 @@ Transfer::applyRule() ref.push_back(*it); } } - tr = pair(tl, false); + tr = pair(tl, false); word[i] = new TransferWord(UtfConverter::toUtf8(sl), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(ref), @@ -2731,8 +2699,8 @@ Transfer::applyRule() } else // neither useBilingual nor preBilingual (sl==tl) { - tr = pair(*tmpword[i], false); - wstring refx,wblankx; + tr = pair(*tmpword[i], false); + UString refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(refx), @@ -2762,7 +2730,7 @@ Transfer::applyRule() /* HERE */ void -Transfer::applyWord(wstring const &word_str) +Transfer::applyWord(UString const &word_str) { ms.step(L'^'); diff --git a/apertium/transfer.h b/apertium/transfer.h index 80ff1bc..c6f9edd 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -45,11 +45,11 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; vector macro_map; vector rule_map; vector rule_lines; @@ -60,8 +60,8 @@ private: int lword; int last_lword; Buffer input_buffer; - vector tmpword; - vector tmpblank; + vector tmpword; + vector tmpblank; bool in_out; bool in_lu; @@ -78,7 +78,7 @@ private: FSTProcessor fstp; FSTProcessor extended; bool isExtended; - FILE *output; + UFILE *output; int any_char; int any_tag; @@ -136,14 +136,14 @@ private: bool endsWith(string const &str1, string const &str2) const; string tolower(string const &str) const; string tags(string const &str) const; - wstring readWord(FILE *in); - wstring readBlank(FILE *in); - wstring readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); int applyRule(); - TransferToken & readToken(FILE *in); + TransferToken & readToken(InputFile& in); bool checkIndex(xmlNode *element, int index, int limit); - void transfer_wrapper_null_flush(FILE *in, FILE *out); + void transfer_wrapper_null_flush(InputFile& in, UFILE* out); void tmp_clear(); public: Transfer(); @@ -151,7 +151,7 @@ public: void read(string const &transferfile, string const &datafile, string const &fstfile = ""); - void transfer(FILE *in, FILE *out); + void transfer(InputFile& in, UFILE* out); void setUseBilingual(bool value); bool getUseBilingual(void) const; void setPreBilingual(bool value); diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index fc08552..2de9701 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -46,14 +46,14 @@ TransferData::destroy() TransferData::TransferData() { // adding fixed attr_items - attr_items[L"lem"] = L"^(([^<]|\"\\<\")+)"; - attr_items[L"lemq"] = L"\\#[- _][^<]+"; - attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)"; - attr_items[L"whole"] = L"(.+)"; - attr_items[L"tags"] = L"((<[^>]+>)+)"; - attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!! - attr_items[L"chcontent"] = L"(\\{.+)"; - attr_items[L"content"] = L"(\\{.+)"; + attr_items["lem"] = "^(([^<]|\"\\<\")+)"; + attr_items["lemq"] = "\\#[- _][^<]+"; + attr_items["lemh"] = "^(([^<#]|\"\\<\"|\"\\#\")+)"; + attr_items["whole"] = "(.+)"; + attr_items["tags"] = "((<[^>]+>)+)"; + attr_items["chname"] = "({([^/]+)\\/)"; // includes delimiters { and / !!! + attr_items["chcontent"] = "(\\{.+)"; + attr_items["content"] = "(\\{.+)"; } TransferData::~TransferData() @@ -89,25 +89,25 @@ TransferData::getTransducer() return transducer; } -map & +map & TransferData::getAttrItems() { return attr_items; } -map & +map & TransferData::getMacros() { return macros; } -map, Ltstr> & +map> & TransferData::getLists() { return lists; } -map & +map & TransferData::getVariables() { return variables; @@ -115,7 +115,7 @@ TransferData::getVariables() int TransferData::countToFinalSymbol(const int count) { - const wstring count_sym = L""; + const UString count_sym = ""; alphabet.includeSymbol(count_sym); const int symbol = alphabet(count_sym); final_symbols.insert(symbol); @@ -134,7 +134,7 @@ TransferData::write(FILE *output) // Find all arcs with "final_symbols" in the transitions, let their source node instead be final, // and extract the rule number from the arc. Record relation between source node and rule number // in finals_rules. It is now no longer safe to minimize -- but we already did that. - const wstring rule_sym_pre = L" > >::const_iterator it = transitions.begin(), limit = transitions.end(); it != limit; ++it) { @@ -152,7 +152,7 @@ TransferData::write(FILE *output) continue; } // Extract the rule number encoded by countToFinalSymbol(): - wstring s; + UString s; alphabet.getSymbol(s, symbol); if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) { continue; @@ -188,34 +188,34 @@ TransferData::write(FILE *output) // variables Compression::multibyte_write(variables.size(), output); - for(map::const_iterator it = variables.begin(), limit = variables.end(); + for(map::const_iterator it = variables.begin(), limit = variables.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); - Compression::wstring_write(it->second, output); + Compression::string_write(it->first, output); + Compression::string_write(it->second, output); } // macros Compression::multibyte_write(macros.size(), output); - for(map::const_iterator it = macros.begin(), limit = macros.end(); + for(map::const_iterator it = macros.begin(), limit = macros.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second, output); } // lists Compression::multibyte_write(lists.size(), output); - for(map, Ltstr>::const_iterator it = lists.begin(), limit = lists.end(); + for(map>::const_iterator it = lists.begin(), limit = lists.end(); it != limit; it++) { - Compression::wstring_write(it->first, output); + Compression::string_write(it->first, output); Compression::multibyte_write(it->second.size(), output); - for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); + for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); it2 != limit2; it2++) { - Compression::wstring_write(*it2, output); + Compression::string_write(*it2, output); } } @@ -227,13 +227,11 @@ TransferData::writeRegexps(FILE *output) Compression::string_write(pcre_version_endian(), output); Compression::multibyte_write(attr_items.size(), output); - map::iterator it, limit; - for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++) - { - Compression::wstring_write(it->first, output); + for (auto& it : attr_items) { + Compression::string_write(it.first, output); ApertiumRE my_re; - my_re.compile(UtfConverter::toUtf8(it->second)); + my_re.compile(UtfConverter::toUtf8(it.second)); my_re.write(output); - Compression::wstring_write(it->second, output); + Compression::string_write(it.second, output); } } diff --git a/apertium/transfer_data.h b/apertium/transfer_data.h index 49b5755..97253e9 100644 --- a/apertium/transfer_data.h +++ b/apertium/transfer_data.h @@ -32,10 +32,10 @@ private: void copy(TransferData const &o); void destroy(); - map attr_items; - map macros; - map, Ltstr> lists; - map variables; + map attr_items; + map macros; + map> lists; + map variables; set final_symbols; Alphabet alphabet; @@ -50,13 +50,13 @@ private: Alphabet & getAlphabet(); Transducer & getTransducer(); - map & getAttrItems(); + map & getAttrItems(); map seen_rules; - map & getMacros(); - map, Ltstr> & getLists(); - map & getVariables(); + map & getMacros(); + map> & getLists(); + map & getVariables(); /** * Encode the rule count in an arc label/symbol (later extracted by diff --git a/apertium/transfer_instr.cc b/apertium/transfer_instr.cc index 7efee52..40d557b 100644 --- a/apertium/transfer_instr.cc +++ b/apertium/transfer_instr.cc @@ -33,7 +33,7 @@ TransferInstr::destroy() { } -TransferInstr::TransferInstr(TransferInstrType t, string const &c, +TransferInstr::TransferInstr(TransferInstrType t, UString const &c, int const p, void *ptr, bool cond) { type = t; @@ -70,7 +70,7 @@ TransferInstr::getType() return type; } -string const & +UString const & TransferInstr::getContent() { return content; diff --git a/apertium/transfer_instr.h b/apertium/transfer_instr.h index 977b112..92ab858 100644 --- a/apertium/transfer_instr.h +++ b/apertium/transfer_instr.h @@ -17,7 +17,8 @@ #ifndef _TRANSFERINSTR_ #define _TRANSFERINSTR_ -#include +#include +#include using namespace std; @@ -44,7 +45,7 @@ class TransferInstr { private: TransferInstrType type; - string content; + UString content; int pos; void *pointer; bool condition; @@ -58,7 +59,7 @@ public: pointer(0), condition(false) {} - TransferInstr(TransferInstrType t, string const &c, int const p, + TransferInstr(TransferInstrType t, UString const &c, int const p, void *ptr=NULL, bool cond = true); ~TransferInstr(); TransferInstr(TransferInstr const &o); @@ -66,7 +67,7 @@ public: TransferInstrType getType(); - string const & getContent(); + UString const & getContent(); int getPos(); void * getPointer(); bool getCondition(); diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc index 6491c53..1938a6f 100644 --- a/apertium/transfer_mult.cc +++ b/apertium/transfer_mult.cc @@ -97,9 +97,9 @@ TransferMult::readData(FILE *in) bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); attr_items[cad_k].read(in); - wstring fallback = Compression::wstring_read(in); + UString fallback = Compression::string_read(in); if(recompile_attrs) { attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); } @@ -108,25 +108,25 @@ TransferMult::readData(FILE *in) // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::string_read(in)); } // macros for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); macros[cad_k] = Compression::multibyte_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { - wstring const cad_v = Compression::wstring_read(in); + UString const cad_v = Compression::string_read(in); lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); } @@ -139,7 +139,7 @@ TransferMult::readBil(string const &fstfile) FILE *in = fopen(fstfile.c_str(), "r"); if(!in) { - wcerr << "Error: Could not open file '" << fstfile << "'." << endl; + cerr << "Error: Could not open file '" << fstfile << "'." << endl; exit(EXIT_FAILURE); } fstp.load(in); @@ -154,7 +154,7 @@ TransferMult::read(string const &datafile, string const &fstfile) FILE *in = fopen(datafile.c_str(), "r"); if(!in) { - wcerr << "Error: Could not open file '" << datafile << "'." << endl; + cerr << "Error: Could not open file '" << datafile << "'." << endl; exit(EXIT_FAILURE); } readData(in); @@ -164,14 +164,14 @@ TransferMult::read(string const &datafile, string const &fstfile) } TransferToken & -TransferMult::readToken(FILE *in) +TransferMult::readToken(InputFile& in) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wstring content = L""; + UString content = ""; while(true) { int val = fgetwc_unlocked(in); @@ -222,7 +222,7 @@ TransferMult::readToken(FILE *in) } void -TransferMult::transfer(FILE *in, FILE *out) +TransferMult::transfer(InputFile& in, UFILE* out) { int last = 0; @@ -243,28 +243,25 @@ TransferMult::transfer(FILE *in, FILE *out) { if(tmpword.size() != 0) { - pair tr = fstp.biltransWithQueue(*tmpword[0], false); + pair tr = fstp.biltransWithQueue(*tmpword[0], false); if(tr.first.size() != 0) { - vector multiword = acceptions(tr.first); - if(multiword.size() > 1) - { - fputws_unlocked(L"[{]", output); + vector multiword = acceptions(tr.first); + if(multiword.size() > 1) { + write("[{]"_u, output); } for(unsigned int i = 0, limit = multiword.size(); i != limit; i++) { if(i > 0) { - fputws_unlocked(L"[|]", output); + write("[|]"_u, output); } - fputwc_unlocked(L'^', output); - fputws_unlocked(multiword[i].c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S$", multiwords[i].c_str()); } if(multiword.size() > 1) { - fputws_unlocked(L".[][}]", output); - } + write(".[][}]"_u, output); + } } tmpword.clear(); isRule = false; @@ -275,7 +272,7 @@ TransferMult::transfer(FILE *in, FILE *out) } else if(tmpblank.size() != 0) { - fputws_unlocked(tmpblank[0]->c_str(), output); + write(*tmpblank[0], output); tmpblank.clear(); last = input_buffer.getPos(); ms.init(me->getInitial()); @@ -312,28 +309,28 @@ TransferMult::transfer(FILE *in, FILE *out) } else { - fputws_unlocked(current.getContent().c_str(), output); + write(current.getContent(), output); return; } break; default: - wcerr << L"Error: Unknown input token." << endl; + cerr << "Error: Unknown input token." << endl; return; } } } bool -TransferMult::isDefaultWord(wstring const &str) +TransferMult::isDefaultWord(UString const &str) { - return str.find(L" D<"); + return str.find(" D<"); } -vector -TransferMult::acceptions(wstring str) +vector +TransferMult::acceptions(UString str) { - vector result; + vector result; int low = 0; // removing '@' @@ -350,7 +347,7 @@ TransferMult::acceptions(wstring str) } else if(str[i] == L'/') { - wstring new_word = str.substr(low, i-low); + UString new_word = str.substr(low, i-low); if(result.size() > 1 && isDefaultWord(new_word)) { @@ -365,7 +362,7 @@ TransferMult::acceptions(wstring str) } } - wstring otherword = str.substr(low); + UString otherword = str.substr(low); if(result.size() > 0 && isDefaultWord(otherword)) { result.push_back(result[0]); @@ -379,10 +376,10 @@ TransferMult::acceptions(wstring str) // eliminar las acepciones sin sentido marcado if(result.size() >= 2) { - vector result2; + vector result2; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { - if(result[i].find(L"__") != wstring::npos) + if(result[i].find("__") != UString::npos) { result2.push_back(result[i]); } @@ -397,22 +394,22 @@ TransferMult::acceptions(wstring str) } void -TransferMult::writeMultiple(list >::iterator itwords, - list::iterator itblanks, - list >::const_iterator limitwords, - wstring acum , bool multiple) +TransferMult::writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + UString acum , bool multiple) { if(itwords == limitwords) { if(multiple) { - output_string.append(L"[|]"); + output_string.append("[|]"); } output_string.append(acum); } else { - vector &refword = *itwords; + vector &refword = *itwords; itwords++; @@ -421,18 +418,18 @@ TransferMult::writeMultiple(list >::iterator itwords, for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { writeMultiple(itwords, itblanks, limitwords, - acum + L"^" + refword[i] + L"$", multiple || (i > 0)); + acum + "^" + refword[i] + "$", multiple || (i > 0)); } } else { - wstring &refblank = *itblanks; + UString &refblank = *itblanks; itblanks++; for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { writeMultiple(itwords, itblanks, limitwords, - acum + L"^" + refword[i] + L"$" + refblank, + acum + "^" + refword[i] + "$" + refblank, multiple || (i > 0)); } } @@ -442,31 +439,26 @@ TransferMult::writeMultiple(list >::iterator itwords, void TransferMult::applyRule() { - list blanks; - list > words; + list blanks; + list > words; - pair tr = fstp.biltransWithQueue(*tmpword[0], false); + pair tr = fstp.biltransWithQueue(*tmpword[0], false); words.push_back(acceptions(tr.first)); for(unsigned int i = 1; i != numwords; i++) { blanks.push_back(*tmpblank[i-1]); - pair tr = fstp.biltransWithQueue(*tmpword[i], false); + pair tr = fstp.biltransWithQueue(*tmpword[i], false); words.push_back(acceptions(tr.first)); } - output_string = L""; + output_string = ""; writeMultiple(words.begin(), blanks.begin(), words.end()); - if(output_string.find(L"[|]") != wstring::npos) - { - fputws_unlocked(L"[{]", output); - fputws_unlocked(output_string.c_str(), output); - fputws_unlocked(L".[][}]", output); - } - else - { - fputws_unlocked(output_string.c_str(), output); + if(output_string.find("[|]"_u) != UString::npos) { + u_fprintf(output, "[{]%S.[][}]", output_string.c_str()); + } else { + write(output_string, output); } ms.init(me->getInitial()); @@ -477,7 +469,7 @@ TransferMult::applyRule() } void -TransferMult::applyWord(wstring const &word_str) +TransferMult::applyWord(UString const &word_str) { ms.step(L'^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) diff --git a/apertium/transfer_mult.h b/apertium/transfer_mult.h index c6c8920..c3ec5be 100644 --- a/apertium/transfer_mult.h +++ b/apertium/transfer_mult.h @@ -40,20 +40,20 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map, Ltstr> lists; - map, Ltstr> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; TransferWord **word; string **blank; Buffer input_buffer; - vector tmpword; - vector tmpblank; - wstring output_string; + vector tmpword; + vector tmpblank; + UString output_string; FSTProcessor fstp; - FILE *output; + UFILE* output; int any_char; int any_tag; bool isRule; @@ -66,7 +66,7 @@ private: OutputType defaultAttrs; void destroy(); - void readData(FILE *input); + void readData(InputFile& input); void readBil(string const &filename); string caseOf(string const &str); string copycase(string const &source_word, string const &target_word); @@ -75,24 +75,24 @@ private: bool endsWith(string const &str1, string const &str2) const; string tolower(string const &str) const; string tags(string const &str) const; - wstring readWord(FILE *in); - wstring readBlank(FILE *in); - wstring readUntil(FILE *in, int const symbol) const; - void applyWord(wstring const &word_str); + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; + void applyWord(UString const &word_str); void applyRule(); - TransferToken & readToken(FILE *in); - void writeMultiple(list >::iterator itwords, - list::iterator itblanks, - list >::const_iterator limitwords, - wstring acum = L"", bool multiple = false); - vector acceptions(wstring str); - bool isDefaultWord(wstring const &str); + TransferToken & readToken(InputFile& in); + void writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + UString acum = "", bool multiple = false); + vector acceptions(UString str); + bool isDefaultWord(UString const &str); public: TransferMult(); ~TransferMult(); void read(string const &datafile, string const &fstfile); - void transfer(FILE *in, FILE *out); + void transfer(InputFile& in, UFILE* out); }; #endif diff --git a/apertium/transfer_token.cc b/apertium/transfer_token.cc index d5b4858..b3c0be1 100644 --- a/apertium/transfer_token.cc +++ b/apertium/transfer_token.cc @@ -36,7 +36,7 @@ type(tt_eof) { } -TransferToken::TransferToken(wstring const &content, +TransferToken::TransferToken(UString const &content, TransferTokenType type) { this->content = content; @@ -70,7 +70,7 @@ TransferToken::getType() return type; } -wstring & +UString & TransferToken::getContent() { return content; @@ -83,7 +83,7 @@ TransferToken::setType(TransferTokenType type) } void -TransferToken::setContent(wstring const &content) +TransferToken::setContent(UString const &content) { this->content = content; } diff --git a/apertium/transfer_token.h b/apertium/transfer_token.h index 039e7d6..a0ca3fc 100644 --- a/apertium/transfer_token.h +++ b/apertium/transfer_token.h @@ -18,6 +18,7 @@ #define _TRANSFERTOKEN_ #include +#include using namespace std; @@ -33,20 +34,20 @@ class TransferToken { private: TransferTokenType type; - wstring content; + UString content; void copy(TransferToken const &o); void destroy(); public: TransferToken(); - TransferToken(wstring const &content, TransferTokenType type); + TransferToken(UString const &content, TransferTokenType type); ~TransferToken(); TransferToken(TransferToken const &o); TransferToken & operator =(TransferToken const &o); TransferTokenType getType(); - wstring & getContent(); + UString & getContent(); void setType(TransferTokenType type); - void setContent(wstring const &content); + void setContent(UString const &content); }; #endif diff --git a/apertium/transferpp.cc b/apertium/transferpp.cc index 62cf712..a947959 100644 --- a/apertium/transferpp.cc +++ b/apertium/transferpp.cc @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) if(argc != 3) { - wcerr << "USAGE: " << basename(argv[0]) << " rules_file transfer_file" << endl; + cerr << "USAGE: " << basename(argv[0]) << " rules_file transfer_file" << endl; exit(EXIT_FAILURE); } diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc index 8cc0e2d..be0d4ac 100644 --- a/apertium/trx_reader.cc +++ b/apertium/trx_reader.cc @@ -23,11 +23,11 @@ #include using namespace Apertium; -wstring const -TRXReader::ANY_TAG = L""; +UString const +TRXReader::ANY_TAG = ""; -wstring const -TRXReader::ANY_CHAR = L""; +UString const +TRXReader::ANY_CHAR = ""; TRXReader::TRXReader() { @@ -36,11 +36,11 @@ TRXReader::TRXReader() } int -TRXReader::insertLemma(int const base, wstring const &lemma) +TRXReader::insertLemma(int const base, UString const &lemma) { int retval = base; static int const any_char = td.getAlphabet()(ANY_CHAR); - if(lemma == L"") + if(lemma == "") { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); @@ -75,7 +75,7 @@ TRXReader::insertLemma(int const base, wstring const &lemma) } int -TRXReader::insertTags(int const base, wstring const &tags) +TRXReader::insertTags(int const base, UString const &tags) { int retval = base; static int const any_tag = td.getAlphabet()(ANY_TAG); @@ -91,7 +91,7 @@ TRXReader::insertTags(int const base, wstring const &tags) } else { - wstring symbol = L"<"; + UString symbol = "<"; for(unsigned int j = i; j != limit; j++) { if(tags[j] == L'.') @@ -102,7 +102,7 @@ TRXReader::insertTags(int const base, wstring const &tags) } } - if(symbol == L"<") + if(symbol == "<") { symbol.append(tags.substr(i)); i = limit; @@ -126,56 +126,56 @@ TRXReader::parse() { procDefCats(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } - if(name == L"section-def-attrs") + if(name == "section-def-attrs") { procDefAttrs(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"section-def-vars") + if(name == "section-def-vars") { procDefVars(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"section-def-lists") + if(name == "section-def-lists") { procDefLists(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"section-def-macros") + if(name == "section-def-macros") { procDefMacros(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"section-rules") + if(name == "section-rules") { procRules(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } @@ -189,17 +189,17 @@ TRXReader::procRules() set alive_states; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-rules") + name != "section-rules") { step(); - if(name == L"rule") + if(name == "rule") { if(type != XML_READER_TYPE_END_ELEMENT) { count++; } } - else if(name == L"pattern") + else if(name == "pattern") { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -220,27 +220,27 @@ TRXReader::procRules() } else { - wcerr << L"Warning (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): " - << L"Paths to rule " << count << " blocked by rule " << td.seen_rules[*it] - << L"." << endl; + cerr << "Warning (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): " + << "Paths to rule " << count << " blocked by rule " << td.seen_rules[*it] + << "." << endl; } } } } - else if(name == L"pattern-item") + else if(name == "pattern-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - pair::iterator, - multimap::iterator> range; + pair::iterator, + multimap::iterator> range; - range = cat_items.equal_range(attrib(L"n")); + range = cat_items.equal_range(attrib("n")); if(range.first == range.second) { - parseError(L"Undefined cat-item '" + attrib(L"n")); + parseError("Undefined cat-item '" + attrib("n")); } // new code @@ -277,21 +277,21 @@ TRXReader::procRules() alive_states = alive_states_new; } } - else if(name == L"let") + else if(name == "let") { int count = 0; int lineno = xmlTextReaderGetParserLineNumber(reader); - while(name != L"let" || type != XML_READER_TYPE_END_ELEMENT) + while(name != "let" || type != XML_READER_TYPE_END_ELEMENT) { step(); if(type == XML_ELEMENT_NODE) { count++; - if(name == L"clip" && attrib(L"side") == L"sl") + if(name == "clip" && attrib("side") == "sl") { - wcerr << L"Warning (" << lineno; - wcerr << L"): assignment to 'sl' side has no effect." << endl; + cerr << "Warning (" << lineno; + cerr << "): assignment to 'sl' side has no effect." << endl; } } @@ -311,8 +311,8 @@ TRXReader::write(string const &filename) FILE *out = fopen(filename.c_str(), "wb"); if(!out) { - wcerr << "Error: cannot open '" << filename; - wcerr << "' for writing" << endl; + cerr << "Error: cannot open '" << filename; + cerr << "' for writing" << endl; exit(EXIT_FAILURE); } @@ -324,41 +324,41 @@ TRXReader::write(string const &filename) void TRXReader::procDefAttrs() { - wstring attrname; + UString attrname; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-attrs") + name != "section-def-attrs") { step(); - if(name == L"attr-item") + if(name == "attr-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - insertAttrItem(attrname, attrib(L"tags")); + insertAttrItem(attrname, attrib("tags")); } } - else if(name == L"def-attr") + else if(name == "def-attr") { if(type != XML_READER_TYPE_END_ELEMENT) { - attrname = attrib(L"n"); + attrname = attrib("n"); } else { - wstring all = td.getAttrItems()[attrname]; - td.getAttrItems()[attrname] = L"(" + all + L")"; - attrname = L""; + UString all = td.getAttrItems()[attrname]; + td.getAttrItems()[attrname] = "(" + all + ")"; + attrname = ""; } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"section-def-attrs") + else if(name == "section-def-attrs") { // do nothing } @@ -372,56 +372,56 @@ TRXReader::procDefAttrs() void TRXReader::procDefCats() { - while(type == XML_READER_TYPE_END_ELEMENT || !(name == L"transfer" || name == L"interchunk" || name == L"postchunk")) + while(type == XML_READER_TYPE_END_ELEMENT || !(name == "transfer" || name == "interchunk" || name == "postchunk")) { step(); - if(name != L"#text" && name != L"transfer" && name != L"interchunk" && - name != L"postchunk" && name != L"section-def-cats" && name != L"#comment") + if(name != "#text" && name != "transfer" && name != "interchunk" && + name != "postchunk" && name != "section-def-cats" && name != "#comment") { unexpectedTag(); } } - wstring catname; + UString catname; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-cats") + name != "section-def-cats") { step(); - if(name == L"cat-item") + if(name == "cat-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - if(attrib(L"tags") != L"") + if(attrib("tags") != "") { - insertCatItem(catname, attrib(L"lemma"), attrib(L"tags")); + insertCatItem(catname, attrib("lemma"), attrib("tags")); } else { - insertCatItem(catname, attrib(L"name"), L""); + insertCatItem(catname, attrib("name"), ""); } } } - else if(name == L"def-cat") + else if(name == "def-cat") { if(type != XML_READER_TYPE_END_ELEMENT) { - catname = attrib(L"n"); + catname = attrib("n"); } else { - catname = L""; + catname = ""; } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"section-def-cats") + else if(name == "section-def-cats") { // do nothing } @@ -436,25 +436,25 @@ void TRXReader::procDefVars() { while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-vars") + name != "section-def-vars") { step(); - if(name == L"def-var") + if(name == "def-var") { if(type != XML_READER_TYPE_END_ELEMENT) { - createVar(attrib(L"n"), attrib(L"v")); + createVar(attrib("n"), attrib("v")); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"section-def-vars") + else if(name == "section-def-vars") { // do nothing } @@ -468,39 +468,39 @@ TRXReader::procDefVars() void TRXReader::procDefLists() { - wstring listname; + UString listname; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-lists") + name != "section-def-lists") { step(); - if(name == L"list-item") + if(name == "list-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - insertListItem(listname, attrib(L"v")); + insertListItem(listname, attrib("v")); } } - else if(name == L"def-list") + else if(name == "def-list") { if(type != XML_READER_TYPE_END_ELEMENT) { - listname = attrib(L"n"); + listname = attrib("n"); } else { - listname = L""; + listname = ""; } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"section-def-lists") + else if(name == "section-def-lists") { // do nothing } @@ -516,53 +516,53 @@ TRXReader::procDefMacros() { int count = 0; while(type != XML_READER_TYPE_END_ELEMENT || - name != L"section-def-macros") + name != "section-def-macros") { step(); - if(name == L"def-macro") + if(name == "def-macro") { if(type != XML_READER_TYPE_END_ELEMENT) { - createMacro(attrib(L"n"), count++); + createMacro(attrib("n"), count++); } } } } void -TRXReader::createMacro(wstring const &name, int const value) +TRXReader::createMacro(UString const &name, int const value) { if(td.getMacros().find(name) != td.getMacros().end()) { - parseError(L"Macro '" + name + L"' defined at least twice"); + parseError("Macro '" + name + "' defined at least twice"); } td.getMacros()[name] = value; } void -TRXReader::insertListItem(wstring const &name, wstring const &value) +TRXReader::insertListItem(UString const &name, UString const &value) { td.getLists()[name].insert(value); } void -TRXReader::createVar(wstring const &name, wstring const &initial_value) +TRXReader::createVar(UString const &name, UString const &initial_value) { td.getVariables()[name] = initial_value; } void -TRXReader::insertCatItem(wstring const &name, wstring const &lemma, - wstring const &tags) +TRXReader::insertCatItem(UString const &name, UString const &lemma, + UString const &tags) { LemmaTags lt; lt.lemma = lemma; lt.tags = tags; - cat_items.insert(pair(name, lt)); + cat_items.insert(pair(name, lt)); } void -TRXReader::insertAttrItem(wstring const &name, wstring const &tags) +TRXReader::insertAttrItem(UString const &name, UString const &tags) { if(td.getAttrItems()[name].size() != 0) { @@ -575,7 +575,7 @@ TRXReader::insertAttrItem(wstring const &name, wstring const &tags) { if(tags[i] == L'.') { - td.getAttrItems()[name].append(L"><"); + td.getAttrItems()[name].append("><"); } else { diff --git a/apertium/trx_reader.h b/apertium/trx_reader.h index 18ebef2..ddb6a6c 100644 --- a/apertium/trx_reader.h +++ b/apertium/trx_reader.h @@ -32,11 +32,11 @@ class TRXReader : public XMLReader private: struct LemmaTags { - wstring lemma; - wstring tags; + UString lemma; + UString tags; }; - multimap cat_items; + multimap cat_items; TransferData td; void destroy(); @@ -50,22 +50,22 @@ private: void procDefMacros(); void procRules(); - void insertCatItem(wstring const &name, wstring const &lemma, - wstring const &tags); - void insertAttrItem(wstring const &name, wstring const &tags); - void createVar(wstring const &name, wstring const &initial_value); - void insertListItem(wstring const &name, wstring const &value); - void createMacro(wstring const &name, int const val); + void insertCatItem(UString const &name, UString const &lemma, + UString const &tags); + void insertAttrItem(UString const &name, UString const &tags); + void createVar(UString const &name, UString const &initial_value); + void insertListItem(UString const &name, UString const &value); + void createMacro(UString const &name, int const val); - int insertLemma(int const base, wstring const &lemma); - int insertTags(int const base, wstring const &tags); + int insertLemma(int const base, UString const &lemma); + int insertTags(int const base, UString const &tags); protected: virtual void parse(); public: - static wstring const ANY_TAG; - static wstring const ANY_CHAR; + static UString const ANY_TAG; + static UString const ANY_CHAR; TRXReader(); diff --git a/apertium/tsx_reader.cc b/apertium/tsx_reader.cc index 750bbf4..92f0d5a 100644 --- a/apertium/tsx_reader.cc +++ b/apertium/tsx_reader.cc @@ -61,13 +61,13 @@ TSXReader::clearTagIndex() { tag_index->clear(); array_tags->clear(); - newTagIndex(L"LPAR"); - newTagIndex(L"RPAR"); - newTagIndex(L"LQUEST"); - newTagIndex(L"CM"); - newTagIndex(L"SENT"); - newTagIndex(L"kEOF"); - newTagIndex(L"kUNDEF"); + newTagIndex("LPAR"); + newTagIndex("RPAR"); + newTagIndex("LQUEST"); + newTagIndex("CM"); + newTagIndex("SENT"); + newTagIndex("kEOF"); + newTagIndex("kUNDEF"); } TSXReader & @@ -82,31 +82,31 @@ TSXReader::operator =(TSXReader const &o) } void -TSXReader::newTagIndex(wstring const &tag) +TSXReader::newTagIndex(UString const &tag) { - if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_" + tag) != tag_index->end()) { - parseError(L"'" + tag + L"' already defined"); + parseError("'" + tag + "' already defined"); } - array_tags->push_back(L"TAG_" + tag); - (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; + array_tags->push_back("TAG_" + tag); + (*tag_index)["TAG_" + tag] = array_tags->size() - 1; } void -TSXReader::newDefTag(wstring const &tag) +TSXReader::newDefTag(UString const &tag) { - if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + if(tag_index->find("TAG_" + tag) != tag_index->end()) { - parseError(L"'" + tag + L"' already defined"); + parseError("'" + tag + "' already defined"); } array_tags->push_back(tag); - (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; + (*tag_index)["TAG_" + tag] = array_tags->size() - 1; } void -TSXReader::newConstant(wstring const &constant) +TSXReader::newConstant(UString const &constant) { constants->setConstant(constant, array_tags->size()); array_tags->push_back(constant); @@ -115,26 +115,26 @@ TSXReader::newConstant(wstring const &constant) void TSXReader::procDiscardOnAmbiguity() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"discard-on-ambiguity") + while(type != XML_READER_TYPE_END_ELEMENT || name != "discard-on-ambiguity") { step(); - if(name == L"discard") + if(name == "discard") { if(type != XML_READER_TYPE_END_ELEMENT) { - tdata.addDiscard(L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"); + tdata.addDiscard("<" + StringUtils::substitute(attrib("tags"), ".", "><") + ">"); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"discard-on-ambiguity") + else if(name == "discard-on-ambiguity") { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -142,7 +142,7 @@ TSXReader::procDiscardOnAmbiguity() } else { - parseError(L"Unexpected 'discard-on-ambiguity' open tag"); + parseError("Unexpected 'discard-on-ambiguity' open tag"); } } else @@ -155,36 +155,36 @@ TSXReader::procDiscardOnAmbiguity() void TSXReader::procDefLabel() { - wstring name_attr = attrib(L"name"); - wstring closed_attr = attrib(L"closed"); + UString name_attr = attrib("name"); + UString closed_attr = attrib("closed"); newDefTag(name_attr); - if(closed_attr != L"true") + if(closed_attr != "true") { - open_class->insert((*tag_index)[L"TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"+name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-label") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-label") { step(); - if(name == L"tags-item") + if(name == "tags-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], attrib(L"lemma"), - attrib(L"tags")); + plist->insert((*tag_index)["TAG_"+name_attr], attrib("lemma"), + attrib("tags")); } } - else if(name == L"def-label") + else if(name == "def-label") { return; } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } @@ -198,50 +198,50 @@ TSXReader::procDefLabel() void TSXReader::procDefMult() { - wstring name_attr = attrib(L"name"); - wstring closed_attr = attrib(L"closed"); + UString name_attr = attrib("name"); + UString closed_attr = attrib("closed"); newDefTag(name_attr); - if(closed_attr != L"true") + if(closed_attr != "true") { - open_class->insert((*tag_index)[L"TAG_"+name_attr]); + open_class->insert((*tag_index)["TAG_"+name_attr]); } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-mult") + while(type != XML_READER_TYPE_END_ELEMENT || name != "def-mult") { step(); - if(name == L"sequence") + if(name == "sequence") { if(type != XML_READER_TYPE_END_ELEMENT) { plist->beginSequence(); - while(type != XML_READER_TYPE_END_ELEMENT || name != L"sequence") + while(type != XML_READER_TYPE_END_ELEMENT || name != "sequence") { step(); - if(name == L"label-item") + if(name == "label-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], - (*tag_index)[L"TAG_"+attrib(L"label")]); + plist->insert((*tag_index)["TAG_"+name_attr], + (*tag_index)["TAG_"+attrib("label")]); } } - else if(name == L"tags-item") + else if(name == "tags-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - plist->insert((*tag_index)[L"TAG_"+name_attr], - attrib(L"lemma"), attrib(L"tags")); + plist->insert((*tag_index)["TAG_"+name_attr], + attrib("lemma"), attrib("tags")); } } - else if(name == L"sequence") + else if(name == "sequence") { break; } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } @@ -249,15 +249,15 @@ TSXReader::procDefMult() plist->endSequence(); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"def-mult") + else if(name == "def-mult") { // do nothing } @@ -271,41 +271,41 @@ TSXReader::procDefMult() void TSXReader::procTagset() { - while(type == XML_READER_TYPE_END_ELEMENT || name != L"tagset") + while(type == XML_READER_TYPE_END_ELEMENT || name != "tagset") { step(); - if(name != L"#text" && name != L"tagger" && name != L"tagset") + if(name != "#text" && name != "tagger" && name != "tagset") { unexpectedTag(); } } - while(type != XML_READER_TYPE_END_ELEMENT || name != L"tagset") + while(type != XML_READER_TYPE_END_ELEMENT || name != "tagset") { step(); - if(name == L"def-label") + if(name == "def-label") { if(type != XML_READER_TYPE_END_ELEMENT) { procDefLabel(); } } - else if(name == L"def-mult") + else if(name == "def-mult") { if(type != XML_READER_TYPE_END_ELEMENT) { procDefMult(); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"tagset") + else if(name == "tagset") { // do nothing } @@ -323,27 +323,27 @@ TSXReader::procLabelSequence() TForbidRule forbid_rule; step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } - if(name != L"label-item") + if(name != "label-item") { - parseError(L" tag expected"); + parseError(" tag expected"); } - forbid_rule.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + forbid_rule.tagi = (*tag_index)["TAG_" + attrib("label")]; step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } - if(name != L"label-item") + if(name != "label-item") { - parseError(L" tag expected"); + parseError(" tag expected"); } - forbid_rule.tagj = (*tag_index)[L"TAG_" + attrib(L"label")]; + forbid_rule.tagj = (*tag_index)["TAG_" + attrib("label")]; forbid_rules->push_back(forbid_rule); } @@ -351,25 +351,25 @@ TSXReader::procLabelSequence() void TSXReader::procForbid() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"forbid") + while(type != XML_READER_TYPE_END_ELEMENT || name != "forbid") { step(); - if(name == L"label-sequence") + if(name == "label-sequence") { if(type != XML_READER_TYPE_END_ELEMENT) { procLabelSequence(); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"forbid") + else if(name == "forbid") { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -377,12 +377,12 @@ TSXReader::procForbid() } else { - parseError(L"Unexpected '" + name + L"' open tag"); + parseError("Unexpected '" + name + "' open tag"); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '" + name + "' tag"); } } } @@ -391,14 +391,14 @@ void TSXReader::procEnforce() { TEnforceAfterRule aux; - while(type != XML_READER_TYPE_END_ELEMENT || name != L"enforce-rules") + while(type != XML_READER_TYPE_END_ELEMENT || name != "enforce-rules") { step(); - if(name == L"enforce-after") + if(name == "enforce-after") { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + aux.tagi = (*tag_index)["TAG_" + attrib("label")]; } else { @@ -406,26 +406,26 @@ TSXReader::procEnforce() aux.tagsj.clear(); } } - else if(name == L"label-set") + else if(name == "label-set") { // do nothing } - else if(name == L"label-item") + else if(name == "label-item") { if(type != XML_READER_TYPE_END_ELEMENT) { - aux.tagsj.push_back((*tag_index)[L"TAG_" + attrib(L"label")]); + aux.tagsj.push_back((*tag_index)["TAG_" + attrib("label")]); } } - else if(name == L"#text") + else if(name == "#text") { // do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"enforce-rules") + else if(name == "enforce-rules") { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -433,12 +433,12 @@ TSXReader::procEnforce() } else { - parseError(L"Unexpected 'enforce-rules' open tag"); + parseError("Unexpected 'enforce-rules' open tag"); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '" + name + "' tag"); } } } @@ -446,26 +446,26 @@ TSXReader::procEnforce() void TSXReader::procPreferences() { - while(type != XML_READER_TYPE_END_ELEMENT || name != L"preferences") + while(type != XML_READER_TYPE_END_ELEMENT || name != "preferences") { step(); - if(name == L"prefer") + if(name == "prefer") { if(type != XML_READER_TYPE_END_ELEMENT) { - wstring const tags = L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"; + UString const tags = "<" + StringUtils::substitute(attrib("tags"), ".", "><") + ">"; prefer_rules->push_back(tags); } } - else if(name == L"#text") + else if(name == "#text") { //do nothing } - else if(name == L"#comment") + else if(name == "#comment") { // do nothing } - else if(name == L"preferences") + else if(name == "preferences") { if(type == XML_READER_TYPE_END_ELEMENT) { @@ -473,12 +473,12 @@ TSXReader::procPreferences() } else { - parseError(L"Unexpected 'preferences' open tag"); + parseError("Unexpected 'preferences' open tag"); } } else { - parseError(L"Unexpected '" + name + L"' tag"); + parseError("Unexpected '" + name + "' tag"); } } } @@ -494,38 +494,38 @@ TSXReader::parse() procTagset(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } - if(name == L"forbid") + if(name == "forbid") { procForbid(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"enforce-rules") + if(name == "enforce-rules") { procEnforce(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"preferences") + if(name == "preferences") { procPreferences(); step(); - while(name == L"#text" || name == L"#comment") + while(name == "#text" || name == "#comment") { step(); } } - if(name == L"discard-on-ambiguity") + if(name == "discard-on-ambiguity") { if(type != XML_READER_TYPE_END_ELEMENT) { @@ -533,20 +533,20 @@ TSXReader::parse() } } - newConstant(L"kMOT"); - newConstant(L"kDOLLAR"); - newConstant(L"kBARRA"); - newConstant(L"kMAS"); - newConstant(L"kIGNORAR"); - newConstant(L"kBEGIN"); - newConstant(L"kUNKNOWN"); - - plist->insert((*tag_index)[L"TAG_LPAR"], L"", L"lpar"); - plist->insert((*tag_index)[L"TAG_RPAR"], L"", L"rpar"); - plist->insert((*tag_index)[L"TAG_LQUEST"], L"", L"lquest"); - plist->insert((*tag_index)[L"TAG_CM"], L"", L"cm"); - plist->insert((*tag_index)[L"TAG_SENT"], L"", L"sent"); -// plist->insert((*tag_index)[L"TAG_kMAS"], L"+", L""); + newConstant("kMOT"); + newConstant("kDOLLAR"); + newConstant("kBARRA"); + newConstant("kMAS"); + newConstant("kIGNORAR"); + newConstant("kBEGIN"); + newConstant("kUNKNOWN"); + + plist->insert((*tag_index)["TAG_LPAR"], "", "lpar"); + plist->insert((*tag_index)["TAG_RPAR"], "", "rpar"); + plist->insert((*tag_index)["TAG_LQUEST"], "", "lquest"); + plist->insert((*tag_index)["TAG_CM"], "", "cm"); + plist->insert((*tag_index)["TAG_SENT"], "", "sent"); +// plist->insert((*tag_index)["TAG_kMAS"], "+", ""); plist->buildTransducer(); } diff --git a/apertium/tsx_reader.h b/apertium/tsx_reader.h index 8cc4829..03ca193 100644 --- a/apertium/tsx_reader.h +++ b/apertium/tsx_reader.h @@ -37,17 +37,17 @@ class TSXReader : public XMLReader private: set *open_class; vector *forbid_rules; - map *tag_index; - vector *array_tags; + map *tag_index; + vector *array_tags; vector *enforce_rules; - vector *prefer_rules; + vector *prefer_rules; PatternList *plist; ConstantManager *constants; TaggerData tdata; - void newTagIndex(wstring const &tag); - void newDefTag(wstring const &tag); - void newConstant(wstring const &constant); + void newTagIndex(UString const &tag); + void newDefTag(UString const &tag); + void newConstant(UString const &constant); void procDefLabel(); void procDefMult(); void procDiscardOnAmbiguity(); diff --git a/apertium/unigram_tagger.cc b/apertium/unigram_tagger.cc index 5738816..a9ff362 100644 --- a/apertium/unigram_tagger.cc +++ b/apertium/unigram_tagger.cc @@ -134,9 +134,9 @@ UnigramTagger::score(const Analysis& Analysis_) { } if(TheFlags.getDebug()) { - score_DEBUG << L"(" << tokenCount_r_a << L" * " - << tokenCount_a << L") /\n (" - << tokenCount_a << L" + " << typeCount_a << L")"; + score_DEBUG << "(" << tokenCount_r_a << " * " + << tokenCount_a << ") /\n (" + << tokenCount_a << " + " << typeCount_a << ")"; } return (tokenCount_r_a * tokenCount_a) / (tokenCount_a + typeCount_a); } @@ -159,7 +159,7 @@ UnigramTagger::model3_score(const Analysis &Analysis_) i i_(Analysis_); Lemma l_(Analysis_); - std::wstringstream score_DEBUG_div; + std::stringstream score_DEBUG_div; if(Model3_l_t.find(i_) != Model3_l_t.end()) { if(Model3_l_t[i_].find(l_) != Model3_l_t[i_].end()) @@ -174,9 +174,9 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L"(" << tokenCount_r_i << L" * " << tokenCount_i; - std::wstringstream score_DEBUG_div; - score_DEBUG_div << L"(" << tokenCount_i << L" + " << typeCount_i << L")"; + score_DEBUG << "(" << tokenCount_r_i << " * " << tokenCount_i; + std::stringstream score_DEBUG_div; + score_DEBUG_div << "(" << tokenCount_i << " + " << typeCount_i << ")"; } long double score = tokenCount_r_i * tokenCount_i; @@ -223,9 +223,9 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L" * " << tokenCount_d_i << L" * " << tokenCount_i_d; - score_DEBUG_div << L" * (" << tokenCount_i << L" + " << typeCount_i - << L") * (" << tokenCount_d << L" + " << typeCount_d << L")"; + score_DEBUG << " * " << tokenCount_d_i << " * " << tokenCount_i_d; + score_DEBUG_div << " * (" << tokenCount_i << " + " << typeCount_i + << ") * (" << tokenCount_d << " + " << typeCount_d << ")"; } score *= (tokenCount_d_i * tokenCount_i_d); @@ -233,7 +233,7 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } if(TheFlags.getDebug()) { - score_DEBUG << L") /\n [" << score_DEBUG_div.str() << L"]"; + score_DEBUG << ") /\n [" << score_DEBUG_div.str() << "]"; } return score / score_Divisor; @@ -255,7 +255,7 @@ UnigramTagger::tag(Stream &Input, std::wostream &Output) } if(TheFlags.getDebug()) { - std::wcerr << L"\n\n"; + std::cerr << "\n\n"; } tag(*StreamedType_.TheLexicalUnit, Output); @@ -275,7 +275,7 @@ UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) { if(TheFlags.getDebug()) { - score_DEBUG.str(L""); + score_DEBUG.str(""); } const Analysis& a_ = LexicalUnit_.TheAnalyses[n]; long double s = score(a_); @@ -286,10 +286,10 @@ UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) } if(TheFlags.getDebug()) { - std::wcerr << L"score(\"" << a_ << L"\") ==\n " - << score_DEBUG.str() << L" ==\n " << std::fixed + std::cerr << "score(\"" << a_ << "\") ==\n " + << score_DEBUG.str() << " ==\n " << std::fixed << std::setprecision(std::numeric_limits::digits10) - << s << L"\n"; + << s << "\n"; } } diff --git a/apertium/unigram_tagger.h b/apertium/unigram_tagger.h index 58af2e9..1101f94 100644 --- a/apertium/unigram_tagger.h +++ b/apertium/unigram_tagger.h @@ -46,7 +46,7 @@ class UnigramTagger : public StreamTagger { private: long double model3_score(const Analysis &Analysis_); void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output); - std::wstringstream score_DEBUG; + std::stringstream score_DEBUG; protected: UnigramTaggerModel model; diff --git a/apertium/unlocked_cstdio.h b/apertium/unlocked_cstdio.h index 5fe402c..2d043c5 100644 --- a/apertium/unlocked_cstdio.h +++ b/apertium/unlocked_cstdio.h @@ -40,24 +40,6 @@ #define fread_unlocked fread #endif -#if !HAVE_DECL_FGETWC_UNLOCKED -#define fgetwc_unlocked fgetwc -#endif - -#if !HAVE_DECL_FPUTWC_UNLOCKED -#define fputwc_unlocked fputwc -#endif - -#if !HAVE_DECL_FPUTWS_UNLOCKED -#define fputws_unlocked fputws -#endif - -#if !HAVE_MBTOWC -#include -inline int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } -inline int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } -#endif - #ifdef _WIN32 #include #endif diff --git a/apertium/utf_converter.cc b/apertium/utf_converter.cc index b79e834..e721152 100644 --- a/apertium/utf_converter.cc +++ b/apertium/utf_converter.cc @@ -62,7 +62,7 @@ namespace UtfConverter void conversionError() { - wcerr << L"Error: conversion error" << endl; + cerr << "Error: conversion error" << endl; exit(EXIT_FAILURE); } diff --git a/apertium/xml_reader.cc b/apertium/xml_reader.cc index b16a3f5..ccd3c3c 100644 --- a/apertium/xml_reader.cc +++ b/apertium/xml_reader.cc @@ -7,7 +7,7 @@ XMLReader::XmlTextReaderResource::XmlTextReaderResource( { reader = xmlReaderForFile(filename.c_str(), NULL, 0); if (reader == NULL) { - wcerr << L"Error: Cannot open '" << filename << L"'." << endl; + cerr << "Error: Cannot open '" << filename << "'." << endl; exit(EXIT_FAILURE); } } @@ -25,7 +25,7 @@ XMLReader::XMLReader() : reader(0), type(0) {} void XMLReader::stepToTag() { - while (name == L"#text" || name == L"#comment") { + while (name == "#text" || name == "#comment") { step(); } } @@ -36,15 +36,15 @@ XMLReader::step() int retval = xmlTextReaderRead(reader); if (retval != 1) { - parseError(L"unexpected EOF"); + parseError("unexpected EOF"); } - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); - //std::wcerr << name << L": " << type << "\n"; + //std::cerr << name << ": " << type << "\n"; } void -XMLReader::stepPastSelfClosingTag(wstring const &tag) +XMLReader::stepPastSelfClosingTag(UString const &tag) { // libxml2 expands to inside entities. // This method exists to work around this difference. @@ -63,8 +63,8 @@ XMLReader::stepToNextTag() stepToTag(); } -wstring -XMLReader::attrib(wstring const &name) +UString +XMLReader::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } @@ -76,18 +76,18 @@ XMLReader::attrib(string const &name) } void -XMLReader::parseError(wstring const &message) +XMLReader::parseError(UString const &message) { - wcerr << L"Error at line " << xmlTextReaderGetParserLineNumber(reader) - << L", column " << xmlTextReaderGetParserColumnNumber(reader) - << L": " << message << L"." << endl; + cerr << "Error at line " << xmlTextReaderGetParserLineNumber(reader) + << ", column " << xmlTextReaderGetParserColumnNumber(reader) + << ": " << message << "." << endl; exit(EXIT_FAILURE); } void XMLReader::unexpectedTag() { - parseError(L"unexpected '<" + name + L">' tag"); + parseError("unexpected '<" + name + ">' tag"); } void diff --git a/apertium/xml_reader.h b/apertium/xml_reader.h index 252e2c4..46dfa7b 100644 --- a/apertium/xml_reader.h +++ b/apertium/xml_reader.h @@ -34,13 +34,13 @@ protected: XMLReader(); xmlTextReaderPtr reader; int type; - wstring name; - wstring attrib(wstring const &name); + UString name; + UString attrib(UString const &name); string attrib(string const &name); - void parseError(wstring const &message); + void parseError(UString const &message); void unexpectedTag(); void stepToTag(); - void stepPastSelfClosingTag(wstring const &tag); + void stepPastSelfClosingTag(UString const &tag); void stepToNextTag(); void step(); virtual void parse() = 0; diff --git a/configure.ac b/configure.ac index 6c992da..1cc4491 100644 --- a/configure.ac +++ b/configure.ac @@ -90,41 +90,18 @@ fi PKG_CHECK_MODULES(LTTOOLBOX, [lttoolbox >= 3.5.3], CPPFLAGS="$CPPFLAGS $LTTOOLBOX_CFLAGS"; LIBS="$LIBS $LTTOOLBOX_LIBS") PKG_CHECK_MODULES(LIBXML2, [libxml-2.0 >= 2.6.17], CPPFLAGS="$CPPFLAGS $LIBXML2_CFLAGS"; LIBS="$LIBS $LIBXML2_LIBS") PKG_CHECK_MODULES(PCRE, [libpcre >= 6.4], CPPFLAGS="$CPPFLAGS $PCRE_CFLAGS"; LIBS="$LIBS $PCRE_LIBS") +PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc], CPPFLAGS="$CPPFLAGS $ICU_CFLAGS"; LIBS="$LIBS $ICU_LIBS") -# Check for wide strings -AC_DEFUN([AC_CXX_WSTRING],[ - AC_CACHE_CHECK(whether the compiler supports wide strings, - ac_cv_cxx_wstring, - [AC_LANG_SAVE - AC_LANG_CPLUSPLUS - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ -std::wstring test = L"test"; - ]])], - [ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no]) - AC_LANG_RESTORE - ]) -]) - -AC_CXX_WSTRING AC_C_BIGENDIAN -if test "$ac_cv_cxx_wstring" = no -then - AC_MSG_ERROR([Missing wide string support]) -fi - - # Checks for header files. AC_LANG(C++) AC_HEADER_STDC AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h filesystem string_view]) AC_CHECK_LIB([stdc++fs], [_ZNSt12experimental10filesystem2v112current_pathEv]) -# Checks for ICU -AC_CHECK_ICU(50) - -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) -AC_CHECK_FUNCS([setlocale strdup getopt snprintf mbtowc]) +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long]) +AC_CHECK_FUNCS([setlocale strdup getopt snprintf]) AC_REPLACE_FUNCS(getopt_long) AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows])