commit 0ff8bf4c83fd518692fab97c01808cbe322d8f83 Author: Daniel Swanson Date: Mon Jun 7 18:39:49 2021 -0500 the long march has taken a detour into deduplicating transfer code diff --git a/apertium/Makefile.am b/apertium/Makefile.am index 48223c4..3db38ce 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -63,6 +63,7 @@ h_sources = a.h \ tmx_trail_postprocessors.h \ tmx_translate.h \ tmx_words.h \ + transfer_base.h \ transfer_data.h \ transfer.h \ transfer_instr.h \ @@ -77,7 +78,8 @@ h_sources = a.h \ unlocked_cstdio.h \ utf_converter.h \ utils.h \ - xml_reader.h + xml_reader.h \ + xml_walk_util.h cc_sources = a.cc \ align.cc \ @@ -130,6 +132,7 @@ cc_sources = a.cc \ tmx_trail_postprocessors.cc \ tmx_translate.cc \ transfer.cc \ + transfer_base.cc \ transfer_data.cc \ transfer_instr.cc \ transfer_mult.cc \ @@ -140,7 +143,8 @@ cc_sources = a.cc \ tsx_reader.cc \ unigram_tagger.cc \ utf_converter.cc \ - xml_reader.cc + xml_reader.cc \ + xml_walk_util.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) diff --git a/apertium/apertium_re.cc b/apertium/apertium_re.cc index b12dc3d..66426c7 100644 --- a/apertium/apertium_re.cc +++ b/apertium/apertium_re.cc @@ -22,19 +22,7 @@ using namespace Apertium; using namespace std; - -std::string& pcre_version_endian() { - static std::string pve; - if (pve.empty()) { - pve = pcre_version(); -#ifdef WORDS_BIGENDIAN - pve += "-be"; -#else - pve += "-le"; -#endif - } - return pve; -} +using namespace icu; ApertiumRE::ApertiumRE() : re(0) @@ -44,9 +32,8 @@ re(0) ApertiumRE::~ApertiumRE() { - if(!empty) - { - pcre_free(re); + if(!empty) { + delete re; } empty = true; } @@ -55,27 +42,22 @@ void ApertiumRE::read(FILE *input) { unsigned int size = Compression::multibyte_read(input); - re = static_cast(pcre_malloc(size)); - if(size != fread(re, 1, size, input)) - { + if (fseek(input, size, SEEK_CUR) != 0) { cerr << "Error reading regexp" << endl; exit(EXIT_FAILURE); } - empty = false; + empty = true; } void -ApertiumRE::compile(string const &str) +ApertiumRE::compile(UString const &str) { - const char *error; - int erroroffset; - re = pcre_compile(str.c_str(), PCRE_DOTALL|PCRE_CASELESS|PCRE_EXTENDED|PCRE_UTF8, - &error, &erroroffset, NULL); - if(re == NULL) - { - cerr << "Error: pcre_compile "; - cerr << error << endl; + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + re = RegexPattern::compile(s, UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, err); + if(err != U_ZERO_ERROR) { + cerr << "Error: unable to compile regular expression '" << str << "'." << endl; exit(EXIT_FAILURE); } @@ -85,88 +67,68 @@ ApertiumRE::compile(string const &str) void ApertiumRE::write(FILE *output) const { - if(empty) - { + if(empty) { cerr << "Error, cannot write empty regexp" << endl; exit(EXIT_FAILURE); } + // for backwards compatibility, write empty binary form + Compression::multibyte_write(0, output); +} - size_t size; - int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); - if(rc < 0) - { - cerr << "Error calling pcre_fullinfo()\n" << endl; - exit(EXIT_FAILURE); +UString +ApertiumRE::match(UString const &str) const +{ + if(empty) { + return ""_u; } - Compression::multibyte_write(size, output); + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + RegexMatcher* m = re->matcher(s, err); - size_t rc2 = fwrite(re, 1, size, output); - if(rc2 != size) - { - cerr << "Error writing precompiled regex\n" << endl; + if (err != U_ZERO_ERROR) { + cerr << "Error: Unable to apply regexp" << endl; exit(EXIT_FAILURE); } -} -string -ApertiumRE::match(string const &str) const -{ - if(empty) - { - return ""; + if (!m->find()) { + return ""_u; } - int result[3]; - int workspace[4096]; -// int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return ""; - - default: - cerr << "Error: Unknown error matching regexp (code " << rc << ")" << endl; - exit(EXIT_FAILURE); - } + UString ret = m->group(err).getTerminatedBuffer(); + if (err != U_ZERO_ERROR) { + cerr << "Error: Unable to extract substring from regexp match" << endl; + exit(EXIT_FAILURE); } - return str.substr(result[0], result[1]-result[0]); + return ret; } // Return true if something was replaced and false otherwise bool -ApertiumRE::replace(string &str, string const &value) const +ApertiumRE::replace(UString &str, UString const &value) const { - if(empty) - { + if(empty) { return false; } - int result[3]; - int workspace[4096]; - // int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); - int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); - if(rc < 0) - { - switch(rc) - { - case PCRE_ERROR_NOMATCH: - return false; - - default: - cerr << "Error: Unknown error matching regexp (code " << rc << ")" << endl; - exit(EXIT_FAILURE); - } + UnicodeString s = str.c_str(); + UErrorCode err = U_ZERO_ERROR; + RegexMatcher* m = re->matcher(s, err); + + if (err != U_ZERO_ERROR) { + cerr << "Error: Unable to apply regexp" << endl; + exit(EXIT_FAILURE); } - string res = str.substr(0, result[0]); + // do this manually rather than call m->replaceFirst() + // because we want to know that a match happened + if (!m->find()) { + return false; + } + UString res = str.substr(0, m->start(err)); res.append(value); - res.append(str.substr(result[1])); - str = res; + res.append(str.substr(m->end(err))); + res.swap(str); return true; } diff --git a/apertium/apertium_re.h b/apertium/apertium_re.h index c9cb8c0..8344d95 100644 --- a/apertium/apertium_re.h +++ b/apertium/apertium_re.h @@ -18,27 +18,25 @@ #ifndef _APERTIUM_RE_ #define _APERTIUM_RE_ -#include #include -#include +#include +#include using namespace std; -std::string& pcre_version_endian(); - class ApertiumRE { private: bool empty; - pcre *re; + icu::RegexPattern* re; public: ApertiumRE(); ~ApertiumRE(); void read(FILE *); void write(FILE *) const; - string match(string const &str) const; - bool replace(string &str, string const &value) const; - void compile(string const &str); + UString match(UString const &str) const; + bool replace(UString &str, UString const &value) const; + void compile(UString const &str); }; #endif diff --git a/apertium/feature_vec.cc b/apertium/feature_vec.cc index fc95d10..4ba2ee2 100644 --- a/apertium/feature_vec.cc +++ b/apertium/feature_vec.cc @@ -79,9 +79,6 @@ operator<<(OStream & out, FeatureVec const &fv) return out; } -template std::wostream& -operator<<(std::wostream& out, FeatureVec const &fv); - template std::ostream& operator<<(std::ostream& out, FeatureVec const &fv); diff --git a/apertium/feature_vec.h b/apertium/feature_vec.h index a4dcd6a..18e848a 100644 --- a/apertium/feature_vec.h +++ b/apertium/feature_vec.h @@ -6,10 +6,11 @@ #include #include #include +#include namespace Apertium { -typedef std::vector FeatureKey; +typedef std::vector FeatureKey; struct CompareFeatureKey { bool operator() (FeatureKey const& lhs, FeatureKey const& rhs) const; }; diff --git a/apertium/file_morpho_stream.cc b/apertium/file_morpho_stream.cc index 46a398a..fa09a3c 100644 --- a/apertium/file_morpho_stream.cc +++ b/apertium/file_morpho_stream.cc @@ -38,17 +38,17 @@ FileMorphoStream::FileMorphoStream(const char* ftxt, bool d, TaggerData *t) : ca_any_tag = alphabet(PatternList::ANY_TAG); ConstantManager &constants = td->getConstants(); - ca_kignorar = constants.getConstant("kIGNORAR"); - ca_kbarra = constants.getConstant("kBARRA"); - ca_kdollar = constants.getConstant("kDOLLAR"); - ca_kbegin = constants.getConstant("kBEGIN"); - ca_kmot = constants.getConstant("kMOT"); - ca_kmas = constants.getConstant("kMAS"); - ca_kunknown = constants.getConstant("kUNKNOWN"); + ca_kignorar = constants.getConstant("kIGNORAR"_u); + ca_kbarra = constants.getConstant("kBARRA"_u); + ca_kdollar = constants.getConstant("kDOLLAR"_u); + ca_kbegin = constants.getConstant("kBEGIN"_u); + ca_kmot = constants.getConstant("kMOT"_u); + ca_kmas = constants.getConstant("kMAS"_u); + ca_kunknown = constants.getConstant("kUNKNOWN"_u); map &tag_index = td->getTagIndex(); - ca_tag_keof = tag_index["TAG_kEOF"]; - ca_tag_kundef = tag_index["TAG_kUNDEF"]; + ca_tag_keof = tag_index["TAG_kEOF"_u]; + ca_tag_kundef = tag_index["TAG_kUNDEF"_u]; end_of_file = false; null_flush = false; @@ -79,7 +79,7 @@ FileMorphoStream::get_next_word() return word; } - if(feof(input)) + if(input.eof()) { return NULL; } @@ -89,70 +89,61 @@ FileMorphoStream::get_next_word() while(true) { - int symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) + UChar32 symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { end_of_file = true; - vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return get_next_word(); } - if(symbol == L'^') + if(symbol == '^') { readRestOfWord(ivwords); return get_next_word(); } else { - UString str = ""; - if(symbol == L'\\') + UString str = ""_u; + if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); - symbol = L'\\'; + symbol = input.get(); + str += '\\'; + str += symbol; + symbol = '\\'; } else { - str += static_cast(symbol); + str += symbol; } - while(symbol != L'^') + while(symbol != '^') { - symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) - { - end_of_file = true; - vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); - return get_next_word(); - } - else if(symbol == L'\\') - { - str += L'\\'; - symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) - { - end_of_file = true; - vwords[ivwords]->add_ignored_string(str); - vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); - return get_next_word(); - } - str += static_cast(symbol); - symbol = L'\\'; - } - else if(symbol == L'^') - { - if(str.size() > 0) - { - vwords[ivwords]->add_ignored_string(str); + symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); + return get_next_word(); + } else if(symbol == '\\') { + str += '\\'; + symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); + return get_next_word(); } - readRestOfWord(ivwords); - return get_next_word(); - } - else - { - str += static_cast(symbol); - } + str += static_cast(symbol); + symbol = '\\'; + } else if(symbol == '^') { + if(str.size() > 0) { + vwords[ivwords]->add_ignored_string(str); + } + readRestOfWord(ivwords); + return get_next_word(); + } else { + str += static_cast(symbol); + } } } } @@ -168,9 +159,9 @@ FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) ms.init(me->getInitial()); for(int i = 0, limit = str.size(); i != limit; i++) { - if(str[i] != L'<') + if(str[i] != '<') { - if(str[i] == L'+') + if(str[i] == '+') { int val = ms.classifyFinals(me->getFinals()); if(val != -1) @@ -183,14 +174,14 @@ FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) } else { - UString tag = ""; + UString tag; for(int j = i+1; j != limit; j++) { - if(str[j] == L'\\') + if(str[j] == '\\') { j++; } - else if(str[j] == L'>') + else if(str[j] == '>') { tag = str.substr(i, j-i+1); i = j; @@ -216,7 +207,7 @@ FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) vwords[ivwords]->add_tag(last_type, str.substr(floor, last_pos - floor + 1), td->getPreferRules()); - if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + if(str[last_pos+1] == '+' && last_pos+1 < limit ) { floor = last_pos + 1; last_pos = floor + 1; @@ -248,7 +239,7 @@ FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) vwords[ivwords]->add_tag(last_type, str.substr(floor, last_pos - floor + 1), td->getPreferRules()); - if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + if(str[last_pos+1] == '+' && last_pos+1 < limit ) { floor = last_pos + 1; last_pos = floor; @@ -292,12 +283,12 @@ void FileMorphoStream::readRestOfWord(int &ivwords) { // first we have the superficial form - UString str = ""; + UString str; while(true) { - int symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) + UChar32 symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { end_of_file = true; if(str.size() > 0) @@ -307,25 +298,25 @@ FileMorphoStream::readRestOfWord(int &ivwords) cerr<<"Word being read: "<get_superficial_form()<<"\n"; cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return; } - else if(symbol == L'\\') + else if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); + symbol = input.get(); + str += '\\'; + str += symbol; } - else if(symbol == L'/') + else if(symbol == '/') { vwords[ivwords]->set_superficial_form(str); - str = ""; + str.clear(); break; } - else if(symbol == L'$') + else if(symbol == '$') { vwords[ivwords]->set_superficial_form(str); - vwords[ivwords]->add_ignored_string("$"); + vwords[ivwords]->add_ignored_string("$"_u); break; } else @@ -338,8 +329,8 @@ FileMorphoStream::readRestOfWord(int &ivwords) while(true) { - int symbol = fgetwc_unlocked(input); - if(feof(input) || (null_flush && symbol == L'\0')) + UChar32 symbol = input.get(); + if(input.eof() || (null_flush && symbol == '\0')) { end_of_file = true; if(str.size() > 0) @@ -349,26 +340,26 @@ FileMorphoStream::readRestOfWord(int &ivwords) cerr<<"Word being read: "<get_superficial_form()<<"\n"; cerr<<"Debug: "<< str <<"\n"; } - vwords[ivwords]->add_tag(ca_tag_keof, "", td->getPreferRules()); + vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); return; } - else if(symbol == L'\\') + else if(symbol == '\\') { - symbol = fgetwc_unlocked(input); - str += L'\\'; - str += static_cast(symbol); - symbol = L'\\'; // to prevent exiting with '\$' + symbol = input.get(); + str += '\\'; + str += symbol; + symbol = '\\'; // to prevent exiting with '\$' } - else if(symbol == L'/') + else if(symbol == '/') { lrlmClassify(str, ivwords); - str = ""; + str.clear(); ivwords = 0; continue; } - else if(symbol == L'$') + else if(symbol == '$') { - if(str[0] != L'*')// do nothing with unknown words + if(str[0] != '*')// do nothing with unknown words { lrlmClassify(str, ivwords); } @@ -402,6 +393,6 @@ FileMorphoStream::setEndOfFile(bool eof) void FileMorphoStream::rewind() { - std::fseek(input, 0, SEEK_SET); + input.rewind(); end_of_file = false; } diff --git a/apertium/file_morpho_stream.h b/apertium/file_morpho_stream.h index 6a6ecf6..fdf8871 100644 --- a/apertium/file_morpho_stream.h +++ b/apertium/file_morpho_stream.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/apertium/file_tagger.cc b/apertium/file_tagger.cc index 774a616..f272a72 100644 --- a/apertium/file_tagger.cc +++ b/apertium/file_tagger.cc @@ -80,7 +80,7 @@ void FILE_Tagger::init_probabilities_kupiec_(const char* corpus_file) { init_probabilities_kupiec_(lexmorfo); } -void FILE_Tagger::read_dictionary(FILE *fdic) { +void FILE_Tagger::read_dictionary(const char* fdic) { tagger_utils::scan_for_ambg_classes(fdic, get_tagger_data()); tagger_utils::add_neccesary_ambg_classes(get_tagger_data()); post_ambg_class_scan(); diff --git a/apertium/file_tagger.h b/apertium/file_tagger.h index c7bceb0..00496e9 100644 --- a/apertium/file_tagger.h +++ b/apertium/file_tagger.h @@ -56,9 +56,9 @@ public: /** It reads the expanded dictionary received as a parameter and calculates * the set of ambiguity classes that the tagger will manage. - * @param is the input stream with the expanded dictionary to read + * @param is the filename of expanded dictionary to read (or NULL for stdin) */ - void read_dictionary(FILE *is); + void read_dictionary(const char* is); virtual TaggerData& get_tagger_data() = 0; diff --git a/apertium/hmm.cc b/apertium/hmm.cc index 2e7aec9..8c5f3c5 100644 --- a/apertium/hmm.cc +++ b/apertium/hmm.cc @@ -193,7 +193,7 @@ HMM::init_probabilities_kupiec(MorphoStream &lexmorfo) //We count for each ambiguity class the number of ocurrences word = lexmorfo.get_next_word(); while((word)) { - if (++nw%10000==0) cerr<get_tags(); @@ -302,7 +302,7 @@ HMM::init_probabilities_from_tagged_text(MorphoStream &stream_tagged, exit(1); } - if (++nw%100==0) cerr< #include #include -#include +#include using namespace std; @@ -35,7 +35,7 @@ using namespace std; */ class AccentsMap { - typedef std::map acmap; + typedef std::map acmap; private: acmap map; // Accent to character acmap::iterator it; // Iterator for searching diff --git a/apertium/lswpost.cc b/apertium/lswpost.cc index 16f53ac..eb10e91 100644 --- a/apertium/lswpost.cc +++ b/apertium/lswpost.cc @@ -53,7 +53,7 @@ TaggerData& LSWPoST::get_tagger_data() { void LSWPoST::deserialise(FILE *Serialised_FILE_Tagger) { tdlsw.read(Serialised_FILE_Tagger); - eos = (tdlsw.getTagIndex())["TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"_u]; } std::vector &LSWPoST::getArrayTags() { @@ -64,7 +64,7 @@ void LSWPoST::serialise(FILE *Stream_) { tdlsw.write(Stream_); } void LSWPoST::deserialise(const TaggerData &Deserialised_FILE_Tagger) { tdlsw = TaggerDataLSW(Deserialised_FILE_Tagger); - eos = (tdlsw.getTagIndex())["TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"_u]; } void LSWPoST::init_probabilities_from_tagged_text_(MorphoStream &, MorphoStream &) { @@ -88,7 +88,7 @@ LSWPoST::LSWPoST(TaggerFlags& Flags_) : FILE_Tagger(Flags_) {} LSWPoST::LSWPoST(TaggerDataLSW t) { tdlsw = t; - eos = (tdlsw.getTagIndex())["TAG_SENT"]; + eos = (tdlsw.getTagIndex())["TAG_SENT"_u]; } LSWPoST::~LSWPoST() {} @@ -112,7 +112,7 @@ LSWPoST::init_probabilities(MorphoStream &morpho_stream) { int num_valid_seq = 0; word = new TaggerWord(); // word for tags left - word->add_tag(eos, "sent", tdlsw.getPreferRules()); + word->add_tag(eos, "sent"_u, tdlsw.getPreferRules()); tags_left = word->get_tags(); // tags left if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); @@ -138,7 +138,7 @@ LSWPoST::init_probabilities(MorphoStream &morpho_stream) { // count each element of the para matrix while (word != NULL) { if (++nw % 10000 == 0) { - cerr << L'.' << flush; + cerr << '.' << flush; } tags_right = word->get_tags(); // tags right @@ -249,7 +249,7 @@ LSWPoST::train(MorphoStream &morpho_stream) { vector > > para_matrix_new(N, vector >(N, vector(N, 0))); word = new TaggerWord(); // word for tags left - word->add_tag(eos, "sent", tdlsw.getPreferRules()); + word->add_tag(eos, "sent"_u, tdlsw.getPreferRules()); tags_left = word->get_tags(); // tags left if (tags_left.size()==0) { //This is an unknown word tags_left = tdlsw.getOpenClass(); @@ -273,7 +273,7 @@ LSWPoST::train(MorphoStream &morpho_stream) { while (word) { if (++nw % 10000 == 0) { - cerr << L'.' << flush; + cerr << '.' << flush; } tags_right = word->get_tags(); // tags right @@ -339,7 +339,7 @@ LSWPoST::tagger(MorphoStream &morpho_stream, UFILE* Output) { morpho_stream.setNullFlush(TheFlags.getNullFlush()); word_left = new TaggerWord(); // word left - word_left->add_tag(eos, "sent", tdlsw.getPreferRules()); + word_left->add_tag(eos, "sent"_u, tdlsw.getPreferRules()); word_left->set_show_sf(TheFlags.getShowSuperficial()); tags_left = word_left->get_tags(); // tags left @@ -380,13 +380,13 @@ LSWPoST::tagger(MorphoStream &morpho_stream, UFILE* Output) { } } - micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())["TAG_kEOF"]); + micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())["TAG_kEOF"_u]); write(micad, Output); if (morpho_stream.getEndOfFile()) { if (TheFlags.getNullFlush()) { u_fputc('\0', Output); } - fflush(Output); + u_fflush(Output); morpho_stream.setEndOfFile(false); } diff --git a/apertium/morpheme.cc b/apertium/morpheme.cc index 68e6b9c..0aa0640 100644 --- a/apertium/morpheme.cc +++ b/apertium/morpheme.cc @@ -30,10 +30,11 @@ bool operator<(const Morpheme &a, const Morpheme &b) { } std::ostream& operator<<(std::ostream& out, const Morpheme &morph) { - out << morph.TheLemma; - for (auto& it : morph.TheTags) { - out << "<" << it.TheTag << ">"; - } + // TODO! this isn't working for some reason + //out << morph.TheLemma; + //for (auto& it : morph.TheTags) { + // out << "<" << it.TheTag << ">"; + //} return out; } diff --git a/apertium/mtx_reader.cc b/apertium/mtx_reader.cc index 994986a..deb05fb 100644 --- a/apertium/mtx_reader.cc +++ b/apertium/mtx_reader.cc @@ -22,11 +22,16 @@ #include #include +#include // TODO + #include #include #include #include +typedef basic_istringstream uistringstream; +typedef basic_stringstream ustringstream; + // XML parsing function guideline // When control is pass to you, you might need to stepToTag // When delegating or returning control, step beyond yourself @@ -36,18 +41,21 @@ MTXReader::MTXReader(VM &spec) : spec(spec), in_global_defn(false), template_slot_counter(0), cur_feat(NULL) {} -size_t MTXReader::pushSetConst(std::string &val) +size_t MTXReader::pushSetConst(UString &val) { size_t set_idx = spec.set_consts.size(); - stringstream val_ss(val); - spec.set_consts.push_back(set( - istream_iterator(val_ss), - istream_iterator() - )); + set s; + ustringstream val_ss(val); + while (!val_ss.eof()) { + UString temp; + val_ss >> temp; + s.insert(temp); + } + spec.set_consts.push_back(s); return set_idx; } -size_t MTXReader::pushStrConst(std::string &val) +size_t MTXReader::pushStrConst(UString &val) { size_t str_idx = spec.str_consts.size(); spec.str_consts.push_back(val); @@ -83,7 +91,10 @@ void MTXReader::emitUInt(int val) void MTXReader::procCoarseTags() { - std::string tsx_fn = attrib("tag"); + UString tsx_fn_attr = attrib("tag"_u); + std::string tsx_fn; + utf8::utf16to8(tsx_fn_attr.begin(), tsx_fn_attr.end(), std::back_inserter(tsx_fn)); + // TODO TODO TODO bool is_abs = ((tsx_fn.size() >= 1 && tsx_fn[0] == '/') || (tsx_fn.size() >= 2 && tsx_fn[1] == ':')); if (!is_abs) { @@ -96,38 +107,38 @@ void MTXReader::procCoarseTags() tsx_reader.read(tsx_fn); spec.coarse_tags = Optional( tsx_reader.getTaggerData()); - stepPastSelfClosingTag("coarse-tags"); + stepPastSelfClosingTag("coarse-tags"_u); } void MTXReader::procSetDef() { - UString name = attrib("name"); + UString name = attrib("name"_u); stepToNextTag(); size_t set_idx = spec.set_consts.size(); spec.set_consts.push_back(VMSet()); VMSet &vm_set = spec.set_consts.back(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == "set-member") { - std::string tag = attrib("tag"); - std::string str = attrib("str"); - vm_set.insert(tag != "" ? tag : str); + if (name == "set-member"_u) { + UString tag = attrib("tag"_u); + UString str = attrib("str"_u); + vm_set.insert(tag.empty() ? str : tag); } else { - parseError("Expected set-member"); + parseError("Expected set-member"_u); } stepToNextTag(); } set_names[name] = set_idx; - assert(name == "def-set"); + assert(name == "def-set"_u); stepToNextTag(); } void MTXReader::procStrDef() { - UString name = attrib("name"); - std::string tag = attrib("tag"); - std::string str = attrib("str"); - str_names[name] = pushStrConst(tag != "" ? tag : str); - stepPastSelfClosingTag("def-str"); + UString name = attrib("name"_u); + UString tag = attrib("tag"_u); + UString str = attrib("str"_u); + str_names[name] = pushStrConst(tag.empty() ? str : tag); + stepPastSelfClosingTag("def-str"_u); } void @@ -135,19 +146,19 @@ MTXReader::procDefns() { stepToNextTag(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == "def-set") { + if (name == "def-set"_u) { procSetDef(); - } else if (name == "def-str") { + } else if (name == "def-str"_u) { procStrDef(); - } else if (name == "def-macro") { + } else if (name == "def-macro"_u) { procDefMacro(); - } else if (name == "#text" || name == "#comment") { + } else if (name == "#text"_u || name == "#comment"_u) { // skip } else { unexpectedTag(); } } - assert(name == "defns"); + assert(name == "defns"_u); stepToNextTag(); } @@ -157,7 +168,7 @@ MTXReader::procGlobalPred() cur_feat = &spec.global_pred; stepToNextTag(); procBoolExpr(); - assert(name == "global-pred" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "global-pred"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } @@ -202,50 +213,50 @@ MTXReader::procIntExpr(bool allow_fail) /* Self-closing tags */ if (!tryProcArg(INTEXPR, true) && !tryProcVar(VM::INTVAL)) { - if (name == "sentlen") { + if (name == "sentlen"_u) { emitOpcode(VM::SENTLENTOK); - stepPastSelfClosingTag("sentlen"); - } else if (name == "pathlen") { + stepPastSelfClosingTag("sentlen"_u); + } else if (name == "pathlen"_u) { emitOpcode(VM::SENTLENWRD); - stepPastSelfClosingTag("pathlen"); - } else if (name == "tokaddr") { + stepPastSelfClosingTag("pathlen"_u); + } else if (name == "tokaddr"_u) { emitOpcode(VM::PUSHTOKADDR); - stepPastSelfClosingTag("tokaddr"); - } else if (name == "wrdidx") { + stepPastSelfClosingTag("tokaddr"_u); + } else if (name == "wrdidx"_u) { emitOpcode(VM::PUSHWRDADDR); - stepPastSelfClosingTag("wrdidx"); - } else if (name == "int") { + stepPastSelfClosingTag("wrdidx"_u); + } else if (name == "int"_u) { emitOpcode(VM::PUSHINT); getAndEmitInt(); - stepPastSelfClosingTag("int"); + stepPastSelfClosingTag("int"_u); /* Other tags */ - } else if (name == "add") { + } else if (name == "add"_u) { stepToNextTag(); procIntExpr(); procIntExpr(); - assert(name == "add" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "add"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADD); stepToNextTag(); - } else if (name == "toklen") { + } else if (name == "toklen"_u) { procIntExpr(); - assert(name == "toklen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "toklen"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::TOKLENWRD); stepToNextTag(); - } else if (name == "strlen") { + } else if (name == "strlen"_u) { procStrExpr(); - assert(name == "strlen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "strlen"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::STRLEN); stepToNextTag(); - } else if (name == "arrlen") { + } else if (name == "arrlen"_u) { procStrArrExpr(); - assert(name == "arrlen" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "arrlen"_u && type == XML_READER_TYPE_END_ELEMENT); procBinCompareOp(VM::ARRLEN); stepToNextTag(); } else { if (allow_fail) { return false; } - parseError("Expected an integer expression."); + parseError("Expected an integer expression."_u); } } return true; @@ -258,22 +269,22 @@ MTXReader::procStrArrExpr(bool allow_fail) if (!tryProcArg(STRARREXPR, true) && !tryProcVar(VM::STRARRVAL) && !tryProcSlice(&MTXReader::procStrArrExpr)) { - if (name == "ex-tags") { + if (name == "ex-tags"_u) { stepToNextTag(); procWordoidExpr(); assert(type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::EXTAGS); - } else if (name == "ex-ambgset") { + } else if (name == "ex-ambgset"_u) { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXAMBGSET); - } else if (name == "for-each") { + } else if (name == "for-each"_u) { procForEach(STREXPR); } else { if (allow_fail) { return false; } - parseError("Expected a string list expression."); + parseError("Expected a string list expression."_u); } stepToNextTag(); } @@ -282,13 +293,13 @@ MTXReader::procStrArrExpr(bool allow_fail) bool MTXReader::tryProcSubscript(bool (MTXReader::*proc_inner)(bool)) { - if (name == "subscript") { - int idx = getInt("idx"); + if (name == "subscript"_u) { + int idx = getInt("idx"_u); stepToNextTag(); (this->*proc_inner)(false); emitOpcode(VM::SUBSCRIPT); emitUInt(idx); - assert(name == "subscript" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "subscript"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -297,24 +308,24 @@ bool MTXReader::tryProcSubscript(bool (MTXReader::*proc_inner)(bool)) bool MTXReader::tryProcSlice(bool (MTXReader::*proc_inner)(bool)) { - if (name == "slice") { + if (name == "slice"_u) { stepToNextTag(); (this->*proc_inner)(false); bool exists; emitOpcode(VM::SLICE); - int start_lit = getInt("start", exists); + int start_lit = getInt("start"_u, exists); if (exists) { emitInt(start_lit); } else { emitInt(0); } - int end_lit = getInt("end", exists); + int end_lit = getInt("end"_u, exists); if (exists) { emitInt(end_lit); } else { emitInt(0); } - assert(name == "slice" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "slice"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -323,17 +334,17 @@ bool MTXReader::tryProcSlice(bool (MTXReader::*proc_inner)(bool)) bool MTXReader::tryProcArg(ExprType expr_type, bool allow_fail) { - if (name == "var") { - UString var_name = attrib("name"); + if (name == "var"_u) { + UString var_name = attrib("name"_u); if (in_global_defn) { VarNVMap::const_iterator arg_name_it = template_arg_names.find(var_name); if (arg_name_it != template_arg_names.end()) { cur_replacements->push_back(make_pair(arg_name_it->second, expr_type)); - stepPastSelfClosingTag("var"); + stepPastSelfClosingTag("var"_u); return true; } if (!allow_fail) { - parseError("No such argument " + var_name); + parseError("No such argument "_u + var_name); } } } @@ -342,31 +353,31 @@ bool MTXReader::tryProcArg(ExprType expr_type, bool allow_fail) bool MTXReader::tryProcVar(VM::StackValueType svt) { - if (name == "var") { - UString var_name = attrib("name"); + if (name == "var"_u) { + UString var_name = attrib("name"_u); VarNVMap::const_iterator slot_names_it = slot_names.find(var_name); if (slot_names_it != slot_names.end()) { if (slot_types[slot_names_it->second] != svt) { - parseError("Variable " + var_name + " has the wrong type"); + parseError("Variable "_u + var_name + " has the wrong type"_u); } emitOpcode(VM::GETVAR); emitUInt(slot_names_it->second); - stepPastSelfClosingTag("var"); + stepPastSelfClosingTag("var"_u); return true; } - parseError("Variable " + var_name + " has not been set."); - } else if (!in_global_defn && name == "macro") { + parseError("Variable "_u + var_name + " has not been set."_u); + } else if (!in_global_defn && name == "macro"_u) { // Get template data - UString var_name = attrib("name"); + UString var_name = attrib("name"_u); VarNVMap::const_iterator template_name_it = template_slot_names.find(var_name); if (template_name_it == template_slot_names.end()) { - parseError("No such macro " + var_name); + parseError("No such macro "_u + var_name); } size_t templ_idx = template_name_it->second; if (template_slot_types[templ_idx] != svt) { - parseError("Macro " + var_name + " returns the wrong type"); + parseError("Macro "_u + var_name + " returns the wrong type"_u); } std::pair &templ_defn = template_defns[templ_idx]; // Get arg values @@ -417,7 +428,7 @@ bool MTXReader::tryProcVar(VM::StackValueType svt) emitOpcode(VM::GETGVAR); emitUInt(templ_instcia_it->second); // Step past end - assert(name == "macro" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "macro"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); return true; } @@ -431,19 +442,19 @@ MTXReader::procStrExpr(bool allow_fail) && !tryProcVar(VM::STRVAL) && !tryProcSlice(&MTXReader::procStrExpr) && !tryProcSubscript(&MTXReader::procStrArrExpr)) { - if (name == "ex-surf") { + if (name == "ex-surf"_u) { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXTOKSURF); - } else if (name == "ex-lemma") { + } else if (name == "ex-lemma"_u) { stepToNextTag(); procWordoidExpr(); emitOpcode(VM::EXWRDLEMMA); - } else if (name == "ex-coarse") { + } else if (name == "ex-coarse"_u) { stepToNextTag(); procWordoidExpr(); emitOpcode(VM::EXWRDCOARSETAG); - } else if (name == "join") { + } else if (name == "join"_u) { bool has_attr; size_t str_idx = getStrRef(has_attr); if (!has_attr) { @@ -457,7 +468,7 @@ MTXReader::procStrExpr(bool allow_fail) if (allow_fail) { return false; } - parseError("Expected a string expression."); + parseError("Expected a string expression."_u); } assert(type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); @@ -470,95 +481,95 @@ MTXReader::procBoolExpr(bool allow_fail) { if (!tryProcArg(BEXPR, true) && !tryProcVar(VM::BVAL)) { - if (name == "and") { + if (name == "and"_u) { stepToNextTag(); procCommBoolOp(VM::AND); - assert(name == "and" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "and"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "or") { + } else if (name == "or"_u) { stepToNextTag(); procCommBoolOp(VM::OR); - assert(name == "or" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "or"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "not") { + } else if (name == "not"_u) { stepToNextTag(); procBoolExpr(); emitOpcode(VM::NOT); - assert(name == "not" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "not"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "eq") { + } else if (name == "eq"_u) { stepToNextTag(); procBinCompareOp(VM::EQ); - assert(name == "eq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "eq"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "neq") { + } else if (name == "neq"_u) { stepToNextTag(); procBinCompareOp(VM::NEQ); - assert(name == "neq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "neq"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "lt") { + } else if (name == "lt"_u) { stepToNextTag(); procBinCompareOp(VM::LT); - assert(name == "lt" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "lt"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "lte") { + } else if (name == "lte"_u) { stepToNextTag(); procBinCompareOp(VM::LTE); - assert(name == "lte" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "lte"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "gt") { + } else if (name == "gt"_u) { stepToNextTag(); procBinCompareOp(VM::GT); - assert(name == "gt" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "gt"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "gte") { + } else if (name == "gte"_u) { stepToNextTag(); procBinCompareOp(VM::GTE); - assert(name == "gte" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "gte"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "streq") { + } else if (name == "streq"_u) { size_t str_ref = getStrRef(); stepToNextTag(); procStrExpr(); emitOpcode(VM::STREQ); emitUInt(str_ref); - assert(name == "streq" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "streq"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "strin") { + } else if (name == "strin"_u) { size_t set_ref = getSetRef(); stepToNextTag(); procStrExpr(); emitOpcode(VM::STRIN); emitUInt(set_ref); - assert(name == "strin" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "strin"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); /* Identical to strin? - } else if (name == "sethas") { + } else if (name == "sethas"_u) { stepToNextTag(); procStrExpr(); emitSetImmOp(VM::SETHAS); */ - } else if (name == "sethasany") { + } else if (name == "sethasany"_u) { size_t set_ref = getSetRef(); stepToNextTag(); procStrArrExpr(); emitOpcode(VM::SETHASANY); emitUInt(set_ref); - assert(name == "sethasany" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "sethasany"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "sethasall") { + } else if (name == "sethasall"_u) { size_t set_ref = getSetRef(); stepToNextTag(); procStrArrExpr(); emitOpcode(VM::SETHASALL); emitUInt(set_ref); - assert(name == "sethasall" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "sethasall"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } else { if (allow_fail) { return false; } - parseError("Expected a boolean expression."); + parseError("Expected a boolean expression."_u); } } return true; @@ -570,37 +581,37 @@ MTXReader::procAddrExpr() stepToTag(); /* Self-closing tags */ if (!tryProcArg(ADDREXPR)) { - if (name == "wrdaddr") { + if (name == "wrdaddr"_u) { emitOpcode(VM::PUSHADDR); - stepPastSelfClosingTag("wrdaddr"); + stepPastSelfClosingTag("wrdaddr"_u); /* Others */ - } else if (name == "addr-of-ints") { + } else if (name == "addr-of-ints"_u) { stepToNextTag(); procIntExpr(); procIntExpr(); - assert(name == "addr-of-ints" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "addr-of-ints"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); - } else if (name == "add") { + } else if (name == "add"_u) { stepToNextTag(); procAddrExpr(); procAddrExpr(); - assert(name == "add" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "add"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADD2); stepToNextTag(); - } else if (name == "adjust") { + } else if (name == "adjust"_u) { stepToNextTag(); procAddrExpr(); - assert(name == "adjust" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "adjust"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::ADJADDR); stepToNextTag(); - } else if (name == "clamp") { + } else if (name == "clamp"_u) { stepToNextTag(); procAddrExpr(); - assert(name == "clamp" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "clamp"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::CLAMPADDR); stepToNextTag(); } else { - parseError("Expected an address expression."); + parseError("Expected an address expression."_u); } } } @@ -611,18 +622,18 @@ MTXReader::procWordoidArrExpr(bool allow_fail) if (!tryProcArg(WRDARREXPR, true) && !tryProcVar(VM::WRDARRVAL) && !tryProcSlice(&MTXReader::procWordoidArrExpr)) { - if (name == "ex-wordoids") { + if (name == "ex-wordoids"_u) { stepToNextTag(); procIntExpr(); emitOpcode(VM::EXWRDARR); - assert(name == "ex-wordoids" && type == XML_READER_TYPE_END_ELEMENT); - } else if (name == "for-each") { + assert(name == "ex-wordoids"_u && type == XML_READER_TYPE_END_ELEMENT); + } else if (name == "for-each"_u) { procForEach(WRDEXPR); } else { if (allow_fail) { return false; } - parseError("Expected a wordoid array expression."); + parseError("Expected a wordoid array expression."_u); } stepToNextTag(); } @@ -636,7 +647,7 @@ MTXReader::procWordoidExpr(bool allow_fail) if (!tryProcArg(WRDEXPR, true) && !tryProcVar(VM::WRDVAL) && !tryProcSubscript(&MTXReader::procWordoidArrExpr)) { - if (name == "ex-wordoid") { + if (name == "ex-wordoid"_u) { stepToNextTag(); procAddrExpr(); emitOpcode(VM::GETWRD); @@ -644,7 +655,7 @@ MTXReader::procWordoidExpr(bool allow_fail) if (allow_fail) { return false; } - parseError("Expected a wordoid expression."); + parseError("Expected a wordoid expression."_u); } assert(type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); @@ -657,7 +668,7 @@ MTXReader::procPred() { stepToNextTag(); procBoolExpr(); - assert(name == "pred" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "pred"_u && type == XML_READER_TYPE_END_ELEMENT); emitOpcode(VM::DIEIFFALSE); stepToNextTag(); } @@ -665,10 +676,10 @@ MTXReader::procPred() size_t MTXReader::getConstRef( const UString &ref_attr, - const std::string &lit_attr, + const UString &lit_attr, const UString &what, VarNVMap &const_map, - size_t (MTXReader::*push_new)(std::string&), + size_t (MTXReader::*push_new)(UString&), bool& exists) { UString const_name = attrib(ref_attr); @@ -676,11 +687,11 @@ MTXReader::getConstRef( exists = true; VarNVMap::iterator sit = const_map.find(const_name); if (sit == const_map.end()) { - parseError("No " + what + " named " + const_name); + parseError("No "_u + what + " named "_u + const_name); } return sit->second; } - std::string const_lit = attrib(lit_attr); + UString const_lit = attrib(lit_attr); if (!const_lit.empty()) { exists = true; return (this->*push_new)(const_lit); @@ -692,7 +703,7 @@ MTXReader::getConstRef( size_t MTXReader::getSetRef(bool& exists) { - return getConstRef("name", "val", "set", set_names, &MTXReader::pushSetConst, exists); + return getConstRef("name"_u, "val"_u, "set"_u, set_names, &MTXReader::pushSetConst, exists); } size_t @@ -701,7 +712,7 @@ MTXReader::getSetRef() bool has_attr; size_t set_ref = getSetRef(has_attr); if (!has_attr) { - parseError("Set required"); + parseError("Set required"_u); } return set_ref; } @@ -709,7 +720,7 @@ MTXReader::getSetRef() size_t MTXReader::getStrRef(bool& exists) { - return getConstRef("name", "val", "string", str_names, &MTXReader::pushStrConst, exists); + return getConstRef("name"_u, "val"_u, "string"_u, str_names, &MTXReader::pushStrConst, exists); } size_t @@ -718,19 +729,19 @@ MTXReader::getStrRef() bool has_attr; size_t str_ref = getStrRef(has_attr); if (!has_attr) { - parseError("String required"); + parseError("String required"_u); } return str_ref; } int -MTXReader::getInt(std::string attr_name, bool& exists) +MTXReader::getInt(UString attr_name, bool& exists) { - std::string int_lit = attrib(attr_name); + UString int_lit = attrib(attr_name); if (!int_lit.empty()) { exists = true; int int_out; - stringstream int_ss(int_lit); + ustringstream int_ss(int_lit); int_ss >> int_out; return int_out; } @@ -741,16 +752,16 @@ MTXReader::getInt(std::string attr_name, bool& exists) int MTXReader::getInt(bool& exists) { - return getInt("val", exists); + return getInt("val"_u, exists); } int -MTXReader::getInt(std::string attr_name) +MTXReader::getInt(UString attr_name) { bool has_attr; int i = getInt(attr_name, has_attr); if (!has_attr) { - parseError("String required"); + parseError("String required"_u); } return i; } @@ -758,7 +769,7 @@ MTXReader::getInt(std::string attr_name) int MTXReader::getInt() { - return getInt("val"); + return getInt("val"_u); } template @@ -769,7 +780,7 @@ MTXReader::emitAttr( bool has_attr = false; GetT val = (this->*getter)(has_attr); if (!has_attr) { - parseError(what + " required"); + parseError(what + " required"_u); } (this->*emitter)(val); } @@ -777,19 +788,19 @@ MTXReader::emitAttr( void MTXReader::getAndEmitStrRef() { - emitAttr("String", &MTXReader::getStrRef, &MTXReader::emitUInt); + emitAttr("String"_u, &MTXReader::getStrRef, &MTXReader::emitUInt); } void MTXReader::getAndEmitSetRef() { - emitAttr("Set", &MTXReader::getSetRef, &MTXReader::emitUInt); + emitAttr("Set"_u, &MTXReader::getSetRef, &MTXReader::emitUInt); } void MTXReader::getAndEmitInt() { - emitAttr("Integer", &MTXReader::getInt, &MTXReader::emitInt); + emitAttr("Integer"_u, &MTXReader::getInt, &MTXReader::emitInt); } void @@ -797,7 +808,7 @@ MTXReader::procInst() { // XXX: There's no way to tell the difference between an empty and absent // attribute with the current lttoolbox xml code - std::string op = attrib("opcode"); + UString op = attrib("opcode"_u); std::transform(op.begin(), op.end(), op.begin(), ::toupper); emitOpcode(VM::opcode_values[op]); int val; @@ -809,7 +820,7 @@ MTXReader::procInst() val = getInt(has_int_lit); int num_operands = has_set_ref + has_str_ref + has_int_lit; if (num_operands > 1) { - parseError("Opcodes can have at most one operand."); + parseError("Opcodes can have at most one operand."_u); } else if (num_operands == 1) { if (has_int_lit) { emitInt(val); @@ -837,10 +848,10 @@ MTXReader::procOut() has_expr = true; } if (!has_expr) { - parseError("Expected a string, bool or int expression."); + parseError("Expected a string, bool or int expression."_u); } stepToTag(); - assert(name == "out" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "out"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } @@ -850,7 +861,7 @@ MTXReader::procOutMany() stepToNextTag(); procStrArrExpr(); emitOpcode(VM::FCATSTRARR); - assert(name == "out-many" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "out-many"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); } @@ -859,12 +870,12 @@ MTXReader::printTmplDefn(const TemplateDefn &tmpl_defn) { PerceptronSpec::printFeature(std::cerr, tmpl_defn.first); if (tmpl_defn.second.size() > 0) { - std::cerr << "Replacements:\n"; + std::cerr << "Replacements:\n"_u; TemplateReplacements::const_iterator it = tmpl_defn.second.begin(); for (; it != tmpl_defn.second.end(); it++) { - std::cerr << "Index: " << it->first << " "; + std::cerr << "Index: "_u << it->first << " "_u; printTypeExpr(it->second); - std::cerr << "\n"; + std::cerr << "\n"_u; } } } @@ -874,22 +885,22 @@ MTXReader::printStackValueType(VM::StackValueType svt) { switch (svt) { case VM::INTVAL: - std::cerr << "INT"; + std::cerr << "INT"_u; break; case VM::BVAL: - std::cerr << "BOOL"; + std::cerr << "BOOL"_u; break; case VM::STRVAL: - std::cerr << "STR"; + std::cerr << "STR"_u; break; case VM::STRARRVAL: - std::cerr << "STRARR"; + std::cerr << "STRARR"_u; break; case VM::WRDVAL: - std::cerr << "WRD"; + std::cerr << "WRD"_u; break; case VM::WRDARRVAL: - std::cerr << "WRDARR"; + std::cerr << "WRDARR"_u; break; default: throw 1; @@ -901,29 +912,29 @@ MTXReader::printTypeExpr(ExprType expr_type) { switch (expr_type) { case VOIDEXPR: - std::cerr << "VOID"; + std::cerr << "VOID"_u; break; case INTEXPR: - std::cerr << "INT"; + std::cerr << "INT"_u; break; case BEXPR: - std::cerr << "BOOL"; + std::cerr << "BOOL"_u; break; case STREXPR: - std::cerr << "STR"; + std::cerr << "STR"_u; procStrExpr(); break; case STRARREXPR: - std::cerr << "STRARR"; + std::cerr << "STRARR"_u; break; case WRDEXPR: - std::cerr << "WRD"; + std::cerr << "WRD"_u; break; case WRDARREXPR: - std::cerr << "WRDARR"; + std::cerr << "WRDARR"_u; break; case ADDREXPR: - std::cerr << "ADDR"; + std::cerr << "ADDR"_u; break; default: throw 1; @@ -966,9 +977,9 @@ MTXReader::procTypeExpr(ExprType expr_type) void MTXReader::procForEach(ExprType expr_type) { - UString var_name = attrib("as"); - if (var_name == "") { - parseError("'as' attribute required for for-each."); + UString var_name = attrib("as"_u); + if (var_name == ""_u) { + parseError("'as' attribute required for for-each."_u); } size_t slot_idx = slot_counter++; slot_names[var_name] = slot_idx; @@ -983,7 +994,7 @@ MTXReader::procForEach(ExprType expr_type) has_expr = true; } if (!has_expr) { - parseError("Expected a string array or wordoid array expression."); + parseError("Expected a string array or wordoid array expression."_u); } emitOpcode(VM::FOREACHINIT); @@ -1021,21 +1032,21 @@ bool MTXReader::procVoidExpr(bool allow_fail) { stepToTag(); - if (name == "pred") { + if (name == "pred"_u) { procPred(); - } else if (name == "out") { + } else if (name == "out"_u) { procOut(); - } else if (name == "out-many") { + } else if (name == "out-many"_u) { procOutMany(); - } else if (name == "for-each") { + } else if (name == "for-each"_u) { procForEach(VOIDEXPR); - } else if (name == "inst") { + } else if (name == "inst"_u) { procInst(); } else { if (allow_fail) { return false; } - parseError("Expected a void expression."); + parseError("Expected a void expression."_u); } return true; } @@ -1049,20 +1060,20 @@ MTXReader::procDefMacro() cur_feat = &template_defns.back().first; cur_replacements = &template_defns.back().second; - UString var_name = attrib("as"); - if (var_name == "") { - parseError("'as' attribute required for def-macro."); + UString var_name = attrib("as"_u); + if (var_name == ""_u) { + parseError("'as' attribute required for def-macro."_u); } template_slot_names[var_name] = template_slot_counter; template_arg_names.clear(); - UString args = attrib("args"); - std::wistringstream args_ss(args); + UString args = attrib("args"_u); + uistringstream args_ss(args); size_t arg_i = 0; for (; !args_ss.eof(); arg_i++) { UString arg_name; args_ss >> arg_name; - if (arg_name == "") { + if (arg_name == ""_u) { break; } template_arg_names[arg_name] = arg_i; @@ -1095,9 +1106,9 @@ MTXReader::procDefMacro() has_expr = true; } if (!has_expr) { - parseError("Expected a non-void expression."); + parseError("Expected a non-void expression."_u); } - assert(name == "def-macro" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "def-macro"_u && type == XML_READER_TYPE_END_ELEMENT); stepToNextTag(); template_slot_counter++; @@ -1114,7 +1125,7 @@ MTXReader::procFeat() while (type != XML_READER_TYPE_END_ELEMENT) { procVoidExpr(); } - assert(name == "feat"); + assert(name == "feat"_u); stepToNextTag(); } @@ -1123,13 +1134,13 @@ MTXReader::procFeats() { stepToNextTag(); while (type != XML_READER_TYPE_END_ELEMENT) { - if (name == "feat") { + if (name == "feat"_u) { procFeat(); } else { unexpectedTag(); } } - assert(name == "feats"); + assert(name == "feats"_u); stepToNextTag(); } @@ -1138,7 +1149,7 @@ MTXReader::printTmplDefns() { std::vector::const_iterator it = template_defns.begin(); for (; it != template_defns.end(); it++) { - std::cerr << " Macro " << it - template_defns.begin() << "\n"; + std::cerr << " Macro "_u << it - template_defns.begin() << "\n"_u; printTmplDefn(*it); } } @@ -1151,30 +1162,30 @@ MTXReader::parse() if (type == XML_READER_TYPE_DOCUMENT_TYPE) { stepToNextTag(); } - if (name != "metatag") { - parseError("expected tag"); + if (name != "metatag"_u) { + parseError("expected tag"_u); } stepToNextTag(); - if (name == "coarse-tags") { + if (name == "coarse-tags"_u) { procCoarseTags(); } - if (name == "beam-width") { + if (name == "beam-width"_u) { size_t val; - std::istringstream val_ss(attrib("val")); + uistringstream val_ss(attrib("val"_u)); val_ss >> val; spec.beam_width = val; } else { spec.beam_width = 4; } - if (name == "defns") { + if (name == "defns"_u) { procDefns(); } - if (name == "global-pred") { + if (name == "global-pred"_u) { procGlobalPred(); } - if (name == "feats") { + if (name == "feats"_u) { procFeats(); } - assert(name == "metatag" && type == XML_READER_TYPE_END_ELEMENT); + assert(name == "metatag"_u && type == XML_READER_TYPE_END_ELEMENT); } } diff --git a/apertium/mtx_reader.h b/apertium/mtx_reader.h index 3474697..676e680 100644 --- a/apertium/mtx_reader.h +++ b/apertium/mtx_reader.h @@ -57,11 +57,11 @@ protected: virtual void parse(); private: - size_t pushSetConst(std::string &val); - size_t pushStrConst(std::string &val); - size_t getConstRef(const UString &ref_attr, const std::string &lit_attr, + size_t pushSetConst(UString &val); + size_t pushStrConst(UString &val); + size_t getConstRef(const UString &ref_attr, const UString &lit_attr, const UString &what, VarNVMap &const_map, - size_t (MTXReader::*push_new)(std::string&), bool& exists); + size_t (MTXReader::*push_new)(UString&), bool& exists); size_t getSetRef(bool& exists); size_t getSetRef(); size_t getStrRef(bool& exists); @@ -71,9 +71,9 @@ private: void pokeBytecode(size_t addr, VM::Bytecode bc); void emitInt(int val); void emitUInt(int val); - int getInt(std::string attr_name, bool& exists); + int getInt(UString attr_name, bool& exists); int getInt(bool& exists); - int getInt(std::string attr_name); + int getInt(UString attr_name); int getInt(); void procCoarseTags(); diff --git a/apertium/perceptron_spec.cc b/apertium/perceptron_spec.cc index 54e8268..b834493 100644 --- a/apertium/perceptron_spec.cc +++ b/apertium/perceptron_spec.cc @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -8,10 +7,10 @@ namespace Apertium { -void PerceptronSpec::printFeature(std::wostream &out, const PerceptronSpec::FeatureDefn &feat_defn) +void PerceptronSpec::printFeature(std::ostream &out, const PerceptronSpec::FeatureDefn &feat_defn) { ios::fmtflags orig_flags(out.flags()); - out << std::hex << std::setw(2) << std::setfill(L'0'); + out << std::hex << std::setw(2) << std::setfill('0'); for (size_t j = 0; j < feat_defn.size(); j++) { out << +feat_defn[j] << " "; } @@ -27,8 +26,8 @@ void PerceptronSpec::printFeature(std::wostream &out, const PerceptronSpec::Feat out << "\n"; } -std::wostream & -operator<<(std::wostream &out, PerceptronSpec const &ps) { +std::ostream & +operator<<(std::ostream &out, PerceptronSpec const &ps) { out << "= Global predicate =\n"; PerceptronSpec::printFeature(out, ps.global_pred); out << "= Globals (" << ps.global_defns.size() << ") =\n"; @@ -44,14 +43,15 @@ operator<<(std::wostream &out, PerceptronSpec const &ps) { return out; } -#define X(a) #a, -const std::string PerceptronSpec::opcode_names[] = { +#define X(a) to_ustring(#a), +const UString PerceptronSpec::opcode_names[] = { OPCODES }; #undef X -const std::string PerceptronSpec::type_names[] = { - "integer", "boolean", "string", "string array", "wordoid", "wordoid array" +const UString PerceptronSpec::type_names[] = { + "integer"_u, "boolean"_u, "string"_u, "string array"_u, + "wordoid"_u, "wordoid array"_u }; static Morpheme make_sentinel_wordoid( @@ -92,9 +92,9 @@ PerceptronSpec::PerceptronSpec() { opcode_values[opcode_names[i]] = (Opcode)i; } - untagged_sentinel = make_sentinel_wordoids("!UNTAGGED!", "!UT!"); - token_wordoids_underflow = make_sentinel_token("!SURF_UNDERFLOW!", "!TOK_UNDERFLOW!", "!TUF!"); - token_wordoids_overflow = make_sentinel_token("!SURF_OVERFLOW!", "!TOK_OVERFLOW!", "!TOF!"); + untagged_sentinel = make_sentinel_wordoids("!UNTAGGED!"_u, "!UT!"_u); + token_wordoids_underflow = make_sentinel_token("!SURF_UNDERFLOW!"_u, "!TOK_UNDERFLOW!"_u, "!TUF!"_u); + token_wordoids_overflow = make_sentinel_token("!SURF_OVERFLOW!"_u, "!TOK_OVERFLOW!"_u, "!TOF!"_u); static_constructed = true; } @@ -102,7 +102,7 @@ PerceptronSpec::PerceptronSpec() { unsigned char PerceptronSpec::num_opcodes; bool PerceptronSpec::static_constructed = false; -std::map +std::map PerceptronSpec::opcode_values; std::vector PerceptronSpec::untagged_sentinel; LexicalUnit PerceptronSpec::token_wordoids_underflow; @@ -141,7 +141,7 @@ PerceptronSpec::get_features( feat_vec_delta.clear(); feat_vec_delta.push_back(FeatureKey()); FeatureKey &fk = feat_vec_delta.back(); - std::string prg_id; + UString prg_id; prg_id = i; fk.push_back(prg_id); // Each feature is tagged with the which created it to avoid collisions Machine machine( @@ -153,12 +153,12 @@ PerceptronSpec::get_features( } } -std::string +UString PerceptronSpec::coarsen(const Morpheme &wrd) const { - std::map::const_iterator it = coarsen_cache.find(wrd); + std::map::const_iterator it = coarsen_cache.find(wrd); if (it == coarsen_cache.end()) { - std::string coarse_tag = UtfConverter::toUtf8(coarse_tags->coarsen(wrd)); + UString coarse_tag = coarse_tags->coarsen(wrd); coarsen_cache[wrd] = coarse_tag; return coarse_tag; } @@ -170,9 +170,9 @@ void PerceptronSpec::clearCache() const coarsen_cache.clear(); } -std::string PerceptronSpec::dot = "."; +UString PerceptronSpec::dot = "."_u; -const std::string& +const UString& PerceptronSpec::Machine::get_str_operand() { size_t idx = *(++bytecode_iter); if (idx == 255) { @@ -254,11 +254,6 @@ PerceptronSpec::Machine::Machine( token_idx(token_idx), wordoid_idx(wordoid_idx) {} -static bool -inRange(int lower, int upper, int x) { - return lower <= x && x < upper; -} - static int clamp(int lower, int upper, int x) { return std::min(std::max(x, lower), upper); @@ -394,7 +389,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) loop_state.accumulator = StackValue(std::vector()); //std::cerr << "Wordoid array size " << loop_state.iterable.size() << "\n"; } else if (stack.top().type == STRVAL) { - loop_state.accumulator = StackValue(std::vector()); + loop_state.accumulator = StackValue(std::vector()); //std::cerr << "String array size " << loop_state.iterable.size() << "\n"; } else { throw 1; @@ -482,26 +477,26 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) } break; case EXTOKSURF: { UString surf = get_token(untagged).TheSurfaceForm; - stack.push(new std::string(UtfConverter::toUtf8(surf))); + stack.push(surf); } break; case EXWRDLEMMA: { UString lemma = stack.pop_off().wrd().TheLemma; - stack.push(new std::string(UtfConverter::toUtf8(lemma))); + stack.push(lemma); } break; case EXWRDCOARSETAG: { assert(spec.coarse_tags); Morpheme &wrd = stack.top().wrd(); - std::string coarse_tag = spec.coarsen(wrd); + UString coarse_tag = spec.coarsen(wrd); stack.pop(); stack.push(coarse_tag); } break; case EXAMBGSET: { assert(spec.coarse_tags); - std::vector ambgset; + std::vector ambgset; const std::vector &analyses = get_token(untagged).TheAnalyses; std::vector::const_iterator analy_it; for (analy_it = analyses.begin(); analy_it != analyses.end(); analy_it++) { - ambgset.push_back(std::string()); + ambgset.push_back(UString()); const std::vector &wrds = analy_it->TheMorphemes; std::vector::const_iterator wrd_it = wrds.begin(); while (true) { @@ -510,7 +505,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) if (wrd_it == wrds.end()) { break; } else { - ambgset.back() += "+"; + ambgset.back() += '+'; } } } @@ -524,7 +519,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) std::cerr << &(*it) << " " << it->TheTag << ", "; } std::cerr << "\n";*/ - std::vector *tags_str = new std::vector; + std::vector *tags_str = new std::vector; tags_str->resize(tags.size()); transform(tags.begin(), tags.end(), tags_str->begin(), get_tag); stack.pop(); @@ -536,7 +531,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) case SENTLENTAGGEDTOK: stack.push((int)tagged.size()); break; - case SENTLENWRD: unimplemented_opcode("SENTLENWRD"); break; // How can we know? + case SENTLENWRD: unimplemented_opcode("SENTLENWRD"_u); break; // How can we know? case TOKLENWRD: { int target_token_idx = stack.pop_off().intVal(); assert(0 <= target_token_idx && (size_t)target_token_idx < tagged.size()); @@ -573,20 +568,20 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) } break; case FILTERIN: { const VMSet& set_op = get_set_operand(); - std::vector &str_arr = stack.top().strArr(); + std::vector &str_arr = stack.top().strArr(); str_arr.erase(std::remove_if( str_arr.begin(), str_arr.end(), std::not1(In(set_op)))); } break; /* case SETHAS: { const VMSet& set_op = get_set_operand(); - std::string str = stack.pop_off().str(); + UString str = stack.pop_off().str(); stack.push(set_op.find(str) != set_op.end()); } break; */ case SETHASANY: { const VMSet& set_op = get_set_operand(); - std::vector str_arr = stack.pop_off().strArr(); + std::vector str_arr = stack.pop_off().strArr(); stack.push( std::find_if(str_arr.begin(), str_arr.end(), In(set_op)) != str_arr.end() @@ -594,22 +589,22 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) } break; case SETHASALL: { const VMSet& set_op = get_set_operand(); - std::vector str_arr = stack.pop_off().strArr(); + std::vector str_arr = stack.pop_off().strArr(); stack.push( std::find_if(str_arr.begin(), str_arr.end(), std::not1(In(set_op))) == str_arr.end() ); } break; case HASSUBSTR: { - std::string haystack = stack.pop_off().str(); - std::string needle = get_str_operand(); - stack.push(haystack.find(needle) != std::string::npos); + UString haystack = stack.pop_off().str(); + UString needle = get_str_operand(); + stack.push(haystack.find(needle) != UString::npos); } break; - case HASANYSUBSTR: unimplemented_opcode("HASANYSUBSTR"); break; - case CPYSTR: unimplemented_opcode("CPYSTR"); break; + case HASANYSUBSTR: unimplemented_opcode("HASANYSUBSTR"_u); break; + case CPYSTR: unimplemented_opcode("CPYSTR"_u); break; case LOWER: { // XXX: Eek! Bad! No Unicode. ICU please. - std::string &str = stack.top().str(); + UString &str = stack.top().str(); std::transform(str.begin(), str.end(), str.begin(), ::tolower); } break; case SLICE: { @@ -632,7 +627,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) } } break; case STRLEN: { - std::string str = stack.pop_off().str(); + UString str = stack.pop_off().str(); stack.push((int)str.length()); } break; case ARRLEN: { @@ -640,17 +635,16 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) stack.push(str_arr_len); } break; case JOIN: { - const std::string &sep = get_str_operand(); - std::stringstream ss; - std::vector str_arr = stack.pop_off().strArr(); - std::vector::const_iterator it; - for (it = str_arr.begin(); it != str_arr.end(); it++) { - ss << *it; - if (it + 1 != str_arr.end()) { - ss << sep; + const UString &sep = get_str_operand(); + std::vector str_arr = stack.pop_off().strArr(); + UString ss; + for (auto& it : str_arr) { + if (!ss.empty()) { + ss.append(sep); } + ss.append(it); } - stack.push(StackValue(ss.str())); + stack.push(StackValue(ss)); } break; default: return false; @@ -675,14 +669,14 @@ PerceptronSpec::Machine::getFeature( } break; case FCATSTRARR: { - std::vector &str_arr = stack.top().strArr(); + std::vector &str_arr = stack.top().strArr(); if (str_arr.size() == 0) { feat_vec_out.clear(); return; } else { UnaryFeatureVec new_feat_vec; new_feat_vec.reserve(feat_vec_out.size() * str_arr.size()); - std::vector::const_iterator str_arr_it; + std::vector::const_iterator str_arr_it; for (str_arr_it = str_arr.begin(); str_arr_it != str_arr.end(); str_arr_it++) { UnaryFeatureVec::iterator append_begin_it = new_feat_vec.end(); std::copy(feat_vec_out.begin(), feat_vec_out.end(), @@ -695,20 +689,20 @@ PerceptronSpec::Machine::getFeature( stack.pop(); } break; case FCATSTR: { - std::string &str = stack.top().str(); + UString &str = stack.top().str(); appendStr(feat_vec_out, str); stack.pop(); } break; case FCATBOOL: { bool b = stack.top().boolVal(); - appendStr(feat_vec_out, b ? "t" : "f"); + appendStr(feat_vec_out, b ? "t"_u : "f"_u); stack.pop(); } break; case FCATINT: { int i = stack.top().intVal(); stringstream ss; ss << i; - appendStr(feat_vec_out, ss.str()); + appendStr(feat_vec_out, to_ustring(ss.str().c_str())); stack.pop(); } break; default: @@ -737,10 +731,10 @@ PerceptronSpec::Machine::getValue() } void -PerceptronSpec::Machine::unimplemented_opcode(std::string opstr) { +PerceptronSpec::Machine::unimplemented_opcode(UString opstr) { int bytecode_idx = bytecode_iter - feat.begin(); std::stringstream msg; - msg << "Unimplemented opcode: " << opstr + msg << "Unimplemented opcode: " //<< opstr // TODO << " at " << (is_feature ? "feature" : "global") << " #" << feat_idx << " address #" << bytecode_idx; throw Apertium::Exception::apertium_tagger::UnimplementedOpcode(msg); } @@ -748,40 +742,40 @@ PerceptronSpec::Machine::unimplemented_opcode(std::string opstr) { PerceptronSpec::In::In(const VMSet &haystack) : haystack(haystack) {}; bool -PerceptronSpec::In::operator() (const std::string &needle) const { +PerceptronSpec::In::operator() (const UString &needle) const { return haystack.find(needle) != haystack.end(); }; void PerceptronSpec::appendStr(UnaryFeatureVec &feat_vec, - const std::string &tail_str) { + const UString &tail_str) { appendStr(feat_vec.begin(), feat_vec.end(), tail_str); } void PerceptronSpec::appendStr(UnaryFeatureVec::iterator begin, UnaryFeatureVec::iterator end, - const std::string &tail_str) { + const UString &tail_str) { for (;begin != end; begin++) { begin->push_back(tail_str); } } -std::string +UString PerceptronSpec::Machine::get_tag(const Tag &in) { - return UtfConverter::toUtf8(in.TheTag); + return in.TheTag; } void PerceptronSpec::serialiseFeatDefn( std::ostream &serialised, const FeatureDefn &defn) const { - Serialiser::serialise( - std::string((char*)&(defn.front()), defn.size()), + Serialiser::serialise( + UString((UChar*)&(defn.front()), defn.size()), serialised); } void PerceptronSpec::deserialiseFeatDefn( std::istream &serialised, FeatureDefn &feat) { - std::string feat_str = Deserialiser::deserialise(serialised); + UString feat_str = Deserialiser::deserialise(serialised); feat.reserve(feat_str.size()); - std::string::iterator feat_str_it; + UString::iterator feat_str_it; for (feat_str_it = feat_str.begin(); feat_str_it != feat_str.end(); feat_str_it++) { feat.push_back(*feat_str_it); } @@ -808,7 +802,7 @@ void PerceptronSpec::deserialiseFeatDefnVec( void PerceptronSpec::serialise(std::ostream &serialised) const { Serialiser::serialise(beam_width, serialised); - Serialiser >::serialise(str_consts, serialised); + Serialiser >::serialise(str_consts, serialised); Serialiser >::serialise(set_consts, serialised); serialiseFeatDefnVec(serialised, features); serialiseFeatDefnVec(serialised, global_defns); @@ -823,7 +817,7 @@ void PerceptronSpec::serialise(std::ostream &serialised) const { void PerceptronSpec::deserialise(std::istream &serialised) { beam_width = Deserialiser::deserialise(serialised); - str_consts = Deserialiser >::deserialise(serialised); + str_consts = Deserialiser >::deserialise(serialised); set_consts = Deserialiser >::deserialise(serialised); deserialiseFeatDefnVec(serialised, features); deserialiseFeatDefnVec(serialised, global_defns); diff --git a/apertium/perceptron_spec.h b/apertium/perceptron_spec.h index 092dd01..def1e2e 100644 --- a/apertium/perceptron_spec.h +++ b/apertium/perceptron_spec.h @@ -27,13 +27,13 @@ using namespace Apertium::SentenceStream; namespace Apertium { -typedef std::set VMSet; +typedef std::set VMSet; class PerceptronSpec { public: - typedef std::vector FeatureDefn; - static void printFeature(std::wostream &out, const PerceptronSpec::FeatureDefn &feat_defn); - friend std::wostream& operator<<(std::wostream &out, PerceptronSpec const &pt); + typedef std::vector FeatureDefn; + static void printFeature(std::ostream &out, const PerceptronSpec::FeatureDefn &feat_defn); + friend std::ostream& operator<<(std::ostream &out, PerceptronSpec const &pt); PerceptronSpec(); #define OPCODES \ /** Boolean and arithmetic */\ @@ -168,9 +168,9 @@ public: #undef X static bool static_constructed; static unsigned char num_opcodes; - static const std::string opcode_names[]; - static const std::string type_names[]; - static std::map opcode_values; + static const UString opcode_names[]; + static const UString type_names[]; + static std::map opcode_values; static std::vector untagged_sentinel; static LexicalUnit token_wordoids_underflow; static LexicalUnit token_wordoids_overflow; @@ -179,7 +179,7 @@ public: }; class StackValue { public: - friend std::wostream& operator<<(std::wostream& out, StackValue const &val) { + friend std::ostream& operator<<(std::ostream& out, StackValue const &val) { switch (val.type) { case INTVAL: out << val.intVal(); @@ -192,8 +192,8 @@ public: break; case STRARRVAL: { out << "["; - std::vector &str_arr = val.strArr(); - std::vector::const_iterator it = str_arr.begin(); + std::vector &str_arr = val.strArr(); + std::vector::const_iterator it = str_arr.begin(); for (; it != str_arr.end(); it++) { out << it->c_str(); } @@ -230,11 +230,11 @@ public: type = other.type; switch (type) { case STRVAL: - payload.strval = new std::string(*other.payload.strval); + payload.strval = new UString(*other.payload.strval); break; case STRARRVAL: payload.strarrval = - new std::vector(*other.payload.strarrval); + new std::vector(*other.payload.strarrval); break; case WRDVAL: payload.wrdval = new Morpheme(*other.payload.wrdval); @@ -260,12 +260,12 @@ public: payload.bval = bval; type = BVAL; } - StackValue(const std::string &strval) { - payload.strval = new std::string(strval); + StackValue(const UString &strval) { + payload.strval = new UString(strval); type = STRVAL; } - StackValue(const std::vector &strarrval) { - payload.strarrval = new std::vector(strarrval); + StackValue(const std::vector &strarrval) { + payload.strarrval = new std::vector(strarrval); type = STRARRVAL; } StackValue(const Morpheme &wordoid) { @@ -290,11 +290,11 @@ public: payload.wrdarrval = new std::vector(wordoids); type = WRDARRVAL; } - StackValue(std::string *strval) { + StackValue(UString *strval) { payload.strval = strval; type = STRVAL; } - StackValue(std::vector *strarrval) { + StackValue(std::vector *strarrval) { payload.strarrval = strarrval; type = STRARRVAL; } @@ -331,11 +331,11 @@ public: assert(type == BVAL); return payload.bval; } - std::string& str() const { + UString& str() const { assert(type == STRVAL); return *payload.strval; } - std::vector& strArr() const { + std::vector& strArr() const { assert(type == STRARRVAL); return *payload.strarrval; } @@ -366,8 +366,8 @@ public: union StackValueUnion { int intval; bool bval; - std::string* strval; - std::vector* strarrval; + UString* strval; + std::vector* strarrval; Morpheme* wrdval; std::vector* wrdarrval; } payload; @@ -379,8 +379,8 @@ public: signed char intbyte : 8; }; Optional coarse_tags; - static std::string dot; - std::vector str_consts; + static UString dot; + std::vector str_consts; std::vector set_consts; mutable std::vector global_results; std::vector global_defns; @@ -390,10 +390,10 @@ public: const TaggedSentence &tagged, const Sentence &untagged, int token_idx, int wordoid_idx, UnaryFeatureVec &feat_vec_out) const; - std::string coarsen(const Morpheme &wrd) const; + UString coarsen(const Morpheme &wrd) const; void clearCache() const; int beam_width; - mutable std::map coarsen_cache; + mutable std::map coarsen_cache; private: class MachineStack { std::deque data; @@ -440,7 +440,7 @@ private: bool is_feature; const FeatureDefn &feat; const size_t &feat_idx; - std::vector::const_iterator bytecode_iter; + std::vector::const_iterator bytecode_iter; const TaggedSentence &tagged; const Sentence &untagged; int token_idx; @@ -454,15 +454,15 @@ private: }; std::deque loop_stack; std::vector slots; - void unimplemented_opcode(std::string opstr); + void unimplemented_opcode(UString opstr); const LexicalUnit& get_token(const Sentence &untagged); const std::vector& tagged_to_wordoids(const TaggedToken &tt); const Morpheme& get_wordoid(const TaggedSentence &tagged); const VMSet& get_set_operand(); int get_int_operand(); unsigned int get_uint_operand(); - const std::string& get_str_operand(); - static std::string get_tag(const Tag &in); + const UString& get_str_operand(); + static UString get_tag(const Tag &in); bool execCommonOp(Opcode op); public: void traceMachineState(); @@ -478,16 +478,16 @@ private: int token_idx, int wordoid_idx); }; - struct In : public std::unary_function { + struct In : public std::unary_function { const VMSet& haystack; In(const VMSet &haystack); - bool operator() (const std::string &needle) const; + bool operator() (const UString &needle) const; }; static void appendStr(UnaryFeatureVec &feat_vec, - const std::string &tail_str); + const UString &tail_str); static void appendStr(UnaryFeatureVec::iterator begin, UnaryFeatureVec::iterator end, - const std::string &tail_str); + const UString &tail_str); void serialiseFeatDefn( std::ostream &serialised, const FeatureDefn &defn) const; void deserialiseFeatDefn( diff --git a/apertium/perceptron_tagger.cc b/apertium/perceptron_tagger.cc index a086cfc..01f800a 100644 --- a/apertium/perceptron_tagger.cc +++ b/apertium/perceptron_tagger.cc @@ -12,7 +12,7 @@ PerceptronTagger::PerceptronTagger(TaggerFlags flags) : StreamTagger(flags) {}; PerceptronTagger::~PerceptronTagger() {}; -void PerceptronTagger::tag(Stream &in, std::wostream &out) { +void PerceptronTagger::tag(Stream &in, std::ostream &out) { SentenceStream::SentenceTagger::tag(in, out, TheFlags.getSentSeg()); } @@ -20,8 +20,8 @@ void PerceptronTagger::read_spec(const std::string &filename) { MTXReader(spec).read(filename); } -std::wostream & -operator<<(std::wostream &out, PerceptronTagger const &pt) { +std::ostream & +operator<<(std::ostream &out, PerceptronTagger const &pt) { out << "== Spec ==\n"; out << pt.spec; out << "== Weights " << pt.weights.size() << " ==\n"; @@ -100,7 +100,7 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) { void PerceptronTagger::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) { + std::ostream &output) { StreamTagger::outputLexicalUnit(lexical_unit, analysis, output); } @@ -289,8 +289,8 @@ PerceptronTagger::extendAgendaAll( } } -std::wostream& -operator<<(std::wostream &out, const TaggedSentence &tagged) { +std::ostream& +operator<<(std::ostream &out, const TaggedSentence &tagged) { TaggedSentence::const_iterator tsi; for (tsi = tagged.begin(); tsi != tagged.end(); tsi++) { if (*tsi) { @@ -303,8 +303,8 @@ operator<<(std::wostream &out, const TaggedSentence &tagged) { return out; } -std::wostream& -operator<<(std::wostream &out, const PerceptronTagger::TrainingAgendaItem &tai) { +std::ostream& +operator<<(std::ostream &out, const PerceptronTagger::TrainingAgendaItem &tai) { out << "Score: " << tai.score << "\n"; out << "Sentence: " << tai.tagged << "\n"; out << "\n"; @@ -312,8 +312,8 @@ operator<<(std::wostream &out, const PerceptronTagger::TrainingAgendaItem &tai) return out; } -std::wostream& -operator<<(std::wostream &out, const std::vector &agenda) { +std::ostream& +operator<<(std::ostream &out, const std::vector &agenda) { std::vector::const_iterator agenda_it; for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) { out << *agenda_it; @@ -322,15 +322,15 @@ operator<<(std::wostream &out, const std::vector &agenda) { +std::ostream& +operator<<(std::ostream &out, const std::vector &agenda) { std::vector::const_iterator agenda_it; for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) { out << *agenda_it; diff --git a/apertium/perceptron_tagger.h b/apertium/perceptron_tagger.h index 9c7ba0f..73986d2 100644 --- a/apertium/perceptron_tagger.h +++ b/apertium/perceptron_tagger.h @@ -21,16 +21,16 @@ public: virtual void train(Stream &tagged, Stream &untagged, int iterations); // tagger virtual void deserialise(std::istream &serialised); - virtual void tag(Stream &input, std::wostream &output); + virtual void tag(Stream &input, std::ostream &output); void read_spec(const std::string &filename); - friend std::wostream& operator<<(std::wostream &out, PerceptronTagger const &pt); + friend std::ostream& operator<<(std::ostream &out, PerceptronTagger const &pt); protected: virtual TaggedSentence tagSentence(const Sentence &untagged); virtual void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output); + std::ostream &output); private: bool trainSentence( const TrainingSentence &sentence, @@ -52,18 +52,18 @@ private: }; template static void extendAgendaAll( std::vector &agenda, Optional analy); - friend std::wostream& operator<<(std::wostream &out, + friend std::ostream& operator<<(std::ostream &out, const TrainingAgendaItem &tai); - friend std::wostream& operator<<( - std::wostream &out, const std::vector &agenda); + friend std::ostream& operator<<( + std::ostream &out, const std::vector &agenda); friend bool operator<(const AgendaItem &a, const AgendaItem &b); - friend std::wostream& operator<<( - std::wostream &out, const PerceptronTagger::AgendaItem &ai); - friend std::wostream& operator<<( - std::wostream &out, const std::vector &agenda); + friend std::ostream& operator<<( + std::ostream &out, const PerceptronTagger::AgendaItem &ai); + friend std::ostream& operator<<( + std::ostream &out, const std::vector &agenda); }; -std::wostream& operator<<(std::wostream &out, const TaggedSentence &tagged); +std::ostream& operator<<(std::ostream &out, const TaggedSentence &tagged); } #endif diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index 4f05848..e066642 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -31,280 +31,40 @@ using namespace Apertium; using namespace std; -void -Postchunk::destroy() -{ - if(me) - { - delete me; - me = NULL; - } - if(doc) - { - xmlFreeDoc(doc); - doc = NULL; - } -} - Postchunk::Postchunk() : word(0), lword(0), output(0), -any_char(0), -any_tag(0), nwords(0) { - me = NULL; - doc = NULL; - root_element = NULL; lastrule = NULL; inword = false; - null_flush = false; - internal_null_flush = false; - trace = false; - in_lu = false; in_out = false; in_let_var = false; in_wblank = false; } -Postchunk::~Postchunk() -{ - destroy(); -} - -void -Postchunk::readData(FILE *in) -{ - alphabet.read(in); - any_char = alphabet(TRXReader::ANY_CHAR); - any_tag = alphabet(TRXReader::ANY_TAG); - - Transducer t; - t.read(in, alphabet.size()); - - map finals; - - // finals - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - int key = Compression::multibyte_read(in); - finals[key] = Compression::multibyte_read(in); - } - - me = new MatchExe(t, finals); - - // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - attr_items[cad_k].read(in); - UString fallback = Compression::string_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - } - } - - // variables - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::string_read(in)); - } - - // macros - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - macros[cad_k] = Compression::multibyte_read(in); - } - - // lists - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - - for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) - { - UString const cad_v = Compression::string_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); - } - } -} - -void -Postchunk::read(string const &transferfile, string const &datafile) -{ - readPostchunk(transferfile); - - // datafile - FILE *in = fopen(datafile.c_str(), "rb"); - if(!in) - { - cerr << "Error: Could not open file '" << datafile << "'." << endl; - exit(EXIT_FAILURE); - } - readData(in); - fclose(in); - -} - -void -Postchunk::readPostchunk(string const &in) -{ - doc = xmlReadFile(in.c_str(), NULL, 0); - - if(doc == NULL) - { - cerr << "Error: Could not parse file '" << in << "'." << endl; - exit(EXIT_FAILURE); - } - - root_element = xmlDocGetRootElement(doc); - - // search for macros & rules - for(xmlNode *i = root_element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) - { - collectMacros(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) - { - collectRules(i); - } - } - } -} - -void -Postchunk::collectRules(xmlNode *localroot) -{ - for(xmlNode *rule = localroot->children; rule != NULL; rule = rule->next) - { - if(rule->type == XML_ELEMENT_NODE) - { - size_t line = rule->line; - for(xmlNode *rulechild = rule->children; ; rulechild = rulechild->next) - { - if(rulechild->type == XML_ELEMENT_NODE && !xmlStrcmp(rulechild->name, (const xmlChar *) "action")) - { - rule_map.push_back(rulechild); - rule_lines.push_back(line); - break; - } - } - } - } -} - -void -Postchunk::collectMacros(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - macro_map.push_back(i); - } - } -} - bool Postchunk::checkIndex(xmlNode *element, int index, int limit) { if(index > limit) // Note: Unlike transfer/interchunk, we allow index==limit! { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index > limit" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index > limit" << endl; return false; } if(index < 0) { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index < 0" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; return false; } if(word[index] == 0) { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": Null access at word[index]" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; return false; } return true; } -bool -Postchunk::gettingLemmaFromWord(string attr) -{ - return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); -} - -string -Postchunk::combineWblanks(string wblank_current, string wblank_to_add) -{ - if(wblank_current.empty() && wblank_to_add.empty()) - { - return wblank_current; - } - else if(wblank_current.empty()) - { - return wblank_to_add; - } - else if(wblank_to_add.empty()) - { - return wblank_current; - } - - string new_out_wblank; - for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) - { - if(*it == '\\') - { - new_out_wblank += *it; - it++; - new_out_wblank += *it; - } - else if(*it == ']') - { - if(*(it+1) == ']') - { - new_out_wblank += ';'; - break; - } - } - else - { - new_out_wblank += *it; - } - } - - for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) - { - if(*it == '\\') - { - new_out_wblank += *it; - it++; - new_out_wblank += *it; - } - else if(*it == '[') - { - if(*(it+1) == '[') - { - new_out_wblank += ' '; - it++; - } - } - else - { - new_out_wblank += *it; - } - } - - return new_out_wblank; -} - -string +UString Postchunk::evalString(xmlNode *element) { map::iterator it; @@ -334,7 +94,7 @@ Postchunk::evalString(xmlNode *element) break; case ti_lu_count: - return StringUtils::itoa_string(tmpword.size()); + return StringUtils::itoa(tmpword.size()); case ti_var: if(lword > 1) @@ -351,7 +111,7 @@ Postchunk::evalString(xmlNode *element) case ti_b: if(!blank_queue.empty()) { - string retblank = blank_queue.front(); + UString retblank = blank_queue.front(); if(in_out) { blank_queue.pop(); @@ -361,7 +121,7 @@ Postchunk::evalString(xmlNode *element) } else { - return " "; + return " "_u; } break; @@ -381,21 +141,21 @@ Postchunk::evalString(xmlNode *element) break; default: - return ""; + return ""_u; } - return ""; + return ""_u; } if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = element->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -403,27 +163,27 @@ Postchunk::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) { evalStringCache[element] = TransferInstr(ti_lit_tag, - tags((const char *) element->properties->children->content), 0); + tags(to_ustring((const char *) element->properties->children->content)), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) { - evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) { if(element->properties == NULL) { - evalStringCache[element] = TransferInstr(ti_b, " ", -1); + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); } else { int pos = atoi((const char *) element->properties->children->content) - 1; - evalStringCache[element] = TransferInstr(ti_b, "", pos); + evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); } } else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) @@ -439,26 +199,26 @@ Postchunk::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); } else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) { - evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + evalStringCache[element] = TransferInstr(ti_var, to_ustring((const char *) element->properties->children->content), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lu-count")) { - evalStringCache[element] = TransferInstr(ti_lu_count, "", 0); + evalStringCache[element] = TransferInstr(ti_lu_count, ""_u, 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = element->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -466,11 +226,11 @@ Postchunk::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) { - string value; + UString value; for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -485,7 +245,7 @@ Postchunk::evalString(xmlNode *element) in_lu = true; out_wblank.clear(); - string myword; + UString myword; for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -501,18 +261,15 @@ Postchunk::evalString(xmlNode *element) out_wblank = word[1]->getWblank(); } - if(myword != "") - { - return out_wblank+"^"+myword+"$"; - } - else - { - return ""; + if(myword.empty()) { + return ""_u; + } else { + return out_wblank+"^"_u+myword+"$"_u; } } else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) { - string value; + UString value; bool first_time = true; out_wblank.clear(); @@ -523,7 +280,7 @@ Postchunk::evalString(xmlNode *element) { in_lu = true; - string myword; + UString myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { @@ -537,17 +294,16 @@ Postchunk::evalString(xmlNode *element) if(!first_time) { - if(myword != "" && myword[0] != '#') //'+#' problem + if(!myword.empty() && myword[0] != '#') //'+#' problem { - value.append("+"); - } + value += '+'; + } } else { - if(myword != "") - { + if (!myword.empty()) { first_time = false; - } + } } value.append(myword); @@ -559,13 +315,10 @@ Postchunk::evalString(xmlNode *element) out_wblank = word[1]->getWblank(); } - if(value != "") - { - return out_wblank+"^"+value+"$"; - } - else - { - return ""; + if (value.empty()) { + return ""_u; + } else { + return out_wblank+"^"_u+value+"$"_u; } } @@ -592,7 +345,7 @@ Postchunk::processOut(xmlNode *localroot) in_lu = true; out_wblank.clear(); - string myword; + UString myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) @@ -614,7 +367,7 @@ Postchunk::processOut(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { - string myword; + UString myword; bool first_time = true; out_wblank.clear(); @@ -624,7 +377,7 @@ Postchunk::processOut(xmlNode *localroot) { in_lu = true; - string mylocalword; + UString mylocalword; for(xmlNode *k = j->children; k != NULL; k = k->next) { if(k->type == XML_ELEMENT_NODE) @@ -637,14 +390,14 @@ Postchunk::processOut(xmlNode *localroot) if(!first_time) { - if(mylocalword != "") + if(!mylocalword.empty()) { myword += '+'; } } else { - if(mylocalword != "") + if(!mylocalword.empty()) { first_time = false; } @@ -692,35 +445,6 @@ Postchunk::processTags(xmlNode *localroot) } } -void -Postchunk::processInstruction(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) - { - processChoose(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) - { - processLet(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) - { - processAppend(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) - { - processOut(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) - { - processCallMacro(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) - { - processModifyCase(localroot); - } -} - void Postchunk::processLet(xmlNode *localroot) { @@ -776,7 +500,7 @@ Postchunk::processLet(xmlNode *localroot) { in_let_var = true; - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); var_val = val; var_out_wblank[var_val].clear(); @@ -789,13 +513,13 @@ Postchunk::processLet(xmlNode *localroot) else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -804,39 +528,13 @@ Postchunk::processLet(xmlNode *localroot) } - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], + bool match = word[pos]->setChunkPart(attr_items[part], evalString(rightSide)); if(!match && trace) { cerr << "apertium-postchunk warning: on line " << localroot->line << " sometimes discards its value." << endl; } - evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, - pos, NULL); - } -} - -void -Postchunk::processAppend(xmlNode *localroot) -{ - string name; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "n")) - { - name = (char *) i->children->content; - break; - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_let_var = true; - var_val = name; - variables[name].append(evalString(i)); - in_let_var = false; - } + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, part, pos, NULL); } } @@ -864,23 +562,23 @@ Postchunk::processModifyCase(xmlNode *localroot) if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL; + UString part; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { - pos = atoi((const char *) i->children->content); + pos = atoi((const char *) i->children->content); } } - string const result = copycase(evalString(rightSide), - word[pos]->chunkPart(attr_items[(const char *) part])); - bool match = word[pos]->setChunkPart(attr_items[(const char *) part], result); + UString const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[part])); + bool match = word[pos]->setChunkPart(attr_items[part], result); if(!match && trace) { @@ -889,7 +587,7 @@ Postchunk::processModifyCase(xmlNode *localroot) } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); variables[val] = copycase(evalString(rightSide), variables[val]); } } @@ -897,7 +595,7 @@ Postchunk::processModifyCase(xmlNode *localroot) void Postchunk::processCallMacro(xmlNode *localroot) { - const char *n = (const char *) localroot->properties->children->content; + UString n = to_ustring((const char *) localroot->properties->children->content); int npar = 0; xmlNode *macro = macro_map[macros[n]]; @@ -962,622 +660,84 @@ Postchunk::processCallMacro(xmlNode *localroot) delete[] myword; } -void -Postchunk::processChoose(xmlNode *localroot) +UString +Postchunk::copycase(UString const &source_word, UString const &target_word) { - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "when")) - { - bool picked_option = false; + UString result; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(j->name, (const xmlChar *) "test")) - { - if(!processTest(j)) - { - break; - } - else - { - picked_option = true; - } - } - else - { - processInstruction(j); - } - } - } - if(picked_option) - { - return; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) - { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - processInstruction(j); - } - } - } - } - } -} + bool firstupper = iswupper(source_word[0]); + bool uppercase = firstupper && iswupper(source_word[source_word.size()-1]); + bool sizeone = source_word.size() == 1; -bool -Postchunk::processLogical(xmlNode *localroot) -{ - if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) - { - return processEqual(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) - { - return processBeginsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) - { - return processBeginsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) - { - return processEndsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + if(!uppercase || (sizeone && uppercase)) { - return processEndsWithList(localroot); + result = StringUtils::tolower(target_word); } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + else { - return processContainsSubstring(localroot); + result = StringUtils::toupper(target_word); } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + + if(firstupper) { - return processOr(localroot); + // TODO: 32 + result[0] = u_toupper(result[0]); } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + + return result; +} + +UString +Postchunk::caseOf(UString const &s) +{ + if(s.size() > 1) { - return processAnd(localroot); + if(!iswupper(s[0])) + { + return "aa"_u; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"_u; + } + else + { + return "AA"_u; + } } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + else if(s.size() == 1) { - return processNot(localroot); + if(!iswupper(s[0])) + { + return "aa"_u; + } + else + { + return "Aa"_u; + } } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + else { - return processIn(localroot); + return "aa"_u; } +} - return false; +UString +Postchunk::tolower(UString const &str) const +{ + return StringUtils::tolower(str); } -bool -Postchunk::processIn(xmlNode *localroot) +UString +Postchunk::tags(UString const &str) const { - xmlNode *value = NULL; - xmlChar *idlist = NULL; + UString result = "<"_u; - for(xmlNode *i = localroot->children; i != NULL; i = i->next) + for(unsigned int i = 0, limit = str.size(); i != limit; i++) { - if(i->type == XML_ELEMENT_NODE) + if(str[i] == '.') { - if(value == NULL) - { - value = i; - } - else - { - idlist = i->properties->children->content; - break; - } - } - } - - string sval = evalString(value); - - if(localroot->properties != NULL) - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - set &myset = listslow[(const char *) idlist]; - if(myset.find(tolower(sval)) != myset.end()) - { - return true; - } - else - { - return false; - } - } - } - - set &myset = lists[(const char *) idlist]; - if(myset.find(sval) != myset.end()) - { - return true; - } - else - { - return false; - } -} - -bool -Postchunk::processTest(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return processLogical(i); - } - } - return false; -} - -bool -Postchunk::processAnd(xmlNode *localroot) -{ - bool val = true; - for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val && processLogical(i); - } - } - - return val; -} - -bool -Postchunk::processOr(xmlNode *localroot) -{ - bool val = false; - for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val || processLogical(i); - } - } - - return val; -} - -bool -Postchunk::processNot(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return !processLogical(i); - } - } - return false; -} - -bool -Postchunk::processEqual(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first) == evalString(second); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)) == tolower(evalString(second)); - } - else - { - return evalString(first) == evalString(second); - } - } -} - -bool -Postchunk::beginsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = 0; i != limit; i++) - { - if(s1[i] != s2[i]) - { - return false; - } - } - - return true; -} - -bool -Postchunk::endsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) - { - if(s1[j] != s2[i]) - { - return false; - } - } - - return true; -} - - -bool -Postchunk::processBeginsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return beginsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return beginsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return beginsWith(evalString(first), evalString(second)); - } - } -} - -bool -Postchunk::processEndsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return endsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return endsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return endsWith(evalString(first), evalString(second)); - } - } -} - -bool -Postchunk::processBeginsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(beginsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Postchunk::processEndsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(endsWith(needle, *it)) - { - return true; - } - } - return false; -} - - -bool -Postchunk::processContainsSubstring(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first).find(evalString(second)) != string::npos; - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; - } - else - { - return evalString(first).find(evalString(second)) != string::npos; - } - } -} - -string -Postchunk::copycase(string const &source_word, string const &target_word) -{ - UString result; - UString const s_word = UtfConverter::fromUtf8(source_word); - UString const t_word = UtfConverter::fromUtf8(target_word); - - bool firstupper = iswupper(s_word[0]); - bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); - bool sizeone = s_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - result = StringUtils::tolower(t_word); - } - else - { - result = StringUtils::toupper(t_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return UtfConverter::toUtf8(result); -} - -string -Postchunk::caseOf(string const &str) -{ - UString const s = UtfConverter::fromUtf8(str); - - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else if(!iswupper(s[s.size()-1])) - { - return "Aa"; - } - else - { - return "AA"; - } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else - { - return "Aa"; - } - } - else - { - return "aa"; - } -} - -UString -Postchunk::caseOf(UString const &str) -{ - if(str.size() > 1) - { - if(!iswupper(str[0])) - { - return "aa"; - } - else if(!iswupper(str[str.size()-1])) - { - return "Aa"; - } - else - { - return "AA"; - } - } - else if(str.size() == 1) - { - if(!iswupper(str[0])) - { - return "aa"; - } - else - { - return "Aa"; - } - } - else - { - return "aa"; - } -} - -string -Postchunk::tolower(string const &str) const -{ - return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); -} - -string -Postchunk::tags(string const &str) const -{ - string result = "<"; - - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - if(str[i] == '.') - { - result.append("><"); + result.append("><"_u); } else { @@ -1590,7 +750,7 @@ Postchunk::tags(string const &str) const return result; } -void +int Postchunk::processRule(xmlNode *localroot) { // localroot is suposed to be an 'action' tag @@ -1604,12 +764,13 @@ Postchunk::processRule(xmlNode *localroot) while(!blank_queue.empty()) //flush remaining blanks that are not spaces { - if(blank_queue.front().compare(" ") != 0) + if(blank_queue.front().compare(" "_u) != 0) { write(blank_queue.front(), output); } blank_queue.pop(); } + return -1; } TransferToken & @@ -1623,64 +784,50 @@ Postchunk::readToken(InputFile& in) UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (internal_null_flush && val == 0)) + UChar32 val = in.get(); + if(in.eof() || (internal_null_flush && val == 0)) { return input_buffer.add(TransferToken(content, tt_eof)); } if(val == '\\') { content += '\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += in.get(); } else if(val == '[') { content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == '\\') - { - content += '\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == ']') - { - content += ']'; - break; - } - else - { - content += wchar_t(val2); - } + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == ']') { + content += ']'; + break; + } else { + content += val2; + } } } else if(inword && val == '{') { content += '{'; - while(true) - { - int val2 = fgetwc_unlocked(in); - if(val2 == '\\') - { - content += '\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == '}') - { - int val3 = wchar_t(fgetwc_unlocked(in)); - ungetwc(val3, in); - - content += '}'; - if(val3 == '$') - { - break; - } - } - else - { - content += wchar_t(val2); - } + while(true) { + UChar32 val2 = in.get(); + if(val2 == '\\') { + content += '\\'; + content += in.get(); + } else if(val2 == '}') { + UChar32 val3 = in.peek(); + content += '}'; + if(val3 == '$') { + break; + } + } else { + content += val2; + } } } else if(inword && val == '$') @@ -1724,15 +871,11 @@ Postchunk::postchunk_wrapper_null_flush(InputFile& in, UFILE* out) null_flush = false; internal_null_flush = true; - while(!feof(in)) + while(!in.eof()) { postchunk(in, out); u_fputc('\0', out); - int code = fflush(out); - if(code != 0) - { - cerr << "Could not flush output " << errno << endl; - } + u_fflush(out); } internal_null_flush = false; @@ -1841,17 +984,15 @@ Postchunk::applyRule() word = new InterchunkWord *[tmpword.size()+1]; lword = tmpword.size(); - word[0] = new InterchunkWord(UtfConverter::toUtf8(wordzero(chunk))); + word[0] = new InterchunkWord(wordzero(chunk)); for(unsigned int i = 1, limit = tmpword.size()+1; i != limit; i++) { - if(i != 1) - { - string blank_to_add = string(UtfConverter::toUtf8(*tmpblank[i-1])); - blank_queue.push(blank_to_add); + if(i != 1) { + blank_queue.push(*tmpblank[i-1]); } - word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i-1])); + word[i] = new InterchunkWord(*tmpword[i-1]); } processRule(lastrule); @@ -1944,7 +1085,8 @@ Postchunk::getVecTags(UString const &chunk) mytag += chunk[i++]; } while(chunk[i] != '>'); - vectags.push_back(mytag + '>'); + mytag += '>'; + vectags.push_back(mytag); } else if(chunk[i] == '{') { @@ -1992,7 +1134,7 @@ Postchunk::wordzero(UString const &chunk) } } - return ""; + return ""_u; } UString @@ -2010,7 +1152,7 @@ Postchunk::pseudolemma(UString const &chunk) } } - return ""; + return ""_u; } void @@ -2021,11 +1163,11 @@ Postchunk::unchunk(UString const &chunk, UFILE* output) bool uppercase_all = false; bool uppercase_first = false; - if(case_info == "AA") + if(case_info == "AA"_u) { uppercase_all = true; } - else if(case_info == "Aa") + else if(case_info == "Aa"_u) { uppercase_first = true; } @@ -2049,8 +1191,10 @@ Postchunk::unchunk(UString const &chunk, UFILE* output) if(iswdigit(chunk[i+1])) { // replace tag - unsigned long value = wcstoul(chunk.c_str()+i+1, - NULL, 0) - 1; + // TODO + unsigned long value = stoi(chunk.c_str()+i+1) - 1; + //unsigned long value = wcstoul(chunk.c_str()+i+1, + // NULL, 0) - 1; //atoi(chunk.c_str()+i+1)-1; if(vectags.size() > value) { @@ -2069,19 +1213,18 @@ Postchunk::unchunk(UString const &chunk, UFILE* output) { if(uppercase_all) { - u_fputc(towupper(chunk[i]), output); + // TODO + u_fputc(u_toupper(chunk[i]), output); } else if(uppercase_first) { - if(iswalnum(chunk[i])) - { - u_fputc(towupper(chunk[i]), output); - uppercase_first = false; - } - else - { - u_fputc(chunk[i], output); - } + if(iswalnum(chunk[i])) { + // TODO + u_fputc(u_toupper(chunk[i]), output); + uppercase_first = false; + } else { + u_fputc(chunk[i], output); + } } else { @@ -2126,11 +1269,11 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, bool uppercase_first = false; bool lastblank = true; - if(case_info == "AA") + if(case_info == "AA"_u) { uppercase_all = true; } - else if(case_info == "Aa") + else if(case_info == "Aa"_u) { uppercase_first = true; } @@ -2141,7 +1284,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, { if(!lastblank) { - blanks.push_back(new UString("")); + blanks.push_back(new UString(""_u)); } lastblank = false; UString *myword = new UString(); @@ -2159,8 +1302,10 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, if(iswdigit(chunk[i+1])) { // replace tag - unsigned long value = wcstoul(chunk.c_str()+i+1, - NULL, 0) - 1; + unsigned long value = stoi(chunk.c_str()+i+1) - 1; + // TODO + //unsigned long value = wcstoul(chunk.c_str()+i+1, + // NULL, 0) - 1; if(vectags.size() > value) { ref.append(vectags[value]); @@ -2178,13 +1323,15 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, { if(uppercase_all) { - ref += towupper(chunk[i]); + // TODO + ref += u_toupper(chunk[i]); } else if(uppercase_first) { if(iswalnum(chunk[i])) { - ref += towupper(chunk[i]); + // TODO + ref += u_toupper(chunk[i]); uppercase_first = false; } else @@ -2207,7 +1354,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, { if(!lastblank) { - blanks.push_back(new UString("")); + blanks.push_back(new UString(""_u)); } lastblank = false; UString *myword = new UString(); @@ -2223,7 +1370,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, else if(chunk[i] == ']' && chunk[i-1] == ']') { ref += chunk[i]; - i++; //i->"^" + i++; //i->"^"_u break; } else @@ -2246,8 +1393,10 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, if(iswdigit(chunk[i+1])) { // replace tag - unsigned long value = wcstoul(chunk.c_str()+i+1, - NULL, 0) - 1; + unsigned long value = stoi(chunk.c_str()+i+1) - 1; + //unsigned long value = wcstoul(chunk.c_str()+i+1, + // NULL, 0) - 1; + // TODO: make sure this is equivalent if(vectags.size() > value) { ref.append(vectags[value]); @@ -2265,13 +1414,14 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, { if(uppercase_all) { - ref += towupper(chunk[i]); + // TODO + ref += u_toupper(chunk[i]); } else if(uppercase_first) { - if(iswalnum(chunk[i])) + if(u_isalnum(chunk[i])) // TODO { - ref += towupper(chunk[i]); + ref += u_toupper(chunk[i]); // TODO uppercase_first = false; } else @@ -2317,7 +1467,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, { if (!lastblank) { - UString *myblank = new UString(""); + UString *myblank = new UString(""_u); blanks.push_back(myblank); } UString &ref = *(blanks.back()); diff --git a/apertium/postchunk.h b/apertium/postchunk.h index e704274..7a8ba51 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -17,45 +17,25 @@ #ifndef _POSTCHUNK_ #define _POSTCHUNK_ -#include +#include + #include #include -#include -#include #include -#include -#include -#include +#include #include -#include -#include -#include -#include #include #include using namespace std; -class Postchunk +class Postchunk : TransferBase { private: - Alphabet alphabet; - MatchExe *me; - MatchState ms; - map attr_items; - map variables; - map macros; - map> lists; - map> listslow; - vector macro_map; - vector rule_map; - vector rule_lines; - xmlDoc *doc; - xmlNode *root_element; InterchunkWord **word; - queue blank_queue; + queue blank_queue; int lword; Buffer input_buffer; vector tmpword; @@ -63,64 +43,36 @@ private: bool in_out; bool in_lu; - bool in_let_var; - string var_val; bool in_wblank; - string out_wblank; - map var_out_wblank; + UString out_wblank; + map var_out_wblank; UFILE *output; - int any_char; - int any_tag; xmlNode *lastrule; unsigned int nwords; - map evalStringCache; - bool inword; - bool null_flush; - bool internal_null_flush; - bool trace; - void destroy(); - void readData(FILE *input); - void readPostchunk(string const &input); - void collectMacros(xmlNode *localroot); - void collectRules(xmlNode *localroot); - static string caseOf(string const &str); static UString caseOf(UString const &str); - string copycase(string const &source_word, string const &target_word); + UString copycase(UString const &source_word, UString const &target_word); void processLet(xmlNode *localroot); - void processAppend(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); - bool processLogical(xmlNode *localroot); - bool processTest(xmlNode *localroot); - bool processAnd(xmlNode *localroot); - bool processOr(xmlNode *localroot); - bool processEqual(xmlNode *localroot); - bool processBeginsWith(xmlNode *localroot); - bool processBeginsWithList(xmlNode *localroot); - bool processEndsWith(xmlNode *localroot); - bool processEndsWithList(xmlNode *localroot); - bool processContainsSubstring(xmlNode *localroot); - bool processNot(xmlNode *localroot); - bool processIn(xmlNode *localroot); - void processRule(xmlNode *localroot); - string evalString(xmlNode *localroot); + UString evalString(xmlNode *localroot); + int processRule(xmlNode* localroot); void processInstruction(xmlNode *localroot); void processChoose(xmlNode *localroot); void processTags(xmlNode *localroot); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; - string readWord(InputFile& in); - string readBlank(InputFile& in); - string readUntil(InputFile& in, int const symbol) const; + bool beginsWith(UString const &str1, UString const &str2) const; + bool endsWith(UString const &str1, UString const &str2) const; + UString tolower(UString const &str) const; + UString tags(UString const &str) const; + UString readWord(InputFile& in); + UString readBlank(InputFile& in); + UString readUntil(InputFile& in, int const symbol) const; void applyWord(UString const &word_str); void applyRule(); TransferToken & readToken(InputFile& in); @@ -135,8 +87,8 @@ private: static UString wordzero(UString const &chunk); bool checkIndex(xmlNode *element, int index, int limit); void postchunk_wrapper_null_flush(InputFile& in, UFILE* out); - bool gettingLemmaFromWord(string attr); - string combineWblanks(string wblank_current, string wblank_to_add); + bool gettingLemmaFromWord(UString attr); + UString combineWblanks(UString wblank_current, UString wblank_to_add); public: Postchunk(); diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc index 1b42529..8f7778b 100644 --- a/apertium/pretransfer.cc +++ b/apertium/pretransfer.cc @@ -11,12 +11,12 @@ UString storeAndWriteWblank(InputFile& input, UFILE* output) { int mychar; - UString content = "[["; + UString content = "[["_u; while(true) { - mychar = fgetwc_unlocked(input); - if(feof(input)) + mychar = input.get(); + if(input.eof()) { cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); @@ -27,13 +27,13 @@ UString storeAndWriteWblank(InputFile& input, UFILE* output) if(mychar == '\\') { - mychar = fgetwc(input); + mychar = input.get(); content += mychar; u_fputc(mychar, output); } else if(mychar == ']') { - mychar = fgetwc(input); + mychar = input.get(); if(mychar == ']') { @@ -51,9 +51,9 @@ void readAndWriteUntil(InputFile& input, UFILE* output, int const charcode) { int mychar; - while((mychar = fgetwc_unlocked(input)) != charcode) + while((mychar = input.get()) != charcode) { - if(feof(input)) + if(input.eof()) { cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); @@ -61,16 +61,16 @@ void readAndWriteUntil(InputFile& input, UFILE* output, int const charcode) u_fputc(mychar, output); if(mychar == '\\') { - mychar = fgetwc(input); + mychar = input.get(); u_fputc(mychar, output); } } } -void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound_sep, UString wblank = "") +void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound_sep, UString wblank = ""_u) { int mychar; - UString buffer = ""; + UString buffer; bool buffer_mode = false; bool in_tag = false; @@ -78,12 +78,12 @@ void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound if(surface_forms) { - while((mychar = fgetwc_unlocked(input)) != '/') ; + while((mychar = input.get()) != '/') ; } - while((mychar = fgetwc_unlocked(input)) != '$') + while((mychar = input.get()) != '$') { - if(feof(input)) + if(input.eof()) { cerr << "ERROR: Unexpected EOF" << endl; exit(EXIT_FAILURE); @@ -121,24 +121,24 @@ void procWord(InputFile& input, UFILE* output, bool surface_forms, bool compound } else if(in_tag == false && mychar == '+') { - buffer.append("$ "); + buffer.append("$ "_u); buffer.append(wblank); - buffer.append("^"); + buffer.append("^"_u); } else if(in_tag == false && mychar == '~' and compound_sep == true) { - buffer.append("$"); + buffer.append("$"_u); buffer.append(wblank); - buffer.append("^"); + buffer.append("^"_u); } } else { if(mychar == '+' && queuing == true) { - buffer.append("$ "); + buffer.append("$ "_u); buffer.append(wblank); - buffer.append("^"); + buffer.append("^"_u); buffer_mode = true; } else @@ -155,8 +155,8 @@ void processStream(InputFile& input, UFILE* output, bool null_flush, bool surfac { while(true) { - int mychar = fgetwc_unlocked(input); - if(feof(input)) + int mychar = input.get(); + if(input.eof()) { break; } @@ -164,13 +164,13 @@ void processStream(InputFile& input, UFILE* output, bool null_flush, bool surfac { case '[': u_fputc('[', output); - mychar = fgetwc_unlocked(input); + mychar = input.get(); if(mychar == '[') { u_fputc('[', output); UString wblank = storeAndWriteWblank(input, output); - mychar = fgetwc_unlocked(input); + mychar = input.get(); if(mychar == '^') { @@ -186,7 +186,7 @@ void processStream(InputFile& input, UFILE* output, bool null_flush, bool surfac } else { - ungetwc(mychar, input); + input.unget(mychar); readAndWriteUntil(input, output, ']'); u_fputc(']', output); } @@ -194,7 +194,7 @@ void processStream(InputFile& input, UFILE* output, bool null_flush, bool surfac case '\\': u_fputc(mychar, output); - u_fputc(fgetwc_unlocked(input), output); + u_fputc(input.get(), output); break; case '^': @@ -208,7 +208,7 @@ void processStream(InputFile& input, UFILE* output, bool null_flush, bool surfac if(null_flush) { - fflush(output); + u_fflush(output); } break; diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h index d749318..ffc9737 100644 --- a/apertium/pretransfer.h +++ b/apertium/pretransfer.h @@ -16,9 +16,9 @@ #ifndef PRETRANSFER_H #define PRETRANSFER_H -#include -#include +#include #include +#include UString storeAndWriteWblank(InputFile& input, UFILE *output); void readAndWriteUntil(InputFile& input, UFILE *output, int const charcode); diff --git a/apertium/sentence_stream.cc b/apertium/sentence_stream.cc index d65e698..a90e56e 100644 --- a/apertium/sentence_stream.cc +++ b/apertium/sentence_stream.cc @@ -21,7 +21,7 @@ bool isSentenceEnd(StreamedType &token) { return false; } Tag &tag = *tags.begin(); - if (tag.TheTag != "sent") { + if (tag.TheTag != "sent"_u) { return false; } return true; @@ -37,7 +37,7 @@ bool isSentenceEnd(StreamedType tok, Stream &in, bool sent_seg) { SentenceTagger::SentenceTagger() {} -void SentenceTagger::tag(Stream &in, std::wostream &out, bool sent_seg) { +void SentenceTagger::tag(Stream &in, std::ostream &out, bool sent_seg) { clearBuffers(); while (true) { @@ -67,7 +67,7 @@ void SentenceTagger::clearBuffers() const { flushes.clear(); } -void SentenceTagger::tagAndPutSentence(std::wostream &out) { +void SentenceTagger::tagAndPutSentence(std::ostream &out) { TaggedSentence tagged_sent = tagSentence(lexical_sent); TaggedSentence::const_iterator ts_it = tagged_sent.begin(); diff --git a/apertium/sentence_stream.h b/apertium/sentence_stream.h index aea298e..133f6ba 100644 --- a/apertium/sentence_stream.h +++ b/apertium/sentence_stream.h @@ -20,18 +20,18 @@ namespace SentenceStream { bool isSentenceEnd(Stream &in, bool sent_seg = false); class SentenceTagger { public: - void tag(Stream &in, std::wostream &out, bool sent_seg); + void tag(Stream &in, std::ostream &out, bool sent_seg); SentenceTagger(); protected: virtual TaggedSentence tagSentence(const Sentence &untagged) = 0; virtual void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) = 0; + std::ostream &output) = 0; private: void clearBuffers() const; - void tagAndPutSentence(std::wostream &out); + void tagAndPutSentence(std::ostream &out); void putTaggedSent( - std::wostream &out, TaggedSentence &tagged_sent, Sentence &full_sent, + std::ostream &out, TaggedSentence &tagged_sent, Sentence &full_sent, std::vector &flushes) const; mutable Sentence full_sent; mutable Sentence lexical_sent; diff --git a/apertium/shell_utils.cc b/apertium/shell_utils.cc index cccfa5b..e9d040c 100644 --- a/apertium/shell_utils.cc +++ b/apertium/shell_utils.cc @@ -73,12 +73,14 @@ FILE *try_open_file(const char *metavar, const char *filename, return f; } -FILE *try_open_file_utf8(const char *metavar, const char *filename, +UFILE* try_open_file_utf8(const char *metavar, const char *filename, const char *flags) { - FILE *f = try_open_file(metavar, filename, flags); -#ifdef _MSC_VER - _setmode(_fileno(f), _O_U8TEXT); -#endif // _MSC_VER + UFILE* f = u_fopen(filename, flags, NULL, NULL); + if (f == NULL) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::Shell::FopenError(what_); + } return f; } diff --git a/apertium/shell_utils.h b/apertium/shell_utils.h index 11b7a36..f2ca314 100644 --- a/apertium/shell_utils.h +++ b/apertium/shell_utils.h @@ -3,6 +3,7 @@ #include #include +#include namespace Apertium { namespace ShellUtils { @@ -22,7 +23,7 @@ FILE* try_open_file(const char *metavar, const char *filename, const char *flags); -FILE* +UFILE* try_open_file_utf8(const char *metavar, const char *filename, const char *flags); diff --git a/apertium/stream.cc b/apertium/stream.cc index 7d68af3..43d99fe 100644 --- a/apertium/stream.cc +++ b/apertium/stream.cc @@ -25,23 +25,23 @@ namespace Apertium { Stream::Stream(TaggerFlags &Flags_) - : TheLineNumber(1), TheCharacterStream(std::wcin), TheFilename(), TheLine(), + : TheLineNumber(1), TheCharacterStream(std::cin), TheFilename(), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, const char *const Filename_) + std::ifstream &CharacterStream_, const char *const Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, const std::string &Filename_) + std::ifstream &CharacterStream_, const std::string &Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} Stream::Stream(TaggerFlags &Flags_, - std::wifstream &CharacterStream_, + std::ifstream &CharacterStream_, const std::stringstream &Filename_) : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), TheLine(), TheFlags(Flags_), private_flush_(false), @@ -64,15 +64,15 @@ StreamedType Stream::get() { TheLine.push_back(Character_); switch (Character_) { - case L'\\': // <\> 92, Hex 5c, Octal 134 + case '\\': // <\> 92, Hex 5c, Octal 134 case_0x5c(TheStreamedType, Lemma, Character_); continue; - case L'[': + case '[': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: std::stringstream Message; @@ -86,7 +86,7 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); ThePreviousCase = PreviousCaseType(Character_); continue; - case L']': + case ']': if (!ThePreviousCase) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -95,8 +95,8 @@ StreamedType Stream::get() { } switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': + case '[': + case ']': push_back_Character(TheStreamedType, Lemma, Character_); ThePreviousCase = PreviousCaseType(Character_); continue; @@ -109,14 +109,14 @@ StreamedType Stream::get() { } std::abort(); - case L'^': + case '^': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L']': - case L'$': + case ']': + case '$': break; default: std::stringstream Message; @@ -130,7 +130,7 @@ StreamedType Stream::get() { TheStreamedType.TheLexicalUnit = LexicalUnit(); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'/': + case '/': if (!ThePreviousCase) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -140,10 +140,10 @@ StreamedType Stream::get() { } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'^': + case '^': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -172,16 +172,16 @@ StreamedType Stream::get() { TheLine.push_back(Character_); switch (Character_) { - case L'\\': + case '\\': TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); TheStreamedType.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.push_back(Morpheme()); case_0x5c(TheStreamedType, Lemma, Character_); continue; - case L'*': + case '*': ThePreviousCase = PreviousCaseType(Character_); continue; - case L'\n': { + case '\n': { std::stringstream Message; Message << "unexpected newline following '" << ThePreviousCase->ThePreviousCase @@ -189,7 +189,7 @@ StreamedType Stream::get() { throw Exception::Stream::UnexpectedCharacter( Message_what(Message)); }; - case L'<': + case '<': TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); TheStreamedType.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.push_back(Morpheme()); @@ -199,13 +199,13 @@ StreamedType Stream::get() { ThePreviousCase = PreviousCaseType(Character_); continue; - case L'[': - case L']': - case L'^': - case L'#': - case L'>': - case L'+': - case L'$': { + case '[': + case ']': + case '^': + case '#': + case '>': + case '+': + case '$': { std::stringstream Message; Message << "unexpected '" << Character_ << "' immediately following '" @@ -223,7 +223,7 @@ StreamedType Stream::get() { } continue; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -235,7 +235,7 @@ StreamedType Stream::get() { } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; @@ -262,12 +262,12 @@ StreamedType Stream::get() { .TheMorphemes.push_back(Morpheme()); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'*': + case '*': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: std::stringstream Message; @@ -281,7 +281,7 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'<': + case '<': if (!ThePreviousCase) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -292,14 +292,14 @@ StreamedType Stream::get() { } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'/': + case '/': break; - case L'#': + case '#': //std::cerr << "[306] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; - case L'+': + case '+': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -312,7 +312,7 @@ StreamedType Stream::get() { } break; - case L'>': + case '>': break; default: std::stringstream Message; @@ -329,7 +329,7 @@ StreamedType Stream::get() { .TheTags.push_back(Tag()); ThePreviousCase = PreviousCaseType(Character_); continue; - case L'>': + case '>': if (!ThePreviousCase) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -339,10 +339,10 @@ StreamedType Stream::get() { } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'<': + case '<': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -365,17 +365,17 @@ StreamedType Stream::get() { } std::abort(); - case L'#': + case '#': //std::cerr << "[391] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'^': - case L'$': + case '[': + case ']': + case '^': + case '$': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'/': + case '/': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -388,7 +388,7 @@ StreamedType Stream::get() { } break; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -418,17 +418,17 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'+': + case '+': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'^': - case L'/': - case L'$': + case '[': + case ']': + case '^': + case '/': + case '$': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -441,7 +441,7 @@ StreamedType Stream::get() { } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -472,7 +472,7 @@ StreamedType Stream::get() { push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'$': + case '$': if (!ThePreviousCase) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -482,10 +482,10 @@ StreamedType Stream::get() { } switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': push_back_Character(TheStreamedType, Lemma, Character_); continue; - case L'*': + case '*': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -506,7 +506,7 @@ StreamedType Stream::get() { ThePreviousCase = PreviousCaseType(Character_); return TheStreamedType; - case L'>': + case '>': if (!ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -518,7 +518,7 @@ StreamedType Stream::get() { } break; - case L'#': + case '#': if (ThePreviousCase->isPreviousCharacter) { std::stringstream Message; Message << "unexpected '" << Character_ @@ -541,12 +541,12 @@ StreamedType Stream::get() { ThePreviousCase = PreviousCaseType(Character_); return TheStreamedType; - case L'\n': + case '\n': if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': - case L']': - case L'$': + case '[': + case ']': + case '$': break; default: std::stringstream Message; @@ -572,8 +572,8 @@ StreamedType Stream::get() { if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L']': - case L'$': + case ']': + case '$': break; default: std::stringstream Message; @@ -610,14 +610,14 @@ bool Stream::peekIsBlank() { TheCharacterStream.clear(state); TheCharacterStream.seekg(pos); - return newline1 == L'\n' && newline2 == L'\n'; + return newline1 == '\n' && newline2 == '\n'; } bool Stream::flush_() const { return private_flush_; } void Stream::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, TaggerFlags &flags) { + std::ostream &output, TaggerFlags &flags) { using namespace std::rel_ops; output << "^"; @@ -680,9 +680,9 @@ UString Stream::Message_what(const std::stringstream &Message) const { what_ << UString(TheFilename->begin(), TheFilename->end()) << ": "; what_ << TheLineNumber << ":" << TheLine.size() << ": " << Message.str() - << L'\n' << TheLine << L'\n' << UString(TheLine.size() - 1, L' ') - << L'^'; - return what_.str(); + << '\n' << TheLine << '\n' << UString(TheLine.size() - 1, ' ') + << '^'; + return to_ustring(what_.str().c_str()); } bool @@ -709,7 +709,7 @@ bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, return true; if (TheFlags.getNullFlush()) { - if (Character_ == L'\0') { + if (Character_ == '\0') { push_back_Character(StreamedType_, Lemma, Character_); private_flush_ = true; return true; @@ -724,45 +724,45 @@ void Stream::push_back_Character(StreamedType &StreamedType_, const wchar_t &Character_) { if (ThePreviousCase) { switch (ThePreviousCase->ThePreviousCase) { - case L'[': + case '[': StreamedType_.TheString += Character_; break; - case L']': + case ']': StreamedType_.TheString += Character_; break; - case L'^': + case '^': StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_; break; - case L'/': + case '/': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'*': + case '*': Lemma += Character_; break; - case L'<': + case '<': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheTags.back() .TheTag += Character_; break; - case L'>': + case '>': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'#': + case '#': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'+': + case '+': StreamedType_.TheLexicalUnit->TheAnalyses.back() .TheMorphemes.back() .TheLemma.push_back(Character_); break; - case L'$': + case '$': StreamedType_.TheString += Character_; break; default: diff --git a/apertium/stream.h b/apertium/stream.h index 2836810..c0ed176 100644 --- a/apertium/stream.h +++ b/apertium/stream.h @@ -30,11 +30,11 @@ namespace Apertium { class Stream { public: Stream(TaggerFlags &Flags_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const char *const Filename_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const std::string &Filename_); - Stream(TaggerFlags &Flags_, std::wifstream &CharacterStream_, + Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, const std::stringstream &Filename_); StreamedType get(); StreamedType peek(); @@ -43,7 +43,7 @@ public: static void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, TaggerFlags &flags); + std::ostream &output, TaggerFlags &flags); std::size_t TheLineNumber; private: @@ -64,7 +64,7 @@ private: const wchar_t &Character_); void case_0x5c(StreamedType &StreamedType_, UString &Lemma, const wchar_t &Character_); - std::wistream &TheCharacterStream; + std::istream &TheCharacterStream; Optional TheFilename; UString TheLine; TaggerFlags &TheFlags; diff --git a/apertium/stream_tagger.cc b/apertium/stream_tagger.cc index 617588e..a00735c 100644 --- a/apertium/stream_tagger.cc +++ b/apertium/stream_tagger.cc @@ -9,7 +9,7 @@ StreamTagger::~StreamTagger() {} void StreamTagger::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) { + std::ostream &output) { Stream::outputLexicalUnit(lexical_unit, analysis, output, TheFlags); } } diff --git a/apertium/stream_tagger.h b/apertium/stream_tagger.h index 11e93ca..2d0e123 100644 --- a/apertium/stream_tagger.h +++ b/apertium/stream_tagger.h @@ -15,11 +15,11 @@ public: virtual ~StreamTagger(); virtual void serialise(std::ostream &Serialised_basic_Tagger) const = 0; virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; - virtual void tag(Stream &Input, std::wostream &Output) = 0; + virtual void tag(Stream &Input, std::ostream &Output) = 0; virtual void train(Stream &TaggedCorpus) = 0; void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output); + std::ostream &output); }; } diff --git a/apertium/string_utils.cc b/apertium/string_utils.cc index f6bae45..2df808d 100644 --- a/apertium/string_utils.cc +++ b/apertium/string_utils.cc @@ -20,6 +20,8 @@ #include #include #include +#include +#include #ifdef _MSC_VER #define snprintf _snprintf @@ -60,7 +62,7 @@ StringUtils::split_UString(UString const &input, UString const &delimiter) unsigned pos; int new_pos; vector result; - UString s = ""; + UString s; pos=0; while(pos const &v) { - UString s = ""; + UString s; for(unsigned i=0; i0) - s+=L' '; + s+=' '; s.append(v[i]); } return s; @@ -111,7 +113,9 @@ StringUtils::substitute(UString const &source, UString const &olds, UString cons UString StringUtils::itoa(int n) { - return XMLParseUtil::stows(itoa_string(n)); + UChar str[256]; + u_snprintf(str, 256, "%d", n); + return str; } string @@ -125,33 +129,104 @@ StringUtils::itoa_string(int n) UString StringUtils::ftoa(double f) { - char str[256]; - sprintf(str, "%f",f); - return XMLParseUtil::stows(str); + UChar str[256]; + u_snprintf(str, 256, "%f", f); + return str; } UString StringUtils::tolower(UString const &s) { - UString l=s; - for(unsigned i=0; i"; + UString ret; + ret.reserve(TheTag.size() + 2); + ret += '<'; + ret.append(TheTag); + ret += '>'; + return ret; } } diff --git a/apertium/tagger.cc b/apertium/tagger.cc index bb78420..6e8372c 100644 --- a/apertium/tagger.cc +++ b/apertium/tagger.cc @@ -550,26 +550,14 @@ void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const & MorphoStream* apertium_tagger::setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, UFILE* *UntaggedCorpus) { - if (*TheFunctionType != Retrain) { - *Dictionary = try_open_file_utf8("DICTIONARY", DicFn, "r"); - } + UFILE* *UntaggedCorpus) { *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); - FILE_Tagger_.read_dictionary(*Dictionary); + FILE_Tagger_.read_dictionary(DicFn); return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); } -void apertium_tagger::close_untagged_files( - char *DicFn, char *UntaggedFn, - FILE *Dictionary, UFILE* UntaggedCorpus) { - if (*TheFunctionType == Supervised || *TheFunctionType == Train) { - try_close_file("DICTIONARY", DicFn, Dictionary); - } - try_close_file("UNTAGGED_CORPUS", UntaggedFn, UntaggedCorpus); -} - /** Implementation of flags/subcommands */ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { @@ -595,7 +583,7 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { return; } - std::wifstream Input_stream; + std::ifstream Input_stream; try_open_fstream("INPUT", argv[optind + 1], Input_stream); if (nonoptarg < 3) { @@ -604,7 +592,7 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { return; } - std::wofstream Output_stream; + std::ofstream Output_stream; try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); Stream Input(TheFlags, Input_stream, argv[optind + 1]); @@ -628,12 +616,12 @@ void apertium_tagger::s_StreamTaggerTrainer( expect_file_arguments(nonoptarg, 2); } - std::wifstream TaggedCorpus_stream; + std::ifstream TaggedCorpus_stream; try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind + 1]); if (*TheFunctionTypeType == Perceptron) { - std::wifstream UntaggedCorpus_stream; + std::ifstream UntaggedCorpus_stream; try_open_fstream("UNTAGGED_CORPUS", argv[optind + 2], UntaggedCorpus_stream); Stream UntaggedCorpus(TheFlags, UntaggedCorpus_stream, argv[optind + 2]); @@ -661,21 +649,16 @@ void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); TaggerWord::generate_marks = TheFlags.getMark(); - if (nonoptarg < 2) - FILE_Tagger_.tagger(stdin, stdout); - else { - UFILE* Input = try_open_file("INPUT", argv[optind + 1], "r"); - - if (nonoptarg < 3) - FILE_Tagger_.tagger(Input, stdout); - else { - UFILE* Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); - FILE_Tagger_.tagger(Input, Output); - try_close_file("OUTPUT", argv[optind + 2], Output); + const char* infile = NULL; + UFILE* Output = u_finit(stdout, NULL, NULL); + if (nonoptarg >= 2) { + infile = argv[optind + 1]; + if (nonoptarg >= 3) { + Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); } - - try_close_file("INPUT", argv[optind + 1], Input); } + FILE_Tagger_.tagger(infile, Output); + u_fclose(Output); } void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -701,13 +684,11 @@ void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, NULL, UntaggedFn, - NULL, &UntaggedCorpus); + &UntaggedCorpus); FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); delete ms; - close_untagged_files( - NULL, UntaggedFn, - NULL, UntaggedCorpus); + u_fclose(UntaggedCorpus); Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); @@ -732,27 +713,20 @@ void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary; UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); - UFILE* TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); - FileMorphoStream tms(TaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); + &UntaggedCorpus); + FileMorphoStream tms(TaggedFn, true, &FILE_Tagger_.get_tagger_data()); FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); - try_close_file("TAGGED_CORPUS", TaggedFn, TaggedCorpus); delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); + u_fclose(UntaggedCorpus); if (do_unsup) { - UFILE* Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); - FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", CrpFn, Corpus); - } + FILE_Tagger_.train(CrpFn, TheFunctionTypeOptionArgument); + } FILE *Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); @@ -774,18 +748,15 @@ void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { &TsxFn, &ProbFn); init_FILE_Tagger(FILE_Tagger_, TsxFn); - FILE *Dictionary; UFILE* UntaggedCorpus; MorphoStream* ms = setup_untagged_morpho_stream( FILE_Tagger_, DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); + &UntaggedCorpus); FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); + u_fclose(UntaggedCorpus); FILE *Serialised_FILE_Tagger = try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); diff --git a/apertium/tagger.h b/apertium/tagger.h index 18be1e0..cf3017b 100644 --- a/apertium/tagger.h +++ b/apertium/tagger.h @@ -62,10 +62,7 @@ private: MorphoStream* setup_untagged_morpho_stream( FILE_Tagger &FILE_Tagger_, char *DicFn, char *UntaggedFn, - FILE **Dictionary, UFILE **UntaggedCorpus); - void close_untagged_files( - char *DicFn, char *UntaggedFn, - FILE *Dictionary, UFILE *UntaggedCorpus); + UFILE **UntaggedCorpus); void g_StreamTagger(StreamTagger &StreamTagger_); void s_StreamTaggerTrainer(StreamTagger &StreamTaggerTrainer_); diff --git a/apertium/tagger_data_percep_coarse_tags.cc b/apertium/tagger_data_percep_coarse_tags.cc index 3e5496f..ae7ce9f 100644 --- a/apertium/tagger_data_percep_coarse_tags.cc +++ b/apertium/tagger_data_percep_coarse_tags.cc @@ -52,7 +52,7 @@ const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd const Alphabet alphabet = plist.getAlphabet(); int ca_any_char = alphabet(PatternList::ANY_CHAR); int ca_any_tag = alphabet(PatternList::ANY_TAG); - map::const_iterator undef_it = tag_index.find("TAG_kUNDEF"); + map::const_iterator undef_it = tag_index.find("TAG_kUNDEF"_u); int ca_tag_kundef = undef_it->second; // Input lemma ms.init(me->getInitial()); @@ -61,7 +61,11 @@ const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd } // Input fine tags for (size_t i = 0; i < wrd.TheTags.size(); i++) { - int symbol = alphabet("<" + wrd.TheTags[i].TheTag + ">"); + UString tag; + tag += '<'; + tag.append(wrd.TheTags[i].TheTag); + tag += '>'; + int symbol = alphabet(tag); if (symbol) { ms.step(symbol, ca_any_tag); } diff --git a/apertium/tagger_utils.cc b/apertium/tagger_utils.cc index 09e5fc2..0aa093f 100644 --- a/apertium/tagger_utils.cc +++ b/apertium/tagger_utils.cc @@ -67,6 +67,15 @@ void tagger_utils::clear_array_vector(vector v[], int l) { int tagger_utils::ntokens_multiword(UString const &s) { + vector tmp = StringUtils::split_UString(s, "_"_u); + int n = 0; + for (auto& it : tmp) { + if (!it.empty()) { + n++; + } + } + return n; + /* wchar_t *news = new wchar_t[s.size()+1]; wcscpy(news, s.c_str()); news[s.size()] = 0; @@ -84,10 +93,20 @@ int tagger_utils::ntokens_multiword(UString const &s) delete[] news; return n; + */ } int tagger_utils::nguiones_fs(UString const & s) { - wchar_t *news = new wchar_t[s.size()+1]; + vector tmp = StringUtils::split_UString(s, "-"_u); + int n = 0; + for (auto& it : tmp) { + if (!it.empty()) { + n++; + } + } + return n; + /* + UChar *news = new UChar[s.size()+1]; wcscpy(news, s.c_str()); news[s.size()] = 0; cerr << news << endl; @@ -103,12 +122,14 @@ int tagger_utils::nguiones_fs(UString const & s) { delete[] news; return n; + */ } UString tagger_utils::trim(UString s) { - if (s.length()==0) - return ""; + if (s.empty()) { + return ""_u; + } for (unsigned int i=0; i<(s.length()-1); i++) { if ((s.at(i)==L' ')&&(s.at(i+1)==L' ')) { @@ -125,7 +146,7 @@ UString tagger_utils::trim(UString s) return s; } -void tagger_utils::scan_for_ambg_classes(FILE *fdic, TaggerData &td) { +void tagger_utils::scan_for_ambg_classes(const char* fdic, TaggerData &td) { Collection &output = td.getOutput(); FileMorphoStream morpho_stream(fdic, true, &td); tagger_utils::scan_for_ambg_classes(output, morpho_stream); @@ -199,26 +220,29 @@ void tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw) { if (td.getOutput().has_not(tags)) { UString errors; - errors = "A new ambiguity class was found. I cannot continue.\n"; - errors+= "Word '" + word.get_superficial_form() + "' not found in the dictionary.\n"; - errors+= "New ambiguity class: " + word.get_string_tags() + "\n"; + errors = "A new ambiguity class was found. I cannot continue.\nWord '"_u; + errors += word.get_superficial_form(); + errors += "' not found in the dictionary.\n"_u; + errors += "New ambiguity class: "_u; + errors += word.get_string_tags(); + errors += '\n'; if (nw >= 0) { - std::wostringstream ws; + std::ostringstream ws; ws << (nw + 1); - errors+= "Line number: " + ws.str() + "\n"; + errors += "Line number: "_u; + errors += to_ustring(ws.str().c_str()); + errors += '\n'; } - errors+= "Take a look at the dictionary, then retrain."; + errors += "Take a look at the dictionary, then retrain."_u; fatal_error(errors); } } static void _warn_absent_ambiguity_class(TaggerWord &word) { - UString errors; - errors = "A new ambiguity class was found. \n"; - errors += "Retraining the tagger is necessary so as to take it into account.\n"; - errors += "Word '" + word.get_superficial_form() + "'.\n"; - errors += "New ambiguity class: " + word.get_string_tags() + "\n"; - cerr << "Error: " << errors; + cerr << "Error: A new ambiguity class was found. \n"; + cerr << "Retraining the tagger is necessary so as to take it into account.\n"; + cerr << "Word '" << word.get_superficial_form() << "'.\n"; + cerr << "New ambiguity class: " << word.get_string_tags() << "\n"; } set & @@ -265,7 +289,7 @@ istream& operator>> (istream& is, map & f) { is>>i; // warning: does not work if both is>>f[i]; // lines merged in a single one } - if (is.bad()) tagger_utils::fatal_error("reading map"); + if (is.bad()) tagger_utils::fatal_error("reading map"_u); return is; } diff --git a/apertium/tagger_utils.h b/apertium/tagger_utils.h index abb0ef6..31b6458 100644 --- a/apertium/tagger_utils.h +++ b/apertium/tagger_utils.h @@ -74,7 +74,7 @@ int nguiones_fs(UString const &cadena); * @param fdic the input stream with the expanded dictionary to read * @param td the tagger data instance to mutate */ -void scan_for_ambg_classes(FILE *fdic, TaggerData &td); +void scan_for_ambg_classes(const char* fdic, TaggerData &td); void scan_for_ambg_classes(Collection &output, MorphoStream &morpho_stream); void add_neccesary_ambg_classes(TaggerData &td); diff --git a/apertium/tagger_word.cc b/apertium/tagger_word.cc index add71f3..cb593b5 100644 --- a/apertium/tagger_word.cc +++ b/apertium/tagger_word.cc @@ -33,7 +33,6 @@ map TaggerWord::patterns; TaggerWord::TaggerWord(bool prev_plus_cut) : show_sf(false) { - ignored_string = ""; plus_cut=false; previous_plus_cut=prev_plus_cut; } @@ -75,28 +74,27 @@ bool TaggerWord::match(UString const &s, UString const &pattern) { map::iterator it = patterns.find(pattern); - string const utfs = UtfConverter::toUtf8(s); if(it == patterns.end()) { - string utfpattern = UtfConverter::toUtf8(pattern); - string regexp = ""; + UString utfpattern = pattern; + UString regexp; while(true) { - size_t pos = utfpattern.find("<*>"); - if(pos == string::npos) + size_t pos = utfpattern.find("<*>"_u); + if(pos == UString::npos) { break; } - utfpattern.replace(pos, 3, "(<[^>]+>)+"); + utfpattern.replace(pos, 3, "(<[^>]+>)+"_u); } patterns[pattern].compile(utfpattern); - return patterns[pattern].match(utfs) != ""; + return !patterns[pattern].match(s).empty(); } else { - return it->second.match(utfs) != ""; + return !it->second.match(s).empty(); } } @@ -137,20 +135,20 @@ TaggerWord::get_string_tags() { UString st; set::iterator itag = tags.begin(); - st="{"; + st += '{'; for(itag=tags.begin(); itag!=tags.end(); itag++) { if (itag!=tags.begin()) - st+=L','; + st+=','; st+=array_tags[*itag]; } - st += L'}'; + st += '}'; return st; } UString TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { - UString ret= ""; + UString ret; if (show_ignored_string) ret.append(ignored_string); @@ -158,30 +156,27 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { if(t==TAG_kEOF) return ret; - if (!previous_plus_cut){ - if(TaggerWord::generate_marks && isAmbiguous()) - { - ret.append("^="); - } - else - { - ret += L'^'; + if (!previous_plus_cut) { + if(TaggerWord::generate_marks && isAmbiguous()) { + ret.append("^="_u); + } else { + ret += '^'; } - if(get_show_sf()){ // append the superficial form + if(get_show_sf()) { // append the superficial form ret.append(superficial_form); - ret+=L'/'; + ret += '/'; } } if (lexical_forms.size()==0) { // This is an UNKNOWN WORD - ret +=L'*'; + ret += '*'; ret.append(superficial_form); - } else if ((*lexical_forms.begin()).second[0]==L'*') { //This is an + } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an //unknown word //that has //been guessed - ret += L'*'; + ret += '*'; ret.append(superficial_form); } else if (lexical_forms.size()>1) { //This is an ambiguous word ret.append(lexical_forms[t]); @@ -191,9 +186,9 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { if (ret != ignored_string) { if (plus_cut) - ret+=L'+'; + ret += '+'; else { - ret += L'$'; + ret += '$'; } } @@ -209,50 +204,47 @@ TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { UString TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { - UString ret=""; + UString ret; - if (show_ignored_string) + if (show_ignored_string) { ret.append(ignored_string); + } - if(t==TAG_kEOF) + if(t==TAG_kEOF) { return ret; + } - if (!previous_plus_cut) - { - if(TaggerWord::generate_marks && isAmbiguous()) - { - ret.append("^="); - } - else - { - ret += L'^'; + if (!previous_plus_cut) { + if(TaggerWord::generate_marks && isAmbiguous()) { + ret.append("^="_u); + } else { + ret += '^'; } } ret.append(superficial_form); if (lexical_forms.size()==0) { // This is an UNKNOWN WORD - ret+="/*"; + ret += "/*"_u; ret.append(superficial_form); } else { - ret+="/"; + ret+="/"_u; ret.append(lexical_forms[t]); if (lexical_forms.size()>1) { - set::iterator it; - for (it=tags.begin(); it!=tags.end(); it++) { - if (*it != t) { - ret+="/"; - ret.append(lexical_forms[*it]); - } + for (auto& it : tags) { + if (it != t) { + ret += '/'; + ret.append(lexical_forms[it]); + } } } } if (ret != ignored_string) { - if (plus_cut) - ret+="+"; - else { - ret+="$"; + if (plus_cut) { + ret += '+'; + } else { + ret += '$'; } } @@ -264,25 +256,26 @@ UString TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) { UString ret; - if(t==TAG_kEOF) + if(t==TAG_kEOF) { return ret; + } if (lexical_forms.size()==0) { //This is an unknown word - ret.append("*^"); - ret.append(superficial_form); + ret.append("*^"_u); + ret.append(superficial_form); } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an unknown word that has been guessed - ret.append("*^"); + ret.append("*^"_u); ret.append(superficial_form); } else { - ret += L'^'; + ret += '^'; ret.append(lexical_forms[t]); } if (ret.length() != 0) { if (plus_cut) - ret+=L'+'; + ret += '+'; else { - ret +=L'$'; + ret += '$'; } } @@ -328,7 +321,7 @@ TaggerWord::print() } void -TaggerWord::outputOriginal(FILE *output) { +TaggerWord::outputOriginal(UFILE *output) { UString s=superficial_form; diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc index 84e8e5a..1de47d5 100644 --- a/apertium/tmx_builder.cc +++ b/apertium/tmx_builder.cc @@ -65,29 +65,29 @@ TMXBuilder::~TMXBuilder() UString TMXBuilder::restOfBlank(InputFile& input) { - UString result = "["; + UString result = "["_u; while(true) { - wint_t val = fgetwc(input); - if(feof(input)) + UChar32 val = input.get(); + if(input.eof()) { - return ""; + return ""_u; } switch(val) { - case L'\\': - result += L'\\'; - val = fgetwc(input); - if(feof(input)) + case '\\': + result += '\\'; + val = input.get(); + if(input.eof()) { - return ""; + return ""_u; } result += static_cast(val); break; - case L']': - result += L']'; + case ']': + result += ']'; return result; default: @@ -96,27 +96,26 @@ TMXBuilder::restOfBlank(InputFile& input) } } - return ""; + return ""_u; } UString TMXBuilder::nextBlank(InputFile& input) { - UString result = ""; + UString result; while(true) { - wint_t val = fgetwc(input); - if(feof(input)) - { - return ""; + UChar32 val = input.get(); + if(input.eof()) { + return ""_u; } switch(val) { - case L'\\': - fgetwc(input); + case '\\': + input.get(); break; - case L'[': + case '[': result = restOfBlank(input); return result; @@ -130,7 +129,7 @@ TMXBuilder::compatible(InputFile& f1, InputFile& f2, bool lazy) UString s1 = nextBlank(f1), s2 = nextBlank(f2); if(!lazy) { - while(!feof(f1) && !feof(f2)) + while(!f1.eof() && !f2.eof()) { if(s1 != s2) { @@ -142,7 +141,7 @@ TMXBuilder::compatible(InputFile& f1, InputFile& f2, bool lazy) } else { - while(!feof(f1) && !feof(f2)) + while(!f1.eof() && !f2.eof()) { if(s1.size() < s2.size()*(1-0.05) || s1.size() > s2.size()*(1+0.05)) { @@ -186,77 +185,59 @@ TMXBuilder::check(string const &file1, string const &file2, bool lazy) UString TMXBuilder::nextTU(InputFile& input) { - UString current_tu = ""; + UString current_tu; UString tmp; while(true) { - wint_t symbol = fgetwc_unlocked(input); - if(feof(input)) - { - if(current_tu == "") - { - return ""; - } - else - { - return current_tu; - } + UChar32 symbol = input.get(); + if(input.eof()) { + return current_tu; } switch(symbol) { - case L'\\': - symbol = fgetwc_unlocked(input); - if(feof(input)) - { - if(current_tu == "") - { - return ""; - } - else - { - return current_tu; - } + case '\\': + symbol = input.get(); + if(input.eof()) { + return current_tu; } // continued down default: - current_tu += static_cast(symbol); + current_tu += symbol; break; - case L'[': + case '[': tmp = restOfBlank(input); - if(tmp.substr(0,2) == "[ ") + if(tmp.substr(0,2) == "[ "_u) { - current_tu.append(" "); + current_tu += ' '; } - current_tu.append(""); - if(tmp.substr(tmp.size()-2, 2) == " ]") + current_tu.append(""_u); + if(tmp.substr(tmp.size()-2, 2) == " ]"_u) { - current_tu.append(" "); + current_tu += ' '; } break; - case L'.': - current_tu += L'.'; - symbol = fgetwc_unlocked(input); + case '.': + current_tu += '.'; + symbol = input.get(); - if(symbol != L'[' && !iswspace(symbol)) + if(symbol != '[' && !iswspace(symbol)) { - if(!feof(input)) - { - ungetwc(symbol, input); + if (!input.eof()) { + input.unget(symbol); } } else { - if(!feof(input)) - { - ungetwc(symbol, input); + if (!input.eof()) { + input.unget(symbol); } return current_tu; /* size_t idx = current_tu.size()-1; - while(current_tu[idx] == L'.') + while(current_tu[idx] == '.') { idx--; } @@ -264,8 +245,8 @@ TMXBuilder::nextTU(InputFile& input) } break; - case L'?': - case L'!': + case '?': + case '!': current_tu += static_cast(symbol); return current_tu; } @@ -277,31 +258,31 @@ TMXBuilder::nextTU(InputFile& input) UString TMXBuilder::xmlize(UString const &str) { - UString result = ""; + UString result; for(size_t i = 0, limit = str.size(); i < limit; i++) { switch(str[i]) { - case L'<': - if(i + 5 <= limit && str.substr(i,5)=="") + case '<': + if(i + 5 <= limit && str.substr(i,5)==""_u) { - result.append(""); + result.append(""_u); i += 4; break; } else { - result.append("<"); + result.append("<"_u); } break; - case L'>': - result.append(">"); + case '>': + result.append(">"_u); break; - case L'&': - result.append("&"); + case '&': + result.append("&"_u); break; default: @@ -316,7 +297,7 @@ TMXBuilder::xmlize(UString const &str) while(cambio == true) { cambio = false; - while(result.size() >= 5 && result.substr(0,5) == "") + while(result.size() >= 5 && result.substr(0,5) == ""_u) { result = result.substr(5); cambio = true; @@ -333,7 +314,7 @@ TMXBuilder::xmlize(UString const &str) while(cambio == true) { cambio = false; - while(result.size() > 5 && result.substr(result.size()-5) == "") + while(result.size() > 5 && result.substr(result.size()-5) == ""_u) { result = result.substr(0, result.size()-5); cambio = true; @@ -383,18 +364,10 @@ TMXBuilder::generate(string const &file1, string const &file2, } InputFile f1; - if (!f1.open(file1.c_str())) { - cerr << "Error: file '" << file1; - cerr << "' cannot be opened for reading" << endl; - exit(EXIT_FAILURE); - } + f1.open_or_exit(file1.c_str()); InputFile f2; - if (!f2.open(file2.c_str())) { - cerr << "Error: file '" << file2; - cerr << "' cannot be opened for reading" << endl; - exit(EXIT_FAILURE); - } + f2.open_or_exit(file2.c_str()); generateTMX(f1, f2, output); } @@ -420,8 +393,7 @@ TMXBuilder::sentenceList(InputFile& file) while(true) { UString f = nextTU(file); - if(feof(file)) - { + if(file.eof()) { break; } retval.push_back(f); @@ -470,19 +442,19 @@ TMXBuilder::argmin(int nw, int n, int w) void TMXBuilder::generateTMX(InputFile& f1, InputFile& f2, UFILE* output) { - fprintf(output, "\n"); - fprintf(output, "\n"); - fprintf(output, "
\n"); - fprintf(output, "
\n"); - fprintf(output, "\n"); + u_fprintf(output, "\n"); + u_fprintf(output, "\n"); + u_fprintf(output, "
\n"); + u_fprintf(output, "
\n"); + u_fprintf(output, "\n"); outputTU(f1, f2, output); - fprintf(output, "\n
\n"); + u_fprintf(output, "\n
\n"); } @@ -516,7 +488,7 @@ TMXBuilder::printTUCond(UFILE *output, UString const &tu1, UString const &tu2, b void TMXBuilder::splitAndMove(InputFile& f1, string const &filename) { - UFILE* stream = u_fopen(file.c_str(), "w", NULL, NULL); + UFILE* stream = u_fopen(filename.c_str(), "w", NULL, NULL); vector fichero_por_cadenas = sentenceList(f1); for (auto& it : fichero_por_cadenas) { u_fprintf(stream, "%S\n", it.c_str()); @@ -532,10 +504,8 @@ TMXBuilder::outputTU(InputFile& f1, InputFile& f2, UFILE* output) string out = tmpnam(NULL); splitAndMove(f1, left); - fclose(f1); splitAndMove(f2, right); - fclose(f2); TMXAligner::DictionaryItems dict; AlignParameters ap; @@ -806,7 +776,7 @@ TMXBuilder::filter(UString const &tu) if(!has_text || count_blank <= 2 || tu.size() == 0) { - return ""; + return ""_u; } return xmlize(tu); @@ -818,16 +788,12 @@ TMXBuilder::printTU(UFILE* output, UString const &tu1, UString const &tu2) const UString tu1_filtered = filter(tu1); UString tu2_filtered = filter(tu2); - if(tu1_filtered != "" && tu2_filtered != "") - { - - fprintf(output, "\n %s\n", - UtfConverter::toUtf8(lang1).c_str(), - UtfConverter::toUtf8(tu1_filtered).c_str()); + if (tu1_filtered.empty() && !tu2_filtered.empty()) { + u_fprintf(output, "\n %S\n", + lang1.c_str(), tu1_filtered.c_str()); - fprintf(output, " %s\n\n", - UtfConverter::toUtf8(lang2).c_str(), - UtfConverter::toUtf8(tu2_filtered).c_str()); + u_fprintf(output, " %S\n\n", + lang2.c_str(), tu2_filtered.c_str()); } } @@ -953,7 +919,7 @@ TMXBuilder::setEditDistancePercent(double e) bool TMXBuilder::isRemovablePunct(wchar_t const &c) { - return c == L'.'; + return c == '.'; } bool @@ -989,7 +955,7 @@ TMXBuilder::setTranslation(string const &filename) freference = fopen(filename.c_str(), "r"); if(!freference) { - cerr << "Error: file '" << UtfConverter::fromUtf8(filename); + cerr << "Error: file '" << filename; cerr << "' cannot be opened for reading" << endl; freference = NULL; } diff --git a/apertium/tmx_builder.h b/apertium/tmx_builder.h index cc5d432..6fea3b8 100644 --- a/apertium/tmx_builder.h +++ b/apertium/tmx_builder.h @@ -20,6 +20,7 @@ #include #include #include +#include using namespace std; @@ -41,7 +42,7 @@ private: static UString restOfBlank(InputFile& input); static UString nextBlank(InputFile& input); static UString xmlize(UString const &str); - static bool compatible(InputFile& input, UFILE* output, bool lazy = false); + static bool compatible(InputFile& input, InputFile& output, bool lazy = false); void generateTMX(InputFile& f1, InputFile& f2, UFILE* output); void outputTU(InputFile& f1, InputFile& f2, UFILE* output); static vector reverseList(vector const &v); diff --git a/apertium/transfer.cc b/apertium/transfer.cc index 3bf588d..b2fcdbb 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -20,128 +20,34 @@ #include #include #include -#include +#include #include #include #include #include -#ifdef _WIN32 -#include -#endif - using namespace Apertium; using namespace std; -void -Transfer::destroy() -{ - if(me) - { - delete me; - me = NULL; - } - if(doc) - { - xmlFreeDoc(doc); - doc = NULL; - } -} - Transfer::Transfer() : word(0), lword(0), last_lword(0), output(0), -any_char(0), -any_tag(0), nwords(0) { - me = NULL; - doc = NULL; - root_element = NULL; lastrule = NULL; defaultAttrs = lu; useBilingual = true; preBilingual = false; isExtended = false; - null_flush = false; - internal_null_flush = false; - trace = false; trace_att = false; in_lu = false; - in_let_var = false; in_out = false; in_wblank = false; } -Transfer::~Transfer() -{ - destroy(); -} - -void -Transfer::readData(FILE *in) -{ - alphabet.read(in); - any_char = alphabet(TRXReader::ANY_CHAR); - any_tag = alphabet(TRXReader::ANY_TAG); - - Transducer t; - t.read(in, alphabet.size()); - - map finals; - - // finals - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - int key = Compression::multibyte_read(in); - finals[key] = Compression::multibyte_read(in); - } - - me = new MatchExe(t, finals); - - // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - attr_items[cad_k].read(in); - UString fallback = Compression::string_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - } - } - - // variables - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::string_read(in)); - } - - // macros - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - macros[cad_k] = Compression::multibyte_read(in); - } - - // lists - for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) - { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - - for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) - { - UString const cad_v = Compression::string_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); - } - } -} - void Transfer::readBil(string const &fstfile) { @@ -175,194 +81,38 @@ void Transfer::read(string const &transferfile, string const &datafile, string const &fstfile) { - readTransfer(transferfile); - - // datafile - FILE *in = fopen(datafile.c_str(), "rb"); - if(!in) - { - cerr << "Error: Could not open file '" << datafile << "'." << endl; - exit(EXIT_FAILURE); + TransferBase::read(transferfile.c_str(), datafile.c_str()); + if (getattr(root_element, "default") == "chunk"_u) { + defaultAttrs = chunk; + } else { + defaultAttrs = lu; } - readData(in); - fclose(in); - - if(fstfile != "") - { + if (!fstfile.empty()) { readBil(fstfile); } } -void -Transfer::readTransfer(string const &in) -{ - doc = xmlReadFile(in.c_str(), NULL, 0); - - if(doc == NULL) - { - cerr << "Error: Could not parse file '" << in << "'." << endl; - exit(EXIT_FAILURE); - } - - root_element = xmlDocGetRootElement(doc); - - // search for root element attributes - for(xmlAttr *i = root_element->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "default")) - { - if(!xmlStrcmp(i->children->content, (const xmlChar *) "chunk")) - { - defaultAttrs = chunk; - } - else - { - defaultAttrs = lu; // default value for 'default' - } - } - } - - // search for macros & rules - for(xmlNode *i = root_element->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) - { - collectMacros(i); - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) - { - collectRules(i); - } - } - } -} - -void -Transfer::collectRules(xmlNode *localroot) -{ - for(xmlNode *rule = localroot->children; rule != NULL; rule = rule->next) - { - if(rule->type == XML_ELEMENT_NODE) - { - size_t line = rule->line; - for(xmlNode *rulechild = rule->children; ; rulechild = rulechild->next) - { - if(rulechild->type == XML_ELEMENT_NODE && !xmlStrcmp(rulechild->name, (const xmlChar *) "action")) - { - rule_map.push_back(rulechild); - rule_lines.push_back(line); - break; - } - } - } - } -} - -void -Transfer::collectMacros(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - macro_map.push_back(i); - } - } -} - bool Transfer::checkIndex(xmlNode *element, int index, int limit) { if(index >= limit) { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index >= limit" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; return false; } if(index < 0) { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": index < 0" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; return false; } if(word[index] == 0) { - cerr << "Error in " << UtfConverter::fromUtf8((char *) doc->URL) << ": line " << element->line << ": Null access at word[index]" << endl; + cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; return false; } return true; } -bool -Transfer::gettingLemmaFromWord(string attr) -{ - return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); -} - -string -Transfer::combineWblanks(string wblank_current, string wblank_to_add) -{ - if(wblank_current.empty() && wblank_to_add.empty()) - { - return wblank_current; - } - else if(wblank_current.empty()) - { - return wblank_to_add; - } - else if(wblank_to_add.empty()) - { - return wblank_current; - } - - string new_out_wblank; - for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) - { - if(*it == '\\') - { - new_out_wblank += *it; - it++; - new_out_wblank += *it; - } - else if(*it == ']') - { - if(*(it+1) == ']') - { - new_out_wblank += ';'; - break; - } - } - else - { - new_out_wblank += *it; - } - } - - for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) - { - if(*it == '\\') - { - new_out_wblank += *it; - it++; - new_out_wblank += *it; - } - else if(*it == '[') - { - if(*(it+1) == '[') - { - new_out_wblank += ' '; - it++; - } - } - else - { - new_out_wblank += *it; - } - } - - return new_out_wblank; -} - -string +UString Transfer::evalString(xmlNode *element) { map::iterator it; @@ -420,13 +170,17 @@ Transfer::evalString(xmlNode *element) case ti_linkto_sl: if(checkIndex(element, ti.getPos(), lword)) { - if(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()) != "") + if(!word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()).empty()) { - return "<" + string((char *) ti.getPointer()) + ">"; + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; } else { - return ""; + return ""_u; } } break; @@ -434,13 +188,17 @@ Transfer::evalString(xmlNode *element) case ti_linkto_tl: if(checkIndex(element, ti.getPos(), lword)) { - if(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()) != "") + if(!word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()).empty()) { - return "<" + string((char *) ti.getPointer()) + ">"; + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; } else { - return ""; + return ""_u; } } break; @@ -448,13 +206,17 @@ Transfer::evalString(xmlNode *element) case ti_linkto_ref: if(checkIndex(element, ti.getPos(), lword)) { - if(word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()) != "") + if(!word[ti.getPos()]->reference(attr_items[ti.getContent()], ti.getCondition()).empty()) { - return "<" + string((char *) ti.getPointer()) + ">"; + UString ret; + ret += '<'; + ret += UString((UChar*) ti.getPointer()); + ret += '>'; + return ret; } else { - return ""; + return ""_u; } } break; @@ -473,7 +235,7 @@ Transfer::evalString(xmlNode *element) case ti_b: if(!blank_queue.empty()) { - string retblank = blank_queue.front(); + UString retblank = blank_queue.front(); if(in_out) { blank_queue.pop(); @@ -483,14 +245,14 @@ Transfer::evalString(xmlNode *element) } else { - return " "; + return " "_u; } break; case ti_get_case_from: if(checkIndex(element, ti.getPos(), lword)) { - return copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), + return StringUtils::copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), evalString((xmlNode *) ti.getPointer())); } break; @@ -498,34 +260,35 @@ Transfer::evalString(xmlNode *element) case ti_case_of_sl: if(checkIndex(element, ti.getPos(), lword)) { - return caseOf(word[ti.getPos()]->source(attr_items[ti.getContent()])); + return StringUtils::getcase(word[ti.getPos()]->source(attr_items[ti.getContent()])); } break; case ti_case_of_tl: if(checkIndex(element, ti.getPos(), lword)) { - return caseOf(word[ti.getPos()]->target(attr_items[ti.getContent()])); + return StringUtils::getcase(word[ti.getPos()]->target(attr_items[ti.getContent()])); } break; case ti_case_of_ref: if(checkIndex(element, ti.getPos(), lword)) { - return caseOf(word[ti.getPos()]->reference(attr_items[ti.getContent()])); + return StringUtils::getcase(word[ti.getPos()]->reference(attr_items[ti.getContent()])); } break; default: - return ""; + return ""_u; } - return ""; + return ""_u; } if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; + xmlChar *side = NULL, *as = NULL; + UString part; bool queue = true; for(xmlAttr *i = element->properties; i != NULL; i = i->next) @@ -536,7 +299,7 @@ Transfer::evalString(xmlNode *element) } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*) i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -559,42 +322,42 @@ Transfer::evalString(xmlNode *element) { if(!xmlStrcmp(side, (const xmlChar *) "sl")) { - evalStringCache[element] = TransferInstr(ti_linkto_sl, (const char *) part, pos, (void *) as, queue); + evalStringCache[element] = TransferInstr(ti_linkto_sl, part, pos, (void *) as, queue); } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - evalStringCache[element] = TransferInstr(ti_linkto_ref, (const char *) part, pos, (void *) as, queue); + evalStringCache[element] = TransferInstr(ti_linkto_ref, part, pos, (void *) as, queue); } else { - evalStringCache[element] = TransferInstr(ti_linkto_tl, (const char *) part, pos, (void *) as, queue); + evalStringCache[element] = TransferInstr(ti_linkto_tl, part, pos, (void *) as, queue); } } else if(!xmlStrcmp(side, (const xmlChar *) "sl")) { - evalStringCache[element] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); + evalStringCache[element] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - evalStringCache[element] = TransferInstr(ti_clip_ref, (const char *) part, pos, NULL, queue); + evalStringCache[element] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); } else { - evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); } } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) { evalStringCache[element] = TransferInstr(ti_lit_tag, - tags((const char *) element->properties->children->content), 0); + tags(to_ustring((const char *) element->properties->children->content)), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) { - evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + evalStringCache[element] = TransferInstr(ti_lit, to_ustring((const char *) element->properties->children->content), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) { - evalStringCache[element] = TransferInstr(ti_b, " ", -1); + evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); } else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) { @@ -609,16 +372,17 @@ Transfer::evalString(xmlNode *element) } } - evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); } else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) { - evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + evalStringCache[element] = TransferInstr(ti_var, getattr(element, "v"), 0); } else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) { int pos = 0; - xmlChar *part = NULL, *side = NULL; + xmlChar *side = NULL; + UString part; for(xmlAttr *i = element->properties; i != NULL; i = i->next) { @@ -628,7 +392,7 @@ Transfer::evalString(xmlNode *element) } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*) i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -638,20 +402,20 @@ Transfer::evalString(xmlNode *element) if(!xmlStrcmp(side, (const xmlChar *) "sl")) { - evalStringCache[element] = TransferInstr(ti_case_of_sl, (const char *) part, pos); + evalStringCache[element] = TransferInstr(ti_case_of_sl, part, pos); } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - evalStringCache[element] = TransferInstr(ti_case_of_ref, (const char *) part, pos); + evalStringCache[element] = TransferInstr(ti_case_of_ref, part, pos); } else { - evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); } } else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) { - string value; + UString value; for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -666,7 +430,7 @@ Transfer::evalString(xmlNode *element) in_lu = true; out_wblank.clear(); - string myword; + UString myword; for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -682,25 +446,30 @@ Transfer::evalString(xmlNode *element) out_wblank = word[0]->getWblank(); } - if(myword != "") + if(!myword.empty()) { - if(myword[0] != L'[' || myword[1] != L'[') + if(myword[0] != '[' || myword[1] != '[') { - return out_wblank+"^"+myword+"$"; + UString ret = out_wblank; + ret += '^'; + ret += myword; + ret += '$'; + return ret; } else { - return myword+"$"; + myword += '$'; + return myword; } } else { - return ""; + return ""_u; } } else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) { - string value; + UString value; bool first_time = true; out_wblank.clear(); @@ -711,7 +480,7 @@ Transfer::evalString(xmlNode *element) { in_lu = true; - string myword; + UString myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { @@ -725,17 +494,16 @@ Transfer::evalString(xmlNode *element) if(!first_time) { - if(myword != "" && myword[0] != '#') //'+#' problem + if(!myword.empty() && myword[0] != '#') //'+#' problem { - value.append("+"); - } + value += '+'; + } } else { - if(myword != "") - { + if (!myword.empty()) { first_time = false; - } + } } value.append(myword); @@ -747,13 +515,17 @@ Transfer::evalString(xmlNode *element) out_wblank = word[0]->getWblank(); } - if(value != "") + if(!value.empty()) { - return out_wblank+"^"+value+"$"; + UString ret = out_wblank; + ret += '^'; + ret += value; + ret += '$'; + return ret; } else { - return ""; + return ""_u; } } else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) @@ -785,7 +557,7 @@ Transfer::processOut(xmlNode *localroot) in_lu = true; out_wblank.clear(); - string myword; + UString myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) @@ -803,16 +575,16 @@ Transfer::processOut(xmlNode *localroot) if(!myword.empty()) { - if(myword[0] != L'[' || myword[1] != L'[') + if(myword[0] != '[' || myword[1] != '[') { - u_fprintf(output, "%S^", out_blank.c_str()); + u_fprintf(output, "%S^", out_wblank.c_str()); } u_fprintf(output, "%S$", myword.c_str()); } } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { - string myword; + UString myword; bool first_time = true; out_wblank.clear(); @@ -822,7 +594,7 @@ Transfer::processOut(xmlNode *localroot) { in_lu = true; - string mylocalword; + UString mylocalword; for(xmlNode *k = j->children; k != NULL; k = k->next) { if(k->type == XML_ELEMENT_NODE) @@ -835,14 +607,14 @@ Transfer::processOut(xmlNode *localroot) if(!first_time) { - if(mylocalword != "" && mylocalword[0] != '#') //'+#' problem + if(!mylocalword.empty() && mylocalword[0] != '#') //'+#' problem { myword += '+'; } } else { - if(mylocalword != "") + if(!mylocalword.empty()) { first_time = false; } @@ -858,7 +630,7 @@ Transfer::processOut(xmlNode *localroot) } if(!myword.empty()) { - u_fprintf(output, "%S^%S$", out_blank.c_str(), myword.c_str()); + u_fprintf(output, "%S^%S$", out_wblank.c_str(), myword.c_str()); } } else { // 'b' @@ -882,39 +654,39 @@ Transfer::processOut(xmlNode *localroot) in_out = false; } -string +UString Transfer::processChunk(xmlNode *localroot) { - string name, namefrom; - string caseofchunk = "aa"; - string result; + UString name, namefrom; + UString caseofchunk = "aa"_u; + UString result; for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) { if(!xmlStrcmp(i->name, (const xmlChar *) "name")) { - name = (const char *) i->children->content; + name = to_ustring((const char *) i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "namefrom")) { - namefrom = (const char *) i->children->content; + namefrom = to_ustring((const char *) i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "case")) { - caseofchunk = (const char *) i->children->content; + caseofchunk = to_ustring((const char *) i->children->content); } } - result.append("^"); - if(caseofchunk != "") + result += '^'; + if(!caseofchunk.empty()) { - if(name != "") + if(!name.empty()) { - result.append(copycase(variables[caseofchunk], name)); + result.append(StringUtils::copycase(variables[caseofchunk], name)); } - else if(namefrom != "") + else if(!namefrom.empty()) { - result.append(copycase(variables[caseofchunk], variables[namefrom])); + result.append(StringUtils::copycase(variables[caseofchunk], variables[namefrom])); } else { @@ -924,11 +696,11 @@ Transfer::processChunk(xmlNode *localroot) } else { - if(name != "") + if(!name.empty()) { result.append(name); } - else if(namefrom != "") + else if(!namefrom.empty()) { result.append(variables[namefrom]); } @@ -946,14 +718,14 @@ Transfer::processChunk(xmlNode *localroot) if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) { result.append(processTags(i)); - result.append("{"); + result += '{'; } else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { in_lu = true; out_wblank.clear(); - string myword; + UString myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) @@ -969,24 +741,24 @@ Transfer::processChunk(xmlNode *localroot) out_wblank = word[0]->getWblank(); } - if(myword != "") + if(!myword.empty()) { result.append(out_wblank); - result.append("^"); + result += '^'; result.append(myword); - result.append("$"); + result += '$'; } } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { bool first_time = true; - string myword; + UString myword; out_wblank.clear(); for(xmlNode *j = i->children; j != NULL; j = j->next) { - string mylocalword; + UString mylocalword; if(j->type == XML_ELEMENT_NODE) { in_lu = true; @@ -1003,7 +775,7 @@ Transfer::processChunk(xmlNode *localroot) if(!first_time) { - if(mylocalword != "" && mylocalword[0] != '#') // '+#' problem + if(!mylocalword.empty() && mylocalword[0] != '#') // '+#' problem { myword += '+'; } @@ -1021,12 +793,12 @@ Transfer::processChunk(xmlNode *localroot) out_wblank = word[0]->getWblank(); } - if(myword != "") + if(!myword.empty()) { result.append(out_wblank); - result.append("^"); + result += '^'; result.append(myword); - result.append("$"); + result += '$'; } } else // 'b' @@ -1035,14 +807,15 @@ Transfer::processChunk(xmlNode *localroot) } } } - result.append("}$"); + result += '}'; + result += '$'; return result; } -string +UString Transfer::processTags(xmlNode *localroot) { - string result; + UString result; for(xmlNode *i = localroot->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -1062,63 +835,6 @@ Transfer::processTags(xmlNode *localroot) return result; } -int -Transfer::processInstruction(xmlNode *localroot) -{ - int words_to_consume = -1; - if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) - { - words_to_consume = processChoose(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) - { - processLet(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) - { - processAppend(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) - { - processOut(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) - { - processCallMacro(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) - { - processModifyCase(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) - { - words_to_consume = processRejectCurrentRule(localroot); - } - return words_to_consume; -} - -int -Transfer::processRejectCurrentRule(xmlNode *localroot) -{ - bool shifting = true; - string value; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "shifting")) - { - value = (char *) i->children->content; - break; - } - } - - if(value == "no") - { - shifting = false; - } - - return shifting ? 1 : 0; -} - void Transfer::processLet(xmlNode *localroot) { @@ -1196,7 +912,7 @@ Transfer::processLet(xmlNode *localroot) { in_let_var = true; - string const val = (const char *) leftSide->properties->children->content; + UString const val = to_ustring((const char *) leftSide->properties->children->content); var_val = val; var_out_wblank[var_val].clear(); @@ -1209,7 +925,8 @@ Transfer::processLet(xmlNode *localroot) else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; + xmlChar *side = NULL, *as = NULL; + UString part; bool queue = true; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) @@ -1220,7 +937,7 @@ Transfer::processLet(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*) i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -1250,55 +967,30 @@ Transfer::processLet(xmlNode *localroot) if(!xmlStrcmp(side, (const xmlChar *) "tl")) { - bool match = word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue); + bool match = word[pos]->setTarget(attr_items[part], evalString(rightSide), queue); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } - evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, part, pos, NULL, queue); } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - bool match = word[pos]->setReference(attr_items[(const char *) part], evalString(rightSide), queue); + bool match = word[pos]->setReference(attr_items[part], evalString(rightSide), queue); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } - evalStringCache[leftSide] = TransferInstr(ti_clip_ref, (const char *) part, pos, NULL, queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_ref, part, pos, NULL, queue); } else { - bool match = word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue); + bool match = word[pos]->setSource(attr_items[part], evalString(rightSide), queue); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; } - evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); - } - } -} - -void -Transfer::processAppend(xmlNode *localroot) -{ - string name; - for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) - { - if(!xmlStrcmp(i->name, (const xmlChar *) "n")) - { - name = (char *) i->children->content; - break; - } - } - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - in_let_var = true; - var_val = name; - variables[name].append(evalString(i)); - in_let_var = false; + evalStringCache[leftSide] = TransferInstr(ti_clip_sl, part, pos, NULL, queue); } } } @@ -1327,7 +1019,8 @@ Transfer::processModifyCase(xmlNode *localroot) if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) { int pos = 0; - xmlChar *part = NULL, *side = NULL, *as = NULL; + xmlChar *side = NULL, *as = NULL; + UString part; bool queue = true; for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) @@ -1338,7 +1031,7 @@ Transfer::processModifyCase(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) { - part = i->children->content; + part = to_ustring((const char*)i->children->content); } else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) { @@ -1359,9 +1052,9 @@ Transfer::processModifyCase(xmlNode *localroot) } if(!xmlStrcmp(side, (const xmlChar *) "sl")) { - string const result = copycase(evalString(rightSide), - word[pos]->source(attr_items[(const char *) part], queue)); - bool match = word[pos]->setSource(attr_items[(const char *) part], result); + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->source(attr_items[part], queue)); + bool match = word[pos]->setSource(attr_items[part], result); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; @@ -1369,9 +1062,9 @@ Transfer::processModifyCase(xmlNode *localroot) } else if(!xmlStrcmp(side, (const xmlChar *) "ref")) { - string const result = copycase(evalString(rightSide), - word[pos]->reference(attr_items[(const char *) part], queue)); - bool match = word[pos]->setReference(attr_items[(const char *) part], result); + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->reference(attr_items[part], queue)); + bool match = word[pos]->setReference(attr_items[part], result); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; @@ -1379,9 +1072,9 @@ Transfer::processModifyCase(xmlNode *localroot) } else { - string const result = copycase(evalString(rightSide), - word[pos]->target(attr_items[(const char *) part], queue)); - bool match = word[pos]->setTarget(attr_items[(const char *) part], result); + UString const result = StringUtils::copycase(evalString(rightSide), + word[pos]->target(attr_items[part], queue)); + bool match = word[pos]->setTarget(attr_items[part], result); if(!match && trace) { cerr << "apertium-transfer warning: on line " << localroot->line << " sometimes discards its value." << endl; @@ -1390,15 +1083,15 @@ Transfer::processModifyCase(xmlNode *localroot) } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { - string const val = (const char *) leftSide->properties->children->content; - variables[val] = copycase(evalString(rightSide), variables[val]); + UString const val = to_ustring((const char *) leftSide->properties->children->content); + variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); } } void Transfer::processCallMacro(xmlNode *localroot) { - string const n = (const char *) localroot->properties->children->content; + UString const n = to_ustring((const char *) localroot->properties->children->content); int npar = 0; xmlNode *macro = macro_map[macros[n]]; @@ -1455,680 +1148,75 @@ Transfer::processCallMacro(xmlNode *localroot) } int -Transfer::processChoose(xmlNode *localroot) +Transfer::processRule(xmlNode *localroot) { - int words_to_consume = -1; + int instruction_return, words_to_consume = -1; + // localroot is suposed to be an 'action' tag for(xmlNode *i = localroot->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) { - if(!xmlStrcmp(i->name, (const xmlChar *) "when")) - { - bool picked_option = false; - - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - if(!xmlStrcmp(j->name, (const xmlChar *) "test")) - { - if(!processTest(j)) - { - break; - } - else - { - picked_option = true; - } - } - else - { - words_to_consume = processInstruction(j); - if(words_to_consume != -1) - { - return words_to_consume; - } - } - } - } - if(picked_option) - { - return words_to_consume; - } - } - else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + instruction_return = processInstruction(i); + // When an instruction which modifies the number of words to be consumed + // from the input is found, execution of the rule is stopped + if(instruction_return != -1) { - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - words_to_consume = processInstruction(j); - if(words_to_consume != -1) - { - return words_to_consume; - } - } - } + words_to_consume = instruction_return; + break; } } } + + while(!blank_queue.empty()) //flush remaining blanks that are not spaces + { + if(blank_queue.front().compare(" "_u) != 0) { + write(blank_queue.front(), output); + } + blank_queue.pop(); + } + return words_to_consume; } -bool -Transfer::processLogical(xmlNode *localroot) +TransferToken & +Transfer::readToken(InputFile& in) { - if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) - { - return processEqual(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) - { - return processBeginsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) - { - return processBeginsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) - { - return processEndsWith(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) - { - return processEndsWithList(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) - { - return processContainsSubstring(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + if(!input_buffer.isEmpty()) { - return processOr(localroot); + return input_buffer.next(); } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + + UString content; + while(true) { - return processAnd(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) - { - return processNot(localroot); - } - else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) - { - return processIn(localroot); - } - - return false; -} - -bool -Transfer::processIn(xmlNode *localroot) -{ - xmlNode *value = NULL; - xmlChar *idlist = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(value == NULL) - { - value = i; - } - else - { - idlist = i->properties->children->content; - break; - } - } - } - - string sval = evalString(value); - - if(localroot->properties != NULL) - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - set &myset = listslow[(const char *) idlist]; - if(myset.find(tolower(sval)) != myset.end()) - { - return true; - } - else - { - return false; - } - } - } - - set &myset = lists[(const char *) idlist]; - if(myset.find(sval) != myset.end()) - { - return true; - } - else - { - return false; - } -} - -bool -Transfer::processTest(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return processLogical(i); - } - } - return false; -} - -bool -Transfer::processAnd(xmlNode *localroot) -{ - bool val = true; - for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val && processLogical(i); - } - } - - return val; -} - -bool -Transfer::processOr(xmlNode *localroot) -{ - bool val = false; - for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - val = val || processLogical(i); - } - } - - return val; -} - -bool -Transfer::processNot(xmlNode *localroot) -{ - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - return !processLogical(i); - } - } - return false; -} - -bool -Transfer::processEqual(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first) == evalString(second); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)) == tolower(evalString(second)); - } - else - { - return evalString(first) == evalString(second); - } - } -} - -bool -Transfer::beginsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = 0; i != limit; i++) - { - if(s1[i] != s2[i]) - { - return false; - } - } - - return true; -} - -bool -Transfer::endsWith(string const &s1, string const &s2) const -{ - int const limit = s2.size(), constraint = s1.size(); - - if(constraint < limit) - { - return false; - } - for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) - { - if(s1[j] != s2[i]) - { - return false; - } - } - - return true; -} - - -bool -Transfer::processBeginsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return beginsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return beginsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return beginsWith(evalString(first), evalString(second)); - } - } -} - -bool -Transfer::processEndsWith(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return endsWith(evalString(first), evalString(second)); - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return endsWith(tolower(evalString(first)), tolower(evalString(second))); - } - else - { - return endsWith(evalString(first), evalString(second)); - } - } -} - -bool -Transfer::processBeginsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(beginsWith(needle, *it)) - { - return true; - } - } - return false; -} - - -bool -Transfer::processEndsWithList(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - xmlChar *idlist = second->properties->children->content; - string needle = evalString(first); - set::iterator it, limit; - - if(localroot->properties == NULL || - xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) - { - it = lists[(const char *) idlist].begin(); - limit = lists[(const char *) idlist].end(); - } - else - { - needle = tolower(needle); - it = listslow[(const char *) idlist].begin(); - limit = listslow[(const char *) idlist].end(); - } - - for(; it != limit; it++) - { - if(endsWith(needle, *it)) - { - return true; - } - } - return false; -} - -bool -Transfer::processContainsSubstring(xmlNode *localroot) -{ - xmlNode *first = NULL, *second = NULL; - - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - if(first == NULL) - { - first = i; - } - else - { - second = i; - break; - } - } - } - - if(localroot->properties == NULL) - { - return evalString(first).find(evalString(second)) != string::npos; - } - else - { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) - { - return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; - } - else - { - return evalString(first).find(evalString(second)) != string::npos; - } - } -} - -string -Transfer::copycase(string const &source_word, string const &target_word) -{ - UString result; - UString const s_word = UtfConverter::fromUtf8(source_word); - UString const t_word = UtfConverter::fromUtf8(target_word); - - bool firstupper = iswupper(s_word[0]); - bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); - bool sizeone = s_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - result = t_word; - result[0] = towlower(result[0]); - //result = StringUtils::tolower(t_word); - } - else - { - result = StringUtils::toupper(t_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return UtfConverter::toUtf8(result); -} - -string -Transfer::caseOf(string const &str) -{ - UString const s = UtfConverter::fromUtf8(str); - - if(s.size() > 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else if(!iswupper(s[s.size()-1])) - { - return "Aa"; - } - else - { - return "AA"; - } - } - else if(s.size() == 1) - { - if(!iswupper(s[0])) - { - return "aa"; - } - else - { - return "Aa"; - } - } - else - { - return "aa"; - } -} - -string -Transfer::tolower(string const &str) const -{ - return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); -} - -string -Transfer::tags(string const &str) const -{ - string result = "<"; - - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - if(str[i] == '.') - { - result.append("><"); - } - else - { - result += str[i]; - } - } - - result += '>'; - - return result; -} - -int -Transfer::processRule(xmlNode *localroot) -{ - int instruction_return, words_to_consume = -1; - // localroot is suposed to be an 'action' tag - for(xmlNode *i = localroot->children; i != NULL; i = i->next) - { - if(i->type == XML_ELEMENT_NODE) - { - instruction_return = processInstruction(i); - // When an instruction which modifies the number of words to be consumed - // from the input is found, execution of the rule is stopped - if(instruction_return != -1) - { - words_to_consume = instruction_return; - break; - } - } - } - - while(!blank_queue.empty()) //flush remaining blanks that are not spaces - { - if(blank_queue.front().compare(" ") != 0) { - write(blank_queue.front(), output); - } - blank_queue.pop(); - } - - return words_to_consume; -} - -TransferToken & -Transfer::readToken(InputFile& in) -{ - if(!input_buffer.isEmpty()) - { - return input_buffer.next(); - } - - UString content; - while(true) - { - int val = fgetwc_unlocked(in); - if(feof(in) || (val == 0 && internal_null_flush)) + UChar32 val = in.get(); + if(in.eof() || (val == 0 && internal_null_flush)) { in_wblank = false; return input_buffer.add(TransferToken(content, tt_eof)); } if(in_wblank) { - content = "[["; - content+= wchar_t(val); + content = "[["_u; + content += val; while(true) { - int val3 = fgetwc_unlocked(in); - if(val3 == L'\\') + UChar32 val3 = in.get(); + if(val3 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val3 == L'$') //[[..]]^..$ is the LU + else if(val3 == '$') //[[..]]^..$ is the LU { in_wblank = false; return input_buffer.add(TransferToken(content, tt_word)); } - else if(val3 == L'\0' && null_flush) + else if(val3 == '\0' && null_flush) { in_wblank = false; - fflush(output); + u_fflush(output); } else { @@ -2138,30 +1226,30 @@ Transfer::readToken(InputFile& in) } if(val == '\\') { - content += L'\\'; - content += (wchar_t) fgetwc_unlocked(in); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + UChar32 val2 = in.get(); + if(val2 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val2 == L'[') + else if(val2 == '[') { //wordbound blank in_wblank = true; content.pop_back(); return input_buffer.add(TransferToken(content, tt_blank)); } - else if(val2 == L']') + else if(val2 == ']') { - content += L']'; + content += ']'; break; } else @@ -2170,18 +1258,18 @@ Transfer::readToken(InputFile& in) } } } - else if(val == L'$') + else if(val == '$') { return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { return input_buffer.add(TransferToken(content, tt_blank)); } - else if(val == L'\0' && null_flush) + else if(val == '\0' && null_flush) { in_wblank = false; - fflush(output); + u_fflush(output); } else { @@ -2190,24 +1278,6 @@ Transfer::readToken(InputFile& in) } } -bool -Transfer::getNullFlush(void) -{ - return null_flush; -} - -void -Transfer::setNullFlush(bool null_flush) -{ - this->null_flush = null_flush; -} - -void -Transfer::setTrace(bool trace) -{ - this->trace = trace; -} - void Transfer::setTraceATT(bool trace) { @@ -2227,16 +1297,12 @@ Transfer::transfer_wrapper_null_flush(InputFile& in, UFILE* out) null_flush = false; internal_null_flush = true; - while(!feof(in)) + while(!in.eof()) { tmp_clear(); transfer(in, out); u_fputc('\0', out); - int code = fflush(out); - if(code != 0) - { - cerr << "Could not flush output " << errno << endl; - } + u_fflush(out); } internal_null_flush = false; @@ -2353,21 +1419,18 @@ Transfer::transfer(InputFile& in, UFILE* out) UString tr_wblank; if(useBilingual && preBilingual == false) { - if(isExtended && (*tmpword[0])[0] == L'*') - { - tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); - if(tr.first[0] == L'@') - { - tr.first[0] = L'*'; + if(isExtended && (*tmpword[0])[0] == '*') { + tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); + if(tr.first[0] == '@') { + tr.first[0] = '*'; + } else { + UString temp; + temp += '%'; + temp.append(tr.first); + temp.swap(tr.first); } - else - { - tr.first = "%" + tr.first; - } - } - else - { - tr = fstp.biltransWithQueue(*tmpword[0], false); + } else { + tr = fstp.biltransWithQueue(*tmpword[0], false); } } else if(preBilingual) @@ -2380,7 +1443,7 @@ Transfer::transfer(InputFile& in, UFILE* out) int seenSlash = 0; for(UString::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) { - if(*it == L'\\') + if(*it == '\\') { if(seenSlash == 0) { @@ -2402,19 +1465,19 @@ Transfer::transfer(InputFile& in, UFILE* out) } continue; } - else if(*it == L'[') + else if(*it == '[') { - if(*(it+1) == L'[') //wordbound blank + if(*(it+1) == '[') //wordbound blank { while(true) { - if(*it == L'\\') + if(*it == '\\') { wblank.push_back(*it); it++; wblank.push_back(*it); } - else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + else if(*it == '^' && *(it-1) == ']' && *(it-2) == ']') { break; } @@ -2443,7 +1506,7 @@ Transfer::transfer(InputFile& in, UFILE* out) } continue; } - else if(*it == L'/') + else if(*it == '/') { seenSlash++; @@ -2537,7 +1600,7 @@ Transfer::transfer(InputFile& in, UFILE* out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -2580,8 +1643,7 @@ Transfer::applyRule() { if(int(blank_queue.size()) < last_lword - 1) { - string blank_to_add = string(UtfConverter::toUtf8(*tmpblank[i-1])); - blank_queue.push(blank_to_add); + blank_queue.push(*tmpblank[i-1]); } } @@ -2589,12 +1651,7 @@ Transfer::applyRule() if(useBilingual && preBilingual == false) { tr = fstp.biltransWithQueue(*tmpword[i], false); - UString refx,wblankx; - word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), - UtfConverter::toUtf8(wblankx), - tr.second); + word[i] = new TransferWord(*tmpword[i], tr.first, ""_u, ""_u, tr.second); } else if(preBilingual) { @@ -2606,7 +1663,7 @@ Transfer::applyRule() int seenSlash = 0; for(UString::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) { - if(*it == L'\\') + if(*it == '\\') { if(seenSlash == 0) { @@ -2628,19 +1685,19 @@ Transfer::applyRule() } continue; } - else if(*it == L'[') + else if(*it == '[') { - if(*(it+1) == L'[') //wordbound blank + if(*(it+1) == '[') //wordbound blank { while(true) { - if(*it == L'\\') + if(*it == '\\') { wblank.push_back(*it); it++; wblank.push_back(*it); } - else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + else if(*it == '^' && *(it-1) == ']' && *(it-2) == ']') { break; } @@ -2670,7 +1727,7 @@ Transfer::applyRule() continue; } - if(*it == L'/') + if(*it == '/') { seenSlash++; @@ -2691,21 +1748,12 @@ Transfer::applyRule() } } tr = pair(tl, false); - word[i] = new TransferWord(UtfConverter::toUtf8(sl), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(ref), - UtfConverter::toUtf8(wblank), - tr.second); + word[i] = new TransferWord(sl, tr.first, ref, wblank, tr.second); } else // neither useBilingual nor preBilingual (sl==tl) { tr = pair(*tmpword[i], false); - UString refx,wblankx; - word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), - UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), - UtfConverter::toUtf8(wblankx), - tr.second); + word[i] = new TransferWord(*tmpword[i], tr.first, ""_u, ""_u, tr.second); } } @@ -2732,29 +1780,29 @@ Transfer::applyRule() void Transfer::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; ms.step(towlower(word_str[i]), any_char); break; - case L'[': - if(word_str[i+1] == L'[') + case '[': + if(word_str[i+1] == '[') { while(true) { - if(word_str[i] == L'\\') + if(word_str[i] == '\\') { i++; } else if(i >= 4) { - if(word_str[i] == L'^' && word_str[i-1] == L']' && word_str[i-2] == L']') + if(word_str[i] == '^' && word_str[i-1] == ']' && word_str[i-2] == ']') { break; } @@ -2769,14 +1817,14 @@ Transfer::applyWord(UString const &word_str) } break; - case L'/': + case '/': i = limit; break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -2798,7 +1846,7 @@ Transfer::applyWord(UString const &word_str) break; } } - ms.step(L'$'); + ms.step('$'); } void diff --git a/apertium/transfer.h b/apertium/transfer.h index c6f9edd..5955daa 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -17,46 +17,25 @@ #ifndef _TRANSFER_ #define _TRANSFER_ -#include +#include + #include #include -#include -#include #include #include -#include -#include -#include #include -#include -#include -#include -#include #include #include using namespace std; -class Transfer +class Transfer : TransferBase { private: - Alphabet alphabet; - MatchExe *me; - MatchState ms; - map attr_items; - map variables; - map macros; - map> lists; - map> listslow; - vector macro_map; - vector rule_map; - vector rule_lines; - xmlDoc *doc; - xmlNode *root_element; TransferWord **word; - queue blank_queue; + queue blank_queue; int lword; int last_lword; Buffer input_buffer; @@ -65,51 +44,31 @@ private: bool in_out; bool in_lu; - bool in_let_var; - string var_val; //stores the name of the variable being processed (in let or append) bool in_wblank; - string out_wblank; - map var_out_wblank; + UString out_wblank; + map var_out_wblank; - bool gettingLemmaFromWord(string attr); - string combineWblanks(string wblank_current, string wblank_to_add); - FSTProcessor fstp; FSTProcessor extended; bool isExtended; UFILE *output; - int any_char; - int any_tag; xmlNode *lastrule; unsigned int nwords; - map evalStringCache; - enum OutputType{lu,chunk}; OutputType defaultAttrs; bool preBilingual; bool useBilingual; - bool null_flush; - bool internal_null_flush; - bool trace; bool trace_att; - string emptyblank; + UString emptyblank; - void destroy(); - void readData(FILE *input); void readBil(string const &filename); - void readTransfer(string const &input); - void collectMacros(xmlNode *localroot); - void collectRules(xmlNode *localroot); - string caseOf(string const &str); - string copycase(string const &source_word, string const &target_word); void processLet(xmlNode *localroot); void processAppend(xmlNode *localroot); - int processRejectCurrentRule(xmlNode *localroot); void processOut(xmlNode *localroot); void processCallMacro(xmlNode *localroot); void processModifyCase(xmlNode *localroot); @@ -126,16 +85,12 @@ private: bool processNot(xmlNode *localroot); bool processIn(xmlNode *localroot); int processRule(xmlNode *localroot); - string evalString(xmlNode *localroot); + UString evalString(xmlNode *localroot); int processInstruction(xmlNode *localroot); int processChoose(xmlNode *localroot); - string processChunk(xmlNode *localroot); - string processTags(xmlNode *localroot); + UString processChunk(xmlNode *localroot); + UString processTags(xmlNode *localroot); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; UString readWord(InputFile& in); UString readBlank(InputFile& in); UString readUntil(InputFile& in, int const symbol) const; @@ -158,9 +113,6 @@ public: bool getPreBilingual(void) const; void setExtendedDictionary(string const &fstfile); void setCaseSensitiveness(bool value); - bool getNullFlush(void); - void setNullFlush(bool null_flush); - void setTrace(bool trace); void setTraceATT(bool trace); }; diff --git a/apertium/transfer_base.cc b/apertium/transfer_base.cc new file mode 100644 index 0000000..6b96e04 --- /dev/null +++ b/apertium/transfer_base.cc @@ -0,0 +1,582 @@ +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +TransferBase::TransferBase() + : me(nullptr), doc(nullptr), root_element(nullptr), + any_char(0), any_tag(0), in_let_var(false), + null_flush(false), internal_null_flush(false), trace(false) +{} + +TransferBase::~TransferBase() +{ + if (me) { + delete me; + me = nullptr; + } + if (doc) { + xmlFreeDoc(doc); + doc = nullptr; + } +} + +void +TransferBase::read(const char* transferfile, const char* datafile) +{ + doc = xmlReadFile(transferfile, NULL, 0); + if (doc == NULL) { + cerr << "Error: Could not parse file '" << transferfile << "'." << endl; + exit(EXIT_FAILURE); + } + root_element = xmlDocGetRootElement(doc); + + for (auto i : children(root_element)) { + if (!xmlStrcmp(i->name, (const xmlChar*) "section-def-macros")) { + collectMacros(i); + } else if (!xmlStrcmp(i->name, (const xmlChar*) "section-rules")) { + collectRules(i); + } + } + + + FILE* in = fopen(datafile, "rb"); + if (!in) { + cerr << "Error: Could not open file '" << datafile << "' for reading." << endl; + exit(EXIT_FAILURE); + } + + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + Compression::string_read(in); // formerly PCRE version, now blank + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + attr_items[cad_k].read(in); + UString fallback = Compression::string_read(in); + attr_items[cad_k].compile(fallback); + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + UString const cad_k = Compression::string_read(in); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + UString const cad_v = Compression::string_read(in); + lists[cad_k].insert(cad_v); + listslow[cad_k].insert(StringUtils::tolower(cad_v)); + } + } +} + +void +TransferBase::collectRules(xmlNode* localroot) +{ + for (auto rule : children(localroot)) { + size_t line = rule->line; + for (auto rulechild : children(rule)) { + if(!xmlStrcmp(rulechild->name, (const xmlChar *) "action")) { + rule_map.push_back(rulechild); + rule_lines.push_back(line); + break; + } + } + } +} + +void +TransferBase::collectMacros(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + macro_map.push_back(i); + } +} + +bool +TransferBase::gettingLemmaFromWord(const UString& attr) +{ + return attr == "lem"_u || attr == "lemh"_u || attr == "whole"_u; +} + +UString +TransferBase::combineWblanks(const UString& first, const UString& second) +{ + if (first.empty()) { + return second; + } else if (second.empty()) { + return first; + } + UString ret; + ret.reserve(first.size() + second.size()); + if (endsWith(first, "]]"_u)) { + if (first.size() > 2) { + size_t i = first.size() - 3; + bool esc = false; + while (first[i] == '\\') { + i--; + esc = !esc; + } + if (esc) { + ret.append(first); + } else { + ret.append(first.substr(0, first.size()-2)); + } + } else { + ret.append(first.substr(0, first.size()-2)); + } + } else { + ret.append(first); + } + ret += ';'; + ret += ' '; + if (beginsWith(second, "[["_u)) { + ret.append(second.substr(2)); + } else { + ret.append(second); + } + return ret; +} + +UString +TransferBase::evalString(xmlNode* element) +{ + if (!element) { + throw "evalString() was called on a NULL element"; + } + if (evalStringCache.find(element) != evalStringCache.end()) { + return evalCachedString(element); + } + if (!xmlStrcmp(element->name, (const xmlChar*) "clip")) { + processClip(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lit-tag")) { + evalStringCache[element] = TransferInstr(ti_lit_tag, tags(getattr(element, "v")), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lit")) { + evalStringCache[element] = TransferInstr(ti_lit, getattr(element, "v"), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "b")) { + processBlank(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "get-case-from")) { + int pos = atoi((const char*) element->properties->children->content); + xmlNode* param; + for (auto it : children(element)) { + param = it; + break; + } + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem"_u, pos, param); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "var")) { + evalStringCache[element] = TransferInstr(ti_var, getattr(element, "n"), 0); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu-count")) { + evalLuCount(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "case-of")) { + evalCaseOf(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "concat")) { + UString value; + for (auto it : children(element)) { + value.append(evalString(it)); + } + return value; + } else if (!xmlStrcmp(element->name, (const xmlChar*) "lu")) { + evalLu(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "mlu")) { + evalMlu(element); + } else if (!xmlStrcmp(element->name, (const xmlChar*) "chunk")) { + evalChunk(element); + } else { + cerr << "Error: unexpected expression: '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + return evalCachedString(element); +} + +int +TransferBase::processInstruction(xmlNode* localroot) +{ + int words_to_consume = -1; + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + words_to_consume = processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) + { + words_to_consume = processRejectCurrentRule(localroot); + } + return words_to_consume; +} + +int +TransferBase::processRejectCurrentRule(xmlNode* localroot) +{ + bool shifting = (getattr(localroot, "shifting") == "yes"_u); + return shifting ? 1 : 0; +} + +int +TransferBase::processChoose(xmlNode* localroot) +{ + int words_to_consume = -1; + for (auto option : children(localroot)) { + if (!xmlStrcmp(option->name, (const xmlChar*) "when")) { + bool picked = false; + for (auto it : children(option)) { + if (!xmlStrcmp(it->name, (const xmlChar*) "test")) { + if (!processTest(it)) { + break; + } else { + picked = true; + } + } else { + words_to_consume = processInstruction(it); + if (words_to_consume != -1) { + return words_to_consume; + } + } + } + if (picked) { + return words_to_consume; + } + } else if (!xmlStrcmp(option->name, (const xmlChar*) "otherwise")) { + for (auto it : children(option)) { + words_to_consume = processInstruction(it); + if (words_to_consume != -1) { + return words_to_consume; + } + } + } + } + return words_to_consume; +} + +void +TransferBase::processAppend(xmlNode* localroot) +{ + UString name = getattr(localroot, "n"); + for (auto i : children(localroot)) { + in_let_var = true; + var_val = name; + variables[name].append(evalString(i)); + in_let_var = false; + } +} + +bool +TransferBase::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +TransferBase::processTest(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + return processLogical(i); + } + return false; +} + +bool +TransferBase::processAnd(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + if (!processLogical(i)) { + return false; + } + } + return true; +} + +bool +TransferBase::processOr(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + if (processLogical(i)) { + return true; + } + } + return false; +} + +bool +TransferBase::processNot(xmlNode* localroot) +{ + for (auto i : children(localroot)) { + return !processLogical(i); + } + return false; +} + +bool +TransferBase::beginsWith(const UString& haystack, const UString& needle) +{ + const size_t hlen = haystack.size(); + const size_t nlen = needle.size(); + if (hlen < nlen) { + return false; + } + for (size_t i = 0; i < nlen; i++) { + if (haystack[i] != needle[i]) { + return false; + } + } + return true; +} + +bool +TransferBase::endsWith(const UString& haystack, const UString& needle) +{ + if (needle.size() > haystack.size()) { + return false; + } + for (int h = haystack.size()-1, n = needle.size()-1; n >= 0; h--, n--) { + if (haystack[h] != needle[n]) { + return false; + } + } + return true; +} + +pair +TransferBase::twoChildren(xmlNode* localroot) +{ + xmlNode* first = nullptr; + xmlNode* second = nullptr; + for (auto i : children(localroot)) { + if (!first) { + first = i; + } else { + second = i; + break; + } + } + return make_pair(first, second); +} + +bool +TransferBase::processBeginsWith(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return beginsWith(StringUtils::tolower(evalString(ch.first)), + StringUtils::tolower(evalString(ch.second))); + } else { + return beginsWith(evalString(ch.first), evalString(ch.second)); + } +} + +bool +TransferBase::processBeginsWithList(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString needle = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + bool caseless = (getattr(localroot, "caseless") == "yes"_u); + if (caseless) { + needle = StringUtils::tolower(needle); + } + for (auto it : (caseless ? listslow[idlist] : lists[idlist])) { + if (beginsWith(needle, it)) { + return true; + } + } + return false; +} + +bool +TransferBase::processEndsWith(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return endsWith(StringUtils::tolower(evalString(ch.first)), + StringUtils::tolower(evalString(ch.second))); + } else { + return endsWith(evalString(ch.first), evalString(ch.second)); + } +} + +bool +TransferBase::processEndsWithList(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString needle = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + bool caseless = (getattr(localroot, "caseless") == "yes"_u); + if (caseless) { + needle = StringUtils::tolower(needle); + } + for (auto it : (caseless ? listslow[idlist] : lists[idlist])) { + if (endsWith(needle, it)) { + return true; + } + } + return false; +} + +bool +TransferBase::processContainsSubstring(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return StringUtils::tolower(evalString(ch.first)).find(StringUtils::tolower(evalString(ch.second))) != UString::npos; + } else { + return evalString(ch.first).find(evalString(ch.second)) != UString::npos; + } +} + +bool +TransferBase::processEqual(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + if (getattr(localroot, "caseless") == "yes"_u) { + return StringUtils::tolower(evalString(ch.first)) == StringUtils::tolower(evalString(ch.second)); + } else { + return evalString(ch.first) == evalString(ch.second); + } +} + +bool +TransferBase::processIn(xmlNode* localroot) +{ + auto ch = twoChildren(localroot); + UString sval = evalString(ch.first); + UString idlist = getattr(ch.second, "n"); + if (getattr(localroot, "caseless") == "yes"_u) { + set& myset = listslow[idlist]; + return (myset.find(StringUtils::tolower(sval)) != myset.end()); + } else { + set& myset = lists[idlist]; + return (myset.find(sval) != myset.end()); + } +} + +UString +TransferBase::tags(const UString& str) const +{ + UString ret; + ret.reserve(str.size()+2); + ret += '<'; + for (auto c : u16iter(str)) { + if (c == '.') { + ret += '>'; + ret += '<'; + } else { + ret += c; + } + } + ret += '>'; + return ret; +} + +bool +TransferBase::getNullFlush(void) +{ + return null_flush; +} + +void +TransferBase::setNullFlush(bool val) +{ + null_flush = val; +} + +void +TransferBase::setTrace(bool val) +{ + trace = val; +} + diff --git a/apertium/transfer_base.h b/apertium/transfer_base.h new file mode 100644 index 0000000..d5adfde --- /dev/null +++ b/apertium/transfer_base.h @@ -0,0 +1,108 @@ +#ifndef _APERTIUM_TRANSFER_BASE_ +#define _APERTIUM_TRANSFER_BASE_ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +using namespace std; + +class TransferBase +{ +protected: + Alphabet alphabet; + MatchExe* me; + MatchState ms; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; + vector macro_map; + vector rule_map; + vector rule_lines; + xmlDoc* doc; + xmlNode* root_element; + + int32_t any_char; + int32_t any_tag; + + bool in_let_var; + UString var_val; + map evalStringCache; + + bool null_flush; + bool internal_null_flush; + bool trace; + + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + + bool gettingLemmaFromWord(const UString& attr); + UString combineWblanks(const UString& first, const UString& second); + + UString evalString(xmlNode* element); + virtual UString evalCachedString(xmlNode* element) = 0; + + virtual void processClip(xmlNode* element) = 0; + virtual void processBlank(xmlNode* element) = 0; + virtual void evalLuCount(xmlNode* element) = 0; + virtual void evalCaseOf(xmlNode* element) = 0; + virtual void evalLu(xmlNode* element) = 0; + virtual void evalMlu(xmlNode* element) = 0; + virtual void evalChunk(xmlNode* element) = 0; + + int processInstruction(xmlNode* localroot); + int processRejectCurrentRule(xmlNode* localroot); + int processChoose(xmlNode* localroot); + void processAppend(xmlNode* localroot); + + virtual void processLet(xmlNode* localroot) = 0; + virtual void processOut(xmlNode* localroot) = 0; + virtual void processCallMacro(xmlNode* localroot) = 0; + virtual void processModifyCase(xmlNode* localroot) = 0; + + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processNot(xmlNode *localroot); + + bool beginsWith(const UString& haystack, const UString& needle); + bool endsWith(const UString& haystack, const UString& needle); + + pair twoChildren(xmlNode* localroot); + + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *localroot); + bool processContainsSubstring(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processIn(xmlNode *localroot); + + virtual int processRule(xmlNode *localroot) = 0; + + UString tags(const UString& s) const; + +public: + TransferBase(); + ~TransferBase(); + + void read(const char* transferfile, const char* datafile); + bool getNullFlush(void); + void setNullFlush(bool null_flush); + void setTrace(bool trace); +}; + +#endif diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index 2de9701..ac1d0c4 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -46,14 +46,14 @@ TransferData::destroy() TransferData::TransferData() { // adding fixed attr_items - attr_items["lem"] = "^(([^<]|\"\\<\")+)"; - attr_items["lemq"] = "\\#[- _][^<]+"; - attr_items["lemh"] = "^(([^<#]|\"\\<\"|\"\\#\")+)"; - attr_items["whole"] = "(.+)"; - attr_items["tags"] = "((<[^>]+>)+)"; - attr_items["chname"] = "({([^/]+)\\/)"; // includes delimiters { and / !!! - attr_items["chcontent"] = "(\\{.+)"; - attr_items["content"] = "(\\{.+)"; + attr_items["lem"_u] = "^(([^<]|\"\\<\")+)"_u; + attr_items["lemq"_u] = "\\#[- _][^<]+"_u; + attr_items["lemh"_u] = "^(([^<#]|\"\\<\"|\"\\#\")+)"_u; + attr_items["whole"_u] = "(.+)"_u; + attr_items["tags"_u] = "((<[^>]+>)+)"_u; + attr_items["chname"_u] = "({([^/]+)\\/)"_u; // includes delimiters { and / !!! + attr_items["chcontent"_u] = "(\\{.+)"_u; + attr_items["content"_u] = "(\\{.+)"_u; } TransferData::~TransferData() @@ -115,7 +115,9 @@ TransferData::getVariables() int TransferData::countToFinalSymbol(const int count) { - const UString count_sym = ""; + UChar buf[64]; + u_snprintf(buf, 64, "", count); + UString count_sym = buf; alphabet.includeSymbol(count_sym); const int symbol = alphabet(count_sym); final_symbols.insert(symbol); @@ -134,7 +136,7 @@ TransferData::write(FILE *output) // Find all arcs with "final_symbols" in the transitions, let their source node instead be final, // and extract the rule number from the arc. Record relation between source node and rule number // in finals_rules. It is now no longer safe to minimize -- but we already did that. - const UString rule_sym_pre = " > >::const_iterator it = transitions.begin(), limit = transitions.end(); it != limit; ++it) { @@ -224,14 +226,16 @@ TransferData::write(FILE *output) void TransferData::writeRegexps(FILE *output) { - Compression::string_write(pcre_version_endian(), output); + // since ICU doesn't have a binary form, it doesn't matter + // what the version is, so leave it blank + Compression::string_write(""_u, output); Compression::multibyte_write(attr_items.size(), output); for (auto& it : attr_items) { Compression::string_write(it.first, output); - ApertiumRE my_re; - my_re.compile(UtfConverter::toUtf8(it.second)); - my_re.write(output); + // empty binary form, since ICU doesn't have a dump function + // like PCRE did + Compression::multibyte_write(0, output); Compression::string_write(it.second, output); } } diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc index 1938a6f..a163430 100644 --- a/apertium/transfer_mult.cc +++ b/apertium/transfer_mult.cc @@ -60,18 +60,6 @@ TransferMult::~TransferMult() destroy(); } -string -TransferMult::tolower(string const &str) const -{ - string result = str; - for(unsigned int i = 0, limit = str.size(); i != limit; i++) - { - result[i] = ::tolower(result[i]); - } - - return result; -} - void TransferMult::readData(FILE *in) { @@ -94,41 +82,39 @@ TransferMult::readData(FILE *in) me = new MatchExe(t, finals); // attr_items - bool recompile_attrs = Compression::string_read(in) != pcre_version_endian(); + Compression::string_read(in); // PCRE version placeholder for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); + UString const cad_k = Compression::string_read(in); attr_items[cad_k].read(in); UString fallback = Compression::string_read(in); - if(recompile_attrs) { - attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); - } + attr_items[cad_k].compile(fallback); } // variables for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); - variables[cad_k] = UtfConverter::toUtf8(Compression::string_read(in)); + UString const cad_k = Compression::string_read(in); + variables[cad_k] = Compression::string_read(in); } // macros for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); + UString const cad_k = Compression::string_read(in); macros[cad_k] = Compression::multibyte_read(in); } // lists for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { - string const cad_k = UtfConverter::toUtf8(Compression::string_read(in)); + UString const cad_k = Compression::string_read(in); for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) { UString const cad_v = Compression::string_read(in); - lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); - listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + lists[cad_k].insert(cad_v); + listslow[cad_k].insert(StringUtils::tolower(cad_v)); } } } @@ -171,33 +157,33 @@ TransferMult::readToken(InputFile& in) return input_buffer.next(); } - UString content = ""; + UString content; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in)) + UChar32 val = in.get(); + if(in.eof()) { return input_buffer.add(TransferToken(content, tt_eof)); } - if(val == L'\\') + if(val == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val == L'[') + else if(val == '[') { - content += L'['; + content += '['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') + UChar32 val2 = in.get(); + if(val2 == '\\') { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); + content += '\\'; + content += in.get(); } - else if(val2 == L']') + else if(val2 == ']') { - content += L']'; + content += ']'; break; } else @@ -206,11 +192,11 @@ TransferMult::readToken(InputFile& in) } } } - else if(val == L'$') + else if(val == '$') { return input_buffer.add(TransferToken(content, tt_word)); } - else if(val == L'^') + else if(val == '^') { return input_buffer.add(TransferToken(content, tt_blank)); } @@ -256,7 +242,7 @@ TransferMult::transfer(InputFile& in, UFILE* out) { write("[|]"_u, output); } - u_fprintf(output, "^%S$", multiwords[i].c_str()); + u_fprintf(output, "^%S$", multiword[i].c_str()); } if(multiword.size() > 1) { @@ -297,7 +283,7 @@ TransferMult::transfer(InputFile& in, UFILE* out) break; case tt_blank: - ms.step(L' '); + ms.step(' '); tmpblank.push_back(¤t.getContent()); break; @@ -324,7 +310,7 @@ TransferMult::transfer(InputFile& in, UFILE* out) bool TransferMult::isDefaultWord(UString const &str) { - return str.find(" D<"); + return str.find(" D<"_u) != UString::npos; } vector @@ -334,18 +320,18 @@ TransferMult::acceptions(UString str) int low = 0; // removing '@' - if(str[0] == L'@') + if(str[0] == '@') { str = str.substr(1); } for(unsigned int i = 0, limit = str.size(); i != limit; i++) { - if(str[i] == L'\\') + if(str[i] == '\\') { i++; } - else if(str[i] == L'/') + else if(str[i] == '/') { UString new_word = str.substr(low, i-low); @@ -379,7 +365,7 @@ TransferMult::acceptions(UString str) vector result2; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { - if(result[i].find("__") != UString::npos) + if(result[i].find("__"_u) != UString::npos) { result2.push_back(result[i]); } @@ -403,7 +389,7 @@ TransferMult::writeMultiple(list >::iterator itwords, { if(multiple) { - output_string.append("[|]"); + output_string.append("[|]"_u); } output_string.append(acum); } @@ -417,8 +403,11 @@ TransferMult::writeMultiple(list >::iterator itwords, { for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { - writeMultiple(itwords, itblanks, limitwords, - acum + "^" + refword[i] + "$", multiple || (i > 0)); + UString temp = acum; + temp += '^'; + temp += refword[i]; + temp += '$'; + writeMultiple(itwords, itblanks, limitwords, temp, multiple || (i > 0)); } } else @@ -428,8 +417,13 @@ TransferMult::writeMultiple(list >::iterator itwords, for(unsigned int i = 0, limit = refword.size(); i != limit; i++) { + UString temp = acum; + temp += '^'; + temp += refword[i]; + temp += '$'; + temp += refblank; writeMultiple(itwords, itblanks, limitwords, - acum + "^" + refword[i] + "$" + refblank, + temp, multiple || (i > 0)); } } @@ -452,7 +446,7 @@ TransferMult::applyRule() words.push_back(acceptions(tr.first)); } - output_string = ""; + output_string.clear(); writeMultiple(words.begin(), blanks.begin(), words.end()); if(output_string.find("[|]"_u) != UString::npos) { @@ -471,20 +465,20 @@ TransferMult::applyRule() void TransferMult::applyWord(UString const &word_str) { - ms.step(L'^'); + ms.step('^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { switch(word_str[i]) { - case L'\\': + case '\\': i++; ms.step(towlower(word_str[i]), any_char); break; - case L'<': + case '<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L'>') + if(word_str[j] == '>') { int symbol = alphabet(word_str.substr(i, j-i+1)); if(symbol) @@ -506,5 +500,5 @@ TransferMult::applyWord(UString const &word_str) break; } } - ms.step(L'$'); + ms.step('$'); } diff --git a/apertium/transfer_mult.h b/apertium/transfer_mult.h index c3ec5be..02d2963 100644 --- a/apertium/transfer_mult.h +++ b/apertium/transfer_mult.h @@ -40,13 +40,13 @@ private: Alphabet alphabet; MatchExe *me; MatchState ms; - map attr_items; - map variables; - map macros; - map> lists; - map> listslow; + map attr_items; + map variables; + map macros; + map> lists; + map> listslow; TransferWord **word; - string **blank; + UString **blank; Buffer input_buffer; vector tmpword; vector tmpblank; @@ -66,15 +66,15 @@ private: OutputType defaultAttrs; void destroy(); - void readData(InputFile& input); + void readData(FILE* input); void readBil(string const &filename); - string caseOf(string const &str); - string copycase(string const &source_word, string const &target_word); + UString caseOf(UString const &str); + UString copycase(UString const &source_word, UString const &target_word); - bool beginsWith(string const &str1, string const &str2) const; - bool endsWith(string const &str1, string const &str2) const; - string tolower(string const &str) const; - string tags(string const &str) const; + bool beginsWith(UString const &str1, UString const &str2) const; + bool endsWith(UString const &str1, UString const &str2) const; + UString tolower(UString const &str) const; + UString tags(UString const &str) const; UString readWord(InputFile& in); UString readBlank(InputFile& in); UString readUntil(InputFile& in, int const symbol) const; @@ -84,7 +84,7 @@ private: void writeMultiple(list >::iterator itwords, list::iterator itblanks, list >::const_iterator limitwords, - UString acum = "", bool multiple = false); + UString acum = ""_u, bool multiple = false); vector acceptions(UString str); bool isDefaultWord(UString const &str); public: diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc index bca3232..fe87da7 100644 --- a/apertium/transfer_word.cc +++ b/apertium/transfer_word.cc @@ -40,7 +40,7 @@ queue_length(0) { } -TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue) +TransferWord::TransferWord(UString const &src, UString const &tgt, UString const &ref, UString const &wblank, int queue) { init(src, tgt, ref, wblank); queue_length = queue; @@ -68,7 +68,7 @@ TransferWord::operator =(TransferWord const &o) } void -TransferWord::init(string const &src, string const &tgt, string const &ref, string const &wblank) +TransferWord::init(UString const &src, UString const &tgt, UString const &ref, UString const &wblank) { s_str = src; t_str = tgt; @@ -76,7 +76,7 @@ TransferWord::init(string const &src, string const &tgt, string const &ref, stri wb_str = wblank; } -string +UString TransferWord::source(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -89,7 +89,7 @@ TransferWord::source(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::target(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -102,7 +102,7 @@ TransferWord::target(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::reference(ApertiumRE const &part, bool with_queue) { if(with_queue) @@ -115,14 +115,14 @@ TransferWord::reference(ApertiumRE const &part, bool with_queue) } } -string +UString TransferWord::getWblank() { return wb_str; } bool -TransferWord::setSource(ApertiumRE const &part, string const &value, +TransferWord::setSource(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -131,7 +131,7 @@ TransferWord::setSource(ApertiumRE const &part, string const &value, } else { - string mystring = s_str.substr(0, s_str.size() - queue_length); + UString mystring = s_str.substr(0, s_str.size() - queue_length); bool ret = part.replace(mystring, value); s_str = mystring + s_str.substr(s_str.size() - queue_length); return ret; @@ -139,7 +139,7 @@ TransferWord::setSource(ApertiumRE const &part, string const &value, } bool -TransferWord::setTarget(ApertiumRE const &part, string const &value, +TransferWord::setTarget(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -148,7 +148,7 @@ TransferWord::setTarget(ApertiumRE const &part, string const &value, } else { - string mystring = t_str.substr(0, t_str.size() - queue_length); + UString mystring = t_str.substr(0, t_str.size() - queue_length); bool ret = part.replace(mystring, value); t_str = mystring + t_str.substr(t_str.size() - queue_length); return ret; @@ -156,7 +156,7 @@ TransferWord::setTarget(ApertiumRE const &part, string const &value, } bool -TransferWord::setReference(ApertiumRE const &part, string const &value, +TransferWord::setReference(ApertiumRE const &part, UString const &value, bool with_queue) { if(with_queue) @@ -165,7 +165,7 @@ TransferWord::setReference(ApertiumRE const &part, string const &value, } else { - string mystring = r_str.substr(0, r_str.size() - queue_length); + UString mystring = r_str.substr(0, r_str.size() - queue_length); bool ret = part.replace(mystring, value); r_str = mystring + r_str.substr(r_str.size() - queue_length); return ret; diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h index 5e63133..c286edf 100644 --- a/apertium/transfer_word.h +++ b/apertium/transfer_word.h @@ -22,6 +22,7 @@ #include #include #include +#include using namespace std; @@ -34,22 +35,22 @@ private: /** * Source language word */ - string s_str; + UString s_str; /** * Target language word */ - string t_str; + UString t_str; /** * Reference word */ - string r_str; + UString r_str; /** * Wordbound blank */ - string wb_str; + UString wb_str; /** * Queue length @@ -73,7 +74,7 @@ private: * @param part regular expression to match/access * @return reference to matched/accessed string */ - string access(string const &str, ApertiumRE const &part); + UString access(UString const &str, ApertiumRE const &part); /** * Assings a value to the source/target/reference side of a word using the @@ -82,7 +83,7 @@ private: * @param part regular expression to match/access * @param value the string to be assigned */ - void assign(string &str, ApertiumRE const &part, string const &value); + void assign(UString &str, ApertiumRE const &part, UString const &value); public: /** @@ -108,7 +109,7 @@ public: * @param wblank wordbound blank * @param queue queue lenght */ - TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue = 0); + TransferWord(UString const &src, UString const &tgt, UString const &ref, UString const &wblank, int queue = 0); /** * Assignment operator @@ -125,7 +126,7 @@ public: * @param ref reference word * @param wblank wordbound blank */ - void init(string const &src, string const &tgt, string const &ref, string const &wblank); + void init(UString const &src, UString const &tgt, UString const &ref, UString const &wblank); /** * Reference a source language word part @@ -133,7 +134,7 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string source(ApertiumRE const &part, bool with_queue = true); + UString source(ApertiumRE const &part, bool with_queue = true); /** * Reference a target language word part @@ -141,7 +142,7 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string target(ApertiumRE const &part, bool with_queue = true); + UString target(ApertiumRE const &part, bool with_queue = true); /** * Reference the reference word part @@ -149,13 +150,13 @@ public: * @param with_queue access taking into account the queue * @returns reference to the part of string matched */ - string reference(ApertiumRE const &part, bool with_queue = true); + UString reference(ApertiumRE const &part, bool with_queue = true); /** * Reference the wordbound blank part * @returns reference to the wordbound blank */ - string getWblank(); + UString getWblank(); /** * Sets a value for a source language word part @@ -164,7 +165,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setSource(ApertiumRE const &part, string const &value, + bool setSource(ApertiumRE const &part, UString const &value, bool with_queue = true); /** @@ -174,7 +175,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setTarget(ApertiumRE const &part, string const &value, + bool setTarget(ApertiumRE const &part, UString const &value, bool with_queue = true); /** @@ -184,7 +185,7 @@ public: * @param with_queue access taking or not into account the queue * @returns whether part matched */ - bool setReference(ApertiumRE const &part, string const &value, + bool setReference(ApertiumRE const &part, UString const &value, bool with_queue = true); }; diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc index be0d4ac..6d811de 100644 --- a/apertium/trx_reader.cc +++ b/apertium/trx_reader.cc @@ -44,21 +44,21 @@ TRXReader::insertLemma(int const base, UString const &lemma) { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); - int another = td.getTransducer().insertSingleTransduction(L'\\', retval); + int another = td.getTransducer().insertSingleTransduction('\\', retval); td.getTransducer().linkStates(another, retval, any_char); } else { for(unsigned int i = 0, limit = lemma.size(); i != limit; i++) { - if(lemma[i] == L'\\') + if(lemma[i] == '\\') { - retval = td.getTransducer().insertSingleTransduction(L'\\', retval); + retval = td.getTransducer().insertSingleTransduction('\\', retval); i++; retval = td.getTransducer().insertSingleTransduction(int(lemma[i]), retval); } - else if(lemma[i] == L'*') + else if(lemma[i] == '*') { retval = td.getTransducer().insertSingleTransduction(any_char, retval); td.getTransducer().linkStates(retval, retval, any_char); @@ -83,7 +83,7 @@ TRXReader::insertTags(int const base, UString const &tags) { for(unsigned int i = 0, limit = tags.size(); i < limit; i++) { - if(tags[i] == L'*') + if(tags[i] == '*') { retval = td.getTransducer().insertSingleTransduction(any_tag, retval); td.getTransducer().linkStates(retval, retval, any_tag); @@ -94,7 +94,7 @@ TRXReader::insertTags(int const base, UString const &tags) UString symbol = "<"; for(unsigned int j = i; j != limit; j++) { - if(tags[j] == L'.') + if(tags[j] == '.') { symbol.append(tags.substr(i, j-i)); i = j; @@ -107,7 +107,7 @@ TRXReader::insertTags(int const base, UString const &tags) symbol.append(tags.substr(i)); i = limit; } - symbol += L'>'; + symbol += '>'; td.getAlphabet().includeSymbol(symbol); retval = td.getTransducer().insertSingleTransduction(td.getAlphabet()(symbol), retval); } @@ -253,12 +253,12 @@ TRXReader::procRules() it != limit; it++) { // mark of begin of word - int tmp = td.getTransducer().insertSingleTransduction(L'^', *it); + int tmp = td.getTransducer().insertSingleTransduction('^', *it); if(*it != td.getTransducer().getInitial()) { // insert optional blank between two words - int alt = td.getTransducer().insertSingleTransduction(L' ', *it); - td.getTransducer().linkStates(alt, tmp, L'^'); + int alt = td.getTransducer().insertSingleTransduction(' ', *it); + td.getTransducer().linkStates(alt, tmp, '^'); } // insert word @@ -266,7 +266,7 @@ TRXReader::procRules() tmp = insertTags(tmp, range.first->second.tags); // insert mark of end of word - tmp = td.getTransducer().insertSingleTransduction(L'$', tmp); + tmp = td.getTransducer().insertSingleTransduction('$', tmp); // set as alive_state alive_states_new.insert(tmp); @@ -566,14 +566,14 @@ TRXReader::insertAttrItem(UString const &name, UString const &tags) { if(td.getAttrItems()[name].size() != 0) { - td.getAttrItems()[name] += L'|'; + td.getAttrItems()[name] += '|'; } td.getAttrItems()[name] += '<'; for(unsigned int i = 0, limit = tags.size(); i != limit; i++) { - if(tags[i] == L'.') + if(tags[i] == '.') { td.getAttrItems()[name].append("><"); } @@ -582,6 +582,6 @@ TRXReader::insertAttrItem(UString const &name, UString const &tags) td.getAttrItems()[name] += tags[i]; } } - td.getAttrItems()[name] += L'>'; + td.getAttrItems()[name] += '>'; } diff --git a/apertium/unigram_tagger.cc b/apertium/unigram_tagger.cc index a9ff362..8fb543b 100644 --- a/apertium/unigram_tagger.cc +++ b/apertium/unigram_tagger.cc @@ -240,7 +240,7 @@ UnigramTagger::model3_score(const Analysis &Analysis_) } void -UnigramTagger::tag(Stream &Input, std::wostream &Output) +UnigramTagger::tag(Stream &Input, std::ostream &Output) { while (true) { StreamedType StreamedType_ = Input.get(); @@ -266,7 +266,7 @@ UnigramTagger::tag(Stream &Input, std::wostream &Output) } void -UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) +UnigramTagger::tag(const LexicalUnit &LexicalUnit_, std::ostream &Output) { Optional TheAnalysis; long double max_score = 0; diff --git a/apertium/unigram_tagger.h b/apertium/unigram_tagger.h index 1101f94..70737bc 100644 --- a/apertium/unigram_tagger.h +++ b/apertium/unigram_tagger.h @@ -45,7 +45,7 @@ enum UnigramTaggerModel { class UnigramTagger : public StreamTagger { private: long double model3_score(const Analysis &Analysis_); - void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output); + void tag(const LexicalUnit &LexicalUnit_, std::ostream &Output); std::stringstream score_DEBUG; protected: @@ -94,7 +94,7 @@ public: UnigramTaggerModel getModel(); void serialise(std::ostream& o) const; void deserialise(std::istream& s); - void tag(Stream& Input, std::wostream& Output); + void tag(Stream& Input, std::ostream& Output); void train(Stream& TaggedCorpus); }; } diff --git a/apertium/xml_walk_util.cc b/apertium/xml_walk_util.cc new file mode 100644 index 0000000..9a122ec --- /dev/null +++ b/apertium/xml_walk_util.cc @@ -0,0 +1,60 @@ +#include + +children::children(const xmlNode* node_) + : node(node_), cur(node->children) +{} + +children::children(const children& it) + : node(it.node), cur(it.cur) +{} + +children::~children() +{} // we don't own the pointers, so we don't delete them + +children& +children::operator++() +{ + if (node && cur) { + cur = cur->next; + while (cur && cur->type != XML_ELEMENT_NODE) { + cur = cur->next; + } + } +} + +children +children::begin() +{ + return children(node); +} + +children +children::end() +{ + children ret(node); + ret.cur = nullptr; + return ret; +} + +bool +children::operator!=(const children& other) const +{ + return node != other.node || cur != other.cur; +} + +bool +children::operator==(const children& other) const +{ + return node == other.node && cur == other.cur; +} + +UString +getattr(xmlNode* node, const char* attr) +{ + for (xmlAttr* i = node->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) attr)) { + return to_ustring((const char*) i->children->content); + } + } + return ""_u; +} diff --git a/apertium/xml_walk_util.h b/apertium/xml_walk_util.h new file mode 100644 index 0000000..d715ebb --- /dev/null +++ b/apertium/xml_walk_util.h @@ -0,0 +1,28 @@ +#ifndef _XML_WALK_UTIL_ +#define _XML_WALK_UTIL_ + +#include + +#include +#include + +class children +{ +private: + xmlNode* node; + xmlNode* cur; +public: + children(const xmlNode* node); + children(const children& it); + + children& operator++(); + children begin(); + children end(); + inline xmlNode* operator*() const { return cur; } + bool operator!=(const children& other) const; + bool operator==(const children& other) const; +}; + +UString getattr(xmlNode* node, const char* attr); + +#endif