commit f135c78091747f768b0f18681529f20586d73bf9 Author: Daniel Swanson Date: Tue Jun 8 18:08:16 2021 -0500 transfer tests pass diff --git a/apertium/apertium_re.cc b/apertium/apertium_re.cc index 66426c7..1ba71a3 100644 --- a/apertium/apertium_re.cc +++ b/apertium/apertium_re.cc @@ -56,7 +56,7 @@ ApertiumRE::compile(UString const &str) UnicodeString s = str.c_str(); UErrorCode err = U_ZERO_ERROR; re = RegexPattern::compile(s, UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, err); - if(err != U_ZERO_ERROR) { + if(!U_SUCCESS(err)) { cerr << "Error: unable to compile regular expression '" << str << "'." << endl; exit(EXIT_FAILURE); } @@ -86,7 +86,7 @@ ApertiumRE::match(UString const &str) const UErrorCode err = U_ZERO_ERROR; RegexMatcher* m = re->matcher(s, err); - if (err != U_ZERO_ERROR) { + if (!U_SUCCESS(err)) { cerr << "Error: Unable to apply regexp" << endl; exit(EXIT_FAILURE); } @@ -96,8 +96,9 @@ ApertiumRE::match(UString const &str) const } UString ret = m->group(err).getTerminatedBuffer(); - if (err != U_ZERO_ERROR) { + if (!U_SUCCESS(err)) { cerr << "Error: Unable to extract substring from regexp match" << endl; + cerr << "error code: " << u_errorName(err) << endl; exit(EXIT_FAILURE); } @@ -116,7 +117,7 @@ ApertiumRE::replace(UString &str, UString const &value) const UErrorCode err = U_ZERO_ERROR; RegexMatcher* m = re->matcher(s, err); - if (err != U_ZERO_ERROR) { + if (!U_SUCCESS(err)) { cerr << "Error: Unable to apply regexp" << endl; exit(EXIT_FAILURE); } diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index 249a235..5d7a052 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -63,10 +63,10 @@ Interchunk::evalCachedString(xmlNode* element) } } break; - + case ti_var: return variables[ti.getContent()]; - + case ti_lit_tag: case ti_lit: return ti.getContent(); @@ -178,7 +178,7 @@ Interchunk::processOut(xmlNode *localroot) write(evalString(i), output); } } - + in_out = false; } @@ -407,7 +407,7 @@ Interchunk::readToken(InputFile& in) content += in.get(); } else if(val2 == '}') { UChar32 val3 = in.peek(); - + content += '}'; if(val3 == '$') { break; @@ -469,27 +469,25 @@ Interchunk::interchunk(InputFile& in, UFILE* out) { if(lastrule != NULL) { - applyRule(); - input_buffer.setPos(last); + applyRule(); + input_buffer.setPos(last); } else { - if(tmpword.size() != 0) - { - u_fprintf(output, "^%S$", tmpword[0]->c_str()); - tmpword.clear(); - input_buffer.setPos(last); - input_buffer.next(); - last = input_buffer.getPos(); - ms.init(me->getInitial()); - } - else if(tmpblank.size() != 0) - { - write(*tmpblank[0], output); - tmpblank.clear(); - last = input_buffer.getPos(); - ms.init(me->getInitial()); - } + if(tmpword.size() != 0) { + u_fprintf(output, "^%S$", tmpword[0]->c_str()); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) { + write(*tmpblank[0], output); + tmpblank.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } } } int val = ms.classifyFinals(me->getFinals()); @@ -498,7 +496,7 @@ Interchunk::interchunk(InputFile& in, UFILE* out) size_t lastrule_line = rule_lines[val-1]; lastrule = rule_map[val-1]; last = input_buffer.getPos(); - + last_lword = tmpword.size(); if(trace) diff --git a/apertium/perceptron_spec.h b/apertium/perceptron_spec.h index def1e2e..a52d8b3 100644 --- a/apertium/perceptron_spec.h +++ b/apertium/perceptron_spec.h @@ -31,7 +31,7 @@ typedef std::set VMSet; class PerceptronSpec { public: - typedef std::vector FeatureDefn; + typedef std::vector FeatureDefn; static void printFeature(std::ostream &out, const PerceptronSpec::FeatureDefn &feat_defn); friend std::ostream& operator<<(std::ostream &out, PerceptronSpec const &pt); PerceptronSpec(); @@ -440,7 +440,7 @@ private: bool is_feature; const FeatureDefn &feat; const size_t &feat_idx; - std::vector::const_iterator bytecode_iter; + std::vector::const_iterator bytecode_iter; const TaggedSentence &tagged; const Sentence &untagged; int token_idx; diff --git a/apertium/transfer_base.cc b/apertium/transfer_base.cc index f48511f..0ee783e 100644 --- a/apertium/transfer_base.cc +++ b/apertium/transfer_base.cc @@ -9,7 +9,7 @@ using namespace std; TransferBase::TransferBase() : me(nullptr), doc(nullptr), root_element(nullptr), - lword(0), nwords(0), output(nullptr), + lword(0), lastrule(nullptr), nwords(0), output(nullptr), any_char(0), any_tag(0), in_let_var(false), in_out(false), null_flush(false), internal_null_flush(false), trace(false) {} @@ -70,12 +70,17 @@ TransferBase::read(const char* transferfile, const char* datafile) me = new MatchExe(t, finals); // attr_items - Compression::string_read(in); // formerly PCRE version, now blank + bool icu = Compression::string_read(in).empty(); for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) { UString const cad_k = Compression::string_read(in); attr_items[cad_k].read(in); UString fallback = Compression::string_read(in); + if (!icu && cad_k == "chname"_u) { + // chname was previously "({([^/]+)\\/)" + // which is fine for PCRE, but ICU chokes on the unmatched bracket + fallback = "(\\{([^/]+)\\/)"_u; + } attr_items[cad_k].compile(fallback); } diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index ac1d0c4..f350891 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -51,7 +51,7 @@ TransferData::TransferData() attr_items["lemh"_u] = "^(([^<#]|\"\\<\"|\"\\#\")+)"_u; attr_items["whole"_u] = "(.+)"_u; attr_items["tags"_u] = "((<[^>]+>)+)"_u; - attr_items["chname"_u] = "({([^/]+)\\/)"_u; // includes delimiters { and / !!! + attr_items["chname"_u] = "(\\{([^/]+)\\/)"_u; // includes delimiters { and / !!! attr_items["chcontent"_u] = "(\\{.+)"_u; attr_items["content"_u] = "(\\{.+)"_u; } diff --git a/apertium/xml_walk_util.cc b/apertium/xml_walk_util.cc index be2b10e..b9e6dde 100644 --- a/apertium/xml_walk_util.cc +++ b/apertium/xml_walk_util.cc @@ -2,7 +2,11 @@ children::children(xmlNode* node_) : node(node_), cur(node->children) -{} +{ + while (cur && cur->type != XML_ELEMENT_NODE) { + cur = cur->next; + } +} children::children(const children& it) : node(it.node), cur(it.cur) diff --git a/tests/data/nno-nob.t2x.bin b/tests/data/nno-nob.t2x.bin index c03e145..6ea1978 100644 Binary files a/tests/data/nno-nob.t2x.bin and b/tests/data/nno-nob.t2x.bin differ diff --git a/tests/tagger/test_find_similar_ambiguity_classes.cc b/tests/tagger/test_find_similar_ambiguity_classes.cc index a6e299e..0554883 100644 --- a/tests/tagger/test_find_similar_ambiguity_classes.cc +++ b/tests/tagger/test_find_similar_ambiguity_classes.cc @@ -1,3 +1,4 @@ +#include #include "apertium/utf_converter.h" #include "apertium/tagger_utils.h" #include "apertium/tagger_data_hmm.h" @@ -6,32 +7,33 @@ #include #include -void print_ambiguity_class(const vector &array_tags, const set &abgset) +void print_ambiguity_class(const vector &array_tags, const set &abgset) { unsigned int j; set::const_iterator abgseti; for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { - wcout << array_tags[*abgseti]; + cout << array_tags[*abgseti]; if (j < abgset.size() - 1) { - wcout << " "; + cout << " "; } } } void find_similar_ambiguity_class_io(TaggerData &td) { - vector &array_tags = td.getArrayTags(); - wstring line = L""; - getline(wcin, line, L'\n'); + vector &array_tags = td.getArrayTags(); + string line_ = ""; + getline(cin, line_, '\n'); + UString line = to_ustring(line_.c_str()); - wstringstream line_stream(line); + basic_istringstream line_stream(line); set ambiguity_class; - wstring tag_name; + UString tag_name; while (line_stream >> tag_name) { - vector::iterator it; + vector::iterator it; it = find(array_tags.begin(), array_tags.end(), tag_name); if (it == array_tags.end()) { - wcerr << L"Tag not in model: " << tag_name << L'\n'; + cerr << "Tag not in model: " << tag_name << '\n'; exit(-3); } ambiguity_class.insert(it - array_tags.begin());