commit 11e4b47c53c7a7c2194e61c76de6619f5b8ef93c Author: Daniel Swanson Date: Fri Jun 11 16:34:18 2021 -0500 use ICU diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index dadee86..35cf6df 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -12,7 +12,7 @@ jobs: sudo apt-get -qy update sudo apt-get -qfy install wget ca-certificates wget -q https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash - sudo apt-get -qfy install --no-install-recommends build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev + sudo apt-get -qfy install --no-install-recommends build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev libutfcpp-dev - name: autoreconf run: autoreconf -fvi - name: configure diff --git a/.travis.yml b/.travis.yml index 201ecc3..0a34dcb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ compiler: before_install: - wget https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash - - sudo apt-get install -qfy build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev + - sudo apt-get install -qfy build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev libutfcpp-dev script: - $CXX --version diff --git a/configure.ac b/configure.ac index 4aea431..1a929eb 100644 --- a/configure.ac +++ b/configure.ac @@ -19,8 +19,12 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS" +PKG_CHECK_MODULES([ICU], [icu-i18n icu-io icu-uc]) +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/anaphora.cc b/src/anaphora.cc index 66366d0..86fb976 100644 --- a/src/anaphora.cc +++ b/src/anaphora.cc @@ -23,7 +23,7 @@ #include "pattern_arx.h" #include - +#include #include #include @@ -36,22 +36,9 @@ using namespace std; -FILE * open_input(string const &filename) -{ - FILE *input = fopen(filename.c_str(), "r"); - if(!input) - { - wcerr << "Error: can't open input file '"; - wcerr << filename.c_str() << "'." << endl; - exit(EXIT_FAILURE); - } - - return input; -} - -FILE * open_output(string const &filename) +UFILE * open_output(string const &filename) { - FILE *output = fopen(filename.c_str(), "w"); + UFILE *output = u_fopen(filename.c_str(), "w", NULL, NULL); if(!output) { wcerr << "Error: can't open output file '"; @@ -132,22 +119,23 @@ int main(int argc, char **argv) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); switch(argc - optind) { case 1: // if only one argument left, it has to be the arx_file - arxFileName = argv[argc - 1]; + arxFileName = argv[argc - 1]; break; case 2: // if two arguments, it has to be arx_file and input_file arxFileName = argv[argc - 2]; - input = open_input(argv[argc - 1]); + input.open_or_exit(argv[argc - 1]); break; case 3: // if three arguments, it has to be arx_file, input file and output file arxFileName = argv[argc - 3]; - input = open_input(argv[argc - 2]); + input.open_or_exit(argv[argc - 2]); output = open_output(argv[argc - 1]); break; @@ -156,20 +144,20 @@ int main(int argc, char **argv) break; } - wchar_t input_char; + UChar32 input_char; - wstring input_stream; + UString input_stream; - wstring final_ref; + UString final_ref; Scoring score_module; unsigned int gen_id = 0; - wstring sl_form; - wstring tl_form; - vector sl_tags; - vector tl_tags; - wstring sl_lemma; - wstring tl_lemma; + UString sl_form; + UString tl_form; + vector sl_tags; + vector tl_tags; + UString sl_lemma; + UString tl_lemma; ParseArx arx_file; int parse_arx_retval = arx_file.parseDoc(arxFileName); @@ -179,22 +167,22 @@ int main(int argc, char **argv) int flag_LU = 0; - input_char = fgetwc(input); + input_char = input.get(); - while(input_char!=EOF) + while(input_char!=U_EOF) { - if(nullFlush && input_char == L'\0') + if(nullFlush && input_char == '\0') { - fputwc(input_char, output); - fflush(output); + u_fputc(input_char, output); + u_fflush(output); input_stream.clear(); sl_form.clear(); tl_form.clear(); sl_tags.clear(); tl_tags.clear(); - sl_lemma.clear(); - tl_lemma.clear(); + sl_lemma.clear(); + tl_lemma.clear(); gen_id = 0; score_module.clear(); @@ -203,24 +191,24 @@ int main(int argc, char **argv) flag_LU = 0; } - else if(input_char == L'\\') + else if(input_char == '\\') { if(flag_LU == 0) { - fputwc(input_char, output); + u_fputc(input_char, output); - input_char = fgetwc(input); + input_char = input.get(); - fputwc(input_char, output); + u_fputc(input_char, output); } else { input_stream.push_back(input_char); - fputwc(input_char, output); + u_fputc(input_char, output); - input_char = fgetwc(input); + input_char = input.get(); - fputwc(input_char, output); + u_fputc(input_char, output); input_stream.push_back(input_char); } } @@ -228,19 +216,19 @@ int main(int argc, char **argv) { if(flag_LU == 0) { - fputwc(input_char, output); + u_fputc(input_char, output); - if(input_char == L'^') + if(input_char == '^') flag_LU = 1; } else if(flag_LU == 1) { - if(input_char == L'$') + if(input_char == '$') { gen_id++; - fputwc(L'/', output); //for adding ref + u_fputc('/', output); //for adding ref flag_LU = 0; @@ -250,8 +238,8 @@ int main(int argc, char **argv) tl_tags = LU.get_tl_tags(); sl_form = LU.get_sl_form(); sl_tags = LU.get_sl_tags(); - sl_lemma = LU.get_sl_lemma(); - tl_lemma = LU.get_tl_lemma(); + sl_lemma = LU.get_sl_lemma(); + tl_lemma = LU.get_tl_lemma(); if(!tl_form.empty()) { @@ -265,7 +253,7 @@ int main(int argc, char **argv) { final_ref = score_module.get_antecedent(debug_flag); - fputws(final_ref.c_str(), output); + write(final_ref, output); } } @@ -276,15 +264,15 @@ int main(int argc, char **argv) input_stream.push_back(input_char); } - fputwc(input_char, output); + u_fputc(input_char, output); } } - input_char = fgetwc(input); + input_char = input.get(); } - //fclose(fin); + u_fclose(output); return 0; } diff --git a/src/parse_arx.cc b/src/parse_arx.cc index 3d96172..a2ab266 100644 --- a/src/parse_arx.cc +++ b/src/parse_arx.cc @@ -28,22 +28,20 @@ #include #include -void print_tags(vector input) +void print_tags(vector input) { - for (size_t i = 0; i < input.size(); ++i) - { - wcerr << input[i]; - wcerr << " "; + for (auto& it : input) { + cerr << it << " "; } } -vector ParseArx::parseTags (wstring tags) +vector ParseArx::parseTags (UString tags) { - vector temp_tags_list; + vector temp_tags_list; - wstring temptag; + UString temptag; - for (std::wstring::iterator i = tags.begin(); i != tags.end(); ++i) + for (UString::iterator i = tags.begin(); i != tags.end(); ++i) { if(*i == '\\') { @@ -68,12 +66,12 @@ vector ParseArx::parseTags (wstring tags) return temp_tags_list; } -void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_type, wstring parameter_name) -//parameter_name: detpos, verbal, etc., parameter_type: anaphor, antecedent, etc. +void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, UString parameter_type, UString parameter_name) +//parameter_name: detpos, verbal, etc., parameter_type: anaphor, antecedent, etc. { xmlChar *Attr; cur = cur->xmlChildrenNode; - + item temp_item; while (cur != NULL) @@ -81,27 +79,28 @@ void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parame temp_item.has_tags.clear(); temp_item.exclude_tags.clear(); temp_item.lemma.clear(); - + if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) { + Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); if (Attr) { - temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.has_tags = parseTags(to_ustring((const char*)Attr)); } - + Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); if (Attr) { - temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.exclude_tags = parseTags(to_ustring((const char*)Attr)); } - + Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); if (Attr) { - temp_item.lemma = XMLParseUtil::towstring(Attr); + temp_item.lemma = to_ustring((const char*)Attr); } - + parameters[parameter_type][parameter_name].push_back(temp_item); xmlFree(Attr); @@ -112,9 +111,9 @@ void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parame return; } -void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name) +void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name) { - wstring parameter_type; + UString parameter_type; cur = cur->xmlChildrenNode; @@ -122,7 +121,7 @@ void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring param { if(cur->type == XML_ELEMENT_NODE) { - parameter_type = XMLParseUtil::towstring(cur->name); + parameter_type = to_ustring((const char*)cur->name); parseParameterItem(doc, cur, parameter_type, parameter_name); } @@ -135,7 +134,7 @@ void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring param void ParseArx::parseParameters (xmlDocPtr doc, xmlNodePtr cur) { xmlChar *parameter_name; - wstring parameter_type; + UString parameter_type; cur = cur->xmlChildrenNode; while (cur != NULL) @@ -144,14 +143,14 @@ void ParseArx::parseParameters (xmlDocPtr doc, xmlNodePtr cur) { parameter_name = xmlGetProp(cur, (const xmlChar *)"n"); - parseParameterTypes(doc,cur, XMLParseUtil::towstring(parameter_name)); + parseParameterTypes(doc,cur, to_ustring((const char*)parameter_name)); xmlFree(parameter_name); } else if ((!xmlStrcmp(cur->name, (const xmlChar *)"delimiter"))) { - parameter_type = XMLParseUtil::towstring(cur->name); + parameter_type = to_ustring((const char*)cur->name); - parseParameterItem(doc, cur, parameter_type, L"default"); + parseParameterItem(doc, cur, parameter_type, "default"_u); } cur = cur->next; @@ -159,7 +158,7 @@ void ParseArx::parseParameters (xmlDocPtr doc, xmlNodePtr cur) return; } -void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name) +void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, UString cat_name) { xmlChar *Attr; cur = cur->xmlChildrenNode; @@ -171,27 +170,27 @@ void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name) temp_item.has_tags.clear(); temp_item.exclude_tags.clear(); temp_item.lemma.clear(); - + if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) { Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); if (Attr) { - temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.has_tags = parseTags(to_ustring((const char*)Attr)); } - + Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); if (Attr) { - temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.exclude_tags = parseTags(to_ustring((const char*)Attr)); } - + Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); if (Attr) { - temp_item.lemma = XMLParseUtil::towstring(Attr); + temp_item.lemma = to_ustring((const char*)Attr); } - + cats[cat_name].push_back(temp_item); xmlFree(Attr); @@ -213,7 +212,7 @@ void ParseArx::parseCats (xmlDocPtr doc, xmlNodePtr cur) { Attr = xmlGetProp(cur, (const xmlChar *)"n"); - parseCatItem(doc,cur, XMLParseUtil::towstring(Attr)); + parseCatItem(doc,cur, to_ustring((const char*)Attr)); xmlFree(Attr); } @@ -236,7 +235,7 @@ vector ParseArx::parsePatternItem (xmlDocPtr doc, xmlNodePtr c markable_pattern temp; Attr = xmlGetProp(cur, (const xmlChar *)"n"); - temp.name = XMLParseUtil::towstring(Attr); + temp.name = to_ustring((const char*)Attr); xmlFree(Attr); @@ -260,7 +259,7 @@ vector ParseArx::parsePatternItem (xmlDocPtr doc, xmlNodePtr c return temp_pattern; } -void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_name) +void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, UString markable_name) { xmlChar *Attr; @@ -279,14 +278,14 @@ void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_na { Attr = xmlGetProp(cur, (const xmlChar *)"n"); - wstring score_ws = XMLParseUtil::towstring(Attr); - int score_int = std::stoi(score_ws); + UString score_ws = to_ustring((const char*)Attr); + int score_int = stoi(score_ws); xmlChar *parameter_name = xmlGetProp(cur, (const xmlChar *)"parameter"); if (parameter_name) { - wstring parameter_name_ws = XMLParseUtil::towstring(parameter_name); + UString parameter_name_ws = to_ustring((const char*)parameter_name); parameter_markables_score[parameter_name_ws][markable_name] = score_int; } else @@ -311,7 +310,7 @@ void ParseArx::parseMarkables (xmlDocPtr doc, xmlNodePtr cur) { Attr = xmlGetProp(cur, (const xmlChar *)"n"); - parsePatterns(doc,cur, XMLParseUtil::towstring(Attr)); + parsePatterns(doc,cur, to_ustring((const char*)Attr)); xmlFree(Attr); } @@ -380,22 +379,22 @@ parameters_datatype ParseArx::get_parameters() return parameters; } -unordered_map ParseArx::get_cats() +unordered_map ParseArx::get_cats() { return cats; } -unordered_map ParseArx::get_markables() +unordered_map ParseArx::get_markables() { return markables; } -unordered_map ParseArx::get_all_markables_score() +unordered_map ParseArx::get_all_markables_score() { return all_markables_score; } -unordered_map ParseArx::get_parameter_markables_score(wstring parameter_name) +unordered_map ParseArx::get_parameter_markables_score(UString parameter_name) { return parameter_markables_score[parameter_name]; } diff --git a/src/parse_arx.h b/src/parse_arx.h index eefe6da..4362400 100644 --- a/src/parse_arx.h +++ b/src/parse_arx.h @@ -31,57 +31,57 @@ using namespace std; struct item { //for cat-item and parameter-item - vector has_tags; - vector exclude_tags; - wstring lemma; + vector has_tags; + vector exclude_tags; + UString lemma; }; typedef vector acceptable_tags; struct markable_pattern { - wstring name; + UString name; int head; }; typedef vector< vector > acceptable_patterns; -typedef unordered_map< wstring, unordered_map > parameters_datatype; +typedef unordered_map< UString, unordered_map > parameters_datatype; -void print_tags(vector< wstring > input); +void print_tags(vector< UString > input); class ParseArx { private: parameters_datatype parameters; //parameter type mapped to its parameter types, i.e. anaphor/antecedent mapped to a map which contains n="detpos" and n="verbal", etc. - unordered_map cats; //cat name mapped to acceptable tag lists + unordered_map cats; //cat name mapped to acceptable tag lists - unordered_map markables; //markable name mapped to acceptable pattern lists. Also each pattern has a head == 1 - unordered_map all_markables_score; //markable name mapped to score of markable, will be applied on all anaphors - unordered_map > parameter_markables_score; //parameter name mapped to a mapping of markable and score (when parameter name is explicitly mentioned in arx) + unordered_map markables; //markable name mapped to acceptable pattern lists. Also each pattern has a head == 1 + unordered_map all_markables_score; //markable name mapped to score of markable, will be applied on all anaphors + unordered_map > parameter_markables_score; //parameter name mapped to a mapping of markable and score (when parameter name is explicitly mentioned in arx) public: int parseDoc(char *docname); - void parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name); - void parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name, wstring parameter_type); + void parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name); + void parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name, UString parameter_type); void parseParameters (xmlDocPtr doc, xmlNodePtr cur); void parseCats (xmlDocPtr doc, xmlNodePtr cur); - void parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name); + void parseCatItem (xmlDocPtr doc, xmlNodePtr cur, UString cat_name); void parseMarkables (xmlDocPtr doc, xmlNodePtr cur); - void parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_name); + void parsePatterns (xmlDocPtr doc, xmlNodePtr cur, UString markable_name); vector parsePatternItem (xmlDocPtr doc, xmlNodePtr cur); - vector parseTags (wstring tags); + vector parseTags (UString tags); parameters_datatype get_parameters(); - unordered_map get_cats(); + unordered_map get_cats(); - unordered_map get_markables(); + unordered_map get_markables(); - unordered_map get_all_markables_score(); - unordered_map get_parameter_markables_score(wstring parameter_name); + unordered_map get_all_markables_score(); + unordered_map get_parameter_markables_score(UString parameter_name); }; #endif diff --git a/src/parse_biltrans.cc b/src/parse_biltrans.cc index bea0462..567658e 100644 --- a/src/parse_biltrans.cc +++ b/src/parse_biltrans.cc @@ -24,14 +24,14 @@ using namespace std; -ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) +ParseLexicalUnit::ParseLexicalUnit(UString input_LU) { int seenSlash = 0; int seenTag = 0; - wstring temptag; + UString temptag; - for (std::wstring::iterator i = input_LU.begin(); i != input_LU.end(); ++i) + for (UString::iterator i = input_LU.begin(); i != input_LU.end(); ++i) { if(*i == L'\\') { @@ -103,7 +103,7 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); } } - + else { sl_lemma.push_back(*i); @@ -113,7 +113,7 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) else if(seenSlash == 1) { tl_form.push_back(*i); - + if(*i == L'<') seenTag++; @@ -131,7 +131,7 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); } } - + else { tl_lemma.push_back(*i); @@ -145,32 +145,32 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) } } -wstring ParseLexicalUnit::get_sl_form() +UString ParseLexicalUnit::get_sl_form() { return sl_form; } -wstring ParseLexicalUnit::get_tl_form() +UString ParseLexicalUnit::get_tl_form() { return tl_form; } -vector< wstring > ParseLexicalUnit::get_sl_tags() +vector< UString > ParseLexicalUnit::get_sl_tags() { return sl_tags; } -vector< wstring > ParseLexicalUnit::get_tl_tags() +vector< UString > ParseLexicalUnit::get_tl_tags() { return tl_tags; } -wstring ParseLexicalUnit::get_sl_lemma() +UString ParseLexicalUnit::get_sl_lemma() { return sl_lemma; } -wstring ParseLexicalUnit::get_tl_lemma() +UString ParseLexicalUnit::get_tl_lemma() { return tl_lemma; } diff --git a/src/parse_biltrans.h b/src/parse_biltrans.h index 4b0eff1..b17e280 100644 --- a/src/parse_biltrans.h +++ b/src/parse_biltrans.h @@ -21,6 +21,7 @@ #include #include +#include using namespace std; @@ -34,69 +35,69 @@ private: /** * Source language word and tags */ - wstring sl_form; + UString sl_form; /** * Target language word and tags */ - wstring tl_form; + UString tl_form; /** * Source language tags */ - vector< wstring > sl_tags; + vector< UString > sl_tags; /** * Target language tags */ - vector< wstring > tl_tags; - + vector< UString > tl_tags; + /** * Source language lemma */ - wstring sl_lemma; - + UString sl_lemma; + /** * Target language lemma */ - wstring tl_lemma; + UString tl_lemma; public: /** * Constructor to fill all variables * @param input_LU one lexical unit between ^ and $ (excluded) */ - ParseLexicalUnit(wstring input_LU); + ParseLexicalUnit(UString input_LU); /** * Return the Source Language Form */ - wstring get_sl_form(); + UString get_sl_form(); /** * Return the Target Language Form */ - wstring get_tl_form(); + UString get_tl_form(); /** * Return the Source Language Tags */ - vector< wstring > get_sl_tags(); + vector< UString > get_sl_tags(); /** * Return the Target Language Form */ - vector< wstring > get_tl_tags(); - + vector< UString > get_tl_tags(); + /** * Return the Source Language Lemma */ - wstring get_sl_lemma(); - + UString get_sl_lemma(); + /** * Return the Target Language Lemma */ - wstring get_tl_lemma(); + UString get_tl_lemma(); }; diff --git a/src/pattern_arx.cc b/src/pattern_arx.cc index 64edb64..e18fe55 100644 --- a/src/pattern_arx.cc +++ b/src/pattern_arx.cc @@ -25,6 +25,8 @@ #include #include #include +#include +#include using namespace std; @@ -36,13 +38,13 @@ void print_markable(acceptable_patterns inp) for(vector::iterator j = (*i).begin(); j != (*i).end(); j++) { - wcerr << (*j).name; + cerr << (*j).name; cerr << "\n"; } } } -int contains(vector tags, wstring tag) +int contains(vector tags, UString tag) { if(std::find(tags.begin(), tags.end(), tag) != tags.end()) return 1; @@ -50,9 +52,9 @@ int contains(vector tags, wstring tag) return 0; } -int contains_any(vector tags, vector candidates) +int contains_any(vector tags, vector candidates) { - for(vector::iterator it=candidates.begin();it!=candidates.end();++it) + for(vector::iterator it=candidates.begin();it!=candidates.end();++it) { if(std::find(tags.begin(), tags.end(), *it) != tags.end()) return 1; @@ -61,42 +63,46 @@ int contains_any(vector tags, vector candidates) return 0; } -void toLower(basic_string& s) +void toLower(UString& s) { - for (basic_string::iterator p = s.begin(); p != s.end(); ++p) - { - *p = towlower(*p); - } + UString temp; + size_t i = 0; + size_t len = s.size(); + UChar32 c; + while (i < len) { + U16_NEXT(s, i, len, c); + temp += u_tolower(c); + } + s.swap(temp); } -int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags) //check has-tags, exclude-tags, lemma +int check_acceptable_tags(vector input_tags, UString input_sl_lemma, acceptable_tags check_tags) //check has-tags, exclude-tags, lemma { for (acceptable_tags::iterator i = check_tags.begin(); i != check_tags.end(); ++i) { int flag_contains_all = 1; - vector temp_tags = i->has_tags; - vector temp_exclude_tags = i->exclude_tags; - - for(std::vector::iterator j = temp_tags.begin(); j != temp_tags.end(); ++j) - { - if(*j == L"*") //ignore * in the tags list - continue; + vector temp_tags = i->has_tags; + vector temp_exclude_tags = i->exclude_tags; - if(!contains(input_tags, *j)) //if the has-tag is NOT in the input LU tags - { - flag_contains_all = 0; - break; - } + for (auto& j : temp_tags) { + if(j == "*"_u) //ignore * in the tags list + continue; + + if(!contains(input_tags, j)) { + //if the has-tag is NOT in the input LU tags + flag_contains_all = 0; + break; } - + } + if(flag_contains_all == 0) { continue; } - - for(std::vector::iterator j = temp_exclude_tags.begin(); j != temp_exclude_tags.end(); ++j) + + for(std::vector::iterator j = temp_exclude_tags.begin(); j != temp_exclude_tags.end(); ++j) { if(contains(input_tags, *j)) { @@ -104,23 +110,23 @@ int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, ac break; } } - + if(flag_contains_all == 0) { continue; } - + if(!(i->lemma).empty()) { - wstring temp_lemma = i->lemma; - + UString temp_lemma = i->lemma; + if(input_sl_lemma.length() == temp_lemma.length()) { if(input_sl_lemma.compare(temp_lemma) != 0) { toLower(input_sl_lemma); toLower(temp_lemma); - + if(input_sl_lemma.compare(temp_lemma) != 0) { flag_contains_all = 0; @@ -132,7 +138,7 @@ int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, ac flag_contains_all = 0; } } - + if(flag_contains_all == 0) { continue; @@ -146,14 +152,14 @@ int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, ac return 0; } -parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names) +parameter_return check_pattern_name(vector input_tags, UString input_sl_lemma, unordered_map parameter_names) { parameter_return retval; retval.found = 0; - for (unordered_map::iterator it = parameter_names.begin(); it != parameter_names.end(); it++) + for (unordered_map::iterator it = parameter_names.begin(); it != parameter_names.end(); it++) { - wstring parameter_name = it->first; + UString parameter_name = it->first; acceptable_tags parameter_tags= it->second; if(check_acceptable_tags(input_tags, input_sl_lemma, parameter_tags)) @@ -164,20 +170,20 @@ parameter_return check_pattern_name(vector input_tags, wstring input_sl return retval; } } - + return retval; } deque< vector > add_properties(deque< vector > context, ParseArx arx_file) { - unordered_map arx_markables = arx_file.get_markables(); - unordered_map arx_cats = arx_file.get_cats(); + unordered_map arx_markables = arx_file.get_markables(); + unordered_map arx_cats = arx_file.get_cats(); - for (unordered_map::iterator it = arx_markables.begin(); it != arx_markables.end(); it++ ) + for (unordered_map::iterator it = arx_markables.begin(); it != arx_markables.end(); it++ ) { //for each markable - wstring markable_name = it->first; + UString markable_name = it->first; acceptable_patterns patterns_list = it->second; for(acceptable_patterns::iterator i = patterns_list.begin(); i!=patterns_list.end(); ++i) //go through patterns in the markable @@ -221,7 +227,7 @@ deque< vector > add_properties(deque< vector > context, Pa if(current_pattern[x].head == 1) { - ((*(n+x)).properties).push_back(L"head"); // + ((*(n+x)).properties).push_back("head"_u); // } } diff --git a/src/pattern_arx.h b/src/pattern_arx.h index 0a2ba2e..5c67fd6 100644 --- a/src/pattern_arx.h +++ b/src/pattern_arx.h @@ -30,12 +30,12 @@ using namespace std; struct unique_LU { int id; - wstring wordform; - wstring tl_wordform; - wstring sl_lemma; - wstring tl_lemma; - vector pos_tags; - vector properties; + UString wordform; + UString tl_wordform; + UString sl_lemma; + UString tl_lemma; + vector pos_tags; + vector properties; }; struct antecedent @@ -47,15 +47,14 @@ struct antecedent struct parameter_return { int found; - wstring parameter_name; + UString parameter_name; }; -int contains(vector tags, wstring tag); -int contains_any(vector tags, vector candidates); -void toLower(basic_string& s); +int contains(vector tags, UString tag); +int contains_any(vector tags, vector candidates); -int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags); -parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names); +int check_acceptable_tags(vector input_tags, UString input_sl_lemma, acceptable_tags check_tags); +parameter_return check_pattern_name(vector input_tags, UString input_sl_lemma, unordered_map parameter_names); deque< vector > add_properties(deque< vector > context, ParseArx arx_file); diff --git a/src/score.cc b/src/score.cc index bd307c1..522a7cb 100644 --- a/src/score.cc +++ b/src/score.cc @@ -37,22 +37,17 @@ void showq(deque < vector > gq) cerr << "\n"; for (std::vector::iterator i = temp_sentence.begin(); i != temp_sentence.end(); ++i) { - wcerr << (*i).tl_wordform; + cerr << (*i).tl_wordform; - for (std::vector::iterator k = (*i).pos_tags.begin(); k != (*i).pos_tags.end(); ++k) - { - cerr << "<"; - wcerr << (*k); - cerr << ">"; - } + for (auto& k : (*i).pos_tags) { + cerr << "<" << k << ">"; + } cerr << ":"; - for (std::vector::iterator l = (*i).properties.begin(); l != (*i).properties.end(); ++l) - { - cerr << " "; - wcerr << (*l); - } + for (auto& l : (*i).properties) { + cerr << " " << l; + } cerr << "\t"; } @@ -62,9 +57,9 @@ void showq(deque < vector > gq) cerr << '\n'; } -int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag) +int Scoring::add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag) { - vector temp_prop; + vector temp_prop; parameters_datatype arx_parameters = arx_file.get_parameters(); unique_LU input_LU = {input_id, input_wordform, input_tl_wordform, input_sl_lemma, input_tl_lemma, input_pos_tags, temp_prop}; @@ -76,7 +71,7 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.push_back(sentence); - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"]) ) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u]) ) { vector new_sentence; @@ -85,7 +80,7 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in } else { - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"])) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u])) { context.back().push_back(input_LU); @@ -97,26 +92,26 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.pop_front(); } - else + else { - parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"anaphor"]); + parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["anaphor"_u]); if(retval.found == 1) //check if tags,lemma of current word match with anaphor in arx file { unique_LU anaphor_LU = input_LU; - vector temp_pos_tags = anaphor_LU.pos_tags; - temp_pos_tags.push_back(L"anaphor"); + vector temp_pos_tags = anaphor_LU.pos_tags; + temp_pos_tags.push_back("anaphor"_u); anaphor_LU.pos_tags = temp_pos_tags; - + context.back().push_back(anaphor_LU); apply_indicators(anaphor_LU, arx_file, retval.parameter_name, debug_flag); context.back().pop_back(); context.back().push_back(input_LU); - + return 1; //To show that something will be added in side ref } else @@ -124,13 +119,13 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.back().push_back(input_LU); //add word to the latest added sentence in the queue } } - + } return 0; } -void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring parameter_name, int debug_flag) +void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, UString parameter_name, int debug_flag) { int distance_marker = 2; //starts from 2 for current sentence and reduces till -1 as we go to previous sentences int temp_score; @@ -142,15 +137,15 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par distance_marker = distance_marker - context_with_prop.size() + 1; //set distance to earliest sentence based on number of sentences in context - unordered_map all_markables_score = arx_file.get_all_markables_score(); - unordered_map parameter_markables_score = arx_file.get_parameter_markables_score(parameter_name); + unordered_map all_markables_score = arx_file.get_all_markables_score(); + unordered_map parameter_markables_score = arx_file.get_parameter_markables_score(parameter_name); if(debug_flag) { cerr << "\n** For anaphor: "; - fputws(anaphor.wordform.c_str(), stderr); + cerr << anaphor.wordform; cerr << "/"; - fputws(anaphor.tl_wordform.c_str(), stderr); + cerr << anaphor.tl_wordform; cerr << ", Context - with markables **\n"; } @@ -164,13 +159,13 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par if(debug_flag) { cerr << "\n"; - wcerr << (*j).wordform; + cerr << (*j).wordform; cerr << ": "; print_tags((*j).properties); cerr << "\n"; } - if(check_acceptable_tags((*j).pos_tags, (*j).sl_lemma, arx_file.get_parameters()[L"antecedent"][parameter_name])) + if(check_acceptable_tags((*j).pos_tags, (*j).sl_lemma, arx_file.get_parameters()["antecedent"_u][parameter_name])) { temp_score = 0; @@ -191,7 +186,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par //Impeding Indicators //Indicators from XML file (iterate through all markables that provided a score without mentioning parameter_name) - for(unordered_map::iterator x = all_markables_score.begin(); x != all_markables_score.end(); ++x) + for(unordered_map::iterator x = all_markables_score.begin(); x != all_markables_score.end(); ++x) { if(contains(antecedent_LU.properties, x->first)) { @@ -200,7 +195,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par } //Now get the scores from the markables that mentioned this specific parameter name - for(unordered_map::iterator x = parameter_markables_score.begin(); x != parameter_markables_score.end(); ++x) + for(unordered_map::iterator x = parameter_markables_score.begin(); x != parameter_markables_score.end(); ++x) { if(contains(antecedent_LU.properties, x->first)) { @@ -214,7 +209,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par else { //cerr << "\nAgreement Failed for:"; - //wcerr << antecedent_LU.wordform; + //cerr << antecedent_LU.wordform; //cerr << "\n"; } } @@ -225,13 +220,13 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par } } -int Scoring::check_agreement(vector antecedent_tags, vector anaphor_tags) +int Scoring::check_agreement(vector antecedent_tags, vector anaphor_tags) { /* - if(contains(anaphor_tags, L"f") && contains(antecedent_tags, L"m")) + if(contains(anaphor_tags, "f") && contains(antecedent_tags, "m")) return 0; - if(contains(anaphor_tags, L"m") && contains(antecedent_tags, L"f")) + if(contains(anaphor_tags, "m") && contains(antecedent_tags, "f")) return 0; */ @@ -239,7 +234,7 @@ int Scoring::check_agreement(vector antecedent_tags, vector an } -wstring Scoring::get_antecedent(int debug_flag) +UString Scoring::get_antecedent(int debug_flag) { unique_LU final_antecedent_LU; antecedent final_antecedent = {final_antecedent_LU, -5}; @@ -254,10 +249,10 @@ wstring Scoring::get_antecedent(int debug_flag) if(debug_flag) { cerr << "\n" << (*it).LU.id << ": "; - fputws((*it).LU.wordform.c_str(), stderr); + cerr << (*it).LU.wordform; cerr << " : " << (*it).score << "\n"; } - + if((*it).score >= final_antecedent.score) final_antecedent = (*it); } @@ -267,9 +262,9 @@ wstring Scoring::get_antecedent(int debug_flag) if(debug_flag) { cerr << "\n" << "** Final Antecedent: "; - fputws(final_antecedent.LU.wordform.c_str(), stderr); + cerr << final_antecedent.LU.wordform; cerr << "/"; - fputws(final_antecedent.LU.tl_wordform.c_str(), stderr); + cerr << final_antecedent.LU.tl_wordform; cerr << " **\n"; } diff --git a/src/score.h b/src/score.h index 3f70621..33809c0 100644 --- a/src/score.h +++ b/src/score.h @@ -38,10 +38,10 @@ private: vector antecedent_list; public: - int add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag); - void apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring parameter_name, int debug_flag); - int check_agreement(vector antecedent_tags, vector anaphor_tags); - wstring get_antecedent(int debug_flag); + int add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag); + void apply_indicators(unique_LU anaphor, ParseArx arx_file, UString parameter_name, int debug_flag); + int check_agreement(vector antecedent_tags, vector anaphor_tags); + UString get_antecedent(int debug_flag); void clear(); };