commit 152691ae22222f29f29ea6a1c462dca5c6c75c59 Author: Daniel Swanson Date: Wed Jun 30 08:51:15 2021 -0500 use ICU (#45) ICU changes - replace all uses of `std::wstring` and associated types with `UString` - use `lttoolbox/input_file.h` to read UTF-8 stream with nulls, since ICU can't do that directly - use lttoolbox helper functions for manipulating string case efficiency/readability changes - pass containers by `const` reference whenever possible - in `parse_arx` move the pointer to the current document from parameter to class attribute, since most of the functions don't use it anyway (i.e. to silence compiler warnings) - make most of the internal parsing functions in `parse_arx` private - convert some loops to range-for helper function and dependency changes - use `lttoolbox/xml_walk_util.h` for iterating over a node's children and converting attribute values to `UString` diff --git a/.travis.yml b/.travis.yml index 201ecc3..0a34dcb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ compiler: before_install: - wget https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash - - sudo apt-get install -qfy build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev + - sudo apt-get install -qfy build-essential autoconf autotools-dev lttoolbox-dev pkg-config libxml2-dev libutfcpp-dev script: - $CXX --version diff --git a/configure.ac b/configure.ac index 4aea431..c75537d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ AC_PREREQ(2.61) -AC_INIT([apertium-anaphora], [1.0.2], [khanna.tanmai@gmail.com]) +AC_INIT([apertium-anaphora], [1.1.0], [khanna.tanmai@gmail.com]) AM_INIT_AUTOMAKE AC_CONFIG_MACRO_DIR([m4]) @@ -11,7 +11,7 @@ AC_LANG_CPLUSPLUS CFLAGS="-Wall -Wextra $CFLAGS" CXXFLAGS="-Wall -Wextra $CXXFLAGS" -PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= 3.5.3]) +PKG_CHECK_MODULES([LTTOOLBOX], [lttoolbox >= 3.6.0]) AC_SUBST(LTTOOLBOX_CFLAGS) AC_SUBST(LTTOOLBOX_LIBS) @@ -19,8 +19,12 @@ PKG_CHECK_MODULES([LIBXML], [libxml-2.0]) AC_SUBST(LIBXML_CFLAGS) AC_SUBST(LIBXML_LIBS) -CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS" -LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS" +PKG_CHECK_MODULES([ICU], [icu-i18n icu-io icu-uc]) +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) + +CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" +LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS" # Checks for highest supported C++ standard AC_LANG(C++) diff --git a/src/anaphora.cc b/src/anaphora.cc index 66366d0..6c01b21 100644 --- a/src/anaphora.cc +++ b/src/anaphora.cc @@ -23,7 +23,7 @@ #include "pattern_arx.h" #include - +#include #include #include @@ -36,26 +36,13 @@ using namespace std; -FILE * open_input(string const &filename) -{ - FILE *input = fopen(filename.c_str(), "r"); - if(!input) - { - wcerr << "Error: can't open input file '"; - wcerr << filename.c_str() << "'." << endl; - exit(EXIT_FAILURE); - } - - return input; -} - -FILE * open_output(string const &filename) +UFILE * open_output(string const &filename) { - FILE *output = fopen(filename.c_str(), "w"); + UFILE *output = u_fopen(filename.c_str(), "w", NULL, NULL); if(!output) { - wcerr << "Error: can't open output file '"; - wcerr << filename.c_str() << "'." << endl; + cerr << "Error: can't open output file '"; + cerr << filename.c_str() << "'." << endl; exit(EXIT_FAILURE); } return output; @@ -63,11 +50,11 @@ FILE * open_output(string const &filename) void help_message(char *progname) { - wcerr << "USAGE: " << basename(progname) << " arx_file [input [output]]" << endl; - wcerr << " " << basename(progname) << " -z arx_file [input [output]]" << endl; - wcerr << " arx_file Anaphora Resolution rules file (apertium-xxx-yyy.xxx-yyy.arx)" << endl; - wcerr << " -z null-flushing output on \\0" << endl; - wcerr << " -h shows this message" << endl; + cerr << "USAGE: " << basename(progname) << " arx_file [input [output]]" << endl; + cerr << " " << basename(progname) << " -z arx_file [input [output]]" << endl; + cerr << " arx_file Anaphora Resolution rules file (apertium-xxx-yyy.xxx-yyy.arx)" << endl; + cerr << " -z null-flushing output on \\0" << endl; + cerr << " -h shows this message" << endl; exit(EXIT_FAILURE); } @@ -132,22 +119,23 @@ int main(int argc, char **argv) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); switch(argc - optind) { case 1: // if only one argument left, it has to be the arx_file - arxFileName = argv[argc - 1]; + arxFileName = argv[argc - 1]; break; case 2: // if two arguments, it has to be arx_file and input_file arxFileName = argv[argc - 2]; - input = open_input(argv[argc - 1]); + input.open_or_exit(argv[argc - 1]); break; case 3: // if three arguments, it has to be arx_file, input file and output file arxFileName = argv[argc - 3]; - input = open_input(argv[argc - 2]); + input.open_or_exit(argv[argc - 2]); output = open_output(argv[argc - 1]); break; @@ -156,20 +144,20 @@ int main(int argc, char **argv) break; } - wchar_t input_char; + UChar32 input_char; - wstring input_stream; + UString input_stream; - wstring final_ref; + UString final_ref; Scoring score_module; unsigned int gen_id = 0; - wstring sl_form; - wstring tl_form; - vector sl_tags; - vector tl_tags; - wstring sl_lemma; - wstring tl_lemma; + UString sl_form; + UString tl_form; + vector sl_tags; + vector tl_tags; + UString sl_lemma; + UString tl_lemma; ParseArx arx_file; int parse_arx_retval = arx_file.parseDoc(arxFileName); @@ -179,22 +167,22 @@ int main(int argc, char **argv) int flag_LU = 0; - input_char = fgetwc(input); + input_char = input.get(); - while(input_char!=EOF) + while(input_char!=U_EOF) { - if(nullFlush && input_char == L'\0') + if(nullFlush && input_char == '\0') { - fputwc(input_char, output); - fflush(output); + u_fputc(input_char, output); + u_fflush(output); input_stream.clear(); sl_form.clear(); tl_form.clear(); sl_tags.clear(); tl_tags.clear(); - sl_lemma.clear(); - tl_lemma.clear(); + sl_lemma.clear(); + tl_lemma.clear(); gen_id = 0; score_module.clear(); @@ -203,24 +191,24 @@ int main(int argc, char **argv) flag_LU = 0; } - else if(input_char == L'\\') + else if(input_char == '\\') { if(flag_LU == 0) { - fputwc(input_char, output); + u_fputc(input_char, output); - input_char = fgetwc(input); + input_char = input.get(); - fputwc(input_char, output); + u_fputc(input_char, output); } else { input_stream.push_back(input_char); - fputwc(input_char, output); + u_fputc(input_char, output); - input_char = fgetwc(input); + input_char = input.get(); - fputwc(input_char, output); + u_fputc(input_char, output); input_stream.push_back(input_char); } } @@ -228,19 +216,19 @@ int main(int argc, char **argv) { if(flag_LU == 0) { - fputwc(input_char, output); + u_fputc(input_char, output); - if(input_char == L'^') + if(input_char == '^') flag_LU = 1; } else if(flag_LU == 1) { - if(input_char == L'$') + if(input_char == '$') { gen_id++; - fputwc(L'/', output); //for adding ref + u_fputc('/', output); //for adding ref flag_LU = 0; @@ -250,8 +238,8 @@ int main(int argc, char **argv) tl_tags = LU.get_tl_tags(); sl_form = LU.get_sl_form(); sl_tags = LU.get_sl_tags(); - sl_lemma = LU.get_sl_lemma(); - tl_lemma = LU.get_tl_lemma(); + sl_lemma = LU.get_sl_lemma(); + tl_lemma = LU.get_tl_lemma(); if(!tl_form.empty()) { @@ -265,7 +253,7 @@ int main(int argc, char **argv) { final_ref = score_module.get_antecedent(debug_flag); - fputws(final_ref.c_str(), output); + write(final_ref, output); } } @@ -276,15 +264,15 @@ int main(int argc, char **argv) input_stream.push_back(input_char); } - fputwc(input_char, output); + u_fputc(input_char, output); } } - input_char = fgetwc(input); + input_char = input.get(); } - //fclose(fin); + u_fclose(output); return 0; } diff --git a/src/parse_arx.cc b/src/parse_arx.cc index 3d96172..5636eda 100644 --- a/src/parse_arx.cc +++ b/src/parse_arx.cc @@ -19,31 +19,24 @@ #include "parse_arx.h" #include -#include -#include -#include -#include -#include -#include #include -#include +#include +#include -void print_tags(vector input) +void print_tags(const vector& input) { - for (size_t i = 0; i < input.size(); ++i) - { - wcerr << input[i]; - wcerr << " "; + for (auto& it : input) { + cerr << it << " "; } } -vector ParseArx::parseTags (wstring tags) +vector ParseArx::parseTags (const UString& tags) { - vector temp_tags_list; + vector temp_tags_list; - wstring temptag; + UString temptag; - for (std::wstring::iterator i = tags.begin(); i != tags.end(); ++i) + for (UString::const_iterator i = tags.begin(); i != tags.end(); ++i) { if(*i == '\\') { @@ -68,310 +61,163 @@ vector ParseArx::parseTags (wstring tags) return temp_tags_list; } -void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_type, wstring parameter_name) -//parameter_name: detpos, verbal, etc., parameter_type: anaphor, antecedent, etc. +void ParseArx::parseParameterItem (xmlNodePtr cur, UString parameter_type, UString parameter_name) +//parameter_name: detpos, verbal, etc., parameter_type: anaphor, antecedent, etc. { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - item temp_item; - - while (cur != NULL) - { - temp_item.has_tags.clear(); - temp_item.exclude_tags.clear(); - temp_item.lemma.clear(); - - if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - if (Attr) - { - temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); - if (Attr) - { - temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); - if (Attr) - { - temp_item.lemma = XMLParseUtil::towstring(Attr); - } - - parameters[parameter_type][parameter_name].push_back(temp_item); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto pi : children(cur)) { + if ((!xmlStrcmp(pi->name, (const xmlChar *)"parameter-item"))) { + item temp_item; + temp_item.has_tags = parseTags(getattr(pi, "has-tags")); + temp_item.exclude_tags = parseTags(getattr(pi, "exclude-tags")); + temp_item.lemma = getattr(pi, "lemma"); + parameters[parameter_type][parameter_name].push_back(temp_item); + } } - return; } -void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name) +void ParseArx::parseParameterTypes (xmlNodePtr cur, UString parameter_name) { - wstring parameter_type; - - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if(cur->type == XML_ELEMENT_NODE) - { - parameter_type = XMLParseUtil::towstring(cur->name); - - parseParameterItem(doc, cur, parameter_type, parameter_name); - } - - cur = cur->next; + for (auto param : children(cur)) { + parseParameterItem(param, to_ustring((const char*) param->name), + parameter_name); } - return; } -void ParseArx::parseParameters (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseParameters (xmlNodePtr cur) { - xmlChar *parameter_name; - wstring parameter_type; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"def-parameter"))) - { - parameter_name = xmlGetProp(cur, (const xmlChar *)"n"); - - parseParameterTypes(doc,cur, XMLParseUtil::towstring(parameter_name)); - xmlFree(parameter_name); - } - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"delimiter"))) - { - parameter_type = XMLParseUtil::towstring(cur->name); - - parseParameterItem(doc, cur, parameter_type, L"default"); - } - - cur = cur->next; + for (auto param : children(cur)) { + if (!xmlStrcmp(param->name, (const xmlChar*)"def-parameter")) { + parseParameterTypes(param, getattr(param, "n")); + } else if (!xmlStrcmp(param->name, (const xmlChar*)"delimiter")) { + parseParameterItem(param, "delimiter"_u, "default"_u); + } } - return; } -void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name) +void ParseArx::parseCatItem (xmlNodePtr cur, UString cat_name) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - item temp_item; - - while (cur != NULL) - { - temp_item.has_tags.clear(); - temp_item.exclude_tags.clear(); - temp_item.lemma.clear(); - - if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - if (Attr) - { - temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); - if (Attr) - { - temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); - if (Attr) - { - temp_item.lemma = XMLParseUtil::towstring(Attr); - } - - cats[cat_name].push_back(temp_item); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto ci : children(cur)) { + if (!xmlStrcmp(ci->name, (const xmlChar*)"cat-item")) { + item temp_item; + temp_item.has_tags = parseTags(getattr(ci, "has-tags")); + temp_item.exclude_tags = parseTags(getattr(ci, "exclude-tags")); + temp_item.lemma = getattr(ci, "lemma"); + cats[cat_name].push_back(temp_item); + } } - return; } -void ParseArx::parseCats (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseCats (xmlNodePtr cur) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"def-cat"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - parseCatItem(doc,cur, XMLParseUtil::towstring(Attr)); - xmlFree(Attr); - } - - cur = cur->next; + for (auto cat : children(cur)) { + if (!xmlStrcmp(cat->name, (const xmlChar*)"def-cat")) { + parseCatItem(cat, getattr(cat, "n")); + } } - return; } -vector ParseArx::parsePatternItem (xmlDocPtr doc, xmlNodePtr cur) +vector ParseArx::parsePatternItem (xmlNodePtr cur) { xmlChar *Attr; - cur = cur->xmlChildrenNode; vector temp_pattern; - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"pattern-item"))) - { - markable_pattern temp; + for (auto pi : children(cur)) { + if ((!xmlStrcmp(pi->name, (const xmlChar *)"pattern-item"))) { + markable_pattern temp; - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - temp.name = XMLParseUtil::towstring(Attr); + Attr = xmlGetProp(pi, (const xmlChar *)"n"); + temp.name = to_ustring((const char*)Attr); - xmlFree(Attr); + xmlFree(Attr); - Attr = xmlGetProp(cur, (const xmlChar *)"head"); + Attr = xmlGetProp(pi, (const xmlChar *)"head"); - if(Attr != NULL) - { - temp.head = 1; - } - else - temp.head = 0; + if(Attr != NULL) { + temp.head = 1; + } else { + temp.head = 0; + } - xmlFree(Attr); + xmlFree(Attr); - temp_pattern.push_back(temp); - } - - cur = cur->next; + temp_pattern.push_back(temp); + } } - return temp_pattern; } -void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_name) +void ParseArx::parsePatterns (xmlNodePtr cur, UString markable_name) { - xmlChar *Attr; + for (auto pat : children(cur)) { + if ((!xmlStrcmp(pat->name, (const xmlChar *)"pattern"))) { + vector temp_pattern = parsePatternItem(pat); - cur = cur->xmlChildrenNode; + markables[markable_name].push_back(temp_pattern); + } else if ((!xmlStrcmp(pat->name, (const xmlChar *)"score"))) { + int score_int = StringUtils::stoi(getattr(pat, "n")); - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"pattern"))) - { - vector temp_pattern = parsePatternItem(doc,cur); - - markables[markable_name].push_back(temp_pattern); - } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"score"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - wstring score_ws = XMLParseUtil::towstring(Attr); - int score_int = std::stoi(score_ws); - - xmlChar *parameter_name = xmlGetProp(cur, (const xmlChar *)"parameter"); - - if (parameter_name) - { - wstring parameter_name_ws = XMLParseUtil::towstring(parameter_name); - parameter_markables_score[parameter_name_ws][markable_name] = score_int; - } - else - { - all_markables_score[markable_name] = score_int; - } - } - - cur = cur->next; + xmlChar *param_name = xmlGetProp(cur, (const xmlChar*)"parameter"); + + if (param_name) { + UString name = to_ustring((const char*)param_name); + parameter_markables_score[name][markable_name] = score_int; + } else { + all_markables_score[markable_name] = score_int; + } + } } - return; } -void ParseArx::parseMarkables (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseMarkables (xmlNodePtr cur) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"markable"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - parsePatterns(doc,cur, XMLParseUtil::towstring(Attr)); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto m : children(cur)) { + if ((!xmlStrcmp(m->name, (const xmlChar *)"markable"))) { + parsePatterns(m, getattr(m, "n")); + } } - return; } int ParseArx::parseDoc(char *docname) { - xmlDocPtr doc; xmlNodePtr cur; - doc = xmlParseFile(docname); + curDoc = xmlParseFile(docname); - if (doc == NULL ) + if (curDoc == nullptr ) { fprintf(stderr,"Document not parsed successfully. \n"); return -1; } - cur = xmlDocGetRootElement(doc); + cur = xmlDocGetRootElement(curDoc); if (cur == NULL) { fprintf(stderr,"Empty Document!\n"); - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); return 1; } if (xmlStrcmp(cur->name, (const xmlChar *) "ref")) { fprintf(stderr,"Document of the wrong type! Root node should be ref.\n"); - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); return 2; } - cur = cur->xmlChildrenNode; - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-parameters"))) - { - parseParameters (doc, cur); + for (auto ch : children(cur)) { + if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-parameters"))) { + parseParameters(ch); + } else if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-def-cats"))) { + parseCats(ch); + } else if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-markables"))) { + parseMarkables(ch); } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-def-cats"))) - { - parseCats (doc, cur); - } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-markables"))) - { - parseMarkables (doc, cur); - } - - cur = cur->next; } - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); + curDoc = nullptr; return 0; } @@ -380,22 +226,22 @@ parameters_datatype ParseArx::get_parameters() return parameters; } -unordered_map ParseArx::get_cats() +unordered_map ParseArx::get_cats() { return cats; } -unordered_map ParseArx::get_markables() +unordered_map ParseArx::get_markables() { return markables; } -unordered_map ParseArx::get_all_markables_score() +unordered_map ParseArx::get_all_markables_score() { return all_markables_score; } -unordered_map ParseArx::get_parameter_markables_score(wstring parameter_name) +unordered_map ParseArx::get_parameter_markables_score(UString parameter_name) { return parameter_markables_score[parameter_name]; } diff --git a/src/parse_arx.h b/src/parse_arx.h index eefe6da..2a5c98a 100644 --- a/src/parse_arx.h +++ b/src/parse_arx.h @@ -19,69 +19,66 @@ #ifndef _PARSEARX_ #define _PARSEARX_ -#include -#include -#include #include -#include #include #include -#include +#include using namespace std; struct item { //for cat-item and parameter-item - vector has_tags; - vector exclude_tags; - wstring lemma; + vector has_tags; + vector exclude_tags; + UString lemma; }; typedef vector acceptable_tags; struct markable_pattern { - wstring name; + UString name; int head; }; typedef vector< vector > acceptable_patterns; -typedef unordered_map< wstring, unordered_map > parameters_datatype; +typedef unordered_map< UString, unordered_map > parameters_datatype; -void print_tags(vector< wstring > input); +void print_tags(const vector< UString >& input); class ParseArx { private: parameters_datatype parameters; //parameter type mapped to its parameter types, i.e. anaphor/antecedent mapped to a map which contains n="detpos" and n="verbal", etc. - unordered_map cats; //cat name mapped to acceptable tag lists + unordered_map cats; //cat name mapped to acceptable tag lists - unordered_map markables; //markable name mapped to acceptable pattern lists. Also each pattern has a head == 1 - unordered_map all_markables_score; //markable name mapped to score of markable, will be applied on all anaphors - unordered_map > parameter_markables_score; //parameter name mapped to a mapping of markable and score (when parameter name is explicitly mentioned in arx) + unordered_map markables; //markable name mapped to acceptable pattern lists. Also each pattern has a head == 1 + unordered_map all_markables_score; //markable name mapped to score of markable, will be applied on all anaphors + unordered_map > parameter_markables_score; //parameter name mapped to a mapping of markable and score (when parameter name is explicitly mentioned in arx) -public: - int parseDoc(char *docname); - void parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name); - void parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parameter_name, wstring parameter_type); - void parseParameters (xmlDocPtr doc, xmlNodePtr cur); + xmlDocPtr curDoc = nullptr; + vector parseTags (const UString& tags); + void parseParameterTypes (xmlNodePtr cur, UString parameter_name); + void parseParameterItem (xmlNodePtr cur, UString parameter_name, UString parameter_type); + void parseParameters (xmlNodePtr cur); - void parseCats (xmlDocPtr doc, xmlNodePtr cur); - void parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name); + void parseCats (xmlNodePtr cur); + void parseCatItem (xmlNodePtr cur, UString cat_name); - void parseMarkables (xmlDocPtr doc, xmlNodePtr cur); - void parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_name); - vector parsePatternItem (xmlDocPtr doc, xmlNodePtr cur); + void parseMarkables (xmlNodePtr cur); + void parsePatterns (xmlNodePtr cur, UString markable_name); + vector parsePatternItem (xmlNodePtr cur); - vector parseTags (wstring tags); +public: + int parseDoc(char *docname); parameters_datatype get_parameters(); - unordered_map get_cats(); + unordered_map get_cats(); - unordered_map get_markables(); + unordered_map get_markables(); - unordered_map get_all_markables_score(); - unordered_map get_parameter_markables_score(wstring parameter_name); + unordered_map get_all_markables_score(); + unordered_map get_parameter_markables_score(UString parameter_name); }; #endif diff --git a/src/parse_biltrans.cc b/src/parse_biltrans.cc index bea0462..41f54ed 100644 --- a/src/parse_biltrans.cc +++ b/src/parse_biltrans.cc @@ -24,16 +24,16 @@ using namespace std; -ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) +ParseLexicalUnit::ParseLexicalUnit(UString input_LU) { int seenSlash = 0; int seenTag = 0; - wstring temptag; + UString temptag; - for (std::wstring::iterator i = input_LU.begin(); i != input_LU.end(); ++i) + for (UString::iterator i = input_LU.begin(); i != input_LU.end(); ++i) { - if(*i == L'\\') + if(*i == '\\') { if(seenSlash == 0) { @@ -79,19 +79,19 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) } } - else if(*i == L'/') + else if(*i == '/') seenSlash++; else if(seenSlash == 0) { sl_form.push_back(*i); - if(*i == L'<') + if(*i == '<') seenTag++; else if(seenTag == 1) { - if(*i == L'>') + if(*i == '>') { seenTag--; sl_tags.push_back(temptag); @@ -103,7 +103,7 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); } } - + else { sl_lemma.push_back(*i); @@ -113,13 +113,13 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) else if(seenSlash == 1) { tl_form.push_back(*i); - - if(*i == L'<') + + if(*i == '<') seenTag++; else if(seenTag == 1) { - if(*i == L'>') + if(*i == '>') { seenTag--; tl_tags.push_back(temptag); @@ -131,7 +131,7 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); } } - + else { tl_lemma.push_back(*i); @@ -145,32 +145,32 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) } } -wstring ParseLexicalUnit::get_sl_form() +UString ParseLexicalUnit::get_sl_form() { return sl_form; } -wstring ParseLexicalUnit::get_tl_form() +UString ParseLexicalUnit::get_tl_form() { return tl_form; } -vector< wstring > ParseLexicalUnit::get_sl_tags() +vector< UString > ParseLexicalUnit::get_sl_tags() { return sl_tags; } -vector< wstring > ParseLexicalUnit::get_tl_tags() +vector< UString > ParseLexicalUnit::get_tl_tags() { return tl_tags; } -wstring ParseLexicalUnit::get_sl_lemma() +UString ParseLexicalUnit::get_sl_lemma() { return sl_lemma; } -wstring ParseLexicalUnit::get_tl_lemma() +UString ParseLexicalUnit::get_tl_lemma() { return tl_lemma; } diff --git a/src/parse_biltrans.h b/src/parse_biltrans.h index 4b0eff1..b17e280 100644 --- a/src/parse_biltrans.h +++ b/src/parse_biltrans.h @@ -21,6 +21,7 @@ #include #include +#include using namespace std; @@ -34,69 +35,69 @@ private: /** * Source language word and tags */ - wstring sl_form; + UString sl_form; /** * Target language word and tags */ - wstring tl_form; + UString tl_form; /** * Source language tags */ - vector< wstring > sl_tags; + vector< UString > sl_tags; /** * Target language tags */ - vector< wstring > tl_tags; - + vector< UString > tl_tags; + /** * Source language lemma */ - wstring sl_lemma; - + UString sl_lemma; + /** * Target language lemma */ - wstring tl_lemma; + UString tl_lemma; public: /** * Constructor to fill all variables * @param input_LU one lexical unit between ^ and $ (excluded) */ - ParseLexicalUnit(wstring input_LU); + ParseLexicalUnit(UString input_LU); /** * Return the Source Language Form */ - wstring get_sl_form(); + UString get_sl_form(); /** * Return the Target Language Form */ - wstring get_tl_form(); + UString get_tl_form(); /** * Return the Source Language Tags */ - vector< wstring > get_sl_tags(); + vector< UString > get_sl_tags(); /** * Return the Target Language Form */ - vector< wstring > get_tl_tags(); - + vector< UString > get_tl_tags(); + /** * Return the Source Language Lemma */ - wstring get_sl_lemma(); - + UString get_sl_lemma(); + /** * Return the Target Language Lemma */ - wstring get_tl_lemma(); + UString get_tl_lemma(); }; diff --git a/src/pattern_arx.cc b/src/pattern_arx.cc index 64edb64..324ce8e 100644 --- a/src/pattern_arx.cc +++ b/src/pattern_arx.cc @@ -24,160 +24,100 @@ #include #include #include -#include +#include using namespace std; void print_markable(acceptable_patterns inp) { - for(acceptable_patterns::iterator i = inp.begin(); i != inp.end(); i++) - { + for (auto& i : inp) { cerr <<"Pattern:\n"; - for(vector::iterator j = (*i).begin(); j != (*i).end(); j++) - { - wcerr << (*j).name; - cerr << "\n"; + for (auto& j : i) { + cerr << j.name << endl; } } } -int contains(vector tags, wstring tag) +bool +contains(const vector& tags, const UString& tag) { - if(std::find(tags.begin(), tags.end(), tag) != tags.end()) - return 1; - else - return 0; + return (std::find(tags.begin(), tags.end(), tag) != tags.end()); } -int contains_any(vector tags, vector candidates) +bool +contains_any(const vector& tags, const vector& candidates) { - for(vector::iterator it=candidates.begin();it!=candidates.end();++it) - { - if(std::find(tags.begin(), tags.end(), *it) != tags.end()) - return 1; + for (auto& it : candidates) { + if(std::find(tags.begin(), tags.end(), it) != tags.end()) + return true; } - return 0; -} - -void toLower(basic_string& s) -{ - for (basic_string::iterator p = s.begin(); p != s.end(); ++p) - { - *p = towlower(*p); - } + return false; } -int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags) //check has-tags, exclude-tags, lemma +bool +check_acceptable_tags(const vector& input_tags, const UString& input_sl_lemma, const acceptable_tags& check_tags) //check has-tags, exclude-tags, lemma { - for (acceptable_tags::iterator i = check_tags.begin(); i != check_tags.end(); ++i) - { - - int flag_contains_all = 1; + for (auto& i : check_tags) { + bool flag_contains_all = true; - vector temp_tags = i->has_tags; - vector temp_exclude_tags = i->exclude_tags; - - for(std::vector::iterator j = temp_tags.begin(); j != temp_tags.end(); ++j) - { - if(*j == L"*") //ignore * in the tags list + for (auto& j : i.has_tags) { + if(j == "*"_u) //ignore * in the tags list continue; - if(!contains(input_tags, *j)) //if the has-tag is NOT in the input LU tags - { - flag_contains_all = 0; + if(!contains(input_tags, j)) { + //if the has-tag is NOT in the input LU tags + flag_contains_all = false; break; } } - - if(flag_contains_all == 0) - { - continue; - } - - for(std::vector::iterator j = temp_exclude_tags.begin(); j != temp_exclude_tags.end(); ++j) - { - if(contains(input_tags, *j)) - { - flag_contains_all = 0; - break; - } - } - - if(flag_contains_all == 0) - { - continue; - } - - if(!(i->lemma).empty()) - { - wstring temp_lemma = i->lemma; - - if(input_sl_lemma.length() == temp_lemma.length()) - { - if(input_sl_lemma.compare(temp_lemma) != 0) - { - toLower(input_sl_lemma); - toLower(temp_lemma); - - if(input_sl_lemma.compare(temp_lemma) != 0) - { - flag_contains_all = 0; - } - } - } - else - { - flag_contains_all = 0; - } - } - - if(flag_contains_all == 0) - { - continue; - } - else - { - return 1; - } + + if (!flag_contains_all) continue; + + if (contains_any(input_tags, i.exclude_tags)) continue; + + if (!i.lemma.empty()) { + if (!StringUtils::caseequal(input_sl_lemma, i.lemma)) { + flag_contains_all = false; + continue; + } + } + + if (flag_contains_all) return true; } - return 0; + return false; } -parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names) +parameter_return check_pattern_name(const vector& input_tags, const UString& input_sl_lemma, const unordered_map& parameter_names) { parameter_return retval; retval.found = 0; - for (unordered_map::iterator it = parameter_names.begin(); it != parameter_names.end(); it++) - { - wstring parameter_name = it->first; - acceptable_tags parameter_tags= it->second; - - if(check_acceptable_tags(input_tags, input_sl_lemma, parameter_tags)) + for (auto& it : parameter_names) { + if(check_acceptable_tags(input_tags, input_sl_lemma, it.second)) { retval.found = 1; - retval.parameter_name = parameter_name; + retval.parameter_name = it.first; return retval; } } - + return retval; } deque< vector > add_properties(deque< vector > context, ParseArx arx_file) { - unordered_map arx_markables = arx_file.get_markables(); - unordered_map arx_cats = arx_file.get_cats(); + unordered_map arx_markables = arx_file.get_markables(); + unordered_map arx_cats = arx_file.get_cats(); - for (unordered_map::iterator it = arx_markables.begin(); it != arx_markables.end(); it++ ) + for (unordered_map::iterator it = arx_markables.begin(); it != arx_markables.end(); it++ ) { //for each markable - wstring markable_name = it->first; + UString markable_name = it->first; acceptable_patterns patterns_list = it->second; for(acceptable_patterns::iterator i = patterns_list.begin(); i!=patterns_list.end(); ++i) //go through patterns in the markable @@ -221,7 +161,7 @@ deque< vector > add_properties(deque< vector > context, Pa if(current_pattern[x].head == 1) { - ((*(n+x)).properties).push_back(L"head"); // + ((*(n+x)).properties).push_back("head"_u); // } } diff --git a/src/pattern_arx.h b/src/pattern_arx.h index 0a2ba2e..103d727 100644 --- a/src/pattern_arx.h +++ b/src/pattern_arx.h @@ -30,12 +30,12 @@ using namespace std; struct unique_LU { int id; - wstring wordform; - wstring tl_wordform; - wstring sl_lemma; - wstring tl_lemma; - vector pos_tags; - vector properties; + UString wordform; + UString tl_wordform; + UString sl_lemma; + UString tl_lemma; + vector pos_tags; + vector properties; }; struct antecedent @@ -47,15 +47,14 @@ struct antecedent struct parameter_return { int found; - wstring parameter_name; + UString parameter_name; }; -int contains(vector tags, wstring tag); -int contains_any(vector tags, vector candidates); -void toLower(basic_string& s); +bool contains(const vector& tags, const UString& tag); +bool contains_any(const vector& tags, const vector& candidates); -int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags); -parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names); +bool check_acceptable_tags(const vector& input_tags, const UString& input_sl_lemma, const acceptable_tags& check_tags); +parameter_return check_pattern_name(const vector& input_tags, const UString& input_sl_lemma, const unordered_map& parameter_names); deque< vector > add_properties(deque< vector > context, ParseArx arx_file); diff --git a/src/score.cc b/src/score.cc index bd307c1..c045205 100644 --- a/src/score.cc +++ b/src/score.cc @@ -28,43 +28,34 @@ using namespace std; -void showq(deque < vector > gq) +void showq(const deque < vector >& gq) { - for(std::deque < vector >::iterator j = gq.begin(); j != gq.end(); ++j) - { - vector temp_sentence = *j; - - cerr << "\n"; - for (std::vector::iterator i = temp_sentence.begin(); i != temp_sentence.end(); ++i) - { - wcerr << (*i).tl_wordform; - - for (std::vector::iterator k = (*i).pos_tags.begin(); k != (*i).pos_tags.end(); ++k) - { - cerr << "<"; - wcerr << (*k); - cerr << ">"; - } - - cerr << ":"; - - for (std::vector::iterator l = (*i).properties.begin(); l != (*i).properties.end(); ++l) - { - cerr << " "; - wcerr << (*l); - } - - cerr << "\t"; - } - - cerr << "\n"; - } - cerr << '\n'; + for (auto& temp_sentence : gq) { + cerr << "\n"; + for (auto& i : temp_sentence) { + cerr << i.tl_wordform; + + for (auto& k : i.pos_tags) { + cerr << "<" << k << ">"; + } + + cerr << ":"; + + for (auto& l : i.properties) { + cerr << " " << l; + } + + cerr << "\t"; + } + + cerr << "\n"; + } + cerr << '\n'; } -int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag) +int Scoring::add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag) { - vector temp_prop; + vector temp_prop; parameters_datatype arx_parameters = arx_file.get_parameters(); unique_LU input_LU = {input_id, input_wordform, input_tl_wordform, input_sl_lemma, input_tl_lemma, input_pos_tags, temp_prop}; @@ -76,7 +67,7 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.push_back(sentence); - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"]) ) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u]) ) { vector new_sentence; @@ -85,7 +76,7 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in } else { - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"])) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u])) { context.back().push_back(input_LU); @@ -97,26 +88,26 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.pop_front(); } - else + else { - parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"anaphor"]); + parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["anaphor"_u]); if(retval.found == 1) //check if tags,lemma of current word match with anaphor in arx file { unique_LU anaphor_LU = input_LU; - vector temp_pos_tags = anaphor_LU.pos_tags; - temp_pos_tags.push_back(L"anaphor"); + vector temp_pos_tags = anaphor_LU.pos_tags; + temp_pos_tags.push_back("anaphor"_u); anaphor_LU.pos_tags = temp_pos_tags; - + context.back().push_back(anaphor_LU); apply_indicators(anaphor_LU, arx_file, retval.parameter_name, debug_flag); context.back().pop_back(); context.back().push_back(input_LU); - + return 1; //To show that something will be added in side ref } else @@ -124,13 +115,13 @@ int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > in context.back().push_back(input_LU); //add word to the latest added sentence in the queue } } - + } return 0; } -void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring parameter_name, int debug_flag) +void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, UString parameter_name, int debug_flag) { int distance_marker = 2; //starts from 2 for current sentence and reduces till -1 as we go to previous sentences int temp_score; @@ -142,40 +133,36 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par distance_marker = distance_marker - context_with_prop.size() + 1; //set distance to earliest sentence based on number of sentences in context - unordered_map all_markables_score = arx_file.get_all_markables_score(); - unordered_map parameter_markables_score = arx_file.get_parameter_markables_score(parameter_name); + unordered_map all_markables_score = arx_file.get_all_markables_score(); + unordered_map parameter_markables_score = arx_file.get_parameter_markables_score(parameter_name); if(debug_flag) { cerr << "\n** For anaphor: "; - fputws(anaphor.wordform.c_str(), stderr); + cerr << anaphor.wordform; cerr << "/"; - fputws(anaphor.tl_wordform.c_str(), stderr); + cerr << anaphor.tl_wordform; cerr << ", Context - with markables **\n"; } //Start going through sentences(earliest to current) and apply all indicators to modify scores of the NPs - for(deque< vector >::iterator i = context_with_prop.begin(); i!=context_with_prop.end(); ++i) - { + for (auto& i : context_with_prop) { firstNP = 1; - for (vector::iterator j = (*i).begin(); j!=(*i).end(); ++j) - { + for (auto& antecedent_LU : i) { if(debug_flag) { cerr << "\n"; - wcerr << (*j).wordform; + cerr << antecedent_LU.wordform; cerr << ": "; - print_tags((*j).properties); + print_tags(antecedent_LU.properties); cerr << "\n"; } - if(check_acceptable_tags((*j).pos_tags, (*j).sl_lemma, arx_file.get_parameters()[L"antecedent"][parameter_name])) + if(check_acceptable_tags(antecedent_LU.pos_tags, antecedent_LU.sl_lemma, arx_file.get_parameters()["antecedent"_u][parameter_name])) { temp_score = 0; - unique_LU antecedent_LU = *j; - if(check_agreement(antecedent_LU.pos_tags, anaphor.pos_tags)) { //Add or Remove Indicators Here @@ -191,20 +178,18 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par //Impeding Indicators //Indicators from XML file (iterate through all markables that provided a score without mentioning parameter_name) - for(unordered_map::iterator x = all_markables_score.begin(); x != all_markables_score.end(); ++x) - { - if(contains(antecedent_LU.properties, x->first)) + for (auto& x : all_markables_score) { + if(contains(antecedent_LU.properties, x.first)) { - temp_score += x->second; + temp_score += x.second; } } //Now get the scores from the markables that mentioned this specific parameter name - for(unordered_map::iterator x = parameter_markables_score.begin(); x != parameter_markables_score.end(); ++x) - { - if(contains(antecedent_LU.properties, x->first)) + for (auto& x : parameter_markables_score) { + if(contains(antecedent_LU.properties, x.first)) { - temp_score += x->second; + temp_score += x.second; } } @@ -214,7 +199,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par else { //cerr << "\nAgreement Failed for:"; - //wcerr << antecedent_LU.wordform; + //cerr << antecedent_LU.wordform; //cerr << "\n"; } } @@ -225,13 +210,13 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par } } -int Scoring::check_agreement(vector antecedent_tags, vector anaphor_tags) +int Scoring::check_agreement(const vector& antecedent_tags, const vector& anaphor_tags) { /* - if(contains(anaphor_tags, L"f") && contains(antecedent_tags, L"m")) + if(contains(anaphor_tags, "f") && contains(antecedent_tags, "m")) return 0; - if(contains(anaphor_tags, L"m") && contains(antecedent_tags, L"f")) + if(contains(anaphor_tags, "m") && contains(antecedent_tags, "f")) return 0; */ @@ -239,7 +224,7 @@ int Scoring::check_agreement(vector antecedent_tags, vector an } -wstring Scoring::get_antecedent(int debug_flag) +UString Scoring::get_antecedent(int debug_flag) { unique_LU final_antecedent_LU; antecedent final_antecedent = {final_antecedent_LU, -5}; @@ -249,17 +234,16 @@ wstring Scoring::get_antecedent(int debug_flag) cerr << "\n** Final Scores **\n"; } - for(vector::iterator it=antecedent_list.begin();it!=antecedent_list.end();++it) //read from furthest to nearest - { + for (auto& it : antecedent_list) { //read from furthest to nearest if(debug_flag) { - cerr << "\n" << (*it).LU.id << ": "; - fputws((*it).LU.wordform.c_str(), stderr); - cerr << " : " << (*it).score << "\n"; + cerr << "\n" << it.LU.id << ": "; + cerr << it.LU.wordform; + cerr << " : " << it.score << "\n"; } - - if((*it).score >= final_antecedent.score) - final_antecedent = (*it); + + if(it.score >= final_antecedent.score) + final_antecedent = it; } antecedent_list.clear(); @@ -267,9 +251,9 @@ wstring Scoring::get_antecedent(int debug_flag) if(debug_flag) { cerr << "\n" << "** Final Antecedent: "; - fputws(final_antecedent.LU.wordform.c_str(), stderr); + cerr << final_antecedent.LU.wordform; cerr << "/"; - fputws(final_antecedent.LU.tl_wordform.c_str(), stderr); + cerr << final_antecedent.LU.tl_wordform; cerr << " **\n"; } diff --git a/src/score.h b/src/score.h index 3f70621..26bb4c4 100644 --- a/src/score.h +++ b/src/score.h @@ -29,7 +29,7 @@ using namespace std; -void showq(deque < vector > gq); +void showq(const deque < vector >& gq); class Scoring { @@ -38,10 +38,10 @@ private: vector antecedent_list; public: - int add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag); - void apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring parameter_name, int debug_flag); - int check_agreement(vector antecedent_tags, vector anaphor_tags); - wstring get_antecedent(int debug_flag); + int add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag); + void apply_indicators(unique_LU anaphor, ParseArx arx_file, UString parameter_name, int debug_flag); + int check_agreement(const vector& antecedent_tags, const vector& anaphor_tags); + UString get_antecedent(int debug_flag); void clear(); };