commit 670eff18153713b7ace827b10aa8a75c36259719 Author: Daniel Swanson Date: Mon Jun 14 12:44:08 2021 -0500 use xml iterators diff --git a/src/parse_arx.cc b/src/parse_arx.cc index 0bf4eda..5636eda 100644 --- a/src/parse_arx.cc +++ b/src/parse_arx.cc @@ -19,30 +19,24 @@ #include "parse_arx.h" #include -#include -#include -#include -#include -#include -#include #include -#include #include +#include -void print_tags(vector input) +void print_tags(const vector& input) { for (auto& it : input) { cerr << it << " "; } } -vector ParseArx::parseTags (UString tags) +vector ParseArx::parseTags (const UString& tags) { vector temp_tags_list; UString temptag; - for (UString::iterator i = tags.begin(); i != tags.end(); ++i) + for (UString::const_iterator i = tags.begin(); i != tags.end(); ++i) { if(*i == '\\') { @@ -67,311 +61,163 @@ vector ParseArx::parseTags (UString tags) return temp_tags_list; } -void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, UString parameter_type, UString parameter_name) +void ParseArx::parseParameterItem (xmlNodePtr cur, UString parameter_type, UString parameter_name) //parameter_name: detpos, verbal, etc., parameter_type: anaphor, antecedent, etc. { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - item temp_item; - - while (cur != NULL) - { - temp_item.has_tags.clear(); - temp_item.exclude_tags.clear(); - temp_item.lemma.clear(); - - if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) - { - - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - if (Attr) - { - temp_item.has_tags = parseTags(to_ustring((const char*)Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); - if (Attr) - { - temp_item.exclude_tags = parseTags(to_ustring((const char*)Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); - if (Attr) - { - temp_item.lemma = to_ustring((const char*)Attr); - } - - parameters[parameter_type][parameter_name].push_back(temp_item); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto pi : children(cur)) { + if ((!xmlStrcmp(pi->name, (const xmlChar *)"parameter-item"))) { + item temp_item; + temp_item.has_tags = parseTags(getattr(pi, "has-tags")); + temp_item.exclude_tags = parseTags(getattr(pi, "exclude-tags")); + temp_item.lemma = getattr(pi, "lemma"); + parameters[parameter_type][parameter_name].push_back(temp_item); + } } - return; } -void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name) +void ParseArx::parseParameterTypes (xmlNodePtr cur, UString parameter_name) { - UString parameter_type; - - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if(cur->type == XML_ELEMENT_NODE) - { - parameter_type = to_ustring((const char*)cur->name); - - parseParameterItem(doc, cur, parameter_type, parameter_name); - } - - cur = cur->next; + for (auto param : children(cur)) { + parseParameterItem(param, to_ustring((const char*) param->name), + parameter_name); } - return; } -void ParseArx::parseParameters (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseParameters (xmlNodePtr cur) { - xmlChar *parameter_name; - UString parameter_type; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"def-parameter"))) - { - parameter_name = xmlGetProp(cur, (const xmlChar *)"n"); - - parseParameterTypes(doc,cur, to_ustring((const char*)parameter_name)); - xmlFree(parameter_name); - } - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"delimiter"))) - { - parameter_type = to_ustring((const char*)cur->name); - - parseParameterItem(doc, cur, parameter_type, "default"_u); - } - - cur = cur->next; + for (auto param : children(cur)) { + if (!xmlStrcmp(param->name, (const xmlChar*)"def-parameter")) { + parseParameterTypes(param, getattr(param, "n")); + } else if (!xmlStrcmp(param->name, (const xmlChar*)"delimiter")) { + parseParameterItem(param, "delimiter"_u, "default"_u); + } } - return; } -void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, UString cat_name) +void ParseArx::parseCatItem (xmlNodePtr cur, UString cat_name) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - item temp_item; - - while (cur != NULL) - { - temp_item.has_tags.clear(); - temp_item.exclude_tags.clear(); - temp_item.lemma.clear(); - - if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - if (Attr) - { - temp_item.has_tags = parseTags(to_ustring((const char*)Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); - if (Attr) - { - temp_item.exclude_tags = parseTags(to_ustring((const char*)Attr)); - } - - Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); - if (Attr) - { - temp_item.lemma = to_ustring((const char*)Attr); - } - - cats[cat_name].push_back(temp_item); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto ci : children(cur)) { + if (!xmlStrcmp(ci->name, (const xmlChar*)"cat-item")) { + item temp_item; + temp_item.has_tags = parseTags(getattr(ci, "has-tags")); + temp_item.exclude_tags = parseTags(getattr(ci, "exclude-tags")); + temp_item.lemma = getattr(ci, "lemma"); + cats[cat_name].push_back(temp_item); + } } - return; } -void ParseArx::parseCats (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseCats (xmlNodePtr cur) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"def-cat"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - parseCatItem(doc,cur, to_ustring((const char*)Attr)); - xmlFree(Attr); - } - - cur = cur->next; + for (auto cat : children(cur)) { + if (!xmlStrcmp(cat->name, (const xmlChar*)"def-cat")) { + parseCatItem(cat, getattr(cat, "n")); + } } - return; } -vector ParseArx::parsePatternItem (xmlDocPtr doc, xmlNodePtr cur) +vector ParseArx::parsePatternItem (xmlNodePtr cur) { xmlChar *Attr; - cur = cur->xmlChildrenNode; vector temp_pattern; - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"pattern-item"))) - { - markable_pattern temp; + for (auto pi : children(cur)) { + if ((!xmlStrcmp(pi->name, (const xmlChar *)"pattern-item"))) { + markable_pattern temp; - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - temp.name = to_ustring((const char*)Attr); + Attr = xmlGetProp(pi, (const xmlChar *)"n"); + temp.name = to_ustring((const char*)Attr); - xmlFree(Attr); + xmlFree(Attr); - Attr = xmlGetProp(cur, (const xmlChar *)"head"); + Attr = xmlGetProp(pi, (const xmlChar *)"head"); - if(Attr != NULL) - { - temp.head = 1; - } - else - temp.head = 0; + if(Attr != NULL) { + temp.head = 1; + } else { + temp.head = 0; + } - xmlFree(Attr); + xmlFree(Attr); - temp_pattern.push_back(temp); - } - - cur = cur->next; + temp_pattern.push_back(temp); + } } - return temp_pattern; } -void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, UString markable_name) +void ParseArx::parsePatterns (xmlNodePtr cur, UString markable_name) { - xmlChar *Attr; + for (auto pat : children(cur)) { + if ((!xmlStrcmp(pat->name, (const xmlChar *)"pattern"))) { + vector temp_pattern = parsePatternItem(pat); - cur = cur->xmlChildrenNode; + markables[markable_name].push_back(temp_pattern); + } else if ((!xmlStrcmp(pat->name, (const xmlChar *)"score"))) { + int score_int = StringUtils::stoi(getattr(pat, "n")); - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"pattern"))) - { - vector temp_pattern = parsePatternItem(doc,cur); - - markables[markable_name].push_back(temp_pattern); - } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"score"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - UString score_ws = to_ustring((const char*)Attr); - int score_int = StringUtils::stoi(score_ws); - - xmlChar *parameter_name = xmlGetProp(cur, (const xmlChar *)"parameter"); - - if (parameter_name) - { - UString parameter_name_ws = to_ustring((const char*)parameter_name); - parameter_markables_score[parameter_name_ws][markable_name] = score_int; - } - else - { - all_markables_score[markable_name] = score_int; - } - } - - cur = cur->next; + xmlChar *param_name = xmlGetProp(cur, (const xmlChar*)"parameter"); + + if (param_name) { + UString name = to_ustring((const char*)param_name); + parameter_markables_score[name][markable_name] = score_int; + } else { + all_markables_score[markable_name] = score_int; + } + } } - return; } -void ParseArx::parseMarkables (xmlDocPtr doc, xmlNodePtr cur) +void ParseArx::parseMarkables (xmlNodePtr cur) { - xmlChar *Attr; - cur = cur->xmlChildrenNode; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"markable"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"n"); - - parsePatterns(doc,cur, to_ustring((const char*)Attr)); - - xmlFree(Attr); - } - - cur = cur->next; + for (auto m : children(cur)) { + if ((!xmlStrcmp(m->name, (const xmlChar *)"markable"))) { + parsePatterns(m, getattr(m, "n")); + } } - return; } int ParseArx::parseDoc(char *docname) { - xmlDocPtr doc; xmlNodePtr cur; - doc = xmlParseFile(docname); + curDoc = xmlParseFile(docname); - if (doc == NULL ) + if (curDoc == nullptr ) { fprintf(stderr,"Document not parsed successfully. \n"); return -1; } - cur = xmlDocGetRootElement(doc); + cur = xmlDocGetRootElement(curDoc); if (cur == NULL) { fprintf(stderr,"Empty Document!\n"); - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); return 1; } if (xmlStrcmp(cur->name, (const xmlChar *) "ref")) { fprintf(stderr,"Document of the wrong type! Root node should be ref.\n"); - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); return 2; } - cur = cur->xmlChildrenNode; - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-parameters"))) - { - parseParameters (doc, cur); + for (auto ch : children(cur)) { + if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-parameters"))) { + parseParameters(ch); + } else if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-def-cats"))) { + parseCats(ch); + } else if ((!xmlStrcmp(ch->name, (const xmlChar*)"section-markables"))) { + parseMarkables(ch); } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-def-cats"))) - { - parseCats (doc, cur); - } - - else if ((!xmlStrcmp(cur->name, (const xmlChar *)"section-markables"))) - { - parseMarkables (doc, cur); - } - - cur = cur->next; } - xmlFreeDoc(doc); + xmlFreeDoc(curDoc); + curDoc = nullptr; return 0; } diff --git a/src/parse_arx.h b/src/parse_arx.h index 4362400..2a5c98a 100644 --- a/src/parse_arx.h +++ b/src/parse_arx.h @@ -19,14 +19,10 @@ #ifndef _PARSEARX_ #define _PARSEARX_ -#include -#include -#include #include -#include #include #include -#include +#include using namespace std; @@ -48,7 +44,7 @@ typedef vector< vector > acceptable_patterns; typedef unordered_map< UString, unordered_map > parameters_datatype; -void print_tags(vector< UString > input); +void print_tags(const vector< UString >& input); class ParseArx { @@ -60,20 +56,21 @@ private: unordered_map all_markables_score; //markable name mapped to score of markable, will be applied on all anaphors unordered_map > parameter_markables_score; //parameter name mapped to a mapping of markable and score (when parameter name is explicitly mentioned in arx) -public: - int parseDoc(char *docname); - void parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name); - void parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, UString parameter_name, UString parameter_type); - void parseParameters (xmlDocPtr doc, xmlNodePtr cur); + xmlDocPtr curDoc = nullptr; + vector parseTags (const UString& tags); + void parseParameterTypes (xmlNodePtr cur, UString parameter_name); + void parseParameterItem (xmlNodePtr cur, UString parameter_name, UString parameter_type); + void parseParameters (xmlNodePtr cur); - void parseCats (xmlDocPtr doc, xmlNodePtr cur); - void parseCatItem (xmlDocPtr doc, xmlNodePtr cur, UString cat_name); + void parseCats (xmlNodePtr cur); + void parseCatItem (xmlNodePtr cur, UString cat_name); - void parseMarkables (xmlDocPtr doc, xmlNodePtr cur); - void parsePatterns (xmlDocPtr doc, xmlNodePtr cur, UString markable_name); - vector parsePatternItem (xmlDocPtr doc, xmlNodePtr cur); + void parseMarkables (xmlNodePtr cur); + void parsePatterns (xmlNodePtr cur, UString markable_name); + vector parsePatternItem (xmlNodePtr cur); - vector parseTags (UString tags); +public: + int parseDoc(char *docname); parameters_datatype get_parameters(); unordered_map get_cats(); diff --git a/src/score.cc b/src/score.cc index 8055395..c045205 100644 --- a/src/score.cc +++ b/src/score.cc @@ -28,33 +28,29 @@ using namespace std; -void showq(deque < vector > gq) +void showq(const deque < vector >& gq) { - for(std::deque < vector >::iterator j = gq.begin(); j != gq.end(); ++j) - { - vector temp_sentence = *j; + for (auto& temp_sentence : gq) { + cerr << "\n"; + for (auto& i : temp_sentence) { + cerr << i.tl_wordform; - cerr << "\n"; - for (std::vector::iterator i = temp_sentence.begin(); i != temp_sentence.end(); ++i) - { - cerr << (*i).tl_wordform; - - for (auto& k : (*i).pos_tags) { - cerr << "<" << k << ">"; - } + for (auto& k : i.pos_tags) { + cerr << "<" << k << ">"; + } - cerr << ":"; + cerr << ":"; - for (auto& l : (*i).properties) { - cerr << " " << l; - } + for (auto& l : i.properties) { + cerr << " " << l; + } - cerr << "\t"; - } + cerr << "\t"; + } - cerr << "\n"; - } - cerr << '\n'; + cerr << "\n"; + } + cerr << '\n'; } int Scoring::add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag) @@ -71,7 +67,7 @@ int Scoring::add_word(int input_id, UString input_wordform, vector< UString > in context.push_back(sentence); - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u]) ) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u]) ) { vector new_sentence; @@ -80,7 +76,7 @@ int Scoring::add_word(int input_id, UString input_wordform, vector< UString > in } else { - if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u])) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["delimiter"_u]["default"_u])) { context.back().push_back(input_LU); @@ -94,7 +90,7 @@ int Scoring::add_word(int input_id, UString input_wordform, vector< UString > in else { - parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["anaphor"_u]); + parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters["anaphor"_u]); if(retval.found == 1) //check if tags,lemma of current word match with anaphor in arx file { @@ -150,27 +146,23 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, UString par } //Start going through sentences(earliest to current) and apply all indicators to modify scores of the NPs - for(deque< vector >::iterator i = context_with_prop.begin(); i!=context_with_prop.end(); ++i) - { + for (auto& i : context_with_prop) { firstNP = 1; - for (vector::iterator j = (*i).begin(); j!=(*i).end(); ++j) - { + for (auto& antecedent_LU : i) { if(debug_flag) { cerr << "\n"; - cerr << (*j).wordform; + cerr << antecedent_LU.wordform; cerr << ": "; - print_tags((*j).properties); + print_tags(antecedent_LU.properties); cerr << "\n"; } - if(check_acceptable_tags((*j).pos_tags, (*j).sl_lemma, arx_file.get_parameters()["antecedent"_u][parameter_name])) + if(check_acceptable_tags(antecedent_LU.pos_tags, antecedent_LU.sl_lemma, arx_file.get_parameters()["antecedent"_u][parameter_name])) { temp_score = 0; - unique_LU antecedent_LU = *j; - if(check_agreement(antecedent_LU.pos_tags, anaphor.pos_tags)) { //Add or Remove Indicators Here @@ -218,7 +210,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, UString par } } -int Scoring::check_agreement(vector antecedent_tags, vector anaphor_tags) +int Scoring::check_agreement(const vector& antecedent_tags, const vector& anaphor_tags) { /* if(contains(anaphor_tags, "f") && contains(antecedent_tags, "m")) @@ -242,17 +234,16 @@ UString Scoring::get_antecedent(int debug_flag) cerr << "\n** Final Scores **\n"; } - for(vector::iterator it=antecedent_list.begin();it!=antecedent_list.end();++it) //read from furthest to nearest - { + for (auto& it : antecedent_list) { //read from furthest to nearest if(debug_flag) { - cerr << "\n" << (*it).LU.id << ": "; - cerr << (*it).LU.wordform; - cerr << " : " << (*it).score << "\n"; + cerr << "\n" << it.LU.id << ": "; + cerr << it.LU.wordform; + cerr << " : " << it.score << "\n"; } - if((*it).score >= final_antecedent.score) - final_antecedent = (*it); + if(it.score >= final_antecedent.score) + final_antecedent = it; } antecedent_list.clear(); diff --git a/src/score.h b/src/score.h index 33809c0..26bb4c4 100644 --- a/src/score.h +++ b/src/score.h @@ -29,7 +29,7 @@ using namespace std; -void showq(deque < vector > gq); +void showq(const deque < vector >& gq); class Scoring { @@ -40,7 +40,7 @@ private: public: int add_word(int input_id, UString input_wordform, vector< UString > input_pos_tags, UString input_tl_wordform, UString input_sl_lemma, UString input_tl_lemma, ParseArx arx_file, int debug_flag); void apply_indicators(unique_LU anaphor, ParseArx arx_file, UString parameter_name, int debug_flag); - int check_agreement(vector antecedent_tags, vector anaphor_tags); + int check_agreement(const vector& antecedent_tags, const vector& anaphor_tags); UString get_antecedent(int debug_flag); void clear(); };