commit 391ebde6b54dae40646bfcf5ab52c50e4be99242 Author: Tanmai Khanna Date: Wed Jun 17 16:24:21 2020 +0530 Can add lemmas in cat-items/parameter-items now! | New test added diff --git a/src/anaphora.cc b/src/anaphora.cc index 2a54f62..d690191 100644 --- a/src/anaphora.cc +++ b/src/anaphora.cc @@ -173,6 +173,8 @@ int main(int argc, char **argv) wstring tl_form; vector sl_tags; vector tl_tags; + wstring sl_lemma; + wstring tl_lemma; ParseArx arx_file; int parse_arx_retval = arx_file.parseDoc(arxFileName); @@ -196,6 +198,8 @@ int main(int argc, char **argv) tl_form.clear(); sl_tags.clear(); tl_tags.clear(); + sl_lemma.clear(); + tl_lemma.clear(); gen_id = 0; score_module.clear(); @@ -251,12 +255,14 @@ int main(int argc, char **argv) tl_tags = LU.get_tl_tags(); sl_form = LU.get_sl_form(); sl_tags = LU.get_sl_tags(); + sl_lemma = LU.get_sl_lemma(); + tl_lemma = LU.get_tl_lemma(); if(!tl_form.empty()) //if TL exists { int retval; - retval = score_module.add_word(gen_id, sl_form, sl_tags, tl_form, arx_file, debug_flag); //Give word to Scoring Module + retval = score_module.add_word(gen_id, sl_form, sl_tags, tl_form, sl_lemma, tl_lemma, arx_file, debug_flag); //Give word to Scoring Module //If retval is 0, nothing will be added in side ref //If retval is 1, we call get_antecedent() and add it to ref diff --git a/src/anaphora.dtd b/src/anaphora.dtd index 5998af8..0a2d811 100644 --- a/src/anaphora.dtd +++ b/src/anaphora.dtd @@ -16,8 +16,9 @@ --> - + - + diff --git a/src/parse_arx.cc b/src/parse_arx.cc index 943512c..174e228 100644 --- a/src/parse_arx.cc +++ b/src/parse_arx.cc @@ -65,10 +65,6 @@ vector ParseArx::parseTags (wstring tags) if(!temptag.empty()) //if any tag remaining temp_tags_list.push_back(temptag); - //print_tags(temp_tags_list); - - //cerr << "\n"; - return temp_tags_list; } @@ -77,29 +73,39 @@ void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parame { xmlChar *Attr; cur = cur->xmlChildrenNode; - - pair< vector , vector > temp_tags_list; - - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) - { - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - temp_tags_list.first = parseTags(XMLParseUtil::towstring(Attr)); + + item temp_item; + + while (cur != NULL) + { + temp_item.has_tags.clear(); + temp_item.exclude_tags.clear(); + temp_item.lemma.clear(); + + if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) + { + Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); + if (Attr) + { + temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); + } Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); if (Attr) { - temp_tags_list.second = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); } - parameters[parameter_type][parameter_name].push_back(temp_tags_list); - - temp_tags_list.first.clear(); - temp_tags_list.second.clear(); + Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); + if (Attr) + { + temp_item.lemma = XMLParseUtil::towstring(Attr); + } + + parameters[parameter_type][parameter_name].push_back(temp_item); - xmlFree(Attr); - } + xmlFree(Attr); + } cur = cur->next; } @@ -117,16 +123,9 @@ void ParseArx::parseParameterTypes (xmlDocPtr doc, xmlNodePtr cur, wstring param if(cur->type == XML_ELEMENT_NODE) { parameter_type = XMLParseUtil::towstring(cur->name); - /* - cerr << "\nname: "; - wcerr << parameter_name; - cerr << "\ntype: "; - wcerr << parameter_type; - cerr << "\n"; - */ - - parseParameterItem(doc, cur, parameter_type, parameter_name); - } + + parseParameterItem(doc, cur, parameter_type, parameter_name); + } cur = cur->next; } @@ -166,28 +165,38 @@ void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name) xmlChar *Attr; cur = cur->xmlChildrenNode; - pair< vector , vector > temp_tags_list; + item temp_item; - while (cur != NULL) - { - if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) - { + while (cur != NULL) + { + temp_item.has_tags.clear(); + temp_item.exclude_tags.clear(); + temp_item.lemma.clear(); + + if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) + { Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - temp_tags_list.first = parseTags(XMLParseUtil::towstring(Attr)); + if (Attr) + { + temp_item.has_tags = parseTags(XMLParseUtil::towstring(Attr)); + } Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); if (Attr) { - temp_tags_list.second = parseTags(XMLParseUtil::towstring(Attr)); + temp_item.exclude_tags = parseTags(XMLParseUtil::towstring(Attr)); } - cats[cat_name].push_back(temp_tags_list); - - temp_tags_list.first.clear(); - temp_tags_list.second.clear(); - - xmlFree(Attr); + + Attr = xmlGetProp(cur, (const xmlChar *)"lemma"); + if (Attr) + { + temp_item.lemma = XMLParseUtil::towstring(Attr); + } + + cats[cat_name].push_back(temp_item); - } + xmlFree(Attr); + } cur = cur->next; } diff --git a/src/parse_arx.h b/src/parse_arx.h index 90a39ab..eefe6da 100644 --- a/src/parse_arx.h +++ b/src/parse_arx.h @@ -30,7 +30,13 @@ using namespace std; -typedef vector< pair < vector, vector > > acceptable_tags; //a vector of pairs of tags to match and exclude +struct item { //for cat-item and parameter-item + vector has_tags; + vector exclude_tags; + wstring lemma; +}; + +typedef vector acceptable_tags; struct markable_pattern { diff --git a/src/parse_biltrans.cc b/src/parse_biltrans.cc index 6c6d822..4865ec8 100644 --- a/src/parse_biltrans.cc +++ b/src/parse_biltrans.cc @@ -48,8 +48,10 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) else //not in a tag { sl_form.push_back(*i); + sl_lemma.push_back(*i); ++i; sl_form.push_back(*i); + sl_lemma.push_back(*i); } } else if(seenSlash == 1) //tl (only first entry) @@ -65,8 +67,10 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) else //not in a tag { tl_form.push_back(*i); + tl_lemma.push_back(*i); ++i; tl_form.push_back(*i); + tl_lemma.push_back(*i); } } else @@ -99,6 +103,11 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); //add char to current tag } } + + else + { + sl_lemma.push_back(*i); + } } else if(seenSlash == 1) //tl (only first entry in tl) @@ -122,6 +131,11 @@ ParseLexicalUnit::ParseLexicalUnit(wstring input_LU) temptag.push_back(*i); //add char to current tag } } + + else + { + tl_lemma.push_back(*i); + } } else //if tl has more than one entry @@ -151,6 +165,16 @@ vector< wstring > ParseLexicalUnit::get_tl_tags() return tl_tags; } +wstring ParseLexicalUnit::get_sl_lemma() +{ + return sl_lemma; +} + +wstring ParseLexicalUnit::get_tl_lemma() +{ + return tl_lemma; +} + /* //Uncomment to test this code void print_tags(vector< wstring > input) @@ -191,5 +215,3 @@ int main() return 0; } */ - - diff --git a/src/parse_biltrans.h b/src/parse_biltrans.h index 6b4120a..4b0eff1 100644 --- a/src/parse_biltrans.h +++ b/src/parse_biltrans.h @@ -50,6 +50,16 @@ private: * Target language tags */ vector< wstring > tl_tags; + + /** + * Source language lemma + */ + wstring sl_lemma; + + /** + * Target language lemma + */ + wstring tl_lemma; public: /** @@ -77,7 +87,17 @@ public: * Return the Target Language Form */ vector< wstring > get_tl_tags(); + + /** + * Return the Source Language Lemma + */ + wstring get_sl_lemma(); + + /** + * Return the Target Language Lemma + */ + wstring get_tl_lemma(); }; -#endif \ No newline at end of file +#endif diff --git a/src/pattern_arx.cc b/src/pattern_arx.cc index a1d8a72..8b89b9d 100644 --- a/src/pattern_arx.cc +++ b/src/pattern_arx.cc @@ -24,6 +24,7 @@ #include #include #include +#include using namespace std; @@ -60,15 +61,23 @@ int contains_any(vector tags, vector candidates) return 0; //if no matches } -int check_acceptable_tags(vector input_tags, acceptable_tags check_tags) //all tags in any tag list in check_tags must exist in input_tags +void toLower(basic_string& s) +{ + for (basic_string::iterator p = s.begin(); p != s.end(); ++p) + { + *p = towlower(*p); + } +} + +int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags) //check has-tags, exclude-tags, lemma { for (acceptable_tags::iterator i = check_tags.begin(); i != check_tags.end(); ++i) { int flag_contains_all = 1; - vector temp_tags = i->first; - vector temp_exclude_tags = i->second; + vector temp_tags = i->has_tags; + vector temp_exclude_tags = i->exclude_tags; for(std::vector::iterator j = temp_tags.begin(); j != temp_tags.end(); ++j) //check for the tags in has-tags { @@ -82,6 +91,11 @@ int check_acceptable_tags(vector input_tags, acceptable_tags check_tags } } + if(flag_contains_all == 0) + { + continue; + } + for(std::vector::iterator j = temp_exclude_tags.begin(); j != temp_exclude_tags.end(); ++j) //check for the tags in exclude-tags { if(contains(input_tags, *j)) //if the exclude-tag IS in the input LU tags @@ -90,16 +104,49 @@ int check_acceptable_tags(vector input_tags, acceptable_tags check_tags break; } } - - if(flag_contains_all == 1) //if any tag list fully matched (i.e. has-tags present, exclude-tags absent) - return 1; - //else continue to next tag list + + if(flag_contains_all == 0) + { + continue; + } + + if(!(i->lemma).empty()) + { + wstring temp_lemma = i->lemma; + + if(input_sl_lemma.length() == temp_lemma.length()) + { + if(input_sl_lemma.compare(temp_lemma) != 0) + { + toLower(input_sl_lemma); + toLower(temp_lemma); + + if(input_sl_lemma.compare(temp_lemma) != 0) + { + flag_contains_all = 0; + } + } + } + else + { + flag_contains_all = 0; + } + } + + if(flag_contains_all == 0) + { + continue; + } + else //if any tag list fully matched (i.e. has-tags present, exclude-tags absent) + { + return 1; + } } return 0; //if it didn't return 1 then no tag list was fully matched } -parameter_return check_pattern_name(vector input_tags, unordered_map parameter_names) +parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names) //find out if any of the anaphors match wrt tags, and if yes, return the unique name { parameter_return retval; @@ -110,7 +157,7 @@ parameter_return check_pattern_name(vector input_tags, unordered_mapfirst; acceptable_tags parameter_tags= it->second; - if(check_acceptable_tags(input_tags, parameter_tags)) + if(check_acceptable_tags(input_tags, input_sl_lemma, parameter_tags)) { retval.found = 1; retval.parameter_name = parameter_name; @@ -157,7 +204,7 @@ deque< vector > add_properties(deque< vector > context, Pa acceptable_tags pattern_item_tags = arx_cats[current_pattern[x].name]; //get pattern item tags from def-cats - if(check_acceptable_tags((*(n+x)).pos_tags, pattern_item_tags)) //comparing current LU tags to pattern tags + if(check_acceptable_tags((*(n+x)).pos_tags, (*(n+x)).sl_lemma, pattern_item_tags)) //comparing current LU tags to pattern tags and lemma { match_flag = 1; diff --git a/src/pattern_arx.h b/src/pattern_arx.h index 6422a5a..0a2ba2e 100644 --- a/src/pattern_arx.h +++ b/src/pattern_arx.h @@ -29,9 +29,11 @@ using namespace std; struct unique_LU { - unsigned int id; + int id; wstring wordform; wstring tl_wordform; + wstring sl_lemma; + wstring tl_lemma; vector pos_tags; vector properties; }; @@ -50,10 +52,11 @@ struct parameter_return int contains(vector tags, wstring tag); int contains_any(vector tags, vector candidates); +void toLower(basic_string& s); -int check_acceptable_tags(vector input_tags, acceptable_tags check_tags); -parameter_return check_pattern_name(vector input_tags, unordered_map parameter_names); +int check_acceptable_tags(vector input_tags, wstring input_sl_lemma, acceptable_tags check_tags); +parameter_return check_pattern_name(vector input_tags, wstring input_sl_lemma, unordered_map parameter_names); deque< vector > add_properties(deque< vector > context, ParseArx arx_file); -#endif \ No newline at end of file +#endif diff --git a/src/score.cc b/src/score.cc index 50f17cb..2724383 100644 --- a/src/score.cc +++ b/src/score.cc @@ -62,12 +62,12 @@ void showq(deque < vector > gq) //to display context if needed (testi cerr << '\n'; } -int Scoring::add_word(unsigned int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, ParseArx arx_file, int debug_flag) +int Scoring::add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag) { vector temp_prop; parameters_datatype arx_parameters = arx_file.get_parameters(); - unique_LU input_LU = {input_id, input_wordform, input_tl_wordform, input_pos_tags, temp_prop}; //initialise LU + unique_LU input_LU = {input_id, input_wordform, input_tl_wordform, input_sl_lemma, input_tl_lemma, input_pos_tags, temp_prop}; //initialise LU if(context.empty()) //if queue is empty { @@ -76,7 +76,7 @@ int Scoring::add_word(unsigned int input_id, wstring input_wordform, vector< wst context.push_back(sentence); - if(check_acceptable_tags(input_LU.pos_tags, arx_parameters[L"delimiter"][L"default"]) ) //if sentence end (somehow the first LU is a sentence end) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"]) ) //if sentence end (somehow the first LU is a sentence end) { vector new_sentence; @@ -85,7 +85,7 @@ int Scoring::add_word(unsigned int input_id, wstring input_wordform, vector< wst } else //if queue is not empty { - if(check_acceptable_tags(input_LU.pos_tags, arx_parameters[L"delimiter"][L"default"])) + if(check_acceptable_tags(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"delimiter"][L"default"])) { context.back().push_back(input_LU); //add to context so that it can also be matched in a pattern @@ -99,9 +99,9 @@ int Scoring::add_word(unsigned int input_id, wstring input_wordform, vector< wst else { - parameter_return retval = check_pattern_name(input_LU.pos_tags, arx_parameters[L"anaphor"]); + parameter_return retval = check_pattern_name(input_LU.pos_tags, input_LU.sl_lemma, arx_parameters[L"anaphor"]); - if(retval.found == 1) //check if tags of current word match with anaphor tags in arx file + if(retval.found == 1) //check if tags,lemma of current word match with anaphor tags in arx file { unique_LU anaphor_LU = input_LU; @@ -173,7 +173,7 @@ void Scoring::apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring par cerr << "\n"; } - if(check_acceptable_tags((*j).pos_tags, arx_file.get_parameters()[L"antecedent"][parameter_name]) ) // if it is antecedent (based on external xml file) + if(check_acceptable_tags((*j).pos_tags, (*j).sl_lemma, arx_file.get_parameters()[L"antecedent"][parameter_name]) ) // if it is antecedent (based on external xml file) { temp_score = 0; @@ -293,4 +293,4 @@ void Scoring::clear() //use a destructor? { context.clear(); //empty queue antecedent_list.clear(); //empty antecedent list -} \ No newline at end of file +} diff --git a/src/score.h b/src/score.h index 95e3326..f08d189 100644 --- a/src/score.h +++ b/src/score.h @@ -38,7 +38,7 @@ private: vector antecedent_list; //A list of antecedents public: - int add_word(unsigned int input_id, wstring input_wordform, vector< wstring > pos_tags, wstring input_tl_wordform, ParseArx arx_file, int debug_flag); + int add_word(int input_id, wstring input_wordform, vector< wstring > input_pos_tags, wstring input_tl_wordform, wstring input_sl_lemma, wstring input_tl_lemma, ParseArx arx_file, int debug_flag); void apply_indicators(unique_LU anaphor, ParseArx arx_file, wstring parameter_name, int debug_flag); int check_agreement(vector antecedent_tags, vector anaphor_tags); wstring get_antecedent(int debug_flag); @@ -46,4 +46,4 @@ public: }; -#endif \ No newline at end of file +#endif diff --git a/tests/apertium-eng-spa.spa-eng.arx b/tests/apertium-eng-spa.spa-eng.arx index 82c3edb..e02cb8e 100644 --- a/tests/apertium-eng-spa.spa-eng.arx +++ b/tests/apertium-eng-spa.spa-eng.arx @@ -9,7 +9,7 @@ - + @@ -28,6 +28,16 @@ + + + + + + + + + + @@ -71,6 +81,10 @@ + + + + @@ -212,6 +226,16 @@ + + + + + + + + + + diff --git a/tests/test_lemma.in b/tests/test_lemma.in new file mode 100644 index 0000000..a9c57a7 --- /dev/null +++ b/tests/test_lemma.in @@ -0,0 +1,10 @@ +^El/The$ ^grup/group$ ^test/grouptest$ ^de/of/from$ +^el/the$ ^Parlament/Parliament$ +^haver/have$ +^mostrar/show/display$ +^aquest/this$ ^dimarts/Tuesday$ +^el seu/his$ ^suport/support$ +^a/at/in/to$ ^test/his$ +^*batle/*batle$ ^de/of/from$ ^*Alaró/*Alaró$^./.$ +^Parlament/Parliament$ ^es/is$ ^test/his$ +^Parlament/Parliament$ ^esta/is$ ^test/his$ diff --git a/tests/test_lemma.out b/tests/test_lemma.out new file mode 100644 index 0000000..7cb1bcb --- /dev/null +++ b/tests/test_lemma.out @@ -0,0 +1,10 @@ +^El/The/$ ^grup/group/$ ^test/grouptest/$ ^de/of/from/$ +^el/the/$ ^Parlament/Parliament/$ +^haver/have/$ +^mostrar/show/display/$ +^aquest/this/$ ^dimarts/Tuesday/$ +^el seu/his/$ ^suport/support/$ +^a/at/in/to/$ ^test/his/Parliament$ +^*batle/*batle/$ ^de/of/from/$ ^*Alaró/*Alaró/$^././$ +^Parlament/Parliament/$ ^es/is/$ ^test/his/Parliament$ +^Parlament/Parliament/$ ^esta/is/$ ^test/his/Parliament$