commit 54c63b93403cef7c75a246340e55443879e7634b Author: Tanmai Khanna Date: Sun Jun 14 16:36:15 2020 +0530 New feature:exclude-tags in pattern matching diff --git a/samples/apertium-eng-spa.spa-eng.arx b/samples/apertium-eng-spa.spa-eng.arx index 76023a1..38f4671 100644 --- a/samples/apertium-eng-spa.spa-eng.arx +++ b/samples/apertium-eng-spa.spa-eng.arx @@ -13,7 +13,7 @@ - + diff --git a/samples/test_exclude.txt b/samples/test_exclude.txt new file mode 100644 index 0000000..2ea24fb --- /dev/null +++ b/samples/test_exclude.txt @@ -0,0 +1,9 @@ +^El/The$ ^grup/group$ ^de/of/from$ +^el/the$ ^Parlament/Parliament$ +^haver/have$ +^mostrar/show/display$ +^aquest/this$ ^dimarts/Tuesday$ +^./.$ ^Tanmai/Tanmai$ ^es/is$ +^el seu/his$ ^suport/support$ +^a/at/in/to$ ^el/the$ +^*batle/*batle$ ^de/of/from$ ^el seu/his$ ^*AlarĂ³/*AlarĂ³$^./.$ diff --git a/src/parse_arx.cc b/src/parse_arx.cc index 4cc03b0..943512c 100644 --- a/src/parse_arx.cc +++ b/src/parse_arx.cc @@ -78,24 +78,27 @@ void ParseArx::parseParameterItem (xmlDocPtr doc, xmlNodePtr cur, wstring parame xmlChar *Attr; cur = cur->xmlChildrenNode; - vector temp_tags_list; + pair< vector , vector > temp_tags_list; while (cur != NULL) { if ((!xmlStrcmp(cur->name, (const xmlChar *)"parameter-item"))) { Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - - //fprintf(stderr, "ParameterItem: "); - - temp_tags_list = parseTags(XMLParseUtil::towstring(Attr)); + temp_tags_list.first = parseTags(XMLParseUtil::towstring(Attr)); + + Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); + if (Attr) + { + temp_tags_list.second = parseTags(XMLParseUtil::towstring(Attr)); + } + parameters[parameter_type][parameter_name].push_back(temp_tags_list); - temp_tags_list.clear(); + temp_tags_list.first.clear(); + temp_tags_list.second.clear(); xmlFree(Attr); - - //key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); } cur = cur->next; @@ -163,23 +166,27 @@ void ParseArx::parseCatItem (xmlDocPtr doc, xmlNodePtr cur, wstring cat_name) xmlChar *Attr; cur = cur->xmlChildrenNode; - vector temp_tags_list; + pair< vector , vector > temp_tags_list; while (cur != NULL) { if ((!xmlStrcmp(cur->name, (const xmlChar *)"cat-item"))) { - Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); - //fprintf(stderr, "catItem: "); - - temp_tags_list = parseTags(XMLParseUtil::towstring(Attr)); + Attr = xmlGetProp(cur, (const xmlChar *)"has-tags"); + temp_tags_list.first = parseTags(XMLParseUtil::towstring(Attr)); + + Attr = xmlGetProp(cur, (const xmlChar *)"exclude-tags"); + if (Attr) + { + temp_tags_list.second = parseTags(XMLParseUtil::towstring(Attr)); + } cats[cat_name].push_back(temp_tags_list); - temp_tags_list.clear(); + temp_tags_list.first.clear(); + temp_tags_list.second.clear(); xmlFree(Attr); - //key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); } cur = cur->next; @@ -279,13 +286,11 @@ void ParseArx::parsePatterns (xmlDocPtr doc, xmlNodePtr cur, wstring markable_na if (parameter_name) { - //cerr << "HELLO!"; wstring parameter_name_ws = XMLParseUtil::towstring(parameter_name); parameter_markables_score[parameter_name_ws][markable_name] = score_int; } else { - //cerr << "FUCK!"; all_markables_score[markable_name] = score_int; } } diff --git a/src/parse_arx.h b/src/parse_arx.h index a43190d..90a39ab 100644 --- a/src/parse_arx.h +++ b/src/parse_arx.h @@ -30,7 +30,7 @@ using namespace std; -typedef vector< vector > acceptable_tags; +typedef vector< pair < vector, vector > > acceptable_tags; //a vector of pairs of tags to match and exclude struct markable_pattern { @@ -78,4 +78,4 @@ public: unordered_map get_parameter_markables_score(wstring parameter_name); }; -#endif \ No newline at end of file +#endif diff --git a/src/pattern_arx.cc b/src/pattern_arx.cc index 5c86f8f..a1d8a72 100644 --- a/src/pattern_arx.cc +++ b/src/pattern_arx.cc @@ -67,31 +67,33 @@ int check_acceptable_tags(vector input_tags, acceptable_tags check_tags int flag_contains_all = 1; - vector temp_tags = *i; - - for(std::vector::iterator j = temp_tags.begin(); j != temp_tags.end(); ++j) + vector temp_tags = i->first; + vector temp_exclude_tags = i->second; + + for(std::vector::iterator j = temp_tags.begin(); j != temp_tags.end(); ++j) //check for the tags in has-tags { - if(*j == L"*") //ignore * in the tags list continue; - if(!contains(input_tags, *j)) //if the required tag is NOT in the input LU tags + if(!contains(input_tags, *j)) //if the has-tag is NOT in the input LU tags { flag_contains_all = 0; break; } - /* - else - { - cerr << "FoundTag:"; - wcerr << *j; - cerr <<"\n"; - } - */ } - - if(flag_contains_all == 1) //if any tag list fully matched - return 1; //else continue to next tag list + + for(std::vector::iterator j = temp_exclude_tags.begin(); j != temp_exclude_tags.end(); ++j) //check for the tags in exclude-tags + { + if(contains(input_tags, *j)) //if the exclude-tag IS in the input LU tags + { + flag_contains_all = 0; + break; + } + } + + if(flag_contains_all == 1) //if any tag list fully matched (i.e. has-tags present, exclude-tags absent) + return 1; + //else continue to next tag list } return 0; //if it didn't return 1 then no tag list was fully matched