Index: branches/weighted-transfer/apertium/apertium/transfer.cc =================================================================== --- branches/weighted-transfer/apertium/apertium/transfer.cc (revision 72278) +++ branches/weighted-transfer/apertium/apertium/transfer.cc (revision 72279) @@ -224,6 +224,7 @@ string lemma = ""; string tags = ""; string rule_id = ""; + int current_pattern_length = 0; string current_pattern = ""; map > > weighted_pattern_rule_group; // to track all rules in one group //tc @@ -262,6 +263,7 @@ { if(k->type == XML_ELEMENT_NODE && !xmlStrcmp(k->name, (const xmlChar *) "pattern")) { + current_pattern_length = 0; weight = atof(getNodeAttr(k, "weight").c_str()); for(xmlNode *patit = k->children; patit != NULL; patit = patit->next) { @@ -269,6 +271,7 @@ { lemma = getNodeAttr(patit, "lemma"); current_pattern = current_pattern + lemma; + current_pattern_length++; tags = getNodeAttr(patit, "tags"); if (tags != "") @@ -303,7 +306,7 @@ weighted_patterns.push_back(weighted_pattern_rule_group); weighted_pattern_rule_group.clear(); cerr << endl; // di - rule_groups.push_back(current_rule_group); + rule_groups.push_back(make_pair(current_pattern_length, current_rule_group)); current_rule_group.clear(); rule_group_index++; } @@ -313,12 +316,10 @@ cerr << "These are the rule groups you collected:" << endl; // di for (unsigned int k1 = 0; k1 < rule_groups.size(); k1++) // di { // di - cerr << "rule_groups[" << k1 << "]:" << endl; // di - for (unsigned int k2 = 0; k2 < rule_groups[k1].size(); k2++) // di + cerr << "rule_groups[" << k1 << "]: " << rule_groups[k1].first << endl; // di + for (unsigned int k2 = 0; k2 < rule_groups[k1].second.size(); k2++) // di { // di - cerr << " " << rule_groups[k1][k2] << endl; // di - //cerr << " rule_group_map[" << rule_groups[k1][k2] << "]: "; // di - //cerr << rule_group_map[rule_groups[k1][k2]] << endl; // di + cerr << " " << rule_groups[k1].second[k2] << endl; // di } // di cerr << endl; // di } // di @@ -2460,6 +2461,7 @@ wstring wtmpchunk = L""; string tmpchunk = ""; + string lookup_chunk = ""; string fallback_tmpchunk = ""; for(unsigned int i = 0; i != limit; i++) @@ -2569,11 +2571,11 @@ { /* //tc If there are other rules in the rule group, - that means chosen rule is ambiguous. + that means the chosen rule is ambiguous. */ rule_group_num = rule_group_map[chosen_rule_id]; - if (rule_groups[rule_group_num].size() > 1) + if (rule_groups[rule_group_num].second.size() > 1) { map > >::iterator lookup_pattern; tmpchunk = UtfConverter::toUtf8(wtmpchunk); @@ -2581,36 +2583,24 @@ cerr << "Rule # " << lastrule_num << " with id '" << chosen_rule_id; // di cerr << "' is ambiguous on input:" << endl; // di cerr << tmpchunk << endl << endl; // di - - // go through patterns //tc - bool found = false; - lookup_pattern = weighted_patterns[rule_group_num].find(tmpchunk); - if (lookup_pattern != weighted_patterns[rule_group_num].end()) + + lookup_chunk = ""; + unsigned int lookup_chunk_len = 0; + for (unsigned int i = 0; i <= tmpchunk.size() && lookup_chunk_len < rule_groups[rule_group_num].first; i++) { - found = true; - } - else + lookup_chunk = lookup_chunk + tmpchunk[i]; + if (tmpchunk[i] == ' ') { - cerr << "Pattern not found" << endl; // di - unsigned int chunk_end; - for (chunk_end = tmpchunk.size() - 2; tmpchunk[chunk_end] != ' ' && chunk_end > 0; chunk_end--); - fallback_tmpchunk = ""; - for (unsigned int i = 0; i <= chunk_end; i++) - { - fallback_tmpchunk = fallback_tmpchunk + tmpchunk[i]; + lookup_chunk_len++; } - lookup_pattern = weighted_patterns[rule_group_num].find(fallback_tmpchunk); - if (lookup_pattern == weighted_patterns[rule_group_num].end()) - { - cerr << "Pattern still not found" << endl; // di } - else - { - found = true; - } - } - if (found) + cerr << "Going to check: " << endl; // di + cerr << lookup_chunk << endl; + + // go through patterns //tc + lookup_pattern = weighted_patterns[rule_group_num].find(lookup_chunk); + if (lookup_pattern != weighted_patterns[rule_group_num].end()) { // let's check the weights for each rule in the group //tc chosen_weight = 0.; @@ -2628,6 +2618,10 @@ } } } + else // di + { // di + cerr << "Pattern not found" << endl; // di + } // di cerr << endl; // di // substitute lastrule with the chosen one //tc lastrule_num = rule_id_map[chosen_rule_id]; Index: branches/weighted-transfer/apertium/apertium/transfer.h =================================================================== --- branches/weighted-transfer/apertium/apertium/transfer.h (revision 72278) +++ branches/weighted-transfer/apertium/apertium/transfer.h (revision 72279) @@ -51,27 +51,43 @@ map, Ltstr> lists; map, Ltstr> listslow; - // all macros in xml in order of appearance in transfer file + /** + all macros in xml in order of their appearance in transfer file + */ vector macro_map; - // all rule actions in xml in order of appearance in transfer file + /** + all rule actions in xml in order of their appearance in transfer file + */ vector rule_map; - // rule number -> rule id, first meaningful rule at position 1 + /** + rule ids in order of their appearance in transfer file + first rule at position 1 + */ vector rule_ids; - // rule id : rule number + /** + rule id : rule number + */ map rule_id_map; - // rule group number : rule ids - vector > rule_groups; + /** + information about rule groups + each element of outer vector is pair of pattern length and vector of rule ids + */ + vector > > rule_groups; - // id : rule group number + /** + rule id : rule group number + */ map rule_group_map; - // all weighted patterns, grouped by rule group number - // index of outer vector corresponds to rule group numbers - // map is pattern string : vector of pairs + /** + all weighted patterns, grouped by rule group number + index of outer vector corresponds to rule group numbers + map is pattern string : vector of pairs of rule id and weight + */ vector > > > weighted_patterns; xmlDoc *doc; @@ -160,8 +176,8 @@ string copycase(string const &source_word, string const &target_word); /** - Apply subelements of 'out' subelement of rule action, one subelement - at a time, depending on subelement type. + Apply subelements of 'out' subelement of rule action, + one subelement at a time, depending on subelement type. */ void processOut(xmlNode *localroot); @@ -235,7 +251,7 @@ ~Transfer(); /** - Read all data needed for transfer + Read all data needed for transfer. */ void read(string const &transferfile, string const &datafile, string const &weightsfile = "", string const &fstfile = "");