Index: branches/weighted-transfer/apertium/apertium/transfer.cc =================================================================== --- branches/weighted-transfer/apertium/apertium/transfer.cc (revision 70271) +++ branches/weighted-transfer/apertium/apertium/transfer.cc (revision 70274) @@ -26,6 +26,7 @@ #include #include #include +#include using namespace Apertium; using namespace std; @@ -79,10 +80,10 @@ Transfer::readData(FILE *in) { // Read transfer rules data from .t*x.bin file - cerr << "readData" << endl; // di + //cerr << "readData" << endl; // di alphabet.read(in); - cerr << "Alphabet size: " << alphabet.size() << endl; // di + //cerr << "Alphabet size: " << alphabet.size() << endl; // di any_char = alphabet(TRXReader::ANY_CHAR); any_tag = alphabet(TRXReader::ANY_TAG); @@ -174,13 +175,13 @@ void Transfer::read(string const &transferfile, string const &datafile, - string const &fstfile) + string const &weightsfile, string const &fstfile) { // read and parse .t*x transfer file readTransfer(transferfile); // open precompiled .t*x.bin file and read data from it - cerr << "Reading data from " << datafile.c_str() << endl; + cerr << "Reading data from " << datafile.c_str() << endl << endl; FILE *in = fopen(datafile.c_str(), "rb"); if(!in) { @@ -190,7 +191,14 @@ readData(in); fclose(in); - // read data from fstfile if specified + // read data from transfer weights file if specified + if(weightsfile != "") + { + //cerr << "Reading weights from " << weightsfile << endl; // di + readTransferWeights(weightsfile); + } + + // read data from bilingual letter transducer file if specified if(fstfile != "") { cerr << "Reading fst data from " << fstfile << endl; // di @@ -250,6 +258,17 @@ } } } + + if (useWeights) // di + { // di + // double-check rule ids in rule_id_map and rule_ids // di + cerr << endl << "Those are the ids you wanted: " << endl; //di + for (int k = 1; k < rule_ids.size(); k++) // di + { // di + cerr << "rule_ids[" << k << "]: " << rule_ids[k] << endl; // di + cerr << "rule_id_map[" << rule_ids[k] << "]: " << rule_id_map[rule_ids[k]] << endl << endl; // di + } // di + } // di } void @@ -256,24 +275,36 @@ Transfer::collectRules(xmlNode *localroot) { // go through subelements of 'section-rules' + int rule_index = 0; + string rule_id = ""; + rule_ids.push_back(""); // fictive zero position element to not bother with i-1 thing + rule_id_map[""] = 0; // a uniformed answer to all empty string ids since we're not interested + for(xmlNode *i = localroot->children; i != NULL; i = i->next) { - if(i->type == XML_ELEMENT_NODE) - { - // normally looking at a 'rule' node now - //cerr << "Looking at " << i->name << endl; // di - for(xmlAttr *j = i->properties; j != NULL; j = j->next) // di - { // di - if(!xmlStrcmp(j->name, (const xmlChar *) "comment")) // di - { // di - cerr << "Collecting rule " << xmlNodeListGetString(i->doc, j->children, 1) << endl; // di - } // di - } // di - // di - // go through subelements of this 'rule' node + if(i->type == XML_ELEMENT_NODE && !xmlStrcmp(i->name, (const xmlChar *) "rule")) + { + // 'rule' element + rule_index++; + cerr << "Collecting rule # " << rule_index << endl; //di + + if (useWeights) // only need ids if weights are used + { + // get rule id and add it to rule_ids + rule_id = getRuleId(i); + if (rule_id != "") + { + rule_id_map[rule_id] = rule_index; + } + rule_ids.push_back(rule_id); + rule_id = ""; + cerr << endl; // di + } + + // go through subelements of current 'rule' element looking for some action for(xmlNode *j = i->children; ; j = j->next) { - // check if subelement is an 'action' node + // check if subelement is 'action' element if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) { // if so, add it at the end of the rule map @@ -300,6 +331,195 @@ } } +void +Transfer::readTransferWeights(string const &in) +{ + // Read transfer weights from .w*x file. + int rule_group_index = 0; + double weight = 0.; + string lemma = "", tags = "*"; + string rule_id = ""; + string regex = ""; + vector current_rule_group; // to track all rules in one group + rule_group_map[""] = -1; // a uniformed answer to all empty rule_group ids + vector > current_pattern_group; + weighted_patterns[""] = current_pattern_group; + + pcre *reCompiled; + pcre_extra *pcreExtra; + const char *pcreErrorStr; + int pcreErrorOffset; + + cerr << "Reading transfer weights from " << in.c_str() << endl << endl; // di + // di + doc = xmlReadFile(in.c_str(), NULL, 0); + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + //cerr << root_element->name << endl; // di + + // search through root's children nodes for 'rule-group' elements + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE && !xmlStrcmp(i->name, (const xmlChar *) "rule-group")) + { + cerr << "Collecting rule-group # " << rule_group_index << endl; // di + // get ids of all rules in rule group + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "rule")) + { + // get id + rule_id = getRuleId(j); + current_rule_group.push_back(rule_id); + rule_group_map[rule_id] = rule_group_index; + cerr << endl; // di + + // get patterns + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE && !xmlStrcmp(k->name, (const xmlChar *) "pattern")) + { + weight = atof(getNodeAttr(k, "weight").c_str()); + cerr << weight << endl; + for(xmlNode *patit = k->children; patit != NULL; patit = patit->next) + { + if(patit->type == XML_ELEMENT_NODE && !xmlStrcmp(patit->name, (const xmlChar *) "pattern-item")) + { + lemma = getNodeAttr(patit, "lemma"); + if (lemma == "") + { + regex = regex + "[^<>]*?"; + } + else + { + regex = regex + lemma; + } + + tags = getNodeAttr(patit, "tags"); + unsigned int tags_len = tags.size(); + if (tags_len > 0 && tags != "*") + { + regex = regex + "<"; + } + char curr_char; + for(unsigned int i = 0; i < tags_len; i++) + { + curr_char = tags[i]; + switch (curr_char) + { + case '.': + regex = regex + "><"; + break; + + case '*': + regex = regex + ".*?"; + break; + + default: + regex = regex + curr_char; + } + } + if (tags_len > 0 && tags != "*") + { + regex = regex + ">"; + } + cerr << lemma << " " << tags << endl; + regex = regex + "\\S*? "; + cerr << regex << endl; + } + } + reCompiled = pcre_compile(regex.c_str(), 0, &pcreErrorStr, &pcreErrorOffset, NULL); + //pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr); + current_pattern_group.push_back(make_pair(reCompiled, weight)); + regex = ""; + } + } + weighted_patterns[rule_id] = current_pattern_group; + current_pattern_group.clear(); + } + } + // push newly acquired current_rule_group into rule_groups + rule_groups.push_back(current_rule_group); + current_rule_group.clear(); + cerr << endl; // di + rule_group_index++; + } + } + + // print out what was collected // di + cerr << "These are the rule groups you collected:" << endl; // di + unsigned int k1, k2; // di + for (k1 = 0; k1 < rule_groups.size(); k1++) // di + { // di + cerr << "rule_groups[" << k1 << "]:" << endl; // di + for (k2 = 0; k2 < rule_groups[k1].size(); k2++) // di + { // di + cerr << " " << rule_groups[k1][k2] << endl; // di + cerr << " rule_group_map[" << rule_groups[k1][k2] << "]: "; // di + cerr << rule_group_map[rule_groups[k1][k2]] << endl; // di + } // di + cerr << endl; // di + } // di + + cerr << "And these are the patterns:" << endl; // di + for (k1 = 1; k1 < rule_ids.size(); k1++) // di + { // di + if (rule_ids[k1] != "") // di + { // di + cerr << "Patterns for rule " << rule_ids[k1] << endl; // di + for (k2 = 0; k2 < weighted_patterns[rule_ids[k1]].size(); k2++) // di + { // di + cerr << " " << weighted_patterns[rule_ids[k1]][k2].first << " "; // di + cerr << weighted_patterns[rule_ids[k1]][k2].second << endl; // di + } // di + } // di + } // di + +} + +string +Transfer::getRuleId(xmlNode *localroot) +{ + string rule_id = ""; + // normally looking at a 'rule' node now + for(xmlAttr *j = localroot->properties; j != NULL; j = j->next) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "comment")) // di + { // di + cerr << "Rule comment: " << xmlNodeListGetString(localroot->doc, j->children, 1) << endl; // di + } // di + else //di + { //di + if(!xmlStrcmp(j->name, (const xmlChar *) "id")) + { + // add rule id to rule_id_map + rule_id = (const char*)xmlNodeListGetString(localroot->doc, j->children, 1); + cerr << "Rule id: " << rule_id << endl; // di + } + } // di + } // di + return rule_id; +} + +string +Transfer::getNodeAttr(xmlNode *localroot, const char* attr_name) +{ + string attr_val = ""; + for(xmlAttr *j = localroot->properties; j != NULL; j = j->next) + { + if(!xmlStrcmp(j->name, (const xmlChar *) attr_name)) + { + attr_val = (const char*) xmlNodeListGetString(localroot->doc, j->children, 1); + } + } + return attr_val; +} + bool Transfer::checkIndex(xmlNode *element, int index, int limit) { @@ -340,27 +560,27 @@ string Transfer::evalString(xmlNode *element) { - // Contrary to its name, this function basically evaluates - // an xml element and executes appropriate instruction. + // This function evaluates an xml element + // and executes appropriate instruction. - // I believe it is used to evaluate lowest-level action elements, + // I believe it is used for lowest-level action elements, // such as 'clip' or 'lit-tag'. // If TransferInstr object corresponding to the element is already // in evalStringCache, execute that instruction, // if not, first add the instruction to evalStringCache, - // then call evalString again, and execute that instruction. + // then call evalString again and execute that instruction. // First, let's see what we've got. // di - if (element->type == XML_ELEMENT_NODE) // di - { // di - cerr << "Evaluating " << element->name << " "; // di - for(xmlAttr *prop = element->properties; prop != NULL; prop = prop->next) // di - { // di - cerr << prop->name << "='" << xmlNodeListGetString(element->doc, prop->children, 1) << "' "; // di - } // di - cerr << endl; // di - } // di + //if (element->type == XML_ELEMENT_NODE) // di + //{ // di + //cerr << "Evaluating " << element->name << " "; // di + //for(xmlAttr *prop = element->properties; prop != NULL; prop = prop->next) // di + //{ // di + //cerr << prop->name << "='" << xmlNodeListGetString(element->doc, prop->children, 1) << "' "; // di + //} // di + //cerr << endl; // di + //} // di map::iterator it; it = evalStringCache.find(element); @@ -465,8 +685,8 @@ // The following code is executed if TransferInstr object // corresponding to the element is not in evalStringCache yet. - // It parses lowest-level element, makes TransferInstr object out of it, - // and pushes it into evalStringCache. + // It parses lowest-level element, creates TransferInstr object + // out of it, and pushes it into evalStringCache. if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) { int pos = 0; @@ -688,7 +908,7 @@ { // apply 'out' subelement of a rule, one subelement at a time, // depending on subelement type - cerr << "Applying 'out' element" << endl; // di + //cerr << "Applying 'out' element" << endl; // di for(xmlNode *i = localroot->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) @@ -779,7 +999,7 @@ // apply 'chunk' subelement of 'out' element of a rule, // one subelement at a time, depending on subelement type - cerr << "Applying 'chunk' element" << endl; // di + //cerr << "Applying 'chunk' element" << endl; // di string name, namefrom; string caseofchunk = "aa"; string result; @@ -807,7 +1027,7 @@ // starting to build the chunk result.append("^"); - cerr << result << endl; // di + //cerr << result << endl; // di // adding chunk name if(caseofchunk != "") @@ -815,12 +1035,12 @@ if(name != "") { result.append(copycase(variables[caseofchunk], name)); - cerr << result << endl; // di + //cerr << result << endl; // di } else if(namefrom != "") { result.append(copycase(variables[caseofchunk], variables[namefrom])); - cerr << result << endl; // di + //cerr << result << endl; // di } else { @@ -833,12 +1053,12 @@ if(name != "") { result.append(name); - cerr << result << endl; // di + //cerr << result << endl; // di } else if(namefrom != "") { result.append(variables[namefrom]); - cerr << result << endl; // di + //cerr << result << endl; // di } else { @@ -860,7 +1080,7 @@ // add chunk tags result.append(processTags(i)); result.append("{"); - cerr << result << endl; // di + //cerr << result << endl; // di } else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { @@ -873,7 +1093,7 @@ { if(j->type == XML_ELEMENT_NODE) { - cerr << "Executing " << j->name << endl; // di + //cerr << "Executing " << j->name << endl; // di myword.append(evalString(j)); evalStringClip(j, untouched, untouched_pos); // black magic @@ -926,17 +1146,17 @@ } if(myword != "") { - cerr << myword << endl; // di + //cerr << myword << endl; // di result.append("^"); result.append(myword); result.append("$"); - cerr << result << endl; // di + //cerr << result << endl; // di } } else // 'b' { result.append(evalString(i)); - cerr << result << endl; // di + //cerr << result << endl; // di } } } @@ -972,7 +1192,7 @@ string Transfer::processTags(xmlNode *localroot) { - cerr << "processTags" << endl; // di + //cerr << "processTags" << endl; // di string result; for(xmlNode *i = localroot->children; i != NULL; i = i->next) { @@ -997,7 +1217,7 @@ Transfer::processInstruction(xmlNode *localroot) { // process instruction specified in rule action based on its name - cerr << "Processing instruction '" << localroot->name << "'" << endl; // di + //cerr << "Processing instruction '" << localroot->name << "'" << endl; // di int words_to_consume = -1; if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) @@ -1454,8 +1674,7 @@ if(localroot->properties != NULL) { - if(!xmlStrcmp(localroot->properties->children->content, - (const xmlChar *) "yes")) + if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) { set &myset = listslow[(const char *) idlist]; if(myset.find(tolower(sval)) != myset.end()) @@ -1845,7 +2064,7 @@ string Transfer::copycase(string const &source_word, string const &target_word) { - cerr << "copycase" << endl; // di + //cerr << "copycase" << endl; // di wstring result; wstring const s_word = UtfConverter::fromUtf8(source_word); wstring const t_word = UtfConverter::fromUtf8(target_word); @@ -1967,7 +2186,7 @@ TransferToken & Transfer::readToken(FILE *in) { - cerr << "readToken" << endl; // di + //cerr << "readToken" << endl; // di if(!input_buffer.isEmpty()) { return input_buffer.next(); @@ -2039,7 +2258,7 @@ bool Transfer::getNullFlush(void) { - cerr << "getNullFlush" << endl; // di + //cerr << "getNullFlush" << endl; // di return null_flush; } @@ -2089,7 +2308,7 @@ void Transfer::transfer(FILE *in, FILE *out) { - cerr << endl << "transfer starts" << endl << endl; // di + cerr << "Transfer starts here" << endl << endl; // di if(getNullFlush()) { @@ -2098,7 +2317,7 @@ int last = 0; int prev_last = 0; - int lastrule_id = -1; + lastrule_num = -1; set banned_rules; output = out; @@ -2107,14 +2326,15 @@ int counter = 0; // di while(true) { - cerr << endl << "Transfer iteration # " << counter << endl; // di + cerr << "Transfer iteration # " << counter << endl; // di cerr << "last: " << last << endl; // di cerr << "prev_last: " << prev_last << endl; // di - cerr << "lastrule_id: " << lastrule_id << endl; // di + cerr << "lastrule_num: " << lastrule_num << endl; // di cerr << "ms.size(): " << ms.size() << endl; // di + // Let's look at input_buffer contents // di - int initbuffpos = input_buffer.getPos(); // di - cerr << "input_buffer position: " << initbuffpos << endl << endl; // di + /*int initbuffpos = input_buffer.getPos(); // di + //cerr << "input_buffer position: " << initbuffpos << endl << endl; // di input_buffer.setPos(0); // di int currbuffpos, prevbuffpos = input_buffer.getPos(); // di TransferToken currbufftok, prevbufftok = input_buffer.next(); // di @@ -2122,7 +2342,7 @@ while (run) { // di currbuffpos = input_buffer.getPos(); // di currbufftok = input_buffer.next(); // di - cerr << "input_buffer.buf[" << prevbuffpos << "]: " << UtfConverter::toUtf8(prevbufftok.getContent()) << endl; // di + //cerr << "input_buffer.buf[" << prevbuffpos << "]: " << UtfConverter::toUtf8(prevbufftok.getContent()) << endl; // di if (currbuffpos == prevbuffpos) { // di run = false; // di } else { // di @@ -2138,7 +2358,7 @@ //for(set::iterator iter=banned_rules.begin(); iter != banned_rules.end(); iter++) { // di // cerr << *iter << ", "; // di //} // di - //cerr << endl; // di + //cerr << endl; // di*/ if(trace_att) { @@ -2172,11 +2392,12 @@ if (ms.size() == 0) { - cerr << "(ms.size() == 0)" << endl; // di + //cerr << "(ms.size() == 0)" << endl; // di if(lastrule != NULL) { - // this is the branch where a rule specified by lastrule_id is applied - cerr << "lastrule != NULL" << endl; // di + // this is the branch where a rule specified by lastrule_num is applied + + //cerr << "lastrule != NULL" << endl; // di int num_words_to_consume = applyRule(); if(trace_att) @@ -2218,12 +2439,12 @@ { cerr << "num_words_to_consume == 0" << endl; // di //Add rule to banned rules - banned_rules.insert(lastrule_id); + banned_rules.insert(lastrule_num); input_buffer.setPos(prev_last); input_buffer.next(); last = input_buffer.getPos(); } // thy words consumed - lastrule_id = -1; + lastrule_num = -1; } else // lastrule == NULL { @@ -2347,7 +2568,7 @@ last = input_buffer.getPos(); ms.init(me->getInitial()); } - } + } // lastrule == NULL ends here } // if(ms.size() == 0) ends here int val = ms.classifyFinals(me->getFinals(), banned_rules); @@ -2354,7 +2575,7 @@ if(val != -1) { lastrule = rule_map[val-1]; - lastrule_id = val; + lastrule_num = val; last = input_buffer.getPos(); if(trace) @@ -2404,6 +2625,7 @@ return; } counter++; + cerr << endl; } } // end of transfer @@ -2410,13 +2632,21 @@ int Transfer::applyRule() { - cerr << "applyRule" << endl; // di - cerr << "limit " << tmpword.size() << endl; // di + //cerr << "applyRule" << endl; // di + //cerr << "limit " << tmpword.size() << endl; // di //wcerr << UtfConverter::toUtf8(*tmpword[0]) << endl; // di - + int words_to_consume; unsigned int limit = tmpword.size(); + wstring wtmpchunk; + string tmpchunk; + + if (useWeights) + { + wtmpchunk = L""; + } + for(unsigned int i = 0; i != limit; i++) { cerr << "applyRule iteration # " << i << endl; // di @@ -2450,8 +2680,8 @@ cerr << "useBilingual && preBilingual == false" << endl; // di tr = fstp.biltransWithQueue(*tmpword[i], false); cerr << i << " "; - wcerr << tr.first << " "; - cerr << tr.second << endl; + wcerr << tr.first << " "; // di + cerr << tr.second << endl; // di } else if(preBilingual) { @@ -2460,6 +2690,7 @@ // then // sl = word_in_lang1 // tl = word_in_lang2 + cerr << "preBilingual" << endl; // di wstring sl; wstring tl; @@ -2506,7 +2737,11 @@ wcerr << tl << endl; // di //tmpword[i]->assign(sl); tr = pair(tl, false); + if (useWeights) + { + wtmpchunk = wtmpchunk + sl + L" "; } + } else { // here we don't need to split anything @@ -2514,13 +2749,79 @@ tr = pair(*tmpword[i], false); } - //wcerr << tr.first << endl; // di + //wcerr << L"tr.first: " << tr.first << endl; // di word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), tr.second); //cerr << i << " "; // di - //wcerr << UtfConverter::fromUtf8(word[i]) << endl; // di + //wcerr << L"word[" << i << L"]: " << UtfConverter::fromUtf8(tr.first) << endl; // di } + // check if we use weights + if (useWeights) + { + tmpchunk = UtfConverter::toUtf8(wtmpchunk); + cerr << "Got an lchunk: " << tmpchunk << endl << endl; // di + + int pcreExecRet; + int subStrVec[30]; + double chosen_weight = 0., current_weight = 0.; + string chosen_rule_id = rule_ids[lastrule_num]; + string current_rule_id; + unsigned int rule_group_num; + + // check if rule id is not empty + if (chosen_rule_id != "") + { + // check if there are other rules in its group + rule_group_num = rule_group_map[chosen_rule_id]; + if (rule_groups[rule_group_num].size() > 1) + { + cerr << "Rule # " << lastrule_num << " is ambiguous" << endl; // di + cerr << "Rule id: " << chosen_rule_id << endl; // di + cerr << "Rules in the group: " << endl; // di + for (unsigned int ind = 0; ind < rule_groups[rule_group_num].size(); ind++) // di + { // di + cerr << " " << rule_groups[rule_group_num][ind] << endl; // di + } // di + cerr << endl; // di + + // let's check the weights for each rule in the group + chosen_weight = 0.; + for (unsigned int ind = 0; ind < rule_groups[rule_group_num].size(); ind++) + { + current_weight = 0.; + current_rule_id = rule_groups[rule_group_num][ind]; + + cerr << "Checking " << current_rule_id << endl; // di + // go through patterns + for (unsigned int k = 0; k < weighted_patterns[current_rule_id].size(); k++) + { + pcreExecRet = pcre_exec(weighted_patterns[current_rule_id][k].first, NULL, + tmpchunk.c_str(), tmpchunk.length(), + 0, 0, subStrVec, 30); + if(pcreExecRet >= 0) // bingo! + { + cerr << "Pattern matched " << weighted_patterns[current_rule_id][k].first; // di + current_weight = weighted_patterns[current_rule_id][k].second; + cerr << " with weight " << current_weight << endl; // di + if (current_weight > chosen_weight) // heavier rule + { + chosen_weight = current_weight; + chosen_rule_id = current_rule_id; + } + } + } + } + cerr << endl; // di + // substitute lastrule with the chosen one + lastrule_num = rule_id_map[chosen_rule_id]; + lastrule = rule_map[lastrule_num-1]; + cerr << "Chose rule # " << lastrule_num << " id: " << chosen_rule_id; + cerr << " with weight " << chosen_weight << endl; // di + } + } + } + words_to_consume = processRule(lastrule); // some cleanup ? @@ -2558,7 +2859,7 @@ // Here, the token contained in word_str is fed // to the fst by stepping with ms - cerr << "applyWord: applying to " << UtfConverter::toUtf8(word_str) << endl; // di + //cerr << "applyWord: applying to " << UtfConverter::toUtf8(word_str) << endl; // di ms.step(L'^'); for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) { @@ -2632,3 +2933,15 @@ { fstp.setCaseSensitiveMode(value); } + +void +Transfer::setUseWeights(bool value) +{ + useWeights = value; +} + +bool +Transfer::getUseWeights(void) const +{ + return useWeights; +} Index: branches/weighted-transfer/apertium/apertium/apertium_transfer.cc =================================================================== --- branches/weighted-transfer/apertium/apertium/apertium_transfer.cc (revision 70271) +++ branches/weighted-transfer/apertium/apertium/apertium_transfer.cc (revision 70274) @@ -46,6 +46,7 @@ wcerr << " biltrans bilingual letter transducer file" << endl; wcerr << " input input file, standard input by default" << endl; wcerr << " output output file, standard output by default" << endl; + wcerr << " -w tweights transfer rule weights file" << endl; wcerr << " -b input from lexical transfer" << endl; wcerr << " -n don't use bilingual dictionary" << endl; wcerr << " -x bindix extended mode with user dictionary" << endl; @@ -102,6 +103,7 @@ Transfer t; int option_index=0; + string weights = ""; while (true) { static struct option long_options[] = @@ -109,6 +111,7 @@ {"from-bilingual", no_argument, 0, 'b'}, {"no-bilingual", no_argument, 0, 'n'}, {"extended", required_argument, 0, 'x'}, + {"transfer-weihts", required_argument, 0, 'w'}, {"case-sensitive", no_argument, 0, 'c'}, {"null-flush", no_argument, 0, 'z'}, {"trace", no_argument, 0, 't'}, @@ -117,7 +120,7 @@ {0, 0, 0, 0} }; - int c=getopt_long(argc, argv, "nbx:cztTh", long_options, &option_index); + int c=getopt_long(argc, argv, "nbx:w:cztTh", long_options, &option_index); if (c==-1) break; @@ -135,6 +138,12 @@ case 'x': t.setExtendedDictionary(optarg); break; + + case 'w': // transfer rule weights file is specified + weights = optarg; + testfile(weights); + t.setUseWeights(true); + break; case 'c': t.setCaseSensitiveness(true); @@ -170,7 +179,7 @@ testfile(argv[argc-3]); testfile(argv[argc-4]); testfile(argv[argc-5]); - t.read(argv[argc-5], argv[argc-4], argv[argc-3]); + t.read(argv[argc-5], argv[argc-4], weights, argv[argc-3]); break; case 5: @@ -180,7 +189,7 @@ input = open_input(argv[argc-2]); testfile(argv[argc-3]); testfile(argv[argc-4]); - t.read(argv[argc-4], argv[argc-3]); + t.read(argv[argc-4], argv[argc-3], weights); } else { @@ -188,7 +197,7 @@ testfile(argv[argc-2]); testfile(argv[argc-3]); testfile(argv[argc-4]); - t.read(argv[argc-4], argv[argc-3], argv[argc-2]); + t.read(argv[argc-4], argv[argc-3], weights, argv[argc-2]); } break; @@ -198,7 +207,7 @@ input = open_input(argv[argc-1]); testfile(argv[argc-2]); testfile(argv[argc-3]); - t.read(argv[argc-3], argv[argc-2]); + t.read(argv[argc-3], argv[argc-2], weights); } else { @@ -205,7 +214,7 @@ testfile(argv[argc-1]); testfile(argv[argc-2]); testfile(argv[argc-3]); - t.read(argv[argc-3], argv[argc-2], argv[argc-1]); + t.read(argv[argc-3], argv[argc-2], weights, argv[argc-1]); } break; case 3: @@ -213,7 +222,7 @@ { testfile(argv[argc-1]); testfile(argv[argc-2]); - t.read(argv[argc-2], argv[argc-1]); + t.read(argv[argc-2], argv[argc-1], weights); } else { Index: branches/weighted-transfer/apertium/apertium/transfer.h =================================================================== --- branches/weighted-transfer/apertium/apertium/transfer.h (revision 70271) +++ branches/weighted-transfer/apertium/apertium/transfer.h (revision 70274) @@ -34,6 +34,7 @@ #include #include #include +#include using namespace std; @@ -51,6 +52,11 @@ map, Ltstr> listslow; vector macro_map; vector rule_map; + vector rule_ids; // rule number -> rule id, first meaningful rule at position 1 + map rule_id_map; // rule id -> rule number + vector > rule_groups; // rule group number -> rule ids + map rule_group_map; // id -> rule group number + map > > weighted_patterns; // all weighted patterns, grouped by rule id xmlDoc *doc; xmlNode *root_element; TransferWord **word; @@ -68,6 +74,7 @@ int any_tag; xmlNode *lastrule; + int lastrule_num; unsigned int nwords; map evalStringCache; @@ -77,6 +84,7 @@ OutputType defaultAttrs; bool preBilingual; bool useBilingual; + bool useWeights; bool null_flush; bool internal_null_flush; bool trace; @@ -87,8 +95,11 @@ void readData(FILE *input); void readBil(string const &filename); void readTransfer(string const &input); + void readTransferWeights(string const &in); // read transfer weights file void collectMacros(xmlNode *localroot); void collectRules(xmlNode *localroot); + string getRuleId(xmlNode *localroot); // get value of 'id' property of 'rule' element + string getNodeAttr(xmlNode *localroot, const char* attr_name); string caseOf(string const &str); string copycase(string const &source_word, string const &target_word); @@ -134,8 +145,8 @@ Transfer(); ~Transfer(); - void read(string const &transferfile, string const &datafile, - string const &fstfile = ""); + void read(string const &transferfile, string const &datafile, + string const &weightsfile = "", string const &fstfile = ""); void transfer(FILE *in, FILE *out); void setUseBilingual(bool value); bool getUseBilingual(void) const; @@ -147,6 +158,8 @@ void setNullFlush(bool null_flush); void setTrace(bool trace); void setTraceATT(bool trace); + void setUseWeights(bool weighted); + bool getUseWeights(void) const; }; #endif Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix =================================================================== --- branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix (revision 70271) +++ branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix (revision 70274) @@ -136,7 +136,7 @@ boot chest lock - stock + sock pair day of Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/eng-rus.automorf.bin =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/rus-eng.autogen.bin =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x =================================================================== --- branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x (revision 70271) +++ branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x (revision 70274) @@ -10,6 +10,8 @@ + @@ -120,33 +122,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -169,11 +144,11 @@ - + @@ -251,7 +226,7 @@ - + @@ -259,60 +234,31 @@ - + - - + + + - - - + + - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x =================================================================== --- branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x (revision 70271) +++ branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x (revision 70274) @@ -1,7 +1,8 @@ + - + @@ -15,19 +16,25 @@ - - - + + + - - - - - + + + + + + + + + + + - + @@ -40,6 +47,8 @@ + + @@ -59,18 +68,6 @@ - - - - - - - - - - - - @@ -83,4 +80,5 @@ + Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/rus-eng.t1x.bin =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: branches/weighted-transfer/apertium-toy-ru-en/process.sh =================================================================== --- branches/weighted-transfer/apertium-toy-ru-en/process.sh (revision 70271) +++ branches/weighted-transfer/apertium-toy-ru-en/process.sh (revision 70274) @@ -5,7 +5,7 @@ lt-proc apertium-rus/rus-eng.automorf.bin | gawk 'BEGIN{RS="$"; FS="/";}{nf=split($1,COMPONENTS,"^"); for(i = 1; i