Index: branches/weighted-transfer/apertium/apertium/transfer.cc
===================================================================
--- branches/weighted-transfer/apertium/apertium/transfer.cc	(revision 70271)
+++ branches/weighted-transfer/apertium/apertium/transfer.cc	(revision 70274)
@@ -26,6 +26,7 @@
 #include <iostream>
 #include <stack>
 #include <cerrno>
+#include <cstdlib>
 
 using namespace Apertium;
 using namespace std;
@@ -79,10 +80,10 @@
 Transfer::readData(FILE *in)
 {
   // Read transfer rules data from .t*x.bin file
-  cerr << "readData" << endl; // di
+  //cerr << "readData" << endl; // di
 
   alphabet.read(in);
-  cerr << "Alphabet size: " << alphabet.size() << endl; // di
+  //cerr << "Alphabet size: " << alphabet.size() << endl; // di
 
   any_char = alphabet(TRXReader::ANY_CHAR);
   any_tag = alphabet(TRXReader::ANY_TAG);
@@ -174,13 +175,13 @@
 
 void
 Transfer::read(string const &transferfile, string const &datafile,
-	       string const &fstfile)
+               string const &weightsfile, string const &fstfile)
 { 
   // read and parse .t*x transfer file
   readTransfer(transferfile);
 
   // open precompiled .t*x.bin file and read data from it
-  cerr << "Reading data from " << datafile.c_str() << endl;
+  cerr << "Reading data from " << datafile.c_str() << endl << endl;
   FILE *in = fopen(datafile.c_str(), "rb");
   if(!in)
   {
@@ -190,7 +191,14 @@
   readData(in);
   fclose(in);
 
-  // read data from fstfile if specified
+  // read data from transfer weights file if specified
+  if(weightsfile != "")
+  {
+    //cerr << "Reading weights from " << weightsfile << endl; // di
+    readTransferWeights(weightsfile);
+  }
+
+  // read data from bilingual letter transducer file if specified
   if(fstfile != "")
   {
     cerr << "Reading fst data from " << fstfile << endl; // di
@@ -250,6 +258,17 @@
       }
     }
   }
+
+  if (useWeights) // di
+  { // di
+    // double-check rule ids in rule_id_map and rule_ids // di
+    cerr << endl << "Those are the ids you wanted: " << endl; //di
+    for (int k = 1; k < rule_ids.size(); k++) // di
+    { // di
+      cerr << "rule_ids[" << k << "]: " << rule_ids[k] << endl; // di
+      cerr << "rule_id_map[" << rule_ids[k] << "]: " << rule_id_map[rule_ids[k]] << endl << endl; // di
+    } // di
+  } // di
 }
 
 void
@@ -256,24 +275,36 @@
 Transfer::collectRules(xmlNode *localroot)
 { 
   // go through subelements of 'section-rules'
+  int rule_index = 0;
+  string rule_id = "";
+  rule_ids.push_back(""); // fictive zero position element to not bother with i-1 thing
+  rule_id_map[""] = 0; // a uniformed answer to all empty string ids since we're not interested
+
   for(xmlNode *i = localroot->children; i != NULL; i = i->next)
   {
-    if(i->type == XML_ELEMENT_NODE)
-    { 
-      // normally looking at a 'rule' node now
-      //cerr << "Looking at " << i->name << endl; // di
-      for(xmlAttr *j = i->properties; j != NULL; j = j->next) // di
-      { // di
-        if(!xmlStrcmp(j->name, (const xmlChar *) "comment")) // di
-        { // di
-          cerr << "Collecting rule " << xmlNodeListGetString(i->doc, j->children, 1) << endl;   // di           
-        } // di
-      } // di
-      // di
-      // go through subelements of this 'rule' node
+    if(i->type == XML_ELEMENT_NODE && !xmlStrcmp(i->name, (const xmlChar *) "rule"))
+    {
+      // 'rule' element
+      rule_index++;
+      cerr << "Collecting rule # " << rule_index << endl; //di
+
+      if (useWeights) // only need ids if weights are used
+      {
+        // get rule id and add it to rule_ids
+        rule_id = getRuleId(i);
+        if (rule_id != "")
+        {
+          rule_id_map[rule_id] = rule_index;
+        }
+        rule_ids.push_back(rule_id);
+        rule_id = "";
+        cerr << endl; // di
+      }
+
+      // go through subelements of current 'rule' element looking for some action
       for(xmlNode *j = i->children; ; j = j->next)
       {
-        // check if subelement is an 'action' node
+        // check if subelement is 'action' element
         if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action"))
         {
           // if so, add it at the end of the rule map
@@ -300,6 +331,195 @@
   }
 }
 
+void
+Transfer::readTransferWeights(string const &in)
+{
+  // Read transfer weights from .w*x file.
+  int rule_group_index = 0;
+  double weight = 0.;
+  string lemma = "", tags = "*";
+  string rule_id = "";
+  string regex = "";
+  vector<string> current_rule_group; // to track all rules in one group
+  rule_group_map[""] = -1; // a uniformed answer to all empty rule_group ids
+  vector<pair<pcre*, double > > current_pattern_group;
+  weighted_patterns[""] = current_pattern_group;
+
+  pcre *reCompiled;
+  pcre_extra *pcreExtra;
+  const char *pcreErrorStr;
+  int pcreErrorOffset;
+
+  cerr << "Reading transfer weights from " << in.c_str() << endl << endl; // di
+  // di
+  doc = xmlReadFile(in.c_str(), NULL, 0);
+  if(doc == NULL)
+  {
+    cerr << "Error: Could not parse file '" << in << "'." << endl;
+    exit(EXIT_FAILURE);
+  }
+
+  root_element = xmlDocGetRootElement(doc);
+  //cerr << root_element->name << endl; // di
+
+  // search through root's children nodes for 'rule-group' elements
+  for(xmlNode *i = root_element->children; i != NULL; i = i->next)
+  {
+    if(i->type == XML_ELEMENT_NODE && !xmlStrcmp(i->name, (const xmlChar *) "rule-group"))
+    {
+      cerr << "Collecting rule-group # " << rule_group_index << endl; // di
+      // get ids of all rules in rule group
+      for(xmlNode *j = i->children; j != NULL; j = j->next)
+      {
+        if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "rule"))
+        {
+          // get id
+          rule_id = getRuleId(j);
+          current_rule_group.push_back(rule_id);
+          rule_group_map[rule_id] = rule_group_index;
+          cerr << endl; // di
+
+          // get patterns
+          for(xmlNode *k = j->children; k != NULL; k = k->next)
+          {
+            if(k->type == XML_ELEMENT_NODE && !xmlStrcmp(k->name, (const xmlChar *) "pattern"))
+            {
+               weight = atof(getNodeAttr(k, "weight").c_str());
+               cerr << weight << endl;
+               for(xmlNode *patit = k->children; patit != NULL; patit = patit->next)
+               {
+                 if(patit->type == XML_ELEMENT_NODE && !xmlStrcmp(patit->name, (const xmlChar *) "pattern-item"))
+                 {
+                   lemma = getNodeAttr(patit, "lemma");
+                   if (lemma == "")
+                   {
+                     regex = regex + "[^<>]*?";
+                   }
+                   else
+                   {
+                     regex = regex + lemma;
+                   }
+
+                   tags = getNodeAttr(patit, "tags");
+                   unsigned int tags_len = tags.size();
+                   if (tags_len > 0 && tags != "*")
+                   {
+                     regex = regex + "<";
+                   } 
+                   char curr_char;
+                   for(unsigned int i = 0; i < tags_len; i++)
+                   {
+                     curr_char = tags[i];
+                     switch (curr_char)
+                     {
+                       case '.':
+                         regex = regex + "><";
+                         break;
+
+                       case '*':
+                         regex = regex + ".*?";
+                         break;
+                     
+                       default:
+                         regex = regex + curr_char;
+                     }
+                   }
+                   if (tags_len > 0 && tags != "*")
+                   {
+                     regex = regex + ">";
+                   } 
+                   cerr << lemma << " " << tags << endl;
+                   regex = regex + "\\S*? ";
+                   cerr << regex << endl;
+                 }
+               }
+               reCompiled = pcre_compile(regex.c_str(), 0, &pcreErrorStr, &pcreErrorOffset, NULL);
+               //pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr);
+               current_pattern_group.push_back(make_pair(reCompiled, weight));
+               regex = "";
+            }
+          }
+          weighted_patterns[rule_id] = current_pattern_group;
+          current_pattern_group.clear();
+        }
+      }
+      // push newly acquired current_rule_group into rule_groups
+      rule_groups.push_back(current_rule_group);
+      current_rule_group.clear();
+      cerr << endl; // di
+      rule_group_index++;
+    }
+  }
+
+  // print out what was collected // di
+  cerr << "These are the rule groups you collected:" << endl; // di
+  unsigned int k1, k2; // di
+  for (k1 = 0; k1 < rule_groups.size(); k1++) // di
+  { // di
+    cerr << "rule_groups[" << k1 << "]:" << endl; // di
+    for (k2 = 0; k2 < rule_groups[k1].size(); k2++) // di
+    { // di
+      cerr << "  " << rule_groups[k1][k2] << endl; // di
+      cerr << "    rule_group_map[" << rule_groups[k1][k2] << "]: "; // di
+      cerr << rule_group_map[rule_groups[k1][k2]] << endl; // di
+    } // di
+    cerr << endl; // di
+  } // di
+
+  cerr << "And these are the patterns:" << endl; // di
+  for (k1 = 1; k1 < rule_ids.size(); k1++) // di
+  { // di
+    if (rule_ids[k1] != "") // di
+    { // di
+      cerr << "Patterns for rule " << rule_ids[k1] << endl; // di
+      for (k2 = 0; k2 < weighted_patterns[rule_ids[k1]].size(); k2++) // di
+      { // di
+        cerr << "  " << weighted_patterns[rule_ids[k1]][k2].first << " "; // di
+        cerr << weighted_patterns[rule_ids[k1]][k2].second << endl; // di
+      } // di
+    } // di
+  } // di
+
+}
+
+string
+Transfer::getRuleId(xmlNode *localroot)
+{
+  string rule_id = "";
+      // normally looking at a 'rule' node now
+      for(xmlAttr *j = localroot->properties; j != NULL; j = j->next)
+      {
+        if(!xmlStrcmp(j->name, (const xmlChar *) "comment")) // di
+        { // di
+          cerr << "Rule comment: " << xmlNodeListGetString(localroot->doc, j->children, 1) << endl;   // di
+        } // di
+        else //di
+        { //di
+          if(!xmlStrcmp(j->name, (const xmlChar *) "id"))
+          {
+            // add rule id to rule_id_map
+            rule_id = (const char*)xmlNodeListGetString(localroot->doc, j->children, 1);
+            cerr << "Rule id: " << rule_id << endl;   // di           
+          }
+        } // di
+      } // di
+  return rule_id;
+}
+
+string
+Transfer::getNodeAttr(xmlNode *localroot, const char* attr_name)
+{
+  string attr_val = "";
+  for(xmlAttr *j = localroot->properties; j != NULL; j = j->next)
+  {
+    if(!xmlStrcmp(j->name, (const xmlChar *) attr_name))
+    { 
+      attr_val = (const char*) xmlNodeListGetString(localroot->doc, j->children, 1);
+    }
+  }
+  return attr_val;
+}
+
 bool
 Transfer::checkIndex(xmlNode *element, int index, int limit)
 { 
@@ -340,27 +560,27 @@
 string
 Transfer::evalString(xmlNode *element)
 {
-  // Contrary to its name, this function basically evaluates
-  // an xml element and executes appropriate instruction.
+  // This function evaluates an xml element
+  // and executes appropriate instruction.
 
-  // I believe it is used to evaluate lowest-level action elements, 
+  // I believe it is used for lowest-level action elements, 
   // such as 'clip' or 'lit-tag'.
 
   // If TransferInstr object corresponding to the element is already
   // in evalStringCache, execute that instruction,
   // if not, first add the instruction to evalStringCache,
-  // then call evalString again, and execute that instruction.
+  // then call evalString again and execute that instruction.
   
   // First, let's see what we've got. // di
-  if (element->type == XML_ELEMENT_NODE)  // di
-  {  // di
-    cerr << "Evaluating " << element->name << " "; // di
-    for(xmlAttr *prop = element->properties; prop != NULL; prop = prop->next) // di
-    {  // di
-      cerr << prop->name << "='" << xmlNodeListGetString(element->doc, prop->children, 1) << "' "; // di
-    } // di
-    cerr << endl; // di
-  } // di
+  //if (element->type == XML_ELEMENT_NODE)  // di
+  //{  // di
+    //cerr << "Evaluating " << element->name << " "; // di
+    //for(xmlAttr *prop = element->properties; prop != NULL; prop = prop->next) // di
+    //{  // di
+      //cerr << prop->name << "='" << xmlNodeListGetString(element->doc, prop->children, 1) << "' "; // di
+    //} // di
+    //cerr << endl; // di
+  //} // di
 
   map<xmlNode *, TransferInstr>::iterator it;
   it = evalStringCache.find(element); 
@@ -465,8 +685,8 @@
 
   // The following code is executed if TransferInstr object 
   // corresponding to the element is not in evalStringCache yet.
-  // It parses lowest-level element, makes TransferInstr object out of it,
-  // and pushes it into evalStringCache.
+  // It parses lowest-level element, creates TransferInstr object
+  // out of it, and pushes it into evalStringCache.
   if(!xmlStrcmp(element->name, (const xmlChar *) "clip"))
   {
     int pos = 0;
@@ -688,7 +908,7 @@
 { 
   // apply 'out' subelement of a rule, one subelement at a time,
   // depending on subelement type
-  cerr << "Applying 'out' element" << endl; // di
+  //cerr << "Applying 'out' element" << endl; // di
   for(xmlNode *i = localroot->children; i != NULL; i = i->next)
   {
     if(i->type == XML_ELEMENT_NODE)
@@ -779,7 +999,7 @@
   // apply 'chunk' subelement of 'out' element of a rule,
   // one subelement at a time, depending on subelement type
 
-  cerr << "Applying 'chunk' element" << endl; // di
+  //cerr << "Applying 'chunk' element" << endl; // di
   string name, namefrom;
   string caseofchunk = "aa";
   string result;
@@ -807,7 +1027,7 @@
 
   // starting to build the chunk
   result.append("^");
-  cerr << result << endl; // di
+  //cerr << result << endl; // di
 
   // adding chunk name
   if(caseofchunk != "")
@@ -815,12 +1035,12 @@
     if(name != "")
     {
       result.append(copycase(variables[caseofchunk], name));
-      cerr << result << endl; // di
+      //cerr << result << endl; // di
     }
     else if(namefrom != "")
     {
       result.append(copycase(variables[caseofchunk], variables[namefrom]));
-      cerr << result << endl; // di
+      //cerr << result << endl; // di
     }
     else
     {
@@ -833,12 +1053,12 @@
     if(name != "")
     {
       result.append(name);
-      cerr << result << endl; // di
+      //cerr << result << endl; // di
     }
     else if(namefrom != "")
     {
       result.append(variables[namefrom]);
-      cerr << result << endl; // di
+      //cerr << result << endl; // di
     }
     else
     {
@@ -860,7 +1080,7 @@
         // add chunk tags
         result.append(processTags(i));
         result.append("{");
-        cerr << result << endl; // di
+        //cerr << result << endl; // di
       }
       else if(!xmlStrcmp(i->name, (const xmlChar *) "lu"))
       {
@@ -873,7 +1093,7 @@
         {
           if(j->type == XML_ELEMENT_NODE)
           {
-            cerr << "Executing " << j->name << endl; // di
+            //cerr << "Executing " << j->name << endl; // di
             myword.append(evalString(j));
 
             evalStringClip(j, untouched, untouched_pos); // black magic
@@ -926,17 +1146,17 @@
         }
         if(myword != "")
         {
-          cerr << myword << endl; // di
+          //cerr << myword << endl; // di
           result.append("^");
           result.append(myword);
           result.append("$");
-          cerr << result << endl; // di
+          //cerr << result << endl; // di
         }
       }
       else // 'b'
       {
         result.append(evalString(i));
-        cerr << result << endl; // di
+        //cerr << result << endl; // di
       }
     }
   }
@@ -972,7 +1192,7 @@
 string
 Transfer::processTags(xmlNode *localroot)
 { 
-  cerr << "processTags" << endl; // di
+  //cerr << "processTags" << endl; // di
   string result;
   for(xmlNode *i = localroot->children; i != NULL; i = i->next)
   {
@@ -997,7 +1217,7 @@
 Transfer::processInstruction(xmlNode *localroot)
 { 
   // process instruction specified in rule action based on its name
-  cerr << "Processing instruction '" << localroot->name << "'" << endl; // di
+  //cerr << "Processing instruction '" << localroot->name << "'" << endl; // di
 
   int words_to_consume = -1;
   if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose"))
@@ -1454,8 +1674,7 @@
 
   if(localroot->properties != NULL)
   {
-    if(!xmlStrcmp(localroot->properties->children->content,
-		  (const xmlChar *) "yes"))
+    if(!xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes"))
     {
       set<string, Ltstr> &myset = listslow[(const char *) idlist];
       if(myset.find(tolower(sval)) != myset.end())
@@ -1845,7 +2064,7 @@
 string
 Transfer::copycase(string const &source_word, string const &target_word)
 { 
-  cerr << "copycase" << endl; // di
+  //cerr << "copycase" << endl; // di
   wstring result;
   wstring const s_word = UtfConverter::fromUtf8(source_word);
   wstring const t_word = UtfConverter::fromUtf8(target_word);
@@ -1967,7 +2186,7 @@
 TransferToken &
 Transfer::readToken(FILE *in)
 { 
-  cerr << "readToken" << endl; // di
+  //cerr << "readToken" << endl; // di
   if(!input_buffer.isEmpty())
   {
     return input_buffer.next();
@@ -2039,7 +2258,7 @@
 bool
 Transfer::getNullFlush(void)
 { 
-  cerr << "getNullFlush" << endl; // di
+  //cerr << "getNullFlush" << endl; // di
   return null_flush;
 }
 
@@ -2089,7 +2308,7 @@
 void
 Transfer::transfer(FILE *in, FILE *out)
 { 
-  cerr << endl << "transfer starts" << endl << endl; // di
+  cerr << "Transfer starts here" << endl << endl; // di
 
   if(getNullFlush())
   {
@@ -2098,7 +2317,7 @@
 
   int last = 0;
   int prev_last = 0;
-  int lastrule_id = -1;
+  lastrule_num = -1;
   set<int> banned_rules;
 
   output = out;
@@ -2107,14 +2326,15 @@
   int counter = 0; // di
   while(true)
   { 
-    cerr << endl << "Transfer iteration # " << counter << endl; // di
+    cerr << "Transfer iteration # " << counter << endl; // di
     cerr << "last: " << last << endl; // di
     cerr << "prev_last: " << prev_last << endl; // di
-    cerr << "lastrule_id: " << lastrule_id << endl; // di
+    cerr << "lastrule_num: " << lastrule_num << endl; // di
     cerr << "ms.size(): " << ms.size() << endl; // di
+
     // Let's look at input_buffer contents // di
-    int initbuffpos = input_buffer.getPos(); // di
-    cerr << "input_buffer position: " << initbuffpos << endl << endl; // di
+    /*int initbuffpos = input_buffer.getPos(); // di
+    //cerr << "input_buffer position: " << initbuffpos << endl << endl; // di
     input_buffer.setPos(0); // di
     int currbuffpos, prevbuffpos = input_buffer.getPos(); // di
     TransferToken currbufftok, prevbufftok = input_buffer.next(); // di
@@ -2122,7 +2342,7 @@
     while (run) { // di
       currbuffpos = input_buffer.getPos(); // di
       currbufftok = input_buffer.next(); // di
-      cerr << "input_buffer.buf[" << prevbuffpos << "]: " << UtfConverter::toUtf8(prevbufftok.getContent()) << endl; // di
+      //cerr << "input_buffer.buf[" << prevbuffpos << "]: " << UtfConverter::toUtf8(prevbufftok.getContent()) << endl; // di
       if (currbuffpos == prevbuffpos) { // di
         run = false; // di
       } else { // di
@@ -2138,7 +2358,7 @@
     //for(set<int>::iterator iter=banned_rules.begin(); iter != banned_rules.end(); iter++) { // di
     //    cerr << *iter << ", "; // di
     //} // di
-    //cerr << endl; // di
+    //cerr << endl; // di*/
 
     if(trace_att)
     {
@@ -2172,11 +2392,12 @@
 
     if (ms.size() == 0)
     { 
-      cerr << "(ms.size() == 0)" << endl; // di
+      //cerr << "(ms.size() == 0)" << endl; // di
       if(lastrule != NULL)
       {
-        // this is the branch where a rule specified by lastrule_id is applied
-        cerr << "lastrule != NULL" << endl; // di
+        // this is the branch where a rule specified by lastrule_num is applied
+
+        //cerr << "lastrule != NULL" << endl; // di
         int num_words_to_consume = applyRule();
 
         if(trace_att)
@@ -2218,12 +2439,12 @@
         {
           cerr << "num_words_to_consume == 0" << endl; // di
           //Add rule to banned rules
-          banned_rules.insert(lastrule_id);
+          banned_rules.insert(lastrule_num);
           input_buffer.setPos(prev_last);
           input_buffer.next();
           last = input_buffer.getPos();
         } // thy words consumed
-        lastrule_id = -1;
+        lastrule_num = -1;
       }
       else // lastrule == NULL
       {
@@ -2347,7 +2568,7 @@
           last = input_buffer.getPos();
           ms.init(me->getInitial());
 	}
-      }
+      } // lastrule == NULL ends here
     } // if(ms.size() == 0) ends here
 
     int val = ms.classifyFinals(me->getFinals(), banned_rules);
@@ -2354,7 +2575,7 @@
     if(val != -1)
     {
       lastrule = rule_map[val-1];
-      lastrule_id = val;
+      lastrule_num = val;
       last = input_buffer.getPos();
 
       if(trace)
@@ -2404,6 +2625,7 @@
 	return;
     }
     counter++;
+    cerr << endl;
   }
 } // end of transfer
 
@@ -2410,13 +2632,21 @@
 int
 Transfer::applyRule()
 { 
-  cerr << "applyRule" << endl; // di
-  cerr << "limit " << tmpword.size() << endl; // di
+  //cerr << "applyRule" << endl; // di
+  //cerr << "limit " << tmpword.size() << endl; // di
   //wcerr << UtfConverter::toUtf8(*tmpword[0]) << endl; // di
-  
+
   int words_to_consume;
   unsigned int limit = tmpword.size();
 
+  wstring wtmpchunk;
+  string tmpchunk;
+
+  if (useWeights)
+  {
+    wtmpchunk = L"";
+  }
+
   for(unsigned int i = 0; i != limit; i++)
   { 
     cerr << "applyRule iteration # " << i << endl; // di
@@ -2450,8 +2680,8 @@
       cerr << "useBilingual && preBilingual == false" << endl; // di
       tr = fstp.biltransWithQueue(*tmpword[i], false);
       cerr << i << " ";
-      wcerr << tr.first << " ";
-      cerr << tr.second << endl;
+      wcerr << tr.first << " "; // di
+      cerr << tr.second << endl; // di
     }
     else if(preBilingual)
     { 
@@ -2460,6 +2690,7 @@
       // then
       // sl = word_in_lang1<its><tags>
       // tl = word_in_lang2<its><tags>
+
       cerr << "preBilingual" << endl; // di
       wstring sl;
       wstring tl;
@@ -2506,7 +2737,11 @@
       wcerr << tl << endl; // di
       //tmpword[i]->assign(sl);
       tr = pair<wstring, int>(tl, false);
+      if (useWeights)
+      {
+        wtmpchunk = wtmpchunk + sl + L" ";
     }
+    }
     else
     { 
       // here we don't need to split anything
@@ -2514,13 +2749,79 @@
       tr = pair<wstring, int>(*tmpword[i], false);
     }
 
-    //wcerr << tr.first << endl; // di
+    //wcerr << L"tr.first: " << tr.first << endl; // di
     word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]),
 			       UtfConverter::toUtf8(tr.first), tr.second);
     //cerr << i << " "; // di
-    //wcerr << UtfConverter::fromUtf8(word[i]) << endl; // di
+    //wcerr << L"word[" << i << L"]: " << UtfConverter::fromUtf8(tr.first) << endl; // di
   }
 
+  // check if we use weights
+  if (useWeights)
+  {
+    tmpchunk = UtfConverter::toUtf8(wtmpchunk);
+    cerr << "Got an lchunk: " << tmpchunk << endl << endl; // di
+
+    int pcreExecRet;
+    int subStrVec[30];
+    double chosen_weight = 0., current_weight = 0.;
+    string chosen_rule_id = rule_ids[lastrule_num];
+    string current_rule_id;
+    unsigned int rule_group_num; 
+
+    // check if rule id is not empty
+    if (chosen_rule_id != "")
+    {
+      // check if there are other rules in its group
+      rule_group_num = rule_group_map[chosen_rule_id];
+      if (rule_groups[rule_group_num].size() > 1)
+      {
+        cerr << "Rule # " << lastrule_num << " is ambiguous" << endl; // di
+        cerr << "Rule id: " << chosen_rule_id << endl; // di
+        cerr << "Rules in the group: " << endl; // di
+        for (unsigned int ind = 0; ind < rule_groups[rule_group_num].size(); ind++) // di
+        {  // di
+          cerr << "  " << rule_groups[rule_group_num][ind] << endl; // di
+        } // di
+        cerr << endl; // di
+     
+        // let's check the weights for each rule in the group
+        chosen_weight = 0.;
+        for (unsigned int ind = 0; ind < rule_groups[rule_group_num].size(); ind++)
+        {
+          current_weight = 0.;
+          current_rule_id = rule_groups[rule_group_num][ind];
+
+          cerr << "Checking " << current_rule_id << endl; // di
+          // go through patterns
+          for (unsigned int k = 0; k < weighted_patterns[current_rule_id].size(); k++)
+          { 
+            pcreExecRet = pcre_exec(weighted_patterns[current_rule_id][k].first, NULL, 
+                                    tmpchunk.c_str(), tmpchunk.length(), 
+                                    0, 0, subStrVec, 30);
+            if(pcreExecRet >= 0) // bingo!
+            {
+              cerr << "Pattern matched " << weighted_patterns[current_rule_id][k].first; // di
+              current_weight = weighted_patterns[current_rule_id][k].second;
+              cerr << " with weight " << current_weight << endl; // di
+              if (current_weight > chosen_weight) // heavier rule
+              {
+                chosen_weight = current_weight;
+                chosen_rule_id = current_rule_id;
+              }
+            }
+          }
+        }
+        cerr << endl; // di
+        // substitute lastrule with the chosen one
+        lastrule_num = rule_id_map[chosen_rule_id];
+        lastrule = rule_map[lastrule_num-1];
+        cerr << "Chose rule # " << lastrule_num << " id: " << chosen_rule_id;
+        cerr << " with weight " << chosen_weight << endl; // di
+      }
+    }
+  }
+
   words_to_consume = processRule(lastrule);
 
   // some cleanup ?
@@ -2558,7 +2859,7 @@
   // Here, the token contained in word_str is fed 
   // to the fst by stepping with ms
 
-  cerr << "applyWord: applying to " << UtfConverter::toUtf8(word_str) << endl; // di
+  //cerr << "applyWord: applying to " << UtfConverter::toUtf8(word_str) << endl; // di
   ms.step(L'^');
   for(unsigned int i = 0, limit = word_str.size(); i < limit; i++)
   {
@@ -2632,3 +2933,15 @@
 { 
   fstp.setCaseSensitiveMode(value);
 }
+
+void
+Transfer::setUseWeights(bool value)
+{
+  useWeights = value;
+}
+
+bool
+Transfer::getUseWeights(void) const
+{
+  return useWeights;
+}
Index: branches/weighted-transfer/apertium/apertium/apertium_transfer.cc
===================================================================
--- branches/weighted-transfer/apertium/apertium/apertium_transfer.cc	(revision 70271)
+++ branches/weighted-transfer/apertium/apertium/apertium_transfer.cc	(revision 70274)
@@ -46,6 +46,7 @@
   wcerr << "  biltrans   bilingual letter transducer file" << endl;
   wcerr << "  input      input file, standard input by default" << endl;
   wcerr << "  output     output file, standard output by default" << endl;
+  wcerr << "  -w tweights  transfer rule weights file" << endl;
   wcerr << "  -b         input from lexical transfer" << endl;
   wcerr << "  -n         don't use bilingual dictionary" << endl;
   wcerr << "  -x bindix  extended mode with user dictionary" << endl;
@@ -102,6 +103,7 @@
   Transfer t;
 
   int option_index=0;
+  string weights = "";
 
   while (true) {
     static struct option long_options[] =
@@ -109,6 +111,7 @@
       {"from-bilingual",      no_argument, 0, 'b'},
       {"no-bilingual",      no_argument, 0, 'n'},
       {"extended",      required_argument, 0, 'x'},
+      {"transfer-weihts",      required_argument, 0, 'w'},
       {"case-sensitive", no_argument, 0, 'c'},
       {"null-flush", no_argument, 0, 'z'},
       {"trace", no_argument, 0, 't'},
@@ -117,7 +120,7 @@
       {0, 0, 0, 0}
     };
 
-    int c=getopt_long(argc, argv, "nbx:cztTh", long_options, &option_index);
+    int c=getopt_long(argc, argv, "nbx:w:cztTh", long_options, &option_index);
     if (c==-1)
       break;
       
@@ -135,6 +138,12 @@
       case 'x':
         t.setExtendedDictionary(optarg);
         break;
+
+      case 'w': // transfer rule weights file is specified
+        weights = optarg;
+        testfile(weights);
+        t.setUseWeights(true);
+        break;
         
       case 'c':
         t.setCaseSensitiveness(true);
@@ -170,7 +179,7 @@
       testfile(argv[argc-3]);
       testfile(argv[argc-4]);
       testfile(argv[argc-5]);
-      t.read(argv[argc-5], argv[argc-4], argv[argc-3]);
+      t.read(argv[argc-5], argv[argc-4], weights, argv[argc-3]);
       break;
       
     case 5:
@@ -180,7 +189,7 @@
         input = open_input(argv[argc-2]);
         testfile(argv[argc-3]);
         testfile(argv[argc-4]);
-        t.read(argv[argc-4], argv[argc-3]);
+        t.read(argv[argc-4], argv[argc-3], weights);
       }
       else
       {
@@ -188,7 +197,7 @@
         testfile(argv[argc-2]);
         testfile(argv[argc-3]);
         testfile(argv[argc-4]);
-        t.read(argv[argc-4], argv[argc-3], argv[argc-2]);
+        t.read(argv[argc-4], argv[argc-3], weights, argv[argc-2]);
       }
       break;
       
@@ -198,7 +207,7 @@
         input = open_input(argv[argc-1]);
         testfile(argv[argc-2]);
         testfile(argv[argc-3]);
-        t.read(argv[argc-3], argv[argc-2]);
+        t.read(argv[argc-3], argv[argc-2], weights);
       }
       else
       {
@@ -205,7 +214,7 @@
         testfile(argv[argc-1]);
         testfile(argv[argc-2]);
         testfile(argv[argc-3]);
-        t.read(argv[argc-3], argv[argc-2], argv[argc-1]);
+        t.read(argv[argc-3], argv[argc-2], weights, argv[argc-1]);
       }
       break;
     case 3:
@@ -213,7 +222,7 @@
       {
         testfile(argv[argc-1]);
         testfile(argv[argc-2]);
-        t.read(argv[argc-2], argv[argc-1]);
+        t.read(argv[argc-2], argv[argc-1], weights);
       }
       else
       {
Index: branches/weighted-transfer/apertium/apertium/transfer.h
===================================================================
--- branches/weighted-transfer/apertium/apertium/transfer.h	(revision 70271)
+++ branches/weighted-transfer/apertium/apertium/transfer.h	(revision 70274)
@@ -34,6 +34,7 @@
 #include <map>
 #include <set>
 #include <vector>
+#include <pcre.h>
 
 using namespace std;
 
@@ -51,6 +52,11 @@
   map<string, set<string, Ltstr>, Ltstr> listslow;
   vector<xmlNode *> macro_map;
   vector<xmlNode *> rule_map;
+  vector<string> rule_ids; // rule number -> rule id, first meaningful rule at position 1
+  map<string, int> rule_id_map; // rule id -> rule number
+  vector<vector<string> > rule_groups; // rule group number -> rule ids
+  map<string, int> rule_group_map; // id -> rule group number
+  map<string, vector<pair<pcre*, double> > > weighted_patterns; // all weighted patterns, grouped by rule id
   xmlDoc *doc;
   xmlNode *root_element;
   TransferWord **word;
@@ -68,6 +74,7 @@
   int any_tag;
 
   xmlNode *lastrule;
+  int lastrule_num;
   unsigned int nwords;
 
   map<xmlNode *, TransferInstr> evalStringCache;
@@ -77,6 +84,7 @@
   OutputType defaultAttrs;
   bool preBilingual;
   bool useBilingual;
+  bool useWeights;
   bool null_flush;
   bool internal_null_flush;
   bool trace;
@@ -87,8 +95,11 @@
   void readData(FILE *input);
   void readBil(string const &filename);
   void readTransfer(string const &input);
+  void readTransferWeights(string const &in); // read transfer weights file
   void collectMacros(xmlNode *localroot);
   void collectRules(xmlNode *localroot);
+  string getRuleId(xmlNode *localroot); // get value of 'id' property of 'rule' element
+  string getNodeAttr(xmlNode *localroot, const char* attr_name);
   string caseOf(string const &str);
   string copycase(string const &source_word, string const &target_word);
 
@@ -134,8 +145,8 @@
   Transfer();
   ~Transfer();
   
-  void read(string const &transferfile, string const &datafile,
-	    string const &fstfile = "");
+  void read(string const &transferfile, string const &datafile, 
+            string const &weightsfile = "", string const &fstfile = "");
   void transfer(FILE *in, FILE *out);
   void setUseBilingual(bool value);
   bool getUseBilingual(void) const;
@@ -147,6 +158,8 @@
   void setNullFlush(bool null_flush);
   void setTrace(bool trace);
   void setTraceATT(bool trace);
+  void setUseWeights(bool weighted);
+  bool getUseWeights(void) const;
 };
 
 #endif
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix
===================================================================
--- branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix	(revision 70271)
+++ branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/apertium-eng.eng.dix	(revision 70274)
@@ -136,7 +136,7 @@
     <e lm="boot"><i>boot</i><par n="regular__n"/></e>
     <e lm="chest"><i>chest</i><par n="regular__n"/></e>
     <e lm="lock"><i>lock</i><par n="regular__n"/></e>
-    <e lm="stock"><i>stock</i><par n="regular__n"/></e>
+    <e lm="sock"><i>sock</i><par n="regular__n"/></e>
     <e lm="pair"><i>pair</i><par n="regular__n"/></e>
     <e lm="day"><i>day</i><par n="regular__n"/></e>
     <e lm="of"><i>of</i><par n="__pr"/></e>
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/eng-rus.automorf.bin
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-eng/rus-eng.autogen.bin
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x
===================================================================
--- branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x	(revision 70271)
+++ branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.t1x	(revision 70274)
@@ -10,6 +10,8 @@
     <def-cat n="adj_gen">
       <cat-item tags="adj.*.gen"/>
       <cat-item tags="num.*.gen"/>
+      <!--cat-item tags="num.*.acc"/>
+      <cat-item tags="adj.*.acc"/-->
     </def-cat>
     <def-cat n="sent">
       <cat-item tags="sent"/>
@@ -120,33 +122,6 @@
       </action>
     </rule>
 
-    <rule id="two_pt_attr_sg" comment="attributive noun construction in sg">
-      <pattern>
-        <pattern-item n="nom"/>
-        <pattern-item n="nom_gen"/>
-      </pattern>
-      <action>
-        <out>
-          <chunk name="nom_pos_nom" case="caseFirstWord">
-            <tags>
-              <tag><lit-tag v="SN"/></tag>
-            </tags>
-            <lu>
-              <clip pos="2" side="tl" part="lem"/>
-              <clip pos="2" side="tl" part="a_nom"/>
-              <lit-tag v="sg"/>
-            </lu>
-            <b/>
-            <lu>
-              <clip pos="1" side="tl" part="lem"/>
-              <clip pos="1" side="tl" part="a_nom"/>
-              <clip pos="1" side="tl" part="nbr"/>
-            </lu>
-          </chunk>
-        </out>
-      </action>
-    </rule>
-
     <rule id="two_pt_of" comment="of-construction">
       <pattern>
         <pattern-item n="nom"/>
@@ -169,11 +144,11 @@
               <lit-tag v="pr"/>
             </lu>
             <b/>
-            <lu>
+            <!--lu>
               <lit v="a"/>
               <lit-tag v="det.ind"/>
             </lu>
-            <b/>
+            <b/-->
             <lu>
               <clip pos="2" side="tl" part="lem"/>
               <clip pos="2" side="tl" part="a_nom"/>
@@ -251,7 +226,7 @@
       </action>
     </rule>
 
-    <rule id="three_pt_attr_sg" comment="attributive construction with adj in sg">
+    <rule id="three_pt_of" comment="of-construction with adj">
       <pattern>
         <pattern-item n="nom"/>
         <pattern-item n="adj_gen"/>
@@ -259,60 +234,31 @@
       </pattern>
       <action>
         <out>
-          <chunk name="nom_pos_adj_nom" case="caseFirstWord">
+          <chunk name="nom_of_adj_nom" case="caseFirstWord">
             <tags>
               <tag><lit-tag v="SN"/></tag>
             </tags>
             <lu>
-              <clip pos="2" side="tl" part="lem"/>
-              <clip pos="2" side="tl" part="a_adj"/>
+              <clip pos="1" side="tl" part="lem"/>
+              <clip pos="1" side="tl" part="a_nom"/>
+              <clip pos="1" side="tl" part="nbr"/>
             </lu>
             <b/>
             <lu>
-              <clip pos="3" side="tl" part="lem"/>
-              <clip pos="3" side="tl" part="a_nom"/>
-              <lit-tag v="sg"/>
+              <lit v="of"/>
+              <lit-tag v="pr"/>
             </lu>
             <b/>
             <lu>
-              <clip pos="1" side="tl" part="lem"/>
-              <clip pos="1" side="tl" part="a_nom"/>
-              <clip pos="1" side="tl" part="nbr"/>
+              <clip pos="2" side="tl" part="lem"/>
+              <clip pos="2" side="tl" part="a_adj"/>
             </lu>
-          </chunk>
-        </out>
-      </action>
-    </rule>
-
-    <rule id="three_pt_of" comment="of-construction with adj">
-      <pattern>
-        <pattern-item n="nom"/>
-        <pattern-item n="adj_gen"/>
-        <pattern-item n="nom_gen"/>
-      </pattern>
-      <action>
-        <out>
-          <chunk name="nom_of_adj_nom" case="caseFirstWord">
-            <tags>
-              <tag><lit-tag v="SN"/></tag>
-            </tags>
-            <lu>
-              <clip pos="3" side="tl" part="lem"/>
-              <clip pos="3" side="tl" part="a_adj"/>
-            </lu>
             <b/>
             <lu>
               <clip pos="3" side="tl" part="lem"/>
               <clip pos="3" side="tl" part="a_nom"/>
               <clip pos="3" side="tl" part="nbr"/>
-              <lit-tag v="attr"/>
             </lu>
-            <b/>
-            <lu>
-              <clip pos="1" side="tl" part="lem"/>
-              <clip pos="1" side="tl" part="a_nom"/>
-              <clip pos="1" side="tl" part="nbr"/>
-            </lu>
           </chunk>
         </out>
       </action>
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x
===================================================================
--- branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x	(revision 70271)
+++ branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/apertium-rus-eng.rus-eng.w1x	(revision 70274)
@@ -1,7 +1,8 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <transfer-weights>
+  <rule-group>
   <rule id="two_pt_poss" comment="possessive case" md5="c28391b3328137d15b61ba833a5e69a5">
-    <pattern weight="0.8">
+      <pattern weight="0.9">
       <pattern-item tags="n.*"/>
       <pattern-item lemma="мертвец" tags="n.*.gen"/>
     </pattern>
@@ -15,19 +16,25 @@
     </pattern>
   </rule>
   <rule id="two_pt_attr" comment="attributive noun construction retaining sg/pl" md5="673433d932d4b5d7f7b094cd847ab80e">
-    <pattern weight="0.2">
-      <pattern-item lemma="пара" tags="n.*"/>
-      <pattern-item tags="n.pl.gen"/>
+      <pattern weight="0.9">
+        <pattern-item tags="n.*"/>
+        <pattern-item lemma="замок" tags="n.*.gen"/>
     </pattern>
-  </rule>
-  <rule id="two_pt_attr_sg" comment="attributive noun construction in sg" md5="78c1a1f1a9656dfec85c7d4f9009e660">
-    <pattern weight="0.8">
-      <pattern-item lemma="пара" tags="n.*"/>
-      <pattern-item tags="n.pl.gen"/>
+      <pattern weight="0.9">
+        <pattern-item tags="n.*"/>
+        <pattern-item lemma="сундук" tags="n.*.gen"/>
     </pattern>
+      <pattern weight="0.9">
+        <pattern-item tags="n.*"/>
+        <pattern-item lemma="сапог" tags="n.*.gen"/>
+      </pattern>
+      <pattern weight="0.9">
+        <pattern-item tags="n.*"/>
+        <pattern-item lemma="носок" tags="n.*.gen"/>
+      </pattern>
   </rule>
   <rule id="two_pt_of" comment="of-construction" md5="b6c1b7a4af3f540f13e25e12c8f07393">
-    <pattern weight="0.2">
+    <pattern weight="0.1">
       <pattern-item tags="n.*"/>
       <pattern-item lemma="мертвец" tags="n.*.gen"/>
     </pattern>
@@ -40,6 +47,8 @@
       <pattern-item lemma="мертвец" tags="n.pl.gen"/>
     </pattern>
   </rule>
+  </rule-group>
+  <rule-group>
   <rule id="three_pt_poss" comment="possessive case with adj" md5="bf48a7ee556d3003276d66826b01c246">
     <pattern weight="0.25">
       <pattern-item tags="n.*"/>
@@ -59,18 +68,6 @@
       <pattern-item tags="n.*.gen"/>
     </pattern>
   </rule>
-  <rule id="three_pt_attr_sg" comment="attributive construction with adj in sg" md5="8a4a44c2138f1eba02361f4356d7f8a3">
-    <pattern weight="0.25">
-      <pattern-item tags="n.*"/>
-      <pattern-item tags="adj.*.gen"/>
-      <pattern-item tags="n.*.gen"/>
-    </pattern>
-    <pattern weight="0.0">
-      <pattern-item tags="n.*"/>
-      <pattern-item tags="num.*.gen"/>
-      <pattern-item tags="n.*.gen"/>
-    </pattern>
-  </rule>
   <rule id="three_pt_of" comment="of-construction with adj" md5="7f5a99c6b5c6e9be29148e2405674bfa">
     <pattern weight="0.25">
       <pattern-item tags="n.*"/>
@@ -83,4 +80,5 @@
       <pattern-item tags="n.*.gen"/>
     </pattern>
   </rule>
+  </rule-group>
 </transfer-weights>
Index: branches/weighted-transfer/apertium-toy-ru-en/apertium-rus-eng/rus-eng.t1x.bin
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: branches/weighted-transfer/apertium-toy-ru-en/process.sh
===================================================================
--- branches/weighted-transfer/apertium-toy-ru-en/process.sh	(revision 70271)
+++ branches/weighted-transfer/apertium-toy-ru-en/process.sh	(revision 70274)
@@ -5,7 +5,7 @@
 lt-proc apertium-rus/rus-eng.automorf.bin | 
 gawk 'BEGIN{RS="$"; FS="/";}{nf=split($1,COMPONENTS,"^"); for(i = 1; i<nf; i++) printf COMPONENTS[i]; if($2 != "") printf("^%s$",$2);}' | 
 lt-proc -b apertium-rus-eng/rus-eng.autobil.bin | # bilingual output 
-apertium-transfer -b apertium-rus-eng/apertium-rus-eng.rus-eng.t1x apertium-rus-eng/rus-eng.t1x.bin | 
+apertium-transfer -bw apertium-rus-eng/apertium-rus-eng.rus-eng.w1x apertium-rus-eng/apertium-rus-eng.rus-eng.t1x apertium-rus-eng/rus-eng.t1x.bin | 
 apertium-interchunk apertium-rus-eng/apertium-rus-eng.rus-eng.t2x apertium-rus-eng/rus-eng.t2x.bin | 
 apertium-postchunk apertium-rus-eng/apertium-rus-eng.rus-eng.t3x apertium-rus-eng/rus-eng.t3x.bin | 
 lt-proc -g apertium-eng/rus-eng.autogen.bin |