commit 38b1060d382e6693598709f92369b46d946164a1
Author: Tanmai Khanna <khanna.tanmai@gmail.com>
Date:   Mon Aug 10 18:55:00 2020 +0530

    wblanks appear in the output now

diff --git a/src/chunk.h b/src/chunk.h
index 4a1f2b4..16b605a 100644
--- a/src/chunk.h
+++ b/src/chunk.h
@@ -256,12 +256,14 @@ public:
       }
       else if(out == NULL)
       {
+        cout << UtfConverter::toUtf8(wblank);
         cout << "^";
         cout << UtfConverter::toUtf8(target);
         cout << "$";
       }
       else
       {
+        fputs_unlocked(UtfConverter::toUtf8(wblank).c_str(), out);
         fputc_unlocked('^', out);
         fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out);
         fputc_unlocked('$', out);
diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc
index d784453..75a5fb2 100644
--- a/src/rtx_processor.cc
+++ b/src/rtx_processor.cc
@@ -296,15 +296,19 @@ RTXProcessor::stackCopy(int src, int dest)
   {
     case 0:
       theStack[dest].b = theStack[src].b;
+      theWblankStack[dest] = theWblankStack[src];
       break;
     case 1:
       theStack[dest].i = theStack[src].i;
+      theWblankStack[dest] = theWblankStack[src];
       break;
     case 2:
       theStack[dest].s = theStack[src].s;
+      theWblankStack[dest] = theWblankStack[src];
       break;
     case 3:
       theStack[dest].c = theStack[src].c;
+      theWblankStack[dest] = theWblankStack[src];
       break;
     default:
       wcerr << "Unknown StackElement mode " << theStack[src].mode;
@@ -312,6 +316,76 @@ RTXProcessor::stackCopy(int src, int dest)
   }
 }
 
+bool
+RTXProcessor::gettingLemmaFromWord(wstring attr)
+{
+    return (attr.compare(L"lem") == 0 || attr.compare(L"lemh") == 0 || attr.compare(L"whole") == 0);
+}
+
+wstring
+RTXProcessor::combineWblanks(wstring wblank_current, wstring wblank_to_add)
+{
+  if(wblank_current.empty() && wblank_to_add.empty())
+  {
+    return wblank_current;
+  }
+  else if(wblank_current.empty())
+  {
+    return wblank_to_add;
+  }
+  else if(wblank_to_add.empty())
+  {
+    return wblank_current;
+  }
+  
+  wstring new_out_wblank;
+  for(wstring::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++)
+  {
+    if(*it == '\\')
+    {
+      new_out_wblank += *it;
+      it++;
+      new_out_wblank += *it;
+    }
+    else if(*it == ']')
+    {
+      if(*(it+1) == ']')
+      {
+        new_out_wblank += ';';
+        break;
+      }
+    }
+    else
+    {
+      new_out_wblank += *it;
+    }
+  }
+  
+  for(wstring::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++)
+  {
+    if(*it == '\\')
+    {
+      new_out_wblank += *it;
+      it++;
+      new_out_wblank += *it;
+    }
+    else if(*it == '[')
+    {
+      if(*(it+1) == '[')
+      {
+        new_out_wblank += ' ';
+        it++;
+      }
+    }
+    else
+    {
+      new_out_wblank += *it;
+    }
+  }
+  
+  return new_out_wblank;
+}
+
 bool
 RTXProcessor::applyRule(const wstring& rule)
 {
@@ -320,6 +394,13 @@ RTXProcessor::applyRule(const wstring& rule)
   const wchar_t* rule_data = rule.data();
   for(unsigned int i = 0, rule_size = rule.size(); i < rule_size; i++)
   {
+    /*
+    if(!theWblankStack[stackIdx].empty())
+    {
+      wcerr << "\n%%wblstack%%" << theWblankStack[stackIdx] << "%%\n";
+    }
+     */
+    
     switch(rule_data[i])
     {
       case DROP:
@@ -600,6 +681,8 @@ RTXProcessor::applyRule(const wstring& rule)
               Chunk* temp = chunkPool.next();
               temp->isBlank = false;
               temp->target = ch->target.substr(last, c-last);
+              temp->wblank = out_wblank;
+              out_wblank.clear();
               if(chunk) currentOutput.back()->contents.push_back(temp);
               else currentOutput.push_back(temp);
               last = c+1;
@@ -634,7 +717,9 @@ RTXProcessor::applyRule(const wstring& rule)
         }
         else
         {
+          ch->wblank = out_wblank;
           currentOutput.push_back(ch);
+          out_wblank.clear();
         }
       }
         break;
@@ -680,7 +765,18 @@ RTXProcessor::applyRule(const wstring& rule)
         popString(part);
         Chunk* ch = popChunk();
         if(ch == NULL) pushStack(L"");
-        else pushStack(ch->chunkPart(attr_items[part], SourceClip));
+        else
+        {
+          if(gettingLemmaFromWord(part))
+          {
+            pushStack(ch->chunkPart(attr_items[part], SourceClip), ch->wblank);
+            //wcerr << "\n\n#" << part << "#s#" << theWblankStack[stackIdx] << "###\n\n";
+          }
+          else
+          {
+            pushStack(ch->chunkPart(attr_items[part], SourceClip));
+          }
+        }
         if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; }
       }
         break;
@@ -691,7 +787,18 @@ RTXProcessor::applyRule(const wstring& rule)
         popString(part);
         Chunk* ch = popChunk();
         if(ch == NULL) pushStack(L"");
-        else pushStack(ch->chunkPart(attr_items[part], TargetClip));
+        else
+        {
+          if(gettingLemmaFromWord(part))
+          {
+            pushStack(ch->chunkPart(attr_items[part], TargetClip), ch->wblank);
+            //wcerr << "\n\n#" << part << "#t#" << theWblankStack[stackIdx] << "###\n\n";
+          }
+          else
+          {
+             pushStack(ch->chunkPart(attr_items[part], TargetClip));
+          }
+        }
         if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; }
       }
         break;
@@ -825,10 +932,12 @@ RTXProcessor::applyRule(const wstring& rule)
         if(theStack[stackIdx+1].mode == 2)
         {
           theStack[stackIdx].c->target += theStack[stackIdx+1].s;
+          out_wblank = combineWblanks(out_wblank, theWblankStack[stackIdx+1]);
         }
         else
         {
           theStack[stackIdx].c->target += theStack[stackIdx+1].c->target;
+          theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank;
         }
         if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; }
       }
@@ -850,10 +959,12 @@ RTXProcessor::applyRule(const wstring& rule)
         if(theStack[stackIdx+1].mode == 2)
         {
           theStack[stackIdx].c->source += theStack[stackIdx+1].s;
+          out_wblank = combineWblanks(out_wblank, theWblankStack[stackIdx+1]);
         }
         else
         {
           theStack[stackIdx].c->source += theStack[stackIdx+1].c->source;
+          theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank;
         }
         if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; }
       }
@@ -1069,6 +1180,7 @@ RTXProcessor::readToken(FILE *in)
           else if(val == L'^')
           {
             inwblank = false;
+            cur = L"[[" + cur;
             wbl.swap(cur);
             inword = true;
           }
@@ -1747,6 +1859,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out)
       outputAll(out);
       variables = currentBranch->stringVars;
       fflush(out);
+      vector<wstring> wblanks;
       vector<wstring> sources;
       vector<wstring> targets;
       vector<wstring> corefs;
@@ -1765,6 +1878,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out)
         {
           unknowns.push_back(false);
         }
+        wblanks.push_back(temp->wblank);
         sources.push_back(temp->source);
         targets.push_back(temp->target);
         corefs.push_back(temp->coref);
@@ -1784,6 +1898,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out)
       for(int i = 0; i < N; i++)
       {
         Chunk* c = chunkPool.next();
+        c->wblank = wblanks[i];
         c->source = sources[i];
         c->target = targets[i];
         c->coref = corefs[i];
diff --git a/src/rtx_processor.h b/src/rtx_processor.h
index 8c68783..2c92607 100644
--- a/src/rtx_processor.h
+++ b/src/rtx_processor.h
@@ -122,6 +122,23 @@ private:
    * Index of the top element on theStack
    */
   int stackIdx;
+  
+  /**
+   * A parallel stack to store wordbound blanks that mimics the operations
+   * of the main stack. wblanks are added everytime lemmas are clipped
+   */
+  wstring theWblankStack[32];
+  
+  /**
+   * wordbound blank to be output
+   */
+  wstring out_wblank;
+  
+  /**
+   * A map of variable name to the wblank of the lemma in the variable
+   * if one was clipped and added to the variable
+  */
+  map <wstring, wstring> var_out_wblank;
 
   /**
    * Input to the virtual machine
@@ -356,21 +373,25 @@ private:
   {
     theStack[++stackIdx].mode = 0;
     theStack[stackIdx].b = b;
+    theWblankStack[stackIdx].clear();
   }
   inline void pushStack(int i)
   {
     theStack[++stackIdx].mode = 1;
     theStack[stackIdx].i = i;
+    theWblankStack[stackIdx].clear();
   }
-  inline void pushStack(const wstring& s)
+  inline void pushStack(const wstring& s, wstring wbl = L"")
   {
     theStack[++stackIdx].mode = 2;
     theStack[stackIdx].s.assign(s);
+    theWblankStack[stackIdx] = wbl;
   }
   inline void pushStack(Chunk* c)
   {
     theStack[++stackIdx].mode = 3;
     theStack[stackIdx].c = c;
+    theWblankStack[stackIdx].clear();
   }
 
   /**
@@ -430,6 +451,16 @@ private:
    */
   void processTRX(FILE* in, FILE* out);
   
+  /**
+   * True if clipping lem/lemh/whole
+  */
+  bool gettingLemmaFromWord(wstring attr);
+  
+  /**
+   * Combines two wordbound blanks and returns it
+  */
+  wstring combineWblanks(wstring wblank_current, wstring wblank_to_add);
+  
 public:
   RTXProcessor();
   ~RTXProcessor();