commit 5086e3d82d275c5c87ec44e9347c6e9ea38992db Author: Tanmai Khanna Date: Sun Jul 5 21:12:04 2020 +0530 all wblank features in postchunk | add blank in interchunk (only for usage in postchunk) diff --git a/apertium/interchunk_word.cc b/apertium/interchunk_word.cc index 690988d..fc7b000 100644 --- a/apertium/interchunk_word.cc +++ b/apertium/interchunk_word.cc @@ -65,6 +65,7 @@ InterchunkWord::operator =(InterchunkWord const &o) void InterchunkWord::init(string const &chunk) { + size_t b_end = 0; for(size_t i = 0; i < chunk.size(); i++) { if(chunk[i] == '\\') @@ -77,8 +78,24 @@ InterchunkWord::init(string const &chunk) this->queue = chunk.substr(i); return; } + else if(chunk[i] == ']') + { + if(chunk[i-1] == ']') + { + b_end = i+1; + } + } + } + + if(b_end > 0) + { + this->blank = chunk.substr(0, b_end); + this->chunk = chunk.substr(b_end); + } + else + { + this->chunk = chunk; } - this->chunk = chunk; this->queue = ""; } @@ -108,6 +125,12 @@ InterchunkWord::chunkPart(ApertiumRE const &part) } } +string +InterchunkWord::getBlank() +{ + return blank; +} + bool InterchunkWord::setChunkPart(ApertiumRE const &part, string const &value) { diff --git a/apertium/interchunk_word.h b/apertium/interchunk_word.h index 317263e..8608b7d 100644 --- a/apertium/interchunk_word.h +++ b/apertium/interchunk_word.h @@ -39,6 +39,11 @@ private: * Target language chunk content */ string queue; + + /** + * Wordbound blank (for postchunk) + */ + string blank; /** * Copy method @@ -92,6 +97,12 @@ public: * @returns reference to the part of string matched */ string chunkPart(ApertiumRE const &part); + + /** + * Reference the wordbound blank (for postchunk) + * @returns reference to the part of string matched + */ + string getBlank(); /** * Sets a value for a chunk part diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index aa5333f..7ef012f 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -232,6 +232,67 @@ Postchunk::checkIndex(xmlNode *element, int index, int limit) return true; } +bool +Postchunk::gettingLemmaFromWord(string attr) +{ + return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); +} + +string +Postchunk::combineWblanks(string wblank_current, string wblank_to_add) +{ + if(wblank_current.empty()) + { + return wblank_to_add; + } + + string new_out_wblank; + for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == ']') + { + if(*(it+1) == ']') + { + new_out_wblank += ';'; + break; + } + } + else + { + new_out_wblank += *it; + } + } + + for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == '[') + { + if(*(it+1) == '[') + { + new_out_wblank += ' '; + it++; + } + } + else + { + new_out_wblank += *it; + } + } + + return new_out_wblank; +} string Postchunk::evalString(xmlNode *element) @@ -246,6 +307,18 @@ Postchunk::evalString(xmlNode *element) case ti_clip_tl: if(checkIndex(element, ti.getPos(), lword)) { + if(gettingLemmaFromWord(ti.getContent())) + { + if(in_lu) + { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getBlank()); + } + else if(in_let_var) + { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getBlank()); + } + } + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); } break; @@ -254,6 +327,8 @@ Postchunk::evalString(xmlNode *element) return StringUtils::itoa_string(tmpword.size()); case ti_var: + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + return variables[ti.getContent()]; case ti_lit_tag: @@ -387,6 +462,9 @@ Postchunk::evalString(xmlNode *element) } else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) { + in_lu = true; + out_wblank.clear(); + string myword; for(xmlNode *i = element->children; i != NULL; i = i->next) { @@ -395,10 +473,12 @@ Postchunk::evalString(xmlNode *element) myword.append(evalString(i)); } } + + in_lu = false; if(myword != "") { - return "^"+myword+"$"; + return out_wblank+"^"+myword+"$"; } else { @@ -410,20 +490,25 @@ Postchunk::evalString(xmlNode *element) string value; bool first_time = true; + out_wblank.clear(); for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) { + in_lu = true; + string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) - { + { myword.append(evalString(j)); - } + } } + + in_lu = false; if(!first_time) { @@ -446,7 +531,7 @@ Postchunk::evalString(xmlNode *element) if(value != "") { - return "^"+value+"$"; + return out_wblank+"^"+value+"$"; } else { @@ -472,6 +557,9 @@ Postchunk::processOut(xmlNode *localroot) { if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + in_lu = true; + out_wblank.clear(); + string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { @@ -480,8 +568,12 @@ Postchunk::processOut(xmlNode *localroot) myword.append(evalString(j)); } } + + in_lu = false; + if(myword != "") { + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); fputwc_unlocked(L'^', output); fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); fputwc_unlocked(L'$', output); @@ -489,38 +581,49 @@ Postchunk::processOut(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { - fputwc_unlocked(L'^', output); + string myword; bool first_time = true; + out_wblank.clear(); + for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) { - string myword; + in_lu = true; + + string mylocalword; for(xmlNode *k = j->children; k != NULL; k = k->next) { if(k->type == XML_ELEMENT_NODE) { - myword.append(evalString(k)); + mylocalword.append(evalString(k)); } } + + in_lu = false; if(!first_time) { - if(myword != "") + if(mylocalword != "") { - fputwc_unlocked('+', output); + myword += '+'; } } - else - { - if(myword != "") - { - first_time = false; + else + { + if(mylocalword != "") + { + first_time = false; } - } - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - } + } + + myword.append(mylocalword); + } } + + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); + fputwc_unlocked('^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); fputwc_unlocked(L'$', output); } else // 'b' @@ -609,7 +712,13 @@ Postchunk::processLet(xmlNode *localroot) switch(ti.getType()) { case ti_var: + in_let_var = true; + var_val = ti.getContent(); + var_out_wblank[var_val].clear(); + variables[ti.getContent()] = evalString(rightSide); + + in_let_var = false; return; case ti_clip_tl: @@ -628,8 +737,16 @@ Postchunk::processLet(xmlNode *localroot) } if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { + in_let_var = true; + string const val = (const char *) leftSide->properties->children->content; + + var_val = val; + var_out_wblank[var_val].clear(); + variables[val] = evalString(rightSide); + + in_let_var = false; evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) @@ -678,7 +795,10 @@ Postchunk::processAppend(xmlNode *localroot) { if(i->type == XML_ELEMENT_NODE) { + in_let_var = true; + var_val = name; variables[name].append(evalString(i)); + in_let_var = false; } } } @@ -2081,27 +2201,115 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, } else if(chunk[i] == L'[') { - if (!(lastblank && blanks.back())) + if(chunk[i+1] == L'[') //wordbound blank { - blanks.push_back(new wstring()); + if(!lastblank) + { + blanks.push_back(new wstring(L"")); + } + lastblank = false; + wstring *myword = new wstring(); + wstring &ref = *myword; + + while(true) + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L']' && chunk[i-1] == L']') + { + ref += chunk[i]; + i++; //i->"^" + break; + } + else + { + ref += chunk[i]; + } + + i++; + } + + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + if(vectags.size() > value) + { + ref.append(vectags[value]); + } + while(chunk[++i] != L'>'); + } + else + { + ref += L'<'; + while(chunk[++i] != L'>') ref += chunk[i]; + ref += L'>'; + } + } + else + { + if(uppercase_all) + { + ref += towupper(chunk[i]); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + ref += towupper(chunk[i]); + uppercase_first = false; + } + else + { + ref += chunk[i]; + } + } + else + { + ref += chunk[i]; + } + } + } + + words.push_back(myword); } - wstring &ref = *(blanks.back()); - ref += L'['; - while(chunk[++i] != L']') + else { - if(chunk[i] == L'\\') + if (!(lastblank && blanks.back())) { - ref += L'\\'; - ref += chunk[++i]; + blanks.push_back(new wstring()); } - else + wstring &ref = *(blanks.back()); + ref += L'['; + while(chunk[++i] != L']') { - ref += chunk[i]; + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } } - } - ref += chunk[i]; + ref += chunk[i]; - lastblank = true; + lastblank = true; + } } else { diff --git a/apertium/postchunk.h b/apertium/postchunk.h index 820393c..5a92234 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -59,6 +59,13 @@ private: Buffer input_buffer; vector tmpword; vector tmpblank; + + bool in_lu; + bool in_let_var; + string var_val; + bool in_wblank; + string out_wblank; + map var_out_wblank; FILE *output; int any_char; @@ -126,6 +133,8 @@ private: static wstring wordzero(wstring const &chunk); bool checkIndex(xmlNode *element, int index, int limit); void postchunk_wrapper_null_flush(FILE *in, FILE *out); + bool gettingLemmaFromWord(string attr); + string combineWblanks(string wblank_current, string wblank_to_add); public: Postchunk();