commit 8663e98ce4b32a5d4e5ff37bda54e5854abd23b3 Author: Tanmai Khanna Date: Tue Jul 7 16:17:57 2020 +0530 Wordbound blanks in transfer (#90) * wordbound blanks in chunker and postchunk, based on where lem/lemh/whole comes from * blanks merge when LUs merge * tests added diff --git a/apertium/interchunk_word.cc b/apertium/interchunk_word.cc index 690988d..fc7b000 100644 --- a/apertium/interchunk_word.cc +++ b/apertium/interchunk_word.cc @@ -65,6 +65,7 @@ InterchunkWord::operator =(InterchunkWord const &o) void InterchunkWord::init(string const &chunk) { + size_t b_end = 0; for(size_t i = 0; i < chunk.size(); i++) { if(chunk[i] == '\\') @@ -77,8 +78,24 @@ InterchunkWord::init(string const &chunk) this->queue = chunk.substr(i); return; } + else if(chunk[i] == ']') + { + if(chunk[i-1] == ']') + { + b_end = i+1; + } + } + } + + if(b_end > 0) + { + this->blank = chunk.substr(0, b_end); + this->chunk = chunk.substr(b_end); + } + else + { + this->chunk = chunk; } - this->chunk = chunk; this->queue = ""; } @@ -108,6 +125,12 @@ InterchunkWord::chunkPart(ApertiumRE const &part) } } +string +InterchunkWord::getBlank() +{ + return blank; +} + bool InterchunkWord::setChunkPart(ApertiumRE const &part, string const &value) { diff --git a/apertium/interchunk_word.h b/apertium/interchunk_word.h index 317263e..8608b7d 100644 --- a/apertium/interchunk_word.h +++ b/apertium/interchunk_word.h @@ -39,6 +39,11 @@ private: * Target language chunk content */ string queue; + + /** + * Wordbound blank (for postchunk) + */ + string blank; /** * Copy method @@ -92,6 +97,12 @@ public: * @returns reference to the part of string matched */ string chunkPart(ApertiumRE const &part); + + /** + * Reference the wordbound blank (for postchunk) + * @returns reference to the part of string matched + */ + string getBlank(); /** * Sets a value for a chunk part diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index aa5333f..5a27a4a 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -232,6 +232,75 @@ Postchunk::checkIndex(xmlNode *element, int index, int limit) return true; } +bool +Postchunk::gettingLemmaFromWord(string attr) +{ + return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); +} + +string +Postchunk::combineWblanks(string wblank_current, string wblank_to_add) +{ + if(wblank_current.empty() && wblank_to_add.empty()) + { + return wblank_current; + } + else if(wblank_current.empty()) + { + return wblank_to_add; + } + else if(wblank_to_add.empty()) + { + return wblank_current; + } + + string new_out_wblank; + for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == ']') + { + if(*(it+1) == ']') + { + new_out_wblank += ';'; + break; + } + } + else + { + new_out_wblank += *it; + } + } + + for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == '[') + { + if(*(it+1) == '[') + { + new_out_wblank += ' '; + it++; + } + } + else + { + new_out_wblank += *it; + } + } + + return new_out_wblank; +} string Postchunk::evalString(xmlNode *element) @@ -246,6 +315,18 @@ Postchunk::evalString(xmlNode *element) case ti_clip_tl: if(checkIndex(element, ti.getPos(), lword)) { + if(gettingLemmaFromWord(ti.getContent())) + { + if(in_lu) + { + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getBlank()); + } + else if(in_let_var) + { + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getBlank()); + } + } + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); } break; @@ -254,6 +335,8 @@ Postchunk::evalString(xmlNode *element) return StringUtils::itoa_string(tmpword.size()); case ti_var: + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + return variables[ti.getContent()]; case ti_lit_tag: @@ -387,6 +470,9 @@ Postchunk::evalString(xmlNode *element) } else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) { + in_lu = true; + out_wblank.clear(); + string myword; for(xmlNode *i = element->children; i != NULL; i = i->next) { @@ -395,10 +481,12 @@ Postchunk::evalString(xmlNode *element) myword.append(evalString(i)); } } + + in_lu = false; if(myword != "") { - return "^"+myword+"$"; + return out_wblank+"^"+myword+"$"; } else { @@ -410,20 +498,25 @@ Postchunk::evalString(xmlNode *element) string value; bool first_time = true; + out_wblank.clear(); for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) { + in_lu = true; + string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) - { + { myword.append(evalString(j)); - } + } } + + in_lu = false; if(!first_time) { @@ -446,7 +539,7 @@ Postchunk::evalString(xmlNode *element) if(value != "") { - return "^"+value+"$"; + return out_wblank+"^"+value+"$"; } else { @@ -472,6 +565,9 @@ Postchunk::processOut(xmlNode *localroot) { if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { + in_lu = true; + out_wblank.clear(); + string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) { @@ -480,8 +576,12 @@ Postchunk::processOut(xmlNode *localroot) myword.append(evalString(j)); } } + + in_lu = false; + if(myword != "") { + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); fputwc_unlocked(L'^', output); fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); fputwc_unlocked(L'$', output); @@ -489,38 +589,49 @@ Postchunk::processOut(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { - fputwc_unlocked(L'^', output); + string myword; bool first_time = true; + out_wblank.clear(); + for(xmlNode *j = i->children; j != NULL; j = j->next) { if(j->type == XML_ELEMENT_NODE) { - string myword; + in_lu = true; + + string mylocalword; for(xmlNode *k = j->children; k != NULL; k = k->next) { if(k->type == XML_ELEMENT_NODE) { - myword.append(evalString(k)); + mylocalword.append(evalString(k)); } } + + in_lu = false; if(!first_time) { - if(myword != "") + if(mylocalword != "") { - fputwc_unlocked('+', output); + myword += '+'; } } - else - { - if(myword != "") - { - first_time = false; + else + { + if(mylocalword != "") + { + first_time = false; } - } - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - } + } + + myword.append(mylocalword); + } } + + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); + fputwc_unlocked('^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); fputwc_unlocked(L'$', output); } else // 'b' @@ -609,7 +720,13 @@ Postchunk::processLet(xmlNode *localroot) switch(ti.getType()) { case ti_var: + in_let_var = true; + var_val = ti.getContent(); + var_out_wblank[var_val].clear(); + variables[ti.getContent()] = evalString(rightSide); + + in_let_var = false; return; case ti_clip_tl: @@ -628,8 +745,16 @@ Postchunk::processLet(xmlNode *localroot) } if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) { + in_let_var = true; + string const val = (const char *) leftSide->properties->children->content; + + var_val = val; + var_out_wblank[var_val].clear(); + variables[val] = evalString(rightSide); + + in_let_var = false; evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); } else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) @@ -678,7 +803,10 @@ Postchunk::processAppend(xmlNode *localroot) { if(i->type == XML_ELEMENT_NODE) { + in_let_var = true; + var_val = name; variables[name].append(evalString(i)); + in_let_var = false; } } } @@ -2081,27 +2209,115 @@ Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, } else if(chunk[i] == L'[') { - if (!(lastblank && blanks.back())) + if(chunk[i+1] == L'[') //wordbound blank { - blanks.push_back(new wstring()); + if(!lastblank) + { + blanks.push_back(new wstring(L"")); + } + lastblank = false; + wstring *myword = new wstring(); + wstring &ref = *myword; + + while(true) + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L']' && chunk[i-1] == L']') + { + ref += chunk[i]; + i++; //i->"^" + break; + } + else + { + ref += chunk[i]; + } + + i++; + } + + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + if(vectags.size() > value) + { + ref.append(vectags[value]); + } + while(chunk[++i] != L'>'); + } + else + { + ref += L'<'; + while(chunk[++i] != L'>') ref += chunk[i]; + ref += L'>'; + } + } + else + { + if(uppercase_all) + { + ref += towupper(chunk[i]); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + ref += towupper(chunk[i]); + uppercase_first = false; + } + else + { + ref += chunk[i]; + } + } + else + { + ref += chunk[i]; + } + } + } + + words.push_back(myword); } - wstring &ref = *(blanks.back()); - ref += L'['; - while(chunk[++i] != L']') + else { - if(chunk[i] == L'\\') + if (!(lastblank && blanks.back())) { - ref += L'\\'; - ref += chunk[++i]; + blanks.push_back(new wstring()); } - else + wstring &ref = *(blanks.back()); + ref += L'['; + while(chunk[++i] != L']') { - ref += chunk[i]; + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } } - } - ref += chunk[i]; + ref += chunk[i]; - lastblank = true; + lastblank = true; + } } else { diff --git a/apertium/postchunk.h b/apertium/postchunk.h index 820393c..5a92234 100644 --- a/apertium/postchunk.h +++ b/apertium/postchunk.h @@ -59,6 +59,13 @@ private: Buffer input_buffer; vector tmpword; vector tmpblank; + + bool in_lu; + bool in_let_var; + string var_val; + bool in_wblank; + string out_wblank; + map var_out_wblank; FILE *output; int any_char; @@ -126,6 +133,8 @@ private: static wstring wordzero(wstring const &chunk); bool checkIndex(xmlNode *element, int index, int limit); void postchunk_wrapper_null_flush(FILE *in, FILE *out); + bool gettingLemmaFromWord(string attr); + string combineWblanks(string wblank_current, string wblank_to_add); public: Postchunk(); diff --git a/apertium/transfer.cc b/apertium/transfer.cc index 9f60372..0bc0433 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -295,7 +295,71 @@ Transfer::checkIndex(xmlNode *element, int index, int limit) bool Transfer::gettingLemmaFromWord(string attr) { - return (attr.compare("lem") == 0 || attr.compare("lemh") == 0); + return (attr.compare("lem") == 0 || attr.compare("lemh") == 0 || attr.compare("whole") == 0); +} + +string +Transfer::combineWblanks(string wblank_current, string wblank_to_add) +{ + if(wblank_current.empty() && wblank_to_add.empty()) + { + return wblank_current; + } + else if(wblank_current.empty()) + { + return wblank_to_add; + } + else if(wblank_to_add.empty()) + { + return wblank_current; + } + + string new_out_wblank; + for(string::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == ']') + { + if(*(it+1) == ']') + { + new_out_wblank += ';'; + break; + } + } + else + { + new_out_wblank += *it; + } + } + + for(string::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == '[') + { + if(*(it+1) == '[') + { + new_out_wblank += ' '; + it++; + } + } + else + { + new_out_wblank += *it; + } + } + + return new_out_wblank; } string @@ -315,29 +379,13 @@ Transfer::evalString(xmlNode *element) { if(in_lu) { - secondary_tags.append(word[ti.getPos()]->source(attr_items["sectags"], ti.getCondition())); + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->blank()); } else if(in_let_var) { - string temp_sl_secondary_tags = word[ti.getPos()]->source(attr_items["sectags"], ti.getCondition()); - var_secondary_tags[var_val].append(temp_sl_secondary_tags); + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->blank()); } } - else if(ti.getContent().compare("lemq") == 0) - { - if(in_lu) - { - string sectags_lemq = secondary_tags; - secondary_tags.clear(); - - sectags_lemq.append(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition())); - return sectags_lemq; - } - else if(in_let_var) - { - var_has_lemq[var_val] = true; - } - } return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); } @@ -350,27 +398,11 @@ Transfer::evalString(xmlNode *element) { if(in_lu) { - secondary_tags.append(word[ti.getPos()]->target(attr_items["sectags"], ti.getCondition())); + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->blank()); } else if(in_let_var) { - string temp_tl_secondary_tags = word[ti.getPos()]->target(attr_items["sectags"], ti.getCondition()); - var_secondary_tags[var_val].append(temp_tl_secondary_tags); - } - } - else if(ti.getContent().compare("lemq") == 0) - { - if(in_lu) - { - string sectags_lemq = secondary_tags; - secondary_tags.clear(); - - sectags_lemq.append(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition())); - return sectags_lemq; - } - else if(in_let_var) - { - var_has_lemq[var_val] = true; + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->blank()); } } @@ -428,35 +460,7 @@ Transfer::evalString(xmlNode *element) break; case ti_var: - secondary_tags.append(var_secondary_tags[ti.getContent()]); - - if(var_has_lemq[ti.getContent()] && !secondary_tags.empty()) - { - string var_content = variables[ti.getContent()]; - string var_content_with_sectags = ""; - int lemq_position = -1; - - for(size_t index = 0; index < var_content.size(); index++) - { - if(var_content[index] == '#') - { - lemq_position = index; - } - else if(var_content[index] == '\\') - { - index++; - continue; - } - } - - var_content_with_sectags = var_content.substr(0,lemq_position).append(secondary_tags).append(var_content.substr(lemq_position, string::npos)); - - secondary_tags.clear(); - - return var_content_with_sectags; - } - - + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); return variables[ti.getContent()]; case ti_lit_tag: @@ -658,7 +662,7 @@ Transfer::evalString(xmlNode *element) else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) { in_lu = true; - secondary_tags.clear(); + out_wblank.clear(); string myword; for(xmlNode *i = element->children; i != NULL; i = i->next) @@ -670,11 +674,10 @@ Transfer::evalString(xmlNode *element) } in_lu = false; - myword.append(secondary_tags); if(myword != "") { - return "^"+myword+"$"; + return out_wblank+"^"+myword+"$"; } else { @@ -686,13 +689,13 @@ Transfer::evalString(xmlNode *element) string value; bool first_time = true; + out_wblank.clear(); for(xmlNode *i = element->children; i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) { in_lu = true; - secondary_tags.clear(); string myword; @@ -705,7 +708,6 @@ Transfer::evalString(xmlNode *element) } in_lu = false; - myword.append(secondary_tags); if(!first_time) { @@ -728,7 +730,7 @@ Transfer::evalString(xmlNode *element) if(value != "") { - return "^"+value+"$"; + return out_wblank+"^"+value+"$"; } else { @@ -760,7 +762,7 @@ Transfer::processOut(xmlNode *localroot) if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { in_lu = true; - secondary_tags.clear(); + out_wblank.clear(); string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) @@ -773,9 +775,9 @@ Transfer::processOut(xmlNode *localroot) in_lu = false; - myword.append(secondary_tags); if(myword != "") { + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); fputwc_unlocked(L'^', output); fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); fputwc_unlocked(L'$', output); @@ -783,46 +785,53 @@ Transfer::processOut(xmlNode *localroot) } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) { - fputwc_unlocked('^', output); - bool first_time = true; - for(xmlNode *j = i->children; j != NULL; j = j->next) - { - if(j->type == XML_ELEMENT_NODE) - { - in_lu = true; - secondary_tags.clear(); - - string myword; - for(xmlNode *k = j->children; k != NULL; k = k->next) - { - if(k->type == XML_ELEMENT_NODE) - { - myword.append(evalString(k)); - } - } - - in_lu = false; - myword.append(secondary_tags); + string myword; + bool first_time = true; + out_wblank.clear(); + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + in_lu = true; + + string mylocalword; + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + mylocalword.append(evalString(k)); + } + } + + in_lu = false; - if(!first_time) - { - if(myword != "" && myword[0] != '#') //'+#' problem - { - fputwc_unlocked(L'+', output); + if(!first_time) + { + if(mylocalword != "" && mylocalword[0] != '#') //'+#' problem + { + myword += '+'; } - } - else - { - if(myword != "") - { - first_time = false; + } + else + { + if(mylocalword != "") + { + first_time = false; } - } - - fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); - } - } - fputwc_unlocked(L'$', output); + } + + myword.append(mylocalword); + } + } + + if(myword != "") + { + fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); + fputwc_unlocked('^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + fputwc_unlocked(L'$', output); + } } else // 'b' { @@ -914,7 +923,7 @@ Transfer::processChunk(xmlNode *localroot) else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) { in_lu = true; - secondary_tags.clear(); + out_wblank.clear(); string myword; for(xmlNode *j = i->children; j != NULL; j = j->next) @@ -925,11 +934,10 @@ Transfer::processChunk(xmlNode *localroot) } } in_lu = false; - - myword.append(secondary_tags); if(myword != "") { + result.append(out_wblank); result.append("^"); result.append(myword); result.append("$"); @@ -939,13 +947,15 @@ Transfer::processChunk(xmlNode *localroot) { bool first_time = true; string myword; + + out_wblank.clear(); + for(xmlNode *j = i->children; j != NULL; j = j->next) { string mylocalword; if(j->type == XML_ELEMENT_NODE) { in_lu = true; - secondary_tags.clear(); for(xmlNode *k = j->children; k != NULL; k = k->next) { @@ -956,7 +966,6 @@ Transfer::processChunk(xmlNode *localroot) } in_lu = false; - mylocalword.append(secondary_tags); if(!first_time) { @@ -974,6 +983,7 @@ Transfer::processChunk(xmlNode *localroot) } if(myword != "") { + result.append(out_wblank); result.append("^"); result.append(myword); result.append("$"); @@ -1100,8 +1110,7 @@ Transfer::processLet(xmlNode *localroot) in_let_var = true; var_val = ti.getContent(); - var_secondary_tags[var_val].clear(); - var_has_lemq[var_val] = false; + var_out_wblank[var_val].clear(); variables[ti.getContent()] = evalString(rightSide); @@ -1150,8 +1159,7 @@ Transfer::processLet(xmlNode *localroot) string const val = (const char *) leftSide->properties->children->content; var_val = val; - var_secondary_tags[var_val].clear(); - var_has_lemq[var_val] = false; + var_out_wblank[var_val].clear(); variables[val] = evalString(rightSide); @@ -2067,8 +2075,38 @@ Transfer::readToken(FILE *in) int val = fgetwc_unlocked(in); if(feof(in) || (val == 0 && internal_null_flush)) { + in_wblank = false; return input_buffer.add(TransferToken(content, tt_eof)); } + if(in_wblank) + { + content = L"[["; + content+= wchar_t(val); + + while(true) + { + int val3 = fgetwc_unlocked(in); + if(val3 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val3 == L'$') //[[..]]^..$ is the LU + { + in_wblank = false; + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val3 == L'\0' && null_flush) + { + in_wblank = false; + fflush(output); + } + else + { + content += wchar_t(val3); + } + } + } if(val == '\\') { content += L'\\'; @@ -2079,21 +2117,28 @@ Transfer::readToken(FILE *in) content += L'['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L']') - { - content += L']'; - break; - } - else - { - content += wchar_t(val2); - } + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L'[') + { //wordbound blank + in_wblank = true; + content.pop_back(); + + return input_buffer.add(TransferToken(content, tt_blank)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } } } else if(val == L'$') @@ -2106,6 +2151,7 @@ Transfer::readToken(FILE *in) } else if(val == L'\0' && null_flush) { + in_wblank = false; fflush(output); } else @@ -2172,6 +2218,7 @@ Transfer::transfer(FILE *in, FILE *out) unsigned int prev_last = last; int lastrule_id = -1; set banned_rules; + in_wblank = false; output = out; ms.init(me->getInitial()); @@ -2266,6 +2313,7 @@ Transfer::transfer(FILE *in, FILE *out) } pair tr; + wstring tr_wblank; if(useBilingual && preBilingual == false) { if(isExtended && (*tmpword[0])[0] == L'*') @@ -2290,6 +2338,7 @@ Transfer::transfer(FILE *in, FILE *out) wstring sl; wstring tl; wstring ref; + wstring wblank; int seenSlash = 0; for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) @@ -2316,6 +2365,47 @@ Transfer::transfer(FILE *in, FILE *out) } continue; } + else if(*it == L'[') + { + if(*(it+1) == L'[') //wordbound blank + { + while(true) + { + if(*it == L'\\') + { + wblank.push_back(*it); + it++; + wblank.push_back(*it); + } + else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + { + break; + } + else + { + wblank.push_back(*it); + } + + it++; + } + } + else + { + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else + { + ref.push_back(*it); + } + } + continue; + } else if(*it == L'/') { seenSlash++; @@ -2338,6 +2428,7 @@ Transfer::transfer(FILE *in, FILE *out) } //tmpword[0]->assign(sl); tr = pair(tl, false); + tr_wblank = wblank; //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; } else @@ -2349,23 +2440,30 @@ Transfer::transfer(FILE *in, FILE *out) { if(defaultAttrs == lu) { - fputwc_unlocked(L'^', output); + if(tr.first[0] != L'[' || tr.first[1] != L'[') + { + fputwc_unlocked(L'^', output); + } fputws_unlocked(tr.first.c_str(), output); fputwc_unlocked(L'$', output); - } - else - { - if(tr.first[0] == '*') - { - fputws_unlocked(L"^unknown{^", output); - } - else - { - fputws_unlocked(L"^default{^", output); - } - fputws_unlocked(tr.first.c_str(), output); - fputws_unlocked(L"$}$", output); - } + } + else + { + if(tr.first[0] == '*') + { + fputws_unlocked(L"^unknown{", output); + fputws_unlocked(tr_wblank.c_str(), output); + fputwc_unlocked(L'^', output); + } + else + { + fputws_unlocked(L"^default{", output); + fputws_unlocked(tr_wblank.c_str(), output); + fputwc_unlocked(L'^', output); + } + fputws_unlocked(tr.first.c_str(), output); + fputws_unlocked(L"$}$", output); + } } banned_rules.clear(); tmpword.clear(); @@ -2473,17 +2571,18 @@ Transfer::applyRule() } else { - blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); } pair tr; if(useBilingual && preBilingual == false) { tr = fstp.biltransWithQueue(*tmpword[i], false); - wstring refx; + wstring refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(wblankx), tr.second); } else if(preBilingual) @@ -2491,6 +2590,7 @@ Transfer::applyRule() wstring sl; wstring tl; wstring ref; + wstring wblank; int seenSlash = 0; for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) @@ -2517,6 +2617,47 @@ Transfer::applyRule() } continue; } + else if(*it == L'[') + { + if(*(it+1) == L'[') //wordbound blank + { + while(true) + { + if(*it == L'\\') + { + wblank.push_back(*it); + it++; + wblank.push_back(*it); + } + else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + { + break; + } + else + { + wblank.push_back(*it); + } + + it++; + } + } + else + { + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else + { + ref.push_back(*it); + } + } + continue; + } if(*it == L'/') { @@ -2542,15 +2683,17 @@ Transfer::applyRule() word[i] = new TransferWord(UtfConverter::toUtf8(sl), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(ref), + UtfConverter::toUtf8(wblank), tr.second); } else // neither useBilingual nor preBilingual (sl==tl) { tr = pair(*tmpword[i], false); - wstring refx; + wstring refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(wblankx), tr.second); } } @@ -2598,7 +2741,33 @@ Transfer::applyWord(wstring const &word_str) i++; ms.step(towlower(word_str[i]), any_char); break; - + + case L'[': + if(word_str[i+1] == L'[') + { + while(true) + { + if(word_str[i] == L'\\') + { + i++; + } + else if(i >= 4) + { + if(word_str[i] == L'^' && word_str[i-1] == L']' && word_str[i-2] == L']') + { + break; + } + } + + i++; + } + } + else + { + ms.step(towlower(word_str[i]), any_char); + } + break; + case L'/': i = limit; break; @@ -2606,12 +2775,6 @@ Transfer::applyWord(wstring const &word_str) case L'<': for(unsigned int j = i+1; j != limit; j++) { - if(word_str[j] == L':') //if secondary tags reached, discard current tag and stop processing word - { - i = limit; - break; - } - if(word_str[j] == L'>') { int symbol = alphabet(word_str.substr(i, j-i+1)); diff --git a/apertium/transfer.h b/apertium/transfer.h index 79ae6fd..ca3bfad 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -62,15 +62,16 @@ private: vector tmpword; vector tmpblank; - //for secondary tags - bool in_lu; //flag to denote that lu is being processed - string secondary_tags; //stores secondary tags of the LU that is being processed - bool in_let_var; //flag to denote that a var in let is being processed (or in append) + bool in_lu; + bool in_let_var; string var_val; //stores the name of the variable being processed (in let or append) - map var_secondary_tags; //map variable name to secondary tags of the word it takes lem/lemh from - map var_has_lemq; //map variable name to bool->true if variable clips lemq + + bool in_wblank; + string out_wblank; + map var_out_wblank; bool gettingLemmaFromWord(string attr); + string combineWblanks(string wblank_current, string wblank_to_add); FSTProcessor fstp; FSTProcessor extended; diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index c2a81e9..78015f3 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -50,8 +50,7 @@ TransferData::TransferData() attr_items[L"lemq"] = L"\\#[- _][^<]+"; attr_items[L"lemh"] = L"^(([^<#]|\"\\<\"|\"\\#\")+)"; attr_items[L"whole"] = L"(.+)"; - attr_items[L"tags"] = L"((<[^:>]+>)+)"; //match all tags excluding secondary tags - attr_items[L"sectags"] = L"((<[^>]+:[^>]+>)+)"; //match all secondary tags + attr_items[L"tags"] = L"((<[^>]+>)+)"; attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!! attr_items[L"chcontent"] = L"(\\{.+)"; attr_items[L"content"] = L"(\\{.+)"; diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc index 33d2f4b..86862fe 100644 --- a/apertium/transfer_word.cc +++ b/apertium/transfer_word.cc @@ -26,6 +26,7 @@ TransferWord::copy(TransferWord const &o) s_str = o.s_str; t_str = o.t_str; r_str = o.r_str; + b_str = o.b_str; queue_length = o.queue_length; } @@ -39,9 +40,9 @@ queue_length(0) { } -TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, int queue) +TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue) { - init(src, tgt, ref); + init(src, tgt, ref, blank); queue_length = queue; } @@ -67,11 +68,12 @@ TransferWord::operator =(TransferWord const &o) } void -TransferWord::init(string const &src, string const &tgt, string const &ref) +TransferWord::init(string const &src, string const &tgt, string const &ref, string const &blank) { s_str = src; t_str = tgt; r_str = ref; + b_str = blank; } string @@ -113,6 +115,12 @@ TransferWord::reference(ApertiumRE const &part, bool with_queue) } } +string +TransferWord::blank() +{ + return b_str; +} + bool TransferWord::setSource(ApertiumRE const &part, string const &value, bool with_queue) diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h index d14ed16..3ff35d8 100644 --- a/apertium/transfer_word.h +++ b/apertium/transfer_word.h @@ -45,6 +45,11 @@ private: * Reference word */ string r_str; + + /** + * Wordbound blank + */ + string b_str; /** * Queue length @@ -100,9 +105,10 @@ public: * @param src source word * @param tgt target word * @param ref reference word + * @param blank wordbound blank * @param queue queue lenght */ - TransferWord(string const &src, string const &tgt, string const &ref, int queue = 0); + TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue = 0); /** * Assignment operator @@ -117,8 +123,9 @@ public: * @param src source word * @param tgt target word * @param ref reference word + * @param blank wordbound blank */ - void init(string const &src, string const &tgt, string const &ref); + void init(string const &src, string const &tgt, string const &ref, string const &blank); /** * Reference a source language word part @@ -143,6 +150,12 @@ public: * @returns reference to the part of string matched */ string reference(ApertiumRE const &part, bool with_queue = true); + + /** + * Reference the wordbound blank part + * @returns reference to the wordbound blank + */ + string blank(); /** * Sets a value for a source language word part diff --git a/tests/data/apertium-nno-nob.nno-nob.t3x b/tests/data/apertium-nno-nob.nno-nob.t3x index 519c465..a52713d 100644 --- a/tests/data/apertium-nno-nob.nno-nob.t3x +++ b/tests/data/apertium-nno-nob.nno-nob.t3x @@ -14,6 +14,9 @@ + + + @@ -133,5 +136,24 @@ + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/apertium-nno-nob.nob-nno.t1x b/tests/data/apertium-nno-nob.nob-nno.t1x index e516d6e..ba6ab3d 100644 --- a/tests/data/apertium-nno-nob.nob-nno.t1x +++ b/tests/data/apertium-nno-nob.nob-nno.t1x @@ -9,6 +9,9 @@ + + + @@ -84,5 +87,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/postchunk/__init__.py b/tests/postchunk/__init__.py index 0de8c3e..52c9ef3 100644 --- a/tests/postchunk/__init__.py +++ b/tests/postchunk/__init__.py @@ -107,6 +107,9 @@ class UseMacroPostchunkTest(PostchunkTest): inputs = ["^thing{^thing$}$"] expectedOutputs = ["^thing$"] +class WordboundBlankTest(PostchunkTest): + inputs = ["^n_n{[[t:b:123456]]^worda$ ;[testblank] [[t:s:xyzab12]]^wordb# xyz$}$"] + expectedOutputs = ["[[t:s:xyzab12]]^wordb# xyz$ ;[testblank] [[t:b:123456]]^worda$ [[t:b:123456; t:s:xyzab12]]^worda+wordb# xyz$"] class BincompatTest(SimplePostchunkTest): bindata = "data/bincompat.t3x.bin" diff --git a/tests/transfer/__init__.py b/tests/transfer/__init__.py index 94144be..4eda9c1 100644 --- a/tests/transfer/__init__.py +++ b/tests/transfer/__init__.py @@ -95,6 +95,9 @@ class SlLemqTest(TransferTest): inputs = ["^skyldes/komme# av$"] expectedOutputs = ["sl-lemq:'' tl-lemq:'# av'"] +class WordboundBlankTest(TransferTest): + inputs = ["[blank1] [[t:s:123456]]^worda/wordta$ ;[blank2]; [[t:b:xyz123; t:l:xyz347]]^wordb/wordtb$ [blank3]; [[t:i:abc123; t:s:abc123]]^hun/ho$"] + expectedOutputs = ["[blank1] ^prn{[[t:i:abc123; t:s:abc123]]^ho$ [[t:b:xyz123; t:l:xyz347]]^wordtb$}$ ;[blank2]; ^det{[[t:s:123456; t:i:abc123; t:s:abc123]]^wordta+ho$}$ [blank3]; "] class BincompatTest(BasicTransferTest): bindata = "data/bincompat.t1x.bin"