commit 22f903ad46688f98820a112bc83678b6beb292f9 Author: Tanmai Khanna Date: Mon Jul 20 13:42:30 2020 +0530 Output wordbound blank automatically if there's only one LU in the matching pattern (#94) Output wordbound blank when no rule matches in single stage transfer; If matched pattern has one LU, blank goes on all output LUs automatically; If input chunk in postchunk rule has one LU, blank outputs on all LUs in rule output; Add tests diff --git a/apertium/interchunk_word.cc b/apertium/interchunk_word.cc index fc7b000..d5c84ff 100644 --- a/apertium/interchunk_word.cc +++ b/apertium/interchunk_word.cc @@ -89,7 +89,7 @@ InterchunkWord::init(string const &chunk) if(b_end > 0) { - this->blank = chunk.substr(0, b_end); + this->wblank = chunk.substr(0, b_end); this->chunk = chunk.substr(b_end); } else @@ -126,9 +126,9 @@ InterchunkWord::chunkPart(ApertiumRE const &part) } string -InterchunkWord::getBlank() +InterchunkWord::getWblank() { - return blank; + return wblank; } bool diff --git a/apertium/interchunk_word.h b/apertium/interchunk_word.h index 8608b7d..db0d56b 100644 --- a/apertium/interchunk_word.h +++ b/apertium/interchunk_word.h @@ -43,7 +43,7 @@ private: /** * Wordbound blank (for postchunk) */ - string blank; + string wblank; /** * Copy method @@ -100,9 +100,9 @@ public: /** * Reference the wordbound blank (for postchunk) - * @returns reference to the part of string matched + * @returns reference to the wblank string */ - string getBlank(); + string getWblank(); /** * Sets a value for a chunk part diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index 5a27a4a..1ac8110 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -315,15 +315,15 @@ Postchunk::evalString(xmlNode *element) case ti_clip_tl: if(checkIndex(element, ti.getPos(), lword)) { - if(gettingLemmaFromWord(ti.getContent())) + if(gettingLemmaFromWord(ti.getContent()) && lword > 1) { if(in_lu) { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getBlank()); + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); } else if(in_let_var) { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getBlank()); + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); } } @@ -335,7 +335,10 @@ Postchunk::evalString(xmlNode *element) return StringUtils::itoa_string(tmpword.size()); case ti_var: + if(lword > 1) + { out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + } return variables[ti.getContent()]; @@ -483,6 +486,11 @@ Postchunk::evalString(xmlNode *element) } in_lu = false; + + if(lword == 1) + { + out_wblank = word[1]->getWblank(); + } if(myword != "") { @@ -536,6 +544,11 @@ Postchunk::evalString(xmlNode *element) value.append(myword); } } + + if(lword == 1) + { + out_wblank = word[1]->getWblank(); + } if(value != "") { @@ -579,6 +592,11 @@ Postchunk::processOut(xmlNode *localroot) in_lu = false; + if(lword == 1) + { + out_wblank = word[1]->getWblank(); + } + if(myword != "") { fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); @@ -628,6 +646,11 @@ Postchunk::processOut(xmlNode *localroot) myword.append(mylocalword); } } + + if(lword == 1) + { + out_wblank = word[1]->getWblank(); + } fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); fputwc_unlocked('^', output); diff --git a/apertium/transfer.cc b/apertium/transfer.cc index 0bc0433..49a14ad 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -54,6 +54,7 @@ word(0), blank(0), lword(0), lblank(0), +last_lword(0), output(0), any_char(0), any_tag(0), @@ -375,15 +376,15 @@ Transfer::evalString(xmlNode *element) case ti_clip_sl: if(checkIndex(element, ti.getPos(), lword)) { - if(gettingLemmaFromWord(ti.getContent())) + if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { if(in_lu) { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->blank()); + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); } else if(in_let_var) { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->blank()); + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); } } @@ -394,15 +395,15 @@ Transfer::evalString(xmlNode *element) case ti_clip_tl: if(checkIndex(element, ti.getPos(), lword)) { - if(gettingLemmaFromWord(ti.getContent())) + if(gettingLemmaFromWord(ti.getContent()) && last_lword > 1) { if(in_lu) { - out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->blank()); + out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank()); } else if(in_let_var) { - var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->blank()); + var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val], word[ti.getPos()]->getWblank()); } } @@ -460,7 +461,10 @@ Transfer::evalString(xmlNode *element) break; case ti_var: - out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + if(last_lword > 1) + { + out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]); + } return variables[ti.getContent()]; case ti_lit_tag: @@ -674,6 +678,11 @@ Transfer::evalString(xmlNode *element) } in_lu = false; + + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } if(myword != "") { @@ -728,6 +737,11 @@ Transfer::evalString(xmlNode *element) } } + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } + if(value != "") { return out_wblank+"^"+value+"$"; @@ -774,6 +788,11 @@ Transfer::processOut(xmlNode *localroot) } in_lu = false; + + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } if(myword != "") { @@ -825,6 +844,11 @@ Transfer::processOut(xmlNode *localroot) } } + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } + if(myword != "") { fputws_unlocked(UtfConverter::fromUtf8(out_wblank).c_str(), output); @@ -933,7 +957,13 @@ Transfer::processChunk(xmlNode *localroot) myword.append(evalString(j)); } } - in_lu = false; + + in_lu = false; + + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } if(myword != "") { @@ -981,6 +1011,12 @@ Transfer::processChunk(xmlNode *localroot) } myword.append(mylocalword); } + + if(last_lword == 1) + { + out_wblank = word[0]->getWblank(); + } + if(myword != "") { result.append(out_wblank); @@ -2440,6 +2476,7 @@ Transfer::transfer(FILE *in, FILE *out) { if(defaultAttrs == lu) { + fputws_unlocked(tr_wblank.c_str(), output); if(tr.first[0] != L'[' || tr.first[1] != L'[') { fputwc_unlocked(L'^', output); @@ -2494,6 +2531,7 @@ Transfer::transfer(FILE *in, FILE *out) lastrule = rule_map[val-1]; lastrule_id = val; last = input_buffer.getPos(); + last_lword = tmpword.size(); if(trace) { diff --git a/apertium/transfer.h b/apertium/transfer.h index ca3bfad..15b7400 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -57,6 +57,7 @@ private: TransferWord **word; string **blank; int lword, lblank; + int last_lword; string noblank = ""; Buffer input_buffer; vector tmpword; diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc index 86862fe..51f15eb 100644 --- a/apertium/transfer_word.cc +++ b/apertium/transfer_word.cc @@ -26,7 +26,7 @@ TransferWord::copy(TransferWord const &o) s_str = o.s_str; t_str = o.t_str; r_str = o.r_str; - b_str = o.b_str; + wb_str = o.wb_str; queue_length = o.queue_length; } @@ -40,9 +40,9 @@ queue_length(0) { } -TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue) +TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue) { - init(src, tgt, ref, blank); + init(src, tgt, ref, wblank); queue_length = queue; } @@ -68,12 +68,12 @@ TransferWord::operator =(TransferWord const &o) } void -TransferWord::init(string const &src, string const &tgt, string const &ref, string const &blank) +TransferWord::init(string const &src, string const &tgt, string const &ref, string const &wblank) { s_str = src; t_str = tgt; r_str = ref; - b_str = blank; + wb_str = wblank; } string @@ -116,9 +116,9 @@ TransferWord::reference(ApertiumRE const &part, bool with_queue) } string -TransferWord::blank() +TransferWord::getWblank() { - return b_str; + return wb_str; } bool diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h index 3ff35d8..03802d8 100644 --- a/apertium/transfer_word.h +++ b/apertium/transfer_word.h @@ -49,7 +49,7 @@ private: /** * Wordbound blank */ - string b_str; + string wb_str; /** * Queue length @@ -105,10 +105,10 @@ public: * @param src source word * @param tgt target word * @param ref reference word - * @param blank wordbound blank + * @param wblank wordbound blank * @param queue queue lenght */ - TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue = 0); + TransferWord(string const &src, string const &tgt, string const &ref, string const &wblank, int queue = 0); /** * Assignment operator @@ -123,9 +123,9 @@ public: * @param src source word * @param tgt target word * @param ref reference word - * @param blank wordbound blank + * @param wblank wordbound blank */ - void init(string const &src, string const &tgt, string const &ref, string const &blank); + void init(string const &src, string const &tgt, string const &ref, string const &wblank); /** * Reference a source language word part @@ -155,7 +155,7 @@ public: * Reference the wordbound blank part * @returns reference to the wordbound blank */ - string blank(); + string getWblank(); /** * Sets a value for a source language word part diff --git a/tests/data/apertium-nno-nob.nno-nob.t3x b/tests/data/apertium-nno-nob.nno-nob.t3x index a52713d..df91750 100644 --- a/tests/data/apertium-nno-nob.nno-nob.t3x +++ b/tests/data/apertium-nno-nob.nno-nob.t3x @@ -17,6 +17,9 @@ + + + @@ -155,5 +158,24 @@ + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/apertium-nno-nob.nob-nno.t1x b/tests/data/apertium-nno-nob.nob-nno.t1x index ba6ab3d..1de6011 100644 --- a/tests/data/apertium-nno-nob.nob-nno.t1x +++ b/tests/data/apertium-nno-nob.nob-nno.t1x @@ -110,7 +110,6 @@ - @@ -118,5 +117,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/postchunk/__init__.py b/tests/postchunk/__init__.py index 52c9ef3..e83de7f 100644 --- a/tests/postchunk/__init__.py +++ b/tests/postchunk/__init__.py @@ -111,6 +111,9 @@ class WordboundBlankTest(PostchunkTest): inputs = ["^n_n{[[t:b:123456]]^worda$ ;[testblank] [[t:s:xyzab12]]^wordb# xyz$}$"] expectedOutputs = ["[[t:s:xyzab12]]^wordb# xyz$ ;[testblank] [[t:b:123456]]^worda$ [[t:b:123456; t:s:xyzab12]]^worda+wordb# xyz$"] +class SingleLUWordboundBlankTest(PostchunkTest): + inputs = ["^thing_wb{^[[t:i:xyzabc]]thing$}$ ^n_n{[[t:b:123456]]^worda$ ;[testblank] [[t:s:xyzab12]]^wordb# xyz$}$ [blanks] ^thing_wb{^[[t:i:xyzabc]]thing$}$ [blankx] ^vblex{[[t:b:123zbc]]^gå$}$^default{^.$}$ [blanks3] ^thing{^[[t:i:xyzabc]]thing$}$"] + expectedOutputs = ["[[t:i:xyzabc]]^newthing$ [[t:i:xyzabc]]^thing$ [[t:i:xyzabc]]^thing+newpr$ [[t:s:xyzab12]]^wordb# xyz$ ;[testblank] [[t:b:123456]]^worda$ [[t:b:123456; t:s:xyzab12]]^worda+wordb# xyz$ [blanks] [[t:i:xyzabc]]^newthing$ [[t:i:xyzabc]]^thing$ [[t:i:xyzabc]]^thing+newpr$ [blankx] [[t:b:123zbc]]^gå$^.$ [blanks3] [[t:i:xyzabc]]^thing$"] class BincompatTest(SimplePostchunkTest): bindata = "data/bincompat.t3x.bin" diff --git a/tests/transfer/__init__.py b/tests/transfer/__init__.py index 4eda9c1..805b0f1 100644 --- a/tests/transfer/__init__.py +++ b/tests/transfer/__init__.py @@ -99,6 +99,11 @@ class WordboundBlankTest(TransferTest): inputs = ["[blank1] [[t:s:123456]]^worda/wordta$ ;[blank2]; [[t:b:xyz123; t:l:xyz347]]^wordb/wordtb$ [blank3]; [[t:i:abc123; t:s:abc123]]^hun/ho$"] expectedOutputs = ["[blank1] ^prn{[[t:i:abc123; t:s:abc123]]^ho$ [[t:b:xyz123; t:l:xyz347]]^wordtb$}$ ;[blank2]; ^det{[[t:s:123456; t:i:abc123; t:s:abc123]]^wordta+ho$}$ [blank3]; "] +class SingleLUWordboundBlankTest(TransferTest): + inputs = ["[blank1] [[t:s:123456]]^worda/wordta$ ;[blank2]; [[t:b:xyz123; t:l:xyz347]]^wordb/wordtb$ [blank3]; "] + expectedOutputs = ["[blank1] ^nacr{[[t:s:123456]]^test$ [[t:s:123456]]^wordta$}$ ^nacr2{[[t:s:123456]]^testlem$ [[t:s:123456]]^wordta+postp$}$ ;[blank2]; ^nacr{[[t:b:xyz123; t:l:xyz347]]^test$ [[t:b:xyz123; t:l:xyz347]]^wordtb$}$ ^nacr2{[[t:b:xyz123; t:l:xyz347]]^testlem$ [[t:b:xyz123; t:l:xyz347]]^wordtb+postp$}$ [blank3]; "] + + class BincompatTest(BasicTransferTest): bindata = "data/bincompat.t1x.bin"