commit b164be07371be852949941484b127f99ac864c8b Author: Tanmai Khanna Date: Sat Aug 29 13:34:41 2020 +0530 blank handling in Interchunk | tests added | dont flush blanks if they're spaces diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index f03b550..d1df32d 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -46,9 +46,8 @@ Interchunk::destroy() Interchunk::Interchunk() : word(0), -blank(0), lword(0), -lblank(0), +last_lword(0), output(0), any_char(0), any_tag(0), @@ -62,7 +61,6 @@ nwords(0) null_flush = false; internal_null_flush = false; trace = false; - emptyblank = ""; } Interchunk::~Interchunk() @@ -270,11 +268,15 @@ Interchunk::evalString(xmlNode *element) return ti.getContent(); case ti_b: - if(ti.getPos() >= 0 && checkIndex(element, ti.getPos(), lblank)) + if(!blank_queue.empty()) { - return !blank?"":*(blank[ti.getPos()]); + string retblank = blank_queue.front(); + blank_queue.pop(); + + return retblank; } - else { + else + { return " "; } break; @@ -420,6 +422,15 @@ Interchunk::processOut(xmlNode *localroot) } } } + + while(!blank_queue.empty()) //flush remaining blanks that are not spaces + { + if(blank_queue.front().compare(" ") != 0) + { + fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + } + blank_queue.pop(); + } } string @@ -649,38 +660,19 @@ Interchunk::processCallMacro(xmlNode *localroot) { myword = new InterchunkWord *[npar]; } - string **myblank = NULL; - if(npar > 0) - { - myblank = new string *[npar]; - myblank[npar-1] = &emptyblank; - } int idx = 0; - int lastpos = 0; for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) { if(i->type == XML_ELEMENT_NODE) { int pos = atoi((const char *) i->properties->children->content)-1; myword[idx] = word[pos]; - if(idx-1 >= 0) - { - if(lastpos + 1 > lblank) { // if a 1-pattern rule calls macro with same - noblank = ""; // param twice the blank array will be empty - myblank[idx-1] = &noblank; - } - else { - myblank[idx-1] = blank[lastpos]; - } - } idx++; - lastpos = pos; } } swap(myword, word); - swap(myblank, blank); swap(npar, lword); for(xmlNode *i = macro->children; i != NULL; i = i->next) @@ -692,11 +684,9 @@ Interchunk::processCallMacro(xmlNode *localroot) } swap(myword, word); - swap(myblank, blank); swap(npar, lword); delete[] myword; - delete[] myblank; } void @@ -1481,6 +1471,8 @@ Interchunk::interchunk(FILE *in, FILE *out) size_t lastrule_line = rule_lines[val-1]; lastrule = rule_map[val-1]; last = input_buffer.getPos(); + + last_lword = tmpword.size(); if(trace) { @@ -1543,20 +1535,13 @@ Interchunk::applyRule() { word = new InterchunkWord *[limit]; lword = limit; - if(limit != 1) - { - blank = new string *[limit - 1]; - lblank = limit - 1; - } - else - { - blank = NULL; - lblank = 0; - } } else { - blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + if(int(blank_queue.size()) < last_lword - 1) + { + blank_queue.push(string(UtfConverter::toUtf8(*tmpblank[i-1]))); + } } word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i])); @@ -1573,16 +1558,8 @@ Interchunk::applyRule() } delete[] word; } - if(blank) - { - for(unsigned int i = 0; i != limit - 1; i++) - { - delete blank[i]; - } - delete[] blank; - } + word = NULL; - blank = NULL; tmpword.clear(); tmpblank.clear(); ms.init(me->getInitial()); diff --git a/apertium/interchunk.h b/apertium/interchunk.h index 02fea7c..49ee2a0 100644 --- a/apertium/interchunk.h +++ b/apertium/interchunk.h @@ -34,6 +34,7 @@ #include #include #include +#include using namespace std; @@ -55,9 +56,9 @@ private: xmlDoc *doc; xmlNode *root_element; InterchunkWord **word; - string **blank; - int lword, lblank; - string noblank = ""; + queue blank_queue; + int lword; + int last_lword; Buffer input_buffer; vector tmpword; vector tmpblank; diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index ee18d08..d0b07d4 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -666,9 +666,12 @@ Postchunk::processOut(xmlNode *localroot) } } - while(!blank_queue.empty()) //flush remaining blanks + while(!blank_queue.empty()) //flush remaining blanks that are not spaces { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + if(blank_queue.front().compare(" ") != 0) + { + fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + } blank_queue.pop(); } } diff --git a/apertium/transfer.cc b/apertium/transfer.cc index a391f5f..b25acb6 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -881,9 +881,12 @@ Transfer::processOut(xmlNode *localroot) } } - while(!blank_queue.empty()) //flush remaining blanks + while(!blank_queue.empty()) //flush remaining blanks that are not spaces { - fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + if(blank_queue.front().compare(" ") != 0) + { + fputws_unlocked(UtfConverter::fromUtf8(blank_queue.front()).c_str(), output); + } blank_queue.pop(); } } @@ -2597,7 +2600,7 @@ Transfer::applyRule() } else { - if(int(blank_queue.size()) + 1 < last_lword) + if(int(blank_queue.size()) < last_lword - 1) { blank_queue.push(string(UtfConverter::toUtf8(*tmpblank[i-1]))); } diff --git a/tests/data/apertium-nno-nob.nno-nob.t2x b/tests/data/apertium-nno-nob.nno-nob.t2x index 1778747..c97adf2 100644 --- a/tests/data/apertium-nno-nob.nno-nob.t2x +++ b/tests/data/apertium-nno-nob.nno-nob.t2x @@ -219,6 +219,28 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/nno-nob.t2x.bin b/tests/data/nno-nob.t2x.bin index 1411e9a..a437354 100644 Binary files a/tests/data/nno-nob.t2x.bin and b/tests/data/nno-nob.t2x.bin differ diff --git a/tests/interchunk/__init__.py b/tests/interchunk/__init__.py index a498147..14ff9c0 100644 --- a/tests/interchunk/__init__.py +++ b/tests/interchunk/__init__.py @@ -104,14 +104,16 @@ class SuperblankTest(InterchunkTest): "[blank1];; ^test2{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];;", #superblankrule2 -> When output rule has more than input blanks, print all then spaces "[blank1];; ^test3{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test1{^wordc# xyz$}$ [blank4];;", #superblankrule3 -> Output rule has no , flush all blanks after rule output "[blank1];; ^test1{^worda$}$ ;[blank2] ^test3{^wordb# xyz$}$ ;[blank3]; ^test2{^wordc# xyz$}$ [blank4];;", #superblankrule4 -> Output rule has one , print one blank, then flush all after rule output - "[blank1];; ^test1{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];; ^test1{^worda$}$ ;[blank5] ^test3{^wordb# xyz$}$ ;[blank6]; ^test2{^wordc# xyz$}$ [blank7];;"] #Multiple matching rules -> superblankrule1 & superblankrule4 + "[blank1];; ^test1{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];; ^test1{^worda$}$ ;[blank5] ^test3{^wordb# xyz$}$ ;[blank6]; ^test2{^wordc# xyz$}$ [blank7];;", #Multiple matching rules -> superblankrule1 & superblankrule4 + "[blank1];; ^test1{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test2x{^wordc# xyz$}$ [blank4];; ^test2{^wordb# xyz$}$ ;[blank5];"] #Rule followed by unknown expectedOutputs = [ "[blank1];; ^test2{^wordb# xyz$}$ ;[blank2] ^test1{^worda$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];;", "[blank1];; ^test1{^worda$}$ ;[blank2] ^test2{^wordb# xyz$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];;", "[blank1];; ^test2{^wordb# xyz$}$ ;[blank2] ^test2{^worda$}$ ;[blank3]; ^test2{^wordb# xyz$}$ ^test2{^worda$}$ ^test3{^wordc# xyz$}$ [blank4];;", "[blank1];; ^test2{^wordb# xyz$}$^test3{^worda$}$^test1{^wordc# xyz$}$ ;[blank2] ;[blank3]; [blank4];;", "[blank1];; ^test3{^wordb# xyz$}$^test1{^worda$}$ ;[blank2] ^test2{^wordc# xyz$}$ ;[blank3]; [blank4];;", - "[blank1];; ^test2{^wordb# xyz$}$ ;[blank2] ^test1{^worda$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];; ^test3{^wordb# xyz$}$^test1{^worda$}$ ;[blank5] ^test2{^wordc# xyz$}$ ;[blank6]; [blank7];;"] + "[blank1];; ^test2{^wordb# xyz$}$ ;[blank2] ^test1{^worda$}$ ;[blank3]; ^test3{^wordc# xyz$}$ [blank4];; ^test3{^wordb# xyz$}$^test1{^worda$}$ ;[blank5] ^test2{^wordc# xyz$}$ ;[blank6]; [blank7];;", + "[blank1];; ^test2{^wordb# xyz$}$ ;[blank2] ^test1{^worda$}$ ;[blank3]; ^test2x{^wordc# xyz$}$ [blank4];; ^test2{^wordb# xyz$}$ ;[blank5];"] class BincompatTest(SimpleInterchunkTest):