commit 8b02e2259666a198556208106d7f9817c6efd59f Author: Tanmai Khanna Date: Fri Jun 26 03:00:34 2020 +0530 parsing wordbound blanks as part of word | ignoring them when doing pattern matching etc. | adding it as a side in transferword diff --git a/apertium/transfer.cc b/apertium/transfer.cc index 9f60372..e450c76 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -2079,21 +2079,47 @@ Transfer::readToken(FILE *in) content += L'['; while(true) { - int val2 = fgetwc_unlocked(in); - if(val2 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val2 == L']') - { - content += L']'; - break; - } - else - { - content += wchar_t(val2); - } + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L'[') + { //wordbound blank + content += L'['; + + while(true) + { + int val3 = fgetwc_unlocked(in); + if(val3 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val3 == L'$') //[[..]]^..$ is the LU + { + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val3 == L'\0' && null_flush) + { + fflush(output); + } + else + { + content += wchar_t(val3); + } + } + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } } } else if(val == L'$') @@ -2266,6 +2292,7 @@ Transfer::transfer(FILE *in, FILE *out) } pair tr; + wstring tr_wblank; if(useBilingual && preBilingual == false) { if(isExtended && (*tmpword[0])[0] == L'*') @@ -2290,6 +2317,7 @@ Transfer::transfer(FILE *in, FILE *out) wstring sl; wstring tl; wstring ref; + wstring wblank; int seenSlash = 0; for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) @@ -2316,6 +2344,30 @@ Transfer::transfer(FILE *in, FILE *out) } continue; } + else if(*it == L'[' && *(it+1) == L'[') + { + while(true) + { + if(*it == L'\\') + { + wblank.push_back(*it); + it++; + wblank.push_back(*it); + } + else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + { + break; + } + else + { + wblank.push_back(*it); + } + + it++; + } + + continue; + } else if(*it == L'/') { seenSlash++; @@ -2338,6 +2390,7 @@ Transfer::transfer(FILE *in, FILE *out) } //tmpword[0]->assign(sl); tr = pair(tl, false); + tr_wblank = wblank; //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; } else @@ -2347,6 +2400,7 @@ Transfer::transfer(FILE *in, FILE *out) if(tr.first.size() != 0) { + fputws_unlocked(tr_wblank.c_str(), output); if(defaultAttrs == lu) { fputwc_unlocked(L'^', output); @@ -2480,10 +2534,11 @@ Transfer::applyRule() if(useBilingual && preBilingual == false) { tr = fstp.biltransWithQueue(*tmpword[i], false); - wstring refx; + wstring refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(wblankx), tr.second); } else if(preBilingual) @@ -2491,6 +2546,7 @@ Transfer::applyRule() wstring sl; wstring tl; wstring ref; + wstring wblank; int seenSlash = 0; for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) @@ -2517,6 +2573,30 @@ Transfer::applyRule() } continue; } + else if(*it == L'[' && *(it+1) == L'[') + { + while(true) + { + if(*it == L'\\') + { + wblank.push_back(*it); + it++; + wblank.push_back(*it); + } + else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + { + break; + } + else + { + wblank.push_back(*it); + } + + it++; + } + + continue; + } if(*it == L'/') { @@ -2542,15 +2622,17 @@ Transfer::applyRule() word[i] = new TransferWord(UtfConverter::toUtf8(sl), UtfConverter::toUtf8(tr.first), UtfConverter::toUtf8(ref), + UtfConverter::toUtf8(wblank), tr.second); } else // neither useBilingual nor preBilingual (sl==tl) { tr = pair(*tmpword[i], false); - wstring refx; + wstring refx,wblankx; word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), UtfConverter::toUtf8(tr.first), - UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(refx), + UtfConverter::toUtf8(wblankx), tr.second); } } @@ -2598,7 +2680,30 @@ Transfer::applyWord(wstring const &word_str) i++; ms.step(towlower(word_str[i]), any_char); break; - + + case L'[': + if(word_str[i+1] == L'[') + { + while(true) + { + if(word_str[i] == L'\\') + { + i++; + } + else if(word_str[i] == L'^' && word_str[i-1] == L']' && word_str[i-2] == L']') + { + break; + } + + i++; + } + } + else + { + ms.step(towlower(word_str[i]), any_char); + } + break; + case L'/': i = limit; break; diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc index 33d2f4b..0bed7c3 100644 --- a/apertium/transfer_word.cc +++ b/apertium/transfer_word.cc @@ -26,6 +26,7 @@ TransferWord::copy(TransferWord const &o) s_str = o.s_str; t_str = o.t_str; r_str = o.r_str; + b_str = o.b_str; queue_length = o.queue_length; } @@ -39,9 +40,9 @@ queue_length(0) { } -TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, int queue) +TransferWord::TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue) { - init(src, tgt, ref); + init(src, tgt, ref, blank); queue_length = queue; } @@ -67,11 +68,12 @@ TransferWord::operator =(TransferWord const &o) } void -TransferWord::init(string const &src, string const &tgt, string const &ref) +TransferWord::init(string const &src, string const &tgt, string const &ref, string const &blank) { s_str = src; t_str = tgt; r_str = ref; + b_str = blank; } string @@ -113,6 +115,19 @@ TransferWord::reference(ApertiumRE const &part, bool with_queue) } } +string +TransferWord::blank(ApertiumRE const &part, bool with_queue) +{ + if(with_queue) //TODO test removing + { + return part.match(b_str); + } + else + { + return part.match(b_str.substr(0, b_str.size() - queue_length)); + } +} + bool TransferWord::setSource(ApertiumRE const &part, string const &value, bool with_queue) diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h index d14ed16..8755856 100644 --- a/apertium/transfer_word.h +++ b/apertium/transfer_word.h @@ -45,6 +45,11 @@ private: * Reference word */ string r_str; + + /** + * Wordbound blank + */ + string b_str; /** * Queue length @@ -100,9 +105,10 @@ public: * @param src source word * @param tgt target word * @param ref reference word + * @param blank wordbound blank * @param queue queue lenght */ - TransferWord(string const &src, string const &tgt, string const &ref, int queue = 0); + TransferWord(string const &src, string const &tgt, string const &ref, string const &blank, int queue = 0); /** * Assignment operator @@ -117,8 +123,9 @@ public: * @param src source word * @param tgt target word * @param ref reference word + * @param blank wordbound blank */ - void init(string const &src, string const &tgt, string const &ref); + void init(string const &src, string const &tgt, string const &ref, string const &blank); /** * Reference a source language word part @@ -143,6 +150,14 @@ public: * @returns reference to the part of string matched */ string reference(ApertiumRE const &part, bool with_queue = true); + + /** + * Reference the wordbound blank part + * @param part regular expression to match + * @param with_queue access taking into account the queue + * @returns reference to the part of string matched + */ + string blank(ApertiumRE const &part, bool with_queue = true); /** * Sets a value for a source language word part