commit ab7eb070490ad178df083abdd431ca2635a499c8 Author: Tanmai Khanna Date: Fri Jun 26 18:08:37 2020 +0530 normal blanks now parse properly diff --git a/apertium/transfer.cc b/apertium/transfer.cc index dc42781..3edb69b 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -2067,8 +2067,37 @@ Transfer::readToken(FILE *in) int val = fgetwc_unlocked(in); if(feof(in) || (val == 0 && internal_null_flush)) { + in_wblank = false; return input_buffer.add(TransferToken(content, tt_eof)); } + if(in_wblank) + { + content += L"[["; + + while(true) + { + int val3 = fgetwc_unlocked(in); + if(val3 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val3 == L'$') //[[..]]^..$ is the LU + { + in_wblank = false; + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val3 == L'\0' && null_flush) + { + in_wblank = false; + fflush(output); + } + else + { + content += wchar_t(val3); + } + } + } if(val == '\\') { content += L'\\'; @@ -2087,29 +2116,10 @@ Transfer::readToken(FILE *in) } else if(val2 == L'[') { //wordbound blank - content += L'['; + in_wblank = true; + content.pop_back(); - while(true) - { - int val3 = fgetwc_unlocked(in); - if(val3 == L'\\') - { - content += L'\\'; - content += wchar_t(fgetwc_unlocked(in)); - } - else if(val3 == L'$') //[[..]]^..$ is the LU - { - return input_buffer.add(TransferToken(content, tt_word)); - } - else if(val3 == L'\0' && null_flush) - { - fflush(output); - } - else - { - content += wchar_t(val3); - } - } + return input_buffer.add(TransferToken(content, tt_blank)); } else if(val2 == L']') { @@ -2132,6 +2142,7 @@ Transfer::readToken(FILE *in) } else if(val == L'\0' && null_flush) { + in_wblank = false; fflush(output); } else @@ -2344,28 +2355,45 @@ Transfer::transfer(FILE *in, FILE *out) } continue; } - else if(*it == L'[' && *(it+1) == L'[') + else if(*it == L'[') { - while(true) + if(*(it+1) == L'[') //wordbound blank { - if(*it == L'\\') + while(true) { - wblank.push_back(*it); + if(*it == L'\\') + { + wblank.push_back(*it); + it++; + wblank.push_back(*it); + } + else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + { + break; + } + else + { + wblank.push_back(*it); + } + it++; - wblank.push_back(*it); } - else if(*it == L'^' && *(it-1) == L']' && *(it-2) == L']') + } + else + { + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) { - break; + tl.push_back(*it); } else { - wblank.push_back(*it); + ref.push_back(*it); } - - it++; } - continue; } else if(*it == L'/') @@ -2527,14 +2555,7 @@ Transfer::applyRule() } else { - if(tmpblank.size() < i-1) - { blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); - } - else - { - blank[i-1] = new string(UtfConverter::toUtf8(L"")); - } } pair tr; diff --git a/apertium/transfer.h b/apertium/transfer.h index 79ae6fd..98a8881 100644 --- a/apertium/transfer.h +++ b/apertium/transfer.h @@ -70,6 +70,8 @@ private: map var_secondary_tags; //map variable name to secondary tags of the word it takes lem/lemh from map var_has_lemq; //map variable name to bool->true if variable clips lemq + bool in_wblank; + bool gettingLemmaFromWord(string attr); FSTProcessor fstp;