commit 776e439fb1d2a940ff3f75a84019de610e4969d2 Author: Tanmai Khanna Date: Thu Jul 16 02:37:04 2020 +0530 Wordbound blanks parsed as normal blanks (#101) Parsing wordbound blanks as normal blanks for analysis, generation, biltrans, postgeneration; Removed unused code for secondary tags; Added a test for wordbound blank analysis diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 84d9f79..103c339 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -238,6 +238,42 @@ FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const del return result; } +wstring +FSTProcessor::readWblank(FILE *input) +{ + wstring result = L""; + result += L"[["; + wchar_t c; + + while(!feof(input)) + { + c = static_cast(fgetwc_unlocked(input)); + result += c; + + if(c == L'\\') + { + result += static_cast(readEscaped(input)); + } + else if(c == L']') + { + c = static_cast(fgetwc_unlocked(input)); + result += c; + + if(c == L']') + { + break; + } + } + } + + if(c != L']') + { + streamError(); + } + + return result; +} + int FSTProcessor::readAnalysis(FILE *input) { @@ -269,7 +305,18 @@ FSTProcessor::readAnalysis(FILE *input) return altval; case L'[': - blankqueue.push(readFullBlock(input, L'[', L']')); + val = static_cast(fgetwc_unlocked(input)); + + if(val == L'[') + { + blankqueue.push(readWblank(input)); + } + else + { + ungetc(val, input); + blankqueue.push(readFullBlock(input, L'[', L']')); + } + input_buffer.add(static_cast(L' ')); return static_cast(L' '); @@ -320,7 +367,18 @@ FSTProcessor::readTMAnalysis(FILE *input) return altval; case L'[': - blankqueue.push(readFullBlock(input, L'[', L']')); + val = static_cast(fgetwc_unlocked(input)); + + if(val == L'[') + { + blankqueue.push(readWblank(input)); + } + else + { + ungetc(val, input); + blankqueue.push(readFullBlock(input, L'[', L']')); + } + input_buffer.add(static_cast(L' ')); isLastBlankTM = true; return static_cast(L' '); @@ -389,7 +447,18 @@ FSTProcessor::readPostgeneration(FILE *input) return altval; case L'[': - blankqueue.push(readFullBlock(input, L'[', L']')); + val = static_cast(fgetwc_unlocked(input)); + + if(val == L'[') + { + blankqueue.push(readWblank(input)); + } + else + { + ungetc(val, input); + blankqueue.push(readFullBlock(input, L'[', L']')); + } + input_buffer.add(static_cast(L' ')); return static_cast(L' '); @@ -517,76 +586,31 @@ FSTProcessor::readGeneration(FILE *input, FILE *output) wstring cad = L""; cad += static_cast(val); - bool isSecondaryTag = false; - while((val = fgetwc_unlocked(input)) != L'>') { if(feof(input)) { streamError(); } - if(val == L':') - { - isSecondaryTag = true; - break; - } cad += static_cast(val); } cad += static_cast(val); - if(isSecondaryTag) + return alphabet(cad); + } + else if(val == L'[') + { + val = fgetwc_unlocked(input); + if(val == L'[') { - while(true) - { - val = fgetwc_unlocked(input); - - if(feof(input)) - { - streamError(); - } - - if(val == L'\\') - { - val = fgetwc_unlocked(input); - continue; - } - - if(isSecondaryTag) - { - if(val == L'>') - { - isSecondaryTag = false; - } - } - else - { - if(val == L'<') - { - isSecondaryTag = true; - } - else if(val == L'$') - { - break; - } - else - { - return static_cast(val); - } - } - } - - outOfWord = true; - return static_cast(L'$'); + fputws_unlocked(readWblank(input).c_str(), output); } else { - return alphabet(cad); + ungetc(val, input); + fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); } - } - else if(val == L'[') - { - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); return readGeneration(input, output); } else @@ -681,7 +705,17 @@ FSTProcessor::readBilingual(FILE *input, FILE *output) } else if(val == L'[') { - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + val = fgetwc_unlocked(input); + if(val == L'[') + { + fputws_unlocked(readWblank(input).c_str(), output); + } + else + { + ungetc(val, input); + fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + } + return readBilingual(input, output); } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index ec95de8..3276151 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -251,6 +251,12 @@ private: * @param delim1 the delimiter of the end of the sequence */ wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); + + /** + * Reads a wordbound blank from the stream input + * @param input the stream being read + */ + wstring readWblank(FILE *input); /** * Returns true if the character code is identified as alphabetic diff --git a/tests/data/wordbound-blank.dix b/tests/data/wordbound-blank.dix new file mode 100644 index 0000000..9337606 --- /dev/null +++ b/tests/data/wordbound-blank.dix @@ -0,0 +1,33 @@ + + + ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÄÅÆÇÈÉÊËÍÑÒÓÔÕÖØÙÚÜČĐŊŠŦŽabcdefghijklmnopqrstuvwxyzàáâäåæçèéêëíñòóôõöøùúüčđŋšŧž­- + + + + + + + + + + + +
+

legge legge

+

leggeopptil leggeopptil

+

opp opp

+ +

leggesegopptil leggesegopptil

+

seg seg

+ +

St.Petersburg St.Petersburg

+

Xy Xy

+

F F

+

G G

+
+ +
+

. .

+
+ +
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index ba77a21..5455c48 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -130,5 +130,14 @@ class CatMultipleFstsTransducer(unittest.TestCase, ProcTest): inputs = ["cat", "cats"] expectedOutputs = ["^cat/cat+n/cat+v$", "^cats/cat+n+$"] +class WordboundBlankAnalysisTest(unittest.TestCase, ProcTest): + procdix = "data/wordbound-blank.dix" + inputs = ["x [[t:i:123456]]opp.", + "[[t:b:456123; t:i:90hfbn]]legge [[t:s:xyz789]]opp opp [[t:b:abc124]]x opp.", + ] + expectedOutputs = ["^x/*x$ [[t:i:123456]]^opp/opp$^./.$", + "[[t:b:456123; t:i:90hfbn]]^legge/legge$ [[t:s:xyz789]]^opp/opp$ ^opp/opp$ [[t:b:abc124]]^x/*x$ ^opp/opp$^./.$", + ] + # These fail on some systems: #from null_flush_invalid_stream_format import *