commit 2c8bab827d92f8f4755ca867b65c608907f9295d Author: Tanmai Khanna Date: Wed Jul 8 01:38:48 2020 +0530 Wordbound blanks now distribute across parts | tests added diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc index d684767..764b3cf 100644 --- a/apertium/pretransfer.cc +++ b/apertium/pretransfer.cc @@ -8,6 +8,45 @@ #include #include +wstring storeAndWriteWblank(FILE *input, FILE *output) +{ + int mychar; + wstring content = L"[["; + + while(true) + { + mychar = fgetwc_unlocked(input); + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + + content += mychar; + fputwc_unlocked(mychar, output); + + if(mychar == L'\\') + { + mychar = fgetwc(input); + content += mychar; + fputwc(mychar, output); + } + else if(mychar == L']') + { + mychar = fgetwc(input); + + if(mychar == L']') + { + content += mychar; + fputwc(mychar, output); + break; + } + } + } + + return content; +} + void readAndWriteUntil(FILE *input, FILE *output, int const charcode) { int mychar; @@ -28,7 +67,7 @@ void readAndWriteUntil(FILE *input, FILE *output, int const charcode) } } -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep) +void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank = L"") { int mychar; wstring buffer = L""; @@ -82,18 +121,24 @@ void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep) } else if(in_tag == false && mychar == L'+') { - buffer.append(L"$ ^"); + buffer.append(L"$ "); + buffer.append(wblank); + buffer.append(L"^"); } else if(in_tag == false && mychar == L'~' and compound_sep == true) { - buffer.append(L"$^"); + buffer.append(L"$"); + buffer.append(wblank); + buffer.append(L"^"); } } else { if(mychar == L'+' && queuing == true) { - buffer.append(L"$ ^"); + buffer.append(L"$ "); + buffer.append(wblank); + buffer.append(L"^"); buffer_mode = true; } else @@ -119,8 +164,32 @@ void processStream(FILE *input, FILE *output, bool null_flush, bool surface_form { case L'[': fputwc_unlocked(L'[', output); - readAndWriteUntil(input, output, L']'); - fputwc_unlocked(L']', output); + mychar = fgetwc_unlocked(input); + + if(mychar == L'[') + { + fputwc_unlocked(L'[', output); + wstring wblank = storeAndWriteWblank(input, output); + mychar = fgetwc_unlocked(input); + + if(mychar == L'^') + { + fputwc_unlocked(mychar, output); + procWord(input, output, surface_forms, compound_sep, wblank); + fputwc_unlocked(L'$', output); + } + else + { + wcerr << L"ERROR: Wordbound blank isn't immediately followed by the Lexical Unit." << endl; + exit(EXIT_FAILURE); + } + } + else + { + ungetwc(mychar, input); + readAndWriteUntil(input, output, L']'); + fputwc_unlocked(L']', output); + } break; case L'\\': diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h index 4c0ce57..1e53c4f 100644 --- a/apertium/pretransfer.h +++ b/apertium/pretransfer.h @@ -19,8 +19,9 @@ #include #include +wstring storeAndWriteWblank(FILE *input, FILE *output); void readAndWriteUntil(FILE *input, FILE *output, int const charcode); -void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep); +void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep, wstring wblank); void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep); #endif diff --git a/tests/pretransfer/__init__.py b/tests/pretransfer/__init__.py index a88423b..7e8c6ce 100644 --- a/tests/pretransfer/__init__.py +++ b/tests/pretransfer/__init__.py @@ -84,11 +84,6 @@ class JoinGroupPretransferTest(PretransferTest): inputs = ["[
]^a+c# b$", "[
]^a+c+d# b$"] expectedOutputs = ["[
]^a# b$ ^c$", "[
]^a# b$ ^c$ ^d$"] - -# Proposed inline blank format: -class InlineBlankPretransferTest(PretransferTest): - inputs = ["[{}]^a+c# b$", "[{}]^a+c+d# b$"] - expectedOutputs = ["[{}]^a# b$ [{}]^c$", "[{}]^a# b$ [{}]^c$ [{}]^d$"] - @unittest.expectedFailure - def runTest(self): - super().runTest(self) +class WordboundBlankTestPretransferTest(PretransferTest): + inputs = ["[[t:i:abc123]]^a+c# b$", "[[t:i:xyz456]]^a+c+d# b$"] + expectedOutputs = ["[[t:i:abc123]]^a# b$ [[t:i:abc123]]^c$", "[[t:i:xyz456]]^a# b$ [[t:i:xyz456]]^c$ [[t:i:xyz456]]^d$"]