commit b86d375e177266890c6461308a5e7629f2ecfbed Merge: b342dcb f2414b9 Author: Daniel Swanson Date: Mon Jun 14 21:14:07 2021 -0500 Merge branch 'master' into icu diff --combined README index a27e04f,32fe0dc..54dbd6c --- a/README +++ b/README @@@ -12,7 -12,7 +12,7 @@@ The three programs main programs are lt the processor, and lt-expand, which generates all possible mappings between surface forms and lexical forms in the dictionary. - Executables built by this pacage: + Executables built by this package: * `lt-comp`: compiler, execute without parameters to show usage instructions. @@@ -51,12 -51,10 +51,12 @@@ Requirements * g++ >= 2.95 * GNU make * libxml2 >= 2.6.17 +* ICU +* utfcpp Building & installing: - * ./configure + * ./autogen.sh * make * make install diff --combined lttoolbox/fst_processor.cc index 1d21b76,f4ec2a1..d986776 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@@ -22,40 -22,32 +22,40 @@@ #include #include #include -#include -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif using namespace std; +UString const FSTProcessor::XML_TEXT_NODE = "#text"_u; +UString const FSTProcessor::XML_COMMENT_NODE = "#comment"_u; +UString const FSTProcessor::XML_IGNORED_CHARS_ELEM = "ignored-chars"_u; +UString const FSTProcessor::XML_RESTORE_CHAR_ELEM = "restore-char"_u; +UString const FSTProcessor::XML_RESTORE_CHARS_ELEM = "restore-chars"_u; +UString const FSTProcessor::XML_VALUE_ATTR = "value"_u; +UString const FSTProcessor::XML_CHAR_ELEM = "char"_u; +UString const FSTProcessor::WBLANK_START = "[["_u; +UString const FSTProcessor::WBLANK_END = "]]"_u; +UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; + + FSTProcessor::FSTProcessor() : default_weight(0.0000), outOfWord(false), isLastBlankTM(false) { // escaped_chars chars - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); caseSensitive = false; dictionaryCase = false; @@@ -134,27 -126,28 +134,27 @@@ FSTProcessor::parseRCX(string const &fi void FSTProcessor::procNodeICX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == L"ignored-chars") + else if(name == XML_IGNORED_CHARS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == XML_CHAR_ELEM) { - ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == XML_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@@ -168,46 -161,47 +168,46 @@@ FSTProcessor::initDefaultIgnoredCharact void FSTProcessor::procNodeRCX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == L"restore-chars") + else if(name == XML_RESTORE_CHARS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == XML_CHAR_ELEM) { - rcx_current_char = static_cast(XMLParseUtil::attrib(reader, L"value")[0]); + rcx_current_char = static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]); } - else if(name == L"restore-char") + else if(name == XML_RESTORE_CHAR_ELEM) { - rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == XML_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } -wchar_t -FSTProcessor::readEscaped(FILE *input) +UChar32 +FSTProcessor::readEscaped(InputFile& input) { - if(feof(input)) + if(input.eof()) { streamError(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); + UChar32 val = input.get(); - if(feof(input)) + if(input.eof()) { streamError(); } @@@ -215,24 -209,24 +215,24 @@@ return val; } -wstring -FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) +UString +FSTProcessor::readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2) { - wstring result = L""; + UString result; result += delim1; - wchar_t c = delim1; + UChar32 c = delim1; - while(!feof(input) && c != delim2) + while(!input.eof() && c != delim2) { - c = static_cast(fgetwc_unlocked(input)); + c = input.get(); result += c; - if(c != L'\\') + if(c != '\\') { continue; } else { - result += static_cast(readEscaped(input)); + result += readEscaped(input); } } @@@ -244,34 -238,35 +244,34 @@@ return result; } -wstring -FSTProcessor::readWblank(FILE *input) +UString +FSTProcessor::readWblank(InputFile& input) { - wstring result = L""; - result += L"[["; - wchar_t c = 0; + UString result = WBLANK_START; + UChar32 c = 0; - while(!feof(input)) + while(!input.eof()) { - c = static_cast(fgetwc_unlocked(input)); + c = input.get(); result += c; - if(c == L'\\') + if(c == '\\') { - result += static_cast(readEscaped(input)); + result += readEscaped(input); } - else if(c == L']') + else if(c == ']') { - c = static_cast(fgetwc_unlocked(input)); + c = input.get(); result += c; - if(c == L']') + if(c == ']') { break; } } } - if(c != L']') + if(c != ']') { streamError(); } @@@ -280,51 -275,60 +280,58 @@@ } bool -FSTProcessor::wblankPostGen(FILE *input, FILE *output) +FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) { - wstring result = L""; - result += L"[["; - wchar_t c = 0; + UString result = WBLANK_START; + UChar32 c = 0; + bool in_content = false; - while(!feof(input)) + while(!input.eof()) { - c = static_cast(fgetwc_unlocked(input)); - + c = input.get(); - result += c; + if(in_content && c == L'~') + { - if(result[result.size()-1] == L']') { ++ if(result[result.size()-1] == ']') { + // We just saw the end of a wblank, may want to merge + wblankqueue.push(result); + } + else { + // wake-up-mark happened some characters into the wblanked word - fputws(result.c_str(), output); ++ write(result, output); + } + return true; + } + else + { + result += c; + } - if(c == L'\\') + if(c == '\\') { - result += static_cast(readEscaped(input)); + result += readEscaped(input); } - else if(c == L']') + else if(c == ']') { - c = static_cast(fgetwc_unlocked(input)); + c = input.get(); result += c; - if(c == L']') + if(c == ']') { int resultlen = result.size(); if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] { - fputws(result.c_str(), output); + write(result, output); break; } else { - c = input.get(); - if(c == '~') - { - wblankqueue.push(result); - return true; - } - else - { - result += c; - } + in_content = true; // Assumption: No nested wblanks, always balanced } } } } - if(c != L']') + if(c != ']') { streamError(); } @@@ -333,65 -337,63 +340,65 @@@ } int -FSTProcessor::readAnalysis(FILE *input) +FSTProcessor::readAnalysis(InputFile& input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; - if(feof(input)) + UChar32 val = input.get(); + int32_t altval = 0; + if(input.eof()) { input_buffer.add(0); // so it's treated like the NUL byte return 0; + } else if(val == U_EOF) { + val = 0; } if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { input_buffer.add(val); - val = static_cast(fgetwc_unlocked(input)); + val = input.get(); } if(escaped_chars.find(val) != escaped_chars.end()) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(readFullBlock(input, '<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { blankqueue.push(readWblank(input)); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(readFullBlock(input, '[', ']')); } - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; default: streamError(); } } - if(val == L' ') { - blankqueue.push(L" "); + if(val == ' ') { + blankqueue.push(" "_u); } input_buffer.add(val); @@@ -399,7 -401,7 +406,7 @@@ } int -FSTProcessor::readTMAnalysis(FILE *input) +FSTProcessor::readTMAnalysis(InputFile& input) { isLastBlankTM = false; if(!input_buffer.isEmpty()) @@@ -407,64 -409,64 +414,64 @@@ return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; - if(feof(input)) + UChar32 val = input.get(); + int32_t altval = 0; + if(input.eof()) { return 0; } - if(escaped_chars.find(val) != escaped_chars.end() || iswdigit(val)) + if(escaped_chars.find(val) != escaped_chars.end() || u_isdigit(val)) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(readFullBlock(input, '<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { blankqueue.push(readWblank(input)); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(readFullBlock(input, '[', ']')); } - input_buffer.add(static_cast(L' ')); + input_buffer.add(static_cast(' ')); isLastBlankTM = true; - return static_cast(L' '); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; - case L'0': - case L'1': - case L'2': - case L'3': - case L'4': - case L'5': - case L'6': - case L'7': - case L'8': - case L'9': - { - wstring ws = L""; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + UString ws; do { ws += val; - val = static_cast(fgetwc_unlocked(input)); - } while(iswdigit(val)); - ungetwc_unlocked(val, input); - input_buffer.add(alphabet(L"")); + val = input.get(); + } while(u_isdigit(val)); + input.unget(val); + input_buffer.add(alphabet(""_u)); numbers.push_back(ws); - return alphabet(L""); + return alphabet(""_u); } break; @@@ -478,61 -480,61 +485,61 @@@ } int -FSTProcessor::readPostgeneration(FILE *input, FILE *output) +FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; + UChar32 val = input.get(); + int32_t altval = 0; is_wblank = false; - if(feof(input)) + if(input.eof()) { return 0; } switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(readFullBlock(input, '<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { if(collect_wblanks) { wblankqueue.push(readWblank(input)); is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } else if(wblankPostGen(input, output)) { - return static_cast(L'~'); + return static_cast('~'); } else { is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(readFullBlock(input, '[', ']')); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; default: @@@ -542,33 -544,33 +549,33 @@@ } void -FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) +FSTProcessor::skipUntil(InputFile& input, UFILE *output, UChar32 const character) { while(true) { - wint_t val = fgetwc_unlocked(input); - if(feof(input)) + UChar32 val = input.get(); + if(input.eof()) { return; } switch(val) { - case L'\\': - val = fgetwc_unlocked(input); - if(feof(input)) + case '\\': + val = input.get(); + if(input.eof()) { return; } - fputwc_unlocked(L'\\', output); - fputwc_unlocked(val, output); + u_fputc('\\', output); + u_fputc(val, output); break; - case L'\0': - fputwc_unlocked(val, output); + case '\0': + u_fputc(val, output); if(nullFlushGeneration) { - fflush(output); + u_fflush(output); } break; @@@ -579,7 -581,7 +586,7 @@@ } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } break; } @@@ -587,47 -589,47 +594,47 @@@ } int -FSTProcessor::readGeneration(FILE *input, FILE *output) +FSTProcessor::readGeneration(InputFile& input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); + UChar32 val = input.get(); - if(feof(input)) + if(input.eof()) { return 0x7fffffff; } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } @@@ -635,106 -637,129 +642,106 @@@ outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return static_cast(val); + val = input.get(); + return static_cast(val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return static_cast(L'$'); + return static_cast('$'); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); - - while((val = fgetwc_unlocked(input)) != L'>') - { - if(feof(input)) - { - streamError(); - } - cad += static_cast(val); - } - cad += static_cast(val); - - return alphabet(cad); + return alphabet(readFullBlock(input, '<', '>')); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = input.get(); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + write(readWblank(input), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + input.unget(val); + write(readFullBlock(input, '[', ']'), output); } return readGeneration(input, output); } else { - return static_cast(val); + return static_cast(val); } return 0x7fffffff; } -pair -FSTProcessor::readBilingual(FILE *input, FILE *output) +pair +FSTProcessor::readBilingual(InputFile& input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); - wstring symbol = L""; + UChar32 val = input.get(); + UString symbol; - if(feof(input)) + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return pair(symbol, val); + val = input.get(); + return pair(symbol, val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return pair(symbol, static_cast(L'$')); + return pair(symbol, static_cast('$')); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); - while((val = fgetwc_unlocked(input)) != L'>') - { - if(feof(input)) - { - streamError(); - } - cad += static_cast(val); - } - cad += static_cast(val); + UString cad = readFullBlock(input, '<', '>'); int res = alphabet(cad); @@@ -742,71 -767,78 +749,78 @@@ { symbol = cad; } - return pair(symbol, res); + return pair(symbol, res); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = input.get(); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + write(readWblank(input), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + input.unget(val); + write(readFullBlock(input, '[', ']'), output); } return readBilingual(input, output); } - return pair(symbol, val); + return pair(symbol, val); } void -FSTProcessor::flushBlanks(FILE *output) +FSTProcessor::flushBlanks(UFILE *output) { for(size_t i = blankqueue.size(); i > 0; i--) { - fputws_unlocked(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } } void -FSTProcessor::flushWblanks(FILE *output) +FSTProcessor::flushWblanks(UFILE *output) { while(wblankqueue.size() > 0) { - fputws_unlocked(wblankqueue.front().c_str(), output); + write(wblankqueue.front(), output); wblankqueue.pop(); } } -wstring +UString FSTProcessor::combineWblanks() { - wstring final_wblank; - wstring last_wblank = L""; + UString final_wblank; + UString last_wblank; + bool seen_wblank = false; while(wblankqueue.size() > 0) { - if(wblankqueue.front().compare(L"[[/]]") == 0) + if(wblankqueue.front().compare(WBLANK_FINAL) == 0) { - if(final_wblank.empty()) - { - final_wblank += WBLANK_START; + if(seen_wblank) { + if(final_wblank.empty()) + { - final_wblank += L"[["; ++ final_wblank += WBLANK_START; + } + else if(final_wblank.size() > 2) + { - final_wblank += L"; "; ++ final_wblank += L"; "_u; + } + + final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] } - else if(final_wblank.size() > 2) - { - final_wblank += "; "_u; + else { + need_end_wblank = true; } - - final_wblank.append(last_wblank, 2, last_wblank.size()-4); //add wblank without brackets [[..]] last_wblank.clear(); } else { + seen_wblank = true; last_wblank = wblankqueue.front(); } wblankqueue.pop(); @@@ -819,25 -851,27 +833,24 @@@ if(!final_wblank.empty()) { - final_wblank += L"]]"; + final_wblank += WBLANK_END; need_end_wblank = true; } - return final_wblank; } void FSTProcessor::calcInitial() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - root.addTransition(0, 0, it->second.getInitial(), default_weight); + for(auto& it : transducers) { + root.addTransition(0, 0, it.second.getInitial(), default_weight); } initial_state.init(&root); } bool -FSTProcessor::endsWith(wstring const &str, wstring const &suffix) +FSTProcessor::endsWith(UString const &str, UString const &suffix) { if(str.size() < suffix.size()) { @@@ -852,61 -886,64 +865,61 @@@ void FSTProcessor::classifyFinals() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - if(endsWith(it->first, L"@inconditional")) + for(auto& it : transducers) { + if(endsWith(it.first, "@inconditional"_u)) { - inconditional.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + inconditional.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@standard")) + else if(endsWith(it.first, "@standard"_u)) { - standard.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + standard.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@postblank")) + else if(endsWith(it.first, "@postblank"_u)) { - postblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + postblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@preblank")) + else if(endsWith(it.first, "@preblank"_u)) { - preblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + preblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } else { - wcerr << L"Error: Unsupported transducer type for '"; - wcerr << it->first << L"'." << endl; + cerr << "Error: Unsupported transducer type for '"; + cerr << it.first << "'." << endl; exit(EXIT_FAILURE); } } } void -FSTProcessor::writeEscaped(wstring const &str, FILE *output) +FSTProcessor::writeEscaped(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } size_t -FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output) +FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) { size_t postpop = 0; for (unsigned int i = 0, limit = str.size(); i < limit; i++) { if (escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); - if (str[i] == L' ') { - if (blankqueue.front() == L" ") { + u_fputc(str[i], output); + if (str[i] == ' ') { + if (blankqueue.front() == " "_u) { blankqueue.pop(); } else { postpop++; @@@ -917,67 -954,71 +930,67 @@@ } void -FSTProcessor::writeEscapedWithTags(wstring const &str, FILE *output) +FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { - fputws_unlocked(str.substr(i).c_str(), output); + write(str.substr(i), output); return; } if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } void -FSTProcessor::printWord(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + write(lf, output); + u_fputc('$', output); } void -FSTProcessor::printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordPopBlank(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); size_t postpop = writeEscapedPopBlanks(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "%S$", lf.c_str()); while (postpop-- && blankqueue.size() > 0) { - fputws(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } } void -FSTProcessor::printWordBilingual(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordBilingual(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S%S$", sf.c_str(), lf.c_str()); } void -FSTProcessor::printUnknownWord(wstring const &sf, FILE *output) +FSTProcessor::printUnknownWord(UString const &sf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'*', output); + u_fputc('/', output); + u_fputc('*', output); writeEscaped(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } unsigned int -FSTProcessor::lastBlank(wstring const &str) +FSTProcessor::lastBlank(UString const &str) { for(int i = static_cast(str.size())-1; i >= 0; i--) { @@@ -991,7 -1032,7 +1004,7 @@@ } void -FSTProcessor::printSpace(wchar_t const val, FILE *output) +FSTProcessor::printSpace(UChar const val, UFILE *output) { if(blankqueue.size() > 0) { @@@ -999,20 -1040,20 +1012,20 @@@ } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } bool -FSTProcessor::isEscaped(wchar_t const c) const +FSTProcessor::isEscaped(UChar32 const c) const { return escaped_chars.find(c) != escaped_chars.end(); } bool -FSTProcessor::isAlphabetic(wchar_t const c) const +FSTProcessor::isAlphabetic(UChar32 const c) const { - return (bool)std::iswalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); + return u_isalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); } void @@@ -1021,7 -1062,7 +1034,7 @@@ FSTProcessor::load(FILE *input fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, input); + fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { auto features = read_le(input); if (features >= LTF_UNKNOWN) { @@@ -1038,7 -1079,7 +1051,7 @@@ int len = Compression::multibyte_read(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); len--; } @@@ -1049,12 -1090,278 +1062,12 @@@ while(len > 0) { - int len2 = Compression::multibyte_read(input); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(input)); - len2--; - } + UString name = Compression::string_read(input); transducers[name].read(input, alphabet); len--; } } -void -FSTProcessor::lsx_wrapper_null_flush(FILE *input, FILE *output) -{ - setNullFlush(false); - //nullFlushGeneration = true; - - while(!feof(input)) - { - lsx(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } - } -} - -void -FSTProcessor::lsx(FILE *input, FILE *output) -{ - if(getNullFlush()) - { - lsx_wrapper_null_flush(input, output); - } - - vector new_states, alive_states; - wstring blank, out, in, alt_out, alt_in; - bool outOfWord = true; - bool finalFound = false; - bool plus_thing = false; - - alive_states.push_back(initial_state); - - int val = -1; - - while(!feof(input) && val != 0) - { - val = fgetwc_unlocked(input); - - if(val == L'+' && isEscaped(val) && !outOfWord) - { - val = L'$'; - plus_thing = true; - } - - if((val == L'^' && isEscaped(val) && outOfWord) || feof(input) || val == 0) - { - blankqueue.push(blank); - - if(alive_states.size() == 0) - { - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - fflush(output); - blankqueue.pop(); - } - - alive_states.push_back(initial_state); - - alt_in = L""; - for(int i=0; i < (int) in.size(); i++) // FIXME indexing - { - alt_in += in[i]; - if(in[i] == L'$' && in[i+1] == L'^' && blankqueue.size() > 0) - { - // in.insert(i+1, blankqueue.front().c_str()); - alt_in += blankqueue.front().c_str(); - blankqueue.pop(); - } - } - in = alt_in; - fputws(in.c_str(), output); - fflush(output); - in = L""; - finalFound = false; - } - else if(finalFound && alive_states.size() == 1) - { - finalFound = false; - } - - blank = L""; - in += val; - outOfWord = false; - continue; - } - - // wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl; - - if(outOfWord) - { - blank += val; - continue; - } - - if((val == 0 || feof(input) || val == L'$') && !outOfWord) // && isEscaped(val) - { - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - //wcerr << endl << L"[0] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - s.step(alphabet(L"<$>")); - //wcerr << endl << L"[1] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - if(s.size() > 0) - { - new_states.push_back(s); - } - - /*if(s.isFinal(all_finals)) - { - out += s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - new_states.push_back(*initial_state); - }*/ - - if(s.isFinal(all_finals)) - { - new_states.clear(); - new_states.push_back(initial_state); - out = s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - - alt_out = L""; - for (int i=0; i < (int) out.size(); i++) - { - wchar_t c = out.at(i); - if(c == L'/') - { - alt_out += L'^'; - } - else if(out[i-1] == L'<' && c == L'$' && out[i+1] == L'>') // indexing - { - alt_out += c; - alt_out += L'^'; - } - else if(!(c == L'<' && out[i+1] == L'$' && out[i+2] == L'>') && !(out[i-2] == L'<' && out[i-1] == L'$' && c == L'>')) - { - alt_out += c; - } - } - out = alt_out; - - - if(out[out.length()-1] == L'^') - { - out = out.substr(0, out.length()-1); // extra ^ at the end - if(plus_thing) - { - out[out.size()-1] = L'+'; - plus_thing = false; - } - } - else // take# out ... of - { - for(int i=out.length()-1; i>=0; i--) // indexing - { - if(out.at(i) == L'$') - { - out.insert(i+1, L" "); - break; - } - } - out += L'$'; - } - - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - blankqueue.pop(); - } - - alt_out = L""; - for(int i=0; i < (int) out.size(); i++) // indexing - { - if((out.at(i) == L'$') && blankqueue.size() > 0) - { - alt_out += out.at(i); - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else if((out.at(i) == L'$') && blankqueue.size() == 0 && i != (int) out.size()-1) - { - alt_out += out.at(i); - alt_out += L' '; - } - else if(out.at(i) == L' ' && blankqueue.size() > 0) - { - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else - { - alt_out += out.at(i); - } - } - out = alt_out; - - fputws(out.c_str(), output); - flushBlanks(output); - finalFound = true; - out = L""; - in = L""; - } - } - - alive_states.swap(new_states); - outOfWord = true; - - if(!finalFound) - { - in += val; //do not remove - } - continue; - } - - if(!outOfWord) // && (!(feof(input) || val == L'$'))) - { - if(val == L'<') // tag - { - wstring tag = readFullBlock(input, L'<', L'>'); - in += tag; - if(!alphabet.isSymbolDefined(tag)) - { - alphabet.includeSymbol(tag); - } - val = static_cast(alphabet(tag)); - } - else - { - in += (wchar_t) val; - } - - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - if(val < 0) - { - s.step_override(val, alphabet(L""), val); - } - else if(val > 0) - { - int val_lowercase = towlower(val); - s.step_override(val_lowercase, alphabet(L""), val); // FIXME deal with cases! in step_override - } - - if(s.size() > 0) - { - new_states.push_back(s); - } - - } - alive_states.swap(new_states); - } - } - - flushBlanks(output); -} - void FSTProcessor::initAnalysis() { @@@ -1071,9 -1378,12 +1084,9 @@@ FSTProcessor::initTMAnalysis( { calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@@ -1082,9 -1392,12 +1095,9 @@@ FSTProcessor::initGeneration( { setIgnoredChars(false); calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@@ -1101,8 -1414,8 +1114,8 @@@ FSTProcessor::initBiltrans( } -wstring -FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupper) +UString +FSTProcessor::compoundAnalysis(UString input_word, bool uppercase, bool firstupper) { const int MAX_COMBINATIONS = 32767; @@@ -1110,16 -1423,16 +1123,16 @@@ for(unsigned int i=0; i MAX_COMBINATIONS) { - wcerr << L"Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << L"'" << endl; - wcerr << L" gave up at char " << i << L" '" << val << L"'." << endl; + cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << endl; + cerr << " gave up at char " << i << " '" << val << "'." << endl; - wstring nullString = L""; + UString nullString; return nullString; } @@@ -1130,13 -1443,13 +1143,13 @@@ if(current_state.size()==0) { - wstring nullString = L""; + UString nullString; return nullString; } } current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements); - wstring result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); + UString result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); return result; } @@@ -1146,30 -1459,30 +1159,30 @@@ void FSTProcessor::initDecompositionSymbols() { - if((compoundOnlyLSymbol=alphabet(L"<:co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<:compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"")) == 0) + if((compoundOnlyLSymbol=alphabet("<:co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<:compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:only-L> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, L""); + alphabet.setSymbol(compoundOnlyLSymbol, ""_u); } - if((compoundRSymbol=alphabet(L"<:co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<:compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"")) == 0) + if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<:compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:R> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:R> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, L""); + alphabet.setSymbol(compoundRSymbol, ""_u); } } @@@ -1183,7 -1496,7 +1196,7 @@@ FSTProcessor::initDecomposition( } void -FSTProcessor::analysis(FILE *input, FILE *output) +FSTProcessor::analysis(InputFile& input, UFILE *output) { if(getNullFlush()) { @@@ -1194,13 -1507,13 +1207,13 @@@ bool last_postblank = false; bool last_preblank = false; State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; bool firstupper = false, uppercase = false; map >::iterator rcx_map_ptr; - wchar_t val; + UChar32 val; do { val = readAnalysis(input); @@@ -1211,8 -1524,8 +1224,8 @@@ { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@@ -1230,8 -1543,8 +1243,8 @@@ { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@@ -1249,8 -1562,8 +1262,8 @@@ { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@@ -1268,8 -1581,8 +1281,8 @@@ { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@@ -1286,9 -1599,9 +1299,9 @@@ last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_preblank = false; @@@ -1300,11 -1613,11 +1313,11 @@@ { rcx_map_ptr = rcx_map.find(val); set tmpset = rcx_map_ptr->second; - if(!iswupper(val) || caseSensitive) + if(!u_isupper(val) || caseSensitive) { current_state.step(val, tmpset); } - else if(rcx_map.find(towlower(val)) != rcx_map.end()) + else if(rcx_map.find(u_tolower(val)) != rcx_map.end()) { rcx_map_ptr = rcx_map.find(tolower(val)); tmpset.insert(tolower(val)); @@@ -1319,7 -1632,14 +1332,7 @@@ } else { - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); } if(current_state.size() != 0) @@@ -1331,29 -1651,29 +1344,29 @@@ } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { if (blankqueue.size() > 0) { - fputws_unlocked(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } else { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } if(val) { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@@ -1361,13 -1681,13 +1374,13 @@@ { printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(last_preblank) { - fputwc_unlocked(L' ', output); + u_fputc(' ', output); printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); input_buffer.setPos(last); @@@ -1382,7 -1702,7 +1395,7 @@@ } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@@ -1392,7 -1712,7 +1405,7 @@@ unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@@ -1401,18 -1721,18 +1414,18 @@@ else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@@ -1427,11 -1747,11 +1440,11 @@@ } } } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@@ -1440,18 -1760,18 +1453,18 @@@ else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@@ -1481,8 -1801,8 +1494,8 @@@ } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; last_preblank = false; @@@ -1495,82 -1815,102 +1508,82 @@@ } void -FSTProcessor::analysis_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::analysis_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { analysis(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::generation_wrapper_null_flush(FILE *input, FILE *output, +FSTProcessor::generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!input.eof()) { generation(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::postgeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::postgeneration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { postgeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::intergeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::intergeneration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while (!feof(input)) + while (!input.eof()) { intergeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if (code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::transliteration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::transliteration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { transliteration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::tm_analysis(FILE *input, FILE *output) +FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; - while(wchar_t val = readTMAnalysis(input)) + while(int32_t val = readTMAnalysis(input)) { // test for final states if(current_state.isFinal(all_finals)) { - if(iswpunct(val)) + if(u_ispunct(val)) { lf = current_state.filterFinalsTM(all_finals, alphabet, escaped_chars, @@@ -1579,13 -1919,20 +1592,13 @@@ numbers.clear(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { lf.append(sf); last = input_buffer.getPos(); } - if(!iswupper(val)) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, false); if(current_state.size() != 0) { @@@ -1593,7 -1940,7 +1606,7 @@@ { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@@ -1604,9 -1951,9 +1617,9 @@@ } else { - if((iswspace(val) || iswpunct(val)) && sf == L"") + if((u_isspace(val) || u_ispunct(val)) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@@ -1614,14 -1961,14 +1627,14 @@@ { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } - else if(!iswspace(val) && !iswpunct(val) && + else if(!u_isspace(val) && !u_ispunct(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do @@@ -1630,7 -1977,7 +1643,7 @@@ { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@@ -1639,16 -1986,16 +1652,16 @@@ alphabet.getSymbol(sf, val); } } - while((val = readTMAnalysis(input)) && !iswspace(val) && !iswpunct(val)); + while((val = readTMAnalysis(input)) && !u_isspace(val) && !u_ispunct(val)); if(val == 0) { - fputws_unlocked(sf.c_str(), output); + write(sf, output); return; } input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + write(sf, output); while(blankqueue.size() > 0) { @@@ -1660,22 -2007,22 +1673,22 @@@ } /* - unsigned int limit = sf.find(L' '); + unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + write(sf.substr(0, limit), output); */ } - else if(lf == L"") + else if(lf.empty()) { -/* unsigned int limit = sf.find(L' '); +/* unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + write(sf.substr(0, limit), output); */ input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + write(sf, output); while(blankqueue.size() > 0) { @@@ -1689,14 -2036,16 +1702,14 @@@ } else { - fputwc_unlocked(L'[', output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L']', output); + u_fprintf(output, "[%S]", lf.c_str()); input_buffer.setPos(last); input_buffer.back(1); } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } } @@@ -1706,7 -2055,7 +1719,7 @@@ void -FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@@ -1714,24 -2063,24 +1727,24 @@@ } State current_state = initial_state; - wstring sf = L""; + UString sf; outOfWord = false; - skipUntil(input, output, L'^'); + skipUntil(input, output, '^'); int val; while((val = readGeneration(input, output)) != 0x7fffffff) { - if(sf == L"" && val == L'=') + if(sf.empty() && val == '=') { - fputwc(L'=', output); + u_fputc('=', output); val = readGeneration(input, output); } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { - if(sf[0] == L'*' || sf[0] == L'%') + if(sf[0] == '*' || sf[0] == '%') { if(mode != gm_clean && mode != gm_tagged_nm) { @@@ -1743,14 -2092,14 +1756,14 @@@ } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } - else if(sf[0] == L'@') + else if(sf[0] == '@') { if(mode == gm_all) { @@@ -1770,11 -2119,11 +1783,11 @@@ } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } else if(current_state.isFinal(all_finals)) @@@ -1782,24 -2131,24 +1795,24 @@@ bool firstupper = false, uppercase = false; if(!dictionaryCase) { - uppercase = sf.size() > 1 && iswupper(sf[1]); - firstupper= iswupper(sf[0]); + uppercase = sf.size() > 1 && u_isupper(sf[1]); + firstupper= u_isupper(sf[0]); } if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); } - fputws_unlocked(current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper).substr(1).c_str(), output); + write(current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper).substr(1), output); if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } @@@ -1807,7 -2156,7 +1820,7 @@@ { if(mode == gm_all) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(sf, output); } else if(mode == gm_clean) @@@ -1816,36 -2165,36 +1829,36 @@@ } else if(mode == gm_unknown) { - if(sf != L"") + if(!sf.empty()) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } } else if(mode == gm_tagged) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf), output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'#', output); + u_fputc('/', output); + u_fputc('#', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } current_state = initial_state; - sf = L""; + sf.clear(); } - else if(iswspace(val) && sf.size() == 0) + else if(u_isspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && (sf[0] == L'*' || sf[0] == L'%' )) + else if(sf.size() > 0 && (sf[0] == '*' || sf[0] == '%' )) { alphabet.getSymbol(sf, val); } @@@ -1854,15 -2203,15 +1867,15 @@@ alphabet.getSymbol(sf,val); if(current_state.size() > 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { if(mode == gm_carefulcase) { - current_state.step_careful(val, towlower(val)); + current_state.step_careful(val, u_tolower(val)); } else { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } } else @@@ -1875,7 -2224,7 +1888,7 @@@ } void -FSTProcessor::postgeneration(FILE *input, FILE *output) +FSTProcessor::postgeneration(InputFile& input, UFILE *output) { if(getNullFlush()) { @@@ -1886,14 -2235,14 +1899,14 @@@ collect_wblanks = false; need_end_wblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; - while(wchar_t val = readPostgeneration(input, output)) + while(UChar val = readPostgeneration(input, output)) { - if(val == L'~') + if(val == '~') { skip_mode = false; collect_wblanks = true; @@@ -1905,11 -2254,11 +1918,11 @@@ } else if(skip_mode) { - if(iswspace(val)) + if(u_isspace(val)) { if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; } @@@ -1924,13 -2273,13 +1937,13 @@@ if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; } } @@@ -1945,8 -2294,8 +1958,8 @@@ // test for final states if(current_state.isFinal(all_finals)) { - bool firstupper = iswupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, empty_escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, @@@ -1954,7 -2303,7 +1967,7 @@@ // case of the beggining of the next word - wstring mybuf = L""; + UString mybuf; for(size_t i = sf.size(); i > 0; --i) { if(!isalpha(sf[i-1])) @@@ -1969,8 -2318,8 +1982,8 @@@ if(mybuf.size() > 0) { - bool myfirstupper = iswupper(mybuf[0]); - bool myuppercase = mybuf.size() > 1 && iswupper(mybuf[1]); + bool myfirstupper = u_isupper(mybuf[0]); + bool myuppercase = mybuf.size() > 1 && u_isupper(mybuf[1]); for(size_t i = lf.size(); i > 0; --i) { @@@ -1978,11 -2327,11 +1991,11 @@@ { if(myfirstupper && i != lf.size()) { - lf[i] = towupper(lf[i]); + lf[i] = u_toupper(lf[i]); } else { - lf[i] = towlower(lf[i]); + lf[i] = u_tolower(lf[i]); } break; } @@@ -1990,11 -2339,11 +2003,11 @@@ { if(myuppercase) { - lf[i-1] = towupper(lf[i-1]); + lf[i-1] = u_toupper(lf[i-1]); } else { - lf[i-1] = towlower(lf[i-1]); + lf[i-1] = u_tolower(lf[i-1]); } } } @@@ -2003,7 -2352,14 +2016,7 @@@ last = input_buffer.getPos(); } - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); if(current_state.size() != 0) { @@@ -2011,51 -2367,51 +2024,51 @@@ } else { - wstring final_wblank = combineWblanks(); - fputws_unlocked(final_wblank.c_str(), output); + UString final_wblank = combineWblanks(); + write(final_wblank, output); - if(lf == L"") + if(lf.empty()) { unsigned int mark = sf.size(); unsigned int space_index = sf.size(); - + for(unsigned int i = 1, limit = sf.size(); i < limit; i++) { - if(sf[i] == L'~') + if(sf[i] == '~') { mark = i; break; } - else if(sf[i] == L' ') + else if(sf[i] == ' ') { space_index = i; } } - + if(space_index != sf.size()) { - fputws_unlocked(sf.substr(1, space_index-1).c_str(), output); - + write(sf.substr(1, space_index-1), output); + if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); flushWblanks(output); } else { - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); } - - fputws_unlocked(sf.substr(space_index+1, mark-space_index-1).c_str(), output); + + write(sf.substr(space_index+1, mark-space_index-1), output); } else { flushWblanks(output); - fputws_unlocked(sf.substr(1, mark-1).c_str(), output); + write(sf.substr(1, mark-1), output); } - + if(mark == sf.size()) { input_buffer.back(1); @@@ -2067,11 -2423,11 +2080,11 @@@ } else { - fputws_unlocked(lf.substr(1,lf.size()-3).c_str(), output); + write(lf.substr(1,lf.size()-3), output); input_buffer.setPos(last); input_buffer.back(2); val = lf[lf.size()-2]; - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@@ -2079,15 -2435,15 +2092,15 @@@ { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); skip_mode = true; collect_wblanks = false; } @@@ -2099,7 -2455,7 +2112,7 @@@ } void -FSTProcessor::intergeneration(FILE *input, FILE *output) +FSTProcessor::intergeneration(InputFile& input, UFILE *output) { if (getNullFlush()) { @@@ -2108,35 -2464,35 +2121,35 @@@ bool skip_mode = true; State current_state = initial_state; - wstring target = L""; - wstring source = L""; + UString target; + UString source; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; while (true) { - wchar_t val = readPostgeneration(input, output); + UChar val = readPostgeneration(input, output); - if (val == L'~') + if (val == '~') { skip_mode = false; } if (skip_mode) { - if (iswspace(val)) + if (u_isspace(val)) { printSpace(val, output); } else { - if(val != L'\0') + if(val != '\0') { if (isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@@ -2145,8 -2501,8 +2158,8 @@@ // test for final states if (current_state.isFinal(all_finals)) { - bool firstupper = iswupper(source[1]); - bool uppercase = source.size() > 1 && firstupper && iswupper(source[2]); + bool firstupper = u_isupper(source[1]); + bool uppercase = source.size() > 1 && firstupper && u_isupper(source[2]); target = current_state.filterFinals(all_finals, alphabet, empty_escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, @@@ -2155,32 -2511,39 +2168,32 @@@ last = input_buffer.getPos(); } - if (val != L'\0') + if (val != '\0') { - if (!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); } - if (val != L'\0' && current_state.size() != 0) + if (val != '\0' && current_state.size() != 0) { alphabet.getSymbol(source, val); } else { - if (target == L"") // no match + if (target.empty()) // no match { - if (val == L'\0') + if (val == '\0') { // flush source - fputws_unlocked(source.c_str(), output); + write(source, output); } else { - fputwc_unlocked(source[0], output); + u_fputc(source[0], output); unsigned int mark, limit; - for (mark = 1, limit = source.size(); mark < limit && source[mark] != L'~' ; mark++) + for (mark = 1, limit = source.size(); mark < limit && source[mark] != '~' ; mark++) { - fputwc_unlocked(source[mark], output); + u_fputc(source[mark], output); } if (mark != source.size()) @@@ -2189,20 -2552,20 +2202,20 @@@ input_buffer.back(back); } - if (val == L'~') + if (val == '~') { input_buffer.back(1); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } else { for(unsigned int i=1; i 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); if(!lf.empty()) { - fputws_unlocked(lf.substr(1).c_str(), output); + write(lf.substr(1), output); current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@@ -2277,17 -2640,17 +2290,17 @@@ { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else { if(current_state.isFinal(all_finals)) { - bool firstupper = iswupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); @@@ -2303,14 -2666,14 +2316,14 @@@ { if(!lf.empty()) { - fputws_unlocked(lf.substr(1).c_str(), output); + write(lf.substr(1), output); input_buffer.setPos(last); input_buffer.back(1); val = lf[lf.size()-1]; } else { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@@ -2318,14 -2681,14 +2331,14 @@@ { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } } } @@@ -2333,14 -2696,14 +2346,14 @@@ flushBlanks(output); } -wstring -FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltransfull(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; if(with_delim == false) @@@ -2349,37 -2712,37 +2362,37 @@@ end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@@ -2389,13 -2752,13 +2402,13 @@@ } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@@ -2404,22 -2767,37 +2417,22 @@@ } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^="+result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if(with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if(mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@@ -2428,11 -2806,11 +2441,11 @@@ // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@@ -2441,23 -2819,23 +2454,23 @@@ if(start_point < (end_point - 3)) { - return L"^$"; + return "^$"_u; } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@@ -2470,7 -2848,7 +2483,7 @@@ if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@@ -2478,7 -2856,7 +2491,7 @@@ { if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@@ -2486,14 -2864,14 +2499,14 @@@ -wstring -FSTProcessor::biltrans(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltrans(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; if(with_delim == false) @@@ -2502,37 -2880,37 +2515,37 @@@ end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@@ -2542,13 -2920,13 +2555,13 @@@ } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@@ -2557,22 -2935,37 +2570,22 @@@ } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^="+result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@@ -2581,11 -2974,11 +2594,11 @@@ // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@@ -2594,19 -2987,19 +2607,19 @@@ // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@@ -2619,7 -3012,7 +2632,7 @@@ if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@@ -2627,53 -3020,54 +2640,53 @@@ { if(with_delim) { - result += L'$'; + result += '$'; } return result; } } void -FSTProcessor::bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!input.eof()) { bilingual(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } -wstring -FSTProcessor::compose(wstring const &lexforms, wstring const &queue) const +UString +FSTProcessor::compose(UString const &lexforms, UString const &queue) const { - wstring result = L""; + UString result; + result.reserve(lexforms.size() + 2 * queue.size()); + result += '/'; for(unsigned int i = 1; i< lexforms.size(); i++) { - if(lexforms[i] == L'\\') + if(lexforms[i] == '\\') { - result += L'\\'; + result += '\\'; i++; } - else if(lexforms[i] == L'/') + else if(lexforms[i] == '/') { result.append(queue); } result += lexforms[i]; } - return L"/" + result + queue; + result += queue; + return result; } void -FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@@ -2681,20 -3075,20 +2694,20 @@@ } State current_state = initial_state; - wstring sf = L""; // source language analysis - wstring queue = L""; // symbols to be added to each target - wstring result = L""; // result of looking up analysis in bidix + UString sf; // source language analysis + UString queue; // symbols to be added to each target + UString result; // result of looking up analysis in bidix outOfWord = false; - skipUntil(input, output, L'^'); - pair tr; // readBilingual return value, containing: + skipUntil(input, output, '^'); + pair tr; // readBilingual return value, containing: int val; // the alphabet value of current symbol, and - wstring symbol = L""; // the current symbol as a string + UString symbol; // the current symbol as a string bool seentags = false; // have we seen any tags at all in the analysis? bool seensurface = false; - wstring surface = L""; + UString surface; while(true) // ie. while(val != 0x7fffffff) { @@@ -2702,17 -3096,17 +2715,17 @@@ symbol = tr.first; val = tr.second; - //fwprintf(stderr, L"> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); + //fwprintf(stderr, "> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); if(biltransSurfaceForms && !seensurface && !outOfWord) { - while(val != L'/' && val != 0x7fffffff) + while(val != '/' && val != 0x7fffffff) { surface = surface + symbol; alphabet.getSymbol(surface, val); tr = readBilingual(input, output); symbol = tr.first; val = tr.second; - //fwprintf(stderr, L" == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); + //fwprintf(stderr, " == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); } seensurface = true; tr = readBilingual(input, output); @@@ -2725,12 -3119,12 +2738,12 @@@ break; } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { if(!seentags) // if no tags: only return complete matches { - bool uppercase = sf.size() > 1 && iswupper(sf[1]); - bool firstupper= iswupper(sf[0]); + bool uppercase = sf.size() > 1 && u_isupper(sf[1]); + bool firstupper= u_isupper(sf[0]); result = current_state.filterFinals(all_finals, alphabet, escaped_chars, @@@ -2738,16 -3132,16 +2751,16 @@@ uppercase, firstupper, 0); } - if(sf[0] == L'*') + if(sf[0] == '*') { if (mode == gm_clean) { - printWordBilingual(sf, L"/" + sf.substr(1), output); + printWordBilingual(sf, "/"_u + sf.substr(1), output); } else { - printWordBilingual(sf, L"/" + sf, output); + printWordBilingual(sf, "/"_u + sf, output); } } - else if(result != L"") + else if(!result.empty()) { printWordBilingual(sf, compose(result, queue), output); } @@@ -2755,30 -3149,30 +2768,30 @@@ { //xxx if(biltransSurfaceForms) { - printWordBilingual(surface, L"/@"+surface, output); + printWordBilingual(surface, "/@"_u + surface, output); } else { - printWordBilingual(sf, L"/@"+sf, output); + printWordBilingual(sf, "/@"_u + sf, output); } } seensurface = false; - surface = L""; - queue = L""; - result = L""; + surface.clear(); + queue.clear(); + result.clear(); current_state = initial_state; - sf = L""; + sf.clear(); seentags = false; } - else if(iswspace(val) && sf.size() == 0) + else if(u_isspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && sf[0] == L'*') + else if(sf.size() > 0 && sf[0] == '*') { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@@ -2790,7 -3184,7 +2803,7 @@@ { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@@ -2803,9 -3197,9 +2816,9 @@@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@@ -2814,16 -3208,16 +2827,16 @@@ } if(current_state.isFinal(all_finals)) { - bool uppercase = sf.size() > 1 && iswupper(sf[1]); - bool firstupper= iswupper(sf[0]); + bool uppercase = sf.size() > 1 && u_isupper(sf[1]); + bool firstupper= u_isupper(sf[0]); - queue = L""; // the intervening tags were matched + queue.clear(); // the intervening tags were matched result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); } - else if(result != L"") + else if(!result.empty()) { // We already have a result, but there is still more to read // of the analysis; following tags are not consumed, but @@@ -2840,21 -3234,21 +2853,21 @@@ else if(current_state.size() == 0) { // There are no more alive transductions and the current symbol is not a tag -- unknown word! - result = L""; + result.clear(); } } } } } -pair -FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) +pair +FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; bool seentags = false; // have we seen any tags at all in the analysis? @@@ -2864,38 -3258,38 +2877,38 @@@ end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { - return pair(input_word, 0); + return pair(input_word, 0); } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val = 0; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; val = input_word[i]; } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { seentags = true; - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@@ -2909,9 -3303,9 +2922,9 @@@ } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@@ -2920,22 -3314,37 +2933,22 @@@ } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^=" + result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@@ -2944,51 -3353,52 +2957,51 @@@ // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } } } if (!seentags - && L"" == current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0)) + && current_state.filterFinals(all_finals, alphabet, escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).empty()) { // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@@ -3001,25 -3411,25 +3014,25 @@@ if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } - return pair(result_with_queue, queue.size()); + return pair(result_with_queue, queue.size()); } else { if(with_delim) { - result += L'$'; + result += '$'; } - return pair(result, 0); + return pair(result, 0); } } -wstring -FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; bool mark = false; @@@ -3030,37 -3440,37 +3043,37 @@@ end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@@ -3070,13 -3480,13 +3083,13 @@@ } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@@ -3085,31 -3495,46 +3098,31 @@@ } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^=" + result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol == L"") + if(symbol.empty()) { // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@@ -3118,7 -3543,7 +3131,7 @@@ if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@@ -3129,16 -3554,16 +3142,16 @@@ FSTProcessor::valid() cons { if(initial_state.isFinal(all_finals)) { - wcerr << L"Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; + cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; return false; } else { State s = initial_state; - s.step(L' '); + s.step(' '); if(s.size() != 0) { - wcerr << L"Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; + cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; return false; } } @@@ -3147,45 -3572,45 +3160,45 @@@ } int -FSTProcessor::readSAO(FILE *input) +FSTProcessor::readSAO(InputFile& input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - if(feof(input)) + UChar32 val = input.get(); + if(input.eof()) { return 0; } if(escaped_chars.find(val) != escaped_chars.end()) { - if(val == L'<') + if(val == '<') { - wstring str = readFullBlock(input, L'<', L'>'); - if(str.substr(0, 9) == L"'); + if(str.substr(0, 9) == "") + while(str.substr(str.size()-3) != "]]>"_u) { - str.append(readFullBlock(input, L'<', L'>').substr(1)); + str.append(readFullBlock(input, '<', '>').substr(1)); } blankqueue.push(str); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } else { streamError(); } } - else if (val == L'\\') { - val = static_cast(fgetwc_unlocked(input)); + else if (val == '\\') { + val = input.get(); if(isEscaped(val)) { input_buffer.add(val); - return static_cast(val); + return static_cast(val); } else streamError(); @@@ -3196,47 -3621,47 +3209,47 @@@ } } - input_buffer.add(val); - return static_cast(val); + input_buffer.add(static_cast(val)); + return static_cast(val); } void -FSTProcessor::printSAOWord(wstring const &lf, FILE *output) +FSTProcessor::printSAOWord(UString const &lf, UFILE *output) { for(unsigned int i = 1, limit = lf.size(); i != limit; i++) { - if(lf[i] == L'/') + if(lf[i] == '/') { break; } - fputwc_unlocked(lf[i], output); + u_fputc(lf[i], output); } } void -FSTProcessor::SAO(FILE *input, FILE *output) +FSTProcessor::SAO(InputFile& input, UFILE *output) { bool last_incond = false; bool last_postblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; escaped_chars.clear(); - escaped_chars.insert(static_cast(L'\\')); - escaped_chars.insert(static_cast(L'<')); - escaped_chars.insert(static_cast(L'>')); + escaped_chars.insert('\\'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); - while(wchar_t val = readSAO(input)) + while(UChar32 val = readSAO(input)) { // test for final states if(current_state.isFinal(all_finals)) { if(current_state.isFinal(inconditional)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@@ -3246,8 -3671,8 +3259,8 @@@ } else if(current_state.isFinal(postblank)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@@ -3257,8 -3682,8 +3270,8 @@@ } else if(!isAlphabetic(val)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@@ -3268,16 -3693,23 +3281,16 @@@ last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_incond = false; last = input_buffer.getPos(); } - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); if(current_state.size() != 0) { @@@ -3285,9 -3717,9 +3298,9 @@@ } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@@ -3295,9 -3727,9 +3308,9 @@@ { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else if(last_incond) @@@ -3309,13 -3741,13 +3322,13 @@@ else if(last_postblank) { printSAOWord(lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@@ -3325,17 -3757,21 +3338,17 @@@ unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } else { @@@ -3345,8 -3781,8 +3358,8 @@@ } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; } @@@ -3356,12 -3792,12 +3369,12 @@@ flushBlanks(output); } -wstring -FSTProcessor::removeTags(wstring const &str) +UString +FSTProcessor::removeTags(UString const &str) { for(unsigned int i = 0; i < str.size(); i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { return str.substr(0, i); } @@@ -3444,7 -3880,7 +3457,7 @@@ FSTProcessor::getNullFlush( } size_t -FSTProcessor::firstNotAlpha(wstring const &sf) +FSTProcessor::firstNotAlpha(UString const &sf) { for(size_t i = 0, limit = sf.size(); i < limit; i++) { @@@ -3454,5 -3890,5 +3467,5 @@@ } } - return wstring::npos; + return UString::npos; } diff --combined lttoolbox/fst_processor.h index 6e5c218,628356d..76d3783 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@@ -18,20 -18,19 +18,20 @@@ #ifndef _FSTPROCESSOR_ #define _FSTPROCESSOR_ +#include #include #include -#include #include #include #include +#include #include -#include #include #include #include #include +#include using namespace std; @@@ -57,7 -56,7 +57,7 @@@ private /** * Transducers in FSTP */ - map transducers; + map transducers; /** * Current state of lexical analysis @@@ -102,27 -101,27 +102,27 @@@ /** * Queue of blanks, used in reading methods */ - queue blankqueue; + queue blankqueue; /** * Queue of wordbound blanks, used in reading methods */ - queue wblankqueue; + queue wblankqueue; /** * Set of characters being considered alphabetics */ - set alphabetic_chars; + set alphabetic_chars; /** * Set of characters to escape with a backslash */ - set escaped_chars; + set escaped_chars; /** * Set of characters to ignore */ - set ignored_chars; + set ignored_chars; /** * Mapping of characters for simplistic diacritic restoration specified in RCX files @@@ -142,7 -141,7 +142,7 @@@ /** * Input buffer */ - Buffer input_buffer; + Buffer input_buffer; /** * Begin of the transducer @@@ -220,7 -219,7 +220,7 @@@ /** * Show or not the controls symbols (as compoundRSymbol) */ - bool showControlSymbols; + bool showControlSymbols; /** * Max compound elements @@@ -263,7 -262,7 +263,7 @@@ * @param input the stream to read from * @return code of the character */ - wchar_t readEscaped(FILE *input); + UChar32 readEscaped(InputFile& input); /** * Reads a block from the stream input, enclosed by delim1 and delim2 @@@ -271,13 -270,13 +271,13 @@@ * @param delim1 the delimiter of the beginning of the sequence * @param delim1 the delimiter of the end of the sequence */ - wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); + UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2); /** * Reads a wordbound blank from the stream input * @param input the stream being read */ - wstring readWblank(FILE *input); + UString readWblank(InputFile& input); /** * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] @@@ -285,28 -284,28 +285,28 @@@ * @param output the stream to write on * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation */ - bool wblankPostGen(FILE *input, FILE *output); + bool wblankPostGen(InputFile& input, UFILE *output); /** * Returns true if the character code is identified as alphabetic * @param c the code provided by the user * @return true if it's alphabetic */ - bool isAlphabetic(wchar_t const c) const; + bool isAlphabetic(UChar32 const c) const; /** * Tests if a character is in the set of escaped_chars * @param c the character code provided by the user * @return true if it is in the set */ - bool isEscaped(wchar_t const c) const; + bool isEscaped(UChar32 const c) const; /** * Read text from stream (analysis version) * @param input the stream to read * @return the next symbol in the stream */ - int readAnalysis(FILE *input); + int readAnalysis(InputFile& input); /** * Read text from stream (decomposition version) @@@ -314,7 -313,7 +314,7 @@@ * @param output the stream to write on * @return the next symbol in the stream */ - int readDecomposition(FILE *input, FILE *output); + int readDecomposition(InputFile& input, UFILE *output); /** * Read text from stream (postgeneration version) @@@ -322,7 -321,7 +322,7 @@@ * @param output the stream to write on * @return the next symbol in the stream */ - int readPostgeneration(FILE *input, FILE *output); + int readPostgeneration(InputFile& input, UFILE *output); /** * Read text from stream (generation version) @@@ -330,7 -329,7 +330,7 @@@ * @param output the stream being written to * @return the next symbol in the stream */ - int readGeneration(FILE *input, FILE *output); + int readGeneration(InputFile& input, UFILE *output); /** * Read text from stream (biltrans version) @@@ -338,32 -337,40 +338,40 @@@ * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ - pair readBilingual(FILE *input, FILE *output); + pair readBilingual(InputFile& input, UFILE *output); /** * Read text from stream (SAO version) * @param input the stream to read * @return the next symbol in the stream */ - int readSAO(FILE *input); + int readSAO(InputFile& input); /** * Flush all the blanks remaining in the current process * @param output stream to write blanks */ - void flushBlanks(FILE *output); + void flushBlanks(UFILE *output); /** * Flush all the wordbound blanks remaining in the current process * @param output stream to write blanks */ - void flushWblanks(FILE *output); + void flushWblanks(UFILE *output); /** - * Combine wordbound blanks in the queue and return them + * Combine wordbound blanks in the queue and return them. + * + * May pop from 'wblankqueue' and set 'need_end_wblank' to true. + * + * If 'wblankqueue' (see which) is empty, we get an empty string, + * otherwise we return a semicolon-separated combination of opening + * wblanks in the queue. If there is only a closing wblank, we just + * set need_end_wblank. + * * @return final wblank string */ - wstring combineWblanks(); + UString combineWblanks(); /** * Calculate the initial state of parsing @@@ -380,7 -387,7 +388,7 @@@ * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscaped(wstring const &str, FILE *output); + void writeEscaped(UString const &str, UFILE *output); /** * Write a string to an output stream. @@@ -391,7 -398,7 +399,7 @@@ * @param output the stream to write in * @return how many blanks to pop and print after printing lu */ - size_t writeEscapedPopBlanks(wstring const &str, FILE *output); + size_t writeEscapedPopBlanks(UString const &str, UFILE *output); /** * Write a string to an output stream, escaping all escapable characters @@@ -399,7 -406,7 +407,7 @@@ * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscapedWithTags(wstring const &str, FILE *output); + void writeEscapedWithTags(UString const &str, UFILE *output); /** @@@ -408,7 -415,7 +416,7 @@@ * @param the searched suffix * @returns true if 'str' has the suffix 'suffix' */ - static bool endsWith(wstring const &str, wstring const &suffix); + static bool endsWith(UString const &str, UString const &suffix); /** * Prints a word @@@ -416,7 -423,7 +424,7 @@@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWord(wstring const &sf, wstring const &lf, FILE *output); + void printWord(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word. @@@ -426,7 -433,7 +434,7 @@@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output); + void printWordPopBlank(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word (Bilingual version) @@@ -434,7 -441,7 +442,7 @@@ * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordBilingual(wstring const &sf, wstring const &lf, FILE *output); + void printWordBilingual(UString const &sf, UString const &lf, UFILE *output); /** @@@ -442,21 -449,21 +450,21 @@@ * @param lf lexical form * @param output stream where the word is written */ - void printSAOWord(wstring const &lf, FILE *output); + void printSAOWord(UString const &lf, UFILE *output); /** * Prints an unknown word * @param sf surface form of the word * @param output stream where the word is written */ - void printUnknownWord(wstring const &sf, FILE *output); + void printUnknownWord(UString const &sf, UFILE *output); void initDecompositionSymbols(); - vector numbers; - int readTMAnalysis(FILE *input); + vector numbers; + int readTMAnalysis(InputFile& input); - unsigned int lastBlank(wstring const &str); + unsigned int lastBlank(UString const &str); /** * Print one blankqueue item if there is one, or a given "space" value. @@@ -464,22 -471,23 +472,22 @@@ * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(wchar_t const val, FILE *output); + void printSpace(UChar const val, UFILE *output); - void skipUntil(FILE *input, FILE *output, wint_t const character); - static wstring removeTags(wstring const &str); - wstring compoundAnalysis(wstring str, bool uppercase, bool firstupper); - size_t firstNotAlpha(wstring const &sf); + void skipUntil(InputFile& input, UFILE *output, UChar32 const character); + static UString removeTags(UString const &str); + UString compoundAnalysis(UString str, bool uppercase, bool firstupper); + size_t firstNotAlpha(UString const &sf); - void analysis_wrapper_null_flush(FILE *input, FILE *output); - void lsx_wrapper_null_flush(FILE *input, FILE *output); - void bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode = gm_unknown); - void generation_wrapper_null_flush(FILE *input, FILE *output, + void analysis_wrapper_null_flush(InputFile& input, UFILE *output); + void bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); - void postgeneration_wrapper_null_flush(FILE *input, FILE *output); - void intergeneration_wrapper_null_flush(FILE *input, FILE *output); - void transliteration_wrapper_null_flush(FILE *input, FILE *output); + void postgeneration_wrapper_null_flush(InputFile& input, UFILE *output); + void intergeneration_wrapper_null_flush(InputFile& input, UFILE *output); + void transliteration_wrapper_null_flush(InputFile& input, UFILE *output); - wstring compose(wstring const &lexforms, wstring const &queue) const; + UString compose(UString const &lexforms, UString const &queue) const; void procNodeICX(); void procNodeRCX(); @@@ -489,21 -497,6 +497,21 @@@ xmlTextReaderPtr reader; public: + + /* + * String constants + */ + static UString const XML_TEXT_NODE; + static UString const XML_COMMENT_NODE; + static UString const XML_IGNORED_CHARS_ELEM; + static UString const XML_RESTORE_CHAR_ELEM; + static UString const XML_RESTORE_CHARS_ELEM; + static UString const XML_VALUE_ATTR; + static UString const XML_CHAR_ELEM; + static UString const WBLANK_START; + static UString const WBLANK_END; + static UString const WBLANK_FINAL; + FSTProcessor(); void initAnalysis(); @@@ -514,23 -507,25 +522,23 @@@ void initBiltrans(); void initDecomposition(); - void analysis(FILE *input = stdin, FILE *output = stdout); - void tm_analysis(FILE *input = stdin, FILE *output = stdout); - void generation(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - void postgeneration(FILE *input = stdin, FILE *output = stdout); - void intergeneration(FILE *input = stdin, FILE *output = stdout); - void transliteration(FILE *input = stdin, FILE *output = stdout); - wstring biltrans(wstring const &input_word, bool with_delim = true); - wstring biltransfull(wstring const &input_word, bool with_delim = true); - void bilingual(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - pair biltransWithQueue(wstring const &input_word, bool with_delim = true); - wstring biltransWithoutQueue(wstring const &input_word, bool with_delim = true); - void SAO(FILE *input = stdin, FILE *output = stdout); + void analysis(InputFile& input, UFILE *output); + void tm_analysis(InputFile& input, UFILE *output); + void generation(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + void postgeneration(InputFile& input, UFILE *output); + void intergeneration(InputFile& input, UFILE *output); + void transliteration(InputFile& input, UFILE *output); + UString biltrans(UString const &input_word, bool with_delim = true); + UString biltransfull(UString const &input_word, bool with_delim = true); + void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + pair biltransWithQueue(UString const &input_word, bool with_delim = true); + UString biltransWithoutQueue(UString const &input_word, bool with_delim = true); + void SAO(InputFile& input, UFILE *output); void parseICX(string const &file); void parseRCX(string const &file); void load(FILE *input); - void lsx(FILE *input, FILE *output); - bool valid() const; void setCaseSensitiveMode(bool const value); diff --combined tests/lt_proc/__init__.py index f975387,fca9df9..2de472a --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@@ -148,8 -148,8 +148,8 @@@ class PostgenerationBasicTest(ProcTest) "El perro ~de el amigo.", "abc ~les testword"] expectedOutputs = [ "xyz ejemplo u ho nombre.", - "xyz se la pelota.", - "El perro del amigo.", + "xyz se la pelota.", + "El perro del amigo.", "abc le pe test testword"] class PostgenerationWordboundBlankTest(ProcTest): @@@ -173,7 -173,11 +173,11 @@@ "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]~les[[/]] [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]"] + "[[t:b:Z9eiLA]]abc[[/]] ~les [[t:b:12bsa23]]pes[[/]] [[t:i:4_tPUA]]~de[[/]] [[t:b:Z9eiLA]]el[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:text:NaNaNa]]pla~ss[[/]]", + "[[t:text:NaNaNa]]pla~sss[[/]]", + "[[t:text:NaNaNa]]pla~ssar[[/]]", + "[[t:text:NaNaNa]]pla~sssar[[/]]"] expectedOutputs = [ "xyz ejemplo [[t:i:123456; t:b:abc123; t:i:123456]]u ho[[/]] [[t:b:iopmnb]]nombre[[/]].", "xyz ejemplo [[t:b:poim230]]u ho[[/]] [[t:i:mnbj203]]nombre[[/]].", @@@ -193,7 -197,11 +197,11 @@@ "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]le pe test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", "[[t:b:Z9eiLA]]abc[[/]] [[t:i:123456; t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", - "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]"] + "[[t:b:Z9eiLA]]abc[[/]] [[t:b:12bsa23]]les pes test[[/]] [[t:i:4_tPUA; t:b:Z9eiLA]]del[[/]] [[t:i:wSM6RQ]]testword[[/]]", + "[[t:text:NaNaNa]]plass[[/]]", + "[[t:text:NaNaNa]]plass[[/]]", + "[[t:text:NaNaNa]]plassar[[/]]", + "[[t:text:NaNaNa]]plassar[[/]]"] class PostgenerationWordboundBlankEscapingTest(ProcTest): @@@ -220,24 -228,5 +228,24 @@@ class SpaceAtEOF(ProcTest) flushing = False +class NonBMPDixTest(ProcTest): + procdix = "data/non-bmp.dix" + inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] + + +class NonBMPATTTest(ProcTest): + procdix = "data/non-bmp.att" + inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] + + +class NonBMPGeneratorTest(ProcTest): + procdix = "data/non-bmp.att" + inputs = ['^𐅁𐅃𐅅$', '^𐅂𐅄𐅆$'] + expectedOutputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + procflags = ['-z', '-g'] + procdir = "rl" + # These fail on some systems: #from null_flush_invalid_stream_format import *