commit b99672a0d97cb7458445d05cc90a5854a11c2f6c Author: Daniel Swanson Date: Tue Jun 1 17:14:21 2021 -0500 the long march part 3 (compiles, but tests fail) diff --git a/configure.ac b/configure.ac index bbd588a..64d84e9 100644 --- a/configure.ac +++ b/configure.ac @@ -41,28 +41,6 @@ AC_ARG_ENABLE(profile, PKG_CHECK_MODULES(LIBXML, [libxml-2.0 >= 2.6.17]) PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc]) -# Check for wide strings -AC_DEFUN([AC_CXX_WSTRING],[ - AC_CACHE_CHECK(whether the compiler supports wide strings, - ac_cv_cxx_wstring, - [AC_LANG_SAVE - AC_LANG_CPLUSPLUS - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ -std::wstring test = L"test"; - ]])], - [ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no]) - AC_LANG_RESTORE - ]) -]) - -AC_CXX_WSTRING - -if test "$ac_cv_cxx_wstring" = no -then - AC_MSG_ERROR([Missing wide string support]) -fi - - # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index cba2bc6..7874da3 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,7 +1,7 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \ - ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \ transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \ string_to_wostream.h ustring.h @@ -50,10 +50,7 @@ lt_tmxproc_SOURCES = lt_tmxproc.cc man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 -INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS) -if WINDOWS - INCLUDES += -I$(top_srcdir)/utf8 -endif +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/utf8 $(LIBXML_CFLAGS) $(ICU_CFLAGS) CLEANFILES = *~ EXTRA_DIST = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd $(man_MANS) diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 52fe76c..807b656 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -171,7 +171,7 @@ public: /** * Note: both the symbol int and int-pair are specific to this alphabet instance. - * @see operator() to go from general wstrings to alphabet-specific ints. + * @see operator() to go from general strings to alphabet-specific ints. * @param code a symbol * @return the pair which code represents in this alphabet */ diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 9408191..b498c5d 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -28,47 +28,47 @@ using namespace std; -UString const Compiler::COMPILER_DICTIONARY_ELEM = (const UChar*)"dictionary"; -UString const Compiler::COMPILER_ALPHABET_ELEM = (const UChar*)"alphabet"; -UString const Compiler::COMPILER_SDEFS_ELEM = (const UChar*)"sdefs"; -UString const Compiler::COMPILER_SDEF_ELEM = (const UChar*)"sdef"; -UString const Compiler::COMPILER_N_ATTR = (const UChar*)"n"; -UString const Compiler::COMPILER_PARDEFS_ELEM = (const UChar*)"pardefs"; -UString const Compiler::COMPILER_PARDEF_ELEM = (const UChar*)"pardef"; -UString const Compiler::COMPILER_PAR_ELEM = (const UChar*)"par"; -UString const Compiler::COMPILER_ENTRY_ELEM = (const UChar*)"e"; -UString const Compiler::COMPILER_RESTRICTION_ATTR = (const UChar*)"r"; -UString const Compiler::COMPILER_RESTRICTION_LR_VAL = (const UChar*)"LR"; -UString const Compiler::COMPILER_RESTRICTION_RL_VAL = (const UChar*)"RL"; -UString const Compiler::COMPILER_PAIR_ELEM = (const UChar*)"p"; -UString const Compiler::COMPILER_LEFT_ELEM = (const UChar*)"l"; -UString const Compiler::COMPILER_RIGHT_ELEM = (const UChar*)"r"; -UString const Compiler::COMPILER_S_ELEM = (const UChar*)"s"; -UString const Compiler::COMPILER_M_ELEM = (const UChar*)"m"; -UString const Compiler::COMPILER_REGEXP_ELEM = (const UChar*)"re"; -UString const Compiler::COMPILER_SECTION_ELEM = (const UChar*)"section"; -UString const Compiler::COMPILER_ID_ATTR = (const UChar*)"id"; -UString const Compiler::COMPILER_TYPE_ATTR = (const UChar*)"type"; -UString const Compiler::COMPILER_IDENTITY_ELEM = (const UChar*)"i"; -UString const Compiler::COMPILER_IDENTITYGROUP_ELEM = (const UChar*)"ig"; -UString const Compiler::COMPILER_JOIN_ELEM = (const UChar*)"j"; -UString const Compiler::COMPILER_BLANK_ELEM = (const UChar*)"b"; -UString const Compiler::COMPILER_POSTGENERATOR_ELEM = (const UChar*)"a"; -UString const Compiler::COMPILER_GROUP_ELEM = (const UChar*)"g"; -UString const Compiler::COMPILER_LEMMA_ATTR = (const UChar*)"lm"; -UString const Compiler::COMPILER_IGNORE_ATTR = (const UChar*)"i"; -UString const Compiler::COMPILER_IGNORE_YES_VAL = (const UChar*)"yes"; -UString const Compiler::COMPILER_ALT_ATTR = (const UChar*)"alt"; -UString const Compiler::COMPILER_V_ATTR = (const UChar*)"v"; -UString const Compiler::COMPILER_VL_ATTR = (const UChar*)"vl"; -UString const Compiler::COMPILER_VR_ATTR = (const UChar*)"vr"; -UString const Compiler::COMPILER_WEIGHT_ATTR = (const UChar*)"w"; -UString const Compiler::COMPILER_TEXT_NODE = (const UChar*)"#text"; -UString const Compiler::COMPILER_COMMENT_NODE = (const UChar*)"#comment"; -UString const Compiler::COMPILER_ACX_ANALYSIS_ELEM = (const UChar*)"analysis-chars"; -UString const Compiler::COMPILER_ACX_CHAR_ELEM = (const UChar*)"char"; -UString const Compiler::COMPILER_ACX_EQUIV_CHAR_ELEM= (const UChar*)"equiv-char"; -UString const Compiler::COMPILER_ACX_VALUE_ATTR = (const UChar*)"value"; +UString const Compiler::COMPILER_DICTIONARY_ELEM = "dictionary"_u; +UString const Compiler::COMPILER_ALPHABET_ELEM = "alphabet"_u; +UString const Compiler::COMPILER_SDEFS_ELEM = "sdefs"_u; +UString const Compiler::COMPILER_SDEF_ELEM = "sdef"_u; +UString const Compiler::COMPILER_N_ATTR = "n"_u; +UString const Compiler::COMPILER_PARDEFS_ELEM = "pardefs"_u; +UString const Compiler::COMPILER_PARDEF_ELEM = "pardef"_u; +UString const Compiler::COMPILER_PAR_ELEM = "par"_u; +UString const Compiler::COMPILER_ENTRY_ELEM = "e"_u; +UString const Compiler::COMPILER_RESTRICTION_ATTR = "r"_u; +UString const Compiler::COMPILER_RESTRICTION_LR_VAL = "LR"_u; +UString const Compiler::COMPILER_RESTRICTION_RL_VAL = "RL"_u; +UString const Compiler::COMPILER_PAIR_ELEM = "p"_u; +UString const Compiler::COMPILER_LEFT_ELEM = "l"_u; +UString const Compiler::COMPILER_RIGHT_ELEM = "r"_u; +UString const Compiler::COMPILER_S_ELEM = "s"_u; +UString const Compiler::COMPILER_M_ELEM = "m"_u; +UString const Compiler::COMPILER_REGEXP_ELEM = "re"_u; +UString const Compiler::COMPILER_SECTION_ELEM = "section"_u; +UString const Compiler::COMPILER_ID_ATTR = "id"_u; +UString const Compiler::COMPILER_TYPE_ATTR = "type"_u; +UString const Compiler::COMPILER_IDENTITY_ELEM = "i"_u; +UString const Compiler::COMPILER_IDENTITYGROUP_ELEM = "ig"_u; +UString const Compiler::COMPILER_JOIN_ELEM = "j"_u; +UString const Compiler::COMPILER_BLANK_ELEM = "b"_u; +UString const Compiler::COMPILER_POSTGENERATOR_ELEM = "a"_u; +UString const Compiler::COMPILER_GROUP_ELEM = "g"_u; +UString const Compiler::COMPILER_LEMMA_ATTR = "lm"_u; +UString const Compiler::COMPILER_IGNORE_ATTR = "i"_u; +UString const Compiler::COMPILER_IGNORE_YES_VAL = "yes"_u; +UString const Compiler::COMPILER_ALT_ATTR = "alt"_u; +UString const Compiler::COMPILER_V_ATTR = "v"_u; +UString const Compiler::COMPILER_VL_ATTR = "vl"_u; +UString const Compiler::COMPILER_VR_ATTR = "vr"_u; +UString const Compiler::COMPILER_WEIGHT_ATTR = "w"_u; +UString const Compiler::COMPILER_TEXT_NODE = "#text"_u; +UString const Compiler::COMPILER_COMMENT_NODE = "#comment"_u; +UString const Compiler::COMPILER_ACX_ANALYSIS_ELEM = "analysis-chars"_u; +UString const Compiler::COMPILER_ACX_CHAR_ELEM = "char"_u; +UString const Compiler::COMPILER_ACX_EQUIV_CHAR_ELEM= "equiv-char"_u; +UString const Compiler::COMPILER_ACX_VALUE_ATTR = "value"_u; Compiler::Compiler() : reader(0), @@ -175,8 +175,7 @@ Compiler::procAlphabet() int ret = xmlTextReaderRead(reader); if(ret == 1) { - xmlChar const *value = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::toUString(value); + letters = XMLParseUtil::readValue(reader); bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { @@ -203,7 +202,7 @@ Compiler::procAlphabet() void Compiler::procSDef() { - alphabet.includeSymbol((const UChar*)"<"+attrib(COMPILER_N_ATTR)+(const UChar*)">"); + alphabet.includeSymbol("<"_u+attrib(COMPILER_N_ATTR)+">"_u); } void @@ -333,7 +332,7 @@ bool Compiler::allBlanks() { bool flag = true; - UString text = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { @@ -348,7 +347,7 @@ Compiler::readString(list &result, UString const &name) { if(name == COMPILER_TEXT_NODE) { - UString value = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); + UString value = XMLParseUtil::readValue(reader); for(unsigned int i = 0, limit = value.size(); i < limit; i++) { result.push_back(static_cast(value[i])); @@ -388,7 +387,7 @@ Compiler::readString(list &result, UString const &name) else if(name == COMPILER_S_ELEM) { requireEmptyError(name); - UString symbol = (const UChar*)"<" + attrib(COMPILER_N_ATTR) + (const UChar*)">"; + UString symbol = "<"_u + attrib(COMPILER_N_ATTR) + ">"_u; if(!alphabet.isSymbolDefined(symbol)) { @@ -424,7 +423,7 @@ Compiler::skipBlanks(UString &name) } xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } @@ -438,12 +437,12 @@ void Compiler::skip(UString &name, UString const &elem, bool open) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); UString slash; if(!open) { - slash = (const UChar*)"/"; + slash = "/"_u; } while(name == COMPILER_TEXT_NODE || name == COMPILER_COMMENT_NODE) @@ -458,7 +457,7 @@ Compiler::skip(UString &name, UString const &elem, bool open) } } xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) @@ -482,7 +481,7 @@ Compiler::procIdentity(UString const &wsweight, bool ig) while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_IDENTITY_ELEM || name == COMPILER_IDENTITYGROUP_ELEM) { break; @@ -527,7 +526,7 @@ Compiler::procTransduction(UString const &wsweight) while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_LEFT_ELEM) { break; @@ -551,7 +550,7 @@ Compiler::procTransduction(UString const &wsweight) while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_RIGHT_ELEM) { break; @@ -724,7 +723,7 @@ Compiler::procSection() requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += (const UChar*)"@"; + current_section += "@"_u; current_section.append(type); } else @@ -758,7 +757,7 @@ Compiler::procEntry() while(name != COMPILER_ENTRY_ELEM) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } return; @@ -766,7 +765,7 @@ Compiler::procEntry() if(wsweight.empty()) { - wsweight = (const UChar*)"0.0000"; + wsweight = "0.0000"_u; } vector elements; @@ -780,7 +779,7 @@ Compiler::procEntry() cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - UString name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); if(current_paradigm.empty() && verbose) @@ -825,7 +824,7 @@ Compiler::procEntry() while(name != COMPILER_ENTRY_ELEM || type != XML_READER_TYPE_END_ELEMENT) { xmlTextReaderRead(reader); - name = XMLParseUtil::toUString(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); } return; @@ -853,8 +852,7 @@ Compiler::procEntry() void Compiler::procNodeACX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - UString name = XMLParseUtil::toUString(xname); + UString name = XMLParseUtil::readName(reader); if(name == COMPILER_TEXT_NODE) { /* ignore */ @@ -886,8 +884,7 @@ Compiler::procNodeACX() void Compiler::procNode() { - xmlChar const *xname = xmlTextReaderConstName(reader); - UString name = XMLParseUtil::toUString(xname); + UString name = XMLParseUtil::readName(reader); // TODO: optimize the execution order of the string "ifs" @@ -944,7 +941,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - UString re = XMLParseUtil::toUString(xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::readValue(reader); et.setRegexp(re); xmlTextReaderRead(reader); return et; diff --git a/lttoolbox/expander.cc b/lttoolbox/expander.cc index 2e4de84..bc471eb 100644 --- a/lttoolbox/expander.cc +++ b/lttoolbox/expander.cc @@ -97,7 +97,7 @@ bool Expander::allBlanks() { bool flag = true; - UString text = to_ustring((char*)xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { @@ -112,7 +112,7 @@ Expander::readString(UString &result, UString const &name) { if(name == Compiler::COMPILER_TEXT_NODE) { - UString value = to_ustring((char*)xmlTextReaderConstValue(reader)); + UString value = XMLParseUtil::readValue(reader); UString escaped = (const UChar*)"^$/<>{}\\*@#+~:"; for(size_t i = value.size()-1; i > 0; i--) { @@ -181,7 +181,7 @@ Expander::skipBlanks(UString &name) exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } @@ -189,7 +189,7 @@ void Expander::skip(UString &name, UString const &elem) { xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_TEXT_NODE) { @@ -200,7 +200,7 @@ Expander::skip(UString &name, UString const &elem) exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) @@ -223,7 +223,7 @@ Expander::procIdentity() while(true) { xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_IDENTITY_ELEM) { break; @@ -248,7 +248,7 @@ Expander::procIdentityGroup() while(true) { xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { break; @@ -277,7 +277,7 @@ Expander::procTransduction() while(true) { xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_LEFT_ELEM) { break; @@ -294,7 +294,7 @@ Expander::procTransduction() while(true) { xmlTextReaderRead(reader); - name = to_ustring((char*)xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_RIGHT_ELEM) { break; @@ -365,7 +365,7 @@ Expander::procEntry(UFILE* output) cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - myname = to_ustring((char*)xmlTextReaderConstName(reader)); + myname = XMLParseUtil::readName(reader); } while(myname != Compiler::COMPILER_ENTRY_ELEM); return; @@ -397,7 +397,7 @@ Expander::procEntry(UFILE* output) cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - UString name = to_ustring((char*)xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); int type = xmlTextReaderNodeType(reader); @@ -495,28 +495,15 @@ Expander::procEntry(UFILE* output) { for(auto& it : items) { - u_fputs(it.first, output); - u_fputc(':', output); - u_fputs(it.second, output); - u_fputc('\n', output); + u_fprintf(output, "%S:%S\n", it.first.c_str(), it.second.c_str()); } for(auto& it : items_lr) { - u_fputs(it.first, output); - u_fputc(':', output); - u_fputc('>', output); - u_fputc(':', output); - u_fputs(it.second, output); - u_fputc('\n', output); + u_fprintf(output, "%S:>:%S\n", it.first.c_str(), it.second.c_str()); } for(auto& it : items_rl) { - u_fputs(it.first, output); - u_fputc(':', output); - u_fputc('<', output); - u_fputc(':', output); - u_fputs(it.second, output); - u_fputc('\n', output); + u_fprintf(output, "%S:<:%S\n", it.first.c_str(), it.second.c_str()); } } else @@ -550,7 +537,7 @@ Expander::procEntry(UFILE* output) void Expander::procNode(UFILE *output) { - UString name = to_ustring((char*)xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); // DO: optimize the execution order of this string "ifs" @@ -606,7 +593,7 @@ UString Expander::procRegexp() { xmlTextReaderRead(reader); - UString re = to_ustring((char*)xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::readValue(reader); xmlTextReaderRead(reader); return re; } diff --git a/lttoolbox/expander.h b/lttoolbox/expander.h index 127929f..3d2c6df 100644 --- a/lttoolbox/expander.h +++ b/lttoolbox/expander.h @@ -17,8 +17,7 @@ #ifndef _EXPANDER_ #define _EXPANDER_ -#include -#include +#include #include #include diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index a676a6d..30c76a0 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -37,17 +37,17 @@ outOfWord(false), isLastBlankTM(false) { // escaped_chars chars - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); caseSensitive = false; dictionaryCase = false; @@ -126,28 +126,27 @@ FSTProcessor::parseRCX(string const &file) void FSTProcessor::procNodeICX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == "#text"_u) { /* ignore */ } - else if(name == L"ignored-chars") + else if(name == "ignored-chars"_u) { /* ignore */ } - else if(name == L"char") + else if(name == "char"_u) { - ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, "value"_u)[0])); } - else if(name == L"#comment") + else if(name == "#comment"_u) { /* ignore */ } else { - wcerr << L"Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in ICX UFILE (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -161,47 +160,46 @@ FSTProcessor::initDefaultIgnoredCharacters() void FSTProcessor::procNodeRCX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == "#text"_u) { /* ignore */ } - else if(name == L"restore-chars") + else if(name == "restore-chars"_u) { /* ignore */ } - else if(name == L"char") + else if(name == "char"_u) { - rcx_current_char = static_cast(XMLParseUtil::attrib(reader, L"value")[0]); + rcx_current_char = static_cast(XMLParseUtil::attrib(reader, "value"_u)[0]); } - else if(name == L"restore-char") + else if(name == "restore-char"_u) { - rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, "value"_u)[0])); } - else if(name == L"#comment") + else if(name == "#comment"_u) { /* ignore */ } else { - wcerr << L"Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in RCX UFILE (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } -wchar_t -FSTProcessor::readEscaped(FILE *input) +UChar +FSTProcessor::readEscaped(UFILE *input) { - if(feof(input)) + if(u_feof(input)) { streamError(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); + UChar val = static_cast(u_fgetc(input)); - if(feof(input)) + if(u_feof(input)) { streamError(); } @@ -209,24 +207,24 @@ FSTProcessor::readEscaped(FILE *input) return val; } -wstring -FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) +UString +FSTProcessor::readFullBlock(UFILE *input, UChar const delim1, UChar const delim2) { - wstring result = L""; + UString result; result += delim1; - wchar_t c = delim1; + UChar c = delim1; - while(!feof(input) && c != delim2) + while(!u_feof(input) && c != delim2) { - c = static_cast(fgetwc_unlocked(input)); + c = static_cast(u_fgetc(input)); result += c; - if(c != L'\\') + if(c != '\\') { continue; } else { - result += static_cast(readEscaped(input)); + result += static_cast(readEscaped(input)); } } @@ -238,35 +236,35 @@ FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const del return result; } -wstring -FSTProcessor::readWblank(FILE *input) +UString +FSTProcessor::readWblank(UFILE *input) { - wstring result = L""; - result += L"[["; - wchar_t c = 0; + UString result; + result += "[["_u; + UChar c = 0; - while(!feof(input)) + while(!u_feof(input)) { - c = static_cast(fgetwc_unlocked(input)); + c = static_cast(u_fgetc(input)); result += c; - if(c == L'\\') + if(c == '\\') { - result += static_cast(readEscaped(input)); + result += static_cast(readEscaped(input)); } - else if(c == L']') + else if(c == ']') { - c = static_cast(fgetwc_unlocked(input)); + c = static_cast(u_fgetc(input)); result += c; - if(c == L']') + if(c == ']') { break; } } } - if(c != L']') + if(c != ']') { streamError(); } @@ -275,38 +273,38 @@ FSTProcessor::readWblank(FILE *input) } bool -FSTProcessor::wblankPostGen(FILE *input, FILE *output) +FSTProcessor::wblankPostGen(UFILE *input, UFILE *output) { - wstring result = L""; - result += L"[["; - wchar_t c = 0; + UString result; + result += "[["_u; + UChar c = 0; - while(!feof(input)) + while(!u_feof(input)) { - c = static_cast(fgetwc_unlocked(input)); + c = static_cast(u_fgetc(input)); result += c; - if(c == L'\\') + if(c == '\\') { - result += static_cast(readEscaped(input)); + result += static_cast(readEscaped(input)); } - else if(c == L']') + else if(c == ']') { - c = static_cast(fgetwc_unlocked(input)); + c = static_cast(u_fgetc(input)); result += c; - if(c == L']') + if(c == ']') { int resultlen = result.size(); if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] { - fputws(result.c_str(), output); + u_fputs(result.c_str(), output); break; } else { - c = static_cast(fgetwc_unlocked(input)); - if(c == L'~') + c = static_cast(u_fgetc(input)); + if(c == '~') { wblankqueue.push(result); return true; @@ -320,7 +318,7 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) } } - if(c != L']') + if(c != ']') { streamError(); } @@ -329,16 +327,16 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) } int -FSTProcessor::readAnalysis(FILE *input) +FSTProcessor::readAnalysis(UFILE *input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); + UChar val = static_cast(u_fgetc(input)); int altval = 0; - if(feof(input)) + if(u_feof(input)) { input_buffer.add(0); // so it's treated like the NUL byte return 0; @@ -347,36 +345,36 @@ FSTProcessor::readAnalysis(FILE *input) if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { input_buffer.add(val); - val = static_cast(fgetwc_unlocked(input)); + val = static_cast(u_fgetc(input)); } if(escaped_chars.find(val) != escaped_chars.end()) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = static_cast(alphabet(readFullBlock(input, '<', '>'))); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = static_cast(u_fgetc(input)); - if(val == L'[') + if(val == '[') { blankqueue.push(readWblank(input)); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + u_fungetc(val, input); + blankqueue.push(readFullBlock(input, '[', ']')); } - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); + case '\\': + val = static_cast(u_fgetc(input)); input_buffer.add(static_cast(val)); return val; @@ -384,8 +382,8 @@ FSTProcessor::readAnalysis(FILE *input) streamError(); } } - if(val == L' ') { - blankqueue.push(L" "); + if(val == ' ') { + blankqueue.push(" "_u); } input_buffer.add(val); @@ -393,7 +391,7 @@ FSTProcessor::readAnalysis(FILE *input) } int -FSTProcessor::readTMAnalysis(FILE *input) +FSTProcessor::readTMAnalysis(UFILE *input) { isLastBlankTM = false; if(!input_buffer.isEmpty()) @@ -401,9 +399,9 @@ FSTProcessor::readTMAnalysis(FILE *input) return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); + UChar val = static_cast(u_fgetc(input)); int altval = 0; - if(feof(input)) + if(u_feof(input)) { return 0; } @@ -412,53 +410,53 @@ FSTProcessor::readTMAnalysis(FILE *input) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = static_cast(alphabet(readFullBlock(input, '<', '>'))); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = static_cast(u_fgetc(input)); - if(val == L'[') + if(val == '[') { blankqueue.push(readWblank(input)); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + u_fungetc(val, input); + blankqueue.push(readFullBlock(input, '[', ']')); } - input_buffer.add(static_cast(L' ')); + input_buffer.add(static_cast(' ')); isLastBlankTM = true; - return static_cast(L' '); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); + case '\\': + val = static_cast(u_fgetc(input)); input_buffer.add(static_cast(val)); return val; - case L'0': - case L'1': - case L'2': - case L'3': - case L'4': - case L'5': - case L'6': - case L'7': - case L'8': - case L'9': - { - wstring ws = L""; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + UString ws; do { ws += val; - val = static_cast(fgetwc_unlocked(input)); + val = static_cast(u_fgetc(input)); } while(iswdigit(val)); - ungetwc_unlocked(val, input); - input_buffer.add(alphabet(L"")); + u_fungetc(val, input); + input_buffer.add(alphabet(""_u)); numbers.push_back(ws); - return alphabet(L""); + return alphabet(""_u); } break; @@ -472,60 +470,60 @@ FSTProcessor::readTMAnalysis(FILE *input) } int -FSTProcessor::readPostgeneration(FILE *input, FILE *output) +FSTProcessor::readPostgeneration(UFILE *input, UFILE *output) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); + UChar val = static_cast(u_fgetc(input)); int altval = 0; is_wblank = false; - if(feof(input)) + if(u_feof(input)) { return 0; } switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = static_cast(alphabet(readFullBlock(input, '<', '>'))); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = static_cast(u_fgetc(input)); - if(val == L'[') + if(val == '[') { if(collect_wblanks) { wblankqueue.push(readWblank(input)); is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } else if(wblankPostGen(input, output)) { - return static_cast(L'~'); + return static_cast('~'); } else { is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + u_fungetc(val, input); + blankqueue.push(readFullBlock(input, '[', ']')); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } - case L'\\': - val = static_cast(fgetwc_unlocked(input)); + case '\\': + val = static_cast(u_fgetc(input)); input_buffer.add(static_cast(val)); return val; @@ -536,33 +534,33 @@ FSTProcessor::readPostgeneration(FILE *input, FILE *output) } void -FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) +FSTProcessor::skipUntil(UFILE *input, UFILE *output, wint_t const character) { while(true) { - wint_t val = fgetwc_unlocked(input); - if(feof(input)) + wint_t val = u_fgetc(input); + if(u_feof(input)) { return; } switch(val) { - case L'\\': - val = fgetwc_unlocked(input); - if(feof(input)) + case '\\': + val = u_fgetc(input); + if(u_feof(input)) { return; } - fputwc_unlocked(L'\\', output); - fputwc_unlocked(val, output); + u_fputc('\\', output); + u_fputc(val, output); break; - case L'\0': - fputwc_unlocked(val, output); + case '\0': + u_fputc(val, output); if(nullFlushGeneration) { - fflush(output); + u_fflush(output); } break; @@ -573,7 +571,7 @@ FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } break; } @@ -581,47 +579,47 @@ FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) } int -FSTProcessor::readGeneration(FILE *input, FILE *output) +FSTProcessor::readGeneration(UFILE *input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); + wint_t val = u_fgetc(input); - if(feof(input)) + if(u_feof(input)) { return 0x7fffffff; } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = u_fgetc(input); + if(u_feof(input)) { return 0x7fffffff; } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = u_fgetc(input); + if(u_feof(input)) { return 0x7fffffff; } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = u_fgetc(input); + if(u_feof(input)) { return 0x7fffffff; } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = u_fgetc(input); + if(u_feof(input)) { return 0x7fffffff; } @@ -629,44 +627,44 @@ FSTProcessor::readGeneration(FILE *input, FILE *output) outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); + val = u_fgetc(input); return static_cast(val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return static_cast(L'$'); + return static_cast('$'); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); + UString cad; + cad += static_cast(val); - while((val = fgetwc_unlocked(input)) != L'>') + while((val = u_fgetc(input)) != '>') { - if(feof(input)) + if(u_feof(input)) { streamError(); } - cad += static_cast(val); + cad += static_cast(val); } - cad += static_cast(val); + cad += static_cast(val); return alphabet(cad); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = u_fgetc(input); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + u_fputs(readWblank(input).c_str(), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + u_fungetc(val, input); + u_fputs(readFullBlock(input, '[', ']').c_str(), output); } return readGeneration(input, output); @@ -679,79 +677,79 @@ FSTProcessor::readGeneration(FILE *input, FILE *output) return 0x7fffffff; } -pair -FSTProcessor::readBilingual(FILE *input, FILE *output) +pair +FSTProcessor::readBilingual(UFILE *input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); - wstring symbol = L""; + wint_t val = u_fgetc(input); + UString symbol; - if(feof(input)) + if(u_feof(input)) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = u_fgetc(input); + if(u_feof(input)) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = u_fgetc(input); + if(u_feof(input)) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = u_fgetc(input); + if(u_feof(input)) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = u_fgetc(input); + if(u_feof(input)) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return pair(symbol, val); + val = u_fgetc(input); + return pair(symbol, val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return pair(symbol, static_cast(L'$')); + return pair(symbol, static_cast('$')); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); - while((val = fgetwc_unlocked(input)) != L'>') + UString cad; + cad += static_cast(val); + while((val = u_fgetc(input)) != '>') { - if(feof(input)) + if(u_feof(input)) { streamError(); } - cad += static_cast(val); + cad += static_cast(val); } - cad += static_cast(val); + cad += static_cast(val); int res = alphabet(cad); @@ -759,64 +757,64 @@ FSTProcessor::readBilingual(FILE *input, FILE *output) { symbol = cad; } - return pair(symbol, res); + return pair(symbol, res); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = u_fgetc(input); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + u_fputs(readWblank(input).c_str(), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + u_fungetc(val, input); + u_fputs(readFullBlock(input, '[', ']').c_str(), output); } return readBilingual(input, output); } - return pair(symbol, val); + return pair(symbol, val); } void -FSTProcessor::flushBlanks(FILE *output) +FSTProcessor::flushBlanks(UFILE *output) { for(size_t i = blankqueue.size(); i > 0; i--) { - fputws_unlocked(blankqueue.front().c_str(), output); + u_fputs(blankqueue.front().c_str(), output); blankqueue.pop(); } } void -FSTProcessor::flushWblanks(FILE *output) +FSTProcessor::flushWblanks(UFILE *output) { while(wblankqueue.size() > 0) { - fputws_unlocked(wblankqueue.front().c_str(), output); + u_fputs(wblankqueue.front().c_str(), output); wblankqueue.pop(); } } -wstring +UString FSTProcessor::combineWblanks() { - wstring final_wblank; - wstring last_wblank = L""; + UString final_wblank; + UString last_wblank; while(wblankqueue.size() > 0) { - if(wblankqueue.front().compare(L"[[/]]") == 0) + if(wblankqueue.front().compare("[[/]]"_u) == 0) { if(final_wblank.empty()) { - final_wblank += L"[["; + final_wblank += "[["_u; } else if(final_wblank.size() > 2) { - final_wblank += L"; "; + final_wblank += "; "_u; } final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] @@ -836,7 +834,7 @@ FSTProcessor::combineWblanks() if(!final_wblank.empty()) { - final_wblank += L"]]"; + final_wblank += "]]"_u; need_end_wblank = true; } @@ -846,18 +844,15 @@ FSTProcessor::combineWblanks() void FSTProcessor::calcInitial() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - root.addTransition(0, 0, it->second.getInitial(), default_weight); + for(auto& it : transducers) { + root.addTransition(0, 0, it.second.getInitial(), default_weight); } initial_state.init(&root); } bool -FSTProcessor::endsWith(wstring const &str, wstring const &suffix) +FSTProcessor::endsWith(UString const &str, UString const &suffix) { if(str.size() < suffix.size()) { @@ -872,64 +867,61 @@ FSTProcessor::endsWith(wstring const &str, wstring const &suffix) void FSTProcessor::classifyFinals() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - if(endsWith(it->first, L"@inconditional")) + for(auto& it : transducers) { + if(endsWith(it.first, "@inconditional"_u)) { - inconditional.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + inconditional.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@standard")) + else if(endsWith(it.first, "@standard"_u)) { - standard.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + standard.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@postblank")) + else if(endsWith(it.first, "@postblank"_u)) { - postblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + postblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@preblank")) + else if(endsWith(it.first, "@preblank"_u)) { - preblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + preblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } else { - wcerr << L"Error: Unsupported transducer type for '"; - wcerr << it->first << L"'." << endl; + cerr << "Error: Unsupported transducer type for '"; + cerr << it.first << "'." << endl; exit(EXIT_FAILURE); } } } void -FSTProcessor::writeEscaped(wstring const &str, FILE *output) +FSTProcessor::writeEscaped(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } size_t -FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output) +FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) { size_t postpop = 0; for (unsigned int i = 0, limit = str.size(); i < limit; i++) { if (escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); - if (str[i] == L' ') { - if (blankqueue.front() == L" ") { + u_fputc(str[i], output); + if (str[i] == ' ') { + if (blankqueue.front() == " "_u) { blankqueue.pop(); } else { postpop++; @@ -940,71 +932,71 @@ FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output) } void -FSTProcessor::writeEscapedWithTags(wstring const &str, FILE *output) +FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { - fputws_unlocked(str.substr(i).c_str(), output); + u_fputs(str.substr(i).c_str(), output); return; } if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } void -FSTProcessor::printWord(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fputs(lf.c_str(), output); + u_fputc('$', output); } void -FSTProcessor::printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordPopBlank(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); size_t postpop = writeEscapedPopBlanks(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fputs(lf.c_str(), output); + u_fputc('$', output); while (postpop-- && blankqueue.size() > 0) { - fputws(blankqueue.front().c_str(), output); + u_fputs(blankqueue.front().c_str(), output); blankqueue.pop(); } } void -FSTProcessor::printWordBilingual(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordBilingual(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fputc('^', output); + u_fputs(sf.c_str(), output); + u_fputs(lf.c_str(), output); + u_fputc('$', output); } void -FSTProcessor::printUnknownWord(wstring const &sf, FILE *output) +FSTProcessor::printUnknownWord(UString const &sf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'*', output); + u_fputc('/', output); + u_fputc('*', output); writeEscaped(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } unsigned int -FSTProcessor::lastBlank(wstring const &str) +FSTProcessor::lastBlank(UString const &str) { for(int i = static_cast(str.size())-1; i >= 0; i--) { @@ -1018,7 +1010,7 @@ FSTProcessor::lastBlank(wstring const &str) } void -FSTProcessor::printSpace(wchar_t const val, FILE *output) +FSTProcessor::printSpace(UChar const val, UFILE *output) { if(blankqueue.size() > 0) { @@ -1026,18 +1018,18 @@ FSTProcessor::printSpace(wchar_t const val, FILE *output) } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } bool -FSTProcessor::isEscaped(wchar_t const c) const +FSTProcessor::isEscaped(UChar const c) const { return escaped_chars.find(c) != escaped_chars.end(); } bool -FSTProcessor::isAlphabetic(wchar_t const c) const +FSTProcessor::isAlphabetic(UChar const c) const { return (bool)std::iswalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); } @@ -1065,7 +1057,7 @@ FSTProcessor::load(FILE *input) int len = Compression::multibyte_read(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); len--; } @@ -1077,10 +1069,10 @@ FSTProcessor::load(FILE *input) while(len > 0) { int len2 = Compression::multibyte_read(input); - wstring name = L""; + UString name; while(len2 > 0) { - name += static_cast(Compression::multibyte_read(input)); + name += static_cast(Compression::multibyte_read(input)); len2--; } transducers[name].read(input, alphabet); @@ -1088,266 +1080,6 @@ FSTProcessor::load(FILE *input) } } -void -FSTProcessor::lsx_wrapper_null_flush(FILE *input, FILE *output) -{ - setNullFlush(false); - //nullFlushGeneration = true; - - while(!feof(input)) - { - lsx(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } - } -} - -void -FSTProcessor::lsx(FILE *input, FILE *output) -{ - if(getNullFlush()) - { - lsx_wrapper_null_flush(input, output); - } - - vector new_states, alive_states; - wstring blank, out, in, alt_out, alt_in; - bool outOfWord = true; - bool finalFound = false; - bool plus_thing = false; - - alive_states.push_back(initial_state); - - int val = -1; - - while(!feof(input) && val != 0) - { - val = fgetwc_unlocked(input); - - if(val == L'+' && isEscaped(val) && !outOfWord) - { - val = L'$'; - plus_thing = true; - } - - if((val == L'^' && isEscaped(val) && outOfWord) || feof(input) || val == 0) - { - blankqueue.push(blank); - - if(alive_states.size() == 0) - { - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - fflush(output); - blankqueue.pop(); - } - - alive_states.push_back(initial_state); - - alt_in = L""; - for(int i=0; i < (int) in.size(); i++) // FIXME indexing - { - alt_in += in[i]; - if(in[i] == L'$' && in[i+1] == L'^' && blankqueue.size() > 0) - { - // in.insert(i+1, blankqueue.front().c_str()); - alt_in += blankqueue.front().c_str(); - blankqueue.pop(); - } - } - in = alt_in; - fputws(in.c_str(), output); - fflush(output); - in = L""; - finalFound = false; - } - else if(finalFound && alive_states.size() == 1) - { - finalFound = false; - } - - blank = L""; - in += val; - outOfWord = false; - continue; - } - - // wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl; - - if(outOfWord) - { - blank += val; - continue; - } - - if((val == 0 || feof(input) || val == L'$') && !outOfWord) // && isEscaped(val) - { - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - //wcerr << endl << L"[0] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - s.step(alphabet(L"<$>")); - //wcerr << endl << L"[1] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - if(s.size() > 0) - { - new_states.push_back(s); - } - - /*if(s.isFinal(all_finals)) - { - out += s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - new_states.push_back(*initial_state); - }*/ - - if(s.isFinal(all_finals)) - { - new_states.clear(); - new_states.push_back(initial_state); - out = s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - - alt_out = L""; - for (int i=0; i < (int) out.size(); i++) - { - wchar_t c = out.at(i); - if(c == L'/') - { - alt_out += L'^'; - } - else if(out[i-1] == L'<' && c == L'$' && out[i+1] == L'>') // indexing - { - alt_out += c; - alt_out += L'^'; - } - else if(!(c == L'<' && out[i+1] == L'$' && out[i+2] == L'>') && !(out[i-2] == L'<' && out[i-1] == L'$' && c == L'>')) - { - alt_out += c; - } - } - out = alt_out; - - - if(out[out.length()-1] == L'^') - { - out = out.substr(0, out.length()-1); // extra ^ at the end - if(plus_thing) - { - out[out.size()-1] = L'+'; - plus_thing = false; - } - } - else // take# out ... of - { - for(int i=out.length()-1; i>=0; i--) // indexing - { - if(out.at(i) == L'$') - { - out.insert(i+1, L" "); - break; - } - } - out += L'$'; - } - - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - blankqueue.pop(); - } - - alt_out = L""; - for(int i=0; i < (int) out.size(); i++) // indexing - { - if((out.at(i) == L'$') && blankqueue.size() > 0) - { - alt_out += out.at(i); - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else if((out.at(i) == L'$') && blankqueue.size() == 0 && i != (int) out.size()-1) - { - alt_out += out.at(i); - alt_out += L' '; - } - else if(out.at(i) == L' ' && blankqueue.size() > 0) - { - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else - { - alt_out += out.at(i); - } - } - out = alt_out; - - fputws(out.c_str(), output); - flushBlanks(output); - finalFound = true; - out = L""; - in = L""; - } - } - - alive_states.swap(new_states); - outOfWord = true; - - if(!finalFound) - { - in += val; //do not remove - } - continue; - } - - if(!outOfWord) // && (!(feof(input) || val == L'$'))) - { - if(val == L'<') // tag - { - wstring tag = readFullBlock(input, L'<', L'>'); - in += tag; - if(!alphabet.isSymbolDefined(tag)) - { - alphabet.includeSymbol(tag); - } - val = static_cast(alphabet(tag)); - } - else - { - in += (wchar_t) val; - } - - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - if(val < 0) - { - s.step_override(val, alphabet(L""), val); - } - else if(val > 0) - { - int val_lowercase = towlower(val); - s.step_override(val_lowercase, alphabet(L""), val); // FIXME deal with cases! in step_override - } - - if(s.size() > 0) - { - new_states.push_back(s); - } - - } - alive_states.swap(new_states); - } - } - - flushBlanks(output); -} - void FSTProcessor::initAnalysis() { @@ -1364,12 +1096,9 @@ FSTProcessor::initTMAnalysis() { calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@ -1378,12 +1107,9 @@ FSTProcessor::initGeneration() { setIgnoredChars(false); calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@ -1400,8 +1126,8 @@ FSTProcessor::initBiltrans() } -wstring -FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupper) +UString +FSTProcessor::compoundAnalysis(UString input_word, bool uppercase, bool firstupper) { const int MAX_COMBINATIONS = 32767; @@ -1409,16 +1135,16 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp for(unsigned int i=0; i MAX_COMBINATIONS) { - wcerr << L"Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << L"'" << endl; - wcerr << L" gave up at char " << i << L" '" << val << L"'." << endl; + cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << endl; + cerr << " gave up at char " << i << " '" << val << "'." << endl; - wstring nullString = L""; + UString nullString; return nullString; } @@ -1429,13 +1155,13 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp if(current_state.size()==0) { - wstring nullString = L""; + UString nullString; return nullString; } } current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements); - wstring result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); + UString result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); return result; } @@ -1445,30 +1171,30 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp void FSTProcessor::initDecompositionSymbols() { - if((compoundOnlyLSymbol=alphabet(L"<:co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<:compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"")) == 0) + if((compoundOnlyLSymbol=alphabet("<:co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<:compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:only-L> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, L""); + alphabet.setSymbol(compoundOnlyLSymbol, ""_u); } - if((compoundRSymbol=alphabet(L"<:co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<:compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"")) == 0) + if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<:compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:R> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:R> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, L""); + alphabet.setSymbol(compoundRSymbol, ""_u); } } @@ -1482,7 +1208,7 @@ FSTProcessor::initDecomposition() } void -FSTProcessor::analysis(FILE *input, FILE *output) +FSTProcessor::analysis(UFILE *input, UFILE *output) { if(getNullFlush()) { @@ -1493,13 +1219,13 @@ FSTProcessor::analysis(FILE *input, FILE *output) bool last_postblank = false; bool last_preblank = false; State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; bool firstupper = false, uppercase = false; map >::iterator rcx_map_ptr; - wchar_t val; + UChar val; do { val = readAnalysis(input); @@ -1585,9 +1311,9 @@ FSTProcessor::analysis(FILE *input, FILE *output) last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && iswspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_preblank = false; @@ -1637,29 +1363,29 @@ FSTProcessor::analysis(FILE *input, FILE *output) } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { if(iswspace(val)) { if (blankqueue.size() > 0) { - fputws_unlocked(blankqueue.front().c_str(), output); + u_fputs(blankqueue.front().c_str(), output); blankqueue.pop(); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } else { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } if(val) { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@ -1667,13 +1393,13 @@ FSTProcessor::analysis(FILE *input, FILE *output) { printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(last_preblank) { - fputwc_unlocked(L' ', output); + u_fputc(' ', output); printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); input_buffer.setPos(last); @@ -1688,7 +1414,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@ -1698,7 +1424,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@ -1707,7 +1433,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) @@ -1716,9 +1442,9 @@ FSTProcessor::analysis(FILE *input, FILE *output) uppercase = firstupper && iswupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@ -1733,11 +1459,11 @@ FSTProcessor::analysis(FILE *input, FILE *output) } } } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@ -1746,7 +1472,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) @@ -1755,9 +1481,9 @@ FSTProcessor::analysis(FILE *input, FILE *output) uppercase = firstupper && iswupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@ -1787,8 +1513,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; last_preblank = false; @@ -1801,97 +1527,77 @@ FSTProcessor::analysis(FILE *input, FILE *output) } void -FSTProcessor::analysis_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::analysis_wrapper_null_flush(UFILE *input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!u_feof(input)) { analysis(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::generation_wrapper_null_flush(FILE *input, FILE *output, +FSTProcessor::generation_wrapper_null_flush(UFILE *input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!u_feof(input)) { generation(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::postgeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::postgeneration_wrapper_null_flush(UFILE *input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!u_feof(input)) { postgeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::intergeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::intergeneration_wrapper_null_flush(UFILE *input, UFILE *output) { setNullFlush(false); - while (!feof(input)) + while (!u_feof(input)) { intergeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if (code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::transliteration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::transliteration_wrapper_null_flush(UFILE *input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!u_feof(input)) { transliteration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::tm_analysis(FILE *input, FILE *output) +FSTProcessor::tm_analysis(UFILE *input, UFILE *output) { State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; - while(wchar_t val = readTMAnalysis(input)) + while(UChar val = readTMAnalysis(input)) { // test for final states if(current_state.isFinal(all_finals)) @@ -1905,7 +1611,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) numbers.clear(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && iswspace(val)) { lf.append(sf); last = input_buffer.getPos(); @@ -1926,7 +1632,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@ -1937,7 +1643,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } else { - if((iswspace(val) || iswpunct(val)) && sf == L"") + if((iswspace(val) || iswpunct(val)) && sf.empty()) { if(iswspace(val)) { @@ -1947,14 +1653,14 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else if(!iswspace(val) && !iswpunct(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do @@ -1963,7 +1669,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@ -1976,12 +1682,12 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) if(val == 0) { - fputws_unlocked(sf.c_str(), output); + u_fputs(sf.c_str(), output); return; } input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + u_fputs(sf.c_str(), output); while(blankqueue.size() > 0) { @@ -1993,22 +1699,22 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } /* - unsigned int limit = sf.find(L' '); + unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + u_fputs(sf.substr(0, limit).c_str(), output); */ } - else if(lf == L"") + else if(lf.empty()) { -/* unsigned int limit = sf.find(L' '); +/* unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + u_fputs(sf.substr(0, limit).c_str(), output); */ input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + u_fputs(sf.c_str(), output); while(blankqueue.size() > 0) { @@ -2022,16 +1728,16 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } else { - fputwc_unlocked(L'[', output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L']', output); + u_fputc('[', output); + u_fputs(lf.c_str(), output); + u_fputc(']', output); input_buffer.setPos(last); input_buffer.back(1); } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } } @@ -2041,7 +1747,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) void -FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::generation(UFILE *input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@ -2049,24 +1755,24 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } State current_state = initial_state; - wstring sf = L""; + UString sf; outOfWord = false; - skipUntil(input, output, L'^'); + skipUntil(input, output, '^'); int val; while((val = readGeneration(input, output)) != 0x7fffffff) { - if(sf == L"" && val == L'=') + if(sf.empty() && val == '=') { - fputwc(L'=', output); + u_fputc('=', output); val = readGeneration(input, output); } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { - if(sf[0] == L'*' || sf[0] == L'%') + if(sf[0] == '*' || sf[0] == '%') { if(mode != gm_clean && mode != gm_tagged_nm) { @@ -2078,14 +1784,14 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } - else if(sf[0] == L'@') + else if(sf[0] == '@') { if(mode == gm_all) { @@ -2105,11 +1811,11 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } else if(current_state.isFinal(all_finals)) @@ -2123,18 +1829,18 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); } - fputws_unlocked(current_state.filterFinals(all_finals, alphabet, + u_fputs(current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper).substr(1).c_str(), output); if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } @@ -2142,7 +1848,7 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) { if(mode == gm_all) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(sf, output); } else if(mode == gm_clean) @@ -2151,36 +1857,36 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_unknown) { - if(sf != L"") + if(!sf.empty()) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } } else if(mode == gm_tagged) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf), output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'#', output); + u_fputc('/', output); + u_fputc('#', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } current_state = initial_state; - sf = L""; + sf.clear(); } else if(iswspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && (sf[0] == L'*' || sf[0] == L'%' )) + else if(sf.size() > 0 && (sf[0] == '*' || sf[0] == '%' )) { alphabet.getSymbol(sf, val); } @@ -2210,7 +1916,7 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } void -FSTProcessor::postgeneration(FILE *input, FILE *output) +FSTProcessor::postgeneration(UFILE *input, UFILE *output) { if(getNullFlush()) { @@ -2221,14 +1927,14 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) collect_wblanks = false; need_end_wblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; - while(wchar_t val = readPostgeneration(input, output)) + while(UChar val = readPostgeneration(input, output)) { - if(val == L'~') + if(val == '~') { skip_mode = false; collect_wblanks = true; @@ -2244,7 +1950,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) { if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + u_fputs("[[/]]"_u, output); need_end_wblank = false; } @@ -2259,13 +1965,13 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + u_fputs("[[/]]"_u, output); need_end_wblank = false; } } @@ -2289,7 +1995,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) // case of the beggining of the next word - wstring mybuf = L""; + UString mybuf; for(size_t i = sf.size(); i > 0; --i) { if(!isalpha(sf[i-1])) @@ -2353,51 +2059,51 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else { - wstring final_wblank = combineWblanks(); - fputws_unlocked(final_wblank.c_str(), output); + UString final_wblank = combineWblanks(); + u_fputs(final_wblank.c_str(), output); - if(lf == L"") + if(lf.empty()) { unsigned int mark = sf.size(); unsigned int space_index = sf.size(); - + for(unsigned int i = 1, limit = sf.size(); i < limit; i++) { - if(sf[i] == L'~') + if(sf[i] == '~') { mark = i; break; } - else if(sf[i] == L' ') + else if(sf[i] == ' ') { space_index = i; } } - + if(space_index != sf.size()) { - fputws_unlocked(sf.substr(1, space_index-1).c_str(), output); - + u_fputs(sf.substr(1, space_index-1).c_str(), output); + if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + u_fputs("[[/]]"_u, output); need_end_wblank = false; - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); flushWblanks(output); } else { - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); } - - fputws_unlocked(sf.substr(space_index+1, mark-space_index-1).c_str(), output); + + u_fputs(sf.substr(space_index+1, mark-space_index-1).c_str(), output); } else { flushWblanks(output); - fputws_unlocked(sf.substr(1, mark-1).c_str(), output); + u_fputs(sf.substr(1, mark-1).c_str(), output); } - + if(mark == sf.size()) { input_buffer.back(1); @@ -2409,7 +2115,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else { - fputws_unlocked(lf.substr(1,lf.size()-3).c_str(), output); + u_fputs(lf.substr(1,lf.size()-3).c_str(), output); input_buffer.setPos(last); input_buffer.back(2); val = lf[lf.size()-2]; @@ -2421,15 +2127,15 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); skip_mode = true; collect_wblanks = false; } @@ -2441,7 +2147,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } void -FSTProcessor::intergeneration(FILE *input, FILE *output) +FSTProcessor::intergeneration(UFILE *input, UFILE *output) { if (getNullFlush()) { @@ -2450,16 +2156,16 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) bool skip_mode = true; State current_state = initial_state; - wstring target = L""; - wstring source = L""; + UString target; + UString source; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; while (true) { - wchar_t val = readPostgeneration(input, output); + UChar val = readPostgeneration(input, output); - if (val == L'~') + if (val == '~') { skip_mode = false; } @@ -2472,13 +2178,13 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) } else { - if(val != L'\0') + if(val != '\0') { if (isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@ -2497,7 +2203,7 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) last = input_buffer.getPos(); } - if (val != L'\0') + if (val != '\0') { if (!iswupper(val) || caseSensitive) { @@ -2509,27 +2215,27 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) } } - if (val != L'\0' && current_state.size() != 0) + if (val != '\0' && current_state.size() != 0) { alphabet.getSymbol(source, val); } else { - if (target == L"") // no match + if (target.empty()) // no match { - if (val == L'\0') + if (val == '\0') { // flush source - fputws_unlocked(source.c_str(), output); + u_fputs(source.c_str(), output); } else { - fputwc_unlocked(source[0], output); + u_fputc(source[0], output); unsigned int mark, limit; - for (mark = 1, limit = source.size(); mark < limit && source[mark] != L'~' ; mark++) + for (mark = 1, limit = source.size(); mark < limit && source[mark] != '~' ; mark++) { - fputwc_unlocked(source[mark], output); + u_fputc(source[mark], output); } if (mark != source.size()) @@ -2538,18 +2244,18 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) input_buffer.back(back); } - if (val == L'~') + if (val == '~') { input_buffer.back(1); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } else { for(unsigned int i=1; i(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -2761,18 +2467,18 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) { if(mark) { - result = L"^="+result.substr(1); + result = "^="_u + result.substr(1); } else { - result[0] = L'^'; + result[0] = '^'; } } else { if(mark) { - result = L"=" + result.substr(1); + result = "="_u + result.substr(1); } else { @@ -2783,7 +2489,7 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -2792,11 +2498,11 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -2805,23 +2511,23 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) if(start_point < (end_point - 3)) { - return L"^$"; + return "^$"_u; } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -2834,7 +2540,7 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@ -2842,7 +2548,7 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) { if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@ -2850,14 +2556,14 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) -wstring -FSTProcessor::biltrans(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltrans(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; if(with_delim == false) @@ -2866,12 +2572,12 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; @@ -2883,20 +2589,20 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -2929,18 +2635,18 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) { if(mark) { - result = L"^="+result.substr(1); + result = "^="_u + result.substr(1); } else { - result[0] = L'^'; + result[0] = '^'; } } else { if(mark) { - result = L"=" + result.substr(1); + result = "="_u + result.substr(1); } else { @@ -2951,7 +2657,7 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -2960,11 +2666,11 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -2973,19 +2679,19 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -2998,7 +2704,7 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@ -3006,54 +2712,50 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) { if(with_delim) { - result += L'$'; + result += '$'; } return result; } } void -FSTProcessor::bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual_wrapper_null_flush(UFILE *input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!u_feof(input)) { bilingual(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } -wstring -FSTProcessor::compose(wstring const &lexforms, wstring const &queue) const +UString +FSTProcessor::compose(UString const &lexforms, UString const &queue) const { - wstring result = L""; + UString result; for(unsigned int i = 1; i< lexforms.size(); i++) { - if(lexforms[i] == L'\\') + if(lexforms[i] == '\\') { - result += L'\\'; + result += '\\'; i++; } - else if(lexforms[i] == L'/') + else if(lexforms[i] == '/') { result.append(queue); } result += lexforms[i]; } - return L"/" + result + queue; + return "/"_u + result + queue; } void -FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual(UFILE *input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@ -3061,20 +2763,20 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) } State current_state = initial_state; - wstring sf = L""; // source language analysis - wstring queue = L""; // symbols to be added to each target - wstring result = L""; // result of looking up analysis in bidix + UString sf; // source language analysis + UString queue; // symbols to be added to each target + UString result; // result of looking up analysis in bidix outOfWord = false; - skipUntil(input, output, L'^'); - pair tr; // readBilingual return value, containing: + skipUntil(input, output, '^'); + pair tr; // readBilingual return value, containing: int val; // the alphabet value of current symbol, and - wstring symbol = L""; // the current symbol as a string + UString symbol; // the current symbol as a string bool seentags = false; // have we seen any tags at all in the analysis? bool seensurface = false; - wstring surface = L""; + UString surface; while(true) // ie. while(val != 0x7fffffff) { @@ -3082,17 +2784,17 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) symbol = tr.first; val = tr.second; - //fwprintf(stderr, L"> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); + //fwprintf(stderr, "> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); if(biltransSurfaceForms && !seensurface && !outOfWord) { - while(val != L'/' && val != 0x7fffffff) + while(val != '/' && val != 0x7fffffff) { surface = surface + symbol; alphabet.getSymbol(surface, val); tr = readBilingual(input, output); symbol = tr.first; val = tr.second; - //fwprintf(stderr, L" == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); + //fwprintf(stderr, " == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); } seensurface = true; tr = readBilingual(input, output); @@ -3105,7 +2807,7 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) break; } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { if(!seentags) // if no tags: only return complete matches { @@ -3118,16 +2820,16 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) uppercase, firstupper, 0); } - if(sf[0] == L'*') + if(sf[0] == '*') { if (mode == gm_clean) { - printWordBilingual(sf, L"/" + sf.substr(1), output); + printWordBilingual(sf, "/"_u + sf.substr(1), output); } else { - printWordBilingual(sf, L"/" + sf, output); + printWordBilingual(sf, "/"_u + sf, output); } } - else if(result != L"") + else if(!result.empty()) { printWordBilingual(sf, compose(result, queue), output); } @@ -3135,30 +2837,30 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) { //xxx if(biltransSurfaceForms) { - printWordBilingual(surface, L"/@"+surface, output); + printWordBilingual(surface, "/@"_u + surface, output); } else { - printWordBilingual(sf, L"/@"+sf, output); + printWordBilingual(sf, "/@"_u + sf, output); } } seensurface = false; - surface = L""; - queue = L""; - result = L""; + surface.clear(); + queue.clear(); + result.clear(); current_state = initial_state; - sf = L""; + sf.clear(); seentags = false; } else if(iswspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && sf[0] == L'*') + else if(sf.size() > 0 && sf[0] == '*') { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@ -3170,7 +2872,7 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@ -3197,13 +2899,13 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) bool uppercase = sf.size() > 1 && iswupper(sf[1]); bool firstupper= iswupper(sf[0]); - queue = L""; // the intervening tags were matched + queue.clear(); // the intervening tags were matched result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); } - else if(result != L"") + else if(!result.empty()) { // We already have a result, but there is still more to read // of the analysis; following tags are not consumed, but @@ -3220,21 +2922,21 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) else if(current_state.size() == 0) { // There are no more alive transductions and the current symbol is not a tag -- unknown word! - result = L""; + result.clear(); } } } } } -pair -FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) +pair +FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; bool seentags = false; // have we seen any tags at all in the analysis? @@ -3244,12 +2946,12 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { - return pair(input_word, 0); + return pair(input_word, 0); } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; @@ -3261,21 +2963,21 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) for(unsigned int i = start_point; i <= end_point; i++) { int val = 0; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; val = input_word[i]; } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { seentags = true; - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -3308,18 +3010,18 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) { if(mark) { - result = L"^=" + result.substr(1); + result = "^="_u + result.substr(1); } else { - result[0] = L'^'; + result[0] = '^'; } } else { if(mark) { - result = L"=" + result.substr(1); + result = "="_u + result.substr(1); } else { @@ -3330,7 +3032,7 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -3339,19 +3041,19 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } } } if (!seentags - && L"" == current_state.filterFinals(all_finals, alphabet, + && ""_u == current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0)) @@ -3359,32 +3061,32 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -3397,25 +3099,25 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } - return pair(result_with_queue, queue.size()); + return pair(result_with_queue, queue.size()); } else { if(with_delim) { - result += L'$'; + result += '$'; } - return pair(result, 0); + return pair(result, 0); } } -wstring -FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; bool mark = false; @@ -3426,12 +3128,12 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; @@ -3443,20 +3145,20 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -3489,18 +3191,18 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) { if(mark) { - result = L"^=" + result.substr(1); + result = "^="_u + result.substr(1); } else { - result[0] = L'^'; + result[0] = '^'; } } else { if(mark) { - result = L"=" + result.substr(1); + result = "="_u + result.substr(1); } else { @@ -3511,16 +3213,16 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) if(current_state.size() == 0) { - if(symbol == L"") + if(symbol.empty()) { // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -3529,7 +3231,7 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@ -3540,16 +3242,16 @@ FSTProcessor::valid() const { if(initial_state.isFinal(all_finals)) { - wcerr << L"Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; + cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; return false; } else { State s = initial_state; - s.step(L' '); + s.step(' '); if(s.size() != 0) { - wcerr << L"Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; + cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; return false; } } @@ -3558,41 +3260,41 @@ FSTProcessor::valid() const } int -FSTProcessor::readSAO(FILE *input) +FSTProcessor::readSAO(UFILE *input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - if(feof(input)) + UChar val = static_cast(u_fgetc(input)); + if(u_feof(input)) { return 0; } if(escaped_chars.find(val) != escaped_chars.end()) { - if(val == L'<') + if(val == '<') { - wstring str = readFullBlock(input, L'<', L'>'); - if(str.substr(0, 9) == L"'); + if(str.substr(0, 9) == "") + while(str.substr(str.size()-3) != "]]>"_u) { - str.append(readFullBlock(input, L'<', L'>').substr(1)); + str.append(readFullBlock(input, '<', '>').substr(1)); } blankqueue.push(str); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } else { streamError(); } } - else if (val == L'\\') { - val = static_cast(fgetwc_unlocked(input)); + else if (val == '\\') { + val = static_cast(u_fgetc(input)); if(isEscaped(val)) { input_buffer.add(val); @@ -3612,34 +3314,34 @@ FSTProcessor::readSAO(FILE *input) } void -FSTProcessor::printSAOWord(wstring const &lf, FILE *output) +FSTProcessor::printSAOWord(UString const &lf, UFILE *output) { for(unsigned int i = 1, limit = lf.size(); i != limit; i++) { - if(lf[i] == L'/') + if(lf[i] == '/') { break; } - fputwc_unlocked(lf[i], output); + u_fputc(lf[i], output); } } void -FSTProcessor::SAO(FILE *input, FILE *output) +FSTProcessor::SAO(UFILE *input, UFILE *output) { bool last_incond = false; bool last_postblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; escaped_chars.clear(); - escaped_chars.insert(static_cast(L'\\')); - escaped_chars.insert(static_cast(L'<')); - escaped_chars.insert(static_cast(L'>')); + escaped_chars.insert(static_cast('\\')); + escaped_chars.insert(static_cast('<')); + escaped_chars.insert(static_cast('>')); - while(wchar_t val = readSAO(input)) + while(UChar val = readSAO(input)) { // test for final states if(current_state.isFinal(all_finals)) @@ -3679,9 +3381,9 @@ FSTProcessor::SAO(FILE *input, FILE *output) last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && iswspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_incond = false; @@ -3703,7 +3405,7 @@ FSTProcessor::SAO(FILE *input, FILE *output) } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { if(iswspace(val)) { @@ -3713,9 +3415,9 @@ FSTProcessor::SAO(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else if(last_incond) @@ -3727,13 +3429,13 @@ FSTProcessor::SAO(FILE *input, FILE *output) else if(last_postblank) { printSAOWord(lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@ -3743,21 +3445,17 @@ FSTProcessor::SAO(FILE *input, FILE *output) unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } else { @@ -3767,8 +3465,8 @@ FSTProcessor::SAO(FILE *input, FILE *output) } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; } @@ -3778,12 +3476,12 @@ FSTProcessor::SAO(FILE *input, FILE *output) flushBlanks(output); } -wstring -FSTProcessor::removeTags(wstring const &str) +UString +FSTProcessor::removeTags(UString const &str) { for(unsigned int i = 0; i < str.size(); i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { return str.substr(0, i); } @@ -3866,7 +3564,7 @@ FSTProcessor::getNullFlush() } size_t -FSTProcessor::firstNotAlpha(wstring const &sf) +FSTProcessor::firstNotAlpha(UString const &sf) { for(size_t i = 0, limit = sf.size(); i < limit; i++) { @@ -3876,5 +3574,5 @@ FSTProcessor::firstNotAlpha(wstring const &sf) } } - return wstring::npos; + return UString::npos; } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 13d3f2c..5580b61 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -18,9 +18,9 @@ #ifndef _FSTPROCESSOR_ #define _FSTPROCESSOR_ +#include #include #include -#include #include #include #include @@ -56,7 +56,7 @@ private: /** * Transducers in FSTP */ - map transducers; + map transducers; /** * Current state of lexical analysis @@ -101,27 +101,27 @@ private: /** * Queue of blanks, used in reading methods */ - queue blankqueue; + queue blankqueue; /** * Queue of wordbound blanks, used in reading methods */ - queue wblankqueue; + queue wblankqueue; /** * Set of characters being considered alphabetics */ - set alphabetic_chars; + set alphabetic_chars; /** * Set of characters to escape with a backslash */ - set escaped_chars; + set escaped_chars; /** * Set of characters to ignore */ - set ignored_chars; + set ignored_chars; /** * Mapping of characters for simplistic diacritic restoration specified in RCX files @@ -262,7 +262,7 @@ private: * @param input the stream to read from * @return code of the character */ - wchar_t readEscaped(FILE *input); + UChar readEscaped(UFILE *input); /** * Reads a block from the stream input, enclosed by delim1 and delim2 @@ -270,13 +270,13 @@ private: * @param delim1 the delimiter of the beginning of the sequence * @param delim1 the delimiter of the end of the sequence */ - wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); + UString readFullBlock(UFILE *input, UChar const delim1, UChar const delim2); /** * Reads a wordbound blank from the stream input * @param input the stream being read */ - wstring readWblank(FILE *input); + UString readWblank(UFILE *input); /** * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] @@ -284,28 +284,28 @@ private: * @param output the stream to write on * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation */ - bool wblankPostGen(FILE *input, FILE *output); + bool wblankPostGen(UFILE *input, UFILE *output); /** * Returns true if the character code is identified as alphabetic * @param c the code provided by the user * @return true if it's alphabetic */ - bool isAlphabetic(wchar_t const c) const; + bool isAlphabetic(UChar const c) const; /** * Tests if a character is in the set of escaped_chars * @param c the character code provided by the user * @return true if it is in the set */ - bool isEscaped(wchar_t const c) const; + bool isEscaped(UChar const c) const; /** * Read text from stream (analysis version) * @param input the stream to read * @return the next symbol in the stream */ - int readAnalysis(FILE *input); + int readAnalysis(UFILE *input); /** * Read text from stream (decomposition version) @@ -313,7 +313,7 @@ private: * @param output the stream to write on * @return the next symbol in the stream */ - int readDecomposition(FILE *input, FILE *output); + int readDecomposition(UFILE *input, UFILE *output); /** * Read text from stream (postgeneration version) @@ -321,7 +321,7 @@ private: * @param output the stream to write on * @return the next symbol in the stream */ - int readPostgeneration(FILE *input, FILE *output); + int readPostgeneration(UFILE *input, UFILE *output); /** * Read text from stream (generation version) @@ -329,7 +329,7 @@ private: * @param output the stream being written to * @return the next symbol in the stream */ - int readGeneration(FILE *input, FILE *output); + int readGeneration(UFILE *input, UFILE *output); /** * Read text from stream (biltrans version) @@ -337,32 +337,32 @@ private: * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ - pair readBilingual(FILE *input, FILE *output); + pair readBilingual(UFILE *input, UFILE *output); /** * Read text from stream (SAO version) * @param input the stream to read * @return the next symbol in the stream */ - int readSAO(FILE *input); + int readSAO(UFILE *input); /** * Flush all the blanks remaining in the current process * @param output stream to write blanks */ - void flushBlanks(FILE *output); + void flushBlanks(UFILE *output); /** * Flush all the wordbound blanks remaining in the current process * @param output stream to write blanks */ - void flushWblanks(FILE *output); + void flushWblanks(UFILE *output); /** * Combine wordbound blanks in the queue and return them * @return final wblank string */ - wstring combineWblanks(); + UString combineWblanks(); /** * Calculate the initial state of parsing @@ -379,7 +379,7 @@ private: * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscaped(wstring const &str, FILE *output); + void writeEscaped(UString const &str, UFILE *output); /** * Write a string to an output stream. @@ -390,7 +390,7 @@ private: * @param output the stream to write in * @return how many blanks to pop and print after printing lu */ - size_t writeEscapedPopBlanks(wstring const &str, FILE *output); + size_t writeEscapedPopBlanks(UString const &str, UFILE *output); /** * Write a string to an output stream, escaping all escapable characters @@ -398,7 +398,7 @@ private: * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscapedWithTags(wstring const &str, FILE *output); + void writeEscapedWithTags(UString const &str, UFILE *output); /** @@ -407,7 +407,7 @@ private: * @param the searched suffix * @returns true if 'str' has the suffix 'suffix' */ - static bool endsWith(wstring const &str, wstring const &suffix); + static bool endsWith(UString const &str, UString const &suffix); /** * Prints a word @@ -415,7 +415,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWord(wstring const &sf, wstring const &lf, FILE *output); + void printWord(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word. @@ -425,7 +425,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output); + void printWordPopBlank(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word (Bilingual version) @@ -433,7 +433,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordBilingual(wstring const &sf, wstring const &lf, FILE *output); + void printWordBilingual(UString const &sf, UString const &lf, UFILE *output); /** @@ -441,21 +441,21 @@ private: * @param lf lexical form * @param output stream where the word is written */ - void printSAOWord(wstring const &lf, FILE *output); + void printSAOWord(UString const &lf, UFILE *output); /** * Prints an unknown word * @param sf surface form of the word * @param output stream where the word is written */ - void printUnknownWord(wstring const &sf, FILE *output); + void printUnknownWord(UString const &sf, UFILE *output); void initDecompositionSymbols(); - vector numbers; - int readTMAnalysis(FILE *input); + vector numbers; + int readTMAnalysis(UFILE *input); - unsigned int lastBlank(wstring const &str); + unsigned int lastBlank(UString const &str); /** * Print one blankqueue item if there is one, or a given "space" value. @@ -463,23 +463,22 @@ private: * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(wchar_t const val, FILE *output); + void printSpace(UChar const val, UFILE *output); - void skipUntil(FILE *input, FILE *output, wint_t const character); - static wstring removeTags(wstring const &str); - wstring compoundAnalysis(wstring str, bool uppercase, bool firstupper); - size_t firstNotAlpha(wstring const &sf); + void skipUntil(UFILE *input, UFILE *output, wint_t const character); + static UString removeTags(UString const &str); + UString compoundAnalysis(UString str, bool uppercase, bool firstupper); + size_t firstNotAlpha(UString const &sf); - void analysis_wrapper_null_flush(FILE *input, FILE *output); - void lsx_wrapper_null_flush(FILE *input, FILE *output); - void bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode = gm_unknown); - void generation_wrapper_null_flush(FILE *input, FILE *output, + void analysis_wrapper_null_flush(UFILE *input, UFILE *output); + void bilingual_wrapper_null_flush(UFILE *input, UFILE *output, GenerationMode mode = gm_unknown); + void generation_wrapper_null_flush(UFILE *input, UFILE *output, GenerationMode mode); - void postgeneration_wrapper_null_flush(FILE *input, FILE *output); - void intergeneration_wrapper_null_flush(FILE *input, FILE *output); - void transliteration_wrapper_null_flush(FILE *input, FILE *output); + void postgeneration_wrapper_null_flush(UFILE *input, UFILE *output); + void intergeneration_wrapper_null_flush(UFILE *input, UFILE *output); + void transliteration_wrapper_null_flush(UFILE *input, UFILE *output); - wstring compose(wstring const &lexforms, wstring const &queue) const; + UString compose(UString const &lexforms, UString const &queue) const; void procNodeICX(); void procNodeRCX(); @@ -499,25 +498,23 @@ public: void initBiltrans(); void initDecomposition(); - void analysis(FILE *input = stdin, FILE *output = stdout); - void tm_analysis(FILE *input = stdin, FILE *output = stdout); - void generation(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - void postgeneration(FILE *input = stdin, FILE *output = stdout); - void intergeneration(FILE *input = stdin, FILE *output = stdout); - void transliteration(FILE *input = stdin, FILE *output = stdout); - wstring biltrans(wstring const &input_word, bool with_delim = true); - wstring biltransfull(wstring const &input_word, bool with_delim = true); - void bilingual(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - pair biltransWithQueue(wstring const &input_word, bool with_delim = true); - wstring biltransWithoutQueue(wstring const &input_word, bool with_delim = true); - void SAO(FILE *input = stdin, FILE *output = stdout); + void analysis(UFILE *input, UFILE *output); + void tm_analysis(UFILE *input, UFILE *output); + void generation(UFILE *input, UFILE *output, GenerationMode mode = gm_unknown); + void postgeneration(UFILE *input, UFILE *output); + void intergeneration(UFILE *input, UFILE *output); + void transliteration(UFILE *input, UFILE *output); + UString biltrans(UString const &input_word, bool with_delim = true); + UString biltransfull(UString const &input_word, bool with_delim = true); + void bilingual(UFILE *input, UFILE *output, GenerationMode mode = gm_unknown); + pair biltransWithQueue(UString const &input_word, bool with_delim = true); + UString biltransWithoutQueue(UString const &input_word, bool with_delim = true); + void SAO(UFILE *input, UFILE *output); void parseICX(string const &file); void parseRCX(string const &file); void load(FILE *input); - void lsx(FILE *input, FILE *output); - bool valid() const; void setCaseSensitiveMode(bool const value); diff --git a/lttoolbox/lt_expand.cc b/lttoolbox/lt_expand.cc index 283f209..3d9facc 100644 --- a/lttoolbox/lt_expand.cc +++ b/lttoolbox/lt_expand.cc @@ -55,7 +55,8 @@ void endProgram(char *name) int main(int argc, char *argv[]) { - FILE *input = NULL, *output = NULL; + FILE* input = NULL; + UFILE* output = NULL; Expander e; e.setKeepBoundaries(false); @@ -86,15 +87,15 @@ int main(int argc, char *argv[]) switch (cnt) { case 'a': - e.setAltValue(optarg); + e.setAltValue(to_ustring(optarg)); break; case 'v': - e.setVariantValue(optarg); + e.setVariantValue(to_ustring(optarg)); break; case 'l': - e.setVariantLeftValue(optarg); + e.setVariantLeftValue(to_ustring(optarg)); break; case 'm': @@ -102,7 +103,7 @@ int main(int argc, char *argv[]) break; case 'r': - e.setVariantRightValue(optarg); + e.setVariantRightValue(to_ustring(optarg)); break; case 'h': @@ -122,11 +123,11 @@ int main(int argc, char *argv[]) input = fopen(infile.c_str(), "rb"); if(input == NULL) { - wcerr << "Error: Cannot open file '" << infile << "'." << endl; + cerr << "Error: Cannot open file '" << infile << "'." << endl; exit(EXIT_FAILURE); } fclose(input); - output = stdout; + output = u_finit(stdout, NULL, NULL); break; case 3: @@ -134,16 +135,16 @@ int main(int argc, char *argv[]) input = fopen(infile.c_str(), "rb"); if(input == NULL) { - wcerr << "Error: Cannot open file '" << infile << "'." << endl; + cerr << "Error: Cannot open file '" << infile << "'." << endl; exit(EXIT_FAILURE); } fclose(input); outfile = argv[argc-1]; - output = fopen(argv[argc-1], "wb"); + output = u_fopen(argv[argc-1], "wb", NULL, NULL); if(output == NULL) { - wcerr << "Error: Cannot open file '" << outfile << "'." << endl; + cerr << "Error: Cannot open file '" << outfile << "'." << endl; exit(EXIT_FAILURE); } break; @@ -158,7 +159,7 @@ int main(int argc, char *argv[]) #endif e.expand(infile, output); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index c138d56..241ee10 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #ifdef _MSC_VER @@ -50,7 +51,7 @@ int main(int argc, char *argv[]) { bool hfst = false; FILE* input = NULL; - FILE* output = stdout; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); @@ -118,7 +119,7 @@ int main(int argc, char *argv[]) if(outfile != "") { - output = fopen(outfile.c_str(), "wb"); + output = u_fopen(outfile.c_str(), "wb", NULL, NULL); if(!output) { cerr << "Error: Cannot open file '" << outfile << "' for writing." << endl; @@ -127,9 +128,9 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; + set alphabetic_chars; - map transducers; + map transducers; fpos_t pos; if (fgetpos(input, &pos) == 0) { @@ -162,13 +163,7 @@ int main(int argc, char *argv[]) while(len > 0) { - int len2 = Compression::multibyte_read(input); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(input)); - len2--; - } + UString name = Compression::string_read(input); transducers[name].read(input); len--; @@ -176,23 +171,21 @@ int main(int argc, char *argv[]) ///////////////////// - map::iterator penum = transducers.end(); + map::iterator penum = transducers.end(); penum--; - for(map::iterator it = transducers.begin(); it != transducers.end(); it++) + for(map::iterator it = transducers.begin(); it != transducers.end(); it++) { it->second.joinFinals(); it->second.show(alphabet, output, 0, hfst); if(it != penum) { - fwprintf(output, L"--\n", it->first.c_str()); // ToDo: Was %ls meant to go somewhere here? + u_fputs("--\n"_u, output); + //fwprintf(output, L"--\n", it->first.c_str()); // ToDo: Was %ls meant to go somewhere here? } } fclose(input); - if(output != stdout) - { - fclose(output); - } + u_fclose(output); return 0; } diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index 7ff4c8b..d722416 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -252,7 +252,8 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + UFILE* input = u_finit(stdin, NULL, NULL); + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); if(optind == (argc - 3)) @@ -264,15 +265,15 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) + input = u_fopen(argv[optind+1], "rb", NULL, NULL); + if(input == NULL) { wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); } - output= fopen(argv[optind+2], "wb"); - if(output == NULL || ferror(output)) + output = u_fopen(argv[optind+2], "wb", NULL, NULL); + if(output == NULL) { wcerr << "Error: Cannot open file '" << argv[optind+2] << "'." << endl << endl; exit(EXIT_FAILURE); @@ -290,8 +291,8 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) + input = u_fopen(argv[optind+1], "rb", NULL, NULL); + if(input == NULL) { wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); @@ -416,13 +417,13 @@ int main(int argc, char *argv[]) { wcerr << e.what(); if (fstp.getNullFlush()) { - fputwc_unlocked(L'\0', output); + u_fputc('\0', output); } exit(1); } - fclose(input); - fclose(output); + u_fclose(input); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_tmxcomp.cc b/lttoolbox/lt_tmxcomp.cc index ab7df4b..b506b0c 100644 --- a/lttoolbox/lt_tmxcomp.cc +++ b/lttoolbox/lt_tmxcomp.cc @@ -82,25 +82,11 @@ int main(int argc, char *argv[]) switch(c_t) { case 'o': - { - wchar_t *param = new wchar_t[strlen(optarg)+1]; - if((size_t) -1 != mbstowcs(param, optarg, strlen(optarg))) - { - c.setOriginLanguageCode(param); - } - delete[] param; - } + c.setOriginLanguageCode(to_ustring(optarg)); break; case 'm': - { - wchar_t *param = new wchar_t[strlen(optarg)+1]; - if((size_t) -1 != mbstowcs(param, optarg, strlen(optarg))) - { - c.setMetaLanguageCode(param); - } - delete[] param; - } + c.setMetaLanguageCode(to_ustring(optarg)); break; default: @@ -109,22 +95,15 @@ int main(int argc, char *argv[]) } } - string opc = argv[argc-3]; - wchar_t* lo = new wchar_t[opc.size()+1]; - wchar_t* lm = new wchar_t[opc.size()+1]; + UString opc = to_ustring(argv[argc-3]); + UString lo = opc.substr(0, opc.find('-')); + UString lm = opc.substr(opc.find('-')+1); - if(((size_t) -1 == mbstowcs(lo, opc.substr(0, opc.find('-')).c_str(), opc.size()))|| - ((size_t) -1 == mbstowcs(lm, opc.substr(opc.find('-')+1).c_str(), opc.size()))) - { - delete[] lo; - delete[] lm; + if(lo.empty() || lm.empty()) { endProgram(argv[0]); } - c.parse(argv[argc-2], lo, lm); - delete[] lo; - delete[] lm; FILE *output = fopen(argv[argc-1], "wb"); if(!output) diff --git a/lttoolbox/lt_tmxproc.cc b/lttoolbox/lt_tmxproc.cc index c90aca9..580b988 100644 --- a/lttoolbox/lt_tmxproc.cc +++ b/lttoolbox/lt_tmxproc.cc @@ -43,7 +43,8 @@ void checkValidity(FSTProcessor const &fstp) int main(int argc, char *argv[]) { - FILE *input = stdin, *output = stdout; + UFILE* input = u_finit(stdin, NULL, NULL); + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); FSTProcessor fstp; FILE *aux; @@ -51,14 +52,14 @@ int main(int argc, char *argv[]) switch(argc) { case 4: - output = fopen(argv[3], "wb"); + output = u_fopen(argv[3], "wb", NULL, NULL); if(!output) { endProgram(argv[0]); } // follow case 3: - input = fopen(argv[2], "rb"); + input = u_fopen(argv[2], "rb", NULL, NULL); if(!input) { endProgram(argv[0]); @@ -82,7 +83,7 @@ int main(int argc, char *argv[]) checkValidity(fstp); fstp.tm_analysis(input, output); - fclose(input); - fclose(output); + u_fclose(input); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index 837794f..eabff7c 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -24,6 +24,7 @@ #include #include #include +#include void endProgram(char *name) { @@ -35,13 +36,12 @@ void endProgram(char *name) exit(EXIT_FAILURE); } -std::pair, std::map > +std::pair, std::map > read_fst(FILE *bin_file) { Alphabet new_alphabet; - wstring letters = L""; - std::map transducers; + std::map transducers; fpos_t pos; if (fgetpos(bin_file, &pos) == 0) { @@ -60,47 +60,36 @@ read_fst(FILE *bin_file) } // letters - int len = Compression::multibyte_read(bin_file); - while(len > 0) - { - letters.push_back(static_cast(Compression::multibyte_read(bin_file))); - len--; - } + UString letters = Compression::string_read(bin_file); // symbols new_alphabet.read(bin_file); - len = Compression::multibyte_read(bin_file); + int len = Compression::multibyte_read(bin_file); while(len > 0) { - int len2 = Compression::multibyte_read(bin_file); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(bin_file)); - len2--; - } + UString name = Compression::string_read(bin_file); transducers[name].read(bin_file); len--; } - std::pair alph_letters; + std::pair alph_letters; alph_letters.first = new_alphabet; alph_letters.second = letters; - return std::pair, std::map > (alph_letters, transducers); + return std::pair, std::map > (alph_letters, transducers); } -std::pair, std::map > +std::pair, std::map > trim(FILE *file_mono, FILE *file_bi) { - std::pair, std::map > alph_trans_mono = read_fst(file_mono); + std::pair, std::map > alph_trans_mono = read_fst(file_mono); Alphabet alph_mono = alph_trans_mono.first.first; - std::map trans_mono = alph_trans_mono.second; - std::pair, std::map > alph_trans_bi = read_fst(file_bi); + std::map trans_mono = alph_trans_mono.second; + std::pair, std::map > alph_trans_bi = read_fst(file_bi); Alphabet alph_bi = alph_trans_bi.first.first; - std::map trans_bi = alph_trans_bi.second; + std::map trans_bi = alph_trans_bi.second; // The prefix transducer is the union of all transducers from bidix, // with a ".*" appended @@ -111,7 +100,7 @@ trim(FILE *file_mono, FILE *file_bi) set loopback_symbols; // ints refer to alph_prefix alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); - for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) + for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) { Transducer union_tmp = it->second; if(union_transducer.isEmpty()) @@ -130,21 +119,21 @@ trim(FILE *file_mono, FILE *file_bi) Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); - for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) + for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) { Transducer trimmed = it->second.intersect(moved_transducer, alph_mono, alph_prefix); - wcout << it->first << " " << it->second.size(); - wcout << " " << it->second.numberOfTransitions() << endl; + cout << it->first << " " << it->second.size(); + cout << " " << it->second.numberOfTransitions() << endl; if(it->second.numberOfTransitions() == 0) { - wcerr << L"Warning: empty section! Skipping it ..."<first].clear(); } else if(trimmed.hasNoFinals()) { - wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<first].clear(); } else { @@ -170,25 +159,24 @@ int main(int argc, char *argv[]) FILE *analyser = fopen(argv[1], "rb"); if(!analyser) { - wcerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; exit(EXIT_FAILURE); } FILE *bidix = fopen(argv[2], "rb"); if(!bidix) { - wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; exit(EXIT_FAILURE); } - std::pair, std::map > trimmed = trim(analyser, bidix); + std::pair, std::map > trimmed = trim(analyser, bidix); Alphabet alph_t = trimmed.first.first; - wstring letters = trimmed.first.second; - std::map trans_t = trimmed.second; + UString letters = trimmed.first.second; + std::map trans_t = trimmed.second; int n_transducers = 0; - for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) - { - if(!(it->second.isEmpty())) + for(auto& it : trans_t) { + if(!(it.second.isEmpty())) { n_transducers++; } @@ -196,9 +184,9 @@ int main(int argc, char *argv[]) if(n_transducers == 0) { - wcerr << L"Error: Trimming gave empty transducer!" << endl; - wcerr << L"Hint: There are no words in bilingual dictionary that match " - L"words in both monolingual dictionaries?" << endl; + cerr << "Error: Trimming gave empty transducer!" << endl; + cerr << "Hint: There are no words in bilingual dictionary that match " + "words in both monolingual dictionaries?" << endl; exit(EXIT_FAILURE); } @@ -206,24 +194,23 @@ int main(int argc, char *argv[]) FILE *output = fopen(argv[3], "wb"); if(!output) { - wcerr << "Error: Cannot open file '" << argv[3] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[3] << "'." << endl << endl; exit(EXIT_FAILURE); } // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alph_t.write(output); // transducers Compression::multibyte_write(n_transducers, output); - for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) - { - if(!(it->second.isEmpty())) + for(auto& it : trans_t) { + if(!(it.second.isEmpty())) { - Compression::wstring_write(it->first, output); - it->second.write(output); + Compression::string_write(it.first, output); + it.second.write(output); } } diff --git a/lttoolbox/ltstr.h b/lttoolbox/ltstr.h deleted file mode 100644 index 9e5abb6..0000000 --- a/lttoolbox/ltstr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#ifndef _Ltstr_ -#define _Ltstr_ - -#include -#include -#include - -using namespace std; - -struct Ltstr -{ - bool operator()(string const &s1, string const &s2) const - { - return strcmp(s1.c_str(), s2.c_str()) < 0; - } - - bool operator()(wchar_t const *s1, wchar_t const *s2) const - { - return wcscmp(s1, s2) < 0; - } - - bool operator()(char const *s1, char const *s2) const - { - return strcmp(s1, s2) < 0; - } - - bool operator()(wstring const &s1, wstring const &s2) const - { - return wcscmp(s1.c_str(), s2.c_str()) < 0; - } -}; - -#endif diff --git a/lttoolbox/pattern_list.cc b/lttoolbox/pattern_list.cc index ed1f056..e5dd5fc 100644 --- a/lttoolbox/pattern_list.cc +++ b/lttoolbox/pattern_list.cc @@ -22,9 +22,9 @@ #include #include -wstring const PatternList::ANY_CHAR = L""; -wstring const PatternList::ANY_TAG = L""; -wstring const PatternList::QUEUE = L""; +UString const PatternList::ANY_CHAR = ""_u; +UString const PatternList::ANY_TAG = ""_u; +UString const PatternList::QUEUE = ""_u; void PatternList::copy(PatternList const &o) @@ -80,7 +80,7 @@ PatternList::beginSequence() { if(sequence) { - wcerr << L"Error: opening an unended sequence" << endl; + cerr << "Error: opening an unended sequence" << endl; exit(EXIT_FAILURE); } sequence = true; @@ -92,7 +92,7 @@ PatternList::endSequence() { if(!sequence) { - wcerr << L"Error: ending an unopened sequence" << endl; + cerr << "Error: ending an unopened sequence" << endl; exit(EXIT_FAILURE); } sequence = false; @@ -107,10 +107,10 @@ PatternList::endSequence() } void -PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, +PatternList::insertOutOfSequence(UString const &lemma, UString const &tags, vector &result) { - if(lemma == L"") + if(lemma.empty()) { result.push_back(alphabet(ANY_CHAR)); } @@ -128,7 +128,7 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, } } } - if(tags == L"") + if(tags.empty()) { result.push_back(alphabet(ANY_TAG)); } @@ -136,9 +136,9 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, { for(unsigned int i = 0, limit = tagCount(tags); i < limit; i++) { - wstring tag = L"<" + tagAt(tags, i) + L">"; + UString tag = "<"_u + tagAt(tags, i) + ">"_u; - if(tag == L"<*>") + if(tag == "<*>"_u) { result.push_back(alphabet(ANY_TAG)); } @@ -152,8 +152,8 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, } void -PatternList::insertIntoSequence(int const id, wstring const &lemma, - wstring const &tags) +PatternList::insertIntoSequence(int const id, UString const &lemma, + UString const &tags) { sequence_id = id; @@ -176,7 +176,7 @@ PatternList::insertIntoSequence(int const id, wstring const &lemma, } void -PatternList::insert(int const id, wstring const &lemma, wstring const &tags) +PatternList::insert(int const id, UString const &lemma, UString const &tags) { if(!sequence) { @@ -196,7 +196,7 @@ PatternList::insert(int const id, int const otherid) { if(!sequence) { - wcerr << L"Error: using labels outside of a sequence" << endl; + cerr << "Error: using labels outside of a sequence" << endl; exit(EXIT_FAILURE); } @@ -233,7 +233,7 @@ PatternList::insert(int const id, int const otherid) } int -PatternList::tagCount(wstring const &tags) +PatternList::tagCount(UString const &tags) { int count = 0; @@ -252,8 +252,8 @@ PatternList::tagCount(wstring const &tags) return count; } -wstring -PatternList::tagAt(wstring const &tags, int const index) +UString +PatternList::tagAt(UString const &tags, int const index) { int start = 0; int end = 0; @@ -282,7 +282,7 @@ PatternList::tagAt(wstring const &tags, int const index) if(index > count) { - return L""; + return ""_u; } if(end != 0) { @@ -366,10 +366,10 @@ void PatternList::write(FILE *output) { alphabet.write(output); - wstring const tagger_name = L"tagger"; + UString const tagger_name = "tagger"_u; Compression::multibyte_write(1, output); - Compression::wstring_write(tagger_name, output); + Compression::string_write(tagger_name, output); transducer.write(output, alphabet.size()); Compression::multibyte_write(final_type.size(), output); @@ -391,7 +391,7 @@ PatternList::read(FILE *input) alphabet.read(input); if(Compression::multibyte_read(input) == 1) { - wstring mystr = Compression::wstring_read(input); + UString mystr = Compression::string_read(input); transducer.read(input, alphabet.size()); int finalsize = Compression::multibyte_read(input); diff --git a/lttoolbox/pattern_list.h b/lttoolbox/pattern_list.h index 5dde942..1b88403 100644 --- a/lttoolbox/pattern_list.h +++ b/lttoolbox/pattern_list.h @@ -45,29 +45,29 @@ private: void copy(PatternList const &o); void destroy(); - void insertOutOfSequence(wstring const &lemma, wstring const &tags, + void insertOutOfSequence(UString const &lemma, UString const &tags, vector &result); - void insertIntoSequence(int const id, wstring const &lemma, - wstring const &tags); + void insertIntoSequence(int const id, UString const &lemma, + UString const &tags); - static int tagCount(wstring const &tags); - static wstring tagAt(wstring const &tags, int const index); + static int tagCount(UString const &tags); + static UString tagAt(UString const &tags, int const index); public: /** * This symbol stands for any char */ - static wstring const ANY_CHAR; + static UString const ANY_CHAR; /** * This symbol stands for any tag */ - static wstring const ANY_TAG; + static UString const ANY_TAG; /** * This symbol marks a word queue */ - static wstring const QUEUE; + static UString const QUEUE; /** * Constructor @@ -106,7 +106,7 @@ public: * @param lemma * @param tags */ - void insert(int const id, wstring const &lemma, wstring const &tags); + void insert(int const id, UString const &lemma, UString const &tags); /** * Insertion method diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index 4e04c0d..340f953 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -96,14 +96,14 @@ RegexpCompiler::isReserved(int const t) void RegexpCompiler::error() { - wcerr << L"Error parsing regexp" < const alts) } void -State::step_case(wchar_t val, wchar_t val2, bool caseSensitive) +State::step_case(UChar val, UChar val2, bool caseSensitive) { if (!iswupper(val) || caseSensitive) { step(val, val2); @@ -416,7 +416,7 @@ State::step_case(wchar_t val, wchar_t val2, bool caseSensitive) void -State::step_case(wchar_t val, bool caseSensitive) +State::step_case(UChar val, bool caseSensitive) { if (!iswupper(val) || caseSensitive) { step(val); @@ -441,14 +441,14 @@ State::isFinal(map const &finals) const } -vector> -State::NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const +vector> +State::NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const { - vector> result; + vector> result; - sort(lf.begin(), lf.end(), sort_weights()); + sort(lf.begin(), lf.end(), sort_weights()); - for(vector >::iterator it = lf.begin(); it != lf.end(); it++) + for(vector >::iterator it = lf.begin(); it != lf.end(); it++) { double last_weight = 0.0000; if(maxAnalyses > 0 && maxWeightClasses > 0) @@ -466,16 +466,16 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC } -wstring +UString State::filterFinals(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const { - vector> response; + vector> response; - wstring result = L""; + UString result; double cost = 0.0000; for(size_t i = 0, limit = state.size(); i != limit; i++) @@ -491,14 +491,14 @@ State::filterFinals(map const &finals, { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); cost += ((*(state[i].sequence))[j]).second; } if(firstupper) { - if(result[first_char] == L'~') + if(result[first_char] == '~') { // skip post-generation mark result[first_char+1] = towupper(result[first_char+1]); @@ -517,7 +517,7 @@ State::filterFinals(map const &finals, { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); cost += ((*(state[i].sequence))[j]).second; @@ -532,16 +532,16 @@ State::filterFinals(map const &finals, response = NFinals(response, max_analyses, max_weight_classes); - result = L""; - for(vector>::iterator it = response.begin(); it != response.end(); it++) + result.clear(); + for(vector>::iterator it = response.begin(); it != response.end(); it++) { - result += L'/'; + result += '/'; result += it->first; if(display_weights) { - result += L"second); - result += L">"; + UChar* temp; + u_sprintf(temp, "", it->second); + result += temp; } } @@ -549,39 +549,39 @@ State::filterFinals(map const &finals, } -set > > +set > > State::filterFinalsLRX(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { - set > > results; + set > > results; - vector current_result; - wstring rule_id = L""; + vector current_result; + UString rule_id; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { current_result.clear(); - rule_id = L""; - wstring current_word = L""; + rule_id.clear(); + UString current_word; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - current_word += L'\\'; + current_word += '\\'; } - wstring sym = L""; + UString sym; alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); - if(sym == L"<$>") + if(sym == "<$>"_u) { - if(current_word != L"") + if(!current_word.empty()) { current_result.push_back(current_word); } - current_word = L""; + current_word.clear(); } else { @@ -597,32 +597,34 @@ State::filterFinalsLRX(map const &finals, } -wstring +UString State::filterFinalsSAO(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { - wstring result = L""; - wstring annot = L""; + UString result; + UString annot; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { - result += L'/'; + result += '/'; unsigned int const first_char = result.size() + firstchar; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } if(alphabet.isTag(((*(state[i].sequence))[j]).first)) { - annot = L""; + annot.clear(); alphabet.getSymbol(annot, ((*(state[i].sequence))[j]).first); - result += L'&'+annot.substr(1,annot.length()-2)+L';'; + result += '&'; + result += annot.substr(1,annot.length()-2); + result += ';'; } else { @@ -631,7 +633,7 @@ State::filterFinalsSAO(map const &finals, } if(firstupper) { - if(result[first_char] == L'~') + if(result[first_char] == '~') { // skip post-generation mark result[first_char+1] = towupper(result[first_char+1]); @@ -647,24 +649,24 @@ State::filterFinalsSAO(map const &finals, return result; } -wstring +UString State::filterFinalsTM(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blankqueue, vector &numbers) const + set const &escaped_chars, + queue &blankqueue, vector &numbers) const { - wstring result = L""; + UString result; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { - result += L'/'; + result += '/'; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find((*(state[i].sequence))[j].first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, (*(state[i].sequence))[j].first); } @@ -672,15 +674,15 @@ State::filterFinalsTM(map const &finals, } - wstring result2 = L""; - vector fragment; - fragment.push_back(L""); + UString result2; + vector fragment; + fragment.push_back(""_u); for(unsigned int i = 0, limit = result.size(); i != limit ; i++) { - if(result[i] == L')') + if(result[i] == ')') { - fragment.push_back(L""); + fragment.push_back(""_u); } else { @@ -692,9 +694,9 @@ State::filterFinalsTM(map const &finals, { if(i != limit -1) { - if(fragment[i].size() >=2 && fragment[i].substr(fragment[i].size()-2) == L"(#") + if(fragment[i].size() >=2 && fragment[i].substr(fragment[i].size()-2) == "(#"_u) { - wstring whitespace = L" "; + UString whitespace = " "_u; if(blankqueue.size() != 0) { whitespace = blankqueue.front().substr(1); @@ -709,8 +711,8 @@ State::filterFinalsTM(map const &finals, bool substitute = false; for(int j = fragment[i].size() - 1; j >= 0; j--) { - if(fragment[i].size()-j > 3 && fragment[i][j] == L'\\' && - fragment[i][j+1] == L'@' && fragment[i][j+2] == L'(') + if(fragment[i].size()-j > 3 && fragment[i][j] == '\\' && + fragment[i][j+1] == '@' && fragment[i][j+2] == '(') { int num = 0; bool correct = true; @@ -738,13 +740,13 @@ State::filterFinalsTM(map const &finals, } if(substitute == false) { - fragment[i] += L')'; + fragment[i] += ')'; } } } } - result = L""; + result.clear(); for(unsigned int i = 0, limit = fragment.size(); i != limit; i++) { @@ -888,26 +890,26 @@ State::restartFinals(const map &finals, int requiredSymbol, Stat -wstring +UString State::getReadableString(const Alphabet &a) { - wstring retval = L"["; + UString retval = "["_u; for(unsigned int i=0; i>* seq = state.at(i).sequence; if(seq != NULL) for (unsigned int j=0; jsize(); j++) { - wstring ws = L""; + UString ws; a.getSymbol(ws, (seq->at(j)).first); retval.append(ws); } if(i+1 < state.size()) { - retval.append(L", "); + retval.append(", "_u); } } - retval.append(L"]"); + retval.append("]"_u); return retval; } diff --git a/lttoolbox/state.h b/lttoolbox/state.h index a7840c7..7f67142 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -30,6 +30,8 @@ #include #include +#include + using namespace std; /** @@ -188,9 +190,9 @@ public: */ void step(int const input, set const alts); - void step_case(wchar_t val, bool caseSensitive); + void step_case(UChar val, bool caseSensitive); - void step_case(wchar_t val, wchar_t val2, bool caseSensitive); + void step_case(UChar val, UChar val2, bool caseSensitive); void step_careful(int const input, int const alt); @@ -236,7 +238,7 @@ public: } }; - vector> NFinals(vector> lf, + vector> NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const; @@ -252,9 +254,9 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - wstring filterFinals(map const &finals, + UString filterFinals(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, int max_weight_classes = INT_MAX, @@ -273,9 +275,9 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - wstring filterFinalsSAO(map const &finals, + UString filterFinalsSAO(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -293,9 +295,9 @@ public: * @return the result of the transduction */ - set > > filterFinalsLRX(map const &finals, + set > > filterFinalsLRX(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -326,13 +328,13 @@ public: /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ - wstring getReadableString(const Alphabet &a); + UString getReadableString(const Alphabet &a); - wstring filterFinalsTM(map const &finals, + UString filterFinalsTM(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blanks, - vector &numbers) const; + set const &escaped_chars, + queue &blanks, + vector &numbers) const; }; diff --git a/lttoolbox/tmx_compiler.cc b/lttoolbox/tmx_compiler.cc index 39113ee..3afe14a 100644 --- a/lttoolbox/tmx_compiler.cc +++ b/lttoolbox/tmx_compiler.cc @@ -19,37 +19,32 @@ #include #include #include -#include #include #include #include -#ifdef _WIN32 -#define swprintf _snwprintf -#endif - using namespace std; -wstring const TMXCompiler::TMX_COMPILER_TMX_ELEM = L"tmx"; -wstring const TMXCompiler::TMX_COMPILER_HEADER_ELEM = L"header"; -wstring const TMXCompiler::TMX_COMPILER_BODY_ELEM = L"body"; -wstring const TMXCompiler::TMX_COMPILER_TU_ELEM = L"tu"; -wstring const TMXCompiler::TMX_COMPILER_TUV_ELEM = L"tuv"; -wstring const TMXCompiler::TMX_COMPILER_HI_ELEM = L"hi"; -wstring const TMXCompiler::TMX_COMPILER_PH_ELEM = L"ph"; -wstring const TMXCompiler::TMX_COMPILER_XMLLANG_ATTR = L"xml:lang"; -wstring const TMXCompiler::TMX_COMPILER_LANG_ATTR = L"lang"; -wstring const TMXCompiler::TMX_COMPILER_SEG_ELEM = L"seg"; -wstring const TMXCompiler::TMX_COMPILER_PROP_ELEM = L"prop"; +UString const TMXCompiler::TMX_COMPILER_TMX_ELEM = "tmx"_u; +UString const TMXCompiler::TMX_COMPILER_HEADER_ELEM = "header"_u; +UString const TMXCompiler::TMX_COMPILER_BODY_ELEM = "body"_u; +UString const TMXCompiler::TMX_COMPILER_TU_ELEM = "tu"_u; +UString const TMXCompiler::TMX_COMPILER_TUV_ELEM = "tuv"_u; +UString const TMXCompiler::TMX_COMPILER_HI_ELEM = "hi"_u; +UString const TMXCompiler::TMX_COMPILER_PH_ELEM = "ph"_u; +UString const TMXCompiler::TMX_COMPILER_XMLLANG_ATTR = "xml:lang"_u; +UString const TMXCompiler::TMX_COMPILER_LANG_ATTR = "lang"_u; +UString const TMXCompiler::TMX_COMPILER_SEG_ELEM = "seg"_u; +UString const TMXCompiler::TMX_COMPILER_PROP_ELEM = "prop"_u; TMXCompiler::TMXCompiler() : reader(0), default_weight(0.0000) { LtLocale::tryToSetLocale(); - alphabet.includeSymbol(L""); // -1 -> numbers - alphabet.includeSymbol(L""); // -2 -> blanks + alphabet.includeSymbol(""_u); // -1 -> numbers + alphabet.includeSymbol(""_u); // -2 -> blanks } TMXCompiler::~TMXCompiler() @@ -57,14 +52,14 @@ TMXCompiler::~TMXCompiler() } void -TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) +TMXCompiler::parse(string const &file, UString const &lo, UString const &lm) { origin_language = lo; meta_language = lm; reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -77,7 +72,7 @@ TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -88,12 +83,12 @@ TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) } void -TMXCompiler::requireEmptyError(wstring const &name) +TMXCompiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -102,7 +97,7 @@ bool TMXCompiler::allBlanks() { bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { @@ -113,79 +108,79 @@ TMXCompiler::allBlanks() } void -TMXCompiler::skipBlanks(wstring &name) +TMXCompiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << "): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -TMXCompiler::skip(wstring &name, wstring const &elem) +TMXCompiler::skip(UString &name, UString const &elem) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); - while(name == L"#text" || name == L"#comment") + while(name == "#text"_u || name == "#comment"_u) { - if(name != L"#comment") + if(name != "#comment"_u) { if(!allBlanks()) { - wcerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << "): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << elem << ">'." << endl; exit(EXIT_FAILURE); } } -wstring -TMXCompiler::attrib(wstring const &name) +UString +TMXCompiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } void -TMXCompiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +TMXCompiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } -wstring +UString TMXCompiler::getTag(size_t const &val) const { - wchar_t cad[32]; - swprintf(cad, 32, L"<%d>", val); + UChar cad[32]; + u_snprintf(cad, 32, "<%d>", val); return cad; } @@ -197,7 +192,7 @@ TMXCompiler::insertTU(vector const &origin, vector const &meta) return; } - if(origin[0] == alphabet(L"") || meta[0] == alphabet(L"")) + if(origin[0] == alphabet(""_u) || meta[0] == alphabet(""_u)) { return; } @@ -273,7 +268,7 @@ TMXCompiler::align_blanks(vector &o, vector &m) vector puntos; vector resultado_o, resultado_m; - int const symbol = alphabet(L""); + int const symbol = alphabet(""_u); vector > so, sm; @@ -288,8 +283,8 @@ TMXCompiler::align_blanks(vector &o, vector &m) trim(sm[i]); if(sm.size() - 1 != i) { - sm[i].push_back(L'('); - sm[i].push_back(L'#'); + sm[i].push_back('('); + sm[i].push_back('#'); } /* while(so[i].size() < sm[i].size()) @@ -301,8 +296,8 @@ TMXCompiler::align_blanks(vector &o, vector &m) sm[i].push_back(0); }*/ } - o = join(so, L' '); - m = join(sm, L')'); + o = join(so, ' '); + m = join(sm, ')'); } else { @@ -315,19 +310,19 @@ TMXCompiler::align_blanks(vector &o, vector &m) trim(sm[i]); if(sm.size() - 1 != i) { - sm[i].push_back(L'('); - sm[i].push_back(L'#'); + sm[i].push_back('('); + sm[i].push_back('#'); } } - o = join(so, L' '); - m = join(sm, L')'); + o = join(so, ' '); + m = join(sm, ')'); } } void TMXCompiler::procTU() { - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); int type = xmlTextReaderNodeType(reader); vector origin; vector meta; @@ -337,9 +332,8 @@ TMXCompiler::procTU() { if(name == TMX_COMPILER_TUV_ELEM && type != XML_READER_TYPE_END_ELEMENT) { - wstring l = attrib(TMX_COMPILER_XMLLANG_ATTR); - if(l == L"") - { + UString l = attrib(TMX_COMPILER_XMLLANG_ATTR); + if(l.empty()) { l = attrib(TMX_COMPILER_LANG_ATTR); } @@ -360,12 +354,12 @@ TMXCompiler::procTU() while(name != TMX_COMPILER_TUV_ELEM || type != XML_READER_TYPE_END_ELEMENT) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); - if(name == L"#text") + if(name == "#text"_u) { - wstring l = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString l = XMLParseUtil::readValue(reader); for(size_t i = 0, limit = l.size(); i != limit; i++) { ref->push_back(l[i]); @@ -375,28 +369,28 @@ TMXCompiler::procTU() { if(type != XML_READER_TYPE_END_ELEMENT) { - ref->push_back(alphabet(L"")); + ref->push_back(alphabet(""_u)); } } } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); } trim(origin); trim(meta); -// wcout << L"DESPUES DE TRIM\n"; +// cout << "DESPUES DE TRIM\n"; // printvector(origin); // printvector(meta); align(origin, meta); -// wcout << L"DESPUES DE ALIGN\n"; +// cout << "DESPUES DE ALIGN\n"; // printvector(origin); // printvector(meta); align_blanks(origin, meta); -// wcout << L"DESPUES DE ALIGNBLANKS\n"; +// cout << "DESPUES DE ALIGNBLANKS\n"; // printvector(origin); // printvector(meta); insertTU(origin, meta); @@ -405,12 +399,11 @@ TMXCompiler::procTU() void TMXCompiler::procNode() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = XMLParseUtil::readName(reader); // HACER: optimizar el orden de ejecución de esta ristra de "ifs" - if(name == L"#text") + if(name == "#text"_u) { /* ignorar */ } @@ -434,14 +427,14 @@ TMXCompiler::procNode() { procTU(); } - else if(name== L"#comment") + else if(name== "#comment"_u) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -454,19 +447,19 @@ TMXCompiler::write(FILE *output) write_le(output, features); // letters (empty to keep the file format) - Compression::wstring_write(L"", output); + Compression::string_write(""_u, output); // symbols alphabet.write(output); // transducers Compression::multibyte_write(1, output); // keeping file format - Compression::wstring_write(L"", output); // keeping file format + Compression::string_write(""_u, output); // keeping file format transducer.write(output); - wcout << origin_language << L"->" << meta_language << L" "; - wcout << transducer.size() << L" " << transducer.numberOfTransitions(); - wcout << endl; + cout << origin_language << "->" << meta_language << " "; + cout << transducer.size() << " " << transducer.numberOfTransitions(); + cout << endl; } void @@ -514,7 +507,7 @@ TMXCompiler::align(vector &origin, vector &meta) numbers_origin_start.push_back(i); numbers_origin_length.push_back(nl); i += nl-1; - modified_origin.push_back(alphabet(L"")); + modified_origin.push_back(alphabet(""_u)); } else { @@ -536,16 +529,16 @@ TMXCompiler::align(vector &origin, vector &meta) if(vectorcmp(origin, numbers_origin_start[j], meta, i, nl)) { - modified_meta.push_back(L'@'); - modified_meta.push_back(L'('); - wchar_t *valor = new wchar_t[8]; - swprintf(valor, 8, L"%d", j+1); - for(int k = 0, limit3 = wcslen(valor); k != limit3; k++) + modified_meta.push_back('@'); + modified_meta.push_back('('); + UChar* valor = new UChar[8]; + u_snprintf(valor, 8, "%d", j+1); + for(int k = 0, limit3 = u_strlen(valor); k != limit3; k++) { modified_meta.push_back(valor[k]); } delete[] valor; - modified_meta.push_back(L')'); + modified_meta.push_back(')'); i += nl-1; tocado = true; break; @@ -582,7 +575,7 @@ TMXCompiler::numberLength(vector &v, unsigned int const position) const { for(unsigned int i = position, limit = v.size(); i < limit; i++) { - if(!iswdigit(v[i]) && (v[i] != L'.' || i == position) && (v[i] != L',' || i == position)) + if(!iswdigit(v[i]) && (v[i] != '.' || i == position) && (v[i] != ',' || i == position)) { if(i == position) { @@ -640,11 +633,11 @@ TMXCompiler::printvector(vector const &v, wostream &os) { if(i != 0) { - os << L" "; + os << " "; } if(v[i] > 31) { - os << v[i] << L" ('" << wchar_t(v[i]) << L"')"; + os << v[i] << " ('" << UChar(v[i]) << "')"; } else { @@ -655,13 +648,13 @@ TMXCompiler::printvector(vector const &v, wostream &os) } void -TMXCompiler::setOriginLanguageCode(wstring const &code) +TMXCompiler::setOriginLanguageCode(UString const &code) { // nada } void -TMXCompiler::setMetaLanguageCode(wstring const &code) +TMXCompiler::setMetaLanguageCode(UString const &code) { // nada } diff --git a/lttoolbox/tmx_compiler.h b/lttoolbox/tmx_compiler.h index 53bb4b5..0fa179b 100644 --- a/lttoolbox/tmx_compiler.h +++ b/lttoolbox/tmx_compiler.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -60,22 +59,22 @@ private: /** * Origin language */ - wstring origin_language; + UString origin_language; /** * Meta language */ - wstring meta_language; + UString meta_language; /** * Origin language code in the TMX */ - wstring origin_language_inner_code; + UString origin_language_inner_code; /** * Origin language code in the TMX */ - wstring meta_language_inner_code; + UString meta_language_inner_code; /** @@ -100,26 +99,26 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Skip all document #text nodes before "elem" * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -127,8 +126,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -136,7 +135,7 @@ private: */ bool allBlanks(); - wstring getTag(size_t const &val) const; + UString getTag(size_t const &val) const; void trim(vector &v) const; void align(vector &origin, vector &meta); unsigned int numberLength(vector &v, unsigned int const position) const; @@ -155,17 +154,17 @@ public: * Constants to represent the element and the attributes of * translation memories in TMX format */ - static wstring const TMX_COMPILER_TMX_ELEM; - static wstring const TMX_COMPILER_HEADER_ELEM; - static wstring const TMX_COMPILER_BODY_ELEM; - static wstring const TMX_COMPILER_TU_ELEM; - static wstring const TMX_COMPILER_TUV_ELEM; - static wstring const TMX_COMPILER_HI_ELEM; - static wstring const TMX_COMPILER_PH_ELEM; - static wstring const TMX_COMPILER_XMLLANG_ATTR; - static wstring const TMX_COMPILER_LANG_ATTR; - static wstring const TMX_COMPILER_SEG_ELEM; - static wstring const TMX_COMPILER_PROP_ELEM; + static UString const TMX_COMPILER_TMX_ELEM; + static UString const TMX_COMPILER_HEADER_ELEM; + static UString const TMX_COMPILER_BODY_ELEM; + static UString const TMX_COMPILER_TU_ELEM; + static UString const TMX_COMPILER_TUV_ELEM; + static UString const TMX_COMPILER_HI_ELEM; + static UString const TMX_COMPILER_PH_ELEM; + static UString const TMX_COMPILER_XMLLANG_ATTR; + static UString const TMX_COMPILER_LANG_ATTR; + static UString const TMX_COMPILER_SEG_ELEM; + static UString const TMX_COMPILER_PROP_ELEM; /** @@ -181,7 +180,7 @@ public: /** * Compile dictionary to letter transducers */ - void parse(string const &file, wstring const &lo, wstring const &lm); + void parse(string const &file, UString const &lo, UString const &lm); /** * Write the result of compilation @@ -193,13 +192,13 @@ public: * Set origin language inner code * @param code the code of the origin language into the TMX file being compiled */ - void setOriginLanguageCode(wstring const &code); + void setOriginLanguageCode(UString const &code); /** * Set meta language inner code * @param code the code of the meta language into the TMX file being compiled */ - void setMetaLanguageCode(wstring const &code); + void setMetaLanguageCode(UString const &code); }; diff --git a/lttoolbox/trans_exe.cc b/lttoolbox/trans_exe.cc index ce39ff6..75ae212 100644 --- a/lttoolbox/trans_exe.cc +++ b/lttoolbox/trans_exe.cc @@ -18,6 +18,7 @@ #include #include #include +#include TransExe::TransExe(): initial_id(0), diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index b37ae96..605d45f 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -24,6 +24,7 @@ #include #include #include +#include int Transducer::newState() @@ -170,8 +171,8 @@ Transducer::linkStates(int const source, int const target, } else { - wcerr << L"Error: Trying to link nonexistent states (" << source; - wcerr << L", " << target << L", " << tag << L")" << endl; + cerr << "Error: Trying to link nonexistent states (" << source; + cerr << ", " << target << ", " << tag << ")" << endl; exit(EXIT_FAILURE); } } @@ -189,7 +190,7 @@ Transducer::setFinal(int const state, double const weight, bool value) int initial_copy = getInitial(); if(state == initial_copy) { - wcerr << L"Setting initial state to final" << endl; + cerr << "Setting initial state to final" << endl; } */ if(value) @@ -261,7 +262,7 @@ Transducer::joinFinals(int const epsilon_tag) } else if(finals.size() == 0) { - wcerr << L"Error: empty set of final states" < states; @@ -801,7 +799,7 @@ Transducer::recognise(wstring pattern, Alphabet &a, FILE *err) { set new_state; //Transducer::closure(int const state, int const epsilon_tag) // For each of the current alive states - //fwprintf(err, L"step: %ls %lc (%d)\n", pattern.c_str(), *it, sym); + //fwprintf(err, "step: %ls %lc (%d)\n", pattern.c_str(), *it, sym); for(auto& it2 : states) { auto& p = transitions[it2]; @@ -811,19 +809,19 @@ Transducer::recognise(wstring pattern, Alphabet &a, FILE *err) { auto t = a.decode(it3.first); - wstring l = L""; + UString l; a.getSymbol(l, t.first); - //wstring r = L""; + //UString r; //a.getSymbol(r, t.second); - //fwprintf(err, L" -> state: %d, trans: %ls:%ls, targ: %d\n", *it2, (l == L"") ? L"ε" : l.c_str(), (r == L"") ? L"ε" : r.c_str(), it3->second); - //if(l.find(*it) != wstring::npos || l == L"" ) - if(l.find(it) != wstring::npos) + //fwprintf(err, " -> state: %d, trans: %ls:%ls, targ: %d\n", *it2, (l.empty()) ? "ε" : l.c_str(), (r.empty()) ? "ε" : r.c_str(), it3->second); + //if(l.find(*it) != UString::npos || l.empty() ) + if(l.find(it) != UString::npos) { auto myclosure = closure(it3.second.first, 0); - //wcerr << L"Before closure alives: " <"; - wstring compoundRSymbol = L""; - wstring COMPILER_JOIN_ELEM = L"+"; - wstring COMPILER_GROUP_ELEM = L"#"; - wstring COMPILER_ANY_TAG = L""; - wstring COMPILER_ANY_CHAR = L""; - wstring COMPILER_SEPARABLE_BOUNDARY = L"<$>"; + UString compoundOnlyLSymbol = ""_u; + UString compoundRSymbol = ""_u; + UString COMPILER_JOIN_ELEM = "+"_u; + UString COMPILER_GROUP_ELEM = "#"_u; + UString COMPILER_ANY_TAG = ""_u; + UString COMPILER_ANY_CHAR = ""_u; + UString COMPILER_SEPARABLE_BOUNDARY = "<$>"_u; // When searching, we need to record (this, (trimmer, trimmer_pre_plus)) typedef std::pair > SearchState; @@ -1095,7 +1093,7 @@ Transducer::intersect(Transducer &trimmer, trimmer_preplus_next = trimmer_preplus; if(states_this_trimmed.find(current) == states_this_trimmed.end()) { - wcerr < #include +#include +#include using namespace icu; @@ -17,7 +19,7 @@ stoi(const UString& str) int ret; int c = u_sscanf(str.c_str(), "%d", &ret); if (c != 1) { - throw std::invalid_argument(); + throw std::invalid_argument("unable to parse int"); } return ret; } @@ -28,7 +30,7 @@ stod(const UString& str) double ret; int c = u_sscanf(str.c_str(), "%f", &ret); if (c != 1) { - throw std::invalid_argument(); + throw std::invalid_argument("unable to parse float"); } return ret; } @@ -36,12 +38,14 @@ stod(const UString& str) UString to_ustring(const char* s) { - UnicodeString temp = UnicodeString::fromUTF8(s); - UString ret = temp.getTerminatedBuffer(); + auto sz = strlen(s); + UString ret; + ret.reserve(sz); + utf8::utf8to16(s, s+sz, std::back_inserter(ret)); return ret; } -char* +const char* to_char(const UString& str) { std::string stemp; @@ -49,13 +53,3 @@ to_char(const UString& str) utemp.toUTF8String(stemp); return stemp.c_str(); } - -static std::ostream& -operator<<(std::ostream& ostr, const UString& str) -{ - std::string res; - UnicodeString temp = str.c_str(); - temp.toUTF8String(res); - ostr << res; - return ostr; -} diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 34c4b0c..52cbadb 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -19,8 +19,23 @@ double stod(const UString& str); UString to_ustring(const char* str); // for interfacing with e.g. XML library -char* to_char(const UString& str); - -static std::ostream& operator<<(std::ostream& ostr, const UString& str); +const char* to_char(const UString& str); + +static std::ostream& operator<<(std::ostream& ostr, const UString& str) +{ + std::string res; + icu::UnicodeString temp = str.c_str(); + temp.toUTF8String(res); + ostr << res; + return ostr; +} + +inline UString operator "" _u(const char* str, std::size_t len) { + UString us(len, 0); + for (size_t i = 0; i < len; ++i) { + us[i] = str[i]; + } + return us; +} #endif diff --git a/lttoolbox/xml_parse_util.cc b/lttoolbox/xml_parse_util.cc index 2a82701..7c8161a 100644 --- a/lttoolbox/xml_parse_util.cc +++ b/lttoolbox/xml_parse_util.cc @@ -26,7 +26,7 @@ XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name) { xmlChar *attrname = xmlCharStrdup(to_char(name)); xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - UString result = to_ustring(myattr); + UString result = to_ustring(reinterpret_cast(myattr)); xmlFree(myattr); xmlFree(attrname); return result; @@ -37,7 +37,7 @@ XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name, const UString { xmlChar *attrname = xmlCharStrdup(to_char(name)); xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - UString result = to_ustring(myattr); + UString result = to_ustring(reinterpret_cast(myattr)); xmlFree(myattr); xmlFree(attrname); if(myattr == NULL) { @@ -47,3 +47,17 @@ XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name, const UString return result; } } + +UString +XMLParseUtil::readName(xmlTextReaderPtr reader) +{ + const xmlChar* name = xmlTextReaderConstName(reader); + return to_ustring(reinterpret_cast(name)); +} + +UString +XMLParseUtil::readValue(xmlTextReaderPtr reader) +{ + const xmlChar* val = xmlTextReaderConstValue(reader); + return to_ustring(reinterpret_cast(val)); +} diff --git a/lttoolbox/xml_parse_util.h b/lttoolbox/xml_parse_util.h index eaf34a0..c96af30 100644 --- a/lttoolbox/xml_parse_util.h +++ b/lttoolbox/xml_parse_util.h @@ -33,7 +33,8 @@ public: /* If attrib does not exist (or other error), returns fallback: */ static UString attrib(xmlTextReaderPtr reader, UString const &name, const UString fallback); - static UString toUString(xmlChar const * input); + static UString readName(xmlTextReaderPtr reader); + static UString readValue(xmlTextReaderPtr reader); }; #endif