commit 81de69848b1b7f0278fa72e78c9d0ac9e3239f1f Author: Daniel Swanson Date: Wed Jun 30 08:49:36 2021 -0500 ICU stuff (#115) ICU changes (closes #81) - replace all instances of `std::wstring` with `UString` (= `std::basic_string`) - create `InputFile` wrapper to handle UTF-8 streams with nulls efficiency, readability, and code style changes - eliminate `Ltstr` and `string_to_wostream` - simplify Makefile - make transducer symbols `int32_t` rather than `int` - make common symbols static attributes of `Transducer` - extract some other string constants - prefer `std::vector` to `std::list` - prefer `.clear()` and `.empty()` to `= ""` and `== ""` - prefer range-for loops - remove old lsx code - have `regex_compiler` iterate over the input string rather than modifying it - lift a static computation out of a loop in `Transducer::determinize()` - move constant initializers to class header helper function and dependency changes - move `StringUtils` here from apertium - depend on external utfcpp rather than bundling it - make `XMLParseUtil` functions more specific to their typical usecases - add `xml_walk_util.h` for cleanly iterating over children of `xmlNode*` diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b38525d..f2716f4 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -10,7 +10,7 @@ jobs: - name: dependencies run: | sudo apt-get -qy update - sudo apt-get -qfy install --no-install-recommends build-essential automake autotools-dev pkg-config libxml2-dev libxml2-utils python3-dev python3-setuptools swig + sudo apt-get -qfy install --no-install-recommends build-essential automake autotools-dev pkg-config libutfcpp-dev libxml2-dev libxml2-utils python3-dev python3-setuptools swig - name: autoreconf run: autoreconf -fvi - name: configure diff --git a/.gitignore b/.gitignore index 6972eaf..7f5b72f 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ /lttoolbox/lt-expand /python/Makefile /python/Makefile.in +/python/lttoolbox.i /python/lttoolbox_wrap.cpp /python/lttoolbox.py /python/setup.py @@ -80,3 +81,4 @@ *.egg-info/ *.egg **/.mypy_cache/ +*~ diff --git a/.travis.yml b/.travis.yml index 312faa8..894f00e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,8 +6,14 @@ compiler: - clang - gcc +addons: + homebrew: + packages: + - icu4c + - utf8cpp + before_install: - - if [ $TRAVIS_OS_NAME = linux ]; then sudo apt-get install -y swig; else brew install swig; fi + - if [ $TRAVIS_OS_NAME = linux ]; then sudo apt-get install -y swig libutfcpp-dev; else brew install swig utf8cpp; fi script: - $CXX --version - autoreconf -fvi diff --git a/CMakeLists.txt b/CMakeLists.txt index 09755dd..3f42928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,6 @@ if(WIN32) add_definitions(-D_SECURE_SCL=0 -D_ITERATOR_DEBUG_LEVEL=0 -D_CRT_SECURE_NO_DEPRECATE -DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN -DNOMINMAX) add_definitions(-DSTDC_HEADERS -DREGEX_MALLOC) include_directories("lttoolbox/win32") - include_directories("utf8") else() add_definitions(-D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) endif() @@ -110,7 +109,7 @@ endif() # Unlocked I/O functions include(CheckSymbolExists) set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) -foreach(func fread_unlocked fwrite_unlocked fgetc_unlocked fputc_unlocked fputs_unlocked fgetwc_unlocked fputwc_unlocked fputws_unlocked) +foreach(func fread_unlocked fwrite_unlocked fgetc_unlocked fputc_unlocked fputs_unlocked) string(TOUPPER ${func} _uc) CHECK_SYMBOL_EXISTS(${func} "stdio.h" HAVE_DECL_${_uc}) if(HAVE_DECL_${_uc}) diff --git a/Makefile.am b/Makefile.am index e07e620..13a7779 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,7 +10,7 @@ endif pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = lttoolbox.pc -EXTRA_DIST=autogen.sh utf8 tests +EXTRA_DIST=autogen.sh tests # TODO: the below will use python3 if you run it on Arch Linux with no python2 installed test: tests/run_tests.py diff --git a/README b/README index 32fe0dc..54dbd6c 100644 --- a/README +++ b/README @@ -51,6 +51,8 @@ Requirements: * g++ >= 2.95 * GNU make * libxml2 >= 2.6.17 +* ICU +* utfcpp Building & installing: diff --git a/configure.ac b/configure.ac index 8e5dee4..05fce75 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,8 @@ AC_PREREQ(2.52) m4_define([PKG_VERSION_MAJOR], [3]) -m4_define([PKG_VERSION_MINOR], [5]) -m4_define([PKG_VERSION_PATCH], [3]) +m4_define([PKG_VERSION_MINOR], [6]) +m4_define([PKG_VERSION_PATCH], [0]) AC_INIT([lttoolbox], [PKG_VERSION_MAJOR.PKG_VERSION_MINOR.PKG_VERSION_PATCH], [apertium-stuff@lists.sourceforge.net], [lttoolbox], [https://wiki.apertium.org/wiki/Lttoolbox]) @@ -38,29 +38,8 @@ AC_ARG_ENABLE(profile, [CXXFLAGS="-pg -g -Wall"; CFLAGS="-pg -g -Wall"; LDFLAGS="-pg"]) -PKG_CHECK_MODULES(LTTOOLBOX, [libxml-2.0 >= 2.6.17]) - -# Check for wide strings -AC_DEFUN([AC_CXX_WSTRING],[ - AC_CACHE_CHECK(whether the compiler supports wide strings, - ac_cv_cxx_wstring, - [AC_LANG_SAVE - AC_LANG_CPLUSPLUS - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ -std::wstring test = L"test"; - ]])], - [ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no]) - AC_LANG_RESTORE - ]) -]) - -AC_CXX_WSTRING - -if test "$ac_cv_cxx_wstring" = no -then - AC_MSG_ERROR([Missing wide string support]) -fi - +PKG_CHECK_MODULES(LIBXML, [libxml-2.0 >= 2.6.17]) +PKG_CHECK_MODULES(ICU, [icu-i18n, icu-io, icu-uc]) # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) @@ -68,6 +47,7 @@ AC_CHECK_LIB(xml2, xmlReaderForFile) # Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL @@ -78,7 +58,7 @@ AC_TYPE_SIZE_T AC_FUNC_ERROR_AT_LINE AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, \ -fputc_unlocked, fputs_unlocked, fgetwc_unlocked, fputwc_unlocked, fputws_unlocked, ungetwc_unlocked]) +fputc_unlocked, fputs_unlocked]) AC_CHECK_FUNCS([setlocale strdup getopt_long]) diff --git a/lttoolbox/CMakeLists.txt b/lttoolbox/CMakeLists.txt index 8a8aa0e..8b25032 100644 --- a/lttoolbox/CMakeLists.txt +++ b/lttoolbox/CMakeLists.txt @@ -57,7 +57,6 @@ if(WIN32) win32/regex.c win32/regex.h win32/unistd.h - ${CMAKE_SOURCE_DIR}/utf8/utf8_fwrap.h ${LIBLTTOOLBOX_SOURCES} ) if(NOT VCPKG_TOOLCHAIN) diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index fb44eeb..2fd56b0 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,15 +1,15 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ - deserialiser.h entry_token.h expander.h fst_processor.h lt_locale.h \ - ltstr.h match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \ - transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \ - string_to_wostream.h + deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ + match_exe.h match_node.h match_state.h my_stdio.h node.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ + transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ + ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ - expander.cc fst_processor.cc lt_locale.cc match_exe.cc \ + expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \ - trans_exe.cc xml_parse_util.cc tmx_compiler.cc + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ + trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) @@ -27,33 +27,16 @@ lttoolboxlib = $(prefix)/lib lttoolbox_DATA = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd -lt_print_SOURCES = lt_print.cc -lt_print_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_print_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) +LDADD = liblttoolbox$(VERSION_MAJOR).la +AM_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LIBXML_LIBS) $(ICU_LIBS) +lt_print_SOURCES = lt_print.cc lt_trim_SOURCES = lt_trim.cc -lt_trim_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_trim_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) - lt_comp_SOURCES = lt_comp.cc -lt_comp_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_comp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) - lt_proc_SOURCES = lt_proc.cc -lt_proc_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_proc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) - lt_expand_SOURCES = lt_expand.cc -lt_expand_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_expand_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) - lt_tmxcomp_SOURCES = lt_tmxcomp.cc -lt_tmxcomp_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_tmxcomp_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) - lt_tmxproc_SOURCES = lt_tmxproc.cc -lt_tmxproc_LDADD = liblttoolbox$(VERSION_MAJOR).la -lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" @@ -67,10 +50,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 -INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS) -if WINDOWS - INCLUDES += -I$(top_srcdir)/utf8 -endif +INCLUDES = -I$(top_srcdir) $(LIBXML_CFLAGS) $(ICU_CFLAGS) CLEANFILES = *~ EXTRA_DIST = dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd $(man_MANS) diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 6a47095..a313814 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -23,19 +23,16 @@ #include #include #include -#include -#include -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif +#include using namespace std; +using namespace icu; Alphabet::Alphabet() { - spair[pair(0,0)] = 0; - spairinv.push_back(pair(0,0)); + spair[pair(0,0)] = 0; + spairinv.push_back(pair(0,0)); } Alphabet::~Alphabet() @@ -74,23 +71,23 @@ Alphabet::copy(Alphabet const &a) } void -Alphabet::includeSymbol(wstring const &s) +Alphabet::includeSymbol(UString const &s) { if(slexic.find(s) == slexic.end()) { - int slexic_size = slexic.size(); + int32_t slexic_size = slexic.size(); slexic[s] = -(slexic_size+1); slexicinv.push_back(s); } } -int -Alphabet::operator()(int const c1, int const c2) +int32_t +Alphabet::operator()(int32_t const c1, int32_t const c2) { auto tmp = make_pair(c1, c2); if(spair.find(tmp) == spair.end()) { - int spair_size = spair.size(); + int32_t spair_size = spair.size(); spair[tmp] = spair_size; spairinv.push_back(tmp); } @@ -98,14 +95,14 @@ Alphabet::operator()(int const c1, int const c2) return spair[tmp]; } -int -Alphabet::operator()(wstring const &s) +int32_t +Alphabet::operator()(UString const &s) { return slexic[s]; } -int -Alphabet::operator()(wstring const &s) const +int32_t +Alphabet::operator()(UString const &s) const { auto it = slexic.find(s); if (it == slexic.end()) { @@ -115,12 +112,12 @@ Alphabet::operator()(wstring const &s) const } bool -Alphabet::isSymbolDefined(wstring const &s) +Alphabet::isSymbolDefined(UString const &s) { return slexic.find(s) != slexic.end(); } -int +int32_t Alphabet::size() const { return slexic.size(); @@ -131,16 +128,16 @@ Alphabet::write(FILE *output) { // First, we write the taglist Compression::multibyte_write(slexicinv.size(), output); // taglist size - for(unsigned int i = 0, limit = slexicinv.size(); i < limit; i++) + for(size_t i = 0, limit = slexicinv.size(); i < limit; i++) { - Compression::wstring_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); + Compression::string_write(slexicinv[i].substr(1, slexicinv[i].size()-2), output); } // Then we write the list of pairs // All numbers are biased + slexicinv.size() to be positive or zero - unsigned int bias = slexicinv.size(); + size_t bias = slexicinv.size(); Compression::multibyte_write(spairinv.size(), output); - for(unsigned int i = 0, limit = spairinv.size(); i != limit; i++) + for(size_t i = 0, limit = spairinv.size(); i != limit; i++) { Compression::multibyte_write(spairinv[i].first + bias, output); Compression::multibyte_write(spairinv[i].second + bias, output); @@ -155,26 +152,28 @@ Alphabet::read(FILE *input) a_new.spair.clear(); // Reading of taglist - int tam = Compression::multibyte_read(input); - map tmp; + int32_t tam = Compression::multibyte_read(input); + map tmp; while(tam > 0) { tam--; - wstring mytag = L"<" + Compression::wstring_read(input) + L">"; + UString mytag = "<"_u; + mytag += Compression::string_read(input); + mytag += ">"_u; a_new.slexicinv.push_back(mytag); a_new.slexic[mytag]= -a_new.slexicinv.size(); // ToDo: This does not turn the result negative due to unsigned semantics } // Reading of pairlist - unsigned int bias = a_new.slexicinv.size(); + size_t bias = a_new.slexicinv.size(); tam = Compression::multibyte_read(input); while(tam > 0) { tam--; - int first = Compression::multibyte_read(input); - int second = Compression::multibyte_read(input); - pair tmp(first - bias, second - bias); - int spair_size = a_new.spair.size(); + int32_t first = Compression::multibyte_read(input); + int32_t second = Compression::multibyte_read(input); + pair tmp(first - bias, second - bias); + int32_t spair_size = a_new.spair.size(); a_new.spair[tmp] = spair_size; a_new.spairinv.push_back(tmp); } @@ -185,8 +184,8 @@ Alphabet::read(FILE *input) void Alphabet::serialise(std::ostream &serialised) const { - Serialiser >::serialise(slexicinv, serialised); - Serialiser > >::serialise(spairinv, serialised); + Serialiser >::serialise(slexicinv, serialised); + Serialiser > >::serialise(spairinv, serialised); } void @@ -196,31 +195,32 @@ Alphabet::deserialise(std::istream &serialised) slexic.clear(); spairinv.clear(); spair.clear(); - slexicinv = Deserialiser >::deserialise(serialised); + slexicinv = Deserialiser >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { slexic[slexicinv[i]] = -i - 1; // ToDo: This does not turn the result negative due to unsigned semantics } - spairinv = Deserialiser > >::deserialise(serialised); + spairinv = Deserialiser > >::deserialise(serialised); for (size_t i = 0; i < slexicinv.size(); i++) { spair[spairinv[i]] = i; } } void -Alphabet::writeSymbol(int const symbol, FILE *output) const +Alphabet::writeSymbol(int32_t const symbol, UFILE *output) const { if(symbol < 0) { - fputws_unlocked(slexicinv[-symbol-1].c_str(), output); + // write() has a name conflict + u_fprintf(output, "%S", slexicinv[-symbol-1].c_str()); } else { - fputwc_unlocked(static_cast(symbol), output); + u_fputc(static_cast(symbol), output); } } void -Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const +Alphabet::getSymbol(UString &result, int32_t const symbol, bool uppercase) const { if(symbol == 0) { @@ -231,7 +231,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const { if(symbol >= 0) { - result += static_cast(symbol); + result += static_cast(symbol); } else { @@ -240,7 +240,7 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const } else if(symbol >= 0) { - result += static_cast(towupper(static_cast(symbol))); + result += u_toupper(static_cast(symbol)); } else { @@ -249,20 +249,20 @@ Alphabet::getSymbol(wstring &result, int const symbol, bool uppercase) const } bool -Alphabet::isTag(int const symbol) const +Alphabet::isTag(int32_t const symbol) const { return symbol < 0; } -pair const & -Alphabet::decode(int const code) const +pair const & +Alphabet::decode(int32_t const code) const { return spairinv[code]; } -set -Alphabet::symbolsWhereLeftIs(wchar_t l) const { - set eps; +set +Alphabet::symbolsWhereLeftIs(UChar32 l) const { + set eps; for(const auto& sp: spair) { // [(l, r) : tag] if(sp.first.first == l) { eps.insert(sp.second); @@ -271,17 +271,17 @@ Alphabet::symbolsWhereLeftIs(wchar_t l) const { return eps; } -void Alphabet::setSymbol(int symbol, wstring newSymbolString) { +void Alphabet::setSymbol(int32_t symbol, UString newSymbolString) { //Should be a special character! if (symbol < 0) slexicinv[-symbol-1] = newSymbolString; } void -Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, bool nonTagsToo) +Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, bool nonTagsToo) { - // Non-tag letters get the same int in spairinv across alphabets, + // Non-tag letters get the same int32_t in spairinv across alphabets, // but tags may differ, so do those separately afterwards. - set tags; + set tags; for(auto& it : basis.spairinv) { if(s == left) { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 3218334..8c6dec2 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -22,10 +22,11 @@ #include #include #include - -#include +#include +#include "ustring.h" using namespace std; +using namespace icu; /** * Alphabet class. @@ -38,27 +39,27 @@ private: * Symbol-identifier relationship. Only contains . * @see slexicinv */ - map slexic; + map slexic; /** * Identifier-symbol relationship. Only contains . * @see slexic */ - vector slexicinv; + vector slexicinv; /** * Map from symbol-pairs to symbols; tags get negative numbers, - * other characters are wchar_t's casted to ints. + * other characters are UChar32's casted to ints. * @see spairinv */ - map, int> spair; + map, int32_t> spair; /** * All symbol-pairs (both and letters). * @see spair */ - vector > spairinv; + vector > spairinv; void copy(Alphabet const &a); @@ -89,7 +90,7 @@ public: /** * Include a symbol into the alphabet. */ - void includeSymbol(wstring const &s); + void includeSymbol(UString const &s); /** * Get an unique code for every symbol pair. This flavour is for @@ -98,8 +99,8 @@ public: * @param c2 right symbol. * @return code for (c1, c2). */ - int operator()(int const c1, int const c2); - int operator()(wstring const &s) const; + int32_t operator()(int32_t const c1, int32_t const c2); + int32_t operator()(UString const &s) const; /** * Gets the individual symbol identifier. Assumes it already exists! @@ -107,20 +108,20 @@ public: * @param s symbol to be identified. * @return symbol identifier. */ - int operator()(wstring const &s); + int32_t operator()(UString const &s); /** * Check wether the symbol is defined in the alphabet. * @param s symbol * @return true if defined */ - bool isSymbolDefined(wstring const &s); + bool isSymbolDefined(UString const &s); /** * Returns the size of the alphabet (number of symbols). * @return number of symbols. */ - int size() const; + int32_t size() const; /** * Write method. @@ -142,7 +143,7 @@ public: * @param symbol symbol code. * @param output output stream. */ - void writeSymbol(int const symbol, FILE *output) const; + void writeSymbol(int32_t const symbol, UFILE *output) const; /** * Concat a symbol in the string that is passed by reference. @@ -150,7 +151,7 @@ public: * @param symbol code of the symbol * @param uppercase true if we want an uppercase symbol */ - void getSymbol(wstring &result, int const symbol, + void getSymbol(UString &result, int32_t const symbol, bool uppercase = false) const; /** @@ -158,27 +159,27 @@ public: * @param symbol the code of the symbol * @return true if the symbol is a tag */ - bool isTag(int const symbol) const; + bool isTag(int32_t const symbol) const; /** * Sets an already existing symbol to represent a new value. * @param symbol the code of the symbol to set * @param newSymbolString the new string for this symbol */ - void setSymbol(int symbol, wstring newSymbolString); + void setSymbol(int32_t symbol, UString newSymbolString); /** * Note: both the symbol int and int-pair are specific to this alphabet instance. - * @see operator() to go from general wstrings to alphabet-specific ints. + * @see operator() to go from general strings to alphabet-specific ints. * @param code a symbol * @return the pair which code represents in this alphabet */ - pair const & decode(int const code) const; + pair const & decode(int32_t const code) const; /** * Get all symbols where the left-hand side of the symbol-pair is l. */ - set symbolsWhereLeftIs(wchar_t l) const; + set symbolsWhereLeftIs(UChar32 l) const; enum Side { @@ -195,7 +196,7 @@ public: * @param s whether to loopback on the left or right side of the symbol-pair * @param nonTagsToo by default only tags are included, but if this is true we include all symbols */ - void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); }; #endif diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index a511f5a..eaa0dd8 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -19,21 +19,24 @@ #include #include #include -#include +#include #include #include +#include +#include +#include +#include using namespace std; +using namespace icu; AttCompiler::AttCompiler() : starting_state(0), default_weight(0.0000) -{ -} +{} AttCompiler::~AttCompiler() -{ -} +{} void AttCompiler::clear() @@ -52,21 +55,24 @@ AttCompiler::clear() * for conversion? */ void -AttCompiler::convert_hfst(wstring& symbol) +AttCompiler::convert_hfst(UString& symbol) { - if (symbol == L"@0@" || symbol == L"ε") - { - symbol = L""; - } - else if (symbol == L"@_SPACE_@") - { - symbol = L" "; + if (symbol == Transducer::HFST_EPSILON_SYMBOL_SHORT || + symbol == Transducer::HFST_EPSILON_SYMBOL_LONG || + symbol == Transducer::LTTB_EPSILON_SYMBOL) { + symbol.clear(); + } else if (symbol == Transducer::HFST_SPACE_SYMBOL) { + symbol = " "_u; + } else if (symbol == Transducer::HFST_TAB_SYMBOL) { + symbol = "\t"_u; } } bool -AttCompiler::is_word_punct(wchar_t symbol) +AttCompiler::is_word_punct(UChar32 symbol) { + // this version isn't quite right, but something like it should be possible + //return u_charType(symbol) & (U_NON_SPACING_MARK | U_ENCLOSING_MARK | U_COMBINING_SPACING_MARK); // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges if((symbol >= 0x0300 && symbol <= 0x036F) // Combining Diacritics || (symbol >= 0x1AB0 && symbol <= 0x1AFF) // ... Extended @@ -90,115 +96,108 @@ AttCompiler::is_word_punct(wchar_t symbol) * only) character otherwise. */ int -AttCompiler::symbol_code(const wstring& symbol) +AttCompiler::symbol_code(const UString& symbol) { - if (symbol.length() > 1) { + if (u_strHasMoreChar32Than(symbol.c_str(), -1, 1)) { alphabet.includeSymbol(symbol); return alphabet(symbol); - } else if (symbol == L"") { + } else if (symbol.empty()) { return 0; - } else if ((iswpunct(symbol[0]) || iswspace(symbol[0])) && !is_word_punct(symbol[0])) { - return symbol[0]; } else { - letters.insert(symbol[0]); - if(iswlower(symbol[0])) - { - letters.insert(towupper(symbol[0])); - } - else if(iswupper(symbol[0])) - { - letters.insert(towlower(symbol[0])); + UChar32 c; + U16_GET(symbol, 0, 0, symbol.size(), c); + if ((u_ispunct(c) || u_isspace(c)) && !is_word_punct(c)) { + return c; + } else { + letters.insert(c); + if(u_islower(c)) { + letters.insert(u_toupper(c)); + } else if(u_isupper(c)) { + letters.insert(u_tolower(c)); + } + return c; } - return symbol[0]; } } -bool -AttCompiler::has_multiple_fsts(string const &file_name) -{ - wifstream infile(file_name.c_str()); // TODO: error checking - wstring line; - - while(getline(infile, line)){ - if (line.find('-') == 0) - return true; - } - - return false; -} - void -AttCompiler::parse(string const &file_name, wstring const &dir) +AttCompiler::parse(string const &file_name, bool read_rl) { clear(); - wifstream infile(file_name.c_str()); // TODO: error checking - vector tokens; - wstring line; + UFILE* infile = u_fopen(file_name.c_str(), "r", NULL, NULL); + if (infile == NULL) { + cerr << "Error: unable to open '" << file_name << "' for reading." << endl; + } + vector tokens; bool first_line_in_fst = true; // First line -- see below - int state_id_offset = 0; + bool multiple_transducers = false; + int state_id_offset = 1; int largest_seen_state_id = 0; + int line_number = 0; - if (has_multiple_fsts(file_name)){ - wcerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; - - // Set the starting state to 0 (Epsilon transtions will be added later) - starting_state = 0; - state_id_offset = 1; - } - - while (getline(infile, line)) + while (!u_feof(infile)) { + line_number++; tokens.clear(); + tokens.push_back(""_u); + do { + UChar c = u_fgetc(infile); + if (c == '\n') { + break; + } else if (c == '\t') { + tokens.push_back(""_u); + } else { + tokens.back() += c; + } + } while (!u_feof(infile)); + int from, to; - wstring upper, lower; + UString upper, lower; double weight; - if (line.length() == 0 && first_line_in_fst) + if (tokens[0].length() == 0 && first_line_in_fst) { - wcerr << "Error: empty file '" << file_name << "'." << endl; + cerr << "Error: empty file '" << file_name << "'." << endl; exit(EXIT_FAILURE); } - if (first_line_in_fst && line.find(L"\t") == wstring::npos) + if (first_line_in_fst && tokens.size() == 1) { - wcerr << "Error: invalid format '" << file_name << "'." << endl; + cerr << "Error: invalid format in file '" << file_name << "' on line " << line_number << "." << endl; exit(EXIT_FAILURE); } /* Empty line. */ - if (line.length() == 0) + if (tokens.size() == 1 && tokens[0].length() == 0) { continue; } - split(line, L'\t', tokens); if (tokens[0].find('-') == 0) { + if (state_id_offset == 1) { + // this is the first split we've seen + cerr << "Warning: Multiple fsts in '" << file_name << "' will be disjuncted." << endl; + multiple_transducers = true; + } // Update the offset for the new FST state_id_offset = largest_seen_state_id + 1; first_line_in_fst = true; continue; } - from = stoi(tokens[0]) + state_id_offset; + from = StringUtils::stoi(tokens[0]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, from); AttNode* source = get_node(from); /* First line: the initial state is of both types. */ if (first_line_in_fst) { - // If the file has a single FST - No need for state id mapping - if (state_id_offset == 0) - starting_state = from; - else{ - AttNode * starting_node = get_node(starting_state); - - // Add an Epsilon transition from the new starting state - starting_node->transductions.push_back( - Transduction(from, L"", L"", - alphabet(symbol_code(L""), symbol_code(L"")), - default_weight)); - } + AttNode * starting_node = get_node(starting_state); + + // Add an Epsilon transition from the new starting state + starting_node->transductions.push_back( + Transduction(from, ""_u, ""_u, 0, default_weight)); first_line_in_fst = false; } @@ -207,7 +206,7 @@ AttCompiler::parse(string const &file_name, wstring const &dir) { if (tokens.size() > 1) { - weight = stod(tokens[1]); + weight = StringUtils::stod(tokens[1]); } else { @@ -217,9 +216,9 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } else { - to = stoi(tokens[1]) + state_id_offset; + to = StringUtils::stoi(tokens[1]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, to); - if(dir == L"RL") + if(read_rl) { upper = tokens[3]; lower = tokens[2]; @@ -234,7 +233,7 @@ AttCompiler::parse(string const &file_name, wstring const &dir) int tag = alphabet(symbol_code(upper), symbol_code(lower)); if(tokens.size() > 4) { - weight = stod(tokens[4]); + weight = StringUtils::stod(tokens[4]); } else { @@ -247,12 +246,19 @@ AttCompiler::parse(string const &file_name, wstring const &dir) } } + if (!multiple_transducers) { + starting_state = 1; + // if we aren't disjuncting multiple transducers + // then we have an extra epsilon transduction at the beginning + // so skip it + } + /* Classify the nodes of the graph. */ classify_forwards(); set path; classify_backwards(starting_state, path); - infile.close(); + u_fclose(infile); } /** Extracts the sub-transducer made of states of type @p type. */ @@ -268,27 +274,27 @@ AttCompiler::extract_transducer(TransducerType type) _extract_transducer(type, starting_state, transducer, corr, visited); /* The final states. */ - bool noFinals = true; + //bool noFinals = true; for (auto& f : finals) { if (corr.find(f.first) != corr.end()) { transducer.setFinal(corr[f.first], f.second); - noFinals = false; + //noFinals = false; } } /* if(noFinals) { - wcerr << L"No final states (" << type << ")" << endl; - wcerr << L" were:" << endl; - wcerr << L"\t" ; + cerr << "No final states (" << type << ")" << endl; + cerr << " were:" << endl; + cerr << "\t" ; for (auto& f : finals) { - wcerr << f.first << L" "; + cerr << f.first << " "; } - wcerr << endl; + cerr << endl; } */ return transducer; @@ -353,11 +359,12 @@ AttCompiler::_extract_transducer(TransducerType type, int from, void AttCompiler::classify_single_transition(Transduction& t) { - if (t.upper.length() == 1) { - if (letters.find(t.upper[0]) != letters.end()) { + int32_t sym = alphabet.decode(t.tag).first; + if (sym > 0) { + if (letters.find(sym) != letters.end()) { t.type |= WORD; } - if (iswpunct(t.upper[0])) { + if (u_ispunct(sym)) { t.type |= PUNCT; } } @@ -380,10 +387,10 @@ AttCompiler::classify_forwards() for(auto& t1 : n1->transductions) { AttNode* n2 = get_node(t1.to); for(auto& t2 : n2->transductions) { - t2.type |= t1.type; + t2.type |= t1.type; } if(done.find(t1.to) == done.end()) { - todo.push(t1.to); + todo.push(t1.to); } } done.insert(next); @@ -400,7 +407,7 @@ TransducerType AttCompiler::classify_backwards(int state, set& path) { if(finals.find(state) != finals.end()) { - wcerr << L"ERROR: Transducer contains epsilon transition to a final state. Aborting." << endl; + cerr << "ERROR: Transducer contains epsilon transition to a final state. Aborting." << endl; exit(EXIT_FAILURE); } AttNode* node = get_node(state); @@ -409,7 +416,7 @@ AttCompiler::classify_backwards(int state, set& path) if(t1.type != UNDECIDED) { type |= t1.type; } else if(path.find(t1.to) != path.end()) { - wcerr << L"ERROR: Transducer contains initial epsilon loop. Aborting." << endl; + cerr << "ERROR: Transducer contains initial epsilon loop. Aborting." << endl; exit(EXIT_FAILURE); } else { path.insert(t1.to); @@ -429,14 +436,14 @@ void AttCompiler::write(FILE *output) { // FILE* output = fopen(file_name, "wb"); - fwrite(HEADER_LTTOOLBOX, 1, 4, output); + fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; write_le(output, features); Transducer punct_fst = extract_transducer(PUNCT); /* Non-multichar symbols. */ - Compression::wstring_write(wstring(letters.begin(), letters.end()), output); + Compression::string_write(UString(letters.begin(), letters.end()), output); /* Multichar symbols. */ alphabet.write(output); /* And now the FST. */ @@ -448,17 +455,17 @@ AttCompiler::write(FILE *output) { Compression::multibyte_write(2, output); } - Compression::wstring_write(L"main@standard", output); + Compression::string_write("main@standard"_u, output); Transducer word_fst = extract_transducer(WORD); word_fst.write(output); - wcout << L"main@standard" << " " << word_fst.size(); - wcout << " " << word_fst.numberOfTransitions() << endl; - Compression::wstring_write(L"final@inconditional", output); + cout << "main@standard" << " " << word_fst.size(); + cout << " " << word_fst.numberOfTransitions() << endl; + Compression::string_write("final@inconditional"_u, output); if(punct_fst.numberOfTransitions() != 0) { punct_fst.write(output); - wcout << L"final@inconditional" << " " << punct_fst.size(); - wcout << " " << punct_fst.numberOfTransitions() << endl; + cout << "final@inconditional" << " " << punct_fst.size(); + cout << " " << punct_fst.numberOfTransitions() << endl; } // fclose(output); } diff --git a/lttoolbox/att_compiler.h b/lttoolbox/att_compiler.h index 126ca56..557eb55 100644 --- a/lttoolbox/att_compiler.h +++ b/lttoolbox/att_compiler.h @@ -19,11 +19,11 @@ #include #include -#include #include #include #include +#include #include #include #include @@ -36,25 +36,11 @@ #define BOTH 3 using namespace std; +using namespace icu; /** Bitmask; 1 = WORD, 2 = PUNCT, 3 = BOTH. */ typedef unsigned int TransducerType; -namespace -{ - /** Splits a string into fields. */ - inline vector& split(const wstring& s, wchar_t delim, vector &out) - { - wistringstream ss(s); - wstring item; - while (getline(ss, item, delim)) - { - out.push_back(item); - } - return out; - } -}; - /** * Converts transducers from AT&T text format to lt binary format. * @@ -90,8 +76,9 @@ public: /** * Reads the AT&T format file @p file_name. The transducer and the alphabet * are both cleared before reading the new file. + * If read_rl = true then the second tape is used as the input */ - void parse(string const &file_name, wstring const &dir); + void parse(string const &file_name, bool read_rl); /** Writes the transducer to @p file_name in lt binary format. */ @@ -113,20 +100,20 @@ private: Alphabet alphabet; /** All non-multicharacter symbols. */ - set letters; + set letters; /** Used in AttNode. */ struct Transduction { int to; - wstring upper; - wstring lower; + UString upper; + UString lower; int tag; double weight; TransducerType type; - Transduction(int to, wstring upper, wstring lower, int tag, double weight, - TransducerType type=UNDECIDED) : + Transduction(int to, UString upper, UString lower, int tag, + double weight, TransducerType type=UNDECIDED) : to(to), upper(upper), lower(lower), tag(tag), weight(weight), type(type) {} }; @@ -170,7 +157,7 @@ private: * Returns true for combining diacritics and modifier letters * */ - bool is_word_punct(wchar_t symbol); + bool is_word_punct(UChar32 symbol); /** * Determines initial type of single transition @@ -186,7 +173,7 @@ private: * @todo Are there other special symbols? If so, add them, and maybe use a map * for conversion? */ - void convert_hfst(wstring& symbol); + void convert_hfst(UString& symbol); /** * Returns the code of the symbol in the alphabet. Run after convert_hfst has @@ -197,12 +184,7 @@ private: * @return the code of the symbol, if @p symbol is multichar; its first (and * only) character otherwise. */ - int symbol_code(const wstring& symbol); - - /** - * Finds whether an at&t file contains multiple FSTs or not - */ - bool has_multiple_fsts(string const &file_name); + int symbol_code(const UString& symbol); }; #endif /* _MYATT_COMPILER_ */ diff --git a/lttoolbox/buffer.h b/lttoolbox/buffer.h index 9a1397f..5d19417 100644 --- a/lttoolbox/buffer.h +++ b/lttoolbox/buffer.h @@ -75,8 +75,8 @@ public: { if(buf_size == 0) { - wcerr << "Error: Cannot create empty buffer." << endl; - exit(EXIT_FAILURE); + cerr << "Error: Cannot create empty buffer." << endl; + exit(EXIT_FAILURE); } buf = new T[buf_size]; size = buf_size; @@ -115,8 +115,8 @@ public: { if(&b != this) { - destroy(); - copy(b); + destroy(); + copy(b); } return *this; } @@ -130,7 +130,7 @@ public: { if(lastpos == size) { - lastpos = 0; + lastpos = 0; } buf[lastpos++] = value; currentpos = lastpos; @@ -147,7 +147,7 @@ public: { if(lastpos == size) { - lastpos = 0; + lastpos = 0; } currentpos = lastpos; return buf[lastpos -1]; @@ -162,15 +162,15 @@ public: { if(currentpos != lastpos) { - if(currentpos == size) - { - currentpos = 0; - } - return buf[currentpos++]; + if(currentpos == size) + { + currentpos = 0; + } + return buf[currentpos++]; } else { - return last(); + return last(); } } @@ -182,11 +182,11 @@ public: { if(lastpos != 0) { - return buf[lastpos-1]; + return buf[lastpos-1]; } else { - return buf[size-1]; + return buf[size-1]; } } @@ -218,11 +218,11 @@ public: { if(prevpos <= currentpos) { - return currentpos - prevpos; + return currentpos - prevpos; } else { - return currentpos + size - prevpos; + return currentpos + size - prevpos; } } @@ -236,11 +236,11 @@ public: { if(postpos >= currentpos) { - return postpos - currentpos; + return postpos - currentpos; } else { - return postpos + size - currentpos; + return postpos + size - currentpos; } } diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 00a6287..d2ab234 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -28,41 +28,47 @@ using namespace std; -wstring const Compiler::COMPILER_DICTIONARY_ELEM = L"dictionary"; -wstring const Compiler::COMPILER_ALPHABET_ELEM = L"alphabet"; -wstring const Compiler::COMPILER_SDEFS_ELEM = L"sdefs"; -wstring const Compiler::COMPILER_SDEF_ELEM = L"sdef"; -wstring const Compiler::COMPILER_N_ATTR = L"n"; -wstring const Compiler::COMPILER_PARDEFS_ELEM = L"pardefs"; -wstring const Compiler::COMPILER_PARDEF_ELEM = L"pardef"; -wstring const Compiler::COMPILER_PAR_ELEM = L"par"; -wstring const Compiler::COMPILER_ENTRY_ELEM = L"e"; -wstring const Compiler::COMPILER_RESTRICTION_ATTR = L"r"; -wstring const Compiler::COMPILER_RESTRICTION_LR_VAL = L"LR"; -wstring const Compiler::COMPILER_RESTRICTION_RL_VAL = L"RL"; -wstring const Compiler::COMPILER_PAIR_ELEM = L"p"; -wstring const Compiler::COMPILER_LEFT_ELEM = L"l"; -wstring const Compiler::COMPILER_RIGHT_ELEM = L"r"; -wstring const Compiler::COMPILER_S_ELEM = L"s"; -wstring const Compiler::COMPILER_M_ELEM = L"m"; -wstring const Compiler::COMPILER_REGEXP_ELEM = L"re"; -wstring const Compiler::COMPILER_SECTION_ELEM = L"section"; -wstring const Compiler::COMPILER_ID_ATTR = L"id"; -wstring const Compiler::COMPILER_TYPE_ATTR = L"type"; -wstring const Compiler::COMPILER_IDENTITY_ELEM = L"i"; -wstring const Compiler::COMPILER_IDENTITYGROUP_ELEM = L"ig"; -wstring const Compiler::COMPILER_JOIN_ELEM = L"j"; -wstring const Compiler::COMPILER_BLANK_ELEM = L"b"; -wstring const Compiler::COMPILER_POSTGENERATOR_ELEM = L"a"; -wstring const Compiler::COMPILER_GROUP_ELEM = L"g"; -wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm"; -wstring const Compiler::COMPILER_IGNORE_ATTR = L"i"; -wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes"; -wstring const Compiler::COMPILER_ALT_ATTR = L"alt"; -wstring const Compiler::COMPILER_V_ATTR = L"v"; -wstring const Compiler::COMPILER_VL_ATTR = L"vl"; -wstring const Compiler::COMPILER_VR_ATTR = L"vr"; -wstring const Compiler::COMPILER_WEIGHT_ATTR = L"w"; +UString const Compiler::COMPILER_DICTIONARY_ELEM = "dictionary"_u; +UString const Compiler::COMPILER_ALPHABET_ELEM = "alphabet"_u; +UString const Compiler::COMPILER_SDEFS_ELEM = "sdefs"_u; +UString const Compiler::COMPILER_SDEF_ELEM = "sdef"_u; +UString const Compiler::COMPILER_N_ATTR = "n"_u; +UString const Compiler::COMPILER_PARDEFS_ELEM = "pardefs"_u; +UString const Compiler::COMPILER_PARDEF_ELEM = "pardef"_u; +UString const Compiler::COMPILER_PAR_ELEM = "par"_u; +UString const Compiler::COMPILER_ENTRY_ELEM = "e"_u; +UString const Compiler::COMPILER_RESTRICTION_ATTR = "r"_u; +UString const Compiler::COMPILER_RESTRICTION_LR_VAL = "LR"_u; +UString const Compiler::COMPILER_RESTRICTION_RL_VAL = "RL"_u; +UString const Compiler::COMPILER_PAIR_ELEM = "p"_u; +UString const Compiler::COMPILER_LEFT_ELEM = "l"_u; +UString const Compiler::COMPILER_RIGHT_ELEM = "r"_u; +UString const Compiler::COMPILER_S_ELEM = "s"_u; +UString const Compiler::COMPILER_M_ELEM = "m"_u; +UString const Compiler::COMPILER_REGEXP_ELEM = "re"_u; +UString const Compiler::COMPILER_SECTION_ELEM = "section"_u; +UString const Compiler::COMPILER_ID_ATTR = "id"_u; +UString const Compiler::COMPILER_TYPE_ATTR = "type"_u; +UString const Compiler::COMPILER_IDENTITY_ELEM = "i"_u; +UString const Compiler::COMPILER_IDENTITYGROUP_ELEM = "ig"_u; +UString const Compiler::COMPILER_JOIN_ELEM = "j"_u; +UString const Compiler::COMPILER_BLANK_ELEM = "b"_u; +UString const Compiler::COMPILER_POSTGENERATOR_ELEM = "a"_u; +UString const Compiler::COMPILER_GROUP_ELEM = "g"_u; +UString const Compiler::COMPILER_LEMMA_ATTR = "lm"_u; +UString const Compiler::COMPILER_IGNORE_ATTR = "i"_u; +UString const Compiler::COMPILER_IGNORE_YES_VAL = "yes"_u; +UString const Compiler::COMPILER_ALT_ATTR = "alt"_u; +UString const Compiler::COMPILER_V_ATTR = "v"_u; +UString const Compiler::COMPILER_VL_ATTR = "vl"_u; +UString const Compiler::COMPILER_VR_ATTR = "vr"_u; +UString const Compiler::COMPILER_WEIGHT_ATTR = "w"_u; +UString const Compiler::COMPILER_TEXT_NODE = "#text"_u; +UString const Compiler::COMPILER_COMMENT_NODE = "#comment"_u; +UString const Compiler::COMPILER_ACX_ANALYSIS_ELEM = "analysis-chars"_u; +UString const Compiler::COMPILER_ACX_CHAR_ELEM = "char"_u; +UString const Compiler::COMPILER_ACX_EQUIV_CHAR_ELEM= "equiv-char"_u; +UString const Compiler::COMPILER_ACX_VALUE_ATTR = "value"_u; Compiler::Compiler() : reader(0), @@ -78,14 +84,14 @@ Compiler::~Compiler() } void -Compiler::parseACX(string const &file, wstring const &dir) +Compiler::parseACX(string const &file, UString const &dir) { if(dir == COMPILER_RESTRICTION_LR_VAL) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: cannot open '" << file << "'." << endl; + cerr << "Error: cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } int ret = xmlTextReaderRead(reader); @@ -98,13 +104,13 @@ Compiler::parseACX(string const &file, wstring const &dir) } void -Compiler::parse(string const &file, wstring const &dir) +Compiler::parse(string const &file, UString const &dir) { direction = dir; reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -117,7 +123,7 @@ Compiler::parse(string const &file, wstring const &dir) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -136,22 +142,22 @@ Compiler::parse(string const &file, wstring const &dir) } bool -Compiler::valid(wstring const& dir) const +Compiler::valid(UString const& dir) const { - const wstring side = dir == COMPILER_RESTRICTION_RL_VAL ? L"right" : L"left"; + const char* side = dir == COMPILER_RESTRICTION_RL_VAL ? "right" : "left"; const set epsilonSymbols = alphabet.symbolsWhereLeftIs(0); - const set spaceSymbols = alphabet.symbolsWhereLeftIs(L' '); + const set spaceSymbols = alphabet.symbolsWhereLeftIs(' '); for (auto §ion : sections) { auto &fst = section.second; auto finals = fst.getFinals(); auto initial = fst.getInitial(); for(const auto i : fst.closure(initial, epsilonSymbols)) { if (finals.count(i)) { - wcerr << L"Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << endl; + cerr << "Error: Invalid dictionary (hint: the " << side << " side of an entry is empty)" << endl; return false; } if(fst.closure(i, spaceSymbols).size() > 1) { // >1 since closure always includes self - wcerr << L"Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << endl; + cerr << "Error: Invalid dictionary (hint: entry on the " << side << " beginning with whitespace)" << endl; return false; } } @@ -169,12 +175,11 @@ Compiler::procAlphabet() int ret = xmlTextReaderRead(reader); if(ret == 1) { - xmlChar const *value = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::towstring(value); + letters = XMLParseUtil::readValue(reader); bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { - if(!isspace(letters.at(i))) + if(!u_isspace(letters.at(i))) { space = false; break; @@ -182,13 +187,13 @@ Compiler::procAlphabet() } if(space == true) // libxml2 returns '\n' for , should be empty { - letters = L""; + letters.clear(); } } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Missing alphabet symbols." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Missing alphabet symbols." << endl; exit(EXIT_FAILURE); } } @@ -197,7 +202,7 @@ Compiler::procAlphabet() void Compiler::procSDef() { - alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); + alphabet.includeSymbol("<"_u + attrib(COMPILER_N_ATTR) + ">"_u); } void @@ -215,18 +220,18 @@ Compiler::procParDef() { paradigms[current_paradigm].minimize(); paradigms[current_paradigm].joinFinals(); - current_paradigm = L""; + current_paradigm.clear(); } } } int -Compiler::matchTransduction(list const &pi, - list const &pd, +Compiler::matchTransduction(vector const &pi, + vector const &pd, int state, Transducer &t, double const &entry_weight) { - list::const_iterator left, right, limleft, limright; + vector::const_iterator left, right, limleft, limright; if(direction == COMPILER_RESTRICTION_LR_VAL) { @@ -313,12 +318,12 @@ Compiler::matchTransduction(list const &pi, void -Compiler::requireEmptyError(wstring const &name) +Compiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -327,67 +332,63 @@ bool Compiler::allBlanks() { bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { - flag = flag && iswspace(c); + flag = flag && u_isspace(c); } return flag; } void -Compiler::readString(list &result, wstring const &name) +Compiler::readString(vector &result, UString const &name) { - if(name == L"#text") + if(name == COMPILER_TEXT_NODE) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } + XMLParseUtil::readValueInto32(reader, result); } else if(name == COMPILER_M_ELEM) { requireEmptyError(name); if(keep_boundaries) { - result.push_back(static_cast(L'>')); + result.push_back(static_cast('>')); } } else if(name == COMPILER_BLANK_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L' ')); + result.push_back(static_cast(' ')); } else if(name == COMPILER_JOIN_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L'+')); + result.push_back(static_cast('+')); } else if(name == COMPILER_POSTGENERATOR_ELEM) { requireEmptyError(name); - result.push_back(static_cast(L'~')); + result.push_back(static_cast('~')); } else if(name == COMPILER_GROUP_ELEM) { int type=xmlTextReaderNodeType(reader); if(type != XML_READER_TYPE_END_ELEMENT) { - result.push_back(static_cast(L'#')); + result.push_back(static_cast('#')); } } else if(name == COMPILER_S_ELEM) { requireEmptyError(name); - wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; + UString symbol = "<"_u + attrib(COMPILER_N_ATTR) + ">"_u; if(!alphabet.isSymbolDefined(symbol)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Undefined symbol '" << symbol << "'." << endl; exit(EXIT_FAILURE); } @@ -395,88 +396,87 @@ Compiler::readString(list &result, wstring const &name) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Compiler::skipBlanks(wstring &name) +Compiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == COMPILER_TEXT_NODE || name == COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -Compiler::skip(wstring &name, wstring const &elem) +Compiler::skip(UString &name, UString const &elem) { skip(name, elem, true); } void -Compiler::skip(wstring &name, wstring const &elem, bool open) +Compiler::skip(UString &name, UString const &elem, bool open) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - wstring slash; + name = XMLParseUtil::readName(reader); + UString slash; if(!open) { - slash = L"/"; + slash = "/"_u; } - while(name == L"#text" || name == L"#comment") + while(name == COMPILER_TEXT_NODE || name == COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << slash << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << slash << elem << ">'." << endl; exit(EXIT_FAILURE); } } EntryToken -Compiler::procIdentity(wstring const &wsweight, bool ig) +Compiler::procIdentity(double const entry_weight, bool ig) { - list both_sides; - double entry_weight = stod(wsweight); + vector both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_IDENTITY_ELEM || name == COMPILER_IDENTITYGROUP_ELEM) { break; @@ -485,17 +485,17 @@ Compiler::procIdentity(wstring const &wsweight, bool ig) } } - if(verbose && first_element && (both_sides.front() == (int)L' ')) + if(verbose && first_element && (both_sides.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; EntryToken e; if(ig) { - list right; - right.push_back(static_cast(L'#')); + vector right; + right.push_back(static_cast('#')); right.insert(right.end(), both_sides.begin(), both_sides.end()); e.setSingleTransduction(both_sides, right, entry_weight); } @@ -507,21 +507,20 @@ Compiler::procIdentity(wstring const &wsweight, bool ig) } EntryToken -Compiler::procTransduction(wstring const &wsweight) +Compiler::procTransduction(double const entry_weight) { - list lhs, rhs; - double entry_weight = stod(wsweight); - wstring name; + vector lhs, rhs; + UString name; skip(name, COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_LEFT_ELEM) { break; @@ -530,10 +529,10 @@ Compiler::procTransduction(wstring const &wsweight) } } - if(verbose && first_element && (lhs.front() == (int)L' ')) + if(verbose && first_element && (lhs.front() == (int)' ')) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Entry begins with space." << endl; } first_element = false; @@ -541,11 +540,11 @@ Compiler::procTransduction(wstring const &wsweight) if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == COMPILER_RIGHT_ELEM) { break; @@ -561,8 +560,8 @@ Compiler::procTransduction(wstring const &wsweight) return e; } -wstring -Compiler::attrib(wstring const &name) +UString +Compiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } @@ -571,20 +570,20 @@ EntryToken Compiler::procPar() { EntryToken e; - wstring paradigm_name = attrib(COMPILER_N_ATTR); + UString paradigm_name = attrib(COMPILER_N_ATTR); first_element = false; - if(current_paradigm != L"" && paradigm_name == current_paradigm) + if(!current_paradigm.empty() && paradigm_name == current_paradigm) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Paradigm refers to itself '" << paradigm_name << L"'." < const &elements) { - if(current_paradigm != L"") + if(!current_paradigm.empty()) { // compilation of paradigms Transducer &t = paradigms[current_paradigm]; @@ -620,8 +619,8 @@ Compiler::insertEntryTokens(vector const &elements) } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid entry token." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid entry token." << endl; exit(EXIT_FAILURE); } } @@ -691,15 +690,15 @@ Compiler::insertEntryTokens(vector const &elements) void -Compiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Compiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } @@ -712,55 +711,56 @@ Compiler::procSection() if(type != XML_READER_TYPE_END_ELEMENT) { - wstring const &id = attrib(COMPILER_ID_ATTR); - wstring const &type = attrib(COMPILER_TYPE_ATTR); + UString const &id = attrib(COMPILER_ID_ATTR); + UString const &type = attrib(COMPILER_TYPE_ATTR); requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += L"@"; + current_section += '@'; current_section.append(type); } else { - current_section = L""; + current_section.clear(); } } void Compiler::procEntry() { - wstring attribute = this->attrib(COMPILER_RESTRICTION_ATTR); - wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); - wstring altval = this->attrib(COMPILER_ALT_ATTR); - wstring varval = this->attrib(COMPILER_V_ATTR); - wstring varl = this->attrib(COMPILER_VL_ATTR); - wstring varr = this->attrib(COMPILER_VR_ATTR); - wstring wsweight = this->attrib(COMPILER_WEIGHT_ATTR); + UString attribute = this->attrib(COMPILER_RESTRICTION_ATTR); + UString ignore = this->attrib(COMPILER_IGNORE_ATTR); + UString altval = this->attrib(COMPILER_ALT_ATTR); + UString varval = this->attrib(COMPILER_V_ATTR); + UString varl = this->attrib(COMPILER_VL_ATTR); + UString varr = this->attrib(COMPILER_VR_ATTR); + UString wsweight = this->attrib(COMPILER_WEIGHT_ATTR); // if entry is masked by a restriction of direction or an ignore mark - if((attribute != L"" && attribute != direction) + if((!attribute.empty() && attribute != direction) || ignore == COMPILER_IGNORE_YES_VAL - || (altval != L"" && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) + || (!altval.empty() && altval != alt) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varval.empty() && varval != variant) + || (direction == COMPILER_RESTRICTION_RL_VAL && !varl.empty() && varl != variant_left) + || (direction == COMPILER_RESTRICTION_LR_VAL && !varr.empty() && varr != variant_right)) { // parse to the end of the entry - wstring name = L""; + UString name; while(name != COMPILER_ENTRY_ELEM) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } return; } - if(wsweight == L"") + double weight = 0.0; + if(!wsweight.empty()) { - wsweight = L"0.0000"; + weight = StringUtils::stod(wsweight); } vector elements; @@ -770,14 +770,14 @@ Compiler::procEntry() int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); - if(current_paradigm == L"" && verbose) + if(current_paradigm.empty() && verbose) { first_element = true; } @@ -785,15 +785,15 @@ Compiler::procEntry() int type = xmlTextReaderNodeType(reader); if(name == COMPILER_PAIR_ELEM) { - elements.push_back(procTransduction(wsweight)); + elements.push_back(procTransduction(weight)); } else if(name == COMPILER_IDENTITY_ELEM) { - elements.push_back(procIdentity(wsweight, false)); + elements.push_back(procIdentity(weight, false)); } else if(name == COMPILER_IDENTITYGROUP_ELEM) { - elements.push_back(procIdentity(wsweight, true)); + elements.push_back(procIdentity(weight, true)); } else if(name == COMPILER_REGEXP_ELEM) { @@ -805,12 +805,12 @@ Compiler::procEntry() // detection of the use of undefined paradigms - wstring const &p = elements.rbegin()->paradigmName(); + UString const &p = elements.rbegin()->paradigmName(); if(paradigms.find(p) == paradigms.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." <' into '<" << COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } @@ -847,32 +847,31 @@ Compiler::procEntry() void Compiler::procNodeACX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == COMPILER_TEXT_NODE) { /* ignore */ } - else if(name == L"analysis-chars") + else if(name == COMPILER_ACX_ANALYSIS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == COMPILER_ACX_CHAR_ELEM) { - acx_current_char = static_cast(attrib(L"value")[0]); + acx_current_char = static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0]); } - else if(name == L"equiv-char") + else if(name == COMPILER_ACX_EQUIV_CHAR_ELEM) { - acx_map[acx_current_char].insert(static_cast(attrib(L"value")[0])); + acx_map[acx_current_char].insert(static_cast(attrib(COMPILER_ACX_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == COMPILER_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -880,12 +879,11 @@ Compiler::procNodeACX() void Compiler::procNode() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = XMLParseUtil::readName(reader); // TODO: optimize the execution order of the string "ifs" - if(name == L"#text") + if(name == COMPILER_TEXT_NODE) { /* ignore */ } @@ -921,14 +919,14 @@ Compiler::procNode() { procSection(); } - else if(name== L"#comment") + else if(name== COMPILER_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -938,8 +936,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - et.setRegexp(re); + et.readRegexp(reader); xmlTextReaderRead(reader); return et; } @@ -947,12 +944,12 @@ Compiler::procRegexp() void Compiler::write(FILE *output) { - fwrite(HEADER_LTTOOLBOX, 1, 4, output); + fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; write_le(output, features); // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alphabet.write(output); @@ -964,35 +961,35 @@ Compiler::write(FILE *output) for(auto& it : sections) { count++; - wcout << it.first << " " << it.second.size(); - wcout << " " << it.second.numberOfTransitions() << endl; - Compression::wstring_write(it.first, output); + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + Compression::string_write(it.first, output); it.second.write(output); } } void -Compiler::setAltValue(string const &a) +Compiler::setAltValue(UString const &a) { - alt = XMLParseUtil::stows(a); + alt = a; } void -Compiler::setVariantValue(string const &v) +Compiler::setVariantValue(UString const &v) { - variant = XMLParseUtil::stows(v); + variant = v; } void -Compiler::setVariantLeftValue(string const &v) +Compiler::setVariantLeftValue(UString const &v) { - variant_left = XMLParseUtil::stows(v); + variant_left = v; } void -Compiler::setVariantRightValue(string const &v) +Compiler::setVariantRightValue(UString const &v) { - variant_right = XMLParseUtil::stows(v); + variant_right = v; } void diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index acd8b7a..ad18f69 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include #include @@ -54,43 +54,43 @@ private: /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The dictionary section being compiled */ - wstring current_section; + UString current_section; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * List of characters to be considered alphabetic */ - wstring letters; + UString letters; /** * Set verbose mode: warnings which may or may not be correct @@ -121,27 +121,27 @@ private: /** * List of named transducers-paradigms */ - map paradigms; + map paradigms; /** * List of named dictionary sections */ - map sections; + map sections; /** * List of named prefix copy of a paradigm */ - map, Ltstr> prefix_paradigms; + map > prefix_paradigms; /** * List of named suffix copy of a paradigm */ - map, Ltstr> suffix_paradigms; + map > suffix_paradigms; /** * List of named endings of a suffix copy of a paradgim */ - map, Ltstr> postsuffix_paradigms; + map > postsuffix_paradigms; /** * Mapping of aliases of characters specified in ACX files @@ -205,7 +205,7 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Construct symbol pairs by align left side of both parts and insert @@ -216,19 +216,19 @@ private: * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(list const &lp, list const &rp, + int matchTransduction(vector const &lp, vector const &rp, int state, Transducer &t, double const &entry_weight); /** * Parse the <p> element * @return a list of tokens from the dictionary's entry */ - EntryToken procTransduction(wstring const &wsweight); + EntryToken procTransduction(double const entry_weight); /** * Parse the <i> element * @return a list of tokens from the dictionary's entry */ - EntryToken procIdentity(wstring const &wsweight, bool ig = false); + EntryToken procIdentity(double const entry_weight, bool ig = false); /** * Parse the <par> element @@ -247,7 +247,7 @@ private: * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all document #text nodes before "elem" @@ -255,22 +255,22 @@ private: * @param elem the name of the expected node * @param open true for open element, false for closed */ - void skip(wstring &name, wstring const &elem, bool open); + void skip(UString &name, UString const &elem, bool open); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(list &result, wstring const &name); + void readString(vector &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -278,8 +278,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -287,7 +287,7 @@ private: */ bool allBlanks(); - bool valid(wstring const& dir) const; + bool valid(UString const& dir) const; public: @@ -295,41 +295,47 @@ public: * Constants to represent the element and the attributes of * dictionaries */ - LTTOOLBOX_IMPORTS static wstring const COMPILER_DICTIONARY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ALPHABET_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SDEFS_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SDEF_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_N_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PARDEFS_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PARDEF_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PAR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ENTRY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_LR_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RESTRICTION_RL_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_PAIR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_LEFT_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_RIGHT_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_S_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_M_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_REGEXP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_SECTION_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ID_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_TYPE_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IDENTITY_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IDENTITYGROUP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_JOIN_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_BLANK_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_POSTGENERATOR_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_GROUP_ELEM; - LTTOOLBOX_IMPORTS static wstring const COMPILER_LEMMA_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_YES_VAL; - LTTOOLBOX_IMPORTS static wstring const COMPILER_ALT_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_V_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_VL_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_VR_ATTR; - LTTOOLBOX_IMPORTS static wstring const COMPILER_WEIGHT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_DICTIONARY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ALPHABET_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SDEFS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SDEF_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_N_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEFS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_PARDEF_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_PAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ENTRY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_LR_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_RESTRICTION_RL_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_PAIR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_LEFT_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_RIGHT_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_S_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_M_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_REGEXP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_SECTION_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ID_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_TYPE_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITY_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_IDENTITYGROUP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_JOIN_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_BLANK_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_POSTGENERATOR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_GROUP_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_LEMMA_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_IGNORE_YES_VAL; + LTTOOLBOX_IMPORTS static UString const COMPILER_ALT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_V_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_VL_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_VR_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_WEIGHT_ATTR; + LTTOOLBOX_IMPORTS static UString const COMPILER_TEXT_NODE; + LTTOOLBOX_IMPORTS static UString const COMPILER_COMMENT_NODE; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_ANALYSIS_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_CHAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_EQUIV_CHAR_ELEM; + LTTOOLBOX_IMPORTS static UString const COMPILER_ACX_VALUE_ATTR; /** * Constructor @@ -344,12 +350,12 @@ public: /** * Compile dictionary to letter transducers */ - void parse(string const &file, wstring const &dir); + void parse(string const &file, UString const &dir); /** * Read ACX file */ - void parseACX(string const &file, wstring const &dir); + void parseACX(string const &file, UString const &dir); /** @@ -372,25 +378,25 @@ public: * Set the alt value to use in compilation * @param a the value */ - void setAltValue(string const &a); + void setAltValue(UString const &a); /** * Set the variant value to use in compilation * @param v the value */ - void setVariantValue(string const &v); + void setVariantValue(UString const &v); /** * Set the variant_left value to use in compilation * @param v the value */ - void setVariantLeftValue(string const &v); + void setVariantLeftValue(UString const &v); /** * Set the variant_right value to use in compilation * @param v the value */ - void setVariantRightValue(string const &v); + void setVariantRightValue(UString const &v); }; diff --git a/lttoolbox/compression.cc b/lttoolbox/compression.cc index 0ba78b5..42c4d2b 100644 --- a/lttoolbox/compression.cc +++ b/lttoolbox/compression.cc @@ -21,13 +21,15 @@ #include #include #include +#include +#include void Compression::writeByte(unsigned char byte, FILE *output) { if(fwrite_unlocked(&byte, 1, 1, output) != 1) { - wcerr << L"I/O Error writing" << endl; + cerr << "I/O Error writing" << endl; exit(EXIT_FAILURE); } } @@ -39,7 +41,7 @@ Compression::readByte(FILE *input) if(fread_unlocked(&value, 1, 1, input) != 1) { // Not uncomment this code since -// wcerr << L"I/O Error reading" << endl; +// cerr << "I/O Error reading" << endl; // exit(EXIT_FAILURE); } @@ -86,7 +88,7 @@ Compression::multibyte_write(unsigned int value, FILE *output) } else { - wcerr << L"Out of range: " << value << endl; + cerr << "Out of range: " << value << endl; exit(EXIT_FAILURE); } } @@ -133,7 +135,7 @@ Compression::multibyte_write(unsigned int value, ostream &output) } else { - wcerr << "Out of range: " << value << endl; + cerr << "Out of range: " << value << endl; exit(EXIT_FAILURE); } } @@ -254,48 +256,26 @@ Compression::multibyte_read(istream &input) void -Compression::wstring_write(wstring const &str, FILE *output) +Compression::string_write(UString const &str, FILE *output) { - Compression::multibyte_write(str.size(), output); - for(auto c : str) + vector vec; + ustring_to_vec32(str, vec); + Compression::multibyte_write(vec.size(), output); + for(auto c : vec) { - Compression::multibyte_write(static_cast(c), output); + Compression::multibyte_write(c, output); } } -wstring -Compression::wstring_read(FILE *input) -{ - wstring retval = L""; - - for(unsigned int i = 0, limit = Compression::multibyte_read(input); - i != limit; i++) - { - retval += static_cast(Compression::multibyte_read(input)); - } - - return retval; -} - -void -Compression::string_write(string const &str, FILE *output) -{ - Compression::multibyte_write(str.size(), output); - for(auto c : str) - { - Compression::multibyte_write(static_cast(c), output); - } -} - -string +UString Compression::string_read(FILE *input) { - string retval = ""; + UString retval; + unsigned int limit = Compression::multibyte_read(input); + retval.reserve(limit); - for(unsigned int i = 0, limit = Compression::multibyte_read(input); - i != limit; i++) - { - retval += static_cast(Compression::multibyte_read(input)); + for(unsigned int i = 0; i != limit; i++) { + retval += static_cast(Compression::multibyte_read(input)); } return retval; diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 8af6cf9..8b5a2b1 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -19,9 +19,9 @@ #include #include -#include #include #include +#include using namespace std; @@ -42,7 +42,7 @@ enum TD_FEATURES : uint64_t { inline auto write_u64(FILE *out, uint64_t value) { - auto rv = fwrite(reinterpret_cast(&value), 1, sizeof(value), out); + auto rv = fwrite_unlocked(reinterpret_cast(&value), 1, sizeof(value), out); if (rv != sizeof(value)) { throw std::runtime_error("Failed to write uint64_t"); } @@ -77,7 +77,7 @@ inline auto write_le(Stream& out, uint64_t value) { inline auto read_u64(FILE *in) { uint64_t value = 0; - if (fread(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { throw std::runtime_error("Failed to read uint64_t"); } return value; @@ -174,23 +174,6 @@ public: */ static unsigned int multibyte_read(istream &is); - /** - * This method allows to write a wide string to an output stream - * using its UCSencoding as integer. - * @see wstring_read() - * @param str the string to write. - * @param output the output stream. - */ - static void wstring_write(wstring const &str, FILE *output); - - /** - * This method reads a wide string from the input stream. - * @see wstring_write() - * @param input the input stream. - * @return the wide string read. - */ - static wstring wstring_read(FILE *input); - /** * This method allows to write a plain string to an output stream * using its UCSencoding as integer. @@ -198,7 +181,7 @@ public: * @param str the string to write. * @param output the output stream. */ - static void string_write(string const &str, FILE *output); + static void string_write(UString const &str, FILE *output); /** * This method reads a plain string from the input stream. @@ -206,7 +189,7 @@ public: * @param input the input stream. * @return the string read. */ - static string string_read(FILE *input); + static UString string_read(FILE *input); /** * Encodes a double value and writes it into the output stream diff --git a/lttoolbox/deserialiser.h b/lttoolbox/deserialiser.h index 4697640..b5ae2f2 100644 --- a/lttoolbox/deserialiser.h +++ b/lttoolbox/deserialiser.h @@ -33,6 +33,8 @@ #include #include +#include + template class Deserialiser; template @@ -76,12 +78,12 @@ public: inline static uint32_t deserialise(std::istream &Stream_); }; -template <> class Deserialiser { +template <> class Deserialiser { public: - inline static wchar_t deserialise(std::istream &Stream_); + inline static char deserialise(std::istream &Stream_); }; -template <> class Deserialiser { +template <> class Deserialiser { public: inline static char deserialise(std::istream &Stream_); }; @@ -168,14 +170,14 @@ uint32_t Deserialiser::deserialise(std::istream &Stream_) { return int_deserialise(Stream_); } -wchar_t Deserialiser::deserialise(std::istream &Stream_) { - return int_deserialise(Stream_); -} - char Deserialiser::deserialise(std::istream &Stream_) { return int_deserialise(Stream_); } +char Deserialiser::deserialise(std::istream &Stream_) { + return int_deserialise(Stream_); +} + double Deserialiser::deserialise(std::istream &Stream_) { union { uint64_t i; diff --git a/lttoolbox/entry_token.cc b/lttoolbox/entry_token.cc index f03bca5..f9401ab 100644 --- a/lttoolbox/entry_token.cc +++ b/lttoolbox/entry_token.cc @@ -61,14 +61,14 @@ EntryToken::destroy() } void -EntryToken::setParadigm(wstring const &np) +EntryToken::setParadigm(UString const &np) { parName = np; type = paradigm; } void -EntryToken::setSingleTransduction(list const &pi, list const &pd, double const ew) +EntryToken::setSingleTransduction(vector const &pi, vector const &pd, double const ew) { weight = ew; leftSide = pi; @@ -77,9 +77,17 @@ EntryToken::setSingleTransduction(list const &pi, list const &pd, doub } void -EntryToken::setRegexp(wstring const &r) +EntryToken::setRegexp(UString const &r) { - myregexp = r; + myregexp.clear(); + ustring_to_vec32(r, myregexp); + type = regexp; +} + +void +EntryToken::readRegexp(xmlTextReaderPtr reader) +{ + XMLParseUtil::readValueInto32(reader, myregexp); type = regexp; } @@ -101,25 +109,25 @@ EntryToken::isRegexp() const return type == regexp; } -wstring const & +UString const & EntryToken::paradigmName() const { return parName; } -list const & +vector const & EntryToken::left() const { return leftSide; } -list const & +vector const & EntryToken::right() const { return rightSide; } -wstring const & +vector const & EntryToken::regExp() const { return myregexp; diff --git a/lttoolbox/entry_token.h b/lttoolbox/entry_token.h index 6b2886c..0b4b43a 100644 --- a/lttoolbox/entry_token.h +++ b/lttoolbox/entry_token.h @@ -18,8 +18,10 @@ #define _ENTRYTOKEN_ -#include -#include +#include +#include +#include +#include using namespace std; @@ -42,7 +44,7 @@ private: /** * Name of the paradigm (if it is of 'paradigm' 'type') */ - wstring parName; + UString parName; /** * Weight value for the entry (default_weight if unspecified) @@ -52,17 +54,17 @@ private: /** * Left side of transduction (if 'single_transduction') */ - list leftSide; + vector leftSide; /** * Right side of transduction (if 'single_transduction') */ - list rightSide; + vector rightSide; /** * Regular expression (if 'regexp') */ - wstring myregexp; + vector myregexp; /** * copy method @@ -99,7 +101,7 @@ public: * Sets the name of the paradigm. * @param np the paradigm name */ - void setParadigm(wstring const &np); + void setParadigm(UString const &np); /** * Set both parts of a single transduction. @@ -107,13 +109,19 @@ public: * @param pd right part * @param ew entry weight */ - void setSingleTransduction(list const &pi, list const &pd, double const ew = 0); + void setSingleTransduction(vector const &pi, vector const &pd, double const ew = 0); /** * Set regular expression. * @param r the regular expression specification. */ - void setRegexp(wstring const &r); + void setRegexp(UString const &r); + + /** + * More efficient version of setRegexp() + * @param reader the current xml parser state + */ + void readRegexp(xmlTextReaderPtr reader); /** * eTest EntryToken to detect if is a paradigm. @@ -137,25 +145,25 @@ public: * Retrieve the name of the paradigm. * @return the name of the paradigm. */ - wstring const & paradigmName() const; + UString const & paradigmName() const; /** * Retrieve the left part of the paradigm. * @return the left part of the paradigm. */ - list const & left() const; + vector const & left() const; /** * Retrieve the right part of the paradigm. * @return the right part of the paradigm. */ - list const & right() const; + vector const & right() const; /** * Retrieve the regular expression specification. * @return the regular expression specification. */ - wstring const & regExp() const; + vector const & regExp() const; /** * Retrieve the weight value of the entry. diff --git a/lttoolbox/expander.cc b/lttoolbox/expander.cc index 8592331..1baf12a 100644 --- a/lttoolbox/expander.cc +++ b/lttoolbox/expander.cc @@ -25,9 +25,6 @@ #include #include -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif using namespace std; @@ -42,12 +39,12 @@ Expander::~Expander() } void -Expander::expand(string const &file, FILE *output) +Expander::expand(string const &file, UFILE* output) { reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -60,7 +57,7 @@ Expander::expand(string const &file, FILE *output) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -78,17 +75,17 @@ Expander::procParDef() } else { - current_paradigm = L""; + current_paradigm.clear(); } } void -Expander::requireEmptyError(wstring const &name) +Expander::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -97,7 +94,7 @@ bool Expander::allBlanks() { bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { @@ -108,16 +105,16 @@ Expander::allBlanks() } void -Expander::readString(wstring &result, wstring const &name) +Expander::readString(UString &result, UString const &name) { - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - wstring escaped = L"^$/<>{}\\*@#+~:"; + UString value = XMLParseUtil::readValue(reader); + UString escaped = "^$/<>{}\\*@#+~:"_u; for(size_t i = value.size()-1; i > 0; i--) { - if(escaped.find(value[i]) != wstring::npos) { - value.insert(value.begin()+i, L'\\'); + if(escaped.find(value[i]) != UString::npos) { + value.insert(value.begin()+i, '\\'); } } result.append(value); @@ -125,105 +122,105 @@ Expander::readString(wstring &result, wstring const &name) else if(name == Compiler::COMPILER_BLANK_ELEM) { requireEmptyError(name); - result += L' '; + result += ' '; } else if(name == Compiler::COMPILER_M_ELEM) { requireEmptyError(name); if(keep_boundaries) { - result += L'>'; + result += '>'; } } else if(name == Compiler::COMPILER_JOIN_ELEM) { requireEmptyError(name); - result += L'+'; + result += '+'; } else if(name == Compiler::COMPILER_POSTGENERATOR_ELEM) { requireEmptyError(name); - result += L'~'; + result += '~'; } else if(name == Compiler::COMPILER_GROUP_ELEM) { int type=xmlTextReaderNodeType(reader); if(type != XML_READER_TYPE_END_ELEMENT) { - result += L'#'; + result += '#'; } } else if(name == Compiler::COMPILER_S_ELEM) { requireEmptyError(name); - result += L'<'; + result += '<'; result.append(attrib(Compiler::COMPILER_N_ATTR)); - result += L'>'; + result += '>'; } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid specification of element '<" << name; + cerr << ">' in this context." << endl; exit(EXIT_FAILURE); } } void -Expander::skipBlanks(wstring &name) +Expander::skipBlanks(UString &name) { - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -Expander::skip(wstring &name, wstring const &elem) +Expander::skip(UString &name, UString const &elem) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { if(!allBlanks()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << elem << ">'." << endl; exit(EXIT_FAILURE); } } -wstring +UString Expander::procIdentity() { - wstring both_sides = L""; + UString both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_IDENTITY_ELEM) { break; @@ -234,21 +231,21 @@ Expander::procIdentity() return both_sides; } -pair +pair Expander::procIdentityGroup() { - wstring lhs = L""; - wstring rhs = L"#"; - wstring both_sides = L""; + UString lhs; + UString rhs = "#"_u; + UString both_sides; if(!xmlTextReaderIsEmptyElement(reader)) { - wstring name = L""; + UString name; while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { break; @@ -259,25 +256,25 @@ Expander::procIdentityGroup() lhs += both_sides; rhs += both_sides; - pair e(lhs, rhs); + pair e(lhs, rhs); return e; } -pair +pair Expander::procTransduction() { - wstring lhs = L"", rhs = L""; - wstring name = L""; + UString lhs, rhs; + UString name; skip(name, Compiler::COMPILER_LEFT_ELEM); if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_LEFT_ELEM) { break; @@ -290,11 +287,11 @@ Expander::procTransduction() if(!xmlTextReaderIsEmptyElement(reader)) { - name = L""; + name.clear(); while(true) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); if(name == Compiler::COMPILER_RIGHT_ELEM) { break; @@ -305,67 +302,67 @@ Expander::procTransduction() skip(name, Compiler::COMPILER_PAIR_ELEM); - pair e(lhs, rhs); + pair e(lhs, rhs); return e; } -wstring -Expander::attrib(wstring const &name) +UString +Expander::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } -wstring +UString Expander::procPar() { EntryToken e; - wstring paradigm_name = attrib(Compiler::COMPILER_N_ATTR); + UString paradigm_name = attrib(Compiler::COMPILER_N_ATTR); return paradigm_name; } void -Expander::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +Expander::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr<< attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr<< attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } void -Expander::procEntry(FILE *output) +Expander::procEntry(UFILE* output) { - wstring attribute = this->attrib(Compiler::COMPILER_RESTRICTION_ATTR); - wstring entrname = this->attrib(Compiler::COMPILER_LEMMA_ATTR); - wstring altval = this->attrib(Compiler::COMPILER_ALT_ATTR); - wstring varval = this->attrib(Compiler::COMPILER_V_ATTR); - wstring varl = this->attrib(Compiler::COMPILER_VL_ATTR); - wstring varr = this->attrib(Compiler::COMPILER_VR_ATTR); - wstring wsweight = this->attrib(Compiler::COMPILER_WEIGHT_ATTR); - - wstring myname = L""; - if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == L"yes" - || (altval != L"" && altval != alt) - || (varval != L"" && varval != variant && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) - || ((varl != L"" && varl != variant_left) && (varr != L"" && varr != variant_right)) - || (varl != L"" && varl != variant_left && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) - || (varr != L"" && varr != variant_right && attribute == Compiler::COMPILER_RESTRICTION_LR_VAL)) + UString attribute = this->attrib(Compiler::COMPILER_RESTRICTION_ATTR); + UString entrname = this->attrib(Compiler::COMPILER_LEMMA_ATTR); + UString altval = this->attrib(Compiler::COMPILER_ALT_ATTR); + UString varval = this->attrib(Compiler::COMPILER_V_ATTR); + UString varl = this->attrib(Compiler::COMPILER_VL_ATTR); + UString varr = this->attrib(Compiler::COMPILER_VR_ATTR); + UString wsweight = this->attrib(Compiler::COMPILER_WEIGHT_ATTR); + + UString myname; + if(this->attrib(Compiler::COMPILER_IGNORE_ATTR) == Compiler::COMPILER_IGNORE_YES_VAL + || (!altval.empty() && altval != alt) + || (!varval.empty() && varval != variant && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) + || ((!varl.empty() && varl != variant_left) && (!varr.empty() && varr != variant_right)) + || (!varl.empty() && varl != variant_left && attribute == Compiler::COMPILER_RESTRICTION_RL_VAL) + || (!varr.empty() && varr != variant_right && attribute == Compiler::COMPILER_RESTRICTION_LR_VAL)) { do { int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - myname = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + myname = XMLParseUtil::readName(reader); } while(myname != Compiler::COMPILER_ENTRY_ELEM); return; @@ -373,19 +370,19 @@ Expander::procEntry(FILE *output) EntList items, items_lr, items_rl; if(attribute == Compiler::COMPILER_RESTRICTION_LR_VAL - || (varval != L"" && varval != variant && attribute != Compiler::COMPILER_RESTRICTION_RL_VAL) - || (varl != L"" && varl != variant_left)) + || (!varval.empty() && varval != variant && attribute != Compiler::COMPILER_RESTRICTION_RL_VAL) + || (!varl.empty() && varl != variant_left)) { - items_lr.push_back(make_pair(L"", L"")); + items_lr.push_back(make_pair(""_u, ""_u)); } else if(attribute == Compiler::COMPILER_RESTRICTION_RL_VAL - || (varr != L"" && varr != variant_right)) + || (!varr.empty() && varr != variant_right)) { - items_rl.push_back(make_pair(L"", L"")); + items_rl.push_back(make_pair(""_u, ""_u)); } else { - items.push_back(make_pair(L"", L"")); + items.push_back(make_pair(""_u, ""_u)); } while(true) @@ -393,53 +390,53 @@ Expander::procEntry(FILE *output) int ret = xmlTextReaderRead(reader); if(ret != 1) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Parse error." << endl; exit(EXIT_FAILURE); } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); skipBlanks(name); int type = xmlTextReaderNodeType(reader); if(name == Compiler::COMPILER_PAIR_ELEM) { - pair p = procTransduction(); + pair p = procTransduction(); append(items, p); append(items_lr, p); append(items_rl, p); } else if(name == Compiler::COMPILER_IDENTITY_ELEM) { - wstring val = procIdentity(); + UString val = procIdentity(); append(items, val); append(items_lr, val); append(items_rl, val); } else if(name == Compiler::COMPILER_IDENTITYGROUP_ELEM) { - pair p = procIdentityGroup(); + pair p = procIdentityGroup(); append(items, p); append(items_lr, p); append(items_rl, p); } else if(name == Compiler::COMPILER_REGEXP_ELEM) { - wstring val = L"__REGEXP__" + procRegexp(); + UString val = "__REGEXP__"_u + procRegexp(); append(items, val); append(items_lr, val); append(items_rl, val); } else if(name == Compiler::COMPILER_PAR_ELEM) { - wstring p = procPar(); + UString p = procPar(); // detection of the use of undefined paradigms if(paradigm.find(p) == paradigm.end() && paradigm_lr.find(p) == paradigm_lr.end() && paradigm_rl.find(p) == paradigm_rl.end()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." <', output); - fputwc_unlocked(L':', output); - fputws_unlocked(it.second.c_str(), output); - fputwc_unlocked(L'\n', output); + u_fprintf(output, "%S:>:%S\n", it.first.c_str(), it.second.c_str()); } for(auto& it : items_rl) { - fputws_unlocked(it.first.c_str(), output); - fputwc_unlocked(L':', output); - fputwc_unlocked(L'<', output); - fputwc_unlocked(L':', output); - fputws_unlocked(it.second.c_str(), output); - fputwc_unlocked(L'\n', output); + u_fprintf(output, "%S:<:%S\n", it.first.c_str(), it.second.c_str()); } } else @@ -531,31 +515,30 @@ Expander::procEntry(FILE *output) return; } - else if(name == L"#text" && allBlanks()) + else if(name == Compiler::COMPILER_TEXT_NODE && allBlanks()) { } - else if(name == L"#comment") + else if(name == Compiler::COMPILER_COMMENT_NODE) { } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid inclusion of '<" << name << L">' into '<" << Compiler::COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid inclusion of '<" << name << ">' into '<" << Compiler::COMPILER_ENTRY_ELEM; + cerr << ">'." << endl; exit(EXIT_FAILURE); } } } void -Expander::procNode(FILE *output) +Expander::procNode(UFILE *output) { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = XMLParseUtil::readName(reader); // DO: optimize the execution order of this string "ifs" - if(name == L"#text") + if(name == Compiler::COMPILER_TEXT_NODE) { /* ignorar */ } @@ -591,23 +574,23 @@ Expander::procNode(FILE *output) { /* ignorar */ } - else if(name == L"#comment") + else if(name == Compiler::COMPILER_COMMENT_NODE) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } -wstring +UString Expander::procRegexp() { xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString re = XMLParseUtil::readValue(reader); xmlTextReaderRead(reader); return re; } @@ -622,7 +605,7 @@ Expander::append(EntList &result, { for(auto& it2 : endings) { - temp.push_back(pair(it.first + it2.first, + temp.push_back(pair(it.first + it2.first, it.second + it2.second)); } } @@ -631,7 +614,7 @@ Expander::append(EntList &result, } void -Expander::append(EntList &result, wstring const &endings) +Expander::append(EntList &result, UString const &endings) { for(auto& it : result) { @@ -642,7 +625,7 @@ Expander::append(EntList &result, wstring const &endings) void Expander::append(EntList &result, - pair const &endings) + pair const &endings) { for(auto& it : result) { @@ -652,27 +635,27 @@ Expander::append(EntList &result, } void -Expander::setAltValue(string const &a) +Expander::setAltValue(UString const &a) { - alt = XMLParseUtil::stows(a); + alt = a; } void -Expander::setVariantValue(string const &v) +Expander::setVariantValue(UString const &v) { - variant = XMLParseUtil::stows(v); + variant = v; } void -Expander::setVariantLeftValue(string const &v) +Expander::setVariantLeftValue(UString const &v) { - variant_left = XMLParseUtil::stows(v); + variant_left = v; } void -Expander::setVariantRightValue(string const &v) +Expander::setVariantRightValue(UString const &v) { - variant_right = XMLParseUtil::stows(v); + variant_right = v; } void @@ -680,4 +663,3 @@ Expander::setKeepBoundaries(bool keep) { keep_boundaries = keep; } - diff --git a/lttoolbox/expander.h b/lttoolbox/expander.h index 74da7e2..3d2c6df 100644 --- a/lttoolbox/expander.h +++ b/lttoolbox/expander.h @@ -17,8 +17,7 @@ #ifndef _EXPANDER_ #define _EXPANDER_ -#include -#include +#include #include #include @@ -27,7 +26,7 @@ using namespace std; -typedef list > EntList; +typedef list > EntList; /** * An expander of dictionaries @@ -43,33 +42,33 @@ private: /** * The alt value */ - wstring alt; + UString alt; /** * The variant value (monodix) */ - wstring variant; + UString variant; /** * The variant value (left side of bidix) */ - wstring variant_left; + UString variant_left; /** * The variant value (right side of bidix) */ - wstring variant_right; + UString variant_right; /** * The paradigm being compiled */ - wstring current_paradigm; + UString current_paradigm; /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) */ - wstring direction; + UString direction; /** * Do we print boundaries or not? @@ -79,16 +78,16 @@ private: /** * Paradigms */ - map paradigm; + map paradigm; - map paradigm_lr; + map paradigm_lr; - map paradigm_rl; + map paradigm_rl; /** * Method to parse an XML Node */ - void procNode(FILE *output); + void procNode(UFILE* output); /** * Parse the <pardef> element @@ -98,67 +97,67 @@ private: /** * Parse the <e> element */ - void procEntry(FILE *output); + void procEntry(UFILE* output); /** * Parse the <re> element * @return the string representing the regular expression */ - wstring procRegexp(); + UString procRegexp(); /** * Gets an attribute value with their name and the current context * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Parse the <p> element * @return a pair of strings, left part and right part of a transduction */ - pair procTransduction(); + pair procTransduction(); /** * Parse the <i> element * @return a string from the dictionary's entry */ - wstring procIdentity(); + UString procIdentity(); /** * Parse the <ig> element * @return a pair of strings, whose right part begins with '#' * but are otherwise identical */ - pair procIdentityGroup(); + pair procIdentityGroup(); /** * Parse the <par> element * @return the name of the paradigm */ - wstring procPar(); + UString procPar(); /** * Skip all document #text nodes before "elem" * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); - void readString(wstring &result, wstring const &name); + void readString(UString &result, UString const &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -166,8 +165,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -181,8 +180,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - list > const &endings); + static void append(list > &result, + list > const &endings); /** * Append a list of endings to a list of current transductions. @@ -190,8 +189,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - wstring const &endings); + static void append(list > &result, + UString const &endings); /** * Append a list of endings to a list of current transductions. @@ -199,8 +198,8 @@ private: * this method, the result of concatenations. * @param endings the endings to be appended. */ - static void append(list > &result, - pair const &endings); + static void append(list > &result, + pair const &endings); public: /** @@ -216,31 +215,31 @@ public: /** * Compile dictionary to letter transducers */ - void expand(string const &file, FILE *output); + void expand(string const &file, UFILE* output); /** * Set the alt value to use in compilation * @param a the value */ - void setAltValue(string const &a); + void setAltValue(UString const &a); /** * Set the variant value to use in expansion * @param v the value */ - void setVariantValue(string const &v); + void setVariantValue(UString const &v); /** * Set the variant_left value to use in expansion * @param v the value */ - void setVariantLeftValue(string const &v); + void setVariantLeftValue(UString const &v); /** * Set the variant_right value to use in expansion * @param v the value */ - void setVariantRightValue(string const &v); + void setVariantRightValue(UString const &v); /** * Set if we are going to keep morpheme boundaries diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index f4ec2a1..732acc8 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -22,49 +22,37 @@ #include #include #include -#include -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif using namespace std; -FSTProcessor::FSTProcessor() : -default_weight(0.0000), -outOfWord(false), -isLastBlankTM(false) +UString const FSTProcessor::XML_TEXT_NODE = "#text"_u; +UString const FSTProcessor::XML_COMMENT_NODE = "#comment"_u; +UString const FSTProcessor::XML_IGNORED_CHARS_ELEM = "ignored-chars"_u; +UString const FSTProcessor::XML_RESTORE_CHAR_ELEM = "restore-char"_u; +UString const FSTProcessor::XML_RESTORE_CHARS_ELEM = "restore-chars"_u; +UString const FSTProcessor::XML_VALUE_ATTR = "value"_u; +UString const FSTProcessor::XML_CHAR_ELEM = "char"_u; +UString const FSTProcessor::WBLANK_START = "[["_u; +UString const FSTProcessor::WBLANK_END = "]]"_u; +UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; + + +FSTProcessor::FSTProcessor() { // escaped_chars chars - escaped_chars.insert(L'['); - escaped_chars.insert(L']'); - escaped_chars.insert(L'{'); - escaped_chars.insert(L'}'); - escaped_chars.insert(L'^'); - escaped_chars.insert(L'$'); - escaped_chars.insert(L'/'); - escaped_chars.insert(L'\\'); - escaped_chars.insert(L'@'); - escaped_chars.insert(L'<'); - escaped_chars.insert(L'>'); - - caseSensitive = false; - dictionaryCase = false; - do_decomposition = false; - nullFlush = false; - nullFlushGeneration = false; - useIgnoredChars = false; - useDefaultIgnoredChars = true; - useRestoreChars = false; - displayWeightsMode = false; - showControlSymbols = false; - biltransSurfaceForms = false; - maxAnalyses = INT_MAX; - maxWeightClasses = INT_MAX; - compoundOnlyLSymbol = 0; - compoundRSymbol = 0; - compound_max_elements = 4; + escaped_chars.insert('['); + escaped_chars.insert(']'); + escaped_chars.insert('{'); + escaped_chars.insert('}'); + escaped_chars.insert('^'); + escaped_chars.insert('$'); + escaped_chars.insert('/'); + escaped_chars.insert('\\'); + escaped_chars.insert('@'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); if(useDefaultIgnoredChars) { @@ -126,28 +114,27 @@ FSTProcessor::parseRCX(string const &file) void FSTProcessor::procNodeICX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == L"ignored-chars") + else if(name == XML_IGNORED_CHARS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == XML_CHAR_ELEM) { - ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == XML_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -161,140 +148,54 @@ FSTProcessor::initDefaultIgnoredCharacters() void FSTProcessor::procNodeRCX() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); - if(name == L"#text") + UString name = XMLParseUtil::readName(reader); + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == L"restore-chars") + else if(name == XML_RESTORE_CHARS_ELEM) { /* ignore */ } - else if(name == L"char") + else if(name == XML_CHAR_ELEM) { - rcx_current_char = static_cast(XMLParseUtil::attrib(reader, L"value")[0]); + rcx_current_char = static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]); } - else if(name == L"restore-char") + else if(name == XML_RESTORE_CHAR_ELEM) { - rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, L"value")[0])); + rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == L"#comment") + else if(name == XML_COMMENT_NODE) { /* ignore */ } else { - wcerr << L"Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } -wchar_t -FSTProcessor::readEscaped(FILE *input) -{ - if(feof(input)) - { - streamError(); - } - - wchar_t val = static_cast(fgetwc_unlocked(input)); - - if(feof(input)) - { - streamError(); - } - - return val; -} - -wstring -FSTProcessor::readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - if(c != L'\\') - { - continue; - } - else - { - result += static_cast(readEscaped(input)); - } - } - - if(c != delim2) - { - streamError(); - } - - return result; -} - -wstring -FSTProcessor::readWblank(FILE *input) -{ - wstring result = L""; - result += L"[["; - wchar_t c = 0; - - while(!feof(input)) - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - - if(c == L'\\') - { - result += static_cast(readEscaped(input)); - } - else if(c == L']') - { - c = static_cast(fgetwc_unlocked(input)); - result += c; - - if(c == L']') - { - break; - } - } - } - - if(c != L']') - { - streamError(); - } - - return result; -} - bool -FSTProcessor::wblankPostGen(FILE *input, FILE *output) +FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) { - wstring result = L""; - result += L"[["; - wchar_t c = 0; + UString result = WBLANK_START; + UChar32 c = 0; bool in_content = false; - while(!feof(input)) + while(!input.eof()) { - c = static_cast(fgetwc_unlocked(input)); - - if(in_content && c == L'~') + c = input.get(); + if(in_content && c == '~') { - if(result[result.size()-1] == L']') { + if(result[result.size()-1] == ']') { // We just saw the end of a wblank, may want to merge wblankqueue.push(result); } else { // wake-up-mark happened some characters into the wblanked word - fputws(result.c_str(), output); + write(result, output); } return true; } @@ -303,21 +204,22 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) result += c; } - if(c == L'\\') + if(c == '\\') { - result += static_cast(readEscaped(input)); + if (input.eof()) streamError(); + result += input.get(); } - else if(c == L']') + else if(c == ']') { - c = static_cast(fgetwc_unlocked(input)); + c = input.get(); result += c; - if(c == L']') + if(c == ']') { int resultlen = result.size(); if(result[resultlen-5] == '[' && result[resultlen-4] == '[' && result[resultlen-3] == '/') //ending blank [[/]] { - fputws(result.c_str(), output); + write(result, output); break; } else @@ -328,7 +230,7 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) } } - if(c != L']') + if(c != ']') { streamError(); } @@ -337,63 +239,65 @@ FSTProcessor::wblankPostGen(FILE *input, FILE *output) } int -FSTProcessor::readAnalysis(FILE *input) +FSTProcessor::readAnalysis(InputFile& input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; - if(feof(input)) + UChar32 val = input.get(); + int32_t altval = 0; + if(input.eof()) { input_buffer.add(0); // so it's treated like the NUL byte return 0; + } else if(val == U_EOF) { + val = 0; } if((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) { input_buffer.add(val); - val = static_cast(fgetwc_unlocked(input)); + val = input.get(); } if(escaped_chars.find(val) != escaped_chars.end()) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { - blankqueue.push(readWblank(input)); + blankqueue.push(input.finishWBlank()); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(input.readBlock('[', ']')); } - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; default: streamError(); } } - if(val == L' ') { - blankqueue.push(L" "); + if(val == ' ') { + blankqueue.push(" "_u); } input_buffer.add(val); @@ -401,7 +305,7 @@ FSTProcessor::readAnalysis(FILE *input) } int -FSTProcessor::readTMAnalysis(FILE *input) +FSTProcessor::readTMAnalysis(InputFile& input) { isLastBlankTM = false; if(!input_buffer.isEmpty()) @@ -409,64 +313,64 @@ FSTProcessor::readTMAnalysis(FILE *input) return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; - if(feof(input)) + UChar32 val = input.get(); + int32_t altval = 0; + if(input.eof()) { return 0; } - if(escaped_chars.find(val) != escaped_chars.end() || iswdigit(val)) + if(escaped_chars.find(val) != escaped_chars.end() || u_isdigit(val)) { switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { - blankqueue.push(readWblank(input)); + blankqueue.push(input.finishWBlank()); } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(input.readBlock('[', ']')); } - input_buffer.add(static_cast(L' ')); + input_buffer.add(static_cast(' ')); isLastBlankTM = true; - return static_cast(L' '); + return static_cast(' '); - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; - case L'0': - case L'1': - case L'2': - case L'3': - case L'4': - case L'5': - case L'6': - case L'7': - case L'8': - case L'9': - { - wstring ws = L""; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + UString ws; do { ws += val; - val = static_cast(fgetwc_unlocked(input)); - } while(iswdigit(val)); - ungetwc_unlocked(val, input); - input_buffer.add(alphabet(L"")); + val = input.get(); + } while(u_isdigit(val)); + input.unget(val); + input_buffer.add(alphabet(""_u)); numbers.push_back(ws); - return alphabet(L""); + return alphabet(""_u); } break; @@ -480,61 +384,61 @@ FSTProcessor::readTMAnalysis(FILE *input) } int -FSTProcessor::readPostgeneration(FILE *input, FILE *output) +FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - int altval = 0; + UChar32 val = input.get(); + int32_t altval = 0; is_wblank = false; - if(feof(input)) + if(input.eof()) { return 0; } switch(val) { - case L'<': - altval = static_cast(alphabet(readFullBlock(input, L'<', L'>'))); + case '<': + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; - case L'[': - val = static_cast(fgetwc_unlocked(input)); + case '[': + val = input.get(); - if(val == L'[') + if(val == '[') { if(collect_wblanks) { - wblankqueue.push(readWblank(input)); + wblankqueue.push(input.finishWBlank()); is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } else if(wblankPostGen(input, output)) { - return static_cast(L'~'); + return static_cast('~'); } else { is_wblank = true; - return static_cast(L' '); + return static_cast(' '); } } else { - ungetwc_unlocked(val, input); - blankqueue.push(readFullBlock(input, L'[', L']')); + input.unget(val); + blankqueue.push(input.readBlock('[', ']')); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } - case L'\\': - val = static_cast(fgetwc_unlocked(input)); - input_buffer.add(static_cast(val)); + case '\\': + val = input.get(); + input_buffer.add(static_cast(val)); return val; default: @@ -544,33 +448,33 @@ FSTProcessor::readPostgeneration(FILE *input, FILE *output) } void -FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) +FSTProcessor::skipUntil(InputFile& input, UFILE *output, UChar32 const character) { while(true) { - wint_t val = fgetwc_unlocked(input); - if(feof(input)) + UChar32 val = input.get(); + if(input.eof()) { return; } switch(val) { - case L'\\': - val = fgetwc_unlocked(input); - if(feof(input)) + case '\\': + val = input.get(); + if(input.eof()) { return; } - fputwc_unlocked(L'\\', output); - fputwc_unlocked(val, output); + u_fputc('\\', output); + u_fputc(val, output); break; - case L'\0': - fputwc_unlocked(val, output); + case '\0': + u_fputc(val, output); if(nullFlushGeneration) { - fflush(output); + u_fflush(output); } break; @@ -581,7 +485,7 @@ FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } break; } @@ -589,47 +493,47 @@ FSTProcessor::skipUntil(FILE *input, FILE *output, wint_t const character) } int -FSTProcessor::readGeneration(FILE *input, FILE *output) +FSTProcessor::readGeneration(InputFile& input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); + UChar32 val = input.get(); - if(feof(input)) + if(input.eof()) { return 0x7fffffff; } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { return 0x7fffffff; } @@ -637,129 +541,106 @@ FSTProcessor::readGeneration(FILE *input, FILE *output) outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return static_cast(val); + val = input.get(); + return static_cast(val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return static_cast(L'$'); + return static_cast('$'); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); - - while((val = fgetwc_unlocked(input)) != L'>') - { - if(feof(input)) - { - streamError(); - } - cad += static_cast(val); - } - cad += static_cast(val); - - return alphabet(cad); + return alphabet(input.readBlock('<', '>')); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = input.get(); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + write(input.finishWBlank(), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + input.unget(val); + write(input.readBlock('[', ']'), output); } return readGeneration(input, output); } else { - return static_cast(val); + return static_cast(val); } return 0x7fffffff; } -pair -FSTProcessor::readBilingual(FILE *input, FILE *output) +pair +FSTProcessor::readBilingual(InputFile& input, UFILE *output) { - wint_t val = fgetwc_unlocked(input); - wstring symbol = L""; + UChar32 val = input.get(); + UString symbol; - if(feof(input)) + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } if(outOfWord) { - if(val == L'^') + if(val == '^') { - val = fgetwc_unlocked(input); - if(feof(input)) + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } - else if(val == L'\\') + else if(val == '\\') { - fputwc_unlocked(val, output); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } - fputwc_unlocked(val,output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val,output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } else { - fputwc_unlocked(val, output); - skipUntil(input, output, L'^'); - val = fgetwc_unlocked(input); - if(feof(input)) + u_fputc(val, output); + skipUntil(input, output, '^'); + val = input.get(); + if(input.eof()) { - return pair(symbol, 0x7fffffff); + return pair(symbol, 0x7fffffff); } } outOfWord = false; } - if(val == L'\\') + if(val == '\\') { - val = fgetwc_unlocked(input); - return pair(symbol, val); + val = input.get(); + return pair(symbol, val); } - else if(val == L'$') + else if(val == '$') { outOfWord = true; - return pair(symbol, static_cast(L'$')); + return pair(symbol, static_cast('$')); } - else if(val == L'<') + else if(val == '<') { - wstring cad = L""; - cad += static_cast(val); - while((val = fgetwc_unlocked(input)) != L'>') - { - if(feof(input)) - { - streamError(); - } - cad += static_cast(val); - } - cad += static_cast(val); + UString cad = input.readBlock('<', '>'); int res = alphabet(cad); @@ -767,66 +648,66 @@ FSTProcessor::readBilingual(FILE *input, FILE *output) { symbol = cad; } - return pair(symbol, res); + return pair(symbol, res); } - else if(val == L'[') + else if(val == '[') { - val = fgetwc_unlocked(input); - if(val == L'[') + val = input.get(); + if(val == '[') { - fputws_unlocked(readWblank(input).c_str(), output); + write(input.finishWBlank(), output); } else { - ungetwc_unlocked(val, input); - fputws_unlocked(readFullBlock(input, L'[', L']').c_str(), output); + input.unget(val); + write(input.readBlock('[', ']'), output); } return readBilingual(input, output); } - return pair(symbol, val); + return pair(symbol, val); } void -FSTProcessor::flushBlanks(FILE *output) +FSTProcessor::flushBlanks(UFILE *output) { for(size_t i = blankqueue.size(); i > 0; i--) { - fputws_unlocked(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } } void -FSTProcessor::flushWblanks(FILE *output) +FSTProcessor::flushWblanks(UFILE *output) { while(wblankqueue.size() > 0) { - fputws_unlocked(wblankqueue.front().c_str(), output); + write(wblankqueue.front(), output); wblankqueue.pop(); } } -wstring +UString FSTProcessor::combineWblanks() { - wstring final_wblank; - wstring last_wblank = L""; + UString final_wblank; + UString last_wblank; bool seen_wblank = false; while(wblankqueue.size() > 0) { - if(wblankqueue.front().compare(L"[[/]]") == 0) + if(wblankqueue.front().compare(WBLANK_FINAL) == 0) { if(seen_wblank) { if(final_wblank.empty()) { - final_wblank += L"[["; + final_wblank += WBLANK_START; } else if(final_wblank.size() > 2) { - final_wblank += L"; "; + final_wblank += "; "_u; } final_wblank += last_wblank.substr(2,last_wblank.size()-4); //add wblank without brackets [[..]] @@ -851,7 +732,7 @@ FSTProcessor::combineWblanks() if(!final_wblank.empty()) { - final_wblank += L"]]"; + final_wblank += WBLANK_END; need_end_wblank = true; } return final_wblank; @@ -860,18 +741,15 @@ FSTProcessor::combineWblanks() void FSTProcessor::calcInitial() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - root.addTransition(0, 0, it->second.getInitial(), default_weight); + for(auto& it : transducers) { + root.addTransition(0, 0, it.second.getInitial(), default_weight); } initial_state.init(&root); } bool -FSTProcessor::endsWith(wstring const &str, wstring const &suffix) +FSTProcessor::endsWith(UString const &str, UString const &suffix) { if(str.size() < suffix.size()) { @@ -886,64 +764,61 @@ FSTProcessor::endsWith(wstring const &str, wstring const &suffix) void FSTProcessor::classifyFinals() { - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - if(endsWith(it->first, L"@inconditional")) + for(auto& it : transducers) { + if(endsWith(it.first, "@inconditional"_u)) { - inconditional.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + inconditional.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@standard")) + else if(endsWith(it.first, "@standard"_u)) { - standard.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + standard.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@postblank")) + else if(endsWith(it.first, "@postblank"_u)) { - postblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + postblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } - else if(endsWith(it->first, L"@preblank")) + else if(endsWith(it.first, "@preblank"_u)) { - preblank.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + preblank.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } else { - wcerr << L"Error: Unsupported transducer type for '"; - wcerr << it->first << L"'." << endl; + cerr << "Error: Unsupported transducer type for '"; + cerr << it.first << "'." << endl; exit(EXIT_FAILURE); } } } void -FSTProcessor::writeEscaped(wstring const &str, FILE *output) +FSTProcessor::writeEscaped(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } size_t -FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output) +FSTProcessor::writeEscapedPopBlanks(UString const &str, UFILE *output) { size_t postpop = 0; for (unsigned int i = 0, limit = str.size(); i < limit; i++) { if (escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); - if (str[i] == L' ') { - if (blankqueue.front() == L" ") { + u_fputc(str[i], output); + if (str[i] == ' ') { + if (blankqueue.front() == " "_u) { blankqueue.pop(); } else { postpop++; @@ -954,71 +829,67 @@ FSTProcessor::writeEscapedPopBlanks(wstring const &str, FILE *output) } void -FSTProcessor::writeEscapedWithTags(wstring const &str, FILE *output) +FSTProcessor::writeEscapedWithTags(UString const &str, UFILE *output) { for(unsigned int i = 0, limit = str.size(); i < limit; i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { - fputws_unlocked(str.substr(i).c_str(), output); + write(str.substr(i), output); return; } if(escaped_chars.find(str[i]) != escaped_chars.end()) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(str[i], output); + u_fputc(str[i], output); } } void -FSTProcessor::printWord(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWord(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + write(lf, output); + u_fputc('$', output); } void -FSTProcessor::printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordPopBlank(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); size_t postpop = writeEscapedPopBlanks(sf, output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "%S$", lf.c_str()); while (postpop-- && blankqueue.size() > 0) { - fputws(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } } void -FSTProcessor::printWordBilingual(wstring const &sf, wstring const &lf, FILE *output) +FSTProcessor::printWordBilingual(UString const &sf, UString const &lf, UFILE *output) { - fputwc_unlocked(L'^', output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L'$', output); + u_fprintf(output, "^%S%S$", sf.c_str(), lf.c_str()); } void -FSTProcessor::printUnknownWord(wstring const &sf, FILE *output) +FSTProcessor::printUnknownWord(UString const &sf, UFILE *output) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(sf, output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'*', output); + u_fputc('/', output); + u_fputc('*', output); writeEscaped(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } unsigned int -FSTProcessor::lastBlank(wstring const &str) +FSTProcessor::lastBlank(UString const &str) { for(int i = static_cast(str.size())-1; i >= 0; i--) { @@ -1032,7 +903,7 @@ FSTProcessor::lastBlank(wstring const &str) } void -FSTProcessor::printSpace(wchar_t const val, FILE *output) +FSTProcessor::printSpace(UChar const val, UFILE *output) { if(blankqueue.size() > 0) { @@ -1040,20 +911,20 @@ FSTProcessor::printSpace(wchar_t const val, FILE *output) } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } bool -FSTProcessor::isEscaped(wchar_t const c) const +FSTProcessor::isEscaped(UChar32 const c) const { return escaped_chars.find(c) != escaped_chars.end(); } bool -FSTProcessor::isAlphabetic(wchar_t const c) const +FSTProcessor::isAlphabetic(UChar32 const c) const { - return (bool)std::iswalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); + return u_isalnum(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); } void @@ -1062,7 +933,7 @@ FSTProcessor::load(FILE *input) fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, input); + fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { auto features = read_le(input); if (features >= LTF_UNKNOWN) { @@ -1079,7 +950,7 @@ FSTProcessor::load(FILE *input) int len = Compression::multibyte_read(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); len--; } @@ -1090,278 +961,12 @@ FSTProcessor::load(FILE *input) while(len > 0) { - int len2 = Compression::multibyte_read(input); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(input)); - len2--; - } + UString name = Compression::string_read(input); transducers[name].read(input, alphabet); len--; } } -void -FSTProcessor::lsx_wrapper_null_flush(FILE *input, FILE *output) -{ - setNullFlush(false); - //nullFlushGeneration = true; - - while(!feof(input)) - { - lsx(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } - } -} - -void -FSTProcessor::lsx(FILE *input, FILE *output) -{ - if(getNullFlush()) - { - lsx_wrapper_null_flush(input, output); - } - - vector new_states, alive_states; - wstring blank, out, in, alt_out, alt_in; - bool outOfWord = true; - bool finalFound = false; - bool plus_thing = false; - - alive_states.push_back(initial_state); - - int val = -1; - - while(!feof(input) && val != 0) - { - val = fgetwc_unlocked(input); - - if(val == L'+' && isEscaped(val) && !outOfWord) - { - val = L'$'; - plus_thing = true; - } - - if((val == L'^' && isEscaped(val) && outOfWord) || feof(input) || val == 0) - { - blankqueue.push(blank); - - if(alive_states.size() == 0) - { - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - fflush(output); - blankqueue.pop(); - } - - alive_states.push_back(initial_state); - - alt_in = L""; - for(int i=0; i < (int) in.size(); i++) // FIXME indexing - { - alt_in += in[i]; - if(in[i] == L'$' && in[i+1] == L'^' && blankqueue.size() > 0) - { - // in.insert(i+1, blankqueue.front().c_str()); - alt_in += blankqueue.front().c_str(); - blankqueue.pop(); - } - } - in = alt_in; - fputws(in.c_str(), output); - fflush(output); - in = L""; - finalFound = false; - } - else if(finalFound && alive_states.size() == 1) - { - finalFound = false; - } - - blank = L""; - in += val; - outOfWord = false; - continue; - } - - // wcerr << L"\n[!] " << (wchar_t)val << L" ||| " << outOfWord << endl; - - if(outOfWord) - { - blank += val; - continue; - } - - if((val == 0 || feof(input) || val == L'$') && !outOfWord) // && isEscaped(val) - { - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - //wcerr << endl << L"[0] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - s.step(alphabet(L"<$>")); - //wcerr << endl << L"[1] FEOF | $ | " << s.size() << L" | " << s.isFinal(all_finals) << endl; - if(s.size() > 0) - { - new_states.push_back(s); - } - - /*if(s.isFinal(all_finals)) - { - out += s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - new_states.push_back(*initial_state); - }*/ - - if(s.isFinal(all_finals)) - { - new_states.clear(); - new_states.push_back(initial_state); - out = s.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses); - - alt_out = L""; - for (int i=0; i < (int) out.size(); i++) - { - wchar_t c = out.at(i); - if(c == L'/') - { - alt_out += L'^'; - } - else if(out[i-1] == L'<' && c == L'$' && out[i+1] == L'>') // indexing - { - alt_out += c; - alt_out += L'^'; - } - else if(!(c == L'<' && out[i+1] == L'$' && out[i+2] == L'>') && !(out[i-2] == L'<' && out[i-1] == L'$' && c == L'>')) - { - alt_out += c; - } - } - out = alt_out; - - - if(out[out.length()-1] == L'^') - { - out = out.substr(0, out.length()-1); // extra ^ at the end - if(plus_thing) - { - out[out.size()-1] = L'+'; - plus_thing = false; - } - } - else // take# out ... of - { - for(int i=out.length()-1; i>=0; i--) // indexing - { - if(out.at(i) == L'$') - { - out.insert(i+1, L" "); - break; - } - } - out += L'$'; - } - - if(blankqueue.size() > 0) - { - fputws(blankqueue.front().c_str(), output); - blankqueue.pop(); - } - - alt_out = L""; - for(int i=0; i < (int) out.size(); i++) // indexing - { - if((out.at(i) == L'$') && blankqueue.size() > 0) - { - alt_out += out.at(i); - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else if((out.at(i) == L'$') && blankqueue.size() == 0 && i != (int) out.size()-1) - { - alt_out += out.at(i); - alt_out += L' '; - } - else if(out.at(i) == L' ' && blankqueue.size() > 0) - { - alt_out += blankqueue.front().c_str(); - blankqueue.pop(); - } - else - { - alt_out += out.at(i); - } - } - out = alt_out; - - fputws(out.c_str(), output); - flushBlanks(output); - finalFound = true; - out = L""; - in = L""; - } - } - - alive_states.swap(new_states); - outOfWord = true; - - if(!finalFound) - { - in += val; //do not remove - } - continue; - } - - if(!outOfWord) // && (!(feof(input) || val == L'$'))) - { - if(val == L'<') // tag - { - wstring tag = readFullBlock(input, L'<', L'>'); - in += tag; - if(!alphabet.isSymbolDefined(tag)) - { - alphabet.includeSymbol(tag); - } - val = static_cast(alphabet(tag)); - } - else - { - in += (wchar_t) val; - } - - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - if(val < 0) - { - s.step_override(val, alphabet(L""), val); - } - else if(val > 0) - { - int val_lowercase = towlower(val); - s.step_override(val_lowercase, alphabet(L""), val); // FIXME deal with cases! in step_override - } - - if(s.size() > 0) - { - new_states.push_back(s); - } - - } - alive_states.swap(new_states); - } - } - - flushBlanks(output); -} - void FSTProcessor::initAnalysis() { @@ -1378,12 +983,9 @@ FSTProcessor::initTMAnalysis() { calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@ -1392,12 +994,9 @@ FSTProcessor::initGeneration() { setIgnoredChars(false); calcInitial(); - for(map::iterator it = transducers.begin(), - limit = transducers.end(); - it != limit; it++) - { - all_finals.insert(it->second.getFinals().begin(), - it->second.getFinals().end()); + for(auto& it : transducers) { + all_finals.insert(it.second.getFinals().begin(), + it.second.getFinals().end()); } } @@ -1414,8 +1013,8 @@ FSTProcessor::initBiltrans() } -wstring -FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupper) +UString +FSTProcessor::compoundAnalysis(UString input_word, bool uppercase, bool firstupper) { const int MAX_COMBINATIONS = 32767; @@ -1423,16 +1022,16 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp for(unsigned int i=0; i MAX_COMBINATIONS) { - wcerr << L"Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << L"'" << endl; - wcerr << L" gave up at char " << i << L" '" << val << L"'." << endl; + cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << endl; + cerr << " gave up at char " << i << " '" << val << "'." << endl; - wstring nullString = L""; + UString nullString; return nullString; } @@ -1443,13 +1042,13 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp if(current_state.size()==0) { - wstring nullString = L""; + UString nullString; return nullString; } } current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements); - wstring result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); + UString result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper); return result; } @@ -1459,30 +1058,30 @@ FSTProcessor::compoundAnalysis(wstring input_word, bool uppercase, bool firstupp void FSTProcessor::initDecompositionSymbols() { - if((compoundOnlyLSymbol=alphabet(L"<:co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<:compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@co:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"<@compound:only-L>")) == 0 - && (compoundOnlyLSymbol=alphabet(L"")) == 0) + if((compoundOnlyLSymbol=alphabet("<:co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<:compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@co:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet("<@compound:only-L>"_u)) == 0 + && (compoundOnlyLSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:only-L> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, L""); + alphabet.setSymbol(compoundOnlyLSymbol, ""_u); } - if((compoundRSymbol=alphabet(L"<:co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<:compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@co:R>")) == 0 - && (compoundRSymbol=alphabet(L"<@compound:R>")) == 0 - && (compoundRSymbol=alphabet(L"")) == 0) + if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<:compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@co:R>"_u)) == 0 + && (compoundRSymbol=alphabet("<@compound:R>"_u)) == 0 + && (compoundRSymbol=alphabet(""_u)) == 0) { - wcerr << L"Warning: Decomposition symbol <:compound:R> not found" << endl; + cerr << "Warning: Decomposition symbol <:compound:R> not found" << endl; } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, L""); + alphabet.setSymbol(compoundRSymbol, ""_u); } } @@ -1496,7 +1095,7 @@ FSTProcessor::initDecomposition() } void -FSTProcessor::analysis(FILE *input, FILE *output) +FSTProcessor::analysis(InputFile& input, UFILE *output) { if(getNullFlush()) { @@ -1507,13 +1106,13 @@ FSTProcessor::analysis(FILE *input, FILE *output) bool last_postblank = false; bool last_preblank = false; State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; bool firstupper = false, uppercase = false; map >::iterator rcx_map_ptr; - wchar_t val; + UChar32 val; do { val = readAnalysis(input); @@ -1524,8 +1123,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@ -1543,8 +1142,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@ -1562,8 +1161,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@ -1581,8 +1180,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } if(do_decomposition && compoundOnlyLSymbol != 0) @@ -1599,9 +1198,9 @@ FSTProcessor::analysis(FILE *input, FILE *output) last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_preblank = false; @@ -1613,11 +1212,11 @@ FSTProcessor::analysis(FILE *input, FILE *output) { rcx_map_ptr = rcx_map.find(val); set tmpset = rcx_map_ptr->second; - if(!iswupper(val) || caseSensitive) + if(!u_isupper(val) || caseSensitive) { current_state.step(val, tmpset); } - else if(rcx_map.find(towlower(val)) != rcx_map.end()) + else if(rcx_map.find(u_tolower(val)) != rcx_map.end()) { rcx_map_ptr = rcx_map.find(tolower(val)); tmpset.insert(tolower(val)); @@ -1632,14 +1231,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) } else { - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); } if(current_state.size() != 0) @@ -1651,29 +1243,29 @@ FSTProcessor::analysis(FILE *input, FILE *output) } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { if (blankqueue.size() > 0) { - fputws_unlocked(blankqueue.front().c_str(), output); + write(blankqueue.front(), output); blankqueue.pop(); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } else { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } if(val) { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@ -1681,13 +1273,13 @@ FSTProcessor::analysis(FILE *input, FILE *output) { printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(last_preblank) { - fputwc_unlocked(L' ', output); + u_fputc(' ', output); printWordPopBlank(sf.substr(0, sf.size()-input_buffer.diffPrevPos(last)), lf, output); input_buffer.setPos(last); @@ -1702,7 +1294,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@ -1712,7 +1304,7 @@ FSTProcessor::analysis(FILE *input, FILE *output) unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@ -1721,18 +1313,18 @@ FSTProcessor::analysis(FILE *input, FILE *output) else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@ -1747,11 +1339,11 @@ FSTProcessor::analysis(FILE *input, FILE *output) } } } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); if(limit == 0) { input_buffer.back(sf.size()); @@ -1760,18 +1352,18 @@ FSTProcessor::analysis(FILE *input, FILE *output) else { input_buffer.back(1+(size-limit)); - wstring unknown_word = sf.substr(0, limit); + UString unknown_word = sf.substr(0, limit); if(do_decomposition) { if(!dictionaryCase) { - firstupper = iswupper(sf[0]); - uppercase = firstupper && iswupper(sf[sf.size()-1]); + firstupper = u_isupper(sf[0]); + uppercase = firstupper && u_isupper(sf[sf.size()-1]); } - wstring compound = L""; + UString compound; compound = compoundAnalysis(unknown_word, uppercase, firstupper); - if(compound != L"") + if(!compound.empty()) { printWord(unknown_word, compound, output); } @@ -1801,8 +1393,8 @@ FSTProcessor::analysis(FILE *input, FILE *output) } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; last_preblank = false; @@ -1815,102 +1407,82 @@ FSTProcessor::analysis(FILE *input, FILE *output) } void -FSTProcessor::analysis_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::analysis_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { analysis(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::generation_wrapper_null_flush(FILE *input, FILE *output, +FSTProcessor::generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!input.eof()) { generation(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::postgeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::postgeneration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { postgeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::intergeneration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::intergeneration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while (!feof(input)) + while (!input.eof()) { intergeneration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if (code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::transliteration_wrapper_null_flush(FILE *input, FILE *output) +FSTProcessor::transliteration_wrapper_null_flush(InputFile& input, UFILE *output) { setNullFlush(false); - while(!feof(input)) + while(!input.eof()) { transliteration(input, output); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } void -FSTProcessor::tm_analysis(FILE *input, FILE *output) +FSTProcessor::tm_analysis(InputFile& input, UFILE *output) { State current_state = initial_state; - wstring lf = L""; //lexical form - wstring sf = L""; //surface form + UString lf; //lexical form + UString sf; //surface form int last = 0; - while(wchar_t val = readTMAnalysis(input)) + while(int32_t val = readTMAnalysis(input)) { // test for final states if(current_state.isFinal(all_finals)) { - if(iswpunct(val)) + if(u_ispunct(val)) { lf = current_state.filterFinalsTM(all_finals, alphabet, escaped_chars, @@ -1919,20 +1491,13 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) numbers.clear(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { lf.append(sf); last = input_buffer.getPos(); } - if(!iswupper(val)) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, false); if(current_state.size() != 0) { @@ -1940,7 +1505,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@ -1951,9 +1516,9 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } else { - if((iswspace(val) || iswpunct(val)) && sf == L"") + if((u_isspace(val) || u_ispunct(val)) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@ -1961,14 +1526,14 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } - else if(!iswspace(val) && !iswpunct(val) && + else if(!u_isspace(val) && !u_ispunct(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do @@ -1977,7 +1542,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) { sf.append(numbers[numbers.size()-1]); } - else if(isLastBlankTM && val == L' ') + else if(isLastBlankTM && val == ' ') { sf.append(blankqueue.back()); } @@ -1986,16 +1551,16 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) alphabet.getSymbol(sf, val); } } - while((val = readTMAnalysis(input)) && !iswspace(val) && !iswpunct(val)); + while((val = readTMAnalysis(input)) && !u_isspace(val) && !u_ispunct(val)); if(val == 0) { - fputws_unlocked(sf.c_str(), output); + write(sf, output); return; } input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + write(sf, output); while(blankqueue.size() > 0) { @@ -2007,22 +1572,22 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } /* - unsigned int limit = sf.find(L' '); + unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + write(sf.substr(0, limit), output); */ } - else if(lf == L"") + else if(lf.empty()) { -/* unsigned int limit = sf.find(L' '); +/* unsigned int limit = sf.find(' '); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(sf.substr(0, limit).c_str(), output); + write(sf.substr(0, limit), output); */ input_buffer.back(1); - fputws_unlocked(sf.c_str(), output); + write(sf, output); while(blankqueue.size() > 0) { @@ -2036,16 +1601,14 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) } else { - fputwc_unlocked(L'[', output); - fputws_unlocked(lf.c_str(), output); - fputwc_unlocked(L']', output); + u_fprintf(output, "[%S]", lf.c_str()); input_buffer.setPos(last); input_buffer.back(1); } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } } @@ -2055,7 +1618,7 @@ FSTProcessor::tm_analysis(FILE *input, FILE *output) void -FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@ -2063,24 +1626,24 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } State current_state = initial_state; - wstring sf = L""; + UString sf; outOfWord = false; - skipUntil(input, output, L'^'); + skipUntil(input, output, '^'); int val; while((val = readGeneration(input, output)) != 0x7fffffff) { - if(sf == L"" && val == L'=') + if(sf.empty() && val == '=') { - fputwc(L'=', output); + u_fputc('=', output); val = readGeneration(input, output); } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { - if(sf[0] == L'*' || sf[0] == L'%') + if(sf[0] == '*' || sf[0] == '%') { if(mode != gm_clean && mode != gm_tagged_nm) { @@ -2092,14 +1655,14 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } - else if(sf[0] == L'@') + else if(sf[0] == '@') { if(mode == gm_all) { @@ -2119,11 +1682,11 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf.substr(1)), output); - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } else if(current_state.isFinal(all_finals)) @@ -2131,24 +1694,24 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) bool firstupper = false, uppercase = false; if(!dictionaryCase) { - uppercase = sf.size() > 1 && iswupper(sf[1]); - firstupper= iswupper(sf[0]); + uppercase = sf.size() > 1 && u_isupper(sf[1]); + firstupper= u_isupper(sf[0]); } if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); } - fputws_unlocked(current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper).substr(1).c_str(), output); + write(current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper).substr(1), output); if(mode == gm_tagged || mode == gm_tagged_nm) { - fputwc_unlocked(L'/', output); + u_fputc('/', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } @@ -2156,7 +1719,7 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) { if(mode == gm_all) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(sf, output); } else if(mode == gm_clean) @@ -2165,36 +1728,36 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } else if(mode == gm_unknown) { - if(sf != L"") + if(!sf.empty()) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } } else if(mode == gm_tagged) { - fputwc_unlocked(L'#', output); + u_fputc('#', output); writeEscaped(removeTags(sf), output); } else if(mode == gm_tagged_nm) { - fputwc_unlocked(L'^', output); + u_fputc('^', output); writeEscaped(removeTags(sf), output); - fputwc_unlocked(L'/', output); - fputwc_unlocked(L'#', output); + u_fputc('/', output); + u_fputc('#', output); writeEscapedWithTags(sf, output); - fputwc_unlocked(L'$', output); + u_fputc('$', output); } } current_state = initial_state; - sf = L""; + sf.clear(); } - else if(iswspace(val) && sf.size() == 0) + else if(u_isspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && (sf[0] == L'*' || sf[0] == L'%' )) + else if(sf.size() > 0 && (sf[0] == '*' || sf[0] == '%' )) { alphabet.getSymbol(sf, val); } @@ -2203,15 +1766,15 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) alphabet.getSymbol(sf,val); if(current_state.size() > 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { if(mode == gm_carefulcase) { - current_state.step_careful(val, towlower(val)); + current_state.step_careful(val, u_tolower(val)); } else { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } } else @@ -2224,7 +1787,7 @@ FSTProcessor::generation(FILE *input, FILE *output, GenerationMode mode) } void -FSTProcessor::postgeneration(FILE *input, FILE *output) +FSTProcessor::postgeneration(InputFile& input, UFILE *output) { if(getNullFlush()) { @@ -2235,14 +1798,14 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) collect_wblanks = false; need_end_wblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; - while(wchar_t val = readPostgeneration(input, output)) + while(UChar val = readPostgeneration(input, output)) { - if(val == L'~') + if(val == '~') { skip_mode = false; collect_wblanks = true; @@ -2254,11 +1817,11 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else if(skip_mode) { - if(iswspace(val)) + if(u_isspace(val)) { if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; } @@ -2273,13 +1836,13 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; } } @@ -2294,8 +1857,8 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) // test for final states if(current_state.isFinal(all_finals)) { - bool firstupper = iswupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, empty_escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, @@ -2303,7 +1866,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) // case of the beggining of the next word - wstring mybuf = L""; + UString mybuf; for(size_t i = sf.size(); i > 0; --i) { if(!isalpha(sf[i-1])) @@ -2318,8 +1881,8 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) if(mybuf.size() > 0) { - bool myfirstupper = iswupper(mybuf[0]); - bool myuppercase = mybuf.size() > 1 && iswupper(mybuf[1]); + bool myfirstupper = u_isupper(mybuf[0]); + bool myuppercase = mybuf.size() > 1 && u_isupper(mybuf[1]); for(size_t i = lf.size(); i > 0; --i) { @@ -2327,11 +1890,11 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) { if(myfirstupper && i != lf.size()) { - lf[i] = towupper(lf[i]); + lf[i] = u_toupper(lf[i]); } else { - lf[i] = towlower(lf[i]); + lf[i] = u_tolower(lf[i]); } break; } @@ -2339,11 +1902,11 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) { if(myuppercase) { - lf[i-1] = towupper(lf[i-1]); + lf[i-1] = u_toupper(lf[i-1]); } else { - lf[i-1] = towlower(lf[i-1]); + lf[i-1] = u_tolower(lf[i-1]); } } } @@ -2352,14 +1915,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) last = input_buffer.getPos(); } - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); if(current_state.size() != 0) { @@ -2367,51 +1923,51 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else { - wstring final_wblank = combineWblanks(); - fputws_unlocked(final_wblank.c_str(), output); + UString final_wblank = combineWblanks(); + write(final_wblank, output); - if(lf == L"") + if(lf.empty()) { unsigned int mark = sf.size(); unsigned int space_index = sf.size(); - + for(unsigned int i = 1, limit = sf.size(); i < limit; i++) { - if(sf[i] == L'~') + if(sf[i] == '~') { mark = i; break; } - else if(sf[i] == L' ') + else if(sf[i] == ' ') { space_index = i; } } - + if(space_index != sf.size()) { - fputws_unlocked(sf.substr(1, space_index-1).c_str(), output); - + write(sf.substr(1, space_index-1), output); + if(need_end_wblank) { - fputws_unlocked(L"[[/]]", output); + write(WBLANK_FINAL, output); need_end_wblank = false; - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); flushWblanks(output); } else { - fputwc_unlocked(sf[space_index], output); + u_fputc(sf[space_index], output); } - - fputws_unlocked(sf.substr(space_index+1, mark-space_index-1).c_str(), output); + + write(sf.substr(space_index+1, mark-space_index-1), output); } else { flushWblanks(output); - fputws_unlocked(sf.substr(1, mark-1).c_str(), output); + write(sf.substr(1, mark-1), output); } - + if(mark == sf.size()) { input_buffer.back(1); @@ -2423,11 +1979,11 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } else { - fputws_unlocked(lf.substr(1,lf.size()-3).c_str(), output); + write(lf.substr(1,lf.size()-3), output); input_buffer.setPos(last); input_buffer.back(2); val = lf[lf.size()-2]; - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@ -2435,15 +1991,15 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); skip_mode = true; collect_wblanks = false; } @@ -2455,7 +2011,7 @@ FSTProcessor::postgeneration(FILE *input, FILE *output) } void -FSTProcessor::intergeneration(FILE *input, FILE *output) +FSTProcessor::intergeneration(InputFile& input, UFILE *output) { if (getNullFlush()) { @@ -2464,35 +2020,35 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) bool skip_mode = true; State current_state = initial_state; - wstring target = L""; - wstring source = L""; + UString target; + UString source; int last = 0; - set empty_escaped_chars; + set empty_escaped_chars; while (true) { - wchar_t val = readPostgeneration(input, output); + UChar val = readPostgeneration(input, output); - if (val == L'~') + if (val == '~') { skip_mode = false; } if (skip_mode) { - if (iswspace(val)) + if (u_isspace(val)) { printSpace(val, output); } else { - if(val != L'\0') + if(val != '\0') { if (isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } } @@ -2501,8 +2057,8 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) // test for final states if (current_state.isFinal(all_finals)) { - bool firstupper = iswupper(source[1]); - bool uppercase = source.size() > 1 && firstupper && iswupper(source[2]); + bool firstupper = u_isupper(source[1]); + bool uppercase = source.size() > 1 && firstupper && u_isupper(source[2]); target = current_state.filterFinals(all_finals, alphabet, empty_escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, @@ -2511,39 +2067,32 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) last = input_buffer.getPos(); } - if (val != L'\0') + if (val != '\0') { - if (!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); } - if (val != L'\0' && current_state.size() != 0) + if (val != '\0' && current_state.size() != 0) { alphabet.getSymbol(source, val); } else { - if (target == L"") // no match + if (target.empty()) // no match { - if (val == L'\0') + if (val == '\0') { // flush source - fputws_unlocked(source.c_str(), output); + write(source, output); } else { - fputwc_unlocked(source[0], output); + u_fputc(source[0], output); unsigned int mark, limit; - for (mark = 1, limit = source.size(); mark < limit && source[mark] != L'~' ; mark++) + for (mark = 1, limit = source.size(); mark < limit && source[mark] != '~' ; mark++) { - fputwc_unlocked(source[mark], output); + u_fputc(source[mark], output); } if (mark != source.size()) @@ -2552,20 +2101,20 @@ FSTProcessor::intergeneration(FILE *input, FILE *output) input_buffer.back(back); } - if (val == L'~') + if (val == '~') { input_buffer.back(1); } else { - fputwc_unlocked(val, output); + u_fputc(val, output); } } } else { for(unsigned int i=1; i 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); if(!lf.empty()) { - fputws_unlocked(lf.substr(1).c_str(), output); + write(lf.substr(1), output); current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@ -2640,17 +2189,17 @@ FSTProcessor::transliteration(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else { if(current_state.isFinal(all_finals)) { - bool firstupper = iswupper(sf[1]); - bool uppercase = sf.size() > 1 && firstupper && iswupper(sf[2]); + bool firstupper = u_isupper(sf[1]); + bool uppercase = sf.size() > 1 && firstupper && u_isupper(sf[2]); lf = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); @@ -2666,14 +2215,14 @@ FSTProcessor::transliteration(FILE *input, FILE *output) { if(!lf.empty()) { - fputws_unlocked(lf.substr(1).c_str(), output); + write(lf.substr(1), output); input_buffer.setPos(last); input_buffer.back(1); val = lf[lf.size()-1]; } else { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@ -2681,14 +2230,14 @@ FSTProcessor::transliteration(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); } } } @@ -2696,14 +2245,14 @@ FSTProcessor::transliteration(FILE *input, FILE *output) flushBlanks(output); } -wstring -FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltransfull(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; if(with_delim == false) @@ -2712,37 +2261,37 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -2752,13 +2301,13 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@ -2767,37 +2316,22 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^="+result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if(with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if(mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -2806,11 +2340,11 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -2819,23 +2353,23 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) if(start_point < (end_point - 3)) { - return L"^$"; + return "^$"_u; } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -2848,7 +2382,7 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@ -2856,7 +2390,7 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) { if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@ -2864,14 +2398,14 @@ FSTProcessor::biltransfull(wstring const &input_word, bool with_delim) -wstring -FSTProcessor::biltrans(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltrans(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; if(with_delim == false) @@ -2880,37 +2414,37 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -2920,13 +2454,13 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@ -2935,37 +2469,22 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^="+result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -2974,11 +2493,11 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -2987,19 +2506,19 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -3012,7 +2531,7 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } return result_with_queue; } @@ -3020,54 +2539,53 @@ FSTProcessor::biltrans(wstring const &input_word, bool with_delim) { if(with_delim) { - result += L'$'; + result += '$'; } return result; } } void -FSTProcessor::bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode) { setNullFlush(false); nullFlushGeneration = true; - while(!feof(input)) + while(!input.eof()) { bilingual(input, output, mode); - fputwc_unlocked(L'\0', output); - int code = fflush(output); - if(code != 0) - { - wcerr << L"Could not flush output " << errno << endl; - } + u_fputc('\0', output); + u_fflush(output); } } -wstring -FSTProcessor::compose(wstring const &lexforms, wstring const &queue) const +UString +FSTProcessor::compose(UString const &lexforms, UString const &queue) const { - wstring result = L""; + UString result; + result.reserve(lexforms.size() + 2 * queue.size()); + result += '/'; for(unsigned int i = 1; i< lexforms.size(); i++) { - if(lexforms[i] == L'\\') + if(lexforms[i] == '\\') { - result += L'\\'; + result += '\\'; i++; } - else if(lexforms[i] == L'/') + else if(lexforms[i] == '/') { result.append(queue); } result += lexforms[i]; } - return L"/" + result + queue; + result += queue; + return result; } void -FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) +FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) { if(getNullFlush()) { @@ -3075,20 +2593,20 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) } State current_state = initial_state; - wstring sf = L""; // source language analysis - wstring queue = L""; // symbols to be added to each target - wstring result = L""; // result of looking up analysis in bidix + UString sf; // source language analysis + UString queue; // symbols to be added to each target + UString result; // result of looking up analysis in bidix outOfWord = false; - skipUntil(input, output, L'^'); - pair tr; // readBilingual return value, containing: + skipUntil(input, output, '^'); + pair tr; // readBilingual return value, containing: int val; // the alphabet value of current symbol, and - wstring symbol = L""; // the current symbol as a string + UString symbol; // the current symbol as a string bool seentags = false; // have we seen any tags at all in the analysis? bool seensurface = false; - wstring surface = L""; + UString surface; while(true) // ie. while(val != 0x7fffffff) { @@ -3096,17 +2614,17 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) symbol = tr.first; val = tr.second; - //fwprintf(stderr, L"> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); + //fprintf(stderr, "> %ls : %lc : %d\n", tr.first.c_str(), tr.second, tr.second); if(biltransSurfaceForms && !seensurface && !outOfWord) { - while(val != L'/' && val != 0x7fffffff) + while(val != '/' && val != 0x7fffffff) { surface = surface + symbol; alphabet.getSymbol(surface, val); tr = readBilingual(input, output); symbol = tr.first; val = tr.second; - //fwprintf(stderr, L" == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); + //fprintf(stderr, " == %ls : %lc : %d => %ls\n", symbol.c_str(), val, val, surface.c_str()); } seensurface = true; tr = readBilingual(input, output); @@ -3119,12 +2637,12 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) break; } - if(val == L'$' && outOfWord) + if(val == '$' && outOfWord) { if(!seentags) // if no tags: only return complete matches { - bool uppercase = sf.size() > 1 && iswupper(sf[1]); - bool firstupper= iswupper(sf[0]); + bool uppercase = sf.size() > 1 && u_isupper(sf[1]); + bool firstupper= u_isupper(sf[0]); result = current_state.filterFinals(all_finals, alphabet, escaped_chars, @@ -3132,16 +2650,16 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) uppercase, firstupper, 0); } - if(sf[0] == L'*') + if(sf[0] == '*') { if (mode == gm_clean) { - printWordBilingual(sf, L"/" + sf.substr(1), output); + printWordBilingual(sf, "/"_u + sf.substr(1), output); } else { - printWordBilingual(sf, L"/" + sf, output); + printWordBilingual(sf, "/"_u + sf, output); } } - else if(result != L"") + else if(!result.empty()) { printWordBilingual(sf, compose(result, queue), output); } @@ -3149,30 +2667,30 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) { //xxx if(biltransSurfaceForms) { - printWordBilingual(surface, L"/@"+surface, output); + printWordBilingual(surface, "/@"_u + surface, output); } else { - printWordBilingual(sf, L"/@"+sf, output); + printWordBilingual(sf, "/@"_u + sf, output); } } seensurface = false; - surface = L""; - queue = L""; - result = L""; + surface.clear(); + queue.clear(); + result.clear(); current_state = initial_state; - sf = L""; + sf.clear(); seentags = false; } - else if(iswspace(val) && sf.size() == 0) + else if(u_isspace(val) && sf.size() == 0) { // do nothing } - else if(sf.size() > 0 && sf[0] == L'*') + else if(sf.size() > 0 && sf[0] == '*') { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@ -3184,7 +2702,7 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) { if(escaped_chars.find(val) != escaped_chars.end()) { - sf += L'\\'; + sf += '\\'; } alphabet.getSymbol(sf, val); // add symbol to sf iff alphabetic if(val == 0) // non-alphabetic, possibly unknown tag; add to sf @@ -3197,9 +2715,9 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@ -3208,16 +2726,16 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) } if(current_state.isFinal(all_finals)) { - bool uppercase = sf.size() > 1 && iswupper(sf[1]); - bool firstupper= iswupper(sf[0]); + bool uppercase = sf.size() > 1 && u_isupper(sf[1]); + bool firstupper= u_isupper(sf[0]); - queue = L""; // the intervening tags were matched + queue.clear(); // the intervening tags were matched result = current_state.filterFinals(all_finals, alphabet, escaped_chars, displayWeightsMode, maxAnalyses, maxWeightClasses, uppercase, firstupper, 0); } - else if(result != L"") + else if(!result.empty()) { // We already have a result, but there is still more to read // of the analysis; following tags are not consumed, but @@ -3234,21 +2752,21 @@ FSTProcessor::bilingual(FILE *input, FILE *output, GenerationMode mode) else if(current_state.size() == 0) { // There are no more alive transductions and the current symbol is not a tag -- unknown word! - result = L""; + result.clear(); } } } } } -pair -FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) +pair +FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; - wstring queue = L""; + UString queue; bool mark = false; bool seentags = false; // have we seen any tags at all in the analysis? @@ -3258,38 +2776,38 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { - return pair(input_word, 0); + return pair(input_word, 0); } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val = 0; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; val = input_word[i]; } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { seentags = true; - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -3303,9 +2821,9 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@ -3314,37 +2832,22 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^=" + result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol != L"" && result != L"") + if(!symbol.empty() && !result.empty()) { queue.append(symbol); } @@ -3353,52 +2856,51 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } } } if (!seentags - && L"" == current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0)) + && current_state.filterFinals(all_finals, alphabet, escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).empty()) { // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } - return pair(result, 0); + return pair(result, 0); } // attach unmatched queue automatically - if(queue != L"") + if(!queue.empty()) { - wstring result_with_queue = L""; + UString result_with_queue; for(unsigned int i = 0, limit = result.size(); i != limit; i++) { switch(result[i]) { - case L'\\': - result_with_queue += L'\\'; + case '\\': + result_with_queue += '\\'; i++; break; - case L'/': + case '/': result_with_queue.append(queue); break; @@ -3411,25 +2913,25 @@ FSTProcessor::biltransWithQueue(wstring const &input_word, bool with_delim) if(with_delim) { - result_with_queue += L'$'; + result_with_queue += '$'; } - return pair(result_with_queue, queue.size()); + return pair(result_with_queue, queue.size()); } else { if(with_delim) { - result += L'$'; + result += '$'; } - return pair(result, 0); + return pair(result, 0); } } -wstring -FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) +UString +FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) { State current_state = initial_state; - wstring result = L""; + UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; bool mark = false; @@ -3440,37 +2942,37 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) end_point = input_word.size()-1; } - if(input_word[start_point] == L'*') + if(input_word[start_point] == '*') { return input_word; } - if(input_word[start_point] == L'=') + if(input_word[start_point] == '=') { start_point++; mark = true; } - bool firstupper = iswupper(input_word[start_point]); - bool uppercase = firstupper && iswupper(input_word[start_point+1]); + bool firstupper = u_isupper(input_word[start_point]); + bool uppercase = firstupper && u_isupper(input_word[start_point+1]); for(unsigned int i = start_point; i <= end_point; i++) { int val; - wstring symbol = L""; + UString symbol; - if(input_word[i] == L'\\') + if(input_word[i] == '\\') { i++; - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } - else if(input_word[i] == L'<') + else if(input_word[i] == '<') { - symbol = L'<'; + symbol = '<'; for(unsigned int j = i + 1; j <= end_point; j++) { symbol += input_word[j]; - if(input_word[j] == L'>') + if(input_word[j] == '>') { i = j; break; @@ -3480,13 +2982,13 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) } else { - val = static_cast(input_word[i]); + val = static_cast(input_word[i]); } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && iswupper(val) && !caseSensitive) + if(!alphabet.isTag(val) && u_isupper(val) && !caseSensitive) { - current_state.step(val, towlower(val)); + current_state.step(val, u_tolower(val)); } else { @@ -3495,46 +2997,31 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = L"^=" + result.substr(1); - } - else - { - result[0] = L'^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = L"=" + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) { - if(symbol == L"") + if(symbol.empty()) { // word is not present if(with_delim) { - result = L"^@" + input_word.substr(1); + result = "^@"_u + input_word.substr(1); } else { - result = L"@" + input_word; + result = "@"_u + input_word; } return result; } @@ -3543,7 +3030,7 @@ FSTProcessor::biltransWithoutQueue(wstring const &input_word, bool with_delim) if(with_delim) { - result += L'$'; + result += '$'; } return result; } @@ -3554,16 +3041,16 @@ FSTProcessor::valid() const { if(initial_state.isFinal(all_finals)) { - wcerr << L"Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; + cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << endl; return false; } else { State s = initial_state; - s.step(L' '); + s.step(' '); if(s.size() != 0) { - wcerr << L"Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; + cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << endl; return false; } } @@ -3572,45 +3059,45 @@ FSTProcessor::valid() const } int -FSTProcessor::readSAO(FILE *input) +FSTProcessor::readSAO(InputFile& input) { if(!input_buffer.isEmpty()) { return input_buffer.next(); } - wchar_t val = static_cast(fgetwc_unlocked(input)); - if(feof(input)) + UChar32 val = input.get(); + if(input.eof()) { return 0; } if(escaped_chars.find(val) != escaped_chars.end()) { - if(val == L'<') + if(val == '<') { - wstring str = readFullBlock(input, L'<', L'>'); - if(str.substr(0, 9) == L"'); + if(str.substr(0, 9) == "") + while(str.substr(str.size()-3) != "]]>"_u) { - str.append(readFullBlock(input, L'<', L'>').substr(1)); + str.append(input.readBlock('<', '>').substr(1)); } blankqueue.push(str); - input_buffer.add(static_cast(L' ')); - return static_cast(L' '); + input_buffer.add(static_cast(' ')); + return static_cast(' '); } else { streamError(); } } - else if (val == L'\\') { - val = static_cast(fgetwc_unlocked(input)); + else if (val == '\\') { + val = input.get(); if(isEscaped(val)) { input_buffer.add(val); - return static_cast(val); + return static_cast(val); } else streamError(); @@ -3621,47 +3108,47 @@ FSTProcessor::readSAO(FILE *input) } } - input_buffer.add(val); - return static_cast(val); + input_buffer.add(static_cast(val)); + return static_cast(val); } void -FSTProcessor::printSAOWord(wstring const &lf, FILE *output) +FSTProcessor::printSAOWord(UString const &lf, UFILE *output) { for(unsigned int i = 1, limit = lf.size(); i != limit; i++) { - if(lf[i] == L'/') + if(lf[i] == '/') { break; } - fputwc_unlocked(lf[i], output); + u_fputc(lf[i], output); } } void -FSTProcessor::SAO(FILE *input, FILE *output) +FSTProcessor::SAO(InputFile& input, UFILE *output) { bool last_incond = false; bool last_postblank = false; State current_state = initial_state; - wstring lf = L""; - wstring sf = L""; + UString lf; + UString sf; int last = 0; escaped_chars.clear(); - escaped_chars.insert(static_cast(L'\\')); - escaped_chars.insert(static_cast(L'<')); - escaped_chars.insert(static_cast(L'>')); + escaped_chars.insert('\\'); + escaped_chars.insert('<'); + escaped_chars.insert('>'); - while(wchar_t val = readSAO(input)) + while(UChar32 val = readSAO(input)) { // test for final states if(current_state.isFinal(all_finals)) { if(current_state.isFinal(inconditional)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@ -3671,8 +3158,8 @@ FSTProcessor::SAO(FILE *input, FILE *output) } else if(current_state.isFinal(postblank)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@ -3682,8 +3169,8 @@ FSTProcessor::SAO(FILE *input, FILE *output) } else if(!isAlphabetic(val)) { - bool firstupper = iswupper(sf[0]); - bool uppercase = firstupper && iswupper(sf[sf.size()-1]); + bool firstupper = u_isupper(sf[0]); + bool uppercase = firstupper && u_isupper(sf[sf.size()-1]); lf = current_state.filterFinalsSAO(all_finals, alphabet, escaped_chars, @@ -3693,23 +3180,16 @@ FSTProcessor::SAO(FILE *input, FILE *output) last = input_buffer.getPos(); } } - else if(sf == L"" && iswspace(val)) + else if(sf.empty() && u_isspace(val)) { - lf = L"/*"; + lf = "/*"_u; lf.append(sf); last_postblank = false; last_incond = false; last = input_buffer.getPos(); } - if(!iswupper(val) || caseSensitive) - { - current_state.step(val); - } - else - { - current_state.step(val, towlower(val)); - } + current_state.step_case(val, caseSensitive); if(current_state.size() != 0) { @@ -3717,9 +3197,9 @@ FSTProcessor::SAO(FILE *input, FILE *output) } else { - if(!isAlphabetic(val) && sf == L"") + if(!isAlphabetic(val) && sf.empty()) { - if(iswspace(val)) + if(u_isspace(val)) { printSpace(val, output); } @@ -3727,9 +3207,9 @@ FSTProcessor::SAO(FILE *input, FILE *output) { if(isEscaped(val)) { - fputwc_unlocked(L'\\', output); + u_fputc('\\', output); } - fputwc_unlocked(val, output); + u_fputc(val, output); } } else if(last_incond) @@ -3741,13 +3221,13 @@ FSTProcessor::SAO(FILE *input, FILE *output) else if(last_postblank) { printSAOWord(lf, output); - fputwc_unlocked(L' ', output); + u_fputc(' ', output); input_buffer.setPos(last); input_buffer.back(1); } else if(isAlphabetic(val) && ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || - lf == L"")) + lf.empty())) { do { @@ -3757,21 +3237,17 @@ FSTProcessor::SAO(FILE *input, FILE *output) unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } - else if(lf == L"") + else if(lf.empty()) { unsigned int limit = firstNotAlpha(sf); unsigned int size = sf.size(); - limit = (limit == static_cast(wstring::npos)?size:limit); + limit = (limit == static_cast(UString::npos)?size:limit); input_buffer.back(1+(size-limit)); - fputws_unlocked(L"", output); - fputws_unlocked(sf.c_str(), output); - fputws_unlocked(L"", output); + u_fprintf(output, "%S", sf.c_str()); } else { @@ -3781,8 +3257,8 @@ FSTProcessor::SAO(FILE *input, FILE *output) } current_state = initial_state; - lf = L""; - sf = L""; + lf.clear(); + sf.clear(); last_incond = false; last_postblank = false; } @@ -3792,12 +3268,12 @@ FSTProcessor::SAO(FILE *input, FILE *output) flushBlanks(output); } -wstring -FSTProcessor::removeTags(wstring const &str) +UString +FSTProcessor::removeTags(UString const &str) { for(unsigned int i = 0; i < str.size(); i++) { - if(str[i] == L'<' && i >=1 && str[i-1] != L'\\') + if(str[i] == '<' && i >=1 && str[i-1] != '\\') { return str.substr(0, i); } @@ -3880,7 +3356,7 @@ FSTProcessor::getNullFlush() } size_t -FSTProcessor::firstNotAlpha(wstring const &sf) +FSTProcessor::firstNotAlpha(UString const &sf) { for(size_t i = 0, limit = sf.size(); i < limit; i++) { @@ -3890,5 +3366,5 @@ FSTProcessor::firstNotAlpha(wstring const &sf) } } - return wstring::npos; + return UString::npos; } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 628356d..32263ac 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -18,19 +18,20 @@ #ifndef _FSTPROCESSOR_ #define _FSTPROCESSOR_ +#include #include #include -#include #include #include #include +#include #include -#include #include #include #include #include +#include using namespace std; @@ -56,7 +57,7 @@ private: /** * Transducers in FSTP */ - map transducers; + map transducers; /** * Current state of lexical analysis @@ -71,7 +72,7 @@ private: /** * Default value of weight unless specified */ - double default_weight; + double default_weight = 0.0000; /** * The final states of inconditional sections in the dictionaries @@ -101,27 +102,27 @@ private: /** * Queue of blanks, used in reading methods */ - queue blankqueue; + queue blankqueue; /** * Queue of wordbound blanks, used in reading methods */ - queue wblankqueue; + queue wblankqueue; /** * Set of characters being considered alphabetics */ - set alphabetic_chars; + set alphabetic_chars; /** * Set of characters to escape with a backslash */ - set escaped_chars; + set escaped_chars; /** * Set of characters to ignore */ - set ignored_chars; + set ignored_chars; /** * Mapping of characters for simplistic diacritic restoration specified in RCX files @@ -141,7 +142,7 @@ private: /** * Input buffer */ - Buffer input_buffer; + Buffer input_buffer; /** * Begin of the transducer @@ -151,86 +152,86 @@ private: /** * true if the position of input stream is out of a word */ - bool outOfWord; + bool outOfWord = false; /** * true if we're automatically removing surface forms. */ - bool biltransSurfaceForms; + bool biltransSurfaceForms = false; /** * if true, makes always difference between uppercase and lowercase * characters */ - bool caseSensitive; + bool caseSensitive = false; /** * if true, uses the dictionary case, discarding surface case * information */ - bool dictionaryCase; + bool dictionaryCase = false; /** * if true, flush the output when the null character is found */ - bool nullFlush; + bool nullFlush = false; /** * nullFlush property for the skipUntil function */ - bool nullFlushGeneration; + bool nullFlushGeneration = false; /** * if true, ignore the provided set of characters */ - bool useIgnoredChars; + bool useIgnoredChars = false; /** * if true, attempt simplistic diacritic restoration */ - bool useRestoreChars; + bool useRestoreChars = false; /** * if true, skips loading the default set of ignored characters */ - bool useDefaultIgnoredChars; + bool useDefaultIgnoredChars = true; /** * if true, displays the final weights (if any) */ - bool displayWeightsMode; + bool displayWeightsMode = false; /** * try analysing unknown words as compounds */ - bool do_decomposition; + bool do_decomposition = false; /** * Symbol of CompoundOnlyL */ - int compoundOnlyLSymbol; + int compoundOnlyLSymbol = 0; /** * Symbol of CompoundR */ - int compoundRSymbol; + int compoundRSymbol = 0; /** * Show or not the controls symbols (as compoundRSymbol) */ - bool showControlSymbols; + bool showControlSymbols = false; /** * Max compound elements * Hard coded for now, but there might come a switch one day */ - int compound_max_elements; + int compound_max_elements = 4; /** * Output no more than 'N' number of weighted analyses */ - int maxAnalyses; + int maxAnalyses = INT_MAX; /** * True if a wblank block ([[..]]xyz[[/]]) was just read @@ -250,62 +251,41 @@ private: /** * Output no more than 'N' best weight classes */ - int maxWeightClasses; + int maxWeightClasses = INT_MAX; /** * Prints an error of input stream and exits */ void streamError(); - /** - * Reads a character that is defined in the set of escaped_chars - * @param input the stream to read from - * @return code of the character - */ - wchar_t readEscaped(FILE *input); - - /** - * Reads a block from the stream input, enclosed by delim1 and delim2 - * @param input the stream being read - * @param delim1 the delimiter of the beginning of the sequence - * @param delim1 the delimiter of the end of the sequence - */ - wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); - - /** - * Reads a wordbound blank from the stream input - * @param input the stream being read - */ - wstring readWblank(FILE *input); - /** * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] * @param input the stream being read * @param output the stream to write on * @return true if the word enclosed by the wordbound blank has a ~ for postgeneration activation */ - bool wblankPostGen(FILE *input, FILE *output); + bool wblankPostGen(InputFile& input, UFILE *output); /** * Returns true if the character code is identified as alphabetic * @param c the code provided by the user * @return true if it's alphabetic */ - bool isAlphabetic(wchar_t const c) const; + bool isAlphabetic(UChar32 const c) const; /** * Tests if a character is in the set of escaped_chars * @param c the character code provided by the user * @return true if it is in the set */ - bool isEscaped(wchar_t const c) const; + bool isEscaped(UChar32 const c) const; /** * Read text from stream (analysis version) * @param input the stream to read * @return the next symbol in the stream */ - int readAnalysis(FILE *input); + int readAnalysis(InputFile& input); /** * Read text from stream (decomposition version) @@ -313,7 +293,7 @@ private: * @param output the stream to write on * @return the next symbol in the stream */ - int readDecomposition(FILE *input, FILE *output); + int readDecomposition(InputFile& input, UFILE *output); /** * Read text from stream (postgeneration version) @@ -321,7 +301,7 @@ private: * @param output the stream to write on * @return the next symbol in the stream */ - int readPostgeneration(FILE *input, FILE *output); + int readPostgeneration(InputFile& input, UFILE *output); /** * Read text from stream (generation version) @@ -329,7 +309,7 @@ private: * @param output the stream being written to * @return the next symbol in the stream */ - int readGeneration(FILE *input, FILE *output); + int readGeneration(InputFile& input, UFILE *output); /** * Read text from stream (biltrans version) @@ -337,26 +317,26 @@ private: * @param output the stream to write on * @return the queue of 0-symbols, and the next symbol in the stream */ - pair readBilingual(FILE *input, FILE *output); + pair readBilingual(InputFile& input, UFILE *output); /** * Read text from stream (SAO version) * @param input the stream to read * @return the next symbol in the stream */ - int readSAO(FILE *input); + int readSAO(InputFile& input); /** * Flush all the blanks remaining in the current process * @param output stream to write blanks */ - void flushBlanks(FILE *output); + void flushBlanks(UFILE *output); /** * Flush all the wordbound blanks remaining in the current process * @param output stream to write blanks */ - void flushWblanks(FILE *output); + void flushWblanks(UFILE *output); /** * Combine wordbound blanks in the queue and return them. @@ -370,7 +350,7 @@ private: * * @return final wblank string */ - wstring combineWblanks(); + UString combineWblanks(); /** * Calculate the initial state of parsing @@ -387,7 +367,7 @@ private: * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscaped(wstring const &str, FILE *output); + void writeEscaped(UString const &str, UFILE *output); /** * Write a string to an output stream. @@ -398,7 +378,7 @@ private: * @param output the stream to write in * @return how many blanks to pop and print after printing lu */ - size_t writeEscapedPopBlanks(wstring const &str, FILE *output); + size_t writeEscapedPopBlanks(UString const &str, UFILE *output); /** * Write a string to an output stream, escaping all escapable characters @@ -406,7 +386,7 @@ private: * @param str the string to write, escaping characters * @param output the stream to write in */ - void writeEscapedWithTags(wstring const &str, FILE *output); + void writeEscapedWithTags(UString const &str, UFILE *output); /** @@ -415,7 +395,7 @@ private: * @param the searched suffix * @returns true if 'str' has the suffix 'suffix' */ - static bool endsWith(wstring const &str, wstring const &suffix); + static bool endsWith(UString const &str, UString const &suffix); /** * Prints a word @@ -423,7 +403,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWord(wstring const &sf, wstring const &lf, FILE *output); + void printWord(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word. @@ -433,7 +413,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordPopBlank(wstring const &sf, wstring const &lf, FILE *output); + void printWordPopBlank(UString const &sf, UString const &lf, UFILE *output); /** * Prints a word (Bilingual version) @@ -441,7 +421,7 @@ private: * @param lf lexical form of the word * @param output stream where the word is written */ - void printWordBilingual(wstring const &sf, wstring const &lf, FILE *output); + void printWordBilingual(UString const &sf, UString const &lf, UFILE *output); /** @@ -449,21 +429,21 @@ private: * @param lf lexical form * @param output stream where the word is written */ - void printSAOWord(wstring const &lf, FILE *output); + void printSAOWord(UString const &lf, UFILE *output); /** * Prints an unknown word * @param sf surface form of the word * @param output stream where the word is written */ - void printUnknownWord(wstring const &sf, FILE *output); + void printUnknownWord(UString const &sf, UFILE *output); void initDecompositionSymbols(); - vector numbers; - int readTMAnalysis(FILE *input); + vector numbers; + int readTMAnalysis(InputFile& input); - unsigned int lastBlank(wstring const &str); + unsigned int lastBlank(UString const &str); /** * Print one blankqueue item if there is one, or a given "space" value. @@ -471,32 +451,46 @@ private: * @param val the space character to use if no blank queue * @param output stream where the word is written */ - void printSpace(wchar_t const val, FILE *output); + void printSpace(UChar const val, UFILE *output); - void skipUntil(FILE *input, FILE *output, wint_t const character); - static wstring removeTags(wstring const &str); - wstring compoundAnalysis(wstring str, bool uppercase, bool firstupper); - size_t firstNotAlpha(wstring const &sf); + void skipUntil(InputFile& input, UFILE *output, UChar32 const character); + static UString removeTags(UString const &str); + UString compoundAnalysis(UString str, bool uppercase, bool firstupper); + size_t firstNotAlpha(UString const &sf); - void analysis_wrapper_null_flush(FILE *input, FILE *output); - void lsx_wrapper_null_flush(FILE *input, FILE *output); - void bilingual_wrapper_null_flush(FILE *input, FILE *output, GenerationMode mode = gm_unknown); - void generation_wrapper_null_flush(FILE *input, FILE *output, + void analysis_wrapper_null_flush(InputFile& input, UFILE *output); + void bilingual_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); - void postgeneration_wrapper_null_flush(FILE *input, FILE *output); - void intergeneration_wrapper_null_flush(FILE *input, FILE *output); - void transliteration_wrapper_null_flush(FILE *input, FILE *output); + void postgeneration_wrapper_null_flush(InputFile& input, UFILE *output); + void intergeneration_wrapper_null_flush(InputFile& input, UFILE *output); + void transliteration_wrapper_null_flush(InputFile& input, UFILE *output); - wstring compose(wstring const &lexforms, wstring const &queue) const; + UString compose(UString const &lexforms, UString const &queue) const; void procNodeICX(); void procNodeRCX(); void initDefaultIgnoredCharacters(); - bool isLastBlankTM; + bool isLastBlankTM = false; xmlTextReaderPtr reader; public: + + /* + * String constants + */ + static UString const XML_TEXT_NODE; + static UString const XML_COMMENT_NODE; + static UString const XML_IGNORED_CHARS_ELEM; + static UString const XML_RESTORE_CHAR_ELEM; + static UString const XML_RESTORE_CHARS_ELEM; + static UString const XML_VALUE_ATTR; + static UString const XML_CHAR_ELEM; + static UString const WBLANK_START; + static UString const WBLANK_END; + static UString const WBLANK_FINAL; + FSTProcessor(); void initAnalysis(); @@ -507,25 +501,23 @@ public: void initBiltrans(); void initDecomposition(); - void analysis(FILE *input = stdin, FILE *output = stdout); - void tm_analysis(FILE *input = stdin, FILE *output = stdout); - void generation(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - void postgeneration(FILE *input = stdin, FILE *output = stdout); - void intergeneration(FILE *input = stdin, FILE *output = stdout); - void transliteration(FILE *input = stdin, FILE *output = stdout); - wstring biltrans(wstring const &input_word, bool with_delim = true); - wstring biltransfull(wstring const &input_word, bool with_delim = true); - void bilingual(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown); - pair biltransWithQueue(wstring const &input_word, bool with_delim = true); - wstring biltransWithoutQueue(wstring const &input_word, bool with_delim = true); - void SAO(FILE *input = stdin, FILE *output = stdout); + void analysis(InputFile& input, UFILE *output); + void tm_analysis(InputFile& input, UFILE *output); + void generation(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + void postgeneration(InputFile& input, UFILE *output); + void intergeneration(InputFile& input, UFILE *output); + void transliteration(InputFile& input, UFILE *output); + UString biltrans(UString const &input_word, bool with_delim = true); + UString biltransfull(UString const &input_word, bool with_delim = true); + void bilingual(InputFile& input, UFILE *output, GenerationMode mode = gm_unknown); + pair biltransWithQueue(UString const &input_word, bool with_delim = true); + UString biltransWithoutQueue(UString const &input_word, bool with_delim = true); + void SAO(InputFile& input, UFILE *output); void parseICX(string const &file); void parseRCX(string const &file); void load(FILE *input); - void lsx(FILE *input, FILE *output); - bool valid() const; void setCaseSensitiveMode(bool const value); diff --git a/lttoolbox/input_file.cc b/lttoolbox/input_file.cc new file mode 100644 index 0000000..307c8c9 --- /dev/null +++ b/lttoolbox/input_file.cc @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +InputFile::InputFile() + : infile(stdin), buffer_size(0) +{} + +InputFile::~InputFile() +{ + close(); +} + +bool +InputFile::open(const char* fname) +{ + close(); + if (fname == nullptr) { + infile = stdin; + } else { + infile = fopen(fname, "rb"); + } + return (infile != nullptr); +} + +void +InputFile::open_or_exit(const char* fname) +{ + if (!open(fname)) { + std::cerr << "Error: Unable to open '" << fname << "' for reading." << std::endl; + exit(EXIT_FAILURE); + } +} + +void +InputFile::close() +{ + if (infile != nullptr) { + if (infile != stdin) { + fclose(infile); + } + infile = nullptr; + } +} + +void +InputFile::wrap(FILE* newinfile) +{ + close(); + infile = newinfile; +} + +void +InputFile::internal_read() +{ + if (buffer_size) { + return; + } + if (feof(infile)) { + ubuffer[buffer_size++] = U_EOF; + return; + } + int i = 1; + cbuffer[0] = fgetc_unlocked(infile); + if (cbuffer[0] == EOF) { + ubuffer[buffer_size++] = U_EOF; + return; + } else if (cbuffer[0] == '\0') { + ubuffer[buffer_size++] = '\0'; + return; + } + if ((cbuffer[0] & 0xF0) == 0xF0) { + i += 3; + if (fread_unlocked(cbuffer+1, 1, 3, infile) != 3) { + throw std::runtime_error("Could not read 3 expected bytes from stream"); + } + } else if ((cbuffer[0] & 0xE0) == 0xE0) { + i += 2; + if (fread_unlocked(cbuffer+1, 1, 2, infile) != 2) { + throw std::runtime_error("Could not read 2 expected bytes from stream"); + } + } else if ((cbuffer[0] & 0xC0) == 0xC0) { + i += 1; + if (fread_unlocked(cbuffer+1, 1, 1, infile) != 1) { + throw std::runtime_error("Could not read 1 expected byte from stream"); + } + } + memset(ubuffer, 0, 3*sizeof(UChar)); + utf8::utf8to32(cbuffer, cbuffer+i, ubuffer); + buffer_size = 1; +} + +UChar32 +InputFile::get() +{ + if (!buffer_size) { + internal_read(); + } + return ubuffer[--buffer_size]; +} + +UChar32 +InputFile::peek() +{ + if (!buffer_size) { + internal_read(); + } + return ubuffer[buffer_size-1]; +} + +void +InputFile::unget(UChar32 c) +{ + // this will probably segfault if called multiple times + ubuffer[buffer_size++] = c; +} + +bool +InputFile::eof() +{ + return (infile == nullptr) || feof(infile); +} + +void +InputFile::rewind() +{ + if (infile != nullptr) { + if (std::fseek(infile, 0, SEEK_SET) != 0) { + std::cerr << "Error: Unable to rewind file" << std::endl; + exit(EXIT_FAILURE); + } + } +} + +UString +InputFile::readBlock(const UChar32 start, const UChar32 end) +{ + UString ret; + ret += start; + UChar32 c = 0; + while (c != end && !eof()) { + c = get(); + if (c == '\0') { + break; + } + ret += c; + if (c == '\\') { + ret += get(); + } + } + return ret; +} + +UString +InputFile::finishWBlank() +{ + UString ret; + ret += '['; + ret += '['; + UChar32 c = 0; + while (!eof()) { + c = get(); + if (c == '\0') { + break; + } + ret += c; + if (c == '\\') { + ret += get(); + } else if (c == ']' && peek() == ']') { + ret += get(); + break; + } + } + return ret; +} diff --git a/lttoolbox/input_file.h b/lttoolbox/input_file.h new file mode 100644 index 0000000..de031c8 --- /dev/null +++ b/lttoolbox/input_file.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_INPUT_FILE_H_ +#define _LT_INPUT_FILE_H_ + +#include +#include +#include + +class InputFile +{ +private: + FILE* infile; + UChar32 ubuffer[3]; + char cbuffer[4]; + int buffer_size; + void internal_read(); +public: + InputFile(); + ~InputFile(); + bool open(const char* fname = nullptr); + void open_or_exit(const char* fname = nullptr); + void close(); + void wrap(FILE* newinfile); + UChar32 get(); + UChar32 peek(); + void unget(UChar32 c); + bool eof(); + void rewind(); + // assumes that start has already been read + // returns string from start to end inclusive + // respects backslash escapes + UString readBlock(const UChar32 start, const UChar32 end); + // assumes [[ has already been read, reads to ]] + // returns entire string, including brackets + UString finishWBlank(); +}; + +#endif diff --git a/lttoolbox/lt_comp.cc b/lttoolbox/lt_comp.cc index 9d05c21..0202343 100644 --- a/lttoolbox/lt_comp.cc +++ b/lttoolbox/lt_comp.cc @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -103,21 +102,21 @@ int main(int argc, char *argv[]) switch (cnt) { case 'a': - c.setAltValue(optarg); + c.setAltValue(to_ustring(optarg)); break; case 'v': - c.setVariantValue(optarg); + c.setVariantValue(to_ustring(optarg)); break; case 'l': vl = optarg; - c.setVariantLeftValue(vl); + c.setVariantLeftValue(to_ustring(optarg)); break; case 'r': vr = optarg; - c.setVariantRightValue(vr); + c.setVariantRightValue(to_ustring(optarg)); break; case 'm': @@ -176,7 +175,7 @@ int main(int argc, char *argv[]) } else { - wcerr << "Error: Cannot not open file '" << infile << "'." << endl << endl; + cerr << "Error: Cannot not open file '" << infile << "'." << endl << endl; exit(EXIT_FAILURE); } initGenericErrorDefaultFunc(NULL); @@ -192,7 +191,7 @@ int main(int argc, char *argv[]) if(ttype == 'a') { LtLocale::tryToSetLocale(); - a.parse(infile, Compiler::COMPILER_RESTRICTION_LR_VAL); + a.parse(infile, false); } else { @@ -214,7 +213,7 @@ int main(int argc, char *argv[]) if(ttype == 'a') { LtLocale::tryToSetLocale(); - a.parse(infile, Compiler::COMPILER_RESTRICTION_RL_VAL); + a.parse(infile, true); } else { @@ -230,7 +229,7 @@ int main(int argc, char *argv[]) FILE *output = fopen(outfile.c_str(), "wb"); if(!output) { - wcerr << "Error: Cannot open file '" << outfile << "'." << endl; + cerr << "Error: Cannot open file '" << outfile << "'." << endl; exit(EXIT_FAILURE); } if(ttype == 'a') diff --git a/lttoolbox/lt_expand.cc b/lttoolbox/lt_expand.cc index 283f209..3d9facc 100644 --- a/lttoolbox/lt_expand.cc +++ b/lttoolbox/lt_expand.cc @@ -55,7 +55,8 @@ void endProgram(char *name) int main(int argc, char *argv[]) { - FILE *input = NULL, *output = NULL; + FILE* input = NULL; + UFILE* output = NULL; Expander e; e.setKeepBoundaries(false); @@ -86,15 +87,15 @@ int main(int argc, char *argv[]) switch (cnt) { case 'a': - e.setAltValue(optarg); + e.setAltValue(to_ustring(optarg)); break; case 'v': - e.setVariantValue(optarg); + e.setVariantValue(to_ustring(optarg)); break; case 'l': - e.setVariantLeftValue(optarg); + e.setVariantLeftValue(to_ustring(optarg)); break; case 'm': @@ -102,7 +103,7 @@ int main(int argc, char *argv[]) break; case 'r': - e.setVariantRightValue(optarg); + e.setVariantRightValue(to_ustring(optarg)); break; case 'h': @@ -122,11 +123,11 @@ int main(int argc, char *argv[]) input = fopen(infile.c_str(), "rb"); if(input == NULL) { - wcerr << "Error: Cannot open file '" << infile << "'." << endl; + cerr << "Error: Cannot open file '" << infile << "'." << endl; exit(EXIT_FAILURE); } fclose(input); - output = stdout; + output = u_finit(stdout, NULL, NULL); break; case 3: @@ -134,16 +135,16 @@ int main(int argc, char *argv[]) input = fopen(infile.c_str(), "rb"); if(input == NULL) { - wcerr << "Error: Cannot open file '" << infile << "'." << endl; + cerr << "Error: Cannot open file '" << infile << "'." << endl; exit(EXIT_FAILURE); } fclose(input); outfile = argv[argc-1]; - output = fopen(argv[argc-1], "wb"); + output = u_fopen(argv[argc-1], "wb", NULL, NULL); if(output == NULL) { - wcerr << "Error: Cannot open file '" << outfile << "'." << endl; + cerr << "Error: Cannot open file '" << outfile << "'." << endl; exit(EXIT_FAILURE); } break; @@ -158,7 +159,7 @@ int main(int argc, char *argv[]) #endif e.expand(infile, output); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_locale.cc b/lttoolbox/lt_locale.cc index 64cb71e..10378b3 100644 --- a/lttoolbox/lt_locale.cc +++ b/lttoolbox/lt_locale.cc @@ -41,7 +41,7 @@ LtLocale::tryToSetLocale() return; } - wcerr << "Warning: unsupported locale, fallback to \"C\"" << endl; + cerr << "Warning: unsupported locale, fallback to \"C\"" << endl; setlocale(LC_ALL, "C"); #endif diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index c138d56..8139e02 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #ifdef _MSC_VER @@ -50,7 +51,7 @@ int main(int argc, char *argv[]) { bool hfst = false; FILE* input = NULL; - FILE* output = stdout; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); @@ -118,7 +119,7 @@ int main(int argc, char *argv[]) if(outfile != "") { - output = fopen(outfile.c_str(), "wb"); + output = u_fopen(outfile.c_str(), "wb", NULL, NULL); if(!output) { cerr << "Error: Cannot open file '" << outfile << "' for writing." << endl; @@ -127,14 +128,14 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; + set alphabetic_chars; - map transducers; + map transducers; fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, input); + fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { auto features = read_le(input); if (features >= LTF_UNKNOWN) { @@ -151,7 +152,7 @@ int main(int argc, char *argv[]) int len = Compression::multibyte_read(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); len--; } @@ -162,13 +163,7 @@ int main(int argc, char *argv[]) while(len > 0) { - int len2 = Compression::multibyte_read(input); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(input)); - len2--; - } + UString name = Compression::string_read(input); transducers[name].read(input); len--; @@ -176,23 +171,20 @@ int main(int argc, char *argv[]) ///////////////////// - map::iterator penum = transducers.end(); + map::iterator penum = transducers.end(); penum--; - for(map::iterator it = transducers.begin(); it != transducers.end(); it++) + for(map::iterator it = transducers.begin(); it != transducers.end(); it++) { it->second.joinFinals(); it->second.show(alphabet, output, 0, hfst); if(it != penum) { - fwprintf(output, L"--\n", it->first.c_str()); // ToDo: Was %ls meant to go somewhere here? + u_fprintf(output, "--\n"); } } fclose(input); - if(output != stdout) - { - fclose(output); - } + u_fclose(output); return 0; } diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index 7ff4c8b..9be941d 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -28,9 +28,6 @@ #include #endif -#if defined(_WIN32) && !defined(_MSC_VER) -#include -#endif using namespace std; @@ -183,7 +180,7 @@ int main(int argc, char *argv[]) maxAnalyses = atoi(optarg); if (maxAnalyses < 1) { - wcerr << "Invalid or no argument for analyses count" << endl; + cerr << "Invalid or no argument for analyses count" << endl; exit(EXIT_FAILURE); } fstp.setMaxAnalysesValue(maxAnalyses); @@ -193,7 +190,7 @@ int main(int argc, char *argv[]) maxWeightClasses = atoi(optarg); if (maxWeightClasses < 1) { - wcerr << "Invalid or no argument for weight class count" << endl; + cerr << "Invalid or no argument for weight class count" << endl; exit(EXIT_FAILURE); } fstp.setMaxWeightClassesValue(maxWeightClasses); @@ -252,7 +249,8 @@ int main(int argc, char *argv[]) } } - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); if(optind == (argc - 3)) @@ -260,21 +258,19 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { - wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; + if (!input.open(argv[optind+1])) { + cerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); } - output= fopen(argv[optind+2], "wb"); - if(output == NULL || ferror(output)) + output = u_fopen(argv[optind+2], "wb", NULL, NULL); + if(output == NULL) { - wcerr << "Error: Cannot open file '" << argv[optind+2] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[optind+2] << "'." << endl << endl; exit(EXIT_FAILURE); } @@ -286,14 +282,12 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } - input = fopen(argv[optind+1], "rb"); - if(input == NULL || ferror(input)) - { - wcerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; + if (!input.open(argv[optind+1])) { + cerr << "Error: Cannot open file '" << argv[optind+1] << "'." << endl << endl; exit(EXIT_FAILURE); } @@ -305,7 +299,7 @@ int main(int argc, char *argv[]) FILE *in = fopen(argv[optind], "rb"); if(in == NULL || ferror(in)) { - wcerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[optind] << "'." << endl << endl; exit(EXIT_FAILURE); } fstp.load(in); @@ -414,15 +408,14 @@ int main(int argc, char *argv[]) } catch (exception& e) { - wcerr << e.what(); + cerr << e.what(); if (fstp.getNullFlush()) { - fputwc_unlocked(L'\0', output); + u_fputc('\0', output); } exit(1); } - fclose(input); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_tmxcomp.cc b/lttoolbox/lt_tmxcomp.cc index ab7df4b..32ab99f 100644 --- a/lttoolbox/lt_tmxcomp.cc +++ b/lttoolbox/lt_tmxcomp.cc @@ -82,25 +82,11 @@ int main(int argc, char *argv[]) switch(c_t) { case 'o': - { - wchar_t *param = new wchar_t[strlen(optarg)+1]; - if((size_t) -1 != mbstowcs(param, optarg, strlen(optarg))) - { - c.setOriginLanguageCode(param); - } - delete[] param; - } + c.setOriginLanguageCode(to_ustring(optarg)); break; case 'm': - { - wchar_t *param = new wchar_t[strlen(optarg)+1]; - if((size_t) -1 != mbstowcs(param, optarg, strlen(optarg))) - { - c.setMetaLanguageCode(param); - } - delete[] param; - } + c.setMetaLanguageCode(to_ustring(optarg)); break; default: @@ -109,27 +95,20 @@ int main(int argc, char *argv[]) } } - string opc = argv[argc-3]; - wchar_t* lo = new wchar_t[opc.size()+1]; - wchar_t* lm = new wchar_t[opc.size()+1]; + UString opc = to_ustring(argv[argc-3]); + UString lo = opc.substr(0, opc.find('-')); + UString lm = opc.substr(opc.find('-')+1); - if(((size_t) -1 == mbstowcs(lo, opc.substr(0, opc.find('-')).c_str(), opc.size()))|| - ((size_t) -1 == mbstowcs(lm, opc.substr(opc.find('-')+1).c_str(), opc.size()))) - { - delete[] lo; - delete[] lm; + if(lo.empty() || lm.empty()) { endProgram(argv[0]); } - c.parse(argv[argc-2], lo, lm); - delete[] lo; - delete[] lm; FILE *output = fopen(argv[argc-1], "wb"); if(!output) { - wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl; + cerr << "Error: Cannot open file '" << argv[2] << "'." << endl; exit(EXIT_FAILURE); } c.write(output); diff --git a/lttoolbox/lt_tmxproc.cc b/lttoolbox/lt_tmxproc.cc index c90aca9..0abee7f 100644 --- a/lttoolbox/lt_tmxproc.cc +++ b/lttoolbox/lt_tmxproc.cc @@ -43,7 +43,8 @@ void checkValidity(FSTProcessor const &fstp) int main(int argc, char *argv[]) { - FILE *input = stdin, *output = stdout; + InputFile input; + UFILE* output = u_finit(stdout, NULL, NULL); LtLocale::tryToSetLocale(); FSTProcessor fstp; FILE *aux; @@ -51,16 +52,14 @@ int main(int argc, char *argv[]) switch(argc) { case 4: - output = fopen(argv[3], "wb"); + output = u_fopen(argv[3], "wb", NULL, NULL); if(!output) { endProgram(argv[0]); } // follow case 3: - input = fopen(argv[2], "rb"); - if(!input) - { + if (!input.open(argv[2])) { endProgram(argv[0]); } // follow @@ -82,7 +81,6 @@ int main(int argc, char *argv[]) checkValidity(fstp); fstp.tm_analysis(input, output); - fclose(input); - fclose(output); + u_fclose(output); return EXIT_SUCCESS; } diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index 837794f..f685752 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -24,6 +24,7 @@ #include #include #include +#include void endProgram(char *name) { @@ -35,18 +36,17 @@ void endProgram(char *name) exit(EXIT_FAILURE); } -std::pair, std::map > +std::pair, std::map > read_fst(FILE *bin_file) { Alphabet new_alphabet; - wstring letters = L""; - std::map transducers; + std::map transducers; fpos_t pos; if (fgetpos(bin_file, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, bin_file); + fread_unlocked(header, 1, 4, bin_file); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { auto features = read_le(bin_file); if (features >= LTF_UNKNOWN) { @@ -60,47 +60,36 @@ read_fst(FILE *bin_file) } // letters - int len = Compression::multibyte_read(bin_file); - while(len > 0) - { - letters.push_back(static_cast(Compression::multibyte_read(bin_file))); - len--; - } + UString letters = Compression::string_read(bin_file); // symbols new_alphabet.read(bin_file); - len = Compression::multibyte_read(bin_file); + int len = Compression::multibyte_read(bin_file); while(len > 0) { - int len2 = Compression::multibyte_read(bin_file); - wstring name = L""; - while(len2 > 0) - { - name += static_cast(Compression::multibyte_read(bin_file)); - len2--; - } + UString name = Compression::string_read(bin_file); transducers[name].read(bin_file); len--; } - std::pair alph_letters; + std::pair alph_letters; alph_letters.first = new_alphabet; alph_letters.second = letters; - return std::pair, std::map > (alph_letters, transducers); + return std::pair, std::map > (alph_letters, transducers); } -std::pair, std::map > +std::pair, std::map > trim(FILE *file_mono, FILE *file_bi) { - std::pair, std::map > alph_trans_mono = read_fst(file_mono); + std::pair, std::map > alph_trans_mono = read_fst(file_mono); Alphabet alph_mono = alph_trans_mono.first.first; - std::map trans_mono = alph_trans_mono.second; - std::pair, std::map > alph_trans_bi = read_fst(file_bi); + std::map trans_mono = alph_trans_mono.second; + std::pair, std::map > alph_trans_bi = read_fst(file_bi); Alphabet alph_bi = alph_trans_bi.first.first; - std::map trans_bi = alph_trans_bi.second; + std::map trans_bi = alph_trans_bi.second; // The prefix transducer is the union of all transducers from bidix, // with a ".*" appended @@ -111,7 +100,7 @@ trim(FILE *file_mono, FILE *file_bi) set loopback_symbols; // ints refer to alph_prefix alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); - for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) + for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) { Transducer union_tmp = it->second; if(union_transducer.isEmpty()) @@ -130,21 +119,21 @@ trim(FILE *file_mono, FILE *file_bi) Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); - for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) + for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) { Transducer trimmed = it->second.intersect(moved_transducer, alph_mono, alph_prefix); - wcout << it->first << " " << it->second.size(); - wcout << " " << it->second.numberOfTransitions() << endl; + cout << it->first << " " << it->second.size(); + cout << " " << it->second.numberOfTransitions() << endl; if(it->second.numberOfTransitions() == 0) { - wcerr << L"Warning: empty section! Skipping it ..."<first].clear(); } else if(trimmed.hasNoFinals()) { - wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<first].clear(); } else { @@ -170,25 +159,24 @@ int main(int argc, char *argv[]) FILE *analyser = fopen(argv[1], "rb"); if(!analyser) { - wcerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[1] << "'." << endl << endl; exit(EXIT_FAILURE); } FILE *bidix = fopen(argv[2], "rb"); if(!bidix) { - wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[2] << "'." << endl << endl; exit(EXIT_FAILURE); } - std::pair, std::map > trimmed = trim(analyser, bidix); + std::pair, std::map > trimmed = trim(analyser, bidix); Alphabet alph_t = trimmed.first.first; - wstring letters = trimmed.first.second; - std::map trans_t = trimmed.second; + UString letters = trimmed.first.second; + std::map trans_t = trimmed.second; int n_transducers = 0; - for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) - { - if(!(it->second.isEmpty())) + for(auto& it : trans_t) { + if(!(it.second.isEmpty())) { n_transducers++; } @@ -196,9 +184,9 @@ int main(int argc, char *argv[]) if(n_transducers == 0) { - wcerr << L"Error: Trimming gave empty transducer!" << endl; - wcerr << L"Hint: There are no words in bilingual dictionary that match " - L"words in both monolingual dictionaries?" << endl; + cerr << "Error: Trimming gave empty transducer!" << endl; + cerr << "Hint: There are no words in bilingual dictionary that match " + "words in both monolingual dictionaries?" << endl; exit(EXIT_FAILURE); } @@ -206,24 +194,23 @@ int main(int argc, char *argv[]) FILE *output = fopen(argv[3], "wb"); if(!output) { - wcerr << "Error: Cannot open file '" << argv[3] << "'." << endl << endl; + cerr << "Error: Cannot open file '" << argv[3] << "'." << endl << endl; exit(EXIT_FAILURE); } // letters - Compression::wstring_write(letters, output); + Compression::string_write(letters, output); // symbols alph_t.write(output); // transducers Compression::multibyte_write(n_transducers, output); - for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) - { - if(!(it->second.isEmpty())) + for(auto& it : trans_t) { + if(!(it.second.isEmpty())) { - Compression::wstring_write(it->first, output); - it->second.write(output); + Compression::string_write(it.first, output); + it.second.write(output); } } diff --git a/lttoolbox/ltstr.h b/lttoolbox/ltstr.h deleted file mode 100644 index 9e5abb6..0000000 --- a/lttoolbox/ltstr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#ifndef _Ltstr_ -#define _Ltstr_ - -#include -#include -#include - -using namespace std; - -struct Ltstr -{ - bool operator()(string const &s1, string const &s2) const - { - return strcmp(s1.c_str(), s2.c_str()) < 0; - } - - bool operator()(wchar_t const *s1, wchar_t const *s2) const - { - return wcscmp(s1, s2) < 0; - } - - bool operator()(char const *s1, char const *s2) const - { - return strcmp(s1, s2) < 0; - } - - bool operator()(wstring const &s1, wstring const &s2) const - { - return wcscmp(s1.c_str(), s2.c_str()) < 0; - } -}; - -#endif diff --git a/lttoolbox/my_stdio.h b/lttoolbox/my_stdio.h index 6cf2083..a446278 100644 --- a/lttoolbox/my_stdio.h +++ b/lttoolbox/my_stdio.h @@ -46,20 +46,4 @@ #define fread_unlocked fread #endif -#if !HAVE_DECL_FGETWC_UNLOCKED -#define fgetwc_unlocked fgetwc -#endif - -#if !HAVE_DECL_FPUTWC_UNLOCKED -#define fputwc_unlocked fputwc -#endif - -#if !HAVE_DECL_FPUTWS_UNLOCKED -#define fputws_unlocked fputws -#endif - -#if !HAVE_DECL_UNGETWC_UNLOCKED -#define ungetwc_unlocked ungetwc -#endif - #endif diff --git a/lttoolbox/pattern_list.cc b/lttoolbox/pattern_list.cc index ed1f056..810ecff 100644 --- a/lttoolbox/pattern_list.cc +++ b/lttoolbox/pattern_list.cc @@ -22,9 +22,9 @@ #include #include -wstring const PatternList::ANY_CHAR = L""; -wstring const PatternList::ANY_TAG = L""; -wstring const PatternList::QUEUE = L""; +UString const PatternList::ANY_CHAR = ""_u; +UString const PatternList::ANY_TAG = ""_u; +UString const PatternList::QUEUE = ""_u; void PatternList::copy(PatternList const &o) @@ -80,7 +80,7 @@ PatternList::beginSequence() { if(sequence) { - wcerr << L"Error: opening an unended sequence" << endl; + cerr << "Error: opening an unended sequence" << endl; exit(EXIT_FAILURE); } sequence = true; @@ -92,7 +92,7 @@ PatternList::endSequence() { if(!sequence) { - wcerr << L"Error: ending an unopened sequence" << endl; + cerr << "Error: ending an unopened sequence" << endl; exit(EXIT_FAILURE); } sequence = false; @@ -107,10 +107,10 @@ PatternList::endSequence() } void -PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, +PatternList::insertOutOfSequence(UString const &lemma, UString const &tags, vector &result) { - if(lemma == L"") + if(lemma.empty()) { result.push_back(alphabet(ANY_CHAR)); } @@ -118,17 +118,17 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, { for(unsigned int i = 0, limit = lemma.size(); i < limit; i++) { - if(lemma[i] == L'*') + if(lemma[i] == '*') { result.push_back(alphabet(ANY_CHAR)); } else { - result.push_back(int((wchar_t) lemma[i])); + result.push_back(static_cast(lemma[i])); } } } - if(tags == L"") + if(tags.empty()) { result.push_back(alphabet(ANY_TAG)); } @@ -136,9 +136,9 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, { for(unsigned int i = 0, limit = tagCount(tags); i < limit; i++) { - wstring tag = L"<" + tagAt(tags, i) + L">"; + UString tag = "<"_u + tagAt(tags, i) + ">"_u; - if(tag == L"<*>") + if(tag == "<*>"_u) { result.push_back(alphabet(ANY_TAG)); } @@ -152,8 +152,8 @@ PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags, } void -PatternList::insertIntoSequence(int const id, wstring const &lemma, - wstring const &tags) +PatternList::insertIntoSequence(int const id, UString const &lemma, + UString const &tags) { sequence_id = id; @@ -169,14 +169,14 @@ PatternList::insertIntoSequence(int const id, wstring const &lemma, list >::iterator limit = sequence_data.end(); for(; it != limit; it++) { - it->push_back(L'+'); + it->push_back('+'); insertOutOfSequence(lemma, tags, *it); } } } void -PatternList::insert(int const id, wstring const &lemma, wstring const &tags) +PatternList::insert(int const id, UString const &lemma, UString const &tags) { if(!sequence) { @@ -196,7 +196,7 @@ PatternList::insert(int const id, int const otherid) { if(!sequence) { - wcerr << L"Error: using labels outside of a sequence" << endl; + cerr << "Error: using labels outside of a sequence" << endl; exit(EXIT_FAILURE); } @@ -221,7 +221,7 @@ PatternList::insert(int const id, int const otherid) p.first != p.second; p.first++) { vector temp = *it; - temp.push_back(L'+'); + temp.push_back('+'); temp.insert(temp.end(), (p.first->second).begin(), (p.first->second).end()); new_sequence_data.push_back(temp); @@ -233,7 +233,7 @@ PatternList::insert(int const id, int const otherid) } int -PatternList::tagCount(wstring const &tags) +PatternList::tagCount(UString const &tags) { int count = 0; @@ -243,7 +243,7 @@ PatternList::tagCount(wstring const &tags) { count++; } - else if(tags[i] == L'.') + else if(tags[i] == '.') { count++; } @@ -252,8 +252,8 @@ PatternList::tagCount(wstring const &tags) return count; } -wstring -PatternList::tagAt(wstring const &tags, int const index) +UString +PatternList::tagAt(UString const &tags, int const index) { int start = 0; int end = 0; @@ -261,7 +261,7 @@ PatternList::tagAt(wstring const &tags, int const index) for(unsigned int i = 0, limit = tags.size(); i < limit; i++) { - if(tags[i] == L'.') + if(tags[i] == '.') { count++; if(end == 0) @@ -282,7 +282,7 @@ PatternList::tagAt(wstring const &tags, int const index) if(index > count) { - return L""; + return ""_u; } if(end != 0) { @@ -331,9 +331,9 @@ PatternList::buildTransducer() // optional queue prevstate = state; - state = transducer.insertSingleTransduction(static_cast(L'_'), state, default_weight); - transducer.linkStates(prevstate, state, static_cast(L' '), default_weight); - transducer.linkStates(prevstate, state, static_cast(L'#'), default_weight); + state = transducer.insertSingleTransduction(static_cast('_'), state, default_weight); + transducer.linkStates(prevstate, state, static_cast(' '), default_weight); + transducer.linkStates(prevstate, state, static_cast('#'), default_weight); transducer.linkStates(state, state, alphabet(ANY_CHAR), default_weight); } else @@ -366,10 +366,10 @@ void PatternList::write(FILE *output) { alphabet.write(output); - wstring const tagger_name = L"tagger"; + UString const tagger_name = "tagger"_u; Compression::multibyte_write(1, output); - Compression::wstring_write(tagger_name, output); + Compression::string_write(tagger_name, output); transducer.write(output, alphabet.size()); Compression::multibyte_write(final_type.size(), output); @@ -391,7 +391,7 @@ PatternList::read(FILE *input) alphabet.read(input); if(Compression::multibyte_read(input) == 1) { - wstring mystr = Compression::wstring_read(input); + UString mystr = Compression::string_read(input); transducer.read(input, alphabet.size()); int finalsize = Compression::multibyte_read(input); diff --git a/lttoolbox/pattern_list.h b/lttoolbox/pattern_list.h index 5dde942..1b88403 100644 --- a/lttoolbox/pattern_list.h +++ b/lttoolbox/pattern_list.h @@ -45,29 +45,29 @@ private: void copy(PatternList const &o); void destroy(); - void insertOutOfSequence(wstring const &lemma, wstring const &tags, + void insertOutOfSequence(UString const &lemma, UString const &tags, vector &result); - void insertIntoSequence(int const id, wstring const &lemma, - wstring const &tags); + void insertIntoSequence(int const id, UString const &lemma, + UString const &tags); - static int tagCount(wstring const &tags); - static wstring tagAt(wstring const &tags, int const index); + static int tagCount(UString const &tags); + static UString tagAt(UString const &tags, int const index); public: /** * This symbol stands for any char */ - static wstring const ANY_CHAR; + static UString const ANY_CHAR; /** * This symbol stands for any tag */ - static wstring const ANY_TAG; + static UString const ANY_TAG; /** * This symbol marks a word queue */ - static wstring const QUEUE; + static UString const QUEUE; /** * Constructor @@ -106,7 +106,7 @@ public: * @param lemma * @param tags */ - void insert(int const id, wstring const &lemma, wstring const &tags); + void insert(int const id, UString const &lemma, UString const &tags); /** * Insertion method diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index e94ee9f..96d98c8 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -21,9 +21,11 @@ RegexpCompiler::RegexpCompiler() : token(0), +index(0), alphabet(0), state(0), letter(0), +postop(0), default_weight(0.0000) { } @@ -74,17 +76,17 @@ RegexpCompiler::isReserved(int const t) { switch(t) { - case L'(': - case L')': - case L'[': - case L']': - case L'*': - case L'?': - case L'+': - case L'-': - case L'^': - case L'\\': - case L'|': + case '(': + case ')': + case '[': + case ']': + case '*': + case '?': + case '+': + case '-': + case '^': + case '\\': + case '|': case FIN_FICHERO: return true; @@ -96,14 +98,14 @@ RegexpCompiler::isReserved(int const t) void RegexpCompiler::error() { - wcerr << L"Error parsing regexp" < const &er) { input = er; - token = static_cast(input[0]); + token = input[0]; + index = 0; state = transducer.getInitial(); S(); transducer.setFinal(state, default_weight); @@ -141,7 +144,7 @@ RegexpCompiler::compile(wstring const &er) void RegexpCompiler::S() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { RExpr(); Cola(); @@ -155,7 +158,7 @@ RegexpCompiler::S() void RegexpCompiler::RExpr() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { Term(); RExprp(); @@ -169,14 +172,14 @@ RegexpCompiler::RExpr() void RegexpCompiler::Cola() { - if(token == FIN_FICHERO || token == L')') + if(token == FIN_FICHERO || token == ')') { } - else if(token == L'|') + else if(token == '|') { int e = state; state = transducer.getInitial(); - consume(L'|'); + consume('|'); RExpr(); Cola(); @@ -192,7 +195,7 @@ RegexpCompiler::Cola() void RegexpCompiler::Term() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Transducer t; int e = t.getInitial(); @@ -200,53 +203,53 @@ RegexpCompiler::Term() e = t.insertNewSingleTransduction((*alphabet)(letter, letter), e, default_weight); t.setFinal(e, default_weight); Postop(); - if(postop == L"*") + if(postop == '*') { t.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == L"+") + else if(postop == '+') { t.oneOrMore((*alphabet)(0, 0)); } - else if(postop == L"?") + else if(postop == '?') { t.optional((*alphabet)(0, 0)); } - postop = L""; + postop = 0; state = transducer.insertTransducer(state, t, (*alphabet)(0, 0)); } - else if(token == L'(') + else if(token == '(') { Transducer t = transducer; int e = state; transducer.clear(); state = transducer.getInitial(); - consume(L'('); + consume('('); S(); - consume(L')'); + consume(')'); transducer.setFinal(state, default_weight); Postop(); - if(postop == L"*") + if(postop == '*') { transducer.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == L"+") + else if(postop == '+') { transducer.oneOrMore((*alphabet)(0, 0)); } - else if(postop == L"?") + else if(postop == '?') { transducer.optional((*alphabet)(0, 0)); } - postop = L""; + postop = 0; state = t.insertTransducer(e, transducer, (*alphabet)(0, 0)); transducer = t; } - else if(token == L'[') + else if(token == '[') { - consume(L'['); + consume('['); Esp(); } else @@ -258,12 +261,12 @@ RegexpCompiler::Term() void RegexpCompiler::RExprp() { - if(token == L'(' || token == L'[' || !isReserved(token) || token == L'\\') + if(token == '(' || token == '[' || !isReserved(token) || token == '\\') { Term(); RExprp(); } - else if(token == L'|' || token == FIN_FICHERO || token == L')') + else if(token == '|' || token == FIN_FICHERO || token == ')') { } else @@ -280,9 +283,9 @@ RegexpCompiler::Letra() letter = token; consume(token); } - else if(token == L'\\') + else if(token == '\\') { - consume(L'\\'); + consume('\\'); letter = token; Reservado(); } @@ -295,24 +298,24 @@ RegexpCompiler::Letra() void RegexpCompiler::Postop() { - if(token == L'*') + if(token == '*') { - consume(L'*'); - postop = L"*"; + consume('*'); + postop = '*'; } - else if(token == L'?') + else if(token == '?') { - consume(L'?'); - postop = L"?"; + consume('?'); + postop = '?'; } - else if(token == L'+') + else if(token == '+') { - consume(L'+'); - postop = L"+"; + consume('+'); + postop = '+'; } - else if(token == L'(' || token == L'[' || !isReserved(token) || - token == L'\\' || token == L'|' || token == FIN_FICHERO || - token == L')') + else if(token == '(' || token == '[' || !isReserved(token) || + token == '\\' || token == '|' || token == FIN_FICHERO || + token == ')') { } else @@ -325,10 +328,10 @@ void RegexpCompiler::Esp() { Transducer t; - if(!isReserved(token) || token == L'\\' || token == L']') + if(!isReserved(token) || token == '\\' || token == ']') { Lista(); - consume(L']'); + consume(']'); Postop(); for(set::iterator it = brackets.begin(); @@ -342,11 +345,11 @@ RegexpCompiler::Esp() t.joinFinals((*alphabet)(0, 0)); } - else if(token == L'^') + else if(token == '^') { - consume(L'^'); + consume('^'); Lista(); - consume(L']'); + consume(']'); Postop(); for(int i = 0; i < 256 ;i++) @@ -367,20 +370,20 @@ RegexpCompiler::Esp() error(); } - if(postop == L"+") + if(postop == '+') { t.oneOrMore((*alphabet)(0, 0)); } - else if(postop == L"*") + else if(postop == '*') { t.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == L"?") + else if(postop == '?') { t.optional((*alphabet)(0, 0)); } brackets.clear(); - postop = L""; + postop = 0; state = transducer.insertTransducer(state, t, (*alphabet)(0, 0)); } @@ -388,12 +391,12 @@ RegexpCompiler::Esp() void RegexpCompiler::Lista() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Elem(); Lista(); } - else if(token == L']') + else if(token == ']') { } else @@ -418,7 +421,7 @@ RegexpCompiler::Reservado() void RegexpCompiler::Elem() { - if(!isReserved(token) || token == L'\\') + if(!isReserved(token) || token == '\\') { Letra(); int rango1 = letter; @@ -446,12 +449,12 @@ RegexpCompiler::Elem() void RegexpCompiler::ColaLetra() { - if(token == L'-') + if(token == '-') { - consume(L'-'); + consume('-'); Letra(); } - else if(!isReserved(token) || token == L'\\' || token == L']') + else if(!isReserved(token) || token == '\\' || token == ']') { } else @@ -478,5 +481,5 @@ RegexpCompiler::initialize(Alphabet *a) setAlphabet(a); transducer.clear(); brackets.clear(); - postop = L""; + postop = 0; } diff --git a/lttoolbox/regexp_compiler.h b/lttoolbox/regexp_compiler.h index dd11ca9..e9bdb30 100644 --- a/lttoolbox/regexp_compiler.h +++ b/lttoolbox/regexp_compiler.h @@ -17,10 +17,13 @@ #ifndef _REGEXP_COMPILER_ #define _REGEXP_COMPILER_ +#include #include #include #include +#include +#include using namespace std; @@ -41,7 +44,12 @@ private: /** * Input string */ - wstring input; + vector input; + + /** + * Location in the input string + */ + size_t index; /** * Alphabet to encode symbols @@ -66,7 +74,7 @@ private: /** * Post-operator: '+', '?', '*' */ - wstring postop; + UChar32 postop; /** * Default value of weight @@ -200,7 +208,7 @@ public: * Function that parses a regular expression and produces a transducer * @param er the regular expression */ - void compile(wstring const &er); + void compile(vector const &er); /** * Set the decoder of symbols diff --git a/lttoolbox/serialiser.h b/lttoolbox/serialiser.h index 01abb3e..bc04f2c 100644 --- a/lttoolbox/serialiser.h +++ b/lttoolbox/serialiser.h @@ -29,6 +29,7 @@ #include #include #include +#include namespace { template @@ -96,15 +97,15 @@ public: std::ostream &Output); }; -template <> class Serialiser { +template <> class Serialiser { public: - inline static void serialise(const wchar_t &SerialisedType_, + inline static void serialise(const char &SerialisedType_, std::ostream &Output); }; -template <> class Serialiser { +template <> class Serialiser { public: - inline static void serialise(const char &SerialisedType_, + inline static void serialise(const UChar &SerialisedType_, std::ostream &Output); }; @@ -213,16 +214,16 @@ void Serialiser::serialise(const uint32_t &SerialisedType_, int_serialise((uint64_t)SerialisedType_, Output); } -void Serialiser::serialise(const wchar_t &SerialisedType_, - std::ostream &Output) { - int_serialise((uint32_t)SerialisedType_, Output); -} - void Serialiser::serialise(const char &SerialisedType_, std::ostream &Output) { int_serialise((uint8_t)SerialisedType_, Output); } +void Serialiser::serialise(const UChar &SerialisedType_, + std::ostream &Output) { + int_serialise((uint16_t)SerialisedType_, Output); +} + void Serialiser::serialise(const double &SerialisedType_, std::ostream &Output) { union { diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index 03abae9..facd537 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -403,12 +402,12 @@ State::step(int const input, set const alts) } void -State::step_case(wchar_t val, wchar_t val2, bool caseSensitive) +State::step_case(UChar32 val, UChar32 val2, bool caseSensitive) { - if (!iswupper(val) || caseSensitive) { + if (!u_isupper(val) || caseSensitive) { step(val, val2); - } else if(val != towlower(val)) { - step(val, towlower(val), val2); + } else if(val != u_tolower(val)) { + step(val, u_tolower(val), val2); } else { step(val, val2); } @@ -416,12 +415,12 @@ State::step_case(wchar_t val, wchar_t val2, bool caseSensitive) void -State::step_case(wchar_t val, bool caseSensitive) +State::step_case(UChar32 val, bool caseSensitive) { - if (!iswupper(val) || caseSensitive) { + if (!u_isupper(val) || caseSensitive) { step(val); } else { - step(val, towlower(val)); + step(val, u_tolower(val)); } } @@ -441,14 +440,14 @@ State::isFinal(map const &finals) const } -vector> -State::NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const +vector> +State::NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const { - vector> result; + vector> result; - sort(lf.begin(), lf.end(), sort_weights()); + sort(lf.begin(), lf.end(), sort_weights()); - for(vector >::iterator it = lf.begin(); it != lf.end(); it++) + for(vector >::iterator it = lf.begin(); it != lf.end(); it++) { double last_weight = 0.0000; if(maxAnalyses > 0 && maxWeightClasses > 0) @@ -466,16 +465,16 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC } -wstring +UString State::filterFinals(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const { - vector> response; + vector> response; - wstring result = L""; + UString result; double cost = 0.0000; for(size_t i = 0, limit = state.size(); i != limit; i++) @@ -491,21 +490,21 @@ State::filterFinals(map const &finals, { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); cost += ((*(state[i].sequence))[j]).second; } if(firstupper) { - if(result[first_char] == L'~') + if(result[first_char] == '~') { // skip post-generation mark - result[first_char+1] = towupper(result[first_char+1]); + result[first_char+1] = u_toupper(result[first_char+1]); } else { - result[first_char] = towupper(result[first_char]); + result[first_char] = u_toupper(result[first_char]); } } } @@ -517,7 +516,7 @@ State::filterFinals(map const &finals, { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); cost += ((*(state[i].sequence))[j]).second; @@ -532,16 +531,17 @@ State::filterFinals(map const &finals, response = NFinals(response, max_analyses, max_weight_classes); - result = L""; - for(vector>::iterator it = response.begin(); it != response.end(); it++) + result.clear(); + for(vector>::iterator it = response.begin(); it != response.end(); it++) { - result += L'/'; + result += '/'; result += it->first; if(display_weights) { - result += L"second); - result += L">"; + UChar temp[16]{}; + // if anyone wants a weight of 10000, this will not be enough + u_sprintf(temp, "", it->second); + result += temp; } } @@ -549,39 +549,39 @@ State::filterFinals(map const &finals, } -set > > +set > > State::filterFinalsLRX(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { - set > > results; + set > > results; - vector current_result; - wstring rule_id = L""; + vector current_result; + UString rule_id; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { current_result.clear(); - rule_id = L""; - wstring current_word = L""; + rule_id.clear(); + UString current_word; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - current_word += L'\\'; + current_word += '\\'; } - wstring sym = L""; + UString sym; alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); - if(sym == L"<$>") + if(sym == "<$>"_u) { - if(current_word != L"") + if(!current_word.empty()) { current_result.push_back(current_word); } - current_word = L""; + current_word.clear(); } else { @@ -597,32 +597,34 @@ State::filterFinalsLRX(map const &finals, } -wstring +UString State::filterFinalsSAO(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, + set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { - wstring result = L""; - wstring annot = L""; + UString result; + UString annot; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { - result += L'/'; + result += '/'; unsigned int const first_char = result.size() + firstchar; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } if(alphabet.isTag(((*(state[i].sequence))[j]).first)) { - annot = L""; + annot.clear(); alphabet.getSymbol(annot, ((*(state[i].sequence))[j]).first); - result += L'&'+annot.substr(1,annot.length()-2)+L';'; + result += '&'; + result += annot.substr(1,annot.length()-2); + result += ';'; } else { @@ -631,14 +633,14 @@ State::filterFinalsSAO(map const &finals, } if(firstupper) { - if(result[first_char] == L'~') + if(result[first_char] == '~') { // skip post-generation mark - result[first_char+1] = towupper(result[first_char+1]); + result[first_char+1] = u_toupper(result[first_char+1]); } else { - result[first_char] = towupper(result[first_char]); + result[first_char] = u_toupper(result[first_char]); } } } @@ -647,24 +649,24 @@ State::filterFinalsSAO(map const &finals, return result; } -wstring +UString State::filterFinalsTM(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blankqueue, vector &numbers) const + set const &escaped_chars, + queue &blankqueue, vector &numbers) const { - wstring result = L""; + UString result; for(size_t i = 0, limit = state.size(); i != limit; i++) { if(finals.find(state[i].where) != finals.end()) { - result += L'/'; + result += '/'; for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) { if(escaped_chars.find((*(state[i].sequence))[j].first) != escaped_chars.end()) { - result += L'\\'; + result += '\\'; } alphabet.getSymbol(result, (*(state[i].sequence))[j].first); } @@ -672,15 +674,15 @@ State::filterFinalsTM(map const &finals, } - wstring result2 = L""; - vector fragment; - fragment.push_back(L""); + UString result2; + vector fragment; + fragment.push_back(""_u); for(unsigned int i = 0, limit = result.size(); i != limit ; i++) { - if(result[i] == L')') + if(result[i] == ')') { - fragment.push_back(L""); + fragment.push_back(""_u); } else { @@ -692,9 +694,9 @@ State::filterFinalsTM(map const &finals, { if(i != limit -1) { - if(fragment[i].size() >=2 && fragment[i].substr(fragment[i].size()-2) == L"(#") + if(fragment[i].size() >=2 && fragment[i].substr(fragment[i].size()-2) == "(#"_u) { - wstring whitespace = L" "; + UString whitespace = " "_u; if(blankqueue.size() != 0) { whitespace = blankqueue.front().substr(1); @@ -709,15 +711,15 @@ State::filterFinalsTM(map const &finals, bool substitute = false; for(int j = fragment[i].size() - 1; j >= 0; j--) { - if(fragment[i].size()-j > 3 && fragment[i][j] == L'\\' && - fragment[i][j+1] == L'@' && fragment[i][j+2] == L'(') + if(fragment[i].size()-j > 3 && fragment[i][j] == '\\' && + fragment[i][j+1] == '@' && fragment[i][j+2] == '(') { int num = 0; bool correct = true; for(unsigned int k = (unsigned int) j+3, limit2 = fragment[i].size(); k != limit2; k++) { - if(iswdigit(fragment[i][k])) + if(u_isdigit(fragment[i][k])) { num = num * 10; num += (int) fragment[i][k] - 48; @@ -738,13 +740,13 @@ State::filterFinalsTM(map const &finals, } if(substitute == false) { - fragment[i] += L')'; + fragment[i] += ')'; } } } } - result = L""; + result.clear(); for(unsigned int i = 0, limit = fragment.size(); i != limit; i++) { @@ -888,26 +890,28 @@ State::restartFinals(const map &finals, int requiredSymbol, Stat -wstring +UString State::getReadableString(const Alphabet &a) { - wstring retval = L"["; + UString retval; + retval += '['; for(unsigned int i=0; i>* seq = state.at(i).sequence; if(seq != NULL) for (unsigned int j=0; jsize(); j++) { - wstring ws = L""; + UString ws; a.getSymbol(ws, (seq->at(j)).first); retval.append(ws); } if(i+1 < state.size()) { - retval.append(L", "); + retval += ','; + retval += ' '; } } - retval.append(L"]"); + retval += ']'; return retval; } diff --git a/lttoolbox/state.h b/lttoolbox/state.h index a7840c7..31f0e42 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -30,6 +30,8 @@ #include #include +#include + using namespace std; /** @@ -188,9 +190,9 @@ public: */ void step(int const input, set const alts); - void step_case(wchar_t val, bool caseSensitive); + void step_case(UChar32 val, bool caseSensitive); - void step_case(wchar_t val, wchar_t val2, bool caseSensitive); + void step_case(UChar32 val, UChar32 val2, bool caseSensitive); void step_careful(int const input, int const alt); @@ -236,7 +238,7 @@ public: } }; - vector> NFinals(vector> lf, + vector> NFinals(vector> lf, int maxAnalyses, int maxWeightClasses) const; @@ -252,9 +254,9 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - wstring filterFinals(map const &finals, + UString filterFinals(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, int max_weight_classes = INT_MAX, @@ -273,9 +275,9 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - wstring filterFinalsSAO(map const &finals, + UString filterFinalsSAO(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -293,9 +295,9 @@ public: * @return the result of the transduction */ - set > > filterFinalsLRX(map const &finals, + set > > filterFinalsLRX(map const &finals, Alphabet const &a, - set const &escaped_chars, + set const &escaped_chars, bool uppercase = false, bool firstupper = false, int firstchar = 0) const; @@ -326,13 +328,13 @@ public: /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ - wstring getReadableString(const Alphabet &a); + UString getReadableString(const Alphabet &a); - wstring filterFinalsTM(map const &finals, + UString filterFinalsTM(map const &finals, Alphabet const &alphabet, - set const &escaped_chars, - queue &blanks, - vector &numbers) const; + set const &escaped_chars, + queue &blanks, + vector &numbers) const; }; diff --git a/lttoolbox/string_to_wostream.h b/lttoolbox/string_to_wostream.h deleted file mode 100644 index 4ffbb4b..0000000 --- a/lttoolbox/string_to_wostream.h +++ /dev/null @@ -1,13 +0,0 @@ -// Include string_utils.h instead if you're linking against apertium - -#ifndef __STRING_TO_WOSTREAM_H_ -#define __STRING_TO_WOSTREAM_H_ - -#include - -static std::wostream & operator<<(std::wostream & ostr, std::string const & str) { - ostr << str.c_str(); - return ostr; -} - -#endif diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc new file mode 100644 index 0000000..411380d --- /dev/null +++ b/lttoolbox/string_utils.cc @@ -0,0 +1,249 @@ +#include + +#include +#include +#include +#include + +UString +StringUtils::trim(const UString& str) +{ + if (str.empty()) { + return str; + } + size_t begin = 0; + size_t end = str.size(); + size_t i = 0; + UChar32 c; + while (begin < end) { + U16_GET(str.c_str(), begin, i, end, c); + if (!u_isspace(c)) { + begin = i; + break; + } else { + U16_FWD_1(str.c_str(), i, end); + } + } + i = str.size(); + U16_BACK_1(str.c_str(), 0, i); + U16_GET(str.c_str(), 0, i, end, c); + if (!u_isspace(c)) { + if (begin == 0) { + return str; + } else { + return str.substr(begin); + } + } + while (end > begin) { + end = i; + U16_BACK_1(str.c_str(), 0, i); + U16_GET(str.c_str(), 0, i, str.size(), c); + if (!u_isspace(c)) { + break; + } + } + return str.substr(begin, end-begin); +} + +std::vector +StringUtils::split(const UString& str, const UString& delim) +{ + size_t pos = 0; + size_t new_pos; + std::vector result; + while (pos < str.size()) { + new_pos = str.find(delim, pos); + if (new_pos == UString::npos) { + new_pos = str.size(); + } + if (new_pos > pos) { + // if we have a non-empty substring between this delimiter + // and the last one + result.push_back(str.substr(pos, new_pos-pos)); + } + pos = new_pos + delim.size(); + } + return result; +} + +UString +StringUtils::join(const std::vector& vec, const UString& delim) +{ + UString s; + for (auto& piece : vec) { + if (!s.empty()) { + s.append(delim); + } + s.append(piece); + } + return s; +} + +UString +StringUtils::substitute(const UString& str, const UString& olds, const UString& news) +{ + UString s = str; + size_t p = s.find(olds, 0); + while (p != UString::npos) { + s.replace(p, olds.length(), news); + p += news.length(); + p = s.find(olds, p); + } + return s; +} + +UString +StringUtils::itoa(int n) +{ + UChar str[256]; + u_snprintf(str, 256, "%d", n); + return str; +} + +std::string +StringUtils::itoa_string(int n) +{ + char str[256]; + snprintf(str, 256, "%d", n); + return str; +} + +UString +StringUtils::ftoa(double f) +{ + UChar str[256]; + u_snprintf(str, 256, "%f", f); + return str; +} + +int +StringUtils::stoi(const UString& str) +{ + int ret; + int c = u_sscanf(str.c_str(), "%d", &ret); + if (c != 1) { + throw std::invalid_argument("unable to parse int"); + } + return ret; +} + +double +StringUtils::stod(const UString& str) +{ + double ret; + int c = u_sscanf(str.c_str(), "%lf", &ret); + if (c != 1) { + throw std::invalid_argument("unable to parse float"); + } + return ret; +} + +UString +StringUtils::tolower(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToLower(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to lowercase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::toupper(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToUpper(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to uppercase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::totitle(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToTitle(buf, str.size()*2, str.c_str(), str.size(), NULL, NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to titlecase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::getcase(const UString& str) +{ + UString ret = "aa"_u; + if (str.empty()) { + return ret; + } + size_t i = 0; + size_t l = str.size(); + UChar32 c; + U16_NEXT(str.c_str(), i, l, c); + if (u_isupper(c)) { + ret[0] = 'A'; + if (i < l) { + U16_BACK_1(str.c_str(), i, l); // decrements l + U16_GET(str.c_str(), 0, l, str.size(), c); + if (u_isupper(c)) { + ret[1] = 'A'; + } + } + } + return ret; +} + +UString +StringUtils::copycase(const UString& source, const UString& target) +{ + if (source.empty() || target.empty()) { + return target; + } + size_t i = 0; + size_t l = source.size(); + UChar32 c; + U16_NEXT(source.c_str(), i, l, c); + bool firstupper = u_isupper(c); + bool uppercase = false; + if (firstupper) { + if (i != l) { + U16_BACK_1(source.c_str(), i, l); // decrements l + U16_GET(source.c_str(), 0, l, source.size(), c); + uppercase = u_isupper(c); + } + } + if (firstupper) { + if (uppercase) { + return toupper(target); + } else { + return totitle(target); + } + } else { + return tolower(target); + } +} + +bool +StringUtils::caseequal(const UString& a, const UString& b) +{ + UErrorCode err = U_ZERO_ERROR; + int cmp = u_strCaseCompare(a.c_str(), -1, b.c_str(), -1, 0, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: caseless string comparison failed on '"; + std::cerr << a << "' and '" << b << "'" << std::endl; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return (cmp == 0); +} diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h new file mode 100644 index 0000000..79aeadf --- /dev/null +++ b/lttoolbox/string_utils.h @@ -0,0 +1,38 @@ +#ifndef __LT_STRING_UTILS_H__ +#define __LT_STRING_UTILS_H__ + +#include +#include + +class StringUtils { +public: + // delete leading and trailing whitespace + static UString trim(const UString& str); + + // split string on delimiter + static std::vector split(const UString& str, const UString& delim); + + // inverse of split + static UString join(const std::vector& vec, const UString& delim); + + // replace each occurrence of olds with news + static UString substitute(const UString& str, const UString& olds, const UString& news); + + static UString itoa(int n); + static std::string itoa_string(int n); + static UString ftoa(double f); + // these throw std::invalid_argument if parsing fails + static int stoi(const UString& str); + static double stod(const UString& str); + + static UString tolower(const UString& str); + static UString toupper(const UString& str); + static UString totitle(const UString& str); + + static UString getcase(const UString& str); + static UString copycase(const UString& source, const UString& target); + + static bool caseequal(const UString& a, const UString& b); +}; + +#endif // __LT_STRING_UTILS_H__ diff --git a/lttoolbox/tmx_compiler.cc b/lttoolbox/tmx_compiler.cc index 39113ee..db0394b 100644 --- a/lttoolbox/tmx_compiler.cc +++ b/lttoolbox/tmx_compiler.cc @@ -19,37 +19,38 @@ #include #include #include -#include #include #include #include -#ifdef _WIN32 -#define swprintf _snwprintf -#endif - using namespace std; -wstring const TMXCompiler::TMX_COMPILER_TMX_ELEM = L"tmx"; -wstring const TMXCompiler::TMX_COMPILER_HEADER_ELEM = L"header"; -wstring const TMXCompiler::TMX_COMPILER_BODY_ELEM = L"body"; -wstring const TMXCompiler::TMX_COMPILER_TU_ELEM = L"tu"; -wstring const TMXCompiler::TMX_COMPILER_TUV_ELEM = L"tuv"; -wstring const TMXCompiler::TMX_COMPILER_HI_ELEM = L"hi"; -wstring const TMXCompiler::TMX_COMPILER_PH_ELEM = L"ph"; -wstring const TMXCompiler::TMX_COMPILER_XMLLANG_ATTR = L"xml:lang"; -wstring const TMXCompiler::TMX_COMPILER_LANG_ATTR = L"lang"; -wstring const TMXCompiler::TMX_COMPILER_SEG_ELEM = L"seg"; -wstring const TMXCompiler::TMX_COMPILER_PROP_ELEM = L"prop"; +UString const TMXCompiler::TMX_COMPILER_TMX_ELEM = "tmx"_u; +UString const TMXCompiler::TMX_COMPILER_HEADER_ELEM = "header"_u; +UString const TMXCompiler::TMX_COMPILER_BODY_ELEM = "body"_u; +UString const TMXCompiler::TMX_COMPILER_TU_ELEM = "tu"_u; +UString const TMXCompiler::TMX_COMPILER_TUV_ELEM = "tuv"_u; +UString const TMXCompiler::TMX_COMPILER_HI_ELEM = "hi"_u; +UString const TMXCompiler::TMX_COMPILER_PH_ELEM = "ph"_u; +UString const TMXCompiler::TMX_COMPILER_XMLLANG_ATTR = "xml:lang"_u; +UString const TMXCompiler::TMX_COMPILER_LANG_ATTR = "lang"_u; +UString const TMXCompiler::TMX_COMPILER_SEG_ELEM = "seg"_u; +UString const TMXCompiler::TMX_COMPILER_PROP_ELEM = "prop"_u; +UString const TMXCompiler::TMX_COMPILER_TEXT_NODE = "#text"_u; +UString const TMXCompiler::TMX_COMPILER_COMMENT_NODE = "#comment"_u; +UString const TMXCompiler::TMX_COMPILER_NUMBER_TAG = ""_u; +UString const TMXCompiler::TMX_COMPILER_BLANK_TAG = ""_u; TMXCompiler::TMXCompiler() : reader(0), default_weight(0.0000) { LtLocale::tryToSetLocale(); - alphabet.includeSymbol(L""); // -1 -> numbers - alphabet.includeSymbol(L""); // -2 -> blanks + alphabet.includeSymbol(TMX_COMPILER_NUMBER_TAG); // -1 -> numbers + alphabet.includeSymbol(TMX_COMPILER_BLANK_TAG); // -2 -> blanks + number_tag = alphabet(TMX_COMPILER_NUMBER_TAG); + blank_tag = alphabet(TMX_COMPILER_BLANK_TAG); } TMXCompiler::~TMXCompiler() @@ -57,14 +58,14 @@ TMXCompiler::~TMXCompiler() } void -TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) +TMXCompiler::parse(string const &file, UString const &lo, UString const &lm) { origin_language = lo; meta_language = lm; reader = xmlReaderForFile(file.c_str(), NULL, 0); if(reader == NULL) { - wcerr << "Error: Cannot open '" << file << "'." << endl; + cerr << "Error: Cannot open '" << file << "'." << endl; exit(EXIT_FAILURE); } @@ -77,7 +78,7 @@ TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) if(ret != 0) { - wcerr << L"Error: Parse error at the end of input." << endl; + cerr << "Error: Parse error at the end of input." << endl; } xmlFreeTextReader(reader); @@ -88,12 +89,12 @@ TMXCompiler::parse(string const &file, wstring const &lo, wstring const &lm) } void -TMXCompiler::requireEmptyError(wstring const &name) +TMXCompiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; exit(EXIT_FAILURE); } } @@ -101,91 +102,91 @@ TMXCompiler::requireEmptyError(wstring const &name) bool TMXCompiler::allBlanks() { - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + UString text = XMLParseUtil::readValue(reader); for(auto c : text) { - flag = flag && iswspace(c); + if (!u_isspace(c)) { + return false; + } } - - return flag; + return true; } void -TMXCompiler::skipBlanks(wstring &name) +TMXCompiler::skipBlanks(UString &name) { - while(name == L"#text" || name == L"#comment") + while(name == TMX_COMPILER_TEXT_NODE || name == TMX_COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != TMX_COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << "): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } } void -TMXCompiler::skip(wstring &name, wstring const &elem) +TMXCompiler::skip(UString &name, UString const &elem) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); - while(name == L"#text" || name == L"#comment") + while(name == TMX_COMPILER_TEXT_NODE || name == TMX_COMPILER_COMMENT_NODE) { - if(name != L"#comment") + if(name != TMX_COMPILER_COMMENT_NODE) { if(!allBlanks()) { - wcerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << "): Invalid construction." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid construction." << endl; exit(EXIT_FAILURE); } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); } if(name != elem) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << elem << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Expected '<" << elem << ">'." << endl; exit(EXIT_FAILURE); } } -wstring -TMXCompiler::attrib(wstring const &name) +UString +TMXCompiler::attrib(UString const &name) { return XMLParseUtil::attrib(reader, name); } void -TMXCompiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) +TMXCompiler::requireAttribute(UString const &value, UString const &attrname, + UString const &elemname) { - if(value == L"") + if(value.empty()) { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): '<" << elemname; + cerr << "' element must specify non-void '"; + cerr << attrname << "' attribute." << endl; exit(EXIT_FAILURE); } } -wstring +UString TMXCompiler::getTag(size_t const &val) const { - wchar_t cad[32]; - swprintf(cad, 32, L"<%d>", val); + UChar cad[32]; + u_snprintf(cad, 32, "<%d>", val); return cad; } @@ -197,7 +198,7 @@ TMXCompiler::insertTU(vector const &origin, vector const &meta) return; } - if(origin[0] == alphabet(L"") || meta[0] == alphabet(L"")) + if(origin[0] == blank_tag || meta[0] == blank_tag) { return; } @@ -273,12 +274,10 @@ TMXCompiler::align_blanks(vector &o, vector &m) vector puntos; vector resultado_o, resultado_m; - int const symbol = alphabet(L""); - vector > so, sm; - split(o, so, symbol); - split(m, sm, symbol); + split(o, so, blank_tag); + split(m, sm, blank_tag); if(so.size() == sm.size()) { @@ -288,8 +287,8 @@ TMXCompiler::align_blanks(vector &o, vector &m) trim(sm[i]); if(sm.size() - 1 != i) { - sm[i].push_back(L'('); - sm[i].push_back(L'#'); + sm[i].push_back('('); + sm[i].push_back('#'); } /* while(so[i].size() < sm[i].size()) @@ -301,8 +300,8 @@ TMXCompiler::align_blanks(vector &o, vector &m) sm[i].push_back(0); }*/ } - o = join(so, L' '); - m = join(sm, L')'); + o = join(so, ' '); + m = join(sm, ')'); } else { @@ -315,19 +314,19 @@ TMXCompiler::align_blanks(vector &o, vector &m) trim(sm[i]); if(sm.size() - 1 != i) { - sm[i].push_back(L'('); - sm[i].push_back(L'#'); + sm[i].push_back('('); + sm[i].push_back('#'); } } - o = join(so, L' '); - m = join(sm, L')'); + o = join(so, ' '); + m = join(sm, ')'); } } void TMXCompiler::procTU() { - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + UString name = XMLParseUtil::readName(reader); int type = xmlTextReaderNodeType(reader); vector origin; vector meta; @@ -337,9 +336,8 @@ TMXCompiler::procTU() { if(name == TMX_COMPILER_TUV_ELEM && type != XML_READER_TYPE_END_ELEMENT) { - wstring l = attrib(TMX_COMPILER_XMLLANG_ATTR); - if(l == L"") - { + UString l = attrib(TMX_COMPILER_XMLLANG_ATTR); + if(l.empty()) { l = attrib(TMX_COMPILER_LANG_ATTR); } @@ -360,57 +358,43 @@ TMXCompiler::procTU() while(name != TMX_COMPILER_TUV_ELEM || type != XML_READER_TYPE_END_ELEMENT) { xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); - if(name == L"#text") + if(name == TMX_COMPILER_TEXT_NODE) { - wstring l = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(size_t i = 0, limit = l.size(); i != limit; i++) - { - ref->push_back(l[i]); - } + XMLParseUtil::readValueInto32(reader, *ref); } else if(name == TMX_COMPILER_HI_ELEM || name == TMX_COMPILER_PH_ELEM) { if(type != XML_READER_TYPE_END_ELEMENT) { - ref->push_back(alphabet(L"")); + ref->push_back(blank_tag); } } } } xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); } trim(origin); trim(meta); -// wcout << L"DESPUES DE TRIM\n"; -// printvector(origin); -// printvector(meta); align(origin, meta); -// wcout << L"DESPUES DE ALIGN\n"; -// printvector(origin); -// printvector(meta); align_blanks(origin, meta); -// wcout << L"DESPUES DE ALIGNBLANKS\n"; -// printvector(origin); -// printvector(meta); insertTU(origin, meta); } void TMXCompiler::procNode() { - xmlChar const *xname = xmlTextReaderConstName(reader); - wstring name = XMLParseUtil::towstring(xname); + UString name = XMLParseUtil::readName(reader); // HACER: optimizar el orden de ejecución de esta ristra de "ifs" - if(name == L"#text") + if(name == TMX_COMPILER_TEXT_NODE) { /* ignorar */ } @@ -434,14 +418,14 @@ TMXCompiler::procNode() { procTU(); } - else if(name== L"#comment") + else if(name== TMX_COMPILER_COMMENT_NODE) { /* ignorar */ } else { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << name << L">'." << endl; + cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); + cerr << "): Invalid node '<" << name << ">'." << endl; exit(EXIT_FAILURE); } } @@ -449,24 +433,24 @@ TMXCompiler::procNode() void TMXCompiler::write(FILE *output) { - fwrite(HEADER_LTTOOLBOX, 1, 4, output); + fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; write_le(output, features); // letters (empty to keep the file format) - Compression::wstring_write(L"", output); + Compression::multibyte_write(0, output); // symbols alphabet.write(output); - // transducers + // transducers (1, with empty name) Compression::multibyte_write(1, output); // keeping file format - Compression::wstring_write(L"", output); // keeping file format + Compression::multibyte_write(0, output); // keeping file format transducer.write(output); - wcout << origin_language << L"->" << meta_language << L" "; - wcout << transducer.size() << L" " << transducer.numberOfTransitions(); - wcout << endl; + cout << origin_language << "->" << meta_language << " "; + cout << transducer.size() << " " << transducer.numberOfTransitions(); + cout << endl; } void @@ -474,7 +458,7 @@ TMXCompiler::trim(vector &v) const { while(v.size() > 0) { - if(iswspace(v[v.size()-1])) + if(u_isspace(v[v.size()-1])) { v.pop_back(); } @@ -488,7 +472,7 @@ TMXCompiler::trim(vector &v) const vector aux; for(auto c : v) { - if(!iswspace(c) || !principio) + if(!u_isspace(c) || !principio) { principio = false; aux.push_back(c); @@ -514,7 +498,7 @@ TMXCompiler::align(vector &origin, vector &meta) numbers_origin_start.push_back(i); numbers_origin_length.push_back(nl); i += nl-1; - modified_origin.push_back(alphabet(L"")); + modified_origin.push_back(number_tag); } else { @@ -536,16 +520,15 @@ TMXCompiler::align(vector &origin, vector &meta) if(vectorcmp(origin, numbers_origin_start[j], meta, i, nl)) { - modified_meta.push_back(L'@'); - modified_meta.push_back(L'('); - wchar_t *valor = new wchar_t[8]; - swprintf(valor, 8, L"%d", j+1); - for(int k = 0, limit3 = wcslen(valor); k != limit3; k++) + modified_meta.push_back('@'); + modified_meta.push_back('('); + UChar valor[8]{}; + int limit3 = u_snprintf(valor, 8, "%d", j+1); + for(int k = 0; k != limit3; k++) { modified_meta.push_back(valor[k]); } - delete[] valor; - modified_meta.push_back(L')'); + modified_meta.push_back(')'); i += nl-1; tocado = true; break; @@ -582,7 +565,7 @@ TMXCompiler::numberLength(vector &v, unsigned int const position) const { for(unsigned int i = position, limit = v.size(); i < limit; i++) { - if(!iswdigit(v[i]) && (v[i] != L'.' || i == position) && (v[i] != L',' || i == position)) + if(!u_isdigit(v[i]) && (v[i] != '.' || i == position) && (v[i] != ',' || i == position)) { if(i == position) { @@ -593,7 +576,7 @@ TMXCompiler::numberLength(vector &v, unsigned int const position) const while(i != position) { i--; - if(iswdigit(v[i])) + if(u_isdigit(v[i])) { return i - position + 1; } @@ -607,7 +590,7 @@ TMXCompiler::numberLength(vector &v, unsigned int const position) const while(i != position) { i--; - if(iswdigit(v[i])) + if(u_isdigit(v[i])) { return i - position + 1; } @@ -634,34 +617,13 @@ TMXCompiler::vectorcmp(vector const &orig, unsigned int const begin_orig, } void -TMXCompiler::printvector(vector const &v, wostream &os) -{ - for(unsigned int i = 0, limit = v.size(); i != limit; i++) - { - if(i != 0) - { - os << L" "; - } - if(v[i] > 31) - { - os << v[i] << L" ('" << wchar_t(v[i]) << L"')"; - } - else - { - os << v[i]; - } - } - os << endl; -} - -void -TMXCompiler::setOriginLanguageCode(wstring const &code) +TMXCompiler::setOriginLanguageCode(UString const &code) { // nada } void -TMXCompiler::setMetaLanguageCode(wstring const &code) +TMXCompiler::setMetaLanguageCode(UString const &code) { // nada } diff --git a/lttoolbox/tmx_compiler.h b/lttoolbox/tmx_compiler.h index 53bb4b5..9cf9595 100644 --- a/lttoolbox/tmx_compiler.h +++ b/lttoolbox/tmx_compiler.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -60,22 +59,25 @@ private: /** * Origin language */ - wstring origin_language; + UString origin_language; /** * Meta language */ - wstring meta_language; + UString meta_language; /** * Origin language code in the TMX */ - wstring origin_language_inner_code; + UString origin_language_inner_code; /** * Origin language code in the TMX */ - wstring meta_language_inner_code; + UString meta_language_inner_code; + + int32_t number_tag; + int32_t blank_tag; /** @@ -100,26 +102,26 @@ private: * @param name the name of the attribute * @return the value of the attribute */ - wstring attrib(wstring const &name); + UString attrib(UString const &name); /** * Skip all document #text nodes before "elem" * @param name the name of the node * @param elem the name of the expected node */ - void skip(wstring &name, wstring const &elem); + void skip(UString &name, UString const &elem); /** * Skip all blank #text nodes before "name" * @param name the name of the node */ - void skipBlanks(wstring &name); + void skipBlanks(UString &name); /** * Force an element to be empty, and check for it * @param name the element */ - void requireEmptyError(wstring const &name); + void requireEmptyError(UString const &name); /** * Force an attribute to be specified, amd check for it @@ -127,8 +129,8 @@ private: * @param attrname the name of the attribute * @param elemname the parent of the attribute */ - void requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname); + void requireAttribute(UString const &value, UString const &attrname, + UString const &elemname); /** * True if all the elements in the current node are blanks @@ -136,7 +138,7 @@ private: */ bool allBlanks(); - wstring getTag(size_t const &val) const; + UString getTag(size_t const &val) const; void trim(vector &v) const; void align(vector &origin, vector &meta); unsigned int numberLength(vector &v, unsigned int const position) const; @@ -147,25 +149,27 @@ private: void align_blanks(vector &o, vector &m); vector join(vector > const &v, int const s) const; - static void printvector(vector const &v, wostream &wos = std::wcout); //eliminar este método - public: /* * Constants to represent the element and the attributes of * translation memories in TMX format */ - static wstring const TMX_COMPILER_TMX_ELEM; - static wstring const TMX_COMPILER_HEADER_ELEM; - static wstring const TMX_COMPILER_BODY_ELEM; - static wstring const TMX_COMPILER_TU_ELEM; - static wstring const TMX_COMPILER_TUV_ELEM; - static wstring const TMX_COMPILER_HI_ELEM; - static wstring const TMX_COMPILER_PH_ELEM; - static wstring const TMX_COMPILER_XMLLANG_ATTR; - static wstring const TMX_COMPILER_LANG_ATTR; - static wstring const TMX_COMPILER_SEG_ELEM; - static wstring const TMX_COMPILER_PROP_ELEM; + static UString const TMX_COMPILER_TMX_ELEM; + static UString const TMX_COMPILER_HEADER_ELEM; + static UString const TMX_COMPILER_BODY_ELEM; + static UString const TMX_COMPILER_TU_ELEM; + static UString const TMX_COMPILER_TUV_ELEM; + static UString const TMX_COMPILER_HI_ELEM; + static UString const TMX_COMPILER_PH_ELEM; + static UString const TMX_COMPILER_XMLLANG_ATTR; + static UString const TMX_COMPILER_LANG_ATTR; + static UString const TMX_COMPILER_SEG_ELEM; + static UString const TMX_COMPILER_PROP_ELEM; + static UString const TMX_COMPILER_TEXT_NODE; + static UString const TMX_COMPILER_COMMENT_NODE; + static UString const TMX_COMPILER_NUMBER_TAG; + static UString const TMX_COMPILER_BLANK_TAG; /** @@ -181,7 +185,7 @@ public: /** * Compile dictionary to letter transducers */ - void parse(string const &file, wstring const &lo, wstring const &lm); + void parse(string const &file, UString const &lo, UString const &lm); /** * Write the result of compilation @@ -193,13 +197,13 @@ public: * Set origin language inner code * @param code the code of the origin language into the TMX file being compiled */ - void setOriginLanguageCode(wstring const &code); + void setOriginLanguageCode(UString const &code); /** * Set meta language inner code * @param code the code of the meta language into the TMX file being compiled */ - void setMetaLanguageCode(wstring const &code); + void setMetaLanguageCode(UString const &code); }; diff --git a/lttoolbox/trans_exe.cc b/lttoolbox/trans_exe.cc index ce39ff6..4dcc5aa 100644 --- a/lttoolbox/trans_exe.cc +++ b/lttoolbox/trans_exe.cc @@ -18,6 +18,7 @@ #include #include #include +#include TransExe::TransExe(): initial_id(0), @@ -70,7 +71,7 @@ TransExe::read(FILE *input, Alphabet const &alphabet) fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, input); + fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { auto features = read_le(input); if (features >= TDF_UNKNOWN) { diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index b37ae96..e9e6454 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -24,6 +24,24 @@ #include #include #include +#include + +UString const Transducer::HFST_EPSILON_SYMBOL_SHORT = "@0@"_u; +UString const Transducer::HFST_EPSILON_SYMBOL_LONG = "@_EPSILON_SYMBOL_@"_u; +// could extend the ""_u helper to include u""_u +// this is the only place that needs it +UString const Transducer::LTTB_EPSILON_SYMBOL = UString(1, (UChar)0x3B5); + // = "ε"_u; +UString const Transducer::HFST_SPACE_SYMBOL = "@_SPACE_@"_u; +UString const Transducer::HFST_TAB_SYMBOL = "@_TAB_@"_u; +UString const Transducer::GROUP_SYMBOL = "#"_u; +UString const Transducer::JOIN_SYMBOL = "+"_u; +UString const Transducer::ANY_TAG_SYMBOL = ""_u; +UString const Transducer::ANY_CHAR_SYMBOL = ""_u; +UString const Transducer::LSX_BOUNDARY_SYMBOL = "<$>"_u; +UString const Transducer::COMPOUND_ONLY_L_SYMBOL = ""_u; +UString const Transducer::COMPOUND_R_SYMBOL = ""_u; + int Transducer::newState() @@ -170,8 +188,8 @@ Transducer::linkStates(int const source, int const target, } else { - wcerr << L"Error: Trying to link nonexistent states (" << source; - wcerr << L", " << target << L", " << tag << L")" << endl; + cerr << "Error: Trying to link nonexistent states (" << source; + cerr << ", " << target << ", " << tag << ")" << endl; exit(EXIT_FAILURE); } } @@ -189,7 +207,7 @@ Transducer::setFinal(int const state, double const weight, bool value) int initial_copy = getInitial(); if(state == initial_copy) { - wcerr << L"Setting initial state to final" << endl; + cerr << "Setting initial state to final" << endl; } */ if(value) @@ -261,7 +279,7 @@ Transducer::joinFinals(int const epsilon_tag) } else if(finals.size() == 0) { - wcerr << L"Error: empty set of final states" < finals_state; + for(auto& it : finals) { + finals_state.insert(it.first); + } + while(size_Q_prime != Q_prime.size()) { size_Q_prime = Q_prime.size(); @@ -326,11 +349,6 @@ Transducer::determinize(int const epsilon_tag) for(auto& it : R[t]) { - set finals_state; - for(auto& it2 : finals) - { - finals_state.insert(it2.first); - } if(!isEmptyIntersection(Q_prime[it], finals_state)) { double w = default_weight; @@ -378,8 +396,8 @@ Transducer::determinize(int const epsilon_tag) t = (t+1)%2; } - transitions = transitions_prime; - finals = finals_prime; + transitions.swap(transitions_prime); + finals.swap(finals_prime); initial = initial_prime; } @@ -517,7 +535,7 @@ bool Transducer::weighted() { void Transducer::write(FILE *output, int const decalage) { - fwrite(HEADER_TRANSDUCER, 1, 4, output); + fwrite_unlocked(HEADER_TRANSDUCER, 1, 4, output); bool write_weights = weighted(); @@ -578,7 +596,7 @@ Transducer::read(FILE *input, int const decalage) fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; - fread(header, 1, 4, input); + fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { auto features = read_le(input); if (features >= TDF_UNKNOWN) { @@ -713,61 +731,58 @@ Transducer::reverse(int const epsilon_tag) } void -Transducer::escapeSymbol(wstring& symbol, bool hfst) const +Transducer::escapeSymbol(UString& symbol, bool hfst) const { - if(symbol == L"") // If it's an epsilon + if(symbol.empty()) // If it's an epsilon { if(hfst) { - symbol = L"@0@"; + symbol = HFST_EPSILON_SYMBOL_SHORT; } else { - symbol = L"ε"; + symbol = LTTB_EPSILON_SYMBOL; } } - else if(hfst && symbol == L" ") + else if(hfst && symbol == " "_u) { - symbol = L"@_SPACE_@"; + symbol = HFST_SPACE_SYMBOL; } - else if(hfst && symbol == L"\t") + else if(hfst && symbol == "\t"_u) { - symbol = L"@_TAB_@"; + symbol = HFST_TAB_SYMBOL; } } void -Transducer::show(Alphabet const &alphabet, FILE *output, int const epsilon_tag, bool hfst) const +Transducer::show(Alphabet const &alphabet, UFILE *output, int const epsilon_tag, bool hfst) const { for(auto& it : transitions) { for(auto& it2 : it.second) { auto t = alphabet.decode(it2.first); - fwprintf(output, L"%d\t", it.first); - fwprintf(output, L"%d\t", it2.second.first); - wstring l = L""; + u_fprintf(output, "%d\t%d\t", it.first, it2.second.first); + UString l; alphabet.getSymbol(l, t.first); escapeSymbol(l, hfst); - fwprintf(output, L"%ls\t", l.c_str()); - wstring r = L""; + u_fprintf(output, "%S\t", l.c_str()); + UString r; alphabet.getSymbol(r, t.second); escapeSymbol(r, hfst); - fwprintf(output, L"%ls\t", r.c_str()); - fwprintf(output, L"%f\t", it2.second.second); - fwprintf(output, L"\n"); + u_fprintf(output, "%S\t", r.c_str()); + u_fprintf(output, "%f\t\n", it2.second.second); } } for(auto& it3 : finals) { - fwprintf(output, L"%d\t", it3.first); - fwprintf(output, L"%f\n", it3.second); + u_fprintf(output, "%d\t%f\n", it3.first, it3.second); } } void -Transducer::show(Alphabet const &alphabet, FILE *output, int const epsilon_tag) const +Transducer::show(Alphabet const &alphabet, UFILE *output, int const epsilon_tag) const { return show(alphabet, output, epsilon_tag, false); } @@ -789,7 +804,7 @@ Transducer::getStateSize(int const state) } bool -Transducer::recognise(wstring pattern, Alphabet &a, FILE *err) +Transducer::recognise(UString pattern, Alphabet &a, FILE *err) { bool accepted = false; set states; @@ -801,7 +816,7 @@ Transducer::recognise(wstring pattern, Alphabet &a, FILE *err) { set new_state; //Transducer::closure(int const state, int const epsilon_tag) // For each of the current alive states - //fwprintf(err, L"step: %ls %lc (%d)\n", pattern.c_str(), *it, sym); + //fprintf(err, "step: %ls %lc (%d)\n", pattern.c_str(), *it, sym); for(auto& it2 : states) { auto& p = transitions[it2]; @@ -811,19 +826,19 @@ Transducer::recognise(wstring pattern, Alphabet &a, FILE *err) { auto t = a.decode(it3.first); - wstring l = L""; + UString l; a.getSymbol(l, t.first); - //wstring r = L""; + //UString r; //a.getSymbol(r, t.second); - //fwprintf(err, L" -> state: %d, trans: %ls:%ls, targ: %d\n", *it2, (l == L"") ? L"ε" : l.c_str(), (r == L"") ? L"ε" : r.c_str(), it3->second); - //if(l.find(*it) != wstring::npos || l == L"" ) - if(l.find(it) != wstring::npos) + //fprintf(err, " -> state: %d, trans: %ls:%ls, targ: %d\n", *it2, (l.empty()) ? "ε" : l.c_str(), (r.empty()) ? "ε" : r.c_str(), it3->second); + //if(l.find(*it) != UString::npos || l.empty() ) + if(l.find(it) != UString::npos) { auto myclosure = closure(it3.second.first, 0); - //wcerr << L"Before closure alives: " < seen; @@ -1006,11 +1017,11 @@ Transducer::moveLemqsLast(Alphabet const &alphabet, { int label = trans_it.first, this_trg = trans_it.second.first; - wstring left = L""; + UString left; alphabet.getSymbol(left, alphabet.decode(label).first); int new_src = states_this_new[this_src]; - if(left == COMPILER_GROUP_ELEM) + if(left == GROUP_SYMBOL) { Transducer tagsFirst = copyWithTagsFirst(this_trg, label, alphabet, epsilon_tag); new_t.finals.insert(make_pair( @@ -1055,16 +1066,6 @@ Transducer::intersect(Transducer &trimmer, * The trimmer is typically a bidix passed through appendDotStar. */ - // TODO: These should be in file which is included by both - // fst_processor.cc and compiler.cc: - wstring compoundOnlyLSymbol = L""; - wstring compoundRSymbol = L""; - wstring COMPILER_JOIN_ELEM = L"+"; - wstring COMPILER_GROUP_ELEM = L"#"; - wstring COMPILER_ANY_TAG = L""; - wstring COMPILER_ANY_CHAR = L""; - wstring COMPILER_SEPARABLE_BOUNDARY = L"<$>"; - // When searching, we need to record (this, (trimmer, trimmer_pre_plus)) typedef std::pair > SearchState; // first: currently searched state in this; @@ -1095,7 +1096,7 @@ Transducer::intersect(Transducer &trimmer, trimmer_preplus_next = trimmer_preplus; if(states_this_trimmed.find(current) == states_this_trimmed.end()) { - wcerr <. + */ + +#include "ustring.h" + +#include +#include +#include +#include + +using namespace icu; + +void +write(const UString& str, UFILE* output) +{ + // u_fputs() inserts a newline + u_fprintf(output, "%S", str.c_str()); +} + +UString +to_ustring(const char* s) +{ + return to_ustring(reinterpret_cast(s)); +} + +UString +to_ustring(const uint8_t* s) +{ + auto sz = strlen(reinterpret_cast(s)); + UString ret; + ret.reserve(sz); + utf8::utf8to16(s, s+sz, std::back_inserter(ret)); + return ret; +} + +void +ustring_to_vec32(const UString& str, std::vector& vec) +{ + if (str.empty()) { + return; + } + + size_t i = 0; + size_t len = str.size(); + vec.reserve(vec.size() + str.size()); + int32_t c; + while (i < str.size()) { + U16_NEXT(str, i, len, c); + vec.push_back(c); + } +} diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h new file mode 100644 index 0000000..7068e28 --- /dev/null +++ b/lttoolbox/ustring.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_USTRING_H_ +#define _LT_USTRING_H_ + +#include +#include +#include +#include +#include + +typedef std::basic_string UString; + +void write(const UString& str, UFILE* output); + +UString to_ustring(const char* str); +UString to_ustring(const uint8_t* str); + +// append UTF-16 string to UTF-32 vector of symbols +void ustring_to_vec32(const UString& str, std::vector& vec); + +inline std::ostream& +operator<<(std::ostream& ostr, UChar c) +{ + ostr << std::hex << static_cast(c); + return ostr; +} + +inline std::ostream& +operator<<(std::ostream& ostr, const UString& str) +{ + std::string res; + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res)); + ostr << res; + return ostr; +} + +inline UString operator "" _u(const char* str, std::size_t len) { + UString us(len, 0); + for (size_t i = 0; i < len; ++i) { + us[i] = str[i]; + } + return us; +} + +inline void operator+=(UString& str, UChar32 c) +{ + if (c <= 0xFFFF) { + str += static_cast(c); + } else { + str += static_cast(0xD800 + ((c - 0x10000) >> 10)); + str += static_cast(0xDC00 + (c & 0x3FF)); + } +} + +#endif diff --git a/lttoolbox/xml_parse_util.cc b/lttoolbox/xml_parse_util.cc index 3149900..0f5e3c5 100644 --- a/lttoolbox/xml_parse_util.cc +++ b/lttoolbox/xml_parse_util.cc @@ -18,128 +18,74 @@ #include #include +#include using namespace std; -wstring -XMLParseUtil::attrib(xmlTextReaderPtr reader, wstring const &name) +UString +XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const &name) { - string mystr = ""; - for(int i = 0, limit = name.size(); i != limit; i++) - { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); - xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = towstring(myattr); - xmlFree(myattr); - xmlFree(attrname); - return result; + return attrib(reader, name, ""_u); } -wstring -XMLParseUtil::attrib(xmlTextReaderPtr reader, wstring const &name, const wstring fallback) +UString +XMLParseUtil::attrib(xmlTextReaderPtr reader, UString const& name, const UString& fallback) { - string mystr = ""; - for (int i = 0, limit = name.size(); i != limit; i++) { - mystr += static_cast(name[i]); - } - - xmlChar *attrname = xmlCharStrdup(mystr.c_str()); + std::string temp; + temp.reserve(name.size()); + utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); + const xmlChar *attrname = reinterpret_cast(temp.c_str()); xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); - wstring result = XMLParseUtil::towstring(myattr); - xmlFree(myattr); - xmlFree(attrname); if(myattr == NULL) { + xmlFree(myattr); return fallback; - } - else { + } else { + UString result = to_ustring(reinterpret_cast(myattr)); + xmlFree(myattr); return result; } } - -string -XMLParseUtil::latin1(xmlChar const *input) +std::string +XMLParseUtil::attrib_str(xmlTextReaderPtr reader, const UString& name) { - if(input == NULL) - { + std::string temp; + temp.reserve(name.size()); + utf8::utf16to8(name.begin(), name.end(), std::back_inserter(temp)); + const xmlChar *attrname = reinterpret_cast(temp.c_str()); + xmlChar *myattr = xmlTextReaderGetAttribute(reader, attrname); + if(myattr == NULL) { + xmlFree(myattr); return ""; + } else { + std::string result = reinterpret_cast(myattr); + xmlFree(myattr); + return result; } - - int outputlen = xmlStrlen(input) + 1; - int inputlen = xmlStrlen(input); - - unsigned char* output = new unsigned char[outputlen]; - - if(UTF8Toisolat1(output, &outputlen, input, &inputlen) != 0) - { - } - - output[outputlen] = 0; - string result = reinterpret_cast(output); - delete[] output; - return result; } -wstring -XMLParseUtil::towstring(xmlChar const * input) +UString +XMLParseUtil::readName(xmlTextReaderPtr reader) { - wstring result = L""; - - for(int i = 0, limit = xmlStrlen(input); i != limit; i++) - { - int val = 0; - if(((unsigned char) input[i] & 0x80) == 0x0) - { - val = static_cast(input[i]); - } - else if(((unsigned char) input[i] & 0xE0) == 0xC0) - { - val = (input[i] & 0x1F) << 6; - i++; - val += input[i] & 0x7F; - } - else if(((unsigned char) input[i] & 0xF0) == 0xE0) - { - val = (input[i] & 0x0F) << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - } - else if(((unsigned char) input[i] & 0xF8) == 0xF0) - { - val = (input[i] & 0x07) << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - val = val << 6; - i++; - val += input[i] & 0x7F; - } - else - { - wcerr << L"UTF-8 invalid string" << endl; - exit(EXIT_FAILURE); - } + const xmlChar* name = xmlTextReaderConstName(reader); + if (name == NULL) return ""_u; + return to_ustring(reinterpret_cast(name)); +} - result += static_cast(val); - } - return result; +UString +XMLParseUtil::readValue(xmlTextReaderPtr reader) +{ + const xmlChar* val = xmlTextReaderConstValue(reader); + if (val == NULL) return ""_u; + return to_ustring(reinterpret_cast(val)); } -wstring -XMLParseUtil::stows(string const &str) +void +XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, vector& vec) { - wchar_t* result = new wchar_t[str.size()+1]; - size_t retval = mbstowcs(result, str.c_str(), str.size()); - result[retval] = L'\0'; - wstring result2 = result; - delete[] result; - return result2; + const xmlChar* val = xmlTextReaderConstValue(reader); + if (val == NULL) return; + auto sz = xmlStrlen(val); + vec.reserve(vec.size() + sz); + utf8::utf8to32(val, val+sz, std::back_inserter(vec)); } diff --git a/lttoolbox/xml_parse_util.h b/lttoolbox/xml_parse_util.h index beca741..9409cdd 100644 --- a/lttoolbox/xml_parse_util.h +++ b/lttoolbox/xml_parse_util.h @@ -19,8 +19,10 @@ #include #include +#include +#include +#include #include -#include using namespace std; @@ -29,14 +31,16 @@ class XMLParseUtil public: /* If attrib does not exist (or other error), returns an empty string: */ - static wstring attrib(xmlTextReaderPtr reader, wstring const &name); + static UString attrib(xmlTextReaderPtr reader, UString const &name); /* If attrib does not exist (or other error), returns fallback: */ - static wstring attrib(xmlTextReaderPtr reader, wstring const &name, const wstring fallback); + static UString attrib(xmlTextReaderPtr reader, UString const &name, const UString& fallback); - static string latin1(xmlChar const * input); // mark for deletion - static wstring towstring(xmlChar const * input); - static wstring stows(string const &str); + static string attrib_str(xmlTextReaderPtr reader, const UString& name); + + static UString readName(xmlTextReaderPtr reader); + static UString readValue(xmlTextReaderPtr reader); + static void readValueInto32(xmlTextReaderPtr reader, vector& vec); }; #endif diff --git a/lttoolbox/xml_walk_util.cc b/lttoolbox/xml_walk_util.cc new file mode 100644 index 0000000..8611556 --- /dev/null +++ b/lttoolbox/xml_walk_util.cc @@ -0,0 +1,65 @@ +#include + +children::children(xmlNode* node_) + : node(node_), cur(node->children) +{ + while (cur && cur->type != XML_ELEMENT_NODE) { + cur = cur->next; + } +} + +children::children(const children& it) + : node(it.node), cur(it.cur) +{} + +children::~children() +{} // we don't own the pointers, so we don't delete them + +children& +children::operator++() +{ + if (node && cur) { + cur = cur->next; + while (cur && cur->type != XML_ELEMENT_NODE) { + cur = cur->next; + } + } + return *this; +} + +children +children::begin() +{ + return children(node); +} + +children +children::end() +{ + children ret(node); + ret.cur = nullptr; + return ret; +} + +bool +children::operator!=(const children& other) const +{ + return node != other.node || cur != other.cur; +} + +bool +children::operator==(const children& other) const +{ + return node == other.node && cur == other.cur; +} + +UString +getattr(xmlNode* node, const char* attr) +{ + for (xmlAttr* i = node->properties; i != NULL; i = i->next) { + if (!xmlStrcmp(i->name, (const xmlChar*) attr)) { + return to_ustring((const char*) i->children->content); + } + } + return ""_u; +} diff --git a/lttoolbox/xml_walk_util.h b/lttoolbox/xml_walk_util.h new file mode 100644 index 0000000..13ca6a4 --- /dev/null +++ b/lttoolbox/xml_walk_util.h @@ -0,0 +1,29 @@ +#ifndef _XML_WALK_UTIL_ +#define _XML_WALK_UTIL_ + +#include + +#include +#include + +class children +{ +private: + xmlNode* node; + xmlNode* cur; +public: + children(xmlNode* node); + children(const children& it); + ~children(); + + children& operator++(); + children begin(); + children end(); + inline xmlNode* operator*() const { return cur; } + bool operator!=(const children& other) const; + bool operator==(const children& other) const; +}; + +UString getattr(xmlNode* node, const char* attr); + +#endif diff --git a/python/lttoolbox.i.in b/python/lttoolbox.i.in index d7362e1..f57fe92 100644 --- a/python/lttoolbox.i.in +++ b/python/lttoolbox.i.in @@ -54,8 +54,9 @@ public: void lt_proc(int argc, char **argv, char *input_path, char *output_path) { - FILE* input = fopen(input_path, "r"); - FILE* output = fopen(output_path, "w"); + InputFile input; + input.open(input_path); + UFILE* output = u_fopen(output_path, "w", NULL, NULL); int cmd = 0; int c = 0; optind = 1; @@ -103,8 +104,7 @@ public: break; } - fclose(input); - fclose(output); + u_fclose(output); } }; diff --git a/tests/data/arabic-punct.att b/tests/data/arabic-punct.att new file mode 100644 index 0000000..db8c154 --- /dev/null +++ b/tests/data/arabic-punct.att @@ -0,0 +1,9 @@ +0 1 ، ، 0.000 +0 1 ؛ ؛ 0.000 +0 1 ؟ ؟ 0.000 +0 2 a a 0.000 +0 2 b b 0.000 +1 3 @0@ 0.000 +2 4 @0@ 0.000 +3 0.000 +4 0.000 diff --git a/tests/data/non-bmp.att b/tests/data/non-bmp.att new file mode 100644 index 0000000..1a1f661 --- /dev/null +++ b/tests/data/non-bmp.att @@ -0,0 +1,34 @@ +0 1 𐅀 𐅀 0.000 +0 1 𐅁 𐅁 0.000 +0 1 𐅂 𐅂 0.000 +0 1 𐅃 𐅃 0.000 +0 1 𐅄 𐅄 0.000 +0 1 𐅅 𐅅 0.000 +0 1 𐅆 𐅆 0.000 +0 1 𐅇 𐅇 0.000 +0 1 𐅈 𐅈 0.000 +0 1 𐅉 𐅉 0.000 +0 1 𐅊 𐅊 0.000 +0 1 𐅋 𐅋 0.000 +0 1 𐅌 𐅌 0.000 +0 1 𐅍 𐅍 0.000 +0 1 𐅎 𐅎 0.000 +0 1 𐅏 𐅏 0.000 +1 1 𐅀 𐅀 0.000 +1 1 𐅁 𐅁 0.000 +1 1 𐅂 𐅂 0.000 +1 1 𐅃 𐅃 0.000 +1 1 𐅄 𐅄 0.000 +1 1 𐅅 𐅅 0.000 +1 1 𐅆 𐅆 0.000 +1 1 𐅇 𐅇 0.000 +1 1 𐅈 𐅈 0.000 +1 1 𐅉 𐅉 0.000 +1 1 𐅊 𐅊 0.000 +1 1 𐅋 𐅋 0.000 +1 1 𐅌 𐅌 0.000 +1 1 𐅍 𐅍 0.000 +1 1 𐅎 𐅎 0.000 +1 1 𐅏 𐅏 0.000 +1 2 @0@ 0.000 +2 0.000 diff --git a/tests/data/non-bmp.dix b/tests/data/non-bmp.dix new file mode 100644 index 0000000..161a161 --- /dev/null +++ b/tests/data/non-bmp.dix @@ -0,0 +1,15 @@ + + + 𐅀𐅁𐅂𐅃𐅄𐅅𐅆𐅇𐅈𐅉𐅊𐅋𐅌𐅍𐅎𐅏 + + + + + +

+
+
+
+ [𐅀𐅁𐅂𐅃𐅄𐅅𐅆𐅇𐅈𐅉𐅊𐅋𐅌𐅍𐅎𐅏]+ +
+
diff --git a/tests/lt_proc/__init__.py b/tests/lt_proc/__init__.py index fca9df9..2de472a 100644 --- a/tests/lt_proc/__init__.py +++ b/tests/lt_proc/__init__.py @@ -148,8 +148,8 @@ class PostgenerationBasicTest(ProcTest): "El perro ~de el amigo.", "abc ~les testword"] expectedOutputs = [ "xyz ejemplo u ho nombre.", - "xyz se la pelota.", - "El perro del amigo.", + "xyz se la pelota.", + "El perro del amigo.", "abc le pe test testword"] class PostgenerationWordboundBlankTest(ProcTest): @@ -228,5 +228,24 @@ class SpaceAtEOF(ProcTest): flushing = False +class NonBMPDixTest(ProcTest): + procdix = "data/non-bmp.dix" + inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] + + +class NonBMPATTTest(ProcTest): + procdix = "data/non-bmp.att" + inputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + expectedOutputs = ['^𐅁𐅃𐅅/𐅁𐅃𐅅$', '^𐅂𐅄𐅆/𐅂𐅄𐅆$'] + + +class NonBMPGeneratorTest(ProcTest): + procdix = "data/non-bmp.att" + inputs = ['^𐅁𐅃𐅅$', '^𐅂𐅄𐅆$'] + expectedOutputs = ['𐅁𐅃𐅅', '𐅂𐅄𐅆'] + procflags = ['-z', '-g'] + procdir = "rl" + # These fail on some systems: #from null_flush_invalid_stream_format import * diff --git a/utf8/utf8.h b/utf8/utf8.h deleted file mode 100644 index c2c85d6..0000000 --- a/utf8/utf8.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#if __cplusplus >= 201103L // C++ 11 or later -#include "utf8/cpp11.h" -#endif // C++ 11 or later - -#endif // header guard diff --git a/utf8/utf8/checked.h b/utf8/utf8/checked.h deleted file mode 100644 index c31861e..0000000 --- a/utf8/utf8/checked.h +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright 2006-2016 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t codepoint) : cp(codepoint) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::append (replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::prior(it, end); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::next(it, end); - } - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& rangestart, - const octet_iterator& rangeend) : - it(octet_it), range_start(rangestart), range_end(rangeend) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#endif //header guard - diff --git a/utf8/utf8/core.h b/utf8/utf8/core.h deleted file mode 100644 index b1f1eff..0000000 --- a/utf8/utf8/core.h +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it, octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - return UTF8_OK; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/utf8/utf8/cpp11.h b/utf8/utf8/cpp11.h deleted file mode 100644 index 77771ff..0000000 --- a/utf8/utf8/cpp11.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2018 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 -#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 - -#include "checked.h" -#include - -namespace utf8 -{ - - inline void append(char32_t cp, std::string& s) - { - append(uint32_t(cp), std::back_inserter(s)); - } - - inline std::string utf16to8(const std::u16string& s) - { - std::string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(const std::string& s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::string utf32to8(const std::u32string& s) - { - std::string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(const std::string& s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::size_t find_invalid(const std::string& s) - { - std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string::npos : (invalid - s.begin()); - } - - inline bool is_valid(const std::string& s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(const std::string& s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(const std::string& s) - { - return starts_with_bom(s.begin(), s.end()); - } - -} // namespace utf8 - -#endif // header guard - diff --git a/utf8/utf8/unchecked.h b/utf8/utf8/unchecked.h deleted file mode 100644 index 5ca6eb7..0000000 --- a/utf8/utf8/unchecked.h +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::unchecked::append (replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::unchecked::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::unchecked::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template - uint32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template - uint32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - template - void advance (octet_iterator& it, distance_type n) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::unchecked::prior(it); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::unchecked::next(it); - } - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/utf8/utf8_fwrap.h b/utf8/utf8_fwrap.h deleted file mode 100644 index 5d41b6b..0000000 --- a/utf8/utf8_fwrap.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef _UTF8_FWRAP_HPP -#define _UTF8_FWRAP_HPP - -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 - #define utf32to8 utf16to8 -#endif - -inline wint_t fgetwc_u8(FILE *in) { -#ifdef _WIN32 - struct _cps { - FILE *f = 0; - wchar_t c = 0; - }; - static _cps cps[4]; - - for (auto& cp : cps) { - if (cp.f == in) { - cp.f = 0; - return cp.c; - } - } -#endif - - int32_t rv = 0; - int c = 0, i = 0; - char buf[4]; - if ((c = fgetc_unlocked(in)) != EOF) { - buf[i++] = static_cast(c); - if ((c & 0xF0) == 0xF0) { - if (fread_unlocked(buf+i, 1, 3, in) != 3) { - throw std::runtime_error("Could not read 3 expected bytes from stream"); - } - i += 3; - } - else if ((c & 0xE0) == 0xE0) { - if (fread_unlocked(buf+i, 1, 2, in) != 2) { - throw std::runtime_error("Could not read 2 expected bytes from stream"); - } - i += 2; - } - else if ((c & 0xC0) == 0xC0) { - if (fread_unlocked(buf+i, 1, 1, in) != 1) { - throw std::runtime_error("Could not read 1 expected byte from stream"); - } - i += 1; - } - } - if (i == 0 && c == EOF) { - rv = WEOF; - } - else { -#ifdef _WIN32 - wchar_t u16[2] = {}; - utf8::unchecked::utf8to16(buf, buf+i, u16); - - if (u16[1]) { - for (auto& cp : cps) { - if (cp.f == 0) { - cp.f = in; - cp.c = u16[1]; - return u16[0]; - } - } - throw std::runtime_error("Not enough space to store UTF-16 high surrogate"); - } - rv = u16[0]; -#else - utf8::unchecked::utf8to32(buf, buf+i, &rv); -#endif - } - return static_cast(rv); -} - -inline wint_t fputwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - if (fwrite_unlocked(buf, 1, e-buf, out) != static_cast(e-buf)) { - return WEOF; - } - - return wc; -} - -inline int fputws_u8(const wchar_t* str, FILE *out) { - static std::string buf; - buf.clear(); - size_t len = wcslen(str); - utf8::unchecked::utf32to8(str, str+len, std::back_inserter(buf)); - if (fwrite_unlocked(&buf[0], 1, buf.size(), out) != buf.size()) { - return WEOF; - } - - return 1; -} - -inline wint_t ungetwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - for (char *b = buf ; b != e ; ++b) { - if (ungetc(*b, out) == EOF) { - return WEOF; - } - } - - return wc; -} - -#ifdef fgetwc_unlocked - #undef fgetwc_unlocked -#endif -#define fgetwc_unlocked fgetwc_u8 - -#ifdef fputwc_unlocked - #undef fputwc_unlocked -#endif -#define fputwc_unlocked fputwc_u8 - -#ifdef fputws_unlocked - #undef fputws_unlocked -#endif -#define fputws_unlocked fputws_u8 - -#ifdef ungetwc - #undef ungetwc -#endif -#define ungetwc ungetwc_u8 - -#ifdef _WIN32 - #undef utf32to8 -#endif - -#endif