commit a37648a0733d54832d6b4eaa870f0586d2d16d17 Author: Daniel Swanson Date: Wed Jun 16 15:08:03 2021 -0500 move deformatters off of wstring - reformatter having issues diff --git a/apertium/deformat.xsl b/apertium/deformat.xsl index 6e71285..b3da49b 100644 --- a/apertium/deformat.xsl +++ b/apertium/deformat.xsl @@ -90,15 +90,12 @@ - + - - - - + @@ -164,49 +161,45 @@ extern "C" { } #include <lttoolbox/lt_locale.h> -#include <apertium/string_to_wostream.h> +#include <lttoolbox/ustring.h> #ifndef GENFORMAT #include "apertium_config.h" #endif #include <utf8.h> #include <apertium/unlocked_cstdio.h> -#ifdef _WIN32 -#include <io.h> -#include <fcntl.h> -#define utf8to32 utf8to16 -#define utf32to8 utf16to8 -#endif using namespace std; -wstring buffer; +UString buffer; string symbuf; bool isDot, isEoh, hasWrite_dot, hasWrite_white; bool eosIncond; bool noDot; bool markEoh; -FILE *formatfile; +UFILE* formatfile; string last; int current; long int offset; vector<long int> offsets; -vector<wstring> tags; +vector<UString> tags; vector<int> orders; regex_t escape_chars; regex_t names_regexp; -void bufferAppend(wstring &buf, string const &str) +void bufferAppend(UString &buf, string const &str) { - symbuf.append(str); - if (utf8::is_valid(symbuf.begin(), symbuf.end())) { - utf8::utf8to32(symbuf.begin(), symbuf.end(), std::back_inserter(buf)); - symbuf.clear(); - } + buf += to_ustring(str.c_str()); } +void put(const UString& str, FILE* f) +{ + string temp; + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(temp)); + fputs_unlocked(temp.c_str(), f); +} void init_escape() { @@ -217,7 +210,7 @@ void init_escape() ", REG_EXTENDED)) { - wcerr << "ERROR: Illegal regular expression for escape characters" << endl; + cerr << "ERROR: Illegal regular expression for escape characters" << endl; exit(EXIT_FAILURE); } } @@ -231,7 +224,7 @@ void init_tagNames() ", REG_EXTENDED)) { - wcerr << "ERROR: Illegal regular expression for tag-names" << endl; + cerr << "ERROR: Illegal regular expression for tag-names" << endl; exit(EXIT_FAILURE); } } @@ -253,20 +246,20 @@ string backslash(string const &str) } -wstring escape(string const &str) +UString escape(string const &str) { regmatch_t pmatch; char const *mystring = str.c_str(); int base = 0; - wstring result; + UString result; while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0)) { bufferAppend(result, str.substr(base, pmatch.rm_so)); - result += L'\\'; + result += '\\'; const char *mb = str.c_str() + base + pmatch.rm_so; - wchar_t micaracter = utf8::next(mb, mb+4); + UChar32 micaracter = utf8::next(mb, mb+4); result += micaracter; base += pmatch.rm_eo; @@ -276,10 +269,10 @@ wstring escape(string const &str) return result; } -wstring escape(wstring const &str) +UString escape(UString const &str) { string dest; - utf8::utf32to8(str.begin(), str.end(), std::back_inserter(dest)); + utf8::utf16to8(str.begin(), str.end(), std::back_inserter(dest)); return escape(dest); } @@ -301,7 +294,7 @@ string get_tagName(string tag){ - + @@ -312,9 +305,9 @@ string get_tagName(string tag){ - + - + @@ -327,7 +320,7 @@ int get_index(string end_tag){ for (int i=tags.size()-1; i >= 0; i--) { new_end_tag.clear(); - utf8::utf32to8(tags[i].begin(), tags[i].end(), std::back_inserter(new_end_tag)); + utf8::utf16to8(tags[i].begin(), tags[i].end(), std::back_inserter(new_end_tag)); if (get_tagName(end_tag) == get_tagName(new_end_tag)) return i; @@ -337,15 +330,8 @@ int get_index(string end_tag){ } void print_emptyTags() { - wchar_t tag[250]; - for (size_t i=0; i < tags.size(); i++) { - swprintf(tag, 250, L"<format-tag offset=\"%d\" order= \"%d\"><![CDATA[", offsets[i], orders[i]); - fputws(tag, formatfile); - fputws(tags[i].c_str(), formatfile); - fputwc(L']', formatfile); - swprintf(tag, 250, L"]></format-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "<format-tag offset=\"%d\" order= \"%d\"><![CDATA[%S]></format-tag>\n", offsets[i], orders[i], tags[i].c_str()); } } @@ -354,14 +340,11 @@ void print_emptyTags() { void printBuffer(int ind=-1, string end_tag="") { - wchar_t tag[250]; - wstring etiketa; - wstring wend_tag; + UString etiketa; + UString wend_tag = to_ustring(end_tag.c_str()); size_t pos; int num; - utf8::utf8to32(end_tag.begin(), end_tag.end(), std::back_inserter(wend_tag)); - if (ind != -1 && ind == tags.size()-1 && offsets[ind] == offset && orders[ind] == current) { @@ -371,7 +354,7 @@ void printBuffer(int ind=-1, string end_tag="") offsets.pop_back(); orders.pop_back(); } - else if (ind == -1 && wend_tag != L"") + else if (ind == -1 && !wend_tag.empty()) { last = "buffer"; buffer = buffer + wend_tag; @@ -381,10 +364,9 @@ void printBuffer(int ind=-1, string end_tag="") // isEoh handling TODO matxin format if (hasWrite_dot && isDot) { - swprintf(tag, 250, L"<empty-tag offset=\"%d\"/>\n", offset+1); - fputws(tag, formatfile); + u_fprintf(formatfile, "<empty-tag offset=\"%d\"/>\n", offset+1); - fputws(L" .\n", yyout); + fputs(" .\n", yyout); offset += 2; hasWrite_dot = false; } @@ -395,24 +377,22 @@ void printBuffer(int ind=-1, string end_tag="") { if (hasWrite_white) { - fputws(L" ", yyout); + fputs(" ", yyout); offset++; hasWrite_white = false; } current++; - swprintf(tag, 250, L"<format-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); - fputws(tag, formatfile); - while ((pos = buffer.find(L"]]>")) != wstring::npos) - buffer.replace(pos, 3, L"\\]\\]\\>"); - fputws(buffer.c_str(), formatfile); - swprintf(tag, 250, L"]]></format-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "<format-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + while ((pos = buffer.find("]]>")) != UString::npos) + buffer.replace(pos, 3, "\\]\\]\\>"_u); + write(buffer, formatfile); + u_fprintf(formatfile, "]]></format-tag>\n"); } else { - fputws(buffer.c_str(), yyout); + put(buffer, yyout); offset += buffer.size(); } @@ -421,30 +401,27 @@ void printBuffer(int ind=-1, string end_tag="") { if (hasWrite_white) { - fputws(L" ", yyout); + fputc(' ', yyout); offset++; hasWrite_white = false; } - num = swprintf(tag, 250, L"<open-close-tag>\n"); - swprintf(tag + num, 250 - num, L"<open-tag offset=\"%d\" order=\"%d\"><![CDATA[", offsets[ind], orders[ind]); - fputws(tag, formatfile); + u_fprintf(formatfile, "<open-close-tag>\n"); + u_fprintf(formatfile, "<open-tag offset=\"%d\"order=\"%d\"><![CDATA[", offsets[ind], orders[ind]); etiketa = tags[ind]; - while ((pos = etiketa.find(L"]]>")) != wstring::npos) - etiketa.replace(pos, 3, L"\\]\\]\\>"); - fputws(etiketa.c_str(), formatfile); + while ((pos = etiketa.find("]]>"_u)) != UString::npos) + etiketa.replace(pos, 3, "\\]\\]\\>"_u); + write(etiketa, formatfile); current++; - num = swprintf(tag, 250, L"]]></open-tag>\n"); - swprintf(tag + num, 250 - num, L"<close-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); - fputws(tag, formatfile); - while ((pos = wend_tag.find(L"]]>")) != wstring::npos) - wend_tag.replace(pos, 3, L"\\]\\]\\>"); - fputws(wend_tag.c_str(), formatfile); - num = swprintf(tag, 250, L"]]></close-tag>\n"); - swprintf(tag + num, 250 - num, L"</open-close-tag>\n"); - fputws(tag, formatfile); + u_fprintf(formatfile, "]]></open-tag>\n"); + u_fprintf(formatfile, "<close-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + while ((pos = wend_tag.find("]]>"_u)) != UString::npos) + wend_tag.replace(pos, 3, "\\]\\]\\>"_u); + write(wend_tag, formatfile); + u_fprintf(formatfile, "]]></close-tag>\n"); + u_fprintf(formatfile, "</open-close-tag>\n"); tags.erase(tags.begin() + ind); offsets.erase(offsets.begin() + ind); @@ -453,7 +430,7 @@ void printBuffer(int ind=-1, string end_tag="") last = "buffer"; - buffer = L""; + buffer.clear(); } } @@ -466,11 +443,11 @@ void preDot() { if(noDot) { - fputws_unlocked(L"[]", yyout); + fputs_unlocked("[]", yyout); } else { - fputws_unlocked(L".[]", yyout); + fputs_unlocked(".[]", yyout); } } } @@ -479,66 +456,64 @@ void printBuffer() { if(isEoh && markEoh) { - fputws_unlocked(L"[]\x2761", yyout); + put(u"[]\u2761", yyout); isEoh = false; } if(isDot && !eosIncond) { if(noDot) { - fputws_unlocked(L"[]", yyout); + fputs_unlocked("[]", yyout); } else { - fputws_unlocked(L".[]", yyout); + fputs_unlocked(".[]", yyout); } isDot = false; } if(buffer.size() > ) { string filename = tmpnam(NULL); - FILE *largeblock = fopen(filename.c_str(), "wb"); - fputws_unlocked(buffer.c_str(), largeblock); - fclose(largeblock); + UFILE *largeblock = u_fopen(filename.c_str(), "wb", NULL, NULL); + write(buffer, largeblock); + u_fclose(largeblock); preDot(); - fputwc_unlocked(L'[', yyout); - fputwc_unlocked(L'@', yyout); - std::wstring cad; - utf8::utf8to32(filename.begin(), filename.end(), std::back_inserter(cad)); - fputws_unlocked(cad.c_str(), yyout); - fputwc_unlocked(L']', yyout); + fputc_unlocked('[', yyout); + fputc_unlocked('@', yyout); + fputs_unlocked(filename.c_str(), yyout); + fputc_unlocked(']', yyout); } else if(buffer.size() > 1) { preDot(); - fputwc_unlocked(L'[', yyout); - wstring const tmp = escape(buffer); - if(tmp[0] == L'@') + fputc_unlocked('[', yyout); + UString const tmp = escape(buffer); + if(tmp[0] == '@') { - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); } - fputws_unlocked(tmp.c_str(), yyout); - fputwc_unlocked(L']', yyout); + put(tmp, yyout); + fputc_unlocked(']', yyout); } - else if(buffer.size() == 1 && buffer[0] != L' ') + else if(buffer.size() == 1 && buffer[0] != ' ') { preDot(); - fputwc_unlocked(L'[', yyout); - wstring const tmp = escape(buffer); - if(tmp[0] == L'@') + fputc_unlocked('[', yyout); + UString const tmp = escape(buffer); + if(tmp[0] == '@') { - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); } - fputws_unlocked(tmp.c_str(), yyout); + put(tmp, yyout); - fputwc_unlocked(L']', yyout); + fputc_unlocked(']', yyout); } else { - fputws_unlocked(buffer.c_str(), yyout); + put(buffer, yyout); } - buffer = L""; + buffer.clear(); } @@ -617,9 +592,9 @@ void printBuffer() - + - + @@ -636,12 +611,11 @@ void printBuffer() { printBuffer(); - fputwc_unlocked(L'\\', yyout); + fputc_unlocked('\\', yyout); offset++; const char *mb = yytext; - wchar_t symbol = utf8::next(mb, mb+4); - - fputwc_unlocked(symbol, yyout); + UChar32 symbol = utf8::next(mb, mb+4); + put(UString(1, symbol), yyout); offset++; hasWrite_dot = hasWrite_white = true; @@ -653,9 +627,9 @@ void printBuffer() if (utf8::is_valid(symbuf.begin(), symbuf.end())) { const char *mb = symbuf.c_str(); - wchar_t symbol = utf8::next(mb, mb+4); + UChar32 symbol = utf8::next(mb, mb+4); symbuf.clear(); - fputwc_unlocked(symbol, yyout); + put(UString(1, symbol), yyout); offset++; hasWrite_dot = hasWrite_white = true; } @@ -676,20 +650,20 @@ void usage(string const &progname) { - wcerr << "USAGE: " << progname << " format_file [input_file [output_file]" << ']' << endl; + cerr << "USAGE: " << progname << " format_file [input_file [output_file]" << ']' << endl; - wcerr << "USAGE: " << progname << " [ -h | -o | -i | -n ] [input_file [output_file]" << ']' << endl; + cerr << "USAGE: " << progname << " [ -h | -o | -i | -n ] [input_file [output_file]" << ']' << endl; - wcerr << " format processor " << endl; + cerr << " format processor " << endl; exit(EXIT_SUCCESS); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); - size_t base = 0; + int base = 0; eosIncond = false; if(argc >= 2) @@ -739,7 +713,7 @@ int main(int argc, char *argv[]) usage(argv[0]); } case 2: - formatfile = fopen(argv[1+base], "wb"); + formatfile = u_fopen(argv[1+base], "wb", NULL, NULL); if(!formatfile) { usage(argv[0]); @@ -750,35 +724,23 @@ int main(int argc, char *argv[]) } - if((argc-base) > 4) - { + if((argc-base) > 4) { usage(argv[0]); } - - switch(argc-base) - { - case 3: - yyout = fopen(argv[2+base], "wb"); - if(!yyout) - { - usage(argv[0]); - } - case 2: - yyin = fopen(argv[1+base], "rb"); - if(!yyin) - { - usage(argv[0]); - } - break; - default: - break; + if ((argc - base) == 3) { + yyout = fopen(argv[2 + base], "wb"); + if (!yyout) { + usage(argv[0]); + } + } + if ((argc - base) >= 2) { + yyin = fopen(argv[1 + base], "rb"); + if (!yyin) { + usage(argv[0]); + } } -#ifdef _MSC_VER - _setmode(_fileno(yyin), _O_U8TEXT); - _setmode(_fileno(yyout), _O_U8TEXT); -#endif // prevent warning message yy_push_state(1); yy_top_state(); @@ -791,8 +753,8 @@ int main(int argc, char *argv[]) - fputws(L"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n", formatfile); - fputws(L"<format>\n", formatfile); + write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"_u, formatfile); + write("<format>\n"_u, formatfile); last.clear(); @@ -806,7 +768,7 @@ int main(int argc, char *argv[]) print_emptyTags(); - fputws(L"</format>", formatfile); + write("</format>"_u, formatfile); fclose(formatfile); fclose(yyin); diff --git a/apertium/file_morpho_stream.cc b/apertium/file_morpho_stream.cc index fc1be66..e76c29b 100644 --- a/apertium/file_morpho_stream.cc +++ b/apertium/file_morpho_stream.cc @@ -169,7 +169,7 @@ FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) last_type = val; } } - ms.step(towlower(str[i]), ca_any_char); + ms.step(u_tolower(str[i]), ca_any_char); } else { diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc index 8c722da..e52c500 100644 --- a/apertium/interchunk.cc +++ b/apertium/interchunk.cc @@ -624,7 +624,7 @@ Interchunk::applyWord(UString const &word_str) { case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; case '<': @@ -652,7 +652,7 @@ Interchunk::applyWord(UString const &word_str) return; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } diff --git a/apertium/mtx_reader.cc b/apertium/mtx_reader.cc index efb743a..4a03312 100644 --- a/apertium/mtx_reader.cc +++ b/apertium/mtx_reader.cc @@ -39,7 +39,6 @@ MTXReader::MTXReader(VM &spec) : size_t MTXReader::pushSetConst(std::string &val) { size_t set_idx = spec.set_consts.size(); - set s; std::stringstream val_ss(val); spec.set_consts.push_back(set( istream_iterator(val_ss), diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc index ab2429e..42240da 100644 --- a/apertium/postchunk.cc +++ b/apertium/postchunk.cc @@ -720,7 +720,7 @@ Postchunk::applyWord(UString const &word_str) { case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; case '<': @@ -748,7 +748,7 @@ Postchunk::applyWord(UString const &word_str) return; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } @@ -984,7 +984,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, } else if(chunk[i] == '<') { - if(iswdigit(chunk[i+1])) + if(u_isdigit(chunk[i+1])) { // replace tag unsigned long value = StringUtils::stoi(chunk.c_str()+i+1) - 1; @@ -1013,7 +1013,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, } else if(uppercase_first) { - if(iswalnum(chunk[i])) + if(u_isalnum(chunk[i])) { // TODO ref += u_toupper(chunk[i]); @@ -1075,7 +1075,7 @@ Postchunk::splitWordsAndBlanks(UString const &chunk, vector &words, } else if(chunk[i] == '<') { - if(iswdigit(chunk[i+1])) + if(u_isdigit(chunk[i+1])) { // replace tag unsigned long value = StringUtils::stoi(chunk.c_str()+i+1) - 1; diff --git a/apertium/tagger_data_percep_coarse_tags.cc b/apertium/tagger_data_percep_coarse_tags.cc index ae7ce9f..89ce084 100644 --- a/apertium/tagger_data_percep_coarse_tags.cc +++ b/apertium/tagger_data_percep_coarse_tags.cc @@ -57,7 +57,7 @@ const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd // Input lemma ms.init(me->getInitial()); for (size_t i = 0; i < wrd.TheLemma.size(); i++) { - ms.step(std::towlower(wrd.TheLemma[i]), ca_any_char); + ms.step(u_tolower(wrd.TheLemma[i]), ca_any_char); } // Input fine tags for (size_t i = 0; i < wrd.TheTags.size(); i++) { diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc index 54e0ed8..9e94ffe 100644 --- a/apertium/tmx_builder.cc +++ b/apertium/tmx_builder.cc @@ -220,7 +220,7 @@ TMXBuilder::nextTU(InputFile& input) current_tu += '.'; symbol = input.get(); - if(symbol != '[' && !iswspace(symbol)) + if(symbol != '[' && !u_isspace(symbol)) { if (!input.eof()) { input.unget(symbol); @@ -299,7 +299,7 @@ TMXBuilder::xmlize(UString const &str) result = result.substr(5); cambio = true; } - while(result.size() > 0 && !iswalnum(result[0]) && !iswpunct(result[0])) + while(result.size() > 0 && !u_isalnum(result[0]) && !u_ispunct(result[0])) { result = result.substr(1); cambio = true; @@ -316,7 +316,7 @@ TMXBuilder::xmlize(UString const &str) result = result.substr(0, result.size()-5); cambio = true; } - while(result.size() > 0 && !iswalnum(result[result.size()-1]) && !iswpunct(result[result.size()-1])) + while(result.size() > 0 && !u_isalnum(result[result.size()-1]) && !u_ispunct(result[result.size()-1])) { result = result.substr(0, result.size()-1); cambio = true; @@ -761,11 +761,11 @@ TMXBuilder::filter(UString const &tu) for(unsigned int i = 0, limit = tu.size(); i != limit; i++) { - if(iswalpha(tu[i])) + if(u_isalpha(tu[i])) { has_text = true; } - else if(has_text && iswspace(tu[i])) + else if(has_text && u_isspace(tu[i])) { count_blank++; } diff --git a/apertium/transfer.cc b/apertium/transfer.cc index f439f1c..572ec56 100644 --- a/apertium/transfer.cc +++ b/apertium/transfer.cc @@ -1396,7 +1396,7 @@ Transfer::applyWord(UString const &word_str) { case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; case '[': @@ -1421,7 +1421,7 @@ Transfer::applyWord(UString const &word_str) } else { - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); } break; @@ -1450,7 +1450,7 @@ Transfer::applyWord(UString const &word_str) break; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } } diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc index 35a69fa..e577e73 100644 --- a/apertium/transfer_mult.cc +++ b/apertium/transfer_mult.cc @@ -467,7 +467,7 @@ TransferMult::applyWord(UString const &word_str) { case '\\': i++; - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; case '<': @@ -491,7 +491,7 @@ TransferMult::applyWord(UString const &word_str) break; default: - ms.step(towlower(word_str[i]), any_char); + ms.step(u_tolower(word_str[i]), any_char); break; } }