commit 62bb1df8ee76660f89ff4ec2edfa995a8de397f9 Author: Daniel Swanson Date: Mon Jun 14 10:21:40 2021 -0500 move string_utils into lttoolbox and casehandle better diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index fdb906d..44c38f9 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -2,13 +2,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ transducer.h trans_exe.h xml_parse_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ trans_exe.cc xml_parse_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 2f16c76..eaa0dd8 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -185,7 +186,7 @@ AttCompiler::parse(string const &file_name, bool read_rl) continue; } - from = stoi(tokens[0]) + state_id_offset; + from = StringUtils::stoi(tokens[0]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, from); AttNode* source = get_node(from); @@ -205,7 +206,7 @@ AttCompiler::parse(string const &file_name, bool read_rl) { if (tokens.size() > 1) { - weight = stod(tokens[1]); + weight = StringUtils::stod(tokens[1]); } else { @@ -215,7 +216,7 @@ AttCompiler::parse(string const &file_name, bool read_rl) } else { - to = stoi(tokens[1]) + state_id_offset; + to = StringUtils::stoi(tokens[1]) + state_id_offset; largest_seen_state_id = max(largest_seen_state_id, to); if(read_rl) { @@ -232,7 +233,7 @@ AttCompiler::parse(string const &file_name, bool read_rl) int tag = alphabet(symbol_code(upper), symbol_code(lower)); if(tokens.size() > 4) { - weight = stod(tokens[4]); + weight = StringUtils::stod(tokens[4]); } else { diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index f7fa14b..8390820 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -759,7 +760,7 @@ Compiler::procEntry() double weight = 0.0; if(!wsweight.empty()) { - weight = stod(wsweight); + weight = StringUtils::stod(wsweight); } vector elements; diff --git a/lttoolbox/string_utils.cc b/lttoolbox/string_utils.cc new file mode 100644 index 0000000..fe2d2bc --- /dev/null +++ b/lttoolbox/string_utils.cc @@ -0,0 +1,235 @@ +#include + +#include +#include +#include +#include + +UString +StringUtils::trim(const UString& str) +{ + if (str.empty()) { + return str; + } + size_t begin = 0; + size_t end = str.size(); + size_t i = 0; + UChar32 c; + while (begin < end) { + U16_GET(str.c_str(), begin, i, end, c); + if (!u_isspace(c)) { + begin = i; + break; + } else { + U16_FWD_1(str.c_str(), i, end); + } + } + i = str.size(); + U16_BACK_1(str.c_str(), 0, i); + U16_GET(str.c_str(), 0, i, end, c); + if (!u_isspace(c)) { + if (begin == 0) { + return str; + } else { + return str.substr(begin); + } + } + while (end > begin) { + end = i; + U16_BACK_1(str.c_str(), 0, i); + U16_GET(str.c_str(), 0, i, str.size(), c); + if (!u_isspace(c)) { + break; + } + } + return str.substr(begin, end-begin); +} + +std::vector +StringUtils::split(const UString& str, const UString& delim) +{ + size_t pos = 0; + size_t new_pos; + std::vector result; + while (pos < str.size()) { + new_pos = str.find(delim, pos); + if (new_pos == UString::npos) { + new_pos = str.size(); + } + if (new_pos > pos) { + // if we have a non-empty substring between this delimiter + // and the last one + result.push_back(str.substr(pos, new_pos-pos)); + } + pos = new_pos + delim.size(); + } + return result; +} + +UString +StringUtils::join(const std::vector& vec, const UString& delim) +{ + UString s; + for (auto& piece : vec) { + if (!s.empty()) { + s.append(delim); + } + s.append(piece); + } + return s; +} + +UString +StringUtils::substitute(const UString& str, const UString& olds, const UString& news) +{ + UString s = str; + size_t p = s.find(olds, 0); + while (p != UString::npos) { + s.replace(p, olds.length(), news); + p += news.length(); + p = s.find(olds, p); + } + return s; +} + +UString +StringUtils::itoa(int n) +{ + UChar str[256]; + u_snprintf(str, 256, "%d", n); + return str; +} + +std::string +StringUtils::itoa_string(int n) +{ + char str[256]; + snprintf(str, 256, "%d", n); + return str; +} + +UString +StringUtils::ftoa(double f) +{ + UChar str[256]; + u_snprintf(str, 256, "%f", f); + return str; +} + +int +StringUtils::stoi(const UString& str) +{ + int ret; + int c = u_sscanf(str.c_str(), "%d", &ret); + if (c != 1) { + throw std::invalid_argument("unable to parse int"); + } + return ret; +} + +double +StringUtils::stod(const UString& str) +{ + double ret; + int c = u_sscanf(str.c_str(), "%lf", &ret); + if (c != 1) { + throw std::invalid_argument("unable to parse float"); + } + return ret; +} + +UString +StringUtils::tolower(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToLower(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to lowercase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::toupper(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToUpper(buf, str.size()*2, str.c_str(), str.size(), NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to uppercase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::totitle(const UString& str) +{ + UChar buf[str.size()*2]; + UErrorCode err = U_ZERO_ERROR; + u_strToTitle(buf, str.size()*2, str.c_str(), str.size(), NULL, NULL, &err); + if (U_FAILURE(err)) { + std::cerr << "Error: unable to titlecase string '" << str << "'.\n"; + std::cerr << "error code: " << u_errorName(err) << std::endl; + exit(EXIT_FAILURE); + } + return buf; +} + +UString +StringUtils::getcase(const UString& str) +{ + UString ret = "aa"_u; + if (str.empty()) { + return ret; + } + size_t i = 0; + size_t l = str.size(); + UChar32 c; + U16_NEXT(str.c_str(), i, l, c); + if (u_isupper(c)) { + ret[0] = 'A'; + if (i < l) { + U16_BACK_1(str.c_str(), i, l); // decrements l + U16_GET(str.c_str(), 0, l, str.size(), c); + if (u_isupper(c)) { + ret[1] = 'A'; + } + } + } + return ret; +} + +UString +StringUtils::copycase(const UString& source, const UString& target) +{ + if (source.empty() || target.empty()) { + return target; + } + size_t i = 0; + size_t l = source.size(); + UChar32 c; + U16_NEXT(source.c_str(), i, l, c); + bool firstupper = u_isupper(c); + bool uppercase = false; + if (firstupper) { + if (i != l) { + U16_BACK_1(source.c_str(), i, l); // decrements l + U16_GET(source.c_str(), 0, l, source.size(), c); + uppercase = u_isupper(c); + } + } + if (firstupper) { + if (uppercase) { + return toupper(target); + } else { + return totitle(target); + } + } else { + return tolower(target); + } +} diff --git a/lttoolbox/string_utils.h b/lttoolbox/string_utils.h new file mode 100644 index 0000000..df664ab --- /dev/null +++ b/lttoolbox/string_utils.h @@ -0,0 +1,36 @@ +#ifndef __LT_STRING_UTILS_H__ +#define __LT_STRING_UTILS_H__ + +#include +#include + +class StringUtils { +public: + // delete leading and trailing whitespace + static UString trim(const UString& str); + + // split string on delimiter + static std::vector split(const UString& str, const UString& delim); + + // inverse of split + static UString join(const std::vector& vec, const UString& delim); + + // replace each occurrence of olds with news + static UString substitute(const UString& str, const UString& olds, const UString& news); + + static UString itoa(int n); + static std::string itoa_string(int n); + static UString ftoa(double f); + // these throw std::invalid_argument if parsing fails + static int stoi(const UString& str); + static double stod(const UString& str); + + static UString tolower(const UString& str); + static UString toupper(const UString& str); + static UString totitle(const UString& str); + + static UString getcase(const UString& str); + static UString copycase(const UString& source, const UString& target); +}; + +#endif // __LT_STRING_UTILS_H__ diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc index b537a78..e8d4777 100644 --- a/lttoolbox/ustring.cc +++ b/lttoolbox/ustring.cc @@ -31,28 +31,6 @@ write(const UString& str, UFILE* output) u_fprintf(output, "%S", str.c_str()); } -int -stoi(const UString& str) -{ - int ret; - int c = u_sscanf(str.c_str(), "%d", &ret); - if (c != 1) { - throw std::invalid_argument("unable to parse int"); - } - return ret; -} - -double -stod(const UString& str) -{ - double ret; - int c = u_sscanf(str.c_str(), "%lf", &ret); - if (c != 1) { - throw std::invalid_argument("unable to parse float"); - } - return ret; -} - UString to_ustring(const char* s) { diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 323726f..548fc51 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -28,12 +28,6 @@ typedef std::basic_string UString; void write(const UString& str, UFILE* output); -// like std::stoi, throws invalid_argument if unable to parse -int stoi(const UString& str); - -// like std::stoi, throws invalid_argument if unable to parse -double stod(const UString& str); - // for command-line arguments UString to_ustring(const char* str);