commit 3ffdbefa697d1e9745f3f588430374a2753f683d Author: Daniel Swanson Date: Mon Jul 26 12:46:18 2021 -0500 manager for string constants diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 2fd56b0..e943b39 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -2,13 +2,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_writer.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 21ca48f..5783f77 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -29,7 +29,8 @@ using namespace std; // Global lttoolbox features constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; enum LT_FEATURES : uint64_t { - LTF_UNKNOWN = (1ull << 0), // Features >= this are unknown, so throw an error; Inc this if more features are added + LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits }; @@ -37,7 +38,8 @@ enum LT_FEATURES : uint64_t { constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; enum TD_FEATURES : uint64_t { TDF_WEIGHTS = (1ull << 0), - TDF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + TDF_MMAP = (1ull << 1), + TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits }; diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc new file mode 100644 index 0000000..292431f --- /dev/null +++ b/lttoolbox/string_writer.cc @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +UString_view +StringWriter::add(const UString& s) +{ + auto start = buffer.find(s); + if (start == UString::npos) { + start = buffer.size(); + buffer += s; + } + UString_view ret(buffer); + return ret.substr(start, s.size()); +} + +UString_view +StringWriter::get(const uint32_t start, const uint32_t count) +{ + UString_view ret(buffer); + return ret.substr(start, count); +} + +void +StringWriter::read(FILE* in) +{ + uint64_t len = read_u64_le(in); + buffer.clear(); + buffer.reserve(len); + uint8_t temp[len*2]{}; + if (fread_unlocked(&temp, 1, len*2, in) != len) { + throw std::runtime_error("Failed to read strings"); + } + uint16_t c; + for (uint64_t i = 0; i < len*2; i += 2) { + buffer += static_cast(temp[i] | (temp[i+1] << 8)); + } +} + +void +StringWriter::write(FILE* out) +{ + write_u64_le(out, buffer.size()); + uint8_t temp[buffer.size()*2]{}; + for (uint64_t i = 0; i < buffer.size(); i++) { + temp[2*i] = buffer[i] & 0xFF; + temp[2*i+1] = (buffer[i] >> 8) & 0xFF; + } + if (fwrite_unlocked(&temp, 1, buffer.size()*2, out) != buffer.size()*2) { + throw std::runtime_error("Failed to write strings"); + } +} diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h new file mode 100644 index 0000000..d7b2334 --- /dev/null +++ b/lttoolbox/string_writer.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_STRING_WRITER_ +#define _LT_STRING_WRITER_ + +// TODO: merge compression.h write_u64_le() and friends to here +// when we drop compressed formats +#include +#include +#include + +class StringWriter { +public: + UString buffer; + UString_view add(const UString& s); + UString_view get(const uint32_t start, const uint32_t count); + void read(FILE* in); + void write(FILE* out); +}; + +#endif diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index fa01e31..3642cbe 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -24,8 +24,10 @@ #include #include #include +#include typedef std::basic_string UString; +typedef std::basic_string_view UString_view; void write(const UString& str, UFILE* output); @@ -43,7 +45,7 @@ operator<<(std::ostream& ostr, char16_t c) } inline std::ostream& -operator<<(std::ostream& ostr, const UString& str) +operator<<(std::ostream& ostr, UString_view str) { std::string res; utf8::utf16to8(str.begin(), str.end(), std::back_inserter(res));