commit 55d00ac739129c6db5da40a748f8853bcb7c162b Author: Daniel Swanson Date: Sat Aug 21 16:13:47 2021 -0400 start dropping compression.h diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index f3737b4..fde3379 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,13 +1,13 @@ -h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = alphabet.h alphabet_exe.h att_compiler.h binary_headers.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h old_binary.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ - match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ + match_node.cc match_state.cc match_state2.cc node.cc old_binary.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 40d27e5..81d2644 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -17,8 +17,8 @@ #include -#include #include +#include AlphabetExe::AlphabetExe(StringWriter* sw_) : sw(sw_), tag_count(0), tags(nullptr) @@ -43,12 +43,12 @@ AlphabetExe::read(FILE* input, bool mmap) symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; } } else { - tag_count = Compression::multibyte_read(input); + tag_count = OldBinary::read_int(input); tags = new StringRef[tag_count]; for (uint32_t i = 0; i < tag_count; i++) { UString tg; tg += '<'; - tg += Compression::string_read(input); + OldBinary::read_ustr(input, tg); tg += '>'; tags[i] = sw->add(tg); } @@ -57,10 +57,10 @@ AlphabetExe::read(FILE* input, bool mmap) for (uint32_t i = 0; i < tag_count; i++) { symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; } - int pairs = Compression::multibyte_read(input); + int pairs = OldBinary::read_int(input); for (int i = 0; i < pairs; i++) { - Compression::multibyte_read(input); - Compression::multibyte_read(input); + OldBinary::read_int(input); + OldBinary::read_int(input); } } } diff --git a/lttoolbox/binary_headers.h b/lttoolbox/binary_headers.h new file mode 100644 index 0000000..878c7bc --- /dev/null +++ b/lttoolbox/binary_headers.h @@ -0,0 +1,23 @@ +#ifndef _LT_BINARY_HEADERS_ +#define _LT_BINARY_HEADERS_ + +#include + +// Global lttoolbox features +constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; +enum LT_FEATURES : uint64_t { + LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +// Invididual transducer features +constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; +enum TD_FEATURES : uint64_t { + TDF_WEIGHTS = (1ull << 0), + TDF_MMAP = (1ull << 1), + TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added + TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +#endif \ No newline at end of file diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 5783f77..739290d 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -23,27 +23,10 @@ #include #include #include +#include using namespace std; -// Global lttoolbox features -constexpr char HEADER_LTTOOLBOX[4]{'L', 'T', 'T', 'B'}; -enum LT_FEATURES : uint64_t { - LTF_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format - LTF_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added - LTF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - -// Invididual transducer features -constexpr char HEADER_TRANSDUCER[4]{'L', 'T', 'T', 'D'}; -enum TD_FEATURES : uint64_t { - TDF_WEIGHTS = (1ull << 0), - TDF_MMAP = (1ull << 1), - TDF_UNKNOWN = (1ull << 2), // Features >= this are unknown, so throw an error; Inc this if more features are added - TDF_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits -}; - - inline auto write_u64(FILE *out, uint64_t value) { auto rv = fwrite_unlocked(reinterpret_cast(&value), 1, sizeof(value), out); if (rv != sizeof(value)) { diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index cb8c7ce..267aeec 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -15,10 +15,11 @@ * along with this program; if not, see . */ #include -#include +#include #include #include #include +#include #include #include @@ -1003,9 +1004,9 @@ FSTProcessor::load(FILE *input) } else { // letters - int len = Compression::multibyte_read(input); + uint64_t len = OldBinary::read_int(input); while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + alphabetic_chars.insert(static_cast(OldBinary::read_int(input))); len--; } @@ -1016,10 +1017,11 @@ FSTProcessor::load(FILE *input) Alphabet temp; temp.read(input); - len = Compression::multibyte_read(input); + len = OldBinary::read_int(input); while(len > 0) { - UString name = Compression::string_read(input); + UString name; + OldBinary::read_ustr(input, name); transducers[name].read_compressed(input, temp); len--; } diff --git a/lttoolbox/old_binary.cc b/lttoolbox/old_binary.cc new file mode 100644 index 0000000..a2d5e55 --- /dev/null +++ b/lttoolbox/old_binary.cc @@ -0,0 +1,129 @@ +#include +#include +#include + +using namespace OldBinary; + +uint64_t +OldBinary::read_u64(FILE* in) +{ + uint64_t v = 0; + if (fread_unlocked(reinterpret_cast(&v), 1, sizeof(v), in) != sizeof(v)) { + throw std::runtime_error("Failed to read uint64_t"); + } + // these are unconditional byte-swaps, so on little-endian platforms + // this reads big-endian data + // this is very bad, but it's the way all the old data was written, + // so we have this here for backwards compatibility until we drop + // support for lttoolbox/apertium <= 3 + // -DGS 2021-08-21 + return (((v & 0xFF00000000000000) >> 56) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF00) << 40) | + ((v & 0xFF) << 56)); +} + +uint64_t read_byte(FILE* in) +{ + unsigned char ret = 0; + if (fread_unlocked(&ret, 1, 1, in) != 1) { + throw std::runtime_error("Failed to read byte"); + } + return ret; +} + +uint64_t +OldBinary::read_int(FILE* in, bool compression) +{ + if (compression) { + uint64_t up = read_byte(in); + if (up < 0x40) { + return up; + } else if (up < 0x80) { + return ((up & 0x3f) << 8) | read_byte(in); + } else if (up < 0xc0) { + uint64_t ret = (up & 0x3f) << 8; + ret |= read_byte(in); + return (ret << 8) | read_byte(in); + } else { + uint64_t ret = ((up & 0x3f) << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + ret = (ret << 8) | read_byte(in); + return ret; + } + } else { + uint64_t ret = 0; + uint64_t size = read_byte(in); + if (size > 8) { + throw std::runtime_error("can't deserialise int"); + } + uint8_t buffer[8]; + if (fread_unlocked(buffer, 1, size, in) != size) { + throw std::runtime_error("can't deserialise int"); + } + for (uint8_t i = 0; i < size; i++) { + ret += static_cast(buffer[i]) << (8 * (size - i - 1)); + } + return ret; + } +} + +void +OldBinary::read_ustr(FILE* in, UString& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +void +OldBinary::read_str(FILE* in, std::string& s, bool compression) +{ + uint64_t count = read_int(in, compression); + for (uint64_t i = 0; i < count; i++) { + s += static_cast(read_int(in, compression)); + } +} + +double +OldBinary::read_double(FILE* in, bool compression, bool endian_util) +{ + if (compression) { + if (endian_util) { + double retval; +#ifdef WORDS_BIGENDIAN + fread_unlocked(&retval, sizeof(double), 1, input); +#else + char *s = reinterpret_cast(&retval); + + for(int i = sizeof(double)-1; i != -1; i--) { + if(fread_unlocked(&(s[i]), 1, 1, in)==0) { + return 0; + } + } +#endif + return retval; + } else { + uint64_t mantissa = read_int(in, true); + if (mantissa >= 0x04000000) { + mantissa = ((mantissa & 0x03ffffff) << 26) | read_int(in, true); + } + + uint64_t exponent = read_int(in, true); + if (exponent >= 0x04000000) { + exponent = ((exponent & 0x03ffffff) << 26) | read_int(in, true); + } + + double v = static_cast(static_cast(mantissa)) / 0x40000000; + return ldexp(v, static_cast(exponent)); + } + } else { + uint64_t d = read_int(in, false); + return *reinterpret_cast(&d); + } +} diff --git a/lttoolbox/old_binary.h b/lttoolbox/old_binary.h new file mode 100644 index 0000000..b798691 --- /dev/null +++ b/lttoolbox/old_binary.h @@ -0,0 +1,16 @@ +#ifndef _LT_OLD_BINARY_ +#define _LT_OLD_BINARY_ + +#include +#include +#include + +namespace OldBinary { + uint64_t read_u64(FILE* in); + uint64_t read_int(FILE* in, bool compression=true); + void read_ustr(FILE* in, UString& s, bool compression=true); + void read_str(FILE* in, std::string& s, bool compression=true); + double read_double(FILE* in, bool compression=true, bool endian_util=false); +}; + +#endif