commit aa0fac9c936440d3ee2db75e194cd21a3151cc09 Author: Daniel Swanson Date: Fri Jul 30 10:27:19 2021 -0500 move endian helpers for mmap to their own header diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index fa5f3f4..7904c99 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,6 +1,6 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ - deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ + deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 339b949..5783f77 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -77,11 +77,6 @@ inline auto write_le(Stream& out, uint64_t value) { return write_u64_le(out, value); } -template -inline auto write_double_le(Stream& out, double value) { - return write_u64_le(out, *reinterpret_cast(&value)); -} - inline auto read_u64(FILE *in) { uint64_t value = 0; @@ -128,11 +123,6 @@ inline auto read_le(std::istream& in) { return read_le(in, Value{}); } -inline double read_double_le(FILE* in) { - uint64_t val = read_le(in); - return *reinterpret_cast(&val); -} - /** * Clase "Compression". * Class methods to access compressed data by the byte-aligned method diff --git a/lttoolbox/endian_util.h b/lttoolbox/endian_util.h new file mode 100644 index 0000000..069bd74 --- /dev/null +++ b/lttoolbox/endian_util.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ENDIAN_UTIL_ +#define _LT_ENDIAN_UTIL_ + +#include +#include +#include +#include + +inline uint32_t to_le_32(uint32_t v) { + return (((v & 0xFF) << 24) | + ((v & 0xFF00) << 8) | + ((v & 0xFF0000) >> 8) | + ((v & 0xFF000000) >> 24)); +} + +inline uint32_t from_le_32(uint32_t v) { + return (((v & 0xFF000000) >> 24) | + ((v & 0xFF0000) >> 8) | + ((v & 0xFF00) << 8) | + ((v & 0xFF) << 24)); +} + +inline uint64_t to_le_64(uint64_t v) { + return (((v & 0xFF) << 56) | + ((v & 0xFF00) << 40) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF00000000000000) >> 56)); +} + +inline uint64_t from_le_64(uint64_t v) { + return (((v & 0xFF00000000000000) >> 56) | + ((v & 0xFF000000000000) >> 40) | + ((v & 0xFF0000000000) >> 24) | + ((v & 0xFF00000000) >> 8) | + ((v & 0xFF000000) << 8) | + ((v & 0xFF0000) << 24) | + ((v & 0xFF00) << 40) | + ((v & 0xFF) << 56)); +} + +inline auto write_le_32(FILE* out, uint32_t value) { + uint32_t v = to_le_32(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint32_t"); + } + return rv; +} + +inline auto write_le_64(FILE* out, uint64_t value) { + uint64_t v = to_le_64(value); + auto rv = fwrite_unlocked(reinterpret_cast(&v), 1, sizeof(value), out); + if (rv != sizeof(value)) { + throw std::runtime_error("Failed to write uint64_t"); + } + return rv; +} + +inline auto read_le_32(FILE* in) { + uint32_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_32(value); +} + +inline auto read_le_64(FILE* in) { + uint64_t value = 0; + if (fread_unlocked(reinterpret_cast(&value), 1, sizeof(value), in) != sizeof(value)) { + throw std::runtime_error("Failed to read uint64_t"); + } + return from_le_64(value); +} + +inline auto write_le_s32(FILE* out, int32_t value) { + return write_le_32(out, *reinterpret_cast(&value)); +} + +inline auto read_le_s32(FILE* in) { + uint32_t val = read_le_32(in); + return *reinterpret_cast(&val); +} + +inline auto write_le_double(FILE* out, double value) { + return write_le_64(out, *reinterpret_cast(&value)); +} + +inline auto read_le_double(FILE* in) { + uint64_t val = read_le_64(in); + return *reinterpret_cast(&val); +} + +#endif diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 292431f..64c7120 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -17,6 +17,7 @@ #include +#include #include UString_view @@ -41,7 +42,7 @@ StringWriter::get(const uint32_t start, const uint32_t count) void StringWriter::read(FILE* in) { - uint64_t len = read_u64_le(in); + uint64_t len = read_le_64(in); buffer.clear(); buffer.reserve(len); uint8_t temp[len*2]{}; @@ -57,7 +58,7 @@ StringWriter::read(FILE* in) void StringWriter::write(FILE* out) { - write_u64_le(out, buffer.size()); + write_le_64(out, buffer.size()); uint8_t temp[buffer.size()*2]{}; for (uint64_t i = 0; i < buffer.size(); i++) { temp[2*i] = buffer[i] & 0xFF; diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index d7b2334..2785c42 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -18,11 +18,9 @@ #ifndef _LT_STRING_WRITER_ #define _LT_STRING_WRITER_ -// TODO: merge compression.h write_u64_le() and friends to here -// when we drop compressed formats -#include #include #include +#include class StringWriter { public: diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index eecce47..fdd4166 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -659,11 +660,11 @@ Transducer::read(FILE *input, int const decalage) void Transducer::read_mmap(FILE* in, Alphabet& alpha) { - read_le(in); // total size - initial = read_le(in); - uint64_t state_count = read_le(in); - uint64_t final_count = read_le(in); - uint64_t trans_count = read_le(in); + read_le_64(in); // total size + initial = read_le_64(in); + uint64_t state_count = read_le_64(in); + uint64_t final_count = read_le_64(in); + uint64_t trans_count = read_le_64(in); if (transitions.size() > state_count) { transitions.clear(); @@ -673,8 +674,8 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) finals.clear(); for (uint64_t i = 0; i < final_count; i++) { - uint64_t s = read_le(in); - double w = read_double_le(in); + uint64_t s = read_le_64(in); + double w = read_le_double(in); finals.insert(make_pair(s, w)); } @@ -682,7 +683,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) offsets.reserve(state_count); for (uint64_t i = 0; i < state_count; i++) { transitions[i].clear(); - offsets.push_back(read_le(in)); + offsets.push_back(read_le_64(in)); } offsets.push_back(0); @@ -691,11 +692,11 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) if (i == offsets[state+1]) { state++; } - uint64_t isym = read_le(in); - uint64_t osym = read_le(in); - int32_t sym = alpha((int32_t)isym, (int32_t)osym); - uint64_t dest = read_le(in); - double wght = read_double_le(in); + int32_t isym = read_le_s32(in); + int32_t osym = read_le_s32(in); + int32_t sym = alpha(isym, osym); + uint64_t dest = read_le_64(in); + double wght = read_le_double(in); transitions[state].insert(make_pair(sym, make_pair(dest, wght))); } } @@ -707,7 +708,7 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) uint64_t features = 0; features |= TDF_WEIGHTS; features |= TDF_MMAP; - write_le(out, features); + write_le_64(out, features); uint64_t tr_count = 0; vector offsets; @@ -725,19 +726,19 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) (finals.size() * 2) + // final states 4 ); // initial state + length of each section - write_le(out, total_size*8); // number of bytes after this - write_le(out, initial); // initial state - write_le(out, transitions.size()); // number of states - write_le(out, finals.size()); // number of finals - write_le(out, tr_count); // number of transitions + write_le_64(out, total_size*8); // number of bytes after this + write_le_64(out, initial); // initial state + write_le_64(out, transitions.size()); // number of states + write_le_64(out, finals.size()); // number of finals + write_le_64(out, tr_count); // number of transitions for (auto& it : finals) { - write_le(out, it.first); - write_le(out, *reinterpret_cast(&it.second)); + write_le_64(out, it.first); + write_le_double(out, it.second); } for (auto& it : offsets) { - write_le(out, it); + write_le_64(out, it); } for (auto& it : transitions) { @@ -751,10 +752,10 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) auto range = it.second.equal_range(s); for (auto tr = range.first; tr != range.second; ++tr) { auto sym = alpha.decode(tr->first); - write_le(out, sym.first); // input symbol - write_le(out, sym.second); // output symbol - write_le(out, tr->second.first); // destination - write_double_le(out, tr->second.second); // weight + write_le_s32(out, sym.first); // input symbol + write_le_s32(out, sym.second); // output symbol + write_le_64(out, tr->second.first); // destination + write_le_double(out, tr->second.second); // weight } } } diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 47f08d6..5efdb40 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -18,6 +18,7 @@ #include #include +#include // includes needed for reading non-mmap files #include @@ -58,30 +59,30 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } if (mmap) { - read_le(input); // total size - initial = read_le(input); - state_count = read_le(input); - final_count = read_le(input); - transition_count = read_le(input); + read_le_64(input); // total size + initial = read_le_64(input); + state_count = read_le_64(input); + final_count = read_le_64(input); + transition_count = read_le_64(input); finals = new Final[final_count]; for (uint64_t i = 0; i < final_count; i++) { - finals[i].state = read_le(input); - finals[i].weight = read_double_le(input); + finals[i].state = read_le_64(input); + finals[i].weight = read_le_double(input); } offsets = new uint64_t[state_count+1]; for (uint64_t i = 0; i < state_count; i++) { - offsets[i] = read_le(input); + offsets[i] = read_le_64(input); } offsets[state_count] = transition_count; transitions = new Transition[transition_count]; for (uint64_t i = 0; i < transition_count; i++) { - transitions[i].isym = read_le(input); - transitions[i].osym = read_le(input); - transitions[i].dest = read_le(input); - transitions[i].weight = read_double_le(input); + transitions[i].isym = read_le_s32(input); + transitions[i].osym = read_le_s32(input); + transitions[i].dest = read_le_64(input); + transitions[i].weight = read_le_double(input); } } else { initial = Compression::multibyte_read(input); diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 8741296..02297b1 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -22,8 +22,8 @@ #include struct Transition { - uint64_t isym; // TODO: should be int32_t - uint64_t osym; + int32_t isym; + int32_t osym; uint64_t dest; double weight; };