commit 644f9b1f86ea0c3f0694d6d77a369422718ad306 Author: Daniel Swanson Date: Thu Jul 29 19:19:35 2021 -0500 class for executing transducers diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index b2875a7..fa5f3f4 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -3,12 +3,12 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ - transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ + transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) diff --git a/lttoolbox/compression.h b/lttoolbox/compression.h index 5783f77..339b949 100644 --- a/lttoolbox/compression.h +++ b/lttoolbox/compression.h @@ -77,6 +77,11 @@ inline auto write_le(Stream& out, uint64_t value) { return write_u64_le(out, value); } +template +inline auto write_double_le(Stream& out, double value) { + return write_u64_le(out, *reinterpret_cast(&value)); +} + inline auto read_u64(FILE *in) { uint64_t value = 0; @@ -123,6 +128,11 @@ inline auto read_le(std::istream& in) { return read_le(in, Value{}); } +inline double read_double_le(FILE* in) { + uint64_t val = read_le(in); + return *reinterpret_cast(&val); +} + /** * Clase "Compression". * Class methods to access compressed data by the byte-aligned method diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 7391318..eecce47 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -674,8 +674,8 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) for (uint64_t i = 0; i < final_count; i++) { uint64_t s = read_le(in); - uint64_t w = read_le(in); - finals.insert(make_pair(s, *reinterpret_cast(&w))); + double w = read_double_le(in); + finals.insert(make_pair(s, w)); } vector offsets; @@ -695,7 +695,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) uint64_t osym = read_le(in); int32_t sym = alpha((int32_t)isym, (int32_t)osym); uint64_t dest = read_le(in); - uint64_t wght = read_le(in); + double wght = read_double_le(in); transitions[state].insert(make_pair(sym, make_pair(dest, wght))); } } @@ -754,8 +754,7 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) write_le(out, sym.first); // input symbol write_le(out, sym.second); // output symbol write_le(out, tr->second.first); // destination - uint64_t w = *reinterpret_cast(&tr->second.second); - write_le(out, w); // weight + write_double_le(out, tr->second.second); // weight } } } diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc new file mode 100644 index 0000000..47f08d6 --- /dev/null +++ b/lttoolbox/transducer_exe.cc @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +// includes needed for reading non-mmap files +#include +#include +#include + +TransducerExe::TransducerExe() : + initial(0), state_count(0), final_count(0), transition_count(0), + finals(nullptr), offsets(nullptr), transitions(nullptr) +{} + +TransducerExe::~TransducerExe() +{ + delete[] finals; + delete[] offsets; + delete[] transitions; +} + +void +TransducerExe::read(FILE* input, Alphabet& alphabet) +{ + bool read_weights = false; // only matters for pre-mmap + bool mmap = false; + fpos_t pos; + fgetpos(input, &pos); + char header[4]{}; + fread_unlocked(header, 1, 4, input); + if (strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le(input); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + read_weights = (features & TDF_WEIGHTS); + mmap = (features & TDF_MMAP); + } else { + // no header + fsetpos(input, &pos); + } + + if (mmap) { + read_le(input); // total size + initial = read_le(input); + state_count = read_le(input); + final_count = read_le(input); + transition_count = read_le(input); + + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + finals[i].state = read_le(input); + finals[i].weight = read_double_le(input); + } + + offsets = new uint64_t[state_count+1]; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = read_le(input); + } + offsets[state_count] = transition_count; + + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = read_le(input); + transitions[i].osym = read_le(input); + transitions[i].dest = read_le(input); + transitions[i].weight = read_double_le(input); + } + } else { + initial = Compression::multibyte_read(input); + final_count = Compression::multibyte_read(input); + + uint64_t base_state = 0; + double base_weight = 0.0; + finals = new Final[final_count]; + for (uint64_t i = 0; i < final_count; i++) { + base_state += Compression::multibyte_read(input); + if (read_weights) { + base_weight += Compression::long_multibyte_read(input); + } + finals[i].state = base_state; + finals[i].weight = base_weight; + } + + state_count = Compression::multibyte_read(input); + offsets = new uint64_t[state_count+1]; + transition_count = 0; + std::vector isyms, osyms, dests; + std::vector weights; + for (uint64_t i = 0; i < state_count; i++) { + offsets[i] = transition_count; + std::map>>> temp; + uint64_t count = Compression::multibyte_read(input); + transition_count += count; + int32_t tag_base = 0; + for (uint64_t i = 0; i < count; i++) { + tag_base += Compression::multibyte_read(input); + uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; + if (read_weights) { + base_weight = Compression::multibyte_read(input); + } + auto sym = alphabet.decode(tag_base); + temp[sym.first].push_back(make_pair(sym.second, + make_pair(dest, base_weight))); + } + for (auto& it : temp) { + for (auto& it2 : it.second) { + isyms.push_back(it.first); + osyms.push_back(it2.first); + dests.push_back(it2.second.first); + weights.push_back(it2.second.second); + } + } + } + offsets[state_count] = transition_count; + transitions = new Transition[transition_count]; + for (uint64_t i = 0; i < transition_count; i++) { + transitions[i].isym = isyms[i]; + transitions[i].osym = osyms[i]; + transitions[i].dest = dests[i]; + transitions[i].weight = weights[i]; + } + } +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h new file mode 100644 index 0000000..8741296 --- /dev/null +++ b/lttoolbox/transducer_exe.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +// only needed for reading non-mmap files +#include + +struct Transition { + uint64_t isym; // TODO: should be int32_t + uint64_t osym; + uint64_t dest; + double weight; +}; + +struct Final { + uint64_t state; + double weight; +}; + +class TransducerExe { +private: + uint64_t initial; + uint64_t state_count; + uint64_t final_count; + uint64_t transition_count; + Final* finals; + uint64_t* offsets; + Transition* transitions; +public: + TransducerExe(); + ~TransducerExe(); + void read(FILE* input, Alphabet& alphabet); +};