commit 77a4a1ca6063c557f13d33ef7922797f246cebfa Author: Daniel Swanson Date: Mon Aug 2 13:49:59 2021 -0500 lt-print and lt-trim accepting new format diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index 8139e02..c2c40db 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -16,6 +16,8 @@ */ #include #include +#include +#include #include #include @@ -128,10 +130,11 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; + set alphabetic_chars; map transducers; + bool mmap = false; fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; @@ -141,6 +144,7 @@ int main(int argc, char *argv[]) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -148,25 +152,44 @@ int main(int argc, char *argv[]) } } - // letters - int len = Compression::multibyte_read(input); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } + if (mmap) { + StringWriter sw; + sw.read(input); - // symbols - alphabet.read(input); + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + vector vec; + ustring_to_vec32(sw.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); - len = Compression::multibyte_read(input); + alphabet.read_mmap(input, sw); - while(len > 0) - { - UString name = Compression::string_read(input); - transducers[name].read(input); + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{sw.get(s, c)}; + transducers[name].read_mmap(input, alphabet); + } + } else { + // letters + int len = Compression::multibyte_read(input); + while(len > 0) { + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + len--; + } - len--; + // symbols + alphabet.read(input); + + len = Compression::multibyte_read(input); + + while(len > 0) { + UString name = Compression::string_read(input); + transducers[name].read(input); + + len--; + } } ///////////////////// diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index f685752..e1e3dc3 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -16,6 +16,8 @@ */ #include #include +#include +#include #include #include @@ -44,6 +46,7 @@ read_fst(FILE *bin_file) std::map transducers; fpos_t pos; + bool mmap = false; if (fgetpos(bin_file, &pos) == 0) { char header[4]{}; fread_unlocked(header, 1, 4, bin_file); @@ -52,6 +55,7 @@ read_fst(FILE *bin_file) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -59,26 +63,43 @@ read_fst(FILE *bin_file) } } - // letters - UString letters = Compression::string_read(bin_file); + UString letters; - // symbols - new_alphabet.read(bin_file); + if (mmap) { + StringWriter sw; + sw.read(bin_file); - int len = Compression::multibyte_read(bin_file); + uint32_t s = read_le_32(bin_file); + uint32_t c = read_le_32(bin_file); + letters = UString{sw.get(s, c)}; - while(len > 0) - { - UString name = Compression::string_read(bin_file); - transducers[name].read(bin_file); + new_alphabet.read_mmap(bin_file, sw); + + uint64_t tr_count = read_le_64(bin_file); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(bin_file); + uint32_t c = read_le_32(bin_file); + UString name = UString{sw.get(s, c)}; + transducers[name].read_mmap(bin_file, new_alphabet); + } + } else { + // letters + letters = Compression::string_read(bin_file); - len--; + // symbols + new_alphabet.read(bin_file); + + int len = Compression::multibyte_read(bin_file); + + while(len > 0) { + UString name = Compression::string_read(bin_file); + transducers[name].read(bin_file); + + len--; + } } - std::pair alph_letters; - alph_letters.first = new_alphabet; - alph_letters.second = letters; - return std::pair, std::map > (alph_letters, transducers); + return make_pair(make_pair(new_alphabet, letters), transducers); } std::pair, std::map > diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index fdd4166..d783c28 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -660,6 +660,17 @@ Transducer::read(FILE *input, int const decalage) void Transducer::read_mmap(FILE* in, Alphabet& alpha) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, in); + if (r == 4 && strncmp(header, HEADER_TRANSDUCER, 4) == 0) { + auto features = read_le_64(in); + if (features >= TDF_UNKNOWN) { + throw std::runtime_error("Transducer has features that are unknown to this version of lttoolbox - upgrade!"); + } + } else { + throw std::runtime_error("Unable to read transducer header!"); + } + read_le_64(in); // total size initial = read_le_64(in); uint64_t state_count = read_le_64(in); @@ -689,7 +700,7 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) uint64_t state = 0; for (uint64_t i = 0; i < trans_count; i++) { - if (i == offsets[state+1]) { + while (i == offsets[state+1]) { state++; } int32_t isym = read_le_s32(in);