commit a1952107775bf1176efa6328e5d5b0f87b70b543 Author: Daniel Swanson Date: Mon Aug 2 12:52:39 2021 -0500 lt-comp working with new format diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index 284e8b9..d6ef296 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -181,6 +182,30 @@ Alphabet::read(FILE *input) *this = a_new; } +void +Alphabet::write_mmap(FILE* output, StringWriter& sw) +{ + write_le_64(output, slexicinv.size()); + for (auto& it : slexicinv) { + StringRef r = sw.add(it); + write_le_32(output, r.start); + write_le_32(output, r.count); + } +} + +void +Alphabet::read_mmap(FILE* input, StringWriter& sw) +{ + int64_t count = read_le_64(input); + for (int64_t i = 0; i < count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString t = UString{sw.get(s, c)}; + slexicinv.push_back(t); + slexic[t] = -i-1; + } +} + void Alphabet::serialise(std::ostream &serialised) const { diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 72a3b36..f314075 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -23,7 +23,8 @@ #include #include #include -#include "ustring.h" +#include +#include using namespace std; using namespace icu; @@ -135,6 +136,9 @@ public: */ void read(FILE *input); + void write_mmap(FILE* output, StringWriter& sw); + void read_mmap(FILE* input, StringWriter& sw); + void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index b55e642..5f43865 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -18,6 +18,7 @@ #include #include +#include AlphabetExe::AlphabetExe(StringWriter* sw_) : sw(sw_), tag_count(0), tags(nullptr) @@ -32,6 +33,13 @@ void AlphabetExe::read(FILE* input, bool mmap) { if (mmap) { + tag_count = read_le_64(input); + tags = new StringRef[tag_count]; + for (uint64_t i = 0; i < tag_count; i++) { + tags[i].start = read_le_32(input); + tags[i].count = read_le_32(input); + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } } else { tag_count = Compression::multibyte_read(input); tags = new StringRef[tag_count]; diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 128fac6..761caf3 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -16,10 +16,12 @@ */ #include #include +#include #include #include #include #include +#include #include #include @@ -946,25 +948,37 @@ Compiler::write(FILE *output) { fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); uint64_t features = 0; - write_le(output, features); + features |= LTF_MMAP; + write_le_64(output, features); + + StringWriter sw; + StringRef letter_loc = sw.add(letters); + for (auto& it : alphabet.getTags()) { + sw.add(it); + } + for (auto& it : sections) { + sw.add(it.first); + } + + sw.write(output); // letters - Compression::string_write(letters, output); + write_le_32(output, letter_loc.start); + write_le_32(output, letter_loc.count); // symbols - alphabet.write(output); + alphabet.write_mmap(output, sw); // transducers - Compression::multibyte_write(sections.size(), output); + write_le_64(output, sections.size()); - int count=0; - for(auto& it : sections) - { - count++; + for(auto& it : sections) { cout << it.first << " " << it.second.size(); cout << " " << it.second.numberOfTransitions() << endl; - Compression::string_write(it.first, output); - it.second.write(output); + StringRef loc = sw.add(it.first); + write_le_32(output, loc.start); + write_le_32(output, loc.count); + it.second.write_mmap(output, alphabet); } } diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 01e9822..1bb7a92 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -16,6 +16,7 @@ */ #include #include +#include #include #include @@ -947,6 +948,25 @@ FSTProcessor::load(FILE *input) } if (mmap) { + str_write.read(input); + + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + // alphabetic_chars + + alphabet.read(input, true); + + uint64_t tr_count = read_le_64(input); + Alphabet temp; + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + transducers[name].read(input, temp); + } } else { // letters diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index f818f07..4bc3d64 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -55,7 +55,7 @@ StringWriter::read(FILE* in) buffer.clear(); buffer.reserve(len); uint8_t temp[len*2]{}; - if (fread_unlocked(&temp, 1, len*2, in) != len) { + if (fread_unlocked(&temp, 1, len*2, in) != len*2) { throw std::runtime_error("Failed to read strings"); } uint16_t c; diff --git a/lttoolbox/ustring.cc b/lttoolbox/ustring.cc index 87056c2..daac05e 100644 --- a/lttoolbox/ustring.cc +++ b/lttoolbox/ustring.cc @@ -48,7 +48,7 @@ to_ustring(const uint8_t* s) } void -ustring_to_vec32(const UString& str, std::vector& vec) +ustring_to_vec32(UString_view str, std::vector& vec) { if (str.empty()) { return; diff --git a/lttoolbox/ustring.h b/lttoolbox/ustring.h index 5ffa878..0073fc4 100644 --- a/lttoolbox/ustring.h +++ b/lttoolbox/ustring.h @@ -35,7 +35,7 @@ UString to_ustring(const char* str); UString to_ustring(const uint8_t* str); // append UTF-16 string to UTF-32 vector of symbols -void ustring_to_vec32(const UString& str, std::vector& vec); +void ustring_to_vec32(UString_view str, std::vector& vec); inline std::ostream& operator<<(std::ostream& ostr, char16_t c)