commit dad61a585e49519942a8111888beb11283cebf76 Author: Daniel Swanson Date: Mon Aug 2 16:46:56 2021 -0500 actually mmap diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index ec07213..f3737b4 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,7 +1,7 @@ h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h mmap.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc index 5f43865..0c682f0 100644 --- a/lttoolbox/alphabet_exe.cc +++ b/lttoolbox/alphabet_exe.cc @@ -26,7 +26,9 @@ AlphabetExe::AlphabetExe(StringWriter* sw_) AlphabetExe::~AlphabetExe() { - delete[] tags; + if (!mmapping) { + delete[] tags; + } } void @@ -63,6 +65,18 @@ AlphabetExe::read(FILE* input, bool mmap) } } +void* +AlphabetExe::init(void* ptr) +{ + mmapping = true; + tag_count = from_le_64(reinterpret_cast(ptr)[0]); + tags = reinterpret_cast(ptr + sizeof(uint64_t)); + for (uint64_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + return ptr + sizeof(uint64_t) + tag_count*sizeof(StringRef); +} + int32_t AlphabetExe::operator()(UString_view sv) { diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h index af579bb..29dcdbe 100644 --- a/lttoolbox/alphabet_exe.h +++ b/lttoolbox/alphabet_exe.h @@ -27,10 +27,12 @@ private: uint64_t tag_count; StringRef* tags; std::map symbol_map; + bool mmapping = false; public: AlphabetExe(StringWriter* sw_); ~AlphabetExe(); void read(FILE* in, bool mmap); + void* init(void* ptr); int32_t operator()(UString_view sv); void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; bool isTag(const int32_t symbol) const; diff --git a/lttoolbox/endian_util.h b/lttoolbox/endian_util.h index 069bd74..fadbbac 100644 --- a/lttoolbox/endian_util.h +++ b/lttoolbox/endian_util.h @@ -23,40 +23,48 @@ #include #include -inline uint32_t to_le_32(uint32_t v) { - return (((v & 0xFF) << 24) | - ((v & 0xFF00) << 8) | - ((v & 0xFF0000) >> 8) | - ((v & 0xFF000000) >> 24)); +inline uint32_t to_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; } -inline uint32_t from_le_32(uint32_t v) { - return (((v & 0xFF000000) >> 24) | - ((v & 0xFF0000) >> 8) | - ((v & 0xFF00) << 8) | - ((v & 0xFF) << 24)); +inline uint32_t from_le_32(uint32_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((bytes[3] << 24) | + (bytes[2] << 16) | + (bytes[1] << 8) | + bytes[0]); + return v; } -inline uint64_t to_le_64(uint64_t v) { - return (((v & 0xFF) << 56) | - ((v & 0xFF00) << 40) | - ((v & 0xFF0000) << 24) | - ((v & 0xFF000000) << 8) | - ((v & 0xFF00000000) >> 8) | - ((v & 0xFF0000000000) >> 24) | - ((v & 0xFF000000000000) >> 40) | - ((v & 0xFF00000000000000) >> 56)); +inline uint64_t to_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + bytes[7] = (v >> 56) & 0xFF; + bytes[6] = (v >> 48) & 0xFF; + bytes[5] = (v >> 40) & 0xFF; + bytes[4] = (v >> 32) & 0xFF; + bytes[3] = (v >> 24) & 0xFF; + bytes[2] = (v >> 16) & 0xFF; + bytes[1] = (v >> 8) & 0xFF; + bytes[0] = v & 0xFF; + return v; } -inline uint64_t from_le_64(uint64_t v) { - return (((v & 0xFF00000000000000) >> 56) | - ((v & 0xFF000000000000) >> 40) | - ((v & 0xFF0000000000) >> 24) | - ((v & 0xFF00000000) >> 8) | - ((v & 0xFF000000) << 8) | - ((v & 0xFF0000) << 24) | - ((v & 0xFF00) << 40) | - ((v & 0xFF) << 56)); +inline uint64_t from_le_64(uint64_t& v) { + uint8_t* bytes = reinterpret_cast(&v); + v = ((static_cast(bytes[7]) << 56ull) | + (static_cast(bytes[6]) << 48ull) | + (static_cast(bytes[5]) << 40ull) | + (static_cast(bytes[4]) << 32ull) | + (static_cast(bytes[3]) << 24ull) | + (static_cast(bytes[2]) << 16ull) | + (static_cast(bytes[1]) << 8ull) | + (static_cast(bytes[0]))); + return v; } inline auto write_le_32(FILE* out, uint32_t value) { diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 1bb7a92..3b675c8 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,13 @@ FSTProcessor::FSTProcessor() } } +FSTProcessor::~FSTProcessor() +{ + if (mmapping) { + munmap(mmap_pointer, mmap_len); + } +} + void FSTProcessor::streamError() { @@ -935,7 +943,7 @@ FSTProcessor::load(FILE *input) char header[4]{}; fread_unlocked(header, 1, 4, input); if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(input); + auto features = read_le_64(input); if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } @@ -948,24 +956,50 @@ FSTProcessor::load(FILE *input) } if (mmap) { - str_write.read(input); + fgetpos(input, &pos); + rewind(input); + mmapping = mmap_file(input, mmap_pointer, mmap_len); + if (mmapping) { + void* ptr = mmap_pointer + 12; + ptr = str_write.init(ptr); - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - vector vec; - ustring_to_vec32(str_write.get(s, c), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - // alphabetic_chars + StringRef let_loc = reinterpret_cast(ptr)[0]; + vector vec; + ustring_to_vec32(str_write.get(let_loc), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + ptr += sizeof(StringRef); - alphabet.read(input, true); + ptr = alphabet.init(ptr); + + uint64_t tr_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + for (uint64_t i = 0; i < tr_count; i++) { + StringRef tn = reinterpret_cast(ptr)[0]; + ptr += sizeof(StringRef); + UString name = UString{str_write.get(tn)}; + ptr = transducers[name].init(ptr); + } + } else { + fsetpos(input, &pos); + + str_write.read(input); - uint64_t tr_count = read_le_64(input); - Alphabet temp; - for (uint64_t i = 0; i < tr_count; i++) { uint32_t s = read_le_32(input); uint32_t c = read_le_32(input); - UString name = UString{str_write.get(s, c)}; - transducers[name].read(input, temp); + vector vec; + ustring_to_vec32(str_write.get(s, c), vec); + alphabetic_chars.insert(vec.begin(), vec.end()); + + alphabet.read(input, true); + + uint64_t tr_count = read_le_64(input); + Alphabet temp; + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{str_write.get(s, c)}; + transducers[name].read(input, temp); + } } } else { diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 412bba2..7951b76 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -259,6 +259,10 @@ private: */ int maxWeightClasses = INT_MAX; + bool mmapping = false; + void* mmap_pointer = nullptr; + int mmap_len = 0; + /** * Prints an error of input stream and exits */ @@ -498,6 +502,7 @@ public: static UString const WBLANK_FINAL; FSTProcessor(); + ~FSTProcessor(); void initAnalysis(); void initTMAnalysis(); diff --git a/lttoolbox/match_state.cc b/lttoolbox/match_state.cc index 84f4aef..75e3ad7 100644 --- a/lttoolbox/match_state.cc +++ b/lttoolbox/match_state.cc @@ -15,7 +15,6 @@ * along with this program; if not, see . */ #include -#include #include #include diff --git a/lttoolbox/mmap.h b/lttoolbox/mmap.h new file mode 100644 index 0000000..221f57b --- /dev/null +++ b/lttoolbox/mmap.h @@ -0,0 +1,34 @@ +#ifndef _LT_MMAP_ +#define _LT_MMAP_ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +inline bool mmap_file(FILE* fd, void*& ptr, int& len) +{ + std::cerr << "mmap_file()\n"; + struct stat sb; + if (fstat(fileno(fd), &sb) == -1) { + std::cerr << "fstat failed\n"; + return false; + } + len = sb.st_size; + std::cerr << "file length is " << len << "\n"; + ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fileno(fd), 0); + if (ptr == MAP_FAILED) { + std::cerr << "mmap failed\nerrno = " << errno << "\n"; + return false; + } + std::cerr << "got pointer\n"; + return true; +} + +#endif diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 15d81f7..bb4d6eb 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -23,10 +23,10 @@ StringRef StringWriter::add(UString_view s) { - auto start = buffer.find(s); + auto start = edit_buffer.find(s); if (start == UString::npos) { - start = buffer.size(); - buffer += s; + start = edit_buffer.size(); + edit_buffer += s; } StringRef ret; ret.start = start; @@ -37,43 +37,58 @@ StringWriter::add(UString_view s) UString_view StringWriter::get(const uint32_t start, const uint32_t count) { - UString_view ret(buffer); - return ret.substr(start, count); + if (mmapping) { + UString_view ret(mmap_buffer, mmap_size); + return ret.substr(start, count); + } else { + UString_view ret(edit_buffer); + return ret.substr(start, count); + } } UString_view StringWriter::get(const StringRef& ref) { - UString_view ret(buffer); - return ret.substr(ref.start, ref.count); + return get(ref.start, ref.count); } void StringWriter::read(FILE* in) { uint64_t len = read_le_64(in); - buffer.clear(); - buffer.reserve(len); + edit_buffer.clear(); + edit_buffer.reserve(len); uint8_t temp[len*2]{}; if (fread_unlocked(&temp, 1, len*2, in) != len*2) { throw std::runtime_error("Failed to read strings"); } uint16_t c; for (uint64_t i = 0; i < len*2; i += 2) { - buffer += static_cast(temp[i] | (temp[i+1] << 8)); + edit_buffer += static_cast(temp[i] | (temp[i+1] << 8)); } } void StringWriter::write(FILE* out) { - write_le_64(out, buffer.size()); - uint8_t temp[buffer.size()*2]{}; - for (uint64_t i = 0; i < buffer.size(); i++) { - temp[2*i] = buffer[i] & 0xFF; - temp[2*i+1] = (buffer[i] >> 8) & 0xFF; + write_le_64(out, edit_buffer.size()); + uint8_t temp[edit_buffer.size()*2]{}; + for (uint64_t i = 0; i < edit_buffer.size(); i++) { + temp[2*i] = edit_buffer[i] & 0xFF; + temp[2*i+1] = (edit_buffer[i] >> 8) & 0xFF; } - if (fwrite_unlocked(&temp, 1, buffer.size()*2, out) != buffer.size()*2) { + if (fwrite_unlocked(&temp, 1, edit_buffer.size()*2, out) != edit_buffer.size()*2) { throw std::runtime_error("Failed to write strings"); } } + +void* +StringWriter::init(void* ptr) +{ + mmapping = true; + mmap_size = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + mmap_buffer = reinterpret_cast(ptr); + get(0, mmap_size); + return ptr + sizeof(UChar)*mmap_size; +} diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 12000a4..182180f 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -28,13 +28,18 @@ struct StringRef { }; class StringWriter { +private: + bool mmapping = false; + UString edit_buffer; + uint64_t mmap_size; + UChar* mmap_buffer; public: - UString buffer; StringRef add(UString_view s); UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); void read(FILE* in); void write(FILE* out); + void* init(void* ptr); }; #endif diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 980c024..4b52ccf 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -691,12 +691,12 @@ Transducer::read_mmap(FILE* in, Alphabet& alpha) } vector offsets; - offsets.reserve(state_count); + offsets.reserve(state_count+1); for (uint64_t i = 0; i < state_count; i++) { transitions[i].clear(); offsets.push_back(read_le_64(in)); } - offsets.push_back(0); + offsets.push_back(read_le_64(in)); uint64_t state = 0; for (uint64_t i = 0; i < trans_count; i++) { @@ -723,19 +723,20 @@ Transducer::write_mmap(FILE* out, const Alphabet& alpha) uint64_t tr_count = 0; vector offsets; - offsets.reserve(transitions.size()); + offsets.reserve(transitions.size()+1); for (auto& it : transitions) { offsets.push_back(tr_count); tr_count += it.second.size(); } + offsets.push_back(tr_count); // TODO: which things should be smaller than u64? uint64_t total_size = - ( transitions.size() + // offset of each state - (tr_count * 4) + // each transition - (finals.size() * 2) + // final states - 4 ); // initial state + length of each section + ( transitions.size() + 1 + // offset of each state + (tr_count * 3) + // each transition + (finals.size() * 2) + // final states + 4 ); // initial state + length of each section write_le_64(out, total_size*8); // number of bytes after this write_le_64(out, initial); // initial state diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index fd69922..3b6fe60 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -32,9 +32,11 @@ TransducerExe::TransducerExe() : TransducerExe::~TransducerExe() { - delete[] finals; - delete[] offsets; - delete[] transitions; + if (!mmapping) { + delete[] finals; + delete[] offsets; + delete[] transitions; + } } void @@ -72,10 +74,9 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } offsets = new uint64_t[state_count+1]; - for (uint64_t i = 0; i < state_count; i++) { + for (uint64_t i = 0; i < state_count+1; i++) { offsets[i] = read_le_64(input); } - offsets[state_count] = transition_count; transitions = new Transition[transition_count]; for (uint64_t i = 0; i < transition_count; i++) { @@ -144,6 +145,32 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } } +void* +TransducerExe::init(void* ptr) +{ + mmapping = true; + + ptr += 4 + sizeof(uint64_t); // skip header + uint64_t* arr = reinterpret_cast(ptr); + uint64_t total_size = arr[0]; + initial = arr[1]; + state_count = arr[2]; + final_count = arr[3]; + transition_count = arr[4]; + ptr += sizeof(uint64_t)*5; + + finals = reinterpret_cast(ptr); + ptr += sizeof(Final)*final_count; + + offsets = reinterpret_cast(ptr); + ptr += sizeof(uint64_t)*(state_count+1); + + transitions = reinterpret_cast(ptr); + ptr += sizeof(Transition)*transition_count; + + return ptr; +} + void TransducerExe::get_range(const uint64_t state, const int32_t symbol, uint64_t& start, uint64_t& end) diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index eadf894..cef9a30 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -50,6 +50,7 @@ private: Final* finals; uint64_t* offsets; Transition* transitions; + bool mmapping = false; void get_range(const uint64_t state, const int32_t sym, uint64_t& start, uint64_t& end); @@ -59,6 +60,7 @@ public: TransducerExe(); ~TransducerExe(); void read(FILE* input, Alphabet& alphabet); + void* init(void* ptr); }; #endif