commit cbc3272feaf35055fb3449ff9fce2262a0bdc0fd Author: Daniel Swanson Date: Fri Jul 30 16:28:29 2021 -0500 use new TransducerExe in lt-proc diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 732acc8..2e4ac97 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -741,11 +741,12 @@ FSTProcessor::combineWblanks() void FSTProcessor::calcInitial() { + set temp; for(auto& it : transducers) { - root.addTransition(0, 0, it.second.getInitial(), default_weight); + temp.insert(&it.second); } - initial_state.init(&root); + initial_state.init(temp); } bool @@ -767,23 +768,19 @@ FSTProcessor::classifyFinals() for(auto& it : transducers) { if(endsWith(it.first, "@inconditional"_u)) { - inconditional.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + inconditional.insert(&it.second); } else if(endsWith(it.first, "@standard"_u)) { - standard.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + standard.insert(&it.second); } else if(endsWith(it.first, "@postblank"_u)) { - postblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + postblank.insert(&it.second); } else if(endsWith(it.first, "@preblank"_u)) { - preblank.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + preblank.insert(&it.second); } else { @@ -930,6 +927,7 @@ FSTProcessor::isAlphabetic(UChar32 const c) const void FSTProcessor::load(FILE *input) { + bool mmap = false; fpos_t pos; if (fgetpos(input, &pos) == 0) { char header[4]{}; @@ -939,6 +937,7 @@ FSTProcessor::load(FILE *input) if (features >= LTF_UNKNOWN) { throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); } + mmap = features & LTF_MMAP; } else { // Old binary format @@ -946,24 +945,26 @@ FSTProcessor::load(FILE *input) } } - // letters - int len = Compression::multibyte_read(input); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } + if (mmap) { + } else { + + // letters + int len = Compression::multibyte_read(input); + while(len > 0) { + alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); + len--; + } - // symbols - alphabet.read(input); + // symbols + alphabet.read(input); - len = Compression::multibyte_read(input); + len = Compression::multibyte_read(input); - while(len > 0) - { - UString name = Compression::string_read(input); - transducers[name].read(input, alphabet); - len--; + while(len > 0) { + UString name = Compression::string_read(input); + transducers[name].read(input, alphabet); + len--; + } } } @@ -984,8 +985,7 @@ FSTProcessor::initTMAnalysis() calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } @@ -995,8 +995,7 @@ FSTProcessor::initGeneration() setIgnoredChars(false); calcInitial(); for(auto& it : transducers) { - all_finals.insert(it.second.getFinals().begin(), - it.second.getFinals().end()); + all_finals.insert(&it.second); } } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 32263ac..9bfe0bc 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -57,7 +57,7 @@ private: /** * Transducers in FSTP */ - map transducers; + map transducers; /** * Current state of lexical analysis @@ -77,27 +77,27 @@ private: /** * The final states of inconditional sections in the dictionaries */ - map inconditional; + set inconditional; /** * The final states of standard sections in the dictionaries */ - map standard; + set standard; /** * The final states of postblank sections in the dictionaries */ - map postblank; + set postblank; /** * The final states of preblank sections in the dictionaries */ - map preblank; + set preblank; /** * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets */ - map all_finals; + set all_finals; /** * Queue of blanks, used in reading methods diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index facd537..be492bc 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -17,7 +17,6 @@ #include #include -#include #include //debug// @@ -26,8 +25,7 @@ //debug// State::State() -{ -} +{} State::~State() { @@ -42,21 +40,15 @@ State::State(State const &s) State & State::operator =(State const &s) { - if(this != &s) - { - destroy(); - copy(s); - } - + copy(s); return *this; } void State::destroy() { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + for (auto& it : state) { + delete it.sequence; } state.clear(); @@ -65,19 +57,17 @@ State::destroy() void State::copy(State const &s) { - // release references - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - delete state[i].sequence; + if (this == &s) { + return; } + destroy(); state = s.state; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - vector> *tmp = new vector>(); - *tmp = *(state[i].sequence); - state[i].sequence = tmp; + for (auto& it : state) { + TPath* tmp = new TPath(); + *tmp = *(it.sequence); + it.sequence = tmp; } } @@ -88,63 +78,62 @@ State::size() const } void -State::init(Node *initial) +State::init(const set& exes) { - state.clear(); - state.push_back(TNodeState(initial, new vector>(), false)); - state[0].sequence->clear(); + destroy(); + for (auto& it : exes) { + state.push_back(TNodeState(it, it->initial, new TPath(), false)); + } epsilonClosure(); } bool -State::apply_into(vector* new_state, int const input, int index, bool dirty) -{ - map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - vector> *new_v = new vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); +State::apply_into(std::vector* new_state, const int32_t input, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + new_v->push_back(make_pair(trans->transitions[i].osym, + trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } bool -State::apply_into_override(vector* new_state, int const input, int const old_sym, int const new_sym, int index, bool dirty) -{ - map::const_iterator it; - it = state[index].where->transitions.find(input); - if(it != state[index].where->transitions.end()) - { - for(int j = 0; j != it->second.size; j++) - { - vector> *new_v = new vector>(); - *new_v = *(state[index].sequence); - if(it->first != 0) - { - if(it->second.out_tag[j] == old_sym) - { - new_v->push_back(make_pair(new_sym, it->second.out_weight[j])); - } - else - { - new_v->push_back(make_pair(it->second.out_tag[j], it->second.out_weight[j])); - } +State::apply_into_override(std::vector* new_state, + const int32_t input, + const int32_t old_sym, const int32_t new_sym, + int index, bool dirty) +{ + uint64_t start, end; + bool any = false; + TransducerExe* trans = state[index].where; + trans->get_range(state[index].state, input, start, end); + for (uint64_t i = start; i < end; i++) { + TPath* new_v = new TPath(); + *new_v = *(state[index].sequence); + if (input != 0) { + int32_t s = trans->transitions[i].osym; + if (s == old_sym) { + s = new_sym; } - new_state->push_back(TNodeState(it->second.dest[j], new_v, state[index].dirty||dirty)); + new_v->push_back(make_pair(s, trans->transitions[i].weight)); } - return true; + new_state->push_back(TNodeState(trans, trans->transitions[i].dest, new_v, + state[index].dirty || dirty)); + any = true; } - return false; + return any; } void @@ -269,20 +258,18 @@ State::epsilonClosure() { for(size_t i = 0; i != state.size(); i++) { - map::iterator it2; - it2 = state[i].where->transitions.find(0); - if(it2 != state[i].where->transitions.end()) - { - for(int j = 0 ; j != it2->second.size; j++) - { - vector> *tmp = new vector>(); - *tmp = *(state[i].sequence); - if(it2->second.out_tag[j] != 0) - { - tmp->push_back(make_pair(it2->second.out_tag[j], it2->second.out_weight[j])); - } - state.push_back(TNodeState(it2->second.dest[j], tmp, state[i].dirty)); + TransducerExe* trans = state[i].where; + uint64_t start, end; + trans->get_range(state[i].state, 0, start, end); + for (uint64_t j = start; j < end; j++) { + TPath* tmp = new TPath(); + *tmp = *(state[i].sequence); + if (trans->transitions[j].osym != 0) { + tmp->push_back(make_pair(trans->transitions[j].osym, + trans->transitions[j].weight)); } + state.push_back(TNodeState(trans, trans->transitions[j].dest, tmp, + state[i].dirty)); } } } @@ -426,12 +413,10 @@ State::step_case(UChar32 val, bool caseSensitive) bool -State::isFinal(map const &finals) const +State::isFinal(const set& finals) const { - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) - { + for (auto& it : state) { + if(finals.find(it.where) != finals.end() && it.where->is_final(it.state)) { return true; } } @@ -466,7 +451,7 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC UString -State::filterFinals(map const &finals, +State::filterFinals(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, @@ -477,23 +462,22 @@ State::filterFinals(map const &finals, UString result; double cost = 0.0000; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { - if(state[i].dirty) + if(st.dirty) { result.clear(); cost = 0.0000; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); + cost += ((*(st.sequence))[j]).second; } if(firstupper) { @@ -512,19 +496,21 @@ State::filterFinals(map const &finals, { result.clear(); cost = 0.0000; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first); - cost += ((*(state[i].sequence))[j]).second; + alphabet.getSymbol(result, ((*(st.sequence))[j]).first); + cost += ((*(st.sequence))[j]).second; } } // Add the weight of the final state - cost += (*(finals.find(state[i].where))).second; + double temp; + st.where->find_final(st.state, temp); + cost += temp; response.push_back(make_pair(result, cost)); } } @@ -550,7 +536,7 @@ State::filterFinals(map const &finals, set > > -State::filterFinalsLRX(map const &finals, +State::filterFinalsLRX(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const @@ -560,21 +546,20 @@ State::filterFinalsLRX(map const &finals, vector current_result; UString rule_id; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { current_result.clear(); rule_id.clear(); UString current_word; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { current_word += '\\'; } UString sym; - alphabet.getSymbol(sym, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(sym, ((*(st.sequence))[j]).first, uppercase); if(sym == "<$>"_u) { if(!current_word.empty()) @@ -598,7 +583,7 @@ State::filterFinalsLRX(map const &finals, UString -State::filterFinalsSAO(map const &finals, +State::filterFinalsSAO(const set& finals, Alphabet const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const @@ -606,29 +591,28 @@ State::filterFinalsSAO(map const &finals, UString result; UString annot; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; unsigned int const first_char = result.size() + firstchar; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find(((*(state[i].sequence))[j]).first) != escaped_chars.end()) + if(escaped_chars.find(((*(st.sequence))[j]).first) != escaped_chars.end()) { result += '\\'; } - if(alphabet.isTag(((*(state[i].sequence))[j]).first)) + if(alphabet.isTag(((*(st.sequence))[j]).first)) { annot.clear(); - alphabet.getSymbol(annot, ((*(state[i].sequence))[j]).first); + alphabet.getSymbol(annot, ((*(st.sequence))[j]).first); result += '&'; result += annot.substr(1,annot.length()-2); result += ';'; } else { - alphabet.getSymbol(result, ((*(state[i].sequence))[j]).first, uppercase); + alphabet.getSymbol(result, ((*(st.sequence))[j]).first, uppercase); } } if(firstupper) @@ -650,25 +634,24 @@ State::filterFinalsSAO(map const &finals, } UString -State::filterFinalsTM(map const &finals, +State::filterFinalsTM(const set& finals, Alphabet const &alphabet, set const &escaped_chars, queue &blankqueue, vector &numbers) const { UString result; - for(size_t i = 0, limit = state.size(); i != limit; i++) - { - if(finals.find(state[i].where) != finals.end()) + for (auto& st : state) { + if(finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { result += '/'; - for(size_t j = 0, limit2 = state[i].sequence->size(); j != limit2; j++) + for(size_t j = 0, limit2 = st.sequence->size(); j != limit2; j++) { - if(escaped_chars.find((*(state[i].sequence))[j].first) != escaped_chars.end()) + if(escaped_chars.find((*(st.sequence))[j].first) != escaped_chars.end()) { result += '\\'; } - alphabet.getSymbol(result, (*(state[i].sequence))[j].first); + alphabet.getSymbol(result, (*(st.sequence))[j].first); } } } @@ -854,34 +837,22 @@ State::lastPartHasRequiredSymbol(const vector> &seq, int requi void -State::restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol) +State::restartFinals(const set& finals, int requiredSymbol, State *restart_state, int separationSymbol) { - - for(unsigned int i=0; i 0) - { - bool restart = lastPartHasRequiredSymbol(*(state_i.sequence), requiredSymbol, separationSymbol); - if(restart) - { - if(restart_state != NULL) - { - for(unsigned int j=0; jstate.size(); j++) - { - TNodeState initst = restart_state->state.at(j); - vector> *tnvec = new vector>; - - for(unsigned int k=0; k < state_i.sequence->size(); k++) - { - tnvec->push_back(state_i.sequence->at(k)); - } - TNodeState tn(initst.where, tnvec, state_i.dirty); - tn.sequence->push_back(make_pair(separationSymbol, 0.0000)); - state.push_back(tn); + if (finals.find(st.where) != finals.end() && st.where->is_final(st.state)) { + bool restart = lastPartHasRequiredSymbol(*(st.sequence), requiredSymbol, separationSymbol); + if(restart && restart_state != NULL) { + for (auto& initst : restart_state->state) { + TPath* tnvec = new TPath(); + for (auto& it : *(st.sequence)) { + tnvec->push_back(it); } + TNodeState tn(initst.where, initst.state, tnvec, st.dirty); + tn.sequence->push_back(make_pair(separationSymbol, 0.0000)); + state.push_back(tn); } } } diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 31f0e42..7d8c973 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -29,11 +29,13 @@ #include #include #include - +#include #include using namespace std; +typedef vector> TPath; + /** * Class to represent the current state of transducer processing */ @@ -45,22 +47,24 @@ private: */ struct TNodeState { - Node *where; - vector> *sequence; + TransducerExe* where; + uint64_t state; + TPath* sequence; // a state is "dirty" if it was introduced at runtime (case variants, etc.) bool dirty; - TNodeState(Node * const &w, vector> * const &s, bool const &d): where(w), sequence(s), dirty(d){} + TNodeState(TransducerExe* w, uint64_t i, TPath* s, bool d) + : where(w), state(i), sequence(s), dirty(d){} TNodeState(const TNodeState& other) - : where(other.where) - , sequence(other.sequence) - , dirty(other.dirty) + : where(other.where), state(other.state), + sequence(other.sequence), dirty(other.dirty) {} TNodeState & operator=(TNodeState const &other) { where = other.where; + state = other.state; sequence = other.sequence; dirty = other.dirty; return *this; @@ -204,7 +208,7 @@ public: * Init the state with the initial node and empty output * @param initial the initial node of the transducer */ - void init(Node *initial); + void init(const set& exes); /** * Remove states not containing a specific symbol in their last 'part', and states @@ -254,7 +258,7 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinals(map const &finals, + UString filterFinals(const set& finals, Alphabet const &a, set const &escaped_chars, bool display_weights = false, @@ -275,7 +279,7 @@ public: * @param firstchar first character of the word * @return the result of the transduction */ - UString filterFinalsSAO(map const &finals, + UString filterFinalsSAO(const set& finals, Alphabet const &a, set const &escaped_chars, bool uppercase = false, @@ -295,7 +299,7 @@ public: * @return the result of the transduction */ - set > > filterFinalsLRX(map const &finals, + set > > filterFinalsLRX(const set& finals, Alphabet const &a, set const &escaped_chars, bool uppercase = false, @@ -314,7 +318,7 @@ public: * @param restart_state * @param separationSymbol */ - void restartFinals(const map &finals, int requiredSymbol, State *restart_state, int separationSymbol); + void restartFinals(const set& finals, int requiredSymbol, State *restart_state, int separationSymbol); /** @@ -323,14 +327,14 @@ public: * @param finals set of final nodes @return * @true if the state is final */ - bool isFinal(map const &finals) const; + bool isFinal(const set& finals) const; /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ UString getReadableString(const Alphabet &a); - UString filterFinalsTM(map const &finals, + UString filterFinalsTM(const set& finals, Alphabet const &alphabet, set const &escaped_chars, queue &blanks, diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index fbd6dad..fd69922 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -103,21 +103,22 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) state_count = Compression::multibyte_read(input); offsets = new uint64_t[state_count+1]; transition_count = 0; - std::vector isyms, osyms, dests; + std::vector isyms, osyms; + std::vector dests; std::vector weights; for (uint64_t i = 0; i < state_count; i++) { offsets[i] = transition_count; - std::map>>> temp; uint64_t count = Compression::multibyte_read(input); transition_count += count; int32_t tag_base = 0; - for (uint64_t i = 0; i < count; i++) { + for (uint64_t t = 0; t < count; t++) { tag_base += Compression::multibyte_read(input); uint64_t dest = (i + Compression::multibyte_read(input)) % state_count; if (read_weights) { - base_weight = Compression::multibyte_read(input); + base_weight = Compression::long_multibyte_read(input); } auto sym = alphabet.decode(tag_base); temp[sym.first].push_back(make_pair(sym.second, @@ -174,7 +175,7 @@ TransducerExe::get_range(const uint64_t state, const int32_t symbol, r = offsets[state+1]; while (l < r) { m = (l + r) / 2; - if (transitions[m].isym < symbol) { + if (transitions[m].isym > symbol) { r = m; } else { l = m + 1; @@ -182,3 +183,30 @@ TransducerExe::get_range(const uint64_t state, const int32_t symbol, } end = l; } + +bool +TransducerExe::find_final(const uint64_t state, double& weight) +{ + int64_t l = 0; + int64_t r = final_count - 1; + int64_t m; + while (l <= r) { + m = (l + r) / 2; + if (finals[m].state == state) { + weight = finals[m].weight; + return true; + } else if (finals[m].state < state) { + l = m + 1; + } else { + r = m - 1; + } + } + return false; +} + +bool +TransducerExe::is_final(const uint64_t state) +{ + double x; + return find_final(state, x); +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index be4f3ae..eadf894 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -15,6 +15,9 @@ * along with this program; if not, see . */ +#ifndef _LT_TRANSDUCER_EXE_ +#define _LT_TRANSDUCER_EXE_ + #include #include @@ -34,11 +37,11 @@ struct Final { }; class MatchState2; -class TransState; +class State; class TransducerExe { friend MatchState2; - friend TransState; + friend State; private: uint64_t initial; uint64_t state_count; @@ -50,8 +53,12 @@ private: void get_range(const uint64_t state, const int32_t sym, uint64_t& start, uint64_t& end); + bool find_final(const uint64_t state, double& weight); + bool is_final(const uint64_t state); public: TransducerExe(); ~TransducerExe(); void read(FILE* input, Alphabet& alphabet); }; + +#endif