commit aeb0fe921d269135d530ec911af33ba5f1c612eb Author: Daniel Swanson Date: Fri Jul 30 12:06:11 2021 -0500 run TransducerExe for matching diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 7904c99..75aa96c 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,13 +1,13 @@ h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ - match_exe.h match_node.h match_state.h my_stdio.h node.h \ + match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ - match_node.cc match_state.cc node.cc pattern_list.cc \ + match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc diff --git a/lttoolbox/match_state2.cc b/lttoolbox/match_state2.cc new file mode 100644 index 0000000..3738fd3 --- /dev/null +++ b/lttoolbox/match_state2.cc @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +MatchState2::MatchState2(TransducerExe* t) : + trans(t) +{ + buffer[0] = trans->initial; + last = 1; +} + +MatchState2::~MatchState2() +{} + +void +MatchState2::copy(const MatchState2& o) +{ + trans = o.trans; + first = o.first; + last = o.last; + for (uint16_t i = first; i != last; i = (i + 1) % BUF_LIMIT) { + buffer[i] = o.buffer[i]; + } +} + +MatchState2::MatchState2(const MatchState2& o) +{ + copy(o); +} + +MatchState2& +MatchState2::operator=(const MatchState2& o) +{ + if (this != &o) { + copy(o); + } + return *this; +} + +uint16_t +MatchState2::size() const +{ + return (last + BUF_LIMIT - first) % BUF_LIMIT; +} + +bool +MatchState2::empty() const +{ + return last == first; +} + +void +MatchState2::applySymbol(const uint64_t state, const int32_t symbol) +{ + uint64_t start = 0; + uint64_t end = 0; + trans->get_range(state, symbol, start, end); + for (uint64_t i = start; i < end; i++) { + buffer[last] = trans->transitions[i].dest; + last = (last + 1) % BUF_LIMIT; + } +} + +void +MatchState2::step(const int32_t input) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + } + first = temp_last; +} + +void +MatchState2::step(const int32_t input, const int32_t alt) +{ + uint16_t temp_last = last; + for (uint16_t i = first; i != temp_last; i = (i+1)%BUF_LIMIT) { + applySymbol(buffer[i], input); + applySymbol(buffer[i], alt); + } + first = temp_last; +} + +void +MatchState2::step(UString_view input, const Alphabet& alpha, bool foldcase) +{ + // TODO +} + +int +MatchState2::classifyFinals(const std::map& finals, + const std::set& banned_rules) const +{ + int ret = INT_MAX; + for (uint16_t i = first; i != last; i = (i+1)%BUF_LIMIT) { + auto it = finals.find(buffer[i]); + if (it != finals.end()) { + if (it->second < ret && + banned_rules.find(it->second) == banned_rules.end()) { + ret = it->second; + } + } + } + return (ret < INT_MAX) ? ret : -1; +} + +int +MatchState2::classifyFinals(const std::map& finals) const +{ + set empty; + return classifyFinals(finals, empty); +} + +void +MatchState2::clear() +{ + first = 0; + last = 1; + buffer[0] = trans->initial; +} diff --git a/lttoolbox/match_state2.h b/lttoolbox/match_state2.h new file mode 100644 index 0000000..1542c05 --- /dev/null +++ b/lttoolbox/match_state2.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_MATCH_STATE_ +#define _LT_MATCH_STATE_ + +#include +#include +#include + +// rename upon deleting old MatchState +class MatchState2 +{ +private: + static int const BUF_LIMIT = 1024; + TransducerExe* trans; + uint64_t buffer[1024]; + uint16_t first = 0; + uint16_t last = 0; + + void copy(const MatchState2& o); + void applySymbol(const uint64_t state, const int32_t symbol); +public: + MatchState2(TransducerExe* t); + ~MatchState2(); + MatchState2(const MatchState2& o); + MatchState2& operator=(const MatchState2& o); + + uint16_t size() const; + bool empty() const; + void step(const int32_t input); + void step(const int32_t input, const int32_t alt); + void step(UString_view input, const Alphabet& alpha, bool foldcase = true); + int classifyFinals(const std::map& finals, + const std::set& banned_rules) const; + int classifyFinals(const std::map& finals) const; + void clear(); +}; + +#endif diff --git a/lttoolbox/transducer_exe.cc b/lttoolbox/transducer_exe.cc index 5efdb40..fbd6dad 100644 --- a/lttoolbox/transducer_exe.cc +++ b/lttoolbox/transducer_exe.cc @@ -142,3 +142,43 @@ TransducerExe::read(FILE* input, Alphabet& alphabet) } } } + +void +TransducerExe::get_range(const uint64_t state, const int32_t symbol, + uint64_t& start, uint64_t& end) +{ + uint64_t l = offsets[state]; + uint64_t r = offsets[state+1]; + uint64_t m; + if (l == r) { + start = end = 0; + return; + } + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym < symbol) { + l = m + 1; + } else { + r = m; + } + } + if (transitions[l].isym != symbol) { + end = start = 0; + return; + } else { + start = l; + } + // there's probably a way to do this with 1 loop + // but I'd have to be very sure of what I was doing to write that loop -DGS + l = start; + r = offsets[state+1]; + while (l < r) { + m = (l + r) / 2; + if (transitions[m].isym < symbol) { + r = m; + } else { + l = m + 1; + } + } + end = l; +} diff --git a/lttoolbox/transducer_exe.h b/lttoolbox/transducer_exe.h index 02297b1..be4f3ae 100644 --- a/lttoolbox/transducer_exe.h +++ b/lttoolbox/transducer_exe.h @@ -33,7 +33,12 @@ struct Final { double weight; }; +class MatchState2; +class TransState; + class TransducerExe { + friend MatchState2; + friend TransState; private: uint64_t initial; uint64_t state_count; @@ -42,6 +47,9 @@ private: Final* finals; uint64_t* offsets; Transition* transitions; + + void get_range(const uint64_t state, const int32_t sym, + uint64_t& start, uint64_t& end); public: TransducerExe(); ~TransducerExe();