commit a8b0acea7488fe0c94d5c9e5b50ea076bda7d35b Author: Daniel Swanson Date: Mon Aug 2 12:02:22 2021 -0500 AlphabetExe diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index 75aa96c..ec07213 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -1,11 +1,11 @@ -h_sources = alphabet.h att_compiler.h buffer.h compiler.h compression.h \ +h_sources = alphabet.h alphabet_exe.h att_compiler.h buffer.h compiler.h compression.h \ deserialiser.h endian_util.h entry_token.h expander.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h match_state2.h my_stdio.h node.h \ pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h string_view.h string_writer.h \ transducer.h transducer_exe.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h -cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ +cc_sources = alphabet.cc alphabet_exe.cc att_compiler.cc compiler.cc compression.cc entry_token.cc \ expander.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc match_state2.cc node.cc pattern_list.cc \ regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc string_writer.cc transducer.cc transducer_exe.cc \ diff --git a/lttoolbox/alphabet.cc b/lttoolbox/alphabet.cc index a313814..284e8b9 100644 --- a/lttoolbox/alphabet.cc +++ b/lttoolbox/alphabet.cc @@ -316,3 +316,9 @@ Alphabet::createLoopbackSymbols(set &symbols, Alphabet &basis, Side s, } } } + +vector& +Alphabet::getTags() +{ + return slexicinv; +} diff --git a/lttoolbox/alphabet.h b/lttoolbox/alphabet.h index 8c6dec2..72a3b36 100644 --- a/lttoolbox/alphabet.h +++ b/lttoolbox/alphabet.h @@ -197,6 +197,11 @@ public: * @param nonTagsToo by default only tags are included, but if this is true we include all symbols */ void createLoopbackSymbols(set &symbols, Alphabet &basis, Side s = right, bool nonTagsToo = false); + + /** + * Return a reference to the array of tags + */ + vector& getTags(); }; #endif diff --git a/lttoolbox/alphabet_exe.cc b/lttoolbox/alphabet_exe.cc new file mode 100644 index 0000000..b55e642 --- /dev/null +++ b/lttoolbox/alphabet_exe.cc @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +#include + +AlphabetExe::AlphabetExe(StringWriter* sw_) + : sw(sw_), tag_count(0), tags(nullptr) +{} + +AlphabetExe::~AlphabetExe() +{ + delete[] tags; +} + +void +AlphabetExe::read(FILE* input, bool mmap) +{ + if (mmap) { + } else { + tag_count = Compression::multibyte_read(input); + tags = new StringRef[tag_count]; + for (uint32_t i = 0; i < tag_count; i++) { + UString tg; + tg += '<'; + tg += Compression::string_read(input); + tg += '>'; + tags[i] = sw->add(tg); + } + // has to be a separate loop, otherwise the string_views get + // invalidated when the StringWriter buffer expands + for (uint32_t i = 0; i < tag_count; i++) { + symbol_map[sw->get(tags[i])] = -static_cast(i) - 1; + } + int pairs = Compression::multibyte_read(input); + for (int i = 0; i < pairs; i++) { + Compression::multibyte_read(input); + Compression::multibyte_read(input); + } + } +} + +int32_t +AlphabetExe::operator()(UString_view sv) +{ + auto it = symbol_map.find(sv); + if (it != symbol_map.end()) { + return it->second; + } else { + return 0; + } +} + +void +AlphabetExe::getSymbol(UString& result, int32_t symbol, bool uppercase) const +{ + if (symbol == 0) { + return; + } else if (symbol < 0) { + result.append(sw->get(tags[-symbol-1])); + } else if (uppercase) { + result += u_toupper(static_cast(symbol)); + } else { + result += static_cast(symbol); + } +} + +bool +AlphabetExe::isTag(const int32_t symbol) const +{ + return symbol < 0; +} + +void +AlphabetExe::clearSymbol(const int32_t symbol) +{ + if (symbol < 0) { + tags[-symbol-1].start = 0; + tags[-symbol-1].count = 0; + } +} diff --git a/lttoolbox/alphabet_exe.h b/lttoolbox/alphabet_exe.h new file mode 100644 index 0000000..af579bb --- /dev/null +++ b/lttoolbox/alphabet_exe.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LT_ALPHABET_EXE_ +#define _LT_ALPHABET_EXE_ + +#include +#include + +class AlphabetExe { +private: + StringWriter* sw; + uint64_t tag_count; + StringRef* tags; + std::map symbol_map; +public: + AlphabetExe(StringWriter* sw_); + ~AlphabetExe(); + void read(FILE* in, bool mmap); + int32_t operator()(UString_view sv); + void getSymbol(UString& result, int32_t symbol, bool uppercase = false) const; + bool isTag(const int32_t symbol) const; + void clearSymbol(const int32_t symbol); +}; + +#endif diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index d2ab234..128fac6 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -179,7 +179,7 @@ Compiler::procAlphabet() bool space = true; for(unsigned int i = 0; i < letters.length(); i++) { - if(!u_isspace(letters.at(i))) + if(!u_isspace(letters[i])) { space = false; break; diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 2e4ac97..01e9822 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -40,6 +40,7 @@ UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; FSTProcessor::FSTProcessor() + : alphabet(AlphabetExe(&str_write)) { // escaped_chars chars escaped_chars.insert('['); @@ -956,13 +957,17 @@ FSTProcessor::load(FILE *input) } // symbols - alphabet.read(input); + fgetpos(input, &pos); + alphabet.read(input, false); + fsetpos(input, &pos); + Alphabet temp; + temp.read(input); len = Compression::multibyte_read(input); while(len > 0) { UString name = Compression::string_read(input); - transducers[name].read(input, alphabet); + transducers[name].read(input, temp); len--; } } @@ -1067,7 +1072,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundOnlyLSymbol, ""_u); + alphabet.clearSymbol(compoundOnlyLSymbol); } if((compoundRSymbol=alphabet("<:co:R>"_u)) == 0 @@ -1080,7 +1085,7 @@ FSTProcessor::initDecompositionSymbols() } else if(!showControlSymbols) { - alphabet.setSymbol(compoundRSymbol, ""_u); + alphabet.clearSymbol(compoundRSymbol); } } diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 9bfe0bc..412bba2 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -19,10 +19,11 @@ #define _FSTPROCESSOR_ #include -#include +#include #include #include #include +#include #include #include #include @@ -134,10 +135,15 @@ private: */ int rcx_current_char; + /** + * String manager + */ + StringWriter str_write; + /** * Alphabet */ - Alphabet alphabet; + AlphabetExe alphabet; /** * Input buffer diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index be492bc..909fc6a 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -452,7 +452,7 @@ State::NFinals(vector> lf, int maxAnalyses, int maxWeightC UString State::filterFinals(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool display_weights, int max_analyses, int max_weight_classes, bool uppercase, bool firstupper, int firstchar) const @@ -537,7 +537,7 @@ State::filterFinals(const set& finals, set > > State::filterFinalsLRX(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { @@ -584,7 +584,7 @@ State::filterFinalsLRX(const set& finals, UString State::filterFinalsSAO(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, bool uppercase, bool firstupper, int firstchar) const { @@ -635,7 +635,7 @@ State::filterFinalsSAO(const set& finals, UString State::filterFinalsTM(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, queue &blankqueue, vector &numbers) const { @@ -749,12 +749,12 @@ State::pruneCompounds(int requiredSymbol, int separationSymbol, int compound_max for(unsigned int i = 0; i> seq = *state.at(i).sequence; + vector> seq = *state[i].sequence; if(lastPartHasRequiredSymbol(seq, requiredSymbol, separationSymbol)) { int this_noOfCompoundElements = 0; - for (int j = seq.size()-2; j>0; j--) if ((seq.at(j)).first==separationSymbol) this_noOfCompoundElements++; + for (int j = seq.size()-2; j>0; j--) if ((seq[j]).first==separationSymbol) this_noOfCompoundElements++; noOfCompoundElements[i] = this_noOfCompoundElements; minNoOfCompoundElements = (minNoOfCompoundElements < this_noOfCompoundElements) ? minNoOfCompoundElements : this_noOfCompoundElements; @@ -862,7 +862,7 @@ State::restartFinals(const set& finals, int requiredSymbol, Stat UString -State::getReadableString(const Alphabet &a) +State::getReadableString(const AlphabetExe &a) { UString retval; retval += '['; diff --git a/lttoolbox/state.h b/lttoolbox/state.h index 7d8c973..676f7eb 100644 --- a/lttoolbox/state.h +++ b/lttoolbox/state.h @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -259,7 +259,7 @@ public: * @return the result of the transduction */ UString filterFinals(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool display_weights = false, int max_analyses = INT_MAX, @@ -280,7 +280,7 @@ public: * @return the result of the transduction */ UString filterFinalsSAO(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool uppercase = false, bool firstupper = false, @@ -300,7 +300,7 @@ public: */ set > > filterFinalsLRX(const set& finals, - Alphabet const &a, + AlphabetExe const &a, set const &escaped_chars, bool uppercase = false, bool firstupper = false, @@ -332,10 +332,10 @@ public: /** * Return the full states string (to allow debuging...) using a Java ArrayList.toString style */ - UString getReadableString(const Alphabet &a); + UString getReadableString(const AlphabetExe &a); UString filterFinalsTM(const set& finals, - Alphabet const &alphabet, + AlphabetExe const &alphabet, set const &escaped_chars, queue &blanks, vector &numbers) const; diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 64c7120..f818f07 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -20,7 +20,7 @@ #include #include -UString_view +StringRef StringWriter::add(const UString& s) { auto start = buffer.find(s); @@ -28,8 +28,10 @@ StringWriter::add(const UString& s) start = buffer.size(); buffer += s; } - UString_view ret(buffer); - return ret.substr(start, s.size()); + StringRef ret; + ret.start = start; + ret.count = s.size(); + return ret; } UString_view @@ -39,6 +41,13 @@ StringWriter::get(const uint32_t start, const uint32_t count) return ret.substr(start, count); } +UString_view +StringWriter::get(const StringRef& ref) +{ + UString_view ret(buffer); + return ret.substr(ref.start, ref.count); +} + void StringWriter::read(FILE* in) { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 2785c42..15fcaf3 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -22,11 +22,17 @@ #include #include +struct StringRef { + uint32_t start; + uint32_t count; +}; + class StringWriter { public: UString buffer; - UString_view add(const UString& s); + StringRef add(const UString& s); UString_view get(const uint32_t start, const uint32_t count); + UString_view get(const StringRef& ref); void read(FILE* in); void write(FILE* out); };