commit 290368b522584bcc9ffe740f03fceaf1ba389037 Author: Daniel Swanson Date: Mon Jun 14 19:09:58 2021 -0500 move regex optimization code to apertium to share with t*x diff --git a/src/pattern.cc b/src/pattern.cc index ea4406b..1e3acd7 100644 --- a/src/pattern.cc +++ b/src/pattern.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -25,8 +26,6 @@ PatternBuilder::PatternBuilder() attr_items["chcontent"_u] = "(\\{.+)"_u; attr_items["content"_u] = "(\\{.+)"_u; attr_items["pos_tag"_u] = "(<[^>]+>)"_u; - starCanBeEmpty = false; - chunkVarCount = 0; } int @@ -105,99 +104,8 @@ PatternBuilder::countToFinalSymbol(const int count) return symbol; } -vector -PatternBuilder::buildTrie(vector parts) -{ - vector ret; - vector> p2; - for(auto p : parts) - { - if(p.size() == 0) continue; - bool found = false; - for(unsigned int t = 0; t < p2.size(); t++) - { - if(ret[t]->self == p[0]) - { - p2[t].push_back(p.substr(1)); - found = true; - break; - } - } - if(!found) - { - TrieNode* t = new TrieNode; - t->self = p[0]; - ret.push_back(t); - p2.push_back(vector(1, p.substr(1))); - } - } - for(unsigned int i = 0; i < ret.size(); i++) - { - ret[i]->next = buildTrie(p2[i]); - } - return ret; -} - -UString -PatternBuilder::unbuildTrie(PatternBuilder::TrieNode* t) -{ - if(t->self == '\0') return ""_u; - UString single; - bool end = false; - vector groups; - int ct = t->next.size(); - for(auto it : t->next) - { - UString blob = unbuildTrie(it); - if(blob.size() == 0) - { - end = true; - ct--; - } - else if(blob.size() == 1) - { - if(single.size() > 0) ct--; - single += blob; - } - else groups.push_back(blob); - } - UString ret; - if(t->self == '#') ret += '\\'; - ret += t->self; - if(single.size() == 0 && groups.size() == 0) return ret; - if(single.size() > 1) single = "["_u + single + "]"_u; - if(ct > 1 || (groups.size() == 1 && end)) ret += "(?:"_u; - for(unsigned int i = 0; i < groups.size(); i++) - { - if(i > 0) ret += '|'; - ret += groups[i]; - } - if(single.size() > 0) - { - if(groups.size() > 0) ret += '|'; - ret += single; - } - if(ct > 1 || (groups.size() == 1 && end)) ret += ')'; - if(end) ret += '?'; - return ret; -} - -UString -PatternBuilder::trie(vector parts) -{ - if(parts.size() == 0) return ""_u; - for(unsigned int i = 0; i < parts.size(); i++) - { - parts[i] = "<"_u + parts[i]; - parts[i] += '\0'; - } - vector l = buildTrie(parts); - // they all start with '<', so there will only be 1. - return "("_u + unbuildTrie(l[0]) + ">)"_u; -} - void -PatternBuilder::addPattern(vector> pat, int rule, double weight, bool isLex) +PatternBuilder::addPattern(const vector>& pat, int rule, double weight, bool isLex) { int state = transducer.getInitial(); for(unsigned int p = 0; p < pat.size(); p++) @@ -233,7 +141,7 @@ PatternBuilder::addPattern(vector> pat, int rule, double } void -PatternBuilder::addRule(int rule, double weight, vector> pattern, vector firstChunk, UString name) +PatternBuilder::addRule(int rule, double weight, const vector>& pattern, const vector& firstChunk, const UString& name) { rules[rule] = make_pair(firstChunk, pattern); addPattern(pattern, rule, weight, false); @@ -254,33 +162,27 @@ PatternBuilder::addRule(int rule, double weight, vector> } void -PatternBuilder::addList(UString name, set vals) +PatternBuilder::addList(const UString& name, const set& vals) { lists[name] = vals; } void -PatternBuilder::addAttr(UString name, set vals) +PatternBuilder::addAttr(const UString& name, const set& vals) { vector pat; - for(auto it : vals) - { - UString p = StringUtils::substitute(it, "\\."_u, "<>"_u); - p = StringUtils::substitute(p, "."_u, "><"_u); - pat.push_back(StringUtils::substitute(p, "<>"_u, "\\."_u)); - } - UString pt = trie(pat); - attr_items[name] = pt; + pat.assign(vals.begin(), vals.end()); + attr_items[name] = optimize_regex(pat); } bool -PatternBuilder::isAttrDefined(UString name) +PatternBuilder::isAttrDefined(const UString& name) { return attr_items.find(name) != attr_items.end(); } void -PatternBuilder::addVar(UString name, UString val) +PatternBuilder::addVar(const UString& name, const UString& val) { variables[name] = val; } diff --git a/src/pattern.h b/src/pattern.h index 395bd1d..ed48cc6 100644 --- a/src/pattern.h +++ b/src/pattern.h @@ -95,7 +95,7 @@ private: /** * Build complete path */ - void addPattern(vector> pat, int rule, double weight, bool isLex); + void addPattern(const vector>& pat, int rule, double weight, bool isLex); void buildLookahead(); @@ -103,31 +103,6 @@ private: void buildFallback(); - ////////// - // ATTRIBUTE COMPRESSION - ////////// - - struct TrieNode - { - wchar_t self; - vector next; - }; - - /** - * Construct tries for a set of inputs, return one for each initial character - */ - vector buildTrie(vector parts); - - /** - * Convert trie to regex - */ - UString unbuildTrie(TrieNode* t); - - /** - * Wrapper around buildTrie() and unbuildTrie() - */ - UString trie(vector parts); - public: ////////// @@ -137,14 +112,13 @@ public: // false: * = 1 or more tags, true: * = 0 or more tags /** * If false, L"*" must match at least one tag, otherwise it can match 0 - * Default: false */ - bool starCanBeEmpty; + bool starCanBeEmpty = false; /** * Number of global Chunk* variables to allocate space for */ - unsigned int chunkVarCount; + unsigned int chunkVarCount = 0; /** * Debug names for input-time rules @@ -158,11 +132,11 @@ public: PatternBuilder(); - void addRule(int rule, double weight, vector> pattern, vector firstChunk, UString name); - void addList(UString name, set vals); - void addAttr(UString name, set vals); - bool isAttrDefined(UString name); - void addVar(UString name, UString val); + void addRule(int rule, double weight, const vector>& pattern, const vector& firstChunk, const UString& name); + void addList(const UString& name, const set& vals); + void addAttr(const UString& name, const set& vals); + bool isAttrDefined(const UString& name); + void addVar(const UString& name, const UString& val); void loadLexFile(const string& fname); void write(FILE* output, int longest, vector> inputBytecode, vector outputBytecode); diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc index 3a71000..d1bfeae 100644 --- a/src/rtx_processor.cc +++ b/src/rtx_processor.cc @@ -78,7 +78,7 @@ RTXProcessor::read(string const &filename) UString fallback = Compression::string_read(in); if (recompile_attrs && cad_k == "chname"_u) { // chname was previously "({([^/]+)\\/)" - // which is fine for PCRE, but ICU chokes on the unmatched bracket + // which was fine for PCRE, but ICU chokes on the unmatched bracket fallback = "(\\{([^/]+)\\/)"_u; } attr_items[cad_k].compile(fallback);