commit a4ed4e3e719c798aa13ce4369e80c599553d4806 Author: Daniel Swanson Date: Mon Jun 14 19:11:31 2021 -0500 regex optimization from apertium-recursive diff --git a/apertium/Makefile.am b/apertium/Makefile.am index ef15388..b731e60 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -67,6 +67,7 @@ h_sources = a.h \ transfer.h \ transfer_instr.h \ transfer_mult.h \ + transfer_regex.h \ transfer_token.h \ transfer_word.h \ trx_reader.h \ @@ -131,6 +132,7 @@ cc_sources = a.cc \ transfer_data.cc \ transfer_instr.cc \ transfer_mult.cc \ + transfer_regex.cc \ transfer_token.cc \ transfer_word.cc \ trx_reader.cc \ diff --git a/apertium/transfer_regex.cc b/apertium/transfer_regex.cc new file mode 100644 index 0000000..f2cf207 --- /dev/null +++ b/apertium/transfer_regex.cc @@ -0,0 +1,109 @@ +#include + +struct TrieNode { + UChar32 c; + std::vector next; +}; + +TrieNode* +add_char(TrieNode* root, UChar32 c) +{ + for (auto node : root->next) { + if (node->c == c) { + return node; + } + } + TrieNode* t = new TrieNode; + t->c = c; + root->next.push_back(t); + return t; +} + +void +add_entry(TrieNode* root, const std::vector& vec) +{ + bool escape = false; + TrieNode* cur = root; + for (auto c : vec) { + if (!escape) { + if (c == '\\') { + escape = true; + continue; + } else if (c == '.') { + cur = add_char(cur, '>'); + cur = add_char(cur, '<'); + continue; + } + } + escape = false; + cur = add_char(cur, c); + } + add_char(cur, '\0'); +} + +UString +unbuildTrie(TrieNode* root) +{ + UString single; + single += '['; + std::vector groups; + bool end = false; + int single_count = 0; + for (auto it : root->next) { + if (it->next.empty()) { + end = true; + } else if (it->next.size() == 1 && it->next[0]->c == '\0') { + single += it->c; + single_count++; + } else { + groups.push_back(unbuildTrie(it)); + } + } + if (single_count > 0) { + if (single_count == 1) { + groups.push_back(single.substr(1)); + } else { + single += ']'; + groups.push_back(single); + } + } + UString ret; + ret += root->c; + if (groups.empty()) { + return ret; + } else if (groups.size() == 1) { + ret += groups[0]; + } else { + ret += '('; ret += '?'; ret += ':'; + for (size_t i = 0; i < groups.size(); i++) { + if (i > 0) { + ret += '|'; + } + ret += groups[i]; + } + ret += ')'; + } + if (end) { + ret += '?'; + } + return ret; +} + +UString +optimize_regex(const std::vector& options) +{ + TrieNode* root = new TrieNode; + root->c = '<'; + std::vector v; + for (auto& s : options) { + v.clear(); + ustring_to_vec32(s, v); + add_entry(root, v); + } + UString ret; + ret += '('; + ret.append(unbuildTrie(root)); + ret += '>'; + ret += ')'; + return ret; +} diff --git a/apertium/transfer_regex.h b/apertium/transfer_regex.h new file mode 100644 index 0000000..63543a1 --- /dev/null +++ b/apertium/transfer_regex.h @@ -0,0 +1,9 @@ +#ifndef __TRANSFER_REGEX_OPTIMIZER__ +#define __TRANSFER_REGEX_OPTIMIZER__ + +#include +#include + +UString optimize_regex(const std::vector& options); + +#endif // __TRANSFER_REGEX_OPTIMIZER__ diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc index 24fcc1d..e400cc0 100644 --- a/apertium/trx_reader.cc +++ b/apertium/trx_reader.cc @@ -20,7 +20,7 @@ #include #include -#include +#include UString const TRXReader::ANY_TAG = ""_u; UString const TRXReader::ANY_CHAR = ""_u; @@ -321,6 +321,7 @@ void TRXReader::procDefAttrs() { UString attrname; + vector items; while(type != XML_READER_TYPE_END_ELEMENT || name != "section-def-attrs"_u) @@ -330,7 +331,7 @@ TRXReader::procDefAttrs() { if(type != XML_READER_TYPE_END_ELEMENT) { - insertAttrItem(attrname, attrib("tags"_u)); + items.push_back(attrib("tags"_u)); } } else if(name == "def-attr"_u) @@ -341,8 +342,8 @@ TRXReader::procDefAttrs() } else { - UString all = td.getAttrItems()[attrname]; - td.getAttrItems()[attrname] = "("_u + all + ")"_u; + td.getAttrItems()[attrname] = optimize_regex(items); + items.clear(); attrname.clear(); } } @@ -556,28 +557,3 @@ TRXReader::insertCatItem(UString const &name, UString const &lemma, lt.tags = tags; cat_items.insert(pair(name, lt)); } - -void -TRXReader::insertAttrItem(UString const &name, UString const &tags) -{ - if(td.getAttrItems()[name].size() != 0) - { - td.getAttrItems()[name] += '|'; - } - - td.getAttrItems()[name] += '<'; - - for(unsigned int i = 0, limit = tags.size(); i != limit; i++) - { - if(tags[i] == '.') - { - td.getAttrItems()[name].append("><"_u); - } - else - { - td.getAttrItems()[name] += tags[i]; - } - } - td.getAttrItems()[name] += '>'; - -} diff --git a/apertium/trx_reader.h b/apertium/trx_reader.h index cbea2e8..7766123 100644 --- a/apertium/trx_reader.h +++ b/apertium/trx_reader.h @@ -51,7 +51,6 @@ private: void insertCatItem(UString const &name, UString const &lemma, UString const &tags); - void insertAttrItem(UString const &name, UString const &tags); void createVar(UString const &name, UString const &initial_value); void insertListItem(UString const &name, UString const &value); void createMacro(UString const &name, int const val); diff --git a/tests/data/nno-nob.t2x.bin b/tests/data/nno-nob.t2x.bin index 6ea1978..9b8e124 100644 Binary files a/tests/data/nno-nob.t2x.bin and b/tests/data/nno-nob.t2x.bin differ