| File: | lexdcompiler.cc |
| Warning: | line 1540, column 9 Potential leak of memory pointed to by 'trans' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | #include "lexdcompiler.h" | |||
| 2 | #include <unicode/unistr.h> | |||
| 3 | #include <memory> | |||
| 4 | #include <chrono> | |||
| 5 | #include <lttoolbox/string_utils.h> | |||
| 6 | ||||
| 7 | using namespace icu; | |||
| 8 | using namespace std; | |||
| 9 | ||||
| 10 | bool tag_filter_t::combinable(const tag_filter_t &other) const | |||
| 11 | { | |||
| 12 | // TODO: make ops combinable even with non-empty filters? | |||
| 13 | if(empty() || other.empty()) | |||
| 14 | return true; | |||
| 15 | return intersectset(pos(), other.neg()).empty() && intersectset(other.pos(), neg()).empty() && ops().empty() && other.ops().empty(); | |||
| 16 | } | |||
| 17 | bool tag_filter_t::combine(const tag_filter_t &other) | |||
| 18 | { | |||
| 19 | if(!combinable(other)) | |||
| 20 | return false; | |||
| 21 | unionset_inplace(_pos, other._pos); | |||
| 22 | unionset_inplace(_neg, other._neg); | |||
| 23 | for(const auto &op: other._ops) | |||
| 24 | _ops.push_back(op); | |||
| 25 | return true; | |||
| 26 | } | |||
| 27 | ||||
| 28 | void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation); | |||
| 29 | vector<pattern_element_t> distribute_tag_expressions(const pattern_element_t &token) | |||
| 30 | { | |||
| 31 | vector<pattern_element_t> result; | |||
| 32 | for(const auto &f: token.tag_filter.distribute()) | |||
| 33 | { | |||
| 34 | pattern_element_t new_token = token; | |||
| 35 | new_token.tag_filter = f; | |||
| 36 | result.push_back(new_token); | |||
| 37 | } | |||
| 38 | return result; | |||
| 39 | } | |||
| 40 | ||||
| 41 | bool tag_filter_t::compatible(const tags_t &tags) const | |||
| 42 | { | |||
| 43 | return subset(pos(), tags) && intersectset(neg(), tags).empty() && ops().empty(); | |||
| 44 | } | |||
| 45 | bool tag_filter_t::applicable(const tags_t &tags) const | |||
| 46 | { | |||
| 47 | return subset(neg(), tags) && ops().empty(); | |||
| 48 | } | |||
| 49 | bool tag_filter_t::try_apply(tags_t &tags) const | |||
| 50 | { | |||
| 51 | if(!applicable(tags)) | |||
| 52 | return false; | |||
| 53 | subtractset_inplace(tags, neg()); | |||
| 54 | unionset_inplace(tags, pos()); | |||
| 55 | return true; | |||
| 56 | } | |||
| 57 | bool pattern_element_t::compatible(const lex_seg_t &tok) const | |||
| 58 | { | |||
| 59 | return left.name.empty() || right.name.empty() || tag_filter.compatible(tok.tags); | |||
| 60 | } | |||
| 61 | const UnicodeString &LexdCompiler::name(string_ref r) const | |||
| 62 | { | |||
| 63 | return id_to_name[(unsigned int)r]; | |||
| 64 | } | |||
| 65 | ||||
| 66 | UnicodeString LexdCompiler::printPattern(const pattern_element_t& pat) | |||
| 67 | { | |||
| 68 | UnicodeString ret = name(pat.left.name); | |||
| 69 | auto& pos = pat.tag_filter.pos(); | |||
| 70 | auto& neg = pat.tag_filter.neg(); | |||
| 71 | if (!pos.empty() || !neg.empty()) { | |||
| 72 | ret += '['; | |||
| 73 | int ln = ret.length(); | |||
| 74 | for (auto& it : pos) { | |||
| 75 | if (ret.length() > ln) ret += ','; | |||
| 76 | ret += name(it); | |||
| 77 | } | |||
| 78 | for (auto& it : neg) { | |||
| 79 | if (ret.length() > ln) ret += ','; | |||
| 80 | ret += '-'; | |||
| 81 | ret += name(it); | |||
| 82 | } | |||
| 83 | ret += ']'; | |||
| 84 | } | |||
| 85 | switch (pat.mode) { | |||
| 86 | case Question: | |||
| 87 | ret += '?'; break; | |||
| 88 | case Plus: | |||
| 89 | ret += '+'; break; | |||
| 90 | case Star: | |||
| 91 | ret += '*'; break; | |||
| 92 | default: break; | |||
| 93 | } | |||
| 94 | return ret; | |||
| 95 | } | |||
| 96 | ||||
| 97 | UnicodeString LexdCompiler::printFilter(const tag_filter_t& filter) | |||
| 98 | { | |||
| 99 | UnicodeString ret; | |||
| 100 | ret += '['; | |||
| 101 | for (auto& it : filter.pos()) { | |||
| 102 | if (ret.length() > 1) ret += ','; | |||
| 103 | ret += name(it); | |||
| 104 | } | |||
| 105 | for (auto& it : filter.neg()) { | |||
| 106 | if (ret.length() > 1) ret += ','; | |||
| 107 | ret += '-'; | |||
| 108 | ret += name(it); | |||
| 109 | } | |||
| 110 | for (auto& op : filter.ops()) { | |||
| 111 | if (ret.length() > 1) ret += ','; | |||
| 112 | ret += op->sigil; | |||
| 113 | ret += '['; | |||
| 114 | int ln = ret.length(); | |||
| 115 | for (auto& it : *op) { | |||
| 116 | if (ret.length() > ln) ret += ','; | |||
| 117 | ret += name(it); | |||
| 118 | } | |||
| 119 | ret += ']'; | |||
| 120 | } | |||
| 121 | ret += ']'; | |||
| 122 | return ret; | |||
| 123 | } | |||
| 124 | ||||
| 125 | trans_sym_t LexdCompiler::alphabet_lookup(const UnicodeString &symbol) | |||
| 126 | { | |||
| 127 | if (!symbol.hasMoreChar32Than(0, symbol.length(), 1)) { | |||
| 128 | return trans_sym_t((int)symbol.char32At(0)); | |||
| 129 | } else { | |||
| 130 | UString temp; | |||
| 131 | temp.append(symbol.getBuffer(), (unsigned int)symbol.length()); | |||
| 132 | alphabet.includeSymbol(temp); | |||
| 133 | return trans_sym_t(alphabet(temp)); | |||
| 134 | } | |||
| 135 | } | |||
| 136 | trans_sym_t LexdCompiler::alphabet_lookup(trans_sym_t l, trans_sym_t r) | |||
| 137 | { | |||
| 138 | return trans_sym_t(alphabet((int)l, (int)r)); | |||
| 139 | } | |||
| 140 | ||||
| 141 | LexdCompiler::LexdCompiler() | |||
| 142 | { | |||
| 143 | id_to_name.push_back(""); | |||
| 144 | name_to_id[""] = string_ref(0); | |||
| 145 | lexicons[string_ref(0)] = vector<entry_t>(); | |||
| 146 | ||||
| 147 | left_sieve_name = internName("<"); | |||
| 148 | token_t lsieve_tok = {.name=left_sieve_name, .part=1, .optional=false}; | |||
| 149 | pattern_element_t lsieve_elem = {.left=lsieve_tok, .right=lsieve_tok, | |||
| 150 | .tag_filter=tag_filter_t(), | |||
| 151 | .mode=Normal}; | |||
| 152 | left_sieve_tok = vector<pattern_element_t>(1, lsieve_elem); | |||
| 153 | ||||
| 154 | right_sieve_name = internName(">"); | |||
| 155 | token_t rsieve_tok = {.name=right_sieve_name, .part=1, .optional=false}; | |||
| 156 | pattern_element_t rsieve_elem = {.left=rsieve_tok, .right=rsieve_tok, | |||
| 157 | .tag_filter=tag_filter_t(), | |||
| 158 | .mode=Normal}; | |||
| 159 | right_sieve_tok = vector<pattern_element_t>(1, rsieve_elem); | |||
| 160 | } | |||
| 161 | ||||
| 162 | LexdCompiler::~LexdCompiler() | |||
| 163 | {} | |||
| 164 | ||||
| 165 | // u_*printf only accept const UChar* | |||
| 166 | // so here's a wrapper so we don't have to write all this out every time | |||
| 167 | // and make it a macro so we don't have issues with it deallocating | |||
| 168 | #define err(s)(to_ustring(s).c_str()) (to_ustring(s).c_str()) | |||
| 169 | ||||
| 170 | void | |||
| 171 | LexdCompiler::die(const char* msg, ...) | |||
| 172 | { | |||
| 173 | UFILE* err_out = u_finitu_finit_72(stderrstderr, NULL__null, NULL__null); | |||
| 174 | u_fprintfu_fprintf_72(err_out, "Error on line %d: ", lineNumber); | |||
| 175 | va_list argptr; | |||
| 176 | va_start(argptr, msg)__builtin_va_start(argptr, msg); | |||
| 177 | u_vfprintfu_vfprintf_72(err_out, msg, argptr); | |||
| 178 | va_end(argptr)__builtin_va_end(argptr); | |||
| 179 | u_fputcu_fputc_72('\n', err_out); | |||
| 180 | exit(EXIT_FAILURE1); | |||
| 181 | } | |||
| 182 | ||||
| 183 | void LexdCompiler::appendLexicon(string_ref lexicon_id, const vector<entry_t> &to_append) | |||
| 184 | { | |||
| 185 | if(lexicons.find(lexicon_id) == lexicons.end()) | |||
| 186 | lexicons[lexicon_id] = to_append; | |||
| 187 | else | |||
| 188 | lexicons[lexicon_id].insert(lexicons[lexicon_id].begin(), to_append.begin(), to_append.end()); | |||
| 189 | } | |||
| 190 | ||||
| 191 | void | |||
| 192 | LexdCompiler::finishLexicon() | |||
| 193 | { | |||
| 194 | if(inLex) | |||
| 195 | { | |||
| 196 | if (currentLexicon.size() == 0) { | |||
| 197 | die("Lexicon '%S' is empty.", err(name(currentLexiconId))(to_ustring(name(currentLexiconId)).c_str())); | |||
| 198 | } | |||
| 199 | appendLexicon(currentLexiconId, currentLexicon); | |||
| 200 | ||||
| 201 | currentLexicon.clear(); | |||
| 202 | currentLexicon_tags.clear(); | |||
| 203 | } | |||
| 204 | inLex = false; | |||
| 205 | } | |||
| 206 | ||||
| 207 | string_ref | |||
| 208 | LexdCompiler::internName(const UnicodeString& name) | |||
| 209 | { | |||
| 210 | if(name_to_id.find(name) == name_to_id.end()) | |||
| 211 | { | |||
| 212 | name_to_id[name] = string_ref(id_to_name.size()); | |||
| 213 | id_to_name.push_back(name); | |||
| 214 | } | |||
| 215 | return name_to_id[name]; | |||
| 216 | } | |||
| 217 | ||||
| 218 | string_ref | |||
| 219 | LexdCompiler::checkName(UnicodeString& name) | |||
| 220 | { | |||
| 221 | const static UString forbidden = u" :?|()<>[]*+"; | |||
| 222 | name.trim(); | |||
| 223 | int l = name.length(); | |||
| 224 | if(l == 0) die("Unnamed pattern or lexicon."); | |||
| 225 | ||||
| 226 | for(const auto &c: forbidden) { | |||
| 227 | if(name.indexOf(c) != -1) { | |||
| 228 | die("Lexicon/pattern names cannot contain character '%C'", c); | |||
| 229 | } | |||
| 230 | } | |||
| 231 | return internName(name); | |||
| 232 | } | |||
| 233 | ||||
| 234 | tags_t | |||
| 235 | LexdCompiler::readTags(char_iter &iter, UnicodeString &line) | |||
| 236 | { | |||
| 237 | tag_filter_t filter = readTagFilter(iter, line); | |||
| 238 | if(filter.neg().empty() && filter.ops().empty()) | |||
| 239 | return tags_t((set<string_ref>)filter.pos()); | |||
| 240 | else | |||
| 241 | die("Cannot declare negative tag in lexicon"); | |||
| 242 | return tags_t(); | |||
| 243 | } | |||
| 244 | ||||
| 245 | tag_filter_t | |||
| 246 | LexdCompiler::readTagFilter(char_iter& iter, UnicodeString& line) | |||
| 247 | { | |||
| 248 | tag_filter_t tag_filter; | |||
| 249 | auto tag_start = (++iter).span(); | |||
| 250 | bool tag_nonempty = false; | |||
| 251 | bool negative = false; | |||
| 252 | vector<shared_ptr<op_tag_filter_t>> ops; | |||
| 253 | for(; !iter.at_end() && (*iter).length() > 0; ++iter) | |||
| 254 | { | |||
| 255 | if(*iter == "]" || *iter == "," || *iter == " ") | |||
| 256 | { | |||
| 257 | if(!tag_nonempty) | |||
| 258 | die("Empty tag at char %d", + iter.span().first); | |||
| 259 | UnicodeString s = line.tempSubStringBetween(tag_start.first, iter.span().first); | |||
| 260 | if(!tag_filter.combine( | |||
| 261 | negative ? tag_filter_t(neg_tag_filter_t {checkName(s)}) | |||
| 262 | : tag_filter_t(pos_tag_filter_t {checkName(s)}) | |||
| 263 | )) | |||
| 264 | die("Illegal tag filter."); | |||
| 265 | tag_nonempty = false; | |||
| 266 | negative = false; | |||
| 267 | if(*iter == "]") | |||
| 268 | { | |||
| 269 | iter++; | |||
| 270 | return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops); | |||
| 271 | } | |||
| 272 | } | |||
| 273 | else if(!tag_nonempty && *iter == "-") | |||
| 274 | { | |||
| 275 | negative = true; | |||
| 276 | } | |||
| 277 | else if(!tag_nonempty && (*iter == "|" || *iter == "^")) | |||
| 278 | { | |||
| 279 | const UnicodeString s = *iter; | |||
| 280 | if(negative) | |||
| 281 | die("Illegal negated operation."); | |||
| 282 | *iter++; | |||
| 283 | if (*iter == "[") | |||
| 284 | { | |||
| 285 | shared_ptr<op_tag_filter_t> op; | |||
| 286 | tags_t operands = readTags(iter, line); | |||
| 287 | if (s == "|") | |||
| 288 | op = make_shared<or_tag_filter_t>(operands); | |||
| 289 | else if (s == "^") | |||
| 290 | op = make_shared<xor_tag_filter_t>(operands); | |||
| 291 | ops.push_back(op); | |||
| 292 | } | |||
| 293 | else | |||
| 294 | die("Expected list of operands."); | |||
| 295 | if(*iter == "]") | |||
| 296 | { | |||
| 297 | iter++; | |||
| 298 | return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops); | |||
| 299 | } | |||
| 300 | } | |||
| 301 | else if(!tag_nonempty) | |||
| 302 | { | |||
| 303 | tag_nonempty = true; | |||
| 304 | tag_start = iter.span(); | |||
| 305 | } | |||
| 306 | } | |||
| 307 | die("End of line in tag list, expected ']'"); | |||
| 308 | return tag_filter_t(); | |||
| 309 | } | |||
| 310 | ||||
| 311 | void | |||
| 312 | LexdCompiler::appendSymbol(const UnicodeString& s, lex_token_t& tok) | |||
| 313 | { | |||
| 314 | if (shouldCombine) { | |||
| 315 | tok.symbols.push_back(alphabet_lookup(s)); | |||
| 316 | } else { | |||
| 317 | for (int c = 0; c < s.length(); c++) { | |||
| 318 | tok.symbols.push_back(alphabet_lookup(s[c])); | |||
| 319 | } | |||
| 320 | } | |||
| 321 | } | |||
| 322 | ||||
| 323 | void | |||
| 324 | LexdCompiler::readSymbol(char_iter& iter, UnicodeString& line, lex_token_t& tok) | |||
| 325 | { | |||
| 326 | if ((*iter).startsWith("\\")) { | |||
| 327 | if ((*iter).length() == 1) { | |||
| 328 | appendSymbol(*++iter, tok); | |||
| 329 | } else { | |||
| 330 | appendSymbol((*iter).tempSubString(1), tok); | |||
| 331 | } | |||
| 332 | } else if ((*iter).startsWith(":")) { | |||
| 333 | appendSymbol((*iter).tempSubString(1), tok); | |||
| 334 | } else if (*iter == "{" || *iter == "<") { | |||
| 335 | UChar end = (*iter == "{") ? '}' : '>'; | |||
| 336 | int i = iter.span().first; | |||
| 337 | for (; !iter.at_end() && *iter != end; ++iter) ; | |||
| 338 | ||||
| 339 | if (*iter == end) { | |||
| 340 | tok.symbols.push_back(alphabet_lookup(line.tempSubStringBetween(i, iter.span().second))); | |||
| 341 | } else { | |||
| 342 | die("Multichar symbol didn't end; searching for %S", err(end)(to_ustring(end).c_str())); | |||
| 343 | } | |||
| 344 | } else { | |||
| 345 | appendSymbol(*iter, tok); | |||
| 346 | } | |||
| 347 | } | |||
| 348 | ||||
| 349 | int | |||
| 350 | LexdCompiler::processRegexTokenSeq(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state) | |||
| 351 | { | |||
| 352 | bool inleft = true; | |||
| 353 | vector<vector<lex_token_t>> left, right; | |||
| 354 | for (; !iter.at_end(); ++iter) { | |||
| 355 | if (*iter == "(" || *iter == ")" || *iter == "|" || *iter == "/") break; | |||
| 356 | else if (*iter == "?" || *iter == "*" || *iter == "+") | |||
| 357 | die("Quantifier %S may only be applied to parenthesized groups", err(*iter)(to_ustring(*iter).c_str())); | |||
| 358 | else if (*iter == "]") die("Regex contains mismatched ]"); | |||
| 359 | else if (*iter == ":" && inleft) inleft = false; | |||
| 360 | else if (*iter == ":") die("Regex contains multiple colons"); | |||
| 361 | else if (*iter == "[") { | |||
| 362 | ++iter; | |||
| 363 | vector<lex_token_t> sym; | |||
| 364 | for (; !iter.at_end(); ++iter) { | |||
| 365 | if (*iter == "]") break; | |||
| 366 | else if (*iter == "-" && !sym.empty()) { | |||
| 367 | ++iter; | |||
| 368 | if (*iter == "]" || iter.at_end()) { | |||
| 369 | --iter; | |||
| 370 | lex_token_t temp; | |||
| 371 | readSymbol(iter, line, temp); | |||
| 372 | sym.push_back(temp); | |||
| 373 | } else { | |||
| 374 | lex_token_t start = sym.back(); | |||
| 375 | lex_token_t end; | |||
| 376 | readSymbol(iter, line, end); | |||
| 377 | // This will fail on diacritics even with -U | |||
| 378 | // on the principle that command-line args should not | |||
| 379 | // change the validity of the code -DGS 2022-05-17 | |||
| 380 | if (start.symbols.size() != 1 || end.symbols.size() != 1 || | |||
| 381 | (int)start.symbols[0] <= 0 || (int)end.symbols[0] <= 0) | |||
| 382 | die("Cannot process symbol range between multichar symbols"); | |||
| 383 | int i_start = (int)start.symbols[0]; | |||
| 384 | int i_end = (int)end.symbols[0]; | |||
| 385 | if (i_start > i_end) | |||
| 386 | die("First character in symbol range does not preceed last"); | |||
| 387 | for (int i = 1 + i_start; i <= i_end; i++) { | |||
| 388 | lex_token_t mid; | |||
| 389 | mid.symbols.push_back((trans_sym_t)i); | |||
| 390 | sym.push_back(mid); | |||
| 391 | } | |||
| 392 | } | |||
| 393 | } else { | |||
| 394 | lex_token_t temp; | |||
| 395 | readSymbol(iter, line, temp); | |||
| 396 | sym.push_back(temp); | |||
| 397 | } | |||
| 398 | } | |||
| 399 | (inleft ? left : right).push_back(sym); | |||
| 400 | } else { | |||
| 401 | vector<lex_token_t> v_temp; | |||
| 402 | lex_token_t t_temp; | |||
| 403 | readSymbol(iter, line, t_temp); | |||
| 404 | v_temp.push_back(t_temp); | |||
| 405 | (inleft ? left : right).push_back(v_temp); | |||
| 406 | } | |||
| 407 | } | |||
| 408 | int state = start_state; | |||
| 409 | vector<lex_token_t> empty_vec; | |||
| 410 | lex_token_t empty_tok; | |||
| 411 | empty_tok.symbols.push_back(trans_sym_t()); | |||
| 412 | empty_vec.push_back(empty_tok); | |||
| 413 | for (unsigned int i = 0; i < left.size() || i < right.size(); i++) { | |||
| 414 | vector<lex_token_t>& lv = (i < left.size() && !left[i].empty() ? | |||
| 415 | left[i] : empty_vec); | |||
| 416 | vector<lex_token_t>& rv = (i < right.size() && !right[i].empty() ? | |||
| 417 | right[i] : empty_vec); | |||
| 418 | bool first = true; | |||
| 419 | int dest_state = 0; | |||
| 420 | if (inleft) { | |||
| 421 | for (auto& s : lv) { | |||
| 422 | if (first) { | |||
| 423 | dest_state = state; | |||
| 424 | for (auto& it : s.symbols) | |||
| 425 | dest_state = trans->insertNewSingleTransduction(alphabet((int)it, (int)it), dest_state); | |||
| 426 | if (dest_state == state) | |||
| 427 | dest_state = trans->insertNewSingleTransduction(0, dest_state); | |||
| 428 | first = false; | |||
| 429 | } else if (s.symbols.empty()) { | |||
| 430 | trans->linkStates(state, dest_state, 0); | |||
| 431 | } else { | |||
| 432 | int cur_state = state; | |||
| 433 | for (unsigned int k = 0; k < s.symbols.size(); k++) { | |||
| 434 | if (k+1 == s.symbols.size()) | |||
| 435 | trans->linkStates(cur_state, dest_state, alphabet((int)s.symbols[k], (int)s.symbols[k])); | |||
| 436 | else | |||
| 437 | cur_state = trans->insertNewSingleTransduction(alphabet((int)s.symbols[k], (int)s.symbols[k]), cur_state); | |||
| 438 | } | |||
| 439 | } | |||
| 440 | } | |||
| 441 | } else { | |||
| 442 | for (auto& l : lv) { | |||
| 443 | for (auto& r : rv) { | |||
| 444 | vector<int> paired; | |||
| 445 | for (unsigned int j = 0; j < l.symbols.size() || j < r.symbols.size(); j++) { | |||
| 446 | trans_sym_t ls = (j < l.symbols.size() ? l.symbols[j] : trans_sym_t()); | |||
| 447 | trans_sym_t rs = (j < r.symbols.size() ? r.symbols[j] : trans_sym_t()); | |||
| 448 | paired.push_back(alphabet((int)ls, (int)rs)); | |||
| 449 | } | |||
| 450 | if (first) { | |||
| 451 | dest_state = state; | |||
| 452 | for (auto& it : paired) { | |||
| 453 | dest_state = trans->insertNewSingleTransduction(it, dest_state); | |||
| 454 | } | |||
| 455 | first = false; | |||
| 456 | } else { | |||
| 457 | int cur_state = state; | |||
| 458 | for (unsigned int k = 0; k < paired.size(); k++) { | |||
| 459 | if (k+1 == paired.size()) | |||
| 460 | trans->linkStates(cur_state, dest_state, paired[k]); | |||
| 461 | else | |||
| 462 | cur_state = trans->insertNewSingleTransduction(paired[k], cur_state); | |||
| 463 | } | |||
| 464 | } | |||
| 465 | } | |||
| 466 | } | |||
| 467 | } | |||
| 468 | state = dest_state; | |||
| 469 | } | |||
| 470 | return state; | |||
| 471 | } | |||
| 472 | ||||
| 473 | int | |||
| 474 | LexdCompiler::processRegexGroup(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state, unsigned int depth) | |||
| 475 | { | |||
| 476 | ++iter; // initial slash or paren | |||
| 477 | int state = start_state; | |||
| 478 | vector<int> option_ends; | |||
| 479 | for (; !iter.at_end(); ++iter) { | |||
| 480 | if (*iter == "(") { | |||
| 481 | state = trans->insertNewSingleTransduction(0, state); | |||
| 482 | state = processRegexGroup(iter, line, trans, state, depth+1); | |||
| 483 | --iter; | |||
| 484 | // this function ends on character after close paren or quantifier | |||
| 485 | // so step back so loop increment doesn't skip a character | |||
| 486 | } | |||
| 487 | else if (*iter == ")" || *iter == "/") break; | |||
| 488 | else if (*iter == "|") { | |||
| 489 | if (state == start_state) | |||
| 490 | state = trans->insertNewSingleTransduction(0, state); | |||
| 491 | option_ends.push_back(state); | |||
| 492 | state = start_state; | |||
| 493 | } | |||
| 494 | else { | |||
| 495 | state = processRegexTokenSeq(iter, line, trans, state); | |||
| 496 | --iter; | |||
| 497 | } | |||
| 498 | } | |||
| 499 | if (state == start_state) | |||
| 500 | state = trans->insertNewSingleTransduction(0, state); | |||
| 501 | for (auto& it : option_ends) | |||
| 502 | trans->linkStates(it, state, 0); | |||
| 503 | if ((depth > 0 && *iter == "/") || (depth == 0 && *iter == ")")) | |||
| 504 | die("Mismatched parentheses in regex"); | |||
| 505 | if (iter.at_end()) | |||
| 506 | die("Unterminated regex"); | |||
| 507 | ++iter; | |||
| 508 | if (depth > 0) { | |||
| 509 | if (*iter == "?") { | |||
| 510 | trans->linkStates(start_state, state, 0); | |||
| 511 | ++iter; | |||
| 512 | } else if (*iter == "*") { | |||
| 513 | trans->linkStates(start_state, state, 0); | |||
| 514 | trans->linkStates(state, start_state, 0); | |||
| 515 | ++iter; | |||
| 516 | } else if (*iter == "+") { | |||
| 517 | trans->linkStates(state, start_state, 0); | |||
| 518 | ++iter; | |||
| 519 | } | |||
| 520 | } | |||
| 521 | return state; | |||
| 522 | } | |||
| 523 | ||||
| 524 | lex_seg_t | |||
| 525 | LexdCompiler::processLexiconSegment(char_iter& iter, UnicodeString& line, unsigned int part_count) | |||
| 526 | { | |||
| 527 | lex_seg_t seg; | |||
| 528 | bool inleft = true; | |||
| 529 | bool left_tags_applied = false, right_tags_applied = false; | |||
| 530 | tag_filter_t tags; | |||
| 531 | if((*iter).startsWith(" ")) | |||
| 532 | { | |||
| 533 | if((*iter).length() > 1) | |||
| 534 | { | |||
| 535 | // if it's a space with a combining diacritic after it, | |||
| 536 | // then we want the diacritic | |||
| 537 | UnicodeString cur = *iter; | |||
| 538 | cur.retainBetween(1, cur.length()); | |||
| 539 | seg.left.symbols.push_back(alphabet_lookup(cur)); | |||
| 540 | } | |||
| 541 | ++iter; | |||
| 542 | } | |||
| 543 | if((*iter).startsWith("/") && seg.left.symbols.size() == 0) | |||
| 544 | { | |||
| 545 | seg.regex = new Transducer(); | |||
| 546 | int state = processRegexGroup(iter, line, seg.regex, 0, 0); | |||
| 547 | seg.regex->setFinal(state); | |||
| 548 | } | |||
| 549 | if(iter.at_end() && seg.regex == nullptr && seg.left.symbols.size() == 0) | |||
| 550 | die("Expected %d parts, found %d", currentLexiconPartCount, part_count); | |||
| 551 | for(; !iter.at_end(); ++iter) | |||
| 552 | { | |||
| 553 | if((*iter).startsWith(" ") || *iter == ']') | |||
| 554 | break; | |||
| 555 | else if(*iter == "[") | |||
| 556 | { | |||
| 557 | auto &tags_applied = inleft ? left_tags_applied : right_tags_applied; | |||
| 558 | if(tags_applied) | |||
| 559 | die("Already provided tag list for this side."); | |||
| 560 | tags = readTagFilter(iter, line); | |||
| 561 | --iter; | |||
| 562 | tags_applied = true; | |||
| 563 | } | |||
| 564 | else if((*iter).startsWith(":")) | |||
| 565 | { | |||
| 566 | if(inleft) | |||
| 567 | inleft = false; | |||
| 568 | else | |||
| 569 | die("Lexicon entry contains multiple colons"); | |||
| 570 | if ((*iter).length() > 1) readSymbol(iter, line, seg.right); | |||
| 571 | } | |||
| 572 | else readSymbol(iter, line, (inleft ? seg.left : seg.right)); | |||
| 573 | } | |||
| 574 | if(inleft) | |||
| 575 | { | |||
| 576 | seg.right = seg.left; | |||
| 577 | } | |||
| 578 | ||||
| 579 | if (seg.regex != nullptr && | |||
| 580 | !(seg.left.symbols.empty() && seg.right.symbols.empty())) | |||
| 581 | die("Lexicon entry contains both regex and text"); | |||
| 582 | ||||
| 583 | seg.tags = currentLexicon_tags; | |||
| 584 | ||||
| 585 | if(!tags.try_apply(seg.tags)) | |||
| 586 | { | |||
| 587 | tags_t diff = subtractset(tags.neg(), seg.tags); | |||
| 588 | for(string_ref t: diff) | |||
| 589 | cerr << "Bad tag '-" << to_ustring(name(t)) << "'" << endl; | |||
| 590 | die("Negative tag has no default to unset."); | |||
| 591 | } | |||
| 592 | ||||
| 593 | return seg; | |||
| 594 | } | |||
| 595 | ||||
| 596 | token_t | |||
| 597 | LexdCompiler::readToken(char_iter& iter, UnicodeString& line) | |||
| 598 | { | |||
| 599 | auto begin_charspan = iter.span(); | |||
| 600 | ||||
| 601 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
| 602 | ||||
| 603 | for(; !iter.at_end() && boundary.indexOf(*iter) == -1; ++iter); | |||
| 604 | UnicodeString name; | |||
| 605 | line.extract(begin_charspan.first, (iter.at_end() ? line.length() : iter.span().first) - begin_charspan.first, name); | |||
| 606 | ||||
| 607 | if(name.length() == 0) | |||
| 608 | die("Symbol '%S' without lexicon name at u16 %d-%d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, iter.span().second-1); | |||
| 609 | ||||
| 610 | bool optional = false; | |||
| 611 | if(*iter == "?") { | |||
| 612 | iter++; | |||
| 613 | if(*iter == "(") { | |||
| 614 | optional = true; | |||
| 615 | } else { | |||
| 616 | iter--; | |||
| 617 | } | |||
| 618 | } | |||
| 619 | ||||
| 620 | unsigned int part = 1; | |||
| 621 | if(*iter == "(") | |||
| 622 | { | |||
| 623 | iter++; | |||
| 624 | begin_charspan = iter.span(); | |||
| 625 | for(; !iter.at_end() && (*iter).length() > 0 && *iter != ")"; iter++) | |||
| 626 | { | |||
| 627 | if((*iter).length() != 1 || !u_isdigitu_isdigit_72((*iter).charAt(0))) | |||
| 628 | die("Syntax error - non-numeric index in parentheses: %S", err(*iter)(to_ustring(*iter).c_str())); | |||
| 629 | } | |||
| 630 | if(*iter != ")") | |||
| 631 | die("Syntax error - unmatched parenthesis"); | |||
| 632 | if(iter.span().first == begin_charspan.first) | |||
| 633 | die("Syntax error - missing index in parenthesis"); | |||
| 634 | part = (unsigned int)StringUtils::stoi(to_ustring(line.tempSubStringBetween(begin_charspan.first, iter.span().first))); | |||
| 635 | if (part == 0) die("Invalid column number (0)"); | |||
| 636 | ++iter; | |||
| 637 | } | |||
| 638 | ||||
| 639 | return token_t {.name = internName(name), .part = part, .optional = optional}; | |||
| 640 | } | |||
| 641 | ||||
| 642 | RepeatMode | |||
| 643 | LexdCompiler::readModifier(char_iter& iter) | |||
| 644 | { | |||
| 645 | if(*iter == "?") | |||
| 646 | { | |||
| 647 | ++iter; | |||
| 648 | return Question; | |||
| 649 | } | |||
| 650 | else if(*iter == "*") | |||
| 651 | { | |||
| 652 | ++iter; | |||
| 653 | return Star; | |||
| 654 | } | |||
| 655 | else if(*iter == "+") | |||
| 656 | { | |||
| 657 | ++iter; | |||
| 658 | return Plus; | |||
| 659 | } | |||
| 660 | return Normal; | |||
| 661 | } | |||
| 662 | ||||
| 663 | pattern_element_t | |||
| 664 | LexdCompiler::readPatternElement(char_iter& iter, UnicodeString& line) | |||
| 665 | { | |||
| 666 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
| 667 | pattern_element_t tok; | |||
| 668 | if(*iter == ":") | |||
| 669 | { | |||
| 670 | iter++; | |||
| 671 | if(boundary.indexOf(*iter) != -1) | |||
| 672 | { | |||
| 673 | if(*iter == ":") | |||
| 674 | die("Syntax error - double colon"); | |||
| 675 | else | |||
| 676 | die("Colon without lexicon or pattern name"); | |||
| 677 | } | |||
| 678 | tok.right = readToken(iter, line); | |||
| 679 | } | |||
| 680 | else if(boundary.indexOf(*iter) != -1) | |||
| 681 | { | |||
| 682 | die("Unexpected symbol '%S' at column %d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first); | |||
| 683 | } | |||
| 684 | else | |||
| 685 | { | |||
| 686 | tok.left = readToken(iter, line); | |||
| 687 | if(*iter == "[") | |||
| 688 | { | |||
| 689 | tok.tag_filter.combine(readTagFilter(iter, line)); | |||
| 690 | } | |||
| 691 | if(*iter == ":") | |||
| 692 | { | |||
| 693 | iter++; | |||
| 694 | if(!iter.at_end() && (*iter).length() > 0) | |||
| 695 | { | |||
| 696 | if(boundary.indexOf(*iter) == -1) | |||
| 697 | { | |||
| 698 | tok.right = readToken(iter, line); | |||
| 699 | } | |||
| 700 | } | |||
| 701 | } | |||
| 702 | else | |||
| 703 | { | |||
| 704 | tok.right = tok.left; | |||
| 705 | } | |||
| 706 | } | |||
| 707 | if(*iter == "[") | |||
| 708 | { | |||
| 709 | tok.tag_filter.combine(readTagFilter(iter, line)); | |||
| 710 | } | |||
| 711 | tok.mode = readModifier(iter); | |||
| 712 | ||||
| 713 | return tok; | |||
| 714 | } | |||
| 715 | ||||
| 716 | void | |||
| 717 | LexdCompiler::processPattern(char_iter& iter, UnicodeString& line) | |||
| 718 | { | |||
| 719 | vector<pattern_t> pats_cur(1); | |||
| 720 | vector<pattern_element_t> alternation; | |||
| 721 | bool final_alternative = true; | |||
| 722 | bool sieve_forward = false; | |||
| 723 | bool just_sieved = false; | |||
| 724 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
| 725 | const UnicodeString token_boundary = " )|<>"; | |||
| 726 | const UnicodeString token_side_boundary = token_boundary + ":+*?"; | |||
| 727 | const UnicodeString token_side_name_boundary = token_side_boundary + "([]"; | |||
| 728 | const UnicodeString modifier = "+*?"; | |||
| 729 | const UnicodeString decrement_after_token = token_boundary + "([]"; | |||
| 730 | ||||
| 731 | for(; !iter.at_end() && *iter != ')' && (*iter).length() > 0; ++iter) | |||
| 732 | { | |||
| 733 | if(*iter == " ") ; | |||
| 734 | else if(*iter == "|") | |||
| 735 | { | |||
| 736 | if(alternation.empty()) | |||
| 737 | die("Syntax error - initial |"); | |||
| 738 | if(!final_alternative) | |||
| 739 | die("Syntax error - multiple consecutive |"); | |||
| 740 | if(just_sieved) | |||
| 741 | die("Syntax error - sieve and alternation operators without intervening token"); | |||
| 742 | final_alternative = false; | |||
| 743 | } | |||
| 744 | else if(*iter == "<") | |||
| 745 | { | |||
| 746 | if(sieve_forward) | |||
| 747 | die("Syntax error - cannot sieve backwards after forwards."); | |||
| 748 | if(alternation.empty()) | |||
| 749 | die("Backward sieve without token?"); | |||
| 750 | if(just_sieved) | |||
| 751 | die("Syntax error - multiple consecutive sieve operators"); | |||
| 752 | if(!final_alternative) | |||
| 753 | die("Syntax error - alternation and sieve operators without intervening token"); | |||
| 754 | expand_alternation(pats_cur, alternation); | |||
| 755 | expand_alternation(pats_cur, left_sieve_tok); | |||
| 756 | alternation.clear(); | |||
| 757 | just_sieved = true; | |||
| 758 | } | |||
| 759 | else if(*iter == ">") | |||
| 760 | { | |||
| 761 | sieve_forward = true; | |||
| 762 | if(alternation.empty()) | |||
| 763 | die("Forward sieve without token?"); | |||
| 764 | if(just_sieved) | |||
| 765 | die("Syntax error - multiple consecutive sieve operators"); | |||
| 766 | if(!final_alternative) | |||
| 767 | die("Syntax error - alternation and sieve operators without intervening token"); | |||
| 768 | expand_alternation(pats_cur, alternation); | |||
| 769 | expand_alternation(pats_cur, right_sieve_tok); | |||
| 770 | alternation.clear(); | |||
| 771 | just_sieved = true; | |||
| 772 | } | |||
| 773 | else if(*iter == "[") | |||
| 774 | { | |||
| 775 | UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++)); | |||
| 776 | currentLexiconId = internName(name); | |||
| 777 | currentLexiconPartCount = 1; | |||
| 778 | inLex = true; | |||
| 779 | entry_t entry; | |||
| 780 | entry.push_back(processLexiconSegment(++iter, line, 0)); | |||
| 781 | if(*iter == " ") iter++; | |||
| 782 | if(*iter != "]") | |||
| 783 | die("Missing closing ] for anonymous lexicon"); | |||
| 784 | currentLexicon.push_back(entry); | |||
| 785 | finishLexicon(); | |||
| 786 | if(final_alternative && !alternation.empty()) | |||
| 787 | { | |||
| 788 | expand_alternation(pats_cur, alternation); | |||
| 789 | alternation.clear(); | |||
| 790 | } | |||
| 791 | ++iter; | |||
| 792 | pattern_element_t anon; | |||
| 793 | anon.left = {.name=currentLexiconId, .part=1, .optional=false}; | |||
| 794 | anon.right = anon.left; | |||
| 795 | anon.mode = readModifier(iter); | |||
| 796 | alternation.push_back(anon); | |||
| 797 | --iter; | |||
| 798 | final_alternative = true; | |||
| 799 | just_sieved = false; | |||
| 800 | } | |||
| 801 | else if(*iter == "(") | |||
| 802 | { | |||
| 803 | string_ref temp = currentPatternId; | |||
| 804 | UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++)); | |||
| 805 | currentPatternId = internName(name); | |||
| 806 | ++iter; | |||
| 807 | processPattern(iter, line); | |||
| 808 | if(*iter == " ") | |||
| 809 | *iter++; | |||
| 810 | if(iter.at_end() || *iter != ")") | |||
| 811 | die("Missing closing ) for anonymous pattern"); | |||
| 812 | ++iter; | |||
| 813 | tag_filter_t filter; | |||
| 814 | if(*iter == "[") | |||
| 815 | filter = readTagFilter(iter, line); | |||
| 816 | if(final_alternative && !alternation.empty()) | |||
| 817 | { | |||
| 818 | expand_alternation(pats_cur, alternation); | |||
| 819 | alternation.clear(); | |||
| 820 | } | |||
| 821 | pattern_element_t anon; | |||
| 822 | anon.left = {.name=currentPatternId, .part=1, .optional=false}; | |||
| 823 | anon.right = anon.left; | |||
| 824 | anon.mode = readModifier(iter); | |||
| 825 | anon.tag_filter = filter; | |||
| 826 | for(const auto &tok : distribute_tag_expressions(anon)) | |||
| 827 | alternation.push_back(tok); | |||
| 828 | --iter; | |||
| 829 | currentPatternId = temp; | |||
| 830 | final_alternative = true; | |||
| 831 | just_sieved = false; | |||
| 832 | } | |||
| 833 | else if(*iter == "?" || *iter == "*" || *iter == "+") | |||
| 834 | { | |||
| 835 | die("Syntax error - unexpected modifier at u16 %d-%d", iter.span().first, iter.span().second); | |||
| 836 | } | |||
| 837 | else | |||
| 838 | { | |||
| 839 | if(final_alternative && !alternation.empty()) | |||
| 840 | { | |||
| 841 | expand_alternation(pats_cur, alternation); | |||
| 842 | alternation.clear(); | |||
| 843 | } | |||
| 844 | for(const auto &tok : distribute_tag_expressions(readPatternElement(iter, line))) | |||
| 845 | alternation.push_back(tok); | |||
| 846 | iter--; | |||
| 847 | final_alternative = true; | |||
| 848 | just_sieved = false; | |||
| 849 | } | |||
| 850 | } | |||
| 851 | if(!final_alternative) | |||
| 852 | die("Syntax error - trailing |"); | |||
| 853 | if(just_sieved) | |||
| 854 | die("Syntax error - trailing sieve (< or >)"); | |||
| 855 | expand_alternation(pats_cur, alternation); | |||
| 856 | for(const auto &pat : pats_cur) | |||
| 857 | { | |||
| 858 | patterns[currentPatternId].push_back(make_pair(lineNumber, pat)); | |||
| 859 | } | |||
| 860 | } | |||
| 861 | ||||
| 862 | void | |||
| 863 | LexdCompiler::processNextLine() | |||
| 864 | { | |||
| 865 | UnicodeString line; | |||
| 866 | UChar c; | |||
| 867 | bool escape = false; | |||
| 868 | bool comment = false; | |||
| 869 | bool lastWasSpace = false; | |||
| 870 | while((c = u_fgetcu_fgetc_72(input)) != '\n') | |||
| 871 | { | |||
| 872 | bool space = false; | |||
| 873 | if(c == U_EOF0xFFFF) | |||
| 874 | { | |||
| 875 | doneReading = true; | |||
| 876 | break; | |||
| 877 | } | |||
| 878 | if(comment) continue; | |||
| 879 | if(escape) | |||
| 880 | { | |||
| 881 | line += c; | |||
| 882 | escape = false; | |||
| 883 | } | |||
| 884 | else if(c == '\\') | |||
| 885 | { | |||
| 886 | escape = true; | |||
| 887 | line += c; | |||
| 888 | } | |||
| 889 | else if(c == '#') | |||
| 890 | { | |||
| 891 | comment = true; | |||
| 892 | } | |||
| 893 | else if(u_isWhitespaceu_isWhitespace_72(c)) | |||
| 894 | { | |||
| 895 | if(line.length() > 0 && !lastWasSpace) | |||
| 896 | { | |||
| 897 | line += ' '; | |||
| 898 | } | |||
| 899 | space = (line.length() > 0); | |||
| 900 | } | |||
| 901 | else line += c; | |||
| 902 | lastWasSpace = space; | |||
| 903 | } | |||
| 904 | lineNumber++; | |||
| 905 | if(escape) die("Trailing backslash"); | |||
| 906 | if(line.length() == 0) return; | |||
| 907 | ||||
| 908 | if(line == "PATTERNS" || line == "PATTERNS ") | |||
| 909 | { | |||
| 910 | finishLexicon(); | |||
| 911 | UnicodeString name = " "; | |||
| 912 | currentPatternId = internName(name); | |||
| 913 | inPat = true; | |||
| 914 | } | |||
| 915 | else if(line.length() > 7 && line.startsWith("PATTERN ")) | |||
| 916 | { | |||
| 917 | UnicodeString name = line.tempSubString(8); | |||
| 918 | finishLexicon(); | |||
| 919 | currentPatternId = checkName(name); | |||
| 920 | if (lexicons.find(currentPatternId) != lexicons.end()) { | |||
| 921 | die("The name '%S' cannot be used for both LEXICONs and PATTERNs.", | |||
| 922 | err(name)(to_ustring(name).c_str())); | |||
| 923 | } | |||
| 924 | inPat = true; | |||
| 925 | } | |||
| 926 | else if(line.length() > 7 && line.startsWith("LEXICON ")) | |||
| 927 | { | |||
| 928 | UnicodeString name = line.tempSubString(8); | |||
| 929 | name.trim(); | |||
| 930 | finishLexicon(); | |||
| 931 | if(name.length() > 1 && name.indexOf('[') != -1) | |||
| 932 | { | |||
| 933 | UnicodeString tags = name.tempSubString(name.indexOf('[')); | |||
| 934 | auto c = char_iter(tags); | |||
| 935 | currentLexicon_tags = readTags(c, tags); | |||
| 936 | if(c != c.end() && *c == ":") | |||
| 937 | { | |||
| 938 | cerr << "WARNING: One-sided tags are deprecated and will soon be removed (line " << lineNumber << ")" << endl; | |||
| 939 | ++c; | |||
| 940 | if(*c == "[") | |||
| 941 | unionset_inplace(currentLexicon_tags, readTags(c, tags)); | |||
| 942 | else | |||
| 943 | die("Expected start of default right tags '[' after ':'."); | |||
| 944 | } | |||
| 945 | if(c != c.end()) | |||
| 946 | die("Unexpected character '%C' after default tags.", (*c)[0]); | |||
| 947 | name.retainBetween(0, name.indexOf('[')); | |||
| 948 | } | |||
| 949 | currentLexiconPartCount = 1; | |||
| 950 | if(name.length() > 1 && name.endsWith(')')) | |||
| 951 | { | |||
| 952 | UnicodeString num; | |||
| 953 | for(int i = name.length()-2; i > 0; i--) | |||
| 954 | { | |||
| 955 | if(u_isdigitu_isdigit_72(name[i])) num = name[i] + num; | |||
| 956 | else if(name[i] == '(' && num.length() > 0) | |||
| 957 | { | |||
| 958 | currentLexiconPartCount = (unsigned int)StringUtils::stoi(to_ustring(num)); | |||
| 959 | name = name.retainBetween(0, i); | |||
| 960 | } | |||
| 961 | else break; | |||
| 962 | } | |||
| 963 | if(name.length() == 0) die("Unnamed lexicon"); | |||
| 964 | } | |||
| 965 | currentLexiconId = checkName(name); | |||
| 966 | if(lexicons.find(currentLexiconId) != lexicons.end()) { | |||
| 967 | if(lexicons[currentLexiconId][0].size() != currentLexiconPartCount) { | |||
| 968 | die("Multiple incompatible definitions for lexicon '%S'.", err(name)(to_ustring(name).c_str())); | |||
| 969 | } | |||
| 970 | } | |||
| 971 | if (patterns.find(currentLexiconId) != patterns.end()) { | |||
| 972 | die("The name '%S' cannot be used for both LEXICONs and PATTERNs.", | |||
| 973 | err(name)(to_ustring(name).c_str())); | |||
| 974 | } | |||
| 975 | inLex = true; | |||
| 976 | inPat = false; | |||
| 977 | } | |||
| 978 | else if(line.length() >= 9 && line.startsWith("ALIAS ")) | |||
| 979 | { | |||
| 980 | finishLexicon(); | |||
| 981 | if(line.endsWith(' ')) line.retainBetween(0, line.length()-1); | |||
| 982 | int loc = line.indexOf(" ", 6); | |||
| 983 | if(loc == -1) die("Expected 'ALIAS lexicon alt_name'"); | |||
| 984 | UnicodeString name = line.tempSubString(6, loc-6); | |||
| 985 | UnicodeString alt = line.tempSubString(loc+1); | |||
| 986 | string_ref altid = checkName(alt); | |||
| 987 | string_ref lexid = checkName(name); | |||
| 988 | if(lexicons.find(lexid) == lexicons.end()) die("Attempt to alias undefined lexicon '%S'.", err(name)(to_ustring(name).c_str())); | |||
| 989 | lexicons[altid] = lexicons[lexid]; | |||
| 990 | inLex = false; | |||
| 991 | inPat = false; | |||
| 992 | } | |||
| 993 | else if(inPat) | |||
| 994 | { | |||
| 995 | char_iter iter = char_iter(line); | |||
| 996 | processPattern(iter, line); | |||
| 997 | if(!iter.at_end() && (*iter).length() > 0) | |||
| 998 | die("Unexpected %S", err(*iter)(to_ustring(*iter).c_str())); | |||
| 999 | } | |||
| 1000 | else if(inLex) | |||
| 1001 | { | |||
| 1002 | char_iter iter = char_iter(line); | |||
| 1003 | entry_t entry; | |||
| 1004 | for(unsigned int i = 0; i < currentLexiconPartCount; i++) | |||
| 1005 | { | |||
| 1006 | entry.push_back(processLexiconSegment(iter, line, i)); | |||
| 1007 | if (*iter == "]") die("Unexpected closing bracket."); | |||
| 1008 | } | |||
| 1009 | if(*iter == ' ') ++iter; | |||
| 1010 | if(!iter.at_end()) | |||
| 1011 | die("Lexicon entry has '%S' (found at u16 %d), more than %d components", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, currentLexiconPartCount); | |||
| 1012 | currentLexicon.push_back(entry); | |||
| 1013 | } | |||
| 1014 | else die("Expected 'PATTERNS' or 'LEXICON'"); | |||
| 1015 | } | |||
| 1016 | ||||
| 1017 | bool | |||
| 1018 | LexdCompiler::isLexiconToken(const pattern_element_t& tok) | |||
| 1019 | { | |||
| 1020 | const bool llex = (tok.left.name.empty() || (lexicons.find(tok.left.name) != lexicons.end())); | |||
| 1021 | const bool rlex = (tok.right.name.empty() || (lexicons.find(tok.right.name) != lexicons.end())); | |||
| 1022 | if(llex && rlex) | |||
| 1023 | { | |||
| 1024 | return true; | |||
| 1025 | } | |||
| 1026 | const bool lpat = (patterns.find(tok.left.name) != patterns.end()); | |||
| 1027 | const bool rpat = (patterns.find(tok.right.name) != patterns.end()); | |||
| 1028 | if(tok.left.name == tok.right.name && lpat && rpat) | |||
| 1029 | { | |||
| 1030 | if(tok.left.part != 1 || tok.right.part != 1) | |||
| 1031 | { | |||
| 1032 | die("Cannote select part of pattern %S", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1033 | } | |||
| 1034 | return false; | |||
| 1035 | } | |||
| 1036 | // Any other scenario is an error, so we need to die() | |||
| 1037 | if(lpat && rpat) | |||
| 1038 | { | |||
| 1039 | die("Cannot collate pattern %S with %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1040 | } | |||
| 1041 | else if((lpat && tok.right.name.empty()) || (rpat && tok.left.name.empty())) | |||
| 1042 | { | |||
| 1043 | die("Cannot select side of pattern %S", err(name(tok.left.name.valid() ? tok.left.name : tok.right.name))(to_ustring(name(tok.left.name.valid() ? tok.left.name : tok. right.name)).c_str())); | |||
| 1044 | } | |||
| 1045 | else if(llex && rpat) | |||
| 1046 | { | |||
| 1047 | die("Cannot collate lexicon %S with pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1048 | } | |||
| 1049 | else if(lpat && rlex) | |||
| 1050 | { | |||
| 1051 | die("Cannot collate pattern %S with lexicon %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1052 | } | |||
| 1053 | else | |||
| 1054 | { | |||
| 1055 | cerr << "Patterns: "; | |||
| 1056 | for(auto pat: patterns) | |||
| 1057 | cerr << to_ustring(name(pat.first)) << " "; | |||
| 1058 | cerr << endl; | |||
| 1059 | cerr << "Lexicons: "; | |||
| 1060 | for(auto l: lexicons) | |||
| 1061 | cerr << to_ustring(name(l.first)) << " "; | |||
| 1062 | cerr << endl; | |||
| 1063 | die("Lexicon or pattern '%S' is not defined.", err(name((llex || lpat) ? tok.right.name : tok.left.name))(to_ustring(name((llex || lpat) ? tok.right.name : tok.left.name )).c_str())); | |||
| 1064 | } | |||
| 1065 | // we never reach this point, but the compiler doesn't understand die() | |||
| 1066 | // so we put a fake return value to keep it happy | |||
| 1067 | return false; | |||
| 1068 | } | |||
| 1069 | ||||
| 1070 | void | |||
| 1071 | LexdCompiler::buildPattern(int state, Transducer* t, const pattern_t& pat, const vector<int> is_free, unsigned int pos) | |||
| 1072 | { | |||
| 1073 | if(pos == pat.size()) | |||
| 1074 | { | |||
| 1075 | t->setFinal(state); | |||
| 1076 | return; | |||
| 1077 | } | |||
| 1078 | const pattern_element_t& tok = pat[pos]; | |||
| 1079 | if(tok.left.name == left_sieve_name) | |||
| 1080 | { | |||
| 1081 | t->linkStates(t->getInitial(), state, 0); | |||
| 1082 | buildPattern(state, t, pat, is_free, pos+1); | |||
| 1083 | } | |||
| 1084 | else if(tok.left.name == right_sieve_name) | |||
| 1085 | { | |||
| 1086 | t->setFinal(state); | |||
| 1087 | buildPattern(state, t, pat, is_free, pos+1); | |||
| 1088 | } | |||
| 1089 | else if(isLexiconToken(tok)) | |||
| 1090 | { | |||
| 1091 | if(is_free[pos] == 1) | |||
| 1092 | { | |||
| 1093 | Transducer *lex = getLexiconTransducer(pat[pos], 0, true); | |||
| 1094 | if(lex) | |||
| 1095 | { | |||
| 1096 | int new_state = t->insertTransducer(state, *lex); | |||
| 1097 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
| 1098 | } | |||
| 1099 | return; | |||
| 1100 | } | |||
| 1101 | else if(matchedParts.find(tok.left.name) == matchedParts.end() && | |||
| 1102 | matchedParts.find(tok.right.name) == matchedParts.end()) | |||
| 1103 | { | |||
| 1104 | unsigned int max = lexicons[tok.left.name || tok.right.name].size(); | |||
| 1105 | if (tok.optional()) max++; | |||
| 1106 | for(unsigned int index = 0; index < max; index++) | |||
| 1107 | { | |||
| 1108 | Transducer *lex = getLexiconTransducer(pat[pos], index, false); | |||
| 1109 | if(lex) | |||
| 1110 | { | |||
| 1111 | int new_state = t->insertTransducer(state, *lex); | |||
| 1112 | if(new_state == state) | |||
| 1113 | { | |||
| 1114 | new_state = t->insertNewSingleTransduction(0, state); | |||
| 1115 | } | |||
| 1116 | if(tok.left.name.valid()) matchedParts[tok.left.name] = index; | |||
| 1117 | if(tok.right.name.valid()) matchedParts[tok.right.name] = index; | |||
| 1118 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
| 1119 | } | |||
| 1120 | } | |||
| 1121 | if(tok.left.name.valid()) matchedParts.erase(tok.left.name); | |||
| 1122 | if(tok.right.name.valid()) matchedParts.erase(tok.right.name); | |||
| 1123 | return; | |||
| 1124 | } | |||
| 1125 | if(tok.left.name.valid() && matchedParts.find(tok.left.name) == matchedParts.end()) | |||
| 1126 | matchedParts[tok.left.name] = matchedParts[tok.right.name]; | |||
| 1127 | if(tok.right.name.valid() && matchedParts.find(tok.right.name) == matchedParts.end()) | |||
| 1128 | matchedParts[tok.right.name] = matchedParts[tok.left.name]; | |||
| 1129 | if(tok.left.name.valid() && tok.right.name.valid() && matchedParts[tok.left.name] != matchedParts[tok.right.name]) | |||
| 1130 | die("Cannot collate %S with %S - both appear in free variation earlier in the pattern.", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1131 | Transducer* lex = getLexiconTransducer(pat[pos], matchedParts[tok.left.name || tok.right.name], false); | |||
| 1132 | if(lex) | |||
| 1133 | { | |||
| 1134 | int new_state = t->insertTransducer(state, *lex); | |||
| 1135 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
| 1136 | } | |||
| 1137 | return; | |||
| 1138 | } | |||
| 1139 | else | |||
| 1140 | { | |||
| 1141 | Transducer *p = buildPattern(tok); | |||
| 1142 | if(!p->hasNoFinals()) | |||
| 1143 | { | |||
| 1144 | int new_state = t->insertTransducer(state, *p); | |||
| 1145 | if(tok.mode & Optional) | |||
| 1146 | t->linkStates(state, new_state, 0); | |||
| 1147 | if(tok.mode & Repeated) | |||
| 1148 | t->linkStates(new_state, state, 0); | |||
| 1149 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
| 1150 | } | |||
| 1151 | } | |||
| 1152 | } | |||
| 1153 | ||||
| 1154 | vector<int> | |||
| 1155 | LexdCompiler::determineFreedom(pattern_t& pat) | |||
| 1156 | { | |||
| 1157 | vector<int> is_free = vector<int>(pat.size(), 0); | |||
| 1158 | map<string_ref, bool> is_optional; | |||
| 1159 | for(unsigned int i = 0; i < pat.size(); i++) | |||
| 1160 | { | |||
| 1161 | const pattern_element_t& t1 = pat[i]; | |||
| 1162 | if (is_optional.find(t1.left.name) != is_optional.end() && is_optional[t1.left.name] != t1.optional()) { | |||
| 1163 | die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.left.name))(to_ustring(name(t1.left.name)).c_str())); | |||
| 1164 | } | |||
| 1165 | if (is_optional.find(t1.right.name) != is_optional.end() && is_optional[t1.right.name] != t1.optional()) { | |||
| 1166 | die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.right.name))(to_ustring(name(t1.right.name)).c_str())); | |||
| 1167 | } | |||
| 1168 | if (t1.left.name.valid()) { | |||
| 1169 | is_optional[t1.left.name] = t1.optional(); | |||
| 1170 | } | |||
| 1171 | if (t1.right.name.valid()) { | |||
| 1172 | is_optional[t1.right.name] = t1.optional(); | |||
| 1173 | } | |||
| 1174 | if(is_free[i] != 0) | |||
| 1175 | continue; | |||
| 1176 | for(unsigned int j = i+1; j < pat.size(); j++) | |||
| 1177 | { | |||
| 1178 | const pattern_element_t& t2 = pat[j]; | |||
| 1179 | if((t1.left.name.valid() && (t1.left.name == t2.left.name || t1.left.name == t2.right.name)) || | |||
| 1180 | (t1.right.name.valid() && (t1.right.name == t2.left.name || t1.right.name == t2.right.name))) | |||
| 1181 | { | |||
| 1182 | is_free[i] = -1; | |||
| 1183 | is_free[j] = -1; | |||
| 1184 | } | |||
| 1185 | } | |||
| 1186 | is_free[i] = (is_free[i] == 0 ? 1 : -1); | |||
| 1187 | } | |||
| 1188 | return is_free; | |||
| 1189 | } | |||
| 1190 | ||||
| 1191 | Transducer* | |||
| 1192 | LexdCompiler::buildPattern(const pattern_element_t &tok) | |||
| 1193 | { | |||
| 1194 | if(tok.left.part != 1 || tok.right.part != 1) | |||
| 1195 | die("Cannot build collated pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
| 1196 | if(patternTransducers.find(tok) == patternTransducers.end()) | |||
| 1197 | { | |||
| 1198 | if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl; | |||
| 1199 | auto start_time = chrono::steady_clock::now(); | |||
| 1200 | Transducer* t = new Transducer(); | |||
| 1201 | patternTransducers[tok] = NULL__null; | |||
| 1202 | map<string_ref, unsigned int> tempMatch; | |||
| 1203 | tempMatch.swap(matchedParts); | |||
| 1204 | for(auto &pat_untagged : patterns[tok.left.name]) | |||
| 1205 | { | |||
| 1206 | for(unsigned int i = 0; i < pat_untagged.second.size(); i++) | |||
| 1207 | { | |||
| 1208 | auto pat = pat_untagged; | |||
| 1209 | bool taggable = true; | |||
| 1210 | for (unsigned int j = 0; j < pat.second.size(); j++) { | |||
| 1211 | auto& pair = pat.second[j]; | |||
| 1212 | if(!pair.tag_filter.combine(tok.tag_filter.neg())) { | |||
| 1213 | taggable = false; | |||
| 1214 | if (verbose) { | |||
| 1215 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
| 1216 | cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[j])); | |||
| 1217 | cerr << " on line " << pat.first << "." << endl; | |||
| 1218 | } | |||
| 1219 | } | |||
| 1220 | } | |||
| 1221 | if(!pat.second[i].tag_filter.combine(tok.tag_filter.pos())) { | |||
| 1222 | taggable = false; | |||
| 1223 | if (verbose) { | |||
| 1224 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
| 1225 | cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[i])); | |||
| 1226 | cerr << " on line " << pat.first << "." << endl; | |||
| 1227 | } | |||
| 1228 | } | |||
| 1229 | if (!taggable) continue; | |||
| 1230 | ||||
| 1231 | matchedParts.clear(); | |||
| 1232 | lineNumber = pat.first; | |||
| 1233 | vector<int> is_free = determineFreedom(pat.second); | |||
| 1234 | buildPattern(t->getInitial(), t, pat.second, is_free, 0); | |||
| 1235 | } | |||
| 1236 | } | |||
| 1237 | tempMatch.swap(matchedParts); | |||
| 1238 | if(!t->hasNoFinals()) | |||
| 1239 | { | |||
| 1240 | if (verbose) | |||
| 1241 | cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl; | |||
| 1242 | t->minimize(); | |||
| 1243 | } | |||
| 1244 | else if (verbose) { | |||
| 1245 | cerr << "Warning: " << to_ustring(printPattern(tok)); | |||
| 1246 | cerr << " is empty." << endl; | |||
| 1247 | } | |||
| 1248 | patternTransducers[tok] = t; | |||
| 1249 | if (verbose) { | |||
| 1250 | auto end_time = chrono::steady_clock::now(); | |||
| 1251 | chrono::duration<double> diff = end_time - start_time; | |||
| 1252 | cerr << "Done compiling " << to_ustring(printPattern(tok)); | |||
| 1253 | cerr << " in " << diff.count() << " seconds." << endl; | |||
| 1254 | } | |||
| 1255 | } | |||
| 1256 | else if(patternTransducers[tok] == NULL__null) | |||
| 1257 | { | |||
| 1258 | die("Cannot compile self-recursive %S", err(printPattern(tok))(to_ustring(printPattern(tok)).c_str())); | |||
| 1259 | } | |||
| 1260 | return patternTransducers[tok]; | |||
| 1261 | } | |||
| 1262 | ||||
| 1263 | int | |||
| 1264 | LexdCompiler::insertPreTags(Transducer* t, int state, tag_filter_t &tags) | |||
| 1265 | { | |||
| 1266 | int end = state; | |||
| 1267 | for(auto tag : tags.pos()) | |||
| 1268 | { | |||
| 1269 | trans_sym_t flag = getFlag(Positive, tag, 1); | |||
| 1270 | end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end); | |||
| 1271 | } | |||
| 1272 | for(auto tag : tags.neg()) | |||
| 1273 | { | |||
| 1274 | trans_sym_t flag = getFlag(Positive, tag, 2); | |||
| 1275 | end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end); | |||
| 1276 | } | |||
| 1277 | return end; | |||
| 1278 | } | |||
| 1279 | ||||
| 1280 | int | |||
| 1281 | LexdCompiler::insertPostTags(Transducer* t, int state, tag_filter_t &tags) | |||
| 1282 | { | |||
| 1283 | int end = 0; | |||
| 1284 | int flag_dest = 0; | |||
| 1285 | for(auto tag : tags.pos()) | |||
| 1286 | { | |||
| 1287 | trans_sym_t flag = getFlag(Disallow, tag, 1); | |||
| 1288 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
| 1289 | if(flag_dest == 0) | |||
| 1290 | { | |||
| 1291 | flag_dest = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1292 | end = flag_dest; | |||
| 1293 | } | |||
| 1294 | else | |||
| 1295 | { | |||
| 1296 | t->linkStates(state, flag_dest, (int)alphabet_lookup(flag, flag)); | |||
| 1297 | } | |||
| 1298 | end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end); | |||
| 1299 | } | |||
| 1300 | if(end == 0) | |||
| 1301 | { | |||
| 1302 | end = state; | |||
| 1303 | } | |||
| 1304 | for(auto tag : tags.neg()) | |||
| 1305 | { | |||
| 1306 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
| 1307 | end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end); | |||
| 1308 | } | |||
| 1309 | return end; | |||
| 1310 | } | |||
| 1311 | ||||
| 1312 | Transducer* | |||
| 1313 | LexdCompiler::buildPatternWithFlags(const pattern_element_t &tok, int pattern_start_state = 0) | |||
| 1314 | { | |||
| 1315 | if(patternTransducers.find(tok) == patternTransducers.end()) | |||
| ||||
| 1316 | { | |||
| 1317 | if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl; | |||
| 1318 | auto start_time = chrono::steady_clock::now(); | |||
| 1319 | Transducer* trans = (shouldHypermin ? hyperminTrans : new Transducer()); | |||
| 1320 | patternTransducers[tok] = NULL__null; | |||
| 1321 | unsigned int transition_index = 0; | |||
| 1322 | vector<int> pattern_finals; | |||
| 1323 | bool did_anything = false; | |||
| 1324 | for(auto& pat : patterns[tok.left.name]) | |||
| 1325 | { | |||
| 1326 | lineNumber = pat.first; | |||
| 1327 | vector<int> is_free = determineFreedom(pat.second); | |||
| 1328 | bool got_non_null = false; | |||
| 1329 | unsigned int count = (tok.tag_filter.pos().size() > 0 ? pat.second.size() : 1); | |||
| 1330 | if(tagsAsFlags) count = 1; | |||
| 1331 | for(unsigned int idx = 0; idx < count; idx++) | |||
| 1332 | { | |||
| 1333 | int state = pattern_start_state; | |||
| 1334 | vector<int> finals; | |||
| 1335 | set<string_ref> to_clear; | |||
| 1336 | bool got_null = false; | |||
| 1337 | for(unsigned int i = 0; i < pat.second.size(); i++) | |||
| 1338 | { | |||
| 1339 | pattern_element_t cur = pat.second[i]; | |||
| 1340 | ||||
| 1341 | if(cur.left.name == left_sieve_name) | |||
| 1342 | { | |||
| 1343 | trans->linkStates(pattern_start_state, state, 0); | |||
| 1344 | continue; | |||
| 1345 | } | |||
| 1346 | else if(cur.left.name == right_sieve_name) | |||
| 1347 | { | |||
| 1348 | finals.push_back(state); | |||
| 1349 | continue; | |||
| 1350 | } | |||
| 1351 | ||||
| 1352 | bool isLex = isLexiconToken(cur); | |||
| 1353 | ||||
| 1354 | transition_index++; | |||
| 1355 | ||||
| 1356 | int mode_start = state; | |||
| 1357 | cur.mode = Normal; | |||
| 1358 | ||||
| 1359 | tag_filter_t current_tags; | |||
| 1360 | if(tagsAsFlags) | |||
| 1361 | { | |||
| 1362 | state = insertPreTags(trans, state, cur.tag_filter); | |||
| 1363 | current_tags = cur.tag_filter; | |||
| 1364 | cur.tag_filter = tag_filter_t(); | |||
| 1365 | } | |||
| 1366 | else | |||
| 1367 | { | |||
| 1368 | if (i == idx && !cur.tag_filter.combine(tok.tag_filter.pos())) { | |||
| 1369 | if (verbose) { | |||
| 1370 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
| 1371 | cerr << " conflict with " << to_ustring(printPattern(pat.second[i])); | |||
| 1372 | cerr << " on line " << pat.first << "." << endl; | |||
| 1373 | } | |||
| 1374 | } | |||
| 1375 | if (!cur.tag_filter.combine(tok.tag_filter.neg())) { | |||
| 1376 | if (verbose) { | |||
| 1377 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
| 1378 | cerr << " conflict with " << to_ustring(printPattern(pat.second[i])); | |||
| 1379 | cerr << " on line " << pat.first << "." << endl; | |||
| 1380 | } | |||
| 1381 | } | |||
| 1382 | } | |||
| 1383 | ||||
| 1384 | Transducer* t; | |||
| 1385 | if(shouldHypermin) | |||
| 1386 | { | |||
| 1387 | trans_sym_t inflag = getFlag(Positive, tok.left.name, transition_index); | |||
| 1388 | trans_sym_t outflag = getFlag(Require, tok.left.name, transition_index); | |||
| 1389 | int in_tr = (int)alphabet_lookup(inflag, inflag); | |||
| 1390 | int out_tr = (int)alphabet_lookup(outflag, outflag); | |||
| 1391 | if(is_free[i] == -1 && isLex) | |||
| 1392 | { | |||
| 1393 | to_clear.insert(cur.left.name); | |||
| 1394 | to_clear.insert(cur.right.name); | |||
| 1395 | } | |||
| 1396 | if(transducerLocs.find(cur) != transducerLocs.end()) | |||
| 1397 | { | |||
| 1398 | auto loc = transducerLocs[cur]; | |||
| 1399 | if(loc.first == loc.second) | |||
| 1400 | { | |||
| 1401 | t = NULL__null; | |||
| 1402 | } | |||
| 1403 | else | |||
| 1404 | { | |||
| 1405 | t = trans; | |||
| 1406 | trans->linkStates(state, loc.first, in_tr); | |||
| 1407 | state = trans->insertSingleTransduction(out_tr, loc.second); | |||
| 1408 | } | |||
| 1409 | } | |||
| 1410 | else | |||
| 1411 | { | |||
| 1412 | int start = trans->insertSingleTransduction(in_tr, state); | |||
| 1413 | int end = start; | |||
| 1414 | if(isLex) | |||
| 1415 | { | |||
| 1416 | t = getLexiconTransducerWithFlags(cur, false); | |||
| 1417 | if(t == NULL__null) | |||
| 1418 | { | |||
| 1419 | transducerLocs[cur] = make_pair(start, start); | |||
| 1420 | } | |||
| 1421 | else | |||
| 1422 | { | |||
| 1423 | end = trans->insertTransducer(start, *t); | |||
| 1424 | transducerLocs[cur] = make_pair(start, end); | |||
| 1425 | } | |||
| 1426 | } | |||
| 1427 | else | |||
| 1428 | { | |||
| 1429 | t = buildPatternWithFlags(cur, start); | |||
| 1430 | end = transducerLocs[cur].second; | |||
| 1431 | } | |||
| 1432 | state = trans->insertSingleTransduction(out_tr, end); | |||
| 1433 | } | |||
| 1434 | } | |||
| 1435 | else if(isLex) | |||
| 1436 | { | |||
| 1437 | t = getLexiconTransducerWithFlags(cur, (is_free[i] == 1)); | |||
| 1438 | if(is_free[i] == -1) | |||
| 1439 | { | |||
| 1440 | to_clear.insert(cur.left.name); | |||
| 1441 | to_clear.insert(cur.right.name); | |||
| 1442 | } | |||
| 1443 | } | |||
| 1444 | else | |||
| 1445 | { | |||
| 1446 | t = buildPatternWithFlags(cur); | |||
| 1447 | } | |||
| 1448 | if(t == NULL__null || (!shouldHypermin && t->hasNoFinals())) | |||
| 1449 | { | |||
| 1450 | got_null = true; | |||
| 1451 | break; | |||
| 1452 | } | |||
| 1453 | got_non_null = true; | |||
| 1454 | if(!shouldHypermin) | |||
| 1455 | { | |||
| 1456 | state = trans->insertTransducer(state, *t); | |||
| 1457 | } | |||
| 1458 | if(tagsAsFlags) | |||
| 1459 | { | |||
| 1460 | state = insertPostTags(trans, state, current_tags); | |||
| 1461 | } | |||
| 1462 | if(pat.second[i].mode & Optional) | |||
| 1463 | { | |||
| 1464 | trans->linkStates(mode_start, state, 0); | |||
| 1465 | } | |||
| 1466 | if(pat.second[i].mode & Repeated) | |||
| 1467 | { | |||
| 1468 | trans->linkStates(state, mode_start, 0); | |||
| 1469 | } | |||
| 1470 | } | |||
| 1471 | if(!got_null || finals.size() > 0) | |||
| 1472 | { | |||
| 1473 | for(auto fin : finals) | |||
| 1474 | { | |||
| 1475 | trans->linkStates(fin, state, 0); | |||
| 1476 | } | |||
| 1477 | for(auto lex : to_clear) | |||
| 1478 | { | |||
| 1479 | if(lex.empty()) | |||
| 1480 | { | |||
| 1481 | continue; | |||
| 1482 | } | |||
| 1483 | UnicodeString flag = "@C."; | |||
| 1484 | encodeFlag(flag, (int)lex.i); | |||
| 1485 | flag += "@"; | |||
| 1486 | trans_sym_t f = alphabet_lookup(flag); | |||
| 1487 | state = trans->insertSingleTransduction((int)alphabet_lookup(f, f), state); | |||
| 1488 | } | |||
| 1489 | trans->setFinal(state); | |||
| 1490 | pattern_finals.push_back(state); | |||
| 1491 | } | |||
| 1492 | } | |||
| 1493 | if(!got_non_null) | |||
| 1494 | { | |||
| 1495 | continue; | |||
| 1496 | } | |||
| 1497 | did_anything = true; | |||
| 1498 | } | |||
| 1499 | if(did_anything
| |||
| 1500 | { | |||
| 1501 | if(shouldHypermin) | |||
| 1502 | { | |||
| 1503 | if(pattern_finals.size() > 0) | |||
| 1504 | { | |||
| 1505 | int end = pattern_finals[0]; | |||
| 1506 | for(auto fin : pattern_finals) | |||
| 1507 | { | |||
| 1508 | if(fin != end) | |||
| 1509 | { | |||
| 1510 | trans->linkStates(fin, end, 0); | |||
| 1511 | } | |||
| 1512 | if(pattern_start_state != 0) | |||
| 1513 | { | |||
| 1514 | trans->setFinal(fin, 0, false); | |||
| 1515 | } | |||
| 1516 | } | |||
| 1517 | pattern_element_t key = tok; | |||
| 1518 | key.mode = Normal; | |||
| 1519 | transducerLocs[key] = make_pair(pattern_start_state, end); | |||
| 1520 | } | |||
| 1521 | } | |||
| 1522 | else | |||
| 1523 | { | |||
| 1524 | if(!trans->hasNoFinals()) { | |||
| 1525 | if (verbose) | |||
| 1526 | cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl; | |||
| 1527 | trans->minimize(); | |||
| 1528 | } | |||
| 1529 | } | |||
| 1530 | } | |||
| 1531 | else | |||
| 1532 | { | |||
| 1533 | if(!shouldHypermin) | |||
| 1534 | trans = NULL__null; | |||
| 1535 | else | |||
| 1536 | { | |||
| 1537 | cerr << "FIXME" << endl; | |||
| 1538 | } | |||
| 1539 | } | |||
| 1540 | if (verbose) { | |||
| ||||
| 1541 | auto end_time = chrono::steady_clock::now(); | |||
| 1542 | chrono::duration<double> diff = end_time - start_time; | |||
| 1543 | cerr << "Done compiling " << to_ustring(printPattern(tok)); | |||
| 1544 | cerr << " in " << diff.count() << " seconds." << endl; | |||
| 1545 | } | |||
| 1546 | patternTransducers[tok] = trans; | |||
| 1547 | } | |||
| 1548 | else if(patternTransducers[tok] == NULL__null) | |||
| 1549 | { | |||
| 1550 | die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
| 1551 | } | |||
| 1552 | return patternTransducers[tok]; | |||
| 1553 | } | |||
| 1554 | ||||
| 1555 | void | |||
| 1556 | LexdCompiler::buildAllLexicons() | |||
| 1557 | { | |||
| 1558 | // find out if there are any lexicons that we can build without flags | |||
| 1559 | vector<pattern_element_t> lexicons_to_build; | |||
| 1560 | for(auto pattern : patterns) | |||
| 1561 | { | |||
| 1562 | for(auto pat : pattern.second) | |||
| 1563 | { | |||
| 1564 | lineNumber = pat.first; | |||
| 1565 | vector<int> free = determineFreedom(pat.second); | |||
| 1566 | for(size_t i = 0; i < pat.second.size(); i++) | |||
| 1567 | { | |||
| 1568 | if(pat.second[i].left.name == left_sieve_name || | |||
| 1569 | pat.second[i].left.name == right_sieve_name) | |||
| 1570 | { | |||
| 1571 | continue; | |||
| 1572 | } | |||
| 1573 | if(isLexiconToken(pat.second[i])) | |||
| 1574 | { | |||
| 1575 | pattern_element_t& tok = pat.second[i]; | |||
| 1576 | if(free[i] == -1) | |||
| 1577 | { | |||
| 1578 | lexiconFreedom[tok.left.name] = false; | |||
| 1579 | lexiconFreedom[tok.right.name] = false; | |||
| 1580 | } | |||
| 1581 | else | |||
| 1582 | { | |||
| 1583 | if(lexiconFreedom.find(tok.left.name) == lexiconFreedom.end()) | |||
| 1584 | { | |||
| 1585 | lexiconFreedom[tok.left.name] = true; | |||
| 1586 | } | |||
| 1587 | if(lexiconFreedom.find(tok.right.name) == lexiconFreedom.end()) | |||
| 1588 | { | |||
| 1589 | lexiconFreedom[tok.right.name] = true; | |||
| 1590 | } | |||
| 1591 | } | |||
| 1592 | lexicons_to_build.push_back(tok); | |||
| 1593 | } | |||
| 1594 | } | |||
| 1595 | } | |||
| 1596 | } | |||
| 1597 | lexiconFreedom[string_ref(0)] = true; | |||
| 1598 | for(auto tok : lexicons_to_build) | |||
| 1599 | { | |||
| 1600 | tok.tag_filter = tag_filter_t(); | |||
| 1601 | tok.mode = Normal; | |||
| 1602 | bool free = ((tok.left.name.empty() || lexiconFreedom[tok.left.name]) && | |||
| 1603 | (tok.right.name.empty() || lexiconFreedom[tok.right.name])); | |||
| 1604 | getLexiconTransducerWithFlags(tok, free); | |||
| 1605 | } | |||
| 1606 | } | |||
| 1607 | ||||
| 1608 | int | |||
| 1609 | LexdCompiler::buildPatternSingleLexicon(pattern_element_t tok, int start_state) | |||
| 1610 | { | |||
| 1611 | if(patternTransducers.find(tok) == patternTransducers.end() || patternTransducers[tok] != NULL__null) | |||
| 1612 | { | |||
| 1613 | patternTransducers[tok] = NULL__null; | |||
| 1614 | int end = -1; | |||
| 1615 | string_ref transition_flag = internName(" "); | |||
| 1616 | for(auto pattern : patterns[tok.left.name]) | |||
| 1617 | { | |||
| 1618 | int next_start_state = start_state; | |||
| 1619 | size_t next_start_idx = 0; | |||
| 1620 | lineNumber = pattern.first; | |||
| 1621 | set<string_ref> to_clear; | |||
| 1622 | size_t count = (tok.tag_filter.pos().empty() ? 1 : pattern.second.size()); | |||
| 1623 | for(size_t tag_idx = 0; tag_idx < count; tag_idx++) | |||
| 1624 | { | |||
| 1625 | int state = next_start_state; | |||
| 1626 | bool finished = true; | |||
| 1627 | for(size_t i = next_start_idx; i < pattern.second.size(); i++) | |||
| 1628 | { | |||
| 1629 | pattern_element_t cur = pattern.second[i]; | |||
| 1630 | ||||
| 1631 | if(cur.left.name == left_sieve_name) | |||
| 1632 | { | |||
| 1633 | hyperminTrans->linkStates(start_state, state, 0); | |||
| 1634 | continue; | |||
| 1635 | } | |||
| 1636 | else if(cur.left.name == right_sieve_name) | |||
| 1637 | { | |||
| 1638 | if(end == -1) | |||
| 1639 | { | |||
| 1640 | end = hyperminTrans->insertNewSingleTransduction(0, state); | |||
| 1641 | } | |||
| 1642 | else | |||
| 1643 | { | |||
| 1644 | hyperminTrans->linkStates(state, end, 0); | |||
| 1645 | } | |||
| 1646 | continue; | |||
| 1647 | } | |||
| 1648 | ||||
| 1649 | if(i == tag_idx) | |||
| 1650 | { | |||
| 1651 | next_start_state = state; | |||
| 1652 | next_start_idx = tag_idx; | |||
| 1653 | cur.tag_filter.combine(tok.tag_filter.pos()); | |||
| 1654 | } | |||
| 1655 | cur.tag_filter.combine(tok.tag_filter.neg()); | |||
| 1656 | ||||
| 1657 | int mode_state = state; | |||
| 1658 | ||||
| 1659 | if(isLexiconToken(cur)) | |||
| 1660 | { | |||
| 1661 | tags_t tags = cur.tag_filter.tags(); | |||
| 1662 | for(auto tag : tags) | |||
| 1663 | { | |||
| 1664 | trans_sym_t flag = getFlag(Clear, tag, 0); | |||
| 1665 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1666 | } | |||
| 1667 | pattern_element_t untagged = cur; | |||
| 1668 | untagged.tag_filter = tag_filter_t(); | |||
| 1669 | untagged.mode = Normal; | |||
| 1670 | bool free = (lexiconFreedom[cur.left.name] && lexiconFreedom[cur.right.name]); | |||
| 1671 | if(!free) | |||
| 1672 | { | |||
| 1673 | to_clear.insert(cur.left.name); | |||
| 1674 | to_clear.insert(cur.right.name); | |||
| 1675 | } | |||
| 1676 | trans_sym_t inflag = getFlag(Positive, transition_flag, transitionCount); | |||
| 1677 | trans_sym_t outflag = getFlag(Require, transition_flag, transitionCount); | |||
| 1678 | transitionCount++; | |||
| 1679 | if(transducerLocs.find(untagged) == transducerLocs.end()) | |||
| 1680 | { | |||
| 1681 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(inflag, inflag), state); | |||
| 1682 | Transducer* lex = getLexiconTransducerWithFlags(untagged, free); | |||
| 1683 | int start = state; | |||
| 1684 | state = hyperminTrans->insertTransducer(state, *lex); | |||
| 1685 | transducerLocs[untagged] = make_pair(start, state); | |||
| 1686 | } | |||
| 1687 | else | |||
| 1688 | { | |||
| 1689 | auto loc = transducerLocs[untagged]; | |||
| 1690 | hyperminTrans->linkStates(state, loc.first, (int)alphabet_lookup(inflag, inflag)); | |||
| 1691 | state = loc.second; | |||
| 1692 | } | |||
| 1693 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(outflag, outflag), state); | |||
| 1694 | for(auto tag : cur.tag_filter.pos()) | |||
| 1695 | { | |||
| 1696 | trans_sym_t flag = getFlag(Require, tag, 1); | |||
| 1697 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1698 | } | |||
| 1699 | for(auto tag : cur.tag_filter.neg()) | |||
| 1700 | { | |||
| 1701 | trans_sym_t flag = getFlag(Disallow, tag, 1); | |||
| 1702 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1703 | } | |||
| 1704 | } | |||
| 1705 | else | |||
| 1706 | { | |||
| 1707 | state = buildPatternSingleLexicon(cur, state); | |||
| 1708 | if(state == -1) | |||
| 1709 | { | |||
| 1710 | finished = false; | |||
| 1711 | break; | |||
| 1712 | } | |||
| 1713 | } | |||
| 1714 | ||||
| 1715 | if(cur.mode & Optional) | |||
| 1716 | { | |||
| 1717 | hyperminTrans->linkStates(mode_state, state, 0); | |||
| 1718 | } | |||
| 1719 | if(cur.mode & Repeated) | |||
| 1720 | { | |||
| 1721 | hyperminTrans->linkStates(state, mode_state, 0); | |||
| 1722 | } | |||
| 1723 | } | |||
| 1724 | if(finished) | |||
| 1725 | { | |||
| 1726 | for(auto lex : to_clear) | |||
| 1727 | { | |||
| 1728 | if(lex.empty()) | |||
| 1729 | { | |||
| 1730 | continue; | |||
| 1731 | } | |||
| 1732 | trans_sym_t flag = getFlag(Clear, lex, 0); | |||
| 1733 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1734 | } | |||
| 1735 | if(end == -1) | |||
| 1736 | { | |||
| 1737 | end = state; | |||
| 1738 | } | |||
| 1739 | else | |||
| 1740 | { | |||
| 1741 | hyperminTrans->linkStates(state, end, 0); | |||
| 1742 | } | |||
| 1743 | } | |||
| 1744 | } | |||
| 1745 | } | |||
| 1746 | patternTransducers.erase(tok); | |||
| 1747 | return end; | |||
| 1748 | } | |||
| 1749 | else | |||
| 1750 | { | |||
| 1751 | die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
| 1752 | return 0; | |||
| 1753 | } | |||
| 1754 | } | |||
| 1755 | ||||
| 1756 | void | |||
| 1757 | LexdCompiler::readFile(UFILE* infile) | |||
| 1758 | { | |||
| 1759 | input = infile; | |||
| 1760 | doneReading = false; | |||
| 1761 | while(!u_feofu_feof_72(input)) | |||
| 1762 | { | |||
| 1763 | processNextLine(); | |||
| 1764 | if(doneReading) break; | |||
| 1765 | } | |||
| 1766 | finishLexicon(); | |||
| 1767 | } | |||
| 1768 | ||||
| 1769 | Transducer* | |||
| 1770 | LexdCompiler::buildTransducer(bool usingFlags) | |||
| 1771 | { | |||
| 1772 | token_t start_tok = {.name = internName(" "), .part = 1, .optional = false}; | |||
| 1773 | pattern_element_t start_pat = {.left=start_tok, .right=start_tok, | |||
| 1774 | .tag_filter=tag_filter_t(), | |||
| 1775 | .mode=Normal}; | |||
| 1776 | if(usingFlags) | |||
| 1777 | { | |||
| 1778 | if(shouldHypermin) | |||
| 1779 | { | |||
| 1780 | hyperminTrans = new Transducer(); | |||
| 1781 | } | |||
| 1782 | Transducer *t = buildPatternWithFlags(start_pat); | |||
| 1783 | if(shouldHypermin) | |||
| 1784 | t->minimize(); | |||
| 1785 | return t; | |||
| 1786 | } | |||
| 1787 | else return buildPattern(start_pat); | |||
| 1788 | } | |||
| 1789 | ||||
| 1790 | Transducer* | |||
| 1791 | LexdCompiler::buildTransducerSingleLexicon() | |||
| 1792 | { | |||
| 1793 | tagsAsMinFlags = true; | |||
| 1794 | token_t start_tok = {.name = internName(" "), .part = 1, .optional = false}; | |||
| 1795 | pattern_element_t start_pat = {.left=start_tok, .right=start_tok, | |||
| 1796 | .tag_filter=tag_filter_t(), | |||
| 1797 | .mode=Normal}; | |||
| 1798 | hyperminTrans = new Transducer(); | |||
| 1799 | buildAllLexicons(); | |||
| 1800 | int end = buildPatternSingleLexicon(start_pat, 0); | |||
| 1801 | if(end == -1) | |||
| 1802 | { | |||
| 1803 | cerr << "WARNING: No non-empty patterns found." << endl; | |||
| 1804 | } | |||
| 1805 | else { | |||
| 1806 | hyperminTrans->setFinal(end); | |||
| 1807 | hyperminTrans->minimize(); | |||
| 1808 | } | |||
| 1809 | return hyperminTrans; | |||
| 1810 | } | |||
| 1811 | ||||
| 1812 | void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation) | |||
| 1813 | { | |||
| 1814 | if(alternation.empty()) | |||
| 1815 | return; | |||
| 1816 | if(pats.empty()) | |||
| 1817 | pats.push_back(pattern_t()); | |||
| 1818 | vector<pattern_t> new_pats; | |||
| 1819 | for(const auto &pat: pats) | |||
| 1820 | { | |||
| 1821 | for(const auto &tok: alternation) | |||
| 1822 | { | |||
| 1823 | auto pat1 = pat; | |||
| 1824 | pat1.push_back(tok); | |||
| 1825 | new_pats.push_back(pat1); | |||
| 1826 | } | |||
| 1827 | } | |||
| 1828 | pats = new_pats; | |||
| 1829 | } | |||
| 1830 | ||||
| 1831 | void | |||
| 1832 | LexdCompiler::insertEntry(Transducer* trans, const lex_seg_t &seg) | |||
| 1833 | { | |||
| 1834 | int state = trans->getInitial(); | |||
| 1835 | if(tagsAsFlags) | |||
| 1836 | { | |||
| 1837 | for(string_ref tag : seg.tags) | |||
| 1838 | { | |||
| 1839 | trans_sym_t check1 = getFlag(Require, tag, 1); | |||
| 1840 | trans_sym_t check2 = getFlag(Disallow, tag, 2); | |||
| 1841 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
| 1842 | int state2 = trans->insertSingleTransduction((int)alphabet_lookup(check1, check1), state); | |||
| 1843 | int state3 = trans->insertSingleTransduction((int)alphabet_lookup(clear, clear), state2); | |||
| 1844 | trans->linkStates(state, state3, 0); | |||
| 1845 | state = trans->insertSingleTransduction((int)alphabet_lookup(check2, check2), state3); | |||
| 1846 | } | |||
| 1847 | } | |||
| 1848 | else if(tagsAsMinFlags) | |||
| 1849 | { | |||
| 1850 | for(string_ref tag : seg.tags) | |||
| 1851 | { | |||
| 1852 | trans_sym_t flag = getFlag(Positive, tag, 1); | |||
| 1853 | state = trans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
| 1854 | } | |||
| 1855 | } | |||
| 1856 | if (seg.regex != nullptr) { | |||
| 1857 | state = trans->insertTransducer(state, *seg.regex); | |||
| 1858 | } | |||
| 1859 | if(!shouldAlign) | |||
| 1860 | { | |||
| 1861 | for(unsigned int i = 0; i < seg.left.symbols.size() || i < seg.right.symbols.size(); i++) | |||
| 1862 | { | |||
| 1863 | trans_sym_t l = (i < seg.left.symbols.size()) ? seg.left.symbols[i] : trans_sym_t(); | |||
| 1864 | trans_sym_t r = (i < seg.right.symbols.size()) ? seg.right.symbols[i] : trans_sym_t(); | |||
| 1865 | state = trans->insertSingleTransduction(alphabet((int)l, (int)r), state); | |||
| 1866 | } | |||
| 1867 | } | |||
| 1868 | else | |||
| 1869 | { | |||
| 1870 | /* | |||
| 1871 | This code is adapted from hfst/libhfst/src/parsers/lexc-utils.cc | |||
| 1872 | It uses the Levenshtein distance algorithm to determine the optimal | |||
| 1873 | alignment of two strings. | |||
| 1874 | In hfst-lexc, the priority order for ties is SUB > DEL > INS | |||
| 1875 | which ensures that 000abc:xyz000 is preferred over abc000:000xyz | |||
| 1876 | However, we're traversing the strings backwards to simplify extracting | |||
| 1877 | the final alignment, so we need to switch INS and DEL. | |||
| 1878 | If shouldCompress is true, we set the cost of SUB to 1 in order to prefer | |||
| 1879 | a:b over 0:b a:0 without changing the alignment of actual correspondences. | |||
| 1880 | */ | |||
| 1881 | const unsigned int INS = 0; | |||
| 1882 | const unsigned int DEL = 1; | |||
| 1883 | const unsigned int SUB = 2; | |||
| 1884 | const unsigned int ins_cost = 1; | |||
| 1885 | const unsigned int del_cost = 1; | |||
| 1886 | const unsigned int sub_cost = (shouldCompress ? 1 : 100); | |||
| 1887 | ||||
| 1888 | const unsigned int len1 = seg.left.symbols.size(); | |||
| 1889 | const unsigned int len2 = seg.right.symbols.size(); | |||
| 1890 | unsigned int cost[len1+1][len2+1]; | |||
| 1891 | unsigned int path[len1+1][len2+1]; | |||
| 1892 | cost[0][0] = 0; | |||
| 1893 | path[0][0] = 0; | |||
| 1894 | for(unsigned int i = 1; i <= len1; i++) | |||
| 1895 | { | |||
| 1896 | cost[i][0] = del_cost * i; | |||
| 1897 | path[i][0] = DEL; | |||
| 1898 | } | |||
| 1899 | for(unsigned int i = 1; i <= len2; i++) | |||
| 1900 | { | |||
| 1901 | cost[0][i] = ins_cost * i; | |||
| 1902 | path[0][i] = INS; | |||
| 1903 | } | |||
| 1904 | ||||
| 1905 | for(unsigned int i = 1; i <= len1; i++) | |||
| 1906 | { | |||
| 1907 | for(unsigned int j = 1; j <= len2; j++) | |||
| 1908 | { | |||
| 1909 | unsigned int sub = cost[i-1][j-1] + (seg.left.symbols[len1-i] == seg.right.symbols[len2-j] ? 0 : sub_cost); | |||
| 1910 | unsigned int ins = cost[i][j-1] + ins_cost; | |||
| 1911 | unsigned int del = cost[i-1][j] + del_cost; | |||
| 1912 | ||||
| 1913 | if(sub <= ins && sub <= del) | |||
| 1914 | { | |||
| 1915 | cost[i][j] = sub; | |||
| 1916 | path[i][j] = SUB; | |||
| 1917 | } | |||
| 1918 | else if(ins <= del) | |||
| 1919 | { | |||
| 1920 | cost[i][j] = ins; | |||
| 1921 | path[i][j] = INS; | |||
| 1922 | } | |||
| 1923 | else | |||
| 1924 | { | |||
| 1925 | cost[i][j] = del; | |||
| 1926 | path[i][j] = DEL; | |||
| 1927 | } | |||
| 1928 | } | |||
| 1929 | } | |||
| 1930 | ||||
| 1931 | for(unsigned int x = len1, y = len2; (x > 0) || (y > 0);) | |||
| 1932 | { | |||
| 1933 | trans_sym_t symbol; | |||
| 1934 | switch(path[x][y]) | |||
| 1935 | { | |||
| 1936 | case SUB: | |||
| 1937 | symbol = alphabet_lookup(seg.left.symbols[len1-x], seg.right.symbols[len2-y]); | |||
| 1938 | x--; | |||
| 1939 | y--; | |||
| 1940 | break; | |||
| 1941 | case INS: | |||
| 1942 | symbol = alphabet_lookup(trans_sym_t(), seg.right.symbols[len2-y]); | |||
| 1943 | y--; | |||
| 1944 | break; | |||
| 1945 | default: // DEL | |||
| 1946 | symbol = alphabet_lookup(seg.left.symbols[len1-x], trans_sym_t()); | |||
| 1947 | x--; | |||
| 1948 | } | |||
| 1949 | state = trans->insertSingleTransduction((int)symbol, state); | |||
| 1950 | } | |||
| 1951 | } | |||
| 1952 | trans->setFinal(state); | |||
| 1953 | } | |||
| 1954 | ||||
| 1955 | void | |||
| 1956 | LexdCompiler::applyMode(Transducer* trans, RepeatMode mode) | |||
| 1957 | { | |||
| 1958 | if(mode == Question) | |||
| 1959 | trans->optional(); | |||
| 1960 | else if(mode == Star) | |||
| 1961 | trans->zeroOrMore(); | |||
| 1962 | else if(mode == Plus) | |||
| 1963 | trans->oneOrMore(); | |||
| 1964 | } | |||
| 1965 | ||||
| 1966 | Transducer* | |||
| 1967 | LexdCompiler::getLexiconTransducer(pattern_element_t tok, unsigned int entry_index, bool free) | |||
| 1968 | { | |||
| 1969 | if(!free && entryTransducers.find(tok) != entryTransducers.end()) | |||
| 1970 | return entryTransducers[tok][entry_index]; | |||
| 1971 | if(free && lexiconTransducers.find(tok) != lexiconTransducers.end()) | |||
| 1972 | return lexiconTransducers[tok]; | |||
| 1973 | ||||
| 1974 | vector<entry_t>& lents = lexicons[tok.left.name]; | |||
| 1975 | if(tok.left.name.valid() && tok.left.part > lents[0].size()) | |||
| 1976 | die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part); | |||
| 1977 | vector<entry_t>& rents = lexicons[tok.right.name]; | |||
| 1978 | if(tok.right.name.valid() && tok.right.part > rents[0].size()) | |||
| 1979 | die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part); | |||
| 1980 | if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size()) | |||
| 1981 | die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 1982 | unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size()); | |||
| 1983 | vector<Transducer*> trans; | |||
| 1984 | if(free) | |||
| 1985 | trans.push_back(new Transducer()); | |||
| 1986 | else | |||
| 1987 | trans.reserve(count); | |||
| 1988 | lex_seg_t empty; | |||
| 1989 | bool did_anything = false; | |||
| 1990 | for(unsigned int i = 0; i < count; i++) | |||
| 1991 | { | |||
| 1992 | lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty); | |||
| 1993 | lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty); | |||
| 1994 | tags_t tags = unionset(le.tags, re.tags); | |||
| 1995 | if(!tok.tag_filter.compatible(tags)) | |||
| 1996 | { | |||
| 1997 | if(!free) | |||
| 1998 | trans.push_back(NULL__null); | |||
| 1999 | continue; | |||
| 2000 | } | |||
| 2001 | Transducer* t = free ? trans[0] : new Transducer(); | |||
| 2002 | if (le.regex != nullptr || re.regex != nullptr) { | |||
| 2003 | if (tok.left.name.empty()) | |||
| 2004 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 2005 | if (tok.right.name.empty()) | |||
| 2006 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
| 2007 | if (tok.left.name != tok.right.name) | |||
| 2008 | die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right .name))).c_str())); | |||
| 2009 | } | |||
| 2010 | insertEntry(t, {.left=le.left, .right=re.right, .regex=le.regex, .tags=tags}); | |||
| 2011 | did_anything = true; | |||
| 2012 | if(!free) | |||
| 2013 | { | |||
| 2014 | applyMode(t, tok.mode); | |||
| 2015 | trans.push_back(t); | |||
| 2016 | } | |||
| 2017 | } | |||
| 2018 | if(tok.optional()) { | |||
| 2019 | Transducer* t = free ? trans[0] : new Transducer(); | |||
| 2020 | tags_t empty_tags; | |||
| 2021 | insertEntry(t, {.left=empty.left, .right=empty.right, .regex=nullptr, .tags=empty_tags}); | |||
| 2022 | did_anything = true; | |||
| 2023 | if (!free) { | |||
| 2024 | applyMode(t, tok.mode); | |||
| 2025 | trans.push_back(t); | |||
| 2026 | } | |||
| 2027 | did_anything = true; | |||
| 2028 | } | |||
| 2029 | if(free) | |||
| 2030 | { | |||
| 2031 | if(!did_anything) | |||
| 2032 | { | |||
| 2033 | trans[0] = NULL__null; | |||
| 2034 | } | |||
| 2035 | if(trans[0]) | |||
| 2036 | { | |||
| 2037 | trans[0]->minimize(); | |||
| 2038 | applyMode(trans[0], tok.mode); | |||
| 2039 | } | |||
| 2040 | lexiconTransducers[tok] = trans[0]; | |||
| 2041 | return trans[0]; | |||
| 2042 | } | |||
| 2043 | else | |||
| 2044 | { | |||
| 2045 | entryTransducers[tok] = trans; | |||
| 2046 | return trans[entry_index]; | |||
| 2047 | } | |||
| 2048 | } | |||
| 2049 | ||||
| 2050 | void | |||
| 2051 | LexdCompiler::encodeFlag(UnicodeString& str, int flag) | |||
| 2052 | { | |||
| 2053 | UnicodeString letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |||
| 2054 | int num = flag; | |||
| 2055 | while(num > 0) | |||
| 2056 | { | |||
| 2057 | str += letters[num % 26]; | |||
| 2058 | num /= 26; | |||
| 2059 | } | |||
| 2060 | } | |||
| 2061 | ||||
| 2062 | trans_sym_t | |||
| 2063 | LexdCompiler::getFlag(FlagDiacriticType type, string_ref flag, unsigned int value) | |||
| 2064 | { | |||
| 2065 | //cerr << "getFlag(" << type << ", " << to_ustring(name(flag)) << ", " << value << ")" << endl; | |||
| 2066 | UnicodeString flagstr = "@"; | |||
| 2067 | switch(type) | |||
| 2068 | { | |||
| 2069 | case Unification: | |||
| 2070 | //cerr << " Unification" << endl; | |||
| 2071 | flagstr += "U."; break; | |||
| 2072 | case Positive: | |||
| 2073 | //cerr << " Positive" << endl; | |||
| 2074 | flagstr += "P."; break; | |||
| 2075 | case Negative: | |||
| 2076 | //cerr << " Negative" << endl; | |||
| 2077 | flagstr += "N."; break; | |||
| 2078 | case Require: | |||
| 2079 | //cerr << " Require" << endl; | |||
| 2080 | flagstr += "R."; break; | |||
| 2081 | case Disallow: | |||
| 2082 | //cerr << " Disallow" << endl; | |||
| 2083 | flagstr += "D."; break; | |||
| 2084 | case Clear: | |||
| 2085 | //cerr << " Clear" << endl; | |||
| 2086 | flagstr += "C."; break; | |||
| 2087 | } | |||
| 2088 | encodeFlag(flagstr, (int)flag.i); | |||
| 2089 | if(type != Clear) | |||
| 2090 | { | |||
| 2091 | flagstr += "."; | |||
| 2092 | encodeFlag(flagstr, (int)(value + 1)); | |||
| 2093 | } | |||
| 2094 | flagstr += "@"; | |||
| 2095 | return alphabet_lookup(flagstr); | |||
| 2096 | } | |||
| 2097 | ||||
| 2098 | Transducer* | |||
| 2099 | LexdCompiler::getLexiconTransducerWithFlags(pattern_element_t& tok, bool free) | |||
| 2100 | { | |||
| 2101 | if(!free && entryTransducers.find(tok) != entryTransducers.end()) | |||
| 2102 | return entryTransducers[tok][0]; | |||
| 2103 | if(free && lexiconTransducers.find(tok) != lexiconTransducers.end()) | |||
| 2104 | return lexiconTransducers[tok]; | |||
| 2105 | ||||
| 2106 | // TODO: can this be abstracted from here and getLexiconTransducer()? | |||
| 2107 | vector<entry_t>& lents = lexicons[tok.left.name]; | |||
| 2108 | if(tok.left.name.valid() && tok.left.part > lents[0].size()) | |||
| 2109 | die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part); | |||
| 2110 | vector<entry_t>& rents = lexicons[tok.right.name]; | |||
| 2111 | if(tok.right.name.valid() && tok.right.part > rents[0].size()) | |||
| 2112 | die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part); | |||
| 2113 | if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size()) | |||
| 2114 | die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 2115 | unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size()); | |||
| 2116 | Transducer* trans = new Transducer(); | |||
| 2117 | lex_seg_t empty; | |||
| 2118 | bool did_anything = false; | |||
| 2119 | for(unsigned int i = 0; i < count; i++) | |||
| 2120 | { | |||
| 2121 | lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty); | |||
| 2122 | lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty); | |||
| 2123 | tags_t tags = unionset(le.tags, re.tags); | |||
| 2124 | if(!tok.tag_filter.compatible(tags)) | |||
| 2125 | { | |||
| 2126 | continue; | |||
| 2127 | } | |||
| 2128 | did_anything = true; | |||
| 2129 | lex_seg_t seg; | |||
| 2130 | if (le.regex != nullptr || re.regex != nullptr) { | |||
| 2131 | if (tok.left.name.empty()) | |||
| 2132 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
| 2133 | if (tok.right.name.empty()) | |||
| 2134 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
| 2135 | if (tok.left.name != tok.right.name) | |||
| 2136 | die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right .name))).c_str())); | |||
| 2137 | seg.regex = le.regex; | |||
| 2138 | } | |||
| 2139 | if(!free && tok.left.name.valid()) | |||
| 2140 | { | |||
| 2141 | trans_sym_t flag = getFlag(Unification, tok.left.name, i); | |||
| 2142 | seg.left.symbols.push_back(flag); | |||
| 2143 | seg.right.symbols.push_back(flag); | |||
| 2144 | } | |||
| 2145 | if(!free && tok.right.name.valid() && tok.right.name != tok.left.name) | |||
| 2146 | { | |||
| 2147 | trans_sym_t flag = getFlag(Unification, tok.right.name, i); | |||
| 2148 | seg.left.symbols.push_back(flag); | |||
| 2149 | seg.right.symbols.push_back(flag); | |||
| 2150 | } | |||
| 2151 | if(tok.left.name.valid()) | |||
| 2152 | { | |||
| 2153 | seg.left.symbols.insert(seg.left.symbols.end(), le.left.symbols.begin(), le.left.symbols.end()); | |||
| 2154 | } | |||
| 2155 | if(tok.right.name.valid()) | |||
| 2156 | { | |||
| 2157 | seg.right.symbols.insert(seg.right.symbols.end(), re.right.symbols.begin(), re.right.symbols.end()); | |||
| 2158 | } | |||
| 2159 | seg.tags.insert(tags.begin(), tags.end()); | |||
| 2160 | insertEntry(trans, seg); | |||
| 2161 | } | |||
| 2162 | if(tok.optional()) { | |||
| 2163 | lex_seg_t seg; | |||
| 2164 | if (!free && tok.left.name.valid()) { | |||
| 2165 | trans_sym_t flag = getFlag(Unification, tok.left.name, count); | |||
| 2166 | seg.left.symbols.push_back(flag); | |||
| 2167 | seg.right.symbols.push_back(flag); | |||
| 2168 | } | |||
| 2169 | if (!free && tok.right.name.valid() && tok.right.name != tok.left.name) { | |||
| 2170 | trans_sym_t flag = getFlag(Unification, tok.right.name, count); | |||
| 2171 | seg.left.symbols.push_back(flag); | |||
| 2172 | seg.right.symbols.push_back(flag); | |||
| 2173 | } | |||
| 2174 | insertEntry(trans, seg); | |||
| 2175 | } | |||
| 2176 | if(did_anything) | |||
| 2177 | { | |||
| 2178 | trans->minimize(); | |||
| 2179 | applyMode(trans, tok.mode); | |||
| 2180 | } | |||
| 2181 | else | |||
| 2182 | { | |||
| 2183 | trans = NULL__null; | |||
| 2184 | } | |||
| 2185 | if(free) | |||
| 2186 | { | |||
| 2187 | lexiconTransducers[tok] = trans; | |||
| 2188 | } | |||
| 2189 | else | |||
| 2190 | { | |||
| 2191 | entryTransducers[tok] = vector<Transducer*>(1, trans); | |||
| 2192 | } | |||
| 2193 | return trans; | |||
| 2194 | } | |||
| 2195 | ||||
| 2196 | void | |||
| 2197 | LexdCompiler::printStatistics() const | |||
| 2198 | { | |||
| 2199 | cerr << "Lexicons: " << lexicons.size() << endl; | |||
| 2200 | cerr << "Lexicon entries: "; | |||
| 2201 | unsigned int x = 0; | |||
| 2202 | for(const auto &lex: lexicons) | |||
| 2203 | x += lex.second.size(); | |||
| 2204 | cerr << x << endl; | |||
| 2205 | x = 0; | |||
| 2206 | cerr << "Patterns: " << patterns.size() << endl; | |||
| 2207 | cerr << "Pattern entries: "; | |||
| 2208 | for(const auto &pair: patterns) | |||
| 2209 | x += pair.second.size(); | |||
| 2210 | cerr << x << endl; | |||
| 2211 | cerr << endl; | |||
| 2212 | cerr << "Counts for individual lexicons:" << endl; | |||
| 2213 | unsigned int anon = 0; | |||
| 2214 | for(const auto &lex: lexicons) | |||
| 2215 | { | |||
| 2216 | if(empty(lex.first)) continue; | |||
| 2217 | UString n = to_ustring(name(lex.first)); | |||
| 2218 | if(n[0] == ' ') anon += lex.second.size(); | |||
| 2219 | else cerr << n << ": " << lex.second.size() << endl; | |||
| 2220 | } | |||
| 2221 | cerr << "All anonymous lexicons: " << anon << endl; | |||
| 2222 | } |