File: | lexdcompiler.cc |
Warning: | line 1540, column 9 Potential leak of memory pointed to by 'trans' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | #include "lexdcompiler.h" | |||
2 | #include <unicode/unistr.h> | |||
3 | #include <memory> | |||
4 | #include <chrono> | |||
5 | #include <lttoolbox/string_utils.h> | |||
6 | ||||
7 | using namespace icu; | |||
8 | using namespace std; | |||
9 | ||||
10 | bool tag_filter_t::combinable(const tag_filter_t &other) const | |||
11 | { | |||
12 | // TODO: make ops combinable even with non-empty filters? | |||
13 | if(empty() || other.empty()) | |||
14 | return true; | |||
15 | return intersectset(pos(), other.neg()).empty() && intersectset(other.pos(), neg()).empty() && ops().empty() && other.ops().empty(); | |||
16 | } | |||
17 | bool tag_filter_t::combine(const tag_filter_t &other) | |||
18 | { | |||
19 | if(!combinable(other)) | |||
20 | return false; | |||
21 | unionset_inplace(_pos, other._pos); | |||
22 | unionset_inplace(_neg, other._neg); | |||
23 | for(const auto &op: other._ops) | |||
24 | _ops.push_back(op); | |||
25 | return true; | |||
26 | } | |||
27 | ||||
28 | void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation); | |||
29 | vector<pattern_element_t> distribute_tag_expressions(const pattern_element_t &token) | |||
30 | { | |||
31 | vector<pattern_element_t> result; | |||
32 | for(const auto &f: token.tag_filter.distribute()) | |||
33 | { | |||
34 | pattern_element_t new_token = token; | |||
35 | new_token.tag_filter = f; | |||
36 | result.push_back(new_token); | |||
37 | } | |||
38 | return result; | |||
39 | } | |||
40 | ||||
41 | bool tag_filter_t::compatible(const tags_t &tags) const | |||
42 | { | |||
43 | return subset(pos(), tags) && intersectset(neg(), tags).empty() && ops().empty(); | |||
44 | } | |||
45 | bool tag_filter_t::applicable(const tags_t &tags) const | |||
46 | { | |||
47 | return subset(neg(), tags) && ops().empty(); | |||
48 | } | |||
49 | bool tag_filter_t::try_apply(tags_t &tags) const | |||
50 | { | |||
51 | if(!applicable(tags)) | |||
52 | return false; | |||
53 | subtractset_inplace(tags, neg()); | |||
54 | unionset_inplace(tags, pos()); | |||
55 | return true; | |||
56 | } | |||
57 | bool pattern_element_t::compatible(const lex_seg_t &tok) const | |||
58 | { | |||
59 | return left.name.empty() || right.name.empty() || tag_filter.compatible(tok.tags); | |||
60 | } | |||
61 | const UnicodeString &LexdCompiler::name(string_ref r) const | |||
62 | { | |||
63 | return id_to_name[(unsigned int)r]; | |||
64 | } | |||
65 | ||||
66 | UnicodeString LexdCompiler::printPattern(const pattern_element_t& pat) | |||
67 | { | |||
68 | UnicodeString ret = name(pat.left.name); | |||
69 | auto& pos = pat.tag_filter.pos(); | |||
70 | auto& neg = pat.tag_filter.neg(); | |||
71 | if (!pos.empty() || !neg.empty()) { | |||
72 | ret += '['; | |||
73 | int ln = ret.length(); | |||
74 | for (auto& it : pos) { | |||
75 | if (ret.length() > ln) ret += ','; | |||
76 | ret += name(it); | |||
77 | } | |||
78 | for (auto& it : neg) { | |||
79 | if (ret.length() > ln) ret += ','; | |||
80 | ret += '-'; | |||
81 | ret += name(it); | |||
82 | } | |||
83 | ret += ']'; | |||
84 | } | |||
85 | switch (pat.mode) { | |||
86 | case Question: | |||
87 | ret += '?'; break; | |||
88 | case Plus: | |||
89 | ret += '+'; break; | |||
90 | case Star: | |||
91 | ret += '*'; break; | |||
92 | default: break; | |||
93 | } | |||
94 | return ret; | |||
95 | } | |||
96 | ||||
97 | UnicodeString LexdCompiler::printFilter(const tag_filter_t& filter) | |||
98 | { | |||
99 | UnicodeString ret; | |||
100 | ret += '['; | |||
101 | for (auto& it : filter.pos()) { | |||
102 | if (ret.length() > 1) ret += ','; | |||
103 | ret += name(it); | |||
104 | } | |||
105 | for (auto& it : filter.neg()) { | |||
106 | if (ret.length() > 1) ret += ','; | |||
107 | ret += '-'; | |||
108 | ret += name(it); | |||
109 | } | |||
110 | for (auto& op : filter.ops()) { | |||
111 | if (ret.length() > 1) ret += ','; | |||
112 | ret += op->sigil; | |||
113 | ret += '['; | |||
114 | int ln = ret.length(); | |||
115 | for (auto& it : *op) { | |||
116 | if (ret.length() > ln) ret += ','; | |||
117 | ret += name(it); | |||
118 | } | |||
119 | ret += ']'; | |||
120 | } | |||
121 | ret += ']'; | |||
122 | return ret; | |||
123 | } | |||
124 | ||||
125 | trans_sym_t LexdCompiler::alphabet_lookup(const UnicodeString &symbol) | |||
126 | { | |||
127 | if (!symbol.hasMoreChar32Than(0, symbol.length(), 1)) { | |||
128 | return trans_sym_t((int)symbol.char32At(0)); | |||
129 | } else { | |||
130 | UString temp; | |||
131 | temp.append(symbol.getBuffer(), (unsigned int)symbol.length()); | |||
132 | alphabet.includeSymbol(temp); | |||
133 | return trans_sym_t(alphabet(temp)); | |||
134 | } | |||
135 | } | |||
136 | trans_sym_t LexdCompiler::alphabet_lookup(trans_sym_t l, trans_sym_t r) | |||
137 | { | |||
138 | return trans_sym_t(alphabet((int)l, (int)r)); | |||
139 | } | |||
140 | ||||
141 | LexdCompiler::LexdCompiler() | |||
142 | { | |||
143 | id_to_name.push_back(""); | |||
144 | name_to_id[""] = string_ref(0); | |||
145 | lexicons[string_ref(0)] = vector<entry_t>(); | |||
146 | ||||
147 | left_sieve_name = internName("<"); | |||
148 | token_t lsieve_tok = {.name=left_sieve_name, .part=1, .optional=false}; | |||
149 | pattern_element_t lsieve_elem = {.left=lsieve_tok, .right=lsieve_tok, | |||
150 | .tag_filter=tag_filter_t(), | |||
151 | .mode=Normal}; | |||
152 | left_sieve_tok = vector<pattern_element_t>(1, lsieve_elem); | |||
153 | ||||
154 | right_sieve_name = internName(">"); | |||
155 | token_t rsieve_tok = {.name=right_sieve_name, .part=1, .optional=false}; | |||
156 | pattern_element_t rsieve_elem = {.left=rsieve_tok, .right=rsieve_tok, | |||
157 | .tag_filter=tag_filter_t(), | |||
158 | .mode=Normal}; | |||
159 | right_sieve_tok = vector<pattern_element_t>(1, rsieve_elem); | |||
160 | } | |||
161 | ||||
162 | LexdCompiler::~LexdCompiler() | |||
163 | {} | |||
164 | ||||
165 | // u_*printf only accept const UChar* | |||
166 | // so here's a wrapper so we don't have to write all this out every time | |||
167 | // and make it a macro so we don't have issues with it deallocating | |||
168 | #define err(s)(to_ustring(s).c_str()) (to_ustring(s).c_str()) | |||
169 | ||||
170 | void | |||
171 | LexdCompiler::die(const char* msg, ...) | |||
172 | { | |||
173 | UFILE* err_out = u_finitu_finit_72(stderrstderr, NULL__null, NULL__null); | |||
174 | u_fprintfu_fprintf_72(err_out, "Error on line %d: ", lineNumber); | |||
175 | va_list argptr; | |||
176 | va_start(argptr, msg)__builtin_va_start(argptr, msg); | |||
177 | u_vfprintfu_vfprintf_72(err_out, msg, argptr); | |||
178 | va_end(argptr)__builtin_va_end(argptr); | |||
179 | u_fputcu_fputc_72('\n', err_out); | |||
180 | exit(EXIT_FAILURE1); | |||
181 | } | |||
182 | ||||
183 | void LexdCompiler::appendLexicon(string_ref lexicon_id, const vector<entry_t> &to_append) | |||
184 | { | |||
185 | if(lexicons.find(lexicon_id) == lexicons.end()) | |||
186 | lexicons[lexicon_id] = to_append; | |||
187 | else | |||
188 | lexicons[lexicon_id].insert(lexicons[lexicon_id].begin(), to_append.begin(), to_append.end()); | |||
189 | } | |||
190 | ||||
191 | void | |||
192 | LexdCompiler::finishLexicon() | |||
193 | { | |||
194 | if(inLex) | |||
195 | { | |||
196 | if (currentLexicon.size() == 0) { | |||
197 | die("Lexicon '%S' is empty.", err(name(currentLexiconId))(to_ustring(name(currentLexiconId)).c_str())); | |||
198 | } | |||
199 | appendLexicon(currentLexiconId, currentLexicon); | |||
200 | ||||
201 | currentLexicon.clear(); | |||
202 | currentLexicon_tags.clear(); | |||
203 | } | |||
204 | inLex = false; | |||
205 | } | |||
206 | ||||
207 | string_ref | |||
208 | LexdCompiler::internName(const UnicodeString& name) | |||
209 | { | |||
210 | if(name_to_id.find(name) == name_to_id.end()) | |||
211 | { | |||
212 | name_to_id[name] = string_ref(id_to_name.size()); | |||
213 | id_to_name.push_back(name); | |||
214 | } | |||
215 | return name_to_id[name]; | |||
216 | } | |||
217 | ||||
218 | string_ref | |||
219 | LexdCompiler::checkName(UnicodeString& name) | |||
220 | { | |||
221 | const static UString forbidden = u" :?|()<>[]*+"; | |||
222 | name.trim(); | |||
223 | int l = name.length(); | |||
224 | if(l == 0) die("Unnamed pattern or lexicon."); | |||
225 | ||||
226 | for(const auto &c: forbidden) { | |||
227 | if(name.indexOf(c) != -1) { | |||
228 | die("Lexicon/pattern names cannot contain character '%C'", c); | |||
229 | } | |||
230 | } | |||
231 | return internName(name); | |||
232 | } | |||
233 | ||||
234 | tags_t | |||
235 | LexdCompiler::readTags(char_iter &iter, UnicodeString &line) | |||
236 | { | |||
237 | tag_filter_t filter = readTagFilter(iter, line); | |||
238 | if(filter.neg().empty() && filter.ops().empty()) | |||
239 | return tags_t((set<string_ref>)filter.pos()); | |||
240 | else | |||
241 | die("Cannot declare negative tag in lexicon"); | |||
242 | return tags_t(); | |||
243 | } | |||
244 | ||||
245 | tag_filter_t | |||
246 | LexdCompiler::readTagFilter(char_iter& iter, UnicodeString& line) | |||
247 | { | |||
248 | tag_filter_t tag_filter; | |||
249 | auto tag_start = (++iter).span(); | |||
250 | bool tag_nonempty = false; | |||
251 | bool negative = false; | |||
252 | vector<shared_ptr<op_tag_filter_t>> ops; | |||
253 | for(; !iter.at_end() && (*iter).length() > 0; ++iter) | |||
254 | { | |||
255 | if(*iter == "]" || *iter == "," || *iter == " ") | |||
256 | { | |||
257 | if(!tag_nonempty) | |||
258 | die("Empty tag at char %d", + iter.span().first); | |||
259 | UnicodeString s = line.tempSubStringBetween(tag_start.first, iter.span().first); | |||
260 | if(!tag_filter.combine( | |||
261 | negative ? tag_filter_t(neg_tag_filter_t {checkName(s)}) | |||
262 | : tag_filter_t(pos_tag_filter_t {checkName(s)}) | |||
263 | )) | |||
264 | die("Illegal tag filter."); | |||
265 | tag_nonempty = false; | |||
266 | negative = false; | |||
267 | if(*iter == "]") | |||
268 | { | |||
269 | iter++; | |||
270 | return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops); | |||
271 | } | |||
272 | } | |||
273 | else if(!tag_nonempty && *iter == "-") | |||
274 | { | |||
275 | negative = true; | |||
276 | } | |||
277 | else if(!tag_nonempty && (*iter == "|" || *iter == "^")) | |||
278 | { | |||
279 | const UnicodeString s = *iter; | |||
280 | if(negative) | |||
281 | die("Illegal negated operation."); | |||
282 | *iter++; | |||
283 | if (*iter == "[") | |||
284 | { | |||
285 | shared_ptr<op_tag_filter_t> op; | |||
286 | tags_t operands = readTags(iter, line); | |||
287 | if (s == "|") | |||
288 | op = make_shared<or_tag_filter_t>(operands); | |||
289 | else if (s == "^") | |||
290 | op = make_shared<xor_tag_filter_t>(operands); | |||
291 | ops.push_back(op); | |||
292 | } | |||
293 | else | |||
294 | die("Expected list of operands."); | |||
295 | if(*iter == "]") | |||
296 | { | |||
297 | iter++; | |||
298 | return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops); | |||
299 | } | |||
300 | } | |||
301 | else if(!tag_nonempty) | |||
302 | { | |||
303 | tag_nonempty = true; | |||
304 | tag_start = iter.span(); | |||
305 | } | |||
306 | } | |||
307 | die("End of line in tag list, expected ']'"); | |||
308 | return tag_filter_t(); | |||
309 | } | |||
310 | ||||
311 | void | |||
312 | LexdCompiler::appendSymbol(const UnicodeString& s, lex_token_t& tok) | |||
313 | { | |||
314 | if (shouldCombine) { | |||
315 | tok.symbols.push_back(alphabet_lookup(s)); | |||
316 | } else { | |||
317 | for (int c = 0; c < s.length(); c++) { | |||
318 | tok.symbols.push_back(alphabet_lookup(s[c])); | |||
319 | } | |||
320 | } | |||
321 | } | |||
322 | ||||
323 | void | |||
324 | LexdCompiler::readSymbol(char_iter& iter, UnicodeString& line, lex_token_t& tok) | |||
325 | { | |||
326 | if ((*iter).startsWith("\\")) { | |||
327 | if ((*iter).length() == 1) { | |||
328 | appendSymbol(*++iter, tok); | |||
329 | } else { | |||
330 | appendSymbol((*iter).tempSubString(1), tok); | |||
331 | } | |||
332 | } else if ((*iter).startsWith(":")) { | |||
333 | appendSymbol((*iter).tempSubString(1), tok); | |||
334 | } else if (*iter == "{" || *iter == "<") { | |||
335 | UChar end = (*iter == "{") ? '}' : '>'; | |||
336 | int i = iter.span().first; | |||
337 | for (; !iter.at_end() && *iter != end; ++iter) ; | |||
338 | ||||
339 | if (*iter == end) { | |||
340 | tok.symbols.push_back(alphabet_lookup(line.tempSubStringBetween(i, iter.span().second))); | |||
341 | } else { | |||
342 | die("Multichar symbol didn't end; searching for %S", err(end)(to_ustring(end).c_str())); | |||
343 | } | |||
344 | } else { | |||
345 | appendSymbol(*iter, tok); | |||
346 | } | |||
347 | } | |||
348 | ||||
349 | int | |||
350 | LexdCompiler::processRegexTokenSeq(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state) | |||
351 | { | |||
352 | bool inleft = true; | |||
353 | vector<vector<lex_token_t>> left, right; | |||
354 | for (; !iter.at_end(); ++iter) { | |||
355 | if (*iter == "(" || *iter == ")" || *iter == "|" || *iter == "/") break; | |||
356 | else if (*iter == "?" || *iter == "*" || *iter == "+") | |||
357 | die("Quantifier %S may only be applied to parenthesized groups", err(*iter)(to_ustring(*iter).c_str())); | |||
358 | else if (*iter == "]") die("Regex contains mismatched ]"); | |||
359 | else if (*iter == ":" && inleft) inleft = false; | |||
360 | else if (*iter == ":") die("Regex contains multiple colons"); | |||
361 | else if (*iter == "[") { | |||
362 | ++iter; | |||
363 | vector<lex_token_t> sym; | |||
364 | for (; !iter.at_end(); ++iter) { | |||
365 | if (*iter == "]") break; | |||
366 | else if (*iter == "-" && !sym.empty()) { | |||
367 | ++iter; | |||
368 | if (*iter == "]" || iter.at_end()) { | |||
369 | --iter; | |||
370 | lex_token_t temp; | |||
371 | readSymbol(iter, line, temp); | |||
372 | sym.push_back(temp); | |||
373 | } else { | |||
374 | lex_token_t start = sym.back(); | |||
375 | lex_token_t end; | |||
376 | readSymbol(iter, line, end); | |||
377 | // This will fail on diacritics even with -U | |||
378 | // on the principle that command-line args should not | |||
379 | // change the validity of the code -DGS 2022-05-17 | |||
380 | if (start.symbols.size() != 1 || end.symbols.size() != 1 || | |||
381 | (int)start.symbols[0] <= 0 || (int)end.symbols[0] <= 0) | |||
382 | die("Cannot process symbol range between multichar symbols"); | |||
383 | int i_start = (int)start.symbols[0]; | |||
384 | int i_end = (int)end.symbols[0]; | |||
385 | if (i_start > i_end) | |||
386 | die("First character in symbol range does not preceed last"); | |||
387 | for (int i = 1 + i_start; i <= i_end; i++) { | |||
388 | lex_token_t mid; | |||
389 | mid.symbols.push_back((trans_sym_t)i); | |||
390 | sym.push_back(mid); | |||
391 | } | |||
392 | } | |||
393 | } else { | |||
394 | lex_token_t temp; | |||
395 | readSymbol(iter, line, temp); | |||
396 | sym.push_back(temp); | |||
397 | } | |||
398 | } | |||
399 | (inleft ? left : right).push_back(sym); | |||
400 | } else { | |||
401 | vector<lex_token_t> v_temp; | |||
402 | lex_token_t t_temp; | |||
403 | readSymbol(iter, line, t_temp); | |||
404 | v_temp.push_back(t_temp); | |||
405 | (inleft ? left : right).push_back(v_temp); | |||
406 | } | |||
407 | } | |||
408 | int state = start_state; | |||
409 | vector<lex_token_t> empty_vec; | |||
410 | lex_token_t empty_tok; | |||
411 | empty_tok.symbols.push_back(trans_sym_t()); | |||
412 | empty_vec.push_back(empty_tok); | |||
413 | for (unsigned int i = 0; i < left.size() || i < right.size(); i++) { | |||
414 | vector<lex_token_t>& lv = (i < left.size() && !left[i].empty() ? | |||
415 | left[i] : empty_vec); | |||
416 | vector<lex_token_t>& rv = (i < right.size() && !right[i].empty() ? | |||
417 | right[i] : empty_vec); | |||
418 | bool first = true; | |||
419 | int dest_state = 0; | |||
420 | if (inleft) { | |||
421 | for (auto& s : lv) { | |||
422 | if (first) { | |||
423 | dest_state = state; | |||
424 | for (auto& it : s.symbols) | |||
425 | dest_state = trans->insertNewSingleTransduction(alphabet((int)it, (int)it), dest_state); | |||
426 | if (dest_state == state) | |||
427 | dest_state = trans->insertNewSingleTransduction(0, dest_state); | |||
428 | first = false; | |||
429 | } else if (s.symbols.empty()) { | |||
430 | trans->linkStates(state, dest_state, 0); | |||
431 | } else { | |||
432 | int cur_state = state; | |||
433 | for (unsigned int k = 0; k < s.symbols.size(); k++) { | |||
434 | if (k+1 == s.symbols.size()) | |||
435 | trans->linkStates(cur_state, dest_state, alphabet((int)s.symbols[k], (int)s.symbols[k])); | |||
436 | else | |||
437 | cur_state = trans->insertNewSingleTransduction(alphabet((int)s.symbols[k], (int)s.symbols[k]), cur_state); | |||
438 | } | |||
439 | } | |||
440 | } | |||
441 | } else { | |||
442 | for (auto& l : lv) { | |||
443 | for (auto& r : rv) { | |||
444 | vector<int> paired; | |||
445 | for (unsigned int j = 0; j < l.symbols.size() || j < r.symbols.size(); j++) { | |||
446 | trans_sym_t ls = (j < l.symbols.size() ? l.symbols[j] : trans_sym_t()); | |||
447 | trans_sym_t rs = (j < r.symbols.size() ? r.symbols[j] : trans_sym_t()); | |||
448 | paired.push_back(alphabet((int)ls, (int)rs)); | |||
449 | } | |||
450 | if (first) { | |||
451 | dest_state = state; | |||
452 | for (auto& it : paired) { | |||
453 | dest_state = trans->insertNewSingleTransduction(it, dest_state); | |||
454 | } | |||
455 | first = false; | |||
456 | } else { | |||
457 | int cur_state = state; | |||
458 | for (unsigned int k = 0; k < paired.size(); k++) { | |||
459 | if (k+1 == paired.size()) | |||
460 | trans->linkStates(cur_state, dest_state, paired[k]); | |||
461 | else | |||
462 | cur_state = trans->insertNewSingleTransduction(paired[k], cur_state); | |||
463 | } | |||
464 | } | |||
465 | } | |||
466 | } | |||
467 | } | |||
468 | state = dest_state; | |||
469 | } | |||
470 | return state; | |||
471 | } | |||
472 | ||||
473 | int | |||
474 | LexdCompiler::processRegexGroup(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state, unsigned int depth) | |||
475 | { | |||
476 | ++iter; // initial slash or paren | |||
477 | int state = start_state; | |||
478 | vector<int> option_ends; | |||
479 | for (; !iter.at_end(); ++iter) { | |||
480 | if (*iter == "(") { | |||
481 | state = trans->insertNewSingleTransduction(0, state); | |||
482 | state = processRegexGroup(iter, line, trans, state, depth+1); | |||
483 | --iter; | |||
484 | // this function ends on character after close paren or quantifier | |||
485 | // so step back so loop increment doesn't skip a character | |||
486 | } | |||
487 | else if (*iter == ")" || *iter == "/") break; | |||
488 | else if (*iter == "|") { | |||
489 | if (state == start_state) | |||
490 | state = trans->insertNewSingleTransduction(0, state); | |||
491 | option_ends.push_back(state); | |||
492 | state = start_state; | |||
493 | } | |||
494 | else { | |||
495 | state = processRegexTokenSeq(iter, line, trans, state); | |||
496 | --iter; | |||
497 | } | |||
498 | } | |||
499 | if (state == start_state) | |||
500 | state = trans->insertNewSingleTransduction(0, state); | |||
501 | for (auto& it : option_ends) | |||
502 | trans->linkStates(it, state, 0); | |||
503 | if ((depth > 0 && *iter == "/") || (depth == 0 && *iter == ")")) | |||
504 | die("Mismatched parentheses in regex"); | |||
505 | if (iter.at_end()) | |||
506 | die("Unterminated regex"); | |||
507 | ++iter; | |||
508 | if (depth > 0) { | |||
509 | if (*iter == "?") { | |||
510 | trans->linkStates(start_state, state, 0); | |||
511 | ++iter; | |||
512 | } else if (*iter == "*") { | |||
513 | trans->linkStates(start_state, state, 0); | |||
514 | trans->linkStates(state, start_state, 0); | |||
515 | ++iter; | |||
516 | } else if (*iter == "+") { | |||
517 | trans->linkStates(state, start_state, 0); | |||
518 | ++iter; | |||
519 | } | |||
520 | } | |||
521 | return state; | |||
522 | } | |||
523 | ||||
524 | lex_seg_t | |||
525 | LexdCompiler::processLexiconSegment(char_iter& iter, UnicodeString& line, unsigned int part_count) | |||
526 | { | |||
527 | lex_seg_t seg; | |||
528 | bool inleft = true; | |||
529 | bool left_tags_applied = false, right_tags_applied = false; | |||
530 | tag_filter_t tags; | |||
531 | if((*iter).startsWith(" ")) | |||
532 | { | |||
533 | if((*iter).length() > 1) | |||
534 | { | |||
535 | // if it's a space with a combining diacritic after it, | |||
536 | // then we want the diacritic | |||
537 | UnicodeString cur = *iter; | |||
538 | cur.retainBetween(1, cur.length()); | |||
539 | seg.left.symbols.push_back(alphabet_lookup(cur)); | |||
540 | } | |||
541 | ++iter; | |||
542 | } | |||
543 | if((*iter).startsWith("/") && seg.left.symbols.size() == 0) | |||
544 | { | |||
545 | seg.regex = new Transducer(); | |||
546 | int state = processRegexGroup(iter, line, seg.regex, 0, 0); | |||
547 | seg.regex->setFinal(state); | |||
548 | } | |||
549 | if(iter.at_end() && seg.regex == nullptr && seg.left.symbols.size() == 0) | |||
550 | die("Expected %d parts, found %d", currentLexiconPartCount, part_count); | |||
551 | for(; !iter.at_end(); ++iter) | |||
552 | { | |||
553 | if((*iter).startsWith(" ") || *iter == ']') | |||
554 | break; | |||
555 | else if(*iter == "[") | |||
556 | { | |||
557 | auto &tags_applied = inleft ? left_tags_applied : right_tags_applied; | |||
558 | if(tags_applied) | |||
559 | die("Already provided tag list for this side."); | |||
560 | tags = readTagFilter(iter, line); | |||
561 | --iter; | |||
562 | tags_applied = true; | |||
563 | } | |||
564 | else if((*iter).startsWith(":")) | |||
565 | { | |||
566 | if(inleft) | |||
567 | inleft = false; | |||
568 | else | |||
569 | die("Lexicon entry contains multiple colons"); | |||
570 | if ((*iter).length() > 1) readSymbol(iter, line, seg.right); | |||
571 | } | |||
572 | else readSymbol(iter, line, (inleft ? seg.left : seg.right)); | |||
573 | } | |||
574 | if(inleft) | |||
575 | { | |||
576 | seg.right = seg.left; | |||
577 | } | |||
578 | ||||
579 | if (seg.regex != nullptr && | |||
580 | !(seg.left.symbols.empty() && seg.right.symbols.empty())) | |||
581 | die("Lexicon entry contains both regex and text"); | |||
582 | ||||
583 | seg.tags = currentLexicon_tags; | |||
584 | ||||
585 | if(!tags.try_apply(seg.tags)) | |||
586 | { | |||
587 | tags_t diff = subtractset(tags.neg(), seg.tags); | |||
588 | for(string_ref t: diff) | |||
589 | cerr << "Bad tag '-" << to_ustring(name(t)) << "'" << endl; | |||
590 | die("Negative tag has no default to unset."); | |||
591 | } | |||
592 | ||||
593 | return seg; | |||
594 | } | |||
595 | ||||
596 | token_t | |||
597 | LexdCompiler::readToken(char_iter& iter, UnicodeString& line) | |||
598 | { | |||
599 | auto begin_charspan = iter.span(); | |||
600 | ||||
601 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
602 | ||||
603 | for(; !iter.at_end() && boundary.indexOf(*iter) == -1; ++iter); | |||
604 | UnicodeString name; | |||
605 | line.extract(begin_charspan.first, (iter.at_end() ? line.length() : iter.span().first) - begin_charspan.first, name); | |||
606 | ||||
607 | if(name.length() == 0) | |||
608 | die("Symbol '%S' without lexicon name at u16 %d-%d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, iter.span().second-1); | |||
609 | ||||
610 | bool optional = false; | |||
611 | if(*iter == "?") { | |||
612 | iter++; | |||
613 | if(*iter == "(") { | |||
614 | optional = true; | |||
615 | } else { | |||
616 | iter--; | |||
617 | } | |||
618 | } | |||
619 | ||||
620 | unsigned int part = 1; | |||
621 | if(*iter == "(") | |||
622 | { | |||
623 | iter++; | |||
624 | begin_charspan = iter.span(); | |||
625 | for(; !iter.at_end() && (*iter).length() > 0 && *iter != ")"; iter++) | |||
626 | { | |||
627 | if((*iter).length() != 1 || !u_isdigitu_isdigit_72((*iter).charAt(0))) | |||
628 | die("Syntax error - non-numeric index in parentheses: %S", err(*iter)(to_ustring(*iter).c_str())); | |||
629 | } | |||
630 | if(*iter != ")") | |||
631 | die("Syntax error - unmatched parenthesis"); | |||
632 | if(iter.span().first == begin_charspan.first) | |||
633 | die("Syntax error - missing index in parenthesis"); | |||
634 | part = (unsigned int)StringUtils::stoi(to_ustring(line.tempSubStringBetween(begin_charspan.first, iter.span().first))); | |||
635 | if (part == 0) die("Invalid column number (0)"); | |||
636 | ++iter; | |||
637 | } | |||
638 | ||||
639 | return token_t {.name = internName(name), .part = part, .optional = optional}; | |||
640 | } | |||
641 | ||||
642 | RepeatMode | |||
643 | LexdCompiler::readModifier(char_iter& iter) | |||
644 | { | |||
645 | if(*iter == "?") | |||
646 | { | |||
647 | ++iter; | |||
648 | return Question; | |||
649 | } | |||
650 | else if(*iter == "*") | |||
651 | { | |||
652 | ++iter; | |||
653 | return Star; | |||
654 | } | |||
655 | else if(*iter == "+") | |||
656 | { | |||
657 | ++iter; | |||
658 | return Plus; | |||
659 | } | |||
660 | return Normal; | |||
661 | } | |||
662 | ||||
663 | pattern_element_t | |||
664 | LexdCompiler::readPatternElement(char_iter& iter, UnicodeString& line) | |||
665 | { | |||
666 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
667 | pattern_element_t tok; | |||
668 | if(*iter == ":") | |||
669 | { | |||
670 | iter++; | |||
671 | if(boundary.indexOf(*iter) != -1) | |||
672 | { | |||
673 | if(*iter == ":") | |||
674 | die("Syntax error - double colon"); | |||
675 | else | |||
676 | die("Colon without lexicon or pattern name"); | |||
677 | } | |||
678 | tok.right = readToken(iter, line); | |||
679 | } | |||
680 | else if(boundary.indexOf(*iter) != -1) | |||
681 | { | |||
682 | die("Unexpected symbol '%S' at column %d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first); | |||
683 | } | |||
684 | else | |||
685 | { | |||
686 | tok.left = readToken(iter, line); | |||
687 | if(*iter == "[") | |||
688 | { | |||
689 | tok.tag_filter.combine(readTagFilter(iter, line)); | |||
690 | } | |||
691 | if(*iter == ":") | |||
692 | { | |||
693 | iter++; | |||
694 | if(!iter.at_end() && (*iter).length() > 0) | |||
695 | { | |||
696 | if(boundary.indexOf(*iter) == -1) | |||
697 | { | |||
698 | tok.right = readToken(iter, line); | |||
699 | } | |||
700 | } | |||
701 | } | |||
702 | else | |||
703 | { | |||
704 | tok.right = tok.left; | |||
705 | } | |||
706 | } | |||
707 | if(*iter == "[") | |||
708 | { | |||
709 | tok.tag_filter.combine(readTagFilter(iter, line)); | |||
710 | } | |||
711 | tok.mode = readModifier(iter); | |||
712 | ||||
713 | return tok; | |||
714 | } | |||
715 | ||||
716 | void | |||
717 | LexdCompiler::processPattern(char_iter& iter, UnicodeString& line) | |||
718 | { | |||
719 | vector<pattern_t> pats_cur(1); | |||
720 | vector<pattern_element_t> alternation; | |||
721 | bool final_alternative = true; | |||
722 | bool sieve_forward = false; | |||
723 | bool just_sieved = false; | |||
724 | const UnicodeString boundary = " :()[]+*?|<>"; | |||
725 | const UnicodeString token_boundary = " )|<>"; | |||
726 | const UnicodeString token_side_boundary = token_boundary + ":+*?"; | |||
727 | const UnicodeString token_side_name_boundary = token_side_boundary + "([]"; | |||
728 | const UnicodeString modifier = "+*?"; | |||
729 | const UnicodeString decrement_after_token = token_boundary + "([]"; | |||
730 | ||||
731 | for(; !iter.at_end() && *iter != ')' && (*iter).length() > 0; ++iter) | |||
732 | { | |||
733 | if(*iter == " ") ; | |||
734 | else if(*iter == "|") | |||
735 | { | |||
736 | if(alternation.empty()) | |||
737 | die("Syntax error - initial |"); | |||
738 | if(!final_alternative) | |||
739 | die("Syntax error - multiple consecutive |"); | |||
740 | if(just_sieved) | |||
741 | die("Syntax error - sieve and alternation operators without intervening token"); | |||
742 | final_alternative = false; | |||
743 | } | |||
744 | else if(*iter == "<") | |||
745 | { | |||
746 | if(sieve_forward) | |||
747 | die("Syntax error - cannot sieve backwards after forwards."); | |||
748 | if(alternation.empty()) | |||
749 | die("Backward sieve without token?"); | |||
750 | if(just_sieved) | |||
751 | die("Syntax error - multiple consecutive sieve operators"); | |||
752 | if(!final_alternative) | |||
753 | die("Syntax error - alternation and sieve operators without intervening token"); | |||
754 | expand_alternation(pats_cur, alternation); | |||
755 | expand_alternation(pats_cur, left_sieve_tok); | |||
756 | alternation.clear(); | |||
757 | just_sieved = true; | |||
758 | } | |||
759 | else if(*iter == ">") | |||
760 | { | |||
761 | sieve_forward = true; | |||
762 | if(alternation.empty()) | |||
763 | die("Forward sieve without token?"); | |||
764 | if(just_sieved) | |||
765 | die("Syntax error - multiple consecutive sieve operators"); | |||
766 | if(!final_alternative) | |||
767 | die("Syntax error - alternation and sieve operators without intervening token"); | |||
768 | expand_alternation(pats_cur, alternation); | |||
769 | expand_alternation(pats_cur, right_sieve_tok); | |||
770 | alternation.clear(); | |||
771 | just_sieved = true; | |||
772 | } | |||
773 | else if(*iter == "[") | |||
774 | { | |||
775 | UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++)); | |||
776 | currentLexiconId = internName(name); | |||
777 | currentLexiconPartCount = 1; | |||
778 | inLex = true; | |||
779 | entry_t entry; | |||
780 | entry.push_back(processLexiconSegment(++iter, line, 0)); | |||
781 | if(*iter == " ") iter++; | |||
782 | if(*iter != "]") | |||
783 | die("Missing closing ] for anonymous lexicon"); | |||
784 | currentLexicon.push_back(entry); | |||
785 | finishLexicon(); | |||
786 | if(final_alternative && !alternation.empty()) | |||
787 | { | |||
788 | expand_alternation(pats_cur, alternation); | |||
789 | alternation.clear(); | |||
790 | } | |||
791 | ++iter; | |||
792 | pattern_element_t anon; | |||
793 | anon.left = {.name=currentLexiconId, .part=1, .optional=false}; | |||
794 | anon.right = anon.left; | |||
795 | anon.mode = readModifier(iter); | |||
796 | alternation.push_back(anon); | |||
797 | --iter; | |||
798 | final_alternative = true; | |||
799 | just_sieved = false; | |||
800 | } | |||
801 | else if(*iter == "(") | |||
802 | { | |||
803 | string_ref temp = currentPatternId; | |||
804 | UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++)); | |||
805 | currentPatternId = internName(name); | |||
806 | ++iter; | |||
807 | processPattern(iter, line); | |||
808 | if(*iter == " ") | |||
809 | *iter++; | |||
810 | if(iter.at_end() || *iter != ")") | |||
811 | die("Missing closing ) for anonymous pattern"); | |||
812 | ++iter; | |||
813 | tag_filter_t filter; | |||
814 | if(*iter == "[") | |||
815 | filter = readTagFilter(iter, line); | |||
816 | if(final_alternative && !alternation.empty()) | |||
817 | { | |||
818 | expand_alternation(pats_cur, alternation); | |||
819 | alternation.clear(); | |||
820 | } | |||
821 | pattern_element_t anon; | |||
822 | anon.left = {.name=currentPatternId, .part=1, .optional=false}; | |||
823 | anon.right = anon.left; | |||
824 | anon.mode = readModifier(iter); | |||
825 | anon.tag_filter = filter; | |||
826 | for(const auto &tok : distribute_tag_expressions(anon)) | |||
827 | alternation.push_back(tok); | |||
828 | --iter; | |||
829 | currentPatternId = temp; | |||
830 | final_alternative = true; | |||
831 | just_sieved = false; | |||
832 | } | |||
833 | else if(*iter == "?" || *iter == "*" || *iter == "+") | |||
834 | { | |||
835 | die("Syntax error - unexpected modifier at u16 %d-%d", iter.span().first, iter.span().second); | |||
836 | } | |||
837 | else | |||
838 | { | |||
839 | if(final_alternative && !alternation.empty()) | |||
840 | { | |||
841 | expand_alternation(pats_cur, alternation); | |||
842 | alternation.clear(); | |||
843 | } | |||
844 | for(const auto &tok : distribute_tag_expressions(readPatternElement(iter, line))) | |||
845 | alternation.push_back(tok); | |||
846 | iter--; | |||
847 | final_alternative = true; | |||
848 | just_sieved = false; | |||
849 | } | |||
850 | } | |||
851 | if(!final_alternative) | |||
852 | die("Syntax error - trailing |"); | |||
853 | if(just_sieved) | |||
854 | die("Syntax error - trailing sieve (< or >)"); | |||
855 | expand_alternation(pats_cur, alternation); | |||
856 | for(const auto &pat : pats_cur) | |||
857 | { | |||
858 | patterns[currentPatternId].push_back(make_pair(lineNumber, pat)); | |||
859 | } | |||
860 | } | |||
861 | ||||
862 | void | |||
863 | LexdCompiler::processNextLine() | |||
864 | { | |||
865 | UnicodeString line; | |||
866 | UChar c; | |||
867 | bool escape = false; | |||
868 | bool comment = false; | |||
869 | bool lastWasSpace = false; | |||
870 | while((c = u_fgetcu_fgetc_72(input)) != '\n') | |||
871 | { | |||
872 | bool space = false; | |||
873 | if(c == U_EOF0xFFFF) | |||
874 | { | |||
875 | doneReading = true; | |||
876 | break; | |||
877 | } | |||
878 | if(comment) continue; | |||
879 | if(escape) | |||
880 | { | |||
881 | line += c; | |||
882 | escape = false; | |||
883 | } | |||
884 | else if(c == '\\') | |||
885 | { | |||
886 | escape = true; | |||
887 | line += c; | |||
888 | } | |||
889 | else if(c == '#') | |||
890 | { | |||
891 | comment = true; | |||
892 | } | |||
893 | else if(u_isWhitespaceu_isWhitespace_72(c)) | |||
894 | { | |||
895 | if(line.length() > 0 && !lastWasSpace) | |||
896 | { | |||
897 | line += ' '; | |||
898 | } | |||
899 | space = (line.length() > 0); | |||
900 | } | |||
901 | else line += c; | |||
902 | lastWasSpace = space; | |||
903 | } | |||
904 | lineNumber++; | |||
905 | if(escape) die("Trailing backslash"); | |||
906 | if(line.length() == 0) return; | |||
907 | ||||
908 | if(line == "PATTERNS" || line == "PATTERNS ") | |||
909 | { | |||
910 | finishLexicon(); | |||
911 | UnicodeString name = " "; | |||
912 | currentPatternId = internName(name); | |||
913 | inPat = true; | |||
914 | } | |||
915 | else if(line.length() > 7 && line.startsWith("PATTERN ")) | |||
916 | { | |||
917 | UnicodeString name = line.tempSubString(8); | |||
918 | finishLexicon(); | |||
919 | currentPatternId = checkName(name); | |||
920 | if (lexicons.find(currentPatternId) != lexicons.end()) { | |||
921 | die("The name '%S' cannot be used for both LEXICONs and PATTERNs.", | |||
922 | err(name)(to_ustring(name).c_str())); | |||
923 | } | |||
924 | inPat = true; | |||
925 | } | |||
926 | else if(line.length() > 7 && line.startsWith("LEXICON ")) | |||
927 | { | |||
928 | UnicodeString name = line.tempSubString(8); | |||
929 | name.trim(); | |||
930 | finishLexicon(); | |||
931 | if(name.length() > 1 && name.indexOf('[') != -1) | |||
932 | { | |||
933 | UnicodeString tags = name.tempSubString(name.indexOf('[')); | |||
934 | auto c = char_iter(tags); | |||
935 | currentLexicon_tags = readTags(c, tags); | |||
936 | if(c != c.end() && *c == ":") | |||
937 | { | |||
938 | cerr << "WARNING: One-sided tags are deprecated and will soon be removed (line " << lineNumber << ")" << endl; | |||
939 | ++c; | |||
940 | if(*c == "[") | |||
941 | unionset_inplace(currentLexicon_tags, readTags(c, tags)); | |||
942 | else | |||
943 | die("Expected start of default right tags '[' after ':'."); | |||
944 | } | |||
945 | if(c != c.end()) | |||
946 | die("Unexpected character '%C' after default tags.", (*c)[0]); | |||
947 | name.retainBetween(0, name.indexOf('[')); | |||
948 | } | |||
949 | currentLexiconPartCount = 1; | |||
950 | if(name.length() > 1 && name.endsWith(')')) | |||
951 | { | |||
952 | UnicodeString num; | |||
953 | for(int i = name.length()-2; i > 0; i--) | |||
954 | { | |||
955 | if(u_isdigitu_isdigit_72(name[i])) num = name[i] + num; | |||
956 | else if(name[i] == '(' && num.length() > 0) | |||
957 | { | |||
958 | currentLexiconPartCount = (unsigned int)StringUtils::stoi(to_ustring(num)); | |||
959 | name = name.retainBetween(0, i); | |||
960 | } | |||
961 | else break; | |||
962 | } | |||
963 | if(name.length() == 0) die("Unnamed lexicon"); | |||
964 | } | |||
965 | currentLexiconId = checkName(name); | |||
966 | if(lexicons.find(currentLexiconId) != lexicons.end()) { | |||
967 | if(lexicons[currentLexiconId][0].size() != currentLexiconPartCount) { | |||
968 | die("Multiple incompatible definitions for lexicon '%S'.", err(name)(to_ustring(name).c_str())); | |||
969 | } | |||
970 | } | |||
971 | if (patterns.find(currentLexiconId) != patterns.end()) { | |||
972 | die("The name '%S' cannot be used for both LEXICONs and PATTERNs.", | |||
973 | err(name)(to_ustring(name).c_str())); | |||
974 | } | |||
975 | inLex = true; | |||
976 | inPat = false; | |||
977 | } | |||
978 | else if(line.length() >= 9 && line.startsWith("ALIAS ")) | |||
979 | { | |||
980 | finishLexicon(); | |||
981 | if(line.endsWith(' ')) line.retainBetween(0, line.length()-1); | |||
982 | int loc = line.indexOf(" ", 6); | |||
983 | if(loc == -1) die("Expected 'ALIAS lexicon alt_name'"); | |||
984 | UnicodeString name = line.tempSubString(6, loc-6); | |||
985 | UnicodeString alt = line.tempSubString(loc+1); | |||
986 | string_ref altid = checkName(alt); | |||
987 | string_ref lexid = checkName(name); | |||
988 | if(lexicons.find(lexid) == lexicons.end()) die("Attempt to alias undefined lexicon '%S'.", err(name)(to_ustring(name).c_str())); | |||
989 | lexicons[altid] = lexicons[lexid]; | |||
990 | inLex = false; | |||
991 | inPat = false; | |||
992 | } | |||
993 | else if(inPat) | |||
994 | { | |||
995 | char_iter iter = char_iter(line); | |||
996 | processPattern(iter, line); | |||
997 | if(!iter.at_end() && (*iter).length() > 0) | |||
998 | die("Unexpected %S", err(*iter)(to_ustring(*iter).c_str())); | |||
999 | } | |||
1000 | else if(inLex) | |||
1001 | { | |||
1002 | char_iter iter = char_iter(line); | |||
1003 | entry_t entry; | |||
1004 | for(unsigned int i = 0; i < currentLexiconPartCount; i++) | |||
1005 | { | |||
1006 | entry.push_back(processLexiconSegment(iter, line, i)); | |||
1007 | if (*iter == "]") die("Unexpected closing bracket."); | |||
1008 | } | |||
1009 | if(*iter == ' ') ++iter; | |||
1010 | if(!iter.at_end()) | |||
1011 | die("Lexicon entry has '%S' (found at u16 %d), more than %d components", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, currentLexiconPartCount); | |||
1012 | currentLexicon.push_back(entry); | |||
1013 | } | |||
1014 | else die("Expected 'PATTERNS' or 'LEXICON'"); | |||
1015 | } | |||
1016 | ||||
1017 | bool | |||
1018 | LexdCompiler::isLexiconToken(const pattern_element_t& tok) | |||
1019 | { | |||
1020 | const bool llex = (tok.left.name.empty() || (lexicons.find(tok.left.name) != lexicons.end())); | |||
1021 | const bool rlex = (tok.right.name.empty() || (lexicons.find(tok.right.name) != lexicons.end())); | |||
1022 | if(llex && rlex) | |||
1023 | { | |||
1024 | return true; | |||
1025 | } | |||
1026 | const bool lpat = (patterns.find(tok.left.name) != patterns.end()); | |||
1027 | const bool rpat = (patterns.find(tok.right.name) != patterns.end()); | |||
1028 | if(tok.left.name == tok.right.name && lpat && rpat) | |||
1029 | { | |||
1030 | if(tok.left.part != 1 || tok.right.part != 1) | |||
1031 | { | |||
1032 | die("Cannote select part of pattern %S", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1033 | } | |||
1034 | return false; | |||
1035 | } | |||
1036 | // Any other scenario is an error, so we need to die() | |||
1037 | if(lpat && rpat) | |||
1038 | { | |||
1039 | die("Cannot collate pattern %S with %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1040 | } | |||
1041 | else if((lpat && tok.right.name.empty()) || (rpat && tok.left.name.empty())) | |||
1042 | { | |||
1043 | die("Cannot select side of pattern %S", err(name(tok.left.name.valid() ? tok.left.name : tok.right.name))(to_ustring(name(tok.left.name.valid() ? tok.left.name : tok. right.name)).c_str())); | |||
1044 | } | |||
1045 | else if(llex && rpat) | |||
1046 | { | |||
1047 | die("Cannot collate lexicon %S with pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1048 | } | |||
1049 | else if(lpat && rlex) | |||
1050 | { | |||
1051 | die("Cannot collate pattern %S with lexicon %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1052 | } | |||
1053 | else | |||
1054 | { | |||
1055 | cerr << "Patterns: "; | |||
1056 | for(auto pat: patterns) | |||
1057 | cerr << to_ustring(name(pat.first)) << " "; | |||
1058 | cerr << endl; | |||
1059 | cerr << "Lexicons: "; | |||
1060 | for(auto l: lexicons) | |||
1061 | cerr << to_ustring(name(l.first)) << " "; | |||
1062 | cerr << endl; | |||
1063 | die("Lexicon or pattern '%S' is not defined.", err(name((llex || lpat) ? tok.right.name : tok.left.name))(to_ustring(name((llex || lpat) ? tok.right.name : tok.left.name )).c_str())); | |||
1064 | } | |||
1065 | // we never reach this point, but the compiler doesn't understand die() | |||
1066 | // so we put a fake return value to keep it happy | |||
1067 | return false; | |||
1068 | } | |||
1069 | ||||
1070 | void | |||
1071 | LexdCompiler::buildPattern(int state, Transducer* t, const pattern_t& pat, const vector<int> is_free, unsigned int pos) | |||
1072 | { | |||
1073 | if(pos == pat.size()) | |||
1074 | { | |||
1075 | t->setFinal(state); | |||
1076 | return; | |||
1077 | } | |||
1078 | const pattern_element_t& tok = pat[pos]; | |||
1079 | if(tok.left.name == left_sieve_name) | |||
1080 | { | |||
1081 | t->linkStates(t->getInitial(), state, 0); | |||
1082 | buildPattern(state, t, pat, is_free, pos+1); | |||
1083 | } | |||
1084 | else if(tok.left.name == right_sieve_name) | |||
1085 | { | |||
1086 | t->setFinal(state); | |||
1087 | buildPattern(state, t, pat, is_free, pos+1); | |||
1088 | } | |||
1089 | else if(isLexiconToken(tok)) | |||
1090 | { | |||
1091 | if(is_free[pos] == 1) | |||
1092 | { | |||
1093 | Transducer *lex = getLexiconTransducer(pat[pos], 0, true); | |||
1094 | if(lex) | |||
1095 | { | |||
1096 | int new_state = t->insertTransducer(state, *lex); | |||
1097 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
1098 | } | |||
1099 | return; | |||
1100 | } | |||
1101 | else if(matchedParts.find(tok.left.name) == matchedParts.end() && | |||
1102 | matchedParts.find(tok.right.name) == matchedParts.end()) | |||
1103 | { | |||
1104 | unsigned int max = lexicons[tok.left.name || tok.right.name].size(); | |||
1105 | if (tok.optional()) max++; | |||
1106 | for(unsigned int index = 0; index < max; index++) | |||
1107 | { | |||
1108 | Transducer *lex = getLexiconTransducer(pat[pos], index, false); | |||
1109 | if(lex) | |||
1110 | { | |||
1111 | int new_state = t->insertTransducer(state, *lex); | |||
1112 | if(new_state == state) | |||
1113 | { | |||
1114 | new_state = t->insertNewSingleTransduction(0, state); | |||
1115 | } | |||
1116 | if(tok.left.name.valid()) matchedParts[tok.left.name] = index; | |||
1117 | if(tok.right.name.valid()) matchedParts[tok.right.name] = index; | |||
1118 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
1119 | } | |||
1120 | } | |||
1121 | if(tok.left.name.valid()) matchedParts.erase(tok.left.name); | |||
1122 | if(tok.right.name.valid()) matchedParts.erase(tok.right.name); | |||
1123 | return; | |||
1124 | } | |||
1125 | if(tok.left.name.valid() && matchedParts.find(tok.left.name) == matchedParts.end()) | |||
1126 | matchedParts[tok.left.name] = matchedParts[tok.right.name]; | |||
1127 | if(tok.right.name.valid() && matchedParts.find(tok.right.name) == matchedParts.end()) | |||
1128 | matchedParts[tok.right.name] = matchedParts[tok.left.name]; | |||
1129 | if(tok.left.name.valid() && tok.right.name.valid() && matchedParts[tok.left.name] != matchedParts[tok.right.name]) | |||
1130 | die("Cannot collate %S with %S - both appear in free variation earlier in the pattern.", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1131 | Transducer* lex = getLexiconTransducer(pat[pos], matchedParts[tok.left.name || tok.right.name], false); | |||
1132 | if(lex) | |||
1133 | { | |||
1134 | int new_state = t->insertTransducer(state, *lex); | |||
1135 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
1136 | } | |||
1137 | return; | |||
1138 | } | |||
1139 | else | |||
1140 | { | |||
1141 | Transducer *p = buildPattern(tok); | |||
1142 | if(!p->hasNoFinals()) | |||
1143 | { | |||
1144 | int new_state = t->insertTransducer(state, *p); | |||
1145 | if(tok.mode & Optional) | |||
1146 | t->linkStates(state, new_state, 0); | |||
1147 | if(tok.mode & Repeated) | |||
1148 | t->linkStates(new_state, state, 0); | |||
1149 | buildPattern(new_state, t, pat, is_free, pos+1); | |||
1150 | } | |||
1151 | } | |||
1152 | } | |||
1153 | ||||
1154 | vector<int> | |||
1155 | LexdCompiler::determineFreedom(pattern_t& pat) | |||
1156 | { | |||
1157 | vector<int> is_free = vector<int>(pat.size(), 0); | |||
1158 | map<string_ref, bool> is_optional; | |||
1159 | for(unsigned int i = 0; i < pat.size(); i++) | |||
1160 | { | |||
1161 | const pattern_element_t& t1 = pat[i]; | |||
1162 | if (is_optional.find(t1.left.name) != is_optional.end() && is_optional[t1.left.name] != t1.optional()) { | |||
1163 | die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.left.name))(to_ustring(name(t1.left.name)).c_str())); | |||
1164 | } | |||
1165 | if (is_optional.find(t1.right.name) != is_optional.end() && is_optional[t1.right.name] != t1.optional()) { | |||
1166 | die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.right.name))(to_ustring(name(t1.right.name)).c_str())); | |||
1167 | } | |||
1168 | if (t1.left.name.valid()) { | |||
1169 | is_optional[t1.left.name] = t1.optional(); | |||
1170 | } | |||
1171 | if (t1.right.name.valid()) { | |||
1172 | is_optional[t1.right.name] = t1.optional(); | |||
1173 | } | |||
1174 | if(is_free[i] != 0) | |||
1175 | continue; | |||
1176 | for(unsigned int j = i+1; j < pat.size(); j++) | |||
1177 | { | |||
1178 | const pattern_element_t& t2 = pat[j]; | |||
1179 | if((t1.left.name.valid() && (t1.left.name == t2.left.name || t1.left.name == t2.right.name)) || | |||
1180 | (t1.right.name.valid() && (t1.right.name == t2.left.name || t1.right.name == t2.right.name))) | |||
1181 | { | |||
1182 | is_free[i] = -1; | |||
1183 | is_free[j] = -1; | |||
1184 | } | |||
1185 | } | |||
1186 | is_free[i] = (is_free[i] == 0 ? 1 : -1); | |||
1187 | } | |||
1188 | return is_free; | |||
1189 | } | |||
1190 | ||||
1191 | Transducer* | |||
1192 | LexdCompiler::buildPattern(const pattern_element_t &tok) | |||
1193 | { | |||
1194 | if(tok.left.part != 1 || tok.right.part != 1) | |||
1195 | die("Cannot build collated pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
1196 | if(patternTransducers.find(tok) == patternTransducers.end()) | |||
1197 | { | |||
1198 | if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl; | |||
1199 | auto start_time = chrono::steady_clock::now(); | |||
1200 | Transducer* t = new Transducer(); | |||
1201 | patternTransducers[tok] = NULL__null; | |||
1202 | map<string_ref, unsigned int> tempMatch; | |||
1203 | tempMatch.swap(matchedParts); | |||
1204 | for(auto &pat_untagged : patterns[tok.left.name]) | |||
1205 | { | |||
1206 | for(unsigned int i = 0; i < pat_untagged.second.size(); i++) | |||
1207 | { | |||
1208 | auto pat = pat_untagged; | |||
1209 | bool taggable = true; | |||
1210 | for (unsigned int j = 0; j < pat.second.size(); j++) { | |||
1211 | auto& pair = pat.second[j]; | |||
1212 | if(!pair.tag_filter.combine(tok.tag_filter.neg())) { | |||
1213 | taggable = false; | |||
1214 | if (verbose) { | |||
1215 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
1216 | cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[j])); | |||
1217 | cerr << " on line " << pat.first << "." << endl; | |||
1218 | } | |||
1219 | } | |||
1220 | } | |||
1221 | if(!pat.second[i].tag_filter.combine(tok.tag_filter.pos())) { | |||
1222 | taggable = false; | |||
1223 | if (verbose) { | |||
1224 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
1225 | cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[i])); | |||
1226 | cerr << " on line " << pat.first << "." << endl; | |||
1227 | } | |||
1228 | } | |||
1229 | if (!taggable) continue; | |||
1230 | ||||
1231 | matchedParts.clear(); | |||
1232 | lineNumber = pat.first; | |||
1233 | vector<int> is_free = determineFreedom(pat.second); | |||
1234 | buildPattern(t->getInitial(), t, pat.second, is_free, 0); | |||
1235 | } | |||
1236 | } | |||
1237 | tempMatch.swap(matchedParts); | |||
1238 | if(!t->hasNoFinals()) | |||
1239 | { | |||
1240 | if (verbose) | |||
1241 | cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl; | |||
1242 | t->minimize(); | |||
1243 | } | |||
1244 | else if (verbose) { | |||
1245 | cerr << "Warning: " << to_ustring(printPattern(tok)); | |||
1246 | cerr << " is empty." << endl; | |||
1247 | } | |||
1248 | patternTransducers[tok] = t; | |||
1249 | if (verbose) { | |||
1250 | auto end_time = chrono::steady_clock::now(); | |||
1251 | chrono::duration<double> diff = end_time - start_time; | |||
1252 | cerr << "Done compiling " << to_ustring(printPattern(tok)); | |||
1253 | cerr << " in " << diff.count() << " seconds." << endl; | |||
1254 | } | |||
1255 | } | |||
1256 | else if(patternTransducers[tok] == NULL__null) | |||
1257 | { | |||
1258 | die("Cannot compile self-recursive %S", err(printPattern(tok))(to_ustring(printPattern(tok)).c_str())); | |||
1259 | } | |||
1260 | return patternTransducers[tok]; | |||
1261 | } | |||
1262 | ||||
1263 | int | |||
1264 | LexdCompiler::insertPreTags(Transducer* t, int state, tag_filter_t &tags) | |||
1265 | { | |||
1266 | int end = state; | |||
1267 | for(auto tag : tags.pos()) | |||
1268 | { | |||
1269 | trans_sym_t flag = getFlag(Positive, tag, 1); | |||
1270 | end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end); | |||
1271 | } | |||
1272 | for(auto tag : tags.neg()) | |||
1273 | { | |||
1274 | trans_sym_t flag = getFlag(Positive, tag, 2); | |||
1275 | end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end); | |||
1276 | } | |||
1277 | return end; | |||
1278 | } | |||
1279 | ||||
1280 | int | |||
1281 | LexdCompiler::insertPostTags(Transducer* t, int state, tag_filter_t &tags) | |||
1282 | { | |||
1283 | int end = 0; | |||
1284 | int flag_dest = 0; | |||
1285 | for(auto tag : tags.pos()) | |||
1286 | { | |||
1287 | trans_sym_t flag = getFlag(Disallow, tag, 1); | |||
1288 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
1289 | if(flag_dest == 0) | |||
1290 | { | |||
1291 | flag_dest = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1292 | end = flag_dest; | |||
1293 | } | |||
1294 | else | |||
1295 | { | |||
1296 | t->linkStates(state, flag_dest, (int)alphabet_lookup(flag, flag)); | |||
1297 | } | |||
1298 | end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end); | |||
1299 | } | |||
1300 | if(end == 0) | |||
1301 | { | |||
1302 | end = state; | |||
1303 | } | |||
1304 | for(auto tag : tags.neg()) | |||
1305 | { | |||
1306 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
1307 | end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end); | |||
1308 | } | |||
1309 | return end; | |||
1310 | } | |||
1311 | ||||
1312 | Transducer* | |||
1313 | LexdCompiler::buildPatternWithFlags(const pattern_element_t &tok, int pattern_start_state = 0) | |||
1314 | { | |||
1315 | if(patternTransducers.find(tok) == patternTransducers.end()) | |||
| ||||
1316 | { | |||
1317 | if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl; | |||
1318 | auto start_time = chrono::steady_clock::now(); | |||
1319 | Transducer* trans = (shouldHypermin ? hyperminTrans : new Transducer()); | |||
1320 | patternTransducers[tok] = NULL__null; | |||
1321 | unsigned int transition_index = 0; | |||
1322 | vector<int> pattern_finals; | |||
1323 | bool did_anything = false; | |||
1324 | for(auto& pat : patterns[tok.left.name]) | |||
1325 | { | |||
1326 | lineNumber = pat.first; | |||
1327 | vector<int> is_free = determineFreedom(pat.second); | |||
1328 | bool got_non_null = false; | |||
1329 | unsigned int count = (tok.tag_filter.pos().size() > 0 ? pat.second.size() : 1); | |||
1330 | if(tagsAsFlags) count = 1; | |||
1331 | for(unsigned int idx = 0; idx < count; idx++) | |||
1332 | { | |||
1333 | int state = pattern_start_state; | |||
1334 | vector<int> finals; | |||
1335 | set<string_ref> to_clear; | |||
1336 | bool got_null = false; | |||
1337 | for(unsigned int i = 0; i < pat.second.size(); i++) | |||
1338 | { | |||
1339 | pattern_element_t cur = pat.second[i]; | |||
1340 | ||||
1341 | if(cur.left.name == left_sieve_name) | |||
1342 | { | |||
1343 | trans->linkStates(pattern_start_state, state, 0); | |||
1344 | continue; | |||
1345 | } | |||
1346 | else if(cur.left.name == right_sieve_name) | |||
1347 | { | |||
1348 | finals.push_back(state); | |||
1349 | continue; | |||
1350 | } | |||
1351 | ||||
1352 | bool isLex = isLexiconToken(cur); | |||
1353 | ||||
1354 | transition_index++; | |||
1355 | ||||
1356 | int mode_start = state; | |||
1357 | cur.mode = Normal; | |||
1358 | ||||
1359 | tag_filter_t current_tags; | |||
1360 | if(tagsAsFlags) | |||
1361 | { | |||
1362 | state = insertPreTags(trans, state, cur.tag_filter); | |||
1363 | current_tags = cur.tag_filter; | |||
1364 | cur.tag_filter = tag_filter_t(); | |||
1365 | } | |||
1366 | else | |||
1367 | { | |||
1368 | if (i == idx && !cur.tag_filter.combine(tok.tag_filter.pos())) { | |||
1369 | if (verbose) { | |||
1370 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
1371 | cerr << " conflict with " << to_ustring(printPattern(pat.second[i])); | |||
1372 | cerr << " on line " << pat.first << "." << endl; | |||
1373 | } | |||
1374 | } | |||
1375 | if (!cur.tag_filter.combine(tok.tag_filter.neg())) { | |||
1376 | if (verbose) { | |||
1377 | cerr << "Warning: The tags of " << to_ustring(printPattern(tok)); | |||
1378 | cerr << " conflict with " << to_ustring(printPattern(pat.second[i])); | |||
1379 | cerr << " on line " << pat.first << "." << endl; | |||
1380 | } | |||
1381 | } | |||
1382 | } | |||
1383 | ||||
1384 | Transducer* t; | |||
1385 | if(shouldHypermin) | |||
1386 | { | |||
1387 | trans_sym_t inflag = getFlag(Positive, tok.left.name, transition_index); | |||
1388 | trans_sym_t outflag = getFlag(Require, tok.left.name, transition_index); | |||
1389 | int in_tr = (int)alphabet_lookup(inflag, inflag); | |||
1390 | int out_tr = (int)alphabet_lookup(outflag, outflag); | |||
1391 | if(is_free[i] == -1 && isLex) | |||
1392 | { | |||
1393 | to_clear.insert(cur.left.name); | |||
1394 | to_clear.insert(cur.right.name); | |||
1395 | } | |||
1396 | if(transducerLocs.find(cur) != transducerLocs.end()) | |||
1397 | { | |||
1398 | auto loc = transducerLocs[cur]; | |||
1399 | if(loc.first == loc.second) | |||
1400 | { | |||
1401 | t = NULL__null; | |||
1402 | } | |||
1403 | else | |||
1404 | { | |||
1405 | t = trans; | |||
1406 | trans->linkStates(state, loc.first, in_tr); | |||
1407 | state = trans->insertSingleTransduction(out_tr, loc.second); | |||
1408 | } | |||
1409 | } | |||
1410 | else | |||
1411 | { | |||
1412 | int start = trans->insertSingleTransduction(in_tr, state); | |||
1413 | int end = start; | |||
1414 | if(isLex) | |||
1415 | { | |||
1416 | t = getLexiconTransducerWithFlags(cur, false); | |||
1417 | if(t == NULL__null) | |||
1418 | { | |||
1419 | transducerLocs[cur] = make_pair(start, start); | |||
1420 | } | |||
1421 | else | |||
1422 | { | |||
1423 | end = trans->insertTransducer(start, *t); | |||
1424 | transducerLocs[cur] = make_pair(start, end); | |||
1425 | } | |||
1426 | } | |||
1427 | else | |||
1428 | { | |||
1429 | t = buildPatternWithFlags(cur, start); | |||
1430 | end = transducerLocs[cur].second; | |||
1431 | } | |||
1432 | state = trans->insertSingleTransduction(out_tr, end); | |||
1433 | } | |||
1434 | } | |||
1435 | else if(isLex) | |||
1436 | { | |||
1437 | t = getLexiconTransducerWithFlags(cur, (is_free[i] == 1)); | |||
1438 | if(is_free[i] == -1) | |||
1439 | { | |||
1440 | to_clear.insert(cur.left.name); | |||
1441 | to_clear.insert(cur.right.name); | |||
1442 | } | |||
1443 | } | |||
1444 | else | |||
1445 | { | |||
1446 | t = buildPatternWithFlags(cur); | |||
1447 | } | |||
1448 | if(t == NULL__null || (!shouldHypermin && t->hasNoFinals())) | |||
1449 | { | |||
1450 | got_null = true; | |||
1451 | break; | |||
1452 | } | |||
1453 | got_non_null = true; | |||
1454 | if(!shouldHypermin) | |||
1455 | { | |||
1456 | state = trans->insertTransducer(state, *t); | |||
1457 | } | |||
1458 | if(tagsAsFlags) | |||
1459 | { | |||
1460 | state = insertPostTags(trans, state, current_tags); | |||
1461 | } | |||
1462 | if(pat.second[i].mode & Optional) | |||
1463 | { | |||
1464 | trans->linkStates(mode_start, state, 0); | |||
1465 | } | |||
1466 | if(pat.second[i].mode & Repeated) | |||
1467 | { | |||
1468 | trans->linkStates(state, mode_start, 0); | |||
1469 | } | |||
1470 | } | |||
1471 | if(!got_null || finals.size() > 0) | |||
1472 | { | |||
1473 | for(auto fin : finals) | |||
1474 | { | |||
1475 | trans->linkStates(fin, state, 0); | |||
1476 | } | |||
1477 | for(auto lex : to_clear) | |||
1478 | { | |||
1479 | if(lex.empty()) | |||
1480 | { | |||
1481 | continue; | |||
1482 | } | |||
1483 | UnicodeString flag = "@C."; | |||
1484 | encodeFlag(flag, (int)lex.i); | |||
1485 | flag += "@"; | |||
1486 | trans_sym_t f = alphabet_lookup(flag); | |||
1487 | state = trans->insertSingleTransduction((int)alphabet_lookup(f, f), state); | |||
1488 | } | |||
1489 | trans->setFinal(state); | |||
1490 | pattern_finals.push_back(state); | |||
1491 | } | |||
1492 | } | |||
1493 | if(!got_non_null) | |||
1494 | { | |||
1495 | continue; | |||
1496 | } | |||
1497 | did_anything = true; | |||
1498 | } | |||
1499 | if(did_anything
| |||
1500 | { | |||
1501 | if(shouldHypermin) | |||
1502 | { | |||
1503 | if(pattern_finals.size() > 0) | |||
1504 | { | |||
1505 | int end = pattern_finals[0]; | |||
1506 | for(auto fin : pattern_finals) | |||
1507 | { | |||
1508 | if(fin != end) | |||
1509 | { | |||
1510 | trans->linkStates(fin, end, 0); | |||
1511 | } | |||
1512 | if(pattern_start_state != 0) | |||
1513 | { | |||
1514 | trans->setFinal(fin, 0, false); | |||
1515 | } | |||
1516 | } | |||
1517 | pattern_element_t key = tok; | |||
1518 | key.mode = Normal; | |||
1519 | transducerLocs[key] = make_pair(pattern_start_state, end); | |||
1520 | } | |||
1521 | } | |||
1522 | else | |||
1523 | { | |||
1524 | if(!trans->hasNoFinals()) { | |||
1525 | if (verbose) | |||
1526 | cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl; | |||
1527 | trans->minimize(); | |||
1528 | } | |||
1529 | } | |||
1530 | } | |||
1531 | else | |||
1532 | { | |||
1533 | if(!shouldHypermin) | |||
1534 | trans = NULL__null; | |||
1535 | else | |||
1536 | { | |||
1537 | cerr << "FIXME" << endl; | |||
1538 | } | |||
1539 | } | |||
1540 | if (verbose) { | |||
| ||||
1541 | auto end_time = chrono::steady_clock::now(); | |||
1542 | chrono::duration<double> diff = end_time - start_time; | |||
1543 | cerr << "Done compiling " << to_ustring(printPattern(tok)); | |||
1544 | cerr << " in " << diff.count() << " seconds." << endl; | |||
1545 | } | |||
1546 | patternTransducers[tok] = trans; | |||
1547 | } | |||
1548 | else if(patternTransducers[tok] == NULL__null) | |||
1549 | { | |||
1550 | die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
1551 | } | |||
1552 | return patternTransducers[tok]; | |||
1553 | } | |||
1554 | ||||
1555 | void | |||
1556 | LexdCompiler::buildAllLexicons() | |||
1557 | { | |||
1558 | // find out if there are any lexicons that we can build without flags | |||
1559 | vector<pattern_element_t> lexicons_to_build; | |||
1560 | for(auto pattern : patterns) | |||
1561 | { | |||
1562 | for(auto pat : pattern.second) | |||
1563 | { | |||
1564 | lineNumber = pat.first; | |||
1565 | vector<int> free = determineFreedom(pat.second); | |||
1566 | for(size_t i = 0; i < pat.second.size(); i++) | |||
1567 | { | |||
1568 | if(pat.second[i].left.name == left_sieve_name || | |||
1569 | pat.second[i].left.name == right_sieve_name) | |||
1570 | { | |||
1571 | continue; | |||
1572 | } | |||
1573 | if(isLexiconToken(pat.second[i])) | |||
1574 | { | |||
1575 | pattern_element_t& tok = pat.second[i]; | |||
1576 | if(free[i] == -1) | |||
1577 | { | |||
1578 | lexiconFreedom[tok.left.name] = false; | |||
1579 | lexiconFreedom[tok.right.name] = false; | |||
1580 | } | |||
1581 | else | |||
1582 | { | |||
1583 | if(lexiconFreedom.find(tok.left.name) == lexiconFreedom.end()) | |||
1584 | { | |||
1585 | lexiconFreedom[tok.left.name] = true; | |||
1586 | } | |||
1587 | if(lexiconFreedom.find(tok.right.name) == lexiconFreedom.end()) | |||
1588 | { | |||
1589 | lexiconFreedom[tok.right.name] = true; | |||
1590 | } | |||
1591 | } | |||
1592 | lexicons_to_build.push_back(tok); | |||
1593 | } | |||
1594 | } | |||
1595 | } | |||
1596 | } | |||
1597 | lexiconFreedom[string_ref(0)] = true; | |||
1598 | for(auto tok : lexicons_to_build) | |||
1599 | { | |||
1600 | tok.tag_filter = tag_filter_t(); | |||
1601 | tok.mode = Normal; | |||
1602 | bool free = ((tok.left.name.empty() || lexiconFreedom[tok.left.name]) && | |||
1603 | (tok.right.name.empty() || lexiconFreedom[tok.right.name])); | |||
1604 | getLexiconTransducerWithFlags(tok, free); | |||
1605 | } | |||
1606 | } | |||
1607 | ||||
1608 | int | |||
1609 | LexdCompiler::buildPatternSingleLexicon(pattern_element_t tok, int start_state) | |||
1610 | { | |||
1611 | if(patternTransducers.find(tok) == patternTransducers.end() || patternTransducers[tok] != NULL__null) | |||
1612 | { | |||
1613 | patternTransducers[tok] = NULL__null; | |||
1614 | int end = -1; | |||
1615 | string_ref transition_flag = internName(" "); | |||
1616 | for(auto pattern : patterns[tok.left.name]) | |||
1617 | { | |||
1618 | int next_start_state = start_state; | |||
1619 | size_t next_start_idx = 0; | |||
1620 | lineNumber = pattern.first; | |||
1621 | set<string_ref> to_clear; | |||
1622 | size_t count = (tok.tag_filter.pos().empty() ? 1 : pattern.second.size()); | |||
1623 | for(size_t tag_idx = 0; tag_idx < count; tag_idx++) | |||
1624 | { | |||
1625 | int state = next_start_state; | |||
1626 | bool finished = true; | |||
1627 | for(size_t i = next_start_idx; i < pattern.second.size(); i++) | |||
1628 | { | |||
1629 | pattern_element_t cur = pattern.second[i]; | |||
1630 | ||||
1631 | if(cur.left.name == left_sieve_name) | |||
1632 | { | |||
1633 | hyperminTrans->linkStates(start_state, state, 0); | |||
1634 | continue; | |||
1635 | } | |||
1636 | else if(cur.left.name == right_sieve_name) | |||
1637 | { | |||
1638 | if(end == -1) | |||
1639 | { | |||
1640 | end = hyperminTrans->insertNewSingleTransduction(0, state); | |||
1641 | } | |||
1642 | else | |||
1643 | { | |||
1644 | hyperminTrans->linkStates(state, end, 0); | |||
1645 | } | |||
1646 | continue; | |||
1647 | } | |||
1648 | ||||
1649 | if(i == tag_idx) | |||
1650 | { | |||
1651 | next_start_state = state; | |||
1652 | next_start_idx = tag_idx; | |||
1653 | cur.tag_filter.combine(tok.tag_filter.pos()); | |||
1654 | } | |||
1655 | cur.tag_filter.combine(tok.tag_filter.neg()); | |||
1656 | ||||
1657 | int mode_state = state; | |||
1658 | ||||
1659 | if(isLexiconToken(cur)) | |||
1660 | { | |||
1661 | tags_t tags = cur.tag_filter.tags(); | |||
1662 | for(auto tag : tags) | |||
1663 | { | |||
1664 | trans_sym_t flag = getFlag(Clear, tag, 0); | |||
1665 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1666 | } | |||
1667 | pattern_element_t untagged = cur; | |||
1668 | untagged.tag_filter = tag_filter_t(); | |||
1669 | untagged.mode = Normal; | |||
1670 | bool free = (lexiconFreedom[cur.left.name] && lexiconFreedom[cur.right.name]); | |||
1671 | if(!free) | |||
1672 | { | |||
1673 | to_clear.insert(cur.left.name); | |||
1674 | to_clear.insert(cur.right.name); | |||
1675 | } | |||
1676 | trans_sym_t inflag = getFlag(Positive, transition_flag, transitionCount); | |||
1677 | trans_sym_t outflag = getFlag(Require, transition_flag, transitionCount); | |||
1678 | transitionCount++; | |||
1679 | if(transducerLocs.find(untagged) == transducerLocs.end()) | |||
1680 | { | |||
1681 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(inflag, inflag), state); | |||
1682 | Transducer* lex = getLexiconTransducerWithFlags(untagged, free); | |||
1683 | int start = state; | |||
1684 | state = hyperminTrans->insertTransducer(state, *lex); | |||
1685 | transducerLocs[untagged] = make_pair(start, state); | |||
1686 | } | |||
1687 | else | |||
1688 | { | |||
1689 | auto loc = transducerLocs[untagged]; | |||
1690 | hyperminTrans->linkStates(state, loc.first, (int)alphabet_lookup(inflag, inflag)); | |||
1691 | state = loc.second; | |||
1692 | } | |||
1693 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(outflag, outflag), state); | |||
1694 | for(auto tag : cur.tag_filter.pos()) | |||
1695 | { | |||
1696 | trans_sym_t flag = getFlag(Require, tag, 1); | |||
1697 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1698 | } | |||
1699 | for(auto tag : cur.tag_filter.neg()) | |||
1700 | { | |||
1701 | trans_sym_t flag = getFlag(Disallow, tag, 1); | |||
1702 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1703 | } | |||
1704 | } | |||
1705 | else | |||
1706 | { | |||
1707 | state = buildPatternSingleLexicon(cur, state); | |||
1708 | if(state == -1) | |||
1709 | { | |||
1710 | finished = false; | |||
1711 | break; | |||
1712 | } | |||
1713 | } | |||
1714 | ||||
1715 | if(cur.mode & Optional) | |||
1716 | { | |||
1717 | hyperminTrans->linkStates(mode_state, state, 0); | |||
1718 | } | |||
1719 | if(cur.mode & Repeated) | |||
1720 | { | |||
1721 | hyperminTrans->linkStates(state, mode_state, 0); | |||
1722 | } | |||
1723 | } | |||
1724 | if(finished) | |||
1725 | { | |||
1726 | for(auto lex : to_clear) | |||
1727 | { | |||
1728 | if(lex.empty()) | |||
1729 | { | |||
1730 | continue; | |||
1731 | } | |||
1732 | trans_sym_t flag = getFlag(Clear, lex, 0); | |||
1733 | state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1734 | } | |||
1735 | if(end == -1) | |||
1736 | { | |||
1737 | end = state; | |||
1738 | } | |||
1739 | else | |||
1740 | { | |||
1741 | hyperminTrans->linkStates(state, end, 0); | |||
1742 | } | |||
1743 | } | |||
1744 | } | |||
1745 | } | |||
1746 | patternTransducers.erase(tok); | |||
1747 | return end; | |||
1748 | } | |||
1749 | else | |||
1750 | { | |||
1751 | die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
1752 | return 0; | |||
1753 | } | |||
1754 | } | |||
1755 | ||||
1756 | void | |||
1757 | LexdCompiler::readFile(UFILE* infile) | |||
1758 | { | |||
1759 | input = infile; | |||
1760 | doneReading = false; | |||
1761 | while(!u_feofu_feof_72(input)) | |||
1762 | { | |||
1763 | processNextLine(); | |||
1764 | if(doneReading) break; | |||
1765 | } | |||
1766 | finishLexicon(); | |||
1767 | } | |||
1768 | ||||
1769 | Transducer* | |||
1770 | LexdCompiler::buildTransducer(bool usingFlags) | |||
1771 | { | |||
1772 | token_t start_tok = {.name = internName(" "), .part = 1, .optional = false}; | |||
1773 | pattern_element_t start_pat = {.left=start_tok, .right=start_tok, | |||
1774 | .tag_filter=tag_filter_t(), | |||
1775 | .mode=Normal}; | |||
1776 | if(usingFlags) | |||
1777 | { | |||
1778 | if(shouldHypermin) | |||
1779 | { | |||
1780 | hyperminTrans = new Transducer(); | |||
1781 | } | |||
1782 | Transducer *t = buildPatternWithFlags(start_pat); | |||
1783 | if(shouldHypermin) | |||
1784 | t->minimize(); | |||
1785 | return t; | |||
1786 | } | |||
1787 | else return buildPattern(start_pat); | |||
1788 | } | |||
1789 | ||||
1790 | Transducer* | |||
1791 | LexdCompiler::buildTransducerSingleLexicon() | |||
1792 | { | |||
1793 | tagsAsMinFlags = true; | |||
1794 | token_t start_tok = {.name = internName(" "), .part = 1, .optional = false}; | |||
1795 | pattern_element_t start_pat = {.left=start_tok, .right=start_tok, | |||
1796 | .tag_filter=tag_filter_t(), | |||
1797 | .mode=Normal}; | |||
1798 | hyperminTrans = new Transducer(); | |||
1799 | buildAllLexicons(); | |||
1800 | int end = buildPatternSingleLexicon(start_pat, 0); | |||
1801 | if(end == -1) | |||
1802 | { | |||
1803 | cerr << "WARNING: No non-empty patterns found." << endl; | |||
1804 | } | |||
1805 | else { | |||
1806 | hyperminTrans->setFinal(end); | |||
1807 | hyperminTrans->minimize(); | |||
1808 | } | |||
1809 | return hyperminTrans; | |||
1810 | } | |||
1811 | ||||
1812 | void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation) | |||
1813 | { | |||
1814 | if(alternation.empty()) | |||
1815 | return; | |||
1816 | if(pats.empty()) | |||
1817 | pats.push_back(pattern_t()); | |||
1818 | vector<pattern_t> new_pats; | |||
1819 | for(const auto &pat: pats) | |||
1820 | { | |||
1821 | for(const auto &tok: alternation) | |||
1822 | { | |||
1823 | auto pat1 = pat; | |||
1824 | pat1.push_back(tok); | |||
1825 | new_pats.push_back(pat1); | |||
1826 | } | |||
1827 | } | |||
1828 | pats = new_pats; | |||
1829 | } | |||
1830 | ||||
1831 | void | |||
1832 | LexdCompiler::insertEntry(Transducer* trans, const lex_seg_t &seg) | |||
1833 | { | |||
1834 | int state = trans->getInitial(); | |||
1835 | if(tagsAsFlags) | |||
1836 | { | |||
1837 | for(string_ref tag : seg.tags) | |||
1838 | { | |||
1839 | trans_sym_t check1 = getFlag(Require, tag, 1); | |||
1840 | trans_sym_t check2 = getFlag(Disallow, tag, 2); | |||
1841 | trans_sym_t clear = getFlag(Clear, tag, 0); | |||
1842 | int state2 = trans->insertSingleTransduction((int)alphabet_lookup(check1, check1), state); | |||
1843 | int state3 = trans->insertSingleTransduction((int)alphabet_lookup(clear, clear), state2); | |||
1844 | trans->linkStates(state, state3, 0); | |||
1845 | state = trans->insertSingleTransduction((int)alphabet_lookup(check2, check2), state3); | |||
1846 | } | |||
1847 | } | |||
1848 | else if(tagsAsMinFlags) | |||
1849 | { | |||
1850 | for(string_ref tag : seg.tags) | |||
1851 | { | |||
1852 | trans_sym_t flag = getFlag(Positive, tag, 1); | |||
1853 | state = trans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state); | |||
1854 | } | |||
1855 | } | |||
1856 | if (seg.regex != nullptr) { | |||
1857 | state = trans->insertTransducer(state, *seg.regex); | |||
1858 | } | |||
1859 | if(!shouldAlign) | |||
1860 | { | |||
1861 | for(unsigned int i = 0; i < seg.left.symbols.size() || i < seg.right.symbols.size(); i++) | |||
1862 | { | |||
1863 | trans_sym_t l = (i < seg.left.symbols.size()) ? seg.left.symbols[i] : trans_sym_t(); | |||
1864 | trans_sym_t r = (i < seg.right.symbols.size()) ? seg.right.symbols[i] : trans_sym_t(); | |||
1865 | state = trans->insertSingleTransduction(alphabet((int)l, (int)r), state); | |||
1866 | } | |||
1867 | } | |||
1868 | else | |||
1869 | { | |||
1870 | /* | |||
1871 | This code is adapted from hfst/libhfst/src/parsers/lexc-utils.cc | |||
1872 | It uses the Levenshtein distance algorithm to determine the optimal | |||
1873 | alignment of two strings. | |||
1874 | In hfst-lexc, the priority order for ties is SUB > DEL > INS | |||
1875 | which ensures that 000abc:xyz000 is preferred over abc000:000xyz | |||
1876 | However, we're traversing the strings backwards to simplify extracting | |||
1877 | the final alignment, so we need to switch INS and DEL. | |||
1878 | If shouldCompress is true, we set the cost of SUB to 1 in order to prefer | |||
1879 | a:b over 0:b a:0 without changing the alignment of actual correspondences. | |||
1880 | */ | |||
1881 | const unsigned int INS = 0; | |||
1882 | const unsigned int DEL = 1; | |||
1883 | const unsigned int SUB = 2; | |||
1884 | const unsigned int ins_cost = 1; | |||
1885 | const unsigned int del_cost = 1; | |||
1886 | const unsigned int sub_cost = (shouldCompress ? 1 : 100); | |||
1887 | ||||
1888 | const unsigned int len1 = seg.left.symbols.size(); | |||
1889 | const unsigned int len2 = seg.right.symbols.size(); | |||
1890 | unsigned int cost[len1+1][len2+1]; | |||
1891 | unsigned int path[len1+1][len2+1]; | |||
1892 | cost[0][0] = 0; | |||
1893 | path[0][0] = 0; | |||
1894 | for(unsigned int i = 1; i <= len1; i++) | |||
1895 | { | |||
1896 | cost[i][0] = del_cost * i; | |||
1897 | path[i][0] = DEL; | |||
1898 | } | |||
1899 | for(unsigned int i = 1; i <= len2; i++) | |||
1900 | { | |||
1901 | cost[0][i] = ins_cost * i; | |||
1902 | path[0][i] = INS; | |||
1903 | } | |||
1904 | ||||
1905 | for(unsigned int i = 1; i <= len1; i++) | |||
1906 | { | |||
1907 | for(unsigned int j = 1; j <= len2; j++) | |||
1908 | { | |||
1909 | unsigned int sub = cost[i-1][j-1] + (seg.left.symbols[len1-i] == seg.right.symbols[len2-j] ? 0 : sub_cost); | |||
1910 | unsigned int ins = cost[i][j-1] + ins_cost; | |||
1911 | unsigned int del = cost[i-1][j] + del_cost; | |||
1912 | ||||
1913 | if(sub <= ins && sub <= del) | |||
1914 | { | |||
1915 | cost[i][j] = sub; | |||
1916 | path[i][j] = SUB; | |||
1917 | } | |||
1918 | else if(ins <= del) | |||
1919 | { | |||
1920 | cost[i][j] = ins; | |||
1921 | path[i][j] = INS; | |||
1922 | } | |||
1923 | else | |||
1924 | { | |||
1925 | cost[i][j] = del; | |||
1926 | path[i][j] = DEL; | |||
1927 | } | |||
1928 | } | |||
1929 | } | |||
1930 | ||||
1931 | for(unsigned int x = len1, y = len2; (x > 0) || (y > 0);) | |||
1932 | { | |||
1933 | trans_sym_t symbol; | |||
1934 | switch(path[x][y]) | |||
1935 | { | |||
1936 | case SUB: | |||
1937 | symbol = alphabet_lookup(seg.left.symbols[len1-x], seg.right.symbols[len2-y]); | |||
1938 | x--; | |||
1939 | y--; | |||
1940 | break; | |||
1941 | case INS: | |||
1942 | symbol = alphabet_lookup(trans_sym_t(), seg.right.symbols[len2-y]); | |||
1943 | y--; | |||
1944 | break; | |||
1945 | default: // DEL | |||
1946 | symbol = alphabet_lookup(seg.left.symbols[len1-x], trans_sym_t()); | |||
1947 | x--; | |||
1948 | } | |||
1949 | state = trans->insertSingleTransduction((int)symbol, state); | |||
1950 | } | |||
1951 | } | |||
1952 | trans->setFinal(state); | |||
1953 | } | |||
1954 | ||||
1955 | void | |||
1956 | LexdCompiler::applyMode(Transducer* trans, RepeatMode mode) | |||
1957 | { | |||
1958 | if(mode == Question) | |||
1959 | trans->optional(); | |||
1960 | else if(mode == Star) | |||
1961 | trans->zeroOrMore(); | |||
1962 | else if(mode == Plus) | |||
1963 | trans->oneOrMore(); | |||
1964 | } | |||
1965 | ||||
1966 | Transducer* | |||
1967 | LexdCompiler::getLexiconTransducer(pattern_element_t tok, unsigned int entry_index, bool free) | |||
1968 | { | |||
1969 | if(!free && entryTransducers.find(tok) != entryTransducers.end()) | |||
1970 | return entryTransducers[tok][entry_index]; | |||
1971 | if(free && lexiconTransducers.find(tok) != lexiconTransducers.end()) | |||
1972 | return lexiconTransducers[tok]; | |||
1973 | ||||
1974 | vector<entry_t>& lents = lexicons[tok.left.name]; | |||
1975 | if(tok.left.name.valid() && tok.left.part > lents[0].size()) | |||
1976 | die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part); | |||
1977 | vector<entry_t>& rents = lexicons[tok.right.name]; | |||
1978 | if(tok.right.name.valid() && tok.right.part > rents[0].size()) | |||
1979 | die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part); | |||
1980 | if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size()) | |||
1981 | die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
1982 | unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size()); | |||
1983 | vector<Transducer*> trans; | |||
1984 | if(free) | |||
1985 | trans.push_back(new Transducer()); | |||
1986 | else | |||
1987 | trans.reserve(count); | |||
1988 | lex_seg_t empty; | |||
1989 | bool did_anything = false; | |||
1990 | for(unsigned int i = 0; i < count; i++) | |||
1991 | { | |||
1992 | lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty); | |||
1993 | lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty); | |||
1994 | tags_t tags = unionset(le.tags, re.tags); | |||
1995 | if(!tok.tag_filter.compatible(tags)) | |||
1996 | { | |||
1997 | if(!free) | |||
1998 | trans.push_back(NULL__null); | |||
1999 | continue; | |||
2000 | } | |||
2001 | Transducer* t = free ? trans[0] : new Transducer(); | |||
2002 | if (le.regex != nullptr || re.regex != nullptr) { | |||
2003 | if (tok.left.name.empty()) | |||
2004 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
2005 | if (tok.right.name.empty()) | |||
2006 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
2007 | if (tok.left.name != tok.right.name) | |||
2008 | die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right .name))).c_str())); | |||
2009 | } | |||
2010 | insertEntry(t, {.left=le.left, .right=re.right, .regex=le.regex, .tags=tags}); | |||
2011 | did_anything = true; | |||
2012 | if(!free) | |||
2013 | { | |||
2014 | applyMode(t, tok.mode); | |||
2015 | trans.push_back(t); | |||
2016 | } | |||
2017 | } | |||
2018 | if(tok.optional()) { | |||
2019 | Transducer* t = free ? trans[0] : new Transducer(); | |||
2020 | tags_t empty_tags; | |||
2021 | insertEntry(t, {.left=empty.left, .right=empty.right, .regex=nullptr, .tags=empty_tags}); | |||
2022 | did_anything = true; | |||
2023 | if (!free) { | |||
2024 | applyMode(t, tok.mode); | |||
2025 | trans.push_back(t); | |||
2026 | } | |||
2027 | did_anything = true; | |||
2028 | } | |||
2029 | if(free) | |||
2030 | { | |||
2031 | if(!did_anything) | |||
2032 | { | |||
2033 | trans[0] = NULL__null; | |||
2034 | } | |||
2035 | if(trans[0]) | |||
2036 | { | |||
2037 | trans[0]->minimize(); | |||
2038 | applyMode(trans[0], tok.mode); | |||
2039 | } | |||
2040 | lexiconTransducers[tok] = trans[0]; | |||
2041 | return trans[0]; | |||
2042 | } | |||
2043 | else | |||
2044 | { | |||
2045 | entryTransducers[tok] = trans; | |||
2046 | return trans[entry_index]; | |||
2047 | } | |||
2048 | } | |||
2049 | ||||
2050 | void | |||
2051 | LexdCompiler::encodeFlag(UnicodeString& str, int flag) | |||
2052 | { | |||
2053 | UnicodeString letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |||
2054 | int num = flag; | |||
2055 | while(num > 0) | |||
2056 | { | |||
2057 | str += letters[num % 26]; | |||
2058 | num /= 26; | |||
2059 | } | |||
2060 | } | |||
2061 | ||||
2062 | trans_sym_t | |||
2063 | LexdCompiler::getFlag(FlagDiacriticType type, string_ref flag, unsigned int value) | |||
2064 | { | |||
2065 | //cerr << "getFlag(" << type << ", " << to_ustring(name(flag)) << ", " << value << ")" << endl; | |||
2066 | UnicodeString flagstr = "@"; | |||
2067 | switch(type) | |||
2068 | { | |||
2069 | case Unification: | |||
2070 | //cerr << " Unification" << endl; | |||
2071 | flagstr += "U."; break; | |||
2072 | case Positive: | |||
2073 | //cerr << " Positive" << endl; | |||
2074 | flagstr += "P."; break; | |||
2075 | case Negative: | |||
2076 | //cerr << " Negative" << endl; | |||
2077 | flagstr += "N."; break; | |||
2078 | case Require: | |||
2079 | //cerr << " Require" << endl; | |||
2080 | flagstr += "R."; break; | |||
2081 | case Disallow: | |||
2082 | //cerr << " Disallow" << endl; | |||
2083 | flagstr += "D."; break; | |||
2084 | case Clear: | |||
2085 | //cerr << " Clear" << endl; | |||
2086 | flagstr += "C."; break; | |||
2087 | } | |||
2088 | encodeFlag(flagstr, (int)flag.i); | |||
2089 | if(type != Clear) | |||
2090 | { | |||
2091 | flagstr += "."; | |||
2092 | encodeFlag(flagstr, (int)(value + 1)); | |||
2093 | } | |||
2094 | flagstr += "@"; | |||
2095 | return alphabet_lookup(flagstr); | |||
2096 | } | |||
2097 | ||||
2098 | Transducer* | |||
2099 | LexdCompiler::getLexiconTransducerWithFlags(pattern_element_t& tok, bool free) | |||
2100 | { | |||
2101 | if(!free && entryTransducers.find(tok) != entryTransducers.end()) | |||
2102 | return entryTransducers[tok][0]; | |||
2103 | if(free && lexiconTransducers.find(tok) != lexiconTransducers.end()) | |||
2104 | return lexiconTransducers[tok]; | |||
2105 | ||||
2106 | // TODO: can this be abstracted from here and getLexiconTransducer()? | |||
2107 | vector<entry_t>& lents = lexicons[tok.left.name]; | |||
2108 | if(tok.left.name.valid() && tok.left.part > lents[0].size()) | |||
2109 | die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part); | |||
2110 | vector<entry_t>& rents = lexicons[tok.right.name]; | |||
2111 | if(tok.right.name.valid() && tok.right.part > rents[0].size()) | |||
2112 | die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part); | |||
2113 | if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size()) | |||
2114 | die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
2115 | unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size()); | |||
2116 | Transducer* trans = new Transducer(); | |||
2117 | lex_seg_t empty; | |||
2118 | bool did_anything = false; | |||
2119 | for(unsigned int i = 0; i < count; i++) | |||
2120 | { | |||
2121 | lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty); | |||
2122 | lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty); | |||
2123 | tags_t tags = unionset(le.tags, re.tags); | |||
2124 | if(!tok.tag_filter.compatible(tags)) | |||
2125 | { | |||
2126 | continue; | |||
2127 | } | |||
2128 | did_anything = true; | |||
2129 | lex_seg_t seg; | |||
2130 | if (le.regex != nullptr || re.regex != nullptr) { | |||
2131 | if (tok.left.name.empty()) | |||
2132 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str())); | |||
2133 | if (tok.right.name.empty()) | |||
2134 | die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str())); | |||
2135 | if (tok.left.name != tok.right.name) | |||
2136 | die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right .name))).c_str())); | |||
2137 | seg.regex = le.regex; | |||
2138 | } | |||
2139 | if(!free && tok.left.name.valid()) | |||
2140 | { | |||
2141 | trans_sym_t flag = getFlag(Unification, tok.left.name, i); | |||
2142 | seg.left.symbols.push_back(flag); | |||
2143 | seg.right.symbols.push_back(flag); | |||
2144 | } | |||
2145 | if(!free && tok.right.name.valid() && tok.right.name != tok.left.name) | |||
2146 | { | |||
2147 | trans_sym_t flag = getFlag(Unification, tok.right.name, i); | |||
2148 | seg.left.symbols.push_back(flag); | |||
2149 | seg.right.symbols.push_back(flag); | |||
2150 | } | |||
2151 | if(tok.left.name.valid()) | |||
2152 | { | |||
2153 | seg.left.symbols.insert(seg.left.symbols.end(), le.left.symbols.begin(), le.left.symbols.end()); | |||
2154 | } | |||
2155 | if(tok.right.name.valid()) | |||
2156 | { | |||
2157 | seg.right.symbols.insert(seg.right.symbols.end(), re.right.symbols.begin(), re.right.symbols.end()); | |||
2158 | } | |||
2159 | seg.tags.insert(tags.begin(), tags.end()); | |||
2160 | insertEntry(trans, seg); | |||
2161 | } | |||
2162 | if(tok.optional()) { | |||
2163 | lex_seg_t seg; | |||
2164 | if (!free && tok.left.name.valid()) { | |||
2165 | trans_sym_t flag = getFlag(Unification, tok.left.name, count); | |||
2166 | seg.left.symbols.push_back(flag); | |||
2167 | seg.right.symbols.push_back(flag); | |||
2168 | } | |||
2169 | if (!free && tok.right.name.valid() && tok.right.name != tok.left.name) { | |||
2170 | trans_sym_t flag = getFlag(Unification, tok.right.name, count); | |||
2171 | seg.left.symbols.push_back(flag); | |||
2172 | seg.right.symbols.push_back(flag); | |||
2173 | } | |||
2174 | insertEntry(trans, seg); | |||
2175 | } | |||
2176 | if(did_anything) | |||
2177 | { | |||
2178 | trans->minimize(); | |||
2179 | applyMode(trans, tok.mode); | |||
2180 | } | |||
2181 | else | |||
2182 | { | |||
2183 | trans = NULL__null; | |||
2184 | } | |||
2185 | if(free) | |||
2186 | { | |||
2187 | lexiconTransducers[tok] = trans; | |||
2188 | } | |||
2189 | else | |||
2190 | { | |||
2191 | entryTransducers[tok] = vector<Transducer*>(1, trans); | |||
2192 | } | |||
2193 | return trans; | |||
2194 | } | |||
2195 | ||||
2196 | void | |||
2197 | LexdCompiler::printStatistics() const | |||
2198 | { | |||
2199 | cerr << "Lexicons: " << lexicons.size() << endl; | |||
2200 | cerr << "Lexicon entries: "; | |||
2201 | unsigned int x = 0; | |||
2202 | for(const auto &lex: lexicons) | |||
2203 | x += lex.second.size(); | |||
2204 | cerr << x << endl; | |||
2205 | x = 0; | |||
2206 | cerr << "Patterns: " << patterns.size() << endl; | |||
2207 | cerr << "Pattern entries: "; | |||
2208 | for(const auto &pair: patterns) | |||
2209 | x += pair.second.size(); | |||
2210 | cerr << x << endl; | |||
2211 | cerr << endl; | |||
2212 | cerr << "Counts for individual lexicons:" << endl; | |||
2213 | unsigned int anon = 0; | |||
2214 | for(const auto &lex: lexicons) | |||
2215 | { | |||
2216 | if(empty(lex.first)) continue; | |||
2217 | UString n = to_ustring(name(lex.first)); | |||
2218 | if(n[0] == ' ') anon += lex.second.size(); | |||
2219 | else cerr << n << ": " << lex.second.size() << endl; | |||
2220 | } | |||
2221 | cerr << "All anonymous lexicons: " << anon << endl; | |||
2222 | } |