Bug Summary

File:lexdcompiler.cc
Warning:line 2022, column 5
Value stored to 'did_anything' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name lexdcompiler.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/lexd/lexd-1.3.5+g175~98d02cec/src -resource-dir /usr/lib/llvm-16/lib/clang/16 -D PACKAGE_NAME="lexd" -D PACKAGE_TARNAME="lexd" -D PACKAGE_VERSION="1.3.5" -D PACKAGE_STRING="lexd 1.3.5" -D PACKAGE_BUGREPORT="awesomeevildudes@gmail.com" -D PACKAGE_URL="" -D PACKAGE="lexd" -D VERSION="1.3.5" -D HAVE_GETOPT_LONG=1 -I . -I /usr/local/include -I /usr/local/include -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/lexd/lexd-1.3.5+g175~98d02cec/src -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/lexd/scan-build/2024-09-11-161239-16607-1 -x c++ lexdcompiler.cc
1#include "lexdcompiler.h"
2#include <unicode/unistr.h>
3#include <memory>
4#include <chrono>
5#include <lttoolbox/string_utils.h>
6
7using namespace icu;
8using namespace std;
9
10bool tag_filter_t::combinable(const tag_filter_t &other) const
11{
12 // TODO: make ops combinable even with non-empty filters?
13 if(empty() || other.empty())
14 return true;
15 return intersectset(pos(), other.neg()).empty() && intersectset(other.pos(), neg()).empty() && ops().empty() && other.ops().empty();
16}
17bool tag_filter_t::combine(const tag_filter_t &other)
18{
19 if(!combinable(other))
20 return false;
21 unionset_inplace(_pos, other._pos);
22 unionset_inplace(_neg, other._neg);
23 for(const auto &op: other._ops)
24 _ops.push_back(op);
25 return true;
26}
27
28void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation);
29vector<pattern_element_t> distribute_tag_expressions(const pattern_element_t &token)
30{
31 vector<pattern_element_t> result;
32 for(const auto &f: token.tag_filter.distribute())
33 {
34 pattern_element_t new_token = token;
35 new_token.tag_filter = f;
36 result.push_back(new_token);
37 }
38 return result;
39}
40
41bool tag_filter_t::compatible(const tags_t &tags) const
42{
43 return subset(pos(), tags) && intersectset(neg(), tags).empty() && ops().empty();
44}
45bool tag_filter_t::applicable(const tags_t &tags) const
46{
47 return subset(neg(), tags) && ops().empty();
48}
49bool tag_filter_t::try_apply(tags_t &tags) const
50{
51 if(!applicable(tags))
52 return false;
53 subtractset_inplace(tags, neg());
54 unionset_inplace(tags, pos());
55 return true;
56}
57bool pattern_element_t::compatible(const lex_seg_t &tok) const
58{
59 return left.name.empty() || right.name.empty() || tag_filter.compatible(tok.tags);
60}
61const UnicodeString &LexdCompiler::name(string_ref r) const
62{
63 return id_to_name[(unsigned int)r];
64}
65
66UnicodeString LexdCompiler::printPattern(const pattern_element_t& pat)
67{
68 UnicodeString ret = name(pat.left.name);
69 auto& pos = pat.tag_filter.pos();
70 auto& neg = pat.tag_filter.neg();
71 if (!pos.empty() || !neg.empty()) {
72 ret += '[';
73 int ln = ret.length();
74 for (auto& it : pos) {
75 if (ret.length() > ln) ret += ',';
76 ret += name(it);
77 }
78 for (auto& it : neg) {
79 if (ret.length() > ln) ret += ',';
80 ret += '-';
81 ret += name(it);
82 }
83 ret += ']';
84 }
85 switch (pat.mode) {
86 case Question:
87 ret += '?'; break;
88 case Plus:
89 ret += '+'; break;
90 case Star:
91 ret += '*'; break;
92 default: break;
93 }
94 return ret;
95}
96
97UnicodeString LexdCompiler::printFilter(const tag_filter_t& filter)
98{
99 UnicodeString ret;
100 ret += '[';
101 for (auto& it : filter.pos()) {
102 if (ret.length() > 1) ret += ',';
103 ret += name(it);
104 }
105 for (auto& it : filter.neg()) {
106 if (ret.length() > 1) ret += ',';
107 ret += '-';
108 ret += name(it);
109 }
110 for (auto& op : filter.ops()) {
111 if (ret.length() > 1) ret += ',';
112 ret += op->sigil;
113 ret += '[';
114 int ln = ret.length();
115 for (auto& it : *op) {
116 if (ret.length() > ln) ret += ',';
117 ret += name(it);
118 }
119 ret += ']';
120 }
121 ret += ']';
122 return ret;
123}
124
125trans_sym_t LexdCompiler::alphabet_lookup(const UnicodeString &symbol)
126{
127 if (!symbol.hasMoreChar32Than(0, symbol.length(), 1)) {
128 return trans_sym_t((int)symbol.char32At(0));
129 } else {
130 UString temp;
131 temp.append(symbol.getBuffer(), (unsigned int)symbol.length());
132 alphabet.includeSymbol(temp);
133 return trans_sym_t(alphabet(temp));
134 }
135}
136trans_sym_t LexdCompiler::alphabet_lookup(trans_sym_t l, trans_sym_t r)
137{
138 return trans_sym_t(alphabet((int)l, (int)r));
139}
140
141LexdCompiler::LexdCompiler()
142{
143 id_to_name.push_back("");
144 name_to_id[""] = string_ref(0);
145 lexicons[string_ref(0)] = vector<entry_t>();
146
147 left_sieve_name = internName("<");
148 token_t lsieve_tok = {.name=left_sieve_name, .part=1, .optional=false};
149 pattern_element_t lsieve_elem = {.left=lsieve_tok, .right=lsieve_tok,
150 .tag_filter=tag_filter_t(),
151 .mode=Normal};
152 left_sieve_tok = vector<pattern_element_t>(1, lsieve_elem);
153
154 right_sieve_name = internName(">");
155 token_t rsieve_tok = {.name=right_sieve_name, .part=1, .optional=false};
156 pattern_element_t rsieve_elem = {.left=rsieve_tok, .right=rsieve_tok,
157 .tag_filter=tag_filter_t(),
158 .mode=Normal};
159 right_sieve_tok = vector<pattern_element_t>(1, rsieve_elem);
160}
161
162LexdCompiler::~LexdCompiler()
163{}
164
165// u_*printf only accept const UChar*
166// so here's a wrapper so we don't have to write all this out every time
167// and make it a macro so we don't have issues with it deallocating
168#define err(s)(to_ustring(s).c_str()) (to_ustring(s).c_str())
169
170void
171LexdCompiler::die(const char* msg, ...)
172{
173 UFILE* err_out = u_finitu_finit_72(stderrstderr, NULL__null, NULL__null);
174 u_fprintfu_fprintf_72(err_out, "Error on line %d: ", lineNumber);
175 va_list argptr;
176 va_start(argptr, msg)__builtin_va_start(argptr, msg);
177 u_vfprintfu_vfprintf_72(err_out, msg, argptr);
178 va_end(argptr)__builtin_va_end(argptr);
179 u_fputcu_fputc_72('\n', err_out);
180 exit(EXIT_FAILURE1);
181}
182
183void LexdCompiler::appendLexicon(string_ref lexicon_id, const vector<entry_t> &to_append)
184{
185 if(lexicons.find(lexicon_id) == lexicons.end())
186 lexicons[lexicon_id] = to_append;
187 else
188 lexicons[lexicon_id].insert(lexicons[lexicon_id].begin(), to_append.begin(), to_append.end());
189}
190
191void
192LexdCompiler::finishLexicon()
193{
194 if(inLex)
195 {
196 if (currentLexicon.size() == 0) {
197 die("Lexicon '%S' is empty.", err(name(currentLexiconId))(to_ustring(name(currentLexiconId)).c_str()));
198 }
199 appendLexicon(currentLexiconId, currentLexicon);
200
201 currentLexicon.clear();
202 currentLexicon_tags.clear();
203 }
204 inLex = false;
205}
206
207string_ref
208LexdCompiler::internName(const UnicodeString& name)
209{
210 if(name_to_id.find(name) == name_to_id.end())
211 {
212 name_to_id[name] = string_ref(id_to_name.size());
213 id_to_name.push_back(name);
214 }
215 return name_to_id[name];
216}
217
218string_ref
219LexdCompiler::checkName(UnicodeString& name)
220{
221 const static UString forbidden = u" :?|()<>[]*+";
222 name.trim();
223 int l = name.length();
224 if(l == 0) die("Unnamed pattern or lexicon.");
225
226 for(const auto &c: forbidden) {
227 if(name.indexOf(c) != -1) {
228 die("Lexicon/pattern names cannot contain character '%C'", c);
229 }
230 }
231 return internName(name);
232}
233
234tags_t
235LexdCompiler::readTags(char_iter &iter, UnicodeString &line)
236{
237 tag_filter_t filter = readTagFilter(iter, line);
238 if(filter.neg().empty() && filter.ops().empty())
239 return tags_t((set<string_ref>)filter.pos());
240 else
241 die("Cannot declare negative tag in lexicon");
242 return tags_t();
243}
244
245tag_filter_t
246LexdCompiler::readTagFilter(char_iter& iter, UnicodeString& line)
247{
248 tag_filter_t tag_filter;
249 auto tag_start = (++iter).span();
250 bool tag_nonempty = false;
251 bool negative = false;
252 vector<shared_ptr<op_tag_filter_t>> ops;
253 for(; !iter.at_end() && (*iter).length() > 0; ++iter)
254 {
255 if(*iter == "]" || *iter == "," || *iter == " ")
256 {
257 if(!tag_nonempty)
258 die("Empty tag at char %d", + iter.span().first);
259 UnicodeString s = line.tempSubStringBetween(tag_start.first, iter.span().first);
260 if(!tag_filter.combine(
261 negative ? tag_filter_t(neg_tag_filter_t {checkName(s)})
262 : tag_filter_t(pos_tag_filter_t {checkName(s)})
263 ))
264 die("Illegal tag filter.");
265 tag_nonempty = false;
266 negative = false;
267 if(*iter == "]")
268 {
269 iter++;
270 return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops);
271 }
272 }
273 else if(!tag_nonempty && *iter == "-")
274 {
275 negative = true;
276 }
277 else if(!tag_nonempty && (*iter == "|" || *iter == "^"))
278 {
279 const UnicodeString s = *iter;
280 if(negative)
281 die("Illegal negated operation.");
282 *iter++;
283 if (*iter == "[")
284 {
285 shared_ptr<op_tag_filter_t> op;
286 tags_t operands = readTags(iter, line);
287 if (s == "|")
288 op = make_shared<or_tag_filter_t>(operands);
289 else if (s == "^")
290 op = make_shared<xor_tag_filter_t>(operands);
291 ops.push_back(op);
292 }
293 else
294 die("Expected list of operands.");
295 if(*iter == "]")
296 {
297 iter++;
298 return tag_filter_t(tag_filter.pos(), tag_filter.neg(), ops);
299 }
300 }
301 else if(!tag_nonempty)
302 {
303 tag_nonempty = true;
304 tag_start = iter.span();
305 }
306 }
307 die("End of line in tag list, expected ']'");
308 return tag_filter_t();
309}
310
311void
312LexdCompiler::appendSymbol(const UnicodeString& s, lex_token_t& tok)
313{
314 if (shouldCombine) {
315 tok.symbols.push_back(alphabet_lookup(s));
316 } else {
317 for (int c = 0; c < s.length(); c++) {
318 tok.symbols.push_back(alphabet_lookup(s[c]));
319 }
320 }
321}
322
323void
324LexdCompiler::readSymbol(char_iter& iter, UnicodeString& line, lex_token_t& tok)
325{
326 if ((*iter).startsWith("\\")) {
327 if ((*iter).length() == 1) {
328 appendSymbol(*++iter, tok);
329 } else {
330 appendSymbol((*iter).tempSubString(1), tok);
331 }
332 } else if ((*iter).startsWith(":")) {
333 appendSymbol((*iter).tempSubString(1), tok);
334 } else if (*iter == "{" || *iter == "<") {
335 UChar end = (*iter == "{") ? '}' : '>';
336 int i = iter.span().first;
337 for (; !iter.at_end() && *iter != end; ++iter) ;
338
339 if (*iter == end) {
340 tok.symbols.push_back(alphabet_lookup(line.tempSubStringBetween(i, iter.span().second)));
341 } else {
342 die("Multichar symbol didn't end; searching for %S", err(end)(to_ustring(end).c_str()));
343 }
344 } else {
345 appendSymbol(*iter, tok);
346 }
347}
348
349int
350LexdCompiler::processRegexTokenSeq(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state)
351{
352 bool inleft = true;
353 vector<vector<lex_token_t>> left, right;
354 for (; !iter.at_end(); ++iter) {
355 if (*iter == "(" || *iter == ")" || *iter == "|" || *iter == "/") break;
356 else if (*iter == "?" || *iter == "*" || *iter == "+")
357 die("Quantifier %S may only be applied to parenthesized groups", err(*iter)(to_ustring(*iter).c_str()));
358 else if (*iter == "]") die("Regex contains mismatched ]");
359 else if (*iter == ":" && inleft) inleft = false;
360 else if (*iter == ":") die("Regex contains multiple colons");
361 else if (*iter == "[") {
362 ++iter;
363 vector<lex_token_t> sym;
364 for (; !iter.at_end(); ++iter) {
365 if (*iter == "]") break;
366 else if (*iter == "-" && !sym.empty()) {
367 ++iter;
368 if (*iter == "]" || iter.at_end()) {
369 --iter;
370 lex_token_t temp;
371 readSymbol(iter, line, temp);
372 sym.push_back(temp);
373 } else {
374 lex_token_t start = sym.back();
375 lex_token_t end;
376 readSymbol(iter, line, end);
377 // This will fail on diacritics even with -U
378 // on the principle that command-line args should not
379 // change the validity of the code -DGS 2022-05-17
380 if (start.symbols.size() != 1 || end.symbols.size() != 1 ||
381 (int)start.symbols[0] <= 0 || (int)end.symbols[0] <= 0)
382 die("Cannot process symbol range between multichar symbols");
383 int i_start = (int)start.symbols[0];
384 int i_end = (int)end.symbols[0];
385 if (i_start > i_end)
386 die("First character in symbol range does not preceed last");
387 for (int i = 1 + i_start; i <= i_end; i++) {
388 lex_token_t mid;
389 mid.symbols.push_back((trans_sym_t)i);
390 sym.push_back(mid);
391 }
392 }
393 } else {
394 lex_token_t temp;
395 readSymbol(iter, line, temp);
396 sym.push_back(temp);
397 }
398 }
399 (inleft ? left : right).push_back(sym);
400 } else {
401 vector<lex_token_t> v_temp;
402 lex_token_t t_temp;
403 readSymbol(iter, line, t_temp);
404 v_temp.push_back(t_temp);
405 (inleft ? left : right).push_back(v_temp);
406 }
407 }
408 int state = start_state;
409 vector<lex_token_t> empty_vec;
410 lex_token_t empty_tok;
411 empty_tok.symbols.push_back(trans_sym_t());
412 empty_vec.push_back(empty_tok);
413 for (unsigned int i = 0; i < left.size() || i < right.size(); i++) {
414 vector<lex_token_t>& lv = (i < left.size() && !left[i].empty() ?
415 left[i] : empty_vec);
416 vector<lex_token_t>& rv = (i < right.size() && !right[i].empty() ?
417 right[i] : empty_vec);
418 bool first = true;
419 int dest_state = 0;
420 if (inleft) {
421 for (auto& s : lv) {
422 if (first) {
423 dest_state = state;
424 for (auto& it : s.symbols)
425 dest_state = trans->insertNewSingleTransduction(alphabet((int)it, (int)it), dest_state);
426 if (dest_state == state)
427 dest_state = trans->insertNewSingleTransduction(0, dest_state);
428 first = false;
429 } else if (s.symbols.empty()) {
430 trans->linkStates(state, dest_state, 0);
431 } else {
432 int cur_state = state;
433 for (unsigned int k = 0; k < s.symbols.size(); k++) {
434 if (k+1 == s.symbols.size())
435 trans->linkStates(cur_state, dest_state, alphabet((int)s.symbols[k], (int)s.symbols[k]));
436 else
437 cur_state = trans->insertNewSingleTransduction(alphabet((int)s.symbols[k], (int)s.symbols[k]), cur_state);
438 }
439 }
440 }
441 } else {
442 for (auto& l : lv) {
443 for (auto& r : rv) {
444 vector<int> paired;
445 for (unsigned int j = 0; j < l.symbols.size() || j < r.symbols.size(); j++) {
446 trans_sym_t ls = (j < l.symbols.size() ? l.symbols[j] : trans_sym_t());
447 trans_sym_t rs = (j < r.symbols.size() ? r.symbols[j] : trans_sym_t());
448 paired.push_back(alphabet((int)ls, (int)rs));
449 }
450 if (first) {
451 dest_state = state;
452 for (auto& it : paired) {
453 dest_state = trans->insertNewSingleTransduction(it, dest_state);
454 }
455 first = false;
456 } else {
457 int cur_state = state;
458 for (unsigned int k = 0; k < paired.size(); k++) {
459 if (k+1 == paired.size())
460 trans->linkStates(cur_state, dest_state, paired[k]);
461 else
462 cur_state = trans->insertNewSingleTransduction(paired[k], cur_state);
463 }
464 }
465 }
466 }
467 }
468 state = dest_state;
469 }
470 return state;
471}
472
473int
474LexdCompiler::processRegexGroup(char_iter& iter, UnicodeString& line, Transducer* trans, int start_state, unsigned int depth)
475{
476 ++iter; // initial slash or paren
477 int state = start_state;
478 vector<int> option_ends;
479 for (; !iter.at_end(); ++iter) {
480 if (*iter == "(") {
481 state = trans->insertNewSingleTransduction(0, state);
482 state = processRegexGroup(iter, line, trans, state, depth+1);
483 --iter;
484 // this function ends on character after close paren or quantifier
485 // so step back so loop increment doesn't skip a character
486 }
487 else if (*iter == ")" || *iter == "/") break;
488 else if (*iter == "|") {
489 if (state == start_state)
490 state = trans->insertNewSingleTransduction(0, state);
491 option_ends.push_back(state);
492 state = start_state;
493 }
494 else {
495 state = processRegexTokenSeq(iter, line, trans, state);
496 --iter;
497 }
498 }
499 if (state == start_state)
500 state = trans->insertNewSingleTransduction(0, state);
501 for (auto& it : option_ends)
502 trans->linkStates(it, state, 0);
503 if ((depth > 0 && *iter == "/") || (depth == 0 && *iter == ")"))
504 die("Mismatched parentheses in regex");
505 if (iter.at_end())
506 die("Unterminated regex");
507 ++iter;
508 if (depth > 0) {
509 if (*iter == "?") {
510 trans->linkStates(start_state, state, 0);
511 ++iter;
512 } else if (*iter == "*") {
513 trans->linkStates(start_state, state, 0);
514 trans->linkStates(state, start_state, 0);
515 ++iter;
516 } else if (*iter == "+") {
517 trans->linkStates(state, start_state, 0);
518 ++iter;
519 }
520 }
521 return state;
522}
523
524lex_seg_t
525LexdCompiler::processLexiconSegment(char_iter& iter, UnicodeString& line, unsigned int part_count)
526{
527 lex_seg_t seg;
528 bool inleft = true;
529 bool left_tags_applied = false, right_tags_applied = false;
530 tag_filter_t tags;
531 if((*iter).startsWith(" "))
532 {
533 if((*iter).length() > 1)
534 {
535 // if it's a space with a combining diacritic after it,
536 // then we want the diacritic
537 UnicodeString cur = *iter;
538 cur.retainBetween(1, cur.length());
539 seg.left.symbols.push_back(alphabet_lookup(cur));
540 }
541 ++iter;
542 }
543 if((*iter).startsWith("/") && seg.left.symbols.size() == 0)
544 {
545 seg.regex = new Transducer();
546 int state = processRegexGroup(iter, line, seg.regex, 0, 0);
547 seg.regex->setFinal(state);
548 }
549 if(iter.at_end() && seg.regex == nullptr && seg.left.symbols.size() == 0)
550 die("Expected %d parts, found %d", currentLexiconPartCount, part_count);
551 for(; !iter.at_end(); ++iter)
552 {
553 if((*iter).startsWith(" ") || *iter == ']')
554 break;
555 else if(*iter == "[")
556 {
557 auto &tags_applied = inleft ? left_tags_applied : right_tags_applied;
558 if(tags_applied)
559 die("Already provided tag list for this side.");
560 tags = readTagFilter(iter, line);
561 --iter;
562 tags_applied = true;
563 }
564 else if((*iter).startsWith(":"))
565 {
566 if(inleft)
567 inleft = false;
568 else
569 die("Lexicon entry contains multiple colons");
570 if ((*iter).length() > 1) readSymbol(iter, line, seg.right);
571 }
572 else readSymbol(iter, line, (inleft ? seg.left : seg.right));
573 }
574 if(inleft)
575 {
576 seg.right = seg.left;
577 }
578
579 if (seg.regex != nullptr &&
580 !(seg.left.symbols.empty() && seg.right.symbols.empty()))
581 die("Lexicon entry contains both regex and text");
582
583 seg.tags = currentLexicon_tags;
584
585 if(!tags.try_apply(seg.tags))
586 {
587 tags_t diff = subtractset(tags.neg(), seg.tags);
588 for(string_ref t: diff)
589 cerr << "Bad tag '-" << to_ustring(name(t)) << "'" << endl;
590 die("Negative tag has no default to unset.");
591 }
592
593 return seg;
594}
595
596token_t
597LexdCompiler::readToken(char_iter& iter, UnicodeString& line)
598{
599 auto begin_charspan = iter.span();
600
601 const UnicodeString boundary = " :()[]+*?|<>";
602
603 for(; !iter.at_end() && boundary.indexOf(*iter) == -1; ++iter);
604 UnicodeString name;
605 line.extract(begin_charspan.first, (iter.at_end() ? line.length() : iter.span().first) - begin_charspan.first, name);
606
607 if(name.length() == 0)
608 die("Symbol '%S' without lexicon name at u16 %d-%d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, iter.span().second-1);
609
610 bool optional = false;
611 if(*iter == "?") {
612 iter++;
613 if(*iter == "(") {
614 optional = true;
615 } else {
616 iter--;
617 }
618 }
619
620 unsigned int part = 1;
621 if(*iter == "(")
622 {
623 iter++;
624 begin_charspan = iter.span();
625 for(; !iter.at_end() && (*iter).length() > 0 && *iter != ")"; iter++)
626 {
627 if((*iter).length() != 1 || !u_isdigitu_isdigit_72((*iter).charAt(0)))
628 die("Syntax error - non-numeric index in parentheses: %S", err(*iter)(to_ustring(*iter).c_str()));
629 }
630 if(*iter != ")")
631 die("Syntax error - unmatched parenthesis");
632 if(iter.span().first == begin_charspan.first)
633 die("Syntax error - missing index in parenthesis");
634 part = (unsigned int)StringUtils::stoi(to_ustring(line.tempSubStringBetween(begin_charspan.first, iter.span().first)));
635 if (part == 0) die("Invalid column number (0)");
636 ++iter;
637 }
638
639 return token_t {.name = internName(name), .part = part, .optional = optional};
640}
641
642RepeatMode
643LexdCompiler::readModifier(char_iter& iter)
644{
645 if(*iter == "?")
646 {
647 ++iter;
648 return Question;
649 }
650 else if(*iter == "*")
651 {
652 ++iter;
653 return Star;
654 }
655 else if(*iter == "+")
656 {
657 ++iter;
658 return Plus;
659 }
660 return Normal;
661}
662
663pattern_element_t
664LexdCompiler::readPatternElement(char_iter& iter, UnicodeString& line)
665{
666 const UnicodeString boundary = " :()[]+*?|<>";
667 pattern_element_t tok;
668 if(*iter == ":")
669 {
670 iter++;
671 if(boundary.indexOf(*iter) != -1)
672 {
673 if(*iter == ":")
674 die("Syntax error - double colon");
675 else
676 die("Colon without lexicon or pattern name");
677 }
678 tok.right = readToken(iter, line);
679 }
680 else if(boundary.indexOf(*iter) != -1)
681 {
682 die("Unexpected symbol '%S' at column %d", err(*iter)(to_ustring(*iter).c_str()), iter.span().first);
683 }
684 else
685 {
686 tok.left = readToken(iter, line);
687 if(*iter == "[")
688 {
689 tok.tag_filter.combine(readTagFilter(iter, line));
690 }
691 if(*iter == ":")
692 {
693 iter++;
694 if(!iter.at_end() && (*iter).length() > 0)
695 {
696 if(boundary.indexOf(*iter) == -1)
697 {
698 tok.right = readToken(iter, line);
699 }
700 }
701 }
702 else
703 {
704 tok.right = tok.left;
705 }
706 }
707 if(*iter == "[")
708 {
709 tok.tag_filter.combine(readTagFilter(iter, line));
710 }
711 tok.mode = readModifier(iter);
712
713 return tok;
714}
715
716void
717LexdCompiler::processPattern(char_iter& iter, UnicodeString& line)
718{
719 vector<pattern_t> pats_cur(1);
720 vector<pattern_element_t> alternation;
721 bool final_alternative = true;
722 bool sieve_forward = false;
723 bool just_sieved = false;
724 const UnicodeString boundary = " :()[]+*?|<>";
725 const UnicodeString token_boundary = " )|<>";
726 const UnicodeString token_side_boundary = token_boundary + ":+*?";
727 const UnicodeString token_side_name_boundary = token_side_boundary + "([]";
728 const UnicodeString modifier = "+*?";
729 const UnicodeString decrement_after_token = token_boundary + "([]";
730
731 for(; !iter.at_end() && *iter != ')' && (*iter).length() > 0; ++iter)
732 {
733 if(*iter == " ") ;
734 else if(*iter == "|")
735 {
736 if(alternation.empty())
737 die("Syntax error - initial |");
738 if(!final_alternative)
739 die("Syntax error - multiple consecutive |");
740 if(just_sieved)
741 die("Syntax error - sieve and alternation operators without intervening token");
742 final_alternative = false;
743 }
744 else if(*iter == "<")
745 {
746 if(sieve_forward)
747 die("Syntax error - cannot sieve backwards after forwards.");
748 if(alternation.empty())
749 die("Backward sieve without token?");
750 if(just_sieved)
751 die("Syntax error - multiple consecutive sieve operators");
752 if(!final_alternative)
753 die("Syntax error - alternation and sieve operators without intervening token");
754 expand_alternation(pats_cur, alternation);
755 expand_alternation(pats_cur, left_sieve_tok);
756 alternation.clear();
757 just_sieved = true;
758 }
759 else if(*iter == ">")
760 {
761 sieve_forward = true;
762 if(alternation.empty())
763 die("Forward sieve without token?");
764 if(just_sieved)
765 die("Syntax error - multiple consecutive sieve operators");
766 if(!final_alternative)
767 die("Syntax error - alternation and sieve operators without intervening token");
768 expand_alternation(pats_cur, alternation);
769 expand_alternation(pats_cur, right_sieve_tok);
770 alternation.clear();
771 just_sieved = true;
772 }
773 else if(*iter == "[")
774 {
775 UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++));
776 currentLexiconId = internName(name);
777 currentLexiconPartCount = 1;
778 inLex = true;
779 entry_t entry;
780 entry.push_back(processLexiconSegment(++iter, line, 0));
781 if(*iter == " ") iter++;
782 if(*iter != "]")
783 die("Missing closing ] for anonymous lexicon");
784 currentLexicon.push_back(entry);
785 finishLexicon();
786 if(final_alternative && !alternation.empty())
787 {
788 expand_alternation(pats_cur, alternation);
789 alternation.clear();
790 }
791 ++iter;
792 pattern_element_t anon;
793 anon.left = {.name=currentLexiconId, .part=1, .optional=false};
794 anon.right = anon.left;
795 anon.mode = readModifier(iter);
796 alternation.push_back(anon);
797 --iter;
798 final_alternative = true;
799 just_sieved = false;
800 }
801 else if(*iter == "(")
802 {
803 string_ref temp = currentPatternId;
804 UnicodeString name = UnicodeString::fromUTF8(" " + to_string(anonymousCount++));
805 currentPatternId = internName(name);
806 ++iter;
807 processPattern(iter, line);
808 if(*iter == " ")
809 *iter++;
810 if(iter.at_end() || *iter != ")")
811 die("Missing closing ) for anonymous pattern");
812 ++iter;
813 tag_filter_t filter;
814 if(*iter == "[")
815 filter = readTagFilter(iter, line);
816 if(final_alternative && !alternation.empty())
817 {
818 expand_alternation(pats_cur, alternation);
819 alternation.clear();
820 }
821 pattern_element_t anon;
822 anon.left = {.name=currentPatternId, .part=1, .optional=false};
823 anon.right = anon.left;
824 anon.mode = readModifier(iter);
825 anon.tag_filter = filter;
826 for(const auto &tok : distribute_tag_expressions(anon))
827 alternation.push_back(tok);
828 --iter;
829 currentPatternId = temp;
830 final_alternative = true;
831 just_sieved = false;
832 }
833 else if(*iter == "?" || *iter == "*" || *iter == "+")
834 {
835 die("Syntax error - unexpected modifier at u16 %d-%d", iter.span().first, iter.span().second);
836 }
837 else
838 {
839 if(final_alternative && !alternation.empty())
840 {
841 expand_alternation(pats_cur, alternation);
842 alternation.clear();
843 }
844 for(const auto &tok : distribute_tag_expressions(readPatternElement(iter, line)))
845 alternation.push_back(tok);
846 iter--;
847 final_alternative = true;
848 just_sieved = false;
849 }
850 }
851 if(!final_alternative)
852 die("Syntax error - trailing |");
853 if(just_sieved)
854 die("Syntax error - trailing sieve (< or >)");
855 expand_alternation(pats_cur, alternation);
856 for(const auto &pat : pats_cur)
857 {
858 patterns[currentPatternId].push_back(make_pair(lineNumber, pat));
859 }
860}
861
862void
863LexdCompiler::processNextLine()
864{
865 UnicodeString line;
866 UChar c;
867 bool escape = false;
868 bool comment = false;
869 bool lastWasSpace = false;
870 while((c = u_fgetcu_fgetc_72(input)) != '\n')
871 {
872 bool space = false;
873 if(c == U_EOF0xFFFF)
874 {
875 doneReading = true;
876 break;
877 }
878 if(comment) continue;
879 if(escape)
880 {
881 line += c;
882 escape = false;
883 }
884 else if(c == '\\')
885 {
886 escape = true;
887 line += c;
888 }
889 else if(c == '#')
890 {
891 comment = true;
892 }
893 else if(u_isWhitespaceu_isWhitespace_72(c))
894 {
895 if(line.length() > 0 && !lastWasSpace)
896 {
897 line += ' ';
898 }
899 space = (line.length() > 0);
900 }
901 else line += c;
902 lastWasSpace = space;
903 }
904 lineNumber++;
905 if(escape) die("Trailing backslash");
906 if(line.length() == 0) return;
907
908 if(line == "PATTERNS" || line == "PATTERNS ")
909 {
910 finishLexicon();
911 UnicodeString name = " ";
912 currentPatternId = internName(name);
913 inPat = true;
914 }
915 else if(line.length() > 7 && line.startsWith("PATTERN "))
916 {
917 UnicodeString name = line.tempSubString(8);
918 finishLexicon();
919 currentPatternId = checkName(name);
920 if (lexicons.find(currentPatternId) != lexicons.end()) {
921 die("The name '%S' cannot be used for both LEXICONs and PATTERNs.",
922 err(name)(to_ustring(name).c_str()));
923 }
924 inPat = true;
925 }
926 else if(line.length() > 7 && line.startsWith("LEXICON "))
927 {
928 UnicodeString name = line.tempSubString(8);
929 name.trim();
930 finishLexicon();
931 if(name.length() > 1 && name.indexOf('[') != -1)
932 {
933 UnicodeString tags = name.tempSubString(name.indexOf('['));
934 auto c = char_iter(tags);
935 currentLexicon_tags = readTags(c, tags);
936 if(c != c.end() && *c == ":")
937 {
938 cerr << "WARNING: One-sided tags are deprecated and will soon be removed (line " << lineNumber << ")" << endl;
939 ++c;
940 if(*c == "[")
941 unionset_inplace(currentLexicon_tags, readTags(c, tags));
942 else
943 die("Expected start of default right tags '[' after ':'.");
944 }
945 if(c != c.end())
946 die("Unexpected character '%C' after default tags.", (*c)[0]);
947 name.retainBetween(0, name.indexOf('['));
948 }
949 currentLexiconPartCount = 1;
950 if(name.length() > 1 && name.endsWith(')'))
951 {
952 UnicodeString num;
953 for(int i = name.length()-2; i > 0; i--)
954 {
955 if(u_isdigitu_isdigit_72(name[i])) num = name[i] + num;
956 else if(name[i] == '(' && num.length() > 0)
957 {
958 currentLexiconPartCount = (unsigned int)StringUtils::stoi(to_ustring(num));
959 name = name.retainBetween(0, i);
960 }
961 else break;
962 }
963 if(name.length() == 0) die("Unnamed lexicon");
964 }
965 currentLexiconId = checkName(name);
966 if(lexicons.find(currentLexiconId) != lexicons.end()) {
967 if(lexicons[currentLexiconId][0].size() != currentLexiconPartCount) {
968 die("Multiple incompatible definitions for lexicon '%S'.", err(name)(to_ustring(name).c_str()));
969 }
970 }
971 if (patterns.find(currentLexiconId) != patterns.end()) {
972 die("The name '%S' cannot be used for both LEXICONs and PATTERNs.",
973 err(name)(to_ustring(name).c_str()));
974 }
975 inLex = true;
976 inPat = false;
977 }
978 else if(line.length() >= 9 && line.startsWith("ALIAS "))
979 {
980 finishLexicon();
981 if(line.endsWith(' ')) line.retainBetween(0, line.length()-1);
982 int loc = line.indexOf(" ", 6);
983 if(loc == -1) die("Expected 'ALIAS lexicon alt_name'");
984 UnicodeString name = line.tempSubString(6, loc-6);
985 UnicodeString alt = line.tempSubString(loc+1);
986 string_ref altid = checkName(alt);
987 string_ref lexid = checkName(name);
988 if(lexicons.find(lexid) == lexicons.end()) die("Attempt to alias undefined lexicon '%S'.", err(name)(to_ustring(name).c_str()));
989 lexicons[altid] = lexicons[lexid];
990 inLex = false;
991 inPat = false;
992 }
993 else if(inPat)
994 {
995 char_iter iter = char_iter(line);
996 processPattern(iter, line);
997 if(!iter.at_end() && (*iter).length() > 0)
998 die("Unexpected %S", err(*iter)(to_ustring(*iter).c_str()));
999 }
1000 else if(inLex)
1001 {
1002 char_iter iter = char_iter(line);
1003 entry_t entry;
1004 for(unsigned int i = 0; i < currentLexiconPartCount; i++)
1005 {
1006 entry.push_back(processLexiconSegment(iter, line, i));
1007 if (*iter == "]") die("Unexpected closing bracket.");
1008 }
1009 if(*iter == ' ') ++iter;
1010 if(!iter.at_end())
1011 die("Lexicon entry has '%S' (found at u16 %d), more than %d components", err(*iter)(to_ustring(*iter).c_str()), iter.span().first, currentLexiconPartCount);
1012 currentLexicon.push_back(entry);
1013 }
1014 else die("Expected 'PATTERNS' or 'LEXICON'");
1015}
1016
1017bool
1018LexdCompiler::isLexiconToken(const pattern_element_t& tok)
1019{
1020 const bool llex = (tok.left.name.empty() || (lexicons.find(tok.left.name) != lexicons.end()));
1021 const bool rlex = (tok.right.name.empty() || (lexicons.find(tok.right.name) != lexicons.end()));
1022 if(llex && rlex)
1023 {
1024 return true;
1025 }
1026 const bool lpat = (patterns.find(tok.left.name) != patterns.end());
1027 const bool rpat = (patterns.find(tok.right.name) != patterns.end());
1028 if(tok.left.name == tok.right.name && lpat && rpat)
1029 {
1030 if(tok.left.part != 1 || tok.right.part != 1)
1031 {
1032 die("Cannote select part of pattern %S", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1033 }
1034 return false;
1035 }
1036 // Any other scenario is an error, so we need to die()
1037 if(lpat && rpat)
1038 {
1039 die("Cannot collate pattern %S with %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1040 }
1041 else if((lpat && tok.right.name.empty()) || (rpat && tok.left.name.empty()))
1042 {
1043 die("Cannot select side of pattern %S", err(name(tok.left.name.valid() ? tok.left.name : tok.right.name))(to_ustring(name(tok.left.name.valid() ? tok.left.name : tok.
right.name)).c_str())
);
1044 }
1045 else if(llex && rpat)
1046 {
1047 die("Cannot collate lexicon %S with pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1048 }
1049 else if(lpat && rlex)
1050 {
1051 die("Cannot collate pattern %S with lexicon %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1052 }
1053 else
1054 {
1055 cerr << "Patterns: ";
1056 for(auto pat: patterns)
1057 cerr << to_ustring(name(pat.first)) << " ";
1058 cerr << endl;
1059 cerr << "Lexicons: ";
1060 for(auto l: lexicons)
1061 cerr << to_ustring(name(l.first)) << " ";
1062 cerr << endl;
1063 die("Lexicon or pattern '%S' is not defined.", err(name((llex || lpat) ? tok.right.name : tok.left.name))(to_ustring(name((llex || lpat) ? tok.right.name : tok.left.name
)).c_str())
);
1064 }
1065 // we never reach this point, but the compiler doesn't understand die()
1066 // so we put a fake return value to keep it happy
1067 return false;
1068}
1069
1070void
1071LexdCompiler::buildPattern(int state, Transducer* t, const pattern_t& pat, const vector<int> is_free, unsigned int pos)
1072{
1073 if(pos == pat.size())
1074 {
1075 t->setFinal(state);
1076 return;
1077 }
1078 const pattern_element_t& tok = pat[pos];
1079 if(tok.left.name == left_sieve_name)
1080 {
1081 t->linkStates(t->getInitial(), state, 0);
1082 buildPattern(state, t, pat, is_free, pos+1);
1083 }
1084 else if(tok.left.name == right_sieve_name)
1085 {
1086 t->setFinal(state);
1087 buildPattern(state, t, pat, is_free, pos+1);
1088 }
1089 else if(isLexiconToken(tok))
1090 {
1091 if(is_free[pos] == 1)
1092 {
1093 Transducer *lex = getLexiconTransducer(pat[pos], 0, true);
1094 if(lex)
1095 {
1096 int new_state = t->insertTransducer(state, *lex);
1097 buildPattern(new_state, t, pat, is_free, pos+1);
1098 }
1099 return;
1100 }
1101 else if(matchedParts.find(tok.left.name) == matchedParts.end() &&
1102 matchedParts.find(tok.right.name) == matchedParts.end())
1103 {
1104 unsigned int max = lexicons[tok.left.name || tok.right.name].size();
1105 if (tok.optional()) max++;
1106 for(unsigned int index = 0; index < max; index++)
1107 {
1108 Transducer *lex = getLexiconTransducer(pat[pos], index, false);
1109 if(lex)
1110 {
1111 int new_state = t->insertTransducer(state, *lex);
1112 if(new_state == state)
1113 {
1114 new_state = t->insertNewSingleTransduction(0, state);
1115 }
1116 if(tok.left.name.valid()) matchedParts[tok.left.name] = index;
1117 if(tok.right.name.valid()) matchedParts[tok.right.name] = index;
1118 buildPattern(new_state, t, pat, is_free, pos+1);
1119 }
1120 }
1121 if(tok.left.name.valid()) matchedParts.erase(tok.left.name);
1122 if(tok.right.name.valid()) matchedParts.erase(tok.right.name);
1123 return;
1124 }
1125 if(tok.left.name.valid() && matchedParts.find(tok.left.name) == matchedParts.end())
1126 matchedParts[tok.left.name] = matchedParts[tok.right.name];
1127 if(tok.right.name.valid() && matchedParts.find(tok.right.name) == matchedParts.end())
1128 matchedParts[tok.right.name] = matchedParts[tok.left.name];
1129 if(tok.left.name.valid() && tok.right.name.valid() && matchedParts[tok.left.name] != matchedParts[tok.right.name])
1130 die("Cannot collate %S with %S - both appear in free variation earlier in the pattern.", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1131 Transducer* lex = getLexiconTransducer(pat[pos], matchedParts[tok.left.name || tok.right.name], false);
1132 if(lex)
1133 {
1134 int new_state = t->insertTransducer(state, *lex);
1135 buildPattern(new_state, t, pat, is_free, pos+1);
1136 }
1137 return;
1138 }
1139 else
1140 {
1141 Transducer *p = buildPattern(tok);
1142 if(!p->hasNoFinals())
1143 {
1144 int new_state = t->insertTransducer(state, *p);
1145 if(tok.mode & Optional)
1146 t->linkStates(state, new_state, 0);
1147 if(tok.mode & Repeated)
1148 t->linkStates(new_state, state, 0);
1149 buildPattern(new_state, t, pat, is_free, pos+1);
1150 }
1151 }
1152}
1153
1154vector<int>
1155LexdCompiler::determineFreedom(pattern_t& pat)
1156{
1157 vector<int> is_free = vector<int>(pat.size(), 0);
1158 map<string_ref, bool> is_optional;
1159 for(unsigned int i = 0; i < pat.size(); i++)
1160 {
1161 const pattern_element_t& t1 = pat[i];
1162 if (is_optional.find(t1.left.name) != is_optional.end() && is_optional[t1.left.name] != t1.optional()) {
1163 die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.left.name))(to_ustring(name(t1.left.name)).c_str()));
1164 }
1165 if (is_optional.find(t1.right.name) != is_optional.end() && is_optional[t1.right.name] != t1.optional()) {
1166 die("Lexicon %S cannot be both optional and non-optional in a single pattern.", err(name(t1.right.name))(to_ustring(name(t1.right.name)).c_str()));
1167 }
1168 if (t1.left.name.valid()) {
1169 is_optional[t1.left.name] = t1.optional();
1170 }
1171 if (t1.right.name.valid()) {
1172 is_optional[t1.right.name] = t1.optional();
1173 }
1174 if(is_free[i] != 0)
1175 continue;
1176 for(unsigned int j = i+1; j < pat.size(); j++)
1177 {
1178 const pattern_element_t& t2 = pat[j];
1179 if((t1.left.name.valid() && (t1.left.name == t2.left.name || t1.left.name == t2.right.name)) ||
1180 (t1.right.name.valid() && (t1.right.name == t2.left.name || t1.right.name == t2.right.name)))
1181 {
1182 is_free[i] = -1;
1183 is_free[j] = -1;
1184 }
1185 }
1186 is_free[i] = (is_free[i] == 0 ? 1 : -1);
1187 }
1188 return is_free;
1189}
1190
1191Transducer*
1192LexdCompiler::buildPattern(const pattern_element_t &tok)
1193{
1194 if(tok.left.part != 1 || tok.right.part != 1)
1195 die("Cannot build collated pattern %S", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()));
1196 if(patternTransducers.find(tok) == patternTransducers.end())
1197 {
1198 if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl;
1199 auto start_time = chrono::steady_clock::now();
1200 Transducer* t = new Transducer();
1201 patternTransducers[tok] = NULL__null;
1202 map<string_ref, unsigned int> tempMatch;
1203 tempMatch.swap(matchedParts);
1204 for(auto &pat_untagged : patterns[tok.left.name])
1205 {
1206 for(unsigned int i = 0; i < pat_untagged.second.size(); i++)
1207 {
1208 auto pat = pat_untagged;
1209 bool taggable = true;
1210 for (unsigned int j = 0; j < pat.second.size(); j++) {
1211 auto& pair = pat.second[j];
1212 if(!pair.tag_filter.combine(tok.tag_filter.neg())) {
1213 taggable = false;
1214 if (verbose) {
1215 cerr << "Warning: The tags of " << to_ustring(printPattern(tok));
1216 cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[j]));
1217 cerr << " on line " << pat.first << "." << endl;
1218 }
1219 }
1220 }
1221 if(!pat.second[i].tag_filter.combine(tok.tag_filter.pos())) {
1222 taggable = false;
1223 if (verbose) {
1224 cerr << "Warning: The tags of " << to_ustring(printPattern(tok));
1225 cerr << " conflict with " << to_ustring(printPattern(pat_untagged.second[i]));
1226 cerr << " on line " << pat.first << "." << endl;
1227 }
1228 }
1229 if (!taggable) continue;
1230
1231 matchedParts.clear();
1232 lineNumber = pat.first;
1233 vector<int> is_free = determineFreedom(pat.second);
1234 buildPattern(t->getInitial(), t, pat.second, is_free, 0);
1235 }
1236 }
1237 tempMatch.swap(matchedParts);
1238 if(!t->hasNoFinals())
1239 {
1240 if (verbose)
1241 cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl;
1242 t->minimize();
1243 }
1244 else if (verbose) {
1245 cerr << "Warning: " << to_ustring(printPattern(tok));
1246 cerr << " is empty." << endl;
1247 }
1248 patternTransducers[tok] = t;
1249 if (verbose) {
1250 auto end_time = chrono::steady_clock::now();
1251 chrono::duration<double> diff = end_time - start_time;
1252 cerr << "Done compiling " << to_ustring(printPattern(tok));
1253 cerr << " in " << diff.count() << " seconds." << endl;
1254 }
1255 }
1256 else if(patternTransducers[tok] == NULL__null)
1257 {
1258 die("Cannot compile self-recursive %S", err(printPattern(tok))(to_ustring(printPattern(tok)).c_str()));
1259 }
1260 return patternTransducers[tok];
1261}
1262
1263int
1264LexdCompiler::insertPreTags(Transducer* t, int state, tag_filter_t &tags)
1265{
1266 int end = state;
1267 for(auto tag : tags.pos())
1268 {
1269 trans_sym_t flag = getFlag(Positive, tag, 1);
1270 end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end);
1271 }
1272 for(auto tag : tags.neg())
1273 {
1274 trans_sym_t flag = getFlag(Positive, tag, 2);
1275 end = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), end);
1276 }
1277 return end;
1278}
1279
1280int
1281LexdCompiler::insertPostTags(Transducer* t, int state, tag_filter_t &tags)
1282{
1283 int end = 0;
1284 int flag_dest = 0;
1285 for(auto tag : tags.pos())
1286 {
1287 trans_sym_t flag = getFlag(Disallow, tag, 1);
1288 trans_sym_t clear = getFlag(Clear, tag, 0);
1289 if(flag_dest == 0)
1290 {
1291 flag_dest = t->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1292 end = flag_dest;
1293 }
1294 else
1295 {
1296 t->linkStates(state, flag_dest, (int)alphabet_lookup(flag, flag));
1297 }
1298 end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end);
1299 }
1300 if(end == 0)
1301 {
1302 end = state;
1303 }
1304 for(auto tag : tags.neg())
1305 {
1306 trans_sym_t clear = getFlag(Clear, tag, 0);
1307 end = t->insertSingleTransduction((int)alphabet_lookup(clear, clear), end);
1308 }
1309 return end;
1310}
1311
1312Transducer*
1313LexdCompiler::buildPatternWithFlags(const pattern_element_t &tok, int pattern_start_state = 0)
1314{
1315 if(patternTransducers.find(tok) == patternTransducers.end())
1316 {
1317 if (verbose) cerr << "Compiling " << to_ustring(printPattern(tok)) << endl;
1318 auto start_time = chrono::steady_clock::now();
1319 Transducer* trans = (shouldHypermin ? hyperminTrans : new Transducer());
1320 patternTransducers[tok] = NULL__null;
1321 unsigned int transition_index = 0;
1322 vector<int> pattern_finals;
1323 bool did_anything = false;
1324 for(auto& pat : patterns[tok.left.name])
1325 {
1326 lineNumber = pat.first;
1327 vector<int> is_free = determineFreedom(pat.second);
1328 bool got_non_null = false;
1329 unsigned int count = (tok.tag_filter.pos().size() > 0 ? pat.second.size() : 1);
1330 if(tagsAsFlags) count = 1;
1331 for(unsigned int idx = 0; idx < count; idx++)
1332 {
1333 int state = pattern_start_state;
1334 vector<int> finals;
1335 set<string_ref> to_clear;
1336 bool got_null = false;
1337 for(unsigned int i = 0; i < pat.second.size(); i++)
1338 {
1339 pattern_element_t cur = pat.second[i];
1340
1341 if(cur.left.name == left_sieve_name)
1342 {
1343 trans->linkStates(pattern_start_state, state, 0);
1344 continue;
1345 }
1346 else if(cur.left.name == right_sieve_name)
1347 {
1348 finals.push_back(state);
1349 continue;
1350 }
1351
1352 bool isLex = isLexiconToken(cur);
1353
1354 transition_index++;
1355
1356 int mode_start = state;
1357 cur.mode = Normal;
1358
1359 tag_filter_t current_tags;
1360 if(tagsAsFlags)
1361 {
1362 state = insertPreTags(trans, state, cur.tag_filter);
1363 current_tags = cur.tag_filter;
1364 cur.tag_filter = tag_filter_t();
1365 }
1366 else
1367 {
1368 if (i == idx && !cur.tag_filter.combine(tok.tag_filter.pos())) {
1369 if (verbose) {
1370 cerr << "Warning: The tags of " << to_ustring(printPattern(tok));
1371 cerr << " conflict with " << to_ustring(printPattern(pat.second[i]));
1372 cerr << " on line " << pat.first << "." << endl;
1373 }
1374 }
1375 if (!cur.tag_filter.combine(tok.tag_filter.neg())) {
1376 if (verbose) {
1377 cerr << "Warning: The tags of " << to_ustring(printPattern(tok));
1378 cerr << " conflict with " << to_ustring(printPattern(pat.second[i]));
1379 cerr << " on line " << pat.first << "." << endl;
1380 }
1381 }
1382 }
1383
1384 Transducer* t;
1385 if(shouldHypermin)
1386 {
1387 trans_sym_t inflag = getFlag(Positive, tok.left.name, transition_index);
1388 trans_sym_t outflag = getFlag(Require, tok.left.name, transition_index);
1389 int in_tr = (int)alphabet_lookup(inflag, inflag);
1390 int out_tr = (int)alphabet_lookup(outflag, outflag);
1391 if(is_free[i] == -1 && isLex)
1392 {
1393 to_clear.insert(cur.left.name);
1394 to_clear.insert(cur.right.name);
1395 }
1396 if(transducerLocs.find(cur) != transducerLocs.end())
1397 {
1398 auto loc = transducerLocs[cur];
1399 if(loc.first == loc.second)
1400 {
1401 t = NULL__null;
1402 }
1403 else
1404 {
1405 t = trans;
1406 trans->linkStates(state, loc.first, in_tr);
1407 state = trans->insertSingleTransduction(out_tr, loc.second);
1408 }
1409 }
1410 else
1411 {
1412 int start = trans->insertSingleTransduction(in_tr, state);
1413 int end = start;
1414 if(isLex)
1415 {
1416 t = getLexiconTransducerWithFlags(cur, false);
1417 if(t == NULL__null)
1418 {
1419 transducerLocs[cur] = make_pair(start, start);
1420 }
1421 else
1422 {
1423 end = trans->insertTransducer(start, *t);
1424 transducerLocs[cur] = make_pair(start, end);
1425 }
1426 }
1427 else
1428 {
1429 t = buildPatternWithFlags(cur, start);
1430 end = transducerLocs[cur].second;
1431 }
1432 state = trans->insertSingleTransduction(out_tr, end);
1433 }
1434 }
1435 else if(isLex)
1436 {
1437 t = getLexiconTransducerWithFlags(cur, (is_free[i] == 1));
1438 if(is_free[i] == -1)
1439 {
1440 to_clear.insert(cur.left.name);
1441 to_clear.insert(cur.right.name);
1442 }
1443 }
1444 else
1445 {
1446 t = buildPatternWithFlags(cur);
1447 }
1448 if(t == NULL__null || (!shouldHypermin && t->hasNoFinals()))
1449 {
1450 got_null = true;
1451 break;
1452 }
1453 got_non_null = true;
1454 if(!shouldHypermin)
1455 {
1456 state = trans->insertTransducer(state, *t);
1457 }
1458 if(tagsAsFlags)
1459 {
1460 state = insertPostTags(trans, state, current_tags);
1461 }
1462 if(pat.second[i].mode & Optional)
1463 {
1464 trans->linkStates(mode_start, state, 0);
1465 }
1466 if(pat.second[i].mode & Repeated)
1467 {
1468 trans->linkStates(state, mode_start, 0);
1469 }
1470 }
1471 if(!got_null || finals.size() > 0)
1472 {
1473 for(auto fin : finals)
1474 {
1475 trans->linkStates(fin, state, 0);
1476 }
1477 for(auto lex : to_clear)
1478 {
1479 if(lex.empty())
1480 {
1481 continue;
1482 }
1483 UnicodeString flag = "@C.";
1484 encodeFlag(flag, (int)lex.i);
1485 flag += "@";
1486 trans_sym_t f = alphabet_lookup(flag);
1487 state = trans->insertSingleTransduction((int)alphabet_lookup(f, f), state);
1488 }
1489 trans->setFinal(state);
1490 pattern_finals.push_back(state);
1491 }
1492 }
1493 if(!got_non_null)
1494 {
1495 continue;
1496 }
1497 did_anything = true;
1498 }
1499 if(did_anything)
1500 {
1501 if(shouldHypermin)
1502 {
1503 if(pattern_finals.size() > 0)
1504 {
1505 int end = pattern_finals[0];
1506 for(auto fin : pattern_finals)
1507 {
1508 if(fin != end)
1509 {
1510 trans->linkStates(fin, end, 0);
1511 }
1512 if(pattern_start_state != 0)
1513 {
1514 trans->setFinal(fin, 0, false);
1515 }
1516 }
1517 pattern_element_t key = tok;
1518 key.mode = Normal;
1519 transducerLocs[key] = make_pair(pattern_start_state, end);
1520 }
1521 }
1522 else
1523 {
1524 if(!trans->hasNoFinals()) {
1525 if (verbose)
1526 cerr << "Minimizing " << to_ustring(printPattern(tok)) << endl;
1527 trans->minimize();
1528 }
1529 }
1530 }
1531 else
1532 {
1533 if(!shouldHypermin)
1534 trans = NULL__null;
1535 else
1536 {
1537 cerr << "FIXME" << endl;
1538 }
1539 }
1540 if (verbose) {
1541 auto end_time = chrono::steady_clock::now();
1542 chrono::duration<double> diff = end_time - start_time;
1543 cerr << "Done compiling " << to_ustring(printPattern(tok));
1544 cerr << " in " << diff.count() << " seconds." << endl;
1545 }
1546 patternTransducers[tok] = trans;
1547 }
1548 else if(patternTransducers[tok] == NULL__null)
1549 {
1550 die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()));
1551 }
1552 return patternTransducers[tok];
1553}
1554
1555void
1556LexdCompiler::buildAllLexicons()
1557{
1558 // find out if there are any lexicons that we can build without flags
1559 vector<pattern_element_t> lexicons_to_build;
1560 for(auto pattern : patterns)
1561 {
1562 for(auto pat : pattern.second)
1563 {
1564 lineNumber = pat.first;
1565 vector<int> free = determineFreedom(pat.second);
1566 for(size_t i = 0; i < pat.second.size(); i++)
1567 {
1568 if(pat.second[i].left.name == left_sieve_name ||
1569 pat.second[i].left.name == right_sieve_name)
1570 {
1571 continue;
1572 }
1573 if(isLexiconToken(pat.second[i]))
1574 {
1575 pattern_element_t& tok = pat.second[i];
1576 if(free[i] == -1)
1577 {
1578 lexiconFreedom[tok.left.name] = false;
1579 lexiconFreedom[tok.right.name] = false;
1580 }
1581 else
1582 {
1583 if(lexiconFreedom.find(tok.left.name) == lexiconFreedom.end())
1584 {
1585 lexiconFreedom[tok.left.name] = true;
1586 }
1587 if(lexiconFreedom.find(tok.right.name) == lexiconFreedom.end())
1588 {
1589 lexiconFreedom[tok.right.name] = true;
1590 }
1591 }
1592 lexicons_to_build.push_back(tok);
1593 }
1594 }
1595 }
1596 }
1597 lexiconFreedom[string_ref(0)] = true;
1598 for(auto tok : lexicons_to_build)
1599 {
1600 tok.tag_filter = tag_filter_t();
1601 tok.mode = Normal;
1602 bool free = ((tok.left.name.empty() || lexiconFreedom[tok.left.name]) &&
1603 (tok.right.name.empty() || lexiconFreedom[tok.right.name]));
1604 getLexiconTransducerWithFlags(tok, free);
1605 }
1606}
1607
1608int
1609LexdCompiler::buildPatternSingleLexicon(pattern_element_t tok, int start_state)
1610{
1611 if(patternTransducers.find(tok) == patternTransducers.end() || patternTransducers[tok] != NULL__null)
1612 {
1613 patternTransducers[tok] = NULL__null;
1614 int end = -1;
1615 string_ref transition_flag = internName(" ");
1616 for(auto pattern : patterns[tok.left.name])
1617 {
1618 int next_start_state = start_state;
1619 size_t next_start_idx = 0;
1620 lineNumber = pattern.first;
1621 set<string_ref> to_clear;
1622 size_t count = (tok.tag_filter.pos().empty() ? 1 : pattern.second.size());
1623 for(size_t tag_idx = 0; tag_idx < count; tag_idx++)
1624 {
1625 int state = next_start_state;
1626 bool finished = true;
1627 for(size_t i = next_start_idx; i < pattern.second.size(); i++)
1628 {
1629 pattern_element_t cur = pattern.second[i];
1630
1631 if(cur.left.name == left_sieve_name)
1632 {
1633 hyperminTrans->linkStates(start_state, state, 0);
1634 continue;
1635 }
1636 else if(cur.left.name == right_sieve_name)
1637 {
1638 if(end == -1)
1639 {
1640 end = hyperminTrans->insertNewSingleTransduction(0, state);
1641 }
1642 else
1643 {
1644 hyperminTrans->linkStates(state, end, 0);
1645 }
1646 continue;
1647 }
1648
1649 if(i == tag_idx)
1650 {
1651 next_start_state = state;
1652 next_start_idx = tag_idx;
1653 cur.tag_filter.combine(tok.tag_filter.pos());
1654 }
1655 cur.tag_filter.combine(tok.tag_filter.neg());
1656
1657 int mode_state = state;
1658
1659 if(isLexiconToken(cur))
1660 {
1661 tags_t tags = cur.tag_filter.tags();
1662 for(auto tag : tags)
1663 {
1664 trans_sym_t flag = getFlag(Clear, tag, 0);
1665 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1666 }
1667 pattern_element_t untagged = cur;
1668 untagged.tag_filter = tag_filter_t();
1669 untagged.mode = Normal;
1670 bool free = (lexiconFreedom[cur.left.name] && lexiconFreedom[cur.right.name]);
1671 if(!free)
1672 {
1673 to_clear.insert(cur.left.name);
1674 to_clear.insert(cur.right.name);
1675 }
1676 trans_sym_t inflag = getFlag(Positive, transition_flag, transitionCount);
1677 trans_sym_t outflag = getFlag(Require, transition_flag, transitionCount);
1678 transitionCount++;
1679 if(transducerLocs.find(untagged) == transducerLocs.end())
1680 {
1681 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(inflag, inflag), state);
1682 Transducer* lex = getLexiconTransducerWithFlags(untagged, free);
1683 int start = state;
1684 state = hyperminTrans->insertTransducer(state, *lex);
1685 transducerLocs[untagged] = make_pair(start, state);
1686 }
1687 else
1688 {
1689 auto loc = transducerLocs[untagged];
1690 hyperminTrans->linkStates(state, loc.first, (int)alphabet_lookup(inflag, inflag));
1691 state = loc.second;
1692 }
1693 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(outflag, outflag), state);
1694 for(auto tag : cur.tag_filter.pos())
1695 {
1696 trans_sym_t flag = getFlag(Require, tag, 1);
1697 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1698 }
1699 for(auto tag : cur.tag_filter.neg())
1700 {
1701 trans_sym_t flag = getFlag(Disallow, tag, 1);
1702 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1703 }
1704 }
1705 else
1706 {
1707 state = buildPatternSingleLexicon(cur, state);
1708 if(state == -1)
1709 {
1710 finished = false;
1711 break;
1712 }
1713 }
1714
1715 if(cur.mode & Optional)
1716 {
1717 hyperminTrans->linkStates(mode_state, state, 0);
1718 }
1719 if(cur.mode & Repeated)
1720 {
1721 hyperminTrans->linkStates(state, mode_state, 0);
1722 }
1723 }
1724 if(finished)
1725 {
1726 for(auto lex : to_clear)
1727 {
1728 if(lex.empty())
1729 {
1730 continue;
1731 }
1732 trans_sym_t flag = getFlag(Clear, lex, 0);
1733 state = hyperminTrans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1734 }
1735 if(end == -1)
1736 {
1737 end = state;
1738 }
1739 else
1740 {
1741 hyperminTrans->linkStates(state, end, 0);
1742 }
1743 }
1744 }
1745 }
1746 patternTransducers.erase(tok);
1747 return end;
1748 }
1749 else
1750 {
1751 die("Cannot compile self-recursive pattern '%S'", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()));
1752 return 0;
1753 }
1754}
1755
1756void
1757LexdCompiler::readFile(UFILE* infile)
1758{
1759 input = infile;
1760 doneReading = false;
1761 while(!u_feofu_feof_72(input))
1762 {
1763 processNextLine();
1764 if(doneReading) break;
1765 }
1766 finishLexicon();
1767}
1768
1769Transducer*
1770LexdCompiler::buildTransducer(bool usingFlags)
1771{
1772 token_t start_tok = {.name = internName(" "), .part = 1, .optional = false};
1773 pattern_element_t start_pat = {.left=start_tok, .right=start_tok,
1774 .tag_filter=tag_filter_t(),
1775 .mode=Normal};
1776 if(usingFlags)
1777 {
1778 if(shouldHypermin)
1779 {
1780 hyperminTrans = new Transducer();
1781 }
1782 Transducer *t = buildPatternWithFlags(start_pat);
1783 if(shouldHypermin)
1784 t->minimize();
1785 return t;
1786 }
1787 else return buildPattern(start_pat);
1788}
1789
1790Transducer*
1791LexdCompiler::buildTransducerSingleLexicon()
1792{
1793 tagsAsMinFlags = true;
1794 token_t start_tok = {.name = internName(" "), .part = 1, .optional = false};
1795 pattern_element_t start_pat = {.left=start_tok, .right=start_tok,
1796 .tag_filter=tag_filter_t(),
1797 .mode=Normal};
1798 hyperminTrans = new Transducer();
1799 buildAllLexicons();
1800 int end = buildPatternSingleLexicon(start_pat, 0);
1801 if(end == -1)
1802 {
1803 cerr << "WARNING: No non-empty patterns found." << endl;
1804 }
1805 else {
1806 hyperminTrans->setFinal(end);
1807 hyperminTrans->minimize();
1808 }
1809 return hyperminTrans;
1810}
1811
1812void expand_alternation(vector<pattern_t> &pats, const vector<pattern_element_t> &alternation)
1813{
1814 if(alternation.empty())
1815 return;
1816 if(pats.empty())
1817 pats.push_back(pattern_t());
1818 vector<pattern_t> new_pats;
1819 for(const auto &pat: pats)
1820 {
1821 for(const auto &tok: alternation)
1822 {
1823 auto pat1 = pat;
1824 pat1.push_back(tok);
1825 new_pats.push_back(pat1);
1826 }
1827 }
1828 pats = new_pats;
1829}
1830
1831void
1832LexdCompiler::insertEntry(Transducer* trans, const lex_seg_t &seg)
1833{
1834 int state = trans->getInitial();
1835 if(tagsAsFlags)
1836 {
1837 for(string_ref tag : seg.tags)
1838 {
1839 trans_sym_t check1 = getFlag(Require, tag, 1);
1840 trans_sym_t check2 = getFlag(Disallow, tag, 2);
1841 trans_sym_t clear = getFlag(Clear, tag, 0);
1842 int state2 = trans->insertSingleTransduction((int)alphabet_lookup(check1, check1), state);
1843 int state3 = trans->insertSingleTransduction((int)alphabet_lookup(clear, clear), state2);
1844 trans->linkStates(state, state3, 0);
1845 state = trans->insertSingleTransduction((int)alphabet_lookup(check2, check2), state3);
1846 }
1847 }
1848 else if(tagsAsMinFlags)
1849 {
1850 for(string_ref tag : seg.tags)
1851 {
1852 trans_sym_t flag = getFlag(Positive, tag, 1);
1853 state = trans->insertSingleTransduction((int)alphabet_lookup(flag, flag), state);
1854 }
1855 }
1856 if (seg.regex != nullptr) {
1857 state = trans->insertTransducer(state, *seg.regex);
1858 }
1859 if(!shouldAlign)
1860 {
1861 for(unsigned int i = 0; i < seg.left.symbols.size() || i < seg.right.symbols.size(); i++)
1862 {
1863 trans_sym_t l = (i < seg.left.symbols.size()) ? seg.left.symbols[i] : trans_sym_t();
1864 trans_sym_t r = (i < seg.right.symbols.size()) ? seg.right.symbols[i] : trans_sym_t();
1865 state = trans->insertSingleTransduction(alphabet((int)l, (int)r), state);
1866 }
1867 }
1868 else
1869 {
1870 /*
1871 This code is adapted from hfst/libhfst/src/parsers/lexc-utils.cc
1872 It uses the Levenshtein distance algorithm to determine the optimal
1873 alignment of two strings.
1874 In hfst-lexc, the priority order for ties is SUB > DEL > INS
1875 which ensures that 000abc:xyz000 is preferred over abc000:000xyz
1876 However, we're traversing the strings backwards to simplify extracting
1877 the final alignment, so we need to switch INS and DEL.
1878 If shouldCompress is true, we set the cost of SUB to 1 in order to prefer
1879 a:b over 0:b a:0 without changing the alignment of actual correspondences.
1880 */
1881 const unsigned int INS = 0;
1882 const unsigned int DEL = 1;
1883 const unsigned int SUB = 2;
1884 const unsigned int ins_cost = 1;
1885 const unsigned int del_cost = 1;
1886 const unsigned int sub_cost = (shouldCompress ? 1 : 100);
1887
1888 const unsigned int len1 = seg.left.symbols.size();
1889 const unsigned int len2 = seg.right.symbols.size();
1890 unsigned int cost[len1+1][len2+1];
1891 unsigned int path[len1+1][len2+1];
1892 cost[0][0] = 0;
1893 path[0][0] = 0;
1894 for(unsigned int i = 1; i <= len1; i++)
1895 {
1896 cost[i][0] = del_cost * i;
1897 path[i][0] = DEL;
1898 }
1899 for(unsigned int i = 1; i <= len2; i++)
1900 {
1901 cost[0][i] = ins_cost * i;
1902 path[0][i] = INS;
1903 }
1904
1905 for(unsigned int i = 1; i <= len1; i++)
1906 {
1907 for(unsigned int j = 1; j <= len2; j++)
1908 {
1909 unsigned int sub = cost[i-1][j-1] + (seg.left.symbols[len1-i] == seg.right.symbols[len2-j] ? 0 : sub_cost);
1910 unsigned int ins = cost[i][j-1] + ins_cost;
1911 unsigned int del = cost[i-1][j] + del_cost;
1912
1913 if(sub <= ins && sub <= del)
1914 {
1915 cost[i][j] = sub;
1916 path[i][j] = SUB;
1917 }
1918 else if(ins <= del)
1919 {
1920 cost[i][j] = ins;
1921 path[i][j] = INS;
1922 }
1923 else
1924 {
1925 cost[i][j] = del;
1926 path[i][j] = DEL;
1927 }
1928 }
1929 }
1930
1931 for(unsigned int x = len1, y = len2; (x > 0) || (y > 0);)
1932 {
1933 trans_sym_t symbol;
1934 switch(path[x][y])
1935 {
1936 case SUB:
1937 symbol = alphabet_lookup(seg.left.symbols[len1-x], seg.right.symbols[len2-y]);
1938 x--;
1939 y--;
1940 break;
1941 case INS:
1942 symbol = alphabet_lookup(trans_sym_t(), seg.right.symbols[len2-y]);
1943 y--;
1944 break;
1945 default: // DEL
1946 symbol = alphabet_lookup(seg.left.symbols[len1-x], trans_sym_t());
1947 x--;
1948 }
1949 state = trans->insertSingleTransduction((int)symbol, state);
1950 }
1951 }
1952 trans->setFinal(state);
1953}
1954
1955void
1956LexdCompiler::applyMode(Transducer* trans, RepeatMode mode)
1957{
1958 if(mode == Question)
1959 trans->optional();
1960 else if(mode == Star)
1961 trans->zeroOrMore();
1962 else if(mode == Plus)
1963 trans->oneOrMore();
1964}
1965
1966Transducer*
1967LexdCompiler::getLexiconTransducer(pattern_element_t tok, unsigned int entry_index, bool free)
1968{
1969 if(!free && entryTransducers.find(tok) != entryTransducers.end())
1970 return entryTransducers[tok][entry_index];
1971 if(free && lexiconTransducers.find(tok) != lexiconTransducers.end())
1972 return lexiconTransducers[tok];
1973
1974 vector<entry_t>& lents = lexicons[tok.left.name];
1975 if(tok.left.name.valid() && tok.left.part > lents[0].size())
1976 die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part);
1977 vector<entry_t>& rents = lexicons[tok.right.name];
1978 if(tok.right.name.valid() && tok.right.part > rents[0].size())
1979 die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part);
1980 if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size())
1981 die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
1982 unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size());
1983 vector<Transducer*> trans;
1984 if(free)
1985 trans.push_back(new Transducer());
1986 else
1987 trans.reserve(count);
1988 lex_seg_t empty;
1989 bool did_anything = false;
1990 for(unsigned int i = 0; i < count; i++)
1991 {
1992 lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty);
1993 lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty);
1994 tags_t tags = unionset(le.tags, re.tags);
1995 if(!tok.tag_filter.compatible(tags))
1996 {
1997 if(!free)
1998 trans.push_back(NULL__null);
1999 continue;
2000 }
2001 Transducer* t = free ? trans[0] : new Transducer();
2002 if (le.regex != nullptr || re.regex != nullptr) {
2003 if (tok.left.name.empty())
2004 die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
2005 if (tok.right.name.empty())
2006 die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()));
2007 if (tok.left.name != tok.right.name)
2008 die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right
.name))).c_str())
);
2009 }
2010 insertEntry(t, {.left=le.left, .right=re.right, .regex=le.regex, .tags=tags});
2011 did_anything = true;
2012 if(!free)
2013 {
2014 applyMode(t, tok.mode);
2015 trans.push_back(t);
2016 }
2017 }
2018 if(tok.optional()) {
2019 Transducer* t = free ? trans[0] : new Transducer();
2020 tags_t empty_tags;
2021 insertEntry(t, {.left=empty.left, .right=empty.right, .regex=nullptr, .tags=empty_tags});
2022 did_anything = true;
Value stored to 'did_anything' is never read
2023 if (!free) {
2024 applyMode(t, tok.mode);
2025 trans.push_back(t);
2026 }
2027 did_anything = true;
2028 }
2029 if(free)
2030 {
2031 if(!did_anything)
2032 {
2033 trans[0] = NULL__null;
2034 }
2035 if(trans[0])
2036 {
2037 trans[0]->minimize();
2038 applyMode(trans[0], tok.mode);
2039 }
2040 lexiconTransducers[tok] = trans[0];
2041 return trans[0];
2042 }
2043 else
2044 {
2045 entryTransducers[tok] = trans;
2046 return trans[entry_index];
2047 }
2048}
2049
2050void
2051LexdCompiler::encodeFlag(UnicodeString& str, int flag)
2052{
2053 UnicodeString letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
2054 int num = flag;
2055 while(num > 0)
2056 {
2057 str += letters[num % 26];
2058 num /= 26;
2059 }
2060}
2061
2062trans_sym_t
2063LexdCompiler::getFlag(FlagDiacriticType type, string_ref flag, unsigned int value)
2064{
2065 //cerr << "getFlag(" << type << ", " << to_ustring(name(flag)) << ", " << value << ")" << endl;
2066 UnicodeString flagstr = "@";
2067 switch(type)
2068 {
2069 case Unification:
2070 //cerr << " Unification" << endl;
2071 flagstr += "U."; break;
2072 case Positive:
2073 //cerr << " Positive" << endl;
2074 flagstr += "P."; break;
2075 case Negative:
2076 //cerr << " Negative" << endl;
2077 flagstr += "N."; break;
2078 case Require:
2079 //cerr << " Require" << endl;
2080 flagstr += "R."; break;
2081 case Disallow:
2082 //cerr << " Disallow" << endl;
2083 flagstr += "D."; break;
2084 case Clear:
2085 //cerr << " Clear" << endl;
2086 flagstr += "C."; break;
2087 }
2088 encodeFlag(flagstr, (int)flag.i);
2089 if(type != Clear)
2090 {
2091 flagstr += ".";
2092 encodeFlag(flagstr, (int)(value + 1));
2093 }
2094 flagstr += "@";
2095 return alphabet_lookup(flagstr);
2096}
2097
2098Transducer*
2099LexdCompiler::getLexiconTransducerWithFlags(pattern_element_t& tok, bool free)
2100{
2101 if(!free && entryTransducers.find(tok) != entryTransducers.end())
2102 return entryTransducers[tok][0];
2103 if(free && lexiconTransducers.find(tok) != lexiconTransducers.end())
2104 return lexiconTransducers[tok];
2105
2106 // TODO: can this be abstracted from here and getLexiconTransducer()?
2107 vector<entry_t>& lents = lexicons[tok.left.name];
2108 if(tok.left.name.valid() && tok.left.part > lents[0].size())
2109 die("%S(%d) - part is out of range", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), tok.left.part);
2110 vector<entry_t>& rents = lexicons[tok.right.name];
2111 if(tok.right.name.valid() && tok.right.part > rents[0].size())
2112 die("%S(%d) - part is out of range", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), tok.right.part);
2113 if(tok.left.name.valid() && tok.right.name.valid() && lents.size() != rents.size())
2114 die("Cannot collate %S with %S - differing numbers of entries", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
2115 unsigned int count = (tok.left.name.valid() ? lents.size() : rents.size());
2116 Transducer* trans = new Transducer();
2117 lex_seg_t empty;
2118 bool did_anything = false;
2119 for(unsigned int i = 0; i < count; i++)
2120 {
2121 lex_seg_t& le = (tok.left.name.valid() ? lents[i][tok.left.part-1] : empty);
2122 lex_seg_t& re = (tok.right.name.valid() ? rents[i][tok.right.part-1] : empty);
2123 tags_t tags = unionset(le.tags, re.tags);
2124 if(!tok.tag_filter.compatible(tags))
2125 {
2126 continue;
2127 }
2128 did_anything = true;
2129 lex_seg_t seg;
2130 if (le.regex != nullptr || re.regex != nullptr) {
2131 if (tok.left.name.empty())
2132 die("Cannot use %S one-sided - it contains a regex", err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()));
2133 if (tok.right.name.empty())
2134 die("Cannot use %S one-sided - it contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()));
2135 if (tok.left.name != tok.right.name)
2136 die("Cannot collate %S with %S - %S contains a regex", err(name(tok.left.name))(to_ustring(name(tok.left.name)).c_str()), err(name(tok.right.name))(to_ustring(name(tok.right.name)).c_str()), err(name((le.regex != nullptr ? tok.left.name : tok.right.name)))(to_ustring(name((le.regex != nullptr ? tok.left.name : tok.right
.name))).c_str())
);
2137 seg.regex = le.regex;
2138 }
2139 if(!free && tok.left.name.valid())
2140 {
2141 trans_sym_t flag = getFlag(Unification, tok.left.name, i);
2142 seg.left.symbols.push_back(flag);
2143 seg.right.symbols.push_back(flag);
2144 }
2145 if(!free && tok.right.name.valid() && tok.right.name != tok.left.name)
2146 {
2147 trans_sym_t flag = getFlag(Unification, tok.right.name, i);
2148 seg.left.symbols.push_back(flag);
2149 seg.right.symbols.push_back(flag);
2150 }
2151 if(tok.left.name.valid())
2152 {
2153 seg.left.symbols.insert(seg.left.symbols.end(), le.left.symbols.begin(), le.left.symbols.end());
2154 }
2155 if(tok.right.name.valid())
2156 {
2157 seg.right.symbols.insert(seg.right.symbols.end(), re.right.symbols.begin(), re.right.symbols.end());
2158 }
2159 seg.tags.insert(tags.begin(), tags.end());
2160 insertEntry(trans, seg);
2161 }
2162 if(tok.optional()) {
2163 lex_seg_t seg;
2164 if (!free && tok.left.name.valid()) {
2165 trans_sym_t flag = getFlag(Unification, tok.left.name, count);
2166 seg.left.symbols.push_back(flag);
2167 seg.right.symbols.push_back(flag);
2168 }
2169 if (!free && tok.right.name.valid() && tok.right.name != tok.left.name) {
2170 trans_sym_t flag = getFlag(Unification, tok.right.name, count);
2171 seg.left.symbols.push_back(flag);
2172 seg.right.symbols.push_back(flag);
2173 }
2174 insertEntry(trans, seg);
2175 }
2176 if(did_anything)
2177 {
2178 trans->minimize();
2179 applyMode(trans, tok.mode);
2180 }
2181 else
2182 {
2183 trans = NULL__null;
2184 }
2185 if(free)
2186 {
2187 lexiconTransducers[tok] = trans;
2188 }
2189 else
2190 {
2191 entryTransducers[tok] = vector<Transducer*>(1, trans);
2192 }
2193 return trans;
2194}
2195
2196void
2197LexdCompiler::printStatistics() const
2198{
2199 cerr << "Lexicons: " << lexicons.size() << endl;
2200 cerr << "Lexicon entries: ";
2201 unsigned int x = 0;
2202 for(const auto &lex: lexicons)
2203 x += lex.second.size();
2204 cerr << x << endl;
2205 x = 0;
2206 cerr << "Patterns: " << patterns.size() << endl;
2207 cerr << "Pattern entries: ";
2208 for(const auto &pair: patterns)
2209 x += pair.second.size();
2210 cerr << x << endl;
2211 cerr << endl;
2212 cerr << "Counts for individual lexicons:" << endl;
2213 unsigned int anon = 0;
2214 for(const auto &lex: lexicons)
2215 {
2216 if(empty(lex.first)) continue;
2217 UString n = to_ustring(name(lex.first));
2218 if(n[0] == ' ') anon += lex.second.size();
2219 else cerr << n << ": " << lex.second.size() << endl;
2220 }
2221 cerr << "All anonymous lexicons: " << anon << endl;
2222}