File: | GrammarApplicator_runRules.cpp |
Warning: | line 702, column 7 Value stored to 'did_test' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright (C) 2007-2024, GrammarSoft ApS |
3 | * Developed by Tino Didriksen <mail@tinodidriksen.com> |
4 | * Design by Eckhard Bick <eckhard.bick@mail.dk>, Tino Didriksen <mail@tinodidriksen.com> |
5 | * |
6 | * This program is free software: you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation, either version 3 of the License, or |
9 | * (at your option) any later version. |
10 | * |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU General Public License |
17 | * along with this progam. If not, see <https://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "GrammarApplicator.hpp" |
21 | #include "Strings.hpp" |
22 | #include "Tag.hpp" |
23 | #include "Grammar.hpp" |
24 | #include "Window.hpp" |
25 | #include "SingleWindow.hpp" |
26 | #include "Reading.hpp" |
27 | #include "ContextualTest.hpp" |
28 | #include "version.hpp" |
29 | #include "process.hpp" |
30 | |
31 | namespace CG3 { |
32 | |
33 | enum { |
34 | RV_NOTHING = 1, |
35 | RV_SOMETHING = 2, |
36 | RV_DELIMITED = 4, |
37 | RV_TRACERULE = 8, |
38 | }; |
39 | |
40 | bool GrammarApplicator::doesWordformsMatch(const Tag* cword, const Tag* rword) { |
41 | if (rword && rword != cword) { |
42 | if (rword->type & T_REGEXP) { |
43 | if (!doesTagMatchRegexp(cword->hash, *rword)) { |
44 | return false; |
45 | } |
46 | } |
47 | else if (rword->type & T_CASE_INSENSITIVE) { |
48 | if (!doesTagMatchIcase(cword->hash, *rword)) { |
49 | return false; |
50 | } |
51 | } |
52 | else { |
53 | return false; |
54 | } |
55 | } |
56 | return true; |
57 | } |
58 | |
59 | bool GrammarApplicator::updateRuleToCohorts(Cohort& c, const uint32_t& rsit) { |
60 | // Check whether this rule is in the allowed rule list from cmdline flag --rule(s) |
61 | if (!valid_rules.empty() && !valid_rules.contains(rsit)) { |
62 | return false; |
63 | } |
64 | SingleWindow* current = c.parent; |
65 | const Rule* r = grammar->rule_by_number[rsit]; |
66 | if (!doesWordformsMatch(c.wordform, r->wordform)) { |
67 | return false; |
68 | } |
69 | if (current->rule_to_cohorts.size() < rsit+1) { |
70 | indexSingleWindow(*current); |
71 | } |
72 | CohortSet& cohortset = current->rule_to_cohorts[rsit]; |
73 | std::vector<size_t> csi; |
74 | for (size_t i = 0; i < cohortsets.size(); ++i) { |
75 | if (cohortsets[i] != &cohortset) { |
76 | continue; |
77 | } |
78 | csi.push_back(i); |
79 | } |
80 | if (!csi.empty()) { |
81 | auto cap = cohortset.capacity(); |
82 | std::vector<CohortSet::const_iterator*> ends; |
83 | std::vector<std::pair<CohortSet::const_iterator*,Cohort*>> chs; |
84 | for (size_t i = 0; i < csi.size(); ++i) { |
85 | if (*rocits[csi[i]] == cohortset.end()) { |
86 | ends.push_back(rocits[csi[i]]); |
87 | } |
88 | else { |
89 | chs.push_back(std::pair(rocits[csi[i]], **rocits[csi[i]])); |
90 | } |
91 | } |
92 | cohortset.insert(&c); |
93 | for (auto it : ends) { |
94 | *it = cohortset.end(); |
95 | } |
96 | if (cap != cohortset.capacity()) { |
97 | for (auto& it : chs) { |
98 | *it.first = cohortset.find(it.second); |
99 | } |
100 | } |
101 | } |
102 | else { |
103 | cohortset.insert(&c); |
104 | } |
105 | return current->valid_rules.insert(rsit); |
106 | } |
107 | |
108 | bool GrammarApplicator::updateValidRules(const uint32IntervalVector& rules, uint32IntervalVector& intersects, const uint32_t& hash, Reading& reading) { |
109 | size_t os = intersects.size(); |
110 | auto it = grammar->rules_by_tag.find(hash); |
111 | if (it != grammar->rules_by_tag.end()) { |
112 | Cohort& c = *(reading.parent); |
113 | for (auto rsit : (it->second)) { |
114 | if (updateRuleToCohorts(c, rsit) && rules.contains(rsit)) { |
115 | intersects.insert(rsit); |
116 | } |
117 | } |
118 | } |
119 | return (os != intersects.size()); |
120 | } |
121 | |
122 | void GrammarApplicator::indexSingleWindow(SingleWindow& current) { |
123 | current.valid_rules.clear(); |
124 | current.rule_to_cohorts.resize(grammar->rule_by_number.size()); |
125 | for (auto& cs : current.rule_to_cohorts) { |
126 | cs.clear(); |
127 | } |
128 | |
129 | for (auto c : current.cohorts) { |
130 | for (uint32_t psit = 0; psit < c->possible_sets.size(); ++psit) { |
131 | if (c->possible_sets.test(psit) == false) { |
132 | continue; |
133 | } |
134 | auto rules_it = grammar->rules_by_set.find(psit); |
135 | if (rules_it == grammar->rules_by_set.end()) { |
136 | continue; |
137 | } |
138 | for (auto rsit : rules_it->second) { |
139 | updateRuleToCohorts(*c, rsit); |
140 | } |
141 | } |
142 | } |
143 | } |
144 | |
145 | TagList GrammarApplicator::getTagList(const Set& theSet, bool unif_mode) const { |
146 | TagList theTags; |
147 | getTagList(theSet, theTags, unif_mode); |
148 | return theTags; |
149 | } |
150 | |
151 | void GrammarApplicator::getTagList(const Set& theSet, TagList& theTags, bool unif_mode) const { |
152 | if (theSet.type & ST_SET_UNIFY) { |
153 | const auto& usets = (*context_stack.back().unif_sets)[theSet.number]; |
154 | const Set& pSet = *(grammar->sets_list[theSet.sets[0]]); |
155 | for (auto iter : pSet.sets) { |
156 | if (usets.count(iter)) { |
157 | getTagList(*(grammar->sets_list[iter]), theTags); |
158 | } |
159 | } |
160 | } |
161 | else if (theSet.type & ST_TAG_UNIFY) { |
162 | for (auto iter : theSet.sets) { |
163 | getTagList(*(grammar->sets_list[iter]), theTags, true); |
164 | } |
165 | } |
166 | else if (!theSet.sets.empty()) { |
167 | for (auto iter : theSet.sets) { |
168 | getTagList(*(grammar->sets_list[iter]), theTags, unif_mode); |
169 | } |
170 | } |
171 | else if (unif_mode) { |
172 | auto unif_tags = context_stack.back().unif_tags; |
173 | auto iter = unif_tags->find(theSet.number); |
174 | if (iter != unif_tags->end()) { |
175 | trie_getTagList(theSet.trie, theTags, iter->second); |
176 | trie_getTagList(theSet.trie_special, theTags, iter->second); |
177 | } |
178 | } |
179 | else { |
180 | trie_getTagList(theSet.trie, theTags); |
181 | trie_getTagList(theSet.trie_special, theTags); |
182 | } |
183 | // Eliminate consecutive duplicates. Not all duplicates, since AddCohort and Append may have multiple readings with repeated tags |
184 | for (auto ot = theTags.begin(); theTags.size() > 1 && ot != theTags.end(); ++ot) { |
185 | auto it = ot; |
186 | ++it; |
187 | for (; it != theTags.end() && std::distance(ot, it) == 1;) { |
188 | if (*ot == *it) { |
189 | it = theTags.erase(it); |
190 | } |
191 | else { |
192 | ++it; |
193 | } |
194 | } |
195 | } |
196 | } |
197 | |
198 | Reading* GrammarApplicator::get_sub_reading(Reading* tr, int sub_reading) { |
199 | if (sub_reading == 0) { |
200 | return tr; |
201 | } |
202 | |
203 | if (sub_reading == GSR_ANY) { |
204 | // If there aren't any sub-readings, the primary reading is the same as the amalgamation of all readings |
205 | if (tr->next == nullptr) { |
206 | return tr; |
207 | } |
208 | |
209 | subs_any.emplace_back(Reading()); |
210 | Reading* reading = &subs_any.back(); |
211 | *reading = *tr; |
212 | reading->next = nullptr; |
213 | while (tr->next) { |
214 | tr = tr->next; |
215 | reading->tags_list.push_back(0); |
216 | reading->tags_list.insert(reading->tags_list.end(), tr->tags_list.begin(), tr->tags_list.end()); |
217 | for (auto tag : tr->tags) { |
218 | reading->tags.insert(tag); |
219 | reading->tags_bloom.insert(tag); |
220 | } |
221 | for (auto tag : tr->tags_plain) { |
222 | reading->tags_plain.insert(tag); |
223 | reading->tags_plain_bloom.insert(tag); |
224 | } |
225 | for (auto tag : tr->tags_textual) { |
226 | reading->tags_textual.insert(tag); |
227 | reading->tags_textual_bloom.insert(tag); |
228 | } |
229 | reading->tags_numerical.insert(tr->tags_numerical.begin(), tr->tags_numerical.end()); |
230 | if (tr->mapped) { |
231 | reading->mapped = true; |
232 | } |
233 | if (tr->mapping) { |
234 | reading->mapping = tr->mapping; |
235 | } |
236 | if (tr->matched_target) { |
237 | reading->matched_target = true; |
238 | } |
239 | if (tr->matched_tests) { |
240 | reading->matched_tests = true; |
241 | } |
242 | } |
243 | reading->rehash(); |
244 | return reading; |
245 | } |
246 | |
247 | if (sub_reading > 0) { |
248 | for (int i = 0; i < sub_reading && tr; ++i) { |
249 | tr = tr->next; |
250 | } |
251 | } |
252 | else if (sub_reading < 0) { |
253 | int ntr = 0; |
254 | Reading* ttr = tr; |
255 | while (ttr) { |
256 | ttr = ttr->next; |
257 | --ntr; |
258 | } |
259 | if (!tr->next) { |
260 | tr = nullptr; |
261 | } |
262 | for (auto i = ntr; i < sub_reading && tr; ++i) { |
263 | tr = tr->next; |
264 | } |
265 | } |
266 | return tr; |
267 | } |
268 | |
269 | #define TRACE \ |
270 | do { \ |
271 | get_apply_to().subreading->hit_by.push_back(rule->number); \ |
272 | if (rule->sub_reading == 32767) { \ |
273 | get_apply_to().reading->hit_by.push_back(rule->number); \ |
274 | } \ |
275 | } while (0) |
276 | |
277 | #define FILL_TAG_LIST(taglist)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (taglist)->begin(); it != (taglist)->end();) { if (reading.tags.find((*it)->hash) == reading.tags.end()) { auto tt = *it; it = (taglist)->erase(it); if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr ) { context_stack.back().regexgrps = ®exgrps_store[used_regex ]; } auto stag = doesTagMatchReading(reading, *tt, false, true ); if (stag) { (taglist)->insert(it, grammar->single_tags .find(stag)->second); } } continue; } ++it; } } while (0) \ |
278 | do { \ |
279 | Reading& reading = *get_apply_to().subreading; \ |
280 | for (auto it = (taglist)->begin(); it != (taglist)->end();) { \ |
281 | if (reading.tags.find((*it)->hash) == reading.tags.end()) { \ |
282 | auto tt = *it; \ |
283 | it = (taglist)->erase(it); \ |
284 | if (tt->type & T_SPECIAL) { \ |
285 | if (context_stack.back().regexgrps == nullptr) { \ |
286 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; \ |
287 | } \ |
288 | auto stag = doesTagMatchReading(reading, *tt, false, true); \ |
289 | if (stag) { \ |
290 | (taglist)->insert(it, grammar->single_tags.find(stag)->second); \ |
291 | } \ |
292 | } \ |
293 | continue; \ |
294 | } \ |
295 | ++it; \ |
296 | } \ |
297 | } while (0) |
298 | |
299 | #define FILL_TAG_LIST_RAW(taglist)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(taglist)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0) \ |
300 | do { \ |
301 | Reading& reading = *get_apply_to().subreading; \ |
302 | for (auto& tt : *(taglist)) { \ |
303 | if (tt->type & T_SPECIAL) { \ |
304 | if (context_stack.back().regexgrps == nullptr) { \ |
305 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; \ |
306 | } \ |
307 | auto stag = doesTagMatchReading(reading, *tt, false, true); \ |
308 | if (stag) { \ |
309 | tt = grammar->single_tags.find(stag)->second; \ |
310 | } \ |
311 | } \ |
312 | } \ |
313 | } while (0) |
314 | |
315 | #define APPEND_TAGLIST_TO_READING(taglist, reading)do { for (auto tter : (taglist)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((reading), tter); } if (updateValidRules(rules, intersects, hash, reading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0) \ |
316 | do { \ |
317 | for (auto tter : (taglist)) { \ |
318 | while (tter->type & T_VARSTRING) { \ |
319 | tter = generateVarstringTag(tter); \ |
320 | } \ |
321 | auto hash = tter->hash; \ |
322 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { \ |
323 | mappings->push_back(tter); \ |
324 | } \ |
325 | else { \ |
326 | hash = addTagToReading((reading), tter); \ |
327 | } \ |
328 | if (updateValidRules(rules, intersects, hash, reading)) { \ |
329 | iter_rules = intersects.find(rule->number); \ |
330 | iter_rules_end = intersects.end(); \ |
331 | } \ |
332 | } \ |
333 | } while (0) |
334 | |
335 | #define VARSTRINGIFY(tag)do { while ((tag)->type & T_VARSTRING) { (tag) = generateVarstringTag ((tag)); } } while (0) \ |
336 | do { \ |
337 | while ((tag)->type & T_VARSTRING) { \ |
338 | (tag) = generateVarstringTag((tag)); \ |
339 | } \ |
340 | } \ |
341 | while (0) |
342 | |
343 | |
344 | bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, RuleCallback reading_cb, RuleCallback cohort_cb) { |
345 | finish_cohort_loop = true; |
346 | bool anything_changed = false; |
347 | KEYWORDS type = rule.type; |
348 | const Set& set = *(grammar->sets_list[rule.target]); |
349 | CohortSet* cohortset = ¤t.rule_to_cohorts[rule.number]; |
350 | |
351 | auto override_cohortset = [&]() { |
352 | if (in_nested) { |
353 | if (!current.nested_rule_to_cohorts) { |
354 | current.nested_rule_to_cohorts.reset(new CohortSet()); |
355 | } |
356 | cohortset = current.nested_rule_to_cohorts.get(); |
357 | cohortset->clear(); |
358 | cohortset->insert(get_apply_to().cohort); |
359 | for (auto& t : set.trie_special) { |
360 | if (t.first->type & T_CONTEXT && t.first->context_ref_pos <= context_stack.back().context.size()) { |
361 | cohortset->insert(context_stack.back().context[t.first->context_ref_pos - 1]); |
362 | } |
363 | } |
364 | } |
365 | }; |
366 | override_cohortset(); |
367 | cohortsets.push_back(cohortset); |
368 | rocits.push_back(nullptr); |
369 | |
370 | scope_guard popper([&]() { |
371 | cohortsets.pop_back(); |
372 | rocits.pop_back(); |
373 | }); |
374 | |
375 | if (debug_level > 1) { |
376 | std::cerr << "DEBUG: " << cohortset->size() << "/" << current.cohorts.size() << " = " << double(cohortset->size()) / double(current.cohorts.size()) << std::endl; |
377 | } |
378 | for (auto rocit = cohortset->cbegin(); (!cohortset->empty()) && (rocit != cohortset->cend());) { |
379 | rocits.back() = &rocit; |
380 | Cohort* cohort = *rocit; |
381 | ++rocit; |
382 | |
383 | finish_reading_loop = true; |
384 | |
385 | if (debug_level > 1) { |
386 | std::cerr << "DEBUG: Trying cohort " << cohort->global_number << ":" << cohort->local_number << std::endl; |
387 | } |
388 | |
389 | // If the current cohort is the initial >>> one, skip it. |
390 | if (cohort->local_number == 0) { |
391 | continue; |
392 | } |
393 | // If the cohort is removed, skip it... |
394 | // Removed cohorts are still in the precalculated rule_to_cohorts map, |
395 | // and it would take time to go through the whole map searching for the cohort. |
396 | // Haven't tested whether it is worth it... |
397 | if (cohort->type & CT_REMOVED) { |
398 | continue; |
399 | } |
400 | |
401 | uint32_t c = cohort->local_number; |
402 | // If the cohort is temporarily unavailable due to parentheses, skip it. |
403 | if ((cohort->type & CT_ENCLOSED) || cohort->parent != ¤t) { |
404 | continue; |
405 | } |
406 | // If there are no readings, skip it. |
407 | // This is unlikely to happen as all cohorts will get a magic reading during input, |
408 | // and not many use the unsafe Remove rules. |
409 | if (cohort->readings.empty()) { |
410 | continue; |
411 | } |
412 | // If there's no reason to even attempt to restore, just skip it. |
413 | if (rule.type == K_RESTORE) { |
414 | if ((rule.flags & RF_DELAYED) && cohort->delayed.empty()) { |
415 | continue; |
416 | } |
417 | else if ((rule.flags & RF_IGNORED) && cohort->ignored.empty()) { |
418 | continue; |
419 | } |
420 | else if (!(rule.flags & (RF_DELAYED|RF_IGNORED)) && cohort->deleted.empty()) { |
421 | continue; |
422 | } |
423 | } |
424 | // If there is not even a remote chance the target set might match this cohort, skip it. |
425 | if (rule.sub_reading == 0 && (rule.target >= cohort->possible_sets.size() || !cohort->possible_sets.test(rule.target))) { |
426 | continue; |
427 | } |
428 | |
429 | // If there is only 1 reading left and it is a Select or safe Remove rule, skip it. |
430 | if (cohort->readings.size() == 1) { |
431 | if (type == K_SELECT) { |
432 | continue; |
433 | } |
434 | if (type == K_REMOVE || type == K_IFF) { |
435 | if (cohort->readings.front()->noprint) { |
436 | continue; |
437 | } |
438 | if ((!unsafe || (rule.flags & RF_SAFE)) && !(rule.flags & RF_UNSAFE)) { |
439 | continue; |
440 | } |
441 | } |
442 | } |
443 | else if (type == K_UNMAP && rule.flags & RF_SAFE) { |
444 | continue; |
445 | } |
446 | // If it's a Delimit rule and we're at the final cohort, skip it. |
447 | if (type == K_DELIMIT && c == current.cohorts.size() - 1) { |
448 | continue; |
449 | } |
450 | |
451 | // If the rule is only supposed to run inside a parentheses, check if cohort is. |
452 | if (rule.flags & RF_ENCL_INNER) { |
453 | if (!par_left_pos) { |
454 | continue; |
455 | } |
456 | if (cohort->local_number < par_left_pos || cohort->local_number > par_right_pos) { |
457 | continue; |
458 | } |
459 | } |
460 | // ...and if the rule should only run outside parentheses, check if cohort is. |
461 | else if (rule.flags & RF_ENCL_OUTER) { |
462 | if (par_left_pos && cohort->local_number >= par_left_pos && cohort->local_number <= par_right_pos) { |
463 | continue; |
464 | } |
465 | } |
466 | |
467 | // If this is SETPARENT SAFE and there's already a parent, skip it. |
468 | if (type == K_SETPARENT && (rule.flags & RF_SAFE) && cohort->dep_parent != DEP_NO_PARENT) { |
469 | continue; |
470 | } |
471 | if ((rule.flags & RF_NOPARENT) && cohort->dep_parent != DEP_NO_PARENT) { |
472 | continue; |
473 | } |
474 | |
475 | // Check if on previous runs the rule did not match this cohort, and skip if that is the case. |
476 | // This cache is cleared if any rule causes any state change in the window. |
477 | uint32_t ih = hash_value(rule.number, cohort->global_number); |
478 | if (index_ruleCohort_no.contains(ih)) { |
479 | continue; |
480 | } |
481 | index_ruleCohort_no.insert(ih); |
482 | |
483 | size_t num_active = 0; |
484 | size_t num_iff = 0; |
485 | |
486 | std::vector<Rule_Context> reading_contexts; |
487 | reading_contexts.reserve(cohort->readings.size()); |
488 | |
489 | // Assume that Iff rules are really Remove rules, until proven otherwise. |
490 | if (rule.type == K_IFF) { |
491 | type = K_REMOVE; |
492 | } |
493 | |
494 | bool did_test = false; |
495 | bool test_good = false; |
496 | bool matched_target = false; |
497 | |
498 | clear(readings_plain); |
499 | clear(subs_any); |
500 | |
501 | // Varstring capture groups exist on a per-cohort basis, since we may need them for mapping later. |
502 | clear(regexgrps_z); |
503 | clear(regexgrps_c); |
504 | clear(unif_tags_rs); |
505 | clear(unif_sets_rs); |
506 | |
507 | used_regex = 0; |
508 | regexgrps_store.resize(std::max(regexgrps_store.size(), cohort->readings.size())); |
509 | regexgrps_z.reserve(std::max(regexgrps_z.size(), cohort->readings.size())); |
510 | regexgrps_c.reserve(std::max(regexgrps_c.size(), cohort->readings.size())); |
511 | |
512 | size_t used_unif = 0; |
513 | unif_tags_store.resize(std::max(unif_tags_store.size(), cohort->readings.size() + 1)); |
514 | unif_sets_store.resize(std::max(unif_sets_store.size(), cohort->readings.size() + 1)); |
515 | |
516 | { |
517 | Rule_Context context; |
518 | context.target.cohort = cohort; |
519 | context_stack.push_back(std::move(context)); |
520 | } |
521 | |
522 | auto reset_cohorts = [&]() { |
523 | cohortset = ¤t.rule_to_cohorts[rule.number]; |
524 | override_cohortset(); |
525 | cohortsets.back() = cohortset; |
526 | if (get_apply_to().cohort->type & CT_REMOVED) { |
527 | rocit = cohortset->lower_bound(current.cohorts[get_apply_to().cohort->local_number]); |
528 | } |
529 | else { |
530 | rocit = cohortset->find(current.cohorts[get_apply_to().cohort->local_number]); |
531 | if (rocit != cohortset->end()) { |
532 | ++rocit; |
533 | } |
534 | } |
535 | }; |
536 | |
537 | // Remember the current state so we can compare later to see if anything has changed |
538 | const size_t state_num_readings = cohort->readings.size(); |
539 | const size_t state_num_removed = cohort->deleted.size(); |
540 | const size_t state_num_delayed = cohort->delayed.size(); |
541 | const size_t state_num_ignored = cohort->ignored.size(); |
542 | |
543 | // This loop figures out which readings, if any, that are valid targets for the current rule |
544 | // Criteria for valid is that the reading must match both target and all contextual tests |
545 | for (size_t i = 0; i < cohort->readings.size(); ++i) { |
546 | // ToDo: Switch sub-readings so that they build up a passed in vector<Reading*> |
547 | Reading* reading = get_sub_reading(cohort->readings[i], rule.sub_reading); |
548 | if (!reading) { |
549 | cohort->readings[i]->matched_target = false; |
550 | cohort->readings[i]->matched_tests = false; |
551 | continue; |
552 | } |
553 | context_stack.back().target.reading = cohort->readings[i]; |
554 | context_stack.back().target.subreading = reading; |
555 | |
556 | // The state is stored in the readings themselves, so clear the old states |
557 | reading->matched_target = false; |
558 | reading->matched_tests = false; |
559 | |
560 | if (reading->mapped && (rule.type == K_MAP || rule.type == K_ADD || rule.type == K_REPLACE)) { |
561 | continue; |
562 | } |
563 | if (reading->mapped && (rule.flags & RF_NOMAPPED)) { |
564 | continue; |
565 | } |
566 | if (reading->noprint && !allow_magic_readings) { |
567 | continue; |
568 | } |
569 | if (reading->immutable && rule.type != K_UNPROTECT) { |
570 | if (type == K_SELECT) { |
571 | reading->matched_target = true; |
572 | reading->matched_tests = true; |
573 | reading_contexts.push_back(context_stack.back()); |
574 | } |
575 | ++num_active; |
576 | ++num_iff; |
577 | continue; |
578 | } |
579 | |
580 | // Check if any previous reading of this cohort had the same plain signature, and if so just copy their results |
581 | // This cache is cleared on a per-cohort basis |
582 | did_test = false; |
583 | if (!(set.type & (ST_SPECIAL | ST_MAPPING | ST_CHILD_UNIFY)) && !readings_plain.empty()) { |
584 | auto rpit = readings_plain.find(reading->hash_plain); |
585 | if (rpit != readings_plain.end()) { |
586 | reading->matched_target = rpit->second->matched_target; |
587 | reading->matched_tests = rpit->second->matched_tests; |
588 | if (reading->matched_tests) { |
589 | ++num_active; |
590 | } |
591 | if (regexgrps_c.count(rpit->second->number)) { |
592 | regexgrps_c[reading->number]; |
593 | regexgrps_c[reading->number] = regexgrps_c[rpit->second->number]; |
594 | regexgrps_z[reading->number]; |
595 | regexgrps_z[reading->number] = regexgrps_z[rpit->second->number]; |
596 | |
597 | context_stack.back().regexgrp_ct = regexgrps_z[reading->number]; |
598 | context_stack.back().regexgrps = regexgrps_c[reading->number]; |
599 | } |
600 | context_stack.back().unif_tags = unif_tags_rs[reading->hash_plain]; |
601 | context_stack.back().unif_sets = unif_sets_rs[reading->hash_plain]; |
602 | did_test = true; |
603 | test_good = rpit->second->matched_tests; |
604 | reading_contexts.push_back(context_stack.back()); |
605 | continue; |
606 | } |
607 | } |
608 | |
609 | // Regex capture is done on a per-reading basis, so clear all captured state. |
610 | context_stack.back().regexgrp_ct = 0; |
611 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; |
612 | |
613 | // Unification is done on a per-reading basis, so clear all unification state. |
614 | context_stack.back().unif_tags = &unif_tags_store[used_unif]; |
615 | context_stack.back().unif_sets = &unif_sets_store[used_unif]; |
616 | unif_tags_rs[reading->hash_plain] = context_stack.back().unif_tags; |
617 | unif_sets_rs[reading->hash_plain] = context_stack.back().unif_sets; |
618 | unif_tags_rs[reading->hash] = context_stack.back().unif_tags; |
619 | unif_sets_rs[reading->hash] = context_stack.back().unif_sets; |
620 | ++used_unif; |
621 | |
622 | context_stack.back().unif_tags->clear(); |
623 | context_stack.back().unif_sets->clear(); |
624 | |
625 | unif_last_wordform = 0; |
626 | unif_last_baseform = 0; |
627 | unif_last_textual = 0; |
628 | |
629 | same_basic = reading->hash_plain; |
630 | rule_target = context_target = nullptr; |
631 | if (context_stack.size() > 1) { |
632 | Cohort* m = context_stack[context_stack.size()-2].mark; |
633 | if (m) set_mark(m); |
634 | else set_mark(cohort); |
635 | } |
636 | else { |
637 | set_mark(cohort); |
638 | } |
639 | uint8_t orz = context_stack.back().regexgrp_ct; |
640 | for (auto r = cohort->readings[i]; r; r = r->next) { |
641 | r->active = true; |
642 | } |
643 | if (rule.line == 2746) { |
644 | cohort = cohort; |
645 | } |
646 | rule_target = cohort; |
647 | // Actually check if the reading is a valid target. First check if rule target matches... |
648 | if (rule.target && doesSetMatchReading(*reading, rule.target, (set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) != 0)) { |
649 | if (rule.line == 2746) { |
650 | cohort = cohort; |
651 | } |
652 | bool regex_prop = true; |
653 | if (orz != context_stack.back().regexgrp_ct) { |
654 | did_test = false; |
655 | regex_prop = false; |
656 | } |
657 | rule_target = context_target = cohort; |
658 | reading->matched_target = true; |
659 | matched_target = true; |
660 | bool good = true; |
661 | // If we didn't already run the contextual tests, run them now. |
662 | if (!did_test) { |
663 | context_stack.back().context.clear(); |
664 | foreach (it, rule.tests)if (!(rule.tests).empty()) for (auto it = (rule.tests).begin( ), it_end = (rule.tests).end(); it != it_end; ++it) { |
665 | ContextualTest* test = *it; |
666 | if (rule.flags & RF_RESETX || !(rule.flags & RF_REMEMBERX)) { |
667 | set_mark(cohort); |
668 | } |
669 | seen_barrier = false; |
670 | // Keeps track of where we have been, to prevent infinite recursion in trees with loops |
671 | dep_deep_seen.clear(); |
672 | // Reset the counters for which types of CohortIterator we have in play |
673 | std::fill(ci_depths.begin(), ci_depths.end(), UI32(0)); |
674 | tmpl_cntx.clear(); |
675 | // Run the contextual test... |
676 | Cohort* next_test = nullptr; |
677 | Cohort* result = nullptr; |
678 | Cohort** deep = nullptr; |
679 | if (rule.type == K_WITH) { |
680 | deep = &result; |
681 | merge_with = nullptr; |
682 | } |
683 | if (!(test->pos & POS_PASS_ORIGIN) && (no_pass_origin || (test->pos & POS_NO_PASS_ORIGIN))) { |
684 | next_test = runContextualTest(¤t, c, test, deep, cohort); |
685 | } |
686 | else { |
687 | next_test = runContextualTest(¤t, c, test, deep); |
688 | } |
689 | context_stack.back().context.push_back(merge_with ? merge_with : result); |
690 | test_good = (next_test != nullptr); |
691 | |
692 | profileRuleContext(test_good, &rule, test); |
693 | |
694 | if (!test_good) { |
695 | good = test_good; |
696 | if (it != rule.tests.begin() && !(rule.flags & RF_KEEPORDER)) { |
697 | rule.tests.erase(it); |
698 | rule.tests.push_front(test); |
699 | } |
700 | break; |
701 | } |
702 | did_test = ((set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) == 0 && context_stack.back().unif_tags->empty() && context_stack.back().unif_sets->empty()); |
Value stored to 'did_test' is never read | |
703 | } |
704 | } |
705 | else { |
706 | good = test_good; |
707 | } |
708 | if (good) { |
709 | // We've found a match, so Iff should be treated as Select instead of Remove |
710 | if (rule.type == K_IFF && type != K_SELECT) { |
711 | type = K_SELECT; |
712 | if (grammar->has_protect) { |
713 | for (size_t j = 0; j < i; ++j) { |
714 | Reading* reading = get_sub_reading(cohort->readings[j], rule.sub_reading); |
715 | if (reading && reading->immutable) { |
716 | reading->matched_target = true; |
717 | reading->matched_tests = true; |
718 | ++num_active; |
719 | ++num_iff; |
720 | } |
721 | } |
722 | } |
723 | } |
724 | reading->matched_tests = true; |
725 | ++num_active; |
726 | if (profiler) { |
727 | Profiler::Key k{ET_RULE, rule.number + 1 }; |
728 | auto& r = profiler->entries[k]; |
729 | ++r.num_match; |
730 | if (!r.example_window) { |
731 | addProfilingExample(r); |
732 | } |
733 | } |
734 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { |
735 | printDebugRule(rule); |
736 | } |
737 | |
738 | if (regex_prop && i && !regexgrps_c.empty()) { |
739 | for (auto z = i; z > 0; --z) { |
740 | auto it = regexgrps_c.find(cohort->readings[z - 1]->number); |
741 | if (it != regexgrps_c.end()) { |
742 | regexgrps_c.insert(std::make_pair(reading->number, it->second)); |
743 | regexgrps_z.insert(std::make_pair(reading->number, regexgrps_z.find(cohort->readings[z - 1]->number)->second)); |
744 | break; |
745 | } |
746 | } |
747 | } |
748 | } |
749 | else { |
750 | context_stack.back().regexgrp_ct = orz; |
751 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { |
752 | printDebugRule(rule, true, false); |
753 | } |
754 | } |
755 | ++num_iff; |
756 | } |
757 | else { |
758 | context_stack.back().regexgrp_ct = orz; |
759 | if (profiler) { |
760 | Profiler::Key k{ ET_RULE, rule.number + 1 }; |
761 | ++profiler->entries[k].num_fail; |
762 | } |
763 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { |
764 | printDebugRule(rule, false, false); |
765 | } |
766 | } |
767 | readings_plain.insert(std::make_pair(reading->hash_plain, reading)); |
768 | for (auto r = cohort->readings[i]; r; r = r->next) { |
769 | r->active = false; |
770 | } |
771 | |
772 | if (reading != cohort->readings[i]) { |
773 | cohort->readings[i]->matched_target = reading->matched_target; |
774 | cohort->readings[i]->matched_tests = reading->matched_tests; |
775 | } |
776 | if (context_stack.back().regexgrp_ct) { |
777 | regexgrps_c[reading->number] = context_stack.back().regexgrps; |
778 | regexgrps_z[reading->number] = context_stack.back().regexgrp_ct; |
779 | ++used_regex; |
780 | } |
781 | reading_contexts.push_back(context_stack.back()); |
782 | } |
783 | |
784 | if (state_num_readings != cohort->readings.size() || state_num_removed != cohort->deleted.size() || state_num_delayed != cohort->delayed.size() || state_num_ignored != cohort->ignored.size()) { |
785 | anything_changed = true; |
786 | cohort->type &= ~CT_NUM_CURRENT; |
787 | } |
788 | |
789 | // If none of the readings were valid targets, remove this cohort from the rule's possible cohorts. |
790 | if (num_active == 0 && (num_iff == 0 || rule.type != K_IFF)) { |
791 | if (!matched_target) { |
792 | --rocit; // We have already incremented rocit earlier, so take one step back... |
793 | rocit = cohortset->erase(rocit); // ...and one step forward again |
794 | } |
795 | context_stack.pop_back(); |
796 | continue; |
797 | } |
798 | |
799 | // All readings were valid targets, which means there is nothing to do for Select or safe Remove rules. |
800 | if (num_active == cohort->readings.size()) { |
801 | if (type == K_SELECT) { |
802 | context_stack.pop_back(); |
803 | continue; |
804 | } |
805 | if (type == K_REMOVE && (!unsafe || (rule.flags & RF_SAFE)) && !(rule.flags & RF_UNSAFE)) { |
806 | context_stack.pop_back(); |
807 | continue; |
808 | } |
809 | } |
810 | |
811 | for (auto& ctx : reading_contexts) { |
812 | if (!ctx.target.subreading->matched_target) { |
813 | continue; |
814 | } |
815 | if (!ctx.target.subreading->matched_tests && rule.type != K_IFF) { |
816 | continue; |
817 | } |
818 | context_stack.back() = ctx; |
819 | reset_cohorts_for_loop = false; |
820 | reading_cb(); |
821 | if (!finish_cohort_loop) { |
822 | context_stack.pop_back(); |
823 | return anything_changed; |
824 | } |
825 | if (reset_cohorts_for_loop) { |
826 | reset_cohorts(); |
827 | break; |
828 | } |
829 | if (!finish_reading_loop) { |
830 | break; |
831 | } |
832 | } |
833 | |
834 | reset_cohorts_for_loop = false; |
835 | cohort_cb(); |
836 | if (!finish_cohort_loop) { |
837 | context_stack.pop_back(); |
838 | return anything_changed; |
839 | } |
840 | if (reset_cohorts_for_loop) { |
841 | reset_cohorts(); |
842 | } |
843 | context_stack.pop_back(); |
844 | } |
845 | return anything_changed; |
846 | } |
847 | |
848 | /** |
849 | * Applies the passed rules to the passed SingleWindow. |
850 | * |
851 | * This function is called at least N*M times where N is number of sections in the grammar and M is the number of windows in the input. |
852 | * Possibly many more times, since if a section changes the state of the window the section is run again. |
853 | * Only when no further changes are caused at a level does it progress to next level. |
854 | * |
855 | * The loops in this function are increasingly explosive, despite efforts to contain them. |
856 | * In the https://visl.sdu.dk/cg3_performance.html test data, this function is called 1015 times. |
857 | * The first loop (rules) is executed 3101728 times. |
858 | * The second loop (cohorts) is executed 11087278 times. |
859 | * The third loop (finding readings) is executed 11738927 times; of these, 1164585 (10%) match the rule target. |
860 | * The fourth loop (contextual test) is executed 1184009 times; of those, 1156322 (97%) fail their contexts. |
861 | * The fifth loop (acting on readings) is executed 41540 times. |
862 | * |
863 | * @param[in,out] current The window to apply rules on |
864 | * @param[in] rules The rules to apply |
865 | */ |
866 | uint32_t GrammarApplicator::runRulesOnSingleWindow(SingleWindow& current, const uint32IntervalVector& rules) { |
867 | uint32_t retval = RV_NOTHING; |
868 | bool section_did_something = false; |
869 | bool delimited = false; |
870 | |
871 | // ToDo: Now that numbering is used, can't this be made a normal max? Hm, maybe not since --sections can still force another order...but if we're smart, then we re-enumerate rules based on --sections |
872 | uint32IntervalVector intersects = current.valid_rules.intersect(rules); |
873 | ReadingList removed; |
874 | ReadingList selected; |
875 | |
876 | if (debug_level > 1) { |
877 | std::cerr << "DEBUG: Trying window " << current.number << std::endl; |
878 | } |
879 | |
880 | current.parent->cohort_map[0] = current.cohorts.front(); |
881 | |
882 | foreach (iter_rules, intersects)if (!(intersects).empty()) for (auto iter_rules = (intersects ).begin(), iter_rules_end = (intersects).end(); iter_rules != iter_rules_end; ++iter_rules) { |
883 | // Conditionally re-sort the rule-to-cohort mapping when the current rule is finished, regardless of how it finishes |
884 | struct Sorter { |
885 | SingleWindow& current; |
886 | bool do_sort = false; |
887 | |
888 | Sorter(SingleWindow& current) |
889 | : current(current) |
890 | {} |
891 | |
892 | ~Sorter() { |
893 | if (do_sort) { |
894 | for (auto& cs : current.rule_to_cohorts) { |
895 | cs.sort(); |
896 | } |
897 | } |
898 | } |
899 | } sorter(current); |
900 | |
901 | repeat_rule: |
902 | bool rule_did_something = false; |
903 | uint32_t j = (*iter_rules); |
904 | |
905 | // Check whether this rule is in the allowed rule list from cmdline flag --rule(s) |
906 | if (!valid_rules.empty() && !valid_rules.contains(j)) { |
907 | continue; |
908 | } |
909 | |
910 | current_rule = grammar->rule_by_number[j]; |
911 | Rule* rule = grammar->rule_by_number[j]; |
912 | if (rule->type == K_IGNORE) { |
913 | continue; |
914 | } |
915 | if (debug_level > 1) { |
916 | std::cerr << "DEBUG: Trying rule " << rule->line << std::endl; |
917 | } |
918 | |
919 | if (!apply_mappings && (rule->type == K_MAP || rule->type == K_ADD || rule->type == K_REPLACE)) { |
920 | continue; |
921 | } |
922 | if (!apply_corrections && (rule->type == K_SUBSTITUTE || rule->type == K_APPEND)) { |
923 | continue; |
924 | } |
925 | // If there are parentheses and the rule is marked as only run on the final pass, skip if this is not it. |
926 | if (current.has_enclosures) { |
927 | if ((rule->flags & RF_ENCL_FINAL) && !did_final_enclosure) { |
928 | continue; |
929 | } |
930 | if (did_final_enclosure && !(rule->flags & RF_ENCL_FINAL)) { |
931 | continue; |
932 | } |
933 | } |
934 | |
935 | bool readings_changed = false; |
936 | bool should_repeat = false; |
937 | bool should_bail = false; |
938 | |
939 | auto reindex = [&](SingleWindow* which = nullptr) { |
940 | if (!which) { |
941 | which = ¤t; |
942 | } |
943 | foreach (iter, which->cohorts)if (!(which->cohorts).empty()) for (auto iter = (which-> cohorts).begin(), iter_end = (which->cohorts).end(); iter != iter_end; ++iter) { |
944 | (*iter)->local_number = UI32(std::distance(which->cohorts.begin(), iter)); |
945 | } |
946 | gWindow->rebuildCohortLinks(); |
947 | }; |
948 | |
949 | auto collect_subtree = [&](CohortSet& cs, Cohort* head, uint32_t cset) { |
950 | if (cset) { |
951 | for (auto iter : current.cohorts) { |
952 | // Always consider the initial cohort a match |
953 | if (iter->global_number == head->global_number) { |
954 | cs.insert(iter); |
955 | } |
956 | else if (iter->dep_parent == head->global_number && doesSetMatchCohortNormal(*iter, cset)) { |
957 | cs.insert(iter); |
958 | } |
959 | } |
960 | CohortSet more; |
961 | for (auto iter : current.cohorts) { |
962 | for (auto cht : cs) { |
963 | // Do not grab the whole tree from the root, in case WithChild is not (*) |
964 | if (cht->global_number == head->global_number) { |
965 | continue; |
966 | } |
967 | if (isChildOf(iter, cht)) { |
968 | more.insert(iter); |
969 | } |
970 | } |
971 | } |
972 | cs.insert(more.begin(), more.end()); |
973 | } |
974 | else { |
975 | cs.insert(head); |
976 | } |
977 | }; |
978 | |
979 | auto add_cohort = [&](Cohort* cohort, size_t& spacesInAddedWf) { |
980 | Cohort* cCohort = alloc_cohort(¤t); |
981 | cCohort->global_number = gWindow->cohort_counter++; |
982 | |
983 | Tag* wf = nullptr; |
984 | std::vector<TagList> readings; |
985 | auto theTags = ss_taglist.get(); |
986 | getTagList(*rule->maplist, theTags); |
987 | |
988 | for (auto& tter : *theTags) { |
989 | if (tter->type & T_VSTR) { |
990 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
991 | } |
992 | } |
993 | |
994 | for (auto tter : *theTags) { |
995 | if(tter->type & T_WORDFORM) { |
996 | spacesInAddedWf = std::count_if(tter->tag.begin(), tter->tag.end(), [](UChar c){ return c == ' '; }); |
997 | } |
998 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
999 | if (tter->type & T_WORDFORM) { |
1000 | cCohort->wordform = tter; |
1001 | wf = tter; |
1002 | continue; |
1003 | } |
1004 | if (!wf) { |
1005 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a wordform before any other tags in ADDCOHORT/MERGECOHORTS on line %u before input line %u.\n", rule->line, numLines); |
1006 | CG3Quit(1); |
1007 | } |
1008 | if (tter->type & T_BASEFORM) { |
1009 | readings.resize(readings.size() + 1); |
1010 | readings.back().push_back(wf); |
1011 | } |
1012 | if (readings.empty()) { |
1013 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform after the wordform in ADDCOHORT/MERGECOHORTS on line %u before input line %u.\n", rule->line, numLines); |
1014 | CG3Quit(1); |
1015 | } |
1016 | readings.back().push_back(tter); |
1017 | } |
1018 | |
1019 | for (auto& tags : readings) { |
1020 | for (size_t i = 0; i < tags.size(); ++i) { |
1021 | if (tags[i]->hash == grammar->tag_any) { |
1022 | auto& nt = cohort->readings.front()->tags_list; |
1023 | if (nt.size() <= 2) { |
1024 | continue; |
1025 | } |
1026 | tags.reserve(tags.size() + nt.size() - 2); |
1027 | tags[i] = grammar->single_tags[nt[2]]; |
1028 | for (size_t j = 3, k = 1; j < nt.size(); ++j) { |
1029 | if (grammar->single_tags[nt[j]]->type & T_DEPENDENCY) { |
1030 | continue; |
1031 | } |
1032 | tags.insert(tags.begin() + i + k, grammar->single_tags[nt[j]]); |
1033 | ++k; |
1034 | } |
1035 | } |
1036 | } |
1037 | } |
1038 | |
1039 | for (auto& rit : readings) { |
1040 | Reading* cReading = alloc_reading(cCohort); |
1041 | ++numReadings; |
1042 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); |
1043 | cReading->hit_by.push_back(rule->number); |
1044 | cReading->noprint = false; |
1045 | TagList mappings; |
1046 | for (auto tter : rit) { |
1047 | uint32_t hash = tter->hash; |
1048 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
1049 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { |
1050 | mappings.push_back(tter); |
1051 | } |
1052 | else { |
1053 | hash = addTagToReading(*cReading, hash); |
1054 | } |
1055 | if (updateValidRules(rules, intersects, hash, *cReading)) { |
1056 | iter_rules = intersects.find(rule->number); |
1057 | iter_rules_end = intersects.end(); |
1058 | } |
1059 | } |
1060 | if (!mappings.empty()) { |
1061 | splitMappings(mappings, *cCohort, *cReading); |
1062 | } |
1063 | cCohort->appendReading(cReading); |
1064 | } |
1065 | |
1066 | current.parent->cohort_map[cCohort->global_number] = cCohort; |
1067 | current.parent->dep_window[cCohort->global_number] = cCohort; |
1068 | if (grammar->addcohort_attach && (rule->type == K_ADDCOHORT_BEFORE || rule->type == K_ADDCOHORT_AFTER)) { |
1069 | attachParentChild(*cohort, *cCohort); |
1070 | } |
1071 | |
1072 | if (cCohort->readings.empty()) { |
1073 | initEmptyCohort(*cCohort); |
1074 | if (trace) { |
1075 | auto r = cCohort->readings.front(); |
1076 | r->hit_by.push_back(rule->number); |
1077 | r->noprint = false; |
1078 | } |
1079 | } |
1080 | |
1081 | CohortSet cohorts; |
1082 | collect_subtree(cohorts, cohort, rule->childset1); |
1083 | |
1084 | if (rule->type == K_ADDCOHORT_BEFORE) { |
1085 | current.cohorts.insert(current.cohorts.begin() + cohorts.front()->local_number, cCohort); |
1086 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + cohorts.front()->local_number, current.all_cohorts.end(), cohorts.front()), cCohort); |
1087 | } |
1088 | else { |
1089 | current.cohorts.insert(current.cohorts.begin() + cohorts.back()->local_number + 1, cCohort); |
1090 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + cohorts.back()->local_number, current.all_cohorts.end(), cohorts.back()) + 1, cCohort); |
1091 | } |
1092 | |
1093 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { |
1094 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); |
1095 | } |
1096 | gWindow->rebuildCohortLinks(); |
1097 | |
1098 | return cCohort; |
1099 | }; |
1100 | |
1101 | auto rem_cohort = [&](Cohort* cohort) { |
1102 | auto& current = *cohort->parent; |
1103 | for (auto iter : cohort->readings) { |
1104 | iter->hit_by.push_back(rule->number); |
1105 | iter->deleted = true; |
1106 | if (trace) { |
1107 | iter->noprint = false; |
1108 | } |
1109 | } |
1110 | // Remove the cohort from all rules |
1111 | for (auto& cs : current.rule_to_cohorts) { |
1112 | cs.erase(cohort); |
1113 | } |
1114 | // Forward all children of this cohort to the parent of this cohort |
1115 | // ToDo: Named relations must be erased |
1116 | while (!cohort->dep_children.empty()) { |
1117 | uint32_t ch = cohort->dep_children.back(); |
1118 | if (cohort->dep_parent == DEP_NO_PARENT) { |
1119 | attachParentChild(*gWindow->cohort_map[0], *gWindow->cohort_map[ch], true, true); |
1120 | } |
1121 | else { |
1122 | attachParentChild(*gWindow->cohort_map[cohort->dep_parent], *gWindow->cohort_map[ch], true, true); |
1123 | } |
1124 | cohort->dep_children.erase(ch); |
1125 | } |
1126 | cohort->type |= CT_REMOVED; |
1127 | cohort->detach(); |
1128 | for (auto& cm : gWindow->cohort_map) { |
1129 | cm.second->dep_children.erase(cohort->dep_self); |
1130 | } |
1131 | gWindow->cohort_map.erase(cohort->global_number); |
1132 | current.cohorts.erase(current.cohorts.begin() + cohort->local_number); |
1133 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { |
1134 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); |
1135 | } |
1136 | |
1137 | if (current.cohorts.size() == 1 && ¤t != gWindow->current) { |
1138 | // This window is now empty, so remove it entirely from consideration so rules can look past it |
1139 | cohort = current.cohorts[0]; |
1140 | |
1141 | // Remove the cohort from all rules |
1142 | for (auto& cs : current.rule_to_cohorts) { |
1143 | cs.erase(cohort); |
1144 | } |
1145 | cohort->detach(); |
1146 | for (auto& cm : gWindow->cohort_map) { |
1147 | cm.second->dep_children.erase(cohort->dep_self); |
1148 | } |
1149 | gWindow->cohort_map.erase(cohort->global_number); |
1150 | free_cohort(cohort); |
1151 | |
1152 | if (current.previous) { |
1153 | current.previous->text += current.text + current.text_post; |
1154 | current.previous->all_cohorts.insert(current.previous->all_cohorts.end(), current.all_cohorts.begin() + 1, current.all_cohorts.end()); |
1155 | } |
1156 | else if (current.next) { |
1157 | current.next->text = current.text_post + current.next->text; |
1158 | current.next->all_cohorts.insert(current.previous->all_cohorts.begin() + 1, current.all_cohorts.begin() + 1, current.all_cohorts.end()); |
1159 | } |
1160 | current.all_cohorts.clear(); |
1161 | |
1162 | for (size_t i = 0; i < gWindow->previous.size(); ++i) { |
1163 | if (gWindow->previous[i] == ¤t) { |
1164 | free_swindow(gWindow->previous[i]); |
1165 | gWindow->previous.erase(gWindow->previous.begin() + i); |
1166 | break; |
1167 | } |
1168 | } |
1169 | for (size_t i = 0; i < gWindow->next.size(); ++i) { |
1170 | if (gWindow->next[i] == ¤t) { |
1171 | free_swindow(gWindow->next[i]); |
1172 | gWindow->next.erase(gWindow->next.begin() + i); |
1173 | break; |
1174 | } |
1175 | } |
1176 | |
1177 | gWindow->rebuildSingleWindowLinks(); |
1178 | } |
1179 | |
1180 | gWindow->rebuildCohortLinks(); |
1181 | }; |
1182 | |
1183 | auto ignore_cohort = [&](Cohort* cohort) { |
1184 | auto& current = *cohort->parent; |
1185 | for (auto iter : cohort->readings) { |
1186 | iter->hit_by.push_back(rule->number); |
1187 | } |
1188 | for (auto& cs : current.rule_to_cohorts) { |
1189 | cs.erase(cohort); |
1190 | } |
1191 | cohort->type |= CT_IGNORED; |
1192 | cohort->detach(); |
1193 | gWindow->cohort_map.erase(cohort->global_number); |
1194 | current.cohorts.erase(current.cohorts.begin() + cohort->local_number); |
1195 | }; |
1196 | |
1197 | auto make_relation_rtag = [&](Tag* tag, uint32_t id) { |
1198 | UChar tmp[256] = { 0 }; |
1199 | u_sprintfu_sprintf_72(tmp, "R:%S:%u", tag->tag.data(), id); |
1200 | auto nt = addTag(tmp); |
1201 | return nt; |
1202 | }; |
1203 | |
1204 | auto add_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { |
1205 | auto nt = make_relation_rtag(tag, id); |
1206 | for (auto& r : cohort->readings) { |
1207 | addTagToReading(*r, nt); |
1208 | } |
1209 | }; |
1210 | |
1211 | auto set_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { |
1212 | auto nt = make_relation_rtag(tag, id); |
1213 | for (auto& r : cohort->readings) { |
1214 | for (auto it = r->tags_list.begin(); it != r->tags_list.end();) { |
1215 | const auto& utag = grammar->single_tags[*it]->tag; |
1216 | if (utag[0] == 'R' && utag[1] == ':' && utag.size() > 2 + tag->tag.size() && utag[2 + tag->tag.size()] == ':' && utag.compare(2, tag->tag.size(), tag->tag) == 0) { |
1217 | r->tags.erase(*it); |
1218 | r->tags_textual.erase(*it); |
1219 | r->tags_numerical.erase(*it); |
1220 | r->tags_plain.erase(*it); |
1221 | it = r->tags_list.erase(it); |
1222 | } |
1223 | else { |
1224 | ++it; |
1225 | } |
1226 | } |
1227 | addTagToReading(*r, nt); |
1228 | } |
1229 | }; |
1230 | |
1231 | auto rem_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { |
1232 | auto nt = make_relation_rtag(tag, id); |
1233 | for (auto& r : cohort->readings) { |
1234 | delTagFromReading(*r, nt); |
1235 | } |
1236 | }; |
1237 | |
1238 | auto insert_taglist_to_reading = [&](auto& iter, auto& taglist, auto& reading, auto& mappings) { |
1239 | for (auto tag : taglist) { |
1240 | if (tag->type & T_VARSTRING) { |
1241 | tag = generateVarstringTag(tag); |
1242 | } |
1243 | if (tag->hash == grammar->tag_any) { |
1244 | break; |
1245 | } |
1246 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { |
1247 | mappings->push_back(tag); |
1248 | } |
1249 | else { |
1250 | iter = reading.tags_list.insert(iter, tag->hash); |
1251 | ++iter; |
1252 | } |
1253 | if (updateValidRules(rules, intersects, tag->hash, reading)) { |
1254 | iter_rules = intersects.find(rule->number); |
1255 | iter_rules_end = intersects.end(); |
1256 | } |
1257 | } |
1258 | reflowReading(reading); |
1259 | }; |
1260 | |
1261 | auto cohort_cb = [&]() { |
1262 | if (rule->type == K_SELECT || (rule->type == K_IFF && !selected.empty())) { |
1263 | Cohort* target = get_apply_to().cohort; |
1264 | if (selected.size() < target->readings.size() && !selected.empty()) { |
1265 | ReadingList drop; |
1266 | size_t si = 0; |
1267 | for (size_t ri = 0; ri < target->readings.size(); ri++) { |
1268 | // Manually trace, since reading_cb doesn't get called on non-matching readings |
1269 | Reading* rd = target->readings[ri]; |
1270 | if (rule->sub_reading != 32767) { |
1271 | rd = get_sub_reading(rd, rule->sub_reading); |
1272 | } |
1273 | if (rd) { |
1274 | rd->hit_by.push_back(rule->number); |
1275 | } |
1276 | if (si < selected.size() && target->readings[ri] == selected[si]) { |
1277 | si++; |
1278 | } |
1279 | else { |
1280 | target->readings[ri]->deleted = true; |
1281 | drop.push_back(target->readings[ri]); |
1282 | } |
1283 | } |
1284 | target->readings.swap(selected); |
1285 | if (rule->flags & RF_DELAYED) { |
1286 | target->delayed.insert(target->delayed.end(), drop.begin(), drop.end()); |
1287 | } |
1288 | else if (rule->flags & RF_IGNORED) { |
1289 | target->ignored.insert(target->ignored.end(), drop.begin(), drop.end()); |
1290 | } |
1291 | else { |
1292 | target->deleted.insert(target->deleted.end(), drop.begin(), drop.end()); |
1293 | } |
1294 | readings_changed = true; |
1295 | } |
1296 | selected.clear(); |
1297 | } |
1298 | else if (rule->type == K_REMOVE || rule->type == K_IFF) { |
1299 | if (!removed.empty() && (removed.size() < get_apply_to().cohort->readings.size() || (unsafe && !(rule->flags & RF_SAFE)) || (rule->flags & RF_UNSAFE))) { |
1300 | if (rule->flags & RF_DELAYED) { |
1301 | get_apply_to().cohort->delayed.insert(get_apply_to().cohort->delayed.end(), removed.begin(), removed.end()); |
1302 | } |
1303 | else if (rule->flags & RF_IGNORED) { |
1304 | get_apply_to().cohort->ignored.insert(get_apply_to().cohort->ignored.end(), removed.begin(), removed.end()); |
1305 | } |
1306 | else { |
1307 | get_apply_to().cohort->deleted.insert(get_apply_to().cohort->deleted.end(), removed.begin(), removed.end()); |
1308 | } |
1309 | size_t oz = get_apply_to().cohort->readings.size(); |
1310 | while (!removed.empty()) { |
1311 | removed.back()->deleted = true; |
1312 | for (size_t i = 0; i < oz; ++i) { |
1313 | if (get_apply_to().cohort->readings[i] == removed.back()) { |
1314 | --oz; |
1315 | std::swap(get_apply_to().cohort->readings[i], get_apply_to().cohort->readings[oz]); |
1316 | } |
1317 | } |
1318 | removed.pop_back(); |
1319 | } |
1320 | get_apply_to().cohort->readings.resize(oz); |
1321 | if (debug_level > 0) { |
1322 | std::cerr << "DEBUG: Rule " << rule->line << " hit cohort " << get_apply_to().cohort->local_number << std::endl; |
1323 | } |
1324 | readings_changed = true; |
1325 | } |
1326 | if (get_apply_to().cohort->readings.empty()) { |
1327 | initEmptyCohort(*get_apply_to().cohort); |
1328 | } |
1329 | selected.clear(); |
1330 | } |
1331 | else if (rule->type == K_JUMP) { |
1332 | auto to = getTagList(*rule->maplist).front(); |
1333 | VARSTRINGIFY(to)do { while ((to)->type & T_VARSTRING) { (to) = generateVarstringTag ((to)); } } while (0); |
1334 | auto it = grammar->anchors.find(to->hash); |
1335 | if (it == grammar->anchors.end()) { |
1336 | u_fprintfu_fprintf_72(ux_stderr, "Warning: JUMP on line %u could not find anchor '%S'.\n", rule->line, to->tag.data()); |
1337 | } |
1338 | else { |
1339 | iter_rules = intersects.lower_bound(it->second); |
1340 | finish_cohort_loop = false; |
1341 | should_repeat = true; |
1342 | } |
1343 | } |
1344 | else if (rule->type == K_REMVARIABLE) { |
1345 | auto names = getTagList(*rule->maplist); |
1346 | for (auto tag : names) { |
1347 | VARSTRINGIFY(tag)do { while ((tag)->type & T_VARSTRING) { (tag) = generateVarstringTag ((tag)); } } while (0); |
1348 | auto it = variables.begin(); |
1349 | if (tag->type & T_REGEXP) { |
1350 | it = std::find_if(it, variables.end(), [&](auto& kv) { return doesTagMatchRegexp(kv.first, *tag); }); |
1351 | } |
1352 | else if (tag->type & T_CASE_INSENSITIVE) { |
1353 | it = std::find_if(it, variables.end(), [&](auto& kv) { return doesTagMatchIcase(kv.first, *tag); }); |
1354 | } |
1355 | else { |
1356 | it = variables.find(tag->hash); |
1357 | } |
1358 | if (it != variables.end()) { |
1359 | if (rule->flags & RF_OUTPUT) { |
1360 | current.variables_output.insert(it->first); |
1361 | } |
1362 | variables.erase(it); |
1363 | //u_fprintf(ux_stderr, "Info: RemVariable fired for %S.\n", tag->tag.data()); |
1364 | } |
1365 | } |
1366 | } |
1367 | else if (rule->type == K_SETVARIABLE) { |
1368 | auto names = getTagList(*rule->maplist); |
1369 | auto values = getTagList(*rule->sublist); |
1370 | VARSTRINGIFY(names.front())do { while ((names.front())->type & T_VARSTRING) { (names .front()) = generateVarstringTag((names.front())); } } while ( 0); |
1371 | VARSTRINGIFY(values.front())do { while ((values.front())->type & T_VARSTRING) { (values .front()) = generateVarstringTag((values.front())); } } while (0); |
1372 | variables[names.front()->hash] = values.front()->hash; |
1373 | if (rule->flags & RF_OUTPUT) { |
1374 | current.variables_output.insert(names.front()->hash); |
1375 | } |
1376 | //u_fprintf(ux_stderr, "Info: SetVariable fired for %S.\n", names.front()->tag.data()); |
1377 | } |
1378 | else if (rule->type == K_DELIMIT) { |
1379 | auto cohort = get_apply_to().cohort; |
1380 | if (cohort->parent->cohorts.size() > cohort->local_number + 1) { |
1381 | delimitAt(current, cohort); |
1382 | delimited = true; |
1383 | readings_changed = true; |
1384 | } |
1385 | } |
1386 | else if (rule->type == K_EXTERNAL_ONCE || rule->type == K_EXTERNAL_ALWAYS) { |
1387 | if (rule->type == K_EXTERNAL_ONCE && !current.hit_external.insert(rule->line).second) { |
1388 | return; |
1389 | } |
1390 | |
1391 | auto ei = externals.find(rule->varname); |
1392 | if (ei == externals.end()) { |
1393 | Tag* ext = grammar->single_tags.find(rule->varname)->second; |
1394 | UErrorCode err = U_ZERO_ERROR; |
1395 | u_strToUTF8u_strToUTF8_72(&cbuffers[0][0], SI32(CG3_BUFFER_SIZE - 1), nullptr, ext->tag.data(), SI32(ext->tag.size()), &err); |
1396 | |
1397 | Process& es = externals[rule->varname]; |
1398 | try { |
1399 | es.start(&cbuffers[0][0]); |
1400 | writeRaw(es, CG3_EXTERNAL_PROTOCOL); |
1401 | } |
1402 | catch (std::exception& e) { |
1403 | u_fprintfu_fprintf_72(ux_stderr, "Error: External on line %u resulted in error: %s\n", rule->line, e.what()); |
1404 | CG3Quit(1); |
1405 | } |
1406 | ei = externals.find(rule->varname); |
1407 | } |
1408 | |
1409 | pipeOutSingleWindow(current, ei->second); |
1410 | pipeInSingleWindow(current, ei->second); |
1411 | |
1412 | indexSingleWindow(current); |
1413 | readings_changed = true; |
1414 | index_ruleCohort_no.clear(); |
1415 | intersects = current.valid_rules.intersect(rules); |
1416 | iter_rules = intersects.find(rule->number); |
1417 | iter_rules_end = intersects.end(); |
1418 | reset_cohorts_for_loop = true; |
1419 | } |
1420 | else if (rule->type == K_REMCOHORT) { |
1421 | // REMCOHORT-IGNORED |
1422 | if (rule->flags & RF_IGNORED) { |
1423 | CohortSet cohorts; |
1424 | collect_subtree(cohorts, get_apply_to().cohort, rule->childset1); |
1425 | for (auto c : reversed(cohorts)) { |
1426 | ignore_cohort(c); |
1427 | } |
1428 | reindex(); |
1429 | reflowDependencyWindow(); |
1430 | } |
1431 | else { |
1432 | rem_cohort(get_apply_to().cohort); |
1433 | } |
1434 | |
1435 | // If we just removed the last cohort, add <<< to the new last cohort |
1436 | if (get_apply_to().cohort->readings.front()->tags.count(endtag)) { |
1437 | for (auto r : current.cohorts.back()->readings) { |
1438 | addTagToReading(*r, endtag); |
1439 | if (updateValidRules(rules, intersects, endtag, *r)) { |
1440 | iter_rules = intersects.find(rule->number); |
1441 | iter_rules_end = intersects.end(); |
1442 | } |
1443 | } |
1444 | index_ruleCohort_no.clear(); |
1445 | } |
1446 | readings_changed = true; |
1447 | reset_cohorts_for_loop = true; |
1448 | } |
1449 | }; |
1450 | |
1451 | RuleCallback reading_cb = [&]() { |
1452 | if (rule->type == K_SELECT || (rule->type == K_IFF && get_apply_to().subreading->matched_tests)) { |
1453 | selected.push_back(get_apply_to().reading); |
1454 | index_ruleCohort_no.clear(); |
1455 | } |
1456 | else if (rule->type == K_REMOVE || rule->type == K_IFF) { |
1457 | if (rule->type == K_REMOVE && (rule->flags & RF_UNMAPLAST) && removed.size() == get_apply_to().cohort->readings.size() - 1) { |
1458 | if (unmapReading(*get_apply_to().subreading, rule->number)) { |
1459 | readings_changed = true; |
1460 | } |
1461 | } |
1462 | else { |
1463 | TRACE; |
1464 | removed.push_back(get_apply_to().reading); |
1465 | } |
1466 | index_ruleCohort_no.clear(); |
1467 | } |
1468 | else if (rule->type == K_PROTECT) { |
1469 | TRACE; |
1470 | get_apply_to().subreading->immutable = true; |
1471 | } |
1472 | else if (rule->type == K_UNPROTECT) { |
1473 | TRACE; |
1474 | get_apply_to().subreading->immutable = false; |
1475 | } |
1476 | else if (rule->type == K_UNMAP) { |
1477 | if (unmapReading(*get_apply_to().subreading, rule->number)) { |
1478 | index_ruleCohort_no.clear(); |
1479 | readings_changed = true; |
1480 | } |
1481 | } |
1482 | else if (rule->type == K_ADDCOHORT_AFTER || rule->type == K_ADDCOHORT_BEFORE) { |
1483 | index_ruleCohort_no.clear(); |
1484 | TRACE; |
1485 | |
1486 | size_t spacesInAddedWf = 0; // not used here |
1487 | auto cCohort = add_cohort(get_apply_to().cohort, spacesInAddedWf); |
1488 | |
1489 | // If the new cohort is now the last cohort, add <<< to it and remove <<< from previous last cohort |
1490 | if (current.cohorts.back() == cCohort) { |
1491 | for (auto r : current.cohorts[current.cohorts.size() - 2]->readings) { |
1492 | delTagFromReading(*r, endtag); |
1493 | } |
1494 | for (auto r : current.cohorts.back()->readings) { |
1495 | addTagToReading(*r, endtag); |
1496 | if (updateValidRules(rules, intersects, endtag, *r)) { |
1497 | iter_rules = intersects.find(rule->number); |
1498 | iter_rules_end = intersects.end(); |
1499 | } |
1500 | } |
1501 | } |
1502 | indexSingleWindow(current); |
1503 | readings_changed = true; |
1504 | |
1505 | reset_cohorts_for_loop = true; |
1506 | } |
1507 | else if (rule->type == K_SPLITCOHORT) { |
1508 | index_ruleCohort_no.clear(); |
1509 | |
1510 | std::vector<std::pair<Cohort*, std::vector<TagList>>> cohorts; |
1511 | |
1512 | auto theTags = ss_taglist.get(); |
1513 | getTagList(*rule->maplist, theTags); |
1514 | |
1515 | for (auto& tter : *theTags) { |
1516 | if (tter->type & T_VSTR) { |
1517 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
1518 | } |
1519 | } |
1520 | |
1521 | Tag* wf = nullptr; |
1522 | for (auto tter : *theTags) { |
1523 | if (tter->type & T_WORDFORM) { |
1524 | cohorts.resize(cohorts.size() + 1); |
1525 | cohorts.back().first = alloc_cohort(¤t); |
1526 | cohorts.back().first->global_number = gWindow->cohort_counter++; |
1527 | wf = tter; |
1528 | VARSTRINGIFY(wf)do { while ((wf)->type & T_VARSTRING) { (wf) = generateVarstringTag ((wf)); } } while (0); |
1529 | cohorts.back().first->wordform = wf; |
1530 | continue; |
1531 | } |
1532 | if (!wf) { |
1533 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a wordform before any other tags in SPLITCOHORT on line %u before input line %u.\n", rule->line, numLines); |
1534 | CG3Quit(1); |
1535 | } |
1536 | } |
1537 | |
1538 | uint32_t rel_trg = DEP_NO_PARENT; |
1539 | std::vector<std::pair<uint32_t, uint32_t>> cohort_dep(cohorts.size()); |
1540 | cohort_dep.front().second = DEP_NO_PARENT; |
1541 | cohort_dep.back().first = DEP_NO_PARENT; |
1542 | cohort_dep.back().second = UI32(cohort_dep.size() - 1); |
1543 | for (size_t i = 1; i < cohort_dep.size() - 1; ++i) { |
1544 | cohort_dep[i].second = UI32(i); |
1545 | } |
1546 | |
1547 | size_t i = 0; |
1548 | std::vector<TagList>* readings = &cohorts.front().second; |
1549 | Tag* bf = nullptr; |
1550 | for (auto tter : *theTags) { |
1551 | if (tter->type & T_WORDFORM) { |
1552 | ++i; |
1553 | bf = nullptr; |
1554 | continue; |
1555 | } |
1556 | if (tter->type & T_BASEFORM) { |
1557 | readings = &cohorts[i - 1].second; |
1558 | readings->resize(readings->size() + 1); |
1559 | readings->back().push_back(cohorts[i - 1].first->wordform); |
1560 | bf = tter; |
1561 | } |
1562 | if (!bf) { |
1563 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform after the wordform in SPLITCOHORT on line %u before input line %u.\n", rule->line, numLines); |
1564 | CG3Quit(1); |
1565 | } |
1566 | |
1567 | UChar dep_self[12] = {}; |
1568 | UChar dep_parent[12] = {}; |
1569 | if (u_sscanfu_sscanf_72(tter->tag.data(), "%[0-9cd]->%[0-9pm]", &dep_self, &dep_parent) == 2) { |
1570 | if (dep_self[0] == 'c' || dep_self[0] == 'd') { |
1571 | cohort_dep[i - 1].first = DEP_NO_PARENT; |
1572 | if (rel_trg == DEP_NO_PARENT) { |
1573 | rel_trg = UI32(i - 1); |
1574 | } |
1575 | } |
1576 | else if (u_sscanfu_sscanf_72(dep_self, "%i", &cohort_dep[i - 1].first) != 1) { |
1577 | u_fprintfu_fprintf_72(ux_stderr, "Error: SPLITCOHORT dependency mapping dep_self was not valid on line %u before input line %u.\n", rule->line, numLines); |
1578 | CG3Quit(1); |
1579 | } |
1580 | if (dep_parent[0] == 'p' || dep_parent[0] == 'm') { |
1581 | cohort_dep[i - 1].second = DEP_NO_PARENT; |
1582 | } |
1583 | else if (u_sscanfu_sscanf_72(dep_parent, "%i", &cohort_dep[i - 1].second) != 1) { |
1584 | u_fprintfu_fprintf_72(ux_stderr, "Error: SPLITCOHORT dependency mapping dep_parent was not valid on line %u before input line %u.\n", rule->line, numLines); |
1585 | CG3Quit(1); |
1586 | } |
1587 | continue; |
1588 | } |
1589 | if (tter->tag.size() == 3 && tter->tag[0] == 'R' && tter->tag[1] == ':' && tter->tag[2] == '*') { |
1590 | rel_trg = UI32(i - 1); |
1591 | continue; |
1592 | } |
1593 | readings->back().push_back(tter); |
1594 | } |
1595 | |
1596 | if (rel_trg == DEP_NO_PARENT) { |
1597 | rel_trg = UI32(cohorts.size() - 1); |
1598 | } |
1599 | |
1600 | for (size_t i = 0; i < cohorts.size(); ++i) { |
1601 | Cohort* cCohort = cohorts[i].first; |
1602 | readings = &cohorts[i].second; |
1603 | |
1604 | for (auto tags : *readings) { |
1605 | Reading* cReading = alloc_reading(cCohort); |
1606 | ++numReadings; |
1607 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); |
1608 | cReading->hit_by.push_back(rule->number); |
1609 | cReading->noprint = false; |
1610 | TagList mappings; |
1611 | |
1612 | for (size_t i = 0; i < tags.size(); ++i) { |
1613 | if (tags[i]->hash == grammar->tag_any) { |
1614 | uint32Vector& nt = get_apply_to().cohort->readings.front()->tags_list; |
1615 | if (nt.size() <= 2) { |
1616 | continue; |
1617 | } |
1618 | tags.reserve(tags.size() + nt.size() - 2); |
1619 | tags[i] = grammar->single_tags[nt[2]]; |
1620 | for (size_t j = 3, k = 1; j < nt.size(); ++j) { |
1621 | if (grammar->single_tags[nt[j]]->type & T_DEPENDENCY) { |
1622 | continue; |
1623 | } |
1624 | tags.insert(tags.begin() + i + k, grammar->single_tags[nt[j]]); |
1625 | ++k; |
1626 | } |
1627 | } |
1628 | } |
1629 | |
1630 | for (auto tter : tags) { |
1631 | uint32_t hash = tter->hash; |
1632 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
1633 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { |
1634 | mappings.push_back(tter); |
1635 | } |
1636 | else { |
1637 | hash = addTagToReading(*cReading, hash); |
1638 | } |
1639 | if (updateValidRules(rules, intersects, hash, *cReading)) { |
1640 | iter_rules = intersects.find(rule->number); |
1641 | iter_rules_end = intersects.end(); |
1642 | } |
1643 | } |
1644 | if (!mappings.empty()) { |
1645 | splitMappings(mappings, *cCohort, *cReading); |
1646 | } |
1647 | cCohort->appendReading(cReading); |
1648 | } |
1649 | |
1650 | if (cCohort->readings.empty()) { |
1651 | initEmptyCohort(*cCohort); |
1652 | } |
1653 | |
1654 | current.parent->dep_window[cCohort->global_number] = cCohort; |
1655 | current.parent->cohort_map[cCohort->global_number] = cCohort; |
1656 | |
1657 | current.cohorts.insert(current.cohorts.begin() + get_apply_to().cohort->local_number + i + 1, cCohort); |
1658 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + get_apply_to().cohort->local_number, current.all_cohorts.end(), get_apply_to().cohort) + i + 1, cCohort); |
1659 | } |
1660 | |
1661 | // Move text from the to-be-deleted cohort to the last new cohort |
1662 | std::swap(cohorts.back().first->text, get_apply_to().cohort->text); |
1663 | |
1664 | for (size_t i = 0; i < cohorts.size(); ++i) { |
1665 | Cohort* cCohort = cohorts[i].first; |
1666 | |
1667 | if (cohort_dep[i].first == DEP_NO_PARENT) { |
1668 | while (!get_apply_to().cohort->dep_children.empty()) { |
1669 | uint32_t ch = get_apply_to().cohort->dep_children.back(); |
1670 | attachParentChild(*cCohort, *current.parent->cohort_map[ch], true, true); |
1671 | get_apply_to().cohort->dep_children.erase(ch); // Just in case the attachment can't be made for some reason |
1672 | } |
1673 | } |
1674 | |
1675 | if (cohort_dep[i].second == DEP_NO_PARENT) { |
1676 | if (current.parent->cohort_map.count(get_apply_to().cohort->dep_parent)) { |
1677 | attachParentChild(*current.parent->cohort_map[get_apply_to().cohort->dep_parent], *cCohort, true, true); |
1678 | } |
1679 | } |
1680 | else { |
1681 | attachParentChild(*current.parent->cohort_map[cohorts.front().first->global_number + cohort_dep[i].second - 1], *cCohort, true, true); |
1682 | } |
1683 | |
1684 | // Re-attach all named relations to the dependency tail or R:* cohort |
1685 | if (rel_trg == i && (get_apply_to().cohort->type & CT_RELATED)) { |
1686 | cCohort->setRelated(); |
1687 | cCohort->relations.swap(get_apply_to().cohort->relations); |
1688 | |
1689 | std::pair<SingleWindow**, size_t> swss[3] = { |
1690 | std::make_pair(&gWindow->previous[0], gWindow->previous.size()), |
1691 | std::make_pair(&gWindow->current, static_cast<size_t>(1)), |
1692 | std::make_pair(&gWindow->next[0], gWindow->next.size()), |
1693 | }; |
1694 | for (auto sws : swss) { |
1695 | for (size_t sw = 0; sw < sws.second; ++sw) { |
1696 | for (auto ch : sws.first[sw]->cohorts) { |
1697 | for (auto& rel : ch->relations) { |
1698 | if (rel.second.count(get_apply_to().cohort->global_number)) { |
1699 | rel.second.erase(get_apply_to().cohort->global_number); |
1700 | rel.second.insert(cCohort->global_number); |
1701 | } |
1702 | } |
1703 | } |
1704 | } |
1705 | } |
1706 | } |
1707 | } |
1708 | |
1709 | // Remove the source cohort |
1710 | for (auto iter : get_apply_to().cohort->readings) { |
1711 | iter->hit_by.push_back(rule->number); |
1712 | iter->deleted = true; |
1713 | } |
1714 | get_apply_to().cohort->type |= CT_REMOVED; |
1715 | get_apply_to().cohort->detach(); |
1716 | for (auto& cm : current.parent->cohort_map) { |
1717 | cm.second->dep_children.erase(get_apply_to().cohort->dep_self); |
1718 | } |
1719 | current.parent->cohort_map.erase(get_apply_to().cohort->global_number); |
1720 | current.cohorts.erase(current.cohorts.begin() + get_apply_to().cohort->local_number); |
1721 | |
1722 | reindex(); |
1723 | indexSingleWindow(current); |
1724 | readings_changed = true; |
1725 | |
1726 | reset_cohorts_for_loop = true; |
1727 | } |
1728 | else if (rule->type == K_ADD || rule->type == K_MAP) { |
1729 | TRACE; |
1730 | auto state_hash = get_apply_to().subreading->hash; |
1731 | index_ruleCohort_no.clear(); |
1732 | auto& reading = *(get_apply_to().subreading); |
1733 | reading.noprint = false; |
1734 | auto mappings = ss_taglist.get(); |
1735 | auto theTags = ss_taglist.get(); |
1736 | getTagList(*rule->maplist, theTags); |
1737 | |
1738 | bool did_insert = false; |
1739 | if (rule->childset1) { |
1740 | bool found_spot = false; |
1741 | auto spot_tags = ss_taglist.get(); |
1742 | getTagList(*grammar->sets_list[rule->childset1], spot_tags); |
1743 | FILL_TAG_LIST(spot_tags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (spot_tags)->begin(); it != (spot_tags)->end( );) { if (reading.tags.find((*it)->hash) == reading.tags.end ()) { auto tt = *it; it = (spot_tags)->erase(it); if (tt-> type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack.back().regexgrps = ®exgrps_store [used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { (spot_tags)->insert(it, grammar ->single_tags.find(stag)->second); } } continue; } ++it ; } } while (0); |
1744 | auto it = reading.tags_list.begin(); |
1745 | for (; it != reading.tags_list.end(); ++it) { |
1746 | bool found = true; |
1747 | auto tmp = it; |
1748 | for (auto tag : *spot_tags) { |
1749 | if (*tmp != tag->hash) { |
1750 | found = false; |
1751 | break; |
1752 | } |
1753 | ++tmp; |
1754 | } |
1755 | if (found) { |
1756 | found_spot = true; |
1757 | break; |
1758 | } |
1759 | } |
1760 | if (found_spot) { |
1761 | if (rule->flags & RF_AFTER) { |
1762 | std::advance(it, spot_tags->size()); |
1763 | } |
1764 | if (it != reading.tags_list.end()) { |
1765 | insert_taglist_to_reading(it, *theTags, reading, mappings); |
1766 | did_insert = true; |
1767 | } |
1768 | } |
1769 | } |
1770 | |
1771 | if (!did_insert) { |
1772 | APPEND_TAGLIST_TO_READING(*theTags, reading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((reading), tter); } if (updateValidRules(rules, intersects, hash, reading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0); |
1773 | } |
1774 | if (!mappings->empty()) { |
1775 | splitMappings(mappings, *get_apply_to().cohort, reading, rule->type == K_MAP); |
1776 | } |
1777 | if (rule->type == K_MAP) { |
1778 | reading.mapped = true; |
1779 | } |
1780 | if (reading.hash != state_hash) { |
1781 | readings_changed = true; |
1782 | } |
1783 | } |
1784 | else if (rule->type == K_RESTORE) { |
1785 | bool did_restore = false; |
1786 | auto move_rs = [&](ReadingList& rl) { |
1787 | for (size_t i = 0; i < rl.size();) { |
1788 | if (doesSetMatchReading(*rl[i], rule->maplist->number)) { |
1789 | rl[i]->deleted = false; |
1790 | rl[i]->hit_by.push_back(rule->number); |
1791 | get_apply_to().cohort->readings.push_back(rl[i]); |
1792 | rl.erase(rl.begin() + i); |
1793 | did_restore = true; |
1794 | } |
1795 | else { |
1796 | ++i; |
1797 | } |
1798 | } |
1799 | }; |
1800 | |
1801 | if (rule->flags & RF_DELAYED) { |
1802 | move_rs(get_apply_to().cohort->delayed); |
1803 | } |
1804 | else if (rule->flags & RF_IGNORED) { |
1805 | move_rs(get_apply_to().cohort->ignored); |
1806 | } |
1807 | else { |
1808 | move_rs(get_apply_to().cohort->deleted); |
1809 | } |
1810 | |
1811 | if (did_restore) { |
1812 | TRACE; |
1813 | } |
1814 | finish_reading_loop = false; |
1815 | } |
1816 | else if (rule->type == K_REPLACE) { |
1817 | auto state_hash = get_apply_to().subreading->hash; |
1818 | index_ruleCohort_no.clear(); |
1819 | TRACE; |
1820 | get_apply_to().subreading->noprint = false; |
1821 | get_apply_to().subreading->tags_list.clear(); |
1822 | get_apply_to().subreading->tags_list.push_back(get_apply_to().cohort->wordform->hash); |
1823 | get_apply_to().subreading->tags_list.push_back(get_apply_to().subreading->baseform); |
1824 | reflowReading(*get_apply_to().subreading); |
1825 | auto mappings = ss_taglist.get(); |
1826 | auto theTags = ss_taglist.get(); |
1827 | getTagList(*rule->maplist, theTags); |
1828 | |
1829 | APPEND_TAGLIST_TO_READING(*theTags, *get_apply_to().subreading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((*get_apply_to().subreading ), tter); } if (updateValidRules(rules, intersects, hash, *get_apply_to ().subreading)) { iter_rules = intersects.find(rule->number ); iter_rules_end = intersects.end(); } } } while (0); |
1830 | |
1831 | if (!mappings->empty()) { |
1832 | splitMappings(mappings, *get_apply_to().cohort, *get_apply_to().subreading, true); |
1833 | } |
1834 | if (get_apply_to().subreading->hash != state_hash) { |
1835 | readings_changed = true; |
1836 | } |
1837 | } |
1838 | else if (rule->type == K_SUBSTITUTE) { |
1839 | // ToDo: Check whether this substitution will do nothing at all to the end result |
1840 | // ToDo: Not actually...instead, test whether any reading in the cohort already is the end result |
1841 | |
1842 | auto state_hash = get_apply_to().subreading->hash; |
1843 | auto theTags = ss_taglist.get(); |
1844 | getTagList(*rule->sublist, theTags); |
1845 | |
1846 | // Modify the list of tags to remove to be the actual list of tags present, including matching regex and icase tags |
1847 | FILL_TAG_LIST(theTags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (theTags)->begin(); it != (theTags)->end();) { if (reading.tags.find((*it)->hash) == reading.tags.end()) { auto tt = *it; it = (theTags)->erase(it); if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr ) { context_stack.back().regexgrps = ®exgrps_store[used_regex ]; } auto stag = doesTagMatchReading(reading, *tt, false, true ); if (stag) { (theTags)->insert(it, grammar->single_tags .find(stag)->second); } } continue; } ++it; } } while (0); |
1848 | |
1849 | // Perform the tag removal, remembering the position of the final removed tag for use as insertion spot |
1850 | size_t tpos = std::numeric_limits<size_t>::max(); |
1851 | bool plain = true; |
1852 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size();) { |
1853 | auto& remter = get_apply_to().subreading->tags_list[i]; |
1854 | |
1855 | if (plain && remter == (*theTags->begin())->hash) { |
1856 | if (get_apply_to().subreading->baseform == remter) { |
1857 | get_apply_to().subreading->baseform = 0; |
1858 | } |
1859 | remter = substtag; |
1860 | tpos = i; |
1861 | for (size_t j = 1; j < theTags->size() && i < get_apply_to().subreading->tags_list.size(); ++j, ++i) { |
1862 | auto& remter = get_apply_to().subreading->tags_list[i]; |
1863 | auto tter = (*theTags)[j]->hash; |
1864 | if (remter != tter) { |
1865 | plain = false; |
1866 | break; |
1867 | } |
1868 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); |
1869 | get_apply_to().subreading->tags.erase(tter); |
1870 | if (get_apply_to().subreading->baseform == tter) { |
1871 | get_apply_to().subreading->baseform = 0; |
1872 | } |
1873 | } |
1874 | continue; |
1875 | } |
1876 | |
1877 | for (auto tter : *theTags) { |
1878 | if (remter != tter->hash) { |
1879 | continue; |
1880 | } |
1881 | tpos = i; |
1882 | remter = substtag; |
1883 | get_apply_to().subreading->tags.erase(tter->hash); |
1884 | if (get_apply_to().subreading->baseform == tter->hash) { |
1885 | get_apply_to().subreading->baseform = 0; |
1886 | } |
1887 | } |
1888 | |
1889 | ++i; |
1890 | } |
1891 | |
1892 | // Should Substitute really do nothing if no tags were removed? 2013-10-21, Eckhard says this is expected behavior. |
1893 | if (tpos != std::numeric_limits<size_t>::max()) { |
1894 | if (!plain) { |
1895 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size() && i < tpos;) { |
1896 | if (get_apply_to().subreading->tags_list[i] == substtag) { |
1897 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); |
1898 | --tpos; |
1899 | } |
1900 | else { |
1901 | ++i; |
1902 | } |
1903 | } |
1904 | } |
1905 | |
1906 | Tag* wf = nullptr; |
1907 | index_ruleCohort_no.clear(); |
1908 | TRACE; |
1909 | get_apply_to().subreading->noprint = false; |
1910 | if (tpos >= get_apply_to().subreading->tags_list.size()) { |
1911 | tpos = get_apply_to().subreading->tags_list.size() - 1; |
1912 | } |
1913 | ++tpos; |
1914 | auto mappings = ss_taglist.get(); |
1915 | auto theTags = ss_taglist.get(); |
1916 | getTagList(*rule->maplist, theTags); |
1917 | |
1918 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size();) { |
1919 | if (get_apply_to().subreading->tags_list[i] == substtag) { |
1920 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); |
1921 | tpos = i; |
1922 | |
1923 | for (auto tag : *theTags) { |
1924 | if (tag->type & T_VARSTRING) { |
1925 | tag = generateVarstringTag(tag); |
1926 | } |
1927 | if (tag->hash == grammar->tag_any) { |
1928 | break; |
1929 | } |
1930 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { |
1931 | mappings->push_back(tag); |
1932 | } |
1933 | else { |
1934 | if (tag->type & T_WORDFORM) { |
1935 | wf = tag; |
1936 | } |
1937 | get_apply_to().subreading->tags_list.insert(get_apply_to().subreading->tags_list.begin() + tpos, tag->hash); |
1938 | ++tpos; |
1939 | } |
1940 | if (updateValidRules(rules, intersects, tag->hash, *get_apply_to().subreading)) { |
1941 | iter_rules = intersects.find(rule->number); |
1942 | iter_rules_end = intersects.end(); |
1943 | } |
1944 | } |
1945 | } |
1946 | else { |
1947 | ++i; |
1948 | } |
1949 | } |
1950 | reflowReading(*get_apply_to().subreading); |
1951 | |
1952 | if (!mappings->empty()) { |
1953 | splitMappings(mappings, *get_apply_to().cohort, *get_apply_to().subreading, true); |
1954 | } |
1955 | if (wf && wf != get_apply_to().subreading->parent->wordform) { |
1956 | for (auto r : get_apply_to().subreading->parent->readings) { |
1957 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); |
1958 | addTagToReading(*r, wf); |
1959 | } |
1960 | for (auto r : get_apply_to().subreading->parent->deleted) { |
1961 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); |
1962 | addTagToReading(*r, wf); |
1963 | } |
1964 | for (auto r : get_apply_to().subreading->parent->delayed) { |
1965 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); |
1966 | addTagToReading(*r, wf); |
1967 | } |
1968 | get_apply_to().subreading->parent->wordform = wf; |
1969 | for (auto r : grammar->wf_rules) { |
1970 | if (doesWordformsMatch(wf, r->wordform)) { |
1971 | current.rule_to_cohorts[r->number].insert(get_apply_to().cohort); |
1972 | intersects.insert(r->number); |
1973 | } |
1974 | else { |
1975 | current.rule_to_cohorts[r->number].erase(get_apply_to().cohort); |
1976 | } |
1977 | } |
1978 | updateValidRules(rules, intersects, wf->hash, *get_apply_to().subreading); |
1979 | iter_rules = intersects.find(rule->number); |
1980 | iter_rules_end = intersects.end(); |
1981 | } |
1982 | } |
1983 | if (get_apply_to().subreading->hash != state_hash) { |
1984 | readings_changed = true; |
1985 | } |
1986 | } |
1987 | else if (rule->type == K_APPEND) { |
1988 | index_ruleCohort_no.clear(); |
1989 | TRACE; |
1990 | |
1991 | Tag* bf = nullptr; |
1992 | std::vector<TagList> readings; |
1993 | auto theTags = ss_taglist.get(); |
1994 | getTagList(*rule->maplist, theTags); |
1995 | |
1996 | for (auto& tter : *theTags) { |
1997 | if (tter->type & T_VSTR) { |
1998 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
1999 | } |
2000 | } |
2001 | |
2002 | for (auto tter : *theTags) { |
2003 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2004 | if (tter->type & T_BASEFORM) { |
2005 | bf = tter; |
2006 | readings.resize(readings.size() + 1); |
2007 | } |
2008 | if (bf == nullptr) { |
2009 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform before any other tags in APPEND on line %u.\n", rule->line); |
2010 | CG3Quit(1); |
2011 | } |
2012 | readings.back().push_back(tter); |
2013 | } |
2014 | |
2015 | for (const auto& rit : readings) { |
2016 | Reading* cReading = alloc_reading(get_apply_to().cohort); |
2017 | ++numReadings; |
2018 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); |
2019 | addTagToReading(*cReading, get_apply_to().cohort->wordform); |
2020 | cReading->hit_by.push_back(rule->number); |
2021 | cReading->noprint = false; |
2022 | TagList mappings; |
2023 | for (auto tter : rit) { |
2024 | uint32_t hash = tter->hash; |
2025 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2026 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { |
2027 | mappings.push_back(tter); |
2028 | } |
2029 | else { |
2030 | hash = addTagToReading(*cReading, tter); |
2031 | } |
2032 | if (updateValidRules(rules, intersects, hash, *cReading)) { |
2033 | iter_rules = intersects.find(rule->number); |
2034 | iter_rules_end = intersects.end(); |
2035 | } |
2036 | } |
2037 | if (!mappings.empty()) { |
2038 | splitMappings(mappings, *get_apply_to().cohort, *cReading); |
2039 | } |
2040 | get_apply_to().cohort->appendReading(cReading); |
2041 | } |
2042 | |
2043 | if (get_apply_to().cohort->readings.size() > 1) { |
2044 | foreach (rit, get_apply_to().cohort->readings)if (!(get_apply_to().cohort->readings).empty()) for (auto rit = (get_apply_to().cohort->readings).begin(), rit_end = (get_apply_to ().cohort->readings).end(); rit != rit_end; ++rit) { |
2045 | if ((*rit)->noprint) { |
2046 | free_reading(*rit); |
2047 | rit = get_apply_to().cohort->readings.erase(rit); |
2048 | rit_end = get_apply_to().cohort->readings.end(); |
2049 | } |
2050 | } |
2051 | } |
2052 | |
2053 | readings_changed = true; |
2054 | finish_reading_loop = false; |
2055 | } |
2056 | else if (rule->type == K_COPY) { |
2057 | // ToDo: Maybe just goto Substitute directly? |
2058 | Reading* cReading = get_apply_to().cohort->allocateAppendReading(*get_apply_to().reading); |
2059 | ++numReadings; |
2060 | index_ruleCohort_no.clear(); |
2061 | TRACE; |
2062 | cReading->hit_by.push_back(rule->number); |
2063 | cReading->noprint = false; |
2064 | |
2065 | if (rule->sublist) { |
2066 | auto excepts = ss_taglist.get(); |
2067 | getTagList(*rule->sublist, excepts); |
2068 | FILL_TAG_LIST_RAW(excepts)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(excepts)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0); |
2069 | for (auto r = cReading; r; r = r->next) { |
2070 | for (auto tter : *excepts) { |
2071 | delTagFromReading(*r, tter); |
2072 | } |
2073 | } |
2074 | } |
2075 | |
2076 | auto mappings = ss_taglist.get(); |
2077 | auto theTags = ss_taglist.get(); |
2078 | getTagList(*rule->maplist, theTags); |
2079 | |
2080 | bool did_insert = false; |
2081 | if (rule->childset1) { |
2082 | auto spot_tags = ss_taglist.get(); |
2083 | getTagList(*grammar->sets_list[rule->childset1], spot_tags); |
2084 | FILL_TAG_LIST(spot_tags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (spot_tags)->begin(); it != (spot_tags)->end( );) { if (reading.tags.find((*it)->hash) == reading.tags.end ()) { auto tt = *it; it = (spot_tags)->erase(it); if (tt-> type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack.back().regexgrps = ®exgrps_store [used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { (spot_tags)->insert(it, grammar ->single_tags.find(stag)->second); } } continue; } ++it ; } } while (0); |
2085 | auto it = cReading->tags_list.begin(); |
2086 | for (; it != cReading->tags_list.end(); ++it) { |
2087 | bool found = true; |
2088 | auto tmp = it; |
2089 | for (auto tag : *spot_tags) { |
2090 | if (*tmp != tag->hash) { |
2091 | found = false; |
2092 | break; |
2093 | } |
2094 | ++tmp; |
2095 | } |
2096 | if (found) { |
2097 | break; |
2098 | } |
2099 | } |
2100 | if (rule->flags & RF_AFTER) { |
2101 | std::advance(it, spot_tags->size()); |
2102 | } |
2103 | if (it != cReading->tags_list.end()) { |
2104 | insert_taglist_to_reading(it, *theTags, *cReading, mappings); |
2105 | did_insert = true; |
2106 | } |
2107 | } |
2108 | |
2109 | if (!did_insert) { |
2110 | APPEND_TAGLIST_TO_READING(*theTags, *cReading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((*cReading), tter); } if (updateValidRules(rules, intersects, hash, *cReading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0); |
2111 | } |
2112 | if (!mappings->empty()) { |
2113 | splitMappings(mappings, *get_apply_to().cohort, *cReading, true); |
2114 | } |
2115 | readings_changed = true; |
2116 | reflowReading(*cReading); |
2117 | } |
2118 | else if (rule->type == K_MERGECOHORTS) { |
2119 | index_ruleCohort_no.clear(); |
2120 | |
2121 | CohortSet withs; |
2122 | Cohort* target = get_apply_to().cohort; |
2123 | withs.insert(target); |
2124 | Cohort* merge_at = target; |
2125 | for (auto it : rule->dep_tests) { |
2126 | auto& at = context_stack.back().attach_to; |
2127 | at.cohort = nullptr; |
2128 | at.reading = nullptr; |
2129 | at.subreading = nullptr; |
2130 | merge_with = nullptr; |
2131 | set_mark(target); |
2132 | dep_deep_seen.clear(); |
2133 | tmpl_cntx.clear(); |
2134 | Cohort* attach = nullptr; |
2135 | bool test_good = (runContextualTest(target->parent, target->local_number, it, &attach) && attach); |
2136 | |
2137 | profileRuleContext(test_good, rule, it); |
2138 | |
2139 | if (!test_good) { |
2140 | finish_reading_loop = false; |
2141 | return; |
2142 | } |
2143 | if (get_attach_to().cohort) { |
2144 | merge_at = get_attach_to().cohort; |
2145 | if (merge_with) { |
2146 | withs.insert(merge_with); |
2147 | } |
2148 | } |
2149 | else if (merge_with) { |
2150 | withs.insert(merge_with); |
2151 | } |
2152 | else { |
2153 | withs.insert(attach); |
2154 | } |
2155 | } |
2156 | |
2157 | size_t spacesInAddedWf = 0; |
2158 | context_stack.back().target.cohort = add_cohort(merge_at, spacesInAddedWf); |
2159 | |
2160 | for (auto c : withs) { |
2161 | size_t foundSpace = c->text.find_first_of(' '); |
2162 | while(spacesInAddedWf && foundSpace != std::string::npos) { |
2163 | c->text.erase(foundSpace, 1); |
2164 | foundSpace = c->text.find_first_of(' '); |
2165 | spacesInAddedWf--; |
2166 | } |
2167 | rem_cohort(c); |
2168 | } |
2169 | |
2170 | // If the last cohort was removed or inserted after, add <<< to the new end |
2171 | if (current.cohorts.back()->readings.front()->tags.count(endtag) == 0) { |
2172 | for (auto r : current.cohorts[current.cohorts.size() - 2]->readings) { |
2173 | delTagFromReading(*r, endtag); |
2174 | } |
2175 | for (auto r : current.cohorts.back()->readings) { |
2176 | addTagToReading(*r, endtag); |
2177 | if (updateValidRules(rules, intersects, endtag, *r)) { |
2178 | iter_rules = intersects.find(rule->number); |
2179 | iter_rules_end = intersects.end(); |
2180 | } |
2181 | } |
2182 | } |
2183 | indexSingleWindow(current); |
2184 | readings_changed = true; |
2185 | |
2186 | reset_cohorts_for_loop = true; |
2187 | } |
2188 | else if (rule->type == K_COPYCOHORT) { |
2189 | Cohort* attach = nullptr; |
2190 | Cohort* cohort = context_stack.back().target.cohort; |
2191 | uint32_t c = cohort->local_number; |
2192 | dep_deep_seen.clear(); |
2193 | tmpl_cntx.clear(); |
2194 | context_stack.back().attach_to.cohort = nullptr; |
2195 | context_stack.back().attach_to.reading = nullptr; |
2196 | context_stack.back().attach_to.subreading = nullptr; |
2197 | if (runContextualTest(¤t, c, rule->dep_target, &attach) && attach) { |
2198 | profileRuleContext(true, rule, rule->dep_target); |
2199 | |
2200 | if (get_attach_to().cohort) { |
2201 | attach = get_attach_to().cohort; |
2202 | } |
2203 | context_target = attach; |
2204 | bool good = true; |
2205 | for (auto it : rule->dep_tests) { |
2206 | context_stack.back().mark = attach; |
2207 | dep_deep_seen.clear(); |
2208 | tmpl_cntx.clear(); |
2209 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); |
2210 | |
2211 | profileRuleContext(test_good, rule, it); |
2212 | |
2213 | if (!test_good) { |
2214 | good = test_good; |
2215 | break; |
2216 | } |
2217 | } |
2218 | |
2219 | if (!good || cohort == attach || cohort->local_number == 0) { |
2220 | return; |
2221 | } |
2222 | |
2223 | auto childset = rule->childset2; |
2224 | if (rule->flags & RF_REVERSE) { |
2225 | std::swap(cohort, attach); |
2226 | childset = rule->childset1; |
2227 | } |
2228 | |
2229 | Cohort* cCohort = alloc_cohort(attach->parent); |
2230 | cCohort->global_number = gWindow->cohort_counter++; |
2231 | cCohort->wordform = cohort->wordform; |
2232 | insert_if_exists(cCohort->possible_sets, grammar->sets_any); |
2233 | |
2234 | auto theTags = ss_taglist.get(); |
2235 | getTagList(*rule->maplist, theTags); |
2236 | |
2237 | for (auto& tter : *theTags) { |
2238 | if (tter->type & T_VSTR) { |
2239 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2240 | } |
2241 | } |
2242 | |
2243 | auto excepts = ss_taglist.get(); |
2244 | if (rule->sublist) { |
2245 | getTagList(*rule->sublist, excepts); |
2246 | FILL_TAG_LIST_RAW(excepts)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(excepts)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0); |
2247 | } |
2248 | |
2249 | std::vector<Reading*> rs; |
2250 | for (auto r : cohort->readings) { |
2251 | rs.clear(); |
2252 | for (; r; r = r->next) { |
2253 | auto cReading = alloc_reading(cCohort); |
2254 | ++numReadings; |
2255 | cReading->hit_by.push_back(rule->number); |
2256 | cReading->noprint = false; |
2257 | TagList mappings; |
2258 | for (auto hash : r->tags_list) { |
2259 | auto tter = grammar->single_tags[hash]; |
2260 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { |
2261 | mappings.push_back(tter); |
2262 | } |
2263 | else { |
2264 | hash = addTagToReading(*cReading, hash); |
2265 | } |
2266 | if (updateValidRules(rules, intersects, hash, *cReading)) { |
2267 | iter_rules = intersects.find(rule->number); |
2268 | iter_rules_end = intersects.end(); |
2269 | } |
2270 | } |
2271 | for (auto tter : *theTags) { |
2272 | auto hash = tter->hash; |
2273 | if (hash == grammar->tag_any) { |
2274 | continue; |
2275 | } |
2276 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { |
2277 | mappings.push_back(tter); |
2278 | } |
2279 | else { |
2280 | hash = addTagToReading(*cReading, hash); |
2281 | } |
2282 | if (updateValidRules(rules, intersects, hash, *cReading)) { |
2283 | iter_rules = intersects.find(rule->number); |
2284 | iter_rules_end = intersects.end(); |
2285 | } |
2286 | } |
2287 | if (!mappings.empty()) { |
2288 | splitMappings(mappings, *cCohort, *cReading); |
2289 | } |
2290 | rs.push_back(cReading); |
2291 | } |
2292 | auto rn = rs.front(); |
2293 | for (size_t j = 1; j < rs.size(); ++j) { |
2294 | rn->next = rs[j]; |
2295 | rn = rn->next; |
2296 | } |
2297 | cCohort->appendReading(rs.front()); |
2298 | } |
2299 | |
2300 | if (cCohort->readings.empty()) { |
2301 | initEmptyCohort(*cCohort); |
2302 | if (trace) { |
2303 | auto r = cCohort->readings.front(); |
2304 | r->hit_by.push_back(rule->number); |
2305 | r->noprint = false; |
2306 | } |
2307 | } |
2308 | |
2309 | for (auto r : cCohort->readings) { |
2310 | for (; r; r = r->next) { |
2311 | for (auto tter : *excepts) { |
2312 | delTagFromReading(*r, tter); |
2313 | } |
2314 | } |
2315 | } |
2316 | |
2317 | if (cohort->wread) { |
2318 | cCohort->wread = alloc_reading(cCohort); |
2319 | for (auto hash : cohort->wread->tags_list) { |
2320 | hash = addTagToReading(*cCohort->wread, hash); |
2321 | if (updateValidRules(rules, intersects, hash, *cCohort->wread)) { |
2322 | iter_rules = intersects.find(rule->number); |
2323 | iter_rules_end = intersects.end(); |
2324 | } |
2325 | } |
2326 | } |
2327 | |
2328 | current.parent->cohort_map[cCohort->global_number] = cCohort; |
2329 | current.parent->dep_window[cCohort->global_number] = cCohort; |
2330 | |
2331 | CohortSet edges; |
2332 | collect_subtree(edges, attach, childset); |
2333 | |
2334 | if (rule->flags & RF_BEFORE) { |
2335 | attach->parent->cohorts.insert(attach->parent->cohorts.begin() + edges.front()->local_number, cCohort); |
2336 | attach->parent->all_cohorts.insert(std::find(attach->parent->all_cohorts.begin() + edges.front()->local_number, attach->parent->all_cohorts.end(), edges.front()), cCohort); |
2337 | attachParentChild(*edges.front(), *cCohort); |
2338 | } |
2339 | else { |
2340 | attach->parent->cohorts.insert(attach->parent->cohorts.begin() + edges.back()->local_number + 1, cCohort); |
2341 | attach->parent->all_cohorts.insert(std::find(attach->parent->all_cohorts.begin() + edges.back()->local_number, attach->parent->all_cohorts.end(), edges.back()) + 1, cCohort); |
2342 | attachParentChild(*edges.back(), *cCohort); |
2343 | } |
2344 | |
2345 | reindex(attach->parent); |
2346 | indexSingleWindow(*attach->parent); |
2347 | readings_changed = true; |
2348 | reset_cohorts_for_loop = true; |
2349 | } |
2350 | } |
2351 | else if (rule->type == K_SETPARENT || rule->type == K_SETCHILD || rule->type == K_ADDRELATION || rule->type == K_SETRELATION || rule->type == K_REMRELATION || rule->type == K_ADDRELATIONS || rule->type == K_SETRELATIONS || rule->type == K_REMRELATIONS) { |
2352 | auto dep_target_cb = [&]() -> bool { |
2353 | Cohort* target = context_stack.back().target.cohort; |
2354 | Cohort* attach = context_stack.back().attach_to.cohort; |
2355 | swapper<Cohort*> sw((rule->flags & RF_REVERSE) != 0, target, attach); |
2356 | if (rule->type == K_SETPARENT || rule->type == K_SETCHILD) { |
2357 | bool attached = false; |
2358 | if (rule->type == K_SETPARENT) { |
2359 | attached = attachParentChild(*attach, *target, (rule->flags & RF_ALLOWLOOP) != 0, (rule->flags & RF_ALLOWCROSS) != 0); |
2360 | } |
2361 | else { |
2362 | attached = attachParentChild(*target, *attach, (rule->flags & RF_ALLOWLOOP) != 0, (rule->flags & RF_ALLOWCROSS) != 0); |
2363 | } |
2364 | if (attached) { |
2365 | index_ruleCohort_no.clear(); |
2366 | // force TRACE to use target |
2367 | Cohort* at_was = context_stack.back().attach_to.cohort; |
2368 | context_stack.back().attach_to.cohort = nullptr; |
2369 | TRACE; |
2370 | context_stack.back().attach_to.cohort = at_was; |
2371 | context_stack.back().target.subreading->noprint = false; |
2372 | has_dep = true; |
2373 | readings_changed = true; |
2374 | } |
2375 | return attached; |
2376 | } |
2377 | else if (rule->type == K_ADDRELATION || rule->type == K_SETRELATION || rule->type == K_REMRELATION) { |
2378 | bool rel_did_anything = false; |
2379 | auto theTags = ss_taglist.get(); |
2380 | getTagList(*rule->maplist, theTags); |
2381 | for (auto tter : *theTags) { |
2382 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2383 | if (rule->type == K_ADDRELATION) { |
2384 | attach->setRelated(); |
2385 | target->setRelated(); |
2386 | rel_did_anything |= target->addRelation(tter->hash, attach->global_number); |
2387 | add_relation_rtag(target, tter, attach->global_number); |
2388 | } |
2389 | else if (rule->type == K_SETRELATION) { |
2390 | attach->setRelated(); |
2391 | target->setRelated(); |
2392 | rel_did_anything |= target->setRelation(tter->hash, attach->global_number); |
2393 | set_relation_rtag(target, tter, attach->global_number); |
2394 | } |
2395 | else { |
2396 | rel_did_anything |= target->remRelation(tter->hash, attach->global_number); |
2397 | rem_relation_rtag(target, tter, attach->global_number); |
2398 | } |
2399 | } |
2400 | if (rel_did_anything) { |
2401 | index_ruleCohort_no.clear(); |
2402 | // force TRACE to use target |
2403 | Cohort* at_was = context_stack.back().attach_to.cohort; |
2404 | context_stack.back().attach_to.cohort = nullptr; |
2405 | TRACE; |
2406 | context_stack.back().attach_to.cohort = at_was; |
2407 | context_stack.back().target.subreading->noprint = false; |
2408 | readings_changed = true; |
2409 | } |
2410 | // don't scan onwards if failed |
2411 | return true; |
2412 | } |
2413 | else if (rule->type == K_ADDRELATIONS || rule->type == K_SETRELATIONS || rule->type == K_REMRELATIONS) { |
2414 | bool rel_did_anything = false; |
2415 | |
2416 | auto sublist = ss_taglist.get(); |
2417 | getTagList(*rule->sublist, sublist); |
2418 | |
2419 | auto maplist = ss_taglist.get(); |
2420 | getTagList(*rule->maplist, maplist); |
2421 | |
2422 | for (auto tter : *maplist) { |
2423 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2424 | if (rule->type == K_ADDRELATIONS) { |
2425 | target->setRelated(); |
2426 | rel_did_anything |= target->addRelation(tter->hash, attach->global_number); |
2427 | add_relation_rtag(target, tter, attach->global_number); |
2428 | } |
2429 | else if (rule->type == K_SETRELATIONS) { |
2430 | target->setRelated(); |
2431 | rel_did_anything |= target->setRelation(tter->hash, attach->global_number); |
2432 | set_relation_rtag(target, tter, attach->global_number); |
2433 | } |
2434 | else { |
2435 | rel_did_anything |= target->remRelation(tter->hash, attach->global_number); |
2436 | rem_relation_rtag(target, tter, attach->global_number); |
2437 | } |
2438 | } |
2439 | for (auto tter : *sublist) { |
2440 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); |
2441 | if (rule->type == K_ADDRELATIONS) { |
2442 | attach->setRelated(); |
2443 | rel_did_anything |= attach->addRelation(tter->hash, target->global_number); |
2444 | add_relation_rtag(attach, tter, target->global_number); |
2445 | } |
2446 | else if (rule->type == K_SETRELATIONS) { |
2447 | attach->setRelated(); |
2448 | rel_did_anything |= attach->setRelation(tter->hash, target->global_number); |
2449 | set_relation_rtag(attach, tter, target->global_number); |
2450 | } |
2451 | else { |
2452 | rel_did_anything |= attach->remRelation(tter->hash, target->global_number); |
2453 | rem_relation_rtag(attach, tter, target->global_number); |
2454 | } |
2455 | } |
2456 | if (rel_did_anything) { |
2457 | index_ruleCohort_no.clear(); |
2458 | // force TRACE to use target |
2459 | Cohort* at_was = context_stack.back().attach_to.cohort; |
2460 | context_stack.back().attach_to.cohort = nullptr; |
2461 | TRACE; |
2462 | context_stack.back().attach_to.cohort = at_was; |
2463 | context_stack.back().target.subreading->noprint = false; |
2464 | readings_changed = true; |
2465 | } |
2466 | // don't scan onwards if failed |
2467 | return true; |
2468 | } |
2469 | return true; |
2470 | }; |
2471 | int32_t orgoffset = rule->dep_target->offset; |
2472 | auto seen_targets = ss_u32sv.get(); |
2473 | |
2474 | ReadingSpec orgtarget = context_stack.back().target; |
2475 | while (true) { |
2476 | auto utags = ss_utags.get(); |
2477 | auto usets = ss_usets.get(); |
2478 | *utags = *context_stack.back().unif_tags; |
2479 | *usets = *context_stack.back().unif_sets; |
2480 | |
2481 | Cohort* attach = nullptr; |
2482 | Cohort* target = context_stack.back().target.cohort; |
2483 | seen_targets->insert(target->global_number); |
2484 | dep_deep_seen.clear(); |
2485 | tmpl_cntx.clear(); |
2486 | context_stack.back().attach_to.cohort = nullptr; |
2487 | context_stack.back().attach_to.reading = nullptr; |
2488 | context_stack.back().attach_to.subreading = nullptr; |
2489 | seen_barrier = false; |
2490 | if (runContextualTest(target->parent, target->local_number, rule->dep_target, &attach) && attach) { |
2491 | profileRuleContext(true, rule, rule->dep_target); |
2492 | |
2493 | bool break_after = seen_barrier || (rule->flags & RF_NEAREST); |
2494 | if (get_attach_to().cohort) { |
2495 | attach = get_attach_to().cohort; |
2496 | } |
2497 | context_target = attach; |
2498 | bool good = true; |
2499 | for (auto it : rule->dep_tests) { |
2500 | context_stack.back().mark = attach; |
2501 | dep_deep_seen.clear(); |
2502 | tmpl_cntx.clear(); |
2503 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); |
2504 | |
2505 | profileRuleContext(test_good, rule, it); |
2506 | |
2507 | if (!test_good) { |
2508 | good = test_good; |
2509 | break; |
2510 | } |
2511 | } |
2512 | if (!get_attach_to().cohort) { |
2513 | context_stack.back().attach_to.cohort = attach; |
2514 | } |
2515 | if (good) { |
2516 | ReadingSpec temp = context_stack.back().target; |
2517 | context_stack.back().target = orgtarget; |
2518 | bool attached = dep_target_cb(); |
2519 | if (attached) { |
2520 | break; |
2521 | } |
2522 | else { |
2523 | context_stack.back().target = temp; |
2524 | } |
2525 | } |
2526 | if (break_after) { |
2527 | break; |
2528 | } |
2529 | if (seen_targets->count(attach->global_number)) { |
2530 | // We've found a cohort we have seen before... |
2531 | // We assume running the test again would result in the same, so don't bother. |
2532 | break; |
2533 | } |
2534 | // Did not successfully attach due to loop restrictions; look onwards from here |
2535 | context_stack.back().target = context_stack.back().attach_to; |
2536 | context_stack.back().unif_tags->swap(utags); |
2537 | context_stack.back().unif_sets->swap(usets); |
2538 | if (rule->dep_target->offset != 0) { |
2539 | // Temporarily set offset to +/- 1 |
2540 | rule->dep_target->offset = ((rule->dep_target->offset < 0) ? -1 : 1); |
2541 | } |
2542 | } |
2543 | else { |
2544 | break; |
2545 | } |
2546 | } |
2547 | rule->dep_target->offset = orgoffset; |
2548 | finish_reading_loop = false; |
2549 | } |
2550 | else if (rule->type == K_MOVE_AFTER || rule->type == K_MOVE_BEFORE || rule->type == K_SWITCH) { |
2551 | // this is a per-cohort rule |
2552 | finish_reading_loop = false; |
2553 | // Calculate hash of current state to later compare whether this move/switch actually did anything |
2554 | uint32_t phash = 0; |
2555 | uint32_t chash = 0; |
2556 | for (const auto& c : current.cohorts) { |
2557 | phash = hash_value(c->global_number, phash); |
2558 | chash = hash_value(c->readings[0]->hash, chash); |
2559 | } |
2560 | |
2561 | // ToDo: ** tests will not correctly work for MOVE/SWITCH; cannot move cohorts between windows |
2562 | Cohort* attach = nullptr; |
2563 | Cohort* cohort = context_stack.back().target.cohort; |
2564 | uint32_t c = cohort->local_number; |
2565 | dep_deep_seen.clear(); |
2566 | tmpl_cntx.clear(); |
2567 | context_stack.back().attach_to.cohort = nullptr; |
2568 | context_stack.back().attach_to.reading = nullptr; |
2569 | context_stack.back().attach_to.subreading = nullptr; |
2570 | if (runContextualTest(¤t, c, rule->dep_target, &attach) && attach && cohort->parent == attach->parent) { |
2571 | profileRuleContext(true, rule, rule->dep_target); |
2572 | |
2573 | if (get_attach_to().cohort) { |
2574 | attach = get_attach_to().cohort; |
2575 | } |
2576 | context_target = attach; |
2577 | bool good = true; |
2578 | for (auto it : rule->dep_tests) { |
2579 | context_stack.back().mark = attach; |
2580 | dep_deep_seen.clear(); |
2581 | tmpl_cntx.clear(); |
2582 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); |
2583 | |
2584 | profileRuleContext(test_good, rule, it); |
2585 | |
2586 | if (!test_good) { |
2587 | good = test_good; |
2588 | break; |
2589 | } |
2590 | } |
2591 | |
2592 | if (!good || cohort == attach || cohort->local_number == 0) { |
2593 | return; |
2594 | } |
2595 | |
2596 | swapper<Cohort*> sw((rule->flags & RF_REVERSE) != 0, attach, cohort); |
2597 | CohortSet cohorts; |
2598 | |
2599 | if (rule->type == K_SWITCH) { |
2600 | if (attach->local_number == 0) { |
2601 | return; |
2602 | } |
2603 | current.cohorts[cohort->local_number] = attach; |
2604 | current.cohorts[attach->local_number] = cohort; |
2605 | cohorts.insert(attach); |
2606 | cohorts.insert(cohort); |
2607 | auto ac_c = std::find(current.all_cohorts.begin() + cohort->local_number, current.all_cohorts.end(), cohort); |
2608 | auto ac_a = std::find(current.all_cohorts.begin() + attach->local_number, current.all_cohorts.end(), attach); |
2609 | *ac_c = attach; |
2610 | *ac_a = cohort; |
2611 | } |
2612 | else { |
2613 | CohortSet edges; |
2614 | collect_subtree(edges, attach, rule->childset2); |
2615 | collect_subtree(cohorts, cohort, rule->childset1); |
2616 | |
2617 | bool need_clean = false; |
2618 | for (auto iter : cohorts) { |
2619 | if (edges.count(iter)) { |
2620 | need_clean = true; |
2621 | break; |
2622 | } |
2623 | } |
2624 | |
2625 | if (need_clean) { |
2626 | if (isChildOf(cohort, attach)) { |
2627 | edges.erase(cohorts.rbegin(), cohorts.rend()); |
2628 | } |
2629 | else /* if (isChildOf(attach, cohort)) */ { |
2630 | cohorts.erase(edges.rbegin(), edges.rend()); |
2631 | } |
2632 | } |
2633 | if (cohorts.empty() || edges.empty()) { |
2634 | finish_reading_loop = false; |
2635 | return; |
2636 | } |
2637 | |
2638 | for (auto c : reversed(cohorts)) { |
2639 | current.cohorts.erase(current.cohorts.begin() + c->local_number); |
2640 | current.all_cohorts.erase(std::find(current.all_cohorts.begin() + c->local_number, current.all_cohorts.end(), c)); |
2641 | } |
2642 | |
2643 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { |
2644 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); |
2645 | } |
2646 | |
2647 | for (auto iter : edges) { |
2648 | if (iter->parent != get_apply_to().cohort->parent) { |
2649 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move across window boundaries.\n", rule->line); |
2650 | CG3Quit(1); |
2651 | } |
2652 | for (auto cohort : cohorts) { |
2653 | if (iter == cohort) { |
2654 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move to a removed position.\n", rule->line); |
2655 | CG3Quit(1); |
2656 | } |
2657 | } |
2658 | } |
2659 | |
2660 | uint32_t spot = 0; |
2661 | auto ac_spot = current.all_cohorts.begin(); |
2662 | if (rule->type == K_MOVE_BEFORE) { |
2663 | spot = edges.front()->local_number; |
2664 | if (spot == 0) { |
2665 | spot = 1; |
2666 | } |
2667 | ac_spot = std::find(current.all_cohorts.begin() + edges.front()->local_number, current.all_cohorts.end(), edges.front()); |
2668 | if ((*ac_spot)->local_number == 0) { |
2669 | ++ac_spot; |
2670 | } |
2671 | } |
2672 | else if (rule->type == K_MOVE_AFTER) { |
2673 | spot = edges.back()->local_number + 1; |
2674 | ac_spot = std::find(current.all_cohorts.begin() + edges.front()->local_number, current.all_cohorts.end(), edges.back()); |
2675 | ++ac_spot; |
2676 | } |
2677 | |
2678 | if (spot > current.cohorts.size()) { |
2679 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move out of bounds.\n", rule->line); |
2680 | CG3Quit(1); |
2681 | } |
2682 | |
2683 | for (auto c : reversed(cohorts)) { |
2684 | current.cohorts.insert(current.cohorts.begin() + spot, c); |
2685 | current.all_cohorts.insert(ac_spot, c); |
2686 | } |
2687 | } |
2688 | reindex(); |
2689 | |
2690 | // Compare whether this move/switch actually did anything |
2691 | uint32_t phash_n = 0; |
2692 | uint32_t chash_n = 0; |
2693 | for (const auto& c : current.cohorts) { |
2694 | phash_n = hash_value(c->global_number, phash_n); |
2695 | chash_n = hash_value(c->readings[0]->hash, chash_n); |
2696 | } |
2697 | |
2698 | if (phash != phash_n || chash != chash_n) { |
2699 | if (++rule_hits[rule->number] > current.cohorts.size() * 100) { |
2700 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Move/Switch endless loop detected for rule on line %u around input line %u - bailing out!\n", rule->line, get_apply_to().cohort->line_number); |
2701 | should_bail = true; |
2702 | finish_cohort_loop = false; |
2703 | return; |
2704 | } |
2705 | |
2706 | for (auto c : cohorts) { |
2707 | for (auto iter : c->readings) { |
2708 | iter->hit_by.push_back(rule->number); |
2709 | } |
2710 | } |
2711 | readings_changed = true; |
2712 | sorter.do_sort = true; |
2713 | } |
2714 | } |
2715 | } |
2716 | else if (rule->type == K_WITH) { |
2717 | TRACE; |
2718 | bool any_readings_changed = false; |
2719 | readings_changed = false; |
2720 | in_nested = true; |
2721 | for (auto& sr : rule->sub_rules) { |
2722 | Rule* cur_was = current_rule; |
2723 | Rule* rule_was = rule; |
2724 | current_rule = sr; |
2725 | rule = sr; |
2726 | bool result = false; |
2727 | do { |
2728 | readings_changed = false; |
2729 | result = runSingleRule(current, *rule, reading_cb, cohort_cb); |
2730 | any_readings_changed = any_readings_changed || result || readings_changed; |
2731 | } while ((result || readings_changed) && (rule->flags & RF_REPEAT) != 0) ; |
2732 | current_rule = cur_was; |
2733 | rule = rule_was; |
2734 | } |
2735 | in_nested = false; |
2736 | readings_changed = any_readings_changed; |
2737 | finish_reading_loop = false; |
2738 | } |
2739 | else if (rule->type != K_REMCOHORT) { |
2740 | TRACE; |
2741 | } |
2742 | }; |
2743 | |
2744 | removed.resize(0); |
2745 | selected.resize(0); |
2746 | bool rv = runSingleRule(current, *rule, reading_cb, cohort_cb); |
2747 | if (rv || readings_changed) { |
2748 | if (!(rule->flags & RF_NOITERATE) && section_max_count != 1) { |
2749 | section_did_something = true; |
2750 | } |
2751 | rule_did_something = true; |
2752 | } |
2753 | if (should_bail) { |
2754 | goto bailout; |
2755 | } |
2756 | if (should_repeat) { |
2757 | goto repeat_rule; |
2758 | } |
2759 | |
2760 | if (rule_did_something) { |
2761 | if (trace_rules.contains(rule->line)) { |
2762 | retval |= RV_TRACERULE; |
2763 | } |
2764 | } |
2765 | if (delimited) { |
2766 | break; |
2767 | } |
2768 | if (rule_did_something && (rule->flags & RF_REPEAT)) { |
2769 | index_ruleCohort_no.clear(); |
2770 | goto repeat_rule; |
2771 | } |
2772 | |
2773 | if (false) { |
2774 | bailout: |
2775 | rule_hits[rule->number] = 0; |
2776 | index_ruleCohort_no.clear(); |
2777 | } |
2778 | |
2779 | if (retval & RV_TRACERULE) { |
2780 | break; |
2781 | } |
2782 | } |
2783 | |
2784 | if (section_did_something) { |
2785 | retval |= RV_SOMETHING; |
2786 | } |
2787 | if (delimited) { |
2788 | retval |= RV_DELIMITED; |
2789 | } |
2790 | return retval; |
2791 | } |
2792 | |
2793 | uint32_t GrammarApplicator::runGrammarOnSingleWindow(SingleWindow& current) { |
2794 | if (!grammar->before_sections.empty() && !no_before_sections) { |
2795 | uint32_t rv = runRulesOnSingleWindow(current, runsections[-1]); |
2796 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { |
2797 | return rv; |
2798 | } |
2799 | } |
2800 | |
2801 | if (!grammar->rules.empty() && !no_sections) { |
2802 | std::map<uint32_t, uint32_t> counter; |
2803 | // Caveat: This may look as if it is not recursing previous sections, but those rules are preprocessed into the successive sections so they are actually run. |
2804 | auto iter = runsections.begin(); |
2805 | auto iter_end = runsections.end(); |
2806 | for (size_t pass = 0; iter != iter_end; ++pass) { |
2807 | if (iter->first < 0 || (section_max_count && counter[iter->first] >= section_max_count)) { |
2808 | ++iter; |
2809 | continue; |
2810 | } |
2811 | uint32_t rv = 0; |
2812 | if (debug_level > 0) { |
2813 | std::cerr << "Running section " << iter->first << " (rules " << *(iter->second.begin()) << " through " << *(--(iter->second.end())) << ") on window " << current.number << std::endl; |
2814 | } |
2815 | rv = runRulesOnSingleWindow(current, iter->second); |
2816 | ++counter[iter->first]; |
2817 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { |
2818 | return rv; |
2819 | } |
2820 | if (!(rv & RV_SOMETHING)) { |
2821 | ++iter; |
2822 | pass = 0; |
2823 | } |
2824 | if (pass >= 1000) { |
2825 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Endless loop detected before input line %u. Window contents was:", numLines); |
2826 | UString tag; |
2827 | for (size_t i = 1; i < current.cohorts.size(); ++i) { |
2828 | Tag* t = current.cohorts[i]->wordform; |
2829 | tag.assign(t->tag.begin() + 2, t->tag.begin() + t->tag.size() - 2); |
2830 | u_fprintfu_fprintf_72(ux_stderr, " %S", tag.data()); |
2831 | } |
2832 | u_fprintfu_fprintf_72(ux_stderr, "\n"); |
2833 | u_fflushu_fflush_72(ux_stderr); |
2834 | break; |
2835 | } |
2836 | } |
2837 | } |
2838 | |
2839 | if (!grammar->after_sections.empty() && !no_after_sections) { |
2840 | uint32_t rv = runRulesOnSingleWindow(current, runsections[-2]); |
2841 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { |
2842 | return rv; |
2843 | } |
2844 | } |
2845 | |
2846 | return 0; |
2847 | } |
2848 | |
2849 | void GrammarApplicator::runGrammarOnWindow() { |
2850 | SingleWindow* current = gWindow->current; |
2851 | did_final_enclosure = false; |
2852 | |
2853 | for (const auto& vit : current->variables_set) { |
2854 | variables[vit.first] = vit.second; |
2855 | } |
2856 | for (auto vit : current->variables_rem) { |
2857 | variables.erase(vit); |
2858 | } |
2859 | variables[mprefix_key] = mprefix_value; |
2860 | |
2861 | if (has_dep) { |
2862 | reflowDependencyWindow(); |
2863 | if (!input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { |
2864 | for (auto cohort : gWindow->next.back()->cohorts) { |
2865 | gWindow->dep_window[cohort->global_number] = cohort; |
2866 | } |
2867 | } |
2868 | } |
2869 | if (has_relations) { |
2870 | reflowRelationWindow(); |
2871 | } |
2872 | |
2873 | if (!grammar->parentheses.empty()) { |
2874 | label_scanParentheses: |
2875 | reverse_foreach (iter, current->cohorts)if (!(current->cohorts).empty()) for (auto iter = (current ->cohorts).rbegin(), iter_end = (current->cohorts).rend (); iter != iter_end; ++iter) { |
2876 | Cohort* c = *iter; |
2877 | if (c->is_pleft == 0) { |
2878 | continue; |
2879 | } |
2880 | auto p = grammar->parentheses.find(c->is_pleft); |
2881 | if (p != grammar->parentheses.end()) { |
2882 | auto right = iter.base(); |
2883 | --right; |
2884 | --right; |
2885 | c = *right; |
2886 | ++right; |
2887 | bool found = false; |
2888 | CohortVector encs; |
2889 | for (; right != current->cohorts.end(); ++right) { |
2890 | Cohort* s = *right; |
2891 | encs.push_back(s); |
2892 | if (s->is_pright == p->second) { |
2893 | found = true; |
2894 | break; |
2895 | } |
2896 | } |
2897 | if (found) { |
2898 | auto left = iter.base(); |
2899 | --left; |
2900 | uint32_t lc = (*left)->local_number; |
2901 | ++right; |
2902 | for (; right != current->cohorts.end(); ++right) { |
2903 | *left = *right; |
2904 | (*left)->local_number = lc; |
2905 | ++lc; |
2906 | ++left; |
2907 | } |
2908 | current->cohorts.resize(current->cohorts.size() - encs.size()); |
2909 | auto ec = std::find(current->all_cohorts.begin() + encs.front()->local_number, current->all_cohorts.end(), encs.front()); |
2910 | --ec; |
2911 | do { |
2912 | ++ec; |
2913 | (*ec)->type |= CT_ENCLOSED; |
2914 | ++((*ec)->enclosed); |
2915 | } while (*ec != encs.back()); |
2916 | current->has_enclosures = true; |
2917 | goto label_scanParentheses; |
2918 | } |
2919 | } |
2920 | } |
2921 | } |
2922 | |
2923 | par_left_tag = 0; |
2924 | par_right_tag = 0; |
2925 | par_left_pos = 0; |
2926 | par_right_pos = 0; |
2927 | uint32_t pass = 0; |
2928 | |
2929 | label_runGrammarOnWindow_begin: |
2930 | while (!gWindow->previous.empty() && gWindow->previous.size() > num_windows) { |
2931 | SingleWindow* tmp = gWindow->previous.front(); |
2932 | printSingleWindow(tmp, *ux_stdout); |
2933 | free_swindow(tmp); |
2934 | gWindow->previous.erase(gWindow->previous.begin()); |
2935 | } |
2936 | |
2937 | rule_hits.clear(); |
2938 | index_ruleCohort_no.clear(); |
2939 | current = gWindow->current; |
2940 | indexSingleWindow(*current); |
2941 | current->hit_external.clear(); |
2942 | gWindow->rebuildCohortLinks(); // ToDo: Hack. This can be done better... |
2943 | |
2944 | ++pass; |
2945 | if (pass > 1000) { |
2946 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Endless loop detected before input line %u. Window contents was:", numLines); |
2947 | UString tag; |
2948 | for (size_t i = 1; i < current->cohorts.size(); ++i) { |
2949 | Tag* t = current->cohorts[i]->wordform; |
2950 | tag.assign(t->tag.begin() + 2, t->tag.begin() + t->tag.size() - 2); |
2951 | u_fprintfu_fprintf_72(ux_stderr, " %S", tag.data()); |
2952 | } |
2953 | u_fprintfu_fprintf_72(ux_stderr, "\n"); |
2954 | u_fflushu_fflush_72(ux_stderr); |
2955 | return; |
2956 | } |
2957 | |
2958 | if (trace_encl) { |
2959 | uint32_t hitpass = std::numeric_limits<uint32_t>::max() - pass; |
2960 | for (auto& c : current->cohorts) { |
2961 | for (auto rit : c->readings) { |
2962 | rit->hit_by.push_back(hitpass); |
2963 | } |
2964 | } |
2965 | } |
2966 | |
2967 | uint32_t rv = runGrammarOnSingleWindow(*current); |
2968 | if (rv & RV_DELIMITED) { |
2969 | goto label_runGrammarOnWindow_begin; |
2970 | } |
2971 | |
2972 | label_unpackEnclosures: |
2973 | if (current->has_enclosures) { |
2974 | size_t nc = current->all_cohorts.size(); |
2975 | for (size_t i = 0; i < nc; ++i) { |
2976 | Cohort* c = current->all_cohorts[i]; |
2977 | if (c->enclosed == 1) { |
2978 | size_t la = i; |
2979 | for (; la > 0; --la) { |
2980 | if (!(current->all_cohorts[la - 1]->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED))) { |
2981 | --la; |
2982 | break; |
2983 | } |
2984 | } |
2985 | size_t ni = current->all_cohorts[la]->local_number; |
2986 | |
2987 | size_t ra = i; |
2988 | size_t ne = 0; |
2989 | for (; ra < nc; ++ra) { |
2990 | if (!(current->all_cohorts[ra]->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED))) { |
2991 | break; |
2992 | } |
2993 | --(current->all_cohorts[ra]->enclosed); |
2994 | if (current->all_cohorts[ra]->enclosed == 0) { |
2995 | current->all_cohorts[ra]->type &= ~CT_ENCLOSED; |
2996 | ++ne; |
2997 | } |
2998 | } |
2999 | |
3000 | current->cohorts.resize(current->cohorts.size() + ne, nullptr); |
3001 | for (size_t j = current->cohorts.size() - 1; j > ni + ne; --j) { |
3002 | current->cohorts[j] = current->cohorts[j - ne]; |
3003 | current->cohorts[j]->local_number = UI32(j); |
3004 | current->cohorts[j - ne] = nullptr; |
3005 | } |
3006 | for (size_t j = 0; i < ra; ++i) { |
3007 | if (current->all_cohorts[i]->enclosed == 0) { |
3008 | current->cohorts[ni + j + 1] = current->all_cohorts[i]; |
3009 | current->cohorts[ni + j + 1]->local_number = UI32(ni + j + 1); |
3010 | current->cohorts[ni + j + 1]->parent = current; |
3011 | ++j; |
3012 | } |
3013 | } |
3014 | par_left_tag = current->all_cohorts[la + 1]->is_pleft; |
3015 | par_right_tag = current->all_cohorts[ra - 1]->is_pright; |
3016 | par_left_pos = UI32(ni + 1); |
3017 | par_right_pos = UI32(ni + ne); |
3018 | if (rv & RV_TRACERULE) { |
3019 | goto label_unpackEnclosures; |
3020 | } |
3021 | goto label_runGrammarOnWindow_begin; |
3022 | } |
3023 | } |
3024 | if (!did_final_enclosure) { |
3025 | par_left_tag = 0; |
3026 | par_right_tag = 0; |
3027 | par_left_pos = 0; |
3028 | par_right_pos = 0; |
3029 | did_final_enclosure = true; |
3030 | if (rv & RV_TRACERULE) { |
3031 | goto label_unpackEnclosures; |
3032 | } |
3033 | goto label_runGrammarOnWindow_begin; |
3034 | } |
3035 | } |
3036 | |
3037 | bool should_reflow = false; |
3038 | for (size_t i = current->all_cohorts.size(); i > 0; --i) { |
3039 | auto cohort = current->all_cohorts[i - 1]; |
3040 | if (cohort->type & CT_IGNORED) { |
3041 | for (auto ins = i; ins > 0; --ins) { |
3042 | if (!(current->all_cohorts[ins - 1]->type & (CT_REMOVED | CT_ENCLOSED | CT_IGNORED))) { |
3043 | current->cohorts.insert(current->cohorts.begin() + current->all_cohorts[ins - 1]->local_number + 1, cohort); |
3044 | cohort->type &= ~CT_IGNORED; |
3045 | current->parent->cohort_map.insert(std::make_pair(cohort->global_number, cohort)); |
3046 | should_reflow = true; |
3047 | break; |
3048 | } |
3049 | } |
3050 | } |
3051 | } |
3052 | if (should_reflow) { |
3053 | for (size_t i = 0; i < current->cohorts.size(); ++i) { |
3054 | current->cohorts[i]->local_number = UI32(i); |
3055 | } |
3056 | reflowDependencyWindow(); |
3057 | } |
3058 | } |
3059 | } |
3060 | |
3061 | // This helps the all_vislcg3.cpp profiling builds |
3062 | #undef TRACE |