File: | GrammarApplicator_runRules.cpp |
Warning: | line 259, column 8 Access to field 'next' results in a dereference of a null pointer (loaded from variable 'tr') |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* | |||
2 | * Copyright (C) 2007-2024, GrammarSoft ApS | |||
3 | * Developed by Tino Didriksen <mail@tinodidriksen.com> | |||
4 | * Design by Eckhard Bick <eckhard.bick@mail.dk>, Tino Didriksen <mail@tinodidriksen.com> | |||
5 | * | |||
6 | * This program is free software: you can redistribute it and/or modify | |||
7 | * it under the terms of the GNU General Public License as published by | |||
8 | * the Free Software Foundation, either version 3 of the License, or | |||
9 | * (at your option) any later version. | |||
10 | * | |||
11 | * This program is distributed in the hope that it will be useful, | |||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
14 | * GNU General Public License for more details. | |||
15 | * | |||
16 | * You should have received a copy of the GNU General Public License | |||
17 | * along with this progam. If not, see <https://www.gnu.org/licenses/>. | |||
18 | */ | |||
19 | ||||
20 | #include "GrammarApplicator.hpp" | |||
21 | #include "Strings.hpp" | |||
22 | #include "Tag.hpp" | |||
23 | #include "Grammar.hpp" | |||
24 | #include "Window.hpp" | |||
25 | #include "SingleWindow.hpp" | |||
26 | #include "Reading.hpp" | |||
27 | #include "ContextualTest.hpp" | |||
28 | #include "version.hpp" | |||
29 | #include "process.hpp" | |||
30 | ||||
31 | namespace CG3 { | |||
32 | ||||
33 | enum { | |||
34 | RV_NOTHING = 1, | |||
35 | RV_SOMETHING = 2, | |||
36 | RV_DELIMITED = 4, | |||
37 | RV_TRACERULE = 8, | |||
38 | }; | |||
39 | ||||
40 | bool GrammarApplicator::doesWordformsMatch(const Tag* cword, const Tag* rword) { | |||
41 | if (rword && rword != cword) { | |||
42 | if (rword->type & T_REGEXP) { | |||
43 | if (!doesTagMatchRegexp(cword->hash, *rword)) { | |||
44 | return false; | |||
45 | } | |||
46 | } | |||
47 | else if (rword->type & T_CASE_INSENSITIVE) { | |||
48 | if (!doesTagMatchIcase(cword->hash, *rword)) { | |||
49 | return false; | |||
50 | } | |||
51 | } | |||
52 | else { | |||
53 | return false; | |||
54 | } | |||
55 | } | |||
56 | return true; | |||
57 | } | |||
58 | ||||
59 | bool GrammarApplicator::updateRuleToCohorts(Cohort& c, const uint32_t& rsit) { | |||
60 | // Check whether this rule is in the allowed rule list from cmdline flag --rule(s) | |||
61 | if (!valid_rules.empty() && !valid_rules.contains(rsit)) { | |||
62 | return false; | |||
63 | } | |||
64 | SingleWindow* current = c.parent; | |||
65 | const Rule* r = grammar->rule_by_number[rsit]; | |||
66 | if (!doesWordformsMatch(c.wordform, r->wordform)) { | |||
67 | return false; | |||
68 | } | |||
69 | if (current->rule_to_cohorts.size() < rsit+1) { | |||
70 | indexSingleWindow(*current); | |||
71 | } | |||
72 | CohortSet& cohortset = current->rule_to_cohorts[rsit]; | |||
73 | std::vector<size_t> csi; | |||
74 | for (size_t i = 0; i < cohortsets.size(); ++i) { | |||
75 | if (cohortsets[i] != &cohortset) { | |||
76 | continue; | |||
77 | } | |||
78 | csi.push_back(i); | |||
79 | } | |||
80 | if (!csi.empty()) { | |||
81 | auto cap = cohortset.capacity(); | |||
82 | std::vector<CohortSet::const_iterator*> ends; | |||
83 | std::vector<std::pair<CohortSet::const_iterator*,Cohort*>> chs; | |||
84 | for (size_t i = 0; i < csi.size(); ++i) { | |||
85 | if (*rocits[csi[i]] == cohortset.end()) { | |||
86 | ends.push_back(rocits[csi[i]]); | |||
87 | } | |||
88 | else { | |||
89 | chs.push_back(std::pair(rocits[csi[i]], **rocits[csi[i]])); | |||
90 | } | |||
91 | } | |||
92 | cohortset.insert(&c); | |||
93 | for (auto it : ends) { | |||
94 | *it = cohortset.end(); | |||
95 | } | |||
96 | if (cap != cohortset.capacity()) { | |||
97 | for (auto& it : chs) { | |||
98 | *it.first = cohortset.find(it.second); | |||
99 | } | |||
100 | } | |||
101 | } | |||
102 | else { | |||
103 | cohortset.insert(&c); | |||
104 | } | |||
105 | return current->valid_rules.insert(rsit); | |||
106 | } | |||
107 | ||||
108 | bool GrammarApplicator::updateValidRules(const uint32IntervalVector& rules, uint32IntervalVector& intersects, const uint32_t& hash, Reading& reading) { | |||
109 | size_t os = intersects.size(); | |||
110 | auto it = grammar->rules_by_tag.find(hash); | |||
111 | if (it != grammar->rules_by_tag.end()) { | |||
112 | Cohort& c = *(reading.parent); | |||
113 | for (auto rsit : (it->second)) { | |||
114 | if (updateRuleToCohorts(c, rsit) && rules.contains(rsit)) { | |||
115 | intersects.insert(rsit); | |||
116 | } | |||
117 | } | |||
118 | } | |||
119 | return (os != intersects.size()); | |||
120 | } | |||
121 | ||||
122 | void GrammarApplicator::indexSingleWindow(SingleWindow& current) { | |||
123 | current.valid_rules.clear(); | |||
124 | current.rule_to_cohorts.resize(grammar->rule_by_number.size()); | |||
125 | for (auto& cs : current.rule_to_cohorts) { | |||
126 | cs.clear(); | |||
127 | } | |||
128 | ||||
129 | for (auto c : current.cohorts) { | |||
130 | for (uint32_t psit = 0; psit < c->possible_sets.size(); ++psit) { | |||
131 | if (c->possible_sets.test(psit) == false) { | |||
132 | continue; | |||
133 | } | |||
134 | auto rules_it = grammar->rules_by_set.find(psit); | |||
135 | if (rules_it == grammar->rules_by_set.end()) { | |||
136 | continue; | |||
137 | } | |||
138 | for (auto rsit : rules_it->second) { | |||
139 | updateRuleToCohorts(*c, rsit); | |||
140 | } | |||
141 | } | |||
142 | } | |||
143 | } | |||
144 | ||||
145 | TagList GrammarApplicator::getTagList(const Set& theSet, bool unif_mode) const { | |||
146 | TagList theTags; | |||
147 | getTagList(theSet, theTags, unif_mode); | |||
148 | return theTags; | |||
149 | } | |||
150 | ||||
151 | void GrammarApplicator::getTagList(const Set& theSet, TagList& theTags, bool unif_mode) const { | |||
152 | if (theSet.type & ST_SET_UNIFY) { | |||
153 | const auto& usets = (*context_stack.back().unif_sets)[theSet.number]; | |||
154 | const Set& pSet = *(grammar->sets_list[theSet.sets[0]]); | |||
155 | for (auto iter : pSet.sets) { | |||
156 | if (usets.count(iter)) { | |||
157 | getTagList(*(grammar->sets_list[iter]), theTags); | |||
158 | } | |||
159 | } | |||
160 | } | |||
161 | else if (theSet.type & ST_TAG_UNIFY) { | |||
162 | for (auto iter : theSet.sets) { | |||
163 | getTagList(*(grammar->sets_list[iter]), theTags, true); | |||
164 | } | |||
165 | } | |||
166 | else if (!theSet.sets.empty()) { | |||
167 | for (auto iter : theSet.sets) { | |||
168 | getTagList(*(grammar->sets_list[iter]), theTags, unif_mode); | |||
169 | } | |||
170 | } | |||
171 | else if (unif_mode) { | |||
172 | auto unif_tags = context_stack.back().unif_tags; | |||
173 | auto iter = unif_tags->find(theSet.number); | |||
174 | if (iter != unif_tags->end()) { | |||
175 | trie_getTagList(theSet.trie, theTags, iter->second); | |||
176 | trie_getTagList(theSet.trie_special, theTags, iter->second); | |||
177 | } | |||
178 | } | |||
179 | else { | |||
180 | trie_getTagList(theSet.trie, theTags); | |||
181 | trie_getTagList(theSet.trie_special, theTags); | |||
182 | } | |||
183 | // Eliminate consecutive duplicates. Not all duplicates, since AddCohort and Append may have multiple readings with repeated tags | |||
184 | for (auto ot = theTags.begin(); theTags.size() > 1 && ot != theTags.end(); ++ot) { | |||
185 | auto it = ot; | |||
186 | ++it; | |||
187 | for (; it != theTags.end() && std::distance(ot, it) == 1;) { | |||
188 | if (*ot == *it) { | |||
189 | it = theTags.erase(it); | |||
190 | } | |||
191 | else { | |||
192 | ++it; | |||
193 | } | |||
194 | } | |||
195 | } | |||
196 | } | |||
197 | ||||
198 | Reading* GrammarApplicator::get_sub_reading(Reading* tr, int sub_reading) { | |||
199 | if (sub_reading == 0) { | |||
200 | return tr; | |||
201 | } | |||
202 | ||||
203 | if (sub_reading
| |||
204 | // If there aren't any sub-readings, the primary reading is the same as the amalgamation of all readings | |||
205 | if (tr->next == nullptr) { | |||
206 | return tr; | |||
207 | } | |||
208 | ||||
209 | subs_any.emplace_back(Reading()); | |||
210 | Reading* reading = &subs_any.back(); | |||
211 | *reading = *tr; | |||
212 | reading->next = nullptr; | |||
213 | while (tr->next) { | |||
214 | tr = tr->next; | |||
215 | reading->tags_list.push_back(0); | |||
216 | reading->tags_list.insert(reading->tags_list.end(), tr->tags_list.begin(), tr->tags_list.end()); | |||
217 | for (auto tag : tr->tags) { | |||
218 | reading->tags.insert(tag); | |||
219 | reading->tags_bloom.insert(tag); | |||
220 | } | |||
221 | for (auto tag : tr->tags_plain) { | |||
222 | reading->tags_plain.insert(tag); | |||
223 | reading->tags_plain_bloom.insert(tag); | |||
224 | } | |||
225 | for (auto tag : tr->tags_textual) { | |||
226 | reading->tags_textual.insert(tag); | |||
227 | reading->tags_textual_bloom.insert(tag); | |||
228 | } | |||
229 | reading->tags_numerical.insert(tr->tags_numerical.begin(), tr->tags_numerical.end()); | |||
230 | if (tr->mapped) { | |||
231 | reading->mapped = true; | |||
232 | } | |||
233 | if (tr->mapping) { | |||
234 | reading->mapping = tr->mapping; | |||
235 | } | |||
236 | if (tr->matched_target) { | |||
237 | reading->matched_target = true; | |||
238 | } | |||
239 | if (tr->matched_tests) { | |||
240 | reading->matched_tests = true; | |||
241 | } | |||
242 | } | |||
243 | reading->rehash(); | |||
244 | return reading; | |||
245 | } | |||
246 | ||||
247 | if (sub_reading > 0) { | |||
248 | for (int i = 0; i < sub_reading && tr; ++i) { | |||
249 | tr = tr->next; | |||
250 | } | |||
251 | } | |||
252 | else if (sub_reading
| |||
253 | int ntr = 0; | |||
254 | Reading* ttr = tr; | |||
255 | while (ttr) { | |||
256 | ttr = ttr->next; | |||
257 | --ntr; | |||
258 | } | |||
259 | if (!tr->next) { | |||
| ||||
260 | tr = nullptr; | |||
261 | } | |||
262 | for (auto i = ntr; i < sub_reading && tr; ++i) { | |||
263 | tr = tr->next; | |||
264 | } | |||
265 | } | |||
266 | return tr; | |||
267 | } | |||
268 | ||||
269 | #define TRACE \ | |||
270 | do { \ | |||
271 | get_apply_to().subreading->hit_by.push_back(rule->number); \ | |||
272 | if (rule->sub_reading == 32767) { \ | |||
273 | get_apply_to().reading->hit_by.push_back(rule->number); \ | |||
274 | } \ | |||
275 | } while (0) | |||
276 | ||||
277 | #define FILL_TAG_LIST(taglist)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (taglist)->begin(); it != (taglist)->end();) { if (reading.tags.find((*it)->hash) == reading.tags.end()) { auto tt = *it; it = (taglist)->erase(it); if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr ) { context_stack.back().regexgrps = ®exgrps_store[used_regex ]; } auto stag = doesTagMatchReading(reading, *tt, false, true ); if (stag) { (taglist)->insert(it, grammar->single_tags .find(stag)->second); } } continue; } ++it; } } while (0) \ | |||
278 | do { \ | |||
279 | Reading& reading = *get_apply_to().subreading; \ | |||
280 | for (auto it = (taglist)->begin(); it != (taglist)->end();) { \ | |||
281 | if (reading.tags.find((*it)->hash) == reading.tags.end()) { \ | |||
282 | auto tt = *it; \ | |||
283 | it = (taglist)->erase(it); \ | |||
284 | if (tt->type & T_SPECIAL) { \ | |||
285 | if (context_stack.back().regexgrps == nullptr) { \ | |||
286 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; \ | |||
287 | } \ | |||
288 | auto stag = doesTagMatchReading(reading, *tt, false, true); \ | |||
289 | if (stag) { \ | |||
290 | (taglist)->insert(it, grammar->single_tags.find(stag)->second); \ | |||
291 | } \ | |||
292 | } \ | |||
293 | continue; \ | |||
294 | } \ | |||
295 | ++it; \ | |||
296 | } \ | |||
297 | } while (0) | |||
298 | ||||
299 | #define FILL_TAG_LIST_RAW(taglist)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(taglist)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0) \ | |||
300 | do { \ | |||
301 | Reading& reading = *get_apply_to().subreading; \ | |||
302 | for (auto& tt : *(taglist)) { \ | |||
303 | if (tt->type & T_SPECIAL) { \ | |||
304 | if (context_stack.back().regexgrps == nullptr) { \ | |||
305 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; \ | |||
306 | } \ | |||
307 | auto stag = doesTagMatchReading(reading, *tt, false, true); \ | |||
308 | if (stag) { \ | |||
309 | tt = grammar->single_tags.find(stag)->second; \ | |||
310 | } \ | |||
311 | } \ | |||
312 | } \ | |||
313 | } while (0) | |||
314 | ||||
315 | #define APPEND_TAGLIST_TO_READING(taglist, reading)do { for (auto tter : (taglist)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((reading), tter); } if (updateValidRules(rules, intersects, hash, reading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0) \ | |||
316 | do { \ | |||
317 | for (auto tter : (taglist)) { \ | |||
318 | while (tter->type & T_VARSTRING) { \ | |||
319 | tter = generateVarstringTag(tter); \ | |||
320 | } \ | |||
321 | auto hash = tter->hash; \ | |||
322 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { \ | |||
323 | mappings->push_back(tter); \ | |||
324 | } \ | |||
325 | else { \ | |||
326 | hash = addTagToReading((reading), tter); \ | |||
327 | } \ | |||
328 | if (updateValidRules(rules, intersects, hash, reading)) { \ | |||
329 | iter_rules = intersects.find(rule->number); \ | |||
330 | iter_rules_end = intersects.end(); \ | |||
331 | } \ | |||
332 | } \ | |||
333 | } while (0) | |||
334 | ||||
335 | #define VARSTRINGIFY(tag)do { while ((tag)->type & T_VARSTRING) { (tag) = generateVarstringTag ((tag)); } } while (0) \ | |||
336 | do { \ | |||
337 | while ((tag)->type & T_VARSTRING) { \ | |||
338 | (tag) = generateVarstringTag((tag)); \ | |||
339 | } \ | |||
340 | } \ | |||
341 | while (0) | |||
342 | ||||
343 | ||||
344 | bool GrammarApplicator::runSingleRule(SingleWindow& current, const Rule& rule, RuleCallback reading_cb, RuleCallback cohort_cb) { | |||
345 | finish_cohort_loop = true; | |||
346 | bool anything_changed = false; | |||
347 | KEYWORDS type = rule.type; | |||
348 | const Set& set = *(grammar->sets_list[rule.target]); | |||
349 | CohortSet* cohortset = ¤t.rule_to_cohorts[rule.number]; | |||
350 | ||||
351 | auto override_cohortset = [&]() { | |||
352 | if (in_nested) { | |||
353 | if (!current.nested_rule_to_cohorts) { | |||
354 | current.nested_rule_to_cohorts.reset(new CohortSet()); | |||
355 | } | |||
356 | cohortset = current.nested_rule_to_cohorts.get(); | |||
357 | cohortset->clear(); | |||
358 | cohortset->insert(get_apply_to().cohort); | |||
359 | for (auto& t : set.trie_special) { | |||
360 | if (t.first->type & T_CONTEXT && t.first->context_ref_pos <= context_stack.back().context.size()) { | |||
361 | cohortset->insert(context_stack.back().context[t.first->context_ref_pos - 1]); | |||
362 | } | |||
363 | } | |||
364 | } | |||
365 | }; | |||
366 | override_cohortset(); | |||
367 | cohortsets.push_back(cohortset); | |||
368 | rocits.push_back(nullptr); | |||
369 | ||||
370 | scope_guard popper([&]() { | |||
371 | cohortsets.pop_back(); | |||
372 | rocits.pop_back(); | |||
373 | }); | |||
374 | ||||
375 | if (debug_level > 1) { | |||
376 | std::cerr << "DEBUG: " << cohortset->size() << "/" << current.cohorts.size() << " = " << double(cohortset->size()) / double(current.cohorts.size()) << std::endl; | |||
377 | } | |||
378 | for (auto rocit = cohortset->cbegin(); (!cohortset->empty()) && (rocit != cohortset->cend());) { | |||
379 | rocits.back() = &rocit; | |||
380 | Cohort* cohort = *rocit; | |||
381 | ++rocit; | |||
382 | ||||
383 | finish_reading_loop = true; | |||
384 | ||||
385 | if (debug_level > 1) { | |||
386 | std::cerr << "DEBUG: Trying cohort " << cohort->global_number << ":" << cohort->local_number << std::endl; | |||
387 | } | |||
388 | ||||
389 | // If the current cohort is the initial >>> one, skip it. | |||
390 | if (cohort->local_number == 0) { | |||
391 | continue; | |||
392 | } | |||
393 | // If the cohort is removed, skip it... | |||
394 | // Removed cohorts are still in the precalculated rule_to_cohorts map, | |||
395 | // and it would take time to go through the whole map searching for the cohort. | |||
396 | // Haven't tested whether it is worth it... | |||
397 | if (cohort->type & CT_REMOVED) { | |||
398 | continue; | |||
399 | } | |||
400 | ||||
401 | uint32_t c = cohort->local_number; | |||
402 | // If the cohort is temporarily unavailable due to parentheses, skip it. | |||
403 | if ((cohort->type & CT_ENCLOSED) || cohort->parent != ¤t) { | |||
404 | continue; | |||
405 | } | |||
406 | // If there are no readings, skip it. | |||
407 | // This is unlikely to happen as all cohorts will get a magic reading during input, | |||
408 | // and not many use the unsafe Remove rules. | |||
409 | if (cohort->readings.empty()) { | |||
410 | continue; | |||
411 | } | |||
412 | // If there's no reason to even attempt to restore, just skip it. | |||
413 | if (rule.type == K_RESTORE) { | |||
414 | if ((rule.flags & RF_DELAYED) && cohort->delayed.empty()) { | |||
415 | continue; | |||
416 | } | |||
417 | else if ((rule.flags & RF_IGNORED) && cohort->ignored.empty()) { | |||
418 | continue; | |||
419 | } | |||
420 | else if (!(rule.flags & (RF_DELAYED|RF_IGNORED)) && cohort->deleted.empty()) { | |||
421 | continue; | |||
422 | } | |||
423 | } | |||
424 | // If there is not even a remote chance the target set might match this cohort, skip it. | |||
425 | if (rule.sub_reading == 0 && (rule.target >= cohort->possible_sets.size() || !cohort->possible_sets.test(rule.target))) { | |||
426 | continue; | |||
427 | } | |||
428 | ||||
429 | // If there is only 1 reading left and it is a Select or safe Remove rule, skip it. | |||
430 | if (cohort->readings.size() == 1) { | |||
431 | if (type == K_SELECT) { | |||
432 | continue; | |||
433 | } | |||
434 | if (type == K_REMOVE || type == K_IFF) { | |||
435 | if (cohort->readings.front()->noprint) { | |||
436 | continue; | |||
437 | } | |||
438 | if ((!unsafe || (rule.flags & RF_SAFE)) && !(rule.flags & RF_UNSAFE)) { | |||
439 | continue; | |||
440 | } | |||
441 | } | |||
442 | } | |||
443 | else if (type == K_UNMAP && rule.flags & RF_SAFE) { | |||
444 | continue; | |||
445 | } | |||
446 | // If it's a Delimit rule and we're at the final cohort, skip it. | |||
447 | if (type == K_DELIMIT && c == current.cohorts.size() - 1) { | |||
448 | continue; | |||
449 | } | |||
450 | ||||
451 | // If the rule is only supposed to run inside a parentheses, check if cohort is. | |||
452 | if (rule.flags & RF_ENCL_INNER) { | |||
453 | if (!par_left_pos) { | |||
454 | continue; | |||
455 | } | |||
456 | if (cohort->local_number < par_left_pos || cohort->local_number > par_right_pos) { | |||
457 | continue; | |||
458 | } | |||
459 | } | |||
460 | // ...and if the rule should only run outside parentheses, check if cohort is. | |||
461 | else if (rule.flags & RF_ENCL_OUTER) { | |||
462 | if (par_left_pos && cohort->local_number >= par_left_pos && cohort->local_number <= par_right_pos) { | |||
463 | continue; | |||
464 | } | |||
465 | } | |||
466 | ||||
467 | // If this is SETPARENT SAFE and there's already a parent, skip it. | |||
468 | if (type == K_SETPARENT && (rule.flags & RF_SAFE) && cohort->dep_parent != DEP_NO_PARENT) { | |||
469 | continue; | |||
470 | } | |||
471 | if ((rule.flags & RF_NOPARENT) && cohort->dep_parent != DEP_NO_PARENT) { | |||
472 | continue; | |||
473 | } | |||
474 | ||||
475 | // Check if on previous runs the rule did not match this cohort, and skip if that is the case. | |||
476 | // This cache is cleared if any rule causes any state change in the window. | |||
477 | uint32_t ih = hash_value(rule.number, cohort->global_number); | |||
478 | if (index_ruleCohort_no.contains(ih)) { | |||
479 | continue; | |||
480 | } | |||
481 | index_ruleCohort_no.insert(ih); | |||
482 | ||||
483 | size_t num_active = 0; | |||
484 | size_t num_iff = 0; | |||
485 | ||||
486 | std::vector<Rule_Context> reading_contexts; | |||
487 | reading_contexts.reserve(cohort->readings.size()); | |||
488 | ||||
489 | // Assume that Iff rules are really Remove rules, until proven otherwise. | |||
490 | if (rule.type == K_IFF) { | |||
491 | type = K_REMOVE; | |||
492 | } | |||
493 | ||||
494 | bool did_test = false; | |||
495 | bool test_good = false; | |||
496 | bool matched_target = false; | |||
497 | ||||
498 | clear(readings_plain); | |||
499 | clear(subs_any); | |||
500 | ||||
501 | // Varstring capture groups exist on a per-cohort basis, since we may need them for mapping later. | |||
502 | clear(regexgrps_z); | |||
503 | clear(regexgrps_c); | |||
504 | clear(unif_tags_rs); | |||
505 | clear(unif_sets_rs); | |||
506 | ||||
507 | used_regex = 0; | |||
508 | regexgrps_store.resize(std::max(regexgrps_store.size(), cohort->readings.size())); | |||
509 | regexgrps_z.reserve(std::max(regexgrps_z.size(), cohort->readings.size())); | |||
510 | regexgrps_c.reserve(std::max(regexgrps_c.size(), cohort->readings.size())); | |||
511 | ||||
512 | size_t used_unif = 0; | |||
513 | unif_tags_store.resize(std::max(unif_tags_store.size(), cohort->readings.size() + 1)); | |||
514 | unif_sets_store.resize(std::max(unif_sets_store.size(), cohort->readings.size() + 1)); | |||
515 | ||||
516 | { | |||
517 | Rule_Context context; | |||
518 | context.target.cohort = cohort; | |||
519 | context_stack.push_back(std::move(context)); | |||
520 | } | |||
521 | ||||
522 | auto reset_cohorts = [&]() { | |||
523 | cohortset = ¤t.rule_to_cohorts[rule.number]; | |||
524 | override_cohortset(); | |||
525 | cohortsets.back() = cohortset; | |||
526 | if (get_apply_to().cohort->type & CT_REMOVED) { | |||
527 | rocit = cohortset->lower_bound(current.cohorts[get_apply_to().cohort->local_number]); | |||
528 | } | |||
529 | else { | |||
530 | rocit = cohortset->find(current.cohorts[get_apply_to().cohort->local_number]); | |||
531 | if (rocit != cohortset->end()) { | |||
532 | ++rocit; | |||
533 | } | |||
534 | } | |||
535 | }; | |||
536 | ||||
537 | // Remember the current state so we can compare later to see if anything has changed | |||
538 | const size_t state_num_readings = cohort->readings.size(); | |||
539 | const size_t state_num_removed = cohort->deleted.size(); | |||
540 | const size_t state_num_delayed = cohort->delayed.size(); | |||
541 | const size_t state_num_ignored = cohort->ignored.size(); | |||
542 | ||||
543 | // This loop figures out which readings, if any, that are valid targets for the current rule | |||
544 | // Criteria for valid is that the reading must match both target and all contextual tests | |||
545 | for (size_t i = 0; i < cohort->readings.size(); ++i) { | |||
546 | // ToDo: Switch sub-readings so that they build up a passed in vector<Reading*> | |||
547 | Reading* reading = get_sub_reading(cohort->readings[i], rule.sub_reading); | |||
548 | if (!reading) { | |||
549 | cohort->readings[i]->matched_target = false; | |||
550 | cohort->readings[i]->matched_tests = false; | |||
551 | continue; | |||
552 | } | |||
553 | context_stack.back().target.reading = cohort->readings[i]; | |||
554 | context_stack.back().target.subreading = reading; | |||
555 | ||||
556 | // The state is stored in the readings themselves, so clear the old states | |||
557 | reading->matched_target = false; | |||
558 | reading->matched_tests = false; | |||
559 | ||||
560 | if (reading->mapped && (rule.type == K_MAP || rule.type == K_ADD || rule.type == K_REPLACE)) { | |||
561 | continue; | |||
562 | } | |||
563 | if (reading->mapped && (rule.flags & RF_NOMAPPED)) { | |||
564 | continue; | |||
565 | } | |||
566 | if (reading->noprint && !allow_magic_readings) { | |||
567 | continue; | |||
568 | } | |||
569 | if (reading->immutable && rule.type != K_UNPROTECT) { | |||
570 | if (type == K_SELECT) { | |||
571 | reading->matched_target = true; | |||
572 | reading->matched_tests = true; | |||
573 | reading_contexts.push_back(context_stack.back()); | |||
574 | } | |||
575 | ++num_active; | |||
576 | ++num_iff; | |||
577 | continue; | |||
578 | } | |||
579 | ||||
580 | // Check if any previous reading of this cohort had the same plain signature, and if so just copy their results | |||
581 | // This cache is cleared on a per-cohort basis | |||
582 | did_test = false; | |||
583 | if (!(set.type & (ST_SPECIAL | ST_MAPPING | ST_CHILD_UNIFY)) && !readings_plain.empty()) { | |||
584 | auto rpit = readings_plain.find(reading->hash_plain); | |||
585 | if (rpit != readings_plain.end()) { | |||
586 | reading->matched_target = rpit->second->matched_target; | |||
587 | reading->matched_tests = rpit->second->matched_tests; | |||
588 | if (reading->matched_tests) { | |||
589 | ++num_active; | |||
590 | } | |||
591 | if (regexgrps_c.count(rpit->second->number)) { | |||
592 | regexgrps_c[reading->number]; | |||
593 | regexgrps_c[reading->number] = regexgrps_c[rpit->second->number]; | |||
594 | regexgrps_z[reading->number]; | |||
595 | regexgrps_z[reading->number] = regexgrps_z[rpit->second->number]; | |||
596 | ||||
597 | context_stack.back().regexgrp_ct = regexgrps_z[reading->number]; | |||
598 | context_stack.back().regexgrps = regexgrps_c[reading->number]; | |||
599 | } | |||
600 | context_stack.back().unif_tags = unif_tags_rs[reading->hash_plain]; | |||
601 | context_stack.back().unif_sets = unif_sets_rs[reading->hash_plain]; | |||
602 | did_test = true; | |||
603 | test_good = rpit->second->matched_tests; | |||
604 | reading_contexts.push_back(context_stack.back()); | |||
605 | continue; | |||
606 | } | |||
607 | } | |||
608 | ||||
609 | // Regex capture is done on a per-reading basis, so clear all captured state. | |||
610 | context_stack.back().regexgrp_ct = 0; | |||
611 | context_stack.back().regexgrps = ®exgrps_store[used_regex]; | |||
612 | ||||
613 | // Unification is done on a per-reading basis, so clear all unification state. | |||
614 | context_stack.back().unif_tags = &unif_tags_store[used_unif]; | |||
615 | context_stack.back().unif_sets = &unif_sets_store[used_unif]; | |||
616 | unif_tags_rs[reading->hash_plain] = context_stack.back().unif_tags; | |||
617 | unif_sets_rs[reading->hash_plain] = context_stack.back().unif_sets; | |||
618 | unif_tags_rs[reading->hash] = context_stack.back().unif_tags; | |||
619 | unif_sets_rs[reading->hash] = context_stack.back().unif_sets; | |||
620 | ++used_unif; | |||
621 | ||||
622 | context_stack.back().unif_tags->clear(); | |||
623 | context_stack.back().unif_sets->clear(); | |||
624 | ||||
625 | unif_last_wordform = 0; | |||
626 | unif_last_baseform = 0; | |||
627 | unif_last_textual = 0; | |||
628 | ||||
629 | same_basic = reading->hash_plain; | |||
630 | rule_target = context_target = nullptr; | |||
631 | if (context_stack.size() > 1) { | |||
632 | Cohort* m = context_stack[context_stack.size()-2].mark; | |||
633 | if (m) set_mark(m); | |||
634 | else set_mark(cohort); | |||
635 | } | |||
636 | else { | |||
637 | set_mark(cohort); | |||
638 | } | |||
639 | uint8_t orz = context_stack.back().regexgrp_ct; | |||
640 | for (auto r = cohort->readings[i]; r; r = r->next) { | |||
641 | r->active = true; | |||
642 | } | |||
643 | if (rule.line == 2746) { | |||
644 | cohort = cohort; | |||
645 | } | |||
646 | rule_target = cohort; | |||
647 | // Actually check if the reading is a valid target. First check if rule target matches... | |||
648 | if (rule.target && doesSetMatchReading(*reading, rule.target, (set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) != 0)) { | |||
649 | if (rule.line == 2746) { | |||
650 | cohort = cohort; | |||
651 | } | |||
652 | bool regex_prop = true; | |||
653 | if (orz != context_stack.back().regexgrp_ct) { | |||
654 | did_test = false; | |||
655 | regex_prop = false; | |||
656 | } | |||
657 | rule_target = context_target = cohort; | |||
658 | reading->matched_target = true; | |||
659 | matched_target = true; | |||
660 | bool good = true; | |||
661 | // If we didn't already run the contextual tests, run them now. | |||
662 | if (!did_test) { | |||
663 | context_stack.back().context.clear(); | |||
664 | foreach (it, rule.tests)if (!(rule.tests).empty()) for (auto it = (rule.tests).begin( ), it_end = (rule.tests).end(); it != it_end; ++it) { | |||
665 | ContextualTest* test = *it; | |||
666 | if (rule.flags & RF_RESETX || !(rule.flags & RF_REMEMBERX)) { | |||
667 | set_mark(cohort); | |||
668 | } | |||
669 | seen_barrier = false; | |||
670 | // Keeps track of where we have been, to prevent infinite recursion in trees with loops | |||
671 | dep_deep_seen.clear(); | |||
672 | // Reset the counters for which types of CohortIterator we have in play | |||
673 | std::fill(ci_depths.begin(), ci_depths.end(), UI32(0)); | |||
674 | tmpl_cntx.clear(); | |||
675 | // Run the contextual test... | |||
676 | Cohort* next_test = nullptr; | |||
677 | Cohort* result = nullptr; | |||
678 | Cohort** deep = nullptr; | |||
679 | if (rule.type == K_WITH) { | |||
680 | deep = &result; | |||
681 | merge_with = nullptr; | |||
682 | } | |||
683 | if (!(test->pos & POS_PASS_ORIGIN) && (no_pass_origin || (test->pos & POS_NO_PASS_ORIGIN))) { | |||
684 | next_test = runContextualTest(¤t, c, test, deep, cohort); | |||
685 | } | |||
686 | else { | |||
687 | next_test = runContextualTest(¤t, c, test, deep); | |||
688 | } | |||
689 | context_stack.back().context.push_back(merge_with ? merge_with : result); | |||
690 | test_good = (next_test != nullptr); | |||
691 | ||||
692 | profileRuleContext(test_good, &rule, test); | |||
693 | ||||
694 | if (!test_good) { | |||
695 | good = test_good; | |||
696 | if (it != rule.tests.begin() && !(rule.flags & RF_KEEPORDER)) { | |||
697 | rule.tests.erase(it); | |||
698 | rule.tests.push_front(test); | |||
699 | } | |||
700 | break; | |||
701 | } | |||
702 | did_test = ((set.type & (ST_CHILD_UNIFY | ST_SPECIAL)) == 0 && context_stack.back().unif_tags->empty() && context_stack.back().unif_sets->empty()); | |||
703 | } | |||
704 | } | |||
705 | else { | |||
706 | good = test_good; | |||
707 | } | |||
708 | if (good) { | |||
709 | // We've found a match, so Iff should be treated as Select instead of Remove | |||
710 | if (rule.type == K_IFF && type != K_SELECT) { | |||
711 | type = K_SELECT; | |||
712 | if (grammar->has_protect) { | |||
713 | for (size_t j = 0; j < i; ++j) { | |||
714 | Reading* reading = get_sub_reading(cohort->readings[j], rule.sub_reading); | |||
715 | if (reading && reading->immutable) { | |||
716 | reading->matched_target = true; | |||
717 | reading->matched_tests = true; | |||
718 | ++num_active; | |||
719 | ++num_iff; | |||
720 | } | |||
721 | } | |||
722 | } | |||
723 | } | |||
724 | reading->matched_tests = true; | |||
725 | ++num_active; | |||
726 | if (profiler) { | |||
727 | Profiler::Key k{ET_RULE, rule.number + 1 }; | |||
728 | auto& r = profiler->entries[k]; | |||
729 | ++r.num_match; | |||
730 | if (!r.example_window) { | |||
731 | addProfilingExample(r); | |||
732 | } | |||
733 | } | |||
734 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { | |||
735 | printDebugRule(rule); | |||
736 | } | |||
737 | ||||
738 | if (regex_prop && i && !regexgrps_c.empty()) { | |||
739 | for (auto z = i; z > 0; --z) { | |||
740 | auto it = regexgrps_c.find(cohort->readings[z - 1]->number); | |||
741 | if (it != regexgrps_c.end()) { | |||
742 | regexgrps_c.insert(std::make_pair(reading->number, it->second)); | |||
743 | regexgrps_z.insert(std::make_pair(reading->number, regexgrps_z.find(cohort->readings[z - 1]->number)->second)); | |||
744 | break; | |||
745 | } | |||
746 | } | |||
747 | } | |||
748 | } | |||
749 | else { | |||
750 | context_stack.back().regexgrp_ct = orz; | |||
751 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { | |||
752 | printDebugRule(rule, true, false); | |||
753 | } | |||
754 | } | |||
755 | ++num_iff; | |||
756 | } | |||
757 | else { | |||
758 | context_stack.back().regexgrp_ct = orz; | |||
759 | if (profiler) { | |||
760 | Profiler::Key k{ ET_RULE, rule.number + 1 }; | |||
761 | ++profiler->entries[k].num_fail; | |||
762 | } | |||
763 | if (!debug_rules.empty() && debug_rules.contains(rule.line)) { | |||
764 | printDebugRule(rule, false, false); | |||
765 | } | |||
766 | } | |||
767 | readings_plain.insert(std::make_pair(reading->hash_plain, reading)); | |||
768 | for (auto r = cohort->readings[i]; r; r = r->next) { | |||
769 | r->active = false; | |||
770 | } | |||
771 | ||||
772 | if (reading != cohort->readings[i]) { | |||
773 | cohort->readings[i]->matched_target = reading->matched_target; | |||
774 | cohort->readings[i]->matched_tests = reading->matched_tests; | |||
775 | } | |||
776 | if (context_stack.back().regexgrp_ct) { | |||
777 | regexgrps_c[reading->number] = context_stack.back().regexgrps; | |||
778 | regexgrps_z[reading->number] = context_stack.back().regexgrp_ct; | |||
779 | ++used_regex; | |||
780 | } | |||
781 | reading_contexts.push_back(context_stack.back()); | |||
782 | } | |||
783 | ||||
784 | if (state_num_readings != cohort->readings.size() || state_num_removed != cohort->deleted.size() || state_num_delayed != cohort->delayed.size() || state_num_ignored != cohort->ignored.size()) { | |||
785 | anything_changed = true; | |||
786 | cohort->type &= ~CT_NUM_CURRENT; | |||
787 | } | |||
788 | ||||
789 | // If none of the readings were valid targets, remove this cohort from the rule's possible cohorts. | |||
790 | if (num_active == 0 && (num_iff == 0 || rule.type != K_IFF)) { | |||
791 | if (!matched_target) { | |||
792 | --rocit; // We have already incremented rocit earlier, so take one step back... | |||
793 | rocit = cohortset->erase(rocit); // ...and one step forward again | |||
794 | } | |||
795 | context_stack.pop_back(); | |||
796 | continue; | |||
797 | } | |||
798 | ||||
799 | // All readings were valid targets, which means there is nothing to do for Select or safe Remove rules. | |||
800 | if (num_active == cohort->readings.size()) { | |||
801 | if (type == K_SELECT) { | |||
802 | context_stack.pop_back(); | |||
803 | continue; | |||
804 | } | |||
805 | if (type == K_REMOVE && (!unsafe || (rule.flags & RF_SAFE)) && !(rule.flags & RF_UNSAFE)) { | |||
806 | context_stack.pop_back(); | |||
807 | continue; | |||
808 | } | |||
809 | } | |||
810 | ||||
811 | for (auto& ctx : reading_contexts) { | |||
812 | if (!ctx.target.subreading->matched_target) { | |||
813 | continue; | |||
814 | } | |||
815 | if (!ctx.target.subreading->matched_tests && rule.type != K_IFF) { | |||
816 | continue; | |||
817 | } | |||
818 | context_stack.back() = ctx; | |||
819 | reset_cohorts_for_loop = false; | |||
820 | reading_cb(); | |||
821 | if (!finish_cohort_loop) { | |||
822 | context_stack.pop_back(); | |||
823 | return anything_changed; | |||
824 | } | |||
825 | if (reset_cohorts_for_loop) { | |||
826 | reset_cohorts(); | |||
827 | break; | |||
828 | } | |||
829 | if (!finish_reading_loop) { | |||
830 | break; | |||
831 | } | |||
832 | } | |||
833 | ||||
834 | reset_cohorts_for_loop = false; | |||
835 | cohort_cb(); | |||
836 | if (!finish_cohort_loop) { | |||
837 | context_stack.pop_back(); | |||
838 | return anything_changed; | |||
839 | } | |||
840 | if (reset_cohorts_for_loop) { | |||
841 | reset_cohorts(); | |||
842 | } | |||
843 | context_stack.pop_back(); | |||
844 | } | |||
845 | return anything_changed; | |||
846 | } | |||
847 | ||||
848 | /** | |||
849 | * Applies the passed rules to the passed SingleWindow. | |||
850 | * | |||
851 | * This function is called at least N*M times where N is number of sections in the grammar and M is the number of windows in the input. | |||
852 | * Possibly many more times, since if a section changes the state of the window the section is run again. | |||
853 | * Only when no further changes are caused at a level does it progress to next level. | |||
854 | * | |||
855 | * The loops in this function are increasingly explosive, despite efforts to contain them. | |||
856 | * In the https://visl.sdu.dk/cg3_performance.html test data, this function is called 1015 times. | |||
857 | * The first loop (rules) is executed 3101728 times. | |||
858 | * The second loop (cohorts) is executed 11087278 times. | |||
859 | * The third loop (finding readings) is executed 11738927 times; of these, 1164585 (10%) match the rule target. | |||
860 | * The fourth loop (contextual test) is executed 1184009 times; of those, 1156322 (97%) fail their contexts. | |||
861 | * The fifth loop (acting on readings) is executed 41540 times. | |||
862 | * | |||
863 | * @param[in,out] current The window to apply rules on | |||
864 | * @param[in] rules The rules to apply | |||
865 | */ | |||
866 | uint32_t GrammarApplicator::runRulesOnSingleWindow(SingleWindow& current, const uint32IntervalVector& rules) { | |||
867 | uint32_t retval = RV_NOTHING; | |||
868 | bool section_did_something = false; | |||
869 | bool delimited = false; | |||
870 | ||||
871 | // ToDo: Now that numbering is used, can't this be made a normal max? Hm, maybe not since --sections can still force another order...but if we're smart, then we re-enumerate rules based on --sections | |||
872 | uint32IntervalVector intersects = current.valid_rules.intersect(rules); | |||
873 | ReadingList removed; | |||
874 | ReadingList selected; | |||
875 | ||||
876 | if (debug_level > 1) { | |||
877 | std::cerr << "DEBUG: Trying window " << current.number << std::endl; | |||
878 | } | |||
879 | ||||
880 | current.parent->cohort_map[0] = current.cohorts.front(); | |||
881 | ||||
882 | foreach (iter_rules, intersects)if (!(intersects).empty()) for (auto iter_rules = (intersects ).begin(), iter_rules_end = (intersects).end(); iter_rules != iter_rules_end; ++iter_rules) { | |||
883 | // Conditionally re-sort the rule-to-cohort mapping when the current rule is finished, regardless of how it finishes | |||
884 | struct Sorter { | |||
885 | SingleWindow& current; | |||
886 | bool do_sort = false; | |||
887 | ||||
888 | Sorter(SingleWindow& current) | |||
889 | : current(current) | |||
890 | {} | |||
891 | ||||
892 | ~Sorter() { | |||
893 | if (do_sort) { | |||
894 | for (auto& cs : current.rule_to_cohorts) { | |||
895 | cs.sort(); | |||
896 | } | |||
897 | } | |||
898 | } | |||
899 | } sorter(current); | |||
900 | ||||
901 | repeat_rule: | |||
902 | bool rule_did_something = false; | |||
903 | uint32_t j = (*iter_rules); | |||
904 | ||||
905 | // Check whether this rule is in the allowed rule list from cmdline flag --rule(s) | |||
906 | if (!valid_rules.empty() && !valid_rules.contains(j)) { | |||
907 | continue; | |||
908 | } | |||
909 | ||||
910 | current_rule = grammar->rule_by_number[j]; | |||
911 | Rule* rule = grammar->rule_by_number[j]; | |||
912 | if (rule->type == K_IGNORE) { | |||
913 | continue; | |||
914 | } | |||
915 | if (debug_level > 1) { | |||
916 | std::cerr << "DEBUG: Trying rule " << rule->line << std::endl; | |||
917 | } | |||
918 | ||||
919 | if (!apply_mappings && (rule->type == K_MAP || rule->type == K_ADD || rule->type == K_REPLACE)) { | |||
920 | continue; | |||
921 | } | |||
922 | if (!apply_corrections && (rule->type == K_SUBSTITUTE || rule->type == K_APPEND)) { | |||
923 | continue; | |||
924 | } | |||
925 | // If there are parentheses and the rule is marked as only run on the final pass, skip if this is not it. | |||
926 | if (current.has_enclosures) { | |||
927 | if ((rule->flags & RF_ENCL_FINAL) && !did_final_enclosure) { | |||
928 | continue; | |||
929 | } | |||
930 | if (did_final_enclosure && !(rule->flags & RF_ENCL_FINAL)) { | |||
931 | continue; | |||
932 | } | |||
933 | } | |||
934 | ||||
935 | bool readings_changed = false; | |||
936 | bool should_repeat = false; | |||
937 | bool should_bail = false; | |||
938 | ||||
939 | auto reindex = [&](SingleWindow* which = nullptr) { | |||
940 | if (!which) { | |||
941 | which = ¤t; | |||
942 | } | |||
943 | foreach (iter, which->cohorts)if (!(which->cohorts).empty()) for (auto iter = (which-> cohorts).begin(), iter_end = (which->cohorts).end(); iter != iter_end; ++iter) { | |||
944 | (*iter)->local_number = UI32(std::distance(which->cohorts.begin(), iter)); | |||
945 | } | |||
946 | gWindow->rebuildCohortLinks(); | |||
947 | }; | |||
948 | ||||
949 | auto collect_subtree = [&](CohortSet& cs, Cohort* head, uint32_t cset) { | |||
950 | if (cset) { | |||
951 | for (auto iter : current.cohorts) { | |||
952 | // Always consider the initial cohort a match | |||
953 | if (iter->global_number == head->global_number) { | |||
954 | cs.insert(iter); | |||
955 | } | |||
956 | else if (iter->dep_parent == head->global_number && doesSetMatchCohortNormal(*iter, cset)) { | |||
957 | cs.insert(iter); | |||
958 | } | |||
959 | } | |||
960 | CohortSet more; | |||
961 | for (auto iter : current.cohorts) { | |||
962 | for (auto cht : cs) { | |||
963 | // Do not grab the whole tree from the root, in case WithChild is not (*) | |||
964 | if (cht->global_number == head->global_number) { | |||
965 | continue; | |||
966 | } | |||
967 | if (isChildOf(iter, cht)) { | |||
968 | more.insert(iter); | |||
969 | } | |||
970 | } | |||
971 | } | |||
972 | cs.insert(more.begin(), more.end()); | |||
973 | } | |||
974 | else { | |||
975 | cs.insert(head); | |||
976 | } | |||
977 | }; | |||
978 | ||||
979 | auto add_cohort = [&](Cohort* cohort, size_t& spacesInAddedWf) { | |||
980 | Cohort* cCohort = alloc_cohort(¤t); | |||
981 | cCohort->global_number = gWindow->cohort_counter++; | |||
982 | ||||
983 | Tag* wf = nullptr; | |||
984 | std::vector<TagList> readings; | |||
985 | auto theTags = ss_taglist.get(); | |||
986 | getTagList(*rule->maplist, theTags); | |||
987 | ||||
988 | for (auto& tter : *theTags) { | |||
989 | if (tter->type & T_VSTR) { | |||
990 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
991 | } | |||
992 | } | |||
993 | ||||
994 | for (auto tter : *theTags) { | |||
995 | if(tter->type & T_WORDFORM) { | |||
996 | spacesInAddedWf = std::count_if(tter->tag.begin(), tter->tag.end(), [](UChar c){ return c == ' '; }); | |||
997 | } | |||
998 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
999 | if (tter->type & T_WORDFORM) { | |||
1000 | cCohort->wordform = tter; | |||
1001 | wf = tter; | |||
1002 | continue; | |||
1003 | } | |||
1004 | if (!wf) { | |||
1005 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a wordform before any other tags in ADDCOHORT/MERGECOHORTS on line %u before input line %u.\n", rule->line, numLines); | |||
1006 | CG3Quit(1); | |||
1007 | } | |||
1008 | if (tter->type & T_BASEFORM) { | |||
1009 | readings.resize(readings.size() + 1); | |||
1010 | readings.back().push_back(wf); | |||
1011 | } | |||
1012 | if (readings.empty()) { | |||
1013 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform after the wordform in ADDCOHORT/MERGECOHORTS on line %u before input line %u.\n", rule->line, numLines); | |||
1014 | CG3Quit(1); | |||
1015 | } | |||
1016 | readings.back().push_back(tter); | |||
1017 | } | |||
1018 | ||||
1019 | for (auto& tags : readings) { | |||
1020 | for (size_t i = 0; i < tags.size(); ++i) { | |||
1021 | if (tags[i]->hash == grammar->tag_any) { | |||
1022 | auto& nt = cohort->readings.front()->tags_list; | |||
1023 | if (nt.size() <= 2) { | |||
1024 | continue; | |||
1025 | } | |||
1026 | tags.reserve(tags.size() + nt.size() - 2); | |||
1027 | tags[i] = grammar->single_tags[nt[2]]; | |||
1028 | for (size_t j = 3, k = 1; j < nt.size(); ++j) { | |||
1029 | if (grammar->single_tags[nt[j]]->type & T_DEPENDENCY) { | |||
1030 | continue; | |||
1031 | } | |||
1032 | tags.insert(tags.begin() + i + k, grammar->single_tags[nt[j]]); | |||
1033 | ++k; | |||
1034 | } | |||
1035 | } | |||
1036 | } | |||
1037 | } | |||
1038 | ||||
1039 | for (auto& rit : readings) { | |||
1040 | Reading* cReading = alloc_reading(cCohort); | |||
1041 | ++numReadings; | |||
1042 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); | |||
1043 | cReading->hit_by.push_back(rule->number); | |||
1044 | cReading->noprint = false; | |||
1045 | TagList mappings; | |||
1046 | for (auto tter : rit) { | |||
1047 | uint32_t hash = tter->hash; | |||
1048 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
1049 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { | |||
1050 | mappings.push_back(tter); | |||
1051 | } | |||
1052 | else { | |||
1053 | hash = addTagToReading(*cReading, hash); | |||
1054 | } | |||
1055 | if (updateValidRules(rules, intersects, hash, *cReading)) { | |||
1056 | iter_rules = intersects.find(rule->number); | |||
1057 | iter_rules_end = intersects.end(); | |||
1058 | } | |||
1059 | } | |||
1060 | if (!mappings.empty()) { | |||
1061 | splitMappings(mappings, *cCohort, *cReading); | |||
1062 | } | |||
1063 | cCohort->appendReading(cReading); | |||
1064 | } | |||
1065 | ||||
1066 | current.parent->cohort_map[cCohort->global_number] = cCohort; | |||
1067 | current.parent->dep_window[cCohort->global_number] = cCohort; | |||
1068 | if (grammar->addcohort_attach && (rule->type == K_ADDCOHORT_BEFORE || rule->type == K_ADDCOHORT_AFTER)) { | |||
1069 | attachParentChild(*cohort, *cCohort); | |||
1070 | } | |||
1071 | ||||
1072 | if (cCohort->readings.empty()) { | |||
1073 | initEmptyCohort(*cCohort); | |||
1074 | if (trace) { | |||
1075 | auto r = cCohort->readings.front(); | |||
1076 | r->hit_by.push_back(rule->number); | |||
1077 | r->noprint = false; | |||
1078 | } | |||
1079 | } | |||
1080 | ||||
1081 | CohortSet cohorts; | |||
1082 | collect_subtree(cohorts, cohort, rule->childset1); | |||
1083 | ||||
1084 | if (rule->type == K_ADDCOHORT_BEFORE) { | |||
1085 | current.cohorts.insert(current.cohorts.begin() + cohorts.front()->local_number, cCohort); | |||
1086 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + cohorts.front()->local_number, current.all_cohorts.end(), cohorts.front()), cCohort); | |||
1087 | } | |||
1088 | else { | |||
1089 | current.cohorts.insert(current.cohorts.begin() + cohorts.back()->local_number + 1, cCohort); | |||
1090 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + cohorts.back()->local_number, current.all_cohorts.end(), cohorts.back()) + 1, cCohort); | |||
1091 | } | |||
1092 | ||||
1093 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { | |||
1094 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); | |||
1095 | } | |||
1096 | gWindow->rebuildCohortLinks(); | |||
1097 | ||||
1098 | return cCohort; | |||
1099 | }; | |||
1100 | ||||
1101 | auto rem_cohort = [&](Cohort* cohort) { | |||
1102 | auto& current = *cohort->parent; | |||
1103 | for (auto iter : cohort->readings) { | |||
1104 | iter->hit_by.push_back(rule->number); | |||
1105 | iter->deleted = true; | |||
1106 | if (trace) { | |||
1107 | iter->noprint = false; | |||
1108 | } | |||
1109 | } | |||
1110 | // Remove the cohort from all rules | |||
1111 | for (auto& cs : current.rule_to_cohorts) { | |||
1112 | cs.erase(cohort); | |||
1113 | } | |||
1114 | // Forward all children of this cohort to the parent of this cohort | |||
1115 | // ToDo: Named relations must be erased | |||
1116 | while (!cohort->dep_children.empty()) { | |||
1117 | uint32_t ch = cohort->dep_children.back(); | |||
1118 | if (cohort->dep_parent == DEP_NO_PARENT) { | |||
1119 | attachParentChild(*gWindow->cohort_map[0], *gWindow->cohort_map[ch], true, true); | |||
1120 | } | |||
1121 | else { | |||
1122 | attachParentChild(*gWindow->cohort_map[cohort->dep_parent], *gWindow->cohort_map[ch], true, true); | |||
1123 | } | |||
1124 | cohort->dep_children.erase(ch); | |||
1125 | } | |||
1126 | cohort->type |= CT_REMOVED; | |||
1127 | cohort->detach(); | |||
1128 | for (auto& cm : gWindow->cohort_map) { | |||
1129 | cm.second->dep_children.erase(cohort->dep_self); | |||
1130 | } | |||
1131 | gWindow->cohort_map.erase(cohort->global_number); | |||
1132 | current.cohorts.erase(current.cohorts.begin() + cohort->local_number); | |||
1133 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { | |||
1134 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); | |||
1135 | } | |||
1136 | ||||
1137 | if (current.cohorts.size() == 1 && ¤t != gWindow->current) { | |||
1138 | // This window is now empty, so remove it entirely from consideration so rules can look past it | |||
1139 | cohort = current.cohorts[0]; | |||
1140 | ||||
1141 | // Remove the cohort from all rules | |||
1142 | for (auto& cs : current.rule_to_cohorts) { | |||
1143 | cs.erase(cohort); | |||
1144 | } | |||
1145 | cohort->detach(); | |||
1146 | for (auto& cm : gWindow->cohort_map) { | |||
1147 | cm.second->dep_children.erase(cohort->dep_self); | |||
1148 | } | |||
1149 | gWindow->cohort_map.erase(cohort->global_number); | |||
1150 | free_cohort(cohort); | |||
1151 | ||||
1152 | if (current.previous) { | |||
1153 | current.previous->text += current.text + current.text_post; | |||
1154 | current.previous->all_cohorts.insert(current.previous->all_cohorts.end(), current.all_cohorts.begin() + 1, current.all_cohorts.end()); | |||
1155 | } | |||
1156 | else if (current.next) { | |||
1157 | current.next->text = current.text_post + current.next->text; | |||
1158 | current.next->all_cohorts.insert(current.previous->all_cohorts.begin() + 1, current.all_cohorts.begin() + 1, current.all_cohorts.end()); | |||
1159 | } | |||
1160 | current.all_cohorts.clear(); | |||
1161 | ||||
1162 | for (size_t i = 0; i < gWindow->previous.size(); ++i) { | |||
1163 | if (gWindow->previous[i] == ¤t) { | |||
1164 | free_swindow(gWindow->previous[i]); | |||
1165 | gWindow->previous.erase(gWindow->previous.begin() + i); | |||
1166 | break; | |||
1167 | } | |||
1168 | } | |||
1169 | for (size_t i = 0; i < gWindow->next.size(); ++i) { | |||
1170 | if (gWindow->next[i] == ¤t) { | |||
1171 | free_swindow(gWindow->next[i]); | |||
1172 | gWindow->next.erase(gWindow->next.begin() + i); | |||
1173 | break; | |||
1174 | } | |||
1175 | } | |||
1176 | ||||
1177 | gWindow->rebuildSingleWindowLinks(); | |||
1178 | } | |||
1179 | ||||
1180 | gWindow->rebuildCohortLinks(); | |||
1181 | }; | |||
1182 | ||||
1183 | auto ignore_cohort = [&](Cohort* cohort) { | |||
1184 | auto& current = *cohort->parent; | |||
1185 | for (auto iter : cohort->readings) { | |||
1186 | iter->hit_by.push_back(rule->number); | |||
1187 | } | |||
1188 | for (auto& cs : current.rule_to_cohorts) { | |||
1189 | cs.erase(cohort); | |||
1190 | } | |||
1191 | cohort->type |= CT_IGNORED; | |||
1192 | cohort->detach(); | |||
1193 | gWindow->cohort_map.erase(cohort->global_number); | |||
1194 | current.cohorts.erase(current.cohorts.begin() + cohort->local_number); | |||
1195 | }; | |||
1196 | ||||
1197 | auto make_relation_rtag = [&](Tag* tag, uint32_t id) { | |||
1198 | UChar tmp[256] = { 0 }; | |||
1199 | u_sprintfu_sprintf_72(tmp, "R:%S:%u", tag->tag.data(), id); | |||
1200 | auto nt = addTag(tmp); | |||
1201 | return nt; | |||
1202 | }; | |||
1203 | ||||
1204 | auto add_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { | |||
1205 | auto nt = make_relation_rtag(tag, id); | |||
1206 | for (auto& r : cohort->readings) { | |||
1207 | addTagToReading(*r, nt); | |||
1208 | } | |||
1209 | }; | |||
1210 | ||||
1211 | auto set_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { | |||
1212 | auto nt = make_relation_rtag(tag, id); | |||
1213 | for (auto& r : cohort->readings) { | |||
1214 | for (auto it = r->tags_list.begin(); it != r->tags_list.end();) { | |||
1215 | const auto& utag = grammar->single_tags[*it]->tag; | |||
1216 | if (utag[0] == 'R' && utag[1] == ':' && utag.size() > 2 + tag->tag.size() && utag[2 + tag->tag.size()] == ':' && utag.compare(2, tag->tag.size(), tag->tag) == 0) { | |||
1217 | r->tags.erase(*it); | |||
1218 | r->tags_textual.erase(*it); | |||
1219 | r->tags_numerical.erase(*it); | |||
1220 | r->tags_plain.erase(*it); | |||
1221 | it = r->tags_list.erase(it); | |||
1222 | } | |||
1223 | else { | |||
1224 | ++it; | |||
1225 | } | |||
1226 | } | |||
1227 | addTagToReading(*r, nt); | |||
1228 | } | |||
1229 | }; | |||
1230 | ||||
1231 | auto rem_relation_rtag = [&](Cohort* cohort, Tag* tag, uint32_t id) { | |||
1232 | auto nt = make_relation_rtag(tag, id); | |||
1233 | for (auto& r : cohort->readings) { | |||
1234 | delTagFromReading(*r, nt); | |||
1235 | } | |||
1236 | }; | |||
1237 | ||||
1238 | auto insert_taglist_to_reading = [&](auto& iter, auto& taglist, auto& reading, auto& mappings) { | |||
1239 | for (auto tag : taglist) { | |||
1240 | if (tag->type & T_VARSTRING) { | |||
1241 | tag = generateVarstringTag(tag); | |||
1242 | } | |||
1243 | if (tag->hash == grammar->tag_any) { | |||
1244 | break; | |||
1245 | } | |||
1246 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { | |||
1247 | mappings->push_back(tag); | |||
1248 | } | |||
1249 | else { | |||
1250 | iter = reading.tags_list.insert(iter, tag->hash); | |||
1251 | ++iter; | |||
1252 | } | |||
1253 | if (updateValidRules(rules, intersects, tag->hash, reading)) { | |||
1254 | iter_rules = intersects.find(rule->number); | |||
1255 | iter_rules_end = intersects.end(); | |||
1256 | } | |||
1257 | } | |||
1258 | reflowReading(reading); | |||
1259 | }; | |||
1260 | ||||
1261 | auto cohort_cb = [&]() { | |||
1262 | if (rule->type == K_SELECT || (rule->type == K_IFF && !selected.empty())) { | |||
| ||||
1263 | Cohort* target = get_apply_to().cohort; | |||
1264 | if (selected.size() < target->readings.size() && !selected.empty()) { | |||
1265 | ReadingList drop; | |||
1266 | size_t si = 0; | |||
1267 | for (size_t ri = 0; ri < target->readings.size(); ri++) { | |||
1268 | // Manually trace, since reading_cb doesn't get called on non-matching readings | |||
1269 | Reading* rd = target->readings[ri]; | |||
1270 | if (rule->sub_reading != 32767) { | |||
1271 | rd = get_sub_reading(rd, rule->sub_reading); | |||
1272 | } | |||
1273 | if (rd) { | |||
1274 | rd->hit_by.push_back(rule->number); | |||
1275 | } | |||
1276 | if (si < selected.size() && target->readings[ri] == selected[si]) { | |||
1277 | si++; | |||
1278 | } | |||
1279 | else { | |||
1280 | target->readings[ri]->deleted = true; | |||
1281 | drop.push_back(target->readings[ri]); | |||
1282 | } | |||
1283 | } | |||
1284 | target->readings.swap(selected); | |||
1285 | if (rule->flags & RF_DELAYED) { | |||
1286 | target->delayed.insert(target->delayed.end(), drop.begin(), drop.end()); | |||
1287 | } | |||
1288 | else if (rule->flags & RF_IGNORED) { | |||
1289 | target->ignored.insert(target->ignored.end(), drop.begin(), drop.end()); | |||
1290 | } | |||
1291 | else { | |||
1292 | target->deleted.insert(target->deleted.end(), drop.begin(), drop.end()); | |||
1293 | } | |||
1294 | readings_changed = true; | |||
1295 | } | |||
1296 | selected.clear(); | |||
1297 | } | |||
1298 | else if (rule->type == K_REMOVE || rule->type == K_IFF) { | |||
1299 | if (!removed.empty() && (removed.size() < get_apply_to().cohort->readings.size() || (unsafe && !(rule->flags & RF_SAFE)) || (rule->flags & RF_UNSAFE))) { | |||
1300 | if (rule->flags & RF_DELAYED) { | |||
1301 | get_apply_to().cohort->delayed.insert(get_apply_to().cohort->delayed.end(), removed.begin(), removed.end()); | |||
1302 | } | |||
1303 | else if (rule->flags & RF_IGNORED) { | |||
1304 | get_apply_to().cohort->ignored.insert(get_apply_to().cohort->ignored.end(), removed.begin(), removed.end()); | |||
1305 | } | |||
1306 | else { | |||
1307 | get_apply_to().cohort->deleted.insert(get_apply_to().cohort->deleted.end(), removed.begin(), removed.end()); | |||
1308 | } | |||
1309 | size_t oz = get_apply_to().cohort->readings.size(); | |||
1310 | while (!removed.empty()) { | |||
1311 | removed.back()->deleted = true; | |||
1312 | for (size_t i = 0; i < oz; ++i) { | |||
1313 | if (get_apply_to().cohort->readings[i] == removed.back()) { | |||
1314 | --oz; | |||
1315 | std::swap(get_apply_to().cohort->readings[i], get_apply_to().cohort->readings[oz]); | |||
1316 | } | |||
1317 | } | |||
1318 | removed.pop_back(); | |||
1319 | } | |||
1320 | get_apply_to().cohort->readings.resize(oz); | |||
1321 | if (debug_level > 0) { | |||
1322 | std::cerr << "DEBUG: Rule " << rule->line << " hit cohort " << get_apply_to().cohort->local_number << std::endl; | |||
1323 | } | |||
1324 | readings_changed = true; | |||
1325 | } | |||
1326 | if (get_apply_to().cohort->readings.empty()) { | |||
1327 | initEmptyCohort(*get_apply_to().cohort); | |||
1328 | } | |||
1329 | selected.clear(); | |||
1330 | } | |||
1331 | else if (rule->type == K_JUMP) { | |||
1332 | auto to = getTagList(*rule->maplist).front(); | |||
1333 | VARSTRINGIFY(to)do { while ((to)->type & T_VARSTRING) { (to) = generateVarstringTag ((to)); } } while (0); | |||
1334 | auto it = grammar->anchors.find(to->hash); | |||
1335 | if (it == grammar->anchors.end()) { | |||
1336 | u_fprintfu_fprintf_72(ux_stderr, "Warning: JUMP on line %u could not find anchor '%S'.\n", rule->line, to->tag.data()); | |||
1337 | } | |||
1338 | else { | |||
1339 | iter_rules = intersects.lower_bound(it->second); | |||
1340 | finish_cohort_loop = false; | |||
1341 | should_repeat = true; | |||
1342 | } | |||
1343 | } | |||
1344 | else if (rule->type == K_REMVARIABLE) { | |||
1345 | auto names = getTagList(*rule->maplist); | |||
1346 | for (auto tag : names) { | |||
1347 | VARSTRINGIFY(tag)do { while ((tag)->type & T_VARSTRING) { (tag) = generateVarstringTag ((tag)); } } while (0); | |||
1348 | auto it = variables.begin(); | |||
1349 | if (tag->type & T_REGEXP) { | |||
1350 | it = std::find_if(it, variables.end(), [&](auto& kv) { return doesTagMatchRegexp(kv.first, *tag); }); | |||
1351 | } | |||
1352 | else if (tag->type & T_CASE_INSENSITIVE) { | |||
1353 | it = std::find_if(it, variables.end(), [&](auto& kv) { return doesTagMatchIcase(kv.first, *tag); }); | |||
1354 | } | |||
1355 | else { | |||
1356 | it = variables.find(tag->hash); | |||
1357 | } | |||
1358 | if (it != variables.end()) { | |||
1359 | if (rule->flags & RF_OUTPUT) { | |||
1360 | current.variables_output.insert(it->first); | |||
1361 | } | |||
1362 | variables.erase(it); | |||
1363 | //u_fprintf(ux_stderr, "Info: RemVariable fired for %S.\n", tag->tag.data()); | |||
1364 | } | |||
1365 | } | |||
1366 | } | |||
1367 | else if (rule->type == K_SETVARIABLE) { | |||
1368 | auto names = getTagList(*rule->maplist); | |||
1369 | auto values = getTagList(*rule->sublist); | |||
1370 | VARSTRINGIFY(names.front())do { while ((names.front())->type & T_VARSTRING) { (names .front()) = generateVarstringTag((names.front())); } } while ( 0); | |||
1371 | VARSTRINGIFY(values.front())do { while ((values.front())->type & T_VARSTRING) { (values .front()) = generateVarstringTag((values.front())); } } while (0); | |||
1372 | variables[names.front()->hash] = values.front()->hash; | |||
1373 | if (rule->flags & RF_OUTPUT) { | |||
1374 | current.variables_output.insert(names.front()->hash); | |||
1375 | } | |||
1376 | //u_fprintf(ux_stderr, "Info: SetVariable fired for %S.\n", names.front()->tag.data()); | |||
1377 | } | |||
1378 | else if (rule->type == K_DELIMIT) { | |||
1379 | auto cohort = get_apply_to().cohort; | |||
1380 | if (cohort->parent->cohorts.size() > cohort->local_number + 1) { | |||
1381 | delimitAt(current, cohort); | |||
1382 | delimited = true; | |||
1383 | readings_changed = true; | |||
1384 | } | |||
1385 | } | |||
1386 | else if (rule->type == K_EXTERNAL_ONCE || rule->type == K_EXTERNAL_ALWAYS) { | |||
1387 | if (rule->type == K_EXTERNAL_ONCE && !current.hit_external.insert(rule->line).second) { | |||
1388 | return; | |||
1389 | } | |||
1390 | ||||
1391 | auto ei = externals.find(rule->varname); | |||
1392 | if (ei == externals.end()) { | |||
1393 | Tag* ext = grammar->single_tags.find(rule->varname)->second; | |||
1394 | UErrorCode err = U_ZERO_ERROR; | |||
1395 | u_strToUTF8u_strToUTF8_72(&cbuffers[0][0], SI32(CG3_BUFFER_SIZE - 1), nullptr, ext->tag.data(), SI32(ext->tag.size()), &err); | |||
1396 | ||||
1397 | Process& es = externals[rule->varname]; | |||
1398 | try { | |||
1399 | es.start(&cbuffers[0][0]); | |||
1400 | writeRaw(es, CG3_EXTERNAL_PROTOCOL); | |||
1401 | } | |||
1402 | catch (std::exception& e) { | |||
1403 | u_fprintfu_fprintf_72(ux_stderr, "Error: External on line %u resulted in error: %s\n", rule->line, e.what()); | |||
1404 | CG3Quit(1); | |||
1405 | } | |||
1406 | ei = externals.find(rule->varname); | |||
1407 | } | |||
1408 | ||||
1409 | pipeOutSingleWindow(current, ei->second); | |||
1410 | pipeInSingleWindow(current, ei->second); | |||
1411 | ||||
1412 | indexSingleWindow(current); | |||
1413 | readings_changed = true; | |||
1414 | index_ruleCohort_no.clear(); | |||
1415 | intersects = current.valid_rules.intersect(rules); | |||
1416 | iter_rules = intersects.find(rule->number); | |||
1417 | iter_rules_end = intersects.end(); | |||
1418 | reset_cohorts_for_loop = true; | |||
1419 | } | |||
1420 | else if (rule->type == K_REMCOHORT) { | |||
1421 | // REMCOHORT-IGNORED | |||
1422 | if (rule->flags & RF_IGNORED) { | |||
1423 | CohortSet cohorts; | |||
1424 | collect_subtree(cohorts, get_apply_to().cohort, rule->childset1); | |||
1425 | for (auto c : reversed(cohorts)) { | |||
1426 | ignore_cohort(c); | |||
1427 | } | |||
1428 | reindex(); | |||
1429 | reflowDependencyWindow(); | |||
1430 | } | |||
1431 | else { | |||
1432 | rem_cohort(get_apply_to().cohort); | |||
1433 | } | |||
1434 | ||||
1435 | // If we just removed the last cohort, add <<< to the new last cohort | |||
1436 | if (get_apply_to().cohort->readings.front()->tags.count(endtag)) { | |||
1437 | for (auto r : current.cohorts.back()->readings) { | |||
1438 | addTagToReading(*r, endtag); | |||
1439 | if (updateValidRules(rules, intersects, endtag, *r)) { | |||
1440 | iter_rules = intersects.find(rule->number); | |||
1441 | iter_rules_end = intersects.end(); | |||
1442 | } | |||
1443 | } | |||
1444 | index_ruleCohort_no.clear(); | |||
1445 | } | |||
1446 | readings_changed = true; | |||
1447 | reset_cohorts_for_loop = true; | |||
1448 | } | |||
1449 | }; | |||
1450 | ||||
1451 | RuleCallback reading_cb = [&]() { | |||
1452 | if (rule->type == K_SELECT || (rule->type == K_IFF && get_apply_to().subreading->matched_tests)) { | |||
1453 | selected.push_back(get_apply_to().reading); | |||
1454 | index_ruleCohort_no.clear(); | |||
1455 | } | |||
1456 | else if (rule->type == K_REMOVE || rule->type == K_IFF) { | |||
1457 | if (rule->type == K_REMOVE && (rule->flags & RF_UNMAPLAST) && removed.size() == get_apply_to().cohort->readings.size() - 1) { | |||
1458 | if (unmapReading(*get_apply_to().subreading, rule->number)) { | |||
1459 | readings_changed = true; | |||
1460 | } | |||
1461 | } | |||
1462 | else { | |||
1463 | TRACE; | |||
1464 | removed.push_back(get_apply_to().reading); | |||
1465 | } | |||
1466 | index_ruleCohort_no.clear(); | |||
1467 | } | |||
1468 | else if (rule->type == K_PROTECT) { | |||
1469 | TRACE; | |||
1470 | get_apply_to().subreading->immutable = true; | |||
1471 | } | |||
1472 | else if (rule->type == K_UNPROTECT) { | |||
1473 | TRACE; | |||
1474 | get_apply_to().subreading->immutable = false; | |||
1475 | } | |||
1476 | else if (rule->type == K_UNMAP) { | |||
1477 | if (unmapReading(*get_apply_to().subreading, rule->number)) { | |||
1478 | index_ruleCohort_no.clear(); | |||
1479 | readings_changed = true; | |||
1480 | } | |||
1481 | } | |||
1482 | else if (rule->type == K_ADDCOHORT_AFTER || rule->type == K_ADDCOHORT_BEFORE) { | |||
1483 | index_ruleCohort_no.clear(); | |||
1484 | TRACE; | |||
1485 | ||||
1486 | size_t spacesInAddedWf = 0; // not used here | |||
1487 | auto cCohort = add_cohort(get_apply_to().cohort, spacesInAddedWf); | |||
1488 | ||||
1489 | // If the new cohort is now the last cohort, add <<< to it and remove <<< from previous last cohort | |||
1490 | if (current.cohorts.back() == cCohort) { | |||
1491 | for (auto r : current.cohorts[current.cohorts.size() - 2]->readings) { | |||
1492 | delTagFromReading(*r, endtag); | |||
1493 | } | |||
1494 | for (auto r : current.cohorts.back()->readings) { | |||
1495 | addTagToReading(*r, endtag); | |||
1496 | if (updateValidRules(rules, intersects, endtag, *r)) { | |||
1497 | iter_rules = intersects.find(rule->number); | |||
1498 | iter_rules_end = intersects.end(); | |||
1499 | } | |||
1500 | } | |||
1501 | } | |||
1502 | indexSingleWindow(current); | |||
1503 | readings_changed = true; | |||
1504 | ||||
1505 | reset_cohorts_for_loop = true; | |||
1506 | } | |||
1507 | else if (rule->type == K_SPLITCOHORT) { | |||
1508 | index_ruleCohort_no.clear(); | |||
1509 | ||||
1510 | std::vector<std::pair<Cohort*, std::vector<TagList>>> cohorts; | |||
1511 | ||||
1512 | auto theTags = ss_taglist.get(); | |||
1513 | getTagList(*rule->maplist, theTags); | |||
1514 | ||||
1515 | for (auto& tter : *theTags) { | |||
1516 | if (tter->type & T_VSTR) { | |||
1517 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
1518 | } | |||
1519 | } | |||
1520 | ||||
1521 | Tag* wf = nullptr; | |||
1522 | for (auto tter : *theTags) { | |||
1523 | if (tter->type & T_WORDFORM) { | |||
1524 | cohorts.resize(cohorts.size() + 1); | |||
1525 | cohorts.back().first = alloc_cohort(¤t); | |||
1526 | cohorts.back().first->global_number = gWindow->cohort_counter++; | |||
1527 | wf = tter; | |||
1528 | VARSTRINGIFY(wf)do { while ((wf)->type & T_VARSTRING) { (wf) = generateVarstringTag ((wf)); } } while (0); | |||
1529 | cohorts.back().first->wordform = wf; | |||
1530 | continue; | |||
1531 | } | |||
1532 | if (!wf) { | |||
1533 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a wordform before any other tags in SPLITCOHORT on line %u before input line %u.\n", rule->line, numLines); | |||
1534 | CG3Quit(1); | |||
1535 | } | |||
1536 | } | |||
1537 | ||||
1538 | uint32_t rel_trg = DEP_NO_PARENT; | |||
1539 | std::vector<std::pair<uint32_t, uint32_t>> cohort_dep(cohorts.size()); | |||
1540 | cohort_dep.front().second = DEP_NO_PARENT; | |||
1541 | cohort_dep.back().first = DEP_NO_PARENT; | |||
1542 | cohort_dep.back().second = UI32(cohort_dep.size() - 1); | |||
1543 | for (size_t i = 1; i < cohort_dep.size() - 1; ++i) { | |||
1544 | cohort_dep[i].second = UI32(i); | |||
1545 | } | |||
1546 | ||||
1547 | size_t i = 0; | |||
1548 | std::vector<TagList>* readings = &cohorts.front().second; | |||
1549 | Tag* bf = nullptr; | |||
1550 | for (auto tter : *theTags) { | |||
1551 | if (tter->type & T_WORDFORM) { | |||
1552 | ++i; | |||
1553 | bf = nullptr; | |||
1554 | continue; | |||
1555 | } | |||
1556 | if (tter->type & T_BASEFORM) { | |||
1557 | readings = &cohorts[i - 1].second; | |||
1558 | readings->resize(readings->size() + 1); | |||
1559 | readings->back().push_back(cohorts[i - 1].first->wordform); | |||
1560 | bf = tter; | |||
1561 | } | |||
1562 | if (!bf) { | |||
1563 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform after the wordform in SPLITCOHORT on line %u before input line %u.\n", rule->line, numLines); | |||
1564 | CG3Quit(1); | |||
1565 | } | |||
1566 | ||||
1567 | UChar dep_self[12] = {}; | |||
1568 | UChar dep_parent[12] = {}; | |||
1569 | if (u_sscanfu_sscanf_72(tter->tag.data(), "%[0-9cd]->%[0-9pm]", &dep_self, &dep_parent) == 2) { | |||
1570 | if (dep_self[0] == 'c' || dep_self[0] == 'd') { | |||
1571 | cohort_dep[i - 1].first = DEP_NO_PARENT; | |||
1572 | if (rel_trg == DEP_NO_PARENT) { | |||
1573 | rel_trg = UI32(i - 1); | |||
1574 | } | |||
1575 | } | |||
1576 | else if (u_sscanfu_sscanf_72(dep_self, "%i", &cohort_dep[i - 1].first) != 1) { | |||
1577 | u_fprintfu_fprintf_72(ux_stderr, "Error: SPLITCOHORT dependency mapping dep_self was not valid on line %u before input line %u.\n", rule->line, numLines); | |||
1578 | CG3Quit(1); | |||
1579 | } | |||
1580 | if (dep_parent[0] == 'p' || dep_parent[0] == 'm') { | |||
1581 | cohort_dep[i - 1].second = DEP_NO_PARENT; | |||
1582 | } | |||
1583 | else if (u_sscanfu_sscanf_72(dep_parent, "%i", &cohort_dep[i - 1].second) != 1) { | |||
1584 | u_fprintfu_fprintf_72(ux_stderr, "Error: SPLITCOHORT dependency mapping dep_parent was not valid on line %u before input line %u.\n", rule->line, numLines); | |||
1585 | CG3Quit(1); | |||
1586 | } | |||
1587 | continue; | |||
1588 | } | |||
1589 | if (tter->tag.size() == 3 && tter->tag[0] == 'R' && tter->tag[1] == ':' && tter->tag[2] == '*') { | |||
1590 | rel_trg = UI32(i - 1); | |||
1591 | continue; | |||
1592 | } | |||
1593 | readings->back().push_back(tter); | |||
1594 | } | |||
1595 | ||||
1596 | if (rel_trg == DEP_NO_PARENT) { | |||
1597 | rel_trg = UI32(cohorts.size() - 1); | |||
1598 | } | |||
1599 | ||||
1600 | for (size_t i = 0; i < cohorts.size(); ++i) { | |||
1601 | Cohort* cCohort = cohorts[i].first; | |||
1602 | readings = &cohorts[i].second; | |||
1603 | ||||
1604 | for (auto tags : *readings) { | |||
1605 | Reading* cReading = alloc_reading(cCohort); | |||
1606 | ++numReadings; | |||
1607 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); | |||
1608 | cReading->hit_by.push_back(rule->number); | |||
1609 | cReading->noprint = false; | |||
1610 | TagList mappings; | |||
1611 | ||||
1612 | for (size_t i = 0; i < tags.size(); ++i) { | |||
1613 | if (tags[i]->hash == grammar->tag_any) { | |||
1614 | uint32Vector& nt = get_apply_to().cohort->readings.front()->tags_list; | |||
1615 | if (nt.size() <= 2) { | |||
1616 | continue; | |||
1617 | } | |||
1618 | tags.reserve(tags.size() + nt.size() - 2); | |||
1619 | tags[i] = grammar->single_tags[nt[2]]; | |||
1620 | for (size_t j = 3, k = 1; j < nt.size(); ++j) { | |||
1621 | if (grammar->single_tags[nt[j]]->type & T_DEPENDENCY) { | |||
1622 | continue; | |||
1623 | } | |||
1624 | tags.insert(tags.begin() + i + k, grammar->single_tags[nt[j]]); | |||
1625 | ++k; | |||
1626 | } | |||
1627 | } | |||
1628 | } | |||
1629 | ||||
1630 | for (auto tter : tags) { | |||
1631 | uint32_t hash = tter->hash; | |||
1632 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
1633 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { | |||
1634 | mappings.push_back(tter); | |||
1635 | } | |||
1636 | else { | |||
1637 | hash = addTagToReading(*cReading, hash); | |||
1638 | } | |||
1639 | if (updateValidRules(rules, intersects, hash, *cReading)) { | |||
1640 | iter_rules = intersects.find(rule->number); | |||
1641 | iter_rules_end = intersects.end(); | |||
1642 | } | |||
1643 | } | |||
1644 | if (!mappings.empty()) { | |||
1645 | splitMappings(mappings, *cCohort, *cReading); | |||
1646 | } | |||
1647 | cCohort->appendReading(cReading); | |||
1648 | } | |||
1649 | ||||
1650 | if (cCohort->readings.empty()) { | |||
1651 | initEmptyCohort(*cCohort); | |||
1652 | } | |||
1653 | ||||
1654 | current.parent->dep_window[cCohort->global_number] = cCohort; | |||
1655 | current.parent->cohort_map[cCohort->global_number] = cCohort; | |||
1656 | ||||
1657 | current.cohorts.insert(current.cohorts.begin() + get_apply_to().cohort->local_number + i + 1, cCohort); | |||
1658 | current.all_cohorts.insert(std::find(current.all_cohorts.begin() + get_apply_to().cohort->local_number, current.all_cohorts.end(), get_apply_to().cohort) + i + 1, cCohort); | |||
1659 | } | |||
1660 | ||||
1661 | // Move text from the to-be-deleted cohort to the last new cohort | |||
1662 | std::swap(cohorts.back().first->text, get_apply_to().cohort->text); | |||
1663 | ||||
1664 | for (size_t i = 0; i < cohorts.size(); ++i) { | |||
1665 | Cohort* cCohort = cohorts[i].first; | |||
1666 | ||||
1667 | if (cohort_dep[i].first == DEP_NO_PARENT) { | |||
1668 | while (!get_apply_to().cohort->dep_children.empty()) { | |||
1669 | uint32_t ch = get_apply_to().cohort->dep_children.back(); | |||
1670 | attachParentChild(*cCohort, *current.parent->cohort_map[ch], true, true); | |||
1671 | get_apply_to().cohort->dep_children.erase(ch); // Just in case the attachment can't be made for some reason | |||
1672 | } | |||
1673 | } | |||
1674 | ||||
1675 | if (cohort_dep[i].second == DEP_NO_PARENT) { | |||
1676 | if (current.parent->cohort_map.count(get_apply_to().cohort->dep_parent)) { | |||
1677 | attachParentChild(*current.parent->cohort_map[get_apply_to().cohort->dep_parent], *cCohort, true, true); | |||
1678 | } | |||
1679 | } | |||
1680 | else { | |||
1681 | attachParentChild(*current.parent->cohort_map[cohorts.front().first->global_number + cohort_dep[i].second - 1], *cCohort, true, true); | |||
1682 | } | |||
1683 | ||||
1684 | // Re-attach all named relations to the dependency tail or R:* cohort | |||
1685 | if (rel_trg == i && (get_apply_to().cohort->type & CT_RELATED)) { | |||
1686 | cCohort->setRelated(); | |||
1687 | cCohort->relations.swap(get_apply_to().cohort->relations); | |||
1688 | ||||
1689 | std::pair<SingleWindow**, size_t> swss[3] = { | |||
1690 | std::make_pair(&gWindow->previous[0], gWindow->previous.size()), | |||
1691 | std::make_pair(&gWindow->current, static_cast<size_t>(1)), | |||
1692 | std::make_pair(&gWindow->next[0], gWindow->next.size()), | |||
1693 | }; | |||
1694 | for (auto sws : swss) { | |||
1695 | for (size_t sw = 0; sw < sws.second; ++sw) { | |||
1696 | for (auto ch : sws.first[sw]->cohorts) { | |||
1697 | for (auto& rel : ch->relations) { | |||
1698 | if (rel.second.count(get_apply_to().cohort->global_number)) { | |||
1699 | rel.second.erase(get_apply_to().cohort->global_number); | |||
1700 | rel.second.insert(cCohort->global_number); | |||
1701 | } | |||
1702 | } | |||
1703 | } | |||
1704 | } | |||
1705 | } | |||
1706 | } | |||
1707 | } | |||
1708 | ||||
1709 | // Remove the source cohort | |||
1710 | for (auto iter : get_apply_to().cohort->readings) { | |||
1711 | iter->hit_by.push_back(rule->number); | |||
1712 | iter->deleted = true; | |||
1713 | } | |||
1714 | get_apply_to().cohort->type |= CT_REMOVED; | |||
1715 | get_apply_to().cohort->detach(); | |||
1716 | for (auto& cm : current.parent->cohort_map) { | |||
1717 | cm.second->dep_children.erase(get_apply_to().cohort->dep_self); | |||
1718 | } | |||
1719 | current.parent->cohort_map.erase(get_apply_to().cohort->global_number); | |||
1720 | current.cohorts.erase(current.cohorts.begin() + get_apply_to().cohort->local_number); | |||
1721 | ||||
1722 | reindex(); | |||
1723 | indexSingleWindow(current); | |||
1724 | readings_changed = true; | |||
1725 | ||||
1726 | reset_cohorts_for_loop = true; | |||
1727 | } | |||
1728 | else if (rule->type == K_ADD || rule->type == K_MAP) { | |||
1729 | TRACE; | |||
1730 | auto state_hash = get_apply_to().subreading->hash; | |||
1731 | index_ruleCohort_no.clear(); | |||
1732 | auto& reading = *(get_apply_to().subreading); | |||
1733 | reading.noprint = false; | |||
1734 | auto mappings = ss_taglist.get(); | |||
1735 | auto theTags = ss_taglist.get(); | |||
1736 | getTagList(*rule->maplist, theTags); | |||
1737 | ||||
1738 | bool did_insert = false; | |||
1739 | if (rule->childset1) { | |||
1740 | bool found_spot = false; | |||
1741 | auto spot_tags = ss_taglist.get(); | |||
1742 | getTagList(*grammar->sets_list[rule->childset1], spot_tags); | |||
1743 | FILL_TAG_LIST(spot_tags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (spot_tags)->begin(); it != (spot_tags)->end( );) { if (reading.tags.find((*it)->hash) == reading.tags.end ()) { auto tt = *it; it = (spot_tags)->erase(it); if (tt-> type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack.back().regexgrps = ®exgrps_store [used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { (spot_tags)->insert(it, grammar ->single_tags.find(stag)->second); } } continue; } ++it ; } } while (0); | |||
1744 | auto it = reading.tags_list.begin(); | |||
1745 | for (; it != reading.tags_list.end(); ++it) { | |||
1746 | bool found = true; | |||
1747 | auto tmp = it; | |||
1748 | for (auto tag : *spot_tags) { | |||
1749 | if (*tmp != tag->hash) { | |||
1750 | found = false; | |||
1751 | break; | |||
1752 | } | |||
1753 | ++tmp; | |||
1754 | } | |||
1755 | if (found) { | |||
1756 | found_spot = true; | |||
1757 | break; | |||
1758 | } | |||
1759 | } | |||
1760 | if (found_spot) { | |||
1761 | if (rule->flags & RF_AFTER) { | |||
1762 | std::advance(it, spot_tags->size()); | |||
1763 | } | |||
1764 | if (it != reading.tags_list.end()) { | |||
1765 | insert_taglist_to_reading(it, *theTags, reading, mappings); | |||
1766 | did_insert = true; | |||
1767 | } | |||
1768 | } | |||
1769 | } | |||
1770 | ||||
1771 | if (!did_insert) { | |||
1772 | APPEND_TAGLIST_TO_READING(*theTags, reading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((reading), tter); } if (updateValidRules(rules, intersects, hash, reading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0); | |||
1773 | } | |||
1774 | if (!mappings->empty()) { | |||
1775 | splitMappings(mappings, *get_apply_to().cohort, reading, rule->type == K_MAP); | |||
1776 | } | |||
1777 | if (rule->type == K_MAP) { | |||
1778 | reading.mapped = true; | |||
1779 | } | |||
1780 | if (reading.hash != state_hash) { | |||
1781 | readings_changed = true; | |||
1782 | } | |||
1783 | } | |||
1784 | else if (rule->type == K_RESTORE) { | |||
1785 | bool did_restore = false; | |||
1786 | auto move_rs = [&](ReadingList& rl) { | |||
1787 | for (size_t i = 0; i < rl.size();) { | |||
1788 | if (doesSetMatchReading(*rl[i], rule->maplist->number)) { | |||
1789 | rl[i]->deleted = false; | |||
1790 | rl[i]->hit_by.push_back(rule->number); | |||
1791 | get_apply_to().cohort->readings.push_back(rl[i]); | |||
1792 | rl.erase(rl.begin() + i); | |||
1793 | did_restore = true; | |||
1794 | } | |||
1795 | else { | |||
1796 | ++i; | |||
1797 | } | |||
1798 | } | |||
1799 | }; | |||
1800 | ||||
1801 | if (rule->flags & RF_DELAYED) { | |||
1802 | move_rs(get_apply_to().cohort->delayed); | |||
1803 | } | |||
1804 | else if (rule->flags & RF_IGNORED) { | |||
1805 | move_rs(get_apply_to().cohort->ignored); | |||
1806 | } | |||
1807 | else { | |||
1808 | move_rs(get_apply_to().cohort->deleted); | |||
1809 | } | |||
1810 | ||||
1811 | if (did_restore) { | |||
1812 | TRACE; | |||
1813 | } | |||
1814 | finish_reading_loop = false; | |||
1815 | } | |||
1816 | else if (rule->type == K_REPLACE) { | |||
1817 | auto state_hash = get_apply_to().subreading->hash; | |||
1818 | index_ruleCohort_no.clear(); | |||
1819 | TRACE; | |||
1820 | get_apply_to().subreading->noprint = false; | |||
1821 | get_apply_to().subreading->tags_list.clear(); | |||
1822 | get_apply_to().subreading->tags_list.push_back(get_apply_to().cohort->wordform->hash); | |||
1823 | get_apply_to().subreading->tags_list.push_back(get_apply_to().subreading->baseform); | |||
1824 | reflowReading(*get_apply_to().subreading); | |||
1825 | auto mappings = ss_taglist.get(); | |||
1826 | auto theTags = ss_taglist.get(); | |||
1827 | getTagList(*rule->maplist, theTags); | |||
1828 | ||||
1829 | APPEND_TAGLIST_TO_READING(*theTags, *get_apply_to().subreading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((*get_apply_to().subreading ), tter); } if (updateValidRules(rules, intersects, hash, *get_apply_to ().subreading)) { iter_rules = intersects.find(rule->number ); iter_rules_end = intersects.end(); } } } while (0); | |||
1830 | ||||
1831 | if (!mappings->empty()) { | |||
1832 | splitMappings(mappings, *get_apply_to().cohort, *get_apply_to().subreading, true); | |||
1833 | } | |||
1834 | if (get_apply_to().subreading->hash != state_hash) { | |||
1835 | readings_changed = true; | |||
1836 | } | |||
1837 | } | |||
1838 | else if (rule->type == K_SUBSTITUTE) { | |||
1839 | // ToDo: Check whether this substitution will do nothing at all to the end result | |||
1840 | // ToDo: Not actually...instead, test whether any reading in the cohort already is the end result | |||
1841 | ||||
1842 | auto state_hash = get_apply_to().subreading->hash; | |||
1843 | auto theTags = ss_taglist.get(); | |||
1844 | getTagList(*rule->sublist, theTags); | |||
1845 | ||||
1846 | // Modify the list of tags to remove to be the actual list of tags present, including matching regex and icase tags | |||
1847 | FILL_TAG_LIST(theTags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (theTags)->begin(); it != (theTags)->end();) { if (reading.tags.find((*it)->hash) == reading.tags.end()) { auto tt = *it; it = (theTags)->erase(it); if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr ) { context_stack.back().regexgrps = ®exgrps_store[used_regex ]; } auto stag = doesTagMatchReading(reading, *tt, false, true ); if (stag) { (theTags)->insert(it, grammar->single_tags .find(stag)->second); } } continue; } ++it; } } while (0); | |||
1848 | ||||
1849 | // Perform the tag removal, remembering the position of the final removed tag for use as insertion spot | |||
1850 | size_t tpos = std::numeric_limits<size_t>::max(); | |||
1851 | bool plain = true; | |||
1852 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size();) { | |||
1853 | auto& remter = get_apply_to().subreading->tags_list[i]; | |||
1854 | ||||
1855 | if (plain && remter == (*theTags->begin())->hash) { | |||
1856 | if (get_apply_to().subreading->baseform == remter) { | |||
1857 | get_apply_to().subreading->baseform = 0; | |||
1858 | } | |||
1859 | remter = substtag; | |||
1860 | tpos = i; | |||
1861 | for (size_t j = 1; j < theTags->size() && i < get_apply_to().subreading->tags_list.size(); ++j, ++i) { | |||
1862 | auto& remter = get_apply_to().subreading->tags_list[i]; | |||
1863 | auto tter = (*theTags)[j]->hash; | |||
1864 | if (remter != tter) { | |||
1865 | plain = false; | |||
1866 | break; | |||
1867 | } | |||
1868 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); | |||
1869 | get_apply_to().subreading->tags.erase(tter); | |||
1870 | if (get_apply_to().subreading->baseform == tter) { | |||
1871 | get_apply_to().subreading->baseform = 0; | |||
1872 | } | |||
1873 | } | |||
1874 | continue; | |||
1875 | } | |||
1876 | ||||
1877 | for (auto tter : *theTags) { | |||
1878 | if (remter != tter->hash) { | |||
1879 | continue; | |||
1880 | } | |||
1881 | tpos = i; | |||
1882 | remter = substtag; | |||
1883 | get_apply_to().subreading->tags.erase(tter->hash); | |||
1884 | if (get_apply_to().subreading->baseform == tter->hash) { | |||
1885 | get_apply_to().subreading->baseform = 0; | |||
1886 | } | |||
1887 | } | |||
1888 | ||||
1889 | ++i; | |||
1890 | } | |||
1891 | ||||
1892 | // Should Substitute really do nothing if no tags were removed? 2013-10-21, Eckhard says this is expected behavior. | |||
1893 | if (tpos != std::numeric_limits<size_t>::max()) { | |||
1894 | if (!plain) { | |||
1895 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size() && i < tpos;) { | |||
1896 | if (get_apply_to().subreading->tags_list[i] == substtag) { | |||
1897 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); | |||
1898 | --tpos; | |||
1899 | } | |||
1900 | else { | |||
1901 | ++i; | |||
1902 | } | |||
1903 | } | |||
1904 | } | |||
1905 | ||||
1906 | Tag* wf = nullptr; | |||
1907 | index_ruleCohort_no.clear(); | |||
1908 | TRACE; | |||
1909 | get_apply_to().subreading->noprint = false; | |||
1910 | if (tpos >= get_apply_to().subreading->tags_list.size()) { | |||
1911 | tpos = get_apply_to().subreading->tags_list.size() - 1; | |||
1912 | } | |||
1913 | ++tpos; | |||
1914 | auto mappings = ss_taglist.get(); | |||
1915 | auto theTags = ss_taglist.get(); | |||
1916 | getTagList(*rule->maplist, theTags); | |||
1917 | ||||
1918 | for (size_t i = 0; i < get_apply_to().subreading->tags_list.size();) { | |||
1919 | if (get_apply_to().subreading->tags_list[i] == substtag) { | |||
1920 | get_apply_to().subreading->tags_list.erase(get_apply_to().subreading->tags_list.begin() + i); | |||
1921 | tpos = i; | |||
1922 | ||||
1923 | for (auto tag : *theTags) { | |||
1924 | if (tag->type & T_VARSTRING) { | |||
1925 | tag = generateVarstringTag(tag); | |||
1926 | } | |||
1927 | if (tag->hash == grammar->tag_any) { | |||
1928 | break; | |||
1929 | } | |||
1930 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { | |||
1931 | mappings->push_back(tag); | |||
1932 | } | |||
1933 | else { | |||
1934 | if (tag->type & T_WORDFORM) { | |||
1935 | wf = tag; | |||
1936 | } | |||
1937 | get_apply_to().subreading->tags_list.insert(get_apply_to().subreading->tags_list.begin() + tpos, tag->hash); | |||
1938 | ++tpos; | |||
1939 | } | |||
1940 | if (updateValidRules(rules, intersects, tag->hash, *get_apply_to().subreading)) { | |||
1941 | iter_rules = intersects.find(rule->number); | |||
1942 | iter_rules_end = intersects.end(); | |||
1943 | } | |||
1944 | } | |||
1945 | } | |||
1946 | else { | |||
1947 | ++i; | |||
1948 | } | |||
1949 | } | |||
1950 | reflowReading(*get_apply_to().subreading); | |||
1951 | ||||
1952 | if (!mappings->empty()) { | |||
1953 | splitMappings(mappings, *get_apply_to().cohort, *get_apply_to().subreading, true); | |||
1954 | } | |||
1955 | if (wf && wf != get_apply_to().subreading->parent->wordform) { | |||
1956 | for (auto r : get_apply_to().subreading->parent->readings) { | |||
1957 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); | |||
1958 | addTagToReading(*r, wf); | |||
1959 | } | |||
1960 | for (auto r : get_apply_to().subreading->parent->deleted) { | |||
1961 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); | |||
1962 | addTagToReading(*r, wf); | |||
1963 | } | |||
1964 | for (auto r : get_apply_to().subreading->parent->delayed) { | |||
1965 | delTagFromReading(*r, get_apply_to().subreading->parent->wordform); | |||
1966 | addTagToReading(*r, wf); | |||
1967 | } | |||
1968 | get_apply_to().subreading->parent->wordform = wf; | |||
1969 | for (auto r : grammar->wf_rules) { | |||
1970 | if (doesWordformsMatch(wf, r->wordform)) { | |||
1971 | current.rule_to_cohorts[r->number].insert(get_apply_to().cohort); | |||
1972 | intersects.insert(r->number); | |||
1973 | } | |||
1974 | else { | |||
1975 | current.rule_to_cohorts[r->number].erase(get_apply_to().cohort); | |||
1976 | } | |||
1977 | } | |||
1978 | updateValidRules(rules, intersects, wf->hash, *get_apply_to().subreading); | |||
1979 | iter_rules = intersects.find(rule->number); | |||
1980 | iter_rules_end = intersects.end(); | |||
1981 | } | |||
1982 | } | |||
1983 | if (get_apply_to().subreading->hash != state_hash) { | |||
1984 | readings_changed = true; | |||
1985 | } | |||
1986 | } | |||
1987 | else if (rule->type == K_APPEND) { | |||
1988 | index_ruleCohort_no.clear(); | |||
1989 | TRACE; | |||
1990 | ||||
1991 | Tag* bf = nullptr; | |||
1992 | std::vector<TagList> readings; | |||
1993 | auto theTags = ss_taglist.get(); | |||
1994 | getTagList(*rule->maplist, theTags); | |||
1995 | ||||
1996 | for (auto& tter : *theTags) { | |||
1997 | if (tter->type & T_VSTR) { | |||
1998 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
1999 | } | |||
2000 | } | |||
2001 | ||||
2002 | for (auto tter : *theTags) { | |||
2003 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2004 | if (tter->type & T_BASEFORM) { | |||
2005 | bf = tter; | |||
2006 | readings.resize(readings.size() + 1); | |||
2007 | } | |||
2008 | if (bf == nullptr) { | |||
2009 | u_fprintfu_fprintf_72(ux_stderr, "Error: There must be a baseform before any other tags in APPEND on line %u.\n", rule->line); | |||
2010 | CG3Quit(1); | |||
2011 | } | |||
2012 | readings.back().push_back(tter); | |||
2013 | } | |||
2014 | ||||
2015 | for (const auto& rit : readings) { | |||
2016 | Reading* cReading = alloc_reading(get_apply_to().cohort); | |||
2017 | ++numReadings; | |||
2018 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); | |||
2019 | addTagToReading(*cReading, get_apply_to().cohort->wordform); | |||
2020 | cReading->hit_by.push_back(rule->number); | |||
2021 | cReading->noprint = false; | |||
2022 | TagList mappings; | |||
2023 | for (auto tter : rit) { | |||
2024 | uint32_t hash = tter->hash; | |||
2025 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2026 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { | |||
2027 | mappings.push_back(tter); | |||
2028 | } | |||
2029 | else { | |||
2030 | hash = addTagToReading(*cReading, tter); | |||
2031 | } | |||
2032 | if (updateValidRules(rules, intersects, hash, *cReading)) { | |||
2033 | iter_rules = intersects.find(rule->number); | |||
2034 | iter_rules_end = intersects.end(); | |||
2035 | } | |||
2036 | } | |||
2037 | if (!mappings.empty()) { | |||
2038 | splitMappings(mappings, *get_apply_to().cohort, *cReading); | |||
2039 | } | |||
2040 | get_apply_to().cohort->appendReading(cReading); | |||
2041 | } | |||
2042 | ||||
2043 | if (get_apply_to().cohort->readings.size() > 1) { | |||
2044 | foreach (rit, get_apply_to().cohort->readings)if (!(get_apply_to().cohort->readings).empty()) for (auto rit = (get_apply_to().cohort->readings).begin(), rit_end = (get_apply_to ().cohort->readings).end(); rit != rit_end; ++rit) { | |||
2045 | if ((*rit)->noprint) { | |||
2046 | free_reading(*rit); | |||
2047 | rit = get_apply_to().cohort->readings.erase(rit); | |||
2048 | rit_end = get_apply_to().cohort->readings.end(); | |||
2049 | } | |||
2050 | } | |||
2051 | } | |||
2052 | ||||
2053 | readings_changed = true; | |||
2054 | finish_reading_loop = false; | |||
2055 | } | |||
2056 | else if (rule->type == K_COPY) { | |||
2057 | // ToDo: Maybe just goto Substitute directly? | |||
2058 | Reading* cReading = get_apply_to().cohort->allocateAppendReading(*get_apply_to().reading); | |||
2059 | ++numReadings; | |||
2060 | index_ruleCohort_no.clear(); | |||
2061 | TRACE; | |||
2062 | cReading->hit_by.push_back(rule->number); | |||
2063 | cReading->noprint = false; | |||
2064 | ||||
2065 | if (rule->sublist) { | |||
2066 | auto excepts = ss_taglist.get(); | |||
2067 | getTagList(*rule->sublist, excepts); | |||
2068 | FILL_TAG_LIST_RAW(excepts)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(excepts)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0); | |||
2069 | for (auto r = cReading; r; r = r->next) { | |||
2070 | for (auto tter : *excepts) { | |||
2071 | delTagFromReading(*r, tter); | |||
2072 | } | |||
2073 | } | |||
2074 | } | |||
2075 | ||||
2076 | auto mappings = ss_taglist.get(); | |||
2077 | auto theTags = ss_taglist.get(); | |||
2078 | getTagList(*rule->maplist, theTags); | |||
2079 | ||||
2080 | bool did_insert = false; | |||
2081 | if (rule->childset1) { | |||
2082 | auto spot_tags = ss_taglist.get(); | |||
2083 | getTagList(*grammar->sets_list[rule->childset1], spot_tags); | |||
2084 | FILL_TAG_LIST(spot_tags)do { Reading& reading = *get_apply_to().subreading; for ( auto it = (spot_tags)->begin(); it != (spot_tags)->end( );) { if (reading.tags.find((*it)->hash) == reading.tags.end ()) { auto tt = *it; it = (spot_tags)->erase(it); if (tt-> type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack.back().regexgrps = ®exgrps_store [used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { (spot_tags)->insert(it, grammar ->single_tags.find(stag)->second); } } continue; } ++it ; } } while (0); | |||
2085 | auto it = cReading->tags_list.begin(); | |||
2086 | for (; it != cReading->tags_list.end(); ++it) { | |||
2087 | bool found = true; | |||
2088 | auto tmp = it; | |||
2089 | for (auto tag : *spot_tags) { | |||
2090 | if (*tmp != tag->hash) { | |||
2091 | found = false; | |||
2092 | break; | |||
2093 | } | |||
2094 | ++tmp; | |||
2095 | } | |||
2096 | if (found) { | |||
2097 | break; | |||
2098 | } | |||
2099 | } | |||
2100 | if (rule->flags & RF_AFTER) { | |||
2101 | std::advance(it, spot_tags->size()); | |||
2102 | } | |||
2103 | if (it != cReading->tags_list.end()) { | |||
2104 | insert_taglist_to_reading(it, *theTags, *cReading, mappings); | |||
2105 | did_insert = true; | |||
2106 | } | |||
2107 | } | |||
2108 | ||||
2109 | if (!did_insert) { | |||
2110 | APPEND_TAGLIST_TO_READING(*theTags, *cReading)do { for (auto tter : (*theTags)) { while (tter->type & T_VARSTRING) { tter = generateVarstringTag(tter); } auto hash = tter->hash; if (tter->type & T_MAPPING || tter-> tag[0] == grammar->mapping_prefix) { mappings->push_back (tter); } else { hash = addTagToReading((*cReading), tter); } if (updateValidRules(rules, intersects, hash, *cReading)) { iter_rules = intersects.find(rule->number); iter_rules_end = intersects .end(); } } } while (0); | |||
2111 | } | |||
2112 | if (!mappings->empty()) { | |||
2113 | splitMappings(mappings, *get_apply_to().cohort, *cReading, true); | |||
2114 | } | |||
2115 | readings_changed = true; | |||
2116 | reflowReading(*cReading); | |||
2117 | } | |||
2118 | else if (rule->type == K_MERGECOHORTS) { | |||
2119 | index_ruleCohort_no.clear(); | |||
2120 | ||||
2121 | CohortSet withs; | |||
2122 | Cohort* target = get_apply_to().cohort; | |||
2123 | withs.insert(target); | |||
2124 | Cohort* merge_at = target; | |||
2125 | for (auto it : rule->dep_tests) { | |||
2126 | auto& at = context_stack.back().attach_to; | |||
2127 | at.cohort = nullptr; | |||
2128 | at.reading = nullptr; | |||
2129 | at.subreading = nullptr; | |||
2130 | merge_with = nullptr; | |||
2131 | set_mark(target); | |||
2132 | dep_deep_seen.clear(); | |||
2133 | tmpl_cntx.clear(); | |||
2134 | Cohort* attach = nullptr; | |||
2135 | bool test_good = (runContextualTest(target->parent, target->local_number, it, &attach) && attach); | |||
2136 | ||||
2137 | profileRuleContext(test_good, rule, it); | |||
2138 | ||||
2139 | if (!test_good) { | |||
2140 | finish_reading_loop = false; | |||
2141 | return; | |||
2142 | } | |||
2143 | if (get_attach_to().cohort) { | |||
2144 | merge_at = get_attach_to().cohort; | |||
2145 | if (merge_with) { | |||
2146 | withs.insert(merge_with); | |||
2147 | } | |||
2148 | } | |||
2149 | else if (merge_with) { | |||
2150 | withs.insert(merge_with); | |||
2151 | } | |||
2152 | else { | |||
2153 | withs.insert(attach); | |||
2154 | } | |||
2155 | } | |||
2156 | ||||
2157 | size_t spacesInAddedWf = 0; | |||
2158 | context_stack.back().target.cohort = add_cohort(merge_at, spacesInAddedWf); | |||
2159 | ||||
2160 | for (auto c : withs) { | |||
2161 | size_t foundSpace = c->text.find_first_of(' '); | |||
2162 | while(spacesInAddedWf && foundSpace != std::string::npos) { | |||
2163 | c->text.erase(foundSpace, 1); | |||
2164 | foundSpace = c->text.find_first_of(' '); | |||
2165 | spacesInAddedWf--; | |||
2166 | } | |||
2167 | rem_cohort(c); | |||
2168 | } | |||
2169 | ||||
2170 | // If the last cohort was removed or inserted after, add <<< to the new end | |||
2171 | if (current.cohorts.back()->readings.front()->tags.count(endtag) == 0) { | |||
2172 | for (auto r : current.cohorts[current.cohorts.size() - 2]->readings) { | |||
2173 | delTagFromReading(*r, endtag); | |||
2174 | } | |||
2175 | for (auto r : current.cohorts.back()->readings) { | |||
2176 | addTagToReading(*r, endtag); | |||
2177 | if (updateValidRules(rules, intersects, endtag, *r)) { | |||
2178 | iter_rules = intersects.find(rule->number); | |||
2179 | iter_rules_end = intersects.end(); | |||
2180 | } | |||
2181 | } | |||
2182 | } | |||
2183 | indexSingleWindow(current); | |||
2184 | readings_changed = true; | |||
2185 | ||||
2186 | reset_cohorts_for_loop = true; | |||
2187 | } | |||
2188 | else if (rule->type == K_COPYCOHORT) { | |||
2189 | Cohort* attach = nullptr; | |||
2190 | Cohort* cohort = context_stack.back().target.cohort; | |||
2191 | uint32_t c = cohort->local_number; | |||
2192 | dep_deep_seen.clear(); | |||
2193 | tmpl_cntx.clear(); | |||
2194 | context_stack.back().attach_to.cohort = nullptr; | |||
2195 | context_stack.back().attach_to.reading = nullptr; | |||
2196 | context_stack.back().attach_to.subreading = nullptr; | |||
2197 | if (runContextualTest(¤t, c, rule->dep_target, &attach) && attach) { | |||
2198 | profileRuleContext(true, rule, rule->dep_target); | |||
2199 | ||||
2200 | if (get_attach_to().cohort) { | |||
2201 | attach = get_attach_to().cohort; | |||
2202 | } | |||
2203 | context_target = attach; | |||
2204 | bool good = true; | |||
2205 | for (auto it : rule->dep_tests) { | |||
2206 | context_stack.back().mark = attach; | |||
2207 | dep_deep_seen.clear(); | |||
2208 | tmpl_cntx.clear(); | |||
2209 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); | |||
2210 | ||||
2211 | profileRuleContext(test_good, rule, it); | |||
2212 | ||||
2213 | if (!test_good) { | |||
2214 | good = test_good; | |||
2215 | break; | |||
2216 | } | |||
2217 | } | |||
2218 | ||||
2219 | if (!good || cohort == attach || cohort->local_number == 0) { | |||
2220 | return; | |||
2221 | } | |||
2222 | ||||
2223 | auto childset = rule->childset2; | |||
2224 | if (rule->flags & RF_REVERSE) { | |||
2225 | std::swap(cohort, attach); | |||
2226 | childset = rule->childset1; | |||
2227 | } | |||
2228 | ||||
2229 | Cohort* cCohort = alloc_cohort(attach->parent); | |||
2230 | cCohort->global_number = gWindow->cohort_counter++; | |||
2231 | cCohort->wordform = cohort->wordform; | |||
2232 | insert_if_exists(cCohort->possible_sets, grammar->sets_any); | |||
2233 | ||||
2234 | auto theTags = ss_taglist.get(); | |||
2235 | getTagList(*rule->maplist, theTags); | |||
2236 | ||||
2237 | for (auto& tter : *theTags) { | |||
2238 | if (tter->type & T_VSTR) { | |||
2239 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2240 | } | |||
2241 | } | |||
2242 | ||||
2243 | auto excepts = ss_taglist.get(); | |||
2244 | if (rule->sublist) { | |||
2245 | getTagList(*rule->sublist, excepts); | |||
2246 | FILL_TAG_LIST_RAW(excepts)do { Reading& reading = *get_apply_to().subreading; for ( auto& tt : *(excepts)) { if (tt->type & T_SPECIAL) { if (context_stack.back().regexgrps == nullptr) { context_stack .back().regexgrps = ®exgrps_store[used_regex]; } auto stag = doesTagMatchReading(reading, *tt, false, true); if (stag) { tt = grammar->single_tags.find(stag)->second; } } } } while (0); | |||
2247 | } | |||
2248 | ||||
2249 | std::vector<Reading*> rs; | |||
2250 | for (auto r : cohort->readings) { | |||
2251 | rs.clear(); | |||
2252 | for (; r; r = r->next) { | |||
2253 | auto cReading = alloc_reading(cCohort); | |||
2254 | ++numReadings; | |||
2255 | cReading->hit_by.push_back(rule->number); | |||
2256 | cReading->noprint = false; | |||
2257 | TagList mappings; | |||
2258 | for (auto hash : r->tags_list) { | |||
2259 | auto tter = grammar->single_tags[hash]; | |||
2260 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { | |||
2261 | mappings.push_back(tter); | |||
2262 | } | |||
2263 | else { | |||
2264 | hash = addTagToReading(*cReading, hash); | |||
2265 | } | |||
2266 | if (updateValidRules(rules, intersects, hash, *cReading)) { | |||
2267 | iter_rules = intersects.find(rule->number); | |||
2268 | iter_rules_end = intersects.end(); | |||
2269 | } | |||
2270 | } | |||
2271 | for (auto tter : *theTags) { | |||
2272 | auto hash = tter->hash; | |||
2273 | if (hash == grammar->tag_any) { | |||
2274 | continue; | |||
2275 | } | |||
2276 | if (tter->type & T_MAPPING || tter->tag[0] == grammar->mapping_prefix) { | |||
2277 | mappings.push_back(tter); | |||
2278 | } | |||
2279 | else { | |||
2280 | hash = addTagToReading(*cReading, hash); | |||
2281 | } | |||
2282 | if (updateValidRules(rules, intersects, hash, *cReading)) { | |||
2283 | iter_rules = intersects.find(rule->number); | |||
2284 | iter_rules_end = intersects.end(); | |||
2285 | } | |||
2286 | } | |||
2287 | if (!mappings.empty()) { | |||
2288 | splitMappings(mappings, *cCohort, *cReading); | |||
2289 | } | |||
2290 | rs.push_back(cReading); | |||
2291 | } | |||
2292 | auto rn = rs.front(); | |||
2293 | for (size_t j = 1; j < rs.size(); ++j) { | |||
2294 | rn->next = rs[j]; | |||
2295 | rn = rn->next; | |||
2296 | } | |||
2297 | cCohort->appendReading(rs.front()); | |||
2298 | } | |||
2299 | ||||
2300 | if (cCohort->readings.empty()) { | |||
2301 | initEmptyCohort(*cCohort); | |||
2302 | if (trace) { | |||
2303 | auto r = cCohort->readings.front(); | |||
2304 | r->hit_by.push_back(rule->number); | |||
2305 | r->noprint = false; | |||
2306 | } | |||
2307 | } | |||
2308 | ||||
2309 | for (auto r : cCohort->readings) { | |||
2310 | for (; r; r = r->next) { | |||
2311 | for (auto tter : *excepts) { | |||
2312 | delTagFromReading(*r, tter); | |||
2313 | } | |||
2314 | } | |||
2315 | } | |||
2316 | ||||
2317 | if (cohort->wread) { | |||
2318 | cCohort->wread = alloc_reading(cCohort); | |||
2319 | for (auto hash : cohort->wread->tags_list) { | |||
2320 | hash = addTagToReading(*cCohort->wread, hash); | |||
2321 | if (updateValidRules(rules, intersects, hash, *cCohort->wread)) { | |||
2322 | iter_rules = intersects.find(rule->number); | |||
2323 | iter_rules_end = intersects.end(); | |||
2324 | } | |||
2325 | } | |||
2326 | } | |||
2327 | ||||
2328 | current.parent->cohort_map[cCohort->global_number] = cCohort; | |||
2329 | current.parent->dep_window[cCohort->global_number] = cCohort; | |||
2330 | ||||
2331 | CohortSet edges; | |||
2332 | collect_subtree(edges, attach, childset); | |||
2333 | ||||
2334 | if (rule->flags & RF_BEFORE) { | |||
2335 | attach->parent->cohorts.insert(attach->parent->cohorts.begin() + edges.front()->local_number, cCohort); | |||
2336 | attach->parent->all_cohorts.insert(std::find(attach->parent->all_cohorts.begin() + edges.front()->local_number, attach->parent->all_cohorts.end(), edges.front()), cCohort); | |||
2337 | attachParentChild(*edges.front(), *cCohort); | |||
2338 | } | |||
2339 | else { | |||
2340 | attach->parent->cohorts.insert(attach->parent->cohorts.begin() + edges.back()->local_number + 1, cCohort); | |||
2341 | attach->parent->all_cohorts.insert(std::find(attach->parent->all_cohorts.begin() + edges.back()->local_number, attach->parent->all_cohorts.end(), edges.back()) + 1, cCohort); | |||
2342 | attachParentChild(*edges.back(), *cCohort); | |||
2343 | } | |||
2344 | ||||
2345 | reindex(attach->parent); | |||
2346 | indexSingleWindow(*attach->parent); | |||
2347 | readings_changed = true; | |||
2348 | reset_cohorts_for_loop = true; | |||
2349 | } | |||
2350 | } | |||
2351 | else if (rule->type == K_SETPARENT || rule->type == K_SETCHILD || rule->type == K_ADDRELATION || rule->type == K_SETRELATION || rule->type == K_REMRELATION || rule->type == K_ADDRELATIONS || rule->type == K_SETRELATIONS || rule->type == K_REMRELATIONS) { | |||
2352 | auto dep_target_cb = [&]() -> bool { | |||
2353 | Cohort* target = context_stack.back().target.cohort; | |||
2354 | Cohort* attach = context_stack.back().attach_to.cohort; | |||
2355 | swapper<Cohort*> sw((rule->flags & RF_REVERSE) != 0, target, attach); | |||
2356 | if (rule->type == K_SETPARENT || rule->type == K_SETCHILD) { | |||
2357 | bool attached = false; | |||
2358 | if (rule->type == K_SETPARENT) { | |||
2359 | attached = attachParentChild(*attach, *target, (rule->flags & RF_ALLOWLOOP) != 0, (rule->flags & RF_ALLOWCROSS) != 0); | |||
2360 | } | |||
2361 | else { | |||
2362 | attached = attachParentChild(*target, *attach, (rule->flags & RF_ALLOWLOOP) != 0, (rule->flags & RF_ALLOWCROSS) != 0); | |||
2363 | } | |||
2364 | if (attached) { | |||
2365 | index_ruleCohort_no.clear(); | |||
2366 | // force TRACE to use target | |||
2367 | Cohort* at_was = context_stack.back().attach_to.cohort; | |||
2368 | context_stack.back().attach_to.cohort = nullptr; | |||
2369 | TRACE; | |||
2370 | context_stack.back().attach_to.cohort = at_was; | |||
2371 | context_stack.back().target.subreading->noprint = false; | |||
2372 | has_dep = true; | |||
2373 | readings_changed = true; | |||
2374 | } | |||
2375 | return attached; | |||
2376 | } | |||
2377 | else if (rule->type == K_ADDRELATION || rule->type == K_SETRELATION || rule->type == K_REMRELATION) { | |||
2378 | bool rel_did_anything = false; | |||
2379 | auto theTags = ss_taglist.get(); | |||
2380 | getTagList(*rule->maplist, theTags); | |||
2381 | for (auto tter : *theTags) { | |||
2382 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2383 | if (rule->type == K_ADDRELATION) { | |||
2384 | attach->setRelated(); | |||
2385 | target->setRelated(); | |||
2386 | rel_did_anything |= target->addRelation(tter->hash, attach->global_number); | |||
2387 | add_relation_rtag(target, tter, attach->global_number); | |||
2388 | } | |||
2389 | else if (rule->type == K_SETRELATION) { | |||
2390 | attach->setRelated(); | |||
2391 | target->setRelated(); | |||
2392 | rel_did_anything |= target->setRelation(tter->hash, attach->global_number); | |||
2393 | set_relation_rtag(target, tter, attach->global_number); | |||
2394 | } | |||
2395 | else { | |||
2396 | rel_did_anything |= target->remRelation(tter->hash, attach->global_number); | |||
2397 | rem_relation_rtag(target, tter, attach->global_number); | |||
2398 | } | |||
2399 | } | |||
2400 | if (rel_did_anything) { | |||
2401 | index_ruleCohort_no.clear(); | |||
2402 | // force TRACE to use target | |||
2403 | Cohort* at_was = context_stack.back().attach_to.cohort; | |||
2404 | context_stack.back().attach_to.cohort = nullptr; | |||
2405 | TRACE; | |||
2406 | context_stack.back().attach_to.cohort = at_was; | |||
2407 | context_stack.back().target.subreading->noprint = false; | |||
2408 | readings_changed = true; | |||
2409 | } | |||
2410 | // don't scan onwards if failed | |||
2411 | return true; | |||
2412 | } | |||
2413 | else if (rule->type == K_ADDRELATIONS || rule->type == K_SETRELATIONS || rule->type == K_REMRELATIONS) { | |||
2414 | bool rel_did_anything = false; | |||
2415 | ||||
2416 | auto sublist = ss_taglist.get(); | |||
2417 | getTagList(*rule->sublist, sublist); | |||
2418 | ||||
2419 | auto maplist = ss_taglist.get(); | |||
2420 | getTagList(*rule->maplist, maplist); | |||
2421 | ||||
2422 | for (auto tter : *maplist) { | |||
2423 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2424 | if (rule->type == K_ADDRELATIONS) { | |||
2425 | target->setRelated(); | |||
2426 | rel_did_anything |= target->addRelation(tter->hash, attach->global_number); | |||
2427 | add_relation_rtag(target, tter, attach->global_number); | |||
2428 | } | |||
2429 | else if (rule->type == K_SETRELATIONS) { | |||
2430 | target->setRelated(); | |||
2431 | rel_did_anything |= target->setRelation(tter->hash, attach->global_number); | |||
2432 | set_relation_rtag(target, tter, attach->global_number); | |||
2433 | } | |||
2434 | else { | |||
2435 | rel_did_anything |= target->remRelation(tter->hash, attach->global_number); | |||
2436 | rem_relation_rtag(target, tter, attach->global_number); | |||
2437 | } | |||
2438 | } | |||
2439 | for (auto tter : *sublist) { | |||
2440 | VARSTRINGIFY(tter)do { while ((tter)->type & T_VARSTRING) { (tter) = generateVarstringTag ((tter)); } } while (0); | |||
2441 | if (rule->type == K_ADDRELATIONS) { | |||
2442 | attach->setRelated(); | |||
2443 | rel_did_anything |= attach->addRelation(tter->hash, target->global_number); | |||
2444 | add_relation_rtag(attach, tter, target->global_number); | |||
2445 | } | |||
2446 | else if (rule->type == K_SETRELATIONS) { | |||
2447 | attach->setRelated(); | |||
2448 | rel_did_anything |= attach->setRelation(tter->hash, target->global_number); | |||
2449 | set_relation_rtag(attach, tter, target->global_number); | |||
2450 | } | |||
2451 | else { | |||
2452 | rel_did_anything |= attach->remRelation(tter->hash, target->global_number); | |||
2453 | rem_relation_rtag(attach, tter, target->global_number); | |||
2454 | } | |||
2455 | } | |||
2456 | if (rel_did_anything) { | |||
2457 | index_ruleCohort_no.clear(); | |||
2458 | // force TRACE to use target | |||
2459 | Cohort* at_was = context_stack.back().attach_to.cohort; | |||
2460 | context_stack.back().attach_to.cohort = nullptr; | |||
2461 | TRACE; | |||
2462 | context_stack.back().attach_to.cohort = at_was; | |||
2463 | context_stack.back().target.subreading->noprint = false; | |||
2464 | readings_changed = true; | |||
2465 | } | |||
2466 | // don't scan onwards if failed | |||
2467 | return true; | |||
2468 | } | |||
2469 | return true; | |||
2470 | }; | |||
2471 | int32_t orgoffset = rule->dep_target->offset; | |||
2472 | auto seen_targets = ss_u32sv.get(); | |||
2473 | ||||
2474 | ReadingSpec orgtarget = context_stack.back().target; | |||
2475 | while (true) { | |||
2476 | auto utags = ss_utags.get(); | |||
2477 | auto usets = ss_usets.get(); | |||
2478 | *utags = *context_stack.back().unif_tags; | |||
2479 | *usets = *context_stack.back().unif_sets; | |||
2480 | ||||
2481 | Cohort* attach = nullptr; | |||
2482 | Cohort* target = context_stack.back().target.cohort; | |||
2483 | seen_targets->insert(target->global_number); | |||
2484 | dep_deep_seen.clear(); | |||
2485 | tmpl_cntx.clear(); | |||
2486 | context_stack.back().attach_to.cohort = nullptr; | |||
2487 | context_stack.back().attach_to.reading = nullptr; | |||
2488 | context_stack.back().attach_to.subreading = nullptr; | |||
2489 | seen_barrier = false; | |||
2490 | if (runContextualTest(target->parent, target->local_number, rule->dep_target, &attach) && attach) { | |||
2491 | profileRuleContext(true, rule, rule->dep_target); | |||
2492 | ||||
2493 | bool break_after = seen_barrier || (rule->flags & RF_NEAREST); | |||
2494 | if (get_attach_to().cohort) { | |||
2495 | attach = get_attach_to().cohort; | |||
2496 | } | |||
2497 | context_target = attach; | |||
2498 | bool good = true; | |||
2499 | for (auto it : rule->dep_tests) { | |||
2500 | context_stack.back().mark = attach; | |||
2501 | dep_deep_seen.clear(); | |||
2502 | tmpl_cntx.clear(); | |||
2503 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); | |||
2504 | ||||
2505 | profileRuleContext(test_good, rule, it); | |||
2506 | ||||
2507 | if (!test_good) { | |||
2508 | good = test_good; | |||
2509 | break; | |||
2510 | } | |||
2511 | } | |||
2512 | if (!get_attach_to().cohort) { | |||
2513 | context_stack.back().attach_to.cohort = attach; | |||
2514 | } | |||
2515 | if (good) { | |||
2516 | ReadingSpec temp = context_stack.back().target; | |||
2517 | context_stack.back().target = orgtarget; | |||
2518 | bool attached = dep_target_cb(); | |||
2519 | if (attached) { | |||
2520 | break; | |||
2521 | } | |||
2522 | else { | |||
2523 | context_stack.back().target = temp; | |||
2524 | } | |||
2525 | } | |||
2526 | if (break_after) { | |||
2527 | break; | |||
2528 | } | |||
2529 | if (seen_targets->count(attach->global_number)) { | |||
2530 | // We've found a cohort we have seen before... | |||
2531 | // We assume running the test again would result in the same, so don't bother. | |||
2532 | break; | |||
2533 | } | |||
2534 | // Did not successfully attach due to loop restrictions; look onwards from here | |||
2535 | context_stack.back().target = context_stack.back().attach_to; | |||
2536 | context_stack.back().unif_tags->swap(utags); | |||
2537 | context_stack.back().unif_sets->swap(usets); | |||
2538 | if (rule->dep_target->offset != 0) { | |||
2539 | // Temporarily set offset to +/- 1 | |||
2540 | rule->dep_target->offset = ((rule->dep_target->offset < 0) ? -1 : 1); | |||
2541 | } | |||
2542 | } | |||
2543 | else { | |||
2544 | break; | |||
2545 | } | |||
2546 | } | |||
2547 | rule->dep_target->offset = orgoffset; | |||
2548 | finish_reading_loop = false; | |||
2549 | } | |||
2550 | else if (rule->type == K_MOVE_AFTER || rule->type == K_MOVE_BEFORE || rule->type == K_SWITCH) { | |||
2551 | // this is a per-cohort rule | |||
2552 | finish_reading_loop = false; | |||
2553 | // Calculate hash of current state to later compare whether this move/switch actually did anything | |||
2554 | uint32_t phash = 0; | |||
2555 | uint32_t chash = 0; | |||
2556 | for (const auto& c : current.cohorts) { | |||
2557 | phash = hash_value(c->global_number, phash); | |||
2558 | chash = hash_value(c->readings[0]->hash, chash); | |||
2559 | } | |||
2560 | ||||
2561 | // ToDo: ** tests will not correctly work for MOVE/SWITCH; cannot move cohorts between windows | |||
2562 | Cohort* attach = nullptr; | |||
2563 | Cohort* cohort = context_stack.back().target.cohort; | |||
2564 | uint32_t c = cohort->local_number; | |||
2565 | dep_deep_seen.clear(); | |||
2566 | tmpl_cntx.clear(); | |||
2567 | context_stack.back().attach_to.cohort = nullptr; | |||
2568 | context_stack.back().attach_to.reading = nullptr; | |||
2569 | context_stack.back().attach_to.subreading = nullptr; | |||
2570 | if (runContextualTest(¤t, c, rule->dep_target, &attach) && attach && cohort->parent == attach->parent) { | |||
2571 | profileRuleContext(true, rule, rule->dep_target); | |||
2572 | ||||
2573 | if (get_attach_to().cohort) { | |||
2574 | attach = get_attach_to().cohort; | |||
2575 | } | |||
2576 | context_target = attach; | |||
2577 | bool good = true; | |||
2578 | for (auto it : rule->dep_tests) { | |||
2579 | context_stack.back().mark = attach; | |||
2580 | dep_deep_seen.clear(); | |||
2581 | tmpl_cntx.clear(); | |||
2582 | bool test_good = (runContextualTest(attach->parent, attach->local_number, it) != nullptr); | |||
2583 | ||||
2584 | profileRuleContext(test_good, rule, it); | |||
2585 | ||||
2586 | if (!test_good) { | |||
2587 | good = test_good; | |||
2588 | break; | |||
2589 | } | |||
2590 | } | |||
2591 | ||||
2592 | if (!good || cohort == attach || cohort->local_number == 0) { | |||
2593 | return; | |||
2594 | } | |||
2595 | ||||
2596 | swapper<Cohort*> sw((rule->flags & RF_REVERSE) != 0, attach, cohort); | |||
2597 | CohortSet cohorts; | |||
2598 | ||||
2599 | if (rule->type == K_SWITCH) { | |||
2600 | if (attach->local_number == 0) { | |||
2601 | return; | |||
2602 | } | |||
2603 | current.cohorts[cohort->local_number] = attach; | |||
2604 | current.cohorts[attach->local_number] = cohort; | |||
2605 | cohorts.insert(attach); | |||
2606 | cohorts.insert(cohort); | |||
2607 | auto ac_c = std::find(current.all_cohorts.begin() + cohort->local_number, current.all_cohorts.end(), cohort); | |||
2608 | auto ac_a = std::find(current.all_cohorts.begin() + attach->local_number, current.all_cohorts.end(), attach); | |||
2609 | *ac_c = attach; | |||
2610 | *ac_a = cohort; | |||
2611 | } | |||
2612 | else { | |||
2613 | CohortSet edges; | |||
2614 | collect_subtree(edges, attach, rule->childset2); | |||
2615 | collect_subtree(cohorts, cohort, rule->childset1); | |||
2616 | ||||
2617 | bool need_clean = false; | |||
2618 | for (auto iter : cohorts) { | |||
2619 | if (edges.count(iter)) { | |||
2620 | need_clean = true; | |||
2621 | break; | |||
2622 | } | |||
2623 | } | |||
2624 | ||||
2625 | if (need_clean) { | |||
2626 | if (isChildOf(cohort, attach)) { | |||
2627 | edges.erase(cohorts.rbegin(), cohorts.rend()); | |||
2628 | } | |||
2629 | else /* if (isChildOf(attach, cohort)) */ { | |||
2630 | cohorts.erase(edges.rbegin(), edges.rend()); | |||
2631 | } | |||
2632 | } | |||
2633 | if (cohorts.empty() || edges.empty()) { | |||
2634 | finish_reading_loop = false; | |||
2635 | return; | |||
2636 | } | |||
2637 | ||||
2638 | for (auto c : reversed(cohorts)) { | |||
2639 | current.cohorts.erase(current.cohorts.begin() + c->local_number); | |||
2640 | current.all_cohorts.erase(std::find(current.all_cohorts.begin() + c->local_number, current.all_cohorts.end(), c)); | |||
2641 | } | |||
2642 | ||||
2643 | foreach (iter, current.cohorts)if (!(current.cohorts).empty()) for (auto iter = (current.cohorts ).begin(), iter_end = (current.cohorts).end(); iter != iter_end ; ++iter) { | |||
2644 | (*iter)->local_number = UI32(std::distance(current.cohorts.begin(), iter)); | |||
2645 | } | |||
2646 | ||||
2647 | for (auto iter : edges) { | |||
2648 | if (iter->parent != get_apply_to().cohort->parent) { | |||
2649 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move across window boundaries.\n", rule->line); | |||
2650 | CG3Quit(1); | |||
2651 | } | |||
2652 | for (auto cohort : cohorts) { | |||
2653 | if (iter == cohort) { | |||
2654 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move to a removed position.\n", rule->line); | |||
2655 | CG3Quit(1); | |||
2656 | } | |||
2657 | } | |||
2658 | } | |||
2659 | ||||
2660 | uint32_t spot = 0; | |||
2661 | auto ac_spot = current.all_cohorts.begin(); | |||
2662 | if (rule->type == K_MOVE_BEFORE) { | |||
2663 | spot = edges.front()->local_number; | |||
2664 | if (spot == 0) { | |||
2665 | spot = 1; | |||
2666 | } | |||
2667 | ac_spot = std::find(current.all_cohorts.begin() + edges.front()->local_number, current.all_cohorts.end(), edges.front()); | |||
2668 | if ((*ac_spot)->local_number == 0) { | |||
2669 | ++ac_spot; | |||
2670 | } | |||
2671 | } | |||
2672 | else if (rule->type == K_MOVE_AFTER) { | |||
2673 | spot = edges.back()->local_number + 1; | |||
2674 | ac_spot = std::find(current.all_cohorts.begin() + edges.front()->local_number, current.all_cohorts.end(), edges.back()); | |||
2675 | ++ac_spot; | |||
2676 | } | |||
2677 | ||||
2678 | if (spot > current.cohorts.size()) { | |||
2679 | u_fprintfu_fprintf_72(ux_stderr, "Error: Move/Switch on line %u tried to move out of bounds.\n", rule->line); | |||
2680 | CG3Quit(1); | |||
2681 | } | |||
2682 | ||||
2683 | for (auto c : reversed(cohorts)) { | |||
2684 | current.cohorts.insert(current.cohorts.begin() + spot, c); | |||
2685 | current.all_cohorts.insert(ac_spot, c); | |||
2686 | } | |||
2687 | } | |||
2688 | reindex(); | |||
2689 | ||||
2690 | // Compare whether this move/switch actually did anything | |||
2691 | uint32_t phash_n = 0; | |||
2692 | uint32_t chash_n = 0; | |||
2693 | for (const auto& c : current.cohorts) { | |||
2694 | phash_n = hash_value(c->global_number, phash_n); | |||
2695 | chash_n = hash_value(c->readings[0]->hash, chash_n); | |||
2696 | } | |||
2697 | ||||
2698 | if (phash != phash_n || chash != chash_n) { | |||
2699 | if (++rule_hits[rule->number] > current.cohorts.size() * 100) { | |||
2700 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Move/Switch endless loop detected for rule on line %u around input line %u - bailing out!\n", rule->line, get_apply_to().cohort->line_number); | |||
2701 | should_bail = true; | |||
2702 | finish_cohort_loop = false; | |||
2703 | return; | |||
2704 | } | |||
2705 | ||||
2706 | for (auto c : cohorts) { | |||
2707 | for (auto iter : c->readings) { | |||
2708 | iter->hit_by.push_back(rule->number); | |||
2709 | } | |||
2710 | } | |||
2711 | readings_changed = true; | |||
2712 | sorter.do_sort = true; | |||
2713 | } | |||
2714 | } | |||
2715 | } | |||
2716 | else if (rule->type == K_WITH) { | |||
2717 | TRACE; | |||
2718 | bool any_readings_changed = false; | |||
2719 | readings_changed = false; | |||
2720 | in_nested = true; | |||
2721 | for (auto& sr : rule->sub_rules) { | |||
2722 | Rule* cur_was = current_rule; | |||
2723 | Rule* rule_was = rule; | |||
2724 | current_rule = sr; | |||
2725 | rule = sr; | |||
2726 | bool result = false; | |||
2727 | do { | |||
2728 | readings_changed = false; | |||
2729 | result = runSingleRule(current, *rule, reading_cb, cohort_cb); | |||
2730 | any_readings_changed = any_readings_changed || result || readings_changed; | |||
2731 | } while ((result || readings_changed) && (rule->flags & RF_REPEAT) != 0) ; | |||
2732 | current_rule = cur_was; | |||
2733 | rule = rule_was; | |||
2734 | } | |||
2735 | in_nested = false; | |||
2736 | readings_changed = any_readings_changed; | |||
2737 | finish_reading_loop = false; | |||
2738 | } | |||
2739 | else if (rule->type != K_REMCOHORT) { | |||
2740 | TRACE; | |||
2741 | } | |||
2742 | }; | |||
2743 | ||||
2744 | removed.resize(0); | |||
2745 | selected.resize(0); | |||
2746 | bool rv = runSingleRule(current, *rule, reading_cb, cohort_cb); | |||
2747 | if (rv || readings_changed) { | |||
2748 | if (!(rule->flags & RF_NOITERATE) && section_max_count != 1) { | |||
2749 | section_did_something = true; | |||
2750 | } | |||
2751 | rule_did_something = true; | |||
2752 | } | |||
2753 | if (should_bail) { | |||
2754 | goto bailout; | |||
2755 | } | |||
2756 | if (should_repeat) { | |||
2757 | goto repeat_rule; | |||
2758 | } | |||
2759 | ||||
2760 | if (rule_did_something) { | |||
2761 | if (trace_rules.contains(rule->line)) { | |||
2762 | retval |= RV_TRACERULE; | |||
2763 | } | |||
2764 | } | |||
2765 | if (delimited) { | |||
2766 | break; | |||
2767 | } | |||
2768 | if (rule_did_something && (rule->flags & RF_REPEAT)) { | |||
2769 | index_ruleCohort_no.clear(); | |||
2770 | goto repeat_rule; | |||
2771 | } | |||
2772 | ||||
2773 | if (false) { | |||
2774 | bailout: | |||
2775 | rule_hits[rule->number] = 0; | |||
2776 | index_ruleCohort_no.clear(); | |||
2777 | } | |||
2778 | ||||
2779 | if (retval & RV_TRACERULE) { | |||
2780 | break; | |||
2781 | } | |||
2782 | } | |||
2783 | ||||
2784 | if (section_did_something) { | |||
2785 | retval |= RV_SOMETHING; | |||
2786 | } | |||
2787 | if (delimited) { | |||
2788 | retval |= RV_DELIMITED; | |||
2789 | } | |||
2790 | return retval; | |||
2791 | } | |||
2792 | ||||
2793 | uint32_t GrammarApplicator::runGrammarOnSingleWindow(SingleWindow& current) { | |||
2794 | if (!grammar->before_sections.empty() && !no_before_sections) { | |||
2795 | uint32_t rv = runRulesOnSingleWindow(current, runsections[-1]); | |||
2796 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { | |||
2797 | return rv; | |||
2798 | } | |||
2799 | } | |||
2800 | ||||
2801 | if (!grammar->rules.empty() && !no_sections) { | |||
2802 | std::map<uint32_t, uint32_t> counter; | |||
2803 | // Caveat: This may look as if it is not recursing previous sections, but those rules are preprocessed into the successive sections so they are actually run. | |||
2804 | auto iter = runsections.begin(); | |||
2805 | auto iter_end = runsections.end(); | |||
2806 | for (size_t pass = 0; iter != iter_end; ++pass) { | |||
2807 | if (iter->first < 0 || (section_max_count && counter[iter->first] >= section_max_count)) { | |||
2808 | ++iter; | |||
2809 | continue; | |||
2810 | } | |||
2811 | uint32_t rv = 0; | |||
2812 | if (debug_level > 0) { | |||
2813 | std::cerr << "Running section " << iter->first << " (rules " << *(iter->second.begin()) << " through " << *(--(iter->second.end())) << ") on window " << current.number << std::endl; | |||
2814 | } | |||
2815 | rv = runRulesOnSingleWindow(current, iter->second); | |||
2816 | ++counter[iter->first]; | |||
2817 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { | |||
2818 | return rv; | |||
2819 | } | |||
2820 | if (!(rv & RV_SOMETHING)) { | |||
2821 | ++iter; | |||
2822 | pass = 0; | |||
2823 | } | |||
2824 | if (pass >= 1000) { | |||
2825 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Endless loop detected before input line %u. Window contents was:", numLines); | |||
2826 | UString tag; | |||
2827 | for (size_t i = 1; i < current.cohorts.size(); ++i) { | |||
2828 | Tag* t = current.cohorts[i]->wordform; | |||
2829 | tag.assign(t->tag.begin() + 2, t->tag.begin() + t->tag.size() - 2); | |||
2830 | u_fprintfu_fprintf_72(ux_stderr, " %S", tag.data()); | |||
2831 | } | |||
2832 | u_fprintfu_fprintf_72(ux_stderr, "\n"); | |||
2833 | u_fflushu_fflush_72(ux_stderr); | |||
2834 | break; | |||
2835 | } | |||
2836 | } | |||
2837 | } | |||
2838 | ||||
2839 | if (!grammar->after_sections.empty() && !no_after_sections) { | |||
2840 | uint32_t rv = runRulesOnSingleWindow(current, runsections[-2]); | |||
2841 | if (rv & (RV_DELIMITED | RV_TRACERULE)) { | |||
2842 | return rv; | |||
2843 | } | |||
2844 | } | |||
2845 | ||||
2846 | return 0; | |||
2847 | } | |||
2848 | ||||
2849 | void GrammarApplicator::runGrammarOnWindow() { | |||
2850 | SingleWindow* current = gWindow->current; | |||
2851 | did_final_enclosure = false; | |||
2852 | ||||
2853 | for (const auto& vit : current->variables_set) { | |||
2854 | variables[vit.first] = vit.second; | |||
2855 | } | |||
2856 | for (auto vit : current->variables_rem) { | |||
2857 | variables.erase(vit); | |||
2858 | } | |||
2859 | variables[mprefix_key] = mprefix_value; | |||
2860 | ||||
2861 | if (has_dep) { | |||
2862 | reflowDependencyWindow(); | |||
2863 | if (!input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { | |||
2864 | for (auto cohort : gWindow->next.back()->cohorts) { | |||
2865 | gWindow->dep_window[cohort->global_number] = cohort; | |||
2866 | } | |||
2867 | } | |||
2868 | } | |||
2869 | if (has_relations) { | |||
2870 | reflowRelationWindow(); | |||
2871 | } | |||
2872 | ||||
2873 | if (!grammar->parentheses.empty()) { | |||
2874 | label_scanParentheses: | |||
2875 | reverse_foreach (iter, current->cohorts)if (!(current->cohorts).empty()) for (auto iter = (current ->cohorts).rbegin(), iter_end = (current->cohorts).rend (); iter != iter_end; ++iter) { | |||
2876 | Cohort* c = *iter; | |||
2877 | if (c->is_pleft == 0) { | |||
2878 | continue; | |||
2879 | } | |||
2880 | auto p = grammar->parentheses.find(c->is_pleft); | |||
2881 | if (p != grammar->parentheses.end()) { | |||
2882 | auto right = iter.base(); | |||
2883 | --right; | |||
2884 | --right; | |||
2885 | c = *right; | |||
2886 | ++right; | |||
2887 | bool found = false; | |||
2888 | CohortVector encs; | |||
2889 | for (; right != current->cohorts.end(); ++right) { | |||
2890 | Cohort* s = *right; | |||
2891 | encs.push_back(s); | |||
2892 | if (s->is_pright == p->second) { | |||
2893 | found = true; | |||
2894 | break; | |||
2895 | } | |||
2896 | } | |||
2897 | if (found) { | |||
2898 | auto left = iter.base(); | |||
2899 | --left; | |||
2900 | uint32_t lc = (*left)->local_number; | |||
2901 | ++right; | |||
2902 | for (; right != current->cohorts.end(); ++right) { | |||
2903 | *left = *right; | |||
2904 | (*left)->local_number = lc; | |||
2905 | ++lc; | |||
2906 | ++left; | |||
2907 | } | |||
2908 | current->cohorts.resize(current->cohorts.size() - encs.size()); | |||
2909 | auto ec = std::find(current->all_cohorts.begin() + encs.front()->local_number, current->all_cohorts.end(), encs.front()); | |||
2910 | --ec; | |||
2911 | do { | |||
2912 | ++ec; | |||
2913 | (*ec)->type |= CT_ENCLOSED; | |||
2914 | ++((*ec)->enclosed); | |||
2915 | } while (*ec != encs.back()); | |||
2916 | current->has_enclosures = true; | |||
2917 | goto label_scanParentheses; | |||
2918 | } | |||
2919 | } | |||
2920 | } | |||
2921 | } | |||
2922 | ||||
2923 | par_left_tag = 0; | |||
2924 | par_right_tag = 0; | |||
2925 | par_left_pos = 0; | |||
2926 | par_right_pos = 0; | |||
2927 | uint32_t pass = 0; | |||
2928 | ||||
2929 | label_runGrammarOnWindow_begin: | |||
2930 | while (!gWindow->previous.empty() && gWindow->previous.size() > num_windows) { | |||
2931 | SingleWindow* tmp = gWindow->previous.front(); | |||
2932 | printSingleWindow(tmp, *ux_stdout); | |||
2933 | free_swindow(tmp); | |||
2934 | gWindow->previous.erase(gWindow->previous.begin()); | |||
2935 | } | |||
2936 | ||||
2937 | rule_hits.clear(); | |||
2938 | index_ruleCohort_no.clear(); | |||
2939 | current = gWindow->current; | |||
2940 | indexSingleWindow(*current); | |||
2941 | current->hit_external.clear(); | |||
2942 | gWindow->rebuildCohortLinks(); // ToDo: Hack. This can be done better... | |||
2943 | ||||
2944 | ++pass; | |||
2945 | if (pass > 1000) { | |||
2946 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Endless loop detected before input line %u. Window contents was:", numLines); | |||
2947 | UString tag; | |||
2948 | for (size_t i = 1; i < current->cohorts.size(); ++i) { | |||
2949 | Tag* t = current->cohorts[i]->wordform; | |||
2950 | tag.assign(t->tag.begin() + 2, t->tag.begin() + t->tag.size() - 2); | |||
2951 | u_fprintfu_fprintf_72(ux_stderr, " %S", tag.data()); | |||
2952 | } | |||
2953 | u_fprintfu_fprintf_72(ux_stderr, "\n"); | |||
2954 | u_fflushu_fflush_72(ux_stderr); | |||
2955 | return; | |||
2956 | } | |||
2957 | ||||
2958 | if (trace_encl) { | |||
2959 | uint32_t hitpass = std::numeric_limits<uint32_t>::max() - pass; | |||
2960 | for (auto& c : current->cohorts) { | |||
2961 | for (auto rit : c->readings) { | |||
2962 | rit->hit_by.push_back(hitpass); | |||
2963 | } | |||
2964 | } | |||
2965 | } | |||
2966 | ||||
2967 | uint32_t rv = runGrammarOnSingleWindow(*current); | |||
2968 | if (rv & RV_DELIMITED) { | |||
2969 | goto label_runGrammarOnWindow_begin; | |||
2970 | } | |||
2971 | ||||
2972 | label_unpackEnclosures: | |||
2973 | if (current->has_enclosures) { | |||
2974 | size_t nc = current->all_cohorts.size(); | |||
2975 | for (size_t i = 0; i < nc; ++i) { | |||
2976 | Cohort* c = current->all_cohorts[i]; | |||
2977 | if (c->enclosed == 1) { | |||
2978 | size_t la = i; | |||
2979 | for (; la > 0; --la) { | |||
2980 | if (!(current->all_cohorts[la - 1]->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED))) { | |||
2981 | --la; | |||
2982 | break; | |||
2983 | } | |||
2984 | } | |||
2985 | size_t ni = current->all_cohorts[la]->local_number; | |||
2986 | ||||
2987 | size_t ra = i; | |||
2988 | size_t ne = 0; | |||
2989 | for (; ra < nc; ++ra) { | |||
2990 | if (!(current->all_cohorts[ra]->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED))) { | |||
2991 | break; | |||
2992 | } | |||
2993 | --(current->all_cohorts[ra]->enclosed); | |||
2994 | if (current->all_cohorts[ra]->enclosed == 0) { | |||
2995 | current->all_cohorts[ra]->type &= ~CT_ENCLOSED; | |||
2996 | ++ne; | |||
2997 | } | |||
2998 | } | |||
2999 | ||||
3000 | current->cohorts.resize(current->cohorts.size() + ne, nullptr); | |||
3001 | for (size_t j = current->cohorts.size() - 1; j > ni + ne; --j) { | |||
3002 | current->cohorts[j] = current->cohorts[j - ne]; | |||
3003 | current->cohorts[j]->local_number = UI32(j); | |||
3004 | current->cohorts[j - ne] = nullptr; | |||
3005 | } | |||
3006 | for (size_t j = 0; i < ra; ++i) { | |||
3007 | if (current->all_cohorts[i]->enclosed == 0) { | |||
3008 | current->cohorts[ni + j + 1] = current->all_cohorts[i]; | |||
3009 | current->cohorts[ni + j + 1]->local_number = UI32(ni + j + 1); | |||
3010 | current->cohorts[ni + j + 1]->parent = current; | |||
3011 | ++j; | |||
3012 | } | |||
3013 | } | |||
3014 | par_left_tag = current->all_cohorts[la + 1]->is_pleft; | |||
3015 | par_right_tag = current->all_cohorts[ra - 1]->is_pright; | |||
3016 | par_left_pos = UI32(ni + 1); | |||
3017 | par_right_pos = UI32(ni + ne); | |||
3018 | if (rv & RV_TRACERULE) { | |||
3019 | goto label_unpackEnclosures; | |||
3020 | } | |||
3021 | goto label_runGrammarOnWindow_begin; | |||
3022 | } | |||
3023 | } | |||
3024 | if (!did_final_enclosure) { | |||
3025 | par_left_tag = 0; | |||
3026 | par_right_tag = 0; | |||
3027 | par_left_pos = 0; | |||
3028 | par_right_pos = 0; | |||
3029 | did_final_enclosure = true; | |||
3030 | if (rv & RV_TRACERULE) { | |||
3031 | goto label_unpackEnclosures; | |||
3032 | } | |||
3033 | goto label_runGrammarOnWindow_begin; | |||
3034 | } | |||
3035 | } | |||
3036 | ||||
3037 | bool should_reflow = false; | |||
3038 | for (size_t i = current->all_cohorts.size(); i > 0; --i) { | |||
3039 | auto cohort = current->all_cohorts[i - 1]; | |||
3040 | if (cohort->type & CT_IGNORED) { | |||
3041 | for (auto ins = i; ins > 0; --ins) { | |||
3042 | if (!(current->all_cohorts[ins - 1]->type & (CT_REMOVED | CT_ENCLOSED | CT_IGNORED))) { | |||
3043 | current->cohorts.insert(current->cohorts.begin() + current->all_cohorts[ins - 1]->local_number + 1, cohort); | |||
3044 | cohort->type &= ~CT_IGNORED; | |||
3045 | current->parent->cohort_map.insert(std::make_pair(cohort->global_number, cohort)); | |||
3046 | should_reflow = true; | |||
3047 | break; | |||
3048 | } | |||
3049 | } | |||
3050 | } | |||
3051 | } | |||
3052 | if (should_reflow) { | |||
3053 | for (size_t i = 0; i < current->cohorts.size(); ++i) { | |||
3054 | current->cohorts[i]->local_number = UI32(i); | |||
3055 | } | |||
3056 | reflowDependencyWindow(); | |||
3057 | } | |||
3058 | } | |||
3059 | } | |||
3060 | ||||
3061 | // This helps the all_vislcg3.cpp profiling builds | |||
3062 | #undef TRACE |