File: | GrammarApplicator_reflow.cpp |
Warning: | line 801, column 2 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* | |||
2 | * Copyright (C) 2007-2024, GrammarSoft ApS | |||
3 | * Developed by Tino Didriksen <mail@tinodidriksen.com> | |||
4 | * Design by Eckhard Bick <eckhard.bick@mail.dk>, Tino Didriksen <mail@tinodidriksen.com> | |||
5 | * | |||
6 | * This program is free software: you can redistribute it and/or modify | |||
7 | * it under the terms of the GNU General Public License as published by | |||
8 | * the Free Software Foundation, either version 3 of the License, or | |||
9 | * (at your option) any later version. | |||
10 | * | |||
11 | * This program is distributed in the hope that it will be useful, | |||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
14 | * GNU General Public License for more details. | |||
15 | * | |||
16 | * You should have received a copy of the GNU General Public License | |||
17 | * along with this progam. If not, see <https://www.gnu.org/licenses/>. | |||
18 | */ | |||
19 | ||||
20 | #include "GrammarApplicator.hpp" | |||
21 | #include "Strings.hpp" | |||
22 | #include "Tag.hpp" | |||
23 | #include "Grammar.hpp" | |||
24 | #include "Window.hpp" | |||
25 | #include "SingleWindow.hpp" | |||
26 | #include "Reading.hpp" | |||
27 | ||||
28 | namespace CG3 { | |||
29 | ||||
30 | Tag* GrammarApplicator::makeBaseFromWord(uint32_t tag) { | |||
31 | return makeBaseFromWord(grammar->single_tags.find(tag)->second); | |||
32 | } | |||
33 | ||||
34 | Tag* GrammarApplicator::makeBaseFromWord(Tag* tag) { | |||
35 | const size_t len = tag->tag.size(); | |||
36 | if (len < 5) { | |||
37 | return tag; | |||
38 | } | |||
39 | static thread_local UString n; | |||
40 | n.clear(); | |||
41 | n.resize(len - 2); | |||
42 | n[0] = n[len - 3] = '"'; | |||
43 | u_strncpyu_strncpy_72(&n[1], tag->tag.data() + 2, SI32(len - 4)); | |||
44 | Tag* nt = addTag(n); | |||
45 | return nt; | |||
46 | } | |||
47 | ||||
48 | bool GrammarApplicator::isChildOf(const Cohort* child, const Cohort* parent) { | |||
49 | bool retval = false; | |||
50 | ||||
51 | if (parent->global_number == child->global_number) { | |||
52 | retval = true; | |||
53 | } | |||
54 | else if (parent->global_number == child->dep_parent) { | |||
55 | retval = true; | |||
56 | } | |||
57 | else { | |||
58 | size_t i = 0; | |||
59 | for (const Cohort* inner = child; i < 1000; ++i) { | |||
60 | if (inner->dep_parent == 0 || inner->dep_parent == DEP_NO_PARENT) { | |||
61 | retval = false; | |||
62 | break; | |||
63 | } | |||
64 | auto it = gWindow->cohort_map.find(inner->dep_parent); | |||
65 | if (it != gWindow->cohort_map.end()) { | |||
66 | inner = it->second; | |||
67 | } | |||
68 | else { | |||
69 | break; | |||
70 | } | |||
71 | if (inner->dep_parent == parent->global_number) { | |||
72 | retval = true; | |||
73 | break; | |||
74 | } | |||
75 | } | |||
76 | if (i == 1000) { | |||
77 | if (verbosity_level > 0) { | |||
78 | u_fprintfu_fprintf_72( | |||
79 | ux_stderr, | |||
80 | "Warning: While testing whether %u is a child of %u the counter exceeded 1000 indicating a loop higher up in the tree.\n", | |||
81 | child->global_number, parent->global_number); | |||
82 | } | |||
83 | } | |||
84 | } | |||
85 | return retval; | |||
86 | } | |||
87 | ||||
88 | bool GrammarApplicator::wouldParentChildLoop(const Cohort* parent, const Cohort* child) { | |||
89 | bool retval = false; | |||
90 | ||||
91 | if (parent->global_number == child->global_number) { | |||
92 | retval = true; | |||
93 | } | |||
94 | else if (parent->global_number == child->dep_parent) { | |||
95 | retval = false; | |||
96 | } | |||
97 | else if (parent->global_number == parent->dep_parent) { | |||
98 | retval = false; | |||
99 | } | |||
100 | else if (parent->dep_parent == child->global_number) { | |||
101 | retval = true; | |||
102 | } | |||
103 | else { | |||
104 | size_t i = 0; | |||
105 | for (const Cohort* inner = parent; i < 1000; ++i) { | |||
106 | if (inner->dep_parent == 0 || inner->dep_parent == DEP_NO_PARENT) { | |||
107 | retval = false; | |||
108 | break; | |||
109 | } | |||
110 | auto it = gWindow->cohort_map.find(inner->dep_parent); | |||
111 | if (it != gWindow->cohort_map.end()) { | |||
112 | inner = it->second; | |||
113 | } | |||
114 | else { | |||
115 | break; | |||
116 | } | |||
117 | if (inner->dep_parent == child->global_number) { | |||
118 | retval = true; | |||
119 | break; | |||
120 | } | |||
121 | } | |||
122 | if (i == 1000) { | |||
123 | if (verbosity_level > 0) { | |||
124 | u_fprintfu_fprintf_72( | |||
125 | ux_stderr, | |||
126 | "Warning: While testing whether %u and %u would loop the counter exceeded 1000 indicating a loop higher up in the tree.\n", | |||
127 | child->global_number, parent->global_number); | |||
128 | } | |||
129 | } | |||
130 | } | |||
131 | return retval; | |||
132 | } | |||
133 | ||||
134 | bool GrammarApplicator::wouldParentChildCross(const Cohort* parent, const Cohort* child) { | |||
135 | uint32_t mn = std::min(parent->global_number, child->global_number); | |||
136 | uint32_t mx = std::max(parent->global_number, child->global_number); | |||
137 | ||||
138 | for (uint32_t i = mn + 1; i < mx; ++i) { | |||
139 | auto it = gWindow->cohort_map.find(parent->dep_parent); | |||
140 | if (it != gWindow->cohort_map.end() && it->second->dep_parent != DEP_NO_PARENT) { | |||
141 | if (it->second->dep_parent < mn || it->second->dep_parent > mx) { | |||
142 | return true; | |||
143 | } | |||
144 | } | |||
145 | } | |||
146 | ||||
147 | return false; | |||
148 | } | |||
149 | ||||
150 | bool GrammarApplicator::attachParentChild(Cohort& parent, Cohort& child, bool allowloop, bool allowcrossing) { | |||
151 | parent.dep_self = parent.global_number; | |||
152 | child.dep_self = child.global_number; | |||
153 | ||||
154 | if (!allowloop && dep_block_loops && wouldParentChildLoop(&parent, &child)) { | |||
155 | if (verbosity_level > 0) { | |||
156 | u_fprintfu_fprintf_72( | |||
157 | ux_stderr, | |||
158 | "Warning: Dependency between %u and %u would cause a loop. Will not attach them.\n", | |||
159 | child.global_number, parent.global_number); | |||
160 | } | |||
161 | return false; | |||
162 | } | |||
163 | ||||
164 | if (!allowcrossing && dep_block_crossing && wouldParentChildCross(&parent, &child)) { | |||
165 | if (verbosity_level > 0) { | |||
166 | u_fprintfu_fprintf_72( | |||
167 | ux_stderr, | |||
168 | "Warning: Dependency between %u and %u would cause crossing branches. Will not attach them.\n", | |||
169 | child.global_number, parent.global_number); | |||
170 | } | |||
171 | return false; | |||
172 | } | |||
173 | ||||
174 | if (child.dep_parent == DEP_NO_PARENT) { | |||
175 | child.dep_parent = child.dep_self; | |||
176 | } | |||
177 | auto it = gWindow->cohort_map.find(child.dep_parent); | |||
178 | if (it != gWindow->cohort_map.end()) { | |||
179 | it->second->remChild(child.dep_self); | |||
180 | } | |||
181 | ||||
182 | child.dep_parent = parent.global_number; | |||
183 | parent.addChild(child.global_number); | |||
184 | ||||
185 | parent.type |= CT_DEP_DONE; | |||
186 | child.type |= CT_DEP_DONE; | |||
187 | ||||
188 | if (!dep_has_spanned && child.parent != parent.parent) { | |||
189 | u_fprintfu_fprintf_72( | |||
190 | ux_stderr, | |||
191 | "Info: Dependency between %u and %u spans the window boundaries. Enumeration will be global from here on.\n", | |||
192 | child.global_number, parent.global_number); | |||
193 | dep_has_spanned = true; | |||
194 | } | |||
195 | return true; | |||
196 | } | |||
197 | ||||
198 | void GrammarApplicator::reflowDependencyWindow(uint32_t max) { | |||
199 | if (dep_delimit && !max && !input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { | |||
200 | max = gWindow->next.back()->cohorts[1]->global_number; | |||
201 | } | |||
202 | ||||
203 | if (gWindow->dep_window.empty() || gWindow->dep_window.begin()->second->parent == 0) { | |||
204 | gWindow->dep_window[0] = gWindow->current->cohorts[0]; | |||
205 | } | |||
206 | else if (gWindow->dep_window.find(0) == gWindow->dep_window.end()) { | |||
207 | // This has to be done in 2 steps or it will segfault on Linux for some reason... | |||
208 | // Turns out g++ evaluates left side of = first, and MSVC++ does right side first, so g++ accessed its own newly created [0] at .begin() | |||
209 | Cohort* tmp = gWindow->dep_window.begin()->second->parent->cohorts[0]; | |||
210 | gWindow->dep_window[0] = tmp; | |||
211 | } | |||
212 | if (gWindow->cohort_map.empty()) { | |||
213 | gWindow->cohort_map[0] = gWindow->current->cohorts[0]; | |||
214 | } | |||
215 | else if (gWindow->cohort_map.find(0) == gWindow->cohort_map.end()) { | |||
216 | Cohort* tmp = gWindow->current->cohorts[0]; | |||
217 | Cohort* c = gWindow->cohort_map.begin()->second; | |||
218 | if (c->parent) { | |||
219 | tmp = c->parent->cohorts[0]; | |||
220 | } | |||
221 | gWindow->cohort_map[0] = tmp; | |||
222 | } | |||
223 | ||||
224 | for (auto begin = gWindow->dep_window.begin(); begin != gWindow->dep_window.end();) { | |||
225 | while (begin != gWindow->dep_window.end() && (begin->second->type & CT_DEP_DONE || !begin->second->dep_self)) { | |||
226 | ++begin; | |||
227 | } | |||
228 | gWindow->dep_map.clear(); | |||
229 | ||||
230 | auto end = begin; | |||
231 | for (; end != gWindow->dep_window.end(); ++end) { | |||
232 | Cohort* cohort = end->second; | |||
233 | if (cohort->type & CT_DEP_DONE) { | |||
234 | continue; | |||
235 | } | |||
236 | if (!cohort->dep_self) { | |||
237 | continue; | |||
238 | } | |||
239 | if (max && cohort->global_number >= max) { | |||
240 | break; | |||
241 | } | |||
242 | if (gWindow->dep_map.find(cohort->dep_self) != gWindow->dep_map.end()) { | |||
243 | break; | |||
244 | } | |||
245 | gWindow->dep_map[cohort->dep_self] = cohort->global_number; | |||
246 | cohort->dep_self = cohort->global_number; | |||
247 | } | |||
248 | ||||
249 | if (gWindow->dep_map.empty()) { | |||
250 | break; | |||
251 | } | |||
252 | ||||
253 | gWindow->dep_map[0] = 0; | |||
254 | for (; begin != end; ++begin) { | |||
255 | Cohort* cohort = begin->second; | |||
256 | if (max && cohort->global_number >= max) { | |||
257 | break; | |||
258 | } | |||
259 | if (cohort->dep_parent == DEP_NO_PARENT) { | |||
260 | continue; | |||
261 | } | |||
262 | if (cohort->dep_self == cohort->global_number) { | |||
263 | if (!(cohort->type & CT_DEP_DONE) && gWindow->dep_map.find(cohort->dep_parent) == gWindow->dep_map.end()) { | |||
264 | if (verbosity_level > 0) { | |||
265 | u_fprintfu_fprintf_72( | |||
266 | ux_stderr, | |||
267 | "Warning: Parent %u of dep %u in cohort %u of window %u does not exist - ignoring.\n", | |||
268 | cohort->dep_parent, cohort->dep_self, cohort->local_number, cohort->parent->number); | |||
269 | u_fflushu_fflush_72(ux_stderr); | |||
270 | } | |||
271 | cohort->dep_parent = DEP_NO_PARENT; | |||
272 | } | |||
273 | else { | |||
274 | if (!(cohort->type & CT_DEP_DONE)) { | |||
275 | auto dep_real = gWindow->dep_map.find(cohort->dep_parent)->second; | |||
276 | cohort->dep_parent = dep_real; | |||
277 | } | |||
278 | gWindow->cohort_map[0] = cohort->parent->cohorts[0]; | |||
279 | auto tmp = gWindow->cohort_map.find(cohort->dep_parent); | |||
280 | if (tmp != gWindow->cohort_map.end()) { | |||
281 | tmp->second->addChild(cohort->dep_self); | |||
282 | } | |||
283 | cohort->type |= CT_DEP_DONE; | |||
284 | } | |||
285 | } | |||
286 | } | |||
287 | } | |||
288 | ||||
289 | gWindow->dep_map.clear(); | |||
290 | gWindow->dep_window.clear(); | |||
291 | } | |||
292 | ||||
293 | void GrammarApplicator::reflowRelationWindow(uint32_t max) { | |||
294 | if (!max && !input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { | |||
295 | max = gWindow->next.back()->cohorts[0]->global_number; | |||
296 | } | |||
297 | ||||
298 | Cohort* cohort = gWindow->current->cohorts[1]; | |||
299 | while (cohort->prev) { | |||
300 | cohort = cohort->prev; | |||
301 | } | |||
302 | ||||
303 | for (; cohort; cohort = cohort->next) { | |||
304 | if (max && cohort->global_number >= max) { | |||
305 | break; | |||
306 | } | |||
307 | ||||
308 | for (auto rel = cohort->relations_input.begin(); rel != cohort->relations_input.end();) { | |||
309 | auto newrel = ss_u32sv.get(); | |||
310 | ||||
311 | for (auto target : rel->second) { | |||
312 | auto it = gWindow->relation_map.find(target); | |||
313 | if (it != gWindow->relation_map.end()) { | |||
314 | cohort->relations[rel->first].insert(it->second); | |||
315 | } | |||
316 | else { | |||
317 | newrel->insert(target); | |||
318 | } | |||
319 | } | |||
320 | ||||
321 | // Defer missing relations for later | |||
322 | if (newrel->empty()) { | |||
323 | rel = cohort->relations_input.erase(rel); | |||
324 | } | |||
325 | else { | |||
326 | rel->second = newrel; | |||
327 | ++rel; | |||
328 | } | |||
329 | } | |||
330 | } | |||
331 | } | |||
332 | ||||
333 | void GrammarApplicator::reflowReading(Reading& reading) { | |||
334 | reading.tags.clear(); | |||
335 | reading.tags_plain.clear(); | |||
336 | reading.tags_textual.clear(); | |||
337 | reading.tags_numerical.clear(); | |||
338 | reading.tags_bloom.clear(); | |||
339 | reading.tags_textual_bloom.clear(); | |||
340 | reading.tags_plain_bloom.clear(); | |||
341 | reading.mapping = nullptr; | |||
342 | reading.tags_string.clear(); | |||
343 | ||||
344 | insert_if_exists(reading.parent->possible_sets, grammar->sets_any); | |||
345 | ||||
346 | Reading::tags_list_t tlist; | |||
347 | tlist.swap(reading.tags_list); | |||
348 | ||||
349 | for (auto tter : tlist) { | |||
350 | addTagToReading(reading, tter, false); | |||
351 | } | |||
352 | ||||
353 | reading.rehash(); | |||
354 | } | |||
355 | ||||
356 | Tag* GrammarApplicator::generateVarstringTag(const Tag* tag) { | |||
357 | static thread_local UnicodeString tmp; | |||
358 | tmp.remove(); | |||
359 | tmp.append(tag->tag.data(), SI32(tag->tag.size())); | |||
360 | bool did_something = false; | |||
361 | ||||
362 | // Convert %[UuLl] markers to control codes to avoid having combined %$1 accidentally match %L | |||
363 | constexpr UStringView raw[] = { STR_VSu_raw, STR_VSU_raw, STR_VSl_raw, STR_VSL_raw }; | |||
364 | constexpr UStringView x01[] = { STR_VSu, STR_VSU, STR_VSl, STR_VSL }; | |||
365 | for (size_t i = 0; i < 4; ++i) { | |||
366 | findAndReplace(tmp, raw[i].data(), x01[i].data()); | |||
367 | } | |||
368 | ||||
369 | // Replace unified sets with their matching tags | |||
370 | if (tag->vs_sets) { | |||
371 | for (size_t i = 0; i < tag->vs_sets->size(); ++i) { | |||
372 | auto tags = ss_taglist.get(); | |||
373 | getTagList(*(*tag->vs_sets)[i], tags); | |||
374 | static thread_local UString rpl; | |||
375 | rpl.clear(); | |||
376 | // If there are multiple tags, such as from CompositeTags, put _ between them | |||
377 | foreach (iter, *tags)if (!(*tags).empty()) for (auto iter = (*tags).begin(), iter_end = (*tags).end(); iter != iter_end; ++iter) { | |||
378 | rpl += (*iter)->tag; | |||
379 | if (std::distance(iter, iter_end) > 1) { | |||
380 | rpl += '_'; | |||
381 | } | |||
382 | } | |||
383 | findAndReplace(tmp, (*tag->vs_names)[i].data(), rpl.data()); | |||
384 | did_something = true; | |||
385 | } | |||
386 | } | |||
387 | ||||
388 | // Replace $1-$9 with their respective match groups | |||
389 | constexpr UStringView grp[] = { STR_VS1, STR_VS2, STR_VS3, STR_VS4, STR_VS5, STR_VS6, STR_VS7, STR_VS8, STR_VS9 }; | |||
390 | for (size_t i = 0; i < context_stack.back().regexgrp_ct && i < 9; ++i) { | |||
391 | findAndReplace(tmp, grp[i].data(), USV((*context_stack.back().regexgrps)[i])); | |||
392 | did_something = true; | |||
393 | } | |||
394 | ||||
395 | // Handle %U %u %L %l markers. | |||
396 | bool found; | |||
397 | do { | |||
398 | found = false; | |||
399 | int32_t pos = -1, mpos = -1; | |||
400 | if ((pos = tmp.lastIndexOf(STR_VSu.data(), SI32(STR_VSu.size()), 0)) != -1) { | |||
401 | found = true; | |||
402 | mpos = std::max(mpos, pos); | |||
403 | } | |||
404 | if ((pos = tmp.lastIndexOf(STR_VSU.data(), SI32(STR_VSU.size()), mpos)) != -1) { | |||
405 | found = true; | |||
406 | mpos = std::max(mpos, pos); | |||
407 | } | |||
408 | if ((pos = tmp.lastIndexOf(STR_VSl.data(), SI32(STR_VSl.size()), mpos)) != -1) { | |||
409 | found = true; | |||
410 | mpos = std::max(mpos, pos); | |||
411 | } | |||
412 | if ((pos = tmp.lastIndexOf(STR_VSL.data(), SI32(STR_VSL.size()), mpos)) != -1) { | |||
413 | found = true; | |||
414 | mpos = std::max(mpos, pos); | |||
415 | } | |||
416 | if (found && mpos != -1) { | |||
417 | UChar mode = tmp[mpos + 1]; | |||
418 | tmp.remove(mpos, 2); | |||
419 | if (mode == 'u') { | |||
420 | UnicodeString range(tmp, mpos, 1); | |||
421 | range.toUpper(); | |||
422 | tmp.setCharAt(mpos, range[0]); | |||
423 | } | |||
424 | else if (mode == 'U') { | |||
425 | UnicodeString range(tmp, mpos); | |||
426 | range.toUpper(); | |||
427 | tmp.truncate(mpos); | |||
428 | tmp.append(range); | |||
429 | } | |||
430 | else if (mode == 'l') { | |||
431 | UnicodeString range(tmp, mpos, 1); | |||
432 | range.toLower(); | |||
433 | tmp.setCharAt(mpos, range[0]); | |||
434 | } | |||
435 | else if (mode == 'L') { | |||
436 | UnicodeString range(tmp, mpos); | |||
437 | range.toLower(); | |||
438 | tmp.truncate(mpos); | |||
439 | tmp.append(range); | |||
440 | } | |||
441 | did_something = true; | |||
442 | } | |||
443 | } while (found); | |||
444 | ||||
445 | if (tag->type & T_CASE_INSENSITIVE) { | |||
446 | tmp += 'i'; | |||
447 | } | |||
448 | if (tag->type & T_REGEXP) { | |||
449 | tmp += 'r'; | |||
450 | } | |||
451 | ||||
452 | const UChar* nt = tmp.getTerminatedBuffer(); | |||
453 | if (!did_something && nt == tag->tag) { | |||
454 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Unable to generate from tag '%S'! Possibly missing KEEPORDER and/or capturing regex from grammar on line %u before input line %u.\n", tag->tag.data(), grammar->lines, numLines); | |||
455 | u_fflushu_fflush_72(ux_stderr); | |||
456 | } | |||
457 | return addTag(nt, true); | |||
458 | } | |||
459 | ||||
460 | uint32_t GrammarApplicator::addTagToReading(Reading& reading, uint32_t utag, bool rehash) { | |||
461 | Tag* tag = grammar->single_tags.find(utag)->second; | |||
462 | return addTagToReading(reading, tag, rehash); | |||
463 | } | |||
464 | ||||
465 | uint32_t GrammarApplicator::addTagToReading(Reading& reading, Tag* tag, bool rehash) { | |||
466 | if (tag->type & T_VARSTRING) { | |||
467 | tag = generateVarstringTag(tag); | |||
468 | } | |||
469 | ||||
470 | auto it = grammar->sets_by_tag.find(tag->hash); | |||
471 | if (it != grammar->sets_by_tag.end()) { | |||
472 | reading.parent->possible_sets.resize(std::max(reading.parent->possible_sets.size(), it->second.size())); | |||
473 | reading.parent->possible_sets |= it->second; | |||
474 | } | |||
475 | reading.tags.insert(tag->hash); | |||
476 | reading.tags_list.push_back(tag->hash); | |||
477 | reading.tags_bloom.insert(tag->hash); | |||
478 | // ToDo: Remove for real ordered mode | |||
479 | if (ordered) { | |||
480 | if (!reading.tags_string.empty()) { | |||
481 | reading.tags_string += ' '; | |||
482 | } | |||
483 | reading.tags_string += tag->tag; | |||
484 | reading.tags_string_hash = hash_value(reading.tags_string); | |||
485 | } | |||
486 | if (grammar->parentheses.find(tag->hash) != grammar->parentheses.end()) { | |||
487 | reading.parent->is_pleft = tag->hash; | |||
488 | } | |||
489 | if (grammar->parentheses_reverse.find(tag->hash) != grammar->parentheses_reverse.end()) { | |||
490 | reading.parent->is_pright = tag->hash; | |||
491 | } | |||
492 | ||||
493 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { | |||
494 | if (reading.mapping && reading.mapping != tag) { | |||
495 | u_fprintfu_fprintf_72(ux_stderr, "Error: addTagToReading() cannot add a mapping tag to a reading which already is mapped!\n"); | |||
496 | CG3Quit(1); | |||
497 | } | |||
498 | reading.mapping = tag; | |||
499 | } | |||
500 | if (tag->type & (T_TEXTUAL | T_WORDFORM | T_BASEFORM)) { | |||
501 | reading.tags_textual.insert(tag->hash); | |||
502 | reading.tags_textual_bloom.insert(tag->hash); | |||
503 | } | |||
504 | if (tag->type & T_NUMERICAL) { | |||
505 | reading.tags_numerical[tag->hash] = tag; | |||
506 | reading.parent->type &= ~CT_NUM_CURRENT; | |||
507 | } | |||
508 | if (!reading.baseform && (tag->type & T_BASEFORM)) { | |||
509 | reading.baseform = tag->hash; | |||
510 | } | |||
511 | if (parse_dep && (tag->type & T_DEPENDENCY) && !(reading.parent->type & CT_DEP_DONE)) { | |||
512 | reading.parent->dep_self = tag->dep_self; | |||
513 | reading.parent->dep_parent = tag->dep_parent; | |||
514 | if (tag->dep_parent == tag->dep_self) { | |||
515 | reading.parent->dep_parent = DEP_NO_PARENT; | |||
516 | } | |||
517 | has_dep = true; | |||
518 | } | |||
519 | if (grammar->has_relations && (tag->type & T_RELATION)) { | |||
520 | if (tag->dep_parent && tag->comparison_hash) { | |||
521 | reading.parent->relations_input[tag->comparison_hash].insert(tag->dep_parent); | |||
522 | } | |||
523 | if (tag->dep_self) { | |||
524 | gWindow->relation_map[tag->dep_self] = reading.parent->global_number; | |||
525 | } | |||
526 | has_relations = true; | |||
527 | reading.parent->setRelated(); | |||
528 | } | |||
529 | if (!(tag->type & T_SPECIAL)) { | |||
530 | reading.tags_plain.insert(tag->hash); | |||
531 | reading.tags_plain_bloom.insert(tag->hash); | |||
532 | } | |||
533 | if (rehash) { | |||
534 | reading.rehash(); | |||
535 | } | |||
536 | ||||
537 | if (grammar->has_bag_of_tags) { | |||
538 | Reading& bot = reading.parent->parent->bag_of_tags; | |||
539 | bot.tags.insert(tag->hash); | |||
540 | bot.tags_list.push_back(tag->hash); | |||
541 | bot.tags_bloom.insert(tag->hash); | |||
542 | ||||
543 | if (tag->type & (T_TEXTUAL | T_WORDFORM | T_BASEFORM)) { | |||
544 | bot.tags_textual.insert(tag->hash); | |||
545 | bot.tags_textual_bloom.insert(tag->hash); | |||
546 | } | |||
547 | if (tag->type & T_NUMERICAL) { | |||
548 | bot.tags_numerical[tag->hash] = tag; | |||
549 | } | |||
550 | if (!reading.baseform && (tag->type & T_BASEFORM)) { | |||
551 | bot.baseform = tag->hash; | |||
552 | } | |||
553 | if (!(tag->type & T_SPECIAL)) { | |||
554 | bot.tags_plain.insert(tag->hash); | |||
555 | bot.tags_plain_bloom.insert(tag->hash); | |||
556 | } | |||
557 | if (rehash) { | |||
558 | bot.rehash(); | |||
559 | } | |||
560 | } | |||
561 | ||||
562 | return tag->hash; | |||
563 | } | |||
564 | ||||
565 | void GrammarApplicator::delTagFromReading(Reading& reading, uint32_t utag) { | |||
566 | erase(reading.tags_list, utag); | |||
567 | reading.tags.erase(utag); | |||
568 | reading.tags_textual.erase(utag); | |||
569 | reading.tags_numerical.erase(utag); | |||
570 | reading.tags_plain.erase(utag); | |||
571 | if (reading.mapping && utag == reading.mapping->hash) { | |||
572 | reading.mapping = nullptr; | |||
573 | } | |||
574 | if (utag == reading.baseform) { | |||
575 | reading.baseform = 0; | |||
576 | } | |||
577 | reading.rehash(); | |||
578 | reading.parent->type &= ~CT_NUM_CURRENT; | |||
579 | } | |||
580 | ||||
581 | void GrammarApplicator::delTagFromReading(Reading& reading, Tag* tag) { | |||
582 | return delTagFromReading(reading, tag->hash); | |||
583 | } | |||
584 | ||||
585 | bool GrammarApplicator::unmapReading(Reading& reading, const uint32_t rule) { | |||
586 | bool readings_changed = false; | |||
587 | if (reading.mapping) { | |||
588 | reading.noprint = false; | |||
589 | delTagFromReading(reading, reading.mapping->hash); | |||
590 | readings_changed = true; | |||
591 | } | |||
592 | if (reading.mapped) { | |||
593 | reading.mapped = false; | |||
594 | readings_changed = true; | |||
595 | } | |||
596 | if (readings_changed) { | |||
597 | reading.hit_by.push_back(rule); | |||
598 | } | |||
599 | return readings_changed; | |||
600 | } | |||
601 | ||||
602 | void GrammarApplicator::splitMappings(TagList& mappings, Cohort& cohort, Reading& reading, bool mapped) { | |||
603 | for (auto it = mappings.begin(); it != mappings.end();) { | |||
604 | Tag*& tag = *it; | |||
605 | while (tag->type & T_VARSTRING) { | |||
606 | tag = generateVarstringTag(tag); | |||
607 | } | |||
608 | if (!(tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix)) { | |||
609 | addTagToReading(reading, tag); | |||
610 | it = mappings.erase(it); | |||
611 | } | |||
612 | else { | |||
613 | ++it; | |||
614 | } | |||
615 | } | |||
616 | ||||
617 | if (reading.mapping) { | |||
618 | mappings.push_back(reading.mapping); | |||
619 | delTagFromReading(reading, reading.mapping->hash); | |||
620 | } | |||
621 | ||||
622 | Tag* tag = mappings.back(); | |||
623 | mappings.pop_back(); | |||
624 | size_t i = mappings.size(); | |||
625 | for (auto ttag : mappings) { | |||
626 | // To avoid duplicating needlessly many times, check for a similar reading in the cohort that's already got this mapping | |||
627 | bool found = false; | |||
628 | for (auto itr : cohort.readings) { | |||
629 | if (itr->hash_plain == reading.hash_plain && itr->mapping && itr->mapping->hash == ttag->hash) { | |||
630 | found = true; | |||
631 | break; | |||
632 | } | |||
633 | } | |||
634 | if (found) { | |||
635 | continue; | |||
636 | } | |||
637 | Reading* nr = alloc_reading(reading); | |||
638 | nr->mapped = mapped; | |||
639 | nr->number = UI32(reading.number - i--); | |||
640 | uint32_t mp = addTagToReading(*nr, ttag); | |||
641 | if (mp != ttag->hash) { | |||
642 | nr->mapping = grammar->single_tags.find(mp)->second; | |||
643 | } | |||
644 | else { | |||
645 | nr->mapping = ttag; | |||
646 | } | |||
647 | cohort.appendReading(nr); | |||
648 | numReadings++; | |||
649 | } | |||
650 | ||||
651 | reading.mapped = mapped; | |||
652 | uint32_t mp = addTagToReading(reading, tag); | |||
653 | if (mp != tag->hash) { | |||
654 | reading.mapping = grammar->single_tags.find(mp)->second; | |||
655 | } | |||
656 | else { | |||
657 | reading.mapping = tag; | |||
658 | } | |||
659 | } | |||
660 | ||||
661 | void GrammarApplicator::splitAllMappings(all_mappings_t& all_mappings, Cohort& cohort, bool mapped) { | |||
662 | if (all_mappings.empty()) { | |||
663 | return; | |||
664 | } | |||
665 | static thread_local ReadingList readings; | |||
666 | readings = cohort.readings; | |||
667 | for (auto reading : readings) { | |||
668 | auto iter = all_mappings.find(reading); | |||
669 | if (iter == all_mappings.end()) { | |||
670 | continue; | |||
671 | } | |||
672 | splitMappings(iter->second, cohort, *reading, mapped); | |||
673 | } | |||
674 | std::sort(cohort.readings.begin(), cohort.readings.end(), Reading::cmp_number); | |||
675 | if (!grammar->reopen_mappings.empty()) { | |||
676 | for (auto reading : cohort.readings) { | |||
677 | if (reading->mapping && grammar->reopen_mappings.count(reading->mapping->hash)) { | |||
678 | reading->mapped = false; | |||
679 | } | |||
680 | } | |||
681 | } | |||
682 | all_mappings.clear(); | |||
683 | } | |||
684 | ||||
685 | void GrammarApplicator::mergeReadings(ReadingList& readings) { | |||
686 | static thread_local bc::flat_map<uint32_t, std::pair<uint32_t, Reading*>> mapped; | |||
687 | mapped.clear(); | |||
688 | mapped.reserve(readings.size()); | |||
689 | static thread_local bc::flat_map<uint32_t, ReadingList> mlist; | |||
690 | mlist.clear(); | |||
691 | mlist.reserve(readings.size()); | |||
692 | ||||
693 | for (auto r : readings) { | |||
694 | uint32_t hp = r->hash_plain, hplain = r->hash_plain; | |||
695 | if (ordered) { | |||
696 | hp = hplain = r->tags_string_hash; | |||
697 | } | |||
698 | uint32_t nm = 0; | |||
699 | if (trace) { | |||
700 | for (auto iter_hb : r->hit_by) { | |||
701 | hp = hash_value(iter_hb, hp); | |||
702 | } | |||
703 | } | |||
704 | if (r->mapping) { | |||
705 | ++nm; | |||
706 | } | |||
707 | Reading* sub = r->next; | |||
708 | while (sub) { | |||
709 | if (ordered) { | |||
710 | hp = hash_value(sub->tags_string_hash, hp); | |||
711 | hplain = hash_value(sub->tags_string_hash, hplain); | |||
712 | } | |||
713 | else { | |||
714 | hp = hash_value(sub->hash_plain, hp); | |||
715 | hplain = hash_value(sub->hash_plain, hplain); | |||
716 | } | |||
717 | if (trace) { | |||
718 | for (auto iter_hb : sub->hit_by) { | |||
719 | hp = hash_value(iter_hb, hp); | |||
720 | } | |||
721 | } | |||
722 | if (sub->mapping) { | |||
723 | ++nm; | |||
724 | } | |||
725 | sub = sub->next; | |||
726 | } | |||
727 | if (mapped.count(hplain)) { | |||
728 | if (mapped[hplain].first != 0 && nm == 0) { | |||
729 | r->deleted = true; | |||
730 | } | |||
731 | else if (mapped[hplain].first != nm && mapped[hplain].first == 0) { | |||
732 | mapped[hplain].second->deleted = true; | |||
733 | } | |||
734 | } | |||
735 | mapped[hplain] = std::make_pair(nm, r); | |||
736 | mlist[hp + nm].push_back(r); | |||
737 | } | |||
738 | ||||
739 | if (mlist.size() == readings.size()) { | |||
740 | return; | |||
741 | } | |||
742 | ||||
743 | readings.clear(); | |||
744 | static thread_local std::vector<Reading*> order; | |||
745 | order.clear(); | |||
746 | ||||
747 | for (auto& miter : mlist) { | |||
748 | const ReadingList& clist = miter.second; | |||
749 | Reading* nr = alloc_reading(*(clist.front())); | |||
750 | if (nr->mapping) { | |||
751 | erase(nr->tags_list, nr->mapping->hash); | |||
752 | } | |||
753 | for (auto iter1 : clist) { | |||
754 | if (iter1->mapping && std::find(nr->tags_list.begin(), nr->tags_list.end(), iter1->mapping->hash) == nr->tags_list.end()) { | |||
755 | nr->tags_list.push_back(iter1->mapping->hash); | |||
756 | } | |||
757 | free_reading(iter1); | |||
758 | } | |||
759 | order.push_back(nr); | |||
760 | } | |||
761 | ||||
762 | std::sort(order.begin(), order.end(), Reading::cmp_number); | |||
763 | readings.insert(readings.begin(), order.begin(), order.end()); | |||
764 | } | |||
765 | ||||
766 | void GrammarApplicator::mergeMappings(Cohort& cohort) { | |||
767 | mergeReadings(cohort.readings); | |||
768 | if (trace) { | |||
769 | mergeReadings(cohort.deleted); | |||
770 | mergeReadings(cohort.delayed); | |||
771 | } | |||
772 | } | |||
773 | ||||
774 | Cohort* GrammarApplicator::delimitAt(SingleWindow& current, Cohort* cohort) { | |||
775 | SingleWindow* nwin = nullptr; | |||
| ||||
776 | if (current.parent->current == ¤t) { | |||
777 | nwin = current.parent->allocPushSingleWindow(); | |||
778 | } | |||
779 | else { | |||
780 | foreach (iter, current.parent->next)if (!(current.parent->next).empty()) for (auto iter = (current .parent->next).begin(), iter_end = (current.parent->next ).end(); iter != iter_end; ++iter) { | |||
781 | if (*iter == ¤t) { | |||
782 | nwin = current.parent->allocSingleWindow(); | |||
783 | current.parent->next.insert(++iter, nwin); | |||
784 | break; | |||
785 | } | |||
786 | } | |||
787 | if (!nwin
| |||
788 | foreach (iter, current.parent->previous)if (!(current.parent->previous).empty()) for (auto iter = ( current.parent->previous).begin(), iter_end = (current.parent ->previous).end(); iter != iter_end; ++iter) { | |||
789 | if (*iter == ¤t) { | |||
790 | nwin = current.parent->allocSingleWindow(); | |||
791 | current.parent->previous.insert(iter, nwin); | |||
792 | break; | |||
793 | } | |||
794 | } | |||
795 | } | |||
796 | gWindow->rebuildSingleWindowLinks(); | |||
797 | } | |||
798 | ||||
799 | assert(nwin != 0)(static_cast<void> (0)); | |||
800 | ||||
801 | std::swap(current.flush_after, nwin->flush_after); | |||
| ||||
802 | std::swap(current.text_post, nwin->text_post); | |||
803 | nwin->has_enclosures = current.has_enclosures; | |||
804 | ||||
805 | Cohort* cCohort = alloc_cohort(nwin); | |||
806 | cCohort->global_number = current.parent->cohort_counter++; | |||
807 | cCohort->wordform = tag_begin; | |||
808 | ||||
809 | Reading* cReading = alloc_reading(cCohort); | |||
810 | cReading->baseform = begintag; | |||
811 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); | |||
812 | addTagToReading(*cReading, begintag); | |||
813 | ||||
814 | cCohort->appendReading(cReading); | |||
815 | nwin->appendCohort(cCohort); | |||
816 | ||||
817 | auto lc = cohort->local_number; | |||
818 | auto nc = std::find(current.all_cohorts.begin() + lc, current.all_cohorts.end(), cohort); | |||
819 | ++nc; | |||
820 | auto from = nc; | |||
821 | for (; nc != current.all_cohorts.end(); ++nc) { | |||
822 | (*nc)->parent = nwin; | |||
823 | if ((*nc)->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED)) { | |||
824 | nwin->all_cohorts.push_back(*nc); | |||
825 | } | |||
826 | else { | |||
827 | nwin->appendCohort(*nc); | |||
828 | } | |||
829 | } | |||
830 | current.cohorts.erase(current.cohorts.begin() + lc + 1, current.cohorts.end()); | |||
831 | current.all_cohorts.erase(from, current.all_cohorts.end()); | |||
832 | ||||
833 | cohort = current.cohorts.back(); | |||
834 | for (auto reading : cohort->readings) { | |||
835 | addTagToReading(*reading, endtag); | |||
836 | } | |||
837 | gWindow->rebuildCohortLinks(); | |||
838 | ||||
839 | return cohort; | |||
840 | } | |||
841 | ||||
842 | void GrammarApplicator::reflowTextuals_Reading(Reading& r) { | |||
843 | if (r.next) { | |||
844 | reflowTextuals_Reading(*r.next); | |||
845 | } | |||
846 | for (auto it : r.tags) { | |||
847 | Tag* tag = grammar->single_tags.find(it)->second; | |||
848 | if (tag->type & T_TEXTUAL) { | |||
849 | r.tags_textual.insert(it); | |||
850 | r.tags_textual_bloom.insert(it); | |||
851 | } | |||
852 | } | |||
853 | } | |||
854 | ||||
855 | void GrammarApplicator::reflowTextuals_Cohort(Cohort& c) { | |||
856 | for (auto it : c.readings) { | |||
857 | reflowTextuals_Reading(*it); | |||
858 | } | |||
859 | for (auto it : c.deleted) { | |||
860 | reflowTextuals_Reading(*it); | |||
861 | } | |||
862 | for (auto it : c.ignored) { | |||
863 | reflowTextuals_Reading(*it); | |||
864 | } | |||
865 | for (auto it : c.delayed) { | |||
866 | reflowTextuals_Reading(*it); | |||
867 | } | |||
868 | } | |||
869 | ||||
870 | void GrammarApplicator::reflowTextuals_SingleWindow(SingleWindow& sw) { | |||
871 | for (auto it : sw.all_cohorts) { | |||
872 | reflowTextuals_Cohort(*it); | |||
873 | } | |||
874 | } | |||
875 | ||||
876 | void GrammarApplicator::reflowTextuals() { | |||
877 | for (auto swit : gWindow->previous) { | |||
878 | reflowTextuals_SingleWindow(*swit); | |||
879 | } | |||
880 | reflowTextuals_SingleWindow(*gWindow->current); | |||
881 | for (auto swit : gWindow->next) { | |||
882 | reflowTextuals_SingleWindow(*swit); | |||
883 | } | |||
884 | } | |||
885 | } |