| File: | GrammarApplicator_reflow.cpp |
| Warning: | line 801, column 2 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* | |||
| 2 | * Copyright (C) 2007-2024, GrammarSoft ApS | |||
| 3 | * Developed by Tino Didriksen <mail@tinodidriksen.com> | |||
| 4 | * Design by Eckhard Bick <eckhard.bick@mail.dk>, Tino Didriksen <mail@tinodidriksen.com> | |||
| 5 | * | |||
| 6 | * This program is free software: you can redistribute it and/or modify | |||
| 7 | * it under the terms of the GNU General Public License as published by | |||
| 8 | * the Free Software Foundation, either version 3 of the License, or | |||
| 9 | * (at your option) any later version. | |||
| 10 | * | |||
| 11 | * This program is distributed in the hope that it will be useful, | |||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| 14 | * GNU General Public License for more details. | |||
| 15 | * | |||
| 16 | * You should have received a copy of the GNU General Public License | |||
| 17 | * along with this progam. If not, see <https://www.gnu.org/licenses/>. | |||
| 18 | */ | |||
| 19 | ||||
| 20 | #include "GrammarApplicator.hpp" | |||
| 21 | #include "Strings.hpp" | |||
| 22 | #include "Tag.hpp" | |||
| 23 | #include "Grammar.hpp" | |||
| 24 | #include "Window.hpp" | |||
| 25 | #include "SingleWindow.hpp" | |||
| 26 | #include "Reading.hpp" | |||
| 27 | ||||
| 28 | namespace CG3 { | |||
| 29 | ||||
| 30 | Tag* GrammarApplicator::makeBaseFromWord(uint32_t tag) { | |||
| 31 | return makeBaseFromWord(grammar->single_tags.find(tag)->second); | |||
| 32 | } | |||
| 33 | ||||
| 34 | Tag* GrammarApplicator::makeBaseFromWord(Tag* tag) { | |||
| 35 | const size_t len = tag->tag.size(); | |||
| 36 | if (len < 5) { | |||
| 37 | return tag; | |||
| 38 | } | |||
| 39 | static thread_local UString n; | |||
| 40 | n.clear(); | |||
| 41 | n.resize(len - 2); | |||
| 42 | n[0] = n[len - 3] = '"'; | |||
| 43 | u_strncpyu_strncpy_72(&n[1], tag->tag.data() + 2, SI32(len - 4)); | |||
| 44 | Tag* nt = addTag(n); | |||
| 45 | return nt; | |||
| 46 | } | |||
| 47 | ||||
| 48 | bool GrammarApplicator::isChildOf(const Cohort* child, const Cohort* parent) { | |||
| 49 | bool retval = false; | |||
| 50 | ||||
| 51 | if (parent->global_number == child->global_number) { | |||
| 52 | retval = true; | |||
| 53 | } | |||
| 54 | else if (parent->global_number == child->dep_parent) { | |||
| 55 | retval = true; | |||
| 56 | } | |||
| 57 | else { | |||
| 58 | size_t i = 0; | |||
| 59 | for (const Cohort* inner = child; i < 1000; ++i) { | |||
| 60 | if (inner->dep_parent == 0 || inner->dep_parent == DEP_NO_PARENT) { | |||
| 61 | retval = false; | |||
| 62 | break; | |||
| 63 | } | |||
| 64 | auto it = gWindow->cohort_map.find(inner->dep_parent); | |||
| 65 | if (it != gWindow->cohort_map.end()) { | |||
| 66 | inner = it->second; | |||
| 67 | } | |||
| 68 | else { | |||
| 69 | break; | |||
| 70 | } | |||
| 71 | if (inner->dep_parent == parent->global_number) { | |||
| 72 | retval = true; | |||
| 73 | break; | |||
| 74 | } | |||
| 75 | } | |||
| 76 | if (i == 1000) { | |||
| 77 | if (verbosity_level > 0) { | |||
| 78 | u_fprintfu_fprintf_72( | |||
| 79 | ux_stderr, | |||
| 80 | "Warning: While testing whether %u is a child of %u the counter exceeded 1000 indicating a loop higher up in the tree.\n", | |||
| 81 | child->global_number, parent->global_number); | |||
| 82 | } | |||
| 83 | } | |||
| 84 | } | |||
| 85 | return retval; | |||
| 86 | } | |||
| 87 | ||||
| 88 | bool GrammarApplicator::wouldParentChildLoop(const Cohort* parent, const Cohort* child) { | |||
| 89 | bool retval = false; | |||
| 90 | ||||
| 91 | if (parent->global_number == child->global_number) { | |||
| 92 | retval = true; | |||
| 93 | } | |||
| 94 | else if (parent->global_number == child->dep_parent) { | |||
| 95 | retval = false; | |||
| 96 | } | |||
| 97 | else if (parent->global_number == parent->dep_parent) { | |||
| 98 | retval = false; | |||
| 99 | } | |||
| 100 | else if (parent->dep_parent == child->global_number) { | |||
| 101 | retval = true; | |||
| 102 | } | |||
| 103 | else { | |||
| 104 | size_t i = 0; | |||
| 105 | for (const Cohort* inner = parent; i < 1000; ++i) { | |||
| 106 | if (inner->dep_parent == 0 || inner->dep_parent == DEP_NO_PARENT) { | |||
| 107 | retval = false; | |||
| 108 | break; | |||
| 109 | } | |||
| 110 | auto it = gWindow->cohort_map.find(inner->dep_parent); | |||
| 111 | if (it != gWindow->cohort_map.end()) { | |||
| 112 | inner = it->second; | |||
| 113 | } | |||
| 114 | else { | |||
| 115 | break; | |||
| 116 | } | |||
| 117 | if (inner->dep_parent == child->global_number) { | |||
| 118 | retval = true; | |||
| 119 | break; | |||
| 120 | } | |||
| 121 | } | |||
| 122 | if (i == 1000) { | |||
| 123 | if (verbosity_level > 0) { | |||
| 124 | u_fprintfu_fprintf_72( | |||
| 125 | ux_stderr, | |||
| 126 | "Warning: While testing whether %u and %u would loop the counter exceeded 1000 indicating a loop higher up in the tree.\n", | |||
| 127 | child->global_number, parent->global_number); | |||
| 128 | } | |||
| 129 | } | |||
| 130 | } | |||
| 131 | return retval; | |||
| 132 | } | |||
| 133 | ||||
| 134 | bool GrammarApplicator::wouldParentChildCross(const Cohort* parent, const Cohort* child) { | |||
| 135 | uint32_t mn = std::min(parent->global_number, child->global_number); | |||
| 136 | uint32_t mx = std::max(parent->global_number, child->global_number); | |||
| 137 | ||||
| 138 | for (uint32_t i = mn + 1; i < mx; ++i) { | |||
| 139 | auto it = gWindow->cohort_map.find(parent->dep_parent); | |||
| 140 | if (it != gWindow->cohort_map.end() && it->second->dep_parent != DEP_NO_PARENT) { | |||
| 141 | if (it->second->dep_parent < mn || it->second->dep_parent > mx) { | |||
| 142 | return true; | |||
| 143 | } | |||
| 144 | } | |||
| 145 | } | |||
| 146 | ||||
| 147 | return false; | |||
| 148 | } | |||
| 149 | ||||
| 150 | bool GrammarApplicator::attachParentChild(Cohort& parent, Cohort& child, bool allowloop, bool allowcrossing) { | |||
| 151 | parent.dep_self = parent.global_number; | |||
| 152 | child.dep_self = child.global_number; | |||
| 153 | ||||
| 154 | if (!allowloop && dep_block_loops && wouldParentChildLoop(&parent, &child)) { | |||
| 155 | if (verbosity_level > 0) { | |||
| 156 | u_fprintfu_fprintf_72( | |||
| 157 | ux_stderr, | |||
| 158 | "Warning: Dependency between %u and %u would cause a loop. Will not attach them.\n", | |||
| 159 | child.global_number, parent.global_number); | |||
| 160 | } | |||
| 161 | return false; | |||
| 162 | } | |||
| 163 | ||||
| 164 | if (!allowcrossing && dep_block_crossing && wouldParentChildCross(&parent, &child)) { | |||
| 165 | if (verbosity_level > 0) { | |||
| 166 | u_fprintfu_fprintf_72( | |||
| 167 | ux_stderr, | |||
| 168 | "Warning: Dependency between %u and %u would cause crossing branches. Will not attach them.\n", | |||
| 169 | child.global_number, parent.global_number); | |||
| 170 | } | |||
| 171 | return false; | |||
| 172 | } | |||
| 173 | ||||
| 174 | if (child.dep_parent == DEP_NO_PARENT) { | |||
| 175 | child.dep_parent = child.dep_self; | |||
| 176 | } | |||
| 177 | auto it = gWindow->cohort_map.find(child.dep_parent); | |||
| 178 | if (it != gWindow->cohort_map.end()) { | |||
| 179 | it->second->remChild(child.dep_self); | |||
| 180 | } | |||
| 181 | ||||
| 182 | child.dep_parent = parent.global_number; | |||
| 183 | parent.addChild(child.global_number); | |||
| 184 | ||||
| 185 | parent.type |= CT_DEP_DONE; | |||
| 186 | child.type |= CT_DEP_DONE; | |||
| 187 | ||||
| 188 | if (!dep_has_spanned && child.parent != parent.parent) { | |||
| 189 | u_fprintfu_fprintf_72( | |||
| 190 | ux_stderr, | |||
| 191 | "Info: Dependency between %u and %u spans the window boundaries. Enumeration will be global from here on.\n", | |||
| 192 | child.global_number, parent.global_number); | |||
| 193 | dep_has_spanned = true; | |||
| 194 | } | |||
| 195 | return true; | |||
| 196 | } | |||
| 197 | ||||
| 198 | void GrammarApplicator::reflowDependencyWindow(uint32_t max) { | |||
| 199 | if (dep_delimit && !max && !input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { | |||
| 200 | max = gWindow->next.back()->cohorts[1]->global_number; | |||
| 201 | } | |||
| 202 | ||||
| 203 | if (gWindow->dep_window.empty() || gWindow->dep_window.begin()->second->parent == 0) { | |||
| 204 | gWindow->dep_window[0] = gWindow->current->cohorts[0]; | |||
| 205 | } | |||
| 206 | else if (gWindow->dep_window.find(0) == gWindow->dep_window.end()) { | |||
| 207 | // This has to be done in 2 steps or it will segfault on Linux for some reason... | |||
| 208 | // Turns out g++ evaluates left side of = first, and MSVC++ does right side first, so g++ accessed its own newly created [0] at .begin() | |||
| 209 | Cohort* tmp = gWindow->dep_window.begin()->second->parent->cohorts[0]; | |||
| 210 | gWindow->dep_window[0] = tmp; | |||
| 211 | } | |||
| 212 | if (gWindow->cohort_map.empty()) { | |||
| 213 | gWindow->cohort_map[0] = gWindow->current->cohorts[0]; | |||
| 214 | } | |||
| 215 | else if (gWindow->cohort_map.find(0) == gWindow->cohort_map.end()) { | |||
| 216 | Cohort* tmp = gWindow->current->cohorts[0]; | |||
| 217 | Cohort* c = gWindow->cohort_map.begin()->second; | |||
| 218 | if (c->parent) { | |||
| 219 | tmp = c->parent->cohorts[0]; | |||
| 220 | } | |||
| 221 | gWindow->cohort_map[0] = tmp; | |||
| 222 | } | |||
| 223 | ||||
| 224 | for (auto begin = gWindow->dep_window.begin(); begin != gWindow->dep_window.end();) { | |||
| 225 | while (begin != gWindow->dep_window.end() && (begin->second->type & CT_DEP_DONE || !begin->second->dep_self)) { | |||
| 226 | ++begin; | |||
| 227 | } | |||
| 228 | gWindow->dep_map.clear(); | |||
| 229 | ||||
| 230 | auto end = begin; | |||
| 231 | for (; end != gWindow->dep_window.end(); ++end) { | |||
| 232 | Cohort* cohort = end->second; | |||
| 233 | if (cohort->type & CT_DEP_DONE) { | |||
| 234 | continue; | |||
| 235 | } | |||
| 236 | if (!cohort->dep_self) { | |||
| 237 | continue; | |||
| 238 | } | |||
| 239 | if (max && cohort->global_number >= max) { | |||
| 240 | break; | |||
| 241 | } | |||
| 242 | if (gWindow->dep_map.find(cohort->dep_self) != gWindow->dep_map.end()) { | |||
| 243 | break; | |||
| 244 | } | |||
| 245 | gWindow->dep_map[cohort->dep_self] = cohort->global_number; | |||
| 246 | cohort->dep_self = cohort->global_number; | |||
| 247 | } | |||
| 248 | ||||
| 249 | if (gWindow->dep_map.empty()) { | |||
| 250 | break; | |||
| 251 | } | |||
| 252 | ||||
| 253 | gWindow->dep_map[0] = 0; | |||
| 254 | for (; begin != end; ++begin) { | |||
| 255 | Cohort* cohort = begin->second; | |||
| 256 | if (max && cohort->global_number >= max) { | |||
| 257 | break; | |||
| 258 | } | |||
| 259 | if (cohort->dep_parent == DEP_NO_PARENT) { | |||
| 260 | continue; | |||
| 261 | } | |||
| 262 | if (cohort->dep_self == cohort->global_number) { | |||
| 263 | if (!(cohort->type & CT_DEP_DONE) && gWindow->dep_map.find(cohort->dep_parent) == gWindow->dep_map.end()) { | |||
| 264 | if (verbosity_level > 0) { | |||
| 265 | u_fprintfu_fprintf_72( | |||
| 266 | ux_stderr, | |||
| 267 | "Warning: Parent %u of dep %u in cohort %u of window %u does not exist - ignoring.\n", | |||
| 268 | cohort->dep_parent, cohort->dep_self, cohort->local_number, cohort->parent->number); | |||
| 269 | u_fflushu_fflush_72(ux_stderr); | |||
| 270 | } | |||
| 271 | cohort->dep_parent = DEP_NO_PARENT; | |||
| 272 | } | |||
| 273 | else { | |||
| 274 | if (!(cohort->type & CT_DEP_DONE)) { | |||
| 275 | auto dep_real = gWindow->dep_map.find(cohort->dep_parent)->second; | |||
| 276 | cohort->dep_parent = dep_real; | |||
| 277 | } | |||
| 278 | gWindow->cohort_map[0] = cohort->parent->cohorts[0]; | |||
| 279 | auto tmp = gWindow->cohort_map.find(cohort->dep_parent); | |||
| 280 | if (tmp != gWindow->cohort_map.end()) { | |||
| 281 | tmp->second->addChild(cohort->dep_self); | |||
| 282 | } | |||
| 283 | cohort->type |= CT_DEP_DONE; | |||
| 284 | } | |||
| 285 | } | |||
| 286 | } | |||
| 287 | } | |||
| 288 | ||||
| 289 | gWindow->dep_map.clear(); | |||
| 290 | gWindow->dep_window.clear(); | |||
| 291 | } | |||
| 292 | ||||
| 293 | void GrammarApplicator::reflowRelationWindow(uint32_t max) { | |||
| 294 | if (!max && !input_eof && !gWindow->next.empty() && gWindow->next.back()->cohorts.size() > 1) { | |||
| 295 | max = gWindow->next.back()->cohorts[0]->global_number; | |||
| 296 | } | |||
| 297 | ||||
| 298 | Cohort* cohort = gWindow->current->cohorts[1]; | |||
| 299 | while (cohort->prev) { | |||
| 300 | cohort = cohort->prev; | |||
| 301 | } | |||
| 302 | ||||
| 303 | for (; cohort; cohort = cohort->next) { | |||
| 304 | if (max && cohort->global_number >= max) { | |||
| 305 | break; | |||
| 306 | } | |||
| 307 | ||||
| 308 | for (auto rel = cohort->relations_input.begin(); rel != cohort->relations_input.end();) { | |||
| 309 | auto newrel = ss_u32sv.get(); | |||
| 310 | ||||
| 311 | for (auto target : rel->second) { | |||
| 312 | auto it = gWindow->relation_map.find(target); | |||
| 313 | if (it != gWindow->relation_map.end()) { | |||
| 314 | cohort->relations[rel->first].insert(it->second); | |||
| 315 | } | |||
| 316 | else { | |||
| 317 | newrel->insert(target); | |||
| 318 | } | |||
| 319 | } | |||
| 320 | ||||
| 321 | // Defer missing relations for later | |||
| 322 | if (newrel->empty()) { | |||
| 323 | rel = cohort->relations_input.erase(rel); | |||
| 324 | } | |||
| 325 | else { | |||
| 326 | rel->second = newrel; | |||
| 327 | ++rel; | |||
| 328 | } | |||
| 329 | } | |||
| 330 | } | |||
| 331 | } | |||
| 332 | ||||
| 333 | void GrammarApplicator::reflowReading(Reading& reading) { | |||
| 334 | reading.tags.clear(); | |||
| 335 | reading.tags_plain.clear(); | |||
| 336 | reading.tags_textual.clear(); | |||
| 337 | reading.tags_numerical.clear(); | |||
| 338 | reading.tags_bloom.clear(); | |||
| 339 | reading.tags_textual_bloom.clear(); | |||
| 340 | reading.tags_plain_bloom.clear(); | |||
| 341 | reading.mapping = nullptr; | |||
| 342 | reading.tags_string.clear(); | |||
| 343 | ||||
| 344 | insert_if_exists(reading.parent->possible_sets, grammar->sets_any); | |||
| 345 | ||||
| 346 | Reading::tags_list_t tlist; | |||
| 347 | tlist.swap(reading.tags_list); | |||
| 348 | ||||
| 349 | for (auto tter : tlist) { | |||
| 350 | addTagToReading(reading, tter, false); | |||
| 351 | } | |||
| 352 | ||||
| 353 | reading.rehash(); | |||
| 354 | } | |||
| 355 | ||||
| 356 | Tag* GrammarApplicator::generateVarstringTag(const Tag* tag) { | |||
| 357 | static thread_local UnicodeString tmp; | |||
| 358 | tmp.remove(); | |||
| 359 | tmp.append(tag->tag.data(), SI32(tag->tag.size())); | |||
| 360 | bool did_something = false; | |||
| 361 | ||||
| 362 | // Convert %[UuLl] markers to control codes to avoid having combined %$1 accidentally match %L | |||
| 363 | constexpr UStringView raw[] = { STR_VSu_raw, STR_VSU_raw, STR_VSl_raw, STR_VSL_raw }; | |||
| 364 | constexpr UStringView x01[] = { STR_VSu, STR_VSU, STR_VSl, STR_VSL }; | |||
| 365 | for (size_t i = 0; i < 4; ++i) { | |||
| 366 | findAndReplace(tmp, raw[i].data(), x01[i].data()); | |||
| 367 | } | |||
| 368 | ||||
| 369 | // Replace unified sets with their matching tags | |||
| 370 | if (tag->vs_sets) { | |||
| 371 | for (size_t i = 0; i < tag->vs_sets->size(); ++i) { | |||
| 372 | auto tags = ss_taglist.get(); | |||
| 373 | getTagList(*(*tag->vs_sets)[i], tags); | |||
| 374 | static thread_local UString rpl; | |||
| 375 | rpl.clear(); | |||
| 376 | // If there are multiple tags, such as from CompositeTags, put _ between them | |||
| 377 | foreach (iter, *tags)if (!(*tags).empty()) for (auto iter = (*tags).begin(), iter_end = (*tags).end(); iter != iter_end; ++iter) { | |||
| 378 | rpl += (*iter)->tag; | |||
| 379 | if (std::distance(iter, iter_end) > 1) { | |||
| 380 | rpl += '_'; | |||
| 381 | } | |||
| 382 | } | |||
| 383 | findAndReplace(tmp, (*tag->vs_names)[i].data(), rpl.data()); | |||
| 384 | did_something = true; | |||
| 385 | } | |||
| 386 | } | |||
| 387 | ||||
| 388 | // Replace $1-$9 with their respective match groups | |||
| 389 | constexpr UStringView grp[] = { STR_VS1, STR_VS2, STR_VS3, STR_VS4, STR_VS5, STR_VS6, STR_VS7, STR_VS8, STR_VS9 }; | |||
| 390 | for (size_t i = 0; i < context_stack.back().regexgrp_ct && i < 9; ++i) { | |||
| 391 | findAndReplace(tmp, grp[i].data(), USV((*context_stack.back().regexgrps)[i])); | |||
| 392 | did_something = true; | |||
| 393 | } | |||
| 394 | ||||
| 395 | // Handle %U %u %L %l markers. | |||
| 396 | bool found; | |||
| 397 | do { | |||
| 398 | found = false; | |||
| 399 | int32_t pos = -1, mpos = -1; | |||
| 400 | if ((pos = tmp.lastIndexOf(STR_VSu.data(), SI32(STR_VSu.size()), 0)) != -1) { | |||
| 401 | found = true; | |||
| 402 | mpos = std::max(mpos, pos); | |||
| 403 | } | |||
| 404 | if ((pos = tmp.lastIndexOf(STR_VSU.data(), SI32(STR_VSU.size()), mpos)) != -1) { | |||
| 405 | found = true; | |||
| 406 | mpos = std::max(mpos, pos); | |||
| 407 | } | |||
| 408 | if ((pos = tmp.lastIndexOf(STR_VSl.data(), SI32(STR_VSl.size()), mpos)) != -1) { | |||
| 409 | found = true; | |||
| 410 | mpos = std::max(mpos, pos); | |||
| 411 | } | |||
| 412 | if ((pos = tmp.lastIndexOf(STR_VSL.data(), SI32(STR_VSL.size()), mpos)) != -1) { | |||
| 413 | found = true; | |||
| 414 | mpos = std::max(mpos, pos); | |||
| 415 | } | |||
| 416 | if (found && mpos != -1) { | |||
| 417 | UChar mode = tmp[mpos + 1]; | |||
| 418 | tmp.remove(mpos, 2); | |||
| 419 | if (mode == 'u') { | |||
| 420 | UnicodeString range(tmp, mpos, 1); | |||
| 421 | range.toUpper(); | |||
| 422 | tmp.setCharAt(mpos, range[0]); | |||
| 423 | } | |||
| 424 | else if (mode == 'U') { | |||
| 425 | UnicodeString range(tmp, mpos); | |||
| 426 | range.toUpper(); | |||
| 427 | tmp.truncate(mpos); | |||
| 428 | tmp.append(range); | |||
| 429 | } | |||
| 430 | else if (mode == 'l') { | |||
| 431 | UnicodeString range(tmp, mpos, 1); | |||
| 432 | range.toLower(); | |||
| 433 | tmp.setCharAt(mpos, range[0]); | |||
| 434 | } | |||
| 435 | else if (mode == 'L') { | |||
| 436 | UnicodeString range(tmp, mpos); | |||
| 437 | range.toLower(); | |||
| 438 | tmp.truncate(mpos); | |||
| 439 | tmp.append(range); | |||
| 440 | } | |||
| 441 | did_something = true; | |||
| 442 | } | |||
| 443 | } while (found); | |||
| 444 | ||||
| 445 | if (tag->type & T_CASE_INSENSITIVE) { | |||
| 446 | tmp += 'i'; | |||
| 447 | } | |||
| 448 | if (tag->type & T_REGEXP) { | |||
| 449 | tmp += 'r'; | |||
| 450 | } | |||
| 451 | ||||
| 452 | const UChar* nt = tmp.getTerminatedBuffer(); | |||
| 453 | if (!did_something && nt == tag->tag) { | |||
| 454 | u_fprintfu_fprintf_72(ux_stderr, "Warning: Unable to generate from tag '%S'! Possibly missing KEEPORDER and/or capturing regex from grammar on line %u before input line %u.\n", tag->tag.data(), grammar->lines, numLines); | |||
| 455 | u_fflushu_fflush_72(ux_stderr); | |||
| 456 | } | |||
| 457 | return addTag(nt, true); | |||
| 458 | } | |||
| 459 | ||||
| 460 | uint32_t GrammarApplicator::addTagToReading(Reading& reading, uint32_t utag, bool rehash) { | |||
| 461 | Tag* tag = grammar->single_tags.find(utag)->second; | |||
| 462 | return addTagToReading(reading, tag, rehash); | |||
| 463 | } | |||
| 464 | ||||
| 465 | uint32_t GrammarApplicator::addTagToReading(Reading& reading, Tag* tag, bool rehash) { | |||
| 466 | if (tag->type & T_VARSTRING) { | |||
| 467 | tag = generateVarstringTag(tag); | |||
| 468 | } | |||
| 469 | ||||
| 470 | auto it = grammar->sets_by_tag.find(tag->hash); | |||
| 471 | if (it != grammar->sets_by_tag.end()) { | |||
| 472 | reading.parent->possible_sets.resize(std::max(reading.parent->possible_sets.size(), it->second.size())); | |||
| 473 | reading.parent->possible_sets |= it->second; | |||
| 474 | } | |||
| 475 | reading.tags.insert(tag->hash); | |||
| 476 | reading.tags_list.push_back(tag->hash); | |||
| 477 | reading.tags_bloom.insert(tag->hash); | |||
| 478 | // ToDo: Remove for real ordered mode | |||
| 479 | if (ordered) { | |||
| 480 | if (!reading.tags_string.empty()) { | |||
| 481 | reading.tags_string += ' '; | |||
| 482 | } | |||
| 483 | reading.tags_string += tag->tag; | |||
| 484 | reading.tags_string_hash = hash_value(reading.tags_string); | |||
| 485 | } | |||
| 486 | if (grammar->parentheses.find(tag->hash) != grammar->parentheses.end()) { | |||
| 487 | reading.parent->is_pleft = tag->hash; | |||
| 488 | } | |||
| 489 | if (grammar->parentheses_reverse.find(tag->hash) != grammar->parentheses_reverse.end()) { | |||
| 490 | reading.parent->is_pright = tag->hash; | |||
| 491 | } | |||
| 492 | ||||
| 493 | if (tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix) { | |||
| 494 | if (reading.mapping && reading.mapping != tag) { | |||
| 495 | u_fprintfu_fprintf_72(ux_stderr, "Error: addTagToReading() cannot add a mapping tag to a reading which already is mapped!\n"); | |||
| 496 | CG3Quit(1); | |||
| 497 | } | |||
| 498 | reading.mapping = tag; | |||
| 499 | } | |||
| 500 | if (tag->type & (T_TEXTUAL | T_WORDFORM | T_BASEFORM)) { | |||
| 501 | reading.tags_textual.insert(tag->hash); | |||
| 502 | reading.tags_textual_bloom.insert(tag->hash); | |||
| 503 | } | |||
| 504 | if (tag->type & T_NUMERICAL) { | |||
| 505 | reading.tags_numerical[tag->hash] = tag; | |||
| 506 | reading.parent->type &= ~CT_NUM_CURRENT; | |||
| 507 | } | |||
| 508 | if (!reading.baseform && (tag->type & T_BASEFORM)) { | |||
| 509 | reading.baseform = tag->hash; | |||
| 510 | } | |||
| 511 | if (parse_dep && (tag->type & T_DEPENDENCY) && !(reading.parent->type & CT_DEP_DONE)) { | |||
| 512 | reading.parent->dep_self = tag->dep_self; | |||
| 513 | reading.parent->dep_parent = tag->dep_parent; | |||
| 514 | if (tag->dep_parent == tag->dep_self) { | |||
| 515 | reading.parent->dep_parent = DEP_NO_PARENT; | |||
| 516 | } | |||
| 517 | has_dep = true; | |||
| 518 | } | |||
| 519 | if (grammar->has_relations && (tag->type & T_RELATION)) { | |||
| 520 | if (tag->dep_parent && tag->comparison_hash) { | |||
| 521 | reading.parent->relations_input[tag->comparison_hash].insert(tag->dep_parent); | |||
| 522 | } | |||
| 523 | if (tag->dep_self) { | |||
| 524 | gWindow->relation_map[tag->dep_self] = reading.parent->global_number; | |||
| 525 | } | |||
| 526 | has_relations = true; | |||
| 527 | reading.parent->setRelated(); | |||
| 528 | } | |||
| 529 | if (!(tag->type & T_SPECIAL)) { | |||
| 530 | reading.tags_plain.insert(tag->hash); | |||
| 531 | reading.tags_plain_bloom.insert(tag->hash); | |||
| 532 | } | |||
| 533 | if (rehash) { | |||
| 534 | reading.rehash(); | |||
| 535 | } | |||
| 536 | ||||
| 537 | if (grammar->has_bag_of_tags) { | |||
| 538 | Reading& bot = reading.parent->parent->bag_of_tags; | |||
| 539 | bot.tags.insert(tag->hash); | |||
| 540 | bot.tags_list.push_back(tag->hash); | |||
| 541 | bot.tags_bloom.insert(tag->hash); | |||
| 542 | ||||
| 543 | if (tag->type & (T_TEXTUAL | T_WORDFORM | T_BASEFORM)) { | |||
| 544 | bot.tags_textual.insert(tag->hash); | |||
| 545 | bot.tags_textual_bloom.insert(tag->hash); | |||
| 546 | } | |||
| 547 | if (tag->type & T_NUMERICAL) { | |||
| 548 | bot.tags_numerical[tag->hash] = tag; | |||
| 549 | } | |||
| 550 | if (!reading.baseform && (tag->type & T_BASEFORM)) { | |||
| 551 | bot.baseform = tag->hash; | |||
| 552 | } | |||
| 553 | if (!(tag->type & T_SPECIAL)) { | |||
| 554 | bot.tags_plain.insert(tag->hash); | |||
| 555 | bot.tags_plain_bloom.insert(tag->hash); | |||
| 556 | } | |||
| 557 | if (rehash) { | |||
| 558 | bot.rehash(); | |||
| 559 | } | |||
| 560 | } | |||
| 561 | ||||
| 562 | return tag->hash; | |||
| 563 | } | |||
| 564 | ||||
| 565 | void GrammarApplicator::delTagFromReading(Reading& reading, uint32_t utag) { | |||
| 566 | erase(reading.tags_list, utag); | |||
| 567 | reading.tags.erase(utag); | |||
| 568 | reading.tags_textual.erase(utag); | |||
| 569 | reading.tags_numerical.erase(utag); | |||
| 570 | reading.tags_plain.erase(utag); | |||
| 571 | if (reading.mapping && utag == reading.mapping->hash) { | |||
| 572 | reading.mapping = nullptr; | |||
| 573 | } | |||
| 574 | if (utag == reading.baseform) { | |||
| 575 | reading.baseform = 0; | |||
| 576 | } | |||
| 577 | reading.rehash(); | |||
| 578 | reading.parent->type &= ~CT_NUM_CURRENT; | |||
| 579 | } | |||
| 580 | ||||
| 581 | void GrammarApplicator::delTagFromReading(Reading& reading, Tag* tag) { | |||
| 582 | return delTagFromReading(reading, tag->hash); | |||
| 583 | } | |||
| 584 | ||||
| 585 | bool GrammarApplicator::unmapReading(Reading& reading, const uint32_t rule) { | |||
| 586 | bool readings_changed = false; | |||
| 587 | if (reading.mapping) { | |||
| 588 | reading.noprint = false; | |||
| 589 | delTagFromReading(reading, reading.mapping->hash); | |||
| 590 | readings_changed = true; | |||
| 591 | } | |||
| 592 | if (reading.mapped) { | |||
| 593 | reading.mapped = false; | |||
| 594 | readings_changed = true; | |||
| 595 | } | |||
| 596 | if (readings_changed) { | |||
| 597 | reading.hit_by.push_back(rule); | |||
| 598 | } | |||
| 599 | return readings_changed; | |||
| 600 | } | |||
| 601 | ||||
| 602 | void GrammarApplicator::splitMappings(TagList& mappings, Cohort& cohort, Reading& reading, bool mapped) { | |||
| 603 | for (auto it = mappings.begin(); it != mappings.end();) { | |||
| 604 | Tag*& tag = *it; | |||
| 605 | while (tag->type & T_VARSTRING) { | |||
| 606 | tag = generateVarstringTag(tag); | |||
| 607 | } | |||
| 608 | if (!(tag->type & T_MAPPING || tag->tag[0] == grammar->mapping_prefix)) { | |||
| 609 | addTagToReading(reading, tag); | |||
| 610 | it = mappings.erase(it); | |||
| 611 | } | |||
| 612 | else { | |||
| 613 | ++it; | |||
| 614 | } | |||
| 615 | } | |||
| 616 | ||||
| 617 | if (reading.mapping) { | |||
| 618 | mappings.push_back(reading.mapping); | |||
| 619 | delTagFromReading(reading, reading.mapping->hash); | |||
| 620 | } | |||
| 621 | ||||
| 622 | Tag* tag = mappings.back(); | |||
| 623 | mappings.pop_back(); | |||
| 624 | size_t i = mappings.size(); | |||
| 625 | for (auto ttag : mappings) { | |||
| 626 | // To avoid duplicating needlessly many times, check for a similar reading in the cohort that's already got this mapping | |||
| 627 | bool found = false; | |||
| 628 | for (auto itr : cohort.readings) { | |||
| 629 | if (itr->hash_plain == reading.hash_plain && itr->mapping && itr->mapping->hash == ttag->hash) { | |||
| 630 | found = true; | |||
| 631 | break; | |||
| 632 | } | |||
| 633 | } | |||
| 634 | if (found) { | |||
| 635 | continue; | |||
| 636 | } | |||
| 637 | Reading* nr = alloc_reading(reading); | |||
| 638 | nr->mapped = mapped; | |||
| 639 | nr->number = UI32(reading.number - i--); | |||
| 640 | uint32_t mp = addTagToReading(*nr, ttag); | |||
| 641 | if (mp != ttag->hash) { | |||
| 642 | nr->mapping = grammar->single_tags.find(mp)->second; | |||
| 643 | } | |||
| 644 | else { | |||
| 645 | nr->mapping = ttag; | |||
| 646 | } | |||
| 647 | cohort.appendReading(nr); | |||
| 648 | numReadings++; | |||
| 649 | } | |||
| 650 | ||||
| 651 | reading.mapped = mapped; | |||
| 652 | uint32_t mp = addTagToReading(reading, tag); | |||
| 653 | if (mp != tag->hash) { | |||
| 654 | reading.mapping = grammar->single_tags.find(mp)->second; | |||
| 655 | } | |||
| 656 | else { | |||
| 657 | reading.mapping = tag; | |||
| 658 | } | |||
| 659 | } | |||
| 660 | ||||
| 661 | void GrammarApplicator::splitAllMappings(all_mappings_t& all_mappings, Cohort& cohort, bool mapped) { | |||
| 662 | if (all_mappings.empty()) { | |||
| 663 | return; | |||
| 664 | } | |||
| 665 | static thread_local ReadingList readings; | |||
| 666 | readings = cohort.readings; | |||
| 667 | for (auto reading : readings) { | |||
| 668 | auto iter = all_mappings.find(reading); | |||
| 669 | if (iter == all_mappings.end()) { | |||
| 670 | continue; | |||
| 671 | } | |||
| 672 | splitMappings(iter->second, cohort, *reading, mapped); | |||
| 673 | } | |||
| 674 | std::sort(cohort.readings.begin(), cohort.readings.end(), Reading::cmp_number); | |||
| 675 | if (!grammar->reopen_mappings.empty()) { | |||
| 676 | for (auto reading : cohort.readings) { | |||
| 677 | if (reading->mapping && grammar->reopen_mappings.count(reading->mapping->hash)) { | |||
| 678 | reading->mapped = false; | |||
| 679 | } | |||
| 680 | } | |||
| 681 | } | |||
| 682 | all_mappings.clear(); | |||
| 683 | } | |||
| 684 | ||||
| 685 | void GrammarApplicator::mergeReadings(ReadingList& readings) { | |||
| 686 | static thread_local bc::flat_map<uint32_t, std::pair<uint32_t, Reading*>> mapped; | |||
| 687 | mapped.clear(); | |||
| 688 | mapped.reserve(readings.size()); | |||
| 689 | static thread_local bc::flat_map<uint32_t, ReadingList> mlist; | |||
| 690 | mlist.clear(); | |||
| 691 | mlist.reserve(readings.size()); | |||
| 692 | ||||
| 693 | for (auto r : readings) { | |||
| 694 | uint32_t hp = r->hash_plain, hplain = r->hash_plain; | |||
| 695 | if (ordered) { | |||
| 696 | hp = hplain = r->tags_string_hash; | |||
| 697 | } | |||
| 698 | uint32_t nm = 0; | |||
| 699 | if (trace) { | |||
| 700 | for (auto iter_hb : r->hit_by) { | |||
| 701 | hp = hash_value(iter_hb, hp); | |||
| 702 | } | |||
| 703 | } | |||
| 704 | if (r->mapping) { | |||
| 705 | ++nm; | |||
| 706 | } | |||
| 707 | Reading* sub = r->next; | |||
| 708 | while (sub) { | |||
| 709 | if (ordered) { | |||
| 710 | hp = hash_value(sub->tags_string_hash, hp); | |||
| 711 | hplain = hash_value(sub->tags_string_hash, hplain); | |||
| 712 | } | |||
| 713 | else { | |||
| 714 | hp = hash_value(sub->hash_plain, hp); | |||
| 715 | hplain = hash_value(sub->hash_plain, hplain); | |||
| 716 | } | |||
| 717 | if (trace) { | |||
| 718 | for (auto iter_hb : sub->hit_by) { | |||
| 719 | hp = hash_value(iter_hb, hp); | |||
| 720 | } | |||
| 721 | } | |||
| 722 | if (sub->mapping) { | |||
| 723 | ++nm; | |||
| 724 | } | |||
| 725 | sub = sub->next; | |||
| 726 | } | |||
| 727 | if (mapped.count(hplain)) { | |||
| 728 | if (mapped[hplain].first != 0 && nm == 0) { | |||
| 729 | r->deleted = true; | |||
| 730 | } | |||
| 731 | else if (mapped[hplain].first != nm && mapped[hplain].first == 0) { | |||
| 732 | mapped[hplain].second->deleted = true; | |||
| 733 | } | |||
| 734 | } | |||
| 735 | mapped[hplain] = std::make_pair(nm, r); | |||
| 736 | mlist[hp + nm].push_back(r); | |||
| 737 | } | |||
| 738 | ||||
| 739 | if (mlist.size() == readings.size()) { | |||
| 740 | return; | |||
| 741 | } | |||
| 742 | ||||
| 743 | readings.clear(); | |||
| 744 | static thread_local std::vector<Reading*> order; | |||
| 745 | order.clear(); | |||
| 746 | ||||
| 747 | for (auto& miter : mlist) { | |||
| 748 | const ReadingList& clist = miter.second; | |||
| 749 | Reading* nr = alloc_reading(*(clist.front())); | |||
| 750 | if (nr->mapping) { | |||
| 751 | erase(nr->tags_list, nr->mapping->hash); | |||
| 752 | } | |||
| 753 | for (auto iter1 : clist) { | |||
| 754 | if (iter1->mapping && std::find(nr->tags_list.begin(), nr->tags_list.end(), iter1->mapping->hash) == nr->tags_list.end()) { | |||
| 755 | nr->tags_list.push_back(iter1->mapping->hash); | |||
| 756 | } | |||
| 757 | free_reading(iter1); | |||
| 758 | } | |||
| 759 | order.push_back(nr); | |||
| 760 | } | |||
| 761 | ||||
| 762 | std::sort(order.begin(), order.end(), Reading::cmp_number); | |||
| 763 | readings.insert(readings.begin(), order.begin(), order.end()); | |||
| 764 | } | |||
| 765 | ||||
| 766 | void GrammarApplicator::mergeMappings(Cohort& cohort) { | |||
| 767 | mergeReadings(cohort.readings); | |||
| 768 | if (trace) { | |||
| 769 | mergeReadings(cohort.deleted); | |||
| 770 | mergeReadings(cohort.delayed); | |||
| 771 | } | |||
| 772 | } | |||
| 773 | ||||
| 774 | Cohort* GrammarApplicator::delimitAt(SingleWindow& current, Cohort* cohort) { | |||
| 775 | SingleWindow* nwin = nullptr; | |||
| ||||
| 776 | if (current.parent->current == ¤t) { | |||
| 777 | nwin = current.parent->allocPushSingleWindow(); | |||
| 778 | } | |||
| 779 | else { | |||
| 780 | foreach (iter, current.parent->next)if (!(current.parent->next).empty()) for (auto iter = (current .parent->next).begin(), iter_end = (current.parent->next ).end(); iter != iter_end; ++iter) { | |||
| 781 | if (*iter == ¤t) { | |||
| 782 | nwin = current.parent->allocSingleWindow(); | |||
| 783 | current.parent->next.insert(++iter, nwin); | |||
| 784 | break; | |||
| 785 | } | |||
| 786 | } | |||
| 787 | if (!nwin
| |||
| 788 | foreach (iter, current.parent->previous)if (!(current.parent->previous).empty()) for (auto iter = ( current.parent->previous).begin(), iter_end = (current.parent ->previous).end(); iter != iter_end; ++iter) { | |||
| 789 | if (*iter == ¤t) { | |||
| 790 | nwin = current.parent->allocSingleWindow(); | |||
| 791 | current.parent->previous.insert(iter, nwin); | |||
| 792 | break; | |||
| 793 | } | |||
| 794 | } | |||
| 795 | } | |||
| 796 | gWindow->rebuildSingleWindowLinks(); | |||
| 797 | } | |||
| 798 | ||||
| 799 | assert(nwin != 0)(static_cast<void> (0)); | |||
| 800 | ||||
| 801 | std::swap(current.flush_after, nwin->flush_after); | |||
| ||||
| 802 | std::swap(current.text_post, nwin->text_post); | |||
| 803 | nwin->has_enclosures = current.has_enclosures; | |||
| 804 | ||||
| 805 | Cohort* cCohort = alloc_cohort(nwin); | |||
| 806 | cCohort->global_number = current.parent->cohort_counter++; | |||
| 807 | cCohort->wordform = tag_begin; | |||
| 808 | ||||
| 809 | Reading* cReading = alloc_reading(cCohort); | |||
| 810 | cReading->baseform = begintag; | |||
| 811 | insert_if_exists(cReading->parent->possible_sets, grammar->sets_any); | |||
| 812 | addTagToReading(*cReading, begintag); | |||
| 813 | ||||
| 814 | cCohort->appendReading(cReading); | |||
| 815 | nwin->appendCohort(cCohort); | |||
| 816 | ||||
| 817 | auto lc = cohort->local_number; | |||
| 818 | auto nc = std::find(current.all_cohorts.begin() + lc, current.all_cohorts.end(), cohort); | |||
| 819 | ++nc; | |||
| 820 | auto from = nc; | |||
| 821 | for (; nc != current.all_cohorts.end(); ++nc) { | |||
| 822 | (*nc)->parent = nwin; | |||
| 823 | if ((*nc)->type & (CT_ENCLOSED | CT_REMOVED | CT_IGNORED)) { | |||
| 824 | nwin->all_cohorts.push_back(*nc); | |||
| 825 | } | |||
| 826 | else { | |||
| 827 | nwin->appendCohort(*nc); | |||
| 828 | } | |||
| 829 | } | |||
| 830 | current.cohorts.erase(current.cohorts.begin() + lc + 1, current.cohorts.end()); | |||
| 831 | current.all_cohorts.erase(from, current.all_cohorts.end()); | |||
| 832 | ||||
| 833 | cohort = current.cohorts.back(); | |||
| 834 | for (auto reading : cohort->readings) { | |||
| 835 | addTagToReading(*reading, endtag); | |||
| 836 | } | |||
| 837 | gWindow->rebuildCohortLinks(); | |||
| 838 | ||||
| 839 | return cohort; | |||
| 840 | } | |||
| 841 | ||||
| 842 | void GrammarApplicator::reflowTextuals_Reading(Reading& r) { | |||
| 843 | if (r.next) { | |||
| 844 | reflowTextuals_Reading(*r.next); | |||
| 845 | } | |||
| 846 | for (auto it : r.tags) { | |||
| 847 | Tag* tag = grammar->single_tags.find(it)->second; | |||
| 848 | if (tag->type & T_TEXTUAL) { | |||
| 849 | r.tags_textual.insert(it); | |||
| 850 | r.tags_textual_bloom.insert(it); | |||
| 851 | } | |||
| 852 | } | |||
| 853 | } | |||
| 854 | ||||
| 855 | void GrammarApplicator::reflowTextuals_Cohort(Cohort& c) { | |||
| 856 | for (auto it : c.readings) { | |||
| 857 | reflowTextuals_Reading(*it); | |||
| 858 | } | |||
| 859 | for (auto it : c.deleted) { | |||
| 860 | reflowTextuals_Reading(*it); | |||
| 861 | } | |||
| 862 | for (auto it : c.ignored) { | |||
| 863 | reflowTextuals_Reading(*it); | |||
| 864 | } | |||
| 865 | for (auto it : c.delayed) { | |||
| 866 | reflowTextuals_Reading(*it); | |||
| 867 | } | |||
| 868 | } | |||
| 869 | ||||
| 870 | void GrammarApplicator::reflowTextuals_SingleWindow(SingleWindow& sw) { | |||
| 871 | for (auto it : sw.all_cohorts) { | |||
| 872 | reflowTextuals_Cohort(*it); | |||
| 873 | } | |||
| 874 | } | |||
| 875 | ||||
| 876 | void GrammarApplicator::reflowTextuals() { | |||
| 877 | for (auto swit : gWindow->previous) { | |||
| 878 | reflowTextuals_SingleWindow(*swit); | |||
| 879 | } | |||
| 880 | reflowTextuals_SingleWindow(*gWindow->current); | |||
| 881 | for (auto swit : gWindow->next) { | |||
| 882 | reflowTextuals_SingleWindow(*swit); | |||
| 883 | } | |||
| 884 | } | |||
| 885 | } |