clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name MweSplitApplicator.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/src -resource-dir /usr/lib/llvm-16/lib/clang/16 -D BOOST_NO_CXX98_FUNCTION_BASE=1 -D HAS_FS -D UNISTR_FROM_CHAR_EXPLICIT=explicit -D UNISTR_FROM_STRING_EXPLICIT=explicit -D _POSIX_C_SOURCE=200112 -D cg3_EXPORTS -I /tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/include/posix -I /tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/include -I /tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/src -I /usr/local/include -D NDEBUG -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -Wno-unused-result -std=c++2b -fdebug-compilation-dir=/tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/src -ferror-limit 19 -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/cg3/scan-build/2024-09-11-161008-13503-1 -x c++ /tmp/build/cg3/cg3-1.4.17+g2285~f7d45cea/src/MweSplitApplicator.cpp
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | #include "MweSplitApplicator.hpp" |
21 | |
22 | namespace CG3 { |
23 | |
24 | MweSplitApplicator::MweSplitApplicator(std::ostream& ux_err) |
25 | : GrammarApplicator(ux_err) |
26 | { |
27 | Grammar* grammar = new Grammar; |
28 | grammar->ux_stderr = ux_stderr; |
29 | grammar->allocateDummySet(); |
30 | grammar->delimiters = grammar->allocateSet(); |
31 | grammar->addTagToSet(grammar->allocateTag(STR_DUMMY), grammar->delimiters); |
32 | grammar->reindex(); |
33 | setGrammar(grammar); |
34 | owns_grammar = true; |
35 | is_conv = true; |
36 | } |
37 | |
38 | void MweSplitApplicator::runGrammarOnText(std::istream& input, std::ostream& output) { |
39 | GrammarApplicator::runGrammarOnText(input, output); |
40 | } |
41 | |
42 | const Tag* MweSplitApplicator::maybeWfTag(const Reading* r) { |
43 | for (auto tter : r->tags_list) { |
44 | if ((!show_end_tags && tter == endtag) || tter == begintag) { |
45 | continue; |
46 | } |
47 | if (tter == r->baseform || tter == r->parent->wordform->hash) { |
48 | continue; |
49 | } |
50 | const Tag* tag = grammar->single_tags[tter]; |
51 | |
52 | if (tag->type & T_WORDFORM) { |
53 | return tag; |
54 | } |
55 | } |
56 | return nullptr; |
57 | } |
58 | |
59 | std::vector<Cohort*> MweSplitApplicator::splitMwe(Cohort* cohort) { |
60 | constexpr UChar rtrimblank[] = { ' ', '\n', '\r', '\t', 0 }; |
61 | constexpr UChar textprefix[] = { ':', 0 }; |
62 | std::vector<Cohort*> cos; |
63 | size_t n_wftags = 0; |
64 | size_t n_goodreadings = 0; |
65 | for (auto rter1 : cohort->readings) { |
66 | if (maybeWfTag(rter1) != nullptr) { |
67 | ++n_wftags; |
68 | } |
69 | ++n_goodreadings; |
70 | } |
71 | |
72 | if (n_wftags < n_goodreadings) { |
| |
73 | if (n_wftags > 0) { |
74 | u_fprintf(ux_stderr, "WARNING: Line %u: Some but not all main-readings of %S had wordform-tags (not completely mwe-disambiguated?), not splitting.\n", cohort->line_number, cohort->wordform->tag.data()); |
75 | |
76 | } |
77 | cos.push_back(cohort); |
78 | return cos; |
79 | } |
80 | UString pretext; |
81 | for (auto r : cohort->readings) { |
82 | size_t pos = std::numeric_limits<size_t>::max(); |
83 | Reading* prev = nullptr; |
| 8 | | 'prev' initialized to a null pointer value | |
|
84 | for (auto sub = r; sub; sub = sub->next) { |
| 9 | | Loop condition is true. Entering loop body | |
|
85 | const Tag* wfTag = maybeWfTag(sub); |
86 | if (wfTag == nullptr) { |
| 10 | | Assuming the condition is true | |
|
| |
87 | prev = prev->next; |
| 12 | | Access to field 'next' results in a dereference of a null pointer (loaded from variable 'prev') |
|
88 | } |
89 | else { |
90 | ++pos; |
91 | Cohort* c; |
92 | while (cos.size() < pos + 1) { |
93 | c = alloc_cohort(cohort->parent); |
94 | c->global_number = gWindow->cohort_counter++; |
95 | cohort->parent->appendCohort(c); |
96 | if(pretext.size() > 0) { |
97 | c->text = pretext; |
98 | pretext.clear(); |
99 | } |
100 | cos.push_back(c); |
101 | } |
102 | c = cos[pos]; |
103 | |
104 | const size_t wfBeg = 2; |
105 | const size_t spBeg0 = wfTag->tag.find_first_not_of(rtrimblank, wfBeg); |
106 | const size_t spBeg = sub->next ? spBeg0 : wfBeg; |
107 | const size_t wfEnd = wfTag->tag.size() - 3; |
108 | const size_t spEnd = 1 + wfTag->tag.find_last_not_of(rtrimblank, wfEnd); |
109 | const UString& wf = |
110 | wfTag->tag.substr(0, wfBeg) |
111 | + wfTag->tag.substr(spBeg, spEnd - spBeg) |
112 | + wfTag->tag.substr(wfEnd + 1); |
113 | if (c->wordform != 0 && wf != c->wordform->tag) { |
114 | u_fprintf(ux_stderr, "WARNING: Line %u: Ambiguous wordform-tags for same cohort, '%S' vs '%S', not splitting.\n", numLines, wf.data(), c->wordform->tag.data()); |
115 | cos.clear(); |
116 | cos.push_back(cohort); |
117 | return cos; |
118 | } |
119 | c->wordform = addTag(wf); |
120 | if (spBeg > wfBeg) { |
121 | pretext = textprefix + wfTag->tag.substr(wfBeg, spBeg - wfBeg); |
122 | } |
123 | if (spEnd < wfEnd + 1) { |
124 | c->text = textprefix + wfTag->tag.substr(spEnd, wfEnd + 1 - spEnd); |
125 | } |
126 | |
127 | Reading* rNew = alloc_reading(*sub); |
128 | for (size_t i = 0; i < rNew->tags_list.size(); ++i) { |
129 | auto& tter = rNew->tags_list[i]; |
130 | if (tter == wfTag->hash || tter == rNew->parent->wordform->hash) { |
131 | rNew->tags_list.erase(rNew->tags_list.begin() + i); |
132 | rNew->tags.erase(tter); |
133 | } |
134 | } |
135 | cos[pos]->appendReading(rNew); |
136 | rNew->parent = cos[pos]; |
137 | |
138 | if (prev != nullptr) { |
139 | free_reading(prev->next); |
140 | } |
141 | prev = rNew; |
142 | } |
143 | } |
144 | } |
145 | if (cos.size() == 0) { |
146 | u_fprintf(ux_stderr, "WARNING: Line %u: Tried splitting %S, but got no new cohorts; shouldn't happen.", numLines, cohort->wordform->tag.data()); |
147 | cos.push_back(cohort); |
148 | } |
149 | |
150 | cos[0]->text = cohort->text; |
151 | std::reverse(cos.begin(), cos.end()); |
152 | return cos; |
153 | } |
154 | |
155 | void MweSplitApplicator::printSingleWindow(SingleWindow* window, std::ostream& output, bool profiling) { |
156 | for (auto var : window->variables_output) { |
157 | Tag* key = grammar->single_tags[var]; |
158 | auto iter = window->variables_set.find(var); |
159 | if (iter != window->variables_set.end()) { |
160 | if (iter->second != grammar->tag_any) { |
161 | Tag* value = grammar->single_tags[iter->second]; |
162 | u_fprintf(output, "%S%S=%S>\n", STR_CMD_SETVAR.data(), key->tag.data(), value->tag.data()); |
163 | } |
164 | else { |
165 | u_fprintf(output, "%S%S>\n", STR_CMD_SETVAR.data(), key->tag.data()); |
166 | } |
167 | } |
168 | else { |
169 | u_fprintf(output, "%S%S>\n", STR_CMD_REMVAR.data(), key->tag.data()); |
170 | } |
171 | } |
172 | |
173 | if (!window->text.empty()) { |
| 1 | Assuming the condition is true | |
|
| |
174 | u_fprintf(output, "%S", window->text.data()); |
175 | if (!ISNL(window->text.back())) { |
| |
176 | u_fputc('\n', output); |
177 | } |
178 | } |
179 | |
180 | auto cs = UI32(window->cohorts.size()); |
181 | for (uint32_t c = 0; c < cs; c++) { |
| |
| 5 | | Loop condition is true. Entering loop body | |
|
182 | Cohort* cohort = window->cohorts[c]; |
183 | std::vector<Cohort*> cs = splitMwe(cohort); |
| 6 | | Calling 'MweSplitApplicator::splitMwe' | |
|
184 | for (auto& iter : cs) { |
185 | printCohort(iter, output, profiling); |
186 | } |
187 | } |
188 | |
189 | if (!window->text_post.empty()) { |
190 | u_fprintf(output, "%S", window->text_post.data()); |
191 | if (!ISNL(window->text_post.back())) { |
192 | u_fputc('\n', output); |
193 | } |
194 | } |
195 | |
196 | u_fputc('\n', output); |
197 | if (window->flush_after) { |
198 | u_fprintf(output, "%S\n", STR_CMD_FLUSH.data()); |
199 | } |
200 | u_fflush(output); |
201 | } |
202 | } |