clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name interchunk.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ interchunk.cc
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | #include <apertium/interchunk.h> |
18 | |
19 | #include <lttoolbox/xml_walk_util.h> |
20 | #include <lttoolbox/string_utils.h> |
21 | |
22 | #include <iostream> |
23 | |
24 | using namespace std; |
25 | |
26 | Interchunk::Interchunk() |
27 | {} |
28 | |
29 | bool |
30 | Interchunk::checkIndex(xmlNode *element, int index, int limit) |
31 | { |
32 | if(index >= limit) |
33 | { |
34 | cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index >= limit" << endl; |
35 | return false; |
36 | } |
37 | if(index < 0) { |
38 | cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl; |
39 | return false; |
40 | } |
41 | if(word[index] == 0) |
42 | { |
43 | cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl; |
44 | return false; |
45 | } |
46 | return true; |
47 | } |
48 | |
49 | UString |
50 | Interchunk::evalCachedString(xmlNode* element) |
51 | { |
52 | TransferInstr& ti = evalStringCache[element]; |
53 | switch (ti.getType()) { |
54 | case ti_clip_tl: |
55 | if (checkIndex(element, ti.getPos(), lword)) { |
56 | if (ti.getContent() == "content"_u) { |
57 | UString wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); |
58 | return wf.substr(1, wf.length()-2); |
59 | } else { |
60 | return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); |
61 | } |
62 | } |
63 | break; |
64 | |
65 | case ti_var: |
66 | return variables[ti.getContent()]; |
67 | |
68 | case ti_lit_tag: |
69 | case ti_lit: |
70 | return ti.getContent(); |
71 | |
72 | case ti_b: |
73 | if (!blank_queue.empty()) { |
74 | UString retblank = blank_queue.front(); |
75 | if (in_out) { |
76 | blank_queue.pop(); |
77 | } |
78 | return retblank; |
79 | } else { |
80 | return " "_u; |
81 | } |
82 | break; |
83 | |
84 | case ti_get_case_from: |
85 | if (checkIndex(element, ti.getPos(), lword)) { |
86 | return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), |
87 | evalString(ti.getPointer())); |
88 | } |
89 | break; |
90 | |
91 | case ti_case_of_tl: |
92 | if (checkIndex(element, ti.getPos(), lword)) { |
93 | return StringUtils::getcase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); |
94 | } |
95 | break; |
96 | |
97 | default: |
98 | return ""_u; |
99 | } |
100 | return ""_u; |
101 | } |
102 | |
103 | void |
104 | Interchunk::processClip(xmlNode* element) |
105 | { |
106 | int pos = 0; |
107 | UString part; |
108 | for (xmlAttr* i = element->properties; i != NULL; i = i->next) { |
109 | if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { |
110 | part = to_ustring((const char*) i->children->content); |
111 | } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { |
112 | pos = atoi((const char*) i->children->content) - 1; |
113 | } |
114 | } |
115 | evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL); |
116 | } |
117 | |
118 | void |
119 | Interchunk::processBlank(xmlNode* element) |
120 | { |
121 | if (element->properties == NULL) { |
122 | evalStringCache[element] = TransferInstr(ti_b, " "_u, -1); |
123 | } else { |
124 | int pos = atoi((const char*) element->properties->children->content) - 1; |
125 | evalStringCache[element] = TransferInstr(ti_b, ""_u, pos); |
126 | } |
127 | } |
128 | |
129 | void |
130 | Interchunk::processLuCount(xmlNode* element) |
131 | { |
132 | cerr << "Error: unexpected expression: '" << element->name << "'" << endl; |
133 | exit(EXIT_FAILURE); |
134 | } |
135 | |
136 | UString |
137 | Interchunk::processLu(xmlNode* element) |
138 | { |
139 | cerr << "Error: unexpected expression: '" << element->name << "'" << endl; |
140 | exit(EXIT_FAILURE); |
141 | return ""_u; |
142 | } |
143 | |
144 | UString |
145 | Interchunk::processMlu(xmlNode* element) |
146 | { |
147 | cerr << "Error: unexpected expression: '" << element->name << "'" << endl; |
148 | exit(EXIT_FAILURE); |
149 | return ""_u; |
150 | } |
151 | |
152 | void |
153 | Interchunk::processCaseOf(xmlNode* element) |
154 | { |
155 | int pos = 0; |
156 | UString part; |
157 | for (xmlAttr* i = element->properties; i != NULL; i = i->next) { |
158 | if (!xmlStrcmp(i->name, (const xmlChar*) "part")) { |
159 | part = to_ustring((char*) i->children->content); |
160 | } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) { |
161 | pos = atoi((const char*) i->children->content) - 1; |
162 | } |
163 | } |
164 | evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos); |
165 | } |
166 | |
167 | void |
168 | Interchunk::processOut(xmlNode *localroot) |
169 | { |
170 | in_out = true; |
171 | |
172 | for (auto i : children(localroot)) { |
173 | if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) { |
174 | write(processChunk(i), output); |
175 | } else { |
176 | write(evalString(i), output); |
177 | } |
178 | } |
179 | |
180 | in_out = false; |
181 | } |
182 | |
183 | UString |
184 | Interchunk::processChunk(xmlNode *localroot) |
185 | { |
186 | UString result; |
187 | result.append("^"_u); |
188 | |
189 | for (auto i : children(localroot)) { |
190 | result.append(evalString(i)); |
191 | } |
192 | |
193 | result.append("$"_u); |
194 | return result; |
195 | } |
196 | |
197 | void |
198 | Interchunk::processLet(xmlNode *localroot) |
199 | { |
200 | xmlNode *leftSide = NULL, *rightSide = NULL; |
201 | |
202 | for (auto i : children(localroot)) { |
203 | if(leftSide == NULL) { |
204 | leftSide = i; |
205 | } else { |
206 | rightSide = i; |
207 | break; |
208 | } |
209 | } |
210 | |
211 | map<xmlNode *, TransferInstr>::iterator it = evalStringCache.find(leftSide); |
212 | if(it != evalStringCache.end()) |
213 | { |
214 | TransferInstr &ti = it->second; |
215 | switch(ti.getType()) |
216 | { |
217 | case ti_var: |
218 | variables[ti.getContent()] = evalString(rightSide); |
219 | return; |
220 | |
221 | case ti_clip_tl: |
222 | { |
223 | bool match = word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); |
224 | if(!match && trace) |
225 | { |
226 | cerr << "apertium-interchunk warning: <let> on line " << localroot->line << " sometimes discards its value." << endl; |
227 | } |
228 | } |
229 | return; |
230 | |
231 | default: |
232 | return; |
233 | } |
234 | } |
235 | if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) |
236 | { |
237 | UString const val = to_ustring((const char *) leftSide->properties->children->content); |
238 | variables[val] = evalString(rightSide); |
239 | evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); |
240 | } |
241 | else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) |
242 | { |
243 | int pos = 0; |
244 | UString part; |
245 | |
246 | for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) |
247 | { |
248 | if(!xmlStrcmp(i->name, (const xmlChar *) "part")) |
249 | { |
250 | part = to_ustring((char*)i->children->content); |
251 | } |
252 | else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) |
253 | { |
254 | pos = atoi((const char *) i->children->content) - 1; |
255 | } |
256 | } |
257 | |
258 | |
259 | bool match = word[pos]->setChunkPart(attr_items[part], |
260 | evalString(rightSide)); |
261 | if(!match && trace) |
262 | { |
263 | cerr << "apertium-interchunk warning: <let> on line " << localroot->line << " sometimes discards its value." << endl; |
264 | } |
265 | evalStringCache[leftSide] = TransferInstr(ti_clip_tl, |
266 | part, |
267 | pos, NULL); |
268 | } |
269 | } |
270 | |
271 | void |
272 | Interchunk::processModifyCase(xmlNode *localroot) |
273 | { |
274 | if (dictionary_case) return; |
| 1 | Assuming field 'dictionary_case' is false | |
|
| |
275 | xmlNode *leftSide = NULL, *rightSide = NULL; |
| 3 | | 'leftSide' initialized to a null pointer value | |
|
276 | |
277 | for (auto i : children(localroot)) { |
278 | if(leftSide == NULL) { |
279 | leftSide = i; |
280 | } else { |
281 | rightSide = i; |
282 | break; |
283 | } |
284 | } |
285 | |
286 | if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) |
| 4 | | Access to field 'name' results in a dereference of a null pointer (loaded from variable 'leftSide') |
|
287 | { |
288 | int pos = 0; |
289 | UString part; |
290 | |
291 | for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) |
292 | { |
293 | if(!xmlStrcmp(i->name, (const xmlChar *) "part")) |
294 | { |
295 | part = to_ustring((char*)i->children->content); |
296 | } |
297 | else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) |
298 | { |
299 | pos = atoi((const char *) i->children->content) - 1; |
300 | } |
301 | } |
302 | |
303 | UString const result = StringUtils::copycase(evalString(rightSide), |
304 | word[pos]->chunkPart(attr_items[part])); |
305 | bool match = word[pos]->setChunkPart(attr_items[part], result); |
306 | if(!match && trace) |
307 | { |
308 | cerr << "apertium-interchunk warning: <modify-case> on line " << localroot->line << " sometimes discards its value." << endl; |
309 | } |
310 | } |
311 | else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) |
312 | { |
313 | UString const val = to_ustring((const char *) leftSide->properties->children->content); |
314 | variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]); |
315 | } |
316 | } |
317 | |
318 | void |
319 | Interchunk::processCallMacro(xmlNode *localroot) |
320 | { |
321 | UString n = to_ustring((const char *) localroot->properties->children->content); |
322 | int npar = 0; |
323 | |
324 | xmlNode *macro = macro_map[macros[n]]; |
325 | |
326 | for(xmlAttr *i = macro->properties; i != NULL; i = i->next) |
327 | { |
328 | if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) |
329 | { |
330 | npar = atoi((const char *) i->children->content); |
331 | break; |
332 | } |
333 | } |
334 | |
335 | |
336 | |
337 | InterchunkWord **myword = NULL; |
338 | int idx = 0; |
339 | if(npar > 0) |
340 | { |
341 | myword = new InterchunkWord *[npar]; |
342 | for (auto i : children(localroot)) { |
343 | int pos = atoi((const char *) i->properties->children->content)-1; |
344 | myword[idx] = word[pos]; |
345 | idx++; |
346 | } |
347 | } |
348 | |
349 | swap(myword, word); |
350 | swap(npar, lword); |
351 | |
352 | for (auto i : children(macro)) { |
353 | processInstruction(i); |
354 | } |
355 | |
356 | swap(myword, word); |
357 | swap(npar, lword); |
358 | |
359 | delete[] myword; |
360 | } |
361 | |
362 | TransferToken & |
363 | Interchunk::readToken(InputFile& in) |
364 | { |
365 | if(!input_buffer.isEmpty()) |
366 | { |
367 | return input_buffer.next(); |
368 | } |
369 | |
370 | UString content; |
371 | while(true) |
372 | { |
373 | int val = in.get(); |
374 | if(in.eof() || (internal_null_flush && val == 0)) |
375 | { |
376 | return input_buffer.add(TransferToken(content, tt_eof)); |
377 | } |
378 | if(val == '\\') |
379 | { |
380 | content += '\\'; |
381 | content += in.get(); |
382 | } |
383 | else if(val == '[') |
384 | { |
385 | content += '['; |
386 | while(true) |
387 | { |
388 | UChar32 val2 = in.get(); |
389 | if(val2 == '\\') { |
390 | content += '\\'; |
391 | content += in.get(); |
392 | } else if(val2 == ']') { |
393 | content += ']'; |
394 | break; |
395 | } else { |
396 | content += val2; |
397 | } |
398 | } |
399 | } |
400 | else if(inword && val == '{') { |
401 | content += '{'; |
402 | while(true) { |
403 | UChar32 val2 = in.get(); |
404 | if(val2 == '\\') { |
405 | content += '\\'; |
406 | content += in.get(); |
407 | } else if(val2 == '}') { |
408 | UChar32 val3 = in.peek(); |
409 | |
410 | content += '}'; |
411 | if(val3 == '$') { |
412 | break; |
413 | } |
414 | } else { |
415 | content += val2; |
416 | } |
417 | } |
418 | } |
419 | else if(inword && val == '$') |
420 | { |
421 | inword = false; |
422 | return input_buffer.add(TransferToken(content, tt_word)); |
423 | } |
424 | else if(val == '^') |
425 | { |
426 | inword = true; |
427 | return input_buffer.add(TransferToken(content, tt_blank)); |
428 | } |
429 | else |
430 | { |
431 | content += val; |
432 | } |
433 | } |
434 | } |
435 | |
436 | void |
437 | Interchunk::interchunk_wrapper_null_flush(InputFile& in, UFILE* out) |
438 | { |
439 | null_flush = false; |
440 | internal_null_flush = true; |
441 | |
442 | while(!in.eof()) { |
443 | interchunk(in, out); |
444 | u_fputc('\0', out); |
445 | u_fflush(out); |
446 | variables = variable_defaults; |
447 | } |
448 | internal_null_flush = false; |
449 | null_flush = true; |
450 | } |
451 | |
452 | |
453 | void |
454 | Interchunk::interchunk(InputFile& in, UFILE* out) |
455 | { |
456 | if(getNullFlush()) |
457 | { |
458 | interchunk_wrapper_null_flush(in, out); |
459 | } |
460 | |
461 | unsigned int last = input_buffer.getPos(); |
462 | unsigned int prev_last = last; |
463 | int lastrule_id = -1; |
464 | set<int> banned_rules; |
465 | |
466 | output = out; |
467 | ms.init(me->getInitial()); |
468 | |
469 | while(true) |
470 | { |
471 | if(ms.size() == 0) |
472 | { |
473 | if(lastrule != NULL) |
474 | { |
475 | int num_words_to_consume = applyRule(); |
476 | |
477 | |
478 | |
479 | if(num_words_to_consume < 0) |
480 | { |
481 | banned_rules.clear(); |
482 | input_buffer.setPos(last); |
483 | } |
484 | else if(num_words_to_consume > 0) |
485 | { |
486 | banned_rules.clear(); |
487 | if(prev_last >= input_buffer.getSize()) |
488 | { |
489 | input_buffer.setPos(0); |
490 | } |
491 | else |
492 | { |
493 | input_buffer.setPos(prev_last+1); |
494 | } |
495 | int num_consumed_words = 0; |
496 | while(num_consumed_words < num_words_to_consume && !input_buffer.isEmpty()) |
497 | { |
498 | TransferToken& local_tt = input_buffer.next(); |
499 | if (local_tt.getType() == tt_word) |
500 | { |
501 | num_consumed_words++; |
502 | } |
503 | } |
504 | } |
505 | else |
506 | { |
507 | |
508 | banned_rules.insert(lastrule_id); |
509 | input_buffer.setPos(prev_last); |
510 | input_buffer.next(); |
511 | last = input_buffer.getPos(); |
512 | } |
513 | lastrule_id = -1; |
514 | } |
515 | else |
516 | { |
517 | if(tmpword.size() != 0) |
518 | { |
519 | u_fprintf(output, "^%S$", tmpword[0]->c_str()); |
520 | tmpword.clear(); |
521 | input_buffer.setPos(last); |
522 | input_buffer.next(); |
523 | prev_last = last; |
524 | banned_rules.clear(); |
525 | last = input_buffer.getPos(); |
526 | ms.init(me->getInitial()); |
527 | } |
528 | else if(tmpblank.size() != 0) { |
529 | write(*tmpblank[0], output); |
530 | tmpblank.clear(); |
531 | prev_last = last; |
532 | last = input_buffer.getPos(); |
533 | ms.init(me->getInitial()); |
534 | } |
535 | } |
536 | } |
537 | int val = ms.classifyFinals(me->getFinals(), banned_rules); |
538 | if(val != -1) |
539 | { |
540 | size_t lastrule_line = rule_lines[val-1]; |
541 | lastrule = rule_map[val-1]; |
542 | last = input_buffer.getPos(); |
543 | lastrule_id = val; |
544 | |
545 | last_lword = tmpword.size(); |
546 | |
547 | if(trace) |
548 | { |
549 | cerr << endl << "apertium-interchunk: Rule " << val << " line " << lastrule_line; |
550 | for (auto& it : tmpword) { |
551 | cerr << " " << *it; |
552 | } |
553 | cerr << endl; |
554 | } |
555 | } |
556 | |
557 | TransferToken ¤t = readToken(in); |
558 | |
559 | switch(current.getType()) |
560 | { |
561 | case tt_word: |
562 | applyWord(current.getContent()); |
563 | tmpword.push_back(¤t.getContent()); |
564 | break; |
565 | |
566 | case tt_blank: |
567 | ms.step(' '); |
568 | tmpblank.push_back(¤t.getContent()); |
569 | break; |
570 | |
571 | case tt_eof: |
572 | if(tmpword.size() != 0) { |
573 | tmpblank.push_back(¤t.getContent()); |
574 | ms.clear(); |
575 | } |
576 | else { |
577 | write(current.getContent(), output); |
578 | tmpblank.clear(); |
579 | return; |
580 | } |
581 | break; |
582 | |
583 | default: |
584 | cerr << "Error: Unknown input token." << endl; |
585 | return; |
586 | } |
587 | } |
588 | } |
589 | |
590 | int |
591 | Interchunk::applyRule() |
592 | { |
593 | unsigned int limit = tmpword.size(); |
594 | |
595 | for(unsigned int i = 0; i != limit; i++) |
596 | { |
597 | if(i == 0) |
598 | { |
599 | word = new InterchunkWord *[limit]; |
600 | lword = limit; |
601 | } |
602 | else |
603 | { |
604 | if(int(blank_queue.size()) < last_lword - 1) |
605 | { |
606 | UString blank_to_add = UString(*tmpblank[i-1]); |
607 | blank_queue.push(blank_to_add); |
608 | } |
609 | } |
610 | |
611 | word[i] = new InterchunkWord(*tmpword[i]); |
612 | } |
613 | |
614 | int words_to_consume = processRule(lastrule); |
615 | lastrule = NULL; |
616 | |
617 | if(word) |
618 | { |
619 | for(unsigned int i = 0; i != limit; i++) |
620 | { |
621 | delete word[i]; |
622 | } |
623 | delete[] word; |
624 | } |
625 | |
626 | word = NULL; |
627 | tmpword.clear(); |
628 | tmpblank.clear(); |
629 | ms.init(me->getInitial()); |
630 | return words_to_consume; |
631 | } |
632 | |
633 | void |
634 | Interchunk::applyWord(UString const &word_str) |
635 | { |
636 | ms.step('^'); |
637 | for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) |
638 | { |
639 | switch(word_str[i]) |
640 | { |
641 | case '\\': |
642 | i++; |
643 | ms.step(u_tolower(word_str[i]), any_char); |
644 | break; |
645 | |
646 | case '<': |
647 | for(unsigned int j = i+1; j != limit; j++) |
648 | { |
649 | if(word_str[j] == '>') |
650 | { |
651 | int symbol = alphabet(word_str.substr(i, j-i+1)); |
652 | if(symbol) |
653 | { |
654 | ms.step(symbol, any_tag); |
655 | } |
656 | else |
657 | { |
658 | ms.step(any_tag); |
659 | } |
660 | i = j; |
661 | break; |
662 | } |
663 | } |
664 | break; |
665 | |
666 | case '{': |
667 | ms.step('$'); |
668 | return; |
669 | |
670 | default: |
671 | ms.step(u_tolower(word_str[i]), any_char); |
672 | break; |
673 | } |
674 | } |
675 | ms.step('$'); |
676 | } |