Bug Summary

File:postchunk.cc
Warning:line 363, column 17
Access to field 'name' results in a dereference of a null pointer (loaded from variable 'leftSide')

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name postchunk.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ postchunk.cc
1/*
2 * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <https://www.gnu.org/licenses/>.
16 */
17#include <apertium/postchunk.h>
18
19#include <lttoolbox/xml_walk_util.h>
20#include <lttoolbox/string_utils.h>
21
22#include <iostream>
23
24using namespace std;
25
26Postchunk::Postchunk()
27{}
28
29bool
30Postchunk::checkIndex(xmlNode *element, int index, int limit)
31{
32 if(index > limit) // Note: Unlike transfer/interchunk, we allow index==limit!
33 {
34 cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index > limit" << endl;
35 return false;
36 }
37 if(index < 0) {
38 cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": index < 0" << endl;
39 return false;
40 }
41 if(word[index] == 0)
42 {
43 cerr << "Error in " << (char *) doc->URL << ": line " << element->line << ": Null access at word[index]" << endl;
44 return false;
45 }
46 return true;
47}
48
49UString
50Postchunk::evalCachedString(xmlNode* element)
51{
52 TransferInstr& ti = evalStringCache[element];
53 switch (ti.getType()) {
54 case ti_clip_tl:
55 if (checkIndex(element, ti.getPos(), lword)) {
56 if (gettingLemmaFromWord(ti.getContent()) && lword > 1) {
57 if (in_lu) {
58 out_wblank = combineWblanks(out_wblank, word[ti.getPos()]->getWblank());
59 } else if (in_let_var) {
60 var_out_wblank[var_val] = combineWblanks(var_out_wblank[var_val],
61 word[ti.getPos()]->getWblank());
62 }
63 }
64 return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]);
65 }
66 break;
67
68 case ti_lu_count:
69 return StringUtils::itoa(tmpword.size());
70
71 case ti_var:
72 if (lword > 1) {
73 out_wblank = combineWblanks(out_wblank, var_out_wblank[ti.getContent()]);
74 }
75 return variables[ti.getContent()];
76
77 case ti_lit_tag:
78 case ti_lit:
79 return ti.getContent();
80
81 case ti_b:
82 if (!blank_queue.empty()) {
83 UString retblank = blank_queue.front();
84 if (in_out) {
85 blank_queue.pop();
86 }
87 return retblank;
88 } else {
89 return " "_u;
90 }
91 break;
92
93 case ti_get_case_from:
94 if (checkIndex(element, ti.getPos()+1, lword)) {
95 return copycase(word[ti.getPos()+1]->chunkPart(attr_items[ti.getContent()]),
96 evalString(ti.getPointer()));
97 }
98 break;
99
100 case ti_case_of_tl:
101 if (checkIndex(element, ti.getPos(), lword)) {
102 return StringUtils::getcase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]));
103 }
104 break;
105
106 default:
107 return ""_u;
108 }
109 return ""_u;
110}
111
112void
113Postchunk::processClip(xmlNode* element)
114{
115 int pos = 0;
116 UString part;
117 for(xmlAttr* i = element->properties; i != NULL__null; i = i->next) {
118 if (!xmlStrcmp(i->name, (const xmlChar*) "part")) {
119 part = to_ustring((const char*) i->children->content);
120 } else if (!xmlStrcmp(i->name, (const xmlChar*) "pos")) {
121 pos = atoi((const char *)i->children->content);
122 }
123 }
124 evalStringCache[element] = TransferInstr(ti_clip_tl, part, pos, NULL__null);
125}
126
127void
128Postchunk::processBlank(xmlNode* element)
129{
130 if (element->properties == NULL__null) {
131 evalStringCache[element] = TransferInstr(ti_b, " "_u, -1);
132 } else {
133 int pos = atoi((const char *) element->properties->children->content) - 1;
134 evalStringCache[element] = TransferInstr(ti_b, ""_u, pos);
135 }
136}
137
138void
139Postchunk::processLuCount(xmlNode* element)
140{
141 evalStringCache[element] = TransferInstr(ti_lu_count, ""_u, 0);
142}
143
144void
145Postchunk::processCaseOf(xmlNode* element)
146{
147 int pos = 0;
148 UString part;
149 for (xmlAttr* i = element->properties; i != NULL__null; i = i->next) {
150 if (!xmlStrcmp(i->name, (const xmlChar*) "part")) {
151 part = to_ustring((const char*) i->children->content);
152 } else if(!xmlStrcmp(i->name, (const xmlChar*) "pos")) {
153 pos = atoi((const char *) i->children->content);
154 }
155 }
156 evalStringCache[element] = TransferInstr(ti_case_of_tl, part, pos);
157}
158
159UString
160Postchunk::processLu(xmlNode* element)
161{
162 in_lu = true;
163 out_wblank.clear();
164
165 UString myword;
166 for (auto i : children(element)) {
167 myword.append(evalString(i));
168 }
169 in_lu = false;
170
171 if (lword == 1) {
172 out_wblank = word[1]->getWblank();
173 }
174
175 if (myword.empty()) {
176 return ""_u;
177 } else {
178 return out_wblank+"^"_u+myword+"$"_u;
179 }
180}
181
182UString
183Postchunk::processMlu(xmlNode* element)
184{
185 UString value;
186
187 bool first_time = true;
188 out_wblank.clear();
189 in_lu = true;
190
191 for (auto i : children(element)) {
192 UString myword;
193
194 for (auto j : children(i)) {
195 myword.append(evalString(j));
196 }
197
198 if (!first_time) {
199 if(!myword.empty() && myword[0] != '#') { //'+#' problem
200 value += '+';
201 }
202 } else {
203 if (!myword.empty()) {
204 first_time = false;
205 }
206 }
207
208 value.append(myword);
209 }
210
211 in_lu = false;
212
213 if (lword == 1) {
214 out_wblank = word[1]->getWblank();
215 }
216
217 if (value.empty()) {
218 return ""_u;
219 } else {
220 return out_wblank+"^"_u+value+"$"_u;
221 }
222}
223
224UString
225Postchunk::processChunk(xmlNode* element)
226{
227 cerr << "Error: unexpected expression: '" << element->name << "'" << endl;
228 exit(EXIT_FAILURE1);
229 return ""_u; // make the type checker happy
230}
231
232void
233Postchunk::processOut(xmlNode *localroot)
234{
235 in_out = true;
236
237 for (auto i : children(localroot)) {
238 if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) {
239 write(processLu(i), output);
240 } else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) {
241 write(processMlu(i), output);
242 } else { // 'b'
243 write(evalString(i), output);
244 }
245 }
246
247 in_out = false;
248}
249
250void
251Postchunk::processTags(xmlNode *localroot)
252{
253 for (auto i : children(localroot)) {
254 if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) {
255 for (auto j : children(i)) {
256 write(evalString(j), output);
257 }
258 }
259 }
260}
261
262void
263Postchunk::processLet(xmlNode *localroot)
264{
265 xmlNode *leftSide = NULL__null, *rightSide = NULL__null;
266
267 for (auto i : children(localroot)) {
268 if(leftSide == NULL__null) {
269 leftSide = i;
270 } else {
271 rightSide = i;
272 break;
273 }
274 }
275
276 map<xmlNode *, TransferInstr>::iterator it = evalStringCache.find(leftSide);
277 if(it != evalStringCache.end())
278 {
279 TransferInstr &ti = it->second;
280 switch(ti.getType())
281 {
282 case ti_var:
283 in_let_var = true;
284 var_val = ti.getContent();
285 var_out_wblank[var_val].clear();
286
287 variables[ti.getContent()] = evalString(rightSide);
288
289 in_let_var = false;
290 return;
291
292 case ti_clip_tl:
293 {
294 bool match = word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide));
295 if(!match && trace)
296 {
297 cerr << "apertium-postchunk warning: <let> on line " << localroot->line << " sometimes discards its value." << endl;
298 }
299 }
300 return;
301
302 default:
303 return;
304 }
305 }
306 if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var"))
307 {
308 in_let_var = true;
309
310 UString const val = to_ustring((const char *) leftSide->properties->children->content);
311
312 var_val = val;
313 var_out_wblank[var_val].clear();
314
315 variables[val] = evalString(rightSide);
316
317 in_let_var = false;
318 evalStringCache[leftSide] = TransferInstr(ti_var, val, 0);
319 }
320 else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip"))
321 {
322 int pos = 0;
323 UString part;
324
325 for(xmlAttr *i = leftSide->properties; i != NULL__null; i = i->next)
326 {
327 if(!xmlStrcmp(i->name, (const xmlChar *) "part"))
328 {
329 part = to_ustring((const char*)i->children->content);
330 }
331 else if(!xmlStrcmp(i->name, (const xmlChar *) "pos"))
332 {
333 pos = atoi((const char *) i->children->content);
334 }
335 }
336
337
338 bool match = word[pos]->setChunkPart(attr_items[part],
339 evalString(rightSide));
340 if(!match && trace)
341 {
342 cerr << "apertium-postchunk warning: <let> on line " << localroot->line << " sometimes discards its value." << endl;
343 }
344 evalStringCache[leftSide] = TransferInstr(ti_clip_tl, part, pos, NULL__null);
345 }
346}
347
348void
349Postchunk::processModifyCase(xmlNode *localroot)
350{
351 if (dictionary_case) return;
1
Assuming field 'dictionary_case' is false
2
Taking false branch
352 xmlNode *leftSide = NULL__null, *rightSide = NULL__null;
3
'leftSide' initialized to a null pointer value
353
354 for (auto i : children(localroot)) {
355 if(leftSide == NULL__null) {
356 leftSide = i;
357 } else {
358 rightSide = i;
359 break;
360 }
361 }
362
363 if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip"))
4
Access to field 'name' results in a dereference of a null pointer (loaded from variable 'leftSide')
364 {
365 int pos = 0;
366 UString part;
367
368 for(xmlAttr *i = leftSide->properties; i != NULL__null; i = i->next)
369 {
370 if(!xmlStrcmp(i->name, (const xmlChar *) "part"))
371 {
372 part = to_ustring((const char*)i->children->content);
373 }
374 else if(!xmlStrcmp(i->name, (const xmlChar *) "pos"))
375 {
376 pos = atoi((const char *) i->children->content);
377 }
378 }
379
380 UString const result = StringUtils::copycase(evalString(rightSide),
381 word[pos]->chunkPart(attr_items[part]));
382 bool match = word[pos]->setChunkPart(attr_items[part], result);
383
384 if(!match && trace)
385 {
386 cerr << "apertium-postchunk warning: <modify-case> on line " << localroot->line << " sometimes discards its value." << endl;
387 }
388 }
389 else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var"))
390 {
391 UString const val = to_ustring((const char *) leftSide->properties->children->content);
392 variables[val] = StringUtils::copycase(evalString(rightSide), variables[val]);
393 }
394}
395
396void
397Postchunk::processCallMacro(xmlNode *localroot)
398{
399 UString n = to_ustring((const char *) localroot->properties->children->content);
400 int npar = 0;
401
402 xmlNode *macro = macro_map[macros[n]];
403
404 for(xmlAttr *i = macro->properties; i != NULL__null; i = i->next)
405 {
406 if(!xmlStrcmp(i->name, (const xmlChar *) "npar"))
407 {
408 npar = atoi((const char *) i->children->content);
409 break;
410 }
411 }
412
413 if (npar <= 0)
414 {
415 throw "Postchunk::processCallMacro() assumes npar > 0, but got npar <= 0";
416 }
417
418 InterchunkWord **myword = NULL__null;
419 if(npar > 0)
420 {
421 myword = new InterchunkWord *[npar+1];
422 }
423
424 myword[0] = word[0];
425
426 bool indexesOK = true;
427 int idx = 1;
428 for (auto i : children(localroot)) {
429 int pos = atoi((const char *) i->properties->children->content);
430 if(!checkIndex(localroot, pos, lword)) {
431 indexesOK = false; // avoid segfaulting on empty chunks, e.g. ^x<x>{}$
432 pos = 1;
433 }
434 myword[idx] = word[pos];
435 idx++;
436 }
437
438 swap(myword, word);
439 swap(npar, lword);
440
441 if(indexesOK) {
442 for (auto i : children(macro)) {
443 processInstruction(i);
444 }
445 }
446 else {
447 cerr << "Warning: Not calling macro \"" << n << "\" from line " << localroot->line << " (empty word?)" << endl;
448 }
449
450 swap(myword, word);
451 swap(npar, lword);
452
453 delete[] myword;
454}
455
456TransferToken &
457Postchunk::readToken(InputFile& in)
458{
459 if(!input_buffer.isEmpty())
460 {
461 return input_buffer.next();
462 }
463
464 UString content;
465 while(true)
466 {
467 UChar32 val = in.get();
468 if(in.eof() || (internal_null_flush && val == 0))
469 {
470 return input_buffer.add(TransferToken(content, tt_eof));
471 }
472 if(val == '\\')
473 {
474 content += '\\';
475 content += in.get();
476 }
477 else if(val == '[')
478 {
479 content += '[';
480 while(true)
481 {
482 UChar32 val2 = in.get();
483 if(val2 == '\\') {
484 content += '\\';
485 content += in.get();
486 } else if(val2 == ']') {
487 content += ']';
488 break;
489 } else {
490 content += val2;
491 }
492 }
493 }
494 else if(inword && val == '{')
495 {
496 content += '{';
497 while(true) {
498 UChar32 val2 = in.get();
499 if(val2 == '\\') {
500 content += '\\';
501 content += in.get();
502 } else if(val2 == '}') {
503 UChar32 val3 = in.peek();
504 content += '}';
505 if(val3 == '$') {
506 break;
507 }
508 } else {
509 content += val2;
510 }
511 }
512 }
513 else if(inword && val == '$')
514 {
515 inword = false;
516 return input_buffer.add(TransferToken(content, tt_word));
517 }
518 else if(val == '^')
519 {
520 inword = true;
521 return input_buffer.add(TransferToken(content, tt_blank));
522 }
523 else
524 {
525 content += val;
526 }
527 }
528}
529
530void
531Postchunk::postchunk_wrapper_null_flush(InputFile& in, UFILE* out)
532{
533 null_flush = false;
534 internal_null_flush = true;
535
536 while(!in.eof())
537 {
538 postchunk(in, out);
539 u_fputcu_fputc_72('\0', out);
540 u_fflushu_fflush_72(out);
541 variables = variable_defaults;
542 }
543
544 internal_null_flush = false;
545 null_flush = true;
546}
547
548void
549Postchunk::postchunk(InputFile& in, UFILE* out)
550{
551 if(getNullFlush())
552 {
553 postchunk_wrapper_null_flush(in, out);
554 }
555
556 unsigned int last = input_buffer.getPos();
557 unsigned int prev_last = last;
558 int lastrule_id = -1;
559 set<int> banned_rules;
560
561 output = out;
562 ms.init(me->getInitial());
563
564 while(true)
565 {
566 if(ms.size() == 0)
567 {
568 if(lastrule != NULL__null)
569 {
570 int num_words_to_consume = applyRule();
571
572 //Consume all the words from the input which matched the rule.
573 //This piece of code is executed unless the rule contains a "reject-current-rule" instruction
574 if(num_words_to_consume < 0)
575 {
576 banned_rules.clear();
577 input_buffer.setPos(last);
578 }
579 else if(num_words_to_consume > 0)
580 {
581 banned_rules.clear();
582 if(prev_last >= input_buffer.getSize())
583 {
584 input_buffer.setPos(0);
585 }
586 else
587 {
588 input_buffer.setPos(prev_last+1);
589 }
590 int num_consumed_words = 0;
591 while(num_consumed_words < num_words_to_consume && !input_buffer.isEmpty())
592 {
593 TransferToken& local_tt = input_buffer.next();
594 if (local_tt.getType() == tt_word)
595 {
596 num_consumed_words++;
597 }
598 }
599 }
600 else
601 {
602 //Add rule to banned rules
603 banned_rules.insert(lastrule_id);
604 input_buffer.setPos(prev_last);
605 input_buffer.next();
606 last = input_buffer.getPos();
607 }
608 lastrule_id = -1;
609 }
610 else
611 {
612 if(tmpword.size() != 0) {
613 unchunk(*tmpword[0], output);
614 tmpword.clear();
615 input_buffer.setPos(last);
616 input_buffer.next();
617 prev_last = last;
618 banned_rules.clear();
619 last = input_buffer.getPos();
620 ms.init(me->getInitial());
621 }
622 else if(tmpblank.size() != 0) {
623 write(*tmpblank[0], output);
624 tmpblank.clear();
625 prev_last = last;
626 last = input_buffer.getPos();
627 ms.init(me->getInitial());
628 }
629 }
630 }
631 int val = ms.classifyFinals(me->getFinals(), banned_rules);
632 if(val != -1)
633 {
634 size_t lastrule_line = rule_lines[val-1];
635 lastrule = rule_map[val-1];
636 last = input_buffer.getPos();
637 lastrule_id = val;
638
639 if(trace)
640 {
641 cerr << endl << "apertium-postchunk: Rule " << val << " line " << lastrule_line;
642 for (auto& it : tmpword) {
643 cerr << " " << *it;
644 }
645 cerr << endl;
646 }
647 }
648
649 TransferToken &current = readToken(in);
650
651 switch(current.getType())
652 {
653 case tt_word:
654 applyWord(current.getContent());
655 tmpword.push_back(&current.getContent());
656 break;
657
658 case tt_blank:
659 ms.step(' ');
660 tmpblank.push_back(&current.getContent());
661 break;
662
663 case tt_eof:
664 if(tmpword.size() != 0) {
665 tmpblank.push_back(&current.getContent());
666 ms.clear();
667 }
668 else {
669 write(current.getContent(), output);
670 return;
671 }
672 break;
673
674 default:
675 cerr << "Error: Unknown input token." << endl;
676 return;
677 }
678 }
679}
680
681int
682Postchunk::applyRule()
683{
684 UString const chunk = *tmpword[0];
685 tmpword.clear();
686 splitWordsAndBlanks(chunk, tmpword, tmpblank);
687
688 word = new InterchunkWord *[tmpword.size()+1];
689 lword = tmpword.size();
690 word[0] = new InterchunkWord(wordzero(chunk));
691
692 for(unsigned int i = 1, limit = tmpword.size()+1; i != limit; i++)
693 {
694 if(i != 1) {
695 blank_queue.push(*tmpblank[i-1]);
696 }
697
698 word[i] = new InterchunkWord(*tmpword[i-1]);
699 }
700
701 int words_to_consume = processRule(lastrule);
702 lastrule = NULL__null;
703
704 if(word)
705 {
706 for(unsigned int i = 0, limit = tmpword.size() + 1; i != limit; i++)
707 {
708 delete word[i];
709 }
710 delete[] word;
711 }
712 word = NULL__null;
713
714 for(unsigned int i = 0, limit = tmpword.size(); i != limit; i++)
715 {
716 if(i != 0)
717 {
718 delete tmpblank[i];
719 }
720 delete tmpword[i];
721 }
722 tmpword.clear();
723 tmpblank.clear();
724 ms.init(me->getInitial());
725 return words_to_consume;
726}
727
728void
729Postchunk::applyWord(UString const &word_str)
730{
731 ms.step('^');
732 for(unsigned int i = 0, limit = word_str.size(); i < limit; i++)
733 {
734 switch(word_str[i])
735 {
736 case '\\':
737 i++;
738 ms.step(u_toloweru_tolower_72(word_str[i]), any_char);
739 break;
740
741 case '<':
742/* for(unsigned int j = i+1; j != limit; j++)
743 {
744 if(word_str[j] == '>')
745 {
746 int symbol = alphabet(word_str.substr(i, j-i+1));
747 if(symbol)
748 {
749 ms.step(symbol, any_tag);
750 }
751 else
752 {
753 ms.step(any_tag);
754 }
755 i = j;
756 break;
757 }
758 }
759 break;*/
760
761 case '{': // ignore the unmodifiable part of the chunk
762 ms.step('$');
763 return;
764
765 default:
766 ms.step(u_toloweru_tolower_72(word_str[i]), any_char);
767 break;
768 }
769 }
770 ms.step('$');
771}
772
773vector<UString>
774Postchunk::getVecTags(UString const &chunk)
775{
776 vector<UString> vectags;
777
778 for(int i = 0, limit = chunk.size(); i != limit; i++)
779 {
780 if(chunk[i] == '\\')
781 {
782 i++;
783 }
784 else if(chunk[i] == '<')
785 {
786 UString mytag;
787 do
788 {
789 mytag += chunk[i++];
790 }
791 while(chunk[i] != '>');
792 mytag += '>';
793 vectags.push_back(mytag);
794 }
795 else if(chunk[i] == '{')
796 {
797 break;
798 }
799 }
800 return vectags;
801}
802
803int
804Postchunk::beginChunk(UString const &chunk)
805{
806 for(int i = 0, limit = chunk.size(); i != limit; i++)
807 {
808 if(chunk[i] == '\\')
809 {
810 i++;
811 }
812 else if(chunk[i] == '{')
813 {
814 return i + 1;
815 }
816 }
817 return chunk.size();
818}
819
820int
821Postchunk::endChunk(UString const &chunk)
822{
823 return chunk.size()-2;
824}
825
826UString
827Postchunk::wordzero(UString const &chunk)
828{
829 for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++)
830 {
831 if(chunk[i] == '\\')
832 {
833 i++;
834 }
835 else if(chunk[i] == '{')
836 {
837 return chunk.substr(0, i);
838 }
839 }
840
841 return ""_u;
842}
843
844UString
845Postchunk::pseudolemma(UString const &chunk)
846{
847 for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++)
848 {
849 if(chunk[i] == '\\')
850 {
851 i++;
852 }
853 else if(chunk[i] == '<' || chunk[i] == '{')
854 {
855 return chunk.substr(0, i);
856 }
857 }
858
859 return ""_u;
860}
861
862void
863Postchunk::unchunk(UString const &chunk, UFILE* output)
864{
865 vector<UString> vectags = getVecTags(chunk);
866 UString case_info = StringUtils::getcase(pseudolemma(chunk));
867 bool uppercase_all = false;
868 bool uppercase_first = false;
869
870 if(!dictionary_case && case_info == "AA"_u)
871 {
872 uppercase_all = true;
873 }
874 else if(!dictionary_case && case_info == "Aa"_u)
875 {
876 uppercase_first = true;
877 }
878
879 for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++)
880 {
881 if(chunk[i] == '\\') {
882 u_fputcu_fputc_72('\\', output);
883 u_fputcu_fputc_72(chunk[++i], output);
884 } else if(chunk[i] == '^') {
885 u_fputcu_fputc_72('^', output);
886 while(chunk[++i] != '$')
887 {
888 if(chunk[i] == '\\')
889 {
890 u_fputcu_fputc_72('\\', output);
891 u_fputcu_fputc_72(chunk[++i], output);
892 }
893 else if(chunk[i] == '<')
894 {
895 if(u_isdigitu_isdigit_72(chunk[i+1]))
896 {
897 int j = ++i;
898 while (chunk[++i] != '>');
899 unsigned long value = StringUtils::stoi(chunk.substr(j, i-j)) - 1;
900 if(vectags.size() > value)
901 {
902 write(vectags[value], output);
903 }
904 }
905 else
906 {
907 u_fputcu_fputc_72('<', output);
908 while(chunk[++i] != '>') u_fputcu_fputc_72(chunk[i], output);
909 u_fputcu_fputc_72('>', output);
910 }
911 }
912 else
913 {
914 if(uppercase_all)
915 {
916 // TODO
917 u_fputcu_fputc_72(u_toupperu_toupper_72(chunk[i]), output);
918 }
919 else if(uppercase_first)
920 {
921 if(u_isalnumu_isalnum_72(chunk[i])) {
922 // TODO
923 u_fputcu_fputc_72(u_toupperu_toupper_72(chunk[i]), output);
924 uppercase_first = false;
925 } else {
926 u_fputcu_fputc_72(chunk[i], output);
927 }
928 }
929 else
930 {
931 u_fputcu_fputc_72(chunk[i], output);
932 }
933 }
934 }
935 u_fputcu_fputc_72('$', output);
936 }
937 else if(chunk[i] == '[')
938 {
939 u_fputcu_fputc_72('[', output);
940 while(chunk[++i] != ']')
941 {
942 if(chunk[i] == '\\')
943 {
944 u_fputcu_fputc_72('\\', output);
945 u_fputcu_fputc_72(chunk[++i], output);
946 }
947 else
948 {
949 u_fputcu_fputc_72(chunk[i], output);
950 }
951 }
952 u_fputcu_fputc_72(']', output);
953 }
954 else
955 {
956 u_fputcu_fputc_72(chunk[i], output);
957 }
958 }
959}
960
961
962void
963Postchunk::splitWordsAndBlanks(UString const &chunk, vector<UString *> &words,
964 vector<UString *> &blanks)
965{
966 vector<UString> vectags = getVecTags(chunk);
967 UString case_info = StringUtils::getcase(pseudolemma(chunk));
968 bool uppercase_all = false;
969 bool uppercase_first = false;
970 bool lastblank = true;
971
972 if(case_info == "AA"_u)
973 {
974 uppercase_all = true;
975 }
976 else if(case_info == "Aa"_u)
977 {
978 uppercase_first = true;
979 }
980
981 for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++)
982 {
983 if(chunk[i] == '^')
984 {
985 if(!lastblank)
986 {
987 blanks.push_back(new UString(""_u));
988 }
989 lastblank = false;
990 UString *myword = new UString();
991 UString &ref = *myword;
992
993 while(chunk[++i] != '$')
994 {
995 if(chunk[i] == '\\')
996 {
997 ref += '\\';
998 ref += chunk[++i];
999 }
1000 else if(chunk[i] == '<')
1001 {
1002 if(u_isdigitu_isdigit_72(chunk[i+1]))
1003 {
1004 // replace tag
1005 int j = ++i;
1006 while (chunk[++i] != '>');
1007 unsigned long value = StringUtils::stoi(chunk.substr(j, i-j)) - 1;
1008 if(vectags.size() > value)
1009 {
1010 ref.append(vectags[value]);
1011 }
1012 }
1013 else
1014 {
1015 ref += '<';
1016 while(chunk[++i] != '>') ref += chunk[i];
1017 ref += '>';
1018 }
1019 }
1020 else
1021 {
1022 if(uppercase_all)
1023 {
1024 // TODO
1025 ref += u_toupperu_toupper_72(chunk[i]);
1026 }
1027 else if(uppercase_first)
1028 {
1029 if(u_isalnumu_isalnum_72(chunk[i]))
1030 {
1031 // TODO
1032 ref += u_toupperu_toupper_72(chunk[i]);
1033 uppercase_first = false;
1034 }
1035 else
1036 {
1037 ref += chunk[i];
1038 }
1039 }
1040 else
1041 {
1042 ref += chunk[i];
1043 }
1044 }
1045 }
1046
1047 words.push_back(myword);
1048 }
1049 else if(chunk[i] == '[')
1050 {
1051 if(chunk[i+1] == '[') //wordbound blank
1052 {
1053 if(!lastblank)
1054 {
1055 blanks.push_back(new UString(""_u));
1056 }
1057 lastblank = false;
1058 UString *myword = new UString();
1059 UString &ref = *myword;
1060
1061 while(true)
1062 {
1063 if(chunk[i] == '\\')
1064 {
1065 ref += '\\';
1066 ref += chunk[++i];
1067 }
1068 else if(chunk[i] == ']' && chunk[i-1] == ']')
1069 {
1070 ref += chunk[i];
1071 i++; //i->"^"_u
1072 break;
1073 }
1074 else
1075 {
1076 ref += chunk[i];
1077 }
1078
1079 i++;
1080 }
1081
1082 while(chunk[++i] != '$')
1083 {
1084 if(chunk[i] == '\\')
1085 {
1086 ref += '\\';
1087 ref += chunk[++i];
1088 }
1089 else if(chunk[i] == '<')
1090 {
1091 if(u_isdigitu_isdigit_72(chunk[i+1]))
1092 {
1093 // replace tag
1094 int j = ++i;
1095 while (chunk[++i] != '>');
1096 unsigned long value = StringUtils::stoi(chunk.substr(j, i-j)) - 1;
1097 if(vectags.size() > value)
1098 {
1099 ref.append(vectags[value]);
1100 }
1101 }
1102 else
1103 {
1104 ref += '<';
1105 while(chunk[++i] != '>') ref += chunk[i];
1106 ref += '>';
1107 }
1108 }
1109 else
1110 {
1111 if(uppercase_all)
1112 {
1113 // TODO
1114 ref += u_toupperu_toupper_72(chunk[i]);
1115 }
1116 else if(uppercase_first)
1117 {
1118 if(u_isalnumu_isalnum_72(chunk[i])) // TODO
1119 {
1120 ref += u_toupperu_toupper_72(chunk[i]); // TODO
1121 uppercase_first = false;
1122 }
1123 else
1124 {
1125 ref += chunk[i];
1126 }
1127 }
1128 else
1129 {
1130 ref += chunk[i];
1131 }
1132 }
1133 }
1134
1135 words.push_back(myword);
1136 }
1137 else
1138 {
1139 if (!(lastblank && blanks.back()))
1140 {
1141 blanks.push_back(new UString());
1142 }
1143 UString &ref = *(blanks.back());
1144 ref += '[';
1145 while(chunk[++i] != ']')
1146 {
1147 if(chunk[i] == '\\')
1148 {
1149 ref += '\\';
1150 ref += chunk[++i];
1151 }
1152 else
1153 {
1154 ref += chunk[i];
1155 }
1156 }
1157 ref += chunk[i];
1158
1159 lastblank = true;
1160 }
1161 }
1162 else
1163 {
1164 if (!lastblank)
1165 {
1166 UString *myblank = new UString(""_u);
1167 blanks.push_back(myblank);
1168 }
1169 UString &ref = *(blanks.back());
1170 if(chunk[i] == '\\')
1171 {
1172 ref += '\\';
1173 ref += chunk[++i];
1174 }
1175 else
1176 {
1177 ref += chunk[i];
1178 }
1179 lastblank = true;
1180 }
1181 }
1182}