Bug Summary

File:fst_processor.cc
Warning:line 1652, column 9
Value stored to 'cur_pos' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name fst_processor.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/lttoolbox/lttoolbox-3.7.13+g626~47e4bf9e/lttoolbox -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_DECL_FGETC_UNLOCKED -D HAVE_DECL_FMEMOPEN -D HAVE_DECL_FPUTC_UNLOCKED -D HAVE_DECL_FPUTS_UNLOCKED -D HAVE_DECL_FREAD_UNLOCKED -D HAVE_DECL_FWRITE_UNLOCKED -D HAVE_GETOPT_LONG -D LTTOOLBOX_EXPORTS -D PACKAGE_VERSION="3.7.13" -D _GNU_SOURCE -D _POSIX_C_SOURCE=200112 -D lttoolbox_EXPORTS -I /usr/include/libxml2 -I /usr/include/utf8cpp -I /tmp/build/lttoolbox/lttoolbox-3.7.13+g626~47e4bf9e -I /tmp/build/lttoolbox/lttoolbox-3.7.13+g626~47e4bf9e/lttoolbox -I /usr/local/include -D NDEBUG -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -Wno-unused-result -std=c++2b -fdebug-compilation-dir=/tmp/build/lttoolbox/lttoolbox-3.7.13+g626~47e4bf9e/lttoolbox -ferror-limit 19 -fvisibility-inlines-hidden -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/lttoolbox/scan-build/2024-09-11-154858-201037-1 -x c++ /tmp/build/lttoolbox/lttoolbox-3.7.13+g626~47e4bf9e/lttoolbox/fst_processor.cc
1/*
2 * Copyright (C) 2005-2019 Universitat d'Alacant / Universidad de Alicante
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <https://www.gnu.org/licenses/>.
16 */
17#include <lttoolbox/fst_processor.h>
18#include <lttoolbox/compression.h>
19#include <lttoolbox/exception.h>
20#include <lttoolbox/xml_parse_util.h>
21#include <lttoolbox/file_utils.h>
22#include <lttoolbox/string_utils.h>
23#include <lttoolbox/symbol_iter.h>
24
25#include <iostream>
26#include <cerrno>
27#include <climits>
28
29
30FSTProcessor::FSTProcessor()
31{
32 // escaped_chars chars
33 escaped_chars.insert('[');
34 escaped_chars.insert(']');
35 escaped_chars.insert('{');
36 escaped_chars.insert('}');
37 escaped_chars.insert('^');
38 escaped_chars.insert('$');
39 escaped_chars.insert('/');
40 escaped_chars.insert('\\');
41 escaped_chars.insert('@');
42 escaped_chars.insert('<');
43 escaped_chars.insert('>');
44
45 if(useDefaultIgnoredChars)
46 {
47 initDefaultIgnoredCharacters();
48 }
49}
50
51void
52FSTProcessor::streamError()
53{
54 throw Exception("Error: Malformed input stream.");
55}
56
57void
58FSTProcessor::parseICX(std::string const &file)
59{
60 if(useIgnoredChars)
61 {
62 reader = xmlReaderForFile(file.c_str(), NULL__null, 0);
63 if(reader == NULL__null)
64 {
65 std::cerr << "Error: cannot open '" << file << "'." << std::endl;
66 exit(EXIT_FAILURE1);
67 }
68 int ret = xmlTextReaderRead(reader);
69 while(ret == 1)
70 {
71 procNodeICX();
72 ret = xmlTextReaderRead(reader);
73 }
74 // No point trying to process ignored chars if there are none
75 if(ignored_chars.size() == 0)
76 {
77 useIgnoredChars = false;
78 }
79 }
80}
81
82void
83FSTProcessor::parseRCX(std::string const &file)
84{
85 if(useRestoreChars)
86 {
87 reader = xmlReaderForFile(file.c_str(), NULL__null, 0);
88 if(reader == NULL__null)
89 {
90 std::cerr << "Error: cannot open '" << file << "'." << std::endl;
91 exit(EXIT_FAILURE1);
92 }
93 int ret = xmlTextReaderRead(reader);
94 while(ret == 1)
95 {
96 procNodeRCX();
97 ret = xmlTextReaderRead(reader);
98 }
99 }
100}
101
102void
103FSTProcessor::procNodeICX()
104{
105 UString name = XMLParseUtil::readName(reader);
106 if(name == XML_TEXT_NODE)
107 {
108 /* ignore */
109 }
110 else if(name == XML_IGNORED_CHARS_ELEM)
111 {
112 /* ignore */
113 }
114 else if(name == XML_CHAR_ELEM)
115 {
116 ignored_chars.insert(static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]));
117 }
118 else if(name == XML_COMMENT_NODE)
119 {
120 /* ignore */
121 }
122 else
123 {
124 std::cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader);
125 std::cerr << "): Invalid node '<" << name << ">'." << std::endl;
126 exit(EXIT_FAILURE1);
127 }
128}
129
130void
131FSTProcessor::initDefaultIgnoredCharacters()
132{
133 ignored_chars.insert(173); // '\u00AD', soft hyphen
134}
135
136void
137FSTProcessor::procNodeRCX()
138{
139 UString name = XMLParseUtil::readName(reader);
140 if(name == XML_TEXT_NODE)
141 {
142 /* ignore */
143 }
144 else if(name == XML_RESTORE_CHARS_ELEM)
145 {
146 /* ignore */
147 }
148 else if(name == XML_CHAR_ELEM)
149 {
150 rcx_current_char = static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]);
151 }
152 else if(name == XML_RESTORE_CHAR_ELEM)
153 {
154 rcx_map[rcx_current_char].insert(static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]));
155 }
156 else if(name == XML_COMMENT_NODE)
157 {
158 /* ignore */
159 }
160 else
161 {
162 std::cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader);
163 std::cerr << "): Invalid node '<" << name << ">'." << std::endl;
164 exit(EXIT_FAILURE1);
165 }
166}
167
168int
169FSTProcessor::readAnalysis(InputFile& input)
170{
171 if (!input_buffer.isEmpty())
172 {
173 UChar32 val = input_buffer.next();
174 return val;
175 }
176
177 UChar32 val = input.get();
178 int32_t altval = 0;
179 if(input.eof())
180 {
181 input_buffer.add(0); // so it's treated like the NUL byte
182 return 0;
183 } else if(val == U_EOF0xFFFF) {
184 val = 0;
185 }
186
187 while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end())
188 {
189 val = input.get();
190 }
191
192 if(escaped_chars.find(val) != escaped_chars.end())
193 {
194 switch(val)
195 {
196 case '<':
197 altval = alphabet(input.readBlock('<', '>'));
198 input_buffer.add(altval);
199 return altval;
200
201 case '[':
202 val = input.get();
203
204 if(val == '[')
205 {
206 blankqueue.push(input.finishWBlank());
207 }
208 else
209 {
210 input.unget(val);
211 blankqueue.push(input.readBlock('[', ']'));
212 }
213
214 input_buffer.add(static_cast<int32_t>(' '));
215 return static_cast<int32_t>(' ');
216
217 case '\\':
218 val = input.get();
219 input_buffer.add(static_cast<int32_t>(val));
220 return val;
221
222 default:
223 streamError();
224 }
225 }
226 if(val == ' ') {
227 blankqueue.push(" "_u);
228 }
229
230 input_buffer.add(val);
231 return val;
232}
233
234int
235FSTProcessor::readTMAnalysis(InputFile& input)
236{
237 isLastBlankTM = false;
238 if(!input_buffer.isEmpty())
239 {
240 return input_buffer.next();
241 }
242
243 UChar32 val = input.get();
244 int32_t altval = 0;
245 if(input.eof())
246 {
247 return 0;
248 }
249
250 if(escaped_chars.find(val) != escaped_chars.end() || u_isdigitu_isdigit_72(val))
251 {
252 switch(val)
253 {
254 case '<':
255 altval = alphabet(input.readBlock('<', '>'));
256 input_buffer.add(altval);
257 return altval;
258
259 case '[':
260 val = input.get();
261
262 if(val == '[')
263 {
264 blankqueue.push(input.finishWBlank());
265 }
266 else
267 {
268 input.unget(val);
269 blankqueue.push(input.readBlock('[', ']'));
270 }
271
272 input_buffer.add(static_cast<int32_t>(' '));
273 isLastBlankTM = true;
274 return static_cast<int32_t>(' ');
275
276 case '\\':
277 val = input.get();
278 input_buffer.add(static_cast<int32_t>(val));
279 return val;
280 case '0':
281 case '1':
282 case '2':
283 case '3':
284 case '4':
285 case '5':
286 case '6':
287 case '7':
288 case '8':
289 case '9':
290 {
291 UString ws;
292 do
293 {
294 ws += val;
295 val = input.get();
296 } while(u_isdigitu_isdigit_72(val));
297 input.unget(val);
298 input_buffer.add(alphabet(u"<n>"));
299 numbers.push_back(ws);
300 return alphabet(u"<n>");
301 }
302 break;
303
304 default:
305 streamError();
306 }
307 }
308
309 input_buffer.add(val);
310 return val;
311}
312
313bool
314FSTProcessor::readTransliterationBlank(InputFile& input)
315{
316 UString blank;
317 while (!input.eof()) {
318 UChar32 c = input.get();
319 if (u_isspaceu_isspace_72(c)) {
320 blank += c;
321 } else if (c == '[') {
322 if (input.peek() == '[') {
323 break;
324 }
325 blank += input.readBlock('[', ']');
326 } else {
327 input.unget(c);
328 break;
329 }
330 }
331 if (!blank.empty()) {
332 blankqueue.push(blank);
333 }
334 return !blank.empty();
335}
336
337bool
338FSTProcessor::readTransliterationWord(InputFile& input)
339{
340 if (input.eof() || input.peek() == '\0') {
341 return false;
342 }
343
344 if (!readTransliterationBlank(input)) {
345 blankqueue.push(""_u);
346 }
347
348 UString wblank;
349 std::vector<int32_t> word;
350 if (input.peek() == '[') {
351 input.get();
352 wblank = input.finishWBlank();
353 while (!input.eof()) {
354 if (readTransliterationBlank(input)) {
355 word.push_back(static_cast<int32_t>(' '));
356 if (input.peek() == '[') break;
357 } else {
358 UChar32 c = input.get();
359 if (c == '[') {
360 input.unget(c);
361 break;
362 } else if (c == '\\') {
363 word.push_back(static_cast<int32_t>(input.get()));
364 } else if (c == '<') {
365 word.push_back(alphabet(input.readBlock('<', '>')));
366 } else if (c == '\0') {
367 input.unget(c);
368 break;
369 } else {
370 word.push_back(static_cast<int32_t>(c));
371 }
372 }
373 }
374 if (input.peek() == '[') {
375 input.get();
376 input.finishWBlank();
377 }
378 } else {
379 while (!input.eof()) {
380 UChar32 c = input.get();
381 if (u_isspaceu_isspace_72(c) || c == '[' || c == '\0') {
382 input.unget(c);
383 break;
384 } else if (c == '\\') {
385 word.push_back(static_cast<int32_t>(input.get()));
386 } else if (c == '<') {
387 word.push_back(alphabet(input.readBlock('<', '>')));
388 } else {
389 word.push_back(static_cast<int32_t>(c));
390 }
391 }
392 }
393 if (word.empty()) {
394 return false;
395 }
396 wblankqueue.push_back(wblank);
397 transliteration_queue.push_back(word);
398
399 return true;
400}
401
402void
403FSTProcessor::skipUntil(InputFile& input, UFILE *output, UChar32 const character)
404{
405 while(true)
406 {
407 UChar32 val = input.get();
408 if(input.eof())
409 {
410 return;
411 }
412
413 switch(val)
414 {
415 case '\\':
416 val = input.get();
417 if(input.eof())
418 {
419 return;
420 }
421 u_fputcu_fputc_72('\\', output);
422 u_fputcu_fputc_72(val, output);
423 break;
424
425 case '\0':
426 u_fputcu_fputc_72(val, output);
427 if(nullFlushGeneration)
428 {
429 u_fflushu_fflush_72(output);
430 }
431 break;
432
433 default:
434 if(val == character)
435 {
436 return;
437 }
438 else
439 {
440 u_fputcu_fputc_72(val, output);
441 }
442 break;
443 }
444 }
445}
446
447int
448FSTProcessor::readGeneration(InputFile& input, UFILE *output)
449{
450 UChar32 val = input.get();
451
452 if(input.eof())
453 {
454 return 0x7fffffff;
455 }
456
457 if(outOfWord)
458 {
459 if(val == '^')
460 {
461 val = input.get();
462 if(input.eof())
463 {
464 return 0x7fffffff;
465 }
466 }
467 else if(val == '\\')
468 {
469 u_fputcu_fputc_72(val, output);
470 val = input.get();
471 if(input.eof())
472 {
473 return 0x7fffffff;
474 }
475 u_fputcu_fputc_72(val,output);
476 skipUntil(input, output, '^');
477 val = input.get();
478 if(input.eof())
479 {
480 return 0x7fffffff;
481 }
482 }
483 else
484 {
485 u_fputcu_fputc_72(val, output);
486 skipUntil(input, output, '^');
487 val = input.get();
488 if(input.eof())
489 {
490 return 0x7fffffff;
491 }
492 }
493 outOfWord = false;
494 }
495
496 if(val == '\\')
497 {
498 val = input.get();
499 return static_cast<int32_t>(val);
500 }
501 else if(val == '$')
502 {
503 outOfWord = true;
504 return static_cast<int32_t>('$');
505 }
506 else if(val == '<')
507 {
508 return alphabet(input.readBlock('<', '>'));
509 }
510 else if(val == '[')
511 {
512 val = input.get();
513 if(val == '[')
514 {
515 write(input.finishWBlank(), output);
516 }
517 else
518 {
519 input.unget(val);
520 write(input.readBlock('[', ']'), output);
521 }
522
523 return readGeneration(input, output);
524 }
525 else
526 {
527 return static_cast<int32_t>(val);
528 }
529
530 return 0x7fffffff;
531}
532
533void
534FSTProcessor::flushBlanks(UFILE *output)
535{
536 for(size_t i = blankqueue.size(); i > 0; i--)
537 {
538 write(blankqueue.front(), output);
539 blankqueue.pop();
540 }
541}
542
543void
544FSTProcessor::calcInitial()
545{
546 for(auto& it : transducers) {
547 root.addTransition(0, 0, it.second.getInitial(), default_weight);
548 }
549
550 initial_state.init(&root);
551}
552
553void
554FSTProcessor::classifyFinals()
555{
556 for(auto& it : transducers) {
557 if(StringUtils::endswith(it.first, u"@inconditional"))
558 {
559 inconditional.insert(it.second.getFinals().begin(),
560 it.second.getFinals().end());
561 }
562 else if(StringUtils::endswith(it.first, u"@standard"))
563 {
564 standard.insert(it.second.getFinals().begin(),
565 it.second.getFinals().end());
566 }
567 else if(StringUtils::endswith(it.first, u"@postblank"))
568 {
569 postblank.insert(it.second.getFinals().begin(),
570 it.second.getFinals().end());
571 }
572 else if(StringUtils::endswith(it.first, u"@preblank"))
573 {
574 preblank.insert(it.second.getFinals().begin(),
575 it.second.getFinals().end());
576 }
577 else
578 {
579 std::cerr << "Error: Unsupported transducer type for '";
580 std::cerr << it.first << "'." << std::endl;
581 exit(EXIT_FAILURE1);
582 }
583 }
584}
585
586UString
587FSTProcessor::filterFinals(const State& state, UStringView casefrom)
588{
589 bool firstupper = false, uppercase = false;
590 if (!dictionaryCase) {
591 firstupper = u_isupperu_isupper_72(casefrom[0]);
592 uppercase = (casefrom.size() > 1 &&
593 firstupper && u_isupperu_isupper_72(casefrom[casefrom.size()-1]));
594 }
595 return state.filterFinals(all_finals, alphabet, escaped_chars,
596 displayWeightsMode, maxAnalyses, maxWeightClasses,
597 uppercase, firstupper, 0);
598}
599
600void
601FSTProcessor::writeEscaped(UStringView str, UFILE *output)
602{
603 for(unsigned int i = 0, limit = str.size(); i < limit; i++)
604 {
605 if(escaped_chars.find(str[i]) != escaped_chars.end())
606 {
607 u_fputcu_fputc_72('\\', output);
608 }
609 u_fputcu_fputc_72(str[i], output);
610 }
611}
612
613size_t
614FSTProcessor::writeEscapedPopBlanks(UStringView str, UFILE *output)
615{
616 size_t postpop = 0;
617 for (unsigned int i = 0, limit = str.size(); i < limit; i++)
618 {
619 if (escaped_chars.find(str[i]) != escaped_chars.end()) {
620 u_fputcu_fputc_72('\\', output);
621 }
622 u_fputcu_fputc_72(str[i], output);
623 if (str[i] == ' ') {
624 if (blankqueue.front() == " "_u) {
625 blankqueue.pop();
626 } else {
627 postpop++;
628 }
629 }
630 }
631 return postpop;
632}
633
634void
635FSTProcessor::writeEscapedWithTags(UStringView str, UFILE *output)
636{
637 for(unsigned int i = 0, limit = str.size(); i < limit; i++)
638 {
639 if(str[i] == '<' && i >=1 && str[i-1] != '\\')
640 {
641 write(str.substr(i), output);
642 return;
643 }
644
645 if(escaped_chars.find(str[i]) != escaped_chars.end())
646 {
647 u_fputcu_fputc_72('\\', output);
648 }
649 u_fputcu_fputc_72(str[i], output);
650 }
651}
652
653
654
655void
656FSTProcessor::printWord(UStringView sf, UStringView lf, UFILE *output)
657{
658 u_fputcu_fputc_72('^', output);
659 writeEscaped(sf, output);
660 write(lf, output);
661 u_fputcu_fputc_72('$', output);
662}
663
664void
665FSTProcessor::printWordPopBlank(UStringView sf, UStringView lf, UFILE *output)
666{
667 u_fputcu_fputc_72('^', output);
668 size_t postpop = writeEscapedPopBlanks(sf, output);
669 u_fprintfu_fprintf_72(output, "%.*S$", lf.size(), lf.data());
670 while (postpop-- && blankqueue.size() > 0)
671 {
672 write(blankqueue.front(), output);
673 blankqueue.pop();
674 }
675}
676
677void
678FSTProcessor::printUnknownWord(UStringView sf, UFILE *output)
679{
680 u_fputcu_fputc_72('^', output);
681 writeEscaped(sf, output);
682 u_fputcu_fputc_72('/', output);
683 u_fputcu_fputc_72('*', output);
684 writeEscaped(sf, output);
685 u_fputcu_fputc_72('$', output);
686}
687
688unsigned int
689FSTProcessor::lastBlank(UStringView str)
690{
691 for(int i = static_cast<int>(str.size())-1; i >= 0; i--)
692 {
693 if(alphabetic_chars.find(str[i]) == alphabetic_chars.end())
694 {
695 return static_cast<unsigned int>(i);
696 }
697 }
698
699 return 0;
700}
701
702void
703FSTProcessor::printSpace(UChar32 val, UFILE *output)
704{
705 if(blankqueue.size() > 0)
706 {
707 flushBlanks(output);
708 }
709 else
710 {
711 u_fputcu_fputc_72(val, output);
712 }
713}
714
715void
716FSTProcessor::printChar(UChar32 val, UFILE* output)
717{
718 if (u_isspaceu_isspace_72(val)) {
719 if (blankqueue.size() > 0) {
720 write(blankqueue.front(), output);
721 blankqueue.pop();
722 } else {
723 u_fputcu_fputc_72(val, output);
724 }
725 } else {
726 if (isEscaped(val)) {
727 u_fputcu_fputc_72('\\', output);
728 }
729 if (val) {
730 u_fputcu_fputc_72(val, output);
731 }
732 }
733}
734
735bool
736FSTProcessor::isEscaped(UChar32 c) const
737{
738 return escaped_chars.find(c) != escaped_chars.end();
739}
740
741bool
742FSTProcessor::isAlphabetic(UChar32 c) const
743{
744 return u_isalnumu_isalnum_72(c) || alphabetic_chars.find(c) != alphabetic_chars.end();
745}
746
747void
748FSTProcessor::load(FILE *input)
749{
750 readTransducerSet(input, alphabetic_chars, alphabet, transducers);
751}
752
753void
754FSTProcessor::initAnalysis()
755{
756 calcInitial();
757 classifyFinals();
758 all_finals = standard;
759 all_finals.insert(inconditional.begin(), inconditional.end());
760 all_finals.insert(postblank.begin(), postblank.end());
761 all_finals.insert(preblank.begin(), preblank.end());
762}
763
764void
765FSTProcessor::initTMAnalysis()
766{
767 calcInitial();
768
769 for(auto& it : transducers) {
770 all_finals.insert(it.second.getFinals().begin(),
771 it.second.getFinals().end());
772 }
773}
774
775void
776FSTProcessor::initGeneration()
777{
778 setIgnoredChars(false);
779 calcInitial();
780 for(auto& it : transducers) {
781 all_finals.insert(it.second.getFinals().begin(),
782 it.second.getFinals().end());
783 }
784}
785
786void
787FSTProcessor::initTransliteration()
788{
789 initGeneration();
790}
791
792void
793FSTProcessor::initBiltrans()
794{
795 initGeneration();
796}
797
798
799UString
800FSTProcessor::compoundAnalysis(UString input_word)
801{
802 const int MAX_COMBINATIONS = 32767;
803
804 State current_state = initial_state;
805
806 for(unsigned int i=0; i<input_word.size(); i++)
807 {
808 UChar val=input_word[i];
809
810 current_state.step_case(val, beCaseSensitive(current_state));
811
812 if(current_state.size() > MAX_COMBINATIONS)
813 {
814 std::cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << std::endl;
815 std::cerr << " gave up at char " << i << " '" << val << "'." << std::endl;
816
817 UString nullString;
818 return nullString;
819 }
820
821 if(i < input_word.size()-1)
822 {
823 current_state.restartFinals(all_finals, compoundOnlyLSymbol, &initial_state, '+');
824 }
825
826 if(current_state.size()==0)
827 {
828 UString nullString;
829 return nullString;
830 }
831 }
832
833 current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements);
834 return filterFinals(current_state, input_word);
835}
836
837
838
839void
840FSTProcessor::initDecompositionSymbols()
841{
842 if((compoundOnlyLSymbol=alphabet(u"<:co:only-L>")) == 0
843 && (compoundOnlyLSymbol=alphabet(u"<:compound:only-L>")) == 0
844 && (compoundOnlyLSymbol=alphabet(u"<@co:only-L>")) == 0
845 && (compoundOnlyLSymbol=alphabet(u"<@compound:only-L>")) == 0
846 && (compoundOnlyLSymbol=alphabet(u"<compound-only-L>")) == 0)
847 {
848 std::cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << std::endl;
849 }
850 else if(!showControlSymbols)
851 {
852 alphabet.setSymbol(compoundOnlyLSymbol, u"");
853 }
854
855 if((compoundRSymbol=alphabet(u"<:co:R>")) == 0
856 && (compoundRSymbol=alphabet(u"<:compound:R>")) == 0
857 && (compoundRSymbol=alphabet(u"<@co:R>")) == 0
858 && (compoundRSymbol=alphabet(u"<@compound:R>")) == 0
859 && (compoundRSymbol=alphabet(u"<compound-R>")) == 0)
860 {
861 std::cerr << "Warning: Decomposition symbol <:compound:R> not found" << std::endl;
862 }
863 else if(!showControlSymbols)
864 {
865 alphabet.setSymbol(compoundRSymbol, u"");
866 }
867}
868
869
870void
871FSTProcessor::initDecomposition()
872{
873 do_decomposition = true;
874 initAnalysis();
875 initDecompositionSymbols();
876}
877
878void
879FSTProcessor::analysis(InputFile& input, UFILE *output)
880{
881 if(getNullFlush())
882 {
883 analysis_wrapper_null_flush(input, output);
884 }
885
886 bool last_incond = false;
887 bool last_postblank = false;
888 bool last_preblank = false;
889 State current_state = initial_state;
890 UString lf; // analysis (lexical form and tags)
891 UString sf; // surface form
892 UString lf_spcmp; // space compound analysis
893 bool seen_cpL = false; // have we seen a <compound-only-L> tag so far
894 size_t last_start = input_buffer.getPos(); // position in input_buffer when sf was last cleared
895 size_t last = 0; // position in input_buffer after last analysis
896 size_t last_size = 0; // size of sf at last analysis
897 std::map<int, std::set<int> >::iterator rcx_map_ptr;
898
899 UChar32 val;
900 do
901 {
902 val = readAnalysis(input);
903 // test for final states
904 if(current_state.isFinal(all_finals))
905 {
906 if(current_state.isFinal(inconditional))
907 {
908 if(do_decomposition && compoundOnlyLSymbol != 0)
909 {
910 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
911 }
912 lf = filterFinals(current_state, sf);
913 last_incond = true;
914 last = input_buffer.getPos();
915 last_size = sf.size();
916 }
917 else if(current_state.isFinal(postblank))
918 {
919 if(do_decomposition && compoundOnlyLSymbol != 0)
920 {
921 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
922 }
923 lf = filterFinals(current_state, sf);
924 last_postblank = true;
925 last = input_buffer.getPos();
926 last_size = sf.size();
927 }
928 else if(current_state.isFinal(preblank))
929 {
930 if(do_decomposition && compoundOnlyLSymbol != 0)
931 {
932 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
933 }
934 lf = filterFinals(current_state, sf);
935 last_preblank = true;
936 last = input_buffer.getPos();
937 last_size = sf.size();
938 }
939 else if(!isAlphabetic(val))
940 {
941 if(do_decomposition && compoundOnlyLSymbol != 0)
942 {
943 current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol);
944 }
945 lf = filterFinals(current_state, sf);
946 last_postblank = false;
947 last_preblank = false;
948 last_incond = false;
949 last = input_buffer.getPos();
950 last_size = sf.size();
951 }
952 else { // isAlphabetic, standard type section
953 // Record if a compound might be possible
954 if (do_decomposition && compoundOnlyLSymbol != 0
955 && current_state.hasSymbol(compoundOnlyLSymbol)) {
956 seen_cpL = true;
957 }
958 }
959 }
960 else if(sf.empty() && u_isspaceu_isspace_72(val))
961 {
962 lf = "/*"_u;
963 lf.append(sf);
964 last_postblank = false;
965 last_preblank = false;
966 last_incond = false;
967 last = input_buffer.getPos();
968 last_size = sf.size();
969 }
970
971 if(useRestoreChars && rcx_map.find(val) != rcx_map.end())
972 {
973 rcx_map_ptr = rcx_map.find(val);
974 std::set<int> tmpset = rcx_map_ptr->second;
975 if(!u_isupperu_isupper_72(val) || beCaseSensitive(current_state))
976 {
977 current_state.step(val, tmpset);
978 }
979 else if(rcx_map.find(u_toloweru_tolower_72(val)) != rcx_map.end())
980 {
981 rcx_map_ptr = rcx_map.find(tolower(val));
982 tmpset.insert(tolower(val));
983 tmpset.insert(rcx_map_ptr->second.begin(), rcx_map_ptr->second.end());
984 current_state.step(val, tmpset);
985 }
986 else
987 {
988 tmpset.insert(tolower(val));
989 current_state.step(val, tmpset);
990 }
991 }
992 else
993 {
994 current_state.step_case(val, beCaseSensitive(current_state));
995 }
996
997 if(current_state.size() != 0)
998 {
999 if(val != 0)
1000 {
1001 alphabet.getSymbol(sf, val);
1002 }
1003 }
1004 else
1005 {
1006 // First try if blank-crossing compound analysis is possible; have
1007 // to fall back on the regular methods if this didn't work:
1008 lf_spcmp.clear();
1009 if (seen_cpL // We've seen both a space and a <compund-only-L>
1010 && isAlphabetic(val)
1011 && !sf.empty()
1012 && last_size <= lastBlank(sf)) {
1013 int oldval = val;
1014 UString oldsf = sf;
1015 do {
1016 alphabet.getSymbol(sf, val);
1017 } while ((val = readAnalysis(input)) && isAlphabetic(val));
1018 lf_spcmp = compoundAnalysis(sf);
1019 if(lf_spcmp.empty()) { // didn't work, rewind!
1020 input_buffer.back(sf.size() - oldsf.size());
1021 val = oldval;
1022 sf.swap(oldsf);
1023 }
1024 else {
1025 input_buffer.back(1);
1026 val = input_buffer.peek();
1027 }
1028 }
1029 seen_cpL = false;
1030
1031 if(!lf_spcmp.empty()) {
1032 printWordPopBlank(sf, lf_spcmp, output);
1033 }
1034 else if(!isAlphabetic(val) && sf.empty())
1035 {
1036 printChar(val, output);
1037 }
1038 else if(last_postblank)
1039 {
1040 printWordPopBlank(sf.substr(0, last_size),
1041 lf, output);
1042 u_fputcu_fputc_72(' ', output);
1043 input_buffer.setPos(last);
1044 input_buffer.back(1);
1045 }
1046 else if(last_preblank)
1047 {
1048 u_fputcu_fputc_72(' ', output);
1049 printWordPopBlank(sf.substr(0, last_size),
1050 lf, output);
1051 input_buffer.setPos(last);
1052 input_buffer.back(1);
1053 }
1054 else if(last_incond)
1055 {
1056 printWordPopBlank(sf.substr(0, last_size),
1057 lf, output);
1058 input_buffer.setPos(last);
1059 input_buffer.back(1);
1060 }
1061 else if(isAlphabetic(val) &&
1062 // we can't skip back a blank:
1063 (last_size > lastBlank(sf) ||
1064 // or we've failed to reach an analysis:
1065 lf.empty()))
1066 {
1067 do
1068 {
1069 alphabet.getSymbol(sf, val);
1070 }
1071 while((val = readAnalysis(input)) && isAlphabetic(val));
1072
1073 auto limit = firstNotAlpha(sf);
1074 if(limit.i_codepoint == 0)
1075 {
1076 input_buffer.setPos(1 + last_start);
1077 writeEscaped(sf.substr(0,1), output);
1078 }
1079 else
1080 {
1081 input_buffer.setPos(last_start + limit.i_codepoint);
1082 UString unknown_word = sf.substr(0, limit.i_utf16);
1083 if(do_decomposition)
1084 {
1085 UString compound = compoundAnalysis(unknown_word);
1086 if(!compound.empty())
1087 {
1088 printWord(unknown_word, compound, output);
1089 }
1090 else
1091 {
1092 printUnknownWord(unknown_word, output);
1093 }
1094 }
1095 else
1096 {
1097 printUnknownWord(unknown_word, output);
1098 }
1099 }
1100 }
1101 else if(lf.empty())
1102 {
1103 auto limit = firstNotAlpha(sf);
1104 if(limit.i_codepoint == 0)
1105 {
1106 input_buffer.setPos(1 + last_start);
1107 writeEscaped(sf.substr(0,1), output);
1108 }
1109 else
1110 {
1111 input_buffer.setPos(last_start + limit.i_codepoint);
1112 UString unknown_word = sf.substr(0, limit.i_utf16);
1113 if(do_decomposition)
1114 {
1115 UString compound = compoundAnalysis(unknown_word);
1116 if(!compound.empty())
1117 {
1118 printWord(unknown_word, compound, output);
1119 }
1120 else
1121 {
1122 printUnknownWord(unknown_word, output);
1123 }
1124 }
1125 else
1126 {
1127 printUnknownWord(unknown_word, output);
1128 }
1129 }
1130 }
1131 else
1132 {
1133 printWordPopBlank(sf.substr(0, last_size),
1134 lf, output);
1135 input_buffer.setPos(last);
1136 input_buffer.back(1);
1137 }
1138 if(val == 0) {
1139 if(!input_buffer.isEmpty()) {
1140 input_buffer.setPos(last+1);
1141 }
1142 }
1143
1144 current_state = initial_state;
1145 lf.clear();
1146 sf.clear();
1147 last_start = input_buffer.getPos();
1148 last_incond = false;
1149 last_postblank = false;
1150 last_preblank = false;
1151 }
1152 }
1153 while(val);
1154
1155 // print remaining blanks
1156 flushBlanks(output);
1157}
1158
1159void
1160FSTProcessor::analysis_wrapper_null_flush(InputFile& input, UFILE *output)
1161{
1162 setNullFlush(false);
1163 while(!input.eof())
1164 {
1165 analysis(input, output);
1166 u_fputcu_fputc_72('\0', output);
1167 u_fflushu_fflush_72(output);
1168 // analysis() doesn't always leave input_buffer empty
1169 // which results in repeatedly analyzing the same string
1170 // so clear it here
1171 while (!input_buffer.isEmpty()) input_buffer.next();
1172 }
1173}
1174
1175void
1176FSTProcessor::generation_wrapper_null_flush(InputFile& input, UFILE *output,
1177 GenerationMode mode)
1178{
1179 setNullFlush(false);
1180 nullFlushGeneration = true;
1181
1182 while(!input.eof())
1183 {
1184 generation(input, output, mode);
1185 u_fputcu_fputc_72('\0', output);
1186 u_fflushu_fflush_72(output);
1187 }
1188}
1189
1190void
1191FSTProcessor::tm_analysis(InputFile& input, UFILE *output)
1192{
1193 State current_state = initial_state;
1194 UString lf; //lexical form
1195 UString sf; //surface form
1196 int last = 0;
1197
1198 while(int32_t val = readTMAnalysis(input))
1199 {
1200 // test for final states
1201 if(current_state.isFinal(all_finals))
1202 {
1203 if(u_ispunctu_ispunct_72(val))
1204 {
1205 lf = current_state.filterFinalsTM(all_finals, alphabet,
1206 escaped_chars,
1207 blankqueue, numbers).substr(1);
1208 last = input_buffer.getPos();
1209 numbers.clear();
1210 }
1211 }
1212 else if(sf.empty() && u_isspaceu_isspace_72(val))
1213 {
1214 lf.append(sf);
1215 last = input_buffer.getPos();
1216 }
1217
1218 current_state.step_case(val, false);
1219
1220 if(current_state.size() != 0)
1221 {
1222 if(val == -1)
1223 {
1224 sf.append(numbers[numbers.size()-1]);
1225 }
1226 else if(isLastBlankTM && val == ' ')
1227 {
1228 sf.append(blankqueue.back());
1229 }
1230 else
1231 {
1232 alphabet.getSymbol(sf, val);
1233 }
1234 }
1235 else
1236 {
1237 if((u_isspaceu_isspace_72(val) || u_ispunctu_ispunct_72(val)) && sf.empty())
1238 {
1239 if(u_isspaceu_isspace_72(val))
1240 {
1241 printSpace(val, output);
1242 }
1243 else
1244 {
1245 if(isEscaped(val))
1246 {
1247 u_fputcu_fputc_72('\\', output);
1248 }
1249 u_fputcu_fputc_72(val, output);
1250 }
1251 }
1252 else if(!u_isspaceu_isspace_72(val) && !u_ispunctu_ispunct_72(val) &&
1253 ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) ||
1254 lf.empty()))
1255 {
1256
1257 do
1258 {
1259 if(val == -1)
1260 {
1261 sf.append(numbers[numbers.size()-1]);
1262 }
1263 else if(isLastBlankTM && val == ' ')
1264 {
1265 sf.append(blankqueue.back());
1266 }
1267 else
1268 {
1269 alphabet.getSymbol(sf, val);
1270 }
1271 }
1272 while((val = readTMAnalysis(input)) && !u_isspaceu_isspace_72(val) && !u_ispunctu_ispunct_72(val));
1273
1274 if(val == 0)
1275 {
1276 write(sf, output);
1277 return;
1278 }
1279
1280 input_buffer.back(1);
1281 write(sf, output);
1282
1283 while(blankqueue.size() > 0)
1284 {
1285 if(blankqueue.size() == 1 && isLastBlankTM)
1286 {
1287 break;
1288 }
1289 blankqueue.pop();
1290 }
1291
1292/*
1293 unsigned int limit = sf.find(' ');
1294 unsigned int size = sf.size();
1295 limit = (limit == static_cast<unsigned int>(UString::npos)?size:limit);
1296 input_buffer.back(1+(size-limit));
1297 write(sf.substr(0, limit), output);
1298*/ }
1299 else if(lf.empty())
1300 {
1301/* unsigned int limit = sf.find(' ');
1302 unsigned int size = sf.size();
1303 limit = (limit == static_cast<unsigned int >(UString::npos)?size:limit);
1304 input_buffer.back(1+(size-limit));
1305 write(sf.substr(0, limit), output);
1306*/
1307 input_buffer.back(1);
1308 write(sf, output);
1309
1310 while(blankqueue.size() > 0)
1311 {
1312 if(blankqueue.size() == 1 && isLastBlankTM)
1313 {
1314 break;
1315 }
1316 blankqueue.pop();
1317 }
1318
1319 }
1320 else
1321 {
1322 u_fprintfu_fprintf_72(output, "[%S]", lf.c_str());
1323 input_buffer.setPos(last);
1324 input_buffer.back(1);
1325 }
1326
1327 current_state = initial_state;
1328 lf.clear();
1329 sf.clear();
1330 }
1331 }
1332
1333 // print remaining blanks
1334 flushBlanks(output);
1335}
1336
1337
1338void
1339FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
1340{
1341 if(getNullFlush())
1342 {
1343 generation_wrapper_null_flush(input, output, mode);
1344 }
1345
1346 State current_state = initial_state;
1347 UString sf;
1348
1349 outOfWord = false;
1350
1351 skipUntil(input, output, '^');
1352 int val;
1353
1354 while((val = readGeneration(input, output)) != 0x7fffffff)
1355 {
1356 if(sf.empty() && val == '=')
1357 {
1358 u_fputcu_fputc_72('=', output);
1359 val = readGeneration(input, output);
1360 }
1361
1362 if(val == '$' && outOfWord)
1363 {
1364 if(sf[0] == '*' || sf[0] == '%')
1365 {
1366 if(mode != gm_clean && mode != gm_tagged_nm)
1367 {
1368 writeEscaped(sf, output);
1369 }
1370 else if (mode == gm_clean)
1371 {
1372 writeEscaped(sf.substr(1), output);
1373 }
1374 else if(mode == gm_tagged_nm)
1375 {
1376 u_fputcu_fputc_72('^', output);
1377 writeEscaped(removeTags(sf.substr(1)), output);
1378 u_fputcu_fputc_72('/', output);
1379 writeEscapedWithTags(sf, output);
1380 u_fputcu_fputc_72('$', output);
1381 }
1382 }
1383 else if(sf[0] == '@')
1384 {
1385 if(mode == gm_all)
1386 {
1387 writeEscaped(sf, output);
1388 }
1389 else if(mode == gm_clean)
1390 {
1391 writeEscaped(removeTags(sf.substr(1)), output);
1392 }
1393 else if(mode == gm_unknown)
1394 {
1395 writeEscaped(removeTags(sf), output);
1396 }
1397 else if(mode == gm_tagged)
1398 {
1399 writeEscaped(removeTags(sf), output);
1400 }
1401 else if(mode == gm_tagged_nm)
1402 {
1403 u_fputcu_fputc_72('^', output);
1404 writeEscaped(removeTags(sf.substr(1)), output);
1405 u_fputcu_fputc_72('/', output);
1406 writeEscapedWithTags(sf, output);
1407 u_fputcu_fputc_72('$', output);
1408 }
1409 }
1410 else if(current_state.isFinal(all_finals))
1411 {
1412 bool firstupper = false, uppercase = false;
1413 if(!dictionaryCase)
1414 {
1415 uppercase = sf.size() > 1 && u_isupperu_isupper_72(sf[1]);
1416 firstupper= u_isupperu_isupper_72(sf[0]);
1417 }
1418
1419 if(mode == gm_tagged || mode == gm_tagged_nm)
1420 {
1421 u_fputcu_fputc_72('^', output);
1422 }
1423
1424 write(current_state.filterFinals(all_finals, alphabet,
1425 escaped_chars,
1426 displayWeightsMode, maxAnalyses, maxWeightClasses,
1427 uppercase, firstupper).substr(1), output);
1428 if(mode == gm_tagged || mode == gm_tagged_nm)
1429 {
1430 u_fputcu_fputc_72('/', output);
1431 writeEscapedWithTags(sf, output);
1432 u_fputcu_fputc_72('$', output);
1433 }
1434
1435 }
1436 else
1437 {
1438 if(mode == gm_all)
1439 {
1440 u_fputcu_fputc_72('#', output);
1441 writeEscaped(sf, output);
1442 }
1443 else if(mode == gm_clean)
1444 {
1445 writeEscaped(removeTags(sf), output);
1446 }
1447 else if(mode == gm_unknown)
1448 {
1449 if(!sf.empty())
1450 {
1451 u_fputcu_fputc_72('#', output);
1452 writeEscaped(removeTags(sf), output);
1453 }
1454 }
1455 else if(mode == gm_tagged)
1456 {
1457 u_fputcu_fputc_72('#', output);
1458 writeEscaped(removeTags(sf), output);
1459 }
1460 else if(mode == gm_tagged_nm)
1461 {
1462 u_fputcu_fputc_72('^', output);
1463 writeEscaped(removeTags(sf), output);
1464 u_fputcu_fputc_72('/', output);
1465 u_fputcu_fputc_72('#', output);
1466 writeEscapedWithTags(sf, output);
1467 u_fputcu_fputc_72('$', output);
1468 }
1469 }
1470
1471 current_state = initial_state;
1472 sf.clear();
1473 }
1474 else if(u_isspaceu_isspace_72(val) && sf.size() == 0)
1475 {
1476 // do nothing
1477 }
1478 else if(sf.size() > 0 && (sf[0] == '*' || sf[0] == '%' ))
1479 {
1480 alphabet.getSymbol(sf, val);
1481 }
1482 else
1483 {
1484 alphabet.getSymbol(sf,val);
1485 if(current_state.size() > 0)
1486 {
1487 if(!alphabet.isTag(val) && u_isupperu_isupper_72(val) && !(beCaseSensitive(current_state)))
1488 {
1489 if(mode == gm_carefulcase)
1490 {
1491 current_state.step_careful(val, u_toloweru_tolower_72(val));
1492 }
1493 else
1494 {
1495 current_state.step(val, u_toloweru_tolower_72(val));
1496 }
1497 }
1498 else
1499 {
1500 current_state.step(val);
1501 }
1502 }
1503 }
1504 }
1505}
1506
1507void
1508FSTProcessor::postgeneration(InputFile& input, UFILE *output)
1509{
1510 transliteration_drop_tilde = true;
1511 transliteration(input, output);
1512}
1513
1514void
1515FSTProcessor::intergeneration(InputFile& input, UFILE *output)
1516{
1517 transliteration_drop_tilde = false;
1518 transliteration(input, output);
1519}
1520
1521void
1522FSTProcessor::transliteration(InputFile& input, UFILE *output)
1523{
1524 size_t start_pos = 0;
1525 size_t cur_word = 0;
1526 size_t cur_pos = 0;
1527 size_t match_pos = 0;
1528 State current_state = initial_state;
1529 UString last_match;
1530 int space_diff = 0;
1531
1532 bool firstupper = false;
1533 bool uppercase = false;
1534 bool have_first = false;
1535 bool have_second = false;
1536
1537 while (true) {
1538 if (transliteration_queue.empty()) {
1539 if (!blankqueue.empty()) {
1540 flushBlanks(output);
1541 }
1542 if (!readTransliterationWord(input)) {
1543 flushBlanks(output);
1544 if (input.eof()) {
1545 break;
1546 } else {
1547 u_fputcu_fputc_72(input.get(), output);
1548 u_fflushu_fflush_72(output);
1549 continue;
1550 }
1551 }
1552 }
1553
1554 if (current_state.isFinal(all_finals)) {
1555 last_match = current_state.filterFinals(all_finals, alphabet,
1556 escaped_chars, displayWeightsMode,
1557 1, maxWeightClasses,
1558 uppercase, firstupper);
1559 while (cur_word > 0) {
1560 if (cur_word == 1) {
1561 if (cur_pos == 0 && last_match[last_match.size()-1] == ' ') {
1562 match_pos = transliteration_queue.front().size();
1563 last_match = last_match.substr(0, last_match.size()-1);
1564 break;
1565 } else {
1566 cur_pos += transliteration_queue.front().size() + 1;
1567 }
1568 }
1569 std::vector<int32_t> word = transliteration_queue.front();
1570 transliteration_queue.pop_front();
1571 word.push_back(static_cast<int32_t>(' '));
1572 word.insert(word.end(), transliteration_queue.front().begin(),
1573 transliteration_queue.front().end());
1574 transliteration_queue.pop_front();
1575 transliteration_queue.push_front(word);
1576 UString wblank = wblankqueue.front();
1577 wblankqueue.pop_front();
1578 wblank = StringUtils::merge_wblanks(wblank, wblankqueue.front());
1579 wblankqueue.pop_front();
1580 wblankqueue.push_front(wblank);
1581 cur_word--;
1582 }
1583 if (cur_word == 0) {
1584 match_pos = cur_pos;
1585 }
1586 }
1587
1588 int32_t sym = 0;
1589 bool is_end = false;
1590 if (cur_pos < transliteration_queue[cur_word].size()) {
1591 sym = transliteration_queue[cur_word][cur_pos];
1592 cur_pos++;
1593 } else {
1594 if (cur_word + 1 == transliteration_queue.size() &&
1595 !readTransliterationWord(input)) {
1596 is_end = true;
1597 } else {
1598 sym = static_cast<int32_t>(' ');
1599 cur_word++;
1600 cur_pos = 0;
1601 }
1602 }
1603
1604 if (isAlphabetic(sym)) {
1605 if (!have_first) {
1606 have_first = true;
1607 if (u_isupperu_isupper_72(sym)) {
1608 firstupper = true;
1609 } else {
1610 firstupper = false;
1611 have_second = true;
1612 }
1613 } else if (!have_second) {
1614 have_second = true;
1615 uppercase = u_isupperu_isupper_72(sym);
1616 }
1617 }
1618
1619 current_state.step_case_override(sym, beCaseSensitive(current_state));
1620
1621 if (current_state.size() == 0 || is_end) {
1622 if (last_match.empty()) {
1623 start_pos++;
1624 } else {
1625 std::vector<int32_t> match = alphabet.tokenize(last_match.substr(1));
1626 last_match.clear();
1627 std::vector<int32_t> word = transliteration_queue.front();
1628 transliteration_queue.pop_front();
1629 size_t i = 0;
1630 for (; i < match.size() && i < match_pos - start_pos; i++) {
1631 if (match[match.size()-i-1] != word[match_pos-i-1]) {
1632 break;
1633 }
1634 }
1635 std::vector<int32_t> new_word;
1636 new_word.insert(new_word.end(), word.begin(), word.begin()+start_pos);
1637 new_word.insert(new_word.end(), match.begin(), match.end());
1638 new_word.insert(new_word.end(), word.begin()+match_pos, word.end());
1639 transliteration_queue.push_front(new_word);
1640 int sf_spaces = 0;
1641 int lf_spaces = 0;
1642 for (auto c : word) {
1643 if (c == static_cast<int32_t>(' ')) sf_spaces++;
1644 }
1645 for (auto c : new_word) {
1646 if (c == static_cast<int32_t>(' ')) lf_spaces++;
1647 }
1648 space_diff += (lf_spaces - sf_spaces);
1649 size_t last_start = start_pos;
1650 start_pos = match_pos - i;
1651 if (start_pos == last_start) start_pos++;
1652 cur_pos = start_pos;
Value stored to 'cur_pos' is never read
1653 cur_word = 0;
1654 }
1655 if (start_pos >= transliteration_queue.front().size()) {
1656 write(blankqueue.front(), output);
1657 blankqueue.pop();
1658 bool has_wblank = !wblankqueue.front().empty();
1659 write(wblankqueue.front(), output);
1660 wblankqueue.pop_front();
1661 auto word = transliteration_queue.front();
1662 transliteration_queue.pop_front();
1663 int space_count = 0;
1664 for (auto c : word) {
1665 if (c == static_cast<int32_t>(' ')) space_count++;
1666 }
1667 int space_out = 0;
1668 UString out;
1669 for (auto c : word) {
1670 if (c == ' ') {
1671 if (space_out + space_diff >= space_count) {
1672 out += ' ';
1673 } else {
1674 out += blankqueue.front();
1675 blankqueue.pop();
1676 }
1677 space_out++;
1678 } else if (transliteration_drop_tilde &&
1679 c == static_cast<int32_t>('~')) {
1680 } else {
1681 if (c > 0 && isEscaped(c)) {
1682 out += '\\';
1683 }
1684 alphabet.getSymbol(out, c);
1685 }
1686 }
1687 write(out, output);
1688 if (has_wblank) {
1689 write(WBLANK_FINAL, output);
1690 }
1691 while (space_diff < 0) {
1692 if (blankqueue.front() != " "_u) {
1693 write(blankqueue.front(), output);
1694 }
1695 blankqueue.pop();
1696 space_diff++;
1697 }
1698 space_diff = 0;
1699 start_pos = 0;
1700 }
1701 match_pos = 0;
1702 cur_pos = start_pos;
1703 cur_word = 0;
1704 uppercase = false;
1705 firstupper = false;
1706 have_first = false;
1707 have_second = false;
1708 current_state = initial_state;
1709 }
1710 }
1711}
1712
1713bool
1714FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue,
1715 bool delim, bool mark)
1716{
1717 State current_state = initial_state;
1718 bool firstupper = u_isupperu_isupper_72(word[0]);
1719 bool uppercase = firstupper && u_isupperu_isupper_72(word[1]);
1720 for (auto symbol : symbol_iter(word)) {
1721 int32_t val = (symbol.size() == 1 ? symbol[0] : alphabet(symbol));
1722 if (current_state.size() != 0) {
1723 current_state.step(val, beCaseSensitive(current_state));
1724 }
1725 if (current_state.isFinal(all_finals)) {
1726 result.clear();
1727 if (delim) result += '^';
1728 if (mark) result += '=';
1729 result += current_state.filterFinals(all_finals, alphabet,
1730 escaped_chars,
1731 displayWeightsMode, maxAnalyses, maxWeightClasses,
1732 uppercase, firstupper, 0).substr(1);
1733 }
1734 if (current_state.size() == 0) {
1735 if (!result.empty()) queue.append(symbol);
1736 else return false;
1737 }
1738 }
1739 return !result.empty();
1740}
1741
1742UString
1743FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
1744{
1745 UString result;
1746 unsigned int start_point = 1;
1747 unsigned int end_point = input_word.size()-2;
1748 UString queue;
1749 bool mark = false;
1750
1751 if(with_delim == false)
1752 {
1753 start_point = 0;
1754 end_point = input_word.size()-1;
1755 }
1756
1757 if(input_word[start_point] == '*')
1758 {
1759 return US(input_word);
1760 }
1761
1762 if(input_word[start_point] == '=')
1763 {
1764 start_point++;
1765 mark = true;
1766 }
1767
1768 auto word = input_word.substr(start_point, end_point-start_point);
1769 bool exists = step_biltrans(word, result, queue, with_delim, mark);
1770 if (!exists) {
1771 if (with_delim) return "^@"_u + US(input_word.substr(1));
1772 else return "@"_u + US(input_word);
1773 }
1774
1775 if(start_point < (end_point - 3))
1776 {
1777 return "^$"_u;
1778 }
1779 // attach unmatched queue automatically
1780
1781 if(!queue.empty())
1782 {
1783 UString result_with_queue = compose(result, queue);
1784 if(with_delim)
1785 {
1786 result_with_queue += '$';
1787 }
1788 return result_with_queue;
1789 }
1790 else
1791 {
1792 if(with_delim)
1793 {
1794 result += '$';
1795 }
1796 return result;
1797 }
1798}
1799
1800
1801
1802UString
1803FSTProcessor::biltrans(UStringView input_word, bool with_delim)
1804{
1805 State current_state = initial_state;
1806 UString result;
1807 unsigned int start_point = 1;
1808 unsigned int end_point = input_word.size()-2;
1809 UString queue;
1810 bool mark = false;
1811
1812 if(with_delim == false)
1813 {
1814 start_point = 0;
1815 end_point = input_word.size()-1;
1816 }
1817
1818 if(input_word[start_point] == '*')
1819 {
1820 return US(input_word);
1821 }
1822
1823 if(input_word[start_point] == '=')
1824 {
1825 start_point++;
1826 mark = true;
1827 }
1828
1829 UStringView word = input_word.substr(start_point, end_point-start_point);
1830 bool exists = step_biltrans(word, result, queue, with_delim, mark);
1831 if (!exists) {
1832 if (with_delim) return "^@"_u + US(input_word.substr(1));
1833 else return "@"_u + US(input_word);
1834 }
1835
1836 // attach unmatched queue automatically
1837
1838 if(!queue.empty())
1839 {
1840 UString result_with_queue = compose(result, queue);
1841 if(with_delim)
1842 {
1843 result_with_queue += '$';
1844 }
1845 return result_with_queue;
1846 }
1847 else
1848 {
1849 if(with_delim)
1850 {
1851 result += '$';
1852 }
1853 return result;
1854 }
1855}
1856
1857UString
1858FSTProcessor::compose(UStringView lexforms, UStringView queue) const
1859{
1860 UString result;
1861 result.reserve(lexforms.size() + 2 * queue.size());
1862 result += '/';
1863
1864 for(unsigned int i = 1; i< lexforms.size(); i++)
1865 {
1866 if(lexforms[i] == '\\')
1867 {
1868 result += '\\';
1869 i++;
1870 }
1871 else if(lexforms[i] == '/')
1872 {
1873 result.append(queue);
1874 }
1875 result += lexforms[i];
1876 }
1877
1878 result += queue;
1879 return result;
1880}
1881
1882void
1883FSTProcessor::skipToNextWord(InputFile& input, UFILE* output)
1884{
1885 int blank_depth = 0;
1886
1887 while (!input.eof()) {
1888 UChar32 c = input.get();
1889
1890 switch (c) {
1891 case '^':
1892 if (blank_depth == 0) {
1893 input.unget(c);
1894 return;
1895 } else {
1896 u_fputcu_fputc_72(c, output);
1897 }
1898 break;
1899 case '\\':
1900 u_fputcu_fputc_72(c, output);
1901 c = input.get();
1902 u_fputcu_fputc_72(c, output);
1903 break;
1904 case '\0':
1905 u_fputcu_fputc_72(c, output);
1906 u_fflushu_fflush_72(output);
1907 break;
1908 case U_EOF0xFFFF:
1909 break;
1910 case '[':
1911 blank_depth++;
1912 u_fputcu_fputc_72(c, output);
1913 break;
1914 case ']':
1915 if (blank_depth > 0) blank_depth--;
1916 u_fputcu_fputc_72(c, output);
1917 break;
1918 default:
1919 u_fputcu_fputc_72(c, output);
1920 }
1921 }
1922}
1923
1924UChar32
1925FSTProcessor::skipReading(InputFile& input, UFILE* output)
1926{
1927 UChar32 c = U_EOF0xFFFF;
1928 while (!input.eof()) {
1929 c = input.get();
1930 if (output != nullptr) {
1931 switch (c) {
1932 case '\\':
1933 u_fputcu_fputc_72(c, output);
1934 u_fputcu_fputc_72(input.get(), output);
1935 break;
1936 case '<':
1937 write(input.readBlock('<', '>'), output);
1938 break;
1939 case '/':
1940 case '$':
1941 u_fputcu_fputc_72(c, output);
1942 break;
1943 default:
1944 if (isEscaped(c)) u_fputcu_fputc_72('\\', output);
1945 u_fputcu_fputc_72(c, output);
1946 }
1947 } else {
1948 switch (c) {
1949 case '\\':
1950 input.get();
1951 break;
1952 case '<':
1953 input.readBlock('<', '>');
1954 break;
1955 }
1956 }
1957 if (c == '/' || c == '$' || c == '\0') break;
1958 }
1959 return c;
1960}
1961
1962void
1963FSTProcessor::nextBilingualWord(InputFile& input, UFILE* output,
1964 std::vector<int32_t>& symbols,
1965 GenerationMode mode)
1966{
1967 symbols.clear();
1968
1969 skipToNextWord(input, output);
1970
1971 if (input.eof()) return;
1972
1973 u_fputcu_fputc_72(input.get(), output); // ^
1974
1975 UChar32 c = '/';
1976
1977 if (biltransSurfaceFormsKeep) {
1978 c = skipReading(input, output);
1979 } else if (biltransSurfaceForms) {
1980 c = skipReading(input, nullptr);
1981 }
1982 if (c != '/') {
1983 nextBilingualWord(input, output, symbols, mode);
1984 return;
1985 }
1986
1987 bool unknown = false;
1988
1989 if (input.peek() == '*') {
1990 input.get();
1991 unknown = true;
1992 }
1993
1994 while (!input.eof()) {
1995 c = input.get();
1996 switch (c) {
1997 case '\\':
1998 symbols.push_back(input.get());
1999 break;
2000 case '\0':
2001 case '/':
2002 case '$':
2003 break;
2004 case '<':
2005 {
2006 UString symbol = input.readBlock('<', '>');
2007 alphabet.includeSymbol(symbol);
2008 symbols.push_back(alphabet(symbol));
2009 }
2010 break;
2011 default:
2012 symbols.push_back(c);
2013 }
2014 if (c == '\0' || c == '/' || c == '$') break;
2015 }
2016
2017 while (c == '/') c = skipReading(input, nullptr);
2018
2019 if (c == '\0' || unknown) {
2020 UString in_str;
2021 for (auto& s : symbols) {
2022 if (isEscaped(s)) in_str += '\\';
2023 alphabet.getSymbol(in_str, s);
2024 }
2025 symbols.clear();
2026 if (c == '\0') {
2027 write(in_str, output);
2028 u_fflushu_fflush_72(output);
2029 } else {
2030 u_fputcu_fputc_72('*', output);
2031 write(in_str, output);
2032 u_fputcu_fputc_72('/', output);
2033 if (mode != gm_clean) u_fputcu_fputc_72('*', output);
2034 write(in_str, output);
2035 u_fputcu_fputc_72('$', output);
2036 }
2037 nextBilingualWord(input, output, symbols, mode);
2038 return;
2039 }
2040}
2041
2042void
2043FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
2044{
2045 std::vector<int32_t> symbols;
2046 while (!input.eof()) {
2047 nextBilingualWord(input, output, symbols, mode);
2048 if (symbols.empty()) continue;
2049
2050 State current_state = initial_state;
2051
2052 bool firstupper = (symbols[0] > 0 && u_isupperu_isupper_72(symbols[0]));
2053 bool uppercase = (firstupper && symbols.size() > 1 &&
2054 symbols[1] > 0 && u_isupperu_isupper_72(symbols[1]));
2055
2056 bool seenTags = false;
2057 size_t queue_start = 0;
2058 UString result;
2059 for (size_t i = 0; i < symbols.size(); i++) {
2060 seenTags = seenTags || alphabet.isTag(symbols[i]);
2061 current_state.step_case(symbols[i], beCaseSensitive(current_state));
2062 if (current_state.isFinal(all_finals)) {
2063 queue_start = i;
2064 result = current_state.filterFinals(all_finals, alphabet, escaped_chars,
2065 displayWeightsMode, maxAnalyses,
2066 maxWeightClasses, uppercase,
2067 firstupper, 0);
2068 }
2069 }
2070 // if there are no tags, we only return complete matches
2071 if (!seenTags && queue_start + 1 < symbols.size()) result.clear();
2072
2073 UString source;
2074 size_t queue_pos = 0;
2075 for (size_t i = 0; i < symbols.size(); i++) {
2076 if (isEscaped(symbols[i]) || (i == 0 && symbols[i] == '*')) source += '\\';
2077 alphabet.getSymbol(source, symbols[i]);
2078 if (i == queue_start) queue_pos = source.size();
2079 }
2080
2081 write(source, output);
2082
2083 if (!result.empty()) {
2084 write(compose(result, source.substr(queue_pos)), output);
2085 } else {
2086 u_fputcu_fputc_72('/', output);
2087 u_fputcu_fputc_72((mode == gm_all ? '#' : '@'), output);
2088 write(source, output);
2089 }
2090 u_fputcu_fputc_72('$', output);
2091 }
2092}
2093
2094std::pair<UString, int>
2095FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
2096{
2097 State current_state = initial_state;
2098 UString result;
2099 unsigned int start_point = 1;
2100 unsigned int end_point = input_word.size()-2;
2101 UString queue;
2102 bool mark = false;
2103 bool seentags = false; // have we seen any tags at all in the analysis?
2104
2105 if(with_delim == false)
2106 {
2107 start_point = 0;
2108 end_point = input_word.size()-1;
2109 }
2110
2111 if(input_word[start_point] == '*')
2112 {
2113 return {US(input_word), 0};
2114 }
2115
2116 if(input_word[start_point] == '=')
2117 {
2118 start_point++;
2119 mark = true;
2120 }
2121
2122 bool firstupper = u_isupperu_isupper_72(input_word[start_point]);
2123 bool uppercase = firstupper && u_isupperu_isupper_72(input_word[start_point+1]);
2124
2125 UStringView word = input_word.substr(start_point, end_point-start_point);
2126 for (auto symbol : symbol_iter(word)) {
2127 int32_t val;
2128 if (symbol.size() == 1) {
2129 val = symbol[0];
2130 } else {
2131 val = alphabet(symbol);
2132 seentags = true;
2133 }
2134 if(current_state.size() != 0)
2135 {
2136 current_state.step_case(val, beCaseSensitive(current_state));
2137 }
2138 if(current_state.isFinal(all_finals))
2139 {
2140 result.clear();
2141 if (with_delim) {
2142 result += '^';
2143 }
2144 if (mark) {
2145 result += '=';
2146 }
2147 result += current_state.filterFinals(all_finals, alphabet,
2148 escaped_chars,
2149 displayWeightsMode, maxAnalyses, maxWeightClasses,
2150 uppercase, firstupper, 0).substr(1);
2151 }
2152
2153 if(current_state.size() == 0)
2154 {
2155 if(!symbol.empty() && !result.empty())
2156 {
2157 queue.append(symbol);
2158 }
2159 else
2160 {
2161 // word is not present
2162 if(with_delim)
2163 {
2164 result = "^@"_u + US(input_word.substr(1));
2165 }
2166 else
2167 {
2168 result = "@"_u + US(input_word);
2169 }
2170 return std::pair<UString, int>(result, 0);
2171 }
2172 }
2173 }
2174
2175 if (!seentags
2176 && current_state.filterFinals(all_finals, alphabet, escaped_chars,
2177 displayWeightsMode, maxAnalyses, maxWeightClasses,
2178 uppercase, firstupper, 0).empty())
2179 {
2180 // word is not present
2181 if(with_delim)
2182 {
2183 result = "^@"_u + US(input_word.substr(1));
2184 }
2185 else
2186 {
2187 result = "@"_u + US(input_word);
2188 }
2189 return {result, 0};
2190 }
2191
2192
2193
2194 // attach unmatched queue automatically
2195
2196 if(!queue.empty())
2197 {
2198 UString result_with_queue = compose(result, queue);
2199 if(with_delim)
2200 {
2201 result_with_queue += '$';
2202 }
2203 return {result_with_queue, queue.size()};
2204 }
2205 else
2206 {
2207 if(with_delim)
2208 {
2209 result += '$';
2210 }
2211 return {result, 0};
2212 }
2213}
2214
2215UString
2216FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
2217{
2218 State current_state = initial_state;
2219 UString result;
2220 unsigned int start_point = 1;
2221 unsigned int end_point = input_word.size()-2;
2222 bool mark = false;
2223
2224 if(with_delim == false)
2225 {
2226 start_point = 0;
2227 end_point = input_word.size()-1;
2228 }
2229
2230 if(input_word[start_point] == '*')
2231 {
2232 return US(input_word);
2233 }
2234
2235 if(input_word[start_point] == '=')
2236 {
2237 start_point++;
2238 mark = true;
2239 }
2240
2241 auto word = input_word.substr(start_point, end_point-start_point);
2242 UString queue;
2243 bool exists = step_biltrans(word, result, queue, with_delim, mark);
2244 if (!exists || !queue.empty()) {
2245 if (with_delim) return "^@"_u + US(input_word.substr(1));
2246 else return "@"_u + US(input_word);
2247 }
2248
2249 if(with_delim)
2250 {
2251 result += '$';
2252 }
2253 return result;
2254}
2255
2256
2257bool
2258FSTProcessor::valid() const
2259{
2260 if(initial_state.isFinal(all_finals))
2261 {
2262 std::cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << std::endl;
2263 return false;
2264 }
2265 else
2266 {
2267 State s = initial_state;
2268 s.step(' ');
2269 if(s.size() != 0)
2270 {
2271 std::cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << std::endl;
2272 return false;
2273 }
2274 }
2275
2276 return true;
2277}
2278
2279int
2280FSTProcessor::readSAO(InputFile& input)
2281{
2282 if(!input_buffer.isEmpty())
2283 {
2284 return input_buffer.next();
2285 }
2286
2287 UChar32 val = input.get();
2288 if(input.eof())
2289 {
2290 return 0;
2291 }
2292
2293 if(escaped_chars.find(val) != escaped_chars.end())
2294 {
2295 if(val == '<')
2296 {
2297 UString str = input.readBlock('<', '>');
2298 if(StringUtils::startswith(str, u"<![CDATA["))
2299 {
2300 while(!StringUtils::endswith(str, u"]]>"))
2301 {
2302 str.append(input.readBlock('<', '>').substr(1));
2303 }
2304 blankqueue.push(str);
2305 input_buffer.add(static_cast<int32_t>(' '));
2306 return static_cast<int32_t>(' ');
2307 }
2308 else
2309 {
2310 streamError();
2311 }
2312 }
2313 else if (val == '\\') {
2314 val = input.get();
2315 if(isEscaped(val))
2316 {
2317 input_buffer.add(val);
2318 return static_cast<int32_t>(val);
2319 }
2320 else
2321 streamError();
2322 }
2323 else
2324 {
2325 streamError();
2326 }
2327 }
2328
2329 input_buffer.add(static_cast<int32_t>(val));
2330 return static_cast<int32_t>(val);
2331}
2332
2333void
2334FSTProcessor::printSAOWord(UStringView lf, UFILE *output)
2335{
2336 for(unsigned int i = 1, limit = lf.size(); i != limit; i++)
2337 {
2338 if(lf[i] == '/')
2339 {
2340 break;
2341 }
2342 u_fputcu_fputc_72(lf[i], output);
2343 }
2344}
2345
2346void
2347FSTProcessor::SAO(InputFile& input, UFILE *output)
2348{
2349 bool last_incond = false;
2350 bool last_postblank = false;
2351 State current_state = initial_state;
2352 UString lf;
2353 UString sf;
2354 int last = 0;
2355
2356 escaped_chars.clear();
2357 escaped_chars.insert('\\');
2358 escaped_chars.insert('<');
2359 escaped_chars.insert('>');
2360
2361 while(UChar32 val = readSAO(input))
2362 {
2363 // test for final states
2364 if(current_state.isFinal(all_finals))
2365 {
2366 if(current_state.isFinal(inconditional))
2367 {
2368 bool firstupper = u_isupperu_isupper_72(sf[0]);
2369 bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]);
2370
2371 lf = current_state.filterFinalsSAO(all_finals, alphabet,
2372 escaped_chars,
2373 uppercase, firstupper);
2374 last_incond = true;
2375 last = input_buffer.getPos();
2376 }
2377 else if(current_state.isFinal(postblank))
2378 {
2379 bool firstupper = u_isupperu_isupper_72(sf[0]);
2380 bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]);
2381
2382 lf = current_state.filterFinalsSAO(all_finals, alphabet,
2383 escaped_chars,
2384 uppercase, firstupper);
2385 last_postblank = true;
2386 last = input_buffer.getPos();
2387 }
2388 else if(!isAlphabetic(val))
2389 {
2390 bool firstupper = u_isupperu_isupper_72(sf[0]);
2391 bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]);
2392
2393 lf = current_state.filterFinalsSAO(all_finals, alphabet,
2394 escaped_chars,
2395 uppercase, firstupper);
2396 last_postblank = false;
2397 last_incond = false;
2398 last = input_buffer.getPos();
2399 }
2400 }
2401 else if(sf.empty() && u_isspaceu_isspace_72(val))
2402 {
2403 lf = "/*"_u;
2404 lf.append(sf);
2405 last_postblank = false;
2406 last_incond = false;
2407 last = input_buffer.getPos();
2408 }
2409
2410 current_state.step_case(val, beCaseSensitive(current_state));
2411
2412 if(current_state.size() != 0)
2413 {
2414 alphabet.getSymbol(sf, val);
2415 }
2416 else
2417 {
2418 if(!isAlphabetic(val) && sf.empty())
2419 {
2420 if(u_isspaceu_isspace_72(val))
2421 {
2422 printSpace(val, output);
2423 }
2424 else
2425 {
2426 if(isEscaped(val))
2427 {
2428 u_fputcu_fputc_72('\\', output);
2429 }
2430 u_fputcu_fputc_72(val, output);
2431 }
2432 }
2433 else if(last_incond)
2434 {
2435 printSAOWord(lf, output);
2436 input_buffer.setPos(last);
2437 input_buffer.back(1);
2438 }
2439 else if(last_postblank)
2440 {
2441 printSAOWord(lf, output);
2442 u_fputcu_fputc_72(' ', output);
2443 input_buffer.setPos(last);
2444 input_buffer.back(1);
2445 }
2446 else if(isAlphabetic(val) &&
2447 ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) ||
2448 lf.empty()))
2449 {
2450 do
2451 {
2452 alphabet.getSymbol(sf, val);
2453 }
2454 while((val = readSAO(input)) && isAlphabetic(val));
2455
2456 auto limit = firstNotAlpha(sf);
2457 unsigned int size = sf.size(); // TODO: change these to character counts
2458 input_buffer.back(1+(size-limit.i_utf16));
2459 u_fprintfu_fprintf_72(output, "<d>%S</d>", sf.c_str());
2460 }
2461 else if(lf.empty())
2462 {
2463 auto limit = firstNotAlpha(sf);
2464 unsigned int size = sf.size(); // TODO: change these to character counts
2465 input_buffer.back(1+(size-limit.i_utf16));
2466 u_fprintfu_fprintf_72(output, "<d>%S</d>", sf.c_str());
2467 }
2468 else
2469 {
2470 printSAOWord(lf, output);
2471 input_buffer.setPos(last);
2472 input_buffer.back(1);
2473 }
2474
2475 current_state = initial_state;
2476 lf.clear();
2477 sf.clear();
2478 last_incond = false;
2479 last_postblank = false;
2480 }
2481 }
2482
2483 // print remaining blanks
2484 flushBlanks(output);
2485}
2486
2487UStringView
2488FSTProcessor::removeTags(UStringView str)
2489{
2490 for(unsigned int i = 0; i < str.size(); i++)
2491 {
2492 if(str[i] == '<' && i >=1 && str[i-1] != '\\')
2493 {
2494 return str.substr(0, i);
2495 }
2496 }
2497
2498 return str;
2499}
2500
2501
2502void
2503FSTProcessor::setBiltransSurfaceForms(bool value)
2504{
2505 biltransSurfaceForms = value;
2506}
2507
2508void
2509FSTProcessor::setBiltransSurfaceFormsKeep(bool value)
2510{
2511 biltransSurfaceFormsKeep = value;
2512}
2513
2514void
2515FSTProcessor::setCaseSensitiveMode(bool value)
2516{
2517 caseSensitive = value;
2518}
2519
2520void
2521FSTProcessor::setDictionaryCaseMode(bool value)
2522{
2523 dictionaryCase = value;
2524}
2525
2526void
2527FSTProcessor::setNullFlush(bool value)
2528{
2529 nullFlush = value;
2530}
2531
2532void
2533FSTProcessor::setIgnoredChars(bool value)
2534{
2535 useIgnoredChars = value;
2536}
2537
2538void
2539FSTProcessor::setRestoreChars(bool value)
2540{
2541 useRestoreChars = value;
2542}
2543
2544void
2545FSTProcessor::setUseDefaultIgnoredChars(bool value)
2546{
2547 useDefaultIgnoredChars = value;
2548}
2549
2550void
2551FSTProcessor::setDisplayWeightsMode(bool value)
2552{
2553 displayWeightsMode = value;
2554}
2555
2556void
2557FSTProcessor::setMaxAnalysesValue(int value)
2558{
2559 maxAnalyses = value;
2560}
2561
2562void
2563FSTProcessor::setMaxWeightClassesValue(int value)
2564{
2565 maxWeightClasses = value;
2566}
2567
2568void
2569FSTProcessor::setCompoundMaxElements(int value)
2570{
2571 compound_max_elements = value;
2572}
2573
2574bool
2575FSTProcessor::getDecompoundingMode()
2576{
2577 return do_decomposition;
2578}
2579
2580bool
2581FSTProcessor::getNullFlush()
2582{
2583 return nullFlush;
2584}
2585
2586FSTProcessor::Indices
2587FSTProcessor::firstNotAlpha(UStringView sf)
2588{
2589 FSTProcessor::Indices ix = { 0, 0 };
2590 UCharCharacterIterator it = UCharCharacterIterator(sf.data(), sf.size());
2591 while (it.hasNext()) {
2592 UChar32 c = it.next32PostInc();
2593 if(!isAlphabetic(c))
2594 {
2595 return ix;
2596 }
2597 ix.i_codepoint++;
2598 ix.i_utf16++;
2599 if(c > UINT16_MAX(65535)) {
2600 ix.i_utf16++;
2601 }
2602 }
2603 return ix;
2604}