File: | fst_processor.cc |
Warning: | line 1653, column 9 Value stored to 'cur_word' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright (C) 2005-2019 Universitat d'Alacant / Universidad de Alicante |
3 | * |
4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU General Public License as |
6 | * published by the Free Software Foundation; either version 2 of the |
7 | * License, or (at your option) any later version. |
8 | * |
9 | * This program is distributed in the hope that it will be useful, but |
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, see <https://www.gnu.org/licenses/>. |
16 | */ |
17 | #include <lttoolbox/fst_processor.h> |
18 | #include <lttoolbox/compression.h> |
19 | #include <lttoolbox/exception.h> |
20 | #include <lttoolbox/xml_parse_util.h> |
21 | #include <lttoolbox/file_utils.h> |
22 | #include <lttoolbox/string_utils.h> |
23 | #include <lttoolbox/symbol_iter.h> |
24 | |
25 | #include <iostream> |
26 | #include <cerrno> |
27 | #include <climits> |
28 | |
29 | |
30 | FSTProcessor::FSTProcessor() |
31 | { |
32 | // escaped_chars chars |
33 | escaped_chars.insert('['); |
34 | escaped_chars.insert(']'); |
35 | escaped_chars.insert('{'); |
36 | escaped_chars.insert('}'); |
37 | escaped_chars.insert('^'); |
38 | escaped_chars.insert('$'); |
39 | escaped_chars.insert('/'); |
40 | escaped_chars.insert('\\'); |
41 | escaped_chars.insert('@'); |
42 | escaped_chars.insert('<'); |
43 | escaped_chars.insert('>'); |
44 | |
45 | if(useDefaultIgnoredChars) |
46 | { |
47 | initDefaultIgnoredCharacters(); |
48 | } |
49 | } |
50 | |
51 | void |
52 | FSTProcessor::streamError() |
53 | { |
54 | throw Exception("Error: Malformed input stream."); |
55 | } |
56 | |
57 | void |
58 | FSTProcessor::parseICX(std::string const &file) |
59 | { |
60 | if(useIgnoredChars) |
61 | { |
62 | reader = xmlReaderForFile(file.c_str(), NULL__null, 0); |
63 | if(reader == NULL__null) |
64 | { |
65 | std::cerr << "Error: cannot open '" << file << "'." << std::endl; |
66 | exit(EXIT_FAILURE1); |
67 | } |
68 | int ret = xmlTextReaderRead(reader); |
69 | while(ret == 1) |
70 | { |
71 | procNodeICX(); |
72 | ret = xmlTextReaderRead(reader); |
73 | } |
74 | // No point trying to process ignored chars if there are none |
75 | if(ignored_chars.size() == 0) |
76 | { |
77 | useIgnoredChars = false; |
78 | } |
79 | } |
80 | } |
81 | |
82 | void |
83 | FSTProcessor::parseRCX(std::string const &file) |
84 | { |
85 | if(useRestoreChars) |
86 | { |
87 | reader = xmlReaderForFile(file.c_str(), NULL__null, 0); |
88 | if(reader == NULL__null) |
89 | { |
90 | std::cerr << "Error: cannot open '" << file << "'." << std::endl; |
91 | exit(EXIT_FAILURE1); |
92 | } |
93 | int ret = xmlTextReaderRead(reader); |
94 | while(ret == 1) |
95 | { |
96 | procNodeRCX(); |
97 | ret = xmlTextReaderRead(reader); |
98 | } |
99 | } |
100 | } |
101 | |
102 | void |
103 | FSTProcessor::procNodeICX() |
104 | { |
105 | UString name = XMLParseUtil::readName(reader); |
106 | if(name == XML_TEXT_NODE) |
107 | { |
108 | /* ignore */ |
109 | } |
110 | else if(name == XML_IGNORED_CHARS_ELEM) |
111 | { |
112 | /* ignore */ |
113 | } |
114 | else if(name == XML_CHAR_ELEM) |
115 | { |
116 | ignored_chars.insert(static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); |
117 | } |
118 | else if(name == XML_COMMENT_NODE) |
119 | { |
120 | /* ignore */ |
121 | } |
122 | else |
123 | { |
124 | std::cerr << "Error in ICX file (" << xmlTextReaderGetParserLineNumber(reader); |
125 | std::cerr << "): Invalid node '<" << name << ">'." << std::endl; |
126 | exit(EXIT_FAILURE1); |
127 | } |
128 | } |
129 | |
130 | void |
131 | FSTProcessor::initDefaultIgnoredCharacters() |
132 | { |
133 | ignored_chars.insert(173); // '\u00AD', soft hyphen |
134 | } |
135 | |
136 | void |
137 | FSTProcessor::procNodeRCX() |
138 | { |
139 | UString name = XMLParseUtil::readName(reader); |
140 | if(name == XML_TEXT_NODE) |
141 | { |
142 | /* ignore */ |
143 | } |
144 | else if(name == XML_RESTORE_CHARS_ELEM) |
145 | { |
146 | /* ignore */ |
147 | } |
148 | else if(name == XML_CHAR_ELEM) |
149 | { |
150 | rcx_current_char = static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]); |
151 | } |
152 | else if(name == XML_RESTORE_CHAR_ELEM) |
153 | { |
154 | rcx_map[rcx_current_char].insert(static_cast<int32_t>(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); |
155 | } |
156 | else if(name == XML_COMMENT_NODE) |
157 | { |
158 | /* ignore */ |
159 | } |
160 | else |
161 | { |
162 | std::cerr << "Error in RCX file (" << xmlTextReaderGetParserLineNumber(reader); |
163 | std::cerr << "): Invalid node '<" << name << ">'." << std::endl; |
164 | exit(EXIT_FAILURE1); |
165 | } |
166 | } |
167 | |
168 | int |
169 | FSTProcessor::readAnalysis(InputFile& input) |
170 | { |
171 | if (!input_buffer.isEmpty()) |
172 | { |
173 | UChar32 val = input_buffer.next(); |
174 | return val; |
175 | } |
176 | |
177 | UChar32 val = input.get(); |
178 | int32_t altval = 0; |
179 | if(input.eof()) |
180 | { |
181 | input_buffer.add(0); // so it's treated like the NUL byte |
182 | return 0; |
183 | } else if(val == U_EOF0xFFFF) { |
184 | val = 0; |
185 | } |
186 | |
187 | while ((useIgnoredChars || useDefaultIgnoredChars) && ignored_chars.find(val) != ignored_chars.end()) |
188 | { |
189 | val = input.get(); |
190 | } |
191 | |
192 | if(escaped_chars.find(val) != escaped_chars.end()) |
193 | { |
194 | switch(val) |
195 | { |
196 | case '<': |
197 | altval = alphabet(input.readBlock('<', '>')); |
198 | input_buffer.add(altval); |
199 | return altval; |
200 | |
201 | case '[': |
202 | val = input.get(); |
203 | |
204 | if(val == '[') |
205 | { |
206 | blankqueue.push(input.finishWBlank()); |
207 | } |
208 | else |
209 | { |
210 | input.unget(val); |
211 | blankqueue.push(input.readBlock('[', ']')); |
212 | } |
213 | |
214 | input_buffer.add(static_cast<int32_t>(' ')); |
215 | return static_cast<int32_t>(' '); |
216 | |
217 | case '\\': |
218 | val = input.get(); |
219 | input_buffer.add(static_cast<int32_t>(val)); |
220 | return val; |
221 | |
222 | default: |
223 | streamError(); |
224 | } |
225 | } |
226 | if(val == ' ') { |
227 | blankqueue.push(" "_u); |
228 | } |
229 | |
230 | input_buffer.add(val); |
231 | return val; |
232 | } |
233 | |
234 | int |
235 | FSTProcessor::readTMAnalysis(InputFile& input) |
236 | { |
237 | isLastBlankTM = false; |
238 | if(!input_buffer.isEmpty()) |
239 | { |
240 | return input_buffer.next(); |
241 | } |
242 | |
243 | UChar32 val = input.get(); |
244 | int32_t altval = 0; |
245 | if(input.eof()) |
246 | { |
247 | return 0; |
248 | } |
249 | |
250 | if(escaped_chars.find(val) != escaped_chars.end() || u_isdigitu_isdigit_72(val)) |
251 | { |
252 | switch(val) |
253 | { |
254 | case '<': |
255 | altval = alphabet(input.readBlock('<', '>')); |
256 | input_buffer.add(altval); |
257 | return altval; |
258 | |
259 | case '[': |
260 | val = input.get(); |
261 | |
262 | if(val == '[') |
263 | { |
264 | blankqueue.push(input.finishWBlank()); |
265 | } |
266 | else |
267 | { |
268 | input.unget(val); |
269 | blankqueue.push(input.readBlock('[', ']')); |
270 | } |
271 | |
272 | input_buffer.add(static_cast<int32_t>(' ')); |
273 | isLastBlankTM = true; |
274 | return static_cast<int32_t>(' '); |
275 | |
276 | case '\\': |
277 | val = input.get(); |
278 | input_buffer.add(static_cast<int32_t>(val)); |
279 | return val; |
280 | case '0': |
281 | case '1': |
282 | case '2': |
283 | case '3': |
284 | case '4': |
285 | case '5': |
286 | case '6': |
287 | case '7': |
288 | case '8': |
289 | case '9': |
290 | { |
291 | UString ws; |
292 | do |
293 | { |
294 | ws += val; |
295 | val = input.get(); |
296 | } while(u_isdigitu_isdigit_72(val)); |
297 | input.unget(val); |
298 | input_buffer.add(alphabet(u"<n>")); |
299 | numbers.push_back(ws); |
300 | return alphabet(u"<n>"); |
301 | } |
302 | break; |
303 | |
304 | default: |
305 | streamError(); |
306 | } |
307 | } |
308 | |
309 | input_buffer.add(val); |
310 | return val; |
311 | } |
312 | |
313 | bool |
314 | FSTProcessor::readTransliterationBlank(InputFile& input) |
315 | { |
316 | UString blank; |
317 | while (!input.eof()) { |
318 | UChar32 c = input.get(); |
319 | if (u_isspaceu_isspace_72(c)) { |
320 | blank += c; |
321 | } else if (c == '[') { |
322 | if (input.peek() == '[') { |
323 | break; |
324 | } |
325 | blank += input.readBlock('[', ']'); |
326 | } else { |
327 | input.unget(c); |
328 | break; |
329 | } |
330 | } |
331 | if (!blank.empty()) { |
332 | blankqueue.push(blank); |
333 | } |
334 | return !blank.empty(); |
335 | } |
336 | |
337 | bool |
338 | FSTProcessor::readTransliterationWord(InputFile& input) |
339 | { |
340 | if (input.eof() || input.peek() == '\0') { |
341 | return false; |
342 | } |
343 | |
344 | if (!readTransliterationBlank(input)) { |
345 | blankqueue.push(""_u); |
346 | } |
347 | |
348 | UString wblank; |
349 | std::vector<int32_t> word; |
350 | if (input.peek() == '[') { |
351 | input.get(); |
352 | wblank = input.finishWBlank(); |
353 | while (!input.eof()) { |
354 | if (readTransliterationBlank(input)) { |
355 | word.push_back(static_cast<int32_t>(' ')); |
356 | if (input.peek() == '[') break; |
357 | } else { |
358 | UChar32 c = input.get(); |
359 | if (c == '[') { |
360 | input.unget(c); |
361 | break; |
362 | } else if (c == '\\') { |
363 | word.push_back(static_cast<int32_t>(input.get())); |
364 | } else if (c == '<') { |
365 | word.push_back(alphabet(input.readBlock('<', '>'))); |
366 | } else if (c == '\0') { |
367 | input.unget(c); |
368 | break; |
369 | } else { |
370 | word.push_back(static_cast<int32_t>(c)); |
371 | } |
372 | } |
373 | } |
374 | if (input.peek() == '[') { |
375 | input.get(); |
376 | input.finishWBlank(); |
377 | } |
378 | } else { |
379 | while (!input.eof()) { |
380 | UChar32 c = input.get(); |
381 | if (u_isspaceu_isspace_72(c) || c == '[' || c == '\0') { |
382 | input.unget(c); |
383 | break; |
384 | } else if (c == '\\') { |
385 | word.push_back(static_cast<int32_t>(input.get())); |
386 | } else if (c == '<') { |
387 | word.push_back(alphabet(input.readBlock('<', '>'))); |
388 | } else { |
389 | word.push_back(static_cast<int32_t>(c)); |
390 | } |
391 | } |
392 | } |
393 | if (word.empty()) { |
394 | return false; |
395 | } |
396 | wblankqueue.push_back(wblank); |
397 | transliteration_queue.push_back(word); |
398 | |
399 | return true; |
400 | } |
401 | |
402 | void |
403 | FSTProcessor::skipUntil(InputFile& input, UFILE *output, UChar32 const character) |
404 | { |
405 | while(true) |
406 | { |
407 | UChar32 val = input.get(); |
408 | if(input.eof()) |
409 | { |
410 | return; |
411 | } |
412 | |
413 | switch(val) |
414 | { |
415 | case '\\': |
416 | val = input.get(); |
417 | if(input.eof()) |
418 | { |
419 | return; |
420 | } |
421 | u_fputcu_fputc_72('\\', output); |
422 | u_fputcu_fputc_72(val, output); |
423 | break; |
424 | |
425 | case '\0': |
426 | u_fputcu_fputc_72(val, output); |
427 | if(nullFlushGeneration) |
428 | { |
429 | u_fflushu_fflush_72(output); |
430 | } |
431 | break; |
432 | |
433 | default: |
434 | if(val == character) |
435 | { |
436 | return; |
437 | } |
438 | else |
439 | { |
440 | u_fputcu_fputc_72(val, output); |
441 | } |
442 | break; |
443 | } |
444 | } |
445 | } |
446 | |
447 | int |
448 | FSTProcessor::readGeneration(InputFile& input, UFILE *output) |
449 | { |
450 | UChar32 val = input.get(); |
451 | |
452 | if(input.eof()) |
453 | { |
454 | return 0x7fffffff; |
455 | } |
456 | |
457 | if(outOfWord) |
458 | { |
459 | if(val == '^') |
460 | { |
461 | val = input.get(); |
462 | if(input.eof()) |
463 | { |
464 | return 0x7fffffff; |
465 | } |
466 | } |
467 | else if(val == '\\') |
468 | { |
469 | u_fputcu_fputc_72(val, output); |
470 | val = input.get(); |
471 | if(input.eof()) |
472 | { |
473 | return 0x7fffffff; |
474 | } |
475 | u_fputcu_fputc_72(val,output); |
476 | skipUntil(input, output, '^'); |
477 | val = input.get(); |
478 | if(input.eof()) |
479 | { |
480 | return 0x7fffffff; |
481 | } |
482 | } |
483 | else |
484 | { |
485 | u_fputcu_fputc_72(val, output); |
486 | skipUntil(input, output, '^'); |
487 | val = input.get(); |
488 | if(input.eof()) |
489 | { |
490 | return 0x7fffffff; |
491 | } |
492 | } |
493 | outOfWord = false; |
494 | } |
495 | |
496 | if(val == '\\') |
497 | { |
498 | val = input.get(); |
499 | return static_cast<int32_t>(val); |
500 | } |
501 | else if(val == '$') |
502 | { |
503 | outOfWord = true; |
504 | return static_cast<int32_t>('$'); |
505 | } |
506 | else if(val == '<') |
507 | { |
508 | return alphabet(input.readBlock('<', '>')); |
509 | } |
510 | else if(val == '[') |
511 | { |
512 | val = input.get(); |
513 | if(val == '[') |
514 | { |
515 | write(input.finishWBlank(), output); |
516 | } |
517 | else |
518 | { |
519 | input.unget(val); |
520 | write(input.readBlock('[', ']'), output); |
521 | } |
522 | |
523 | return readGeneration(input, output); |
524 | } |
525 | else |
526 | { |
527 | return static_cast<int32_t>(val); |
528 | } |
529 | |
530 | return 0x7fffffff; |
531 | } |
532 | |
533 | void |
534 | FSTProcessor::flushBlanks(UFILE *output) |
535 | { |
536 | for(size_t i = blankqueue.size(); i > 0; i--) |
537 | { |
538 | write(blankqueue.front(), output); |
539 | blankqueue.pop(); |
540 | } |
541 | } |
542 | |
543 | void |
544 | FSTProcessor::calcInitial() |
545 | { |
546 | for(auto& it : transducers) { |
547 | root.addTransition(0, 0, it.second.getInitial(), default_weight); |
548 | } |
549 | |
550 | initial_state.init(&root); |
551 | } |
552 | |
553 | void |
554 | FSTProcessor::classifyFinals() |
555 | { |
556 | for(auto& it : transducers) { |
557 | if(StringUtils::endswith(it.first, u"@inconditional")) |
558 | { |
559 | inconditional.insert(it.second.getFinals().begin(), |
560 | it.second.getFinals().end()); |
561 | } |
562 | else if(StringUtils::endswith(it.first, u"@standard")) |
563 | { |
564 | standard.insert(it.second.getFinals().begin(), |
565 | it.second.getFinals().end()); |
566 | } |
567 | else if(StringUtils::endswith(it.first, u"@postblank")) |
568 | { |
569 | postblank.insert(it.second.getFinals().begin(), |
570 | it.second.getFinals().end()); |
571 | } |
572 | else if(StringUtils::endswith(it.first, u"@preblank")) |
573 | { |
574 | preblank.insert(it.second.getFinals().begin(), |
575 | it.second.getFinals().end()); |
576 | } |
577 | else |
578 | { |
579 | std::cerr << "Error: Unsupported transducer type for '"; |
580 | std::cerr << it.first << "'." << std::endl; |
581 | exit(EXIT_FAILURE1); |
582 | } |
583 | } |
584 | } |
585 | |
586 | UString |
587 | FSTProcessor::filterFinals(const State& state, UStringView casefrom) |
588 | { |
589 | bool firstupper = false, uppercase = false; |
590 | if (!dictionaryCase) { |
591 | firstupper = u_isupperu_isupper_72(casefrom[0]); |
592 | uppercase = (casefrom.size() > 1 && |
593 | firstupper && u_isupperu_isupper_72(casefrom[casefrom.size()-1])); |
594 | } |
595 | return state.filterFinals(all_finals, alphabet, escaped_chars, |
596 | displayWeightsMode, maxAnalyses, maxWeightClasses, |
597 | uppercase, firstupper, 0); |
598 | } |
599 | |
600 | void |
601 | FSTProcessor::writeEscaped(UStringView str, UFILE *output) |
602 | { |
603 | for(unsigned int i = 0, limit = str.size(); i < limit; i++) |
604 | { |
605 | if(escaped_chars.find(str[i]) != escaped_chars.end()) |
606 | { |
607 | u_fputcu_fputc_72('\\', output); |
608 | } |
609 | u_fputcu_fputc_72(str[i], output); |
610 | } |
611 | } |
612 | |
613 | size_t |
614 | FSTProcessor::writeEscapedPopBlanks(UStringView str, UFILE *output) |
615 | { |
616 | size_t postpop = 0; |
617 | for (unsigned int i = 0, limit = str.size(); i < limit; i++) |
618 | { |
619 | if (escaped_chars.find(str[i]) != escaped_chars.end()) { |
620 | u_fputcu_fputc_72('\\', output); |
621 | } |
622 | u_fputcu_fputc_72(str[i], output); |
623 | if (str[i] == ' ') { |
624 | if (blankqueue.front() == " "_u) { |
625 | blankqueue.pop(); |
626 | } else { |
627 | postpop++; |
628 | } |
629 | } |
630 | } |
631 | return postpop; |
632 | } |
633 | |
634 | void |
635 | FSTProcessor::writeEscapedWithTags(UStringView str, UFILE *output) |
636 | { |
637 | for(unsigned int i = 0, limit = str.size(); i < limit; i++) |
638 | { |
639 | if(str[i] == '<' && i >=1 && str[i-1] != '\\') |
640 | { |
641 | write(str.substr(i), output); |
642 | return; |
643 | } |
644 | |
645 | if(escaped_chars.find(str[i]) != escaped_chars.end()) |
646 | { |
647 | u_fputcu_fputc_72('\\', output); |
648 | } |
649 | u_fputcu_fputc_72(str[i], output); |
650 | } |
651 | } |
652 | |
653 | |
654 | |
655 | void |
656 | FSTProcessor::printWord(UStringView sf, UStringView lf, UFILE *output) |
657 | { |
658 | u_fputcu_fputc_72('^', output); |
659 | writeEscaped(sf, output); |
660 | write(lf, output); |
661 | u_fputcu_fputc_72('$', output); |
662 | } |
663 | |
664 | void |
665 | FSTProcessor::printWordPopBlank(UStringView sf, UStringView lf, UFILE *output) |
666 | { |
667 | u_fputcu_fputc_72('^', output); |
668 | size_t postpop = writeEscapedPopBlanks(sf, output); |
669 | u_fprintfu_fprintf_72(output, "%.*S$", lf.size(), lf.data()); |
670 | while (postpop-- && blankqueue.size() > 0) |
671 | { |
672 | write(blankqueue.front(), output); |
673 | blankqueue.pop(); |
674 | } |
675 | } |
676 | |
677 | void |
678 | FSTProcessor::printUnknownWord(UStringView sf, UFILE *output) |
679 | { |
680 | u_fputcu_fputc_72('^', output); |
681 | writeEscaped(sf, output); |
682 | u_fputcu_fputc_72('/', output); |
683 | u_fputcu_fputc_72('*', output); |
684 | writeEscaped(sf, output); |
685 | u_fputcu_fputc_72('$', output); |
686 | } |
687 | |
688 | unsigned int |
689 | FSTProcessor::lastBlank(UStringView str) |
690 | { |
691 | for(int i = static_cast<int>(str.size())-1; i >= 0; i--) |
692 | { |
693 | if(alphabetic_chars.find(str[i]) == alphabetic_chars.end()) |
694 | { |
695 | return static_cast<unsigned int>(i); |
696 | } |
697 | } |
698 | |
699 | return 0; |
700 | } |
701 | |
702 | void |
703 | FSTProcessor::printSpace(UChar32 val, UFILE *output) |
704 | { |
705 | if(blankqueue.size() > 0) |
706 | { |
707 | flushBlanks(output); |
708 | } |
709 | else |
710 | { |
711 | u_fputcu_fputc_72(val, output); |
712 | } |
713 | } |
714 | |
715 | void |
716 | FSTProcessor::printChar(UChar32 val, UFILE* output) |
717 | { |
718 | if (u_isspaceu_isspace_72(val)) { |
719 | if (blankqueue.size() > 0) { |
720 | write(blankqueue.front(), output); |
721 | blankqueue.pop(); |
722 | } else { |
723 | u_fputcu_fputc_72(val, output); |
724 | } |
725 | } else { |
726 | if (isEscaped(val)) { |
727 | u_fputcu_fputc_72('\\', output); |
728 | } |
729 | if (val) { |
730 | u_fputcu_fputc_72(val, output); |
731 | } |
732 | } |
733 | } |
734 | |
735 | bool |
736 | FSTProcessor::isEscaped(UChar32 c) const |
737 | { |
738 | return escaped_chars.find(c) != escaped_chars.end(); |
739 | } |
740 | |
741 | bool |
742 | FSTProcessor::isAlphabetic(UChar32 c) const |
743 | { |
744 | return u_isalnumu_isalnum_72(c) || alphabetic_chars.find(c) != alphabetic_chars.end(); |
745 | } |
746 | |
747 | void |
748 | FSTProcessor::load(FILE *input) |
749 | { |
750 | readTransducerSet(input, alphabetic_chars, alphabet, transducers); |
751 | } |
752 | |
753 | void |
754 | FSTProcessor::initAnalysis() |
755 | { |
756 | calcInitial(); |
757 | classifyFinals(); |
758 | all_finals = standard; |
759 | all_finals.insert(inconditional.begin(), inconditional.end()); |
760 | all_finals.insert(postblank.begin(), postblank.end()); |
761 | all_finals.insert(preblank.begin(), preblank.end()); |
762 | } |
763 | |
764 | void |
765 | FSTProcessor::initTMAnalysis() |
766 | { |
767 | calcInitial(); |
768 | |
769 | for(auto& it : transducers) { |
770 | all_finals.insert(it.second.getFinals().begin(), |
771 | it.second.getFinals().end()); |
772 | } |
773 | } |
774 | |
775 | void |
776 | FSTProcessor::initGeneration() |
777 | { |
778 | setIgnoredChars(false); |
779 | calcInitial(); |
780 | for(auto& it : transducers) { |
781 | all_finals.insert(it.second.getFinals().begin(), |
782 | it.second.getFinals().end()); |
783 | } |
784 | } |
785 | |
786 | void |
787 | FSTProcessor::initTransliteration() |
788 | { |
789 | initGeneration(); |
790 | } |
791 | |
792 | void |
793 | FSTProcessor::initBiltrans() |
794 | { |
795 | initGeneration(); |
796 | } |
797 | |
798 | |
799 | UString |
800 | FSTProcessor::compoundAnalysis(UString input_word) |
801 | { |
802 | const int MAX_COMBINATIONS = 32767; |
803 | |
804 | State current_state = initial_state; |
805 | |
806 | for(unsigned int i=0; i<input_word.size(); i++) |
807 | { |
808 | UChar val=input_word[i]; |
809 | |
810 | current_state.step_case(val, beCaseSensitive(current_state)); |
811 | |
812 | if(current_state.size() > MAX_COMBINATIONS) |
813 | { |
814 | std::cerr << "Warning: compoundAnalysis's MAX_COMBINATIONS exceeded for '" << input_word << "'" << std::endl; |
815 | std::cerr << " gave up at char " << i << " '" << val << "'." << std::endl; |
816 | |
817 | UString nullString; |
818 | return nullString; |
819 | } |
820 | |
821 | if(i < input_word.size()-1) |
822 | { |
823 | current_state.restartFinals(all_finals, compoundOnlyLSymbol, &initial_state, '+'); |
824 | } |
825 | |
826 | if(current_state.size()==0) |
827 | { |
828 | UString nullString; |
829 | return nullString; |
830 | } |
831 | } |
832 | |
833 | current_state.pruneCompounds(compoundRSymbol, '+', compound_max_elements); |
834 | return filterFinals(current_state, input_word); |
835 | } |
836 | |
837 | |
838 | |
839 | void |
840 | FSTProcessor::initDecompositionSymbols() |
841 | { |
842 | if((compoundOnlyLSymbol=alphabet(u"<:co:only-L>")) == 0 |
843 | && (compoundOnlyLSymbol=alphabet(u"<:compound:only-L>")) == 0 |
844 | && (compoundOnlyLSymbol=alphabet(u"<@co:only-L>")) == 0 |
845 | && (compoundOnlyLSymbol=alphabet(u"<@compound:only-L>")) == 0 |
846 | && (compoundOnlyLSymbol=alphabet(u"<compound-only-L>")) == 0) |
847 | { |
848 | std::cerr << "Warning: Decomposition symbol <:compound:only-L> not found" << std::endl; |
849 | } |
850 | else if(!showControlSymbols) |
851 | { |
852 | alphabet.setSymbol(compoundOnlyLSymbol, u""); |
853 | } |
854 | |
855 | if((compoundRSymbol=alphabet(u"<:co:R>")) == 0 |
856 | && (compoundRSymbol=alphabet(u"<:compound:R>")) == 0 |
857 | && (compoundRSymbol=alphabet(u"<@co:R>")) == 0 |
858 | && (compoundRSymbol=alphabet(u"<@compound:R>")) == 0 |
859 | && (compoundRSymbol=alphabet(u"<compound-R>")) == 0) |
860 | { |
861 | std::cerr << "Warning: Decomposition symbol <:compound:R> not found" << std::endl; |
862 | } |
863 | else if(!showControlSymbols) |
864 | { |
865 | alphabet.setSymbol(compoundRSymbol, u""); |
866 | } |
867 | } |
868 | |
869 | |
870 | void |
871 | FSTProcessor::initDecomposition() |
872 | { |
873 | do_decomposition = true; |
874 | initAnalysis(); |
875 | initDecompositionSymbols(); |
876 | } |
877 | |
878 | void |
879 | FSTProcessor::analysis(InputFile& input, UFILE *output) |
880 | { |
881 | if(getNullFlush()) |
882 | { |
883 | analysis_wrapper_null_flush(input, output); |
884 | } |
885 | |
886 | bool last_incond = false; |
887 | bool last_postblank = false; |
888 | bool last_preblank = false; |
889 | State current_state = initial_state; |
890 | UString lf; // analysis (lexical form and tags) |
891 | UString sf; // surface form |
892 | UString lf_spcmp; // space compound analysis |
893 | bool seen_cpL = false; // have we seen a <compound-only-L> tag so far |
894 | size_t last_start = input_buffer.getPos(); // position in input_buffer when sf was last cleared |
895 | size_t last = 0; // position in input_buffer after last analysis |
896 | size_t last_size = 0; // size of sf at last analysis |
897 | std::map<int, std::set<int> >::iterator rcx_map_ptr; |
898 | |
899 | UChar32 val; |
900 | do |
901 | { |
902 | val = readAnalysis(input); |
903 | // test for final states |
904 | if(current_state.isFinal(all_finals)) |
905 | { |
906 | if(current_state.isFinal(inconditional)) |
907 | { |
908 | if(do_decomposition && compoundOnlyLSymbol != 0) |
909 | { |
910 | current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol); |
911 | } |
912 | lf = filterFinals(current_state, sf); |
913 | last_incond = true; |
914 | last = input_buffer.getPos(); |
915 | last_size = sf.size(); |
916 | } |
917 | else if(current_state.isFinal(postblank)) |
918 | { |
919 | if(do_decomposition && compoundOnlyLSymbol != 0) |
920 | { |
921 | current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol); |
922 | } |
923 | lf = filterFinals(current_state, sf); |
924 | last_postblank = true; |
925 | last = input_buffer.getPos(); |
926 | last_size = sf.size(); |
927 | } |
928 | else if(current_state.isFinal(preblank)) |
929 | { |
930 | if(do_decomposition && compoundOnlyLSymbol != 0) |
931 | { |
932 | current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol); |
933 | } |
934 | lf = filterFinals(current_state, sf); |
935 | last_preblank = true; |
936 | last = input_buffer.getPos(); |
937 | last_size = sf.size(); |
938 | } |
939 | else if(!isAlphabetic(val)) |
940 | { |
941 | if(do_decomposition && compoundOnlyLSymbol != 0) |
942 | { |
943 | current_state.pruneStatesWithForbiddenSymbol(compoundOnlyLSymbol); |
944 | } |
945 | lf = filterFinals(current_state, sf); |
946 | last_postblank = false; |
947 | last_preblank = false; |
948 | last_incond = false; |
949 | last = input_buffer.getPos(); |
950 | last_size = sf.size(); |
951 | } |
952 | else { // isAlphabetic, standard type section |
953 | // Record if a compound might be possible |
954 | if (do_decomposition && compoundOnlyLSymbol != 0 |
955 | && current_state.hasSymbol(compoundOnlyLSymbol)) { |
956 | seen_cpL = true; |
957 | } |
958 | } |
959 | } |
960 | else if(sf.empty() && u_isspaceu_isspace_72(val)) |
961 | { |
962 | lf = "/*"_u; |
963 | lf.append(sf); |
964 | last_postblank = false; |
965 | last_preblank = false; |
966 | last_incond = false; |
967 | last = input_buffer.getPos(); |
968 | last_size = sf.size(); |
969 | } |
970 | |
971 | if(useRestoreChars && rcx_map.find(val) != rcx_map.end()) |
972 | { |
973 | rcx_map_ptr = rcx_map.find(val); |
974 | std::set<int> tmpset = rcx_map_ptr->second; |
975 | if(!u_isupperu_isupper_72(val) || beCaseSensitive(current_state)) |
976 | { |
977 | current_state.step(val, tmpset); |
978 | } |
979 | else if(rcx_map.find(u_toloweru_tolower_72(val)) != rcx_map.end()) |
980 | { |
981 | rcx_map_ptr = rcx_map.find(tolower(val)); |
982 | tmpset.insert(tolower(val)); |
983 | tmpset.insert(rcx_map_ptr->second.begin(), rcx_map_ptr->second.end()); |
984 | current_state.step(val, tmpset); |
985 | } |
986 | else |
987 | { |
988 | tmpset.insert(tolower(val)); |
989 | current_state.step(val, tmpset); |
990 | } |
991 | } |
992 | else |
993 | { |
994 | current_state.step_case(val, beCaseSensitive(current_state)); |
995 | } |
996 | |
997 | if(current_state.size() != 0) |
998 | { |
999 | if(val != 0) |
1000 | { |
1001 | alphabet.getSymbol(sf, val); |
1002 | } |
1003 | } |
1004 | else |
1005 | { |
1006 | // First try if blank-crossing compound analysis is possible; have |
1007 | // to fall back on the regular methods if this didn't work: |
1008 | lf_spcmp.clear(); |
1009 | if (seen_cpL // We've seen both a space and a <compund-only-L> |
1010 | && isAlphabetic(val) |
1011 | && !sf.empty() |
1012 | && last_size <= lastBlank(sf)) { |
1013 | int oldval = val; |
1014 | UString oldsf = sf; |
1015 | do { |
1016 | alphabet.getSymbol(sf, val); |
1017 | } while ((val = readAnalysis(input)) && isAlphabetic(val)); |
1018 | lf_spcmp = compoundAnalysis(sf); |
1019 | if(lf_spcmp.empty()) { // didn't work, rewind! |
1020 | input_buffer.back(sf.size() - oldsf.size()); |
1021 | val = oldval; |
1022 | sf.swap(oldsf); |
1023 | } |
1024 | else { |
1025 | input_buffer.back(1); |
1026 | val = input_buffer.peek(); |
1027 | } |
1028 | } |
1029 | seen_cpL = false; |
1030 | |
1031 | if(!lf_spcmp.empty()) { |
1032 | printWordPopBlank(sf, lf_spcmp, output); |
1033 | } |
1034 | else if(!isAlphabetic(val) && sf.empty()) |
1035 | { |
1036 | printChar(val, output); |
1037 | } |
1038 | else if(last_postblank) |
1039 | { |
1040 | printWordPopBlank(sf.substr(0, last_size), |
1041 | lf, output); |
1042 | u_fputcu_fputc_72(' ', output); |
1043 | input_buffer.setPos(last); |
1044 | input_buffer.back(1); |
1045 | } |
1046 | else if(last_preblank) |
1047 | { |
1048 | u_fputcu_fputc_72(' ', output); |
1049 | printWordPopBlank(sf.substr(0, last_size), |
1050 | lf, output); |
1051 | input_buffer.setPos(last); |
1052 | input_buffer.back(1); |
1053 | } |
1054 | else if(last_incond) |
1055 | { |
1056 | printWordPopBlank(sf.substr(0, last_size), |
1057 | lf, output); |
1058 | input_buffer.setPos(last); |
1059 | input_buffer.back(1); |
1060 | } |
1061 | else if(isAlphabetic(val) && |
1062 | // we can't skip back a blank: |
1063 | (last_size > lastBlank(sf) || |
1064 | // or we've failed to reach an analysis: |
1065 | lf.empty())) |
1066 | { |
1067 | do |
1068 | { |
1069 | alphabet.getSymbol(sf, val); |
1070 | } |
1071 | while((val = readAnalysis(input)) && isAlphabetic(val)); |
1072 | |
1073 | auto limit = firstNotAlpha(sf); |
1074 | if(limit.i_codepoint == 0) |
1075 | { |
1076 | input_buffer.setPos(1 + last_start); |
1077 | writeEscaped(sf.substr(0,1), output); |
1078 | } |
1079 | else |
1080 | { |
1081 | input_buffer.setPos(last_start + limit.i_codepoint); |
1082 | UString unknown_word = sf.substr(0, limit.i_utf16); |
1083 | if(do_decomposition) |
1084 | { |
1085 | UString compound = compoundAnalysis(unknown_word); |
1086 | if(!compound.empty()) |
1087 | { |
1088 | printWord(unknown_word, compound, output); |
1089 | } |
1090 | else |
1091 | { |
1092 | printUnknownWord(unknown_word, output); |
1093 | } |
1094 | } |
1095 | else |
1096 | { |
1097 | printUnknownWord(unknown_word, output); |
1098 | } |
1099 | } |
1100 | } |
1101 | else if(lf.empty()) |
1102 | { |
1103 | auto limit = firstNotAlpha(sf); |
1104 | if(limit.i_codepoint == 0) |
1105 | { |
1106 | input_buffer.setPos(1 + last_start); |
1107 | writeEscaped(sf.substr(0,1), output); |
1108 | } |
1109 | else |
1110 | { |
1111 | input_buffer.setPos(last_start + limit.i_codepoint); |
1112 | UString unknown_word = sf.substr(0, limit.i_utf16); |
1113 | if(do_decomposition) |
1114 | { |
1115 | UString compound = compoundAnalysis(unknown_word); |
1116 | if(!compound.empty()) |
1117 | { |
1118 | printWord(unknown_word, compound, output); |
1119 | } |
1120 | else |
1121 | { |
1122 | printUnknownWord(unknown_word, output); |
1123 | } |
1124 | } |
1125 | else |
1126 | { |
1127 | printUnknownWord(unknown_word, output); |
1128 | } |
1129 | } |
1130 | } |
1131 | else |
1132 | { |
1133 | printWordPopBlank(sf.substr(0, last_size), |
1134 | lf, output); |
1135 | input_buffer.setPos(last); |
1136 | input_buffer.back(1); |
1137 | } |
1138 | if(val == 0) { |
1139 | if(!input_buffer.isEmpty()) { |
1140 | input_buffer.setPos(last+1); |
1141 | } |
1142 | } |
1143 | |
1144 | current_state = initial_state; |
1145 | lf.clear(); |
1146 | sf.clear(); |
1147 | last_start = input_buffer.getPos(); |
1148 | last_incond = false; |
1149 | last_postblank = false; |
1150 | last_preblank = false; |
1151 | } |
1152 | } |
1153 | while(val); |
1154 | |
1155 | // print remaining blanks |
1156 | flushBlanks(output); |
1157 | } |
1158 | |
1159 | void |
1160 | FSTProcessor::analysis_wrapper_null_flush(InputFile& input, UFILE *output) |
1161 | { |
1162 | setNullFlush(false); |
1163 | while(!input.eof()) |
1164 | { |
1165 | analysis(input, output); |
1166 | u_fputcu_fputc_72('\0', output); |
1167 | u_fflushu_fflush_72(output); |
1168 | // analysis() doesn't always leave input_buffer empty |
1169 | // which results in repeatedly analyzing the same string |
1170 | // so clear it here |
1171 | while (!input_buffer.isEmpty()) input_buffer.next(); |
1172 | } |
1173 | } |
1174 | |
1175 | void |
1176 | FSTProcessor::generation_wrapper_null_flush(InputFile& input, UFILE *output, |
1177 | GenerationMode mode) |
1178 | { |
1179 | setNullFlush(false); |
1180 | nullFlushGeneration = true; |
1181 | |
1182 | while(!input.eof()) |
1183 | { |
1184 | generation(input, output, mode); |
1185 | u_fputcu_fputc_72('\0', output); |
1186 | u_fflushu_fflush_72(output); |
1187 | } |
1188 | } |
1189 | |
1190 | void |
1191 | FSTProcessor::tm_analysis(InputFile& input, UFILE *output) |
1192 | { |
1193 | State current_state = initial_state; |
1194 | UString lf; //lexical form |
1195 | UString sf; //surface form |
1196 | int last = 0; |
1197 | |
1198 | while(int32_t val = readTMAnalysis(input)) |
1199 | { |
1200 | // test for final states |
1201 | if(current_state.isFinal(all_finals)) |
1202 | { |
1203 | if(u_ispunctu_ispunct_72(val)) |
1204 | { |
1205 | lf = current_state.filterFinalsTM(all_finals, alphabet, |
1206 | escaped_chars, |
1207 | blankqueue, numbers).substr(1); |
1208 | last = input_buffer.getPos(); |
1209 | numbers.clear(); |
1210 | } |
1211 | } |
1212 | else if(sf.empty() && u_isspaceu_isspace_72(val)) |
1213 | { |
1214 | lf.append(sf); |
1215 | last = input_buffer.getPos(); |
1216 | } |
1217 | |
1218 | current_state.step_case(val, false); |
1219 | |
1220 | if(current_state.size() != 0) |
1221 | { |
1222 | if(val == -1) |
1223 | { |
1224 | sf.append(numbers[numbers.size()-1]); |
1225 | } |
1226 | else if(isLastBlankTM && val == ' ') |
1227 | { |
1228 | sf.append(blankqueue.back()); |
1229 | } |
1230 | else |
1231 | { |
1232 | alphabet.getSymbol(sf, val); |
1233 | } |
1234 | } |
1235 | else |
1236 | { |
1237 | if((u_isspaceu_isspace_72(val) || u_ispunctu_ispunct_72(val)) && sf.empty()) |
1238 | { |
1239 | if(u_isspaceu_isspace_72(val)) |
1240 | { |
1241 | printSpace(val, output); |
1242 | } |
1243 | else |
1244 | { |
1245 | if(isEscaped(val)) |
1246 | { |
1247 | u_fputcu_fputc_72('\\', output); |
1248 | } |
1249 | u_fputcu_fputc_72(val, output); |
1250 | } |
1251 | } |
1252 | else if(!u_isspaceu_isspace_72(val) && !u_ispunctu_ispunct_72(val) && |
1253 | ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || |
1254 | lf.empty())) |
1255 | { |
1256 | |
1257 | do |
1258 | { |
1259 | if(val == -1) |
1260 | { |
1261 | sf.append(numbers[numbers.size()-1]); |
1262 | } |
1263 | else if(isLastBlankTM && val == ' ') |
1264 | { |
1265 | sf.append(blankqueue.back()); |
1266 | } |
1267 | else |
1268 | { |
1269 | alphabet.getSymbol(sf, val); |
1270 | } |
1271 | } |
1272 | while((val = readTMAnalysis(input)) && !u_isspaceu_isspace_72(val) && !u_ispunctu_ispunct_72(val)); |
1273 | |
1274 | if(val == 0) |
1275 | { |
1276 | write(sf, output); |
1277 | return; |
1278 | } |
1279 | |
1280 | input_buffer.back(1); |
1281 | write(sf, output); |
1282 | |
1283 | while(blankqueue.size() > 0) |
1284 | { |
1285 | if(blankqueue.size() == 1 && isLastBlankTM) |
1286 | { |
1287 | break; |
1288 | } |
1289 | blankqueue.pop(); |
1290 | } |
1291 | |
1292 | /* |
1293 | unsigned int limit = sf.find(' '); |
1294 | unsigned int size = sf.size(); |
1295 | limit = (limit == static_cast<unsigned int>(UString::npos)?size:limit); |
1296 | input_buffer.back(1+(size-limit)); |
1297 | write(sf.substr(0, limit), output); |
1298 | */ } |
1299 | else if(lf.empty()) |
1300 | { |
1301 | /* unsigned int limit = sf.find(' '); |
1302 | unsigned int size = sf.size(); |
1303 | limit = (limit == static_cast<unsigned int >(UString::npos)?size:limit); |
1304 | input_buffer.back(1+(size-limit)); |
1305 | write(sf.substr(0, limit), output); |
1306 | */ |
1307 | input_buffer.back(1); |
1308 | write(sf, output); |
1309 | |
1310 | while(blankqueue.size() > 0) |
1311 | { |
1312 | if(blankqueue.size() == 1 && isLastBlankTM) |
1313 | { |
1314 | break; |
1315 | } |
1316 | blankqueue.pop(); |
1317 | } |
1318 | |
1319 | } |
1320 | else |
1321 | { |
1322 | u_fprintfu_fprintf_72(output, "[%S]", lf.c_str()); |
1323 | input_buffer.setPos(last); |
1324 | input_buffer.back(1); |
1325 | } |
1326 | |
1327 | current_state = initial_state; |
1328 | lf.clear(); |
1329 | sf.clear(); |
1330 | } |
1331 | } |
1332 | |
1333 | // print remaining blanks |
1334 | flushBlanks(output); |
1335 | } |
1336 | |
1337 | |
1338 | void |
1339 | FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode) |
1340 | { |
1341 | if(getNullFlush()) |
1342 | { |
1343 | generation_wrapper_null_flush(input, output, mode); |
1344 | } |
1345 | |
1346 | State current_state = initial_state; |
1347 | UString sf; |
1348 | |
1349 | outOfWord = false; |
1350 | |
1351 | skipUntil(input, output, '^'); |
1352 | int val; |
1353 | |
1354 | while((val = readGeneration(input, output)) != 0x7fffffff) |
1355 | { |
1356 | if(sf.empty() && val == '=') |
1357 | { |
1358 | u_fputcu_fputc_72('=', output); |
1359 | val = readGeneration(input, output); |
1360 | } |
1361 | |
1362 | if(val == '$' && outOfWord) |
1363 | { |
1364 | if(sf[0] == '*' || sf[0] == '%') |
1365 | { |
1366 | if(mode != gm_clean && mode != gm_tagged_nm) |
1367 | { |
1368 | writeEscaped(sf, output); |
1369 | } |
1370 | else if (mode == gm_clean) |
1371 | { |
1372 | writeEscaped(sf.substr(1), output); |
1373 | } |
1374 | else if(mode == gm_tagged_nm) |
1375 | { |
1376 | u_fputcu_fputc_72('^', output); |
1377 | writeEscaped(removeTags(sf.substr(1)), output); |
1378 | u_fputcu_fputc_72('/', output); |
1379 | writeEscapedWithTags(sf, output); |
1380 | u_fputcu_fputc_72('$', output); |
1381 | } |
1382 | } |
1383 | else if(sf[0] == '@') |
1384 | { |
1385 | if(mode == gm_all) |
1386 | { |
1387 | writeEscaped(sf, output); |
1388 | } |
1389 | else if(mode == gm_clean) |
1390 | { |
1391 | writeEscaped(removeTags(sf.substr(1)), output); |
1392 | } |
1393 | else if(mode == gm_unknown) |
1394 | { |
1395 | writeEscaped(removeTags(sf), output); |
1396 | } |
1397 | else if(mode == gm_tagged) |
1398 | { |
1399 | writeEscaped(removeTags(sf), output); |
1400 | } |
1401 | else if(mode == gm_tagged_nm) |
1402 | { |
1403 | u_fputcu_fputc_72('^', output); |
1404 | writeEscaped(removeTags(sf.substr(1)), output); |
1405 | u_fputcu_fputc_72('/', output); |
1406 | writeEscapedWithTags(sf, output); |
1407 | u_fputcu_fputc_72('$', output); |
1408 | } |
1409 | } |
1410 | else if(current_state.isFinal(all_finals)) |
1411 | { |
1412 | bool firstupper = false, uppercase = false; |
1413 | if(!dictionaryCase) |
1414 | { |
1415 | uppercase = sf.size() > 1 && u_isupperu_isupper_72(sf[1]); |
1416 | firstupper= u_isupperu_isupper_72(sf[0]); |
1417 | } |
1418 | |
1419 | if(mode == gm_tagged || mode == gm_tagged_nm) |
1420 | { |
1421 | u_fputcu_fputc_72('^', output); |
1422 | } |
1423 | |
1424 | write(current_state.filterFinals(all_finals, alphabet, |
1425 | escaped_chars, |
1426 | displayWeightsMode, maxAnalyses, maxWeightClasses, |
1427 | uppercase, firstupper).substr(1), output); |
1428 | if(mode == gm_tagged || mode == gm_tagged_nm) |
1429 | { |
1430 | u_fputcu_fputc_72('/', output); |
1431 | writeEscapedWithTags(sf, output); |
1432 | u_fputcu_fputc_72('$', output); |
1433 | } |
1434 | |
1435 | } |
1436 | else |
1437 | { |
1438 | if(mode == gm_all) |
1439 | { |
1440 | u_fputcu_fputc_72('#', output); |
1441 | writeEscaped(sf, output); |
1442 | } |
1443 | else if(mode == gm_clean) |
1444 | { |
1445 | writeEscaped(removeTags(sf), output); |
1446 | } |
1447 | else if(mode == gm_unknown) |
1448 | { |
1449 | if(!sf.empty()) |
1450 | { |
1451 | u_fputcu_fputc_72('#', output); |
1452 | writeEscaped(removeTags(sf), output); |
1453 | } |
1454 | } |
1455 | else if(mode == gm_tagged) |
1456 | { |
1457 | u_fputcu_fputc_72('#', output); |
1458 | writeEscaped(removeTags(sf), output); |
1459 | } |
1460 | else if(mode == gm_tagged_nm) |
1461 | { |
1462 | u_fputcu_fputc_72('^', output); |
1463 | writeEscaped(removeTags(sf), output); |
1464 | u_fputcu_fputc_72('/', output); |
1465 | u_fputcu_fputc_72('#', output); |
1466 | writeEscapedWithTags(sf, output); |
1467 | u_fputcu_fputc_72('$', output); |
1468 | } |
1469 | } |
1470 | |
1471 | current_state = initial_state; |
1472 | sf.clear(); |
1473 | } |
1474 | else if(u_isspaceu_isspace_72(val) && sf.size() == 0) |
1475 | { |
1476 | // do nothing |
1477 | } |
1478 | else if(sf.size() > 0 && (sf[0] == '*' || sf[0] == '%' )) |
1479 | { |
1480 | alphabet.getSymbol(sf, val); |
1481 | } |
1482 | else |
1483 | { |
1484 | alphabet.getSymbol(sf,val); |
1485 | if(current_state.size() > 0) |
1486 | { |
1487 | if(!alphabet.isTag(val) && u_isupperu_isupper_72(val) && !(beCaseSensitive(current_state))) |
1488 | { |
1489 | if(mode == gm_carefulcase) |
1490 | { |
1491 | current_state.step_careful(val, u_toloweru_tolower_72(val)); |
1492 | } |
1493 | else |
1494 | { |
1495 | current_state.step(val, u_toloweru_tolower_72(val)); |
1496 | } |
1497 | } |
1498 | else |
1499 | { |
1500 | current_state.step(val); |
1501 | } |
1502 | } |
1503 | } |
1504 | } |
1505 | } |
1506 | |
1507 | void |
1508 | FSTProcessor::postgeneration(InputFile& input, UFILE *output) |
1509 | { |
1510 | transliteration_drop_tilde = true; |
1511 | transliteration(input, output); |
1512 | } |
1513 | |
1514 | void |
1515 | FSTProcessor::intergeneration(InputFile& input, UFILE *output) |
1516 | { |
1517 | transliteration_drop_tilde = false; |
1518 | transliteration(input, output); |
1519 | } |
1520 | |
1521 | void |
1522 | FSTProcessor::transliteration(InputFile& input, UFILE *output) |
1523 | { |
1524 | size_t start_pos = 0; |
1525 | size_t cur_word = 0; |
1526 | size_t cur_pos = 0; |
1527 | size_t match_pos = 0; |
1528 | State current_state = initial_state; |
1529 | UString last_match; |
1530 | int space_diff = 0; |
1531 | |
1532 | bool firstupper = false; |
1533 | bool uppercase = false; |
1534 | bool have_first = false; |
1535 | bool have_second = false; |
1536 | |
1537 | while (true) { |
1538 | if (transliteration_queue.empty()) { |
1539 | if (!blankqueue.empty()) { |
1540 | flushBlanks(output); |
1541 | } |
1542 | if (!readTransliterationWord(input)) { |
1543 | flushBlanks(output); |
1544 | if (input.eof()) { |
1545 | break; |
1546 | } else { |
1547 | u_fputcu_fputc_72(input.get(), output); |
1548 | u_fflushu_fflush_72(output); |
1549 | continue; |
1550 | } |
1551 | } |
1552 | } |
1553 | |
1554 | if (current_state.isFinal(all_finals)) { |
1555 | last_match = current_state.filterFinals(all_finals, alphabet, |
1556 | escaped_chars, displayWeightsMode, |
1557 | 1, maxWeightClasses, |
1558 | uppercase, firstupper); |
1559 | while (cur_word > 0) { |
1560 | if (cur_word == 1) { |
1561 | if (cur_pos == 0 && last_match[last_match.size()-1] == ' ') { |
1562 | match_pos = transliteration_queue.front().size(); |
1563 | last_match = last_match.substr(0, last_match.size()-1); |
1564 | break; |
1565 | } else { |
1566 | cur_pos += transliteration_queue.front().size() + 1; |
1567 | } |
1568 | } |
1569 | std::vector<int32_t> word = transliteration_queue.front(); |
1570 | transliteration_queue.pop_front(); |
1571 | word.push_back(static_cast<int32_t>(' ')); |
1572 | word.insert(word.end(), transliteration_queue.front().begin(), |
1573 | transliteration_queue.front().end()); |
1574 | transliteration_queue.pop_front(); |
1575 | transliteration_queue.push_front(word); |
1576 | UString wblank = wblankqueue.front(); |
1577 | wblankqueue.pop_front(); |
1578 | wblank = StringUtils::merge_wblanks(wblank, wblankqueue.front()); |
1579 | wblankqueue.pop_front(); |
1580 | wblankqueue.push_front(wblank); |
1581 | cur_word--; |
1582 | } |
1583 | if (cur_word == 0) { |
1584 | match_pos = cur_pos; |
1585 | } |
1586 | } |
1587 | |
1588 | int32_t sym = 0; |
1589 | bool is_end = false; |
1590 | if (cur_pos < transliteration_queue[cur_word].size()) { |
1591 | sym = transliteration_queue[cur_word][cur_pos]; |
1592 | cur_pos++; |
1593 | } else { |
1594 | if (cur_word + 1 == transliteration_queue.size() && |
1595 | !readTransliterationWord(input)) { |
1596 | is_end = true; |
1597 | } else { |
1598 | sym = static_cast<int32_t>(' '); |
1599 | cur_word++; |
1600 | cur_pos = 0; |
1601 | } |
1602 | } |
1603 | |
1604 | if (isAlphabetic(sym)) { |
1605 | if (!have_first) { |
1606 | have_first = true; |
1607 | if (u_isupperu_isupper_72(sym)) { |
1608 | firstupper = true; |
1609 | } else { |
1610 | firstupper = false; |
1611 | have_second = true; |
1612 | } |
1613 | } else if (!have_second) { |
1614 | have_second = true; |
1615 | uppercase = u_isupperu_isupper_72(sym); |
1616 | } |
1617 | } |
1618 | |
1619 | current_state.step_case_override(sym, beCaseSensitive(current_state)); |
1620 | |
1621 | if (current_state.size() == 0 || is_end) { |
1622 | if (last_match.empty()) { |
1623 | start_pos++; |
1624 | } else { |
1625 | std::vector<int32_t> match = alphabet.tokenize(last_match.substr(1)); |
1626 | last_match.clear(); |
1627 | std::vector<int32_t> word = transliteration_queue.front(); |
1628 | transliteration_queue.pop_front(); |
1629 | size_t i = 0; |
1630 | for (; i < match.size() && i < match_pos - start_pos; i++) { |
1631 | if (match[match.size()-i-1] != word[match_pos-i-1]) { |
1632 | break; |
1633 | } |
1634 | } |
1635 | std::vector<int32_t> new_word; |
1636 | new_word.insert(new_word.end(), word.begin(), word.begin()+start_pos); |
1637 | new_word.insert(new_word.end(), match.begin(), match.end()); |
1638 | new_word.insert(new_word.end(), word.begin()+match_pos, word.end()); |
1639 | transliteration_queue.push_front(new_word); |
1640 | int sf_spaces = 0; |
1641 | int lf_spaces = 0; |
1642 | for (auto c : word) { |
1643 | if (c == static_cast<int32_t>(' ')) sf_spaces++; |
1644 | } |
1645 | for (auto c : new_word) { |
1646 | if (c == static_cast<int32_t>(' ')) lf_spaces++; |
1647 | } |
1648 | space_diff += (lf_spaces - sf_spaces); |
1649 | size_t last_start = start_pos; |
1650 | start_pos = match_pos - i; |
1651 | if (start_pos == last_start) start_pos++; |
1652 | cur_pos = start_pos; |
1653 | cur_word = 0; |
Value stored to 'cur_word' is never read | |
1654 | } |
1655 | if (start_pos >= transliteration_queue.front().size()) { |
1656 | write(blankqueue.front(), output); |
1657 | blankqueue.pop(); |
1658 | bool has_wblank = !wblankqueue.front().empty(); |
1659 | write(wblankqueue.front(), output); |
1660 | wblankqueue.pop_front(); |
1661 | auto word = transliteration_queue.front(); |
1662 | transliteration_queue.pop_front(); |
1663 | int space_count = 0; |
1664 | for (auto c : word) { |
1665 | if (c == static_cast<int32_t>(' ')) space_count++; |
1666 | } |
1667 | int space_out = 0; |
1668 | UString out; |
1669 | for (auto c : word) { |
1670 | if (c == ' ') { |
1671 | if (space_out + space_diff >= space_count) { |
1672 | out += ' '; |
1673 | } else { |
1674 | out += blankqueue.front(); |
1675 | blankqueue.pop(); |
1676 | } |
1677 | space_out++; |
1678 | } else if (transliteration_drop_tilde && |
1679 | c == static_cast<int32_t>('~')) { |
1680 | } else { |
1681 | if (c > 0 && isEscaped(c)) { |
1682 | out += '\\'; |
1683 | } |
1684 | alphabet.getSymbol(out, c); |
1685 | } |
1686 | } |
1687 | write(out, output); |
1688 | if (has_wblank) { |
1689 | write(WBLANK_FINAL, output); |
1690 | } |
1691 | while (space_diff < 0) { |
1692 | if (blankqueue.front() != " "_u) { |
1693 | write(blankqueue.front(), output); |
1694 | } |
1695 | blankqueue.pop(); |
1696 | space_diff++; |
1697 | } |
1698 | space_diff = 0; |
1699 | start_pos = 0; |
1700 | } |
1701 | match_pos = 0; |
1702 | cur_pos = start_pos; |
1703 | cur_word = 0; |
1704 | uppercase = false; |
1705 | firstupper = false; |
1706 | have_first = false; |
1707 | have_second = false; |
1708 | current_state = initial_state; |
1709 | } |
1710 | } |
1711 | } |
1712 | |
1713 | bool |
1714 | FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue, |
1715 | bool delim, bool mark) |
1716 | { |
1717 | State current_state = initial_state; |
1718 | bool firstupper = u_isupperu_isupper_72(word[0]); |
1719 | bool uppercase = firstupper && u_isupperu_isupper_72(word[1]); |
1720 | for (auto symbol : symbol_iter(word)) { |
1721 | int32_t val = (symbol.size() == 1 ? symbol[0] : alphabet(symbol)); |
1722 | if (current_state.size() != 0) { |
1723 | current_state.step(val, beCaseSensitive(current_state)); |
1724 | } |
1725 | if (current_state.isFinal(all_finals)) { |
1726 | result.clear(); |
1727 | if (delim) result += '^'; |
1728 | if (mark) result += '='; |
1729 | result += current_state.filterFinals(all_finals, alphabet, |
1730 | escaped_chars, |
1731 | displayWeightsMode, maxAnalyses, maxWeightClasses, |
1732 | uppercase, firstupper, 0).substr(1); |
1733 | } |
1734 | if (current_state.size() == 0) { |
1735 | if (!result.empty()) queue.append(symbol); |
1736 | else return false; |
1737 | } |
1738 | } |
1739 | return !result.empty(); |
1740 | } |
1741 | |
1742 | UString |
1743 | FSTProcessor::biltransfull(UStringView input_word, bool with_delim) |
1744 | { |
1745 | UString result; |
1746 | unsigned int start_point = 1; |
1747 | unsigned int end_point = input_word.size()-2; |
1748 | UString queue; |
1749 | bool mark = false; |
1750 | |
1751 | if(with_delim == false) |
1752 | { |
1753 | start_point = 0; |
1754 | end_point = input_word.size()-1; |
1755 | } |
1756 | |
1757 | if(input_word[start_point] == '*') |
1758 | { |
1759 | return US(input_word); |
1760 | } |
1761 | |
1762 | if(input_word[start_point] == '=') |
1763 | { |
1764 | start_point++; |
1765 | mark = true; |
1766 | } |
1767 | |
1768 | auto word = input_word.substr(start_point, end_point-start_point); |
1769 | bool exists = step_biltrans(word, result, queue, with_delim, mark); |
1770 | if (!exists) { |
1771 | if (with_delim) return "^@"_u + US(input_word.substr(1)); |
1772 | else return "@"_u + US(input_word); |
1773 | } |
1774 | |
1775 | if(start_point < (end_point - 3)) |
1776 | { |
1777 | return "^$"_u; |
1778 | } |
1779 | // attach unmatched queue automatically |
1780 | |
1781 | if(!queue.empty()) |
1782 | { |
1783 | UString result_with_queue = compose(result, queue); |
1784 | if(with_delim) |
1785 | { |
1786 | result_with_queue += '$'; |
1787 | } |
1788 | return result_with_queue; |
1789 | } |
1790 | else |
1791 | { |
1792 | if(with_delim) |
1793 | { |
1794 | result += '$'; |
1795 | } |
1796 | return result; |
1797 | } |
1798 | } |
1799 | |
1800 | |
1801 | |
1802 | UString |
1803 | FSTProcessor::biltrans(UStringView input_word, bool with_delim) |
1804 | { |
1805 | State current_state = initial_state; |
1806 | UString result; |
1807 | unsigned int start_point = 1; |
1808 | unsigned int end_point = input_word.size()-2; |
1809 | UString queue; |
1810 | bool mark = false; |
1811 | |
1812 | if(with_delim == false) |
1813 | { |
1814 | start_point = 0; |
1815 | end_point = input_word.size()-1; |
1816 | } |
1817 | |
1818 | if(input_word[start_point] == '*') |
1819 | { |
1820 | return US(input_word); |
1821 | } |
1822 | |
1823 | if(input_word[start_point] == '=') |
1824 | { |
1825 | start_point++; |
1826 | mark = true; |
1827 | } |
1828 | |
1829 | UStringView word = input_word.substr(start_point, end_point-start_point); |
1830 | bool exists = step_biltrans(word, result, queue, with_delim, mark); |
1831 | if (!exists) { |
1832 | if (with_delim) return "^@"_u + US(input_word.substr(1)); |
1833 | else return "@"_u + US(input_word); |
1834 | } |
1835 | |
1836 | // attach unmatched queue automatically |
1837 | |
1838 | if(!queue.empty()) |
1839 | { |
1840 | UString result_with_queue = compose(result, queue); |
1841 | if(with_delim) |
1842 | { |
1843 | result_with_queue += '$'; |
1844 | } |
1845 | return result_with_queue; |
1846 | } |
1847 | else |
1848 | { |
1849 | if(with_delim) |
1850 | { |
1851 | result += '$'; |
1852 | } |
1853 | return result; |
1854 | } |
1855 | } |
1856 | |
1857 | UString |
1858 | FSTProcessor::compose(UStringView lexforms, UStringView queue) const |
1859 | { |
1860 | UString result; |
1861 | result.reserve(lexforms.size() + 2 * queue.size()); |
1862 | result += '/'; |
1863 | |
1864 | for(unsigned int i = 1; i< lexforms.size(); i++) |
1865 | { |
1866 | if(lexforms[i] == '\\') |
1867 | { |
1868 | result += '\\'; |
1869 | i++; |
1870 | } |
1871 | else if(lexforms[i] == '/') |
1872 | { |
1873 | result.append(queue); |
1874 | } |
1875 | result += lexforms[i]; |
1876 | } |
1877 | |
1878 | result += queue; |
1879 | return result; |
1880 | } |
1881 | |
1882 | void |
1883 | FSTProcessor::skipToNextWord(InputFile& input, UFILE* output) |
1884 | { |
1885 | int blank_depth = 0; |
1886 | |
1887 | while (!input.eof()) { |
1888 | UChar32 c = input.get(); |
1889 | |
1890 | switch (c) { |
1891 | case '^': |
1892 | if (blank_depth == 0) { |
1893 | input.unget(c); |
1894 | return; |
1895 | } else { |
1896 | u_fputcu_fputc_72(c, output); |
1897 | } |
1898 | break; |
1899 | case '\\': |
1900 | u_fputcu_fputc_72(c, output); |
1901 | c = input.get(); |
1902 | u_fputcu_fputc_72(c, output); |
1903 | break; |
1904 | case '\0': |
1905 | u_fputcu_fputc_72(c, output); |
1906 | u_fflushu_fflush_72(output); |
1907 | break; |
1908 | case U_EOF0xFFFF: |
1909 | break; |
1910 | case '[': |
1911 | blank_depth++; |
1912 | u_fputcu_fputc_72(c, output); |
1913 | break; |
1914 | case ']': |
1915 | if (blank_depth > 0) blank_depth--; |
1916 | u_fputcu_fputc_72(c, output); |
1917 | break; |
1918 | default: |
1919 | u_fputcu_fputc_72(c, output); |
1920 | } |
1921 | } |
1922 | } |
1923 | |
1924 | UChar32 |
1925 | FSTProcessor::skipReading(InputFile& input, UFILE* output) |
1926 | { |
1927 | UChar32 c = U_EOF0xFFFF; |
1928 | while (!input.eof()) { |
1929 | c = input.get(); |
1930 | if (output != nullptr) { |
1931 | switch (c) { |
1932 | case '\\': |
1933 | u_fputcu_fputc_72(c, output); |
1934 | u_fputcu_fputc_72(input.get(), output); |
1935 | break; |
1936 | case '<': |
1937 | write(input.readBlock('<', '>'), output); |
1938 | break; |
1939 | case '/': |
1940 | case '$': |
1941 | u_fputcu_fputc_72(c, output); |
1942 | break; |
1943 | default: |
1944 | if (isEscaped(c)) u_fputcu_fputc_72('\\', output); |
1945 | u_fputcu_fputc_72(c, output); |
1946 | } |
1947 | } else { |
1948 | switch (c) { |
1949 | case '\\': |
1950 | input.get(); |
1951 | break; |
1952 | case '<': |
1953 | input.readBlock('<', '>'); |
1954 | break; |
1955 | } |
1956 | } |
1957 | if (c == '/' || c == '$' || c == '\0') break; |
1958 | } |
1959 | return c; |
1960 | } |
1961 | |
1962 | void |
1963 | FSTProcessor::nextBilingualWord(InputFile& input, UFILE* output, |
1964 | std::vector<int32_t>& symbols, |
1965 | GenerationMode mode) |
1966 | { |
1967 | symbols.clear(); |
1968 | |
1969 | skipToNextWord(input, output); |
1970 | |
1971 | if (input.eof()) return; |
1972 | |
1973 | u_fputcu_fputc_72(input.get(), output); // ^ |
1974 | |
1975 | UChar32 c = '/'; |
1976 | |
1977 | if (biltransSurfaceFormsKeep) { |
1978 | c = skipReading(input, output); |
1979 | } else if (biltransSurfaceForms) { |
1980 | c = skipReading(input, nullptr); |
1981 | } |
1982 | if (c != '/') { |
1983 | nextBilingualWord(input, output, symbols, mode); |
1984 | return; |
1985 | } |
1986 | |
1987 | bool unknown = false; |
1988 | |
1989 | if (input.peek() == '*') { |
1990 | input.get(); |
1991 | unknown = true; |
1992 | } |
1993 | |
1994 | while (!input.eof()) { |
1995 | c = input.get(); |
1996 | switch (c) { |
1997 | case '\\': |
1998 | symbols.push_back(input.get()); |
1999 | break; |
2000 | case '\0': |
2001 | case '/': |
2002 | case '$': |
2003 | break; |
2004 | case '<': |
2005 | { |
2006 | UString symbol = input.readBlock('<', '>'); |
2007 | alphabet.includeSymbol(symbol); |
2008 | symbols.push_back(alphabet(symbol)); |
2009 | } |
2010 | break; |
2011 | default: |
2012 | symbols.push_back(c); |
2013 | } |
2014 | if (c == '\0' || c == '/' || c == '$') break; |
2015 | } |
2016 | |
2017 | while (c == '/') c = skipReading(input, nullptr); |
2018 | |
2019 | if (c == '\0' || unknown) { |
2020 | UString in_str; |
2021 | for (auto& s : symbols) { |
2022 | if (isEscaped(s)) in_str += '\\'; |
2023 | alphabet.getSymbol(in_str, s); |
2024 | } |
2025 | symbols.clear(); |
2026 | if (c == '\0') { |
2027 | write(in_str, output); |
2028 | u_fflushu_fflush_72(output); |
2029 | } else { |
2030 | u_fputcu_fputc_72('*', output); |
2031 | write(in_str, output); |
2032 | u_fputcu_fputc_72('/', output); |
2033 | if (mode != gm_clean) u_fputcu_fputc_72('*', output); |
2034 | write(in_str, output); |
2035 | u_fputcu_fputc_72('$', output); |
2036 | } |
2037 | nextBilingualWord(input, output, symbols, mode); |
2038 | return; |
2039 | } |
2040 | } |
2041 | |
2042 | void |
2043 | FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode) |
2044 | { |
2045 | std::vector<int32_t> symbols; |
2046 | while (!input.eof()) { |
2047 | nextBilingualWord(input, output, symbols, mode); |
2048 | if (symbols.empty()) continue; |
2049 | |
2050 | State current_state = initial_state; |
2051 | |
2052 | bool firstupper = (symbols[0] > 0 && u_isupperu_isupper_72(symbols[0])); |
2053 | bool uppercase = (firstupper && symbols.size() > 1 && |
2054 | symbols[1] > 0 && u_isupperu_isupper_72(symbols[1])); |
2055 | |
2056 | bool seenTags = false; |
2057 | size_t queue_start = 0; |
2058 | UString result; |
2059 | for (size_t i = 0; i < symbols.size(); i++) { |
2060 | seenTags = seenTags || alphabet.isTag(symbols[i]); |
2061 | current_state.step_case(symbols[i], beCaseSensitive(current_state)); |
2062 | if (current_state.isFinal(all_finals)) { |
2063 | queue_start = i; |
2064 | result = current_state.filterFinals(all_finals, alphabet, escaped_chars, |
2065 | displayWeightsMode, maxAnalyses, |
2066 | maxWeightClasses, uppercase, |
2067 | firstupper, 0); |
2068 | } |
2069 | } |
2070 | // if there are no tags, we only return complete matches |
2071 | if (!seenTags && queue_start + 1 < symbols.size()) result.clear(); |
2072 | |
2073 | UString source; |
2074 | size_t queue_pos = 0; |
2075 | for (size_t i = 0; i < symbols.size(); i++) { |
2076 | if (isEscaped(symbols[i]) || (i == 0 && symbols[i] == '*')) source += '\\'; |
2077 | alphabet.getSymbol(source, symbols[i]); |
2078 | if (i == queue_start) queue_pos = source.size(); |
2079 | } |
2080 | |
2081 | write(source, output); |
2082 | |
2083 | if (!result.empty()) { |
2084 | write(compose(result, source.substr(queue_pos)), output); |
2085 | } else { |
2086 | u_fputcu_fputc_72('/', output); |
2087 | u_fputcu_fputc_72((mode == gm_all ? '#' : '@'), output); |
2088 | write(source, output); |
2089 | } |
2090 | u_fputcu_fputc_72('$', output); |
2091 | } |
2092 | } |
2093 | |
2094 | std::pair<UString, int> |
2095 | FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim) |
2096 | { |
2097 | State current_state = initial_state; |
2098 | UString result; |
2099 | unsigned int start_point = 1; |
2100 | unsigned int end_point = input_word.size()-2; |
2101 | UString queue; |
2102 | bool mark = false; |
2103 | bool seentags = false; // have we seen any tags at all in the analysis? |
2104 | |
2105 | if(with_delim == false) |
2106 | { |
2107 | start_point = 0; |
2108 | end_point = input_word.size()-1; |
2109 | } |
2110 | |
2111 | if(input_word[start_point] == '*') |
2112 | { |
2113 | return {US(input_word), 0}; |
2114 | } |
2115 | |
2116 | if(input_word[start_point] == '=') |
2117 | { |
2118 | start_point++; |
2119 | mark = true; |
2120 | } |
2121 | |
2122 | bool firstupper = u_isupperu_isupper_72(input_word[start_point]); |
2123 | bool uppercase = firstupper && u_isupperu_isupper_72(input_word[start_point+1]); |
2124 | |
2125 | UStringView word = input_word.substr(start_point, end_point-start_point); |
2126 | for (auto symbol : symbol_iter(word)) { |
2127 | int32_t val; |
2128 | if (symbol.size() == 1) { |
2129 | val = symbol[0]; |
2130 | } else { |
2131 | val = alphabet(symbol); |
2132 | seentags = true; |
2133 | } |
2134 | if(current_state.size() != 0) |
2135 | { |
2136 | current_state.step_case(val, beCaseSensitive(current_state)); |
2137 | } |
2138 | if(current_state.isFinal(all_finals)) |
2139 | { |
2140 | result.clear(); |
2141 | if (with_delim) { |
2142 | result += '^'; |
2143 | } |
2144 | if (mark) { |
2145 | result += '='; |
2146 | } |
2147 | result += current_state.filterFinals(all_finals, alphabet, |
2148 | escaped_chars, |
2149 | displayWeightsMode, maxAnalyses, maxWeightClasses, |
2150 | uppercase, firstupper, 0).substr(1); |
2151 | } |
2152 | |
2153 | if(current_state.size() == 0) |
2154 | { |
2155 | if(!symbol.empty() && !result.empty()) |
2156 | { |
2157 | queue.append(symbol); |
2158 | } |
2159 | else |
2160 | { |
2161 | // word is not present |
2162 | if(with_delim) |
2163 | { |
2164 | result = "^@"_u + US(input_word.substr(1)); |
2165 | } |
2166 | else |
2167 | { |
2168 | result = "@"_u + US(input_word); |
2169 | } |
2170 | return std::pair<UString, int>(result, 0); |
2171 | } |
2172 | } |
2173 | } |
2174 | |
2175 | if (!seentags |
2176 | && current_state.filterFinals(all_finals, alphabet, escaped_chars, |
2177 | displayWeightsMode, maxAnalyses, maxWeightClasses, |
2178 | uppercase, firstupper, 0).empty()) |
2179 | { |
2180 | // word is not present |
2181 | if(with_delim) |
2182 | { |
2183 | result = "^@"_u + US(input_word.substr(1)); |
2184 | } |
2185 | else |
2186 | { |
2187 | result = "@"_u + US(input_word); |
2188 | } |
2189 | return {result, 0}; |
2190 | } |
2191 | |
2192 | |
2193 | |
2194 | // attach unmatched queue automatically |
2195 | |
2196 | if(!queue.empty()) |
2197 | { |
2198 | UString result_with_queue = compose(result, queue); |
2199 | if(with_delim) |
2200 | { |
2201 | result_with_queue += '$'; |
2202 | } |
2203 | return {result_with_queue, queue.size()}; |
2204 | } |
2205 | else |
2206 | { |
2207 | if(with_delim) |
2208 | { |
2209 | result += '$'; |
2210 | } |
2211 | return {result, 0}; |
2212 | } |
2213 | } |
2214 | |
2215 | UString |
2216 | FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim) |
2217 | { |
2218 | State current_state = initial_state; |
2219 | UString result; |
2220 | unsigned int start_point = 1; |
2221 | unsigned int end_point = input_word.size()-2; |
2222 | bool mark = false; |
2223 | |
2224 | if(with_delim == false) |
2225 | { |
2226 | start_point = 0; |
2227 | end_point = input_word.size()-1; |
2228 | } |
2229 | |
2230 | if(input_word[start_point] == '*') |
2231 | { |
2232 | return US(input_word); |
2233 | } |
2234 | |
2235 | if(input_word[start_point] == '=') |
2236 | { |
2237 | start_point++; |
2238 | mark = true; |
2239 | } |
2240 | |
2241 | auto word = input_word.substr(start_point, end_point-start_point); |
2242 | UString queue; |
2243 | bool exists = step_biltrans(word, result, queue, with_delim, mark); |
2244 | if (!exists || !queue.empty()) { |
2245 | if (with_delim) return "^@"_u + US(input_word.substr(1)); |
2246 | else return "@"_u + US(input_word); |
2247 | } |
2248 | |
2249 | if(with_delim) |
2250 | { |
2251 | result += '$'; |
2252 | } |
2253 | return result; |
2254 | } |
2255 | |
2256 | |
2257 | bool |
2258 | FSTProcessor::valid() const |
2259 | { |
2260 | if(initial_state.isFinal(all_finals)) |
2261 | { |
2262 | std::cerr << "Error: Invalid dictionary (hint: the left side of an entry is empty)" << std::endl; |
2263 | return false; |
2264 | } |
2265 | else |
2266 | { |
2267 | State s = initial_state; |
2268 | s.step(' '); |
2269 | if(s.size() != 0) |
2270 | { |
2271 | std::cerr << "Error: Invalid dictionary (hint: entry beginning with whitespace)" << std::endl; |
2272 | return false; |
2273 | } |
2274 | } |
2275 | |
2276 | return true; |
2277 | } |
2278 | |
2279 | int |
2280 | FSTProcessor::readSAO(InputFile& input) |
2281 | { |
2282 | if(!input_buffer.isEmpty()) |
2283 | { |
2284 | return input_buffer.next(); |
2285 | } |
2286 | |
2287 | UChar32 val = input.get(); |
2288 | if(input.eof()) |
2289 | { |
2290 | return 0; |
2291 | } |
2292 | |
2293 | if(escaped_chars.find(val) != escaped_chars.end()) |
2294 | { |
2295 | if(val == '<') |
2296 | { |
2297 | UString str = input.readBlock('<', '>'); |
2298 | if(StringUtils::startswith(str, u"<![CDATA[")) |
2299 | { |
2300 | while(!StringUtils::endswith(str, u"]]>")) |
2301 | { |
2302 | str.append(input.readBlock('<', '>').substr(1)); |
2303 | } |
2304 | blankqueue.push(str); |
2305 | input_buffer.add(static_cast<int32_t>(' ')); |
2306 | return static_cast<int32_t>(' '); |
2307 | } |
2308 | else |
2309 | { |
2310 | streamError(); |
2311 | } |
2312 | } |
2313 | else if (val == '\\') { |
2314 | val = input.get(); |
2315 | if(isEscaped(val)) |
2316 | { |
2317 | input_buffer.add(val); |
2318 | return static_cast<int32_t>(val); |
2319 | } |
2320 | else |
2321 | streamError(); |
2322 | } |
2323 | else |
2324 | { |
2325 | streamError(); |
2326 | } |
2327 | } |
2328 | |
2329 | input_buffer.add(static_cast<int32_t>(val)); |
2330 | return static_cast<int32_t>(val); |
2331 | } |
2332 | |
2333 | void |
2334 | FSTProcessor::printSAOWord(UStringView lf, UFILE *output) |
2335 | { |
2336 | for(unsigned int i = 1, limit = lf.size(); i != limit; i++) |
2337 | { |
2338 | if(lf[i] == '/') |
2339 | { |
2340 | break; |
2341 | } |
2342 | u_fputcu_fputc_72(lf[i], output); |
2343 | } |
2344 | } |
2345 | |
2346 | void |
2347 | FSTProcessor::SAO(InputFile& input, UFILE *output) |
2348 | { |
2349 | bool last_incond = false; |
2350 | bool last_postblank = false; |
2351 | State current_state = initial_state; |
2352 | UString lf; |
2353 | UString sf; |
2354 | int last = 0; |
2355 | |
2356 | escaped_chars.clear(); |
2357 | escaped_chars.insert('\\'); |
2358 | escaped_chars.insert('<'); |
2359 | escaped_chars.insert('>'); |
2360 | |
2361 | while(UChar32 val = readSAO(input)) |
2362 | { |
2363 | // test for final states |
2364 | if(current_state.isFinal(all_finals)) |
2365 | { |
2366 | if(current_state.isFinal(inconditional)) |
2367 | { |
2368 | bool firstupper = u_isupperu_isupper_72(sf[0]); |
2369 | bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]); |
2370 | |
2371 | lf = current_state.filterFinalsSAO(all_finals, alphabet, |
2372 | escaped_chars, |
2373 | uppercase, firstupper); |
2374 | last_incond = true; |
2375 | last = input_buffer.getPos(); |
2376 | } |
2377 | else if(current_state.isFinal(postblank)) |
2378 | { |
2379 | bool firstupper = u_isupperu_isupper_72(sf[0]); |
2380 | bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]); |
2381 | |
2382 | lf = current_state.filterFinalsSAO(all_finals, alphabet, |
2383 | escaped_chars, |
2384 | uppercase, firstupper); |
2385 | last_postblank = true; |
2386 | last = input_buffer.getPos(); |
2387 | } |
2388 | else if(!isAlphabetic(val)) |
2389 | { |
2390 | bool firstupper = u_isupperu_isupper_72(sf[0]); |
2391 | bool uppercase = firstupper && u_isupperu_isupper_72(sf[sf.size()-1]); |
2392 | |
2393 | lf = current_state.filterFinalsSAO(all_finals, alphabet, |
2394 | escaped_chars, |
2395 | uppercase, firstupper); |
2396 | last_postblank = false; |
2397 | last_incond = false; |
2398 | last = input_buffer.getPos(); |
2399 | } |
2400 | } |
2401 | else if(sf.empty() && u_isspaceu_isspace_72(val)) |
2402 | { |
2403 | lf = "/*"_u; |
2404 | lf.append(sf); |
2405 | last_postblank = false; |
2406 | last_incond = false; |
2407 | last = input_buffer.getPos(); |
2408 | } |
2409 | |
2410 | current_state.step_case(val, beCaseSensitive(current_state)); |
2411 | |
2412 | if(current_state.size() != 0) |
2413 | { |
2414 | alphabet.getSymbol(sf, val); |
2415 | } |
2416 | else |
2417 | { |
2418 | if(!isAlphabetic(val) && sf.empty()) |
2419 | { |
2420 | if(u_isspaceu_isspace_72(val)) |
2421 | { |
2422 | printSpace(val, output); |
2423 | } |
2424 | else |
2425 | { |
2426 | if(isEscaped(val)) |
2427 | { |
2428 | u_fputcu_fputc_72('\\', output); |
2429 | } |
2430 | u_fputcu_fputc_72(val, output); |
2431 | } |
2432 | } |
2433 | else if(last_incond) |
2434 | { |
2435 | printSAOWord(lf, output); |
2436 | input_buffer.setPos(last); |
2437 | input_buffer.back(1); |
2438 | } |
2439 | else if(last_postblank) |
2440 | { |
2441 | printSAOWord(lf, output); |
2442 | u_fputcu_fputc_72(' ', output); |
2443 | input_buffer.setPos(last); |
2444 | input_buffer.back(1); |
2445 | } |
2446 | else if(isAlphabetic(val) && |
2447 | ((sf.size()-input_buffer.diffPrevPos(last)) > lastBlank(sf) || |
2448 | lf.empty())) |
2449 | { |
2450 | do |
2451 | { |
2452 | alphabet.getSymbol(sf, val); |
2453 | } |
2454 | while((val = readSAO(input)) && isAlphabetic(val)); |
2455 | |
2456 | auto limit = firstNotAlpha(sf); |
2457 | unsigned int size = sf.size(); // TODO: change these to character counts |
2458 | input_buffer.back(1+(size-limit.i_utf16)); |
2459 | u_fprintfu_fprintf_72(output, "<d>%S</d>", sf.c_str()); |
2460 | } |
2461 | else if(lf.empty()) |
2462 | { |
2463 | auto limit = firstNotAlpha(sf); |
2464 | unsigned int size = sf.size(); // TODO: change these to character counts |
2465 | input_buffer.back(1+(size-limit.i_utf16)); |
2466 | u_fprintfu_fprintf_72(output, "<d>%S</d>", sf.c_str()); |
2467 | } |
2468 | else |
2469 | { |
2470 | printSAOWord(lf, output); |
2471 | input_buffer.setPos(last); |
2472 | input_buffer.back(1); |
2473 | } |
2474 | |
2475 | current_state = initial_state; |
2476 | lf.clear(); |
2477 | sf.clear(); |
2478 | last_incond = false; |
2479 | last_postblank = false; |
2480 | } |
2481 | } |
2482 | |
2483 | // print remaining blanks |
2484 | flushBlanks(output); |
2485 | } |
2486 | |
2487 | UStringView |
2488 | FSTProcessor::removeTags(UStringView str) |
2489 | { |
2490 | for(unsigned int i = 0; i < str.size(); i++) |
2491 | { |
2492 | if(str[i] == '<' && i >=1 && str[i-1] != '\\') |
2493 | { |
2494 | return str.substr(0, i); |
2495 | } |
2496 | } |
2497 | |
2498 | return str; |
2499 | } |
2500 | |
2501 | |
2502 | void |
2503 | FSTProcessor::setBiltransSurfaceForms(bool value) |
2504 | { |
2505 | biltransSurfaceForms = value; |
2506 | } |
2507 | |
2508 | void |
2509 | FSTProcessor::setBiltransSurfaceFormsKeep(bool value) |
2510 | { |
2511 | biltransSurfaceFormsKeep = value; |
2512 | } |
2513 | |
2514 | void |
2515 | FSTProcessor::setCaseSensitiveMode(bool value) |
2516 | { |
2517 | caseSensitive = value; |
2518 | } |
2519 | |
2520 | void |
2521 | FSTProcessor::setDictionaryCaseMode(bool value) |
2522 | { |
2523 | dictionaryCase = value; |
2524 | } |
2525 | |
2526 | void |
2527 | FSTProcessor::setNullFlush(bool value) |
2528 | { |
2529 | nullFlush = value; |
2530 | } |
2531 | |
2532 | void |
2533 | FSTProcessor::setIgnoredChars(bool value) |
2534 | { |
2535 | useIgnoredChars = value; |
2536 | } |
2537 | |
2538 | void |
2539 | FSTProcessor::setRestoreChars(bool value) |
2540 | { |
2541 | useRestoreChars = value; |
2542 | } |
2543 | |
2544 | void |
2545 | FSTProcessor::setUseDefaultIgnoredChars(bool value) |
2546 | { |
2547 | useDefaultIgnoredChars = value; |
2548 | } |
2549 | |
2550 | void |
2551 | FSTProcessor::setDisplayWeightsMode(bool value) |
2552 | { |
2553 | displayWeightsMode = value; |
2554 | } |
2555 | |
2556 | void |
2557 | FSTProcessor::setMaxAnalysesValue(int value) |
2558 | { |
2559 | maxAnalyses = value; |
2560 | } |
2561 | |
2562 | void |
2563 | FSTProcessor::setMaxWeightClassesValue(int value) |
2564 | { |
2565 | maxWeightClasses = value; |
2566 | } |
2567 | |
2568 | void |
2569 | FSTProcessor::setCompoundMaxElements(int value) |
2570 | { |
2571 | compound_max_elements = value; |
2572 | } |
2573 | |
2574 | bool |
2575 | FSTProcessor::getDecompoundingMode() |
2576 | { |
2577 | return do_decomposition; |
2578 | } |
2579 | |
2580 | bool |
2581 | FSTProcessor::getNullFlush() |
2582 | { |
2583 | return nullFlush; |
2584 | } |
2585 | |
2586 | FSTProcessor::Indices |
2587 | FSTProcessor::firstNotAlpha(UStringView sf) |
2588 | { |
2589 | FSTProcessor::Indices ix = { 0, 0 }; |
2590 | UCharCharacterIterator it = UCharCharacterIterator(sf.data(), sf.size()); |
2591 | while (it.hasNext()) { |
2592 | UChar32 c = it.next32PostInc(); |
2593 | if(!isAlphabetic(c)) |
2594 | { |
2595 | return ix; |
2596 | } |
2597 | ix.i_codepoint++; |
2598 | ix.i_utf16++; |
2599 | if(c > UINT16_MAX(65535)) { |
2600 | ix.i_utf16++; |
2601 | } |
2602 | } |
2603 | return ix; |
2604 | } |