clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name tagger.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ tagger.cc
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | #include <apertium/tagger.h> |
17 | |
18 | #include "apertium_config.h" |
19 | |
20 | #include "align.h" |
21 | #include <lttoolbox/exception.h> |
22 | #include "exception.h" |
23 | #include "linebreak.h" |
24 | #include "unigram_tagger.h" |
25 | #include <apertium/perceptron_tagger.h> |
26 | #include <apertium/hmm.h> |
27 | #include <apertium/lswpost.h> |
28 | #include <apertium/tagger_word.h> |
29 | #include <apertium/shell_utils.h> |
30 | |
31 | #include <lttoolbox/lt_locale.h> |
32 | |
33 | #include "getopt_long.h" |
34 | #include <cerrno> |
35 | #include <cstdio> |
36 | #include <cstdlib> |
37 | #include <cstring> |
38 | #include <fstream> |
39 | #include <iomanip> |
40 | #include <ios> |
41 | #include <iostream> |
42 | #include <locale> |
43 | #include <sstream> |
44 | #include <string> |
45 | #include <unistd.h> |
46 | |
47 | namespace Apertium { |
48 | using namespace ShellUtils; |
49 | using namespace tagger_utils; |
50 | |
51 | |
52 | |
53 | apertium_tagger::apertium_tagger(int &argc, char **&argv) |
54 | : argc(argc), argv(argv), The_val(), nonoptarg(), |
55 | |
56 | The_indexptr(), FunctionTypeTypeOption_indexptr(), |
57 | FunctionTypeOption_indexptr(), |
58 | |
59 | TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), |
60 | TheFunctionTypeOptionArgument(0), TheFlags() { |
61 | try { |
62 | |
63 | optind = 1; |
64 | while (true) { |
| 1 | Loop condition is true. Entering loop body | |
|
65 | The_val = getopt_long(argc, argv, "bdfegmpr:s:t:u:wxz", longopts, &The_indexptr); |
66 | |
67 | if (The_val == -1) |
| 2 | | Assuming the condition is true | |
|
| |
68 | break; |
69 | |
70 | set_indexptr(); |
71 | |
72 | switch (The_val) { |
73 | case 'b': |
74 | flagOptionCase(&TaggerFlags::getSentSeg, |
75 | &TaggerFlags::setSentSeg); |
76 | break; |
77 | case 'd': |
78 | flagOptionCase(&TaggerFlags::getDebug, |
79 | &TaggerFlags::setDebug); |
80 | break; |
81 | case 'e': |
82 | flagOptionCase(&TaggerFlags::getSkipErrors, |
83 | &TaggerFlags::setSkipErrors); |
84 | break; |
85 | case 'f': |
86 | flagOptionCase(&TaggerFlags::getFirst, |
87 | &TaggerFlags::setFirst); |
88 | break; |
89 | case 'm': |
90 | flagOptionCase(&TaggerFlags::getMark, |
91 | &TaggerFlags::setMark); |
92 | break; |
93 | case 'p': |
94 | flagOptionCase(&TaggerFlags::getShowSuperficial, |
95 | &TaggerFlags::setShowSuperficial); |
96 | break; |
97 | case 'z': |
98 | flagOptionCase(&TaggerFlags::getNullFlush, |
99 | &TaggerFlags::setNullFlush); |
100 | break; |
101 | case 'u': |
102 | functionTypeTypeOptionCase(Unigram); |
103 | |
104 | if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { |
105 | TheUnigramType = Stream_5_3_1; |
106 | break; |
107 | } |
108 | |
109 | if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { |
110 | TheUnigramType = Stream_5_3_2; |
111 | break; |
112 | } |
113 | |
114 | if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { |
115 | TheUnigramType = Stream_5_3_3; |
116 | break; |
117 | } |
118 | |
119 | { |
120 | std::stringstream what_; |
121 | what_ << "invalid argument '" << optarg << "' for '--unigram'\n" |
122 | "Valid arguments are:\n" |
123 | " - '1'\n" |
124 | " - '2'\n" |
125 | " - '3'"; |
126 | throw Exception::apertium_tagger::InvalidArgument(what_); |
127 | } |
128 | break; |
129 | case 'w': |
130 | functionTypeTypeOptionCase(SlidingWindow); |
131 | break; |
132 | case 'x': |
133 | functionTypeTypeOptionCase(Perceptron); |
134 | break; |
135 | case 'g': |
136 | functionTypeOptionCase(Tagger); |
137 | break; |
138 | case 'r': |
139 | functionTypeOptionCase(Retrain); |
140 | getIterationsArgument(); |
141 | break; |
142 | case 's': |
143 | functionTypeOptionCase(Supervised); |
144 | getIterationsArgument(); |
145 | break; |
146 | case 't': |
147 | functionTypeOptionCase(Train); |
148 | getIterationsArgument(); |
149 | break; |
150 | case 'h': |
151 | help(); |
152 | return; |
153 | default: |
154 | throw Exception::apertium_tagger::err_Exception(""); |
155 | } |
156 | } |
157 | |
158 | if (!TheFunctionType) { |
| 4 | | Execution continues on line 158 | |
|
| |
159 | help(); |
160 | return; |
161 | } |
162 | |
163 | nonoptarg = argc - optind; |
164 | switch (*TheFunctionType) { |
| 6 | | Control jumps to 'case Supervised:' at line 229 | |
|
165 | case Tagger: |
166 | if (!TheFunctionTypeType) { |
167 | try { |
168 | PerceptronTagger percep(TheFlags); |
169 | g_StreamTagger(percep); |
170 | } catch (DeserialisationException) { |
171 | HMM HiddenMarkovModelTagger_(TheFlags); |
172 | g_FILE_Tagger(HiddenMarkovModelTagger_); |
173 | } |
174 | break; |
175 | } |
176 | switch (*TheFunctionTypeType) { |
177 | case Unigram: { |
178 | UnigramTagger UnigramTagger_(TheFlags); |
179 | switch (*TheUnigramType) { |
180 | case Stream_5_3_1: |
181 | UnigramTagger_.setModel(UnigramTaggerModel1); |
182 | break; |
183 | case Stream_5_3_2: |
184 | UnigramTagger_.setModel(UnigramTaggerModel2); |
185 | break; |
186 | case Stream_5_3_3: |
187 | UnigramTagger_.setModel(UnigramTaggerModel3); |
188 | break; |
189 | default: |
190 | std::abort(); |
191 | } |
192 | g_StreamTagger(UnigramTagger_); |
193 | } break; |
194 | case SlidingWindow: { |
195 | LSWPoST SlidingWindowTagger_(TheFlags); |
196 | g_FILE_Tagger(SlidingWindowTagger_); |
197 | } break; |
198 | case Perceptron: { |
199 | PerceptronTagger perceptron(TheFlags); |
200 | g_StreamTagger(perceptron); |
201 | } break; |
202 | default: |
203 | std::abort(); |
204 | } |
205 | |
206 | break; |
207 | case Retrain: |
208 | if (!TheFunctionTypeType) { |
209 | HMM HiddenMarkovModelTagger_(TheFlags); |
210 | r_FILE_Tagger(HiddenMarkovModelTagger_); |
211 | break; |
212 | } |
213 | |
214 | switch (*TheFunctionTypeType) { |
215 | case Unigram: { |
216 | std::stringstream what_; |
217 | what_ << "invalid option -- 'u'"; |
218 | throw Exception::apertium_tagger::InvalidOption(what_); |
219 | } |
220 | case SlidingWindow: { |
221 | LSWPoST SlidingWindowTagger_(TheFlags); |
222 | r_FILE_Tagger(SlidingWindowTagger_); |
223 | } break; |
224 | default: |
225 | std::abort(); |
226 | } |
227 | |
228 | break; |
229 | case Supervised: |
230 | if (!TheFunctionTypeType) { |
| |
231 | HMM HiddenMarkovModelTagger_(TheFlags); |
232 | s_FILE_Tagger(HiddenMarkovModelTagger_); |
| 8 | | Calling 'apertium_tagger::s_FILE_Tagger' | |
|
233 | break; |
234 | } |
235 | |
236 | switch (*TheFunctionTypeType) { |
237 | case Unigram: { |
238 | UnigramTagger UnigramTagger_(TheFlags); |
239 | switch (*TheUnigramType) { |
240 | case Stream_5_3_1: |
241 | UnigramTagger_.setModel(UnigramTaggerModel1); |
242 | break; |
243 | case Stream_5_3_2: |
244 | UnigramTagger_.setModel(UnigramTaggerModel2); |
245 | break; |
246 | case Stream_5_3_3: |
247 | UnigramTagger_.setModel(UnigramTaggerModel3); |
248 | break; |
249 | default: |
250 | std::abort(); |
251 | } |
252 | s_StreamTaggerTrainer(UnigramTagger_); |
253 | } break; |
254 | case SlidingWindow: { |
255 | std::stringstream what_; |
256 | what_ << "invalid option -- 'w'"; |
257 | throw Exception::apertium_tagger::InvalidOption(what_); |
258 | } break; |
259 | case Perceptron: { |
260 | PerceptronTagger perceptron(TheFlags); |
261 | s_StreamTaggerTrainer(perceptron); |
262 | } break; |
263 | default: |
264 | std::abort(); |
265 | } |
266 | |
267 | break; |
268 | case Train: |
269 | if (!TheFunctionTypeType) { |
270 | HMM HiddenMarkovModelTagger_(TheFlags); |
271 | t_FILE_Tagger(HiddenMarkovModelTagger_); |
272 | break; |
273 | } |
274 | |
275 | switch (*TheFunctionTypeType) { |
276 | case Unigram: { |
277 | std::stringstream what_; |
278 | what_ << "invalid option -- 'u'"; |
279 | throw Exception::apertium_tagger::InvalidOption(what_); |
280 | } |
281 | case SlidingWindow: { |
282 | LSWPoST SlidingWindowTagger_(TheFlags); |
283 | t_FILE_Tagger(SlidingWindowTagger_); |
284 | } break; |
285 | default: |
286 | std::abort(); |
287 | } |
288 | |
289 | break; |
290 | default: |
291 | std::abort(); |
292 | } |
293 | } catch (const ExceptionType &ExceptionType_) { |
294 | std::cerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; |
295 | throw Exception::apertium_tagger::err_Exception(""); |
296 | } |
297 | } |
298 | |
299 | apertium_tagger::~apertium_tagger() {} |
300 | |
301 | void apertium_tagger::help() { |
302 | |
303 | std::cerr << |
304 | "Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" |
305 | " [INPUT \\\n" |
306 | " [OUTPUT]]\n" |
307 | "\n" |
308 | " or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" |
309 | " CORPUS \\\n" |
310 | " SERIALISED_TAGGER\n" |
311 | "\n" |
312 | " or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" |
313 | " DICTIONARY \\\n" |
314 | " CORPUS \\\n" |
315 | " TAGGER_SPECIFICATION \\\n" |
316 | " SERIALISED_TAGGER \\\n" |
317 | " TAGGED_CORPUS \\\n" |
318 | " UNTAGGED_CORPUS\n" |
319 | "\n" |
320 | " or: apertium-tagger [OPTION]... -s 0 \\\n" |
321 | " DICTIONARY \\\n" |
322 | " TAGGER_SPECIFICATION \\\n" |
323 | " SERIALISED_TAGGER \\\n" |
324 | " TAGGED_CORPUS \\\n" |
325 | " UNTAGGED_CORPUS\n" |
326 | "\n" |
327 | " or: apertium-tagger [OPTION]... -s 0 \\\n" |
328 | " -u MODEL \\\n" |
329 | " SERIALISED_TAGGER \\\n" |
330 | " TAGGED_CORPUS\n" |
331 | "\n" |
332 | " or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" |
333 | " DICTIONARY \\\n" |
334 | " CORPUS \\\n" |
335 | " TAGGER_SPECIFICATION \\\n" |
336 | " SERIALISED_TAGGER\n" |
337 | "\n" |
338 | "Mandatory arguments to long options are mandatory for short options too.\n" |
339 | "\n"; |
340 | |
341 | std::vector<std::pair<std::string, std::string> > options_description_; |
342 | options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); |
343 | options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); |
344 | options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); |
345 | options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); |
346 | options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); |
347 | align::align_(options_description_); |
348 | std::cerr << '\n'; |
349 | options_description_.clear(); |
350 | options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from <https://coltekin.net/cagri/papers/trmorph-tools.pdf>")); |
351 | align::align_(options_description_); |
352 | std::cerr << '\n'; |
353 | options_description_.clear(); |
354 | options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); |
355 | options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); |
356 | options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); |
357 | align::align_(options_description_); |
358 | std::cerr << '\n'; |
359 | options_description_.clear(); |
360 | options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); |
361 | align::align_(options_description_); |
362 | std::cerr << '\n'; |
363 | options_description_.clear(); |
364 | options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); |
365 | options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); |
366 | options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); |
367 | align::align_(options_description_); |
368 | std::cerr << '\n'; |
369 | options_description_.clear(); |
370 | options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); |
371 | align::align_(options_description_); |
372 | } |
373 | |
374 | const struct option apertium_tagger::longopts[] = { |
375 | {"help", no_argument, 0, 'h'}, |
376 | {"sent-seg", no_argument, 0, 'b'}, |
377 | {"debug", no_argument, 0, 'd'}, |
378 | {"skip-on-error", no_argument, 0, 'e'}, |
379 | {"first", no_argument, 0, 'f'}, |
380 | {"mark", no_argument, 0, 'm'}, |
381 | {"show-superficial", no_argument, 0, 'p'}, |
382 | {"null-flush", no_argument, 0, 'z'}, |
383 | {"unigram", required_argument, 0, 'u'}, |
384 | {"sliding-window", no_argument, 0, 'w'}, |
385 | {"perceptron", no_argument, 0, 'x'}, |
386 | {"tagger", no_argument, 0, 'g'}, |
387 | {"retrain", required_argument, 0, 'r'}, |
388 | {"supervised", required_argument, 0, 's'}, |
389 | {"train", required_argument, 0, 't'}, |
390 | {0, 0, 0, 0}}; |
391 | |
392 | |
393 | |
394 | std::string apertium_tagger::option_string(const int &indexptr_) { |
395 | return option_string(longopts[indexptr_]); |
396 | } |
397 | |
398 | std::string apertium_tagger::option_string(const struct option &option_) { |
399 | std::stringstream option_string_; |
400 | option_string_ << "--" << option_.name; |
401 | return option_string_.str(); |
402 | } |
403 | |
404 | void apertium_tagger::locale_global_() { |
405 | |
406 | #if defined __clang__ |
407 | |
408 | std::locale::global(std::locale("")); |
409 | |
410 | #else |
411 | #if defined __APPLE__ |
412 | |
413 | LtLocale::tryToSetLocale(); |
414 | |
415 | #else |
416 | |
417 | std::locale::global(std::locale("")); |
418 | |
419 | #endif // defined __APPLE__ |
420 | #endif // defined __clang__ |
421 | } |
422 | |
423 | void apertium_tagger::set_indexptr() { |
424 | if (The_val == longopts[The_indexptr].val) |
425 | return; |
426 | |
427 | for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; |
428 | ++longopts_Index) { |
429 | if (The_val == longopts[longopts_Index].val) { |
430 | The_indexptr = longopts_Index; |
431 | return; |
432 | } |
433 | } |
434 | } |
435 | |
436 | void apertium_tagger::flagOptionCase( |
437 | bool (TaggerFlags::*GetFlag)(), |
438 | void (TaggerFlags::*SetFlag)(const bool &)) { |
439 | if ((TheFlags.*GetFlag)()) { |
440 | std::stringstream what_; |
441 | what_ << "unexpected '" << option_string() << "' following '" |
442 | << option_string() << '\''; |
443 | throw Exception::apertium_tagger::UnexpectedFlagOption(what_); |
444 | } |
445 | |
446 | (TheFlags.*SetFlag)(true); |
447 | } |
448 | |
449 | std::string apertium_tagger::option_string() { |
450 | return option_string(The_indexptr); |
451 | } |
452 | |
453 | void apertium_tagger::functionTypeTypeOptionCase( |
454 | const FunctionTypeType &FunctionTypeType_) { |
455 | if (FunctionTypeTypeOption_indexptr) { |
456 | std::stringstream what_; |
457 | what_ << "unexpected '" << option_string() << "' following '" |
458 | << option_string(*FunctionTypeTypeOption_indexptr) |
459 | << '\''; |
460 | throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); |
461 | } |
462 | |
463 | TheFunctionTypeType = FunctionTypeType_; |
464 | FunctionTypeTypeOption_indexptr = The_indexptr; |
465 | } |
466 | |
467 | void apertium_tagger::functionTypeOptionCase( |
468 | const FunctionType &FunctionType_) { |
469 | if (FunctionTypeOption_indexptr) { |
470 | std::stringstream what_; |
471 | what_ << "unexpected '" << option_string() << "' following '" |
472 | << option_string(*FunctionTypeOption_indexptr) |
473 | << '\''; |
474 | throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); |
475 | } |
476 | |
477 | TheFunctionType = FunctionType_; |
478 | FunctionTypeOption_indexptr = The_indexptr; |
479 | } |
480 | |
481 | void apertium_tagger::getIterationsArgument() { |
482 | try { |
483 | TheFunctionTypeOptionArgument = optarg_unsigned_long("ITERATIONS"); |
484 | } catch (const ExceptionType &ExceptionType_) { |
485 | std::stringstream what_; |
486 | what_ << "invalid argument '" << optarg << "' for '" << option_string() |
487 | << '\''; |
488 | throw Exception::apertium_tagger::InvalidArgument(what_); |
489 | } |
490 | } |
491 | |
492 | static unsigned long parse_unsigned_long(const char *metavar, const char *val) { |
493 | char *str_end; |
494 | errno = 0; |
495 | unsigned long N_0 = std::strtoul(val, &str_end, 10); |
496 | |
497 | if (*str_end != '\0') { |
498 | std::stringstream what_; |
499 | what_ << "can't convert " << metavar << " \"" << val << "\" to unsigned long"; |
500 | throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); |
501 | } |
502 | |
503 | if (*val == '\0') { |
504 | std::stringstream what_; |
505 | what_ << "can't convert " << metavar << " of size 1 \"\" to unsigned long"; |
506 | throw Exception::apertium_tagger::optarg_eq_NULL(what_); |
507 | } |
508 | |
509 | if (errno == ERANGE) { |
510 | std::stringstream what_; |
511 | what_ << "can't convert " << metavar << " \"" << val |
512 | << "\" to unsigned long, not in unsigned long range"; |
513 | throw Exception::apertium_tagger::ERANGE_(what_); |
514 | } |
515 | |
516 | return N_0; |
517 | } |
518 | |
519 | unsigned long apertium_tagger::optarg_unsigned_long(const char *metavar) { |
520 | return parse_unsigned_long(metavar, optarg); |
521 | } |
522 | |
523 | void apertium_tagger::get_file_arguments( |
524 | bool get_crp_fn, |
525 | char **DicFn, char **CrpFn, |
526 | char **TaggedFn, char **UntaggedFn, |
527 | char **TsxFn, char **ProbFn) { |
528 | if (*TheFunctionType != Retrain) { |
| 14 | | Assuming the condition is true | |
|
| |
529 | *DicFn = argv[optind++]; |
530 | } |
531 | if (get_crp_fn) { |
| |
532 | *CrpFn = argv[optind++]; |
533 | } |
534 | if (*TheFunctionType == Supervised) { |
| 17 | | Assuming the condition is false | |
|
| |
535 | *TsxFn = argv[optind++]; |
536 | *ProbFn = argv[optind++]; |
537 | *TaggedFn = argv[optind++]; |
538 | } |
539 | *UntaggedFn = argv[optind++]; |
540 | if (*TheFunctionType == Supervised && !get_crp_fn) { |
541 | *CrpFn = *UntaggedFn; |
542 | } |
543 | if (*TheFunctionType != Supervised) { |
| |
544 | if (*TheFunctionType != Retrain) { |
| |
545 | *TsxFn = argv[optind++]; |
546 | } |
547 | *ProbFn = argv[optind++]; |
548 | } |
549 | } |
| 21 | | Returning without writing to '*TaggedFn' | |
|
550 | |
551 | void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const &TsxFn) { |
552 | FILE_Tagger_.deserialise(TsxFn); |
553 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
554 | } |
555 | |
556 | MorphoStream* apertium_tagger::setup_untagged_morpho_stream( |
557 | FILE_Tagger &FILE_Tagger_, |
558 | char *DicFn, char *UntaggedFn, |
559 | UFILE* *UntaggedCorpus) { |
560 | *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); |
561 | |
562 | FILE_Tagger_.read_dictionary(DicFn); |
563 | |
564 | return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); |
565 | } |
566 | |
567 | |
568 | |
569 | void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { |
570 | locale_global_(); |
571 | |
572 | expect_file_arguments(nonoptarg, 1, 4); |
573 | |
574 | std::ifstream SerialisedAnalysisFrequencies; |
575 | try_open_fstream("SERIALISED_TAGGER", argv[optind], |
576 | SerialisedAnalysisFrequencies); |
577 | |
578 | try { |
579 | StreamTagger_.deserialise(SerialisedAnalysisFrequencies); |
580 | } catch (const ExceptionType &ExceptionType_) { |
581 | std::stringstream what_; |
582 | what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] |
583 | << "\" Reason: " << ExceptionType_.what(); |
584 | throw Exception::apertium_tagger::deserialise(what_); |
585 | } |
586 | |
587 | if (nonoptarg < 2) { |
588 | Stream Input(TheFlags); |
589 | StreamTagger_.tag(Input, std::cout); |
590 | return; |
591 | } |
592 | |
593 | Stream Input(TheFlags, argv[optind + 1]); |
594 | |
595 | if (nonoptarg < 3) { |
596 | StreamTagger_.tag(Input, std::cout); |
597 | return; |
598 | } |
599 | |
600 | std::ofstream Output_stream; |
601 | try_open_fstream("OUTPUT", argv[optind + 2], Output_stream); |
602 | |
603 | StreamTagger_.tag(Input, Output_stream); |
604 | } |
605 | |
606 | void apertium_tagger::s_StreamTaggerTrainer( |
607 | StreamTagger &StreamTaggerTrainer_) { |
608 | locale_global_(); |
609 | |
610 | if (TheFunctionTypeOptionArgument != 0 && *TheFunctionTypeType != Perceptron) { |
611 | std::stringstream what_; |
612 | what_ << "invalid argument '" << TheFunctionTypeOptionArgument |
613 | << "' for '--supervised'"; |
614 | throw Exception::apertium_tagger::InvalidArgument(what_); |
615 | } |
616 | |
617 | if (*TheFunctionTypeType == Perceptron) { |
618 | expect_file_arguments(nonoptarg, 4); |
619 | } else { |
620 | expect_file_arguments(nonoptarg, 2); |
621 | } |
622 | |
623 | Stream TaggedCorpus(TheFlags, argv[optind + 1]); |
624 | |
625 | if (*TheFunctionTypeType == Perceptron) { |
626 | Stream UntaggedCorpus(TheFlags, argv[optind + 2]); |
627 | |
628 | PerceptronTagger &pt = dynamic_cast<PerceptronTagger&>(StreamTaggerTrainer_); |
629 | pt.read_spec(argv[optind + 3]); |
630 | pt.train(TaggedCorpus, UntaggedCorpus, TheFunctionTypeOptionArgument); |
631 | } else { |
632 | StreamTaggerTrainer_.train(TaggedCorpus); |
633 | } |
634 | |
635 | std::ofstream Serialised_basic_Tagger; |
636 | try_open_fstream("SERIALISED_TAGGER", argv[optind], |
637 | Serialised_basic_Tagger); |
638 | |
639 | StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); |
640 | } |
641 | |
642 | void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
643 | LtLocale::tryToSetLocale(); |
644 | expect_file_arguments(nonoptarg, 1, 4); |
645 | |
646 | FILE *Serialised_FILE_Tagger = |
647 | try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); |
648 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); |
649 | try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); |
650 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
651 | TaggerWord::generate_marks = TheFlags.getMark(); |
652 | const char* infile = NULL; |
653 | UFILE* Output = u_finit(stdout, NULL, NULL); |
654 | if (nonoptarg >= 2) { |
655 | infile = argv[optind + 1]; |
656 | if (nonoptarg >= 3) { |
657 | Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); |
658 | } |
659 | } |
660 | FILE_Tagger_.tagger(infile, Output); |
661 | u_fclose(Output); |
662 | } |
663 | |
664 | void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
665 | LtLocale::tryToSetLocale(); |
666 | |
667 | expect_file_arguments(nonoptarg, 2); |
668 | |
669 | char *ProbFn, *UntaggedFn; |
670 | |
671 | get_file_arguments( |
672 | false, |
673 | NULL, NULL, NULL, &UntaggedFn, |
674 | NULL, &ProbFn); |
675 | |
676 | FILE *Serialised_FILE_Tagger = |
677 | try_open_file("SERIALISED_TAGGER", ProbFn, "rb"); |
678 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); |
679 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
680 | |
681 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
682 | |
683 | UFILE* UntaggedCorpus; |
684 | MorphoStream* ms = setup_untagged_morpho_stream( |
685 | FILE_Tagger_, |
686 | NULL, UntaggedFn, |
687 | &UntaggedCorpus); |
688 | |
689 | FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); |
690 | delete ms; |
691 | u_fclose(UntaggedCorpus); |
692 | |
693 | Serialised_FILE_Tagger = |
694 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
695 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
696 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
697 | } |
698 | |
699 | void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
700 | LtLocale::tryToSetLocale(); |
701 | |
702 | if (TheFunctionTypeOptionArgument == 0) { |
| 9 | | Assuming field 'TheFunctionTypeOptionArgument' is not equal to 0 | |
|
| |
703 | expect_file_arguments(nonoptarg, 5, 7); |
704 | } else { |
705 | expect_file_arguments(nonoptarg, 6); |
706 | } |
707 | char *DicFn, *CrpFn, *TsxFn, *ProbFn, *TaggedFn, *UntaggedFn; |
| 11 | | 'TaggedFn' declared without an initial value | |
|
708 | bool do_unsup = nonoptarg == 6; |
| 12 | | Assuming field 'nonoptarg' is not equal to 6 | |
|
709 | |
710 | get_file_arguments( |
| 13 | | Calling 'apertium_tagger::get_file_arguments' | |
|
| 22 | | Returning from 'apertium_tagger::get_file_arguments' | |
|
711 | do_unsup, |
712 | &DicFn, &CrpFn, &TaggedFn, &UntaggedFn, |
713 | &TsxFn, &ProbFn); |
714 | init_FILE_Tagger(FILE_Tagger_, TsxFn); |
715 | |
716 | UFILE* UntaggedCorpus; |
717 | MorphoStream* ms = setup_untagged_morpho_stream( |
718 | FILE_Tagger_, |
719 | DicFn, UntaggedFn, |
720 | &UntaggedCorpus); |
721 | FileMorphoStream tms(TaggedFn, true, &FILE_Tagger_.get_tagger_data()); |
| 23 | | 1st function call argument is an uninitialized value |
|
722 | |
723 | FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); |
724 | delete ms; |
725 | u_fclose(UntaggedCorpus); |
726 | |
727 | if (do_unsup) { |
728 | FILE_Tagger_.train(CrpFn, TheFunctionTypeOptionArgument); |
729 | } |
730 | |
731 | FILE *Serialised_FILE_Tagger = |
732 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
733 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
734 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
735 | } |
736 | |
737 | void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
738 | LtLocale::tryToSetLocale(); |
739 | |
740 | expect_file_arguments(nonoptarg, 4); |
741 | |
742 | char *DicFn, *TsxFn, *ProbFn, *UntaggedFn; |
743 | UntaggedFn = NULL; |
744 | |
745 | get_file_arguments( |
746 | false, |
747 | &DicFn, NULL, NULL, &UntaggedFn, |
748 | &TsxFn, &ProbFn); |
749 | init_FILE_Tagger(FILE_Tagger_, TsxFn); |
750 | |
751 | UFILE* UntaggedCorpus; |
752 | MorphoStream* ms = setup_untagged_morpho_stream( |
753 | FILE_Tagger_, |
754 | DicFn, UntaggedFn, |
755 | &UntaggedCorpus); |
756 | |
757 | FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); |
758 | delete ms; |
759 | u_fclose(UntaggedCorpus); |
760 | |
761 | FILE *Serialised_FILE_Tagger = |
762 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
763 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
764 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
765 | |
766 | } |
767 | } |