clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name tagger.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ tagger.cc
| 1 | |
| 2 | |
| 3 | |
| 4 | |
| 5 | |
| 6 | |
| 7 | |
| 8 | |
| 9 | |
| 10 | |
| 11 | |
| 12 | |
| 13 | |
| 14 | |
| 15 | |
| 16 | #include <apertium/tagger.h> |
| 17 | |
| 18 | #include "apertium_config.h" |
| 19 | |
| 20 | #include "align.h" |
| 21 | #include <lttoolbox/exception.h> |
| 22 | #include "exception.h" |
| 23 | #include "linebreak.h" |
| 24 | #include "unigram_tagger.h" |
| 25 | #include <apertium/perceptron_tagger.h> |
| 26 | #include <apertium/hmm.h> |
| 27 | #include <apertium/lswpost.h> |
| 28 | #include <apertium/tagger_word.h> |
| 29 | #include <apertium/shell_utils.h> |
| 30 | |
| 31 | #include <lttoolbox/lt_locale.h> |
| 32 | |
| 33 | #include "getopt_long.h" |
| 34 | #include <cerrno> |
| 35 | #include <cstdio> |
| 36 | #include <cstdlib> |
| 37 | #include <cstring> |
| 38 | #include <fstream> |
| 39 | #include <iomanip> |
| 40 | #include <ios> |
| 41 | #include <iostream> |
| 42 | #include <locale> |
| 43 | #include <sstream> |
| 44 | #include <string> |
| 45 | #include <unistd.h> |
| 46 | |
| 47 | namespace Apertium { |
| 48 | using namespace ShellUtils; |
| 49 | using namespace tagger_utils; |
| 50 | |
| 51 | |
| 52 | |
| 53 | apertium_tagger::apertium_tagger(int &argc, char **&argv) |
| 54 | : argc(argc), argv(argv), The_val(), nonoptarg(), |
| 55 | |
| 56 | The_indexptr(), FunctionTypeTypeOption_indexptr(), |
| 57 | FunctionTypeOption_indexptr(), |
| 58 | |
| 59 | TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), |
| 60 | TheFunctionTypeOptionArgument(0), TheFlags() { |
| 61 | try { |
| 62 | |
| 63 | optind = 1; |
| 64 | while (true) { |
| 1 | Loop condition is true. Entering loop body | |
|
| 65 | The_val = getopt_long(argc, argv, "bdfegmpr:s:t:u:wxz", longopts, &The_indexptr); |
| 66 | |
| 67 | if (The_val == -1) |
| 2 | | Assuming the condition is true | |
|
| |
| 68 | break; |
| 69 | |
| 70 | set_indexptr(); |
| 71 | |
| 72 | switch (The_val) { |
| 73 | case 'b': |
| 74 | flagOptionCase(&TaggerFlags::getSentSeg, |
| 75 | &TaggerFlags::setSentSeg); |
| 76 | break; |
| 77 | case 'd': |
| 78 | flagOptionCase(&TaggerFlags::getDebug, |
| 79 | &TaggerFlags::setDebug); |
| 80 | break; |
| 81 | case 'e': |
| 82 | flagOptionCase(&TaggerFlags::getSkipErrors, |
| 83 | &TaggerFlags::setSkipErrors); |
| 84 | break; |
| 85 | case 'f': |
| 86 | flagOptionCase(&TaggerFlags::getFirst, |
| 87 | &TaggerFlags::setFirst); |
| 88 | break; |
| 89 | case 'm': |
| 90 | flagOptionCase(&TaggerFlags::getMark, |
| 91 | &TaggerFlags::setMark); |
| 92 | break; |
| 93 | case 'p': |
| 94 | flagOptionCase(&TaggerFlags::getShowSuperficial, |
| 95 | &TaggerFlags::setShowSuperficial); |
| 96 | break; |
| 97 | case 'z': |
| 98 | flagOptionCase(&TaggerFlags::getNullFlush, |
| 99 | &TaggerFlags::setNullFlush); |
| 100 | break; |
| 101 | case 'u': |
| 102 | functionTypeTypeOptionCase(Unigram); |
| 103 | |
| 104 | if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { |
| 105 | TheUnigramType = Stream_5_3_1; |
| 106 | break; |
| 107 | } |
| 108 | |
| 109 | if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { |
| 110 | TheUnigramType = Stream_5_3_2; |
| 111 | break; |
| 112 | } |
| 113 | |
| 114 | if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { |
| 115 | TheUnigramType = Stream_5_3_3; |
| 116 | break; |
| 117 | } |
| 118 | |
| 119 | { |
| 120 | std::stringstream what_; |
| 121 | what_ << "invalid argument '" << optarg << "' for '--unigram'\n" |
| 122 | "Valid arguments are:\n" |
| 123 | " - '1'\n" |
| 124 | " - '2'\n" |
| 125 | " - '3'"; |
| 126 | throw Exception::apertium_tagger::InvalidArgument(what_); |
| 127 | } |
| 128 | break; |
| 129 | case 'w': |
| 130 | functionTypeTypeOptionCase(SlidingWindow); |
| 131 | break; |
| 132 | case 'x': |
| 133 | functionTypeTypeOptionCase(Perceptron); |
| 134 | break; |
| 135 | case 'g': |
| 136 | functionTypeOptionCase(Tagger); |
| 137 | break; |
| 138 | case 'r': |
| 139 | functionTypeOptionCase(Retrain); |
| 140 | getIterationsArgument(); |
| 141 | break; |
| 142 | case 's': |
| 143 | functionTypeOptionCase(Supervised); |
| 144 | getIterationsArgument(); |
| 145 | break; |
| 146 | case 't': |
| 147 | functionTypeOptionCase(Train); |
| 148 | getIterationsArgument(); |
| 149 | break; |
| 150 | case 'h': |
| 151 | help(); |
| 152 | return; |
| 153 | default: |
| 154 | throw Exception::apertium_tagger::err_Exception(""); |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | if (!TheFunctionType) { |
| 4 | | Execution continues on line 158 | |
|
| |
| 159 | help(); |
| 160 | return; |
| 161 | } |
| 162 | |
| 163 | nonoptarg = argc - optind; |
| 164 | switch (*TheFunctionType) { |
| 6 | | Control jumps to 'case Train:' at line 268 | |
|
| 165 | case Tagger: |
| 166 | if (!TheFunctionTypeType) { |
| 167 | try { |
| 168 | PerceptronTagger percep(TheFlags); |
| 169 | g_StreamTagger(percep); |
| 170 | } catch (DeserialisationException) { |
| 171 | HMM HiddenMarkovModelTagger_(TheFlags); |
| 172 | g_FILE_Tagger(HiddenMarkovModelTagger_); |
| 173 | } |
| 174 | break; |
| 175 | } |
| 176 | switch (*TheFunctionTypeType) { |
| 177 | case Unigram: { |
| 178 | UnigramTagger UnigramTagger_(TheFlags); |
| 179 | switch (*TheUnigramType) { |
| 180 | case Stream_5_3_1: |
| 181 | UnigramTagger_.setModel(UnigramTaggerModel1); |
| 182 | break; |
| 183 | case Stream_5_3_2: |
| 184 | UnigramTagger_.setModel(UnigramTaggerModel2); |
| 185 | break; |
| 186 | case Stream_5_3_3: |
| 187 | UnigramTagger_.setModel(UnigramTaggerModel3); |
| 188 | break; |
| 189 | default: |
| 190 | std::abort(); |
| 191 | } |
| 192 | g_StreamTagger(UnigramTagger_); |
| 193 | } break; |
| 194 | case SlidingWindow: { |
| 195 | LSWPoST SlidingWindowTagger_(TheFlags); |
| 196 | g_FILE_Tagger(SlidingWindowTagger_); |
| 197 | } break; |
| 198 | case Perceptron: { |
| 199 | PerceptronTagger perceptron(TheFlags); |
| 200 | g_StreamTagger(perceptron); |
| 201 | } break; |
| 202 | default: |
| 203 | std::abort(); |
| 204 | } |
| 205 | |
| 206 | break; |
| 207 | case Retrain: |
| 208 | if (!TheFunctionTypeType) { |
| 209 | HMM HiddenMarkovModelTagger_(TheFlags); |
| 210 | r_FILE_Tagger(HiddenMarkovModelTagger_); |
| 211 | break; |
| 212 | } |
| 213 | |
| 214 | switch (*TheFunctionTypeType) { |
| 215 | case Unigram: { |
| 216 | std::stringstream what_; |
| 217 | what_ << "invalid option -- 'u'"; |
| 218 | throw Exception::apertium_tagger::InvalidOption(what_); |
| 219 | } |
| 220 | case SlidingWindow: { |
| 221 | LSWPoST SlidingWindowTagger_(TheFlags); |
| 222 | r_FILE_Tagger(SlidingWindowTagger_); |
| 223 | } break; |
| 224 | default: |
| 225 | std::abort(); |
| 226 | } |
| 227 | |
| 228 | break; |
| 229 | case Supervised: |
| 230 | if (!TheFunctionTypeType) { |
| 231 | HMM HiddenMarkovModelTagger_(TheFlags); |
| 232 | s_FILE_Tagger(HiddenMarkovModelTagger_); |
| 233 | break; |
| 234 | } |
| 235 | |
| 236 | switch (*TheFunctionTypeType) { |
| 237 | case Unigram: { |
| 238 | UnigramTagger UnigramTagger_(TheFlags); |
| 239 | switch (*TheUnigramType) { |
| 240 | case Stream_5_3_1: |
| 241 | UnigramTagger_.setModel(UnigramTaggerModel1); |
| 242 | break; |
| 243 | case Stream_5_3_2: |
| 244 | UnigramTagger_.setModel(UnigramTaggerModel2); |
| 245 | break; |
| 246 | case Stream_5_3_3: |
| 247 | UnigramTagger_.setModel(UnigramTaggerModel3); |
| 248 | break; |
| 249 | default: |
| 250 | std::abort(); |
| 251 | } |
| 252 | s_StreamTaggerTrainer(UnigramTagger_); |
| 253 | } break; |
| 254 | case SlidingWindow: { |
| 255 | std::stringstream what_; |
| 256 | what_ << "invalid option -- 'w'"; |
| 257 | throw Exception::apertium_tagger::InvalidOption(what_); |
| 258 | } break; |
| 259 | case Perceptron: { |
| 260 | PerceptronTagger perceptron(TheFlags); |
| 261 | s_StreamTaggerTrainer(perceptron); |
| 262 | } break; |
| 263 | default: |
| 264 | std::abort(); |
| 265 | } |
| 266 | |
| 267 | break; |
| 268 | case Train: |
| 269 | if (!TheFunctionTypeType) { |
| |
| 270 | HMM HiddenMarkovModelTagger_(TheFlags); |
| 271 | t_FILE_Tagger(HiddenMarkovModelTagger_); |
| 8 | | Calling 'apertium_tagger::t_FILE_Tagger' | |
|
| 272 | break; |
| 273 | } |
| 274 | |
| 275 | switch (*TheFunctionTypeType) { |
| 276 | case Unigram: { |
| 277 | std::stringstream what_; |
| 278 | what_ << "invalid option -- 'u'"; |
| 279 | throw Exception::apertium_tagger::InvalidOption(what_); |
| 280 | } |
| 281 | case SlidingWindow: { |
| 282 | LSWPoST SlidingWindowTagger_(TheFlags); |
| 283 | t_FILE_Tagger(SlidingWindowTagger_); |
| 284 | } break; |
| 285 | default: |
| 286 | std::abort(); |
| 287 | } |
| 288 | |
| 289 | break; |
| 290 | default: |
| 291 | std::abort(); |
| 292 | } |
| 293 | } catch (const ExceptionType &ExceptionType_) { |
| 294 | std::cerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; |
| 295 | throw Exception::apertium_tagger::err_Exception(""); |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | apertium_tagger::~apertium_tagger() {} |
| 300 | |
| 301 | void apertium_tagger::help() { |
| 302 | |
| 303 | std::cerr << |
| 304 | "Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" |
| 305 | " [INPUT \\\n" |
| 306 | " [OUTPUT]]\n" |
| 307 | "\n" |
| 308 | " or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" |
| 309 | " CORPUS \\\n" |
| 310 | " SERIALISED_TAGGER\n" |
| 311 | "\n" |
| 312 | " or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" |
| 313 | " DICTIONARY \\\n" |
| 314 | " CORPUS \\\n" |
| 315 | " TAGGER_SPECIFICATION \\\n" |
| 316 | " SERIALISED_TAGGER \\\n" |
| 317 | " TAGGED_CORPUS \\\n" |
| 318 | " UNTAGGED_CORPUS\n" |
| 319 | "\n" |
| 320 | " or: apertium-tagger [OPTION]... -s 0 \\\n" |
| 321 | " DICTIONARY \\\n" |
| 322 | " TAGGER_SPECIFICATION \\\n" |
| 323 | " SERIALISED_TAGGER \\\n" |
| 324 | " TAGGED_CORPUS \\\n" |
| 325 | " UNTAGGED_CORPUS\n" |
| 326 | "\n" |
| 327 | " or: apertium-tagger [OPTION]... -s 0 \\\n" |
| 328 | " -u MODEL \\\n" |
| 329 | " SERIALISED_TAGGER \\\n" |
| 330 | " TAGGED_CORPUS\n" |
| 331 | "\n" |
| 332 | " or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" |
| 333 | " DICTIONARY \\\n" |
| 334 | " CORPUS \\\n" |
| 335 | " TAGGER_SPECIFICATION \\\n" |
| 336 | " SERIALISED_TAGGER\n" |
| 337 | "\n" |
| 338 | "Mandatory arguments to long options are mandatory for short options too.\n" |
| 339 | "\n"; |
| 340 | |
| 341 | std::vector<std::pair<std::string, std::string> > options_description_; |
| 342 | options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); |
| 343 | options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); |
| 344 | options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); |
| 345 | options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); |
| 346 | options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); |
| 347 | align::align_(options_description_); |
| 348 | std::cerr << '\n'; |
| 349 | options_description_.clear(); |
| 350 | options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from <https://coltekin.net/cagri/papers/trmorph-tools.pdf>")); |
| 351 | align::align_(options_description_); |
| 352 | std::cerr << '\n'; |
| 353 | options_description_.clear(); |
| 354 | options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); |
| 355 | options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); |
| 356 | options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); |
| 357 | align::align_(options_description_); |
| 358 | std::cerr << '\n'; |
| 359 | options_description_.clear(); |
| 360 | options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); |
| 361 | align::align_(options_description_); |
| 362 | std::cerr << '\n'; |
| 363 | options_description_.clear(); |
| 364 | options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); |
| 365 | options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); |
| 366 | options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); |
| 367 | align::align_(options_description_); |
| 368 | std::cerr << '\n'; |
| 369 | options_description_.clear(); |
| 370 | options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); |
| 371 | align::align_(options_description_); |
| 372 | } |
| 373 | |
| 374 | const struct option apertium_tagger::longopts[] = { |
| 375 | {"help", no_argument, 0, 'h'}, |
| 376 | {"sent-seg", no_argument, 0, 'b'}, |
| 377 | {"debug", no_argument, 0, 'd'}, |
| 378 | {"skip-on-error", no_argument, 0, 'e'}, |
| 379 | {"first", no_argument, 0, 'f'}, |
| 380 | {"mark", no_argument, 0, 'm'}, |
| 381 | {"show-superficial", no_argument, 0, 'p'}, |
| 382 | {"null-flush", no_argument, 0, 'z'}, |
| 383 | {"unigram", required_argument, 0, 'u'}, |
| 384 | {"sliding-window", no_argument, 0, 'w'}, |
| 385 | {"perceptron", no_argument, 0, 'x'}, |
| 386 | {"tagger", no_argument, 0, 'g'}, |
| 387 | {"retrain", required_argument, 0, 'r'}, |
| 388 | {"supervised", required_argument, 0, 's'}, |
| 389 | {"train", required_argument, 0, 't'}, |
| 390 | {0, 0, 0, 0}}; |
| 391 | |
| 392 | |
| 393 | |
| 394 | std::string apertium_tagger::option_string(const int &indexptr_) { |
| 395 | return option_string(longopts[indexptr_]); |
| 396 | } |
| 397 | |
| 398 | std::string apertium_tagger::option_string(const struct option &option_) { |
| 399 | std::stringstream option_string_; |
| 400 | option_string_ << "--" << option_.name; |
| 401 | return option_string_.str(); |
| 402 | } |
| 403 | |
| 404 | void apertium_tagger::locale_global_() { |
| 405 | |
| 406 | #if defined __clang__ |
| 407 | |
| 408 | std::locale::global(std::locale("")); |
| 409 | |
| 410 | #else |
| 411 | #if defined __APPLE__ |
| 412 | |
| 413 | LtLocale::tryToSetLocale(); |
| 414 | |
| 415 | #else |
| 416 | |
| 417 | std::locale::global(std::locale("")); |
| 418 | |
| 419 | #endif // defined __APPLE__ |
| 420 | #endif // defined __clang__ |
| 421 | } |
| 422 | |
| 423 | void apertium_tagger::set_indexptr() { |
| 424 | if (The_val == longopts[The_indexptr].val) |
| 425 | return; |
| 426 | |
| 427 | for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; |
| 428 | ++longopts_Index) { |
| 429 | if (The_val == longopts[longopts_Index].val) { |
| 430 | The_indexptr = longopts_Index; |
| 431 | return; |
| 432 | } |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | void apertium_tagger::flagOptionCase( |
| 437 | bool (TaggerFlags::*GetFlag)(), |
| 438 | void (TaggerFlags::*SetFlag)(const bool &)) { |
| 439 | if ((TheFlags.*GetFlag)()) { |
| 440 | std::stringstream what_; |
| 441 | what_ << "unexpected '" << option_string() << "' following '" |
| 442 | << option_string() << '\''; |
| 443 | throw Exception::apertium_tagger::UnexpectedFlagOption(what_); |
| 444 | } |
| 445 | |
| 446 | (TheFlags.*SetFlag)(true); |
| 447 | } |
| 448 | |
| 449 | std::string apertium_tagger::option_string() { |
| 450 | return option_string(The_indexptr); |
| 451 | } |
| 452 | |
| 453 | void apertium_tagger::functionTypeTypeOptionCase( |
| 454 | const FunctionTypeType &FunctionTypeType_) { |
| 455 | if (FunctionTypeTypeOption_indexptr) { |
| 456 | std::stringstream what_; |
| 457 | what_ << "unexpected '" << option_string() << "' following '" |
| 458 | << option_string(*FunctionTypeTypeOption_indexptr) |
| 459 | << '\''; |
| 460 | throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); |
| 461 | } |
| 462 | |
| 463 | TheFunctionTypeType = FunctionTypeType_; |
| 464 | FunctionTypeTypeOption_indexptr = The_indexptr; |
| 465 | } |
| 466 | |
| 467 | void apertium_tagger::functionTypeOptionCase( |
| 468 | const FunctionType &FunctionType_) { |
| 469 | if (FunctionTypeOption_indexptr) { |
| 470 | std::stringstream what_; |
| 471 | what_ << "unexpected '" << option_string() << "' following '" |
| 472 | << option_string(*FunctionTypeOption_indexptr) |
| 473 | << '\''; |
| 474 | throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); |
| 475 | } |
| 476 | |
| 477 | TheFunctionType = FunctionType_; |
| 478 | FunctionTypeOption_indexptr = The_indexptr; |
| 479 | } |
| 480 | |
| 481 | void apertium_tagger::getIterationsArgument() { |
| 482 | try { |
| 483 | TheFunctionTypeOptionArgument = optarg_unsigned_long("ITERATIONS"); |
| 484 | } catch (const ExceptionType &ExceptionType_) { |
| 485 | std::stringstream what_; |
| 486 | what_ << "invalid argument '" << optarg << "' for '" << option_string() |
| 487 | << '\''; |
| 488 | throw Exception::apertium_tagger::InvalidArgument(what_); |
| 489 | } |
| 490 | } |
| 491 | |
| 492 | static unsigned long parse_unsigned_long(const char *metavar, const char *val) { |
| 493 | char *str_end; |
| 494 | errno = 0; |
| 495 | unsigned long N_0 = std::strtoul(val, &str_end, 10); |
| 496 | |
| 497 | if (*str_end != '\0') { |
| 498 | std::stringstream what_; |
| 499 | what_ << "can't convert " << metavar << " \"" << val << "\" to unsigned long"; |
| 500 | throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); |
| 501 | } |
| 502 | |
| 503 | if (*val == '\0') { |
| 504 | std::stringstream what_; |
| 505 | what_ << "can't convert " << metavar << " of size 1 \"\" to unsigned long"; |
| 506 | throw Exception::apertium_tagger::optarg_eq_NULL(what_); |
| 507 | } |
| 508 | |
| 509 | if (errno == ERANGE) { |
| 510 | std::stringstream what_; |
| 511 | what_ << "can't convert " << metavar << " \"" << val |
| 512 | << "\" to unsigned long, not in unsigned long range"; |
| 513 | throw Exception::apertium_tagger::ERANGE_(what_); |
| 514 | } |
| 515 | |
| 516 | return N_0; |
| 517 | } |
| 518 | |
| 519 | unsigned long apertium_tagger::optarg_unsigned_long(const char *metavar) { |
| 520 | return parse_unsigned_long(metavar, optarg); |
| 521 | } |
| 522 | |
| 523 | void apertium_tagger::get_file_arguments( |
| 524 | bool get_crp_fn, |
| 525 | char **DicFn, char **CrpFn, |
| 526 | char **TaggedFn, char **UntaggedFn, |
| 527 | char **TsxFn, char **ProbFn) { |
| 528 | if (*TheFunctionType != Retrain) { |
| 11 | | Assuming the condition is false | |
|
| |
| 529 | *DicFn = argv[optind++]; |
| 530 | } |
| 531 | if (get_crp_fn) { |
| |
| 532 | *CrpFn = argv[optind++]; |
| 533 | } |
| 534 | if (*TheFunctionType == Supervised) { |
| |
| 535 | *TsxFn = argv[optind++]; |
| 536 | *ProbFn = argv[optind++]; |
| 537 | *TaggedFn = argv[optind++]; |
| 538 | } |
| 539 | *UntaggedFn = argv[optind++]; |
| 540 | if (*TheFunctionType == Supervised && !get_crp_fn) { |
| 541 | *CrpFn = *UntaggedFn; |
| 542 | } |
| 543 | if (*TheFunctionType != Supervised) { |
| |
| 544 | if (*TheFunctionType != Retrain) { |
| |
| 545 | *TsxFn = argv[optind++]; |
| 546 | } |
| 547 | *ProbFn = argv[optind++]; |
| 548 | } |
| 549 | } |
| 17 | | Returning without writing to '*TsxFn' | |
|
| 550 | |
| 551 | void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const &TsxFn) { |
| 552 | FILE_Tagger_.deserialise(TsxFn); |
| 553 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
| 554 | } |
| 555 | |
| 556 | MorphoStream* apertium_tagger::setup_untagged_morpho_stream( |
| 557 | FILE_Tagger &FILE_Tagger_, |
| 558 | char *DicFn, char *UntaggedFn, |
| 559 | UFILE* *UntaggedCorpus) { |
| 560 | *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); |
| 561 | |
| 562 | FILE_Tagger_.read_dictionary(DicFn); |
| 563 | |
| 564 | return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); |
| 565 | } |
| 566 | |
| 567 | |
| 568 | |
| 569 | void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { |
| 570 | locale_global_(); |
| 571 | |
| 572 | expect_file_arguments(nonoptarg, 1, 4); |
| 573 | |
| 574 | std::ifstream SerialisedAnalysisFrequencies; |
| 575 | try_open_fstream("SERIALISED_TAGGER", argv[optind], |
| 576 | SerialisedAnalysisFrequencies); |
| 577 | |
| 578 | try { |
| 579 | StreamTagger_.deserialise(SerialisedAnalysisFrequencies); |
| 580 | } catch (const ExceptionType &ExceptionType_) { |
| 581 | std::stringstream what_; |
| 582 | what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] |
| 583 | << "\" Reason: " << ExceptionType_.what(); |
| 584 | throw Exception::apertium_tagger::deserialise(what_); |
| 585 | } |
| 586 | |
| 587 | if (nonoptarg < 2) { |
| 588 | Stream Input(TheFlags); |
| 589 | StreamTagger_.tag(Input, std::cout); |
| 590 | return; |
| 591 | } |
| 592 | |
| 593 | Stream Input(TheFlags, argv[optind + 1]); |
| 594 | |
| 595 | if (nonoptarg < 3) { |
| 596 | StreamTagger_.tag(Input, std::cout); |
| 597 | return; |
| 598 | } |
| 599 | |
| 600 | std::ofstream Output_stream; |
| 601 | try_open_fstream("OUTPUT", argv[optind + 2], Output_stream); |
| 602 | |
| 603 | StreamTagger_.tag(Input, Output_stream); |
| 604 | } |
| 605 | |
| 606 | void apertium_tagger::s_StreamTaggerTrainer( |
| 607 | StreamTagger &StreamTaggerTrainer_) { |
| 608 | locale_global_(); |
| 609 | |
| 610 | if (TheFunctionTypeOptionArgument != 0 && *TheFunctionTypeType != Perceptron) { |
| 611 | std::stringstream what_; |
| 612 | what_ << "invalid argument '" << TheFunctionTypeOptionArgument |
| 613 | << "' for '--supervised'"; |
| 614 | throw Exception::apertium_tagger::InvalidArgument(what_); |
| 615 | } |
| 616 | |
| 617 | if (*TheFunctionTypeType == Perceptron) { |
| 618 | expect_file_arguments(nonoptarg, 4); |
| 619 | } else { |
| 620 | expect_file_arguments(nonoptarg, 2); |
| 621 | } |
| 622 | |
| 623 | Stream TaggedCorpus(TheFlags, argv[optind + 1]); |
| 624 | |
| 625 | if (*TheFunctionTypeType == Perceptron) { |
| 626 | Stream UntaggedCorpus(TheFlags, argv[optind + 2]); |
| 627 | |
| 628 | PerceptronTagger &pt = dynamic_cast<PerceptronTagger&>(StreamTaggerTrainer_); |
| 629 | pt.read_spec(argv[optind + 3]); |
| 630 | pt.train(TaggedCorpus, UntaggedCorpus, TheFunctionTypeOptionArgument); |
| 631 | } else { |
| 632 | StreamTaggerTrainer_.train(TaggedCorpus); |
| 633 | } |
| 634 | |
| 635 | std::ofstream Serialised_basic_Tagger; |
| 636 | try_open_fstream("SERIALISED_TAGGER", argv[optind], |
| 637 | Serialised_basic_Tagger); |
| 638 | |
| 639 | StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); |
| 640 | } |
| 641 | |
| 642 | void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
| 643 | LtLocale::tryToSetLocale(); |
| 644 | expect_file_arguments(nonoptarg, 1, 4); |
| 645 | |
| 646 | FILE *Serialised_FILE_Tagger = |
| 647 | try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); |
| 648 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); |
| 649 | try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); |
| 650 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
| 651 | TaggerWord::generate_marks = TheFlags.getMark(); |
| 652 | const char* infile = NULL; |
| 653 | UFILE* Output = u_finit(stdout, NULL, NULL); |
| 654 | if (nonoptarg >= 2) { |
| 655 | infile = argv[optind + 1]; |
| 656 | if (nonoptarg >= 3) { |
| 657 | Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); |
| 658 | } |
| 659 | } |
| 660 | FILE_Tagger_.tagger(infile, Output); |
| 661 | u_fclose(Output); |
| 662 | } |
| 663 | |
| 664 | void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
| 665 | LtLocale::tryToSetLocale(); |
| 666 | |
| 667 | expect_file_arguments(nonoptarg, 2); |
| 668 | |
| 669 | char *ProbFn, *UntaggedFn; |
| 670 | |
| 671 | get_file_arguments( |
| 672 | false, |
| 673 | NULL, NULL, NULL, &UntaggedFn, |
| 674 | NULL, &ProbFn); |
| 675 | |
| 676 | FILE *Serialised_FILE_Tagger = |
| 677 | try_open_file("SERIALISED_TAGGER", ProbFn, "rb"); |
| 678 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); |
| 679 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
| 680 | |
| 681 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); |
| 682 | |
| 683 | UFILE* UntaggedCorpus; |
| 684 | MorphoStream* ms = setup_untagged_morpho_stream( |
| 685 | FILE_Tagger_, |
| 686 | NULL, UntaggedFn, |
| 687 | &UntaggedCorpus); |
| 688 | |
| 689 | FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); |
| 690 | delete ms; |
| 691 | u_fclose(UntaggedCorpus); |
| 692 | |
| 693 | Serialised_FILE_Tagger = |
| 694 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
| 695 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
| 696 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
| 697 | } |
| 698 | |
| 699 | void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
| 700 | LtLocale::tryToSetLocale(); |
| 701 | |
| 702 | if (TheFunctionTypeOptionArgument == 0) { |
| 703 | expect_file_arguments(nonoptarg, 5, 7); |
| 704 | } else { |
| 705 | expect_file_arguments(nonoptarg, 6); |
| 706 | } |
| 707 | char *DicFn, *CrpFn, *TsxFn, *ProbFn, *TaggedFn, *UntaggedFn; |
| 708 | bool do_unsup = nonoptarg == 6; |
| 709 | |
| 710 | get_file_arguments( |
| 711 | do_unsup, |
| 712 | &DicFn, &CrpFn, &TaggedFn, &UntaggedFn, |
| 713 | &TsxFn, &ProbFn); |
| 714 | init_FILE_Tagger(FILE_Tagger_, TsxFn); |
| 715 | |
| 716 | UFILE* UntaggedCorpus; |
| 717 | MorphoStream* ms = setup_untagged_morpho_stream( |
| 718 | FILE_Tagger_, |
| 719 | DicFn, UntaggedFn, |
| 720 | &UntaggedCorpus); |
| 721 | FileMorphoStream tms(TaggedFn, true, &FILE_Tagger_.get_tagger_data()); |
| 722 | |
| 723 | FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); |
| 724 | delete ms; |
| 725 | u_fclose(UntaggedCorpus); |
| 726 | |
| 727 | if (do_unsup) { |
| 728 | FILE_Tagger_.train(CrpFn, TheFunctionTypeOptionArgument); |
| 729 | } |
| 730 | |
| 731 | FILE *Serialised_FILE_Tagger = |
| 732 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
| 733 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
| 734 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
| 735 | } |
| 736 | |
| 737 | void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { |
| 738 | LtLocale::tryToSetLocale(); |
| 739 | |
| 740 | expect_file_arguments(nonoptarg, 4); |
| 741 | |
| 742 | char *DicFn, *TsxFn, *ProbFn, *UntaggedFn; |
| 9 | | 'TsxFn' declared without an initial value | |
|
| 743 | UntaggedFn = NULL; |
| 744 | |
| 745 | get_file_arguments( |
| 10 | | Calling 'apertium_tagger::get_file_arguments' | |
|
| 18 | | Returning from 'apertium_tagger::get_file_arguments' | |
|
| 746 | false, |
| 747 | &DicFn, NULL, NULL, &UntaggedFn, |
| 748 | &TsxFn, &ProbFn); |
| 749 | init_FILE_Tagger(FILE_Tagger_, TsxFn); |
| 19 | | 1st function call argument is an uninitialized value |
|
| 750 | |
| 751 | UFILE* UntaggedCorpus; |
| 752 | MorphoStream* ms = setup_untagged_morpho_stream( |
| 753 | FILE_Tagger_, |
| 754 | DicFn, UntaggedFn, |
| 755 | &UntaggedCorpus); |
| 756 | |
| 757 | FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); |
| 758 | delete ms; |
| 759 | u_fclose(UntaggedCorpus); |
| 760 | |
| 761 | FILE *Serialised_FILE_Tagger = |
| 762 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); |
| 763 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); |
| 764 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); |
| 765 | |
| 766 | } |
| 767 | } |