clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name tagger.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ tagger.cc
| 1 |  | 
| 2 |  | 
| 3 |  | 
| 4 |  | 
| 5 |  | 
| 6 |  | 
| 7 |  | 
| 8 |  | 
| 9 |  | 
| 10 |  | 
| 11 |  | 
| 12 |  | 
| 13 |  | 
| 14 |  | 
| 15 |  | 
| 16 | #include <apertium/tagger.h> | 
| 17 |  | 
| 18 | #include "apertium_config.h" | 
| 19 |  | 
| 20 | #include "align.h" | 
| 21 | #include <lttoolbox/exception.h> | 
| 22 | #include "exception.h" | 
| 23 | #include "linebreak.h" | 
| 24 | #include "unigram_tagger.h" | 
| 25 | #include <apertium/perceptron_tagger.h> | 
| 26 | #include <apertium/hmm.h> | 
| 27 | #include <apertium/lswpost.h> | 
| 28 | #include <apertium/tagger_word.h> | 
| 29 | #include <apertium/shell_utils.h> | 
| 30 |  | 
| 31 | #include <lttoolbox/lt_locale.h> | 
| 32 |  | 
| 33 | #include "getopt_long.h" | 
| 34 | #include <cerrno> | 
| 35 | #include <cstdio> | 
| 36 | #include <cstdlib> | 
| 37 | #include <cstring> | 
| 38 | #include <fstream> | 
| 39 | #include <iomanip> | 
| 40 | #include <ios> | 
| 41 | #include <iostream> | 
| 42 | #include <locale> | 
| 43 | #include <sstream> | 
| 44 | #include <string> | 
| 45 | #include <unistd.h> | 
| 46 |  | 
| 47 | namespace Apertium { | 
| 48 | using namespace ShellUtils; | 
| 49 | using namespace tagger_utils; | 
| 50 |  | 
| 51 |  | 
| 52 |  | 
| 53 | apertium_tagger::apertium_tagger(int &argc, char **&argv) | 
| 54 | : argc(argc), argv(argv), The_val(), nonoptarg(), | 
| 55 |  | 
| 56 | The_indexptr(), FunctionTypeTypeOption_indexptr(), | 
| 57 | FunctionTypeOption_indexptr(), | 
| 58 |  | 
| 59 | TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), | 
| 60 | TheFunctionTypeOptionArgument(0), TheFlags() { | 
| 61 | try { | 
| 62 |  | 
| 63 | optind = 1; | 
| 64 | while (true) { | 
|  | | 1 | Loop condition is true.  Entering loop body |  | 
 | 
| 65 | The_val = getopt_long(argc, argv, "bdfegmpr:s:t:u:wxz", longopts, &The_indexptr); | 
| 66 |  | 
| 67 | if (The_val == -1) | 
|  | | 2 |  | Assuming the condition is true |  | 
 | 
|  |  | 
| 68 | break; | 
| 69 |  | 
| 70 | set_indexptr(); | 
| 71 |  | 
| 72 | switch (The_val) { | 
| 73 | case 'b': | 
| 74 | flagOptionCase(&TaggerFlags::getSentSeg, | 
| 75 | &TaggerFlags::setSentSeg); | 
| 76 | break; | 
| 77 | case 'd': | 
| 78 | flagOptionCase(&TaggerFlags::getDebug, | 
| 79 | &TaggerFlags::setDebug); | 
| 80 | break; | 
| 81 | case 'e': | 
| 82 | flagOptionCase(&TaggerFlags::getSkipErrors, | 
| 83 | &TaggerFlags::setSkipErrors); | 
| 84 | break; | 
| 85 | case 'f': | 
| 86 | flagOptionCase(&TaggerFlags::getFirst, | 
| 87 | &TaggerFlags::setFirst); | 
| 88 | break; | 
| 89 | case 'm': | 
| 90 | flagOptionCase(&TaggerFlags::getMark, | 
| 91 | &TaggerFlags::setMark); | 
| 92 | break; | 
| 93 | case 'p': | 
| 94 | flagOptionCase(&TaggerFlags::getShowSuperficial, | 
| 95 | &TaggerFlags::setShowSuperficial); | 
| 96 | break; | 
| 97 | case 'z': | 
| 98 | flagOptionCase(&TaggerFlags::getNullFlush, | 
| 99 | &TaggerFlags::setNullFlush); | 
| 100 | break; | 
| 101 | case 'u': | 
| 102 | functionTypeTypeOptionCase(Unigram); | 
| 103 |  | 
| 104 | if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { | 
| 105 | TheUnigramType = Stream_5_3_1; | 
| 106 | break; | 
| 107 | } | 
| 108 |  | 
| 109 | if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { | 
| 110 | TheUnigramType = Stream_5_3_2; | 
| 111 | break; | 
| 112 | } | 
| 113 |  | 
| 114 | if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { | 
| 115 | TheUnigramType = Stream_5_3_3; | 
| 116 | break; | 
| 117 | } | 
| 118 |  | 
| 119 | { | 
| 120 | std::stringstream what_; | 
| 121 | what_ << "invalid argument '" << optarg << "' for '--unigram'\n" | 
| 122 | "Valid arguments are:\n" | 
| 123 | "  - '1'\n" | 
| 124 | "  - '2'\n" | 
| 125 | "  - '3'"; | 
| 126 | throw Exception::apertium_tagger::InvalidArgument(what_); | 
| 127 | } | 
| 128 | break; | 
| 129 | case 'w': | 
| 130 | functionTypeTypeOptionCase(SlidingWindow); | 
| 131 | break; | 
| 132 | case 'x': | 
| 133 | functionTypeTypeOptionCase(Perceptron); | 
| 134 | break; | 
| 135 | case 'g': | 
| 136 | functionTypeOptionCase(Tagger); | 
| 137 | break; | 
| 138 | case 'r': | 
| 139 | functionTypeOptionCase(Retrain); | 
| 140 | getIterationsArgument(); | 
| 141 | break; | 
| 142 | case 's': | 
| 143 | functionTypeOptionCase(Supervised); | 
| 144 | getIterationsArgument(); | 
| 145 | break; | 
| 146 | case 't': | 
| 147 | functionTypeOptionCase(Train); | 
| 148 | getIterationsArgument(); | 
| 149 | break; | 
| 150 | case 'h': | 
| 151 | help(); | 
| 152 | return; | 
| 153 | default: | 
| 154 | throw Exception::apertium_tagger::err_Exception(""); | 
| 155 | } | 
| 156 | } | 
| 157 |  | 
| 158 | if (!TheFunctionType) { | 
|  | | 4 |  | Execution continues on line 158 |  | 
 | 
|  |  | 
| 159 | help(); | 
| 160 | return; | 
| 161 | } | 
| 162 |  | 
| 163 | nonoptarg = argc - optind; | 
| 164 | switch (*TheFunctionType) { | 
|  | | 6 |  | Control jumps to 'case Supervised:'  at line 229 |  | 
 | 
| 165 | case Tagger: | 
| 166 | if (!TheFunctionTypeType) { | 
| 167 | try { | 
| 168 | PerceptronTagger percep(TheFlags); | 
| 169 | g_StreamTagger(percep); | 
| 170 | } catch (DeserialisationException) { | 
| 171 | HMM HiddenMarkovModelTagger_(TheFlags); | 
| 172 | g_FILE_Tagger(HiddenMarkovModelTagger_); | 
| 173 | } | 
| 174 | break; | 
| 175 | } | 
| 176 | switch (*TheFunctionTypeType) { | 
| 177 | case Unigram: { | 
| 178 | UnigramTagger UnigramTagger_(TheFlags); | 
| 179 | switch (*TheUnigramType) { | 
| 180 | case Stream_5_3_1: | 
| 181 | UnigramTagger_.setModel(UnigramTaggerModel1); | 
| 182 | break; | 
| 183 | case Stream_5_3_2: | 
| 184 | UnigramTagger_.setModel(UnigramTaggerModel2); | 
| 185 | break; | 
| 186 | case Stream_5_3_3: | 
| 187 | UnigramTagger_.setModel(UnigramTaggerModel3); | 
| 188 | break; | 
| 189 | default: | 
| 190 | std::abort(); | 
| 191 | } | 
| 192 | g_StreamTagger(UnigramTagger_); | 
| 193 | } break; | 
| 194 | case SlidingWindow: { | 
| 195 | LSWPoST SlidingWindowTagger_(TheFlags); | 
| 196 | g_FILE_Tagger(SlidingWindowTagger_); | 
| 197 | } break; | 
| 198 | case Perceptron: { | 
| 199 | PerceptronTagger perceptron(TheFlags); | 
| 200 | g_StreamTagger(perceptron); | 
| 201 | } break; | 
| 202 | default: | 
| 203 | std::abort(); | 
| 204 | } | 
| 205 |  | 
| 206 | break; | 
| 207 | case Retrain: | 
| 208 | if (!TheFunctionTypeType) { | 
| 209 | HMM HiddenMarkovModelTagger_(TheFlags); | 
| 210 | r_FILE_Tagger(HiddenMarkovModelTagger_); | 
| 211 | break; | 
| 212 | } | 
| 213 |  | 
| 214 | switch (*TheFunctionTypeType) { | 
| 215 | case Unigram: { | 
| 216 | std::stringstream what_; | 
| 217 | what_ << "invalid option -- 'u'"; | 
| 218 | throw Exception::apertium_tagger::InvalidOption(what_); | 
| 219 | } | 
| 220 | case SlidingWindow: { | 
| 221 | LSWPoST SlidingWindowTagger_(TheFlags); | 
| 222 | r_FILE_Tagger(SlidingWindowTagger_); | 
| 223 | } break; | 
| 224 | default: | 
| 225 | std::abort(); | 
| 226 | } | 
| 227 |  | 
| 228 | break; | 
| 229 | case Supervised: | 
| 230 | if (!TheFunctionTypeType) { | 
|  |  | 
| 231 | HMM HiddenMarkovModelTagger_(TheFlags); | 
| 232 | s_FILE_Tagger(HiddenMarkovModelTagger_); | 
|  | | 8 |  | Calling 'apertium_tagger::s_FILE_Tagger' |  | 
 | 
| 233 | break; | 
| 234 | } | 
| 235 |  | 
| 236 | switch (*TheFunctionTypeType) { | 
| 237 | case Unigram: { | 
| 238 | UnigramTagger UnigramTagger_(TheFlags); | 
| 239 | switch (*TheUnigramType) { | 
| 240 | case Stream_5_3_1: | 
| 241 | UnigramTagger_.setModel(UnigramTaggerModel1); | 
| 242 | break; | 
| 243 | case Stream_5_3_2: | 
| 244 | UnigramTagger_.setModel(UnigramTaggerModel2); | 
| 245 | break; | 
| 246 | case Stream_5_3_3: | 
| 247 | UnigramTagger_.setModel(UnigramTaggerModel3); | 
| 248 | break; | 
| 249 | default: | 
| 250 | std::abort(); | 
| 251 | } | 
| 252 | s_StreamTaggerTrainer(UnigramTagger_); | 
| 253 | } break; | 
| 254 | case SlidingWindow: { | 
| 255 | std::stringstream what_; | 
| 256 | what_ << "invalid option -- 'w'"; | 
| 257 | throw Exception::apertium_tagger::InvalidOption(what_); | 
| 258 | } break; | 
| 259 | case Perceptron: { | 
| 260 | PerceptronTagger perceptron(TheFlags); | 
| 261 | s_StreamTaggerTrainer(perceptron); | 
| 262 | } break; | 
| 263 | default: | 
| 264 | std::abort(); | 
| 265 | } | 
| 266 |  | 
| 267 | break; | 
| 268 | case Train: | 
| 269 | if (!TheFunctionTypeType) { | 
| 270 | HMM HiddenMarkovModelTagger_(TheFlags); | 
| 271 | t_FILE_Tagger(HiddenMarkovModelTagger_); | 
| 272 | break; | 
| 273 | } | 
| 274 |  | 
| 275 | switch (*TheFunctionTypeType) { | 
| 276 | case Unigram: { | 
| 277 | std::stringstream what_; | 
| 278 | what_ << "invalid option -- 'u'"; | 
| 279 | throw Exception::apertium_tagger::InvalidOption(what_); | 
| 280 | } | 
| 281 | case SlidingWindow: { | 
| 282 | LSWPoST SlidingWindowTagger_(TheFlags); | 
| 283 | t_FILE_Tagger(SlidingWindowTagger_); | 
| 284 | } break; | 
| 285 | default: | 
| 286 | std::abort(); | 
| 287 | } | 
| 288 |  | 
| 289 | break; | 
| 290 | default: | 
| 291 | std::abort(); | 
| 292 | } | 
| 293 | } catch (const ExceptionType &ExceptionType_) { | 
| 294 | std::cerr << "apertium-tagger: " << ExceptionType_.what() << std::endl; | 
| 295 | throw Exception::apertium_tagger::err_Exception(""); | 
| 296 | } | 
| 297 | } | 
| 298 |  | 
| 299 | apertium_tagger::~apertium_tagger() {} | 
| 300 |  | 
| 301 | void apertium_tagger::help() { | 
| 302 |  | 
| 303 | std::cerr << | 
| 304 | "Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER                        \\\n" | 
| 305 | "                                      [INPUT                                   \\\n" | 
| 306 | "                                      [OUTPUT]]\n" | 
| 307 | "\n" | 
| 308 | "  or:  apertium-tagger [OPTION]... -r ITERATIONS                               \\\n" | 
| 309 | "                                      CORPUS                                   \\\n" | 
| 310 | "                                      SERIALISED_TAGGER\n" | 
| 311 | "\n" | 
| 312 | "  or:  apertium-tagger [OPTION]... -s ITERATIONS                               \\\n" | 
| 313 | "                                      DICTIONARY                               \\\n" | 
| 314 | "                                      CORPUS                                   \\\n" | 
| 315 | "                                      TAGGER_SPECIFICATION                     \\\n" | 
| 316 | "                                      SERIALISED_TAGGER                        \\\n" | 
| 317 | "                                      TAGGED_CORPUS                            \\\n" | 
| 318 | "                                      UNTAGGED_CORPUS\n" | 
| 319 | "\n" | 
| 320 | "  or:  apertium-tagger [OPTION]... -s 0                                        \\\n" | 
| 321 | "                                      DICTIONARY                               \\\n" | 
| 322 | "                                      TAGGER_SPECIFICATION                     \\\n" | 
| 323 | "                                      SERIALISED_TAGGER                        \\\n" | 
| 324 | "                                      TAGGED_CORPUS                            \\\n" | 
| 325 | "                                      UNTAGGED_CORPUS\n" | 
| 326 | "\n" | 
| 327 | "  or:  apertium-tagger [OPTION]... -s 0                                        \\\n" | 
| 328 | "                                   -u MODEL                                    \\\n" | 
| 329 | "                                      SERIALISED_TAGGER                        \\\n" | 
| 330 | "                                      TAGGED_CORPUS\n" | 
| 331 | "\n" | 
| 332 | "  or:  apertium-tagger [OPTION]... -t ITERATIONS                               \\\n" | 
| 333 | "                                      DICTIONARY                               \\\n" | 
| 334 | "                                      CORPUS                                   \\\n" | 
| 335 | "                                      TAGGER_SPECIFICATION                     \\\n" | 
| 336 | "                                      SERIALISED_TAGGER\n" | 
| 337 | "\n" | 
| 338 | "Mandatory arguments to long options are mandatory for short options too.\n" | 
| 339 | "\n"; | 
| 340 |  | 
| 341 | std::vector<std::pair<std::string, std::string> > options_description_; | 
| 342 | options_description_.push_back(std::make_pair("-d, --debug",            "with -g, print error messages about the input")); | 
| 343 | options_description_.push_back(std::make_pair("-f, --first",            "with -g, reorder each lexical unit's analyses so that the chosen one is first")); | 
| 344 | options_description_.push_back(std::make_pair("-m, --mark",             "with -g, mark disambiguated lexical units")); | 
| 345 | options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); | 
| 346 | options_description_.push_back(std::make_pair("-z, --null-flush",       "with -g, flush the output after getting each null character")); | 
| 347 | align::align_(options_description_); | 
| 348 | std::cerr << '\n'; | 
| 349 | options_description_.clear(); | 
| 350 | options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from <https://coltekin.net/cagri/papers/trmorph-tools.pdf>")); | 
| 351 | align::align_(options_description_); | 
| 352 | std::cerr << '\n'; | 
| 353 | options_description_.clear(); | 
| 354 | options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); | 
| 355 | options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); | 
| 356 | options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); | 
| 357 | align::align_(options_description_); | 
| 358 | std::cerr << '\n'; | 
| 359 | options_description_.clear(); | 
| 360 | options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); | 
| 361 | align::align_(options_description_); | 
| 362 | std::cerr << '\n'; | 
| 363 | options_description_.clear(); | 
| 364 | options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); | 
| 365 | options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); | 
| 366 | options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); | 
| 367 | align::align_(options_description_); | 
| 368 | std::cerr << '\n'; | 
| 369 | options_description_.clear(); | 
| 370 | options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); | 
| 371 | align::align_(options_description_); | 
| 372 | } | 
| 373 |  | 
| 374 | const struct option apertium_tagger::longopts[] = { | 
| 375 | {"help", no_argument, 0, 'h'}, | 
| 376 | {"sent-seg", no_argument, 0, 'b'}, | 
| 377 | {"debug", no_argument, 0, 'd'}, | 
| 378 | {"skip-on-error", no_argument, 0, 'e'}, | 
| 379 | {"first", no_argument, 0, 'f'}, | 
| 380 | {"mark", no_argument, 0, 'm'}, | 
| 381 | {"show-superficial", no_argument, 0, 'p'}, | 
| 382 | {"null-flush", no_argument, 0, 'z'}, | 
| 383 | {"unigram", required_argument, 0, 'u'}, | 
| 384 | {"sliding-window", no_argument, 0, 'w'}, | 
| 385 | {"perceptron", no_argument, 0, 'x'}, | 
| 386 | {"tagger", no_argument, 0, 'g'}, | 
| 387 | {"retrain", required_argument, 0, 'r'}, | 
| 388 | {"supervised", required_argument, 0, 's'}, | 
| 389 | {"train", required_argument, 0, 't'}, | 
| 390 | {0, 0, 0, 0}}; | 
| 391 |  | 
| 392 |  | 
| 393 |  | 
| 394 | std::string apertium_tagger::option_string(const int &indexptr_) { | 
| 395 | return option_string(longopts[indexptr_]); | 
| 396 | } | 
| 397 |  | 
| 398 | std::string apertium_tagger::option_string(const struct option &option_) { | 
| 399 | std::stringstream option_string_; | 
| 400 | option_string_ << "--" << option_.name; | 
| 401 | return option_string_.str(); | 
| 402 | } | 
| 403 |  | 
| 404 | void apertium_tagger::locale_global_() { | 
| 405 |  | 
| 406 | #if defined __clang__ | 
| 407 |  | 
| 408 | std::locale::global(std::locale("")); | 
| 409 |  | 
| 410 | #else | 
| 411 | #if defined __APPLE__ | 
| 412 |  | 
| 413 | LtLocale::tryToSetLocale(); | 
| 414 |  | 
| 415 | #else | 
| 416 |  | 
| 417 | std::locale::global(std::locale("")); | 
| 418 |  | 
| 419 | #endif // defined __APPLE__ | 
| 420 | #endif // defined __clang__ | 
| 421 | } | 
| 422 |  | 
| 423 | void apertium_tagger::set_indexptr() { | 
| 424 | if (The_val == longopts[The_indexptr].val) | 
| 425 | return; | 
| 426 |  | 
| 427 | for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; | 
| 428 | ++longopts_Index) { | 
| 429 | if (The_val == longopts[longopts_Index].val) { | 
| 430 | The_indexptr = longopts_Index; | 
| 431 | return; | 
| 432 | } | 
| 433 | } | 
| 434 | } | 
| 435 |  | 
| 436 | void apertium_tagger::flagOptionCase( | 
| 437 | bool (TaggerFlags::*GetFlag)(), | 
| 438 | void (TaggerFlags::*SetFlag)(const bool &)) { | 
| 439 | if ((TheFlags.*GetFlag)()) { | 
| 440 | std::stringstream what_; | 
| 441 | what_ << "unexpected '" << option_string() << "' following '" | 
| 442 | << option_string() << '\''; | 
| 443 | throw Exception::apertium_tagger::UnexpectedFlagOption(what_); | 
| 444 | } | 
| 445 |  | 
| 446 | (TheFlags.*SetFlag)(true); | 
| 447 | } | 
| 448 |  | 
| 449 | std::string apertium_tagger::option_string() { | 
| 450 | return option_string(The_indexptr); | 
| 451 | } | 
| 452 |  | 
| 453 | void apertium_tagger::functionTypeTypeOptionCase( | 
| 454 | const FunctionTypeType &FunctionTypeType_) { | 
| 455 | if (FunctionTypeTypeOption_indexptr) { | 
| 456 | std::stringstream what_; | 
| 457 | what_ << "unexpected '" << option_string() << "' following '" | 
| 458 | << option_string(*FunctionTypeTypeOption_indexptr) | 
| 459 | << '\''; | 
| 460 | throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); | 
| 461 | } | 
| 462 |  | 
| 463 | TheFunctionTypeType = FunctionTypeType_; | 
| 464 | FunctionTypeTypeOption_indexptr = The_indexptr; | 
| 465 | } | 
| 466 |  | 
| 467 | void apertium_tagger::functionTypeOptionCase( | 
| 468 | const FunctionType &FunctionType_) { | 
| 469 | if (FunctionTypeOption_indexptr) { | 
| 470 | std::stringstream what_; | 
| 471 | what_ << "unexpected '" << option_string() << "' following '" | 
| 472 | << option_string(*FunctionTypeOption_indexptr) | 
| 473 | << '\''; | 
| 474 | throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); | 
| 475 | } | 
| 476 |  | 
| 477 | TheFunctionType = FunctionType_; | 
| 478 | FunctionTypeOption_indexptr = The_indexptr; | 
| 479 | } | 
| 480 |  | 
| 481 | void apertium_tagger::getIterationsArgument() { | 
| 482 | try { | 
| 483 | TheFunctionTypeOptionArgument = optarg_unsigned_long("ITERATIONS"); | 
| 484 | } catch (const ExceptionType &ExceptionType_) { | 
| 485 | std::stringstream what_; | 
| 486 | what_ << "invalid argument '" << optarg << "' for '" << option_string() | 
| 487 | << '\''; | 
| 488 | throw Exception::apertium_tagger::InvalidArgument(what_); | 
| 489 | } | 
| 490 | } | 
| 491 |  | 
| 492 | static unsigned long parse_unsigned_long(const char *metavar, const char *val) { | 
| 493 | char *str_end; | 
| 494 | errno = 0; | 
| 495 | unsigned long N_0 = std::strtoul(val, &str_end, 10); | 
| 496 |  | 
| 497 | if (*str_end != '\0') { | 
| 498 | std::stringstream what_; | 
| 499 | what_ << "can't convert " << metavar << " \"" << val << "\" to unsigned long"; | 
| 500 | throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); | 
| 501 | } | 
| 502 |  | 
| 503 | if (*val == '\0') { | 
| 504 | std::stringstream what_; | 
| 505 | what_ << "can't convert " << metavar << " of size 1 \"\" to unsigned long"; | 
| 506 | throw Exception::apertium_tagger::optarg_eq_NULL(what_); | 
| 507 | } | 
| 508 |  | 
| 509 | if (errno == ERANGE) { | 
| 510 | std::stringstream what_; | 
| 511 | what_ << "can't convert " << metavar << " \"" << val | 
| 512 | << "\" to unsigned long, not in unsigned long range"; | 
| 513 | throw Exception::apertium_tagger::ERANGE_(what_); | 
| 514 | } | 
| 515 |  | 
| 516 | return N_0; | 
| 517 | } | 
| 518 |  | 
| 519 | unsigned long apertium_tagger::optarg_unsigned_long(const char *metavar) { | 
| 520 | return parse_unsigned_long(metavar, optarg); | 
| 521 | } | 
| 522 |  | 
| 523 | void apertium_tagger::get_file_arguments( | 
| 524 | bool get_crp_fn, | 
| 525 | char **DicFn, char **CrpFn, | 
| 526 | char **TaggedFn, char **UntaggedFn, | 
| 527 | char **TsxFn, char **ProbFn) { | 
| 528 | if (*TheFunctionType != Retrain) { | 
|  | | 14 |  | Assuming the condition is false |  | 
 | 
|  |  | 
| 529 | *DicFn = argv[optind++]; | 
| 530 | } | 
| 531 | if (get_crp_fn) { | 
|  |  | 
| 532 | *CrpFn = argv[optind++]; | 
| 533 | } | 
| 534 | if (*TheFunctionType == Supervised) { | 
|  |  | 
| 535 | *TsxFn = argv[optind++]; | 
| 536 | *ProbFn = argv[optind++]; | 
| 537 | *TaggedFn = argv[optind++]; | 
| 538 | } | 
| 539 | *UntaggedFn = argv[optind++]; | 
| 540 | if (*TheFunctionType == Supervised && !get_crp_fn) { | 
| 541 | *CrpFn = *UntaggedFn; | 
| 542 | } | 
| 543 | if (*TheFunctionType != Supervised) { | 
|  |  | 
| 544 | if (*TheFunctionType != Retrain) { | 
|  |  | 
| 545 | *TsxFn = argv[optind++]; | 
| 546 | } | 
| 547 | *ProbFn = argv[optind++]; | 
| 548 | } | 
| 549 | } | 
|  | | 20 |  | Returning without writing to '*TsxFn' |  | 
 | 
| 550 |  | 
| 551 | void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const &TsxFn) { | 
| 552 | FILE_Tagger_.deserialise(TsxFn); | 
| 553 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); | 
| 554 | } | 
| 555 |  | 
| 556 | MorphoStream* apertium_tagger::setup_untagged_morpho_stream( | 
| 557 | FILE_Tagger &FILE_Tagger_, | 
| 558 | char *DicFn, char *UntaggedFn, | 
| 559 | UFILE* *UntaggedCorpus) { | 
| 560 | *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); | 
| 561 |  | 
| 562 | FILE_Tagger_.read_dictionary(DicFn); | 
| 563 |  | 
| 564 | return new FileMorphoStream(UntaggedFn, true, &FILE_Tagger_.get_tagger_data()); | 
| 565 | } | 
| 566 |  | 
| 567 |  | 
| 568 |  | 
| 569 | void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { | 
| 570 | locale_global_(); | 
| 571 |  | 
| 572 | expect_file_arguments(nonoptarg, 1, 4); | 
| 573 |  | 
| 574 | std::ifstream SerialisedAnalysisFrequencies; | 
| 575 | try_open_fstream("SERIALISED_TAGGER", argv[optind], | 
| 576 | SerialisedAnalysisFrequencies); | 
| 577 |  | 
| 578 | try { | 
| 579 | StreamTagger_.deserialise(SerialisedAnalysisFrequencies); | 
| 580 | } catch (const ExceptionType &ExceptionType_) { | 
| 581 | std::stringstream what_; | 
| 582 | what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] | 
| 583 | << "\" Reason: " << ExceptionType_.what(); | 
| 584 | throw Exception::apertium_tagger::deserialise(what_); | 
| 585 | } | 
| 586 |  | 
| 587 | if (nonoptarg < 2) { | 
| 588 | Stream Input(TheFlags); | 
| 589 | StreamTagger_.tag(Input, std::cout); | 
| 590 | return; | 
| 591 | } | 
| 592 |  | 
| 593 | Stream Input(TheFlags, argv[optind + 1]); | 
| 594 |  | 
| 595 | if (nonoptarg < 3) { | 
| 596 | StreamTagger_.tag(Input, std::cout); | 
| 597 | return; | 
| 598 | } | 
| 599 |  | 
| 600 | std::ofstream Output_stream; | 
| 601 | try_open_fstream("OUTPUT", argv[optind + 2], Output_stream); | 
| 602 |  | 
| 603 | StreamTagger_.tag(Input, Output_stream); | 
| 604 | } | 
| 605 |  | 
| 606 | void apertium_tagger::s_StreamTaggerTrainer( | 
| 607 | StreamTagger &StreamTaggerTrainer_) { | 
| 608 | locale_global_(); | 
| 609 |  | 
| 610 | if (TheFunctionTypeOptionArgument != 0 && *TheFunctionTypeType != Perceptron) { | 
| 611 | std::stringstream what_; | 
| 612 | what_ << "invalid argument '" << TheFunctionTypeOptionArgument | 
| 613 | << "' for '--supervised'"; | 
| 614 | throw Exception::apertium_tagger::InvalidArgument(what_); | 
| 615 | } | 
| 616 |  | 
| 617 | if (*TheFunctionTypeType == Perceptron) { | 
| 618 | expect_file_arguments(nonoptarg, 4); | 
| 619 | } else { | 
| 620 | expect_file_arguments(nonoptarg, 2); | 
| 621 | } | 
| 622 |  | 
| 623 | Stream TaggedCorpus(TheFlags, argv[optind + 1]); | 
| 624 |  | 
| 625 | if (*TheFunctionTypeType == Perceptron) { | 
| 626 | Stream UntaggedCorpus(TheFlags, argv[optind + 2]); | 
| 627 |  | 
| 628 | PerceptronTagger &pt = dynamic_cast<PerceptronTagger&>(StreamTaggerTrainer_); | 
| 629 | pt.read_spec(argv[optind + 3]); | 
| 630 | pt.train(TaggedCorpus, UntaggedCorpus, TheFunctionTypeOptionArgument); | 
| 631 | } else { | 
| 632 | StreamTaggerTrainer_.train(TaggedCorpus); | 
| 633 | } | 
| 634 |  | 
| 635 | std::ofstream Serialised_basic_Tagger; | 
| 636 | try_open_fstream("SERIALISED_TAGGER", argv[optind], | 
| 637 | Serialised_basic_Tagger); | 
| 638 |  | 
| 639 | StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); | 
| 640 | } | 
| 641 |  | 
| 642 | void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { | 
| 643 | LtLocale::tryToSetLocale(); | 
| 644 | expect_file_arguments(nonoptarg, 1, 4); | 
| 645 |  | 
| 646 | FILE *Serialised_FILE_Tagger = | 
| 647 | try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); | 
| 648 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); | 
| 649 | try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); | 
| 650 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); | 
| 651 | TaggerWord::generate_marks = TheFlags.getMark(); | 
| 652 | const char* infile = NULL; | 
| 653 | UFILE* Output = u_finit(stdout, NULL, NULL); | 
| 654 | if (nonoptarg >= 2) { | 
| 655 | infile = argv[optind + 1]; | 
| 656 | if (nonoptarg >= 3) { | 
| 657 | Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); | 
| 658 | } | 
| 659 | } | 
| 660 | FILE_Tagger_.tagger(infile, Output); | 
| 661 | u_fclose(Output); | 
| 662 | } | 
| 663 |  | 
| 664 | void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { | 
| 665 | LtLocale::tryToSetLocale(); | 
| 666 |  | 
| 667 | expect_file_arguments(nonoptarg, 2); | 
| 668 |  | 
| 669 | char *ProbFn, *UntaggedFn; | 
| 670 |  | 
| 671 | get_file_arguments( | 
| 672 | false, | 
| 673 | NULL, NULL, NULL, &UntaggedFn, | 
| 674 | NULL, &ProbFn); | 
| 675 |  | 
| 676 | FILE *Serialised_FILE_Tagger = | 
| 677 | try_open_file("SERIALISED_TAGGER", ProbFn, "rb"); | 
| 678 | FILE_Tagger_.deserialise(Serialised_FILE_Tagger); | 
| 679 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); | 
| 680 |  | 
| 681 | TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); | 
| 682 |  | 
| 683 | UFILE* UntaggedCorpus; | 
| 684 | MorphoStream* ms = setup_untagged_morpho_stream( | 
| 685 | FILE_Tagger_, | 
| 686 | NULL, UntaggedFn, | 
| 687 | &UntaggedCorpus); | 
| 688 |  | 
| 689 | FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); | 
| 690 | delete ms; | 
| 691 | u_fclose(UntaggedCorpus); | 
| 692 |  | 
| 693 | Serialised_FILE_Tagger = | 
| 694 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); | 
| 695 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); | 
| 696 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); | 
| 697 | } | 
| 698 |  | 
| 699 | void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { | 
| 700 | LtLocale::tryToSetLocale(); | 
| 701 |  | 
| 702 | if (TheFunctionTypeOptionArgument == 0) { | 
|  | | 9 |  | Assuming field 'TheFunctionTypeOptionArgument' is not equal to 0 |  | 
 | 
|  |  | 
| 703 | expect_file_arguments(nonoptarg, 5, 7); | 
| 704 | } else { | 
| 705 | expect_file_arguments(nonoptarg, 6); | 
| 706 | } | 
| 707 | char *DicFn, *CrpFn, *TsxFn, *ProbFn, *TaggedFn, *UntaggedFn; | 
|  | | 11 |  | 'TsxFn' declared without an initial value |  | 
 | 
| 708 | bool do_unsup = nonoptarg == 6; | 
|  | | 12 |  | Assuming field 'nonoptarg' is not equal to 6 |  | 
 | 
| 709 |  | 
| 710 | get_file_arguments( | 
|  | | 13 |  | Calling 'apertium_tagger::get_file_arguments' |  | 
 | 
|  | | 21 |  | Returning from 'apertium_tagger::get_file_arguments' |  | 
 | 
| 711 | do_unsup, | 
| 712 | &DicFn, &CrpFn, &TaggedFn, &UntaggedFn, | 
| 713 | &TsxFn, &ProbFn); | 
| 714 | init_FILE_Tagger(FILE_Tagger_, TsxFn); | 
|  | | 22 |  | 1st function call argument is an uninitialized value | 
 | 
| 715 |  | 
| 716 | UFILE* UntaggedCorpus; | 
| 717 | MorphoStream* ms = setup_untagged_morpho_stream( | 
| 718 | FILE_Tagger_, | 
| 719 | DicFn, UntaggedFn, | 
| 720 | &UntaggedCorpus); | 
| 721 | FileMorphoStream tms(TaggedFn, true, &FILE_Tagger_.get_tagger_data()); | 
| 722 |  | 
| 723 | FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); | 
| 724 | delete ms; | 
| 725 | u_fclose(UntaggedCorpus); | 
| 726 |  | 
| 727 | if (do_unsup) { | 
| 728 | FILE_Tagger_.train(CrpFn, TheFunctionTypeOptionArgument); | 
| 729 | } | 
| 730 |  | 
| 731 | FILE *Serialised_FILE_Tagger = | 
| 732 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); | 
| 733 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); | 
| 734 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); | 
| 735 | } | 
| 736 |  | 
| 737 | void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { | 
| 738 | LtLocale::tryToSetLocale(); | 
| 739 |  | 
| 740 | expect_file_arguments(nonoptarg, 4); | 
| 741 |  | 
| 742 | char *DicFn, *TsxFn, *ProbFn, *UntaggedFn; | 
| 743 | UntaggedFn = NULL; | 
| 744 |  | 
| 745 | get_file_arguments( | 
| 746 | false, | 
| 747 | &DicFn, NULL, NULL, &UntaggedFn, | 
| 748 | &TsxFn, &ProbFn); | 
| 749 | init_FILE_Tagger(FILE_Tagger_, TsxFn); | 
| 750 |  | 
| 751 | UFILE* UntaggedCorpus; | 
| 752 | MorphoStream* ms = setup_untagged_morpho_stream( | 
| 753 | FILE_Tagger_, | 
| 754 | DicFn, UntaggedFn, | 
| 755 | &UntaggedCorpus); | 
| 756 |  | 
| 757 | FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); | 
| 758 | delete ms; | 
| 759 | u_fclose(UntaggedCorpus); | 
| 760 |  | 
| 761 | FILE *Serialised_FILE_Tagger = | 
| 762 | try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); | 
| 763 | FILE_Tagger_.serialise(Serialised_FILE_Tagger); | 
| 764 | try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); | 
| 765 |  | 
| 766 | } | 
| 767 | } |