| File: | tmx_aligner_tool.cc |
| Warning: | line 276, column 7 Value stored to 'globalQuality' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /************************************************************************* |
| 2 | * * |
| 3 | * (C) Copyright 2004. Media Research Centre at the * |
| 4 | * Sociology and Communications Department of the * |
| 5 | * Budapest University of Technology and Economics. * |
| 6 | * * |
| 7 | * Developed by Daniel Varga. * |
| 8 | * * |
| 9 | * From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * |
| 10 | * * |
| 11 | *************************************************************************/ |
| 12 | #include <apertium/tmx_aligner_tool.h> |
| 13 | #include <lttoolbox/string_utils.h> |
| 14 | |
| 15 | namespace TMXAligner |
| 16 | { |
| 17 | |
| 18 | extern std::string hunglishDictionaryHome; |
| 19 | extern std::string hunglishExperimentsHome; |
| 20 | |
| 21 | void readTrailOrBisentenceList( std::istream& is, Trail& trail ) |
| 22 | { |
| 23 | trail.clear(); |
| 24 | while ( is.peek() != -1 ) |
| 25 | { |
| 26 | int huPos, enPos; |
| 27 | |
| 28 | is >> huPos; |
| 29 | if (is.peek()!=' ') |
| 30 | { |
| 31 | std::cerr << "no space in line" << std::endl; |
| 32 | throw "data error"; |
| 33 | } |
| 34 | is.ignore(); |
| 35 | |
| 36 | is >> enPos; |
| 37 | if (is.peek()!='\n') |
| 38 | { |
| 39 | std::cerr << "too much data in line" << std::endl; |
| 40 | throw "data error"; |
| 41 | } |
| 42 | is.ignore(); |
| 43 | |
| 44 | trail.push_back(std::make_pair(huPos,enPos)); |
| 45 | } |
| 46 | } |
| 47 | |
| 48 | void scoreBisentenceListByFile( const BisentenceList& bisentenceList, const std::string& handAlignFile ) |
| 49 | { |
| 50 | Trail trailHand; |
| 51 | std::ifstream is( handAlignFile.c_str() ); |
| 52 | readTrailOrBisentenceList( is, trailHand ); |
| 53 | |
| 54 | scoreBisentenceList( bisentenceList, trailHand ); |
| 55 | } |
| 56 | |
| 57 | void scoreTrailByFile( const Trail& bestTrail, const std::string& handAlignFile ) |
| 58 | { |
| 59 | Trail trailHand; |
| 60 | std::ifstream is( handAlignFile.c_str() ); |
| 61 | readTrailOrBisentenceList( is, trailHand ); |
| 62 | |
| 63 | scoreTrail( bestTrail, trailHand ); |
| 64 | } |
| 65 | |
| 66 | // TEMP TEMP |
| 67 | void logLexiconCoverageOfBicorpus( SentenceList& huSentenceList, SentenceList& enSentenceList, |
| 68 | const DictionaryItems& dictionaryItems ); |
| 69 | |
| 70 | |
| 71 | // The <p> scores should not be counted. This causes some complications. |
| 72 | // Otherwise, this is just the average score of segments. |
| 73 | // Currently this does not like segment lengths of more than two. |
| 74 | double globalScoreOfTrail( const Trail& trail, const AlignMatrix& dynMatrix, |
| 75 | const SentenceList& huSentenceListGarbled, const SentenceList& enSentenceListGarbled ) |
| 76 | { |
| 77 | TrailScoresInterval trailScoresInterval( trail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); |
| 78 | |
| 79 | return trailScoresInterval(0,trail.size()-1); |
| 80 | } |
| 81 | |
| 82 | |
| 83 | void collectBisentences( const Trail& bestTrail, const AlignMatrix& dynMatrix, |
| 84 | const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty, |
| 85 | SentenceList& huBisentences, SentenceList& enBisentences, |
| 86 | double qualityThreshold ) |
| 87 | { |
| 88 | huBisentences.clear(); |
| 89 | enBisentences.clear(); |
| 90 | |
| 91 | BisentenceList bisentenceList; |
| 92 | |
| 93 | TrailScores trailScores( bestTrail, dynMatrix ); |
| 94 | trailToBisentenceList( bestTrail, trailScores, qualityThreshold, bisentenceList ); |
| 95 | |
| 96 | for (size_t i=0; i<bisentenceList.size(); ++i ) |
| 97 | { |
| 98 | huBisentences.push_back( huSentenceListPretty[ bisentenceList[i].first ] ); |
| 99 | enBisentences.push_back( enSentenceListPretty[ bisentenceList[i].second ] ); |
| 100 | } |
| 101 | |
| 102 | // std::cerr << huBisentences.size() << " bisentences collected." << std::endl; |
| 103 | |
| 104 | } |
| 105 | |
| 106 | |
| 107 | void temporaryDumpOfAlignMatrix( std::ostream& os, const AlignMatrix& alignMatrix ) |
| 108 | { |
| 109 | for ( int huPos=0; huPos<alignMatrix.size(); ++huPos ) |
| 110 | { |
| 111 | int rowStart = alignMatrix.rowStart(huPos); |
| 112 | int rowEnd = alignMatrix.rowEnd(huPos); |
| 113 | for ( int enPos=rowStart; enPos<rowEnd; ++enPos ) |
| 114 | { |
| 115 | bool numeric = true; |
| 116 | if (numeric) |
| 117 | { |
| 118 | os << alignMatrix[huPos][enPos] << "\t" ; |
| 119 | } |
| 120 | else |
| 121 | { |
| 122 | if (alignMatrix[huPos][enPos]<0) |
| 123 | { |
| 124 | os << ". " ; |
| 125 | } |
| 126 | else if (alignMatrix[huPos][enPos]<10) |
| 127 | { |
| 128 | os << alignMatrix[huPos][enPos] << " " ; |
| 129 | } |
| 130 | else |
| 131 | { |
| 132 | os << "X " ; |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | os << std::endl; |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | |
| 141 | double alignerToolWithObjects( const DictionaryItems& dictionary, |
| 142 | SentenceList& huSentenceListPretty, |
| 143 | SentenceList& enSentenceList, |
| 144 | const AlignParameters& alignParameters, |
| 145 | std::ostream& os ) |
| 146 | { |
| 147 | int huBookSize = huSentenceListPretty.size(); |
| 148 | int enBookSize = enSentenceList.size(); |
| 149 | |
| 150 | SentenceValues huLength,enLength; |
| 151 | setSentenceValues( huSentenceListPretty, huLength, alignParameters.utfCharCountingMode ); // Here we use the most originalest Hungarian text. |
| 152 | setSentenceValues( enSentenceList, enLength, alignParameters.utfCharCountingMode ); |
| 153 | |
| 154 | bool quasiglobal_stopwordRemoval = false; |
| 155 | // std::cerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; |
| 156 | if (quasiglobal_stopwordRemoval) |
| 157 | { |
| 158 | removeStopwords( huSentenceListPretty, enSentenceList ); |
| 159 | // std::cerr << "Stopwords removed." << std::endl; |
| 160 | } |
| 161 | |
| 162 | SentenceList huSentenceListGarbled, enSentenceListGarbled; |
| 163 | |
| 164 | normalizeTextsForIdentity( dictionary, |
| 165 | huSentenceListPretty, enSentenceList, |
| 166 | huSentenceListGarbled, enSentenceListGarbled ); |
| 167 | |
| 168 | const int minimalThickness = 500; |
| 169 | |
| 170 | const double quasiglobal_maximalSizeInMegabytes = 4000; |
| 171 | |
| 172 | const int maximalThickness = (int) ( |
| 173 | quasiglobal_maximalSizeInMegabytes |
| 174 | * 1024*1024 /*bytes*/ |
| 175 | / ( 2*sizeof(double)+sizeof(char) ) /* for the similarity, dynprog and trelli matrices */ |
| 176 | / (double)( huBookSize ) /* the memory consumption of alignMatrix( huBookSize, enBookSize, thickness ) is huBookSize*thickness. */ |
| 177 | / 2.4 /* unexplained empirically observed factor. linux top is weird. :) */ |
| 178 | ) ; |
| 179 | |
| 180 | // Note that thickness is not a radius, it's a diameter. |
| 181 | const double thicknessRatio = 10.0; |
| 182 | |
| 183 | int thickness = (int) ( (double)( huBookSize>enBookSize ? huBookSize : enBookSize ) / thicknessRatio ) ; |
| 184 | |
| 185 | thickness = ( thickness>minimalThickness ? thickness : minimalThickness ) ; |
| 186 | |
| 187 | if (thickness>maximalThickness) |
| 188 | { |
| 189 | // std::cerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; |
| 190 | // std::cerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; |
| 191 | // std::cerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; |
| 192 | |
| 193 | thickness = maximalThickness; |
| 194 | } |
| 195 | |
| 196 | AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); |
| 197 | |
| 198 | sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); |
| 199 | // std::cerr << std::endl; |
| 200 | // std::cerr << "Rough translation-based similarity matrix ready." << std::endl; |
| 201 | |
| 202 | Trail bestTrail; |
| 203 | AlignMatrix dynMatrix( huBookSize+1, enBookSize+1, thickness, 1e30 ); |
| 204 | |
| 205 | align( similarityMatrix, huLength, enLength, bestTrail, dynMatrix ); |
| 206 | // std::cerr << "Align ready." << std::endl; |
| 207 | |
| 208 | double globalQuality; |
| 209 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
| 210 | huSentenceListGarbled, enSentenceListGarbled ); |
| 211 | |
| 212 | // std::cerr << "Global quality of unfiltered align " << globalQuality << std::endl; |
| 213 | |
| 214 | if (alignParameters.realignType==AlignParameters::NoRealign) |
| 215 | { |
| 216 | } |
| 217 | else |
| 218 | { |
| 219 | AlignMatrix similarityMatrixDetailed( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); |
| 220 | |
| 221 | bool success = borderDetailedAlignMatrix( similarityMatrixDetailed, bestTrail, 5/*radius*/ ); |
| 222 | |
| 223 | if (!success) |
| 224 | { |
| 225 | // std::cerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; |
| 226 | } |
| 227 | else |
| 228 | { |
| 229 | // std::cerr << "Border of realign zone determined." << std::endl; |
| 230 | |
| 231 | switch (alignParameters.realignType) |
| 232 | { |
| 233 | case AlignParameters::ModelOneRealign: |
| 234 | { |
| 235 | IBMModelOne modelOne; |
| 236 | |
| 237 | SentenceList huBisentences,enBisentences; |
| 238 | |
| 239 | throw "unimplemented"; |
| 240 | // std::cerr << "Plausible bisentences filtered." << std::endl; |
| 241 | |
| 242 | modelOne.build(huBisentences,enBisentences); |
| 243 | // std::cerr << "IBM Model I ready." << std::endl; |
| 244 | |
| 245 | sentenceListsToAlignMatrixIBMModelOne( huSentenceListPretty, enSentenceList, modelOne, similarityMatrixDetailed ); |
| 246 | // std::cerr << "IBM Model I based similarity matrix ready." << std::endl; |
| 247 | break; |
| 248 | } |
| 249 | case AlignParameters::FineTranslationRealign: |
| 250 | { |
| 251 | TransLex transLex; |
| 252 | transLex.build(dictionary); |
| 253 | // std::cerr << "Hashtable for dictionary ready." << std::endl; |
| 254 | |
| 255 | sentenceListsToAlignMatrixTranslation( huSentenceListPretty, enSentenceList, transLex, similarityMatrixDetailed ); |
| 256 | |
| 257 | // std::cerr << "Fine translation-based similarity matrix ready." << std::endl; |
| 258 | break; |
| 259 | } |
| 260 | |
| 261 | case AlignParameters::NoRealign: |
| 262 | default: |
| 263 | { |
| 264 | break; |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | Trail bestTrailDetailed; |
| 269 | AlignMatrix dynMatrixDetailed( huBookSize+1, enBookSize+1, thickness, 1e30 ); |
| 270 | align( similarityMatrixDetailed, huLength, enLength, bestTrailDetailed, dynMatrixDetailed ); |
| 271 | // std::cerr << "Detail realign ready." << std::endl; |
| 272 | |
| 273 | bestTrail = bestTrailDetailed; |
| 274 | dynMatrix = dynMatrixDetailed; |
| 275 | |
| 276 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
Value stored to 'globalQuality' is never read | |
| 277 | huSentenceListGarbled, enSentenceListGarbled ); |
| 278 | |
| 279 | // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | TrailScoresInterval trailScoresInterval( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); |
| 284 | |
| 285 | if ( alignParameters.postprocessTrailQualityThreshold != -1 ) |
| 286 | { |
| 287 | postprocessTrail( bestTrail, trailScoresInterval, alignParameters.postprocessTrailQualityThreshold ); |
| 288 | // std::cerr << "Trail start and end postprocessed by score." << std::endl; |
| 289 | } |
| 290 | |
| 291 | if ( alignParameters.postprocessTrailStartAndEndQualityThreshold != -1 ) |
| 292 | { |
| 293 | postprocessTrailStartAndEnd( bestTrail, trailScoresInterval, alignParameters.postprocessTrailStartAndEndQualityThreshold ); |
| 294 | // std::cerr << "Trail start and end postprocessed by score." << std::endl; |
| 295 | } |
| 296 | |
| 297 | if ( alignParameters.postprocessTrailByTopologyQualityThreshold != -1 ) |
| 298 | { |
| 299 | postprocessTrailByTopology( bestTrail, alignParameters.postprocessTrailByTopologyQualityThreshold ); |
| 300 | // std::cerr << "Trail postprocessed by topology." << std::endl; |
| 301 | } |
| 302 | |
| 303 | bool quasiglobal_spaceOutBySentenceLength = true; |
| 304 | // std::cerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; |
| 305 | if (quasiglobal_spaceOutBySentenceLength) |
| 306 | { |
| 307 | spaceOutBySentenceLength( bestTrail, huSentenceListPretty, enSentenceList, alignParameters.utfCharCountingMode ); |
| 308 | // std::cerr << "Trail spaced out by sentence length." << std::endl; |
| 309 | } |
| 310 | |
| 311 | // In cautious mode, auto-aligned rundles are thrown away if |
| 312 | // their left or right neighbour holes are not one-to-one. |
| 313 | if (alignParameters.cautiousMode) |
| 314 | { |
| 315 | cautiouslyFilterTrail( bestTrail ); |
| 316 | // std::cerr << "Trail filtered by topology." << std::endl; |
| 317 | } |
| 318 | |
| 319 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
| 320 | huSentenceListGarbled, enSentenceListGarbled ); |
| 321 | |
| 322 | // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; |
| 323 | |
| 324 | bool textual = ! alignParameters.justSentenceIds ; |
| 325 | |
| 326 | if (alignParameters.justBisentences) |
| 327 | { |
| 328 | BisentenceList bisentenceList; |
| 329 | trailToBisentenceList( bestTrail, bisentenceList ); |
| 330 | |
| 331 | filterBisentenceListByQuality( bisentenceList, dynMatrix, alignParameters.qualityThreshold ); |
| 332 | |
| 333 | BisentenceListScores bisentenceListScores(bisentenceList, dynMatrix); |
| 334 | |
| 335 | for ( size_t i=0; i<bisentenceList.size(); ++i ) |
| 336 | { |
| 337 | int huPos = bisentenceList[i].first; |
| 338 | int enPos = bisentenceList[i].second; |
| 339 | |
| 340 | if (textual) |
| 341 | { |
| 342 | os << huSentenceListPretty[huPos].words; |
| 343 | } |
| 344 | else |
| 345 | { |
| 346 | os << huPos ; |
| 347 | } |
| 348 | |
| 349 | os << "\t" ; |
| 350 | |
| 351 | if (textual) |
| 352 | { |
| 353 | os << enSentenceList[enPos].words; |
| 354 | } |
| 355 | else |
| 356 | { |
| 357 | os << enPos ; |
| 358 | } |
| 359 | |
| 360 | os << "\t" << bisentenceListScores(i); |
| 361 | |
| 362 | os << std::endl; |
| 363 | } |
| 364 | |
| 365 | if (! alignParameters.handAlignFilename.empty()) |
| 366 | { |
| 367 | scoreBisentenceListByFile( bisentenceList, alignParameters.handAlignFilename ); |
| 368 | } |
| 369 | } |
| 370 | else |
| 371 | { |
| 372 | filterTrailByQuality( bestTrail, trailScoresInterval, alignParameters.qualityThreshold ); |
| 373 | |
| 374 | for ( size_t i=0; i<bestTrail.size()-1; ++i ) |
| 375 | { |
| 376 | // The [huPos, nexthuPos) interval corresponds to the [enPos, nextenPos) interval. |
| 377 | int huPos = bestTrail[i].first; |
| 378 | int enPos = bestTrail[i].second; |
| 379 | int nexthuPos = bestTrail[i+1].first; |
| 380 | int nextenPos = bestTrail[i+1].second; |
| 381 | |
| 382 | if (textual) |
| 383 | { |
| 384 | int j; |
| 385 | for ( j=huPos; j<nexthuPos; ++j ) |
| 386 | { |
| 387 | os << huSentenceListPretty[j].words; |
| 388 | |
| 389 | if (j+1<nexthuPos) |
| 390 | os << " "; // os << " ~~~ "; |
| 391 | } |
| 392 | |
| 393 | os << "\t" ; |
| 394 | |
| 395 | for ( j=enPos; j<nextenPos; ++j ) |
| 396 | { |
| 397 | os << enSentenceList[j].words; |
| 398 | if (j+1<nextenPos) |
| 399 | { |
| 400 | os << " "; // os << " ~~~ "; |
| 401 | } |
| 402 | } |
| 403 | } |
| 404 | else // (!textual) |
| 405 | { |
| 406 | os << huPos << "\t" << enPos ; |
| 407 | } |
| 408 | |
| 409 | os << "\t" << trailScoresInterval(i); |
| 410 | |
| 411 | os << std::endl; |
| 412 | } |
| 413 | |
| 414 | if (! alignParameters.handAlignFilename.empty()) |
| 415 | { |
| 416 | scoreTrailByFile( bestTrail, alignParameters.handAlignFilename ); |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | return globalQuality; |
| 421 | } |
| 422 | |
| 423 | |
| 424 | void alignerToolWithFilenames( const DictionaryItems& dictionary, |
| 425 | const std::string& huFilename, const std::string& enFilename, |
| 426 | const AlignParameters& alignParameters, |
| 427 | const std::string& outputFilename) |
| 428 | { |
| 429 | std::ifstream hus(huFilename.c_str()); |
| 430 | SentenceList huSentenceListPretty; |
| 431 | huSentenceListPretty.readNoIds( hus ); |
| 432 | // std::cerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; |
| 433 | |
| 434 | std::ifstream ens(enFilename.c_str()); |
| 435 | SentenceList enSentenceList; |
| 436 | enSentenceList.readNoIds( ens ); |
| 437 | // std::cerr << enSentenceList.size() << " english sentences read." << std::endl; |
| 438 | |
| 439 | if ( (enSentenceList. size() < huSentenceListPretty.size()/5) || |
| 440 | (huSentenceListPretty.size() < enSentenceList. size()/5) ) |
| 441 | { |
| 442 | // std::cerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; |
| 443 | return; |
| 444 | } |
| 445 | |
| 446 | if (outputFilename.empty()) |
| 447 | { |
| 448 | /* double globalQuality = */alignerToolWithObjects |
| 449 | ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, std::cout ); |
| 450 | |
| 451 | // std::cerr << "Quality " << globalQuality << std::endl ; |
| 452 | |
| 453 | } |
| 454 | else |
| 455 | { |
| 456 | std::ofstream os(outputFilename.c_str()); |
| 457 | /*double globalQuality = */ alignerToolWithObjects |
| 458 | ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, os ); |
| 459 | |
| 460 | // If you want to collect global quality information in batch mode, grep "^Quality" of stderr must do. |
| 461 | // std::cerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; |
| 462 | } |
| 463 | |
| 464 | } |
| 465 | |
| 466 | void fillPercentParameter( Arguments& args, const std::string& argName, double& value ) |
| 467 | { |
| 468 | int valueInt; |
| 469 | if ( args.getNumericParam(argName, valueInt)) |
| 470 | { |
| 471 | value = 1.0 * valueInt / 100 ; |
| 472 | } |
| 473 | } |
| 474 | |
| 475 | void main_alignerToolUsage() |
| 476 | { |
| 477 | std::cerr << "Usage (either):\n\ |
| 478 | alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\ |
| 479 | \n\ |
| 480 | or:\n\ |
| 481 | alignerTool [ common_arguments ] -batch dictionary_file batch_file\n\ |
| 482 | \n\ |
| 483 | where\n\ |
| 484 | common_arguments ::= [ -text ] [ -bisent ] [ -utf ] [ -cautious ] [ -realign [ -autodict=filename ] ]\n\ |
| 485 | [ -thresh=n ] [ -ppthresh=n ] [ -headerthresh=n ] [ -topothresh=n ]\n\ |
| 486 | \n\ |
| 487 | Arguments:\n\ |
| 488 | \n\ |
| 489 | -text\n\ |
| 490 | The output should be in text format, rather than the default (numeric) ladder format.\n\ |
| 491 | \n\ |
| 492 | -bisent\n\ |
| 493 | Only bisentences (one-to-one alignment segments) are printed. In non-text mode, their\n\ |
| 494 | starting rung is printed.\n\ |
| 495 | \n\ |
| 496 | -cautious\n\ |
| 497 | In -bisent mode, only bisentences for which both the preceding and the following\n\ |
| 498 | segments are one-to-one are printed. In the default non-bisent mode, only rungs\n\ |
| 499 | for which both the preceding and the following segments are one-to-one are printed.\n\ |
| 500 | \n\ |
| 501 | -hand=file\n\ |
| 502 | When this argument is given, the precision and recall of the alignment is calculated\n\ |
| 503 | based on the manually built ladder file. Information like the following is written\n\ |
| 504 | on the standard error: \n\ |
| 505 | 53 misaligned out of 6446 correct items, 6035 bets.\n\ |
| 506 | Precision: 0.991218, Recall: 0.928017\n\ |
| 507 | \n\ |
| 508 | Note that by default, 'item' means rung. The switch -bisent also changes the semantics\n\ |
| 509 | of the scoring from rung-based to bisentence-based and in this case 'item' means bisentences.\n\ |
| 510 | See File formats about the format of this input align file.\n\ |
| 511 | \n\ |
| 512 | -autodict=filename\n\ |
| 513 | The dictionary built during realign is saved to this file. By default, it is not saved.\n\ |
| 514 | \n\ |
| 515 | -utf\n\ |
| 516 | The system uses the character counts of the sentences as information for the\n\ |
| 517 | pairing of sentences. By default, it assumes one-byte character encoding such\n\ |
| 518 | as ISO Latin-1 when calculating these counts. If our text is in UTF-8 format,\n\ |
| 519 | byte counts and character counts are different, and we must use the -utf switch\n\ |
| 520 | to force the system to properly calculate character counts.\n\ |
| 521 | Note: UTF-16 input is not supported.\n\ |
| 522 | \n\ |
| 523 | Postfiltering options:\n\ |
| 524 | There are various postprocessors which remove implausible rungs based on various heuristics.\n\ |
| 525 | \n\ |
| 526 | -thresh=n\n\ |
| 527 | Don't print out segments with score lower than n/100.\n\ |
| 528 | \n\ |
| 529 | -ppthresh=n\n\ |
| 530 | Filter rungs with less than n/100 average score in their vicinity.\n\ |
| 531 | \n\ |
| 532 | -headerthresh=n\n\ |
| 533 | Filter all rungs at the start and end of texts until finding a reliably\n\ |
| 534 | plausible region.\n\ |
| 535 | \n\ |
| 536 | -topothresh=n\n\ |
| 537 | Filter rungs with less than n percent of one-to-one segments in their vicinity.\n\ |
| 538 | \n\ |
| 539 | "; |
| 540 | } |
| 541 | |
| 542 | int main_alignerTool(int argC, char* argV[]) |
| 543 | { |
| 544 | #ifndef _DEBUG |
| 545 | try |
| 546 | #endif |
| 547 | { |
| 548 | if (argC<4) |
| 549 | { |
| 550 | main_alignerToolUsage(); |
| 551 | throw ""; |
| 552 | } |
| 553 | |
| 554 | Arguments args; |
| 555 | std::vector<const char*> remains; |
| 556 | args.read( argC, argV, remains ); |
| 557 | |
| 558 | AlignParameters alignParameters; |
| 559 | |
| 560 | if (args.getSwitchCompact("text")) |
| 561 | { |
| 562 | alignParameters.justSentenceIds = false; |
| 563 | } |
| 564 | |
| 565 | if (args.getSwitchCompact("bisent")) |
| 566 | { |
| 567 | alignParameters.justBisentences = true; |
| 568 | } |
| 569 | |
| 570 | if (args.getSwitchCompact("cautious")) |
| 571 | { |
| 572 | alignParameters.cautiousMode = true; |
| 573 | } |
| 574 | |
| 575 | alignParameters.utfCharCountingMode = args.getSwitchCompact("utf"); |
| 576 | |
| 577 | fillPercentParameter( args, "thresh", alignParameters.qualityThreshold ); |
| 578 | |
| 579 | fillPercentParameter( args, "ppthresh", alignParameters.postprocessTrailQualityThreshold ); |
| 580 | |
| 581 | fillPercentParameter( args, "headerthresh", alignParameters.postprocessTrailStartAndEndQualityThreshold ); |
| 582 | |
| 583 | fillPercentParameter( args, "topothresh", alignParameters.postprocessTrailByTopologyQualityThreshold ); |
| 584 | |
| 585 | bool batchMode = args.getSwitchCompact("batch") ; |
| 586 | |
| 587 | if (batchMode && (remains.size()!=2) ) |
| 588 | { |
| 589 | std::cerr << "Batch mode requires exactly two file arguments." << std::endl; |
| 590 | std::cerr << std::endl; |
| 591 | |
| 592 | main_alignerToolUsage(); |
| 593 | throw "argument error"; |
| 594 | } |
| 595 | |
| 596 | std::string handArgumentname = "hand"; |
| 597 | if (args.find(handArgumentname)!=args.end()) |
| 598 | { |
| 599 | if (batchMode) |
| 600 | { |
| 601 | std::cerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; |
| 602 | throw "argument error"; |
| 603 | } |
| 604 | else |
| 605 | { |
| 606 | alignParameters.handAlignFilename = args[handArgumentname].dString ; |
| 607 | args.erase(handArgumentname); |
| 608 | |
| 609 | if (alignParameters.handAlignFilename.empty()) |
| 610 | { |
| 611 | std::cerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; |
| 612 | throw "argument error"; |
| 613 | } |
| 614 | } |
| 615 | } |
| 616 | |
| 617 | std::string autoDictDumpArgumentname = "autodict"; |
| 618 | if (args.find(autoDictDumpArgumentname)!=args.end()) |
| 619 | { |
| 620 | if (batchMode) |
| 621 | { |
| 622 | std::cerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; |
| 623 | throw "argument error"; |
| 624 | } |
| 625 | else |
| 626 | { |
| 627 | alignParameters.autoDictionaryDumpFilename = args[autoDictDumpArgumentname].dString ; |
| 628 | args.erase(autoDictDumpArgumentname); |
| 629 | |
| 630 | if (alignParameters.autoDictionaryDumpFilename.empty()) |
| 631 | { |
| 632 | std::cerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; |
| 633 | throw "argument error"; |
| 634 | } |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | if (!batchMode && (remains.size()!=3) ) |
| 639 | { |
| 640 | std::cerr << "Nonbatch mode requires exactly three file arguments." << std::endl; |
| 641 | std::cerr << std::endl; |
| 642 | |
| 643 | main_alignerToolUsage(); |
| 644 | throw "argument error"; |
| 645 | } |
| 646 | |
| 647 | try |
| 648 | { |
| 649 | args.checkEmptyArgs(); |
| 650 | } |
| 651 | catch (...) |
| 652 | { |
| 653 | std::cerr << std::endl; |
| 654 | |
| 655 | main_alignerToolUsage(); |
| 656 | throw "argument error"; |
| 657 | } |
| 658 | |
| 659 | // std::cerr << "Reading dictionary..." << std::endl; |
| 660 | const char* dicFilename = remains[0] ; |
| 661 | DictionaryItems dictionary; |
| 662 | std::ifstream dis(dicFilename); |
| 663 | dictionary.read(dis); |
| 664 | |
| 665 | if (batchMode) |
| 666 | { |
| 667 | const char* batchFilename = remains[1] ; |
| 668 | std::ifstream bis(batchFilename); |
| 669 | |
| 670 | while (bis.good()&&!bis.eof()) |
| 671 | { |
| 672 | std::string line; |
| 673 | std::getline(bis,line); |
| 674 | |
| 675 | std::vector<std::string> words; |
| 676 | split( line, words, '\t' ); |
| 677 | |
| 678 | if (words.size()!=3) |
| 679 | { |
| 680 | std::cerr << "Batch file has incorrect format." << std::endl; |
| 681 | throw "data error"; |
| 682 | } |
| 683 | |
| 684 | std::string huFilename, enFilename, outFilename; |
| 685 | huFilename = words[0]; |
| 686 | enFilename = words[1]; |
| 687 | outFilename = words[2]; |
| 688 | |
| 689 | // std::cerr << "Processing " << outFilename << std::endl; |
| 690 | bool failed = false; |
| 691 | try |
| 692 | { |
| 693 | alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters, outFilename ); |
| 694 | } |
| 695 | catch ( const char* errorType ) |
| 696 | { |
| 697 | std::cerr << errorType << std::endl; |
| 698 | failed = true; |
| 699 | } |
| 700 | catch ( std::exception& e ) |
| 701 | { |
| 702 | std::cerr << "some failed assertion:" << e.what() << std::endl; |
| 703 | failed = true; |
| 704 | } |
| 705 | catch ( ... ) |
| 706 | { |
| 707 | std::cerr << "some unknown failed assertion..." << std::endl; |
| 708 | failed = true; |
| 709 | } |
| 710 | |
| 711 | if (failed) |
| 712 | { |
| 713 | std::cerr << "Align failed for " << outFilename << std::endl; |
| 714 | } |
| 715 | } |
| 716 | } |
| 717 | else |
| 718 | { |
| 719 | const char* huFilename = remains[1] ; |
| 720 | const char* enFilename = remains[2] ; |
| 721 | |
| 722 | alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters ); |
| 723 | } |
| 724 | } |
| 725 | #ifndef _DEBUG |
| 726 | catch ( const char* errorType ) |
| 727 | { |
| 728 | std::cerr << errorType << std::endl; |
| 729 | return -1; |
| 730 | } |
| 731 | catch ( std::exception& e ) |
| 732 | { |
| 733 | std::cerr << "some failed assertion:" << e.what() << std::endl; |
| 734 | return -1; |
| 735 | } |
| 736 | catch ( ... ) |
| 737 | { |
| 738 | std::cerr << "some unknown failed assertion..." << std::endl; |
| 739 | return -1; |
| 740 | } |
| 741 | #endif |
| 742 | return 0; |
| 743 | } |
| 744 | |
| 745 | } |