File: | tmx_aligner_tool.cc |
Warning: | line 276, column 7 Value stored to 'globalQuality' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /************************************************************************* |
2 | * * |
3 | * (C) Copyright 2004. Media Research Centre at the * |
4 | * Sociology and Communications Department of the * |
5 | * Budapest University of Technology and Economics. * |
6 | * * |
7 | * Developed by Daniel Varga. * |
8 | * * |
9 | * From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * |
10 | * * |
11 | *************************************************************************/ |
12 | #include <apertium/tmx_aligner_tool.h> |
13 | #include <lttoolbox/string_utils.h> |
14 | |
15 | namespace TMXAligner |
16 | { |
17 | |
18 | extern std::string hunglishDictionaryHome; |
19 | extern std::string hunglishExperimentsHome; |
20 | |
21 | void readTrailOrBisentenceList( std::istream& is, Trail& trail ) |
22 | { |
23 | trail.clear(); |
24 | while ( is.peek() != -1 ) |
25 | { |
26 | int huPos, enPos; |
27 | |
28 | is >> huPos; |
29 | if (is.peek()!=' ') |
30 | { |
31 | std::cerr << "no space in line" << std::endl; |
32 | throw "data error"; |
33 | } |
34 | is.ignore(); |
35 | |
36 | is >> enPos; |
37 | if (is.peek()!='\n') |
38 | { |
39 | std::cerr << "too much data in line" << std::endl; |
40 | throw "data error"; |
41 | } |
42 | is.ignore(); |
43 | |
44 | trail.push_back(std::make_pair(huPos,enPos)); |
45 | } |
46 | } |
47 | |
48 | void scoreBisentenceListByFile( const BisentenceList& bisentenceList, const std::string& handAlignFile ) |
49 | { |
50 | Trail trailHand; |
51 | std::ifstream is( handAlignFile.c_str() ); |
52 | readTrailOrBisentenceList( is, trailHand ); |
53 | |
54 | scoreBisentenceList( bisentenceList, trailHand ); |
55 | } |
56 | |
57 | void scoreTrailByFile( const Trail& bestTrail, const std::string& handAlignFile ) |
58 | { |
59 | Trail trailHand; |
60 | std::ifstream is( handAlignFile.c_str() ); |
61 | readTrailOrBisentenceList( is, trailHand ); |
62 | |
63 | scoreTrail( bestTrail, trailHand ); |
64 | } |
65 | |
66 | // TEMP TEMP |
67 | void logLexiconCoverageOfBicorpus( SentenceList& huSentenceList, SentenceList& enSentenceList, |
68 | const DictionaryItems& dictionaryItems ); |
69 | |
70 | |
71 | // The <p> scores should not be counted. This causes some complications. |
72 | // Otherwise, this is just the average score of segments. |
73 | // Currently this does not like segment lengths of more than two. |
74 | double globalScoreOfTrail( const Trail& trail, const AlignMatrix& dynMatrix, |
75 | const SentenceList& huSentenceListGarbled, const SentenceList& enSentenceListGarbled ) |
76 | { |
77 | TrailScoresInterval trailScoresInterval( trail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); |
78 | |
79 | return trailScoresInterval(0,trail.size()-1); |
80 | } |
81 | |
82 | |
83 | void collectBisentences( const Trail& bestTrail, const AlignMatrix& dynMatrix, |
84 | const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty, |
85 | SentenceList& huBisentences, SentenceList& enBisentences, |
86 | double qualityThreshold ) |
87 | { |
88 | huBisentences.clear(); |
89 | enBisentences.clear(); |
90 | |
91 | BisentenceList bisentenceList; |
92 | |
93 | TrailScores trailScores( bestTrail, dynMatrix ); |
94 | trailToBisentenceList( bestTrail, trailScores, qualityThreshold, bisentenceList ); |
95 | |
96 | for (size_t i=0; i<bisentenceList.size(); ++i ) |
97 | { |
98 | huBisentences.push_back( huSentenceListPretty[ bisentenceList[i].first ] ); |
99 | enBisentences.push_back( enSentenceListPretty[ bisentenceList[i].second ] ); |
100 | } |
101 | |
102 | // std::cerr << huBisentences.size() << " bisentences collected." << std::endl; |
103 | |
104 | } |
105 | |
106 | |
107 | void temporaryDumpOfAlignMatrix( std::ostream& os, const AlignMatrix& alignMatrix ) |
108 | { |
109 | for ( int huPos=0; huPos<alignMatrix.size(); ++huPos ) |
110 | { |
111 | int rowStart = alignMatrix.rowStart(huPos); |
112 | int rowEnd = alignMatrix.rowEnd(huPos); |
113 | for ( int enPos=rowStart; enPos<rowEnd; ++enPos ) |
114 | { |
115 | bool numeric = true; |
116 | if (numeric) |
117 | { |
118 | os << alignMatrix[huPos][enPos] << "\t" ; |
119 | } |
120 | else |
121 | { |
122 | if (alignMatrix[huPos][enPos]<0) |
123 | { |
124 | os << ". " ; |
125 | } |
126 | else if (alignMatrix[huPos][enPos]<10) |
127 | { |
128 | os << alignMatrix[huPos][enPos] << " " ; |
129 | } |
130 | else |
131 | { |
132 | os << "X " ; |
133 | } |
134 | } |
135 | } |
136 | os << std::endl; |
137 | } |
138 | } |
139 | |
140 | |
141 | double alignerToolWithObjects( const DictionaryItems& dictionary, |
142 | SentenceList& huSentenceListPretty, |
143 | SentenceList& enSentenceList, |
144 | const AlignParameters& alignParameters, |
145 | std::ostream& os ) |
146 | { |
147 | int huBookSize = huSentenceListPretty.size(); |
148 | int enBookSize = enSentenceList.size(); |
149 | |
150 | SentenceValues huLength,enLength; |
151 | setSentenceValues( huSentenceListPretty, huLength, alignParameters.utfCharCountingMode ); // Here we use the most originalest Hungarian text. |
152 | setSentenceValues( enSentenceList, enLength, alignParameters.utfCharCountingMode ); |
153 | |
154 | bool quasiglobal_stopwordRemoval = false; |
155 | // std::cerr << "quasiglobal_stopwordRemoval is set to " << quasiglobal_stopwordRemoval << std::endl; |
156 | if (quasiglobal_stopwordRemoval) |
157 | { |
158 | removeStopwords( huSentenceListPretty, enSentenceList ); |
159 | // std::cerr << "Stopwords removed." << std::endl; |
160 | } |
161 | |
162 | SentenceList huSentenceListGarbled, enSentenceListGarbled; |
163 | |
164 | normalizeTextsForIdentity( dictionary, |
165 | huSentenceListPretty, enSentenceList, |
166 | huSentenceListGarbled, enSentenceListGarbled ); |
167 | |
168 | const int minimalThickness = 500; |
169 | |
170 | const double quasiglobal_maximalSizeInMegabytes = 4000; |
171 | |
172 | const int maximalThickness = (int) ( |
173 | quasiglobal_maximalSizeInMegabytes |
174 | * 1024*1024 /*bytes*/ |
175 | / ( 2*sizeof(double)+sizeof(char) ) /* for the similarity, dynprog and trelli matrices */ |
176 | / (double)( huBookSize ) /* the memory consumption of alignMatrix( huBookSize, enBookSize, thickness ) is huBookSize*thickness. */ |
177 | / 2.4 /* unexplained empirically observed factor. linux top is weird. :) */ |
178 | ) ; |
179 | |
180 | // Note that thickness is not a radius, it's a diameter. |
181 | const double thicknessRatio = 10.0; |
182 | |
183 | int thickness = (int) ( (double)( huBookSize>enBookSize ? huBookSize : enBookSize ) / thicknessRatio ) ; |
184 | |
185 | thickness = ( thickness>minimalThickness ? thickness : minimalThickness ) ; |
186 | |
187 | if (thickness>maximalThickness) |
188 | { |
189 | // std::cerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; |
190 | // std::cerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; |
191 | // std::cerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; |
192 | |
193 | thickness = maximalThickness; |
194 | } |
195 | |
196 | AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); |
197 | |
198 | sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); |
199 | // std::cerr << std::endl; |
200 | // std::cerr << "Rough translation-based similarity matrix ready." << std::endl; |
201 | |
202 | Trail bestTrail; |
203 | AlignMatrix dynMatrix( huBookSize+1, enBookSize+1, thickness, 1e30 ); |
204 | |
205 | align( similarityMatrix, huLength, enLength, bestTrail, dynMatrix ); |
206 | // std::cerr << "Align ready." << std::endl; |
207 | |
208 | double globalQuality; |
209 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
210 | huSentenceListGarbled, enSentenceListGarbled ); |
211 | |
212 | // std::cerr << "Global quality of unfiltered align " << globalQuality << std::endl; |
213 | |
214 | if (alignParameters.realignType==AlignParameters::NoRealign) |
215 | { |
216 | } |
217 | else |
218 | { |
219 | AlignMatrix similarityMatrixDetailed( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); |
220 | |
221 | bool success = borderDetailedAlignMatrix( similarityMatrixDetailed, bestTrail, 5/*radius*/ ); |
222 | |
223 | if (!success) |
224 | { |
225 | // std::cerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; |
226 | } |
227 | else |
228 | { |
229 | // std::cerr << "Border of realign zone determined." << std::endl; |
230 | |
231 | switch (alignParameters.realignType) |
232 | { |
233 | case AlignParameters::ModelOneRealign: |
234 | { |
235 | IBMModelOne modelOne; |
236 | |
237 | SentenceList huBisentences,enBisentences; |
238 | |
239 | throw "unimplemented"; |
240 | // std::cerr << "Plausible bisentences filtered." << std::endl; |
241 | |
242 | modelOne.build(huBisentences,enBisentences); |
243 | // std::cerr << "IBM Model I ready." << std::endl; |
244 | |
245 | sentenceListsToAlignMatrixIBMModelOne( huSentenceListPretty, enSentenceList, modelOne, similarityMatrixDetailed ); |
246 | // std::cerr << "IBM Model I based similarity matrix ready." << std::endl; |
247 | break; |
248 | } |
249 | case AlignParameters::FineTranslationRealign: |
250 | { |
251 | TransLex transLex; |
252 | transLex.build(dictionary); |
253 | // std::cerr << "Hashtable for dictionary ready." << std::endl; |
254 | |
255 | sentenceListsToAlignMatrixTranslation( huSentenceListPretty, enSentenceList, transLex, similarityMatrixDetailed ); |
256 | |
257 | // std::cerr << "Fine translation-based similarity matrix ready." << std::endl; |
258 | break; |
259 | } |
260 | |
261 | case AlignParameters::NoRealign: |
262 | default: |
263 | { |
264 | break; |
265 | } |
266 | } |
267 | |
268 | Trail bestTrailDetailed; |
269 | AlignMatrix dynMatrixDetailed( huBookSize+1, enBookSize+1, thickness, 1e30 ); |
270 | align( similarityMatrixDetailed, huLength, enLength, bestTrailDetailed, dynMatrixDetailed ); |
271 | // std::cerr << "Detail realign ready." << std::endl; |
272 | |
273 | bestTrail = bestTrailDetailed; |
274 | dynMatrix = dynMatrixDetailed; |
275 | |
276 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
Value stored to 'globalQuality' is never read | |
277 | huSentenceListGarbled, enSentenceListGarbled ); |
278 | |
279 | // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; |
280 | } |
281 | } |
282 | |
283 | TrailScoresInterval trailScoresInterval( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); |
284 | |
285 | if ( alignParameters.postprocessTrailQualityThreshold != -1 ) |
286 | { |
287 | postprocessTrail( bestTrail, trailScoresInterval, alignParameters.postprocessTrailQualityThreshold ); |
288 | // std::cerr << "Trail start and end postprocessed by score." << std::endl; |
289 | } |
290 | |
291 | if ( alignParameters.postprocessTrailStartAndEndQualityThreshold != -1 ) |
292 | { |
293 | postprocessTrailStartAndEnd( bestTrail, trailScoresInterval, alignParameters.postprocessTrailStartAndEndQualityThreshold ); |
294 | // std::cerr << "Trail start and end postprocessed by score." << std::endl; |
295 | } |
296 | |
297 | if ( alignParameters.postprocessTrailByTopologyQualityThreshold != -1 ) |
298 | { |
299 | postprocessTrailByTopology( bestTrail, alignParameters.postprocessTrailByTopologyQualityThreshold ); |
300 | // std::cerr << "Trail postprocessed by topology." << std::endl; |
301 | } |
302 | |
303 | bool quasiglobal_spaceOutBySentenceLength = true; |
304 | // std::cerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; |
305 | if (quasiglobal_spaceOutBySentenceLength) |
306 | { |
307 | spaceOutBySentenceLength( bestTrail, huSentenceListPretty, enSentenceList, alignParameters.utfCharCountingMode ); |
308 | // std::cerr << "Trail spaced out by sentence length." << std::endl; |
309 | } |
310 | |
311 | // In cautious mode, auto-aligned rundles are thrown away if |
312 | // their left or right neighbour holes are not one-to-one. |
313 | if (alignParameters.cautiousMode) |
314 | { |
315 | cautiouslyFilterTrail( bestTrail ); |
316 | // std::cerr << "Trail filtered by topology." << std::endl; |
317 | } |
318 | |
319 | globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, |
320 | huSentenceListGarbled, enSentenceListGarbled ); |
321 | |
322 | // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; |
323 | |
324 | bool textual = ! alignParameters.justSentenceIds ; |
325 | |
326 | if (alignParameters.justBisentences) |
327 | { |
328 | BisentenceList bisentenceList; |
329 | trailToBisentenceList( bestTrail, bisentenceList ); |
330 | |
331 | filterBisentenceListByQuality( bisentenceList, dynMatrix, alignParameters.qualityThreshold ); |
332 | |
333 | BisentenceListScores bisentenceListScores(bisentenceList, dynMatrix); |
334 | |
335 | for ( size_t i=0; i<bisentenceList.size(); ++i ) |
336 | { |
337 | int huPos = bisentenceList[i].first; |
338 | int enPos = bisentenceList[i].second; |
339 | |
340 | if (textual) |
341 | { |
342 | os << huSentenceListPretty[huPos].words; |
343 | } |
344 | else |
345 | { |
346 | os << huPos ; |
347 | } |
348 | |
349 | os << "\t" ; |
350 | |
351 | if (textual) |
352 | { |
353 | os << enSentenceList[enPos].words; |
354 | } |
355 | else |
356 | { |
357 | os << enPos ; |
358 | } |
359 | |
360 | os << "\t" << bisentenceListScores(i); |
361 | |
362 | os << std::endl; |
363 | } |
364 | |
365 | if (! alignParameters.handAlignFilename.empty()) |
366 | { |
367 | scoreBisentenceListByFile( bisentenceList, alignParameters.handAlignFilename ); |
368 | } |
369 | } |
370 | else |
371 | { |
372 | filterTrailByQuality( bestTrail, trailScoresInterval, alignParameters.qualityThreshold ); |
373 | |
374 | for ( size_t i=0; i<bestTrail.size()-1; ++i ) |
375 | { |
376 | // The [huPos, nexthuPos) interval corresponds to the [enPos, nextenPos) interval. |
377 | int huPos = bestTrail[i].first; |
378 | int enPos = bestTrail[i].second; |
379 | int nexthuPos = bestTrail[i+1].first; |
380 | int nextenPos = bestTrail[i+1].second; |
381 | |
382 | if (textual) |
383 | { |
384 | int j; |
385 | for ( j=huPos; j<nexthuPos; ++j ) |
386 | { |
387 | os << huSentenceListPretty[j].words; |
388 | |
389 | if (j+1<nexthuPos) |
390 | os << " "; // os << " ~~~ "; |
391 | } |
392 | |
393 | os << "\t" ; |
394 | |
395 | for ( j=enPos; j<nextenPos; ++j ) |
396 | { |
397 | os << enSentenceList[j].words; |
398 | if (j+1<nextenPos) |
399 | { |
400 | os << " "; // os << " ~~~ "; |
401 | } |
402 | } |
403 | } |
404 | else // (!textual) |
405 | { |
406 | os << huPos << "\t" << enPos ; |
407 | } |
408 | |
409 | os << "\t" << trailScoresInterval(i); |
410 | |
411 | os << std::endl; |
412 | } |
413 | |
414 | if (! alignParameters.handAlignFilename.empty()) |
415 | { |
416 | scoreTrailByFile( bestTrail, alignParameters.handAlignFilename ); |
417 | } |
418 | } |
419 | |
420 | return globalQuality; |
421 | } |
422 | |
423 | |
424 | void alignerToolWithFilenames( const DictionaryItems& dictionary, |
425 | const std::string& huFilename, const std::string& enFilename, |
426 | const AlignParameters& alignParameters, |
427 | const std::string& outputFilename) |
428 | { |
429 | std::ifstream hus(huFilename.c_str()); |
430 | SentenceList huSentenceListPretty; |
431 | huSentenceListPretty.readNoIds( hus ); |
432 | // std::cerr << huSentenceListPretty.size() << " hungarian sentences read." << std::endl; |
433 | |
434 | std::ifstream ens(enFilename.c_str()); |
435 | SentenceList enSentenceList; |
436 | enSentenceList.readNoIds( ens ); |
437 | // std::cerr << enSentenceList.size() << " english sentences read." << std::endl; |
438 | |
439 | if ( (enSentenceList. size() < huSentenceListPretty.size()/5) || |
440 | (huSentenceListPretty.size() < enSentenceList. size()/5) ) |
441 | { |
442 | // std::cerr << "Sizes differing too much. Ignoring files to avoid a rare loop bug." << std::endl; |
443 | return; |
444 | } |
445 | |
446 | if (outputFilename.empty()) |
447 | { |
448 | /* double globalQuality = */alignerToolWithObjects |
449 | ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, std::cout ); |
450 | |
451 | // std::cerr << "Quality " << globalQuality << std::endl ; |
452 | |
453 | } |
454 | else |
455 | { |
456 | std::ofstream os(outputFilename.c_str()); |
457 | /*double globalQuality = */ alignerToolWithObjects |
458 | ( dictionary, huSentenceListPretty, enSentenceList, alignParameters, os ); |
459 | |
460 | // If you want to collect global quality information in batch mode, grep "^Quality" of stderr must do. |
461 | // std::cerr << "Quality\t" << outputFilename << "\t" << globalQuality << std::endl ; |
462 | } |
463 | |
464 | } |
465 | |
466 | void fillPercentParameter( Arguments& args, const std::string& argName, double& value ) |
467 | { |
468 | int valueInt; |
469 | if ( args.getNumericParam(argName, valueInt)) |
470 | { |
471 | value = 1.0 * valueInt / 100 ; |
472 | } |
473 | } |
474 | |
475 | void main_alignerToolUsage() |
476 | { |
477 | std::cerr << "Usage (either):\n\ |
478 | alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\ |
479 | \n\ |
480 | or:\n\ |
481 | alignerTool [ common_arguments ] -batch dictionary_file batch_file\n\ |
482 | \n\ |
483 | where\n\ |
484 | common_arguments ::= [ -text ] [ -bisent ] [ -utf ] [ -cautious ] [ -realign [ -autodict=filename ] ]\n\ |
485 | [ -thresh=n ] [ -ppthresh=n ] [ -headerthresh=n ] [ -topothresh=n ]\n\ |
486 | \n\ |
487 | Arguments:\n\ |
488 | \n\ |
489 | -text\n\ |
490 | The output should be in text format, rather than the default (numeric) ladder format.\n\ |
491 | \n\ |
492 | -bisent\n\ |
493 | Only bisentences (one-to-one alignment segments) are printed. In non-text mode, their\n\ |
494 | starting rung is printed.\n\ |
495 | \n\ |
496 | -cautious\n\ |
497 | In -bisent mode, only bisentences for which both the preceding and the following\n\ |
498 | segments are one-to-one are printed. In the default non-bisent mode, only rungs\n\ |
499 | for which both the preceding and the following segments are one-to-one are printed.\n\ |
500 | \n\ |
501 | -hand=file\n\ |
502 | When this argument is given, the precision and recall of the alignment is calculated\n\ |
503 | based on the manually built ladder file. Information like the following is written\n\ |
504 | on the standard error: \n\ |
505 | 53 misaligned out of 6446 correct items, 6035 bets.\n\ |
506 | Precision: 0.991218, Recall: 0.928017\n\ |
507 | \n\ |
508 | Note that by default, 'item' means rung. The switch -bisent also changes the semantics\n\ |
509 | of the scoring from rung-based to bisentence-based and in this case 'item' means bisentences.\n\ |
510 | See File formats about the format of this input align file.\n\ |
511 | \n\ |
512 | -autodict=filename\n\ |
513 | The dictionary built during realign is saved to this file. By default, it is not saved.\n\ |
514 | \n\ |
515 | -utf\n\ |
516 | The system uses the character counts of the sentences as information for the\n\ |
517 | pairing of sentences. By default, it assumes one-byte character encoding such\n\ |
518 | as ISO Latin-1 when calculating these counts. If our text is in UTF-8 format,\n\ |
519 | byte counts and character counts are different, and we must use the -utf switch\n\ |
520 | to force the system to properly calculate character counts.\n\ |
521 | Note: UTF-16 input is not supported.\n\ |
522 | \n\ |
523 | Postfiltering options:\n\ |
524 | There are various postprocessors which remove implausible rungs based on various heuristics.\n\ |
525 | \n\ |
526 | -thresh=n\n\ |
527 | Don't print out segments with score lower than n/100.\n\ |
528 | \n\ |
529 | -ppthresh=n\n\ |
530 | Filter rungs with less than n/100 average score in their vicinity.\n\ |
531 | \n\ |
532 | -headerthresh=n\n\ |
533 | Filter all rungs at the start and end of texts until finding a reliably\n\ |
534 | plausible region.\n\ |
535 | \n\ |
536 | -topothresh=n\n\ |
537 | Filter rungs with less than n percent of one-to-one segments in their vicinity.\n\ |
538 | \n\ |
539 | "; |
540 | } |
541 | |
542 | int main_alignerTool(int argC, char* argV[]) |
543 | { |
544 | #ifndef _DEBUG |
545 | try |
546 | #endif |
547 | { |
548 | if (argC<4) |
549 | { |
550 | main_alignerToolUsage(); |
551 | throw ""; |
552 | } |
553 | |
554 | Arguments args; |
555 | std::vector<const char*> remains; |
556 | args.read( argC, argV, remains ); |
557 | |
558 | AlignParameters alignParameters; |
559 | |
560 | if (args.getSwitchCompact("text")) |
561 | { |
562 | alignParameters.justSentenceIds = false; |
563 | } |
564 | |
565 | if (args.getSwitchCompact("bisent")) |
566 | { |
567 | alignParameters.justBisentences = true; |
568 | } |
569 | |
570 | if (args.getSwitchCompact("cautious")) |
571 | { |
572 | alignParameters.cautiousMode = true; |
573 | } |
574 | |
575 | alignParameters.utfCharCountingMode = args.getSwitchCompact("utf"); |
576 | |
577 | fillPercentParameter( args, "thresh", alignParameters.qualityThreshold ); |
578 | |
579 | fillPercentParameter( args, "ppthresh", alignParameters.postprocessTrailQualityThreshold ); |
580 | |
581 | fillPercentParameter( args, "headerthresh", alignParameters.postprocessTrailStartAndEndQualityThreshold ); |
582 | |
583 | fillPercentParameter( args, "topothresh", alignParameters.postprocessTrailByTopologyQualityThreshold ); |
584 | |
585 | bool batchMode = args.getSwitchCompact("batch") ; |
586 | |
587 | if (batchMode && (remains.size()!=2) ) |
588 | { |
589 | std::cerr << "Batch mode requires exactly two file arguments." << std::endl; |
590 | std::cerr << std::endl; |
591 | |
592 | main_alignerToolUsage(); |
593 | throw "argument error"; |
594 | } |
595 | |
596 | std::string handArgumentname = "hand"; |
597 | if (args.find(handArgumentname)!=args.end()) |
598 | { |
599 | if (batchMode) |
600 | { |
601 | std::cerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; |
602 | throw "argument error"; |
603 | } |
604 | else |
605 | { |
606 | alignParameters.handAlignFilename = args[handArgumentname].dString ; |
607 | args.erase(handArgumentname); |
608 | |
609 | if (alignParameters.handAlignFilename.empty()) |
610 | { |
611 | std::cerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; |
612 | throw "argument error"; |
613 | } |
614 | } |
615 | } |
616 | |
617 | std::string autoDictDumpArgumentname = "autodict"; |
618 | if (args.find(autoDictDumpArgumentname)!=args.end()) |
619 | { |
620 | if (batchMode) |
621 | { |
622 | std::cerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; |
623 | throw "argument error"; |
624 | } |
625 | else |
626 | { |
627 | alignParameters.autoDictionaryDumpFilename = args[autoDictDumpArgumentname].dString ; |
628 | args.erase(autoDictDumpArgumentname); |
629 | |
630 | if (alignParameters.autoDictionaryDumpFilename.empty()) |
631 | { |
632 | std::cerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; |
633 | throw "argument error"; |
634 | } |
635 | } |
636 | } |
637 | |
638 | if (!batchMode && (remains.size()!=3) ) |
639 | { |
640 | std::cerr << "Nonbatch mode requires exactly three file arguments." << std::endl; |
641 | std::cerr << std::endl; |
642 | |
643 | main_alignerToolUsage(); |
644 | throw "argument error"; |
645 | } |
646 | |
647 | try |
648 | { |
649 | args.checkEmptyArgs(); |
650 | } |
651 | catch (...) |
652 | { |
653 | std::cerr << std::endl; |
654 | |
655 | main_alignerToolUsage(); |
656 | throw "argument error"; |
657 | } |
658 | |
659 | // std::cerr << "Reading dictionary..." << std::endl; |
660 | const char* dicFilename = remains[0] ; |
661 | DictionaryItems dictionary; |
662 | std::ifstream dis(dicFilename); |
663 | dictionary.read(dis); |
664 | |
665 | if (batchMode) |
666 | { |
667 | const char* batchFilename = remains[1] ; |
668 | std::ifstream bis(batchFilename); |
669 | |
670 | while (bis.good()&&!bis.eof()) |
671 | { |
672 | std::string line; |
673 | std::getline(bis,line); |
674 | |
675 | std::vector<std::string> words; |
676 | split( line, words, '\t' ); |
677 | |
678 | if (words.size()!=3) |
679 | { |
680 | std::cerr << "Batch file has incorrect format." << std::endl; |
681 | throw "data error"; |
682 | } |
683 | |
684 | std::string huFilename, enFilename, outFilename; |
685 | huFilename = words[0]; |
686 | enFilename = words[1]; |
687 | outFilename = words[2]; |
688 | |
689 | // std::cerr << "Processing " << outFilename << std::endl; |
690 | bool failed = false; |
691 | try |
692 | { |
693 | alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters, outFilename ); |
694 | } |
695 | catch ( const char* errorType ) |
696 | { |
697 | std::cerr << errorType << std::endl; |
698 | failed = true; |
699 | } |
700 | catch ( std::exception& e ) |
701 | { |
702 | std::cerr << "some failed assertion:" << e.what() << std::endl; |
703 | failed = true; |
704 | } |
705 | catch ( ... ) |
706 | { |
707 | std::cerr << "some unknown failed assertion..." << std::endl; |
708 | failed = true; |
709 | } |
710 | |
711 | if (failed) |
712 | { |
713 | std::cerr << "Align failed for " << outFilename << std::endl; |
714 | } |
715 | } |
716 | } |
717 | else |
718 | { |
719 | const char* huFilename = remains[1] ; |
720 | const char* enFilename = remains[2] ; |
721 | |
722 | alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters ); |
723 | } |
724 | } |
725 | #ifndef _DEBUG |
726 | catch ( const char* errorType ) |
727 | { |
728 | std::cerr << errorType << std::endl; |
729 | return -1; |
730 | } |
731 | catch ( std::exception& e ) |
732 | { |
733 | std::cerr << "some failed assertion:" << e.what() << std::endl; |
734 | return -1; |
735 | } |
736 | catch ( ... ) |
737 | { |
738 | std::cerr << "some unknown failed assertion..." << std::endl; |
739 | return -1; |
740 | } |
741 | #endif |
742 | return 0; |
743 | } |
744 | |
745 | } |