File: | tmx_alignment.cc |
Warning: | line 132, column 13 Value stored to 'lengthFitness' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /************************************************************************* |
2 | * * |
3 | * (C) Copyright 2004. Media Research Centre at the * |
4 | * Sociology and Communications Department of the * |
5 | * Budapest University of Technology and Economics. * |
6 | * * |
7 | * Developed by Daniel Varga. * |
8 | * * |
9 | * From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * |
10 | * * |
11 | *************************************************************************/ |
12 | #include <apertium/tmx_alignment.h> |
13 | |
14 | #include <apertium/tmx_words.h> // For SentenceList |
15 | #include <apertium/tmx_dictionary.h> // For FrequencyMap |
16 | #include <lttoolbox/string_utils.h> |
17 | |
18 | #include <iostream> |
19 | #include <map> |
20 | #include <set> |
21 | #include <algorithm> |
22 | |
23 | // Copypaste-elve. TODO Elhelyezni. |
24 | #define massert(e)if (!(e)) { std::cerr << "e" << " failed" << std::endl; throw "assert"; } if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } |
25 | |
26 | std::ostream& operator<<( std::ostream& os, std::pair<int,int> p ) |
27 | { |
28 | os << p.first << "," << p.second; |
29 | return os; |
30 | } |
31 | |
32 | namespace TMXAligner |
33 | { |
34 | |
35 | |
36 | // Attention, the two-sentence length is the first argument. Usually the Hungarian is, but not here. |
37 | // The bigger the better. closeness is always smaller than bestScore. |
38 | double closeness( double twoSentenceLength, double oneSentenceLength ) |
39 | { |
40 | const double bestScore = 0.3; |
41 | const double quasiglobal_closenessMultiplier = 0.3; |
42 | |
43 | double ratio; |
44 | |
45 | if (twoSentenceLength>oneSentenceLength) |
46 | { |
47 | ratio = (twoSentenceLength+1)/(oneSentenceLength+1); |
48 | } |
49 | else |
50 | { |
51 | ratio = (oneSentenceLength+1)/(twoSentenceLength+1); |
52 | } |
53 | |
54 | ratio -= 1.0; |
55 | |
56 | // assert(ratio>=0); |
57 | return bestScore - quasiglobal_closenessMultiplier * ratio; |
58 | } |
59 | |
60 | const unsigned char Diag = 1; |
61 | const unsigned char HuSkip = 2; |
62 | const unsigned char EnSkip = 3; |
63 | const unsigned char HuHuEnSkip = 4; |
64 | const unsigned char HuEnEnSkip = 5; |
65 | const unsigned char Dead = 6; |
66 | |
67 | void buildDynProgMatrix( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength, |
68 | QuasiDiagonal<double>& v, TrelliMatrix& trellis ) |
69 | { |
70 | const int huBookSize = w.size(); |
71 | |
72 | |
73 | int huPos,enPos; |
74 | |
75 | // v[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals. |
76 | // The smaller value, the better similarity. (Unlike in the original similarity matrix w, where bigger is better.) |
77 | |
78 | double infinity = 1e6; |
79 | |
80 | for ( huPos=0; huPos<=huBookSize; ++huPos ) |
81 | { |
82 | int rowStart = v.rowStart(huPos); |
83 | int rowEnd = v.rowEnd(huPos); |
84 | for ( enPos=rowStart; enPos<rowEnd; ++enPos ) |
85 | { |
86 | double& val = v.cell(huPos,enPos); |
87 | unsigned char& trail = trellis.cell(huPos,enPos); |
88 | |
89 | bool quasiglobal_knightsMoveAllowed = true; |
90 | if (quasiglobal_knightsMoveAllowed) |
91 | { |
92 | double lengthFitness(0); |
93 | |
94 | bool quasiglobal_lengthFitnessApplied = true; |
95 | |
96 | // The array is indexed by the step directions. The smaller value, the better. |
97 | double values[Dead]; |
98 | int i; |
99 | for ( i=1; i<Dead; ++i ) |
100 | values[i] = infinity; |
101 | |
102 | if (huPos>0) |
103 | { |
104 | values[HuSkip] = v[huPos-1][enPos] - skipScore; |
105 | } |
106 | |
107 | if (enPos>0) |
108 | { |
109 | values[EnSkip] = v[huPos][enPos-1] - skipScore; |
110 | } |
111 | |
112 | if ((huPos>0) && (enPos>0)) |
113 | { |
114 | if (quasiglobal_lengthFitnessApplied) |
115 | { |
116 | lengthFitness = closeness(huLength[huPos-1], enLength[enPos-1]); |
117 | } |
118 | else |
119 | { |
120 | lengthFitness = 0; |
121 | } |
122 | |
123 | values[Diag] = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] - lengthFitness ; |
124 | } |
125 | |
126 | const double dotLength = 2.0 ; |
127 | |
128 | if ((huPos>1) && (enPos>0)) |
129 | { |
130 | if (quasiglobal_lengthFitnessApplied) |
131 | { |
132 | lengthFitness = closeness(huLength[huPos-2]+huLength[huPos-1]+dotLength, enLength[enPos-1]); |
Value stored to 'lengthFitness' is never read | |
133 | } |
134 | else |
135 | { |
136 | lengthFitness = 0; |
137 | } |
138 | |
139 | } |
140 | |
141 | if ((huPos>0) && (enPos>1)) |
142 | { |
143 | if (quasiglobal_lengthFitnessApplied) |
144 | { |
145 | // Attention, the two-sentence length is the first argument. Usually the Hungarian is the first argument, but not here. |
146 | lengthFitness = closeness(enLength[enPos-2]+enLength[enPos-1]+dotLength, huLength[huPos-1]); |
147 | } |
148 | else |
149 | { |
150 | lengthFitness = 0; |
151 | } |
152 | |
153 | const double& a = w[huPos-1][enPos-1] ; |
154 | const double& b = w[huPos-1][enPos-2] ; |
155 | values[HuEnEnSkip] = v[huPos-1][enPos-2] - ( a<b ? a : b ) - skipScore - lengthFitness ; // The worse of the two crossed square. |
156 | } |
157 | |
158 | unsigned char direction = Dead; |
159 | double bestValue = infinity; |
160 | for ( i=1; i<Dead; ++i ) |
161 | { |
162 | if (values[i]<bestValue) |
163 | { |
164 | bestValue = values[i]; |
165 | direction = i; |
166 | } |
167 | } |
168 | |
169 | trail = direction; |
170 | if (direction==Dead) |
171 | { |
172 | val = 0; |
173 | } |
174 | else |
175 | { |
176 | val = bestValue; |
177 | } |
178 | } |
179 | else // (!quasiglobal_knightsMoveAllowed) |
180 | { |
181 | int borderCase = ( (huPos==0) ? 0 : 2 ) + ( (enPos==0) ? 0 : 1 ) ; |
182 | |
183 | switch (borderCase) |
184 | { |
185 | case 0: |
186 | { |
187 | val = 0; |
188 | trail = Dead; |
189 | break; |
190 | } |
191 | case 1: // huPos==0 |
192 | { |
193 | val = v[0][enPos-1] - skipScore ; |
194 | trail = EnSkip; |
195 | break; |
196 | } |
197 | case 2: // enPos==0 |
198 | { |
199 | val = v[huPos-1][0] - skipScore ; |
200 | trail = HuSkip; |
201 | break; |
202 | } |
203 | case 3: |
204 | { |
205 | double x = v[huPos-1][enPos] - skipScore ; |
206 | double y = v[huPos] [enPos-1] - skipScore ; |
207 | double xy = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] ; |
208 | |
209 | double best = xy; |
210 | trail = Diag; |
211 | if (x<best) |
212 | { |
213 | best = x; |
214 | trail = HuSkip; |
215 | } |
216 | if (y<best) |
217 | { |
218 | best = y; |
219 | trail = EnSkip; |
220 | } |
221 | val = best; |
222 | break; |
223 | } |
224 | } |
225 | } |
226 | } |
227 | } |
228 | } |
229 | |
230 | void trelliToLadder( const TrelliMatrix& trellis, Trail& bestTrail ) |
231 | { |
232 | bestTrail.clear(); |
233 | |
234 | // The -1 is needed because the trellis matrix is one larger than the similarity matrix. |
235 | // This points to its downmost rightmost element. |
236 | const int huBookSize = trellis.size()-1; |
237 | const int enBookSize = trellis.otherSize()-1; |
238 | |
239 | int huPos=huBookSize; |
240 | int enPos=enBookSize; |
241 | |
242 | bool logging = false; |
243 | |
244 | if (logging) std::cerr << std::endl; |
245 | |
246 | bool over = false; |
247 | bool hopelesslyBadTrail = false; |
248 | bestTrail.push_back(std::make_pair(huPos,enPos)); |
249 | |
250 | while (true) |
251 | { |
252 | unsigned char trelli = trellis[huPos][enPos]; |
253 | |
254 | if ((huPos==0) || (enPos==0)) |
255 | break; |
256 | |
257 | switch (trelli) |
258 | { |
259 | case Diag : |
260 | { |
261 | --huPos; |
262 | --enPos; |
263 | break; |
264 | } |
265 | case HuSkip : |
266 | { |
267 | --huPos; |
268 | break; |
269 | } |
270 | case EnSkip : |
271 | { |
272 | --enPos; |
273 | break; |
274 | } |
275 | case HuHuEnSkip : |
276 | { |
277 | huPos -= 2; |
278 | --enPos; |
279 | break; |
280 | } |
281 | case HuEnEnSkip : |
282 | { |
283 | --huPos; |
284 | enPos -= 2; |
285 | break; |
286 | } |
287 | case Dead : |
288 | { |
289 | over = true; |
290 | break; |
291 | } |
292 | default: |
293 | { |
294 | hopelesslyBadTrail = true; |
295 | over = true; |
296 | break; |
297 | } |
298 | } |
299 | |
300 | if (over) |
301 | break; |
302 | |
303 | bestTrail.push_back(std::make_pair(huPos,enPos)); |
304 | |
305 | if (logging) |
306 | { |
307 | std::cerr << huPos << " \t" << enPos << std::endl; |
308 | } |
309 | |
310 | } |
311 | |
312 | if (hopelesslyBadTrail) |
313 | { |
314 | bestTrail.clear(); |
315 | bestTrail.push_back(std::make_pair(huBookSize,enBookSize)); |
316 | bestTrail.push_back(std::make_pair(0,0)); |
317 | std::cerr << "Error: hopelessly bad trail." << std::endl; |
318 | } |
319 | |
320 | std::reverse(bestTrail.begin(), bestTrail.end() ); |
321 | } |
322 | |
323 | |
324 | void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength, |
325 | Trail& bestTrail, AlignMatrix& v ) |
326 | { |
327 | const int huBookSize = w.size(); |
328 | const int enBookSize = w.otherSize(); |
329 | const int thickness = w.thickness(); |
330 | |
331 | massert(w.size()+1 == v.size())if (!(w.size()+1 == v.size())) { std::cerr << "w.size()+1 == v.size()" << " failed" << std::endl; throw "assert"; }; |
332 | massert(w.otherSize()+1 == v.otherSize())if (!(w.otherSize()+1 == v.otherSize())) { std::cerr << "w.otherSize()+1 == v.otherSize()" << " failed" << std::endl; throw "assert"; }; |
333 | |
334 | TrelliMatrix trellis( huBookSize+1,enBookSize+1,thickness, Dead ); |
335 | |
336 | buildDynProgMatrix( w, huLength, enLength, v, trellis ); |
337 | |
338 | // std::cerr << "Matrix built." << std::endl; |
339 | |
340 | trelliToLadder( trellis, bestTrail ); |
341 | |
342 | // std::cerr << "Trail found." << std::endl; |
343 | } |
344 | |
345 | |
346 | bool oneToOne( const Trail& bestTrail, int pos ) |
347 | { |
348 | return ( |
349 | ( bestTrail[pos+1].first -bestTrail[pos].first == 1 ) |
350 | && |
351 | ( bestTrail[pos+1].second-bestTrail[pos].second == 1 ) |
352 | ); |
353 | } |
354 | |
355 | |
356 | int countIntersectionOfTrails( const Trail& sx, const Trail& sy ) |
357 | { |
358 | int inter(0); |
359 | |
360 | Trail::const_iterator sxt = sx.begin(); |
361 | Trail::const_iterator syt = sy.begin(); |
362 | Trail::const_iterator sxe = sx.end(); |
363 | Trail::const_iterator sye = sy.end(); |
364 | for ( ; sxt!=sxe && syt!=sye ; ) |
365 | { |
366 | if ( *sxt < *syt ) |
367 | ++sxt; |
368 | else if ( *sxt > *syt ) |
369 | ++syt; |
370 | else |
371 | { |
372 | ++inter; |
373 | ++sxt; |
374 | ++syt; |
375 | } |
376 | } |
377 | return inter; |
378 | } |
379 | |
380 | |
381 | // A bit of an abuse of the fact that Trail and BisentenceList are typedef'd to the same structure. |
382 | double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHand ) |
383 | { |
384 | int score = countIntersectionOfTrails( trailAuto, trailHand ); |
385 | |
386 | std::cerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " |
387 | << trailAuto.size() << " bets." << std::endl; |
388 | |
389 | std::cerr << "Precision: " << 1.0*score/trailAuto.size() |
390 | << ", Recall: " << 1.0*score/trailHand.size() << std::endl; |
391 | |
392 | double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size(); |
393 | return ratio; |
394 | } |
395 | |
396 | |
397 | void trailToBisentenceList( const Trail& bestTrail, |
398 | BisentenceList& bisentenceList ) |
399 | { |
400 | bisentenceList.clear(); |
401 | |
402 | int trailSize = bestTrail.size(); |
403 | |
404 | for ( int pos=0; pos<trailSize-1; ++pos ) |
405 | { |
406 | if ( oneToOne(bestTrail,pos) ) |
407 | { |
408 | bisentenceList.push_back(bestTrail [pos]); |
409 | } |
410 | } |
411 | } |
412 | |
413 | |
414 | double scoreBisentenceList( const BisentenceList& bisentenceListAuto, const Trail& trailHand ) |
415 | { |
416 | BisentenceList bisentenceListHand; |
417 | trailToBisentenceList( trailHand, bisentenceListHand ); |
418 | |
419 | double score = scoreTrailOrBisentenceList( bisentenceListAuto, bisentenceListHand ) ; |
420 | |
421 | return score; |
422 | } |
423 | |
424 | double scoreTrail( const Trail& trailAuto, const Trail& trailHand ) |
425 | { |
426 | return ( scoreTrailOrBisentenceList( trailAuto, trailHand ) ); |
427 | } |
428 | |
429 | |
430 | void setBox( AlignMatrix& m, int huPos, int enPos, int radius, int insideOfRadiusValue ) |
431 | { |
432 | for ( int x=huPos-radius; x<=huPos+radius; ++x ) |
433 | { |
434 | for ( int y=enPos-radius; y<=enPos+radius; ++y ) |
435 | { |
436 | if ( (x>=0) && (x<m.size()) && (y>=0) && (y<m.otherSize()) ) |
437 | { |
438 | m.cell(x,y) = insideOfRadiusValue ; // ToDo: Should this be (y,x) instead? Function has args y,x not x,y. Fix here or function |
439 | } |
440 | } |
441 | } |
442 | } |
443 | |
444 | // Fills the complement of the radius of the trail with minus infties. |
445 | // The return value true means success. Failure means that during the fill, |
446 | // we intersected the outside of the quasidiagonal area. |
447 | // In this case, the operation is not finished. |
448 | bool borderDetailedAlignMatrix( AlignMatrix& alignMatrix, const Trail& trail, int radius ) |
449 | { |
450 | int huBookSize = alignMatrix.size(); |
451 | |
452 | int huPos, enPos; |
453 | for ( huPos=0; huPos<huBookSize; ++huPos ) |
454 | { |
455 | int rowStart = alignMatrix.rowStart(huPos); |
456 | int rowEnd = alignMatrix.rowEnd(huPos); |
457 | for ( enPos=rowStart; enPos<rowEnd; ++enPos ) |
458 | { |
459 | alignMatrix.cell(huPos,enPos) = outsideOfRadiusValue; |
460 | } |
461 | } |
462 | |
463 | // We seriously use the fact that many-to-zero segments are subdivided into one-to-zero segments. |
464 | // Inside setBox, an exception is thrown if we try to write outside the quasidiagonal. |
465 | // If we catch such an exception, it means that the quasidiagonal is not thick enough. |
466 | // In this case, we abandon the whole align, just to be sure. |
467 | try |
468 | { |
469 | for ( size_t i=0; i<trail.size(); ++i ) |
470 | { |
471 | setBox( alignMatrix, trail[i].first, trail[i].second, radius, insideOfRadiusValue ); |
472 | } |
473 | } |
474 | catch ( const char* errorType ) |
475 | { |
476 | massert( std::string(errorType) == "out of quasidiagonal" )if (!(std::string(errorType) == "out of quasidiagonal")) { std ::cerr << "std::string(errorType) == \"out of quasidiagonal\"" << " failed" << std::endl; throw "assert"; } |
477 | return false; |
478 | } |
479 | |
480 | bool verify = true; |
481 | if (verify) |
482 | { |
483 | int numberOfEvaluatedItems(0); |
484 | for ( huPos=0; huPos<huBookSize; ++huPos ) |
485 | { |
486 | int rowStart = alignMatrix.rowStart(huPos); |
487 | int rowEnd = alignMatrix.rowEnd(huPos); |
488 | for ( enPos=rowStart; enPos<rowEnd; ++enPos ) |
489 | { |
490 | if (alignMatrix[huPos][enPos]==insideOfRadiusValue) |
491 | { |
492 | ++numberOfEvaluatedItems; |
493 | } |
494 | } |
495 | } |
496 | |
497 | std::cerr << numberOfEvaluatedItems << " items inside the border." << std::endl; |
498 | } |
499 | |
500 | return true; |
501 | } |
502 | |
503 | template <class T> |
504 | void dumpAlignMatrix( const QuasiDiagonal<T>& alignMatrix ) |
505 | { |
506 | int huPos,enPos; |
507 | |
508 | int huBookSize = alignMatrix.size(); |
509 | int enBookSize = alignMatrix.otherSize(); |
510 | |
511 | for ( huPos=0; huPos<huBookSize; ++huPos ) |
512 | { |
513 | for ( enPos=0; enPos<enBookSize; ++enPos ) |
514 | { |
515 | int start = alignMatrix.rowStart(huPos); |
516 | int end = alignMatrix.rowEnd (huPos); |
517 | |
518 | if ( (enPos<start) || (enPos>=end) ) |
519 | { |
520 | std::cout << "-1\t"; |
521 | continue; |
522 | } |
523 | |
524 | std::cout << alignMatrix[huPos][enPos] << "\t"; |
525 | } |
526 | std::cout << std::endl; |
527 | } |
528 | } |
529 | |
530 | void dumpAlignMatrix( const QuasiDiagonal<int>& alignMatrix, bool graphical ) |
531 | { |
532 | int huPos,enPos; |
533 | |
534 | int huBookSize = alignMatrix.size(); |
535 | int enBookSize = alignMatrix.otherSize(); |
536 | |
537 | for ( huPos=0; huPos<huBookSize; ++huPos ) |
538 | { |
539 | for ( enPos=0; enPos<enBookSize; ++enPos ) |
540 | { |
541 | int start = alignMatrix.rowStart(huPos); |
542 | int end = alignMatrix.rowEnd (huPos); |
543 | |
544 | if ( (enPos<start) || (enPos>=end) ) |
545 | { |
546 | if (graphical) |
547 | { |
548 | std::cout << " "; |
549 | } |
550 | else |
551 | { |
552 | std::cout << "-1\t"; |
553 | } |
554 | continue; |
555 | } |
556 | |
557 | if (graphical) |
558 | { |
559 | char c(' '); |
560 | switch (alignMatrix[huPos][enPos]) |
561 | { |
562 | case 0: c=' '; break; |
563 | case 1: c='.'; break; |
564 | case 2: c=':'; break; |
565 | case 3: c='|'; break; |
566 | case 4: c='+'; break; |
567 | default: c='X'; break; |
568 | } |
569 | std::cout << c << " "; |
570 | } |
571 | else |
572 | { |
573 | std::cout << alignMatrix[huPos][enPos] << "\t"; |
574 | } |
575 | } |
576 | std::cout << std::endl; |
577 | } |
578 | } |
579 | |
580 | void dumpTrelliMatrix( const TrelliMatrix& trellis ) |
581 | { |
582 | std::map<int, std::string> directions; |
583 | |
584 | directions[Diag] = "HuEn"; |
585 | directions[HuSkip] = "Hu"; |
586 | directions[EnSkip] = "En"; |
587 | directions[HuHuEnSkip] = "HuHuEn"; |
588 | directions[HuEnEnSkip] = "HuEnEn"; |
589 | directions[Dead] = "Dead"; |
590 | |
591 | int huPos,enPos; |
592 | |
593 | int huBookSize = trellis.size(); |
594 | int enBookSize = trellis.otherSize(); |
595 | |
596 | for ( huPos=0; huPos<huBookSize; ++huPos ) |
597 | { |
598 | for ( enPos=0; enPos<enBookSize; ++enPos ) |
599 | { |
600 | int start = trellis.rowStart(huPos); |
601 | int end = trellis.rowEnd (huPos); |
602 | |
603 | if ( (enPos<start) || (enPos>=end) ) |
604 | { |
605 | std::cout << "-1\t"; |
606 | continue; |
607 | } |
608 | |
609 | std::cout << directions[trellis[huPos][enPos]] << "\t"; |
610 | } |
611 | std::cout << std::endl; |
612 | } |
613 | } |
614 | |
615 | } // namespace TMXAligner |