Bug Summary

File:hmm.cc
Warning:line 693, column 10
Although the value stored to 'nw' is used in the enclosing expression, the value is never actually read from 'nw'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name hmm.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ hmm.cc
1
2/*
3 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, see <https://www.gnu.org/licenses/>.
17 */
18/*
19 * First order hidden Markov model (HMM) implementation (source)
20 *
21 * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
22 */
23
24#include <apertium/hmm.h>
25#include "apertium_config.h"
26#include <apertium/unlocked_cstdio.h>
27#include <lttoolbox/compression.h>
28
29#include <stdio.h>
30#include <unistd.h>
31#include <vector>
32#include <algorithm>
33#include <lttoolbox/string_utils.h>
34#include <apertium/file_morpho_stream.h>
35
36inline bool p_isnan(double v) {
37#if __cplusplus202101L >= 201103L
38 return std::isnan(v);
39#else
40 return ::isnan(v);
41#endif
42}
43
44inline bool p_isinf(double v) {
45#if __cplusplus202101L >= 201103L
46 return std::isinf(v);
47#else
48 return ::isinf(v);
49#endif
50}
51
52using namespace Apertium;
53using namespace tagger_utils;
54
55TaggerData& HMM::get_tagger_data() {
56 return tdhmm;
57}
58
59void HMM::deserialise(FILE *Serialised_FILE_Tagger) {
60 tdhmm.read(Serialised_FILE_Tagger);
61 eos = (tdhmm.getTagIndex())["TAG_SENT"_u];
62}
63
64std::vector<UString> &HMM::getArrayTags() {
65 return tdhmm.getArrayTags();
66}
67
68void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); }
69
70void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) {
71 tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger);
72 eos = (tdhmm.getTagIndex())["TAG_SENT"_u];
73}
74
75void HMM::init_probabilities_from_tagged_text_(MorphoStream &stream_tagged,
76 MorphoStream &stream_untagged) {
77 init_probabilities_from_tagged_text(stream_tagged, stream_untagged);
78 apply_rules();
79}
80
81void HMM::init_probabilities_kupiec_(MorphoStream &lexmorfo) {
82 init_probabilities_kupiec(lexmorfo);
83 apply_rules();
84}
85
86void HMM::train(MorphoStream &morpho_stream, unsigned long count) {
87 for (; count > 0; --count) {
88 morpho_stream.rewind();
89 train(morpho_stream);
90 }
91
92 apply_rules();
93}
94
95HMM::HMM() {}
96
97HMM::HMM(TaggerFlags& Flags_) : FILE_Tagger(Flags_) {}
98
99HMM::HMM(TaggerDataHMM _tdhmm)
100 : tdhmm(_tdhmm)
101{
102 eos = (tdhmm.getTagIndex())["TAG_SENT"_u];
103}
104
105HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {}
106
107HMM::~HMM() {}
108
109void
110HMM::init()
111{
112}
113
114void
115HMM::set_eos(TTag t)
116{
117 eos = t;
118}
119
120void
121HMM::read_ambiguity_classes(FILE *in)
122{
123 while(in)
124 {
125 int ntags = Compression::multibyte_read(in);
126
127 if(feof(in))
128 {
129 break;
130 }
131 set<TTag> ambiguity_class;
132
133 for(; ntags != 0; ntags--)
134 {
135 ambiguity_class.insert(Compression::multibyte_read(in));
136 }
137
138 if(ambiguity_class.size() != 0)
139 {
140 tdhmm.getOutput().add(ambiguity_class);
141 }
142 }
143
144 tdhmm.setProbabilities(tdhmm.getTagIndex().size(), tdhmm.getOutput().size());
145}
146
147void
148HMM::write_ambiguity_classes(FILE *out)
149{
150 for(int i=0, limit = tdhmm.getOutput().size(); i != limit; i++)
151 {
152 set<TTag> const &ac = (tdhmm.getOutput())[i];
153 Compression::multibyte_write(ac.size(), out);
154 for(set<TTag>::const_iterator it = ac.begin(), limit2 = ac.end();
155 it != limit2; it++)
156 {
157 Compression::multibyte_write(*it, out);
158 }
159 }
160}
161
162void
163HMM::read_probabilities(FILE *in)
164{
165 tdhmm.read(in);
166}
167
168void
169HMM::write_probabilities(FILE *out)
170{
171 tdhmm.write(out);
172}
173
174void
175HMM::init_probabilities_kupiec(MorphoStream &lexmorfo)
176{
177 int N = tdhmm.getN();
178 int M = tdhmm.getM();
179 int i=0, j=0, k=0, k1=0, k2=0, nw=0;
180 vector <double> classes_ocurrences (M, 1);
181 vector <vector <double> > classes_pair_ocurrences(M, vector<double>(M, 1));
182 vector <double> tags_estimate(N, 0);
183 vector <vector <double> > tags_pair_estimate(N, vector<double>(N, 0));
184
185 Collection &output = tdhmm.getOutput();
186
187 TaggerWord *word=NULL__null;
188
189 set<TTag> tags;
190 tags.insert(eos);
191 k1=output[tags]; //The first tag (ambiguity class) seen is the end-of-sentence
192
193 //We count for each ambiguity class the number of ocurrences
194 word = lexmorfo.get_next_word();
195 while((word)) {
196 if (++nw%10000==0) cerr<<'.'<<flush;
197
198 tags=word->get_tags();
199
200 if (tags.size()==0) { //This is an unknown word
201 tags = tdhmm.getOpenClass();
202 }
203 else {
204 require_ambiguity_class(tdhmm, tags, *word, nw);
205 }
206
207 k2=output[tags];
208
209 classes_ocurrences[k1]++;
210 classes_pair_ocurrences[k1][k2]++; //k1 followed by k2
211 delete word;
212 word=lexmorfo.get_next_word();
213
214 k1=k2;
215
216 }
217
218 //Estimation of the number of time each tags occurs in the training text
219 for(i=0; i<N; i++) {
220 for(k=0; k<M; k++) {
221
222 if(output[k].find(i) != output[k].end())
223 tags_estimate[i] += classes_ocurrences[k]/output[k].size();
224 }
225 }
226
227 set<TTag> tags1, tags2;
228 set<TTag>::iterator itag1, itag2;
229 for(k1=0; k1<M; k1++) {
230 tags1=output[k1];
231 for(k2=0; k2<M; k2++) {
232 tags2=output[k2];
233 double nocurrences=classes_pair_ocurrences[k1][k2]/((double)(tags1.size()*tags2.size()));
234 for (itag1=tags1.begin(); itag1!=tags1.end(); itag1++) {
235 for (itag2=tags2.begin(); itag2!=tags2.end(); itag2++)
236 tags_pair_estimate[*itag1][*itag2]+=nocurrences;
237 }
238 }
239 }
240
241 //a[i][j] estimation.
242 double sum;
243 for(i=0; i<N; i++) {
244 sum=0;
245 for(j=0; j<N; j++)
246 sum+=tags_pair_estimate[i][j];
247
248 for(j=0; j<N; j++) {
249 if (sum>0)
250 (tdhmm.getA())[i][j] = tags_pair_estimate[i][j]/sum;
251 else {
252 (tdhmm.getA())[i][j] = 0;
253 }
254 }
255 }
256
257 //b[i][k] estimation
258 for(i=0; i<N; i++) {
259 for(k=0; k<M; k++) {
260 if (output[k].find(i)!=output[k].end()) {
261 if (tags_estimate[i]>0)
262 (tdhmm.getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i];
263 else
264 (tdhmm.getB())[i][k] = 0;
265 }
266 }
267 }
268 cerr<<"\n";
269}
270
271
272void
273HMM::init_probabilities_from_tagged_text(MorphoStream &stream_tagged,
274 MorphoStream &stream_untagged) {
275 int i, j, k, nw=0;
276 int N = tdhmm.getN();
277 int M = tdhmm.getM();
278 vector <vector <double> > tags_pair(N, vector<double>(N, 0));
279 vector <vector <double> > emission(N, vector<double>(M, 0));
280
281
282 TaggerWord *word_tagged=NULL__null, *word_untagged=NULL__null;
283 Collection &output = tdhmm.getOutput();
284
285
286 set<TTag> tags;
287
288 TTag tag1, tag2;
289 tag1 = eos; // The first seen tag is the end-of-sentence tag
290
291 word_tagged = stream_tagged.get_next_word();
292 word_untagged = stream_untagged.get_next_word();
293 while(word_tagged) {
294 cerr<<*word_tagged;
295 cerr<<" -- "<<*word_untagged<<"\n";
296
297 if (word_tagged->get_superficial_form()!=word_untagged->get_superficial_form()) {
298 cerr<<"\nTagged text (.tagged) and analyzed text (.untagged) streams are not aligned.\n";
299 cerr<<"Take a look at tagged text (.tagged).\n";
300 cerr<<"Perhaps this is caused by a multiword unit that is not a multiword unit in one of the two files.\n";
301 cerr<<*word_tagged<<" -- "<<*word_untagged<<"\n";
302 exit(1);
303 }
304
305 if (++nw%100==0) cerr<<'.'<<flush;
306
307 tag2 = tag1;
308
309 if (word_untagged==NULL__null) {
310 cerr<<"word_untagged==NULL\n";
311 exit(1);
312 }
313
314 if (word_tagged->get_tags().size()==0) // Unknown word
315 tag1 = -1;
316 else if (word_tagged->get_tags().size()>1) // Ambiguous word
317 cerr<<"Error in tagged text. An ambiguous word was found: "<<word_tagged->get_superficial_form()<<"\n";
318 else
319 tag1 = *(word_tagged->get_tags()).begin();
320
321
322 if ((tag1>=0) && (tag2>=0))
323 tags_pair[tag2][tag1]++;
324
325
326 if (word_untagged->get_tags().size()==0) { // Unknown word
327 tags = tdhmm.getOpenClass();
328 }
329 else {
330 require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged, nw);
331 tags = word_untagged->get_tags();
332 }
333
334 k=output[tags];
335 if(tag1>=0)
336 emission[tag1][k]++;
337
338 delete word_tagged;
339 word_tagged=stream_tagged.get_next_word();
340 delete word_untagged;
341 word_untagged=stream_untagged.get_next_word();
342 }
343
344
345 //Estimate of a[i][j]
346 for(i=0; i<N; i++) {
347 double sum=0;
348 for(j=0; j<N; j++)
349 sum += tags_pair[i][j]+1.0;
350 for(j=0; j<N; j++)
351 (tdhmm.getA())[i][j] = (tags_pair[i][j]+1.0)/sum;
352 }
353
354
355 //Estimate of b[i][k]
356 for(i=0; i<N; i++) {
357 int nclasses_appear=0;
358 double times_appear=0.0;
359 for(k=0; k<M; k++) {
360 if (output[k].find(i)!=output[k].end()) {
361 nclasses_appear++;
362 times_appear+=emission[i][k];
363 }
364 }
365 for(k=0; k<M; k++) {
366 if (output[k].find(i)!=output[k].end())
367 (tdhmm.getB())[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
368 }
369 }
370
371 cerr<<"\n";
372}
373
374void
375HMM::apply_rules()
376{
377 vector<TForbidRule> &forbid_rules = tdhmm.getForbidRules();
378 vector<TEnforceAfterRule> &enforce_rules = tdhmm.getEnforceRules();
379 int N = tdhmm.getN();
380 int i, j, j2;
381 bool found;
382
383 for(i=0; i<(int) forbid_rules.size(); i++) {
384 (tdhmm.getA())[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO1e-10;
385 }
386
387 for(i=0; i<(int) enforce_rules.size(); i++) {
388 for(j=0; j<N; j++) {
389 found = false;
390 for (j2=0; j2<(int) enforce_rules[i].tagsj.size(); j2++) {
391 if (enforce_rules[i].tagsj[j2]==j) {
392 found = true;
393 break;
394 }
395 }
396 if (!found)
397 (tdhmm.getA())[enforce_rules[i].tagi][j] = ZERO1e-10;
398 }
399 }
400
401 // Normalize probabilities
402 for(i=0; i<N; i++) {
403 double sum=0;
404 for(j=0; j<N; j++)
405 sum += (tdhmm.getA())[i][j];
406 for(j=0; j<N; j++) {
407 if (sum>0)
408 (tdhmm.getA())[i][j] = (tdhmm.getA())[i][j]/sum;
409 else
410 (tdhmm.getA())[i][j] = 0;
411 }
412 }
413}
414
415void
416HMM::post_ambg_class_scan() {
417 int N = (tdhmm.getTagIndex()).size();
418 int M = (tdhmm.getOutput()).size();
419 cerr << N << " states and " << M <<" ambiguity classes\n";
420
421 tdhmm.setProbabilities(N, M);
422}
423
424void
425HMM::filter_ambiguity_classes(const char* input_file, UFILE* out) {
426 set<set<TTag> > ambiguity_classes;
427 FileMorphoStream morpho_stream(input_file, true, &tdhmm);
428
429 TaggerWord *word = morpho_stream.get_next_word();
430
431 while(word) {
432 set<TTag> tags = word->get_tags();
433 if(tags.size() > 0) {
434 if(ambiguity_classes.find(tags) == ambiguity_classes.end()) {
435 ambiguity_classes.insert(tags);
436 word->outputOriginal(out);
437 //cerr<<word->get_string_tags()<<"\n";
438 }
439 }
440 delete word;
441 word = morpho_stream.get_next_word();
442 }
443}
444
445void
446HMM::train(MorphoStream &morpho_stream) {
447 int i, j, k, t, len, nw = 0;
448 TaggerWord *word=NULL__null;
449 TTag tag;
450 set<TTag> tags, pretags;
451 set<TTag>::iterator itag, jtag;
452 map <int, double> gamma;
453 map <int, double>::iterator jt, kt;
454 map < int, map <int, double> > alpha, beta, xsi, phi;
455 map < int, map <int, double> >::iterator it;
456 double prob, loli;
457 vector < set<TTag> > pending;
458 Collection &output = tdhmm.getOutput();
459
460 int ndesconocidas=0;
461 // alpha => forward probabilities
462 // beta => backward probabilities
463
464 loli = 0;
465 tag = eos;
466 tags.clear();
467 tags.insert(tag);
468 pending.push_back(tags);
469
470 alpha[0].clear();
471 alpha[0][tag] = 1;
472
473 word = morpho_stream.get_next_word();
474
475 while (word) {
476
477 //cerr<<"Enter para continuar\n";
478 //getchar();
479
480 if (++nw%10000==0) cerr<<'.'<<flush;
481
482 //cerr<<*word<<"\n";
483
484 pretags = pending.back();
485
486 tags = word->get_tags();
487
488 if (tags.size()==0) { // This is an unknown word
489 tags = tdhmm.getOpenClass();
490 ndesconocidas++;
491 }
492
493 require_ambiguity_class(tdhmm, tags, *word, nw);
494
495 k = output[tags];
496 len = pending.size();
497 alpha[len].clear();
498
499 //Forward probabilities
500 for (itag=tags.begin(); itag!=tags.end(); itag++) {
501 i=*itag;
502 for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
503 j=*jtag;
504 //cerr<<"previous alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n";
505 //cerr<<"alpha["<<len-1<<"]["<<j<<"]="<<alpha[len-1][j]<<"\n";
506 //cerr<<"a["<<j<<"]["<<i<<"]="<<a[j][i]<<"\n";
507 //cerr<<"b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\n";
508 alpha[len][i] += alpha[len-1][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k];
509 }
510 if (alpha[len][i]==0)
511 alpha[len][i]=DBL_MIN2.2250738585072014e-308;
512 //cerr<<"alpha["<<len<<"]["<<i<<"]="<<alpha[len][i]<<"\n--------\n";
513 }
514
515 if (tags.size()>1) {
516 pending.push_back(tags);
517 } else { // word is unambiguous
518 tag = *tags.begin();
519 beta[0].clear();
520 beta[0][tag] = 1;
521
522 prob = alpha[len][tag];
523
524 //cerr<<"prob="<<prob<<"\n";
525 //cerr<<"alpha["<<len<<"]["<<tag<<"]="<<alpha[len][tag]<<"\n";
526 loli -= log(prob);
527
528 for (t=0; t<len; t++) { // loop from T-1 to 0
529 pretags = pending.back();
530 pending.pop_back();
531 k = output[tags];
532 beta[1-t%2].clear();
533 for (itag=tags.begin(); itag!=tags.end(); itag++) {
534 i=*itag;
535 for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
536 j = *jtag;
537 beta[1-t%2][j] += (tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]*beta[t%2][i];
538 xsi[j][i] += alpha[len-t-1][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]*beta[t%2][i]/prob;
539 }
540 double previous_value = gamma[i];
541
542 gamma[i] += alpha[len-t][i]*beta[t%2][i]/prob;
543 if (p_isnan(gamma[i])) {
544 cerr<<"NAN(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
545 <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
546 exit(1);
547 }
548 if (p_isinf(gamma[i])) {
549 cerr<<"INF(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
550 <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
551 exit(1);
552 }
553 if (gamma[i]==0) {
554 //cout<<"ZERO(3) gamma["<<i<<"] = "<<gamma[i]<<" alpha["<<len-t<<"]["<<i<<"]= "<<alpha[len-t][i]
555 // <<" beta["<<t%2<<"]["<<i<<"] = "<<beta[t%2][i]<<" prob = "<<prob<<" previous gamma = "<<previous_value<<"\n";
556 gamma[i]=DBL_MIN2.2250738585072014e-308;
557 //exit(1);
558 }
559 phi[i][k] += alpha[len-t][i]*beta[t%2][i]/prob;
560 }
561 tags=pretags;
562 }
563
564 tags.clear();
565 tags.insert(tag);
566 pending.push_back(tags);
567 alpha[0].clear();
568 alpha[0][tag] = 1;
569 }
570
571 delete word;
572 word = morpho_stream.get_next_word();
573 }
574
575 if ((pending.size()>1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())["TAG_kEOF"_u]))) {
576 cerr << "Warning: The last tag is not the end-of-sentence-tag "
577 << "but rather " << tdhmm.getArrayTags()[tag] << ". Line: " << nw
578 << ". Pending: " << pending.size() << ". Tags: ";
579 cerr << "\n";
580 }
581
582 int N = tdhmm.getN();
583 int M = tdhmm.getM();
584
585 //Clean previous values
586 for(i=0; i<N; i++) {
587 for(j=0; j<N; j++)
588 (tdhmm.getA())[i][j]=ZERO1e-10;
589 for(k=0; k<M; k++)
590 (tdhmm.getB())[i][k]=ZERO1e-10;
591 }
592
593 // new parameters
594 for (it=xsi.begin(); it!=xsi.end(); it++) {
595 i = it->first;
596 for (jt=xsi[i].begin(); jt!=xsi[i].end(); jt++) {
597 j = jt->first;
598 if (xsi[i][j]>0) {
599 if (gamma[i]==0) {
600 cerr<<"Warning: gamma["<<i<<"]=0\n";
601 gamma[i]=DBL_MIN2.2250738585072014e-308;
602 }
603
604 (tdhmm.getA())[i][j] = xsi[i][j]/gamma[i];
605
606 if (p_isnan((tdhmm.getA())[i][j])) {
607 cerr<<"NAN\n";
608 cerr <<"Error: BW - NAN(1) a["<<i<<"]["<<j<<"]="<<(tdhmm.getA())[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
609 exit(1);
610 }
611 if (p_isinf((tdhmm.getA())[i][j])) {
612 cerr<<"INF\n";
613 cerr <<"Error: BW - INF(1) a["<<i<<"]["<<j<<"]="<<(tdhmm.getA())[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
614 exit(1);
615 }
616 if ((tdhmm.getA())[i][j]==0) {
617 //cerr <<"Error: BW - ZERO(1) a["<<i<<"]["<<j<<"]="<<(tdhmm.getA())[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
618 // exit(1);
619 }
620 }
621 }
622 }
623
624 for (it=phi.begin(); it!=phi.end(); it++) {
625 i = it->first;
626 for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) {
627 k = kt->first;
628 if (phi[i][k]>0) {
629 (tdhmm.getB())[i][k] = phi[i][k]/gamma[i];
630
631 if (p_isnan((tdhmm.getB())[i][k])) {
632 cerr<<"Error: BW - NAN(2) b["<<i<<"]["<<k<<"]="<<(tdhmm.getB())[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
633 exit(1);
634 }
635 if (p_isinf((tdhmm.getB())[i][k])) {
636 cerr<<"Error: BW - INF(2) b["<<i<<"]["<<k<<"]="<<(tdhmm.getB())[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
637 exit(1);
638 }
639 if ((tdhmm.getB())[i][k]==0) {
640 //cerr <<"Error: BW - ZERO(2) b["<<i<<"]["<<k<<"]="<<(tdhmm.getB())[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
641 // exit(1);
642 }
643 }
644 }
645 }
646
647 //It can be possible that a probability is not updated
648 //We normalize the probabilitites
649 for(i=0; i<N; i++) {
650 double sum=0;
651 for(j=0; j<N; j++)
652 sum+=(tdhmm.getA())[i][j];
653 for(j=0; j<N; j++)
654 (tdhmm.getA())[i][j]=(tdhmm.getA())[i][j]/sum;
655 }
656
657 for(i=0; i<N; i++) {
658 double sum=0;
659 for(k=0; k<M; k++) {
660 if(output[k].find(i)!=output[k].end())
661 sum+=(tdhmm.getB())[i][k];
662 }
663 for(k=0; k<M; k++) {
664 if(output[k].find(i)!=output[k].end())
665 (tdhmm.getB())[i][k]=(tdhmm.getB())[i][k]/sum;
666 }
667 }
668
669 cerr<<"Log="<<loli<<"\n";
670}
671
672void
673HMM::tagger(MorphoStream &morpho_stream, UFILE* Output) {
674 int i, j, k, nw;
675 TaggerWord *word = NULL__null;
676 TTag tag;
677
678 set <TTag> ambg_class_tags, tags, pretags;
679 set <TTag>::iterator itag, jtag;
680
681 double prob, loli, x;
682 int N = tdhmm.getN();
683 vector <vector <double> > alpha(2, vector<double>(N));
684 vector <vector <vector<TTag> > > best(2, vector <vector <TTag> >(N));
685
686 vector <TaggerWord> wpend;
687 int nwpend;
688
689 morpho_stream.setNullFlush(TheFlags.getNullFlush());
690
691 Collection &output = tdhmm.getOutput();
692
693 loli = nw = 0;
Although the value stored to 'nw' is used in the enclosing expression, the value is never actually read from 'nw'
694
695 //Initialization
696 tags.insert(eos);
697 alpha[0][eos] = 1;
698
699 word = morpho_stream.get_next_word();
700
701 while (word) {
702 wpend.push_back(*word);
703 nwpend = wpend.size();
704
705 pretags = tags; // Tags from the previous word
706
707 tags = word->get_tags();
708
709 if (tags.size()==0) // This is an unknown word
710 tags = tdhmm.getOpenClass();
711
712 ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, TheFlags.getDebug());
713
714 k = output[ambg_class_tags]; //Ambiguity class the word belongs to
715
716 clear_array_double(&alpha[nwpend%2][0], N);
717 clear_array_vector(&best[nwpend%2][0], N);
718
719 //Induction
720 for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word
721 i=*itag;
722 for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { //For all tags from the previous word
723 j=*jtag;
724 x = alpha[1-nwpend%2][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k];
725 if (alpha[nwpend%2][i]<=x) {
726 if (nwpend>1)
727 best[nwpend%2][i] = best[1-nwpend%2][j];
728 best[nwpend%2][i].push_back(i);
729 alpha[nwpend%2][i] = x;
730 }
731 }
732 }
733
734 //Backtracking
735 if (tags.size() == 1) {
736 tag = *tags.begin();
737 prob = alpha[nwpend%2][tag];
738
739 if (prob>0)
740 loli -= log(prob);
741 else {
742 if (TheFlags.getDebug())
743 cerr<<"Problem with word '"<<word->get_superficial_form()<<"' "<<word->get_string_tags()<<"\n";
744 }
745 for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
746 if (TheFlags.getFirst()) {
747 UString const &micad = wpend[t].get_all_chosen_tag_first(best[nwpend%2][tag][t], (tdhmm.getTagIndex())["TAG_kEOF"_u]);
748 write(micad, Output);
749 } else {
750 // print Output
751 wpend[t].set_show_sf(TheFlags.getShowSuperficial());
752 UString const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (tdhmm.getTagIndex())["TAG_kEOF"_u]);
753 write(micad, Output);
754 }
755 }
756
757 //Return to the initial state
758 wpend.clear();
759 alpha[0][tag] = 1;
760 }
761
762 delete word;
763
764 if(morpho_stream.getEndOfFile())
765 {
766 if(TheFlags.getNullFlush())
767 {
768 u_fputcu_fputc_72('\0', Output);
769 tags.clear();
770 tags.insert(eos);
771 alpha[0][eos] = 1;
772 }
773
774 u_fflushu_fflush_72(Output);
775 morpho_stream.setEndOfFile(false);
776 }
777 word = morpho_stream.get_next_word();
778 }
779
780 if ((tags.size()>1)&&(TheFlags.getDebug())) {
781 cerr << "Error: The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n";
782 cerr << "This message should never appears. If you are reading this ..... these are very bad news.\n";
783 }
784}
785
786
787void
788HMM::print_A() {
789 int i,j;
790
791 cout<<"TRANSITION MATRIX (A)\n------------------------------\n";
792 for(i=0; i != tdhmm.getN(); i++)
793 for(j=0; j != tdhmm.getN(); j++) {
794 cout<<"A["<<i<<"]["<<j<<"] = "<<(tdhmm.getA())[i][j]<<"\n";
795 }
796}
797
798void
799HMM::print_B() {
800 int i,k;
801
802 cout<<"EMISSION MATRIX (B)\n-------------------------------\n";
803 for(i=0; i != tdhmm.getN(); i++)
804 for(k=0; k != tdhmm.getM(); k++) {
805 Collection &output = tdhmm.getOutput();
806 if(output[k].find(i)!=output[k].end())
807 cout<<"B["<<i<<"]["<<k<<"] = "<<(tdhmm.getB())[i][k]<<"\n";
808 }
809}
810
811void HMM::print_ambiguity_classes() {
812 set<TTag> ambiguity_class;
813 set<TTag>::iterator itag;
814 cout<<"AMBIGUITY CLASSES\n-------------------------------\n";
815 for(int i=0; i != tdhmm.getM(); i++) {
816 ambiguity_class = (tdhmm.getOutput())[i];
817 cout <<i<<": ";
818 for (itag=ambiguity_class.begin(); itag!=ambiguity_class.end(); itag++) {
819 cout << *itag <<" ";
820 }
821 cout << "\n";
822 }
823}