perceptron_tagger.cc

Bug Summary

File:	perceptron_tagger.cc
Warning:	line 261, column 7 Branch condition evaluates to a garbage value

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name perceptron_tagger.cc -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -resource-dir /usr/lib/llvm-16/lib/clang/16 -D HAVE_CONFIG_H -I . -I .. -I /usr/include/utf8cpp/ -I /usr/local/include -I /usr/include/libxml2 -I /usr/local/include -D PIC -internal-isystem /usr/lib/llvm-16/bin/../include/c++/v1 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -std=c++2b -fdeprecated-macro -fdebug-compilation-dir=/tmp/build/apertium/apertium-3.9.12+g928~04ac90c6/apertium -ferror-limit 19 -fgnuc-version=4.2.1 -fno-implicit-modules -fcxx-exceptions -fexceptions -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/apertium/scan-build/2024-09-11-155328-205384-1 -x c++ perceptron_tagger.cc

1#include <apertium/perceptron_tagger.h>

3#include <apertium/mtx_reader.h>
4#include <apertium/exception.h>
5#include <algorithm>
6#include <map>
7#include <set>

9namespace Apertium {

11PerceptronTagger::PerceptronTagger(TaggerFlags flags) : StreamTagger(flags) {};

13PerceptronTagger::~PerceptronTagger() {};

15void PerceptronTagger::tag(Stream &in, std::ostream &out) {
SentenceStream::SentenceTagger::tag(in, out, TheFlags.getSentSeg());
17}

19void PerceptronTagger::read_spec(const std::string &filename) {
MTXReader(spec).read(filename);
21}

23std::ostream &
24operator<<(std::ostream &out, PerceptronTagger const &pt) {
out << "== Spec ==\n";
out << pt.spec;
out << "== Weights " << pt.weights.size() << " ==\n";
out << pt.weights;
return out;
30}

32TaggedSentence
33PerceptronTagger::tagSentence(const Sentence &untagged_sent) {
const size_t sent_len = untagged_sent.size();

std::vector<AgendaItem> agenda;
agenda.reserve(spec.beam_width);
agenda.push_back(AgendaItem());
agenda.back().tagged.reserve(sent_len);

UnaryFeatureVec feat_vec_delta;
std::vector<Analysis>::const_iterator analys_it;
std::vector<AgendaItem>::const_iterator agenda_it;
std::vector<Morpheme>::const_iterator wordoid_it;

for (size_t token_idx = 0; token_idx < sent_len; token_idx++) {
  const std::vector<Analysis> &analyses =
      untagged_sent[token_idx].TheLexicalUnit->TheAnalyses;

  std::vector<AgendaItem> new_agenda;
  new_agenda.reserve(spec.beam_width * analyses.size());

  if (analyses.size() == 1) {
    extendAgendaAll(agenda, analyses.front());
    continue;
  } else if (analyses.size() == 0) {
    extendAgendaAll(agenda, Optional<Analysis>());
    continue;
  }

  for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
    for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) {
      const std::vector<Morpheme> &wordoids = analys_it->TheMorphemes;

      new_agenda.push_back(*agenda_it);
      AgendaItem &new_agenda_item = new_agenda.back();
      new_agenda_item.tagged.push_back(*analys_it);

      for (wordoid_it = wordoids.begin(); wordoid_it != wordoids.end(); wordoid_it++) {
        int wordoid_idx = wordoid_it - wordoids.begin();
        feat_vec_delta.clear();
        spec.get_features(new_agenda_item.tagged, untagged_sent,
                          token_idx, wordoid_idx, feat_vec_delta);
        if (TheFlags.getDebug()) {
          FeatureVec fv(feat_vec_delta);
          std::cerr << "Token " << token_idx << "\t\tWordoid " << wordoid_idx << "\n";
          std::cerr << fv;
          std::cerr << "Score: " << weights * feat_vec_delta << "\n";
        }
        new_agenda_item.score += weights * feat_vec_delta;
      }
    }
  }
  // Apply the beam
  if (TheFlags.getDebug()) {
    std::cerr << "-- Before beam: --\n" << new_agenda;
  }
  size_t new_agenda_size = std::min((size_t)spec.beam_width, new_agenda.size());
  agenda.resize(new_agenda_size);
  std::partial_sort_copy(new_agenda.begin(), new_agenda.end(),
                         agenda.begin(), agenda.end());
  if (TheFlags.getDebug()) {
    std::cerr << "-- After beam: --\n" << agenda;
  }
}

spec.clearCache();
return agenda.front().tagged;
99}

101void PerceptronTagger::outputLexicalUnit(
  const LexicalUnit &lexical_unit, const Optional<Analysis> analysis,
  std::ostream &output) {
StreamTagger::outputLexicalUnit(lexical_unit, analysis, output);
105}

107bool PerceptronTagger::trainSentence(
  const TrainingSentence &sentence,
  FeatureVecAverager &avg_weights)
110{
const TaggedSentence &tagged_sent = sentence.first;
const Sentence &untagged_sent = sentence.second;
assert(tagged_sent.size() == untagged_sent.size())(static_cast <bool> (tagged_sent.size() == untagged_sent
.size()) ? void (0) : __assert_fail ("tagged_sent.size() == untagged_sent.size()"
, __builtin_FILE (), __builtin_LINE (), __extension__ __PRETTY_FUNCTION__
));
const size_t sent_len = tagged_sent.size();

std::vector<TrainingAgendaItem> agenda;
agenda.reserve(spec.beam_width);
agenda.push_back(TrainingAgendaItem());
agenda.back().tagged.reserve(sent_len);
std::vector<TrainingAgendaItem>::const_iterator correct_agenda_it
    = agenda.begin();

TrainingAgendaItem correct_sentence;
correct_sentence.tagged.reserve(sent_len);

UnaryFeatureVec feat_vec_delta;
std::vector<Analysis>::const_iterator analys_it;
std::vector<TrainingAgendaItem>::const_iterator agenda_it;
std::vector<Morpheme>::const_iterator wordoid_it;

for (size_t token_idx = 0; token_idx < sent_len; token_idx++) {
  //std::cerr << "Token idx: " << token_idx << "\n";
  const TaggedToken &tagged_tok(tagged_sent[token_idx]);
  const StreamedType &untagged_tok(untagged_sent[token_idx]);
  correct_sentence.tagged.push_back(tagged_tok);

  const std::vector<Analysis> &analyses =
      untagged_tok.TheLexicalUnit->TheAnalyses;

  std::vector<TrainingAgendaItem> new_agenda;
  new_agenda.reserve(spec.beam_width * analyses.size());

  if (analyses.size() <= 1 || !tagged_tok) {
    // Case |analyses| = 0, nothing we can do
    // Case !tagged_tok, |analyses| > 0, no point penalising a guess which
    //   can only be incorrect when there's no correct answer
    // Case |analyses| = 1, everything will cancel out anyway
    if (analyses.size() == 1) {
      extendAgendaAll(agenda, analyses.front());
      continue;
    } else {
      extendAgendaAll(agenda, Optional<Analysis>());
      continue;
    }
  }

  bool correct_available = false;
  for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
    //std::cerr << *agenda_it;
    for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) {
      const std::vector<Morpheme> &wordoids = analys_it->TheMorphemes;

      new_agenda.push_back(*agenda_it);
      TrainingAgendaItem &new_agenda_item = new_agenda.back();
      new_agenda_item.tagged.push_back(*analys_it);

      for (wordoid_it = wordoids.begin(); wordoid_it != wordoids.end(); wordoid_it++) {
        int wordoid_idx = wordoid_it - wordoids.begin();
        feat_vec_delta.clear();
        spec.get_features(new_agenda_item.tagged, untagged_sent,
                          token_idx, wordoid_idx, feat_vec_delta);
        new_agenda_item.vec += feat_vec_delta;
        new_agenda_item.score += weights * feat_vec_delta;
        if (agenda_it == correct_agenda_it && *analys_it == *tagged_tok) {
          correct_sentence = new_agenda_item;
          correct_available = true;
        }
      }
    }
  }
  if (!correct_available) {
    if (TheFlags.getSkipErrors()) {
      return true;
    } else {
      std::stringstream what_;
      what_ << "Tagged analysis unavailable in untagged/ambigous input.\n";
      what_ << "Available:\n";
      for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) {
        what_ << *analys_it << "\n";
      }
      what_ << "Required: " << *tagged_tok << "\n";
      what_ << "Rerun with --skip-on-error to skip this sentence.";
      throw Apertium::Exception::PerceptronTagger::CorrectAnalysisUnavailable(what_);
    }
  }
  // Apply the beam
  //std::cerr << "-- Before beam: --\n" << new_agenda;
  size_t new_agenda_size = std::min((size_t)spec.beam_width, new_agenda.size());
  agenda.resize(new_agenda_size);
  std::partial_sort_copy(new_agenda.begin(), new_agenda.end(),
                         agenda.begin(), agenda.end());
  //std::cerr << "-- After beam: --\n" << agenda;

  // Early update "fallen off the beam"
  bool any_match = false;
  for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
    if (agenda_it->tagged == correct_sentence.tagged) {
      correct_agenda_it = agenda_it;
      any_match = true;
      break;
    }
  }
  if (!any_match) {
    /*std::cerr << "Early update time!\n";
    std::cerr << "Before:\n" << weights << "\n";
    std::cerr << "Incorrect:\n" << agenda.front().vec << "\n";
    std::cerr << "Correct:\n" << correct_sentence.vec << "\n";*/
    avg_weights -= agenda.front().vec;
    avg_weights += correct_sentence.vec;
    avg_weights.incIteration();
    //std::cerr << "After:\n" << weights << "\n";
    return false;
  }
}
// Normal update
/*std::cerr << "Best match:\n" << agenda.front().tagged << "\n\n";
std::cerr << "Correct:\n" << correct_sentence.tagged << "\n\n";*/
if (agenda.front().tagged != correct_sentence.tagged) {
  /*std::cerr << "Normal update time!\n";
  std::cerr << "Before:\n" << weights << "\n";
  std::cerr << "Incorrect:\n" << agenda.front().vec << "\n";
  std::cerr << "Correct:\n" << correct_sentence.vec << "\n";*/
  avg_weights -= agenda.front().vec;
  avg_weights += correct_sentence.vec;
  avg_weights.incIteration();
  //std::cerr << "After:\n" << weights << "\n";
}
return false;
239}

241void PerceptronTagger::train(Stream&) {}  // dummy

243void PerceptronTagger::train(
  Stream &tagged,
  Stream &untagged,
  int iterations) {
FeatureVecAverager avg_weights(weights);
TrainingCorpus tc(tagged, untagged, TheFlags.getSkipErrors(), TheFlags.getSentSeg());
size_t avail_skipped;
1
'avail_skipped' declared without an initial value→
for (int i = 0; i < iterations; i++) {
2
←
Assuming 'i' is >= 'iterations'→
3
←
Loop condition is false. Execution continues on line 260→
  std::cerr << "Iteration " << i + 1 << " of " << iterations << "\n";
  avail_skipped = 0;
  tc.shuffle();
  std::vector<TrainingSentence>::const_iterator si;
  for (si = tc.sentences.begin(); si != tc.sentences.end(); si++) {
    avail_skipped += trainSentence(*si, avg_weights);
    spec.clearCache();
  }
}
avg_weights.average();
if (avail_skipped) {
4
←
Branch condition evaluates to a garbage value
  std::cerr << "Skipped " << tc.skipped << " sentences due to token "
             << "misalignment and " << avail_skipped << " sentences due to "
             << "tagged token being unavailable in untagged file out of "
             << tc.sentences.size() << " total sentences.\n";
}
//std::cerr << *this;
268}

270void PerceptronTagger::serialise(std::ostream &serialised) const
271{
spec.serialise(serialised);
weights.serialise(serialised);
274};

276void PerceptronTagger::deserialise(std::istream &serialised)
277{
spec.deserialise(serialised);
weights.deserialise(serialised);
280};

282template <typename T> void
283PerceptronTagger::extendAgendaAll(
  std::vector<T> &agenda,
  Optional<Analysis> analy) {
typename std::vector<T>::iterator agenda_it;
for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
  agenda_it->tagged.push_back(analy);
}
290}

292std::ostream&
293operator<<(std::ostream &out, const TaggedSentence &tagged) {
TaggedSentence::const_iterator tsi;
for (tsi = tagged.begin(); tsi != tagged.end(); tsi++) {
  if (*tsi) {
    out << **tsi;
  } else {
    out << "*";
  }
  out << " ";
}
return out;
304}

306std::ostream&
307operator<<(std::ostream &out, const PerceptronTagger::TrainingAgendaItem &tai) {
out << "Score: " << tai.score << "\n";
out << "Sentence: " << tai.tagged << "\n";
out << "\n";
out << "Vector:\n" << tai.vec;
return out;
313}

315std::ostream&
316operator<<(std::ostream &out, const std::vector<PerceptronTagger::TrainingAgendaItem> &agenda) {
std::vector<PerceptronTagger::TrainingAgendaItem>::const_iterator agenda_it;
for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
  out << *agenda_it;
}
out << "\n\n";
return out;
323}

325std::ostream&
326operator<<(std::ostream &out, const PerceptronTagger::AgendaItem &ai) {
out << "Score: " << ai.score << "\n";
out << "Sentence: " << ai.tagged << "\n";
return out;
330}

332std::ostream&
333operator<<(std::ostream &out, const std::vector<PerceptronTagger::AgendaItem> &agenda) {
std::vector<PerceptronTagger::AgendaItem>::const_iterator agenda_it;
for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) {
  out << *agenda_it;
}
out << "\n\n";
return out;
340}

342bool operator<(const PerceptronTagger::AgendaItem &a,
             const PerceptronTagger::AgendaItem &b) {
return a.score > b.score;
345};
346}