commit d73d2bd7f6117ad1619ee409a72c07f2ee294c34 Author: Daniel Swanson Date: Sat Aug 21 16:51:35 2021 -0400 nearly have perceptron loading diff --git a/apertium/feature_vec.h b/apertium/feature_vec.h index 0df6b69..dff8b0e 100644 --- a/apertium/feature_vec.h +++ b/apertium/feature_vec.h @@ -8,6 +8,8 @@ #include #include +class TaggerDataExe; + namespace Apertium { typedef std::vector FeatureKey; @@ -20,6 +22,7 @@ class FeatureVec { friend class FeatureVecAverager; friend class PerceptronTagger; + friend class ::TaggerDataExe; public: typedef std::map Map; typedef std::pair Pair; diff --git a/apertium/perceptron_spec.cc b/apertium/perceptron_spec.cc index 0c0161f..607bee5 100644 --- a/apertium/perceptron_spec.cc +++ b/apertium/perceptron_spec.cc @@ -2,9 +2,12 @@ #include #include #include +#include #include #include #include +#include +#include namespace Apertium { @@ -158,9 +161,22 @@ PerceptronSpec::coarsen(const Morpheme &wrd) const { std::map::const_iterator it = coarsen_cache.find(wrd); if (it == coarsen_cache.end()) { - UString coarse_tag = coarse_tags->coarsen(wrd); std::string result; - utf8::utf16to8(coarse_tag.begin(), coarse_tag.end(), std::back_inserter(result)); + if (tde) { + MatchState2 state(&tde->trans); + state.step(static_cast(wrd), tde->alpha, true); + int val = state.classifyFinals(coarse_word_match_finals); + if (val == -1) { + uint64_t undef; + tde->search(tde->tag_index, tde->tag_index_count, "TAG_kUNDEF"_u, undef); + val = undef; + } + UString_view coarse = tde->str_write.get(tde->array_tags[val]); + utf8::utf16to8(coarse.begin(), coarse.end(), std::back_inserter(result)); + } else { + UString coarse_tag = coarse_tags->coarsen(wrd); + utf8::utf16to8(coarse_tag.begin(), coarse_tag.end(), std::back_inserter(result)); + } coarsen_cache[wrd] = result; return result; } @@ -499,20 +515,13 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) case EXAMBGSET: { assert(spec.coarse_tags); std::vector ambgset; - const std::vector &analyses = get_token(untagged).TheAnalyses; - std::vector::const_iterator analy_it; - for (analy_it = analyses.begin(); analy_it != analyses.end(); analy_it++) { + for (auto& analy_it : get_token(untagged).TheAnalyses) { ambgset.push_back(std::string()); - const std::vector &wrds = analy_it->TheMorphemes; - std::vector::const_iterator wrd_it = wrds.begin(); - while (true) { - ambgset.back() += spec.coarsen(*wrd_it); - wrd_it++; - if (wrd_it == wrds.end()) { - break; - } else { + for (auto& wrd_it : analy_it.TheMorphemes) { + if (!ambgset.back().empty()) { ambgset.back() += '+'; } + ambgset.back() += spec.coarsen(wrd_it); } } stack.push(ambgset); @@ -779,18 +788,16 @@ void PerceptronSpec::deserialiseFeatDefn( std::istream &serialised, FeatureDefn &feat) { std::string feat_str = Deserialiser::deserialise(serialised); feat.reserve(feat_str.size()); - std::string::iterator feat_str_it; - for (feat_str_it = feat_str.begin(); feat_str_it != feat_str.end(); feat_str_it++) { - feat.push_back(*feat_str_it); + for (auto& it : feat_str) { + feat.push_back(it); } } void PerceptronSpec::serialiseFeatDefnVec( std::ostream &serialised, const std::vector &defn_vec) const { Serialiser::serialise(defn_vec.size(), serialised); - std::vector::const_iterator feat_it; - for (feat_it = defn_vec.begin(); feat_it != defn_vec.end(); feat_it++) { - serialiseFeatDefn(serialised, *feat_it); + for (auto& it : defn_vec) { + serialiseFeatDefn(serialised, it); } } @@ -836,4 +843,64 @@ void PerceptronSpec::deserialise(std::istream &serialised) { } } +void PerceptronSpec::read_compressed(FILE* in) +{ + beam_width = OldBinary::read_int(in, false); + uint64_t count = OldBinary::read_int(in, false); + str_consts = std::vector(count); + for (uint64_t i = 0; i < count; i++) { + uint64_t count2 = OldBinary::read_int(in, false); + for (uint64_t j = 0; j < count2; j++) { + str_consts[i] += static_cast(OldBinary::read_int(in, false)); + } + } + set_consts.clear(); + count = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < count; i++) { + VMSet cur; + uint64_t count2 = OldBinary::read_int(in, false); + for (uint64_t j = 0; j < count2; j++) { + std::string s; + uint64_t count3 = OldBinary::read_int(in, false); + for (uint64_t k = 0; k < count3; k++) { + s += static_cast(OldBinary::read_int(in, false)); + } + cur.insert(s); + } + set_consts.push_back(cur); + } + count = OldBinary::read_int(in, false); + features.clear(); + for (uint64_t i = 0; i < count; i++) { + features.push_back(FeatureDefn()); + uint64_t count2 = OldBinary::read_int(in, false); + for (uint64_t j = 0; j < count2; j++) { + features[i].push_back(static_cast(OldBinary::read_int(in, false))); + } + } + count = OldBinary::read_int(in, false); + global_defns.clear(); + for (uint64_t i = 0; i < count; i++) { + global_defns.push_back(FeatureDefn()); + uint64_t count2 = OldBinary::read_int(in, false); + for (uint64_t j = 0; j < count2; j++) { + global_defns[i].push_back(static_cast(OldBinary::read_int(in, false))); + } + } + global_pred.clear(); + count = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < count; i++) { + global_pred.push_back(static_cast(OldBinary::read_int(in, false))); + } +} + +void PerceptronSpec::set_tagger_data_exe(TaggerDataExe* t) +{ + tde = t; + coarse_word_match_finals.clear(); + for (uint64_t i = 0; i < tde->finals_count; i++) { + coarse_word_match_finals[tde->finals[i].i1] = tde->finals[i].i2; + } +} + } diff --git a/apertium/perceptron_spec.h b/apertium/perceptron_spec.h index 5b8e986..8c082af 100644 --- a/apertium/perceptron_spec.h +++ b/apertium/perceptron_spec.h @@ -26,6 +26,8 @@ using namespace Apertium::SentenceStream; +class TaggerDataExe; + namespace Apertium { typedef std::set VMSet; class PerceptronSpec @@ -376,6 +378,8 @@ public: signed char intbyte : 8; }; Optional coarse_tags; + TaggerDataExe* tde = nullptr; + std::map coarse_word_match_finals; static std::string dot; std::vector str_consts; std::vector set_consts; @@ -496,6 +500,8 @@ private: public: void serialise(std::ostream &serialised) const; void deserialise(std::istream &serialised); + void read_compressed(FILE* in); + void set_tagger_data_exe(TaggerDataExe* t); }; } diff --git a/apertium/tagger_data_exe.cc b/apertium/tagger_data_exe.cc index aa63f60..367751d 100644 --- a/apertium/tagger_data_exe.cc +++ b/apertium/tagger_data_exe.cc @@ -17,8 +17,10 @@ #include +#include + #include -#include +#include #include #include @@ -31,84 +33,38 @@ TaggerDataExe::TaggerDataExe() : alpha(AlphabetExe(&str_write)) {} -uint64_t deserialise_int(FILE* in) -{ - uint64_t ret = 0; - uint8_t size = fgetc_unlocked(in); - if (size > 8) { - throw std::runtime_error("can't deserialise int"); - } - uint8_t buffer[8]; - if (fread_unlocked(buffer, 1, size, in) != size) { - throw std::runtime_error("can't deserialise int"); - } - for (uint8_t i = 0; i < size; i++) { - ret += static_cast(buffer[i]) << (8 * (size - i - 1)); - } - return ret; -} - -StringRef deserialise_str(FILE* in, StringWriter& sw) -{ - UString s; - for (uint64_t i = deserialise_int(in); i > 0; i--) { - s += static_cast(deserialise_int(in)); - } - return sw.add(s); -} - -void deserialise_str(FILE* in, UString& s) -{ - for (uint64_t i = deserialise_int(in); i > 0; i--) { - s += static_cast(deserialise_int(in)); - } -} - void deserialise_tags(FILE* in, UString& s) { - for (uint64_t i = deserialise_int(in); i > 0; i--) { + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { s += '<'; - deserialise_str(in, s); + OldBinary::read_ustr(in, s, false); s += '>'; } } -double -read_compressed_double(FILE *input) +StringRef deserialise_str(FILE* in, StringWriter& sw) { - double retval; -#ifdef WORDS_BIGENDIAN - fread_unlocked(&retval, sizeof(double), 1, input); -#else - char *s = reinterpret_cast(&retval); - - for(int i = sizeof(double)-1; i != -1; i--) - { - if(fread_unlocked(&(s[i]), 1, 1, input)==0) - { - return 0; - } - } -#endif - return retval; + UString s; + OldBinary::read_ustr(in, s, false); + return sw.add(s); } void TaggerDataExe::read_compressed_unigram1(FILE* in) { - uni1_count = deserialise_int(in); + uni1_count = OldBinary::read_int(in, false); uni1 = new str_int[uni1_count]; for (uint64_t i = 0; i < uni1_count; i++) { UString s; - for (uint64_t j = deserialise_int(in); j > 0; j--) { + for (uint64_t j = OldBinary::read_int(in, false); j > 0; j--) { if (!s.empty()) { s += '+'; } - deserialise_str(in, s); + OldBinary::read_ustr(in, s, false); deserialise_tags(in, s); } uni1[i].s = str_write.add(s); - uni1[i].i = deserialise_int(in); + uni1[i].i = OldBinary::read_int(in, false); } } @@ -119,19 +75,19 @@ TaggerDataExe::read_compressed_unigram2(FILE* in) std::vector lems; std::vector counts; - for (uint64_t ans = deserialise_int(in); ans > 0; ans--) { + for (uint64_t ans = OldBinary::read_int(in, false); ans > 0; ans--) { UString a; deserialise_tags(in, a); - for (uint64_t c = deserialise_int(in); c > 0; c--) { + for (uint64_t c = OldBinary::read_int(in, false); c > 0; c--) { a += '+'; - deserialise_str(in, a); + OldBinary::read_ustr(in, a, false); deserialise_tags(in, a); } StringRef ar = str_write.add(a); - for (uint64_t i = deserialise_int(in); i > 0; i--) { + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { as.push_back(ar); lems.push_back(deserialise_str(in, str_write)); - counts.push_back(deserialise_int(in)); + counts.push_back(OldBinary::read_int(in, false)); uni2_count++; } } @@ -151,14 +107,14 @@ TaggerDataExe::read_compressed_unigram3(FILE* in) std::vector s2; std::vector counts; - for (uint64_t ans = deserialise_int(in); ans > 0; ans--) { + for (uint64_t ans = OldBinary::read_int(in, false); ans > 0; ans--) { UString tg; deserialise_tags(in, tg); StringRef tgr = str_write.add(tg); - for (uint64_t i = deserialise_int(in); i > 0; i--) { + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { s1.push_back(tgr); s2.push_back(deserialise_str(in, str_write)); - counts.push_back(deserialise_int(in)); + counts.push_back(OldBinary::read_int(in, false)); uni3_l_t_count++; } } @@ -173,14 +129,14 @@ TaggerDataExe::read_compressed_unigram3(FILE* in) s2.clear(); counts.clear(); - for (uint64_t ans = deserialise_int(in); ans > 0; ans--) { + for (uint64_t ans = OldBinary::read_int(in, false); ans > 0; ans--) { UString tg; deserialise_tags(in, tg); StringRef tgr = str_write.add(tg); - for (uint64_t i = deserialise_int(in); i > 0; i--) { + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { s1.push_back(tgr); s2.push_back(deserialise_str(in, str_write)); - counts.push_back(deserialise_int(in)); + counts.push_back(OldBinary::read_int(in, false)); uni3_cl_ct_count++; } } @@ -195,14 +151,14 @@ TaggerDataExe::read_compressed_unigram3(FILE* in) s2.clear(); counts.clear(); - for (uint64_t ans = deserialise_int(in); ans > 0; ans--) { + for (uint64_t ans = OldBinary::read_int(in, false); ans > 0; ans--) { StringRef lm = deserialise_str(in, str_write); - for (uint64_t i = deserialise_int(in); i > 0; i--) { + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { s1.push_back(lm); UString tg; deserialise_tags(in, tg); s2.push_back(str_write.add(tg)); - counts.push_back(deserialise_int(in)); + counts.push_back(OldBinary::read_int(in, false)); uni3_ct_cl_count++; } } @@ -220,43 +176,47 @@ TaggerDataExe::read_compressed_hmm_lsw(FILE* in, bool is_hmm) // open_class std::vector open_class; uint64_t val = 0; - for (uint64_t i = Compression::multibyte_read(in); i > 0; i--) { - val += Compression::multibyte_read(in); + for (uint64_t i = OldBinary::read_int(in); i > 0; i--) { + val += OldBinary::read_int(in); open_class.push_back(val); } // forbid_rules - forbid_rules_count = Compression::multibyte_read(in); + forbid_rules_count = OldBinary::read_int(in); forbid_rules = new int_int[forbid_rules_count]; for (uint64_t i = 0; i < forbid_rules_count; i++) { - forbid_rules[i].i1 = Compression::multibyte_read(in); - forbid_rules[i].i2 = Compression::multibyte_read(in); + forbid_rules[i].i1 = OldBinary::read_int(in); + forbid_rules[i].i2 = OldBinary::read_int(in); } // array_tags - array_tags_count = Compression::multibyte_read(in); + array_tags_count = OldBinary::read_int(in); array_tags = new StringRef[array_tags_count]; for (uint64_t i = 0; i < array_tags_count; i++) { - array_tags[i] = str_write.add(Compression::string_read(in)); + UString temp; + OldBinary::read_ustr(in, temp); + array_tags[i] = str_write.add(temp); } // tag_index - tag_index_count = Compression::multibyte_read(in); + tag_index_count = OldBinary::read_int(in); tag_index = new str_int[tag_index_count]; for (uint64_t i = 0; i < tag_index_count; i++) { - tag_index[i].s = str_write.add(Compression::string_read(in)); - tag_index[i].i = Compression::multibyte_read(in); + UString temp; + OldBinary::read_ustr(in, temp); + tag_index[i].s = str_write.add(temp); + tag_index[i].i = OldBinary::read_int(in); } // enforce_rules - enforce_rules_count = Compression::multibyte_read(in); + enforce_rules_count = OldBinary::read_int(in); enforce_rules_offsets = new uint64_t[enforce_rules_count+1]; std::vector enf; for (uint64_t i = 0; i < enforce_rules_count; i++) { enforce_rules_offsets[i] = enf.size(); - enf.push_back(Compression::multibyte_read(in)); - for (uint64_t j = Compression::multibyte_read(in); j > 0; j--) { - enf.push_back(Compression::multibyte_read(in)); + enf.push_back(OldBinary::read_int(in)); + for (uint64_t j = OldBinary::read_int(in); j > 0; j--) { + enf.push_back(OldBinary::read_int(in)); } } enforce_rules_offsets[enforce_rules_count] = enf.size(); @@ -266,29 +226,33 @@ TaggerDataExe::read_compressed_hmm_lsw(FILE* in, bool is_hmm) } // prefer_rules - prefer_rules_count = Compression::multibyte_read(in); + prefer_rules_count = OldBinary::read_int(in); prefer_rules = new StringRef[prefer_rules_count]; for (uint64_t i = 0; i < prefer_rules_count; i++) { - prefer_rules[i] = str_write.add(Compression::string_read(in)); + UString temp; + OldBinary::read_ustr(in, temp); + prefer_rules[i] = str_write.add(temp); } // constants - constants_count = Compression::multibyte_read(in); + constants_count = OldBinary::read_int(in); constants = new str_int[constants_count]; for (uint64_t i = 0; i < constants_count; i++) { - constants[i].s = str_write.add(Compression::string_read(in)); - constants[i].i = Compression::multibyte_read(in); + UString temp; + OldBinary::read_ustr(in, temp); + constants[i].s = str_write.add(temp); + constants[i].i = OldBinary::read_int(in); } // output - output_count = Compression::multibyte_read(in); + output_count = OldBinary::read_int(in); // +2 in case we need to append open_class output_offsets = new uint64_t[output_count+2]; std::vector out; for (uint64_t i = 0; i < output_count; i++) { output_offsets[i] = out.size(); - for (uint64_t j = Compression::multibyte_read(in); j > 0; j--) { - out.push_back(Compression::multibyte_read(in)); + for (uint64_t j = OldBinary::read_int(in); j > 0; j--) { + out.push_back(OldBinary::read_int(in)); } } output_offsets[output_count] = out.size(); @@ -320,14 +284,14 @@ TaggerDataExe::read_compressed_hmm_lsw(FILE* in, bool is_hmm) if (is_hmm) { // dimensions - N = Compression::multibyte_read(in); - M = Compression::multibyte_read(in); + N = OldBinary::read_int(in); + M = OldBinary::read_int(in); // matrix a hmm_a = new double[N*N]; for (uint64_t i = 0; i < N; i++) { for (uint64_t j = 0; j < N; j++) { - hmm_a[i*N+j] = read_compressed_double(in); + hmm_a[i*N+j] = OldBinary::read_double(in, true, true); } } @@ -336,23 +300,23 @@ TaggerDataExe::read_compressed_hmm_lsw(FILE* in, bool is_hmm) for (uint64_t i = 0; i < N*M; i++) { hmm_b[i] = 1e-10; } - for (uint64_t count = Compression::multibyte_read(in); count > 0; count--) { - uint64_t i = Compression::multibyte_read(in); - uint64_t j = Compression::multibyte_read(in); - hmm_b[i*M+j] = read_compressed_double(in); + for (uint64_t count = OldBinary::read_int(in); count > 0; count--) { + uint64_t i = OldBinary::read_int(in); + uint64_t j = OldBinary::read_int(in); + hmm_b[i*M+j] = OldBinary::read_double(in, true, true); } } else { // dimensions - N = Compression::multibyte_read(in); + N = OldBinary::read_int(in); // matrix d lsw_d = new double[N*N*N]; memset(lsw_d, 0, N*N*N*sizeof(double)); - for (uint64_t count = Compression::multibyte_read(in); count > 0; count--) { - uint64_t i = Compression::multibyte_read(in); - uint64_t j = Compression::multibyte_read(in); - uint64_t k = Compression::multibyte_read(in); - lsw_d[(i*N*N)+(j*N)+k] = read_compressed_double(in); + for (uint64_t count = OldBinary::read_int(in); count > 0; count--) { + uint64_t i = OldBinary::read_int(in); + uint64_t j = OldBinary::read_int(in); + uint64_t k = OldBinary::read_int(in); + lsw_d[(i*N*N)+(j*N)+k] = OldBinary::read_double(in, true, true); } } @@ -363,26 +327,146 @@ TaggerDataExe::read_compressed_hmm_lsw(FILE* in, bool is_hmm) alpha.read(in, false); fsetpos(in, &pos); temp.read(in); - int len = Compression::multibyte_read(in); + int len = OldBinary::read_int(in); if (len == 1) { - Compression::string_read(in); + UString name; + OldBinary::read_ustr(in, name); trans.read_compressed(in, temp, true); - finals_count = Compression::multibyte_read(in); + finals_count = OldBinary::read_int(in); finals = new int_int[finals_count]; for (uint64_t i = 0; i < finals_count; i++) { - finals[i].i1 = Compression::multibyte_read(in); - finals[i].i2 = Compression::multibyte_read(in); + finals[i].i1 = OldBinary::read_int(in); + finals[i].i2 = OldBinary::read_int(in); } } // discard - discard_count = Compression::multibyte_read(in); + discard_count = OldBinary::read_int(in); if (feof(in)) { discard_count = 0; } discard = new StringRef[discard_count]; for (uint64_t i = 0; i < discard_count; i++) { - discard[i] = str_write.add(Compression::string_read(in)); + UString temp; + OldBinary::read_ustr(in, temp); + discard[i] = str_write.add(temp); + } +} + +void +TaggerDataExe::read_compressed_perceptron(FILE* in) +{ + spec = new Apertium::PerceptronSpec(); + spec->read_compressed(in); + if (OldBinary::read_int(in, false) == 1) { + // open_class + std::vector open_class; + uint64_t val = 0; + for (uint64_t i = OldBinary::read_int(in, false); i > 0; i--) { + val += OldBinary::read_int(in, false); + open_class.push_back(val); + } + + // array_tags + array_tags_count = OldBinary::read_int(in, false); + array_tags = new StringRef[array_tags_count]; + for (uint64_t i = 0; i < array_tags_count; i++) { + array_tags[i] = deserialise_str(in, str_write); + } + + // tag_index + tag_index_count = OldBinary::read_int(in, false); + tag_index = new str_int[tag_index_count]; + for (uint64_t i = 0; i < tag_index_count; i++) { + tag_index[i].s = deserialise_str(in, str_write); + tag_index[i].i = OldBinary::read_int(in, false); + } + + // constants + constants_count = OldBinary::read_int(in, false); + constants = new str_int[constants_count]; + for (uint64_t i = 0; i < constants_count; i++) { + constants[i].s = deserialise_str(in, str_write); + constants[i].i = OldBinary::read_int(in, false); + } + + // output + output_count = OldBinary::read_int(in, false); + // +2 in case we need to append open_class + output_offsets = new uint64_t[output_count+2]; + std::vector out; + for (uint64_t i = 0; i < output_count; i++) { + output_offsets[i] = out.size(); + for (uint64_t j = OldBinary::read_int(in, false); j > 0; j--) { + out.push_back(OldBinary::read_int(in, false)); + } + } + output_offsets[output_count] = out.size(); + open_class_index = output_count; + for (uint64_t i = 0; i < output_count; i++) { + if (output_offsets[i+1] - output_offsets[i] == open_class.size()) { + bool match = true; + for (uint64_t j = 0; j < open_class.size(); j++) { + if (open_class[j] != out[output_offsets[i]+j]) { + match = false; + break; + } + } + if (match) { + open_class_index = i; + break; + } + } + } + if (open_class_index == output_count) { + output_count++; + out.insert(out.end(), open_class.begin(), open_class.end()); + output_offsets[output_count] = out.size(); + } + output = new uint64_t[out.size()]; + for (uint64_t i = 0; i < out.size(); i++) { + output[i] = out[i]; + } + + // pattern list + // TODO: tell Alphabet and Transducer to read serialised + // rather than compressed + Alphabet temp; + fpos_t pos; + fgetpos(in, &pos); + alpha.read(in, false); + fsetpos(in, &pos); + temp.read(in); + int len = OldBinary::read_int(in, false); + if (len == 1) { + UString name; // ignored + OldBinary::read_ustr(in, name, false); + trans.read_compressed(in, temp, true); + finals_count = OldBinary::read_int(in, false); + finals = new int_int[finals_count]; + for (uint64_t i = 0; i < finals_count; i++) { + finals[i].i1 = OldBinary::read_int(in, false); + finals[i].i2 = OldBinary::read_int(in, false); + } + } + } + + // weights + //percep_weights; + uint64_t count = OldBinary::read_int(in, false); + for (uint64_t i = 0; i < count; i++) { + std::vector v; + uint64_t count2 = OldBinary::read_int(in, false); + for (uint64_t j = 0; j < count2; j++) { + std::string s; + uint64_t count3 = OldBinary::read_int(in, false); + for (uint64_t k = 0; k < count3; k++) { + s += static_cast(OldBinary::read_int(in, false)); + } + v.push_back(s); + } + uint64_t w = OldBinary::read_int(in, false); + percep_weights.data[v] = *reinterpret_cast(&w); } } @@ -472,4 +556,5 @@ TaggerDataExe::summarize(str_str_int* ptr, uint64_t count) ret[key].first++; ret[key].second += ptr[i].i; } + return ret; } diff --git a/apertium/tagger_data_exe.h b/apertium/tagger_data_exe.h index 211fb93..526a36d 100644 --- a/apertium/tagger_data_exe.h +++ b/apertium/tagger_data_exe.h @@ -24,6 +24,7 @@ #include #include #include +#include struct str_int { StringRef s; @@ -41,6 +42,10 @@ struct int_int { uint64_t i2; }; +namespace Apertium { +class PerceptronSpec; +} + class TaggerDataExe { private: bool mmapping = false; @@ -123,6 +128,12 @@ public: double* hmm_b = nullptr; // NxM double* lsw_d = nullptr; // NxNxN + /** + * Perceptron + */ + Apertium::PerceptronSpec* spec = nullptr; + Apertium::FeatureVec percep_weights; + /* perceptron map, double> weights int beam_width diff --git a/apertium/tagger_exe.cc b/apertium/tagger_exe.cc index 217ae53..801fb6e 100644 --- a/apertium/tagger_exe.cc +++ b/apertium/tagger_exe.cc @@ -2,6 +2,8 @@ #include #include +#include +#include #include