commit 01e5bc85cee187285fb500291656d219b1e97708 Author: Daniel Swanson Date: Fri Aug 13 15:12:37 2021 -0500 unigram 3 and lsw diff --git a/apertium/perceptron_tagger.cc b/apertium/perceptron_tagger.cc index 01f800a..8059640 100644 --- a/apertium/perceptron_tagger.cc +++ b/apertium/perceptron_tagger.cc @@ -39,9 +39,6 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) { agenda.back().tagged.reserve(sent_len); UnaryFeatureVec feat_vec_delta; - std::vector::const_iterator analys_it; - std::vector::const_iterator agenda_it; - std::vector::const_iterator wordoid_it; for (size_t token_idx = 0; token_idx < sent_len; token_idx++) { const std::vector &analyses = @@ -58,22 +55,20 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) { continue; } - for (agenda_it = agenda.begin(); agenda_it != agenda.end(); agenda_it++) { - for (analys_it = analyses.begin(); analys_it != analyses.end(); analys_it++) { - const std::vector &wordoids = analys_it->TheMorphemes; + for (auto& agenda_it : agenda) { + for (auto& analys_it : analyses) { - new_agenda.push_back(*agenda_it); + new_agenda.push_back(agenda_it); AgendaItem &new_agenda_item = new_agenda.back(); - new_agenda_item.tagged.push_back(*analys_it); + new_agenda_item.tagged.push_back(analys_it); - for (wordoid_it = wordoids.begin(); wordoid_it != wordoids.end(); wordoid_it++) { - int wordoid_idx = wordoid_it - wordoids.begin(); + for (size_t w_idx = 0; w_idx < analys_it.TheMorphemes.size(); w_idx++) { feat_vec_delta.clear(); spec.get_features(new_agenda_item.tagged, untagged_sent, - token_idx, wordoid_idx, feat_vec_delta); + token_idx, w_idx, feat_vec_delta); if (TheFlags.getDebug()) { FeatureVec fv(feat_vec_delta); - std::cerr << "Token " << token_idx << "\t\tWordoid " << wordoid_idx << "\n"; + std::cerr << "Token " << token_idx << "\t\tWordoid " << w_idx << "\n"; std::cerr << fv; std::cerr << "Score: " << weights * feat_vec_delta << "\n"; } diff --git a/apertium/tagger_data_exe.cc b/apertium/tagger_data_exe.cc index 09ff2fc..aa63f60 100644 --- a/apertium/tagger_data_exe.cc +++ b/apertium/tagger_data_exe.cc @@ -462,3 +462,14 @@ TaggerDataExe::search(int_int* ptr, uint64_t count, uint64_t key, uint64_t& val) } return false; } + +std::map> +TaggerDataExe::summarize(str_str_int* ptr, uint64_t count) +{ + std::map> ret; + for (uint64_t i = 0; i < count; i++) { + UString_view key = str_write.get(ptr[i].s1); + ret[key].first++; + ret[key].second += ptr[i].i; + } +} diff --git a/apertium/tagger_data_exe.h b/apertium/tagger_data_exe.h index 8506736..211fb93 100644 --- a/apertium/tagger_data_exe.h +++ b/apertium/tagger_data_exe.h @@ -22,6 +22,7 @@ #include #include #include +#include #include struct str_int { @@ -157,6 +158,10 @@ public: bool search(str_str_int* ptr, uint64_t count, UString_view k1, UString_view k2, uint64_t& val); bool search(int_int* ptr, uint64_t count, uint64_t key, uint64_t& val); + + std::map> summarize(str_str_int* ptr, + uint64_t count); }; #endif diff --git a/apertium/tagger_exe.cc b/apertium/tagger_exe.cc index 035cdc4..217ae53 100644 --- a/apertium/tagger_exe.cc +++ b/apertium/tagger_exe.cc @@ -188,16 +188,6 @@ TaggerExe::score_unigram1(UString_view lu) return s; } -void -TaggerExe::build_uni2_counts() -{ - for (uint64_t i = 0; i < tde.uni2_count; i++) { - UString_view key = tde.str_write.get(tde.uni2[i].s1); - uni2_counts[key].first++; - uni2_counts[key].second += tde.uni2[i].i; - } -} - long double TaggerExe::score_unigram2(UString_view lu) { @@ -222,11 +212,79 @@ TaggerExe::score_unigram2(UString_view lu) return (tokenCount_r_a * tokenCount_a) / (tokenCount_a + typeCount_a); } +long double +TaggerExe::score_unigram3(UString_view lu) +{ + long double tokenCount_r_i = 1; + long double tokenCount_i = 1; + long double typeCount_i = 1; + + vector morphemes = StringUtils::split_escape(lu, '+'); + + auto loc = morphemes[0].find_first_of('<'); + UString_view lemma = morphemes[0].substr(0, loc); + UString_view tags = morphemes[0].substr(loc); + uint64_t n; + if (uni3_l_t.find(tags) != uni3_l_t.end()) { + if (tde.search(tde.uni3_l_t, tde.uni3_l_t_count, tags, lemma, n)) { + tokenCount_r_i += n; + } else { + typeCount_i += 1; + } + typeCount_i += uni3_l_t[tags].first; + tokenCount_i += uni3_l_t[tags].second; + } + long double num = tokenCount_r_i * tokenCount_i; + long double denom = tokenCount_i + typeCount_i; + UString_view l; + UString_view t_cur = tags; + UString_view t_prev; + for (uint64_t i = 1; i < morphemes.size(); i++) { + t_prev = t_cur; + loc = morphemes[i].find_first_of('<'); + l = morphemes[i].substr(0, loc); + t_cur = morphemes[i].substr(loc); + + long double tokenCount_d_i = 1; + long double tokenCount_i_d = 1; + long double tokenCount_i = 1; + long double typeCount_i = 1; + long double tokenCount_d = 1; + long double typeCount_d = 1; + + if (uni3_cl_ct.find(t_prev) != uni3_cl_ct.end()) { + if (tde.search(tde.uni3_cl_ct, tde.uni3_cl_ct_count, t_prev, l, n)) { + tokenCount_d_i += n; + } else { + typeCount_i += 1; + } + tokenCount_i += uni3_cl_ct[t_prev].second; + typeCount_i += uni3_cl_ct[t_prev].first; + } + if (uni3_ct_cl.find(l) != uni3_ct_cl.end()) { + if (tde.search(tde.uni3_ct_cl, tde.uni3_ct_cl_count, l, t_cur, n)) { + tokenCount_i_d += n; + } else { + typeCount_d += 1; + } + tokenCount_d += uni3_ct_cl[l].second; + typeCount_i += uni3_ct_cl[l].first; + } + num *= (tokenCount_d_i * tokenCount_i_d); + denom *= ((tokenCount_i + typeCount_i) * (tokenCount_d + typeCount_d)); + } + return num / denom; +} + void TaggerExe::tag_unigram(InputFile& input, UFILE* output, int model) { if (model == 2) { - build_uni2_counts(); + uni2_counts = tde.summarize(tde.uni2, tde.uni2_count); + } else if (model == 3) { + uni3_l_t = tde.summarize(tde.uni3_l_t, tde.uni3_l_t_count); + uni3_cl_ct = tde.summarize(tde.uni3_cl_ct, tde.uni3_cl_ct_count); + uni3_ct_cl = tde.summarize(tde.uni3_ct_cl, tde.uni3_ct_cl_count); } while (!input.eof()) { write(input.readBlank(true), output); @@ -262,6 +320,8 @@ TaggerExe::tag_unigram(InputFile& input, UFILE* output, int model) s = score_unigram1(pieces[i]); break; case 2: s = score_unigram2(pieces[i]); break; + case 3: + s = score_unigram3(pieces[i]); break; default: break; } @@ -368,6 +428,72 @@ TaggerExe::tag_hmm(InputFile& input, UFILE* output) } } +void +TaggerExe::tag_lsw(InputFile& input, UFILE* output) +{ + build_match_finals(); + build_prefer_rules(); + vector arr_tg; + for (uint64_t i = 0; i < tde.array_tags_count; i++) { + arr_tg.push_back(UString{tde.str_write.get(tde.array_tags[i])}); + } + TaggerWord::setArrayTags(arr_tg); + + uint64_t eos; + tde.search(tde.tag_index, tde.tag_index_count, "TAG_SENT"_u, eos); + uint64_t tag_eof; + tde.search(tde.tag_index, tde.tag_index_count, "TAG_kEOF"_u, tag_eof); + + TaggerWord* left = nullptr; + TaggerWord* mid = nullptr; + TaggerWord* right = nullptr; + + left = new TaggerWord(); + left->add_tag(eos, "sent"_u, prefer_rules); + + mid = read_tagger_word(input); + + if (input.eof()) { + delete left; + delete mid; + return; + } + + right = read_tagger_word(input); + + while (right) { + double max = -1; + uint64_t tag_max = *(mid->get_tags().begin()); + for (auto& m : mid->get_tags()) { + double n = 0; + for (auto& l : left->get_tags()) { + for (auto& r : right->get_tags()) { + n += tde.getD(l, m, r); + } + } + if (n > max) { + max = n; + tag_max = m; + } + } + + write(mid->get_lexical_form(tag_max, tag_eof), output); + if (input.eof()) { + if (null_flush) { + u_fputc('\0', output); + } + u_fflush(output); + } + + delete left; + left = mid; + mid = right; + right = read_tagger_word(input); + } + delete left; + delete mid; +} + void TaggerExe::load(FILE* in) { diff --git a/apertium/tagger_exe.h b/apertium/tagger_exe.h index 28e315d..7ae9cd9 100644 --- a/apertium/tagger_exe.h +++ b/apertium/tagger_exe.h @@ -27,6 +27,8 @@ #include #include +typedef std::map> uni_summary; + class TaggerExe { private: bool null_flush = true; @@ -40,16 +42,20 @@ private: std::vector prefer_rules; void build_prefer_rules(); - std::map> uni2_counts; - void build_uni2_counts(); + uni_summary uni2_counts; + uni_summary uni3_l_t; + uni_summary uni3_cl_ct; + uni_summary uni3_ct_cl; long double score_unigram1(UString_view lu); long double score_unigram2(UString_view lu); + long double score_unigram3(UString_view lu); public: TaggerDataExe tde; Apertium::StreamedType read_streamed_type(InputFile& input); TaggerWord* read_tagger_word(InputFile& input); void tag_unigram(InputFile& input, UFILE* output, int model); void tag_hmm(InputFile& input, UFILE* output); + void tag_lsw(InputFile& input, UFILE* output); void load(FILE* in); };