commit 690a322cb6cd9ede626932809a758a7eac152719 Author: Daniel Swanson Date: Thu Aug 19 11:18:47 2021 -0400 text extractor for unigram diff --git a/apertium/Makefile.am b/apertium/Makefile.am index 3b27891..c785d39 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -198,6 +198,7 @@ bin_PROGRAMS = apertium-cleanstream \ apertium-tagger-apply-new-rules \ apertium-tagger-new \ apertium-tagger-readwords \ + apertium-tagger2txt \ apertium-perceptron-trace \ apertium-tmxbuild \ apertium-transfer \ @@ -264,6 +265,9 @@ apertium_tagger_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) apertium_tagger_new_SOURCES = apertium_tagger_new.cc apertium_tagger_new_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) +apertium_tagger2txt_SOURCES = tagger2txt.cc +apertium_tagger2txt_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) + apertium_perceptron_trace_SOURCES = apertium_perceptron_trace.cc apertium_perceptron_trace_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) diff --git a/apertium/tagger2txt.cc b/apertium/tagger2txt.cc new file mode 100644 index 0000000..07b1692 --- /dev/null +++ b/apertium/tagger2txt.cc @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include + +using namespace std; + +void out(StringWriter& sw, str_int* arr, uint64_t ct, const char* name) +{ + if (ct) { + cout << name << " {\n"; + for (uint64_t i = 0; i < ct; i++) { + cout << sw.get(arr[i].s) << "\t" << arr[i].i << endl; + } + cout << "}\n"; + } +} + +void out(StringWriter& sw, str_str_int* arr, uint64_t ct, const char* name) +{ + if (ct) { + cout << name << " {\n"; + for (uint64_t i = 0; i < ct; i++) { + cout << sw.get(arr[i].s1) << "\t" << sw.get(arr[i].s2) << "\t"; + cout << arr[i].i << endl; + } + cout << "}\n"; + } +} + +int main(int argc, char* argv[]) +{ + LtLocale::tryToSetLocale(); + + if (argc != 3) { + cerr << "USAGE: apertium-tagger2txt [ h | l | 1 | 2 | 3 | x ] prob_file\n"; + exit(EXIT_FAILURE); + } + + TaggerDataExe tde; + FILE* fin = fopen(argv[2], "rb"); + if (!fin) { + cerr << "Unable to open '" << argv[2] << "' for reading.\n"; + exit(EXIT_FAILURE); + } + switch (argv[1][0]) { + case 'h': + tde.read_compressed_hmm_lsw(fin, true); + cout << "Type: HMM\n"; + break; + case 'l': + tde.read_compressed_hmm_lsw(fin, false); + cout << "Type: LSW\n"; + break; + case '1': + tde.read_compressed_unigram1(fin); + cout << "Type: Unigram1\n"; + break; + case '2': + tde.read_compressed_unigram2(fin); + cout << "Type: Unigram2\n"; + break; + case '3': + tde.read_compressed_unigram3(fin); + cout << "Type: Unigram3\n"; + break; + case 'x': + default: + cerr << "Unrecognized prob type '" << argv[1] << "'\n"; + exit(EXIT_FAILURE); + } + + out(tde.str_write, tde.uni1, tde.uni1_count, "Data"); + out(tde.str_write, tde.uni2, tde.uni2_count, "Data"); + out(tde.str_write, tde.uni3_l_t, tde.uni3_l_t_count, "Data_l_t"); + out(tde.str_write, tde.uni3_cl_ct, tde.uni3_cl_ct_count, "Data_cl_ct"); + out(tde.str_write, tde.uni3_ct_cl, tde.uni3_ct_cl_count, "Data_ct_cl"); + return 0; +}