commit 690a322cb6cd9ede626932809a758a7eac152719
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Thu Aug 19 11:18:47 2021 -0400

    text extractor for unigram

diff --git a/apertium/Makefile.am b/apertium/Makefile.am
index 3b27891..c785d39 100644
--- a/apertium/Makefile.am
+++ b/apertium/Makefile.am
@@ -198,6 +198,7 @@ bin_PROGRAMS = apertium-cleanstream \
 	       apertium-tagger-apply-new-rules \
 		   apertium-tagger-new \
 	       apertium-tagger-readwords \
+		   apertium-tagger2txt \
 	       apertium-perceptron-trace \
 	       apertium-tmxbuild \
 	       apertium-transfer \
@@ -264,6 +265,9 @@ apertium_tagger_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES)
 apertium_tagger_new_SOURCES = apertium_tagger_new.cc
 apertium_tagger_new_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES)
 
+apertium_tagger2txt_SOURCES = tagger2txt.cc
+apertium_tagger2txt_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES)
+
 apertium_perceptron_trace_SOURCES = apertium_perceptron_trace.cc
 apertium_perceptron_trace_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES)
 
diff --git a/apertium/tagger2txt.cc b/apertium/tagger2txt.cc
new file mode 100644
index 0000000..07b1692
--- /dev/null
+++ b/apertium/tagger2txt.cc
@@ -0,0 +1,80 @@
+#include <lttoolbox/lt_locale.h>
+#include <apertium/tagger_data_exe.h>
+#include <lttoolbox/ustring.h>
+#include <iostream>
+#include <cstdio>
+
+using namespace std;
+
+void out(StringWriter& sw, str_int* arr, uint64_t ct, const char* name)
+{
+  if (ct) {
+    cout << name << " {\n";
+    for (uint64_t i = 0; i < ct; i++) {
+      cout << sw.get(arr[i].s) << "\t" << arr[i].i << endl;
+    }
+    cout << "}\n";
+  }
+}
+
+void out(StringWriter& sw, str_str_int* arr, uint64_t ct, const char* name)
+{
+  if (ct) {
+    cout << name << " {\n";
+    for (uint64_t i = 0; i < ct; i++) {
+      cout << sw.get(arr[i].s1) << "\t" << sw.get(arr[i].s2) << "\t";
+      cout << arr[i].i << endl;
+    }
+    cout << "}\n";
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  LtLocale::tryToSetLocale();
+
+  if (argc != 3) {
+    cerr << "USAGE: apertium-tagger2txt [ h | l | 1 | 2 | 3 | x ] prob_file\n";
+    exit(EXIT_FAILURE);
+  }
+
+  TaggerDataExe tde;
+  FILE* fin = fopen(argv[2], "rb");
+  if (!fin) {
+    cerr << "Unable to open '" << argv[2] << "' for reading.\n";
+    exit(EXIT_FAILURE);
+  }
+  switch (argv[1][0]) {
+  case 'h':
+    tde.read_compressed_hmm_lsw(fin, true);
+    cout << "Type: HMM\n";
+    break;
+  case 'l':
+    tde.read_compressed_hmm_lsw(fin, false);
+    cout << "Type: LSW\n";
+    break;
+  case '1':
+    tde.read_compressed_unigram1(fin);
+    cout << "Type: Unigram1\n";
+    break;
+  case '2':
+    tde.read_compressed_unigram2(fin);
+    cout << "Type: Unigram2\n";
+    break;
+  case '3':
+    tde.read_compressed_unigram3(fin);
+    cout << "Type: Unigram3\n";
+    break;
+  case 'x':
+  default:
+    cerr << "Unrecognized prob type '" << argv[1] << "'\n";
+    exit(EXIT_FAILURE);
+  }
+
+  out(tde.str_write, tde.uni1, tde.uni1_count, "Data");
+  out(tde.str_write, tde.uni2, tde.uni2_count, "Data");
+  out(tde.str_write, tde.uni3_l_t, tde.uni3_l_t_count, "Data_l_t");
+  out(tde.str_write, tde.uni3_cl_ct, tde.uni3_cl_ct_count, "Data_cl_ct");
+  out(tde.str_write, tde.uni3_ct_cl, tde.uni3_ct_cl_count, "Data_ct_cl");
+  return 0;
+}