commit 695c73f46abee1b61a1c689d3ea0a8b836bf4930 Author: Eiji Miyamoto Date: Wed Jul 12 06:26:08 2023 +0100 tokenizer with mecab diff --git a/buffer_mecab.cpp b/buffer_mecab.cpp new file mode 100644 index 0000000..42b9672 --- /dev/null +++ b/buffer_mecab.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +void process_text(std::istream& sin, std::ostream& sout) { + std::string text; + std::getline(sin, text); + + MeCab::Tagger* tagger = MeCab::createTagger("-Owakati"); + std::stringstream buffer; + //std::ostringstream buffer; + std::string tokenized; + bool in_bracket = false; + for (char i : text) { + buffer << i; + if (i == '[') { + std::string parsed = tagger->parse(buffer.str().c_str()); + parsed.erase(parsed.find_last_not_of(" \n\r\t") + 1); + tokenized += parsed; + // buffer = tagger->parse(buffer.str().c_str()); + // tokenized += buffer; + buffer.str(""); + buffer.clear(); + in_bracket = true; + } + else if (in_bracket) { + tokenized += i; + } + else if (i == ']') { + in_bracket = false; + } + } + + sout << tokenized; + + delete tagger; +} + +int main() { + process_text(std::cin, std::cout); + return 0; +} diff --git a/tokenize.py b/tokenize.py index de4fe95..f8267b7 100644 --- a/tokenize.py +++ b/tokenize.py @@ -1,248 +1,25 @@ -#!/usr/bin/env python3 - -# to run you'll need to 'sudo apt-get install python3-hfst' or equivalent -# pip might also work though see https://github.com/hfst/hfst/issues/448 - -# usage: -# copy to directory and in modes.xml add as first step: -# -# -# -# -# this will insert spaces so that analysis won't get stuck - -import hfst -from itertools import product - -def list_options(spans, start, wlen): - def ls_op_recurse(i): - nonlocal spans, wlen - if i in spans: - for j in spans[i]: - for op in list_options(spans, j, wlen): - yield [(i,j)] + op - got_any = False - for i in range(start, wlen): - if i in spans: - yield from ls_op_recurse(i) - got_any = True - if not got_any: - yield [] - -def weight_options(spans, wlen): - for op in list_options(spans, 0, wlen): - unk_count = len(op)-1 - if len(op) == 0: - unk_count = 1 - else: - if op[0][0] != 0: - unk_count += 1 - if op[-1][1] != wlen: - unk_count += 1 - n = 0 - unk_len = 0 - for i,j in op: - if (i - n) > 0: - unk_len += (i - n) - n = j - if n < wlen: - unk_len += (wlen - n) - yield (op, unk_count, unk_len) - -def closure(analyzer, states): - if not states: - return ([], False) - ls = [] - todo = states - any_final = False - while todo: - s = todo.pop() - if s in ls: - continue - ls.append(s) - if analyzer.is_final_state(s): - any_final = True - tr = analyzer.transitions(s) - for t in tr: - if t.get_input_symbol() == '@_EPSILON_SYMBOL_@': - dest = t.get_target_state() - if dest not in ls: - ls.append(dest) - todo.append(dest) - return (ls, any_final) - -def step(analyzer, states, c): - ls = [] - for s in states: - for t in analyzer.transitions(s): - if t.get_input_symbol() == c: - ls.append(t.get_target_state()) - return closure(analyzer, ls) - -def find_spans(word, analyzer): - spans = {} - init_state, _ = closure(analyzer, [0]) - for i in range(len(word)): - states = init_state[:] - ends = [] - for j in range(i+1, len(word)+1): - states, final = step(analyzer, states, word[j-1]) - if final: - ends.append(j) - if len(states) == 0: - break - if ends: - spans[i] = list(reversed(ends)) - # put the longest option first - # so we can approximate LRLM when we randomize - return spans - -def process_word_tok(word, tokenizer, sout): - spans = find_spans(word, tokenizer) - if len(spans) == 0: - sout.write(word) - return - mins = [] - min_count = len(word) - min_len = len(word) - for op, count, ln in weight_options(spans, len(word)): - if ln > min_len: - continue - elif (ln == min_len) and (count > min_count): - continue - elif (ln == min_len) and (count == min_count): - mins.append(op) - else: - mins = [] - mins.append(op) - min_count = count - min_len = ln - sout.write(' ') - n = 0 - for i,j in mins[0]: - if n < i: - sout.write(word[n:i] + ' ') - sout.write(word[i:j] + ' ') - n = j - if n < len(word): - sout.write(word[n:] + ' ') - -def lattice(spans): - ret = [] # [ chunk, chunk, chunk ] - # chunk => (total_span, [ option, option, option ]) - # option => [ span, span, span ] - last_start = max(spans.keys()) - start = 0 - end = 0 - while start <= last_start: - unk = start - while start not in spans or len(spans[start]) == 0: - start += 1 - if start > unk: - ret.append(((unk, start), [[(unk, start)]])) - options = [] # [ (option, total_span) ] - for e in spans[start]: - options.append(([(start, e)], e)) - end = max(end, e) - updated = True - while updated: - updated = False - options2 = [] - for path, e in options: - if e == end: - options2.append((path, e)) - continue - for i in range(e, end): - if i in spans and len(spans[i]) > 0: - if i > e: - path.append((e,i)) - for n in spans[i]: - options2.append((path[:] + [(i,n)], n)) - update = True - end = max(n, end) - break - else: - options2.append((path + [(e, end)], end)) - options, options2 = options2, [] - ret.append(((start, end), [x[0] for x in options])) - start = end - return ret - -def string_op(op): - ret = '' - last_unk = False - for s in op: - if s[0] == '*': - if last_unk: - ret += s[1:] - elif ret: - ret += '+' + s - else: - ret += s - last_unk = True - else: - if ret: - ret += '+' - ret += s - return ret - -def process_word_morf(word, tokenizer, sout, analyzer): - spans = find_spans(word, tokenizer) - if len(spans) == 0: - sout.write('^' + word + '/*' + word + '$') - return - lat = lattice(spans) - for sp, ops in lat: - analyses = [] - for op in ops: - an = [] - for s in op: - w = word[s[0]:s[1]] - d = analyzer.lookup(w) - if len(d) == 0: - an.append(['*' + w]) - else: - ls = [] - for k in d: - for a, w in d[k]: - ls.append(a.replace('@_EPSILON_SYMBOL_@', '')) - an.append(ls) - analyses += list(product(*an)) - sout.write('^' + word[sp[0]:sp[1]]) - for op in analyses: - sout.write('/' + string_op(op)) - sout.write('$') - -def process_stream(tokenizer, sin, sout, analyzer=None): - alpha = tokenizer.get_alphabet() - cur_word = '' - while True: - c = sin.read(1) - if c in alpha: - cur_word += c - else: - if cur_word: - if analyzer: - process_word_morf(cur_word, tokenizer, sout, analyzer) - else: - process_word_tok(cur_word, tokenizer, sout) - if c: - cur_word = '' - sout.write(c) - else: - break +import MeCab +import sys + +def process_text(sin, sout): + text = sin.read() + mecab = MeCab.Tagger("-Owakati") + buffer = "" + tokenized = "" + in_blancket = False + + for i in text: + buffer += i + if i == "[": + buffer = mecab.parse(buffer.strip()).rstrip() + tokenized += buffer + buffer = "" + in_blancket = True + elif in_blancket: + tokenized += i + elif i == "]": + in_blancket = False + sout.write(tokenized) if __name__ == '__main__': - import argparse - prs = argparse.ArgumentParser(description='Segment input stream using HFST') - prs.add_argument('transducer') - prs.add_argument('analyzer', nargs='?') - args = prs.parse_args() - stream_tok = hfst.HfstInputStream(args.transducer) - tokenizer = hfst.HfstBasicTransducer(stream_tok.read()) - analyzer = None - if args.analyzer: - stream_morf = hfst.HfstInputStream(args.analyzer) - analyzer = hfst.HfstBasicTransducer(stream_morf.read()) - import sys - process_stream(tokenizer, sys.stdin, sys.stdout, analyzer) + process_text(sys.stdin, sys.stdout)