commit 3d83726e750229a03d686fb66637451d398099dd Author: Eiji Miyamoto Date: Fri Mar 10 15:05:18 2023 +0000 integration from iii diff --git a/Makefile.am b/Makefile.am index 44618f3..9fe04af 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,6 +5,8 @@ RELEASE=0.1 VERSION=0.1.0 LANG1=jpn +SCRIPT2=Yi +LANG1SCRIPT2=$(LANG1)@$(SCRIPT2) BASENAME=apertium-$(LANG1) TARGETS_COMMON = \ @@ -48,6 +50,9 @@ TARGETS_COMMON = \ $(LANG1).autogen.hfst: .deps/$(LANG1).RL.hfst hfst-fst2fst -O $< -o $@ +$(LANG1SCRIPT2).autotok.hfst: .deps/$(LANG1SCRIPT2).automorf.hfst + hfst-fst2fst -F $< | hfst-project -p input | hfst-minimise -o $@ + $(LANG1).automorf.hfst: .deps/$(LANG1).LR.hfst hfst-invert $< | hfst-fst2fst -O -o $@ diff --git a/modes.xml b/modes.xml index 376d05d..5bcbfe3 100644 --- a/modes.xml +++ b/modes.xml @@ -13,6 +13,10 @@ + + + + @@ -35,8 +39,20 @@ + + + + + + + + + + + + @@ -48,6 +64,10 @@ + + + + diff --git a/tokenize.py b/tokenize.py new file mode 100644 index 0000000..de4fe95 --- /dev/null +++ b/tokenize.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +# to run you'll need to 'sudo apt-get install python3-hfst' or equivalent +# pip might also work though see https://github.com/hfst/hfst/issues/448 + +# usage: +# copy to directory and in modes.xml add as first step: +# +# +# +# +# this will insert spaces so that analysis won't get stuck + +import hfst +from itertools import product + +def list_options(spans, start, wlen): + def ls_op_recurse(i): + nonlocal spans, wlen + if i in spans: + for j in spans[i]: + for op in list_options(spans, j, wlen): + yield [(i,j)] + op + got_any = False + for i in range(start, wlen): + if i in spans: + yield from ls_op_recurse(i) + got_any = True + if not got_any: + yield [] + +def weight_options(spans, wlen): + for op in list_options(spans, 0, wlen): + unk_count = len(op)-1 + if len(op) == 0: + unk_count = 1 + else: + if op[0][0] != 0: + unk_count += 1 + if op[-1][1] != wlen: + unk_count += 1 + n = 0 + unk_len = 0 + for i,j in op: + if (i - n) > 0: + unk_len += (i - n) + n = j + if n < wlen: + unk_len += (wlen - n) + yield (op, unk_count, unk_len) + +def closure(analyzer, states): + if not states: + return ([], False) + ls = [] + todo = states + any_final = False + while todo: + s = todo.pop() + if s in ls: + continue + ls.append(s) + if analyzer.is_final_state(s): + any_final = True + tr = analyzer.transitions(s) + for t in tr: + if t.get_input_symbol() == '@_EPSILON_SYMBOL_@': + dest = t.get_target_state() + if dest not in ls: + ls.append(dest) + todo.append(dest) + return (ls, any_final) + +def step(analyzer, states, c): + ls = [] + for s in states: + for t in analyzer.transitions(s): + if t.get_input_symbol() == c: + ls.append(t.get_target_state()) + return closure(analyzer, ls) + +def find_spans(word, analyzer): + spans = {} + init_state, _ = closure(analyzer, [0]) + for i in range(len(word)): + states = init_state[:] + ends = [] + for j in range(i+1, len(word)+1): + states, final = step(analyzer, states, word[j-1]) + if final: + ends.append(j) + if len(states) == 0: + break + if ends: + spans[i] = list(reversed(ends)) + # put the longest option first + # so we can approximate LRLM when we randomize + return spans + +def process_word_tok(word, tokenizer, sout): + spans = find_spans(word, tokenizer) + if len(spans) == 0: + sout.write(word) + return + mins = [] + min_count = len(word) + min_len = len(word) + for op, count, ln in weight_options(spans, len(word)): + if ln > min_len: + continue + elif (ln == min_len) and (count > min_count): + continue + elif (ln == min_len) and (count == min_count): + mins.append(op) + else: + mins = [] + mins.append(op) + min_count = count + min_len = ln + sout.write(' ') + n = 0 + for i,j in mins[0]: + if n < i: + sout.write(word[n:i] + ' ') + sout.write(word[i:j] + ' ') + n = j + if n < len(word): + sout.write(word[n:] + ' ') + +def lattice(spans): + ret = [] # [ chunk, chunk, chunk ] + # chunk => (total_span, [ option, option, option ]) + # option => [ span, span, span ] + last_start = max(spans.keys()) + start = 0 + end = 0 + while start <= last_start: + unk = start + while start not in spans or len(spans[start]) == 0: + start += 1 + if start > unk: + ret.append(((unk, start), [[(unk, start)]])) + options = [] # [ (option, total_span) ] + for e in spans[start]: + options.append(([(start, e)], e)) + end = max(end, e) + updated = True + while updated: + updated = False + options2 = [] + for path, e in options: + if e == end: + options2.append((path, e)) + continue + for i in range(e, end): + if i in spans and len(spans[i]) > 0: + if i > e: + path.append((e,i)) + for n in spans[i]: + options2.append((path[:] + [(i,n)], n)) + update = True + end = max(n, end) + break + else: + options2.append((path + [(e, end)], end)) + options, options2 = options2, [] + ret.append(((start, end), [x[0] for x in options])) + start = end + return ret + +def string_op(op): + ret = '' + last_unk = False + for s in op: + if s[0] == '*': + if last_unk: + ret += s[1:] + elif ret: + ret += '+' + s + else: + ret += s + last_unk = True + else: + if ret: + ret += '+' + ret += s + return ret + +def process_word_morf(word, tokenizer, sout, analyzer): + spans = find_spans(word, tokenizer) + if len(spans) == 0: + sout.write('^' + word + '/*' + word + '$') + return + lat = lattice(spans) + for sp, ops in lat: + analyses = [] + for op in ops: + an = [] + for s in op: + w = word[s[0]:s[1]] + d = analyzer.lookup(w) + if len(d) == 0: + an.append(['*' + w]) + else: + ls = [] + for k in d: + for a, w in d[k]: + ls.append(a.replace('@_EPSILON_SYMBOL_@', '')) + an.append(ls) + analyses += list(product(*an)) + sout.write('^' + word[sp[0]:sp[1]]) + for op in analyses: + sout.write('/' + string_op(op)) + sout.write('$') + +def process_stream(tokenizer, sin, sout, analyzer=None): + alpha = tokenizer.get_alphabet() + cur_word = '' + while True: + c = sin.read(1) + if c in alpha: + cur_word += c + else: + if cur_word: + if analyzer: + process_word_morf(cur_word, tokenizer, sout, analyzer) + else: + process_word_tok(cur_word, tokenizer, sout) + if c: + cur_word = '' + sout.write(c) + else: + break + +if __name__ == '__main__': + import argparse + prs = argparse.ArgumentParser(description='Segment input stream using HFST') + prs.add_argument('transducer') + prs.add_argument('analyzer', nargs='?') + args = prs.parse_args() + stream_tok = hfst.HfstInputStream(args.transducer) + tokenizer = hfst.HfstBasicTransducer(stream_tok.read()) + analyzer = None + if args.analyzer: + stream_morf = hfst.HfstInputStream(args.analyzer) + analyzer = hfst.HfstBasicTransducer(stream_morf.read()) + import sys + process_stream(tokenizer, sys.stdin, sys.stdout, analyzer)