commit b084615183ff7a53a4cb4629a0946c146bfc9583 Author: Daniel Swanson Date: Thu May 20 07:45:41 2021 -0500 add editdist script (copied apertium-kaz version) diff --git a/scripts/Makefile.am b/scripts/Makefile.am index bdecf4f..0c857a9 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -11,6 +11,7 @@ apertium-translate-to-default-equivalent: apertium-translate-to-default-equivale $(SED) -e 's|[@]datarootdir@|$(datarootdir)|g' < "$<" > "$@" && chmod +x "$@" dist_bin_SCRIPTS = \ + apertium-editdist \ apertium-filter-dix \ apertium-filter-rules \ apertium-genvdix \ diff --git a/scripts/apertium-editdist b/scripts/apertium-editdist new file mode 100755 index 0000000..f283f6d --- /dev/null +++ b/scripts/apertium-editdist @@ -0,0 +1,357 @@ +#!/usr/bin/python + +# see editdist.py --help for usage + + +import sys +import struct +import codecs +from optparse import OptionParser + +usage_string = "usage: %prog [options] alphabet" + +info_string = """ +Produce an edit distance transducer in ATT format. + +There are three ways to produce an edit distance transducer: + +* giving the alphabet as a command line argument +* giving a file with specialized configuration syntax +* giving a transducer in optimized-lookup format to induce an alphabet + (in this case only symbols with length 1 are considered) + +These ways may be combined freely. + +For the default case, all the desired transitions are generated with weight 1.0. +The alphabet is read from a string which contains all the (utf-8) characters +you want to use. Alternatively, an existing optimized-lookup transducer +can be supplied for reading the alphabet. + +The specification file should be in the following format: +* First, an (optional) list of tokens separated by newlines + All transitions involving these tokens that are otherwise unspecified + are generated with weight 1.0. Symbol weights can be specified by appending + a tab and a weight to the symbol. Transitions involving such a symbol + will have the user-specified weight added to it. +* If you want to exclude symbols that may be induced from a transducer, + add a leading ~ character to that line. +* If you want to specify transitions, insert a line with the content "@@" + (without the quotes) +* In the following lines, specified transitions with the form + FROM TO WEIGHT + where FROM is the source token, TO is the destination token and WEIGHT is + a nonnegative floating point number specifying the weight. By default, + if only one transition involving FROM and TO is specified, the same WEIGHT + will be used to specify the transition TO -> FROM (assuming that both are + listed in the list of tokens). +* If the command line option to generate swaps is set, you can also specify swap + weights with + FROM,TO TO,FROM WEIGHT + Again, unspecified swaps will be generated automatically with weight 1.0. +* Lines starting with ## are comments. + +with d for distance and S for size of alphabet plus one +(for epsilon), expected output is a transducer in ATT format with +* Swapless: +** d + 1 states +** d*(S^2 + S - 1) transitions +* Swapful: +** d*(S^2 - 3S + 3) + 1 states +** d*(3S^2 - 5S + 3) transitions +""" + +# Some utility classes + +class Header: + """Read and provide interface to header""" + + def __init__(self, file): + bytes = file.read(5) # "HFST\0" + if str(struct.unpack_from("<5s", bytes, 0)) == "('HFST\\x00',)": + # just ignore any hfst3 header + remaining = struct.unpack_from(" 1: + print "Too many options!" + sys.exit() + +if options.inputfile != None: + try: + inputfile = open(options.inputfile) + except IOError: + print "Couldn't open " + options.inputfile + sys.exit() + while True: + line = unicode(inputfile.readline(), 'utf-8') + if line in ("@@\n", ""): + break + if line.strip() != "": + if line.startswith(u'##'): + continue + if len(line) > 1 and line.startswith(u'~'): + exclusions.add(line[1:].strip()) + continue + if '\t' in line: + weight = float(line.split('\t')[1]) + symbol = linesplit('\t')[0] + else: + weight = 0.0 + symbol = line.strip("\n") + alphabet[symbol] = weight + +if len(args) == 1: + for c in unicode(args[0], 'utf-8'): + if c not in alphabet.keys() and c not in exclusions: + alphabet[c] = 0.0 +if options.alphabetfile != None: + afile = open(options.alphabetfile, "rb") + ol_header = Header(afile) + ol_alphabet = Alphabet(afile, ol_header.number_of_symbols) + for c in filter(lambda x: x.strip() != '', ol_alphabet.keyTable[:]): + if c not in alphabet.keys() and c not in exclusions: + alphabet[c] = 0.0 +epsilon = unicode(options.epsilon, 'utf-8') +OTHER = u'@_UNKNOWN_SYMBOL_@' + +def p(string): # stupid python, or possibly stupid me + return string.encode('utf-8') + +def maketrans(from_st, to_st, from_sy, to_sy, weight): + return str(from_st) + "\t" + str(to_st) + "\t" + p(from_sy) + "\t" + p(to_sy) + "\t" + str(weight) + +class Transducer: + def __init__(self, alphabet, _other = OTHER, _epsilon = epsilon): + self.alphabet = alphabet + self.substitutions = {} + self.swaps = {} + self.other = _other + self.epsilon = _epsilon + self.swapstate = options.distance + 1 + self.skipstate = self.swapstate + 1 + self.transitions = [] + + def process(self, specification): + parts = specification.split('\t') + if len(parts) != 3: + raise ValueError("Got specification with " + str(len(parts)) +\ + " parts, expected 3:\n" + specification) + weight = float(parts[2]) + if ',' in parts[0]: + frompair = tuple(parts[0].split(',')) + topair = tuple(parts[1].split(',')) + if not (len(frompair) == len(topair) == 2): + raise ValueError("Got swap-specification with incorrect number " + "of comma separators:\n" + specification) + if (frompair, topair) not in self.swaps: + self.swaps[(frompair, topair)] = weight + else: + if not (parts[0], parts[1]) in self.substitutions: + self.substitutions[(parts[0], parts[1])] = weight + + def generate(self): + # for substitutions and swaps that weren't defined by the user, + # generate standard subs and swaps + if (self.other, self.epsilon) not in self.substitutions: + self.substitutions[(self.other, self.epsilon)] = 1.0 + for symbol in self.alphabet.keys(): + if (self.other, symbol) not in self.substitutions: + self.substitutions[(self.other, symbol)] = 1.0 + alphabet[symbol] + if (self.epsilon, symbol) not in self.substitutions: + self.substitutions[(self.epsilon, symbol)] = 1.0 + alphabet[symbol] + if (symbol, self.epsilon) not in self.substitutions: + self.substitutions[(symbol, self.epsilon)] = 1.0 + alphabet[symbol] + for symbol2 in self.alphabet.keys(): + if symbol == symbol2: continue + if ((symbol, symbol2), (symbol2, symbol)) not in self.swaps: + if ((symbol2, symbol), (symbol, symbol2)) in self.swaps: + self.swaps[((symbol, symbol2), (symbol2, symbol))] = self.swaps[((symbol2, symbol), (symbol, symbol2))] + else: + self.swaps[((symbol, symbol2), (symbol2, symbol))] = 1.0 + alphabet[symbol] + alphabet[symbol2] + if (symbol, symbol2) not in self.substitutions: + if (symbol2, symbol) in self.substitutions: + self.substitutions[(symbol, symbol2)] = self.substitutions[(symbol2, symbol)] + else: + self.substitutions[(symbol, symbol2)] = 1.0 + alphabet[symbol] + alphabet[symbol2] + + def next_special(self, state): + if state == "swap": + self.swapstate += 1 + while self.swapstate == self.skipstate: + self.swapstate += 1 + elif state == "skip": + self.skipstate += 1 + while self.skipstate == self.swapstate: + self.skipstate += 1 + else: + raise Exception + + def make_identities(self, state, nextstate = None): + if nextstate is None: + nextstate = state + ret = [] + for symbol in self.alphabet.keys(): + if symbol not in (self.epsilon, self.other): + ret.append(maketrans(state, nextstate, symbol, symbol, 0.0)) + return ret + + def make_swaps(self, state, nextstate = None): + if nextstate is None: + nextstate = state + 1 + ret = [] + if options.swap: + for swap in self.swaps: + ret.append(maketrans(state, self.swapstate, swap[0][0], swap[0][1], self.swaps[swap])) + ret.append(maketrans(self.swapstate, nextstate, swap[1][0], swap[1][1], 0.0)) + self.next_special("swap") + return ret + + # for substitutions, we try to eliminate redundancies by refusing to do + # deletion right after insertion and insertion right after deletion + def make_substitutions(self, state, nextstate = None): + if nextstate is None: + nextstate = state + 1 + ret = [] + for sub in self.substitutions: + if (nextstate + 1 >= options.distance) or options.no_elim: + ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub])) + elif sub[1] is self.epsilon: # deletion + ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub])) + ret += self.make_identities(self.skipstate, nextstate) + ret += self.make_swaps(self.skipstate, nextstate + 1) + for sub2 in self.substitutions: + # after deletion, refuse to do insertion + if sub2[0] != self.epsilon: + ret.append(maketrans(self.skipstate, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2])) + self.next_special("skip") + elif sub[0] is self.epsilon: # insertion + ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub])) + ret += self.make_identities(self.skipstate, nextstate) + ret += self.make_swaps(self.skipstate, nextstate + 1) + for sub2 in self.substitutions: + # after insertion, refuse to do deletion + if sub2[1] != self.epsilon: + ret.append(maketrans(self.skipstate, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2])) + self.next_special("skip") + else: + ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub])) + return ret + + def make_transitions(self): + for state in range(options.distance): + self.transitions.append(str(state + 1) + "\t0.0") # final states + self.transitions += self.make_identities(state) + self.transitions += self.make_substitutions(state) + self.transitions += self.make_swaps(state) + self.transitions += self.make_identities(options.distance) + +transducer = Transducer(alphabet) + +if options.inputfile != None: + while True: + line = inputfile.readline() + if line.startswith('##'): + continue + if line == "\n": + continue + if line == "": + break + transducer.process(unicode(line, "utf-8")) + +transducer.generate() +transducer.make_transitions() +for transition in transducer.transitions: + print transition + +stderr_u8 = codecs.getwriter('utf-8')(sys.stderr) + +if options.verbose: + stderr_u8.write("\n" + str(max(transducer.skipstate, transducer.swapstate)) + " states and " + str(len(transducer.transitions)) + " transitions written for\n"+ + "distance " + str(options.distance) + " and base alphabet size " + str(len(transducer.alphabet)) +"\n\n") + stderr_u8.write("The alphabet was:\n") + for symbol, weight in alphabet.iteritems(): + stderr_u8.write(symbol + "\t" + str(weight) + "\n") + if len(exclusions) != 0: + stderr_u8.write("The exclusions were:\n") + for symbol in exclusions: + stderr_u8.write(symbol + "\n")