commit 726c57a6a9753af6d09905e102010b27aa74e501 Author: Daniel Swanson Date: Thu May 20 12:47:26 2021 -0500 clean up editdist some more diff --git a/scripts/apertium-editdist b/scripts/apertium-editdist index f036622..7dae1c9 100755 --- a/scripts/apertium-editdist +++ b/scripts/apertium-editdist @@ -2,30 +2,14 @@ # see apertium-editdist --help for usage - import sys import struct -import codecs -from optparse import OptionParser - -usage_string = "usage: %prog [options] alphabet" - -info_string = """ -Produce an edit distance transducer in ATT format. - -There are three ways to produce an edit distance transducer: +import argparse -* giving the alphabet as a command line argument -* giving a file with specialized configuration syntax -* giving a transducer in optimized-lookup format to induce an alphabet - (in this case only symbols with length 1 are considered) - -These ways may be combined freely. +description_string = "Produce an edit distance transducer in ATT format." +epilog_string = """ For the default case, all the desired transitions are generated with weight 1.0. -The alphabet is read from a string which contains all the (utf-8) characters -you want to use. Alternatively, an existing optimized-lookup transducer -can be supplied for reading the alphabet. The specification file should be in the following format: * First, an (optional) list of tokens separated by newlines @@ -60,199 +44,123 @@ with d for distance and S for size of alphabet plus one ** d*(3S^2 - 5S + 3) transitions """ -# Some utility classes - -class Header: - """Read and provide interface to header""" - - def __init__(self, file): - bytes = file.read(5) # "HFST\0" - if str(struct.unpack_from("<5s", bytes, 0)) == "('HFST\\x00',)": - # just ignore any hfst3 header - remaining = struct.unpack_from(" 1: - print("Too many options!") - sys.exit() - -if options.inputfile != None: - try: - inputfile = open(options.inputfile) - except IOError: - print("Couldn't open " + options.inputfile) - sys.exit() - while True: - line = str(inputfile.readline(), 'utf-8') - if line in ("@@\n", ""): - break - if line.strip() != "": - if line.startswith('##'): - continue - if len(line) > 1 and line.startswith('~'): - exclusions.add(line[1:].strip()) - continue - if '\t' in line: - weight = float(line.split('\t')[1]) - symbol = linesplit('\t')[0] - else: - weight = 0.0 - symbol = line.strip("\n") - alphabet[symbol] = weight - -if len(args) == 1: - for c in str(args[0], 'utf-8'): - if c not in list(alphabet.keys()) and c not in exclusions: - alphabet[c] = 0.0 -if options.alphabetfile != None: - afile = open(options.alphabetfile, "rb") - ol_header = Header(afile) - ol_alphabet = Alphabet(afile, ol_header.number_of_symbols) - for c in [x for x in ol_alphabet.keyTable[:] if x.strip() != '']: - if c not in list(alphabet.keys()) and c not in exclusions: - alphabet[c] = 0.0 -epsilon = str(options.epsilon, 'utf-8') -OTHER = '@_UNKNOWN_SYMBOL_@' - -def p(string): # stupid python, or possibly stupid me - return string.encode('utf-8') - def maketrans(from_st, to_st, from_sy, to_sy, weight): - return str(from_st) + "\t" + str(to_st) + "\t" + p(from_sy) + "\t" + p(to_sy) + "\t" + str(weight) + return "{0}\t{1}\t{2}\t{3}\t{4}".format(from_st, to_st, from_sy, to_sy, weight) class Transducer: - def __init__(self, alphabet, _other = OTHER, _epsilon = epsilon): - self.alphabet = alphabet + def __init__(self, dist, other, epsilon, swap, elim): + self.distance = dist + self.alphabet = {} + self.exclusions = set() self.substitutions = {} self.swaps = {} - self.other = _other - self.epsilon = _epsilon - self.swapstate = options.distance + 1 + self.should_swap = swap + self.should_elim = elim + self.other = other + self.epsilon = epsilon + self.swapstate = self.distance + 1 self.skipstate = self.swapstate + 1 self.transitions = [] - def process(self, specification): - parts = specification.split('\t') - if len(parts) != 3: - raise ValueError("Got specification with " + str(len(parts)) +\ - " parts, expected 3:\n" + specification) - weight = float(parts[2]) - if ',' in parts[0]: - frompair = tuple(parts[0].split(',')) - topair = tuple(parts[1].split(',')) - if not (len(frompair) == len(topair) == 2): - raise ValueError("Got swap-specification with incorrect number " - "of comma separators:\n" + specification) - if (frompair, topair) not in self.swaps: - self.swaps[(frompair, topair)] = weight - else: - if not (parts[0], parts[1]) in self.substitutions: - self.substitutions[(parts[0], parts[1])] = weight + def read_input_file(self, fname): + def parse_error(err, line): + nonlocal fname + sys.stderr.write('Syntax error on line %s of %s: %s.' % (fname, line, err)) + sys.exit(1) + with open(fname) as fin: + in_alpha = True + for i, line_ in enumerate(fin, 1): + line = line_.strip() + if not line or line.startswith('##'): + continue + if in_alpha: + if line == '@@': + in_alpha = False + elif len(line) > 1 and line[0] == '~': + self.exclusions.add(line[1:].strip()) + continue + elif '\t' in line: + if line.count('\t') > 1: + parse_error('Too many tabs', i) + symbol, weight = line.split('\t') + try: + self.alphabet[symbol] = float(weight) + except: + parse_error('Unable to parse weight', i) + else: + self.alphabet[line] = 0.0 + else: + if line.count('\t') > 2 or line.count('\t') == 1: + parse_error('Wrong number of tabs, expected 3 tab-separated columns', i) + elif line.count('\t') == 0: + parse_error('Substitutions and swaps must be tab-separated', i) + l, r, w = line.split('\t') + weight = 0.0 + try: + weight = float(w) + except: + parse_error('Unable to parse weight', i) + if ',' in line: + frompair = l.split(',') + topair = r.split(',') + if not (len(frompair) == len(topair) == 2): + parse_error('Swap-specification has wrong number of comma separators', i) + self.swaps.setdefault((frompair, topair), weight) + else: + self.substitutions.setdefault((l, r), weight) + + def read_optimized_lookup_alphabet(self, fname): + with open(fname, "rb") as fin: + byt = fin.read(5) + if byt == b"HFST\0": + # just ignore any hfst3 header + header_length = struct.unpack_from("= options.distance) or options.no_elim: + if (nextstate + 1 >= self.distance) or not self.should_elim: ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub])) elif sub[1] is self.epsilon: # deletion ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub])) @@ -318,40 +226,77 @@ class Transducer: return ret def make_transitions(self): - for state in range(options.distance): + for state in range(self.distance): self.transitions.append(str(state + 1) + "\t0.0") # final states self.transitions += self.make_identities(state) self.transitions += self.make_substitutions(state) self.transitions += self.make_swaps(state) - self.transitions += self.make_identities(options.distance) - -transducer = Transducer(alphabet) - -if options.inputfile != None: - while True: - line = inputfile.readline() - if line.startswith('##'): - continue - if line == "\n": - continue - if line == "": - break - transducer.process(str(line, "utf-8")) - -transducer.generate() -transducer.make_transitions() -for transition in transducer.transitions: - print(transition) - -stderr_u8 = codecs.getwriter('utf-8')(sys.stderr) - -if options.verbose: - stderr_u8.write("\n" + str(max(transducer.skipstate, transducer.swapstate)) + " states and " + str(len(transducer.transitions)) + " transitions written for\n"+ - "distance " + str(options.distance) + " and base alphabet size " + str(len(transducer.alphabet)) +"\n\n") - stderr_u8.write("The alphabet was:\n") - for symbol, weight in alphabet.items(): - stderr_u8.write(symbol + "\t" + str(weight) + "\n") - if len(exclusions) != 0: - stderr_u8.write("The exclusions were:\n") - for symbol in exclusions: - stderr_u8.write(symbol + "\n") + self.transitions += self.make_identities(self.distance) + + def verbose_report(self): + template = ''' +{states} states and {trans} transitions written for +distance {dist} and base alphabet size {alpha_size} + +The alphabet was: +{alpha} +The exclusions were: +{excl} +''' + alpha = ''.join('%s\t%s\n' % (s, w) for s, w in self.alphabet.items()) + excl = '\n'.join(self.exclusions) + report = template.format(states=max(self.skipstate, self.swapstate), + trans=len(self.transitions), + dist=self.distance, + alpha_size=len(self.alphabet), + alpha=alpha, + excl=excl) + sys.stderr.write(report) + +def main(): + parser = argparse.ArgumentParser(description=description_string, + epilog=epilog_string, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-d', '--distance', type=int, metavar='DIST', default=1, + help='edit depth, default is 1') + parser.add_argument('-e', '--epsilon', default='@0@', metavar='EPS', + help='epsilon symbol, default is @0@') + parser.add_argument('--no-elim', action='store_true', + help="don't reduce elimination") + parser.add_argument('-s', '--swap', action='store_true', + help='generate swaps in addition to insertions and deletions') + parser.add_argument('-v', '--verbose', action='store_true', + help='print summary to stderr') + alpha = parser.add_argument_group('Alphabet', 'Specify the alphabet of the transducer (these may be combined and repeated)') + alpha.add_argument('-i', '--input', dest='inputfile', metavar='INPUT', + action='append', default=[], + help='specification file in edit-distance syntax') + alpha.add_argument('-a', '--alphabet', dest='alphabetfile', metavar='TRANS', + action='append', default=[], + help='optimized-lookup format transducer to read alphabet from (ignoring multi-character symbols)') + alpha.add_argument('alphabet_string', nargs='*', + help='the alphabet as a string on the command line') + options = parser.parse_args() + + transducer = Transducer(options.distance, '@_UNKNOWN_SYMBOL_@', options.epsilon, options.swap, not options.no_elim) + + if not options.inputfile and not options.alphabet_string and not options.alphabetfile: + parser.error('Must provide an alphabet') + for fl in options.inputfile: + transducer.read_input_file(fl) + for s in options.alphabet_string: + transducer.extend_alphabet(s) + for fl in options.alphabetfile: + transducer.read_optimized_lookup_alphabet(fl) + transducer.clean_alphabet() + + transducer.generate() + transducer.make_transitions() + for transition in transducer.transitions: + print(transition) + + if options.verbose: + transducer.verbose_report() + +if __name__ == '__main__': + main()