commit 726c57a6a9753af6d09905e102010b27aa74e501
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Thu May 20 12:47:26 2021 -0500

    clean up editdist some more

diff --git a/scripts/apertium-editdist b/scripts/apertium-editdist
index f036622..7dae1c9 100755
--- a/scripts/apertium-editdist
+++ b/scripts/apertium-editdist
@@ -2,30 +2,14 @@
 
 # see apertium-editdist --help for usage
 
-
 import sys
 import struct
-import codecs
-from optparse import OptionParser
-
-usage_string = "usage: %prog [options] alphabet"
-
-info_string = """
-Produce an edit distance transducer in ATT format.
-
-There are three ways to produce an edit distance transducer:
+import argparse
 
-* giving the alphabet as a command line argument
-* giving a file with specialized configuration syntax
-* giving a transducer in optimized-lookup format to induce an alphabet
-    (in this case only symbols with length 1 are considered)
-
-These ways may be combined freely.
+description_string = "Produce an edit distance transducer in ATT format."
 
+epilog_string = """
 For the default case, all the desired transitions are generated with weight 1.0.
-The alphabet is read from a string which contains all the (utf-8) characters
-you want to use. Alternatively, an existing optimized-lookup transducer
-can be supplied for reading the alphabet.
 
 The specification file should be in the following format:
 * First, an (optional) list of tokens separated by newlines
@@ -60,199 +44,123 @@ with d for distance and S for size of alphabet plus one
 ** d*(3S^2 - 5S + 3) transitions
 """
 
-# Some utility classes
-
-class Header:
-    """Read and provide interface to header"""
-
-    def __init__(self, file):
-        bytes = file.read(5) # "HFST\0"
-        if str(struct.unpack_from("<5s", bytes, 0)) == "('HFST\\x00',)":
-            # just ignore any hfst3 header
-            remaining = struct.unpack_from("<H", file.read(3), 0)[0]
-            self.handle_hfst3_header(file, remaining)
-            bytes = file.read(56) # 2 unsigned shorts, 4 unsigned ints and 9 uint-bools
-        else:
-            bytes = bytes + file.read(56 - 5)
-        self.number_of_input_symbols             = struct.unpack_from("<H", bytes, 0)[0]
-        self.number_of_symbols                   = struct.unpack_from("<H", bytes, 2)[0]
-        self.size_of_transition_index_table      = struct.unpack_from("<I", bytes, 4)[0]
-        self.size_of_transition_target_table     = struct.unpack_from("<I", bytes, 8)[0]
-        self.number_of_states                    = struct.unpack_from("<I", bytes, 12)[0]
-        self.number_of_transitions               = struct.unpack_from("<I", bytes, 16)[0]
-        self.weighted                            = struct.unpack_from("<I", bytes, 20)[0] != 0
-        self.deterministic                       = struct.unpack_from("<I", bytes, 24)[0] != 0
-        self.input_deterministic                 = struct.unpack_from("<I", bytes, 28)[0] != 0
-        self.minimized                           = struct.unpack_from("<I", bytes, 32)[0] != 0
-        self.cyclic                              = struct.unpack_from("<I", bytes, 36)[0] != 0
-        self.has_epsilon_epsilon_transitions     = struct.unpack_from("<I", bytes, 40)[0] != 0
-        self.has_input_epsilon_transitions       = struct.unpack_from("<I", bytes, 44)[0] != 0
-        self.has_input_epsilon_cycles            = struct.unpack_from("<I", bytes, 48)[0] != 0
-        self.has_unweighted_input_epsilon_cycles = struct.unpack_from("<I", bytes, 52)[0] != 0
-
-    def handle_hfst3_header(self, file, remaining):
-        chars = struct.unpack_from("<" + str(remaining) + "c",
-                                   file.read(remaining), 0)
-        # assume the h3-header doesn't say anything surprising for now
-
-class Alphabet:
-    """Read and provide interface to alphabet"""
-
-    def __init__(self, file, number_of_symbols):
-        stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
-        self.keyTable = [] # list of unicode objects, use foo.encode("utf-8") to print
-        for x in range(number_of_symbols):
-            symbol = ""
-            while True:
-                byte = file.read(1)
-                if byte == '\0': # a symbol has ended
-                    symbol = str(symbol, "utf-8")
-                    if len(symbol) != 1:
-                        stderr_u8.write("Ignored symbol " + symbol + "\n")
-                    else:
-                        self.keyTable.append(symbol)
-                    break
-                symbol += byte
-
-class MyOptionParser(OptionParser):
-    # This is needed to override the formatting of the help string
-    def format_epilog(self, formatter):
-        return self.epilog
-
-parser = MyOptionParser(usage=usage_string, epilog=info_string)
-parser.add_option("-e", "--epsilon", dest = "epsilon",
-                  help = "specify symbol to use as epsilon, default is @0@",
-                  metavar = "EPS")
-parser.add_option("-d", "--distance", type = "int", dest = "distance",
-                  help = "specify edit depth, default is 1",
-                  metavar = "DIST")
-parser.add_option("-s", "--swap", action = "store_true", dest="swap",
-                  help = "generate swaps (as well as insertions and deletions)")
-parser.add_option("", "--no-elim", action = "store_true", dest="no_elim",
-                  help = "don't do redundancy elimination")
-parser.add_option("-i", "--input", dest = "inputfile",
-                  help = "optional file with special edit-distance syntax",
-                  metavar = "INPUT")
-parser.add_option("-a", "--alphabet", dest = "alphabetfile",
-                  help = "read the alphabet from an existing optimized-lookup format transducer",
-                  metavar = "ALPHABET")
-parser.add_option("-v", "--verbose", action = "store_true", dest="verbose",
-                  help = "print some diagnostics to standard error")
-parser.set_defaults(epsilon = '@0@')
-parser.set_defaults(distance = 1)
-parser.set_defaults(swap = False)
-parser.set_defaults(no_elim = False)
-parser.set_defaults(verbose = False)
-(options, args) = parser.parse_args()
-
-alphabet = {}
-exclusions = set()
-
-if options.inputfile == None and options.alphabetfile == None \
-        and len(args) == 0:
-    print("Specify at least one of INPUT, ALPHABET or alphabet string")
-    sys.exit()
-if len(args) > 1:
-    print("Too many options!")
-    sys.exit()
-
-if options.inputfile != None:
-    try:
-        inputfile = open(options.inputfile)
-    except IOError:
-        print("Couldn't open " + options.inputfile)
-        sys.exit()
-    while True:
-        line = str(inputfile.readline(), 'utf-8')
-        if line in ("@@\n", ""):
-            break
-        if line.strip() != "":
-            if line.startswith('##'):
-                continue
-            if len(line) > 1 and line.startswith('~'):
-                exclusions.add(line[1:].strip())
-                continue
-            if '\t' in line:
-                weight = float(line.split('\t')[1])
-                symbol = linesplit('\t')[0]
-            else:
-                weight = 0.0
-                symbol = line.strip("\n")
-            alphabet[symbol] = weight
-
-if len(args) == 1:
-    for c in str(args[0], 'utf-8'):
-        if c not in list(alphabet.keys()) and c not in exclusions:
-            alphabet[c] = 0.0
-if options.alphabetfile != None:
-    afile = open(options.alphabetfile, "rb")
-    ol_header = Header(afile)
-    ol_alphabet = Alphabet(afile, ol_header.number_of_symbols)
-    for c in [x for x in ol_alphabet.keyTable[:] if x.strip() != '']:
-        if c not in list(alphabet.keys()) and c not in exclusions:
-            alphabet[c] = 0.0
-epsilon = str(options.epsilon, 'utf-8')
-OTHER = '@_UNKNOWN_SYMBOL_@'
-
-def p(string): # stupid python, or possibly stupid me
-    return string.encode('utf-8')
-
 def maketrans(from_st, to_st, from_sy, to_sy, weight):
-    return str(from_st) + "\t" + str(to_st) + "\t" + p(from_sy) + "\t" + p(to_sy) + "\t" + str(weight)
+    return "{0}\t{1}\t{2}\t{3}\t{4}".format(from_st, to_st, from_sy, to_sy, weight)
 
 class Transducer:
-    def __init__(self, alphabet, _other = OTHER, _epsilon = epsilon):
-        self.alphabet = alphabet
+    def __init__(self, dist, other, epsilon, swap, elim):
+        self.distance = dist
+        self.alphabet = {}
+        self.exclusions = set()
         self.substitutions = {}
         self.swaps = {}
-        self.other = _other
-        self.epsilon = _epsilon
-        self.swapstate = options.distance + 1
+        self.should_swap = swap
+        self.should_elim = elim
+        self.other = other
+        self.epsilon = epsilon
+        self.swapstate = self.distance + 1
         self.skipstate = self.swapstate + 1
         self.transitions = []
 
-    def process(self, specification):
-        parts = specification.split('\t')
-        if len(parts) != 3:
-            raise ValueError("Got specification with " + str(len(parts)) +\
-                                 " parts, expected 3:\n" + specification)
-        weight = float(parts[2])
-        if ',' in parts[0]:
-            frompair = tuple(parts[0].split(','))
-            topair = tuple(parts[1].split(','))
-            if not (len(frompair) == len(topair) == 2):
-                raise ValueError("Got swap-specification with incorrect number "
-                                 "of comma separators:\n" + specification)
-            if (frompair, topair) not in self.swaps:
-                self.swaps[(frompair, topair)] = weight
-        else:
-            if not (parts[0], parts[1]) in self.substitutions:
-                self.substitutions[(parts[0], parts[1])] = weight
+    def read_input_file(self, fname):
+        def parse_error(err, line):
+            nonlocal fname
+            sys.stderr.write('Syntax error on line %s of %s: %s.' % (fname, line, err))
+            sys.exit(1)
+        with open(fname) as fin:
+            in_alpha = True
+            for i, line_ in enumerate(fin, 1):
+                line = line_.strip()
+                if not line or line.startswith('##'):
+                    continue
+                if in_alpha:
+                    if line == '@@':
+                        in_alpha = False
+                    elif len(line) > 1 and line[0] == '~':
+                        self.exclusions.add(line[1:].strip())
+                        continue
+                    elif '\t' in line:
+                        if line.count('\t') > 1:
+                            parse_error('Too many tabs', i)
+                        symbol, weight = line.split('\t')
+                        try:
+                            self.alphabet[symbol] = float(weight)
+                        except:
+                            parse_error('Unable to parse weight', i)
+                    else:
+                        self.alphabet[line] = 0.0
+                else:
+                    if line.count('\t') > 2 or line.count('\t') == 1:
+                        parse_error('Wrong number of tabs, expected 3 tab-separated columns', i)
+                    elif line.count('\t') == 0:
+                        parse_error('Substitutions and swaps must be tab-separated', i)
+                    l, r, w = line.split('\t')
+                    weight = 0.0
+                    try:
+                        weight = float(w)
+                    except:
+                        parse_error('Unable to parse weight', i)
+                    if ',' in line:
+                        frompair = l.split(',')
+                        topair = r.split(',')
+                        if not (len(frompair) == len(topair) == 2):
+                            parse_error('Swap-specification has wrong number of comma separators', i)
+                        self.swaps.setdefault((frompair, topair), weight)
+                    else:
+                        self.substitutions.setdefault((l, r), weight)
+
+    def read_optimized_lookup_alphabet(self, fname):
+        with open(fname, "rb") as fin:
+            byt = fin.read(5)
+            if byt == b"HFST\0":
+                # just ignore any hfst3 header
+                header_length = struct.unpack_from("<H", fin.read(3), 0)[0]
+                fin.read(header_length)
+                # hopefully there's nothing surprising in here
+                byt = fin.read(56)
+            else:
+                byt += fin.read(56 - 5)
+            symbol_count = struct.unpack_from("<H", byt, 2)[0]
+            for n in range(symbol_count):
+                s = fin.read(1)
+                while s[-1] != 0:
+                    s += fin.read(1)
+                sym = s[:-1].decode('utf-8')
+                if len(sym) != 1:
+                    sys.stderr.write("Ignored symbol " + sym + "\n")
+                elif not sym.isspace() and sym not in self.exclusions:
+                    self.alphabet.setdefault(sym, 0.0)
+
+    def extend_alphabet(self, alpha):
+        for c in alpha:
+            if c not in self.exclusions:
+                self.alphabet.setdefault(c, 0.0)
+
+    def clean_alphabet(self):
+        # depending on what order read_input_file(), extend_alphabet()
+        # and read_optimzed_lookup_alphabet() are called in, symbols
+        # might get included in the alphabet which should be excluded
+        # so remove those
+        for s in self.exclusions:
+            if s in self.alphabet:
+                del self.alphabet[s]
 
     def generate(self):
         # for substitutions and swaps that weren't defined by the user,
         # generate standard subs and swaps
-        if (self.other, self.epsilon) not in self.substitutions:
-            self.substitutions[(self.other, self.epsilon)] = 1.0
-        for symbol in list(self.alphabet.keys()):
-            if (self.other, symbol) not in self.substitutions:
-                self.substitutions[(self.other, symbol)] = 1.0 + alphabet[symbol]
-            if (self.epsilon, symbol) not in self.substitutions:
-                self.substitutions[(self.epsilon, symbol)] = 1.0 + alphabet[symbol]
-            if (symbol, self.epsilon) not in self.substitutions:
-                self.substitutions[(symbol, self.epsilon)] = 1.0 + alphabet[symbol]
-            for symbol2 in list(self.alphabet.keys()):
+        self.substitutions.setdefault((self.other, self.epsilon), 1.0)
+        for symbol in self.alphabet:
+            w = 1.0 + self.alphabet[symbol]
+            self.substitutions.setdefault((self.other, symbol), w)
+            self.substitutions.setdefault((self.epsilon, symbol), w)
+            self.substitutions.setdefault((symbol, self.epsilon), w)
+            for symbol2 in self.alphabet:
                 if symbol == symbol2: continue
-                if ((symbol, symbol2), (symbol2, symbol)) not in self.swaps:
-                    if ((symbol2, symbol), (symbol, symbol2)) in self.swaps:
-                        self.swaps[((symbol, symbol2), (symbol2, symbol))] = self.swaps[((symbol2, symbol), (symbol, symbol2))]
-                    else:
-                        self.swaps[((symbol, symbol2), (symbol2, symbol))] = 1.0 + alphabet[symbol] + alphabet[symbol2]
-                if (symbol, symbol2) not in self.substitutions:
-                    if (symbol2, symbol) in self.substitutions:
-                        self.substitutions[(symbol, symbol2)] = self.substitutions[(symbol2, symbol)]
-                    else:
-                        self.substitutions[(symbol, symbol2)] = 1.0 + alphabet[symbol] + alphabet[symbol2]
+                w += self.alphabet[symbol2]
+                p12 = (symbol, symbol2)
+                p21 = (symbol2, symbol)
+                self.swaps.setdefault((p12, p21), self.swaps.get((p21, p12), w))
+                self.substitutions.setdefault(p12, self.substitutions.get(p21, w))
 
     def next_special(self, state):
         if state == "swap":
@@ -270,7 +178,7 @@ class Transducer:
         if nextstate is None:
             nextstate = state
         ret = []
-        for symbol in list(self.alphabet.keys()):
+        for symbol in self.alphabet:
             if symbol not in (self.epsilon, self.other):
                 ret.append(maketrans(state, nextstate, symbol, symbol, 0.0))
         return ret
@@ -279,7 +187,7 @@ class Transducer:
         if nextstate is None:
             nextstate = state + 1
         ret = []
-        if options.swap:
+        if self.should_swap:
             for swap in self.swaps:
                 ret.append(maketrans(state, self.swapstate, swap[0][0], swap[0][1], self.swaps[swap]))
                 ret.append(maketrans(self.swapstate, nextstate, swap[1][0], swap[1][1], 0.0))
@@ -293,7 +201,7 @@ class Transducer:
             nextstate = state + 1
         ret = []
         for sub in self.substitutions:
-            if (nextstate + 1 >= options.distance) or options.no_elim:
+            if (nextstate + 1 >= self.distance) or not self.should_elim:
                 ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
             elif sub[1] is self.epsilon: # deletion
                 ret.append(maketrans(state, self.skipstate, sub[0], sub[1], self.substitutions[sub]))
@@ -318,40 +226,77 @@ class Transducer:
         return ret
 
     def make_transitions(self):
-        for state in range(options.distance):
+        for state in range(self.distance):
             self.transitions.append(str(state + 1) + "\t0.0") # final states
             self.transitions += self.make_identities(state)
             self.transitions += self.make_substitutions(state)
             self.transitions += self.make_swaps(state)
-        self.transitions += self.make_identities(options.distance)
-
-transducer = Transducer(alphabet)
-
-if options.inputfile != None:
-    while True:
-        line = inputfile.readline()
-        if line.startswith('##'):
-            continue
-        if line == "\n":
-            continue
-        if line == "":
-            break
-        transducer.process(str(line, "utf-8"))
-
-transducer.generate()
-transducer.make_transitions()
-for transition in transducer.transitions:
-    print(transition)
-
-stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
-
-if options.verbose:
-    stderr_u8.write("\n" + str(max(transducer.skipstate, transducer.swapstate)) + " states and " + str(len(transducer.transitions)) + " transitions written for\n"+
-                     "distance " + str(options.distance) + " and base alphabet size " + str(len(transducer.alphabet)) +"\n\n")
-    stderr_u8.write("The alphabet was:\n")
-    for symbol, weight in alphabet.items():
-        stderr_u8.write(symbol + "\t" + str(weight) + "\n")
-    if len(exclusions) != 0:
-        stderr_u8.write("The exclusions were:\n")
-        for symbol in exclusions:
-            stderr_u8.write(symbol + "\n")
+        self.transitions += self.make_identities(self.distance)
+
+    def verbose_report(self):
+        template = '''
+{states} states and {trans} transitions written for
+distance {dist} and base alphabet size {alpha_size}
+
+The alphabet was:
+{alpha}
+The exclusions were:
+{excl}
+'''
+        alpha = ''.join('%s\t%s\n' % (s, w) for s, w in self.alphabet.items())
+        excl = '\n'.join(self.exclusions)
+        report = template.format(states=max(self.skipstate, self.swapstate),
+                                 trans=len(self.transitions),
+                                 dist=self.distance,
+                                 alpha_size=len(self.alphabet),
+                                 alpha=alpha,
+                                 excl=excl)
+        sys.stderr.write(report)
+
+def main():
+    parser = argparse.ArgumentParser(description=description_string,
+                                     epilog=epilog_string,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-d', '--distance', type=int, metavar='DIST', default=1,
+                        help='edit depth, default is 1')
+    parser.add_argument('-e', '--epsilon', default='@0@', metavar='EPS',
+                        help='epsilon symbol, default is @0@')
+    parser.add_argument('--no-elim', action='store_true',
+                        help="don't reduce elimination")
+    parser.add_argument('-s', '--swap', action='store_true',
+                        help='generate swaps in addition to insertions and deletions')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='print summary to stderr')
+    alpha = parser.add_argument_group('Alphabet', 'Specify the alphabet of the transducer (these may be combined and repeated)')
+    alpha.add_argument('-i', '--input', dest='inputfile', metavar='INPUT',
+                       action='append', default=[],
+                       help='specification file in edit-distance syntax')
+    alpha.add_argument('-a', '--alphabet', dest='alphabetfile', metavar='TRANS',
+                       action='append', default=[],
+                       help='optimized-lookup format transducer to read alphabet from (ignoring multi-character symbols)')
+    alpha.add_argument('alphabet_string', nargs='*',
+                       help='the alphabet as a string on the command line')
+    options = parser.parse_args()
+
+    transducer = Transducer(options.distance, '@_UNKNOWN_SYMBOL_@', options.epsilon, options.swap, not options.no_elim)
+
+    if not options.inputfile and not options.alphabet_string and not options.alphabetfile:
+        parser.error('Must provide an alphabet')
+    for fl in options.inputfile:
+        transducer.read_input_file(fl)
+    for s in options.alphabet_string:
+        transducer.extend_alphabet(s)
+    for fl in options.alphabetfile:
+        transducer.read_optimized_lookup_alphabet(fl)
+    transducer.clean_alphabet()
+
+    transducer.generate()
+    transducer.make_transitions()
+    for transition in transducer.transitions:
+        print(transition)
+
+    if options.verbose:
+        transducer.verbose_report()
+
+if __name__ == '__main__':
+    main()