commit fd8844c137d4b1e388812767c1bd31f00ccd068a
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Fri Jul 9 11:03:31 2021 -0500

    check for errors, clean up

diff --git a/apertium-regtest.py b/apertium-regtest.py
index a9fc44b..b19da60 100755
--- a/apertium-regtest.py
+++ b/apertium-regtest.py
@@ -15,6 +15,7 @@ import shlex
 import socketserver
 import subprocess
 import sys
+import time
 import urllib.parse
 import xml.etree.ElementTree
 import zlib
@@ -23,16 +24,26 @@ def hash_line(s):
     return base64.b64encode(hashlib.sha256(s.encode('utf-8')).digest(), b'-_')[:12].decode('utf-8')
 
 def load_input(fname):
-    with open(fname, 'r') as fin:
-        lines = fin.read().splitlines()
-        ret = {}
-        for i, l_ in enumerate(lines):
-            # TODO: more careful escape handling
-            l = l_.split('#')[0].replace('\\n', '\n').strip()
-            if not l:
-                continue
-            ret[hash_line(l)] = [i, l]
-        return ret
+    try:
+        with open(fname, 'r') as fin:
+            lines = fin.read().splitlines()
+            ret = {}
+            for i, l_ in enumerate(lines):
+                # TODO: more careful escape handling
+                l = l_.split('#')[0].replace('\\n', '\n').strip()
+                if not l:
+                    continue
+                ret[hash_line(l)] = [i, l]
+            return ret
+    except FileNotFoundError:
+        print('ERROR: Input file %s does not exist!' % fname)
+        sys.exit(1)
+
+def load_input_string(fname):
+    txt = ''
+    for hsh, (line, content) in load_input(fname).items():
+        txt += '[%s#%s] %s\n[/%s]\n\0' % (hsh, line, content, hsh)
+    return txt
 
 # [hash#line] content [/hash]
 txt_out_format = re.compile(r'\[([A-Za-z0-9_-]+)#(\d+)\](.*)\[/\1\]', re.DOTALL)
@@ -54,7 +65,7 @@ def load_output(fname):
     except FileNotFoundError:
         return {}
 
-def save_output(fname, data, sep='\n'):
+def save_output(fname, data):
     with open(fname, 'w') as fout:
         for inhash in sorted(data.keys()):
             fout.write('[%s#0] %s\n[/%s]\n' % (inhash, data[inhash][1], inhash))
@@ -71,13 +82,13 @@ def load_gold(fname):
                         opts.append(o2)
                 if not opts:
                     print('ERROR: Empty entry %s in %s' % (ident, fname))
-                    sys.exit(1)
+                    continue
                 ret[hsh] = opts
             return ret
     except FileNotFoundError:
         return {}
 
-def save_gold(fname, data, sep='\n'):
+def save_gold(fname, data):
     with open(fname, 'w') as fout:
         for inhash in sorted(data.keys()):
             fout.write('[%s]\n' % inhash)
@@ -85,6 +96,27 @@ def save_gold(fname, data, sep='\n'):
                 fout.write('%s [/option]\n' % ln)
             fout.write('[/%s]\n' % inhash)
 
+def run_command(cmd, intxt, outfile, shell=False):
+    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, shell=shell)
+    stdout, stderr = proc.communicate(intxt.encode('utf-8'))
+    if proc.returncode != 0:
+        c = cmd if isinstance(cmd, str) else ' '.join(cmd)
+        print('Failed command: %s' % c)
+        print('Writing stderr to test/error.log')
+        with open('test/error.log', 'ab') as fout:
+            fout.write(('Command: %s\n' % c).encode('utf-8'))
+            fout.write(('Output file: %s\n' % outfile).encode('utf-8'))
+            fout.write(('Time: %s\n' % time.asctime()).encode('utf-8'))
+            fout.write(b'Stderr:\n\n')
+            fout.write(stderr)
+            fout.write(b'\n\n')
+        print('Exiting')
+        sys.exit(1)
+    else:
+        with open(outfile, 'wb') as fout:
+            fout.write(stdout)
+
 class Step:
     prognames = {
         'cg-proc': 'disam',
@@ -128,40 +160,34 @@ class Step:
         cmd = [self.prog] + self.args
         if self.prog in Step.prognames or self.prog in ['lt-proc', 'hfst-proc']:
             cmd.append('-z')
-        with open(in_name, 'r') as fin:
-            if first:
-                data = load_input(in_name)
-                txt = ''
-                for hsh, (line, content) in data.items():
-                    txt += '[%s#%s] %s\n[/%s]\n\0' % (hsh, line, content, hsh)
-            else:
+        txt = ''
+        if first:
+            txt = load_input_string(in_name)
+        else:
+            with open(in_name, 'r') as fin:
                 txt = fin.read()
-            proc = subprocess.Popen(cmd, stdin=subprocess.PIPE,
-                                    stdout=subprocess.PIPE)
-            stdout, stderr = proc.communicate(txt.encode('utf-8'))
-            with open(out_name, 'wb') as fout:
-                fout.write(stdout)
+        run_command(cmd, txt, out_name)
 
 class Mode:
     all_modes = {}
     def __init__(self, xml):
         self.name = xml.attrib['name']
         self.steps = [Step(s) for s in xml[0]]
+        self.commands = {}
         nm = defaultdict(lambda: 0)
-        for s in self.steps:
+        for i, s in enumerate(self.steps):
             nm[s.name] += 1
             if nm[s.name] > 1:
                 s.name += str(nm[s.name])
+            self.commands[s.name] = i
         Mode.all_modes[self.name] = self
-    def run(self, corpusname, filename):
+    def run(self, corpusname, filename, start=None):
         fin = filename
-        for i, step in enumerate(self.steps):
+        idx = self.commands.get(start, 0)
+        for i, step in enumerate(self.steps[idx:]):
             fout = 'test/%s-%s-output.txt' % (corpusname, step.name)
             step.run(fin, fout, first=(i == 0))
             fin = fout
-        with open(fin, 'r') as f1:
-            with open('test/%s-all-output.txt' % corpusname, 'w') as f2:
-                f2.write(f1.read())
     def get_commands(self):
         return [s.name for s in self.steps]
 
@@ -177,7 +203,12 @@ def load_modes():
         print('Parser message: %s' % e.msg)
         sys.exit(1)
     for m in root:
-        Mode(m)
+        try:
+            Mode(m)
+        except:
+            print('Unable to parse modes.xml.')
+            print('Run `apertium-validate-modes` for more information.')
+            sys.exit(1)
 
 def get_url(remote):
     proc = subprocess.run(['git', 'remote', 'get-url', remote],
@@ -232,10 +263,20 @@ def check_git():
 class Corpus:
     all_corpora = {}
     def __init__(self, name, blob):
-        # TODO: more error checking, start-step, command
         self.name = name
-        self.mode = blob['mode']
+        self.mode = blob.get('mode', None)
+        self.shell = blob.get('command', None)
+        if not self.mode and not self.shell:
+            print('Corpus %s must specify either "mode": or "command":' % self.name)
+            sys.exit(1)
+        if self.mode and self.mode not in Mode.all_modes:
+            print('Unknown mode %s in corpus %s' % (self.mode, self.name))
+            sys.exit(1)
+        if 'input' not in blob:
+            print('Corpus %s must specify an input file' % self.name)
+            sys.exit(1)
         self.infile = 'test/' + blob['input']
+        self.start_step = blob.get('start-step', None)
         self.data = {}
         self.loaded = False
         self.unsaved = set()
@@ -243,7 +284,12 @@ class Corpus:
         self.hashes = []
         Corpus.all_corpora[name] = self
     def run(self):
-        Mode.all_modes[self.mode].run(self.name, self.infile)
+        if self.mode:
+            Mode.all_modes[self.mode].run(self.name, self.infile,
+                                          start=self.start_step)
+        else:
+            txt = load_input_string(self.infile)
+            run_command(self.shell, txt, self.out_name('all'), shell=True)
         self.loaded = False
     def exp_name(self, cmd):
         return 'test/%s-%s-expected.txt' % (self.name, cmd)
@@ -263,7 +309,9 @@ class Corpus:
         self.hashes = list(ins.keys())
         self.hashes.sort(key = lambda x: ins[x][0])
         outs = []
-        cmds = Mode.all_modes[self.mode].get_commands()
+        cmds = ['all']
+        if self.mode:
+            cmds = Mode.all_modes[self.mode].get_commands()
         self.data = {
             'inputs': ins,
             'cmds': [],
@@ -370,6 +418,8 @@ class Corpus:
         changes = self.accept_add_del(False)
         for blob in self.data['cmds']:
             for h in (hashes or blob['expect'].keys()):
+                if h not in blob['expect']:
+                    continue
                 if blob['expect'][h][1] != blob['output'][h][1]:
                     blob['expect'][h][1] = blob['output'][h][1]
                     changes.append(h)
@@ -383,10 +433,10 @@ class Corpus:
         blob['gold'][hsh] = vals
         save_gold(self.gold_name(blob['cmd']), blob['gold'])
 
-def load_corpora():
+def load_corpora(static=False):
     if not os.path.isdir('test') or not os.path.isfile('test/tests.json'):
         if os.path.isdir('.git'):
-            if check_git():
+            if not static and check_git():
                 load_corpora()
                 return
         print('Test corpora not found. Please create test/tests.json')
@@ -410,6 +460,7 @@ def test_run(corpora):
     return True, ''
 
 def cb_load(page):
+    # TODO: this actually returns the whole corpus as a single page
     changes = {
         'changed_final': [],
         'changed_any': [],
@@ -454,20 +505,20 @@ class CallbackRequestHandler(http.server.SimpleHTTPRequestHandler):
         status = HTTPStatus.OK
         resp = {}
 
+        # TODO: error checking
+        # TODO: if a catastrophic error occurs, the program exits
+        # thus returning no information to the browser
+        # maybe switch to exceptions elsewhere?
         if params['a'][0] == 'init':
             resp['folder'] = os.path.basename(os.getcwd())
             resp['corpora'] = list(sorted(Corpus.all_corpora.keys()))
         elif params['a'][0] == 'load':
-            #try:
             resp = cb_load(params['p'][0])
-            #except:
-            #    resp['error'] = 'Current state is missing or invalid. You will need to run the regression test for all corpora.'
         elif params['a'][0] == 'run':
             good, output = test_run(params.get('c', ['*']))
             resp['good'] = good
             resp['output'] = output
         elif params['a'][0] == 'accept-nd':
-            # TODO: error checking
             resp['c'] = params['c'][0]
             resp['hs'] = Corpus.all_corpora[resp['c']].accept_add_del()
         elif params['a'][0] == 'accept':
@@ -496,10 +547,10 @@ class CallbackRequestHandler(http.server.SimpleHTTPRequestHandler):
         self.end_headers()
         self.wfile.write(rstr)
 
-def start_server():
+def start_server(port):
     d = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'static/')
     handle = partial(CallbackRequestHandler, directory=d)
-    with socketserver.TCPServer(('', 3000), handle) as httpd:
+    with socketserver.TCPServer(('', port), handle) as httpd:
    	    httpd.serve_forever()
 
 class RegtestShell(cmd.Cmd):
@@ -707,7 +758,6 @@ Abbreviated form: `q`'''
 
 if __name__ == '__main__':
     load_modes()
-    load_corpora()
     import argparse
     parser = argparse.ArgumentParser(
         prog='apertium-regtest',
@@ -724,8 +774,11 @@ apertium-regtest has 3 modes available:
     parser.add_argument('mode', choices=['test', 'web', 'cli'])
     parser.add_argument('--no-autosave', action='store_false', dest='autosave',
                         help="in cli mode, don't automatically save pending changes upon exiting")
+    parser.add_argument('-p', '--port', type=int, default=3000,
+                        help="in web mode, run the server on this port (default 3000)")
     args = parser.parse_args()
     if args.mode == 'test':
+        load_corpora(static=True)
         n = len(Corpus.all_corpora.items())
         changed = False
         for i, (name, corp) in enumerate(Corpus.all_corpora.items(), 1):
@@ -757,8 +810,10 @@ apertium-regtest has 3 modes available:
             print('There were changes! Rerun in interactive mode to update tests.')
             sys.exit(1)
     elif args.mode == 'web':
-        start_server()
+        load_corpora(static=False)
+        start_server(args.port)
     elif args.mode == 'cli':
+        load_corpora(static=False)
         RegtestShell(args.autosave).cmdloop()
     else:
         print("Unknown operation mode. Expected 'test', 'web', or 'cli'.")