commit 4e77669be261084363605c9779901bad44748f8c Author: Daniel Swanson Date: Mon Jul 12 22:08:35 2021 -0500 ongoing script cleanup diff --git a/scripts/biltrans-extract-freq.py b/scripts/biltrans-extract-freq.py index 49fd928..b5e9d9a 100644 --- a/scripts/biltrans-extract-freq.py +++ b/scripts/biltrans-extract-freq.py @@ -18,13 +18,9 @@ class Counter(BCC.BiltransCounter): tokenizer = 'biltrans' line_ids = False - def processs_row(self, frac_count=0): + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): global sl_tl - for i in range(len(self.am_row)): - if self.am_row[i].count('/') > 1: - sl = BCC.strip_tags(self.am_row[i], 'sl') - tl = BCC.strip_tags(self.dm_row[i], 'tl') - sl_tl[sl][tl] += 1 + sl_tl[sl][tl] += 1 c = Counter() c.read_files(sys.argv[1], # File with ambiguous biltrans output diff --git a/scripts/biltrans-line-only-pos-ambig.py b/scripts/biltrans-line-only-pos-ambig.py index cef4f50..d0f37d5 100755 --- a/scripts/biltrans-line-only-pos-ambig.py +++ b/scripts/biltrans-line-only-pos-ambig.py @@ -2,61 +2,41 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; - -pos = ["", "", ""]; - -def process_line(l): #{ - global pos; - w = ''; - in_word = False; - escaped = False; - for c in l: #{ - if c == '\\': #{ - escaped = True; - continue; - #} - if c == '^' and escaped == False: #{ - in_word = True; - #} - if c == '$' and escaped == False: #{ - word_in_pos = False; - for p in pos: #{ - if w.count(p) > 0: #{ - word_in_pos = True; - #} - #} - - if w.count('/') > 1 and word_in_pos == True: #{ - return True; - #} - - w = ''; - in_word = False; - #} - if in_word == True: #{ - w = w + c; - #} - escaped = False; - #} - return False; -#} - -output = False; - -infile = sys.stdin ; - -if len(sys.argv) > 1: #{ - infile = open(sys.argv[1]); -#} - -for line in infile.readlines(): #{ - - output = process_line(line); - - if output == True: #{ - print(line.strip()); - #} +import sys + +pos = ["", "", ""] + +def process_line(l): + global pos + w = '' + in_word = False + escaped = False + for c in l: + if c == '\\': + escaped = True + continue + if c == '^' and escaped == False: + in_word = True + if c == '$' and escaped == False: + if w.count('/') > 1 and any(p in w for p in pos): + return True + w = '' + in_word = False + if in_word == True: + w += c + escaped = False + return False + +output = False + +infile = sys.stdin + +if len(sys.argv) > 1: + infile = open(sys.argv[1]) + +for line in infile.readlines(): + output = process_line(line) + if output == True: + print(line.strip()) # else: -# print(line.strip(), file=sys.stderr); -#} +# print(line.strip(), file=sys.stderr) diff --git a/scripts/biltrans-no-retained.py b/scripts/biltrans-no-retained.py index 7460a71..e8eea5b 100644 --- a/scripts/biltrans-no-retained.py +++ b/scripts/biltrans-no-retained.py @@ -2,58 +2,53 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys -lines = [] ; -for line in open(sys.argv[2]).readlines(): #{ - lines.append(int(line.strip())); -#} -print(sys.argv, len(lines), file=sys.stderr); +lines = [] +for line in open(sys.argv[2]).readlines(): + lines.append(int(line.strip())) -lineno = 1; +print(sys.argv, len(lines), file=sys.stderr) -inf = open(sys.argv[1]); -buf = inf.readline(); -while buf != '': #{ - if lineno not in lines: #{ - print(buf.strip()); - elif lineno in lines: #{ - print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); - else: #{ - print('Something weird happened.', file=sys.stderr); - #} +lineno = 1 - lineno = lineno + 1; - buf = inf.readline(); -#} +inf = open(sys.argv[1]) +buf = inf.readline() +while buf != '': + if lineno in lines: + print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) + else: + print(buf.strip()) + lineno += 1 + buf = inf.readline() # -#c = inf.read(1); -#buf = ''; -#while c: #{ -# if c == '\n': #{ -# if lineno in lines: #{ -# print(buf.strip()); -# elif lineno not in lines: #{ -# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); -# else: #{ -# print('Something weird happened.', file=sys.stderr); -# #} -# lineno = lineno + 1; -# buf = ''; -# #} +#c = inf.read(1) +#buf = '' +#while c: +# if c == '\n': +# if lineno in lines: +# print(buf.strip()) +# elif lineno not in lines: +# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) +# else: +# print('Something weird happened.', file=sys.stderr) +# +# lineno = lineno + 1 +# buf = '' +# +# +# buf = buf + c +# c = inf.read(1) # -# buf = buf + c; -# c = inf.read(1); -##} ## -#for line in open(sys.argv[1]).readlines(): #{ -# if lineno in lines: #{ -# print(line.strip()); -# elif lineno not in lines: #{ -# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); -# else: #{ -# print('Something weird happened.', file=sys.stderr); -# #} -# lineno = lineno + 1; -##} +#for line in open(sys.argv[1]).readlines(): +# if lineno in lines: +# print(line.strip()) +# elif lineno not in lines: +# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) +# else: +# print('Something weird happened.', file=sys.stderr) +# +# lineno = lineno + 1 +# diff --git a/scripts/biltrans-only-retained.py b/scripts/biltrans-only-retained.py index 7508da6..f1b41f1 100644 --- a/scripts/biltrans-only-retained.py +++ b/scripts/biltrans-only-retained.py @@ -2,58 +2,53 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys -lines = [] ; -for line in open(sys.argv[2]).readlines(): #{ - lines.append(int(line.strip())); -#} -print(sys.argv, len(lines), file=sys.stderr); +lines = [] +for line in open(sys.argv[2]).readlines(): + lines.append(int(line.strip())) -lineno = 1; +print(sys.argv, len(lines), file=sys.stderr) -inf = open(sys.argv[1]); -buf = inf.readline(); -while buf != '': #{ - if lineno in lines: #{ - print(buf.strip()); - elif lineno not in lines: #{ - print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); - else: #{ - print('Something weird happened.', file=sys.stderr); - #} +lineno = 1 - lineno = lineno + 1; - buf = inf.readline(); -#} +inf = open(sys.argv[1]) +buf = inf.readline() +while buf != '': + if lineno in lines: + print(buf.strip()) + else: + print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) + lineno += 1 + buf = inf.readline() # -#c = inf.read(1); -#buf = ''; -#while c: #{ -# if c == '\n': #{ -# if lineno in lines: #{ -# print(buf.strip()); -# elif lineno not in lines: #{ -# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); -# else: #{ -# print('Something weird happened.', file=sys.stderr); -# #} -# lineno = lineno + 1; -# buf = ''; -# #} +#c = inf.read(1) +#buf = '' +#while c: +# if c == '\n': +# if lineno in lines: +# print(buf.strip()) +# elif lineno not in lines: +# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) +# else: +# print('Something weird happened.', file=sys.stderr) +# +# lineno = lineno + 1 +# buf = '' +# +# +# buf = buf + c +# c = inf.read(1) # -# buf = buf + c; -# c = inf.read(1); -##} ## -#for line in open(sys.argv[1]).readlines(): #{ -# if lineno in lines: #{ -# print(line.strip()); -# elif lineno not in lines: #{ -# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr); -# else: #{ -# print('Something weird happened.', file=sys.stderr); -# #} -# lineno = lineno + 1; -##} +#for line in open(sys.argv[1]).readlines(): +# if lineno in lines: +# print(line.strip()) +# elif lineno not in lines: +# print('Line ' + str(lineno) + ' discarded.', file=sys.stderr) +# else: +# print('Something weird happened.', file=sys.stderr) +# +# lineno = lineno + 1 +# diff --git a/scripts/biltrans-to-multitrans-line-recursive.py b/scripts/biltrans-to-multitrans-line-recursive.py index f613529..a15a7fb 100755 --- a/scripts/biltrans-to-multitrans-line-recursive.py +++ b/scripts/biltrans-to-multitrans-line-recursive.py @@ -2,98 +2,53 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, commands; +import sys from operator import mul -sys.stdin = codecs.getreader('utf-8')(sys.stdin); -sys.stdout = codecs.getwriter('utf-8')(sys.stdout); -sys.stderr = codecs.getwriter('utf-8')(sys.stderr); - t = 0 lineno = 0 -def process_biltrans_unit(lu): #{ - - state = 0; - sl = ''; - tl = []; - for c in lu[1:-1]: #{ - #^worth/valor$ ^\$/\$$^20/20$^*m/*m$ - #print c , sl , tl; - if c == '/': #{ - state = state + 1; - tl.append(sl) - #} - if state == 0: #{ - sl = sl + c; - #} - if state >= 1: #{ - tl[state-1] = tl[state-1] + c; - #} - #} - return (sl, tl); -#} - def parse_input(line): sentence = [] - escaped = False; - reading_word = False; + escaped = False + reading_word = False lu = '' - for c in line: #{ - if c == '\\': #{ - escaped = True; - lu = lu + c; - c = sys.stdin.read(1); - #} - if c == '^': #{ - reading_word = True; - #} - if c == '$' and escaped == False: #{ - lu = lu + c; - reading_word = False; - (sl, tl) = process_biltrans_unit(lu) - sentence.append(tl) - lu = ''; - #} - if c != '\\' and escaped == True: #{ - escaped = False; - #} - if c.isspace(): #{ - if reading_word == False: #{ - continue; - #} - #} - if reading_word: #{ - lu = lu + c; - #} - #}sys.stdout.writesys.stdout.write + for c in line: + if escaped: + if reading_word: + lu += c + elif c == '\\': + if reading_word: + lu += c + escaped = True + elif c == '^': + reading_word = True + elif c == '$': + sentence.append(lu.split('/')[1:]) + reading_word = False + lu = '' + elif reading_word: + lu += c + return sentence -#} -def process(sentence, start, out): #{ + +def process(sentence, start, out): global t global lineno - if start >= len(sentence): #{ - sys.stdout.write ('.[][' + str(lineno) + " " + str(t) + '].[]\t') - for s in out: - sys.stdout.write("^" + s + "$ "); - print '' + if start >= len(sentence): + sen = ' '.join('^'+s+'$' for s in out) + print('.[][%d %d].[]\t%s' % (lineno, t, sen)) t += 1 - return; - #} - tokens = sentence[start] - for token in tokens: #{ - out.append(token) - process(sentence, start + 1, out); - del out[-1] - #} -#} + return + + for token in sentence[start]: + process(sentence, start + 1, out + [token]) -while True: #{ - lineno - string = sys.stdin.readline().rstrip(); +while True: + string = sys.stdin.readline().rstrip() if string == "": - break; + break tokens = parse_input(string) # print map(len, tokens) @@ -102,4 +57,3 @@ while True: #{ process(tokens, 0, []) lineno += 1 t = 0 -#} diff --git a/scripts/biltrans-to-multitrans.py b/scripts/biltrans-to-multitrans.py index e92113f..ecb24be 100644 --- a/scripts/biltrans-to-multitrans.py +++ b/scripts/biltrans-to-multitrans.py @@ -2,115 +2,65 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, commands; +import sys -sys.stdin = codecs.getreader('utf-8')(sys.stdin); -sys.stdout = codecs.getwriter('utf-8')(sys.stdout); -sys.stderr = codecs.getwriter('utf-8')(sys.stderr); +def process_biltrans_unit(lu, sents): + new_paths = {} + state = 0 + tl = {} + ls = lu[1:-1].split('/') + sl = ls[0] + for i in range(1, len(ls)): + tl[i] = '/' + ls[i] -output_sentences = {}; -output_sentences[''] = ''; -reading_word = False; -lineno = 1; -lu = ''; -c = sys.stdin.read(1); + if len(tl) == 0: + print('ERROR:', lu, file=sys.stderr) + elif len(tl) > 1: + for tid, trad in tl.items(): + for path, sent in sents.items(): + new_paths[path + trad] = sent + '^' + sl + trad + '$' + else: + for path in sents: + new_paths[path] = sents[path] + '^' + sl + tl[1] + '$' + return new_paths -def process_biltrans_unit(lu, sents): #{ - new_paths = {}; +def process_line(line): + escaped = False + in_word = False + cur_id = line.split()[0] + idx = len(cur_id) + 1 + lu = '' + output_sentences = {'':''} + while idx < len(line): + c = line[idx] + if c == '\\': + if in_word: + lu += c + idx += 1 + lu += line[idx] + else: + idx += 1 + elif c == '^': + in_word = True + elif c == '$': + in_word = False + new_paths = process_biltrans_line(lu, output_sentences) + output_sentences = new_paths + lu = '' + elif in_word: + lu += c + elif c.isspace() and c != '\n': + for s in output_sentences: + output_sentences[s] += c + idx += 1 + return cur_id, output_sentences - state = 0; - sl = ''; - tl = {}; - for c in lu[1:-1]: #{ - #^worth/valor$ ^\$/\$$^20/20$^*m/*m$ - #print c , sl , tl; - if c == '/': #{ - state = state + 1; - if state not in tl: #{ - tl[state] = ''; - #} - #} - if state == 0: #{ - sl = sl + c; - #} - if state >= 1: #{ - tl[state] = tl[state] + c; - #} - #} - - if len(tl) > 1: #{ - for trad in tl: #{ - for path in sents: #{ - new_paths[path + tl[trad]] = sents[path] + '^' + sl + tl[trad] + '$'; - #} - #} - else: #{ - for path in sents: #{ - if state not in tl: #{ - print >> sys.stderr, 'ERROR: '; - print >> sys.stderr, sl ; - print >> sys.stderr, tl ; - #} - new_paths[path] = sents[path] + '^' + sl + tl[state] + '$'; - #} - #} - - - return new_paths; -#} - -escaped = False; -seen_newline = True; -cur_id = ''; -while c: #{ - if c == '\\': #{ - escaped = True; - lu = lu + c; - c = sys.stdin.read(1); - #} - if c == '^': #{ - reading_word = True; - #} - if c == '$' and escaped == False: #{ - lu = lu + c; - new_paths = process_biltrans_unit(lu, output_sentences); - del output_sentences; - output_sentences = new_paths; - reading_word = False; - lu = ''; - #} - if c != '\\' and escaped == True: #{ - escaped = False; - #} - if c.isspace(): #{ - seen_newline = False; - if c == '\n': #{ - print >> sys.stderr, 'output_sentences: ', len(output_sentences); - i = 0; - for sentence in output_sentences: #{ - #print '.[][' + str(lineno) + ' ' + str(i) + ' ' + cur_id +'].[]\t' , output_sentences[sentence]; - print '.[][' + cur_id + ' ' + str(i) + '].[]\t' , output_sentences[sentence]; - i = i + 1; - #} - lineno = lineno + 1; - - output_sentences = {}; - output_sentences[''] = ''; - seen_newline = True; - cur_id = ''; - - elif reading_word == False: #{ - for sentence in output_sentences: #{ - output_sentences[sentence] = output_sentences[sentence] + c; - #} - #} - #} - if reading_word: #{ - lu = lu + c; - #} - if seen_newline and c != '\n': #{ - cur_id = cur_id + c; - #} - c = sys.stdin.read(1); -#} +while True: + ln = sys.stdin.readline() + if not ln: + break + cur_id, sentences = process_line(ln) + print('output_sentences:', len(sentences), file=sys.stderr) + for i, sent in enumerate(sentences.values()): + print('.[][%s %s].[]\t%s' % (cur_id, i, sent)) diff --git a/scripts/biltrans-trim-uncovered.py b/scripts/biltrans-trim-uncovered.py index 0c3fa23..f6fcb9f 100644 --- a/scripts/biltrans-trim-uncovered.py +++ b/scripts/biltrans-trim-uncovered.py @@ -2,28 +2,23 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; +import sys -pos = ["", "", ""]; +pos = ["", "", ""] -output = False; +output = False -infile = sys.stdin ; +infile = sys.stdin -if len(sys.argv) > 1: #{ - infile = open(sys.argv[1]); -#} +if len(sys.argv) > 1: + infile = open(sys.argv[1]) -lineno = 0; -for line in infile.readlines(): #{ - lineno = lineno + 1; - num_lu = float(line.count('$')); - num_unk = float(line.count('*')) / 2.0; - cov = 100.0 - ((num_unk / num_lu) * 100.0); +lineno = 0 +for line in infile.readlines(): + lineno = lineno + 1 + num_lu = float(line.count('$')) + num_unk = float(line.count('*')) / 2.0 + cov = 100.0 - ((num_unk / num_lu) * 100.0) - if cov >= 90.0: #{ - print(line.strip()); - #} - - -#} + if cov >= 90.0: + print(line.strip())