commit ccc35b6808507f8bf6c8d074080d395437091365 Author: vivekvardhanadepu Date: Sat Aug 21 18:11:35 2021 +0530 cleaning scripts diff --git a/scripts/common.py b/scripts/common.py index 4d4031f..30dc5c0 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -6,7 +6,8 @@ import re import sys -re_start = re.compile('(^[^\^]*)'); +re_start = re.compile('(^[^\^]*)') + def ambiguous(bt): # legislation/legislaciĆ³n/ordenamiento @@ -15,198 +16,207 @@ def ambiguous(bt): return True return False - -def wrap (x): - return '^' + x + '$' + + +def wrap(x): + return '^' + x + '$' + def parse_tags(ptr, line): - tags = [] - tag = ''; + tags = [] + tag = '' + + while True: + c = line[ptr] - while True: - c = line[ptr]; + if c == '$' or c == '/': + return (ptr-1, tags) + elif c == '>': + tags.append(tag) + tag = '' + elif c != '<': + tag += c - if c == '$' or c == '/': - return (ptr-1, tags); - elif c == '>': - tags.append(tag); - tag = ''; - elif c != '<': - tag += c; + ptr += 1 - ptr += 1 def parse_sl(ptr, line): - out = ''; - if line[ptr] == '*': - (ptr, out) = parse_unknown(ptr, line) - return (ptr, (out, [])); - - escaped = False; - while True: - c = line[ptr]; - if c == '\\': - escaped = True; - elif (c == '/' or c == '$') and not escaped: - return (ptr, out); - elif c == '<' and not escaped: - (ptr, tags) = parse_tags(ptr+1, line); - return (ptr, (out, tags)); - else: - out += c; - escaped = False; - ptr += 1; + out = '' + if line[ptr] == '*': + (ptr, out) = parse_unknown(ptr, line) + return (ptr, (out, [])) + + escaped = False + while True: + c = line[ptr] + if c == '\\': + escaped = True + elif (c == '/' or c == '$') and not escaped: + return (ptr, out) + elif c == '<' and not escaped: + (ptr, tags) = parse_tags(ptr+1, line) + return (ptr, (out, tags)) + else: + out += c + escaped = False + ptr += 1 + def parse_unknown(ptr, line): - out = ''; - escaped = False; - while True: - c = line[ptr]; - if c == '\\': - escaped = True; - elif (c == '$' or c == '/') and not escaped: - return (ptr, out); - else: - out += c; - escaped = False; - ptr += 1; + out = '' + escaped = False + while True: + c = line[ptr] + if c == '\\': + escaped = True + elif (c == '$' or c == '/') and not escaped: + return (ptr, out) + else: + out += c + escaped = False + ptr += 1 + def parse_tls(ptr, line): - tls = []; - tl = ''; - out = ''; - escaped = False; - if line[ptr] == '*': - (ptr, out) = parse_unknown(ptr, line) - return (ptr, [(out, [])]); - - - while True: - if ptr == len(line): - tls.append(tl) - return (ptr, tls); - c = line[ptr]; - if c == '\\': - escaped = True; - elif c == '/' and tl != '' and not escaped: - tls.append(tl) - tl = ''; - elif c == '$' and not escaped: - if tl != '': - tls.append(tl) - return (ptr, tls); - elif c == '<' and not escaped: - (ptr, tags) = parse_tags(ptr, line); - tls.append((tl, tags)); - tl = ''; - elif c != '/' or escaped: - tl += c; - escaped = False; - ptr += 1; + tls = [] + tl = '' + out = '' + escaped = False + if line[ptr] == '*': + (ptr, out) = parse_unknown(ptr, line) + return (ptr, [(out, [])]) + + while True: + if ptr == len(line): + tls.append(tl) + return (ptr, tls) + c = line[ptr] + if c == '\\': + escaped = True + elif c == '/' and tl != '' and not escaped: + tls.append(tl) + tl = '' + elif c == '$' and not escaped: + if tl != '': + tls.append(tl) + return (ptr, tls) + elif c == '<' and not escaped: + (ptr, tags) = parse_tags(ptr, line) + tls.append((tl, tags)) + tl = '' + elif c != '/' or escaped: + tl += c + escaped = False + ptr += 1 def toBiltransToken(sl, tls): - new_tls = [] - for tl in tls: - new_tl = tl[0] + '<' + '><'.join(tl[1]) + '>'; - new_tls.append(new_tl); - new_sl = sl[0] + '<' + '><'.join(sl[1]) + '>'; + new_tls = [] + for tl in tls: + new_tl = tl[0] + '<' + '><'.join(tl[1]) + '>' + new_tls.append(new_tl) + new_sl = sl[0] + '<' + '><'.join(sl[1]) + '>' - return (new_sl, new_tls); + return (new_sl, new_tls) def parse_biltrans_token(ptr, line): - (ptr, sl) = parse_sl(ptr, line); - (ptr, tls) = parse_tls(ptr+1, line); - (sl, tls) = toBiltransToken(sl, tls); + (ptr, sl) = parse_sl(ptr, line) + (ptr, tls) = parse_tls(ptr+1, line) + (sl, tls) = toBiltransToken(sl, tls) - token = {}; - token['sl'] = sl; - token['tls'] = tls; + token = {} + token['sl'] = sl + token['tls'] = tls + + return (ptr, token) - return (ptr, token); def parse_tagger_token(ptr, line): - (ptr, sl) = parse_sl(ptr, line); - sl = sl[0] + '<' + '><'.join(sl[1]) + '>' + (ptr, sl) = parse_sl(ptr, line) + sl = sl[0] + '<' + '><'.join(sl[1]) + '>' + + return (ptr, sl) - return (ptr, sl); def tokenize_biltrans_line(line): return tokenise_biltrans_line(line) + def tokenise_biltrans_line(line): - out = [] - escaped = False; - for ptr in range(0, len(line)): - c = line[ptr]; - if c == '^' and not escaped: - (ptr, token) = parse_biltrans_token(ptr+1, line) - out.append(token); - elif c == '\\': - escaped = True; - elif escaped: - escaped = False; - - return out + out = [] + escaped = False + for ptr in range(0, len(line)): + c = line[ptr] + if c == '^' and not escaped: + (ptr, token) = parse_biltrans_token(ptr+1, line) + out.append(token) + elif c == '\\': + escaped = True + elif escaped: + escaped = False + + return out + def tokenize_tagger_line(line): return tokenise_tagger_line(line) + def tokenise_tagger_line(line): - out = [] - escaped = False; - for ptr in range(0, len(line)): - c = line[ptr]; - if c == '^' and not escaped: - (ptr, token) = parse_tagger_token(ptr+1, line) - out.append(token); - elif c == '\\': - escaped = True; - elif escaped: - escaped = False; + out = [] + escaped = False + for ptr in range(0, len(line)): + c = line[ptr] + if c == '^' and not escaped: + (ptr, token) = parse_tagger_token(ptr+1, line) + out.append(token) + elif c == '\\': + escaped = True + elif escaped: + escaped = False + return out - return out def tokenize_biltrans_line2(line): return tokenise_biltrans_line2(line) -def tokenise_biltrans_line2(line): - line = clean_biltrans_line(line)[1:-1]; - row = []; - token = ''; - state = 0; - - escaped = False; - - for c in line: - # in token - if state == 0: - if c == '$': - row.append(token); - token = ''; - state = 1; - elif c == '\\': - continue; - else: - token += c; - # between tokens - elif state == 1: - - if c == '\\': - escaped = True; - elif c == '^' and not escaped: - state = 0; - escaped = False; - elif escaped: - escaped = False - - return row -def clean_biltrans_line(line): - line = re_start.sub('', line); - return line +def tokenise_biltrans_line2(line): + line = clean_biltrans_line(line)[1:-1] + row = [] + token = '' + state = 0 + + escaped = False + + for c in line: + # in token + if state == 0: + if c == '$': + row.append(token) + token = '' + state = 1 + elif c == '\\': + continue + else: + token += c + # between tokens + elif state == 1: + + if c == '\\': + escaped = True + elif c == '^' and not escaped: + state = 0 + escaped = False + elif escaped: + escaped = False + + return row +def clean_biltrans_line(line): + line = re_start.sub('', line) + return line diff --git a/scripts/extract-biltrans-candidates.py b/scripts/extract-biltrans-candidates.py index 6b08d04..e469fb3 100644 --- a/scripts/extract-biltrans-candidates.py +++ b/scripts/extract-biltrans-candidates.py @@ -2,121 +2,114 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs; -import common; +import sys +import common -if len(sys.argv) < 3 or len(sys.argv) > 4: #{ - print('extact-sentences.py [-m|--match-pos]'); - sys.exit(-1); -#} +if len(sys.argv) < 3 or len(sys.argv) > 4: + print('extact-sentences.py [-m|--match-pos]') + exit(1) -match_pos = False; +match_pos = False if len(sys.argv) == 4 and sys.argv[3] not in ['-m', '--match-pos']: - print('extact-sentences.py [-m|--match-pos]'); - sys.exit(-1); + print('extact-sentences.py [-m|--match-pos]') + exit(1) elif len(sys.argv) == 4 and sys.argv[3] in ['-m', '--match-pos']: - match_pos = True; + match_pos = True -phrase_table = open(sys.argv[1]); -biltrans_out = open(sys.argv[2]); +phrase_table = open(sys.argv[1]) +biltrans_out = open(sys.argv[2]) def bttoken_tostr(token): - return '^' + token['sl'] + '/' + '/'.join(token['tls']) + '$'; + return '^' + token['sl'] + '/' + '/'.join(token['tls']) + '$' def generate_tags(token): - tags = filter(lambda x: x != "*>", token.split('<')[1:]); - tags = ["') + "\"/>" for x in tags ]; - tags = ''.join(tags); - return tags + tags = filter(lambda x: x != "*>", token.split('<')[1:]) + tags = ["') + "\"/>" for x in tags] + tags = ''.join(tags) + return tags def generate_entry(slw, tlw): - out = '

%s%s%s%s

'; - llemma = slw.split('<')[0] - ltags = generate_tags(slw); + out = '

%s%s%s%s

' + llemma = slw.split('<')[0] + ltags = generate_tags(slw) - rlemma = tlw.split('<')[0] - rtags = generate_tags(tlw); + rlemma = tlw.split('<')[0] + rtags = generate_tags(tlw) # ltags = ["" for x in ltags] - print (out % (llemma, ltags, rlemma, rtags)); + print(out % (llemma, ltags, rlemma, rtags)) + def pos_equal(s, t): - spos = s.split('>')[1][1:] - tpos = s.split('>')[1][1:] - - return spos == tpos; - -reading = True; -lineno = 0; -total_valid = 0; - -while reading: #{ - try: - lineno = lineno + 1; - pt_line = phrase_table.readline().strip(); - bt_line = biltrans_out.readline().strip(); - - if bt_line == '' and pt_line == '': #{ - reading = False; - #} - - row = pt_line.split('|||'); - sl = common.tokenise_tagger_line(row[1]); - tl = common.tokenise_tagger_line(row[0]); - alignments = row[2].strip(); - bt = common.tokenise_biltrans_line(bt_line); - - if not common.ambiguous(bt): #{ - continue; - #} - if len(sl) < 2 and len(tl) < 2: #{ - continue; - #} - - # Here we collect a set of SL words, with their correspondences in the bilingual - # dictionary, and the word they have been aligned with in the target. - # e.g. words[0] = ('sl', ['bt1', 'bt2', ...], 'tl') - - translations = {}; - i = 0; - for j in alignments.split(' '): #{ - ament = j.split('-'); - if int(ament[0]) > len(tl): #{ - continue; - #} - slw = sl[int(ament[1])] - if slw not in translations: - translations[slw] = {} - translations[slw]['tls'] = tl[int(ament[0])] - translations[slw]['bts'] = bt[int(ament[1])] - #} - - # for tr in translations: - # print (tr, translations[tr]) - - current_ambig_words = {}; - valid = True; - i = 0; - # - for tran in translations: #{ - r = translations[tran] - tlw = r['tls'] - # If the word is ambiguous - if len(r['bts']['tls']) > 1: #{ - # if match_pos = 1 and pos tags do not match - if match_pos and not pos_equal(tran, tlw): - continue; - - # Check to see if the TL possibilities are found in the lexical - # transfer output. - if tlw not in r['bts']['tls']: - print (tlw, "not found for", tran, file=sys.stderr); - generate_entry(tran, tlw); - - #} - - #} - - except: - pass + spos = s.split('>')[1][1:] + tpos = s.split('>')[1][1:] + + return spos == tpos + +reading = True +lineno = 0 +total_valid = 0 + +while reading: + try: + lineno = lineno + 1 + pt_line = phrase_table.readline().strip() + bt_line = biltrans_out.readline().strip() + + if bt_line == '' and pt_line == '': + reading = False + + row = pt_line.split('|||') + sl = common.tokenise_tagger_line(row[1]) + tl = common.tokenise_tagger_line(row[0]) + alignments = row[2].strip() + bt = common.tokenise_biltrans_line(bt_line) + + if not common.ambiguous(bt): + continue + + if len(sl) < 2 and len(tl) < 2: + continue + + # Here we collect a set of SL words, with their correspondences in the bilingual + # dictionary, and the word they have been aligned with in the target. + # e.g. words[0] = ('sl', ['bt1', 'bt2', ...], 'tl') + + translations = {} + i = 0 + for j in alignments.split(' '): + ament = j.split('-') + if int(ament[0]) > len(tl): + continue + + slw = sl[int(ament[1])] + if slw not in translations: + translations[slw] = {} + translations[slw]['tls'] = tl[int(ament[0])] + translations[slw]['bts'] = bt[int(ament[1])] + + # for tr in translations: + # print (tr, translations[tr]) + + current_ambig_words = {} + valid = True + i = 0 + # + for tran in translations: + r = translations[tran] + tlw = r['tls'] + # If the word is ambiguous + if len(r['bts']['tls']) > 1: + # if match_pos = 1 and pos tags do not match + if match_pos and not pos_equal(tran, tlw): + continue + + # Check to see if the TL possibilities are found in the lexical + # transfer output. + if tlw not in r['bts']['tls']: + print(tlw, "not found for", tran, file=sys.stderr) + generate_entry(tran, tlw) + + except: + pass