commit 1afdffd74fd3853e7eabedbd071e7d7344bb07c4 Author: vivekvardhanadepu Date: Thu Jun 17 15:25:56 2021 +0530 Scripts fixup: enclosing the code in functions diff --git a/scripts/extract-freq-lexicon.py b/scripts/extract-freq-lexicon.py index 379539a..3534ac6 100755 --- a/scripts/extract-freq-lexicon.py +++ b/scripts/extract-freq-lexicon.py @@ -2,7 +2,8 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, common +import sys +import common # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations @@ -13,128 +14,135 @@ import sys, common # ngrams[ngram][tl_word] = freq # 5 Please rise , then , for this minute 's silence . -#5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. -#5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . -#5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 -#------------------------------------------------------------------------------- - -def wrap (x): - return '^' + x + '$' - -MAX_NGRAMS = 3 - -cur_line = 0 -lineno = 0 -sl_tl = {} -ngrams = {} - -cur_sl_row = [] -cur_tl_row = [] -cur_bt_row = [] -cur_al_row = [] - -if len(sys.argv) < 2: #{ - print('extract-freq-lexicon.py ') - sys.exit(-1) -#} - -#for line in open(sys.argv[1]).readlines(): #{ -with open(sys.argv[1]) as infile: - for line in infile: #{ - line = line.strip() - lineno += 1 - if lineno % 5000 == 0: #{ - sys.stderr.write('.') - if lineno % 100000 == 0: #{ - sys.stderr.write(str(lineno)+'\n') - #} - sys.stderr.flush() - #} - try: - if line[0] == '-': #{ - # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations - # - # sl_tl[sl_word][tl_word] = tl_freq - i = 0 - for slword in cur_sl_row: #{ - if len(cur_bt_row[i]['tls']) > 1: #{ - for al in cur_al_row: #{ - if al == '': - continue - al_sl = int(al.split('-')[1]) - al_tl = int(al.split('-')[0]) - if al_sl != i: #{ - continue - #} - if al_tl < len(cur_tl_row): - tlword = cur_tl_row[al_tl] - else: - tlword = cur_tl_row[-1] - print("alignment out", - "of", - "range", al_tl, - "not in", - "len(", - cur_tl_row, - ")", - file=sys.stderr) - slword = slword - if slword not in sl_tl: #{ - sl_tl[slword] = {} - #} - if tlword not in sl_tl[slword]: #{ - sl_tl[slword][tlword] = 0 - #} - sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1 - # print '+' , slword , tlword , sl_tl[slword][tlword], lineno - #} - #} - i = i + 1 - #} - cur_line = 0 - continue - #} - - line = line.split('\t')[1] - - if cur_line == 0: #{ - cur_sl_row = common.tokenise_tagger_line(line) - elif cur_line == 1: #{ - cur_bt_row = common.tokenise_biltrans_line(line) - elif cur_line == 2: #{ - cur_tl_row = common.tokenise_tagger_line(line) - elif cur_line == 3: #{ - cur_al_row = line.split(' ') - #} - - cur_line = cur_line + 1 - except Exception as e: - print("Error in line", lineno, ":", e, file=sys.stderr) - sys.exit(-1) - #} -#} - -for sl in sl_tl: #{ - - newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) - newtl.reverse() - first = True - for tl in newtl: #{ - if tl[0] == '*': #{ - print('Error: tl word unknown', tl, file=sys.stderr) - continue - #} - first_tag_sl = sl.split('<')[1].split('>')[0].strip() - first_tag_tl = tl.split('<')[1].split('>')[0].strip() - if first_tag_sl != first_tag_tl: #{ - print('Error:', first_tag_sl, '!=', first_tag_tl, file=sys.stderr) - continue - #} - if first: #{ - print(sl_tl[sl][tl] , wrap(sl) , wrap(tl) , '@') - first = False - else: #{ - print(sl_tl[sl][tl] , wrap(sl) , wrap(tl)) - #} - #} -#} +# 5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. +# 5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . +# 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 +# ------------------------------------------------------------------------------- + + +def wrap(x): + return '^' + x + '$' + + +def extract_freq_lexicon(canditates): + # MAX_NGRAMS = 3 + + cur_line = 0 + lineno = 0 + sl_tl = {} + # ngrams = {} + + cur_sl_row = [] + cur_tl_row = [] + cur_bt_row = [] + cur_al_row = [] + + # for line in open(sys.argv[1]).readlines(): #{ + with open(canditates) as infile: + for line in infile: # { + line = line.strip() + lineno += 1 + if lineno % 5000 == 0: # { + sys.stderr.write('.') + if lineno % 100000 == 0: # { + sys.stderr.write(str(lineno)+'\n') + # } + sys.stderr.flush() + # } + try: + if line[0] == '-': # { + # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations + # + # sl_tl[sl_word][tl_word] = tl_freq + i = 0 + for slword in cur_sl_row: # { + if len(cur_bt_row[i]['tls']) > 1: # { + for al in cur_al_row: # { + if al == '': + continue + al_sl = int(al.split('-')[1]) + al_tl = int(al.split('-')[0]) + if al_sl != i: # { + continue + # } + if al_tl < len(cur_tl_row): + tlword = cur_tl_row[al_tl] + else: + tlword = cur_tl_row[-1] + print("alignment out", + "of", + "range", al_tl, + "not in", + "len(", + cur_tl_row, + ")", + file=sys.stderr) + slword = slword + if slword not in sl_tl: # { + sl_tl[slword] = {} + # } + if tlword not in sl_tl[slword]: # { + sl_tl[slword][tlword] = 0 + # } + sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1 + # print '+' , slword , tlword , sl_tl[slword][tlword], lineno + # } + # } + i = i + 1 + # } + cur_line = 0 + continue + # } + + line = line.split('\t')[1] + + if cur_line == 0: # { + cur_sl_row = common.tokenise_tagger_line(line) + elif cur_line == 1: # { + cur_bt_row = common.tokenise_biltrans_line(line) + elif cur_line == 2: # { + cur_tl_row = common.tokenise_tagger_line(line) + elif cur_line == 3: # { + cur_al_row = line.split(' ') + # } + + cur_line = cur_line + 1 + except Exception as e: + print("Error in line", lineno, ":", e, file=sys.stderr) + sys.exit(-1) + # } + # } + + for sl in sl_tl: # { + + newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) + newtl.reverse() + first = True + for tl in newtl: # { + if tl[0] == '*': # { + print('Error: tl word unknown', tl, file=sys.stderr) + continue + # } + first_tag_sl = sl.split('<')[1].split('>')[0].strip() + first_tag_tl = tl.split('<')[1].split('>')[0].strip() + if first_tag_sl != first_tag_tl: # { + print('Error:', first_tag_sl, '!=', + first_tag_tl, file=sys.stderr) + continue + # } + if first: # { + print(sl_tl[sl][tl], wrap(sl), wrap(tl), '@') + first = False + else: # { + print(sl_tl[sl][tl], wrap(sl), wrap(tl)) + # } + # } + # } + + +if __name__ == '__main__': + if len(sys.argv) < 2: # { + print('extract-freq-lexicon.py ') + exit(1) + # } + extract_freq_lexicon(sys.argv[1]) diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py index 0d8834b..01aedf7 100755 --- a/scripts/extract-sentences.py +++ b/scripts/extract-sentences.py @@ -2,99 +2,107 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs +import sys +import codecs import common -if len(sys.argv) < 3: #{ - print('extact-sentences.py ') - sys.exit(-1) -#} - -phrase_table = open(sys.argv[1]) -biltrans_out = open(sys.argv[2]) - -def ambiguous(bt): #{ - # legislation/legislación/ordenamiento - - ambig = False - for token in bt: #{ - tls = token['tls'] - if len(tls) > 1: #{ - return True - #} - #} - - return ambig -#} - -reading = True -lineno = 0 -total_valid = 0 -total_errors = 0 - -not_ambiguous = [] - -while reading: #{ - try: - lineno = lineno + 1 - pt_line = phrase_table.readline().strip() - bt_line = biltrans_out.readline().strip() - - if not bt_line.strip() and not pt_line.strip(): #{ - reading = False - break - elif not bt_line.strip() or not pt_line.strip(): #{ - continue - - #} - row = pt_line.split('|||') - bt = common.tokenise_biltrans_line(bt_line.strip()) - sl = common.tokenise_tagger_line(row[1].strip()) - tl = common.tokenise_tagger_line(row[0].strip()) - - if not ambiguous(bt): #{ - not_ambiguous.append(str(lineno)) - if len(not_ambiguous) >= 10: #{ - print ("not ambiguous:", ' '.join(not_ambiguous), file=sys.stderr) - not_ambiguous = [] - #} - continue - #} - if len(sl) < 2 and len(tl) < 2: #{ - continue - #} - - - # Check that the number of words in the lexical transfer, and in the phrasetable matches up - if len(sl) != len(bt): #{ - print ("Error in line", lineno, ": len(sl) != len(bt)", file=sys.stderr) - continue - #} - - # cheking if the alignments are empty - if not row[2].strip(): - print("In line", lineno, ", alignments are empty", file=sys.stderr) - continue - - # Resumption of the session - # Resumption/Reanudación of/de the/el session/sesión - # Reanudación de el periodo de sesión - # 0-0 1-1 2-2 5-3 - - - print(lineno, '\t' + row[1]) - print(lineno, '\t' + bt_line) - print(lineno, '\t' + row[0]) - print(lineno, '\t' + row[2]) - print('-------------------------------------------------------------------------------') - total_valid += 1 - except Exception as e: - print ("Error in line", lineno, ": ", e, file=sys.stderr) - total_errors += 1 - continue - -#} - -print('total:', lineno, file=sys.stderr) -print('valid:', total_valid, '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr) -print('errors:',total_errors, '(' + str((total_errors/lineno)*100) + '%)', file=sys.stderr) + +def ambiguous(bt): # { + # legislation/legislación/ordenamiento + + ambig = False + for token in bt: # { + tls = token['tls'] + if len(tls) > 1: # { + return True + # } + # } + + return ambig +# } + + +def extract_sentences(phrase_table, biltrans_out): + reading = True + lineno = 0 + total_valid = 0 + total_errors = 0 + + not_ambiguous = [] + + while reading: # { + try: + lineno = lineno + 1 + pt_line = phrase_table.readline().strip() + bt_line = biltrans_out.readline().strip() + + if not bt_line.strip() and not pt_line.strip(): # { + reading = False + break + elif not bt_line.strip() or not pt_line.strip(): # { + continue + + # } + row = pt_line.split('|||') + bt = common.tokenise_biltrans_line(bt_line.strip()) + sl = common.tokenise_tagger_line(row[1].strip()) + tl = common.tokenise_tagger_line(row[0].strip()) + + if not ambiguous(bt): # { + not_ambiguous.append(str(lineno)) + if len(not_ambiguous) >= 10: # { + print("not ambiguous:", ' '.join( + not_ambiguous), file=sys.stderr) + not_ambiguous = [] + # } + continue + # } + if len(sl) < 2 and len(tl) < 2: # { + continue + # } + + # Check that the number of words in the lexical transfer, and in the phrasetable matches up + if len(sl) != len(bt): # { + print("Error in line", lineno, + ": len(sl) != len(bt)", file=sys.stderr) + continue + # } + + # cheking if the alignments are empty + if not row[2].strip(): + print("In line", lineno, ", alignments are empty", file=sys.stderr) + continue + + # Resumption of the session + # Resumption/Reanudación of/de the/el session/sesión + # Reanudación de el periodo de sesión + # 0-0 1-1 2-2 5-3 + + print(lineno, '\t' + row[1]) + print(lineno, '\t' + bt_line) + print(lineno, '\t' + row[0]) + print(lineno, '\t' + row[2]) + print( + '-------------------------------------------------------------------------------') + total_valid += 1 + except Exception as e: + print("Error in line", lineno, ": ", e, file=sys.stderr) + total_errors += 1 + continue + + # } + + print('total:', lineno, file=sys.stderr) + print('valid:', total_valid, + '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr) + print('errors:', total_errors, + '(' + str((total_errors/lineno)*100) + '%)', file=sys.stderr) + + +if __name__ == '__main__': + if len(sys.argv) < 3: # { + print('extact-sentences.py ') + exit(1) + # } + with open(sys.argv[1]) as phrase_table, open(sys.argv[2]) as biltrans_out: + extract_sentences(phrase_table, biltrans_out) diff --git a/scripts/lambdas-to-rules.py b/scripts/lambdas-to-rules.py index ead391d..d02b87f 100644 --- a/scripts/lambdas-to-rules.py +++ b/scripts/lambdas-to-rules.py @@ -1,78 +1,89 @@ -import sys; -import common; - -def wrap (x): - return '^' + x + '$' - -sl_tl_defaults = {}; -sl_tl = {}; - -indexes = {}; -trad_counter = {}; -rindex = {}; - -with open(sys.argv[1]) as d: - for line in d: #{ - if len(line) < 1: #{ - continue; - #} - row = common.tokenise_tagger_line(line); - sl = wrap(row[0].strip()); - tl = wrap(row[1].strip()); - if tl[1] == '*': - tl = tl[:-3] + '$' - - if sl not in sl_tl: #{ - sl_tl[sl] = []; - #} - if sl not in trad_counter: #{ - trad_counter[sl] = 0; - #} - if line.count('@') > 0: #{ - sl_tl_defaults[sl] = tl; - #} - sl_tl[sl].append(tl); - indexes[(sl, tl)] = trad_counter[sl]; - rindex[(sl, trad_counter[sl])] = tl; - trad_counter[sl] = trad_counter[sl] + 1; - - #} - -for pair in rindex: #{ - print(pair[0], pair[1], rindex[pair], file=sys.stderr); -#} - -#ability 0.25652 1 ability to -#ability 1.54548 0 ability to deliver -#ability 1.48162 0 our ability to - -with open(sys.argv[2]) as d: - for line in d: #{ - - row = line.split(' \t '); - slword = row[0].strip(); - l = float(row[1]); - tlid = int(row[2]); - if (slword, tlid) not in rindex: #{ - print ('(', slword, ',', tlid, ') not in index', file=sys.stderr) - continue; - #} - tlword = rindex[(slword, tlid)]; - context = row[3].strip(); - # #+ 0.571428571429 14 8 8 troiñ tourner 8 - #+nature service nature carácter 3 - - - print('+ ' + row[1] + '\t' + slword + '\t' + context + '\t' + tlword + '\t1'); - - # print(' ' % (l)); - # for c in context.split(' '): #{ - # if c.count(slword) == 1: #{ - # print(slword, tlword); - # else: #{ - # print(c); - # #} - # #} - # print(' '); - - #} +import sys +import common + + +def wrap(x): + return '^' + x + '$' + + +def lambdas_to_rules(freq_lexicon, rules): + sl_tl_defaults = {} + sl_tl = {} + + indexes = {} + trad_counter = {} + rindex = {} + + with open(freq_lexicon) as d: + for line in d: # { + if len(line) < 1: # { + continue + # } + row = common.tokenise_tagger_line(line) + sl = wrap(row[0].strip()) + tl = wrap(row[1].strip()) + if tl[1] == '*': + tl = tl[:-3] + '$' + + if sl not in sl_tl: # { + sl_tl[sl] = [] + # } + if sl not in trad_counter: # { + trad_counter[sl] = 0 + # } + if line.count('@') > 0: # { + sl_tl_defaults[sl] = tl + # } + sl_tl[sl].append(tl) + indexes[(sl, tl)] = trad_counter[sl] + rindex[(sl, trad_counter[sl])] = tl + trad_counter[sl] = trad_counter[sl] + 1 + + # } + + for pair in rindex: # { + print(pair[0], pair[1], rindex[pair], file=sys.stderr) + # } + + # ability 0.25652 1 ability to + # ability 1.54548 0 ability to deliver + # ability 1.48162 0 our ability to + + with open(rules) as d: + for line in d: # { + + row = line.split(' \t ') + slword = row[0].strip() + l = float(row[1]) + tlid = int(row[2]) + if (slword, tlid) not in rindex: # { + print('(', slword, ',', tlid, ') not in index', file=sys.stderr) + continue + # } + tlword = rindex[(slword, tlid)] + context = row[3].strip() + # #+ 0.571428571429 14 8 8 troiñ tourner 8 + #+nature service nature carácter 3 + + print('+ ' + row[1] + '\t' + slword + + '\t' + context + '\t' + tlword + '\t1') + + # print(' ' % (l)) + # for c in context.split(' '): #{ + # if c.count(slword) == 1: #{ + # print(slword, tlword) + # else: #{ + # print(c) + # #} + # #} + # print(' ') + + # } + + +if __name__ == '__main__': + if len(sys.argv) < 3: # { + print('lambdas-to-rules.py ') + exit(1) + # } + lambdas_to_rules(sys.argv[1], sys.argv[2]) diff --git a/scripts/merge-ngrams-lambdas.py b/scripts/merge-ngrams-lambdas.py index 21264f4..1595d26 100644 --- a/scripts/merge-ngrams-lambdas.py +++ b/scripts/merge-ngrams-lambdas.py @@ -2,43 +2,52 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, random; - -# ngram file -# lambda file -# lexicon file - -ngf = sys.argv[1]; -ldf = sys.argv[2]; - -ngrams = {}; -lambdas = {}; - -for line in open(ngf).readlines(): #{ - #59763 poor in capital - if len(line) < 2: #{ - continue; - #} - row = line.strip().split( '\t' ); - if(len(row) < 2): - row.append(''); - ngid = int(row[0].strip()); - ngrams[ngid] = row[1]; -#} - -with open(ldf) as d: - for line in d: #{ - #59176:0 1.00131 - if line.count('@@') > 0: #{ - continue; - #} - row = line.strip().split('\t'); - - l = float(row[2]); - ngid = int(row[1].split(':')[0]); - ngram = ngrams[ngid]; - - trad = row[1].split(':')[1]; - token = row[0] - print(token, '\t', l, '\t', trad, '\t', ngram); - #} +import sys +import codecs +import random + + +def merge_ngrams_lambdas(ngf, ldf): + # ngram file + # lambda file + # lexicon file + + ngrams = {} + # lambdas = {} + + for line in open(ngf).readlines(): # { + # 59763 poor in capital + if len(line) < 2: # { + continue + # } + row = line.strip().split('\t') + if(len(row) < 2): + row.append('') + ngid = int(row[0].strip()) + ngrams[ngid] = row[1] + # } + + with open(ldf) as d: + for line in d: # { + # 59176:0 1.00131 + if line.count('@@') > 0: # { + continue + # } + row = line.strip().split('\t') + + l = float(row[2]) + ngid = int(row[1].split(':')[0]) + ngram = ngrams[ngid] + + trad = row[1].split(':')[1] + token = row[0] + print(token, '\t', l, '\t', trad, '\t', ngram) + # } + + +if __name__ == '__main__': + if len(sys.argv) < 3: # { + print('merge-ngrams-lambdas.py ') + exit(1) + # } + merge_ngrams_lambdas(sys.argv[1], sys.argv[2]) diff --git a/scripts/ngram-count-patterns-maxent2.py b/scripts/ngram-count-patterns-maxent2.py index 9b77876..463bc3f 100755 --- a/scripts/ngram-count-patterns-maxent2.py +++ b/scripts/ngram-count-patterns-maxent2.py @@ -2,7 +2,9 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys +import codecs +import copy import common # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations @@ -14,252 +16,263 @@ import common # ngrams[ngram][tl_word] = freq # 5 Please rise , then , for this minute 's silence . -#5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. -#5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . -#5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 -#------------------------------------------------------------------------------- +# 5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. +# 5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . +# 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 +# ------------------------------------------------------------------------------- THRESHOLD = 0 -if len(sys.argv) not in [3, 4]: #{ - print('count-patterns.py [threshold]') - sys.exit(-1); -#} - -if len(sys.argv) == 4: - THRESHOLD = int(sys.argv[3]) - -MAX_NGRAMS = 3; -cur_line = 0; - -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; - -meevents = {}; # events[slword][counter] = [feat, feat, feat]; -meoutcomes = {}; # meoutcomes[slword][counter] = tlword; -event_counter = 0; - -features = {}; # features[(slword, ['a', 'list'], tlword)] = 3 -feature_counter = 0; - -indexes = {}; -trad_counter = {}; - -def wrap (x): - return '^' + x + '$' - -for line in open(sys.argv[1], 'r').readlines(): #{ - if len(line) < 1: #{ - continue; - #} - w = int(line.split(' ')[0]) - if w < THRESHOLD: - continue; - - row = common.tokenise_tagger_line(line); - sl = wrap(row[0]).lower(); - tl = wrap(row[1].strip()).lower(); - if tl[1] == '*': - tl = tl[:-3] + '$' - - if sl not in sl_tl: #{ - sl_tl[sl] = []; - #} - if sl not in trad_counter: #{ - trad_counter[sl] = 0; - #} - if line.count('@') > 0: #{ - sl_tl_defaults[sl] = tl; - sl_tl[sl].append(tl); - indexes[(sl, tl)] = trad_counter[sl]; - trad_counter[sl] = trad_counter[sl] + 1; - - #} -#} - -cur_sl_row = []; -cur_tl_row = []; -cur_bt_row = []; -cur_al_row = []; - - -for line in open(sys.argv[2], 'r').readlines(): #{ - line = line.strip() - if line[0] == '-': #{ -# print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row); -# print cur_sl_row; -# print cur_bt_row; -# print cur_tl_row; -# print cur_al_row; -# - # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations - # - # sl_tl[sl_word][tl_word] = tl_freq - i = 0; - for slword in cur_sl_row: #{ - if len(cur_bt_row[i]['tls']) > 1: #{ - for al in cur_al_row: #{ - al_sl = int(al.split('-')[1]); - al_tl = int(al.split('-')[0]); - if al_sl != i: #{ - continue; - #} - - tlword = wrap(cur_tl_row[al_tl].lower()); - slword = wrap(slword.lower()); - - if tlword[1] == '*' or slword[1] == '*': - continue; - - if slword not in sl_tl_defaults: #{ -# print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping'; - continue; - #} - if (slword, tlword) not in indexes: #{ -# print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword); - continue; - #} -# if tlword != sl_tl_defaults[slword]: #{ -# print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword; -# else: #{ -# print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword; -# #} -# print >>sys.stderr, cur_sl_row; - for j in range(1, MAX_NGRAMS): #{ -# print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1] -# print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1] -# print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1] - - - pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])); - postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])); - roundgram = ' '.join(map(wrap, cur_sl_row[i-j:i+j+1])); - - if slword not in ngrams: #{ - ngrams[slword] = {}; - #} - if pregram not in ngrams[slword]: #{ - ngrams[slword][pregram] = {}; - #} - if postgram not in ngrams[slword]: #{ - ngrams[slword][postgram] = {}; - #} - if roundgram not in ngrams[slword]: #{ - ngrams[slword][roundgram] = {}; - #} - if tlword not in ngrams[slword][pregram]: #{ - ngrams[slword][pregram][tlword] = 0; - #} - if tlword not in ngrams[slword][postgram]: #{ - ngrams[slword][postgram][tlword] = 0; - #} - if tlword not in ngrams[slword][roundgram]: #{ - ngrams[slword][roundgram][tlword] = 0; - #} - - ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1; - ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1; - ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1; - #} - #print ',' , len(ngrams[slword]); - if slword not in meevents: #{ - meevents[slword] = {}; - #} - if slword not in meoutcomes: #{ - meoutcomes[slword] = {}; - #} - if event_counter not in meevents: #{ - meevents[slword][event_counter] = []; - #} - if event_counter not in meoutcomes[slword]: #{ - meoutcomes[slword][event_counter] = ''; - #} - for ni in ngrams[slword]: #{ - if ni not in features: #{ - feature_counter = feature_counter + 1; - features[ni] = feature_counter; - #} - meevents[slword][event_counter].append(features[ni]); - #meevents[slword][event_counter].append(feat); - meoutcomes[slword][event_counter] = tlword; - - #} - del ngrams; - ngrams = {}; - if len(sl_tl[slword]) < 2: #{ - continue; - #} - for event in meevents[slword]: #{ - outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # '; - for j in range(0, len(sl_tl[slword])): #{ - for feature in meevents[slword][event]: #{ - outline = outline + str(feature) + ':' + str(j) + ' '; - #} - outline = outline + ' # ' - #} - print(slword , '\t', len(sl_tl[slword]),'\t', outline); - #} - del meevents; - del meoutcomes; - meevents = {}; - meoutcomes = {}; - -# for f in features: #{ -# print >>sys.stderr, features[f] , f; -# #} - - #} - -# for j in range(0, MAX_NGRAMS): #{ -# print cur_sl_row[i-j:i+1]; -# print cur_sl_row[i:i+j]; -# #} - #print ngrams[slword]; - #} - i = i + 1; - - #} - - cur_line = 0; - event_counter = event_counter + 1; - #print line; - continue; - #} - - line = line.split('\t')[1]; - line = line.strip() - - if cur_line == 0: #{ - cur_sl_row = common.tokenise_tagger_line(line); - elif cur_line == 1: #{ - cur_bt_row = common.tokenise_biltrans_line(line); - elif cur_line == 2: #{ - cur_tl_row = common.tokenise_tagger_line(line); - elif cur_line == 3: #{ - cur_al_row = line.split(' '); - #} - - cur_line = cur_line + 1; -#} - -for feature in features: #{ - print(features[feature] , '\t' , feature, file=sys.stderr); -#} - -sys.exit(-1); - -for slword in meevents: #{ - if len(sl_tl[slword]) < 2: #{ - continue; - #} - for event in meevents[slword]: #{ - outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # '; - for j in range(0, len(sl_tl[slword])): #{ - for feature in meevents[slword][event]: #{ - outline = outline + str(feature) + ':' + str(j) + ' '; - #} - outline = outline + ' # ' - #} - print(slword , '\t', len(sl_tl[slword]),'\t', outline); - #} -#} + + +def wrap(x): + return '^' + x + '$' + + +def ngram_count_patterns(freq_lexicon, candidates): + + MAX_NGRAMS = 3 + cur_line = 0 + + sl_tl_defaults = {} + sl_tl = {} + ngrams = {} + + meevents = {} # events[slword][counter] = [feat, feat, feat] + meoutcomes = {} # meoutcomes[slword][counter] = tlword + event_counter = 0 + + features = {} # features[(slword, ['a', 'list'], tlword)] = 3 + feature_counter = 0 + + indexes = {} + trad_counter = {} + for line in open(freq_lexicon, 'r').readlines(): # { + if len(line) < 1: # { + continue + # } + w = int(line.split(' ')[0]) + if w < THRESHOLD: + continue + + row = common.tokenise_tagger_line(line) + sl = wrap(row[0]).lower() + tl = wrap(row[1].strip()).lower() + if tl[1] == '*': + tl = tl[:-3] + '$' + + if sl not in sl_tl: # { + sl_tl[sl] = [] + # } + if sl not in trad_counter: # { + trad_counter[sl] = 0 + # } + if line.count('@') > 0: # { + sl_tl_defaults[sl] = tl + sl_tl[sl].append(tl) + indexes[(sl, tl)] = trad_counter[sl] + trad_counter[sl] = trad_counter[sl] + 1 + + # } + # } + + cur_sl_row = [] + cur_tl_row = [] + cur_bt_row = [] + cur_al_row = [] + + for line in open(candidates, 'r').readlines(): # { + line = line.strip() + if line[0] == '-': # { + # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row); + # print cur_sl_row; + # print cur_bt_row; + # print cur_tl_row; + # print cur_al_row; + # + # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations + # + # sl_tl[sl_word][tl_word] = tl_freq + i = 0 + for slword in cur_sl_row: # { + if len(cur_bt_row[i]['tls']) > 1: # { + for al in cur_al_row: # { + al_sl = int(al.split('-')[1]) + al_tl = int(al.split('-')[0]) + if al_sl != i: # { + continue + # } + + tlword = wrap(cur_tl_row[al_tl].lower()) + slword = wrap(slword.lower()) + + if tlword[1] == '*' or slword[1] == '*': + continue + + if slword not in sl_tl_defaults: # { + # print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping'; + continue + # } + if (slword, tlword) not in indexes: # { + # print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword); + continue + # } + # if tlword != sl_tl_defaults[slword]: #{ + # print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword; + # else: #{ + # print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword; + # #} + # print >>sys.stderr, cur_sl_row; + for j in range(1, MAX_NGRAMS): # { + # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1] + # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1] + # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1] + + pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])) + postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])) + roundgram = ' '.join( + map(wrap, cur_sl_row[i-j:i+j+1])) + + if slword not in ngrams: # { + ngrams[slword] = {} + # } + if pregram not in ngrams[slword]: # { + ngrams[slword][pregram] = {} + # } + if postgram not in ngrams[slword]: # { + ngrams[slword][postgram] = {} + # } + if roundgram not in ngrams[slword]: # { + ngrams[slword][roundgram] = {} + # } + if tlword not in ngrams[slword][pregram]: # { + ngrams[slword][pregram][tlword] = 0 + # } + if tlword not in ngrams[slword][postgram]: # { + ngrams[slword][postgram][tlword] = 0 + # } + if tlword not in ngrams[slword][roundgram]: # { + ngrams[slword][roundgram][tlword] = 0 + # } + + ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1 + ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1 + ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1 + # } + # print ',' , len(ngrams[slword]); + if slword not in meevents: # { + meevents[slword] = {} + # } + if slword not in meoutcomes: # { + meoutcomes[slword] = {} + # } + if event_counter not in meevents: # { + meevents[slword][event_counter] = [] + # } + if event_counter not in meoutcomes[slword]: # { + meoutcomes[slword][event_counter] = '' + # } + for ni in ngrams[slword]: # { + if ni not in features: # { + feature_counter = feature_counter + 1 + features[ni] = feature_counter + # } + meevents[slword][event_counter].append( + features[ni]) + # meevents[slword][event_counter].append(feat); + meoutcomes[slword][event_counter] = tlword + + # } + del ngrams + ngrams = {} + if len(sl_tl[slword]) < 2: # { + continue + # } + for event in meevents[slword]: # { + outline = str( + indexes[(slword, meoutcomes[slword][event])]) + ' # ' + for j in range(0, len(sl_tl[slword])): # { + for feature in meevents[slword][event]: # { + outline = outline + \ + str(feature) + ':' + str(j) + ' ' + # } + outline = outline + ' # ' + # } + print(slword, '\t', len( + sl_tl[slword]), '\t', outline) + # } + del meevents + del meoutcomes + meevents = {} + meoutcomes = {} + + # for f in features: #{ + # print >>sys.stderr, features[f] , f; + # #} + + # } + + # for j in range(0, MAX_NGRAMS): #{ + # print cur_sl_row[i-j:i+1]; + # print cur_sl_row[i:i+j]; + # #} + # print ngrams[slword]; + # } + i = i + 1 + + # } + + cur_line = 0 + event_counter = event_counter + 1 + # print line; + continue + # } + + line = line.split('\t')[1] + line = line.strip() + + if cur_line == 0: # { + cur_sl_row = common.tokenise_tagger_line(line) + elif cur_line == 1: # { + cur_bt_row = common.tokenise_biltrans_line(line) + elif cur_line == 2: # { + cur_tl_row = common.tokenise_tagger_line(line) + elif cur_line == 3: # { + cur_al_row = line.split(' ') + # } + + cur_line = cur_line + 1 + # } + + for feature in features: # { + print(features[feature], '\t', feature, file=sys.stderr) + # } + + exit(1) + + for slword in meevents: # { + if len(sl_tl[slword]) < 2: # { + continue + # } + for event in meevents[slword]: # { + outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # ' + for j in range(0, len(sl_tl[slword])): # { + for feature in meevents[slword][event]: # { + outline = outline + str(feature) + ':' + str(j) + ' ' + # } + outline = outline + ' # ' + # } + print(slword, '\t', len(sl_tl[slword]), '\t', outline) + # } + # } + + +if __name__ == '__main__': + if len(sys.argv) not in [3, 4]: # { + print('count-patterns.py [threshold]') + exit(1) + # } + + if len(sys.argv) == 4: + THRESHOLD = int(sys.argv[3]) + + ngram_count_patterns(sys.argv[1], sys.argv[2]) diff --git a/scripts/ngrams-to-rules-me.py b/scripts/ngrams-to-rules-me.py index f22737a..87d0fad 100755 --- a/scripts/ngrams-to-rules-me.py +++ b/scripts/ngrams-to-rules-me.py @@ -2,172 +2,182 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; -import common; - -#+nature service nature carácter 3 -#+nature The imperialist nature carácter 1 -#+nature the secular nature of State carácter 1 -#+nature its nature prevent carácter 1 -#+nature nature be in carácter 1 +import sys +import common + +# +nature service nature carácter 3 +# +nature The imperialist nature carácter 1 +# +nature the secular nature of State carácter 1 +# +nature its nature prevent carácter 1 +# +nature nature be in carácter 1 # -#FREQMIN = 8.0; - -MINMATCH = 2; - -infile = ''; - -if len(sys.argv) < 2: #{ - print('ngrams-to-rules.py '); - sys.exit(-1); -#} - -infile = open(sys.argv[1]); - - -permitted_tags = ['n', 'vblex', 'adj']; - -print(''); -lineno = 1; -ruleno = 0; -for line in infile: #{ -# print '\n'; -# print line - if len(line) < 2: #{ - continue; - #} - line = line.strip(); - #line = line.decode('utf-8').strip(); - print(line, file=sys.stderr) - #+ 0.571428571429 14 8 8 troiñ tourner 8 - row = line.split('\t'); - - tipus = row[0].split(' ')[0]; - weight = row[0].replace(' ', ' ').split(' ')[1]; - sl = row[1].strip()[1:-1]; - tl = row[3][1:-1]; - tl_lema = tl.split('<')[0].lower(); - tl_tags = ''.join(tl.split('<')[1:]).replace('>', '.').rstrip('.') - freq = 1 -# freq = float(row[4]); - - pattern = common.tokenize_tagger_line(row[2]); - - if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: #{ - print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr); - continue; - #} - - if tipus == '-' or tipus == '~': #{ - print('DEFAULT_READING', line, file=sys.stderr); - continue; - #} - - # Hacks -# if len(pattern) == 0: #{ -# print('ZERO_PATTERN' , line, file=sys.stderr); -# continue; - #} - - - if len(pattern) < MINMATCH and len(pattern) > 0: #{ - print('BELOW_MINMATCH', line, file=sys.stderr); - continue; - #} - - - - inpattern = False; - for w in pattern: #{ - if w.lower().count(sl) > 0: #{ - inpattern = True; - #} - #} - if len(pattern) > 0 and not inpattern: #{ - print('SL_NOT_IN_PATTERN' , line, file=sys.stderr); - continue; - #} - - if tl_tags.count('adj') > 0 and sl.count('adj') < 1: #{ - print("TAG_MISMATCH" , line, file=sys.stderr); - continue; - #} - if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: #{ - print("TAG_MISMATCH" , line, file=sys.stderr); - continue; - #} - - if tl_tags.split('.')[0] not in permitted_tags: #{ - print("TAG_NOT_PERMITTED" , tl_tags , '||' , line, file=sys.stderr); - continue; - #} - - sel = False; - ruleno = ruleno + 1; - lineno = lineno + 1; - - commentb = ''; - commente = ''; -# if freq < FREQMIN: #{ -# commentb = ''; -# #} - - print(commentb + ' '); - for word in pattern: #{ - sl_lema = word.split('<')[0].lower(); - if word.count('><') > 0: #{ - sl_tags = '<'.join(word.split('<')[1:]).replace('><', '.').replace('>', ''); - else: #{ - sl_tags = '<'.join(word.split('<')[1:]).strip('<>'); - #} - - # ======================================================================= # - - sl_lema = sl_lema.replace('~', ' '); - tl_lema = tl_lema.replace('~', ' '); -# sl_lema = sl_lema.replace('-', '\-'); -# tl_lema = tl_lema.replace('-', '\-'); -# sl_lema = sl_lema.replace('(', '\('); -# tl_lema = tl_lema.replace('(', '\('); -# sl_lema = sl_lema.replace(')', '\)'); -# tl_lema = tl_lema.replace(')', '\)'); -# - if word.lower().count(sl) > 0: #{ - lineno = lineno + 1; - if sl_lema == '': #{ - print(' '); - #} - sel = True; - else: #{ - lineno = lineno + 1; - if sl_lema == '': #{ - print(' '); - else: #{ - print(' '); - #} - #} - #} - if sel == False and len(pattern) == 0: #{ - sl_lema = sl.split('<')[0]; - if sl.count('><') > 0: #{ - sl_tags = '<'.join(sl.split('<')[1:]).replace('><', '.').replace('>', ''); - else: #{ - sl_tags = '<'.join(sl.split('<')[1:]).strip('<>'); - #} - if sl_lema == '': #{ - print(' '); - print(' ' + commente); - elif sel == False: - print(' '+commente+ ''); - else: #{ - print(' ' + commente); - #} - lineno = lineno + 1; -#} -print(''); + +def ngrams_to_rules(ngrams): + # FREQMIN = 8.0 + + MINMATCH = 2 + + permitted_tags = ['n', 'vblex', 'adj'] + + print('') + lineno = 1 + ruleno = 0 + + with open(ngrams) as infile: + for line in infile: # { + # print '\n' + # print line + if len(line) < 2: # { + continue + # } + line = line.strip() + # line = line.decode('utf-8').strip() + print(line, file=sys.stderr) + # + 0.571428571429 14 8 8 troiñ tourner 8 + row = line.split('\t') + + tipus = row[0].split(' ')[0] + weight = row[0].replace(' ', ' ').split(' ')[1] + sl = row[1].strip()[1:-1] + tl = row[3][1:-1] + tl_lema = tl.split('<')[0].lower() + tl_tags = ''.join(tl.split('<')[1:]).replace( + '>', '.').rstrip('.') + freq = 1 + # freq = float(row[4]) + + pattern = common.tokenize_tagger_line(row[2]) + + if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: # { + print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) + continue + # } + + if tipus == '-' or tipus == '~': # { + print('DEFAULT_READING', line, file=sys.stderr) + continue + # } + + # Hacks + # if len(pattern) == 0: #{ + # print('ZERO_PATTERN' , line, file=sys.stderr); + # continue + # } + + if len(pattern) < MINMATCH and len(pattern) > 0: # { + print('BELOW_MINMATCH', line, file=sys.stderr) + continue + # } + + inpattern = False + for w in pattern: # { + if w.lower().count(sl) > 0: # { + inpattern = True + # } + # } + if len(pattern) > 0 and not inpattern: # { + print('SL_NOT_IN_PATTERN', line, file=sys.stderr) + continue + # } + + if tl_tags.count('adj') > 0 and sl.count('adj') < 1: # { + print("TAG_MISMATCH", line, file=sys.stderr) + continue + # } + if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: # { + print("TAG_MISMATCH", line, file=sys.stderr) + continue + # } + + if tl_tags.split('.')[0] not in permitted_tags: # { + print("TAG_NOT_PERMITTED", tl_tags, + '||', line, file=sys.stderr) + continue + # } + + sel = False + ruleno = ruleno + 1 + lineno = lineno + 1 + + commentb = '' + commente = '' + # if freq < FREQMIN: #{ + # commentb = '' + # #} + + print(commentb + ' ') + for word in pattern: # { + sl_lema = word.split('<')[0].lower() + if word.count('><') > 0: # { + sl_tags = '<'.join(word.split('<')[1:]).replace( + '><', '.').replace('>', '') + else: # { + sl_tags = '<'.join(word.split('<')[1:]).strip('<>') + # } + + # ======================================================================= # + + sl_lema = sl_lema.replace('~', ' ') + tl_lema = tl_lema.replace('~', ' ') + # sl_lema = sl_lema.replace('-', '\-') + # tl_lema = tl_lema.replace('-', '\-') + # sl_lema = sl_lema.replace('(', '\(') + # tl_lema = tl_lema.replace('(', '\(') + # sl_lema = sl_lema.replace(')', '\)') + # tl_lema = tl_lema.replace(')', '\)') + # + if word.lower().count(sl) > 0: # { + lineno = lineno + 1 + if sl_lema == '': # { + print(' ') + # } + sel = True + else: # { + lineno = lineno + 1 + if sl_lema == '': # { + print(' ') + else: # { + print(' ') + # } + # } + # } + if sel == False and len(pattern) == 0: # { + sl_lema = sl.split('<')[0] + if sl.count('><') > 0: # { + sl_tags = '<'.join(sl.split('<')[1:]).replace( + '><', '.').replace('>', '') + else: # { + sl_tags = '<'.join(sl.split('<')[1:]).strip('<>') + # } + if sl_lema == '': # { + print(' ') + print(' ' + commente) + elif sel == False: + print(' '+commente + + '') + else: # { + print(' ' + commente) + # } + lineno = lineno + 1 + # } + print('') + + +if __name__ == '__main__': + if len(sys.argv) < 2: # { + print('ngrams-to-rules.py ') + exit(1) + # } + ngrams_to_rules(sys.argv[1])