commit 167cf7e81bf77a5bbdc6966fca8474fdff8dd127 Author: vivekvardhanadepu Date: Fri Jul 30 00:10:19 2021 +0530 Scripts fixup: cleaning old syntax and other minor fixes diff --git a/scripts/extract-freq-lexicon.py b/scripts/extract-freq-lexicon.py index ef7b427..0aeb2df 100755 --- a/scripts/extract-freq-lexicon.py +++ b/scripts/extract-freq-lexicon.py @@ -38,34 +38,34 @@ def extract_freq_lexicon(canditates): cur_bt_row = [] cur_al_row = [] - # for line in open(sys.argv[1]).readlines(): #{ + # for line in open(sys.argv[1]).readlines(): with open(canditates) as infile: - for line in infile: # { + for line in infile: line = line.strip() lineno += 1 - if lineno % 5000 == 0: # { + if lineno % 5000 == 0: sys.stderr.write('.') - if lineno % 100000 == 0: # { + if lineno % 100000 == 0: sys.stderr.write(str(lineno)+'\n') - # } + sys.stderr.flush() - # } + try: - if line[0] == '-': # { + if line[0] == '-': # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 - for slword in cur_sl_row: # { - if len(cur_bt_row[i]['tls']) > 1: # { - for al in cur_al_row: # { + for slword in cur_sl_row: + if len(cur_bt_row[i]['tls']) > 1: + for al in cur_al_row: if al == '': continue al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) - if al_sl != i: # { + if al_sl != i: continue - # } + if al_tl < len(cur_tl_row): tlword = cur_tl_row[al_tl] else: @@ -81,72 +81,64 @@ def extract_freq_lexicon(canditates): file=sys.stderr) exit(1) slword = slword - if slword not in sl_tl: # { + if slword not in sl_tl: sl_tl[slword] = {} - # } - if tlword not in sl_tl[slword]: # { + + if tlword not in sl_tl[slword]: sl_tl[slword][tlword] = 0 - # } + sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1 # print '+' , slword , tlword , sl_tl[slword][tlword], lineno - # } - # } + i = i + 1 - # } + cur_line = 0 continue - # } line = line.split('\t')[1] - if cur_line == 0: # { + if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) - elif cur_line == 1: # { + elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) - elif cur_line == 2: # { + elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) - elif cur_line == 3: # { + elif cur_line == 3: cur_al_row = line.split(' ') - # } cur_line = cur_line + 1 except Exception: # print("Error in line", lineno, ":", e, file=sys.stderr) traceback.print_exc() exit(1) - # } - # } - for sl in sl_tl: # { + for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True - for tl in newtl: # { - if tl[0] == '*': # { + for tl in newtl: + if tl[0] == '*': print('Error: tl word unknown', tl, file=sys.stderr) continue - # } + first_tag_sl = sl.split('<')[1].split('>')[0].strip() first_tag_tl = tl.split('<')[1].split('>')[0].strip() - if first_tag_sl != first_tag_tl: # { + if first_tag_sl != first_tag_tl: print('Error:', first_tag_sl, '!=', first_tag_tl, file=sys.stderr) continue - # } - if first: # { + + if first: print(sl_tl[sl][tl], wrap(sl), wrap(tl), '@') first = False - else: # { + else: print(sl_tl[sl][tl], wrap(sl), wrap(tl)) - # } - # } - # } if __name__ == '__main__': - if len(sys.argv) < 2: # { + if len(sys.argv) < 2: print('Usage: extract-freq-lexicon.py ', file=sys.stderr) exit(1) - # } + extract_freq_lexicon(sys.argv[1]) diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py index c80f77b..fc55a63 100755 --- a/scripts/extract-sentences.py +++ b/scripts/extract-sentences.py @@ -3,23 +3,19 @@ # -*- encoding: utf-8 -*- import sys -import codecs import common -def ambiguous(bt): # { +def ambiguous(bt): # legislation/legislación/ordenamiento ambig = False - for token in bt: # { + for token in bt: tls = token['tls'] - if len(tls) > 1: # { + if len(tls) > 1: return True - # } - # } return ambig -# } def extract_sentences(phrase_table_file, biltrans_out_file): @@ -29,42 +25,39 @@ def extract_sentences(phrase_table_file, biltrans_out_file): not_ambiguous = [] with open(phrase_table_file) as phrase_table, open(biltrans_out_file) as biltrans_out: - while True: # { + while True: try: lineno = lineno + 1 pt_line = phrase_table.readline().strip() bt_line = biltrans_out.readline().strip() - if not bt_line.strip() and not pt_line.strip(): # { + if not bt_line.strip() and not pt_line.strip(): break - elif not bt_line.strip() or not pt_line.strip(): # { + elif not bt_line.strip() or not pt_line.strip(): continue - # } row = pt_line.split('|||') bt = common.tokenise_biltrans_line(bt_line.strip()) sl = common.tokenise_tagger_line(row[1].strip()) tl = common.tokenise_tagger_line(row[0].strip()) - if not ambiguous(bt): # { + if not ambiguous(bt): not_ambiguous.append(str(lineno)) - if len(not_ambiguous) >= 10: # { + if len(not_ambiguous) >= 10: print("not ambiguous:", ' '.join( not_ambiguous), file=sys.stderr) not_ambiguous = [] - # } + continue - # } - if len(sl) < 2 and len(tl) < 2: # { + + if len(sl) < 2 and len(tl) < 2: continue - # } # Check that the number of words in the lexical transfer, and in the phrasetable matches up - if len(sl) != len(bt): # { + if len(sl) != len(bt): print("Error in line", lineno, ": len(sl) != len(bt)", file=sys.stderr) continue - # } # cheking if the alignments are empty if not row[2].strip(): @@ -88,8 +81,6 @@ def extract_sentences(phrase_table_file, biltrans_out_file): total_errors += 1 continue - # } - print('total:', lineno, file=sys.stderr) print('valid:', total_valid, '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr) @@ -98,9 +89,8 @@ def extract_sentences(phrase_table_file, biltrans_out_file): if __name__ == '__main__': - if len(sys.argv) < 3: # { + if len(sys.argv) < 3: print('Usage: extact-sentences.py ', file=sys.stderr) exit(1) - # } extract_sentences(sys.argv[1], sys.argv[2]) diff --git a/scripts/lambdas-to-rules.py b/scripts/lambdas-to-rules.py index 745c7c7..d6214e3 100644 --- a/scripts/lambdas-to-rules.py +++ b/scripts/lambdas-to-rules.py @@ -15,51 +15,48 @@ def lambdas_to_rules(freq_lexicon, rules): rindex = {} with open(freq_lexicon) as d: - for line in d: # { - if len(line) < 1: # { + for line in d: + if len(line) < 1: continue - # } + row = common.tokenise_tagger_line(line) sl = wrap(row[0].strip()) tl = wrap(row[1].strip()) if tl[1] == '*': tl = tl[:-3] + '$' - if sl not in sl_tl: # { + if sl not in sl_tl: sl_tl[sl] = [] - # } - if sl not in trad_counter: # { + + if sl not in trad_counter: trad_counter[sl] = 0 - # } - if line.count('@') > 0: # { + + if line.count('@') > 0: sl_tl_defaults[sl] = tl - # } + sl_tl[sl].append(tl) indexes[(sl, tl)] = trad_counter[sl] rindex[(sl, trad_counter[sl])] = tl trad_counter[sl] = trad_counter[sl] + 1 - # } - - for pair in rindex: # { + for pair in rindex: print(pair[0], pair[1], rindex[pair], file=sys.stderr) - # } # ability 0.25652 1 ability to # ability 1.54548 0 ability to deliver # ability 1.48162 0 our ability to with open(rules) as d: - for line in d: # { + for line in d: row = line.split(' \t ') slword = row[0].strip() l = float(row[1]) tlid = int(row[2]) - if (slword, tlid) not in rindex: # { + if (slword, tlid) not in rindex: print('(', slword, ',', tlid, ') not in index', file=sys.stderr) continue - # } + tlword = rindex[(slword, tlid)] context = row[3].strip() # #+ 0.571428571429 14 8 8 troiñ tourner 8 @@ -69,21 +66,17 @@ def lambdas_to_rules(freq_lexicon, rules): '\t' + context + '\t' + tlword + '\t1') # print(' ' % (l)) - # for c in context.split(' '): #{ - # if c.count(slword) == 1: #{ + # for c in context.split(' '): + # if c.count(slword) == 1: # print(slword, tlword) - # else: #{ + # else: # print(c) - # #} - # #} # print(' ') - # } - if __name__ == '__main__': - if len(sys.argv) < 3: # { + if len(sys.argv) < 3: print('Usage: lambdas-to-rules.py ', file=sys.stderr) exit(1) - # } + lambdas_to_rules(sys.argv[1], sys.argv[2]) diff --git a/scripts/merge-ngrams-lambdas.py b/scripts/merge-ngrams-lambdas.py index 3ceb065..fda05d7 100644 --- a/scripts/merge-ngrams-lambdas.py +++ b/scripts/merge-ngrams-lambdas.py @@ -3,7 +3,6 @@ # -*- encoding: utf-8 -*- import sys -import codecs import random @@ -15,24 +14,23 @@ def merge_ngrams_lambdas(ngf, ldf): ngrams = {} # lambdas = {} - for line in open(ngf).readlines(): # { + for line in open(ngf).readlines(): # 59763 poor in capital - if len(line) < 2: # { + if len(line) < 2: continue - # } + row = line.strip().split('\t') if(len(row) < 2): row.append('') ngid = int(row[0].strip()) ngrams[ngid] = row[1] - # } with open(ldf) as d: - for line in d: # { + for line in d: # 59176:0 1.00131 - if line.count('@@') > 0: # { + if line.count('@@') > 0: continue - # } + row = line.strip().split('\t') l = float(row[2]) @@ -42,12 +40,11 @@ def merge_ngrams_lambdas(ngf, ldf): trad = row[1].split(':')[1] token = row[0] print(token, '\t', l, '\t', trad, '\t', ngram) - # } if __name__ == '__main__': - if len(sys.argv) < 3: # { + if len(sys.argv) < 3: print('Usage: merge-ngrams-lambdas.py ', file=sys.stderr) exit(1) - # } + merge_ngrams_lambdas(sys.argv[1], sys.argv[2]) diff --git a/scripts/ngram-count-patterns-maxent2.py b/scripts/ngram-count-patterns-maxent2.py index c3a9714..387336f 100755 --- a/scripts/ngram-count-patterns-maxent2.py +++ b/scripts/ngram-count-patterns-maxent2.py @@ -3,8 +3,6 @@ # -*- encoding: utf-8 -*- import sys -import codecs -import copy import common # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations @@ -46,61 +44,52 @@ def ngram_count_patterns(freq_lexicon, candidates): indexes = {} trad_counter = {} - for line in open(freq_lexicon, 'r').readlines(): # { - if len(line) < 1: # { + for line in open(freq_lexicon, 'r').readlines(): + if len(line) < 1: continue - # } + w = int(line.split(' ')[0]) if w < THRESHOLD: continue row = common.tokenise_tagger_line(line) sl = wrap(row[0]).lower() - tl = wrap(row[1].strip()).lower() + tl = wrap(row[1]).lower() if tl[1] == '*': tl = tl[:-3] + '$' - if sl not in sl_tl: # { + if sl not in sl_tl: sl_tl[sl] = [] - # } - if sl not in trad_counter: # { + + if sl not in trad_counter: trad_counter[sl] = 0 - # } - if line.count('@') > 0: # { + + if line.count('@') > 0: sl_tl_defaults[sl] = tl sl_tl[sl].append(tl) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] = trad_counter[sl] + 1 - # } - # } - cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] - for line in open(candidates, 'r').readlines(): # { + for line in open(candidates, 'r').readlines(): line = line.strip() - if line[0] == '-': # { - # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row); - # print cur_sl_row; - # print cur_bt_row; - # print cur_tl_row; - # print cur_al_row; - # + if line[0] == '-': # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 - for slword in cur_sl_row: # { - if len(cur_bt_row[i]['tls']) > 1: # { - for al in cur_al_row: # { + for slword in cur_sl_row: + if len(cur_bt_row[i]['tls']) > 1: + for al in cur_al_row: al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) - if al_sl != i: # { + if al_sl != i: continue - # } + tlword = wrap(cur_tl_row[al_tl].lower()) slword = wrap(slword.lower()) @@ -108,21 +97,20 @@ def ngram_count_patterns(freq_lexicon, candidates): if tlword[1] == '*' or slword[1] == '*': continue - if slword not in sl_tl_defaults: # { - # print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping'; + if slword not in sl_tl_defaults: + # print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping' continue - # } - if (slword, tlword) not in indexes: # { - # print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword); + + if (slword, tlword) not in indexes: + # print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword) continue - # } - # if tlword != sl_tl_defaults[slword]: #{ - # print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword; - # else: #{ - # print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword; - # #} - # print >>sys.stderr, cur_sl_row; - for j in range(1, MAX_NGRAMS): # { + + # if tlword != sl_tl_defaults[slword]: + # print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword + # else: + # print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword + # print >>sys.stderr, cur_sl_row + for j in range(1, MAX_NGRAMS): # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1] # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1] # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1] @@ -132,147 +120,145 @@ def ngram_count_patterns(freq_lexicon, candidates): roundgram = ' '.join( map(wrap, cur_sl_row[i-j:i+j+1])) - if slword not in ngrams: # { + if slword not in ngrams: ngrams[slword] = {} - # } - if pregram not in ngrams[slword]: # { + + if pregram not in ngrams[slword]: ngrams[slword][pregram] = {} - # } - if postgram not in ngrams[slword]: # { + + if postgram not in ngrams[slword]: ngrams[slword][postgram] = {} - # } - if roundgram not in ngrams[slword]: # { + + if roundgram not in ngrams[slword]: ngrams[slword][roundgram] = {} - # } - if tlword not in ngrams[slword][pregram]: # { + + if tlword not in ngrams[slword][pregram]: ngrams[slword][pregram][tlword] = 0 - # } - if tlword not in ngrams[slword][postgram]: # { + + if tlword not in ngrams[slword][postgram]: ngrams[slword][postgram][tlword] = 0 - # } - if tlword not in ngrams[slword][roundgram]: # { + + if tlword not in ngrams[slword][roundgram]: ngrams[slword][roundgram][tlword] = 0 - # } + ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1 ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1 ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1 - # } - # print ',' , len(ngrams[slword]); - if slword not in meevents: # { + + # print ',' , len(ngrams[slword]) + if slword not in meevents: meevents[slword] = {} - # } - if slword not in meoutcomes: # { + + if slword not in meoutcomes: meoutcomes[slword] = {} - # } - if event_counter not in meevents: # { + + if event_counter not in meevents: meevents[slword][event_counter] = [] - # } - if event_counter not in meoutcomes[slword]: # { + + if event_counter not in meoutcomes[slword]: meoutcomes[slword][event_counter] = '' - # } - for ni in ngrams[slword]: # { - if ni not in features: # { + + for ni in ngrams[slword]: + if ni not in features: feature_counter = feature_counter + 1 features[ni] = feature_counter - # } + meevents[slword][event_counter].append( features[ni]) - # meevents[slword][event_counter].append(feat); + # meevents[slword][event_counter].append(feat) meoutcomes[slword][event_counter] = tlword - # } + del ngrams ngrams = {} - if len(sl_tl[slword]) < 2: # { + if len(sl_tl[slword]) < 2: continue - # } - for event in meevents[slword]: # { + + for event in meevents[slword]: outline = str( indexes[(slword, meoutcomes[slword][event])]) + ' # ' - for j in range(0, len(sl_tl[slword])): # { - for feature in meevents[slword][event]: # { + for j in range(0, len(sl_tl[slword])): + for feature in meevents[slword][event]: outline = outline + \ str(feature) + ':' + str(j) + ' ' - # } + outline = outline + ' # ' - # } + print(slword, '\t', len( sl_tl[slword]), '\t', outline) - # } + del meevents del meoutcomes meevents = {} meoutcomes = {} - # for f in features: #{ - # print >>sys.stderr, features[f] , f; - # #} + # for f in features: + # print >>sys.stderr, features[f] , f - # } + - # for j in range(0, MAX_NGRAMS): #{ - # print cur_sl_row[i-j:i+1]; - # print cur_sl_row[i:i+j]; - # #} - # print ngrams[slword]; - # } + # for j in range(0, MAX_NGRAMS): + # print cur_sl_row[i-j:i+1] + # print cur_sl_row[i:i+j] + # print ngrams[slword] + i = i + 1 - # } + cur_line = 0 event_counter = event_counter + 1 - # print line; + # print line continue - # } + line = line.split('\t')[1] line = line.strip() - if cur_line == 0: # { + if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) - elif cur_line == 1: # { + elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) - elif cur_line == 2: # { + elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) - elif cur_line == 3: # { + elif cur_line == 3: cur_al_row = line.split(' ') - # } + cur_line = cur_line + 1 - # } + - for feature in features: # { + for feature in features: print(features[feature], '\t', feature, file=sys.stderr) - # } + # exit(1) return - for slword in meevents: # { - if len(sl_tl[slword]) < 2: # { - continue - # } - for event in meevents[slword]: # { - outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # ' - for j in range(0, len(sl_tl[slword])): # { - for feature in meevents[slword][event]: # { - outline = outline + str(feature) + ':' + str(j) + ' ' - # } - outline = outline + ' # ' - # } - print(slword, '\t', len(sl_tl[slword]), '\t', outline) - # } - # } + # for slword in meevents: + # if len(sl_tl[slword]) < 2: + # continue + + # for event in meevents[slword]: + # outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # ' + # for j in range(0, len(sl_tl[slword])): + # for feature in meevents[slword][event]: + # outline = outline + str(feature) + ':' + str(j) + ' ' + + # outline = outline + ' # ' + + # print(slword, '\t', len(sl_tl[slword]), '\t', outline) + + if __name__ == '__main__': - if len(sys.argv) not in [3, 4]: # { + if len(sys.argv) not in [3, 4]: print( - 'Usage: count-patterns.py [threshold]', file=sys.stderr) + 'Usage: count-patterns.py [threshold]', file=sys.stderr) exit(1) - # } + if len(sys.argv) == 4: THRESHOLD = int(sys.argv[3]) diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py index 91b7ced..52e60ce 100755 --- a/scripts/ngram-count-patterns.py +++ b/scripts/ngram-count-patterns.py @@ -2,11 +2,8 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, commands; +import sys import common -sys.stdin = codecs.getreader('utf-8')(sys.stdin); -sys.stdout = codecs.getwriter('utf-8')(sys.stdout); -sys.stderr = codecs.getwriter('utf-8')(sys.stderr); # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations @@ -17,198 +14,192 @@ sys.stderr = codecs.getwriter('utf-8')(sys.stderr); # ngrams[ngram][tl_word] = freq # 5 Please rise , then , for this minute 's silence . -#5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. -#5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . -#5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 -#------------------------------------------------------------------------------- - -def wrap (x): - return '^' + x + '$' - -if len(sys.argv) < 3: #{ - print ('count-patterns.py '); - sys.exit(-1); -#} - -MAX_NGRAMS = 2; - -crisphold = float(sys.argv[3]); -cur_line = 0; - -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; - -lineno = 0 -for line in file(sys.argv[1]).readlines(): #{ - lineno += 1 - if lineno % 10000 == 0: - print >> sys.stderr, lineno - if len(line) < 1: #{ - continue; - #} - row = common.tokenise_tagger_line(line.decode('utf-8')); - sl = wrap(row[0]); - tl = wrap(row[1]); - if tl[1] == '*': - tl = tl[:-3] + '$' - if line.count('@') > 0: #{ - sl_tl_defaults[sl] = tl; - else: #{ - sl_tl[sl] = tl; - #} -#} - -cur_sl_row = []; -cur_tl_row = []; -cur_bt_row = []; -cur_al_row = []; -lineno = 0 -for line in file(sys.argv[2]).readlines(): #{ - lineno += 1 - line = line.strip().decode('utf-8'); - if lineno % 500 == 0: - print >> sys.stderr, lineno - if line[0] == '-': #{ - # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row); - # print cur_sl_row; - # print cur_bt_row; - # print cur_tl_row; - # print cur_al_row; - # - # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations - # - # sl_tl[sl_word][tl_word] = tl_freq - i = 0; - for slword in cur_sl_row: #{ - if len(cur_bt_row[i]['tls']) > 1: #{ - for al in cur_al_row: #{ - if al == '': - continue - al_sl = int(al.split('-')[1]); - al_tl = int(al.split('-')[0]); - if al_sl != i: #{ - continue; - #} - tlword = wrap(cur_tl_row[al_tl]); - slword = wrap(slword); - - if slword not in sl_tl_defaults: #{ - print >>sys.stderr, '!', - continue; - #} - - for j in range(1, MAX_NGRAMS): #{ - - pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])); - postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])); - roundgram = ' '.join(map(wrap, cur_sl_row[i-j:i+j+1])); - - if slword not in ngrams: #{ - ngrams[slword] = {}; - #} - if pregram not in ngrams[slword]: #{ - ngrams[slword][pregram] = {}; - #} - if postgram not in ngrams[slword]: #{ - ngrams[slword][postgram] = {}; - #} - if roundgram not in ngrams[slword]: #{ - ngrams[slword][roundgram] = {}; - #} - if tlword not in ngrams[slword][pregram]: #{ - ngrams[slword][pregram][tlword] = 0; - #} - if tlword not in ngrams[slword][postgram]: #{ - ngrams[slword][postgram][tlword] = 0; - #} - if tlword not in ngrams[slword][roundgram]: #{ - ngrams[slword][roundgram][tlword] = 0; - #} - - ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1; - ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1; - ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1; - #} - #} -# for j in range(0, MAX_NGRAMS): #{ -# print cur_sl_row[i-j:i+1]; -# print cur_sl_row[i:i+j]; -# #} - #} - i = i + 1; - #} - cur_line = 0; - #print line; - continue; - #} - - line = line.split('\t')[1]; - - if cur_line == 0: #{ - cur_sl_row = common.tokenise_tagger_line(line) - elif cur_line == 1: #{ - cur_bt_row = common.tokenise_biltrans_line(line) - elif cur_line == 2: #{ - cur_tl_row = common.tokenise_tagger_line(line) - elif cur_line == 3: #{ - cur_al_row = line.split(' '); - #} - - cur_line = cur_line + 1; -#} - - -for sl in ngrams: #{ - - for ngram in ngrams[sl]: #{ - total = 0; - max_freq = -1; - current_tl = ''; - for tl in ngrams[sl][ngram]: #{ - if ngrams[sl][ngram][tl] > max_freq: #{ - max_freq = ngrams[sl][ngram][tl]; - current_tl = tl; - #} - total = total + ngrams[sl][ngram][tl]; - #} - - #> If for each of the rules we include - #> the amount of time the translation is seen with that pattern over the - #> total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8 - #> etc. (>0.6 would be the same as 2/3 of the time the alternative - #> translation is seen with that ngram, and 1/3 of the time the default - #> translation is). I think this would be easier to explain than the magic - #> number I came up with. - # - #I see this as a way to define how "crispy" the decisions are. I think it - #would be better to express this as a ratio: the ratio of the times the - #alternative translation is seen to the number of times the defaullt - #translation is seen with that n-gram. - # - #It would be "2" in this case: the alternative is seen twice as often as - #the default. - - for tl in ngrams[sl][ngram]: #{ - crispiness = 0.0; - default = sl_tl_defaults[sl]; - alt_crisp = float(ngrams[sl][ngram][tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - weight = float(ngrams[sl][ngram][tl]) / float(total); - crispiness = alt_crisp/def_crisp; - - #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; - - if crispiness < crisphold: #{ - print '-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]); - else: #{ - - print '+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]); - #} - - #} - #} -#} +# 5 Please/Complacer rise/aumento ,/, then/entonces ,/, for/para/durante this/este minute/minuto '/' *s/*s silence/silencio ./. +# 5 Invitar a todo a que prpers poner de pie para guardar uno minuto de silencio . +# 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 +# ------------------------------------------------------------------------------- + + +def wrap(x): + return '^' + x + '$' + + +def ngram_count_patterns(freq_lexicon, candidates, crisphold): + MAX_NGRAMS = 2 + + cur_line = 0 + + sl_tl_defaults = {} + sl_tl = {} + ngrams = {} + + lineno = 0 + for line in open(freq_lexicon).readlines(): + lineno += 1 + if lineno % 10000 == 0: + print(lineno, file=sys.stderr) + if len(line) < 1: + continue + + row = common.tokenise_tagger_line(line) + sl = wrap(row[0]) + tl = wrap(row[1]) + if tl[1] == '*': + tl = tl[:-3] + '$' + if line.count('@') > 0: + sl_tl_defaults[sl] = tl + else: + sl_tl[sl] = tl + + cur_sl_row = [] + cur_tl_row = [] + cur_bt_row = [] + cur_al_row = [] + lineno = 0 + for line in open(candidates).readlines(): + lineno += 1 + line = line.strip() + if lineno % 500 == 0: + print(lineno, file=sys.stderr) + if line[0] == '-': + # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row) + # print cur_sl_row + # print cur_bt_row + # print cur_tl_row + # print cur_al_row + # + # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations + # + # sl_tl[sl_word][tl_word] = tl_freq + i = 0 + for slword in cur_sl_row: + if len(cur_bt_row[i]['tls']) > 1: + for al in cur_al_row: + if al == '': + continue + al_sl = int(al.split('-')[1]) + al_tl = int(al.split('-')[0]) + if al_sl != i: + continue + + tlword = wrap(cur_tl_row[al_tl]) + slword = wrap(slword) + + if slword not in sl_tl_defaults: + print('!', file=sys.stderr) + continue + + for j in range(1, MAX_NGRAMS): + + pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])) + postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])) + roundgram = ' '.join( + map(wrap, cur_sl_row[i-j:i+j+1])) + + if slword not in ngrams: + ngrams[slword] = {} + + if pregram not in ngrams[slword]: + ngrams[slword][pregram] = {} + + if postgram not in ngrams[slword]: + ngrams[slword][postgram] = {} + + if roundgram not in ngrams[slword]: + ngrams[slword][roundgram] = {} + + if tlword not in ngrams[slword][pregram]: + ngrams[slword][pregram][tlword] = 0 + + if tlword not in ngrams[slword][postgram]: + ngrams[slword][postgram][tlword] = 0 + + if tlword not in ngrams[slword][roundgram]: + ngrams[slword][roundgram][tlword] = 0 + + ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1 + ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1 + ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1 + + # for j in range(0, MAX_NGRAMS): + # print cur_sl_row[i-j:i+1] + # print cur_sl_row[i:i+j] + + i = i + 1 + + cur_line = 0 + # print line + continue + + line = line.split('\t')[1] + + if cur_line == 0: + cur_sl_row = common.tokenise_tagger_line(line) + elif cur_line == 1: + cur_bt_row = common.tokenise_biltrans_line(line) + elif cur_line == 2: + cur_tl_row = common.tokenise_tagger_line(line) + elif cur_line == 3: + cur_al_row = line.split(' ') + + cur_line = cur_line + 1 + + for sl in ngrams: + + for ngram in ngrams[sl]: + total = 0 + max_freq = -1 + current_tl = '' + for tl in ngrams[sl][ngram]: + if ngrams[sl][ngram][tl] > max_freq: + max_freq = ngrams[sl][ngram][tl] + current_tl = tl + + total = total + ngrams[sl][ngram][tl] + + # > If for each of the rules we include + # > the amount of time the translation is seen with that pattern over the + # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8 + # > etc. (>0.6 would be the same as 2/3 of the time the alternative + # > translation is seen with that ngram, and 1/3 of the time the default + # > translation is). I think this would be easier to explain than the magic + # > number I came up with. + # + # I see this as a way to define how "crispy" the decisions are. I think it + # would be better to express this as a ratio: the ratio of the times the + # alternative translation is seen to the number of times the defaullt + # translation is seen with that n-gram. + # + # It would be "2" in this case: the alternative is seen twice as often as + # the default. + + for tl in ngrams[sl][ngram]: + crispiness = 0.0 + default = sl_tl_defaults[sl] + alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: + def_crisp = float( + ngrams[sl][ngram][default] / float(total)) + + weight = float(ngrams[sl][ngram][tl]) / float(total) + crispiness = alt_crisp/def_crisp + + # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] + + if crispiness < crisphold: + print('-', crispiness, weight, total, max_freq, + ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl])) + else: + + print('+', crispiness, weight, total, max_freq, + ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl])) + + +if __name__ == '__main__': + if len(sys.argv) < 4: + print( + 'Usage: count-patterns.py ', file=sys.stderr) + exit(1) + + ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/scripts/ngrams-to-rules-me.py b/scripts/ngrams-to-rules-me.py index ded1119..9c67b67 100755 --- a/scripts/ngrams-to-rules-me.py +++ b/scripts/ngrams-to-rules-me.py @@ -10,8 +10,6 @@ import common # +nature the secular nature of State carácter 1 # +nature its nature prevent carácter 1 # +nature nature be in carácter 1 -# - def ngrams_to_rules(ngrams): # FREQMIN = 8.0 @@ -25,12 +23,12 @@ def ngrams_to_rules(ngrams): ruleno = 0 with open(ngrams) as infile: - for line in infile: # { + for line in infile: # print '\n' # print line - if len(line) < 2: # { + if len(line) < 2: continue - # } + line = line.strip() # line = line.decode('utf-8').strip() print(line, file=sys.stderr) @@ -49,52 +47,44 @@ def ngrams_to_rules(ngrams): pattern = common.tokenize_tagger_line(row[2]) - if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: # { + if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) continue - # } - if tipus == '-' or tipus == '~': # { + if tipus == '-' or tipus == '~': print('DEFAULT_READING', line, file=sys.stderr) continue - # } # Hacks - # if len(pattern) == 0: #{ + # if len(pattern) == 0: # print('ZERO_PATTERN' , line, file=sys.stderr); # continue - # } - if len(pattern) < MINMATCH and len(pattern) > 0: # { + if len(pattern) < MINMATCH and len(pattern) > 0: print('BELOW_MINMATCH', line, file=sys.stderr) continue - # } inpattern = False - for w in pattern: # { - if w.lower().count(sl) > 0: # { + for w in pattern: + if w.lower().count(sl) > 0: inpattern = True - # } - # } - if len(pattern) > 0 and not inpattern: # { + + if len(pattern) > 0 and not inpattern: print('SL_NOT_IN_PATTERN', line, file=sys.stderr) continue - # } - if tl_tags.count('adj') > 0 and sl.count('adj') < 1: # { + if tl_tags.count('adj') > 0 and sl.count('adj') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue - # } - if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: # { + + if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue - # } - if tl_tags.split('.')[0] not in permitted_tags: # { + if tl_tags.split('.')[0] not in permitted_tags: print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr) continue - # } sel = False ruleno = ruleno + 1 @@ -102,21 +92,19 @@ def ngrams_to_rules(ngrams): commentb = '' commente = '' - # if freq < FREQMIN: #{ + # if freq < FREQMIN: # commentb = '' - # #} print(commentb + ' ') - for word in pattern: # { + for word in pattern: sl_lema = word.split('<')[0].lower() - if word.count('><') > 0: # { + if word.count('><') > 0: sl_tags = '<'.join(word.split('<')[1:]).replace( '><', '.').replace('>', '') - else: # { + else: sl_tags = '<'.join(word.split('<')[1:]).strip('<>') - # } # ======================================================================= # @@ -129,55 +117,53 @@ def ngrams_to_rules(ngrams): # sl_lema = sl_lema.replace(')', '\)') # tl_lema = tl_lema.replace(')', '\)') # - if word.lower().count(sl) > 0: # { + if word.lower().count(sl) > 0: lineno = lineno + 1 - if sl_lema == '': # { + if sl_lema == '': print(' ') - # } + sel = True - else: # { + else: lineno = lineno + 1 - if sl_lema == '': # { + if sl_lema == '': print(' ') - else: # { + else: print(' ') - # } - # } - # } - if sel == False and len(pattern) == 0: # { + + if sel == False and len(pattern) == 0: sl_lema = sl.split('<')[0] - if sl.count('><') > 0: # { + if sl.count('><') > 0: sl_tags = '<'.join(sl.split('<')[1:]).replace( '><', '.').replace('>', '') - else: # { + else: sl_tags = '<'.join(sl.split('<')[1:]).strip('<>') - # } - if sl_lema == '': # { + + if sl_lema == '': print(' ') print(' ' + commente) elif sel == False: print(' '+commente + '') - else: # { + else: print(' ' + commente) - # } + lineno = lineno + 1 - # } + print('') if __name__ == '__main__': - if len(sys.argv) < 2: # { - print('Usage: ngrams-to-rules.py ', file=sys.stderr) + if len(sys.argv) < 2: + print('Usage: ngrams-to-rules-me.py ', file=sys.stderr) exit(1) - # } + ngrams_to_rules(sys.argv[1]) diff --git a/scripts/ngrams-to-rules.py b/scripts/ngrams-to-rules.py index 13c2643..329851e 100755 --- a/scripts/ngrams-to-rules.py +++ b/scripts/ngrams-to-rules.py @@ -2,148 +2,140 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; +import sys import common #+nature service nature carácter 3 -#+nature The imperialist nature carácter 1 -#+nature the secular nature of State carácter 1 -#+nature its nature prevent carácter 1 -#+nature nature be in carácter 1 -# - -infile = ''; - -if len(sys.argv) < 3: #{ - print('ngrams-to-rules.py '); - sys.exit(-1); -#} - -infile = open(sys.argv[1]); -threshold = float(sys.argv[2]); - -permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*']; - -print(''); -lineno = 1; -ruleno = 0; -for line in infile.readlines(): #{ -# print('\n'; -# print(line - if len(line) < 2: #{ - continue; - #} - line = line.strip(); - #line = line.decode('utf-8').strip(); - - - #+ 0.571428571429 14 8 8 troiñ tourner 8 - row = line.split('\t'); - - if len(row) == 3: - row.insert(0, ''); - -# tipus = row[0].split(' ')[0]; - weight = row[0].split(' ')[1]; - sl = row[1].strip()[1:-1]; - tl = row[3][1:-1]; - tl_lema = tl.split('<')[0].lower(); - tl_tags = '<'.join(tl.split('<')[1:]).replace('><', '.').replace('>', ''); - - - freq = row[4]; - pattern = common.tokenize_tagger_line(row[2]); - - if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: #{ - print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr); - continue; - #} - - inpattern = False; - for w in pattern: #{ - if w.count(sl) > 0: #{ - inpattern = True; - #} - #} - if inpattern == False: #{ - print('SL_NOT_IN_PATTERN' , line, sl, tl, file=sys.stderr); - continue; - #} - - if tl_tags.count('adj') > 0 and sl.count('adj') < 1: #{ - print("TAG_MISMATCH" , line, file=sys.stderr); - continue; - #} - if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: #{ - print("TAG_MISMATCH" , line, file=sys.stderr); - continue; - #} - - if tl_tags.split('.')[0] not in permitted_tags: #{ - print("TAG_NOT_PERMITTED" , tl_tags , '||' , line, file=sys.stderr); - continue; - #} - - if float(weight) <= float(threshold): #{ - print("UNDER_THRESHOLD", weight, "<", threshold, "||", line, file=sys.stderr); - continue; - #} - - if any([x.startswith("*") for x in pattern]): #{ - print("UNKNOWN_WORD_IN_PATTERN" , pattern, file=sys.stderr); - continue; - #} - - sel = False; - ruleno = ruleno + 1; - lineno = lineno + 1; - - print(' '); - for word in pattern: #{ - sl_lema = word.split('<')[0].lower(); - if (sl_lema[0] == '*'): - continue; - - if word.count('><') > 0: #{ - sl_tags = '<'.join(word.split('<')[1:]).replace('><', '.').replace('>', ''); - else: #{ - sl_tags = '<'.join(word.split('<')[1:]).strip('<>'); - #} - - # ======================================================================= # - - sl_lema = sl_lema.replace('~', ' '); - tl_lema = tl_lema.replace('~', ' '); - sl_lema = sl_lema.replace('-', '\-'); - tl_lema = tl_lema.replace('-', '\-'); - sl_lema = sl_lema.replace('(', '\('); - tl_lema = tl_lema.replace('(', '\('); - sl_lema = sl_lema.replace(')', '\)'); - tl_lema = tl_lema.replace(')', '\)'); - - if word.lower().count(sl) > 0: #{ - lineno = lineno + 1; - if sl_lema == '': #{ - print(' '); - #} - sel = True; - else: #{ - lineno = lineno + 1; - if sl_lema == '': #{ - print(' '); - else: #{ - print(' '); - #} - #} - #} - if sel == False: #{ - - print(' '); - else: #{ - print(' '); - #} - lineno = lineno + 1; -#} -print(''); +# +nature The imperialist nature carácter 1 +# +nature the secular nature of State carácter 1 +# +nature its nature prevent carácter 1 +# +nature nature be in carácter 1 + +def ngrams_to_rules(ngrams, crisphold): + permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*'] + + print('') + lineno = 1 + ruleno = 0 + for line in open(ngrams).readlines(): + # print('\n'; + # print(line + if len(line) < 2: + continue + + line = line.strip() + #line = line.strip(); + + # + 0.571428571429 14 8 8 troiñ tourner 8 + row = line.split('\t') + + if len(row) == 3: + row.insert(0, '') + + # tipus = row[0].split(' ')[0]; + weight = row[0].split(' ')[1] + sl = row[1].strip()[1:-1] + tl = row[3][1:-1] + tl_lema = tl.split('<')[0].lower() + tl_tags = '<'.join(tl.split('<')[1:]).replace( + '><', '.').replace('>', '') + + freq = row[4] + pattern = common.tokenize_tagger_line(row[2]) + + if row[2].count('') > 0 or row[2].count('') > 0 or row[2].count('') > 0: + print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) + continue + + inpattern = False + for w in pattern: + if w.count(sl) > 0: + inpattern = True + + if inpattern == False: + print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr) + continue + + if tl_tags.count('adj') > 0 and sl.count('adj') < 1: + print("TAG_MISMATCH", line, file=sys.stderr) + continue + + if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: + print("TAG_MISMATCH", line, file=sys.stderr) + continue + + if tl_tags.split('.')[0] not in permitted_tags: + print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr) + continue + + if float(weight) <= float(crisphold): + print("UNDER_THRESHOLD", weight, "<", + crisphold, "||", line, file=sys.stderr) + continue + + if any([x.startswith("*") for x in pattern]): + print("UNKNOWN_WORD_IN_PATTERN", pattern, file=sys.stderr) + continue + + sel = False + ruleno = ruleno + 1 + lineno = lineno + 1 + + print(' ') + for word in pattern: + sl_lema = word.split('<')[0].lower() + if (sl_lema[0] == '*'): + continue + + if word.count('><') > 0: + sl_tags = '<'.join(word.split('<')[1:]).replace( + '><', '.').replace('>', '') + else: + sl_tags = '<'.join(word.split('<')[1:]).strip('<>') + + # ======================================================================= # + + sl_lema = sl_lema.replace('~', ' ') + tl_lema = tl_lema.replace('~', ' ') + sl_lema = sl_lema.replace('-', '\-') + tl_lema = tl_lema.replace('-', '\-') + sl_lema = sl_lema.replace('(', '\(') + tl_lema = tl_lema.replace('(', '\(') + sl_lema = sl_lema.replace(')', '\)') + tl_lema = tl_lema.replace(')', '\)') + + if word.lower().count(sl) > 0: + lineno = lineno + 1 + if sl_lema == '': + print(' ') + + sel = True + else: + lineno = lineno + 1 + if sl_lema == '': + print(' ') + else: + print(' ') + + if sel == False: + + print(' ') + else: + print(' ') + + lineno = lineno + 1 + print('') + + +if __name__ == '__main__': + if len(sys.argv) < 3: + print('Usage: ngrams-to-rules.py ', file=sys.stderr) + exit(1) + + ngrams_to_rules(sys.argv[1], sys.argv[2])