commit 41005865beeecaafa5660c24af0677ca3b1dd0e5 Author: Vivek Vardhan Adepu Date: Sat Aug 7 08:44:37 2021 +0530 Scripts: enclosing the code in functions (#81) diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py index 9a79c5a..cedac60 100755 --- a/scripts/biltrans-count-patterns-ngrams.py +++ b/scripts/biltrans-count-patterns-ngrams.py @@ -28,39 +28,40 @@ import biltrans_count_common as BCC # d) Crispiness threshold -cur_line = 0 -crisphold = 3.0 # Default -only_max = True -#only_max = False - -if len(sys.argv) == 5: - crisphold = float(sys.argv[4]) - print('crisp:', crisphold, file=sys.stderr) - -# First read in the frequency defaults - -sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1]) - -print('Reading...', file=sys.stderr) -sys.stderr.flush() - - class Counter(BCC.BiltransCounter): tokenizer = 'biltrans' line_ids = True count_ngrams = True max_ngrams = 3 - - -c = Counter() -c.read_files(sys.argv[2], # File with ambiguous biltrans output - sys.argv[3]) # File with disambiguated biltrans output -ngrams = c.ngrams - -print('Caching counts...', file=sys.stderr) -for sl in ngrams: - for ngram in ngrams[sl]: - for tl in ngrams[sl][ngram]: - print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)) - -print('\n', file=sys.stderr) + +def biltrans_count_patterns_ngrams(lex_freq, biltrans_ambig, biltrans_annotated, crisphold=3.0): + # First read in the frequency defaults + + BCC.read_frequencies(lex_freq) + + print('Reading...', file=sys.stderr) + sys.stderr.flush() + + c = Counter() + c.read_files(biltrans_ambig, # File with ambiguous biltrans output + biltrans_annotated) # File with disambiguated biltrans output + ngrams = c.ngrams + + print('Caching counts...', file=sys.stderr) + for sl in ngrams: + for ngram in ngrams[sl]: + for tl in ngrams[sl][ngram]: + print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)) + + print('\n', file=sys.stderr) + +if __name__ == '__main__': + if len(sys.argv) < 4: + print('Usage: biltrans-count-patterns-ngrams.py [crisphold]', file=sys.stderr) + exit(1) + + if len(sys.argv) == 5: + print('crisp:', sys.argv[4], file=sys.stderr) + biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) + else: + biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/scripts/biltrans-extract-frac-freq.py b/scripts/biltrans-extract-frac-freq.py index 17211aa..ef1ad66 100644 --- a/scripts/biltrans-extract-frac-freq.py +++ b/scripts/biltrans-extract-frac-freq.py @@ -17,32 +17,38 @@ import common # # -# The sl-tl possible combinations -sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0)) - - class Counter(BCC.BiltransCounter): tokenizer = 'biltrans' line_ids = True + # The sl-tl possible combinations + sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0)) + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): - global sl_tl - sl_tl[sl][tl] += frac_count - - -c = Counter() -c.read_files(sys.argv[1], # File with ambiguous biltrans output - sys.argv[2]) # File with disambiguated biltrans output - -for sl in sl_tl: - newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) - newtl.reverse() - first = True - for tl in newtl: - if first: - print('%.10f %s %s @' % - (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) - first = False - else: - print('%.10f %s %s' % - (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) + self.sl_tl[sl][tl] += frac_count + +def biltrans_extract_frac_freq(biltrans_ambig, biltrans_annotated): + + c = Counter() + c.read_files(biltrans_ambig, # File with ambiguous biltrans output + biltrans_annotated) # File with disambiguated biltrans output + + for sl in c.sl_tl: + newtl = sorted(c.sl_tl[sl], key=lambda x: c.sl_tl[sl][x]) + newtl.reverse() + first = True + for tl in newtl: + if first: + print('%.10f %s %s @' % + (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) + first = False + else: + print('%.10f %s %s' % + (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) + +if __name__ == '__main__': + if len(sys.argv) < 3: + print('Usage: biltrans-extract-frac-freq.py ', file=sys.stderr) + exit(1) + + biltrans_extract_frac_freq(sys.argv[1], sys.argv[2]) diff --git a/scripts/extract-alig-lrx.py b/scripts/extract-alig-lrx.py index af736c2..371b0e9 100755 --- a/scripts/extract-alig-lrx.py +++ b/scripts/extract-alig-lrx.py @@ -5,51 +5,58 @@ import sys import common -with open(sys.argv[1]) as d: - print('') - for line in d: # { - - sys.stdout.flush() - if line[-2] == '@': # { - row = common.tokenize_tagger_line(line) - - fq = line.split(' ')[0] - sl = row[0] - tl = row[1] - - if line.count('>') < 2: # { - continue - # } - print(sl, tl, file=sys.stderr) - sl_lem = sl.split('<')[0] - tl_lem = tl.split('<')[0] - sl_lem = sl_lem.replace( - '-', '\\-').replace('~', ' ').replace('&', '&') - tl_lem = tl_lem.replace( - '-', '\\-').replace('~', ' ').replace('&', '&') - - sl_tag = sl.replace('><', '.').split('<')[1].strip('>') - tl_tag = tl.replace('><', '.').split('<')[1].strip('>') - - cmb = '' - cma = '' - - if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']: # { - cmb = '' - else: # { - cma = '' + +def extract_alig_lrx(lex_freq): + with open(lex_freq) as d: + print('') + + for line in d: + sys.stdout.flush() + if line[-2] == '@': + row = common.tokenize_tagger_line(line) + + fq = line.split(' ')[0] + sl = row[0] + tl = row[1] + + if line.count('>') < 2: + continue + + print(sl, tl, file=sys.stderr) + sl_lem = sl.split('<')[0] + tl_lem = tl.split('<')[0] + sl_lem = sl_lem.replace( + '-', '\\-').replace('~', ' ').replace('&', '&') + tl_lem = tl_lem.replace( + '-', '\\-').replace('~', ' ').replace('&', '&') + + sl_tag = sl.replace('><', '.').split('<')[1].strip('>') + tl_tag = tl.replace('><', '.').split('<')[1].strip('>') + cmb = '' - # } + cma = '' + + if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']: + cmb = '' + else: + cma = '' + cmb = '' + + rule = cmb + '' + # rule = rule + '' + rule = rule + '' + rule = rule + '' + cma + + print(rule) + + print('') - rule = cmb + '' - #rule = rule + '' - rule = rule + '' - rule = rule + '' + cma - print(rule) - # } +if __name__ == '__main__': + if len(sys.argv) < 2: + print('Usage: extract-alig-lrx.py ', file=sys.stderr) + exit(1) - # } - print('') + extract_alig_lrx(sys.argv[1]) diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py index 7f4a158..34ade71 100755 --- a/scripts/ngram-count-patterns.py +++ b/scripts/ngram-count-patterns.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # coding=utf-8 # -*- encoding: utf-8 -*- diff --git a/scripts/ngram-pruning-frac.py b/scripts/ngram-pruning-frac.py index 8509715..743e606 100755 --- a/scripts/ngram-pruning-frac.py +++ b/scripts/ngram-pruning-frac.py @@ -17,160 +17,157 @@ import common # d) Crispiness threshold -cur_line = 0 -crisphold = 3.0 # Default -only_max = True -#only_max = False; - -if len(sys.argv) == 4: # { - crisphold = float(sys.argv[3]) - print('crisp:', crisphold, file=sys.stderr) -# } - -sl_tl_defaults = {} -sl_tl = {} -ngrams = {} - -# First read in the frequency defaults - -for line in open(sys.argv[1]).readlines(): # { - if len(line) < 1: # { - continue - # } - - row = common.tokenize_tagger_line(line) - sl = row[0] - tl = row[1] - fr = float(line.split(' ')[0]) - if line.count('@') and fr == 0.0: # { - print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr) - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) - # } - if line.count('@') > 0: # { - print('default:', sl, tl, file=sys.stderr) - sl_tl_defaults[sl] = tl - else: # { - sl_tl[sl] = tl - # } - -# } - -max_crispiness = 0.0 -print('Reading...', file=sys.stderr) -sys.stderr.flush() - -# Load counts from cached file - -ngramsf = open(sys.argv[2]) -for line in ngramsf.readlines(): # { - if len(line) < 1: # { - continue - # } - row = line.split('\t') - - freq = float(row[0]) - ngram = row[1] - sl = row[2] - tl = row[3].strip() - - if sl not in ngrams: # { - ngrams[sl] = {} - # } - if ngram not in ngrams[sl]: # { - ngrams[sl][ngram] = {} - # } - if tl not in ngrams[sl][ngram]: # { - ngrams[sl][ngram][tl] = 0.0 - # } - ngrams[sl][ngram][tl] = freq -# } - -for sl in ngrams: # { - if sl == '': # { - continue - # } - for ngram in ngrams[sl]: # { - if ngram == '': # { +def ngram_pruning_frac(lex_freq, ngrams_file, crisphold=3.0): + cur_line = 0 + only_max = True + #only_max = False; + + sl_tl_defaults = {} + sl_tl = {} + ngrams = {} + + # First read in the frequency defaults + + for line in open(lex_freq).readlines(): + if len(line) < 1: continue - # } - total = 0.0 - max_freq = -1.0 - max_tl = '' - for tl in ngrams[sl][ngram]: # { + row = common.tokenize_tagger_line(line) + sl = row[0] + tl = row[1] + fr = float(line.split(' ')[0]) + if line.count('@') and fr == 0.0: + print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr) + print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) + + if line.count('@') > 0: + print('default:', sl, tl, file=sys.stderr) + sl_tl_defaults[sl] = tl + else: + sl_tl[sl] = tl - if ngrams[sl][ngram][tl] > max_freq: # { - max_freq = ngrams[sl][ngram][tl] - max_tl = tl - # } - total = total + ngrams[sl][ngram][tl] - # } - default = sl_tl_defaults[sl] + max_crispiness = 0.0 + print('Reading...', file=sys.stderr) + sys.stderr.flush() - if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]: # { - print('Some shit went down..', file=sys.stderr) - print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr) + # Load counts from cached file + + ngramsf = open(ngrams_file) + for line in ngramsf.readlines(): + if len(line) < 1: continue - # } - if max_freq == 0.0: + + row = line.split('\t') + + freq = float(row[0]) + ngram = row[1] + sl = row[2] + tl = row[3].strip() + + if sl not in ngrams: + ngrams[sl] = {} + + if ngram not in ngrams[sl]: + ngrams[sl][ngram] = {} + + if tl not in ngrams[sl][ngram]: + ngrams[sl][ngram][tl] = 0.0 + + ngrams[sl][ngram][tl] = freq + + + for sl in ngrams: + if sl == '': continue - if only_max == True: # { - crispiness = 0.0 - alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total) - def_crisp = 1.0 - if default in ngrams[sl][ngram]: # { - def_crisp = float(ngrams[sl][ngram][default] / float(total)) - # } - if def_crisp == 0.0: # { - print('!!! Something wanky happened. :(', file=sys.stderr) - print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % ( - total, max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr) - print('\tskipping...', file=sys.stderr) + for ngram in ngrams[sl]: + if ngram == '': continue - # } - weight = float(ngrams[sl][ngram][max_tl]) / float(total) - crispiness = alt_crisp/def_crisp - print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, - max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])) -# print('- %.10f \t%s\t%s\t%s\t%.10f' % (crispiness, sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); + total = 0.0 + max_freq = -1.0 + max_tl = '' + for tl in ngrams[sl][ngram]: + + if ngrams[sl][ngram][tl] > max_freq: + max_freq = ngrams[sl][ngram][tl] + max_tl = tl - if crispiness > max_crispiness: # { - max_crispiness = crispiness - # } + total = total + ngrams[sl][ngram][tl] -# crispiness weight total default max_freq tl_freq sl -# + 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238 aozer aozer an levr organisateur 0.7236389238 -# - 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438 treuzkas treuzkas teknologel transfert 0.9999321438 - else: # { + default = sl_tl_defaults[sl] - for tl in ngrams[sl][ngram]: # { + if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]: + print('Some shit went down..', file=sys.stderr) + print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr) + continue + + if max_freq == 0.0: + continue + if only_max == True: crispiness = 0.0 - default = sl_tl_defaults[sl] - alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total) def_crisp = 1.0 - if default in ngrams[sl][ngram]: # { - def_crisp = float( - ngrams[sl][ngram][default] / float(total)) - # } - weight = float(ngrams[sl][ngram][tl]) / float(total) + if default in ngrams[sl][ngram]: + def_crisp = float(ngrams[sl][ngram][default] / float(total)) + + if def_crisp == 0.0: + print('!!! Something wanky happened. :(', file=sys.stderr) + print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % ( + total, max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr) + print('\tskipping...', file=sys.stderr) + continue + + weight = float(ngrams[sl][ngram][max_tl]) / float(total) crispiness = alt_crisp/def_crisp - # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; + print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, + max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])) + # print('- %.10f \t%s\t%s\t%s\t%.10f' % (crispiness, sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); + + if crispiness > max_crispiness: + max_crispiness = crispiness + + + # crispiness weight total default max_freq tl_freq sl + # + 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238 aozer aozer an levr organisateur 0.7236389238 + # - 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438 treuzkas treuzkas teknologel transfert 0.9999321438 + else: + + for tl in ngrams[sl][ngram]: + + crispiness = 0.0 + default = sl_tl_defaults[sl] + alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: + def_crisp = float( + ngrams[sl][ngram][default] / float(total)) + + weight = float(ngrams[sl][ngram][tl]) / float(total) + crispiness = alt_crisp/def_crisp + + # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; + + print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, + ngrams[sl][ngram][default], max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])) + # + 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 galloud ha an galloud puissance 1.9979947504 + + if crispiness > max_crispiness: + max_crispiness = crispiness - print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, - ngrams[sl][ngram][default], max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])) -# + 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 galloud ha an galloud puissance 1.9979947504 - # } - if crispiness > max_crispiness: # { - max_crispiness = crispiness - # } - # } - # } -# } + print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr) -print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr) +if __name__ == '__main__': + if len(sys.argv) < 3: + print('Usage: ngram-pruning-frac.py [crisphold]', file=sys.stderr) + exit(1) + + if len(sys.argv) == 4: + print('crisp:', sys.argv[3], file=sys.stderr) + ngram_pruning_frac(sys.argv[1], sys.argv[2], sys.argv[3]) + else: + ngram_pruning_frac(sys.argv[1], sys.argv[2]) diff --git a/scripts/ngrams-to-rules.py b/scripts/ngrams-to-rules.py index 329851e..5900928 100755 --- a/scripts/ngrams-to-rules.py +++ b/scripts/ngrams-to-rules.py @@ -18,13 +18,13 @@ def ngrams_to_rules(ngrams, crisphold): lineno = 1 ruleno = 0 for line in open(ngrams).readlines(): - # print('\n'; - # print(line + # print('\n') + # print(line) if len(line) < 2: continue line = line.strip() - #line = line.strip(); + #line = line.strip() # + 0.571428571429 14 8 8 troiƱ tourner 8 row = line.split('\t') @@ -32,7 +32,7 @@ def ngrams_to_rules(ngrams, crisphold): if len(row) == 3: row.insert(0, '') - # tipus = row[0].split(' ')[0]; + # tipus = row[0].split(' ')[0] weight = row[0].split(' ')[1] sl = row[1].strip()[1:-1] tl = row[3][1:-1]