commit 0bbee0d34ed4b2d4ab19279af56de4d5a2817b28 Author: vivekvardhanadepu Date: Sat Jul 31 20:30:46 2021 +0530 adding scripts(reqd for non-parallel corpora training) for packaging diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 1f8a2f1..389d2fb 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -6,4 +6,10 @@ apertium_lex_tools_DATA = \ merge-ngrams-lambdas.py \ lambdas-to-rules.py \ common.py \ - ngrams-to-rules-me.py + ngrams-to-rules-me.py \ + biltrans-extract-frac-freq.py \ + extract-alig-lrx.py \ + biltrans-count-patterns-ngrams.py \ + ngram-pruning-frac.py \ + ngrams-to-rules.py \ + biltrans_count_common.py diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py index bd5bf68..9a79c5a 100755 --- a/scripts/biltrans-count-patterns-ngrams.py +++ b/scripts/biltrans-count-patterns-ngrams.py @@ -23,8 +23,8 @@ import biltrans_count_common as BCC # c) Disambiguated biltrans output -#.[][56011 0].[] ^un/un$ ^digarez/excuse$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.9917274061 |@| -#.[][56011 1].[] ^un/un$ ^digarez/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.0082725939 || +# .[][56011 0].[] ^un/un$ ^digarez/excuse$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.9917274061 |@| +# .[][56011 1].[] ^un/un$ ^digarez/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.0082725939 || # d) Crispiness threshold @@ -34,8 +34,8 @@ only_max = True #only_max = False if len(sys.argv) == 5: - crisphold = float(sys.argv[4]) - print('crisp:', crisphold, file=sys.stderr) + crisphold = float(sys.argv[4]) + print('crisp:', crisphold, file=sys.stderr) # First read in the frequency defaults @@ -44,21 +44,23 @@ sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1]) print('Reading...', file=sys.stderr) sys.stderr.flush() + class Counter(BCC.BiltransCounter): - tokenizer = 'biltrans' - line_ids = True - count_ngrams = True - max_ngrams = 3 + tokenizer = 'biltrans' + line_ids = True + count_ngrams = True + max_ngrams = 3 + c = Counter() -c.read_files(sys.argv[2], # File with ambiguous biltrans output - sys.argv[3]) # File with disambiguated biltrans output +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output ngrams = c.ngrams print('Caching counts...', file=sys.stderr) for sl in ngrams: - for ngram in ngrams[sl]: - for tl in ngrams[sl][ngram]: - print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)) + for ngram in ngrams[sl]: + for tl in ngrams[sl][ngram]: + print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)) print('\n', file=sys.stderr) diff --git a/scripts/biltrans-extract-frac-freq.py b/scripts/biltrans-extract-frac-freq.py index a203ef2..17211aa 100644 --- a/scripts/biltrans-extract-frac-freq.py +++ b/scripts/biltrans-extract-frac-freq.py @@ -12,33 +12,37 @@ import common # 56011 ^un/un$ ^digarez/excuse/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$ # # b) Disambiguated biltrans output -#.[][56011 0].[] ^un/un$ ^digarez/excuse$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.9917274061 |@| -#.[][56011 1].[] ^un/un$ ^digarez/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.0082725939 || +# .[][56011 0].[] ^un/un$ ^digarez/excuse$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.9917274061 |@| +# .[][56011 1].[] ^un/un$ ^digarez/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.0082725939 || # # # The sl-tl possible combinations sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0)) + class Counter(BCC.BiltransCounter): - tokenizer = 'biltrans' - line_ids = True + tokenizer = 'biltrans' + line_ids = True + + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + global sl_tl + sl_tl[sl][tl] += frac_count - def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): - global sl_tl - sl_tl[sl][tl] += frac_count c = Counter() -c.read_files(sys.argv[1], # File with ambiguous biltrans output - sys.argv[2]) # File with disambiguated biltrans output +c.read_files(sys.argv[1], # File with ambiguous biltrans output + sys.argv[2]) # File with disambiguated biltrans output for sl in sl_tl: - newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) - newtl.reverse() - first = True - for tl in newtl: - if first: - print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))) - first = False - else: - print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))) + newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) + newtl.reverse() + first = True + for tl in newtl: + if first: + print('%.10f %s %s @' % + (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) + first = False + else: + print('%.10f %s %s' % + (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))) diff --git a/scripts/biltrans_count_common.py b/scripts/biltrans_count_common.py index 3b14b9d..4663dcf 100644 --- a/scripts/biltrans_count_common.py +++ b/scripts/biltrans_count_common.py @@ -8,202 +8,213 @@ import math import re import sys + def safe_float(s): - if not s: - return 0.0, False - try: - f = float(s) - if math.isnan(f): - return 0.0, False - return f, True - except: - return 0.0, False + if not s: + return 0.0, False + try: + f = float(s) + if math.isnan(f): + return 0.0, False + return f, True + except: + return 0.0, False + def strip_tags(s, side, space=False): - idx = 0 if side == 'sl' else 1 - ret = s.split('/')[idx] - if not space: - ret = ret.replace(' ', '~') - if ret.count('><') > 0: - ret = ret.split('><')[0] + '>' - return ret + idx = 0 if side == 'sl' else 1 + ret = s.split('/')[idx] + if not space: + ret = ret.replace(' ', '~') + if ret.count('><') > 0: + ret = ret.split('><')[0] + '>' + return ret + class BiltransCounter: - lu_sep = re.compile(r'\$[^\^]*\^') - tokenizer = 'regex' # or 'biltrans' - line_ids = False - count_ngrams = False - max_ngrams = 3 - biltrans_wrap_lus = False - def __init__(self): - self.reading = False - - self.am_file = None - self.am_line = None - self.am_row = None - self.am_id = None - self.am_linenum = 0 - self.dm_file = None - self.dm_line = None - self.dm_row = None - self.dm_id = None - self.dm_linenum = 0 - - self.clear_ngrams() - - def __del__(self): - if self.am_file: - self.am_file.close() - if self.dm_file: - self.dm_file.close() - - def next_am_line(self): - self.am_line = self.am_file.readline() - self.am_linenum += 1 - if not self.am_line: - self.am_id, self.am_row = None, [] - self.reading = False - return - ls = self.am_line.split('\t') - if self.line_ids: - self.am_id = int(ls[0].strip()) - if self.tokenizer == 'regex': - self.am_row = self.lu_sep.split(ls[1].strip()[1:-1]) - elif self.tokenizer == 'biltrans': - self.am_row = common.tokenize_biltrans_line(self.am_line) - - def next_dm_line(self): - self.dm_linenum += 1 - self.dm_line = self.dm_file.readline() - if not self.dm_line: - self.dm_id, self.dm_row = None, [] - self.reading = False - return - ls = self.dm_line.split('\t') - if self.line_ids: - self.dm_id = int(self.dm_line.split('.[][')[1].split()[0]) - if self.tokenizer == 'regex': - self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1]) - elif self.tokenizer == 'biltrans': - self.dm_row = common.tokenize_biltrans_line(self.dm_line) - - def clear_ngrams(self): - self.ngrams = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) - - def check_rows(self): - if len(self.am_row) != len(self.dm_row): - print('Mismatch in number of LUs between analysis and training', file=sys.stderr) - print('\t' + self.am_line, file=sys.stderr) - print('\t' + self.dm_line, file=sys.stderr) - print('...skipping', file=sys.stderr) - return False - return True - - def read_files_multi_dm(self, am_fname, dm_fname): - self.next_dm_line() - while self.reading: - self.next_am_line() - while self.am_id == self.dm_id and self.reading: - frac_count = 0 - if self.dm_line.count('\t') > 1: - frac_count, _ = safe_float(self.dm_line.split('\t')[2]) - if self.check_rows(): - self.process_row(frac_count) - self.next_dm_line() - if self.am_linenum % 1000 == 0: - print('=> %d SL and %d TL lines read' % (self.am_linenum, self.dm_linenum), file=sys.stderr) - - def read_files(self, am_fname, dm_fname): - self.am_file = open(am_fname) - self.dm_file = open(dm_fname) - self.reading = True - if self.line_ids: - self.read_files_multi_dm(am_fname, dm_fname) - return - while self.reading: - self.next_am_line() - self.next_dm_line() - if self.reading and self.check_rows(): - self.process_row() - if self.am_linenum % 1000 == 0: - print('=> %d lines read' % self.am_linenum, file=sys.stderr) - - def process_row(self, frac_count=0): - if self.tokenizer == 'regex': - cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row] - for i in range(len(self.am_row)): - if self.am_row[i].count('/') > 1: - sl = strip_tags(self.am_row[i], 'sl') - tl = strip_tags(self.dm_row[i], 'tl') - self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) - elif self.tokenizer == 'biltrans': - cur_sl_row = [x['sl'] for x in self.am_row] - for i in range(len(self.am_row)): - if len(self.am_row[i]['tls']) > 1: - sl = self.am_row[i]['sl'] - tl = self.dm_row[i]['tls'][0] - if self.biltrans_wrap_lus: - sl = common.wrap(sl) - tl = common.wrap(tl) - self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) - - def process_lu_internal(self, sl, tl, idx, cur_sl_row, frac_count=0): - if self.count_ngrams: - for j in range(1, self.max_ngrams): - pregram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+1])) - postgram = ' '.join(map(common.wrap, cur_sl_row[idx:idx+j+1])) - roundgram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+j+1])) - self.ngrams[sl][pregram][tl] += frac_count - self.ngrams[sl][postgram][tl] += frac_count - self.ngrams[sl][roundgram][tl] += frac_count - self.process_lu(sl, tl, idx, cur_sl_row, frac_count) - - def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): - pass + lu_sep = re.compile(r'\$[^\^]*\^') + tokenizer = 'regex' # or 'biltrans' + line_ids = False + count_ngrams = False + max_ngrams = 3 + biltrans_wrap_lus = False + + def __init__(self): + self.reading = False + + self.am_file = None + self.am_line = None + self.am_row = None + self.am_id = None + self.am_linenum = 0 + self.dm_file = None + self.dm_line = None + self.dm_row = None + self.dm_id = None + self.dm_linenum = 0 + + self.clear_ngrams() + + def __del__(self): + if self.am_file: + self.am_file.close() + if self.dm_file: + self.dm_file.close() + + def next_am_line(self): + self.am_line = self.am_file.readline() + self.am_linenum += 1 + if not self.am_line: + self.am_id, self.am_row = None, [] + self.reading = False + return + ls = self.am_line.split('\t') + if self.line_ids: + self.am_id = int(ls[0].strip()) + if self.tokenizer == 'regex': + self.am_row = self.lu_sep.split(ls[1].strip()[1:-1]) + elif self.tokenizer == 'biltrans': + self.am_row = common.tokenize_biltrans_line(self.am_line) + + def next_dm_line(self): + self.dm_linenum += 1 + self.dm_line = self.dm_file.readline() + if not self.dm_line: + self.dm_id, self.dm_row = None, [] + self.reading = False + return + ls = self.dm_line.split('\t') + if self.line_ids: + self.dm_id = int(self.dm_line.split('.[][')[1].split()[0]) + if self.tokenizer == 'regex': + self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1]) + elif self.tokenizer == 'biltrans': + self.dm_row = common.tokenize_biltrans_line(self.dm_line) + + def clear_ngrams(self): + self.ngrams = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + + def check_rows(self): + if len(self.am_row) != len(self.dm_row): + print( + 'Mismatch in number of LUs between analysis and training', file=sys.stderr) + print('\t' + self.am_line, file=sys.stderr) + print('\t' + self.dm_line, file=sys.stderr) + print('...skipping', file=sys.stderr) + return False + return True + + def read_files_multi_dm(self, am_fname, dm_fname): + self.next_dm_line() + while self.reading: + self.next_am_line() + while self.am_id == self.dm_id and self.reading: + frac_count = 0 + if self.dm_line.count('\t') > 1: + frac_count, _ = safe_float(self.dm_line.split('\t')[2]) + if self.check_rows(): + self.process_row(frac_count) + self.next_dm_line() + if self.am_linenum % 1000 == 0: + print('=> %d SL and %d TL lines read' % + (self.am_linenum, self.dm_linenum), file=sys.stderr) + + def read_files(self, am_fname, dm_fname): + self.am_file = open(am_fname) + self.dm_file = open(dm_fname) + self.reading = True + if self.line_ids: + self.read_files_multi_dm(am_fname, dm_fname) + return + while self.reading: + self.next_am_line() + self.next_dm_line() + if self.reading and self.check_rows(): + self.process_row() + if self.am_linenum % 1000 == 0: + print('=> %d lines read' % self.am_linenum, file=sys.stderr) + + def process_row(self, frac_count=0): + if self.tokenizer == 'regex': + cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row] + for i in range(len(self.am_row)): + if self.am_row[i].count('/') > 1: + sl = strip_tags(self.am_row[i], 'sl') + tl = strip_tags(self.dm_row[i], 'tl') + self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) + elif self.tokenizer == 'biltrans': + cur_sl_row = [x['sl'] for x in self.am_row] + for i in range(len(self.am_row)): + if len(self.am_row[i]['tls']) > 1: + sl = self.am_row[i]['sl'] + tl = self.dm_row[i]['tls'][0] + if self.biltrans_wrap_lus: + sl = common.wrap(sl) + tl = common.wrap(tl) + self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) + + def process_lu_internal(self, sl, tl, idx, cur_sl_row, frac_count=0): + if self.count_ngrams: + for j in range(1, self.max_ngrams): + pregram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+1])) + postgram = ' '.join(map(common.wrap, cur_sl_row[idx:idx+j+1])) + roundgram = ' '.join( + map(common.wrap, cur_sl_row[idx-j:idx+j+1])) + self.ngrams[sl][pregram][tl] += frac_count + self.ngrams[sl][postgram][tl] += frac_count + self.ngrams[sl][roundgram][tl] += frac_count + self.process_lu(sl, tl, idx, cur_sl_row, frac_count) + + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + pass + def features_and_outline(ngrams, sl, tl, sl_tl, features, indexes, - frac_count=None): - if not ngrams[sl]: - return - meevents = [] - for ni in ngrams[sl]: - if ni not in features: - feature_counter = len(features) + 1 - features[ni] = feature_counter - meevents.append(features[ni]) - if sl not in sl_tl or len(sl_tl[sl]) < 2: - return - outline = str(indexes[(sl, tl)]) - if frac_count != None: - outline += ' $ ' + str(int(frac_count * 10000)) + ' ' - outline += ' # ' - for j in range(len(sl_tl[sl])): - for feature in meevents: - outline += '%s:%s ' % (feature, j) - outline += ' # ' - print('%s\t%s\t%s' % (sl, len(sl_tl[sl]), outline)) + frac_count=None): + if not ngrams[sl]: + return + meevents = [] + for ni in ngrams[sl]: + if ni not in features: + feature_counter = len(features) + 1 + features[ni] = feature_counter + meevents.append(features[ni]) + if sl not in sl_tl or len(sl_tl[sl]) < 2: + return + outline = str(indexes[(sl, tl)]) + if frac_count != None: + outline += ' $ ' + str(int(frac_count * 10000)) + ' ' + outline += ' # ' + for j in range(len(sl_tl[sl])): + for feature in meevents: + outline += '%s:%s ' % (feature, j) + outline += ' # ' + print('%s\t%s\t%s' % (sl, len(sl_tl[sl]), outline)) + def read_frequencies(fname): - with open(fname) as fin: - sl_tl = {} - sl_tl_defaults = {} - indexes = {} - trad_counter = defaultdict(lambda: 0) - for line_ in fin.readlines(): - line = line_.strip() - if not line: - continue - row = line.split(' ') - sl = row[1].strip() - tl = row[2].strip() - indexes[(sl, tl)] = trad_counter[sl] - trad_counter[sl] += 1 - if '@' in line: - sl_tl_defaults[sl] = tl - if float(row[0]) == 0.0: - print('!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr) - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) - else: - sl_tl[sl] = tl - return sl_tl, sl_tl_defaults, indexes + with open(fname) as fin: + sl_tl = {} + sl_tl_defaults = {} + indexes = {} + trad_counter = defaultdict(lambda: 0) + for line_ in fin.readlines(): + line = line_.strip() + if not line: + continue + row = line.split(' ') + sl = row[1].strip() + tl = row[2].strip() + indexes[(sl, tl)] = trad_counter[sl] + trad_counter[sl] += 1 + if '@' in line: + sl_tl_defaults[sl] = tl + if float(row[0]) == 0.0: + print( + '!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr) + print(' %s => %s = %.10f' % + (sl, tl, fr), file=sys.stderr) + else: + sl_tl[sl] = tl + return sl_tl, sl_tl_defaults, indexes diff --git a/scripts/extract-alig-lrx.py b/scripts/extract-alig-lrx.py index d71971a..af736c2 100755 --- a/scripts/extract-alig-lrx.py +++ b/scripts/extract-alig-lrx.py @@ -2,53 +2,54 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; +import sys import common with open(sys.argv[1]) as d: - print(''); - for line in d: #{ - - sys.stdout.flush(); - if line[-2] == '@': #{ - row = common.tokenize_tagger_line(line) - - fq = line.split(' ')[0]; - sl = row[0]; - tl = row[1]; - - if line.count('>') < 2: #{ - continue; - #} - print(sl, tl, file=sys.stderr) - sl_lem = sl.split('<')[0]; - tl_lem = tl.split('<')[0]; - sl_lem = sl_lem.replace('-', '\\-').replace('~', ' ').replace('&', '&'); - tl_lem = tl_lem.replace('-', '\\-').replace('~', ' ').replace('&', '&'); - - sl_tag = sl.replace('><', '.').split('<')[1].strip('>'); - tl_tag = tl.replace('><', '.').split('<')[1].strip('>'); - - cmb = ''; - cma = ''; - - if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']: #{ - cmb = ''; - else: #{ - cma = ''; - cmb = ''; - #} - - rule = cmb + ''; - #rule = rule + ''; - rule = rule + ''; - rule = rule + '' + cma; - - print(rule); - #} - - - #} - print(''); + print('') + for line in d: # { + + sys.stdout.flush() + if line[-2] == '@': # { + row = common.tokenize_tagger_line(line) + + fq = line.split(' ')[0] + sl = row[0] + tl = row[1] + + if line.count('>') < 2: # { + continue + # } + print(sl, tl, file=sys.stderr) + sl_lem = sl.split('<')[0] + tl_lem = tl.split('<')[0] + sl_lem = sl_lem.replace( + '-', '\\-').replace('~', ' ').replace('&', '&') + tl_lem = tl_lem.replace( + '-', '\\-').replace('~', ' ').replace('&', '&') + + sl_tag = sl.replace('><', '.').split('<')[1].strip('>') + tl_tag = tl.replace('><', '.').split('<')[1].strip('>') + + cmb = '' + cma = '' + + if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']: # { + cmb = '' + else: # { + cma = '' + cmb = '' + # } + + rule = cmb + '' + #rule = rule + '' + rule = rule + '' + rule = rule + '' + cma + + print(rule) + # } + + # } + print('') diff --git a/scripts/ngram-pruning-frac.py b/scripts/ngram-pruning-frac.py index d4016cc..8509715 100755 --- a/scripts/ngram-pruning-frac.py +++ b/scripts/ngram-pruning-frac.py @@ -2,172 +2,175 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys import common # Input: -#0.6000015452 k bukatu ari izan bukatu acabar -#0.3999984548 k bukatu ari izan bukatu terminar -#0.2435956440 a eta bukatu bukatu acabar -#0.7564043560 a eta bukatu bukatu terminar -#0.0003531084 eta *ed bukatu izan n bukatu acabar -#0.9996468916 eta *ed bukatu izan n bukatu terminar -#0.4520909033 *Jazten bukatu bukatu acabar -#0.5479090967 *Jazten bukatu bukatu terminar +# 0.6000015452 k bukatu ari izan bukatu acabar +# 0.3999984548 k bukatu ari izan bukatu terminar +# 0.2435956440 a eta bukatu bukatu acabar +# 0.7564043560 a eta bukatu bukatu terminar +# 0.0003531084 eta *ed bukatu izan n bukatu acabar +# 0.9996468916 eta *ed bukatu izan n bukatu terminar +# 0.4520909033 *Jazten bukatu bukatu acabar +# 0.5479090967 *Jazten bukatu bukatu terminar # d) Crispiness threshold -cur_line = 0; -crisphold = 3.0 ; # Default -only_max = True; +cur_line = 0 +crisphold = 3.0 # Default +only_max = True #only_max = False; -if len(sys.argv) == 4: #{ - crisphold = float(sys.argv[3]); - print('crisp:', crisphold, file=sys.stderr); -#} +if len(sys.argv) == 4: # { + crisphold = float(sys.argv[3]) + print('crisp:', crisphold, file=sys.stderr) +# } -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; +sl_tl_defaults = {} +sl_tl = {} +ngrams = {} # First read in the frequency defaults -for line in open(sys.argv[1]).readlines(): #{ - if len(line) < 1: #{ - continue; - #} - - row = common.tokenize_tagger_line(line) - sl = row[0]; - tl = row[1]; - fr = float(line.split(' ')[0]); - if line.count('@') and fr == 0.0: #{ - print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr); - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr); - #} - if line.count('@') > 0: #{ - print('default:', sl, tl, file=sys.stderr); - sl_tl_defaults[sl] = tl; - else: #{ - sl_tl[sl] = tl; - #} - -#} - -max_crispiness = 0.0; -print('Reading...', file=sys.stderr); -sys.stderr.flush(); +for line in open(sys.argv[1]).readlines(): # { + if len(line) < 1: # { + continue + # } + + row = common.tokenize_tagger_line(line) + sl = row[0] + tl = row[1] + fr = float(line.split(' ')[0]) + if line.count('@') and fr == 0.0: # { + print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr) + print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) + # } + if line.count('@') > 0: # { + print('default:', sl, tl, file=sys.stderr) + sl_tl_defaults[sl] = tl + else: # { + sl_tl[sl] = tl + # } + +# } + +max_crispiness = 0.0 +print('Reading...', file=sys.stderr) +sys.stderr.flush() # Load counts from cached file -ngramsf = open(sys.argv[2]); -for line in ngramsf.readlines(): #{ - if len(line) < 1: #{ - continue; - #} - row = line.split('\t'); - - freq = float(row[0]); - ngram = row[1]; - sl = row[2]; - tl = row[3].strip(); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if ngram not in ngrams[sl]: #{ - ngrams[sl][ngram] = {}; - #} - if tl not in ngrams[sl][ngram]: #{ - ngrams[sl][ngram][tl] = 0.0; - #} - ngrams[sl][ngram][tl] = freq; -#} - -for sl in ngrams: #{ - if sl == '': #{ - continue; - #} - for ngram in ngrams[sl]: #{ - if ngram == '': #{ - continue; - #} - - total = 0.0; - max_freq = -1.0; - max_tl = ''; - for tl in ngrams[sl][ngram]: #{ - - if ngrams[sl][ngram][tl] > max_freq: #{ - max_freq = ngrams[sl][ngram][tl]; - max_tl = tl; - #} - total = total + ngrams[sl][ngram][tl]; - #} - - default = sl_tl_defaults[sl]; - - if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]: #{ - print('Some shit went down..', file=sys.stderr); - print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr); - continue; - #} - if max_freq == 0.0: - continue; - - - if only_max == True: #{ - crispiness = 0.0; - alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - if def_crisp == 0.0: #{ - print('!!! Something wanky happened. :(', file=sys.stderr); - print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (total , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr); - print('\tskipping...', file=sys.stderr); - continue; - #} - weight = float(ngrams[sl][ngram][max_tl]) / float(total); - crispiness = alt_crisp/def_crisp; - - print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); +ngramsf = open(sys.argv[2]) +for line in ngramsf.readlines(): # { + if len(line) < 1: # { + continue + # } + row = line.split('\t') + + freq = float(row[0]) + ngram = row[1] + sl = row[2] + tl = row[3].strip() + + if sl not in ngrams: # { + ngrams[sl] = {} + # } + if ngram not in ngrams[sl]: # { + ngrams[sl][ngram] = {} + # } + if tl not in ngrams[sl][ngram]: # { + ngrams[sl][ngram][tl] = 0.0 + # } + ngrams[sl][ngram][tl] = freq +# } + +for sl in ngrams: # { + if sl == '': # { + continue + # } + for ngram in ngrams[sl]: # { + if ngram == '': # { + continue + # } + + total = 0.0 + max_freq = -1.0 + max_tl = '' + for tl in ngrams[sl][ngram]: # { + + if ngrams[sl][ngram][tl] > max_freq: # { + max_freq = ngrams[sl][ngram][tl] + max_tl = tl + # } + total = total + ngrams[sl][ngram][tl] + # } + + default = sl_tl_defaults[sl] + + if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]: # { + print('Some shit went down..', file=sys.stderr) + print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr) + continue + # } + if max_freq == 0.0: + continue + + if only_max == True: # { + crispiness = 0.0 + alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: # { + def_crisp = float(ngrams[sl][ngram][default] / float(total)) + # } + if def_crisp == 0.0: # { + print('!!! Something wanky happened. :(', file=sys.stderr) + print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % ( + total, max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr) + print('\tskipping...', file=sys.stderr) + continue + # } + weight = float(ngrams[sl][ngram][max_tl]) / float(total) + crispiness = alt_crisp/def_crisp + + print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, + max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])) # print('- %.10f \t%s\t%s\t%s\t%.10f' % (crispiness, sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); - if crispiness > max_crispiness: #{ - max_crispiness = crispiness; - #} + if crispiness > max_crispiness: # { + max_crispiness = crispiness + # } # crispiness weight total default max_freq tl_freq sl -#+ 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238 aozer aozer an levr organisateur 0.7236389238 -#- 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438 treuzkas treuzkas teknologel transfert 0.9999321438 - else: #{ - - for tl in ngrams[sl][ngram]: #{ - - crispiness = 0.0; - default = sl_tl_defaults[sl]; - alt_crisp = float(ngrams[sl][ngram][tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - weight = float(ngrams[sl][ngram][tl]) / float(total); - crispiness = alt_crisp/def_crisp; - - #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; - - print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])); -#+ 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 galloud ha an galloud puissance 1.9979947504 - #} - - if crispiness > max_crispiness: #{ - max_crispiness = crispiness; - #} - #} - #} -#} - -print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr); +# + 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238 aozer aozer an levr organisateur 0.7236389238 +# - 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438 treuzkas treuzkas teknologel transfert 0.9999321438 + else: # { + + for tl in ngrams[sl][ngram]: # { + + crispiness = 0.0 + default = sl_tl_defaults[sl] + alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: # { + def_crisp = float( + ngrams[sl][ngram][default] / float(total)) + # } + weight = float(ngrams[sl][ngram][tl]) / float(total) + crispiness = alt_crisp/def_crisp + + # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; + + print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, + ngrams[sl][ngram][default], max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])) +# + 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 galloud ha an galloud puissance 1.9979947504 + # } + + if crispiness > max_crispiness: # { + max_crispiness = crispiness + # } + # } + # } +# } + +print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr)