commit 587e4f0a51c495873e89f866ba797d753e030293 Author: Daniel Swanson Date: Mon Jul 12 12:54:11 2021 -0500 reduce script code duplication diff --git a/scripts/apertium-lex-evaluate.py b/scripts/apertium-lex-evaluate.py index e6bba14..05e59a0 100644 --- a/scripts/apertium-lex-evaluate.py +++ b/scripts/apertium-lex-evaluate.py @@ -2,160 +2,122 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys; - -# src = output of translator up to lt-proc -b -# ref = reference corpus -# tst = output of lexical selection module - -if len(sys.argv) < 4: #{ - print('apertium-lex-evaluate [-d] [-l] '); - sys.exit(-1); -#} - -debug = False; -perLine = False; -quiet = False; - -if len(sys.argv) == 6: #{ - n_tst = sys.argv[5]; - n_ref = sys.argv[4]; - n_src = sys.argv[3]; - - debug = True; - perLine = True; - -elif len(sys.argv) == 5: #{ - n_tst = sys.argv[4]; - n_ref = sys.argv[3]; - n_src = sys.argv[2]; - if sys.argv[1] == '-d': #{ - debug = True; - elif sys.argv[1] == '-q': #{ - quiet = True; - elif sys.argv[1] == '-l': #{ - perLine = True; - else: #{ - printi('`' + sys.argv[1] + "' is not a valid option."); - sys.exit(-1); - #} -elif len(sys.argv) == 4: #{ - - n_tst = sys.argv[3]; - n_ref = sys.argv[2]; - n_src = sys.argv[1]; - -#} - -f_src = open(n_src); -f_ref = open(n_ref); -f_tst = open(n_tst); - -def lineToArray(line): #{ - current_word_sl = ''; - current_word_tl = ''; - current_words_tl = []; - firstWord = False; - inWord = False; - lus = []; - - for c in line: #{ - if c == '^': #{ - inWord = True; - firstWord = True; - continue; - elif c == '$': #{ - current_words_tl.append(current_word_tl); - current_word = (current_word_sl, current_words_tl); - lus.append(current_word); - #print current_word; - current_word_sl = ''; - current_word_tl = ''; - current_words_tl = []; - i = 0; - inWord = False; - continue; - elif c == '/': #{ - if not firstWord: #{ - current_words_tl.append(current_word_tl); - current_word_tl = ''; - elif firstWord: #{ - firstWord = False; - #} - continue; - #} - - if inWord and firstWord: #{ - current_word_sl = current_word_sl + c; +import argparse +import sys + +parser = argparse.ArgumentParser() +parser.add_argument('src', help='output of translator up to lt-proc -b') +parser.add_argument('ref', help='reference corpus') +parser.add_argument('tst', help='output of lexical selection module') +parser.add_argument('-d', '--debug', action='store_true') +parser.add_argument('-q', '--quiet', action='store_true') +parser.add_argument('-l', '--line', action='store_true') +args = parser.parse_args() + +def debug(msg): + global args + if args.debug: + print(msg, file=sys.stderr) + +f_src = open(args.src) +f_ref = open(args.ref) +f_tst = open(args.tst) + +def lineToArray(line): + current_word_sl = '' + current_word_tl = '' + current_words_tl = [] + firstWord = False + inWord = False + lus = [] + + for c in line: + if c == '^': + inWord = True + firstWord = True + continue + elif c == '$': + current_words_tl.append(current_word_tl) + current_word = (current_word_sl, current_words_tl) + lus.append(current_word) + current_word_sl = '' + current_word_tl = '' + current_words_tl = [] + i = 0 + inWord = False + continue + elif c == '/': + if firstWord: + firstWord = False + else: + current_words_tl.append(current_word_tl) + current_word_tl = '' + continue + + if inWord and firstWord: + current_word_sl = current_word_sl + c elif inWord and not firstWord: - current_word_tl = current_word_tl + c; - #} - #} - return lus; -#} - -def sanityChecks(l_src, l_ref, l_tst): #{ - if debug: - print('---', file=sys.stderr); - src_lu = []; - ref_lu = []; - tst_lu = []; - - src_lu = lineToArray(l_src); - ref_lu = lineToArray(l_ref); - tst_lu = lineToArray(l_tst); - - if debug: - print('src:' , src_lu, file=sys.stderr); - print('tst:' , tst_lu, file=sys.stderr); - print('ref:' , ref_lu, file=sys.stderr); - - if len(src_lu) != len(ref_lu): #{ - print('WARNING: Source and reference sentence have different number of lexical units.', file=sys.stderr); - print('SRC: ' , len(src_lu) , ": " + l_src, file=sys.stderr); - print('REF: ' , len(ref_lu) , ": " + l_ref, file=sys.stderr); - #} - - if len(src_lu) != len(tst_lu): #{ - print('WARNING: Source and test sentence have different number of lexical units.', file=sys.stderr); - print(len(src_lu) , ": " + l_src, file=sys.stderr); - print(len(tst_lu) , ": " + l_tst, file=sys.stderr); - #} + current_word_tl = current_word_tl + c + + return lus + +def sanityChecks(l_src, l_ref, l_tst): + debug('---') + src_lu = [] + ref_lu = [] + tst_lu = [] + + src_lu = lineToArray(l_src) + ref_lu = lineToArray(l_ref) + tst_lu = lineToArray(l_tst) + + debug('src: %s' % src_lu) + debug('tst: %s' % tst_lu) + debug('ref: %s' % ref_lu) + + if len(src_lu) != len(ref_lu): + print('WARNING: Source and reference sentence have different number of lexical units.', file=sys.stderr) + print('SRC: ' , len(src_lu) , ": " + l_src, file=sys.stderr) + print('REF: ' , len(ref_lu) , ": " + l_ref, file=sys.stderr) + + + if len(src_lu) != len(tst_lu): + print('WARNING: Source and test sentence have different number of lexical units.', file=sys.stderr) + print(len(src_lu) , ": " + l_src, file=sys.stderr) + print(len(tst_lu) , ": " + l_tst, file=sys.stderr) + # i) do a sanity check, look for outN in tst that aren't in src: LEX module is outputting strange stuff - for i in range(0, len(tst_lu)): #{ - if len(tst_lu[i][1]) > 1: #{ - print('WARNING: Test sentence has a translation with more than one option.', file=sys.stderr); - print(' ',src_lu[i], file=sys.stderr); - print(' ',ref_lu[i], file=sys.stderr); - print(' ',tst_lu[i][1], file=sys.stderr); - #} - for lu in tst_lu[i][1]: #{ - if lu not in src_lu[i][1]: #{ - print('WARNING: Test sentence has a translation option that can never ', file=sys.stderr); - print(' be generated by the MT system.', file=sys.stderr); - print(' TST: ', tst_lu[i], file=sys.stderr); - print(' SRC: ', src_lu[i], file=sys.stderr); - #} - #} - #} + for i in range(0, len(tst_lu)): + if len(tst_lu[i][1]) > 1: + print('WARNING: Test sentence has a translation with more than one option.', file=sys.stderr) + print(' ',src_lu[i], file=sys.stderr) + print(' ',ref_lu[i], file=sys.stderr) + print(' ',tst_lu[i][1], file=sys.stderr) + + for lu in tst_lu[i][1]: + if lu not in src_lu[i][1]: + print('WARNING: Test sentence has a translation option that can never ', file=sys.stderr) + print(' be generated by the MT system.', file=sys.stderr) + print(' TST: ', tst_lu[i], file=sys.stderr) + print(' SRC: ', src_lu[i], file=sys.stderr) + + + # ii) look for outN in ref that aren't in src: MT system has changed - for i in range(0, len(ref_lu)): #{ - for lu in ref_lu[i][1]: #{ - if lu not in src_lu[i][1]: #{ - print('WARNING: Reference sentence has a translation option that can never ', file=sys.stderr); - print(' be generated by the MT system.', file=sys.stderr); - print('REF: ', ref_lu[i], file=sys.stderr); - print('SRC: ', src_lu[i], file=sys.stderr); - #} - #} - #} + for i in range(0, len(ref_lu)): + for lu in ref_lu[i][1]: + if lu not in src_lu[i][1]: + print('WARNING: Reference sentence has a translation option that can never ', file=sys.stderr) + print(' be generated by the MT system.', file=sys.stderr) + print('REF: ', ref_lu[i], file=sys.stderr) + print('SRC: ', src_lu[i], file=sys.stderr) + + return (src_lu, ref_lu, tst_lu) - return (src_lu, ref_lu, tst_lu); -#} # Process: # Read linestep, for each line in the three files: @@ -171,79 +133,78 @@ def sanityChecks(l_src, l_ref, l_tst): #{ # iv) if it is in the ref, increase score for that LU by 1. # v) final score is number of good TL translations / total number of TL translations -lines = True; +lines = True + +lineno = 0 -lineno = 0; +total_ambig_lus = 0 +total_fallos = 0 -total_ambig_lus = 0; -total_fallos = 0; +while lines: -while lines: #{ + l_src = f_src.readline() + l_ref = f_ref.readline() + l_tst = f_tst.readline() - l_src = f_src.readline(); - l_ref = f_ref.readline(); - l_tst = f_tst.readline(); + if l_src.strip('[]') == '' and l_ref.strip('[]') == '' and l_tst.strip('[]') == '': + lines = False + continue - if l_src.strip('[]') == '' and l_ref.strip('[]') == '' and l_tst.strip('[]') == '': #{ - lines = False; - continue; - #} - lineno = lineno + 1; + lineno = lineno + 1 - (lu_src, lu_ref, lu_tst) = sanityChecks(l_src, l_ref, l_tst); + (lu_src, lu_ref, lu_tst) = sanityChecks(l_src, l_ref, l_tst) - num_ambig_lus = 0; - num_fallos = 0; + num_ambig_lus = 0 + num_fallos = 0 - for i in range(0, len(lu_tst)): #{ + for i in range(0, len(lu_tst)): # We are only interested in counting a mismatch as an error if the # source LU has more than one possible translation, and # the number of translations is lower in the reference. This means # that if we have two possible translations in both the source and # the reference, it should not be considered ambiguous as both are # valid. - if len(lu_src[i][1]) > 1 and len(lu_ref[i][1]) != len(lu_src[i][1]) and lu_ref[i][1] != lu_src[i][1]: #{ + if len(lu_src[i][1]) > 1 and len(lu_ref[i][1]) != len(lu_src[i][1]) and lu_ref[i][1] != lu_src[i][1]: # >> 2 3 station [u'station'] +++ [u'station', u'season', u'ski resort'] # XX station XX [u'station'] - num_ambig_lus = num_ambig_lus + 1; - if debug: - print('>>' , len(lu_tst[i][1]) , len(lu_src[i][1]) , lu_tst[i][1][0] , lu_ref[i][1] , '+++' , lu_src[i][1]); - print('XX', lu_tst[i][1][0] , 'XX ' , lu_ref[i][1]); - if lu_tst[i][1][0] not in lu_ref[i][1]: #{ - num_fallos = num_fallos + 1; - if debug: - print('MISMATCH: ' , lu_tst[i][1][0] , 'not in' , lu_ref[i][1]); - #} - #} - #} - - if num_fallos == 0 and num_ambig_lus == 0: #{ -# print 'WEIRD: ' , l_src ; -# print ' : ' , l_ref ; -# print ' : ' , l_tst ; - continue; - #} - err = float(num_fallos)/float(num_ambig_lus)*100; - errh = str(err).split('.')[0]; - errt = ''.join(str(err).split('.')[1][0:1]); - if perLine: - print(n_tst + ':' + str(lineno) + ' ' + str(num_fallos) + '/' + str(num_ambig_lus) + ' ' + errh + '.' + errt + '%'); - - total_ambig_lus = total_ambig_lus + num_ambig_lus; - total_fallos = total_fallos + num_fallos; -#} - -if total_fallos == 0 or total_ambig_lus == 0: #{ - print('what: ' , total_fallos ,total_ambig_lus); - print("Check you haven't tried to use the source as a reference"); -#} -err = float(total_fallos)/float(total_ambig_lus)*100; -errh = str(err).split('.')[0]; -errt = ''.join(str(err).split('.')[1][0:1]); -#print n_tst + ' ' + str(total_fallos) + '/' + str(total_ambig_lus) + ' ' + errh + '.' + errt + '%'; -if quiet: #{ - print(errh + '.' + errt); -else: #{ - print(str(total_fallos) + '/' + str(total_ambig_lus) + '\t' + errh + '.' + errt + '%'); -#} + num_ambig_lus = num_ambig_lus + 1 + debug('>> %s %s %s +++' % (len(lu_tst[i][1]), len(lu_src[i][1]), + lu_tst[i][1][0], lu_ref[i][1], + lu_src[i][1])) + debug('XX %s XX %s' % (lu_tst[i][1][0], lu_ref[i][1])) + if lu_tst[i][1][0] not in lu_ref[i][1]: + num_fallos = num_fallos + 1 + debug('MISMATCH: %s not in %s' % (lu_tst[i][1][0], lu_ref[i][1])) + + + + + if num_fallos == 0 and num_ambig_lus == 0: +# print('WEIRD: ' , l_src) +# print(' : ' , l_ref) +# print(' : ' , l_tst) + continue + + err = float(num_fallos)/float(num_ambig_lus)*100 + errh = str(err).split('.')[0] + errt = ''.join(str(err).split('.')[1][0:1]) + if args.line: + print(n_tst + ':' + str(lineno) + ' ' + str(num_fallos) + '/' + str(num_ambig_lus) + ' ' + errh + '.' + errt + '%') + + total_ambig_lus = total_ambig_lus + num_ambig_lus + total_fallos = total_fallos + num_fallos + + +if total_fallos == 0 or total_ambig_lus == 0: + print('what: ' , total_fallos ,total_ambig_lus) + print("Check you haven't tried to use the source as a reference") + +err = float(total_fallos)/float(total_ambig_lus)*100 +errh = str(err).split('.')[0] +errt = ''.join(str(err).split('.')[1][0:1]) +#print(n_tst + ' ' + str(total_fallos) + '/' + str(total_ambig_lus) + ' ' + errh + '.' + errt + '%') +if args.quiet: + print(errh + '.' + errt) +else: + print(str(total_fallos) + '/' + str(total_ambig_lus) + '\t' + errh + '.' + errt + '%') diff --git a/scripts/biltrans-count-patterns-frac-maxent.py b/scripts/biltrans-count-patterns-frac-maxent.py index 13c1931..c0962b9 100755 --- a/scripts/biltrans-count-patterns-frac-maxent.py +++ b/scripts/biltrans-count-patterns-frac-maxent.py @@ -2,7 +2,9 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, math, re, common; +import sys, math, re, common +from collections import defaultdict +import biltrans_count_common as BCC # Input: # a) Frequency lexicon @@ -25,231 +27,50 @@ import sys, codecs, copy, math, re, common; #.[][56011 0].[] ^un/un$ ^digarez/excuse$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.9917274061 |@| #.[][56011 1].[] ^un/un$ ^digarez/occasion$ ^da$ ^distreiñ/revenir$ ^war/sur$ ^e/son$ ^doare/manière$ ^ober/faire$ ^./.$^./.$ 0.0082725939 || -MAX_NGRAMS = 3; # Max = 5-grams -cur_line = 0; +sl_tl_defaults = {} +sl_tl = defaultdict(list) -re_sep = re.compile('\$[^\^]*\^'); +features = {} # features[(slword, ['a', 'list'], tlword)] = 3 -def split_line(line): - line = re_clean_start.sub('', line.split('\t')[1]); - line = re_clean_end.sub('$', line); - line = line[1:-1]; +indexes = {} +trad_counter = defaultdict(lambda: 0) - row = re_sep.split(line); - return row - - - - # am_row = re_sep.sub('$ ^', am_line.split('\t')[1])[1:-1].split('$ ^'); - - -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; +# First read in the frequency defaults -meevents = {}; # events[slword][counter] = [feat, feat, feat]; -meoutcomes = {}; # meoutcomes[slword][counter] = tlword; -event_counter = 0; +for line in open(sys.argv[1]): + line = line.strip() + if len(line) < 1: + continue -features = {}; # features[(slword, ['a', 'list'], tlword)] = 3 -feature_counter = 0; + row = common.tokenize_tagger_line(line) + sl = common.wrap(row[0]) + tl = common.wrap(row[1]) + if tl[1] == '*': + tl = tl[:-3] + '$' -indexes = {}; -trad_counter = {}; + indexes[(sl, tl)] = trad_counter[sl] + trad_counter[sl] += 1 + sl_tl[sl].append(tl) -am_counter = 0; -dm_counter = 0; + if line.count('@') > 0: + sl_tl_defaults[sl] = tl +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = True + count_ngrams = True + max_ngrams = 3 + biltrans_wrap_lus = True -# First read in the frequency defaults + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + global sl_tl, features, indexes + BCC.features_and_outline(self.ngrams, sl, tl, sl_tl, features, + indexes, frac_count=frac_count) + self.clear_ngrams() -for line in open(sys.argv[1]): #{ - line = line.strip(); - if len(line) < 1: #{ - continue; - #} - row = common.tokenize_tagger_line(line); - sl = common.wrap(row[0]); - tl = common.wrap(row[1]); - if tl[1] == '*': - tl = tl[:-3] + '$' - if sl not in trad_counter: #{ - trad_counter[sl] = 0; - #} - if sl not in sl_tl: #{ - sl_tl[sl] = []; - #} - if line.count('@') > 0: #{ - sl_tl_defaults[sl] = tl; - sl_tl[sl].append(tl); - indexes[(sl, tl)] = trad_counter[sl]; - trad_counter[sl] = trad_counter[sl] + 1; - else: #{ - sl_tl[sl].append(tl); - indexes[(sl, tl)] = trad_counter[sl]; - trad_counter[sl] = trad_counter[sl] + 1; - #} -#} - -am_file = open(sys.argv[2]); # File with ambiguous biltrans output -dm_file = open(sys.argv[3]); # File with disambiguated biltrans output -reading = True; - -current_am_line_id = -1; -current_dm_line_id = -1; - -dm_line = dm_file.readline(); -current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - -while reading: #{ - am_line = am_file.readline(); - - if am_line == '': #{ - reading = False; - continue; - #} - - current_am_line_id = int(am_line.split('\t')[0]) - while current_dm_line_id == current_am_line_id: #{ - am_row = common.tokenize_biltrans_line(am_line); - dm_row = common.tokenize_biltrans_line(dm_line); - - if len(am_row) != len(dm_row): #{ - print('Mismatch in number of LUs between analysis and training', len(am_row), len(dm_row), 'lines', current_am_line_id, current_dm_line_id, file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - dm_line = dm_file.readline() - current_dm_line_id = int(dm_line.split('\t')[0]); - continue; - #} - - try: - frac_count = 0.0; - s_fc = dm_line.split('\t')[2].strip(); - if s_fc == '' or len(s_fc) == 0: #{ -# print('%d %d :: %d %d :: Frac count is not floatable' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr); - dm_line = dm_file.readline() - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - continue; - #} - - frac_count = float(s_fc); - - if math.isnan(frac_count): #{ -# print('%d %d :: %d %d :: Frac count is not a number' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr); - frac_count = 0.0; - #} - except: - dm_line = dm_file.readline() - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - continue; - - cur_sl_row = [x['sl'] for x in am_row] - limit = len(am_row); - for i in range(0, limit): #{ - if len(am_row[i]['tls']) > 1: #{ - sl = common.wrap(am_row[i]['sl']) - tl = common.wrap(dm_row[i]['tls'][0]) - - for j in range(1, MAX_NGRAMS): #{ - pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1])); - postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1])); - roundgram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+j+1])); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if pregram not in ngrams[sl]: #{ - ngrams[sl][pregram] = {}; - #} - if postgram not in ngrams[sl]: #{ - ngrams[sl][postgram] = {}; - #} - if roundgram not in ngrams[sl]: #{ - ngrams[sl][roundgram] = {}; - #} - if tl not in ngrams[sl][pregram]: #{ - ngrams[sl][pregram][tl] = 0.0; - #} - if tl not in ngrams[sl][postgram]: #{ - ngrams[sl][postgram][tl] = 0.0; - #} - if tl not in ngrams[sl][roundgram]: #{ - ngrams[sl][roundgram][tl] = 0.0; - #} - - ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count; - ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count; - ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count; - #} - if sl not in meevents: #{ - meevents[sl] = {}; - #} - if sl not in meoutcomes: #{ - meoutcomes[sl] = {}; - #} - if event_counter not in meevents: #{ - meevents[sl][event_counter] = []; - #} - if event_counter not in meoutcomes: #{ - meoutcomes[sl][event_counter] = ''; - #} - for ni in ngrams[sl]: #{ - if ni not in features: #{ - feature_counter = feature_counter + 1; - features[ni] = feature_counter; - #} - meevents[sl][event_counter].append(features[ni]); - #meevents[sl][event_counter].append(feat); - #meoutcomes[sl][event_counter] = (tl, frac_count); - meoutcomes[sl][event_counter] = (tl, int(frac_count * 10000)); - - #} - del ngrams; - ngrams = {}; - if sl not in sl_tl: #{ - continue; - #} - if len(sl_tl[sl]) < 2: #{ - continue; - #} - - for event in meevents[sl]: #{ - outline = str(indexes[(sl, meoutcomes[sl][event][0])]) + ' $ ' ; - outline = outline + str(meoutcomes[sl][event][1]) + ' # '; - for j in range(0, len(sl_tl[sl])): #{ - for feature in meevents[sl][event]: #{ - outline = outline + str(feature) + ':' + str(j) + ' '; - #} - outline = outline + ' # ' - #} - print(sl , '\t', len(sl_tl[sl]),'\t', outline); - #} - del meevents; - del meoutcomes; - meevents = {}; - meoutcomes = {}; - #} - #} - - dm_line = dm_file.readline(); - if dm_line == '': #{ - reading = False; - break; - #} - - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - event_counter = event_counter + 1; - - dm_counter += 1; - - #} - am_counter += 1; - -#} - -for feature in features: #{ - print(features[feature] , '\t' , feature, file=sys.stderr); -#} +c = Counter() +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output +for feature in features: + print(features[feature] , '\t' , feature, file=sys.stderr) diff --git a/scripts/biltrans-count-patterns-frac.py b/scripts/biltrans-count-patterns-frac.py index 7e03ae7..3384566 100755 --- a/scripts/biltrans-count-patterns-frac.py +++ b/scripts/biltrans-count-patterns-frac.py @@ -2,8 +2,10 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, re; -import common; +import sys, re +import common +from collections import defaultdict +import biltrans_count_common as BCC # Input: # a) Frequency lexicon @@ -28,202 +30,43 @@ import common; # d) Crispiness threshold -MAX_NGRAMS = 3; # Max = 5-grams +crisphold = 3.0 # Default +only_max = True +#only_max = False +cache_counts = open('/tmp/cache_counts.log', 'w+') -cur_line = 0; -crisphold = 3.0 ; # Default -only_max = True; -#only_max = False; -cache_counts = open('/tmp/cache_counts.log', 'w+'); +if len(sys.argv) == 5: + crisphold = float(sys.argv[4]) + print('crisp:', crisphold, file=sys.stderr) -if len(sys.argv) == 5: #{ - crisphold = float(sys.argv[4]); - print('crisp:', crisphold, file=sys.stderr); -#} +# First read in the frequency defaults -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; +sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1]) -# First read in the frequency defaults +print('Reading...', file=sys.stderr) +sys.stderr.flush() -for line in open(sys.argv[1]).readlines(): #{ - if len(line) < 1: #{ - continue; - #} - row = line.split(' '); - sl = row[1]; - tl = row[2]; - fr = float(row[0]); - if line.count('@') and fr == 0.0: #{ - print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr); - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr); - #} - if line.count('@') > 0: #{ - print('default:', sl, tl, file=sys.stderr); - sl_tl_defaults[sl] = tl; - else: #{ - sl_tl[sl] = tl; - #} - -#} - -print('Reading...', file=sys.stderr); -sys.stderr.flush(); - -am_file = open(sys.argv[2]); # File with ambiguous biltrans output -dm_file = open(sys.argv[3]); # File with disambiguated biltrans output -reading = True; - -current_am_line_id = -1; -current_dm_line_id = -1; - -rsep = re.compile('\$[^\^]*\^'); - -dm_line = dm_file.readline(); -current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - -am_counter = 0; -dm_counter = 0; - - -while reading: #{ - am_line = am_file.readline(); - - if am_line == '': #{ - reading = False; - continue; - #} - - current_am_line_id += 1 - -# # to skip lines in the frac corpus if we have a sub-corpus -# if current_dm_line_id != current_am_line_id: #{ -# print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr); -# while current_dm_line_id != current_am_line_id: #{ -# dm_line = dm_file.readline(); -# current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); -# print('skipping %d ...' % (current_dm_line_id), file=sys.stderr); -# #} -# #} - while current_dm_line_id == current_am_line_id: #{ - - am_row = common.tokenize_biltrans_line(am_line); - dm_row = common.tokenize_biltrans_line(dm_line); - - if len(am_row) != len(dm_row): #{ - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - continue; - #} - - cur_sl_row = []; - for lu in am_row: #{ - sl = lu.split('/')[0]; - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - cur_sl_row.append(sl); - #} +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = True + count_ngrams = True + max_ngrams = 3 - try: - frac_count = float(dm_line.split('\t')[2]); - except: - break; - - limit = len(am_row); - for i in range(0, limit): #{ - if am_row[i].count('/') > 1: #{ - #print(am_row[i] , dm_row[i]); - sl = am_row[i].split('/')[0].replace(' ', '~'); - tl = dm_row[i].split('/')[1].replace(' ', '~'); - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - if tl.count('><') > 0: #{ - tl = tl.split('><')[0] + '>'; - #} - -# if tl != sl_tl_defaults[sl]: #{ -# print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); -# else: #{ -# print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); -# #} - - for j in range(1, MAX_NGRAMS): #{ - pregram = ' '.join(cur_sl_row[i-j:i+1]); - postgram = ' '.join(cur_sl_row[i:i+j+1]); - roundgram = ' '.join(cur_sl_row[i-j:i+j+1]); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if pregram not in ngrams[sl]: #{ - ngrams[sl][pregram] = {}; - #} - if postgram not in ngrams[sl]: #{ - ngrams[sl][postgram] = {}; - #} - if roundgram not in ngrams[sl]: #{ - ngrams[sl][roundgram] = {}; - #} - - if tl not in ngrams[sl][pregram]: #{ - ngrams[sl][pregram][tl] = 0.0; - #} - if tl not in ngrams[sl][postgram]: #{ - ngrams[sl][postgram][tl] = 0.0; - #} - if tl not in ngrams[sl][roundgram]: #{ - ngrams[sl][roundgram][tl] = 0.0; - #} - ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count; - ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count; - ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count; - -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][pregram][tl], pregram), file=sys.stderr); -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][roundgram][tl], roundgram), file=sys.stderr); -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][postgram][tl], postgram), file=sys.stderr); - - - #} - #} - #} - - dm_line = dm_file.readline(); - if dm_line == '': #{ - reading = False; - break; - #} - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - - dm_counter += 1; - #} - am_counter += 1; - - if am_counter % 10000 == 0: #{ - print('=> %d SL and %d TL lines [id: %d] [ngrams: %d].' % (am_counter, dm_counter, current_am_line_id, len(ngrams)), file=sys.stderr); - sys.stderr.flush(); - #} -#} - -print('Caching counts...', file=sys.stderr); -for sl in ngrams: #{ - - for ngram in ngrams[sl]: #{ - - for tl in ngrams[sl][ngram]: #{ - print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl), file=cache_counts); - #} - #} -#} -print('\n', file=sys.stderr); - -for sl in ngrams: #{ - - for ngram in ngrams[sl]: #{ +c = Counter() +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output +ngrams = c.ngrams + +print('Caching counts...', file=sys.stderr) +for sl in ngrams: + for ngram in ngrams[sl]: + for tl in ngrams[sl][ngram]: + print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl), file=cache_counts) + +print('\n', file=sys.stderr) + +for sl in ngrams: + for ngram in ngrams[sl]: try: #> If for each of the rules we include #> the amount of time the translation is seen with that pattern over the @@ -241,74 +84,65 @@ for sl in ngrams: #{ #It would be "2" in this case: the alternative is seen twice as often as #the default. - total = 0.0; - max_freq = 0.0; - max_tl = ''; - for tl in ngrams[sl][ngram]: #{ - if ngrams[sl][ngram][tl] > max_freq: #{ - max_freq = ngrams[sl][ngram][tl]; - max_tl = tl; - #} - total = total + ngrams[sl][ngram][tl]; - #} - - if only_max == True: #{ - crispiness = 0.0; - default = sl_tl_defaults[sl]; - # if default == max_tl: #{ - # print('default=max_tl', default, max_tl, '\t', ngram, file=sys.stderr); - # else:#{ - # print('default!=max_tl', default, max_tl, '\t', ngram, file=sys.stderr); - # #} - alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - weight = float(ngrams[sl][ngram][max_tl]) / float(total); - crispiness = alt_crisp/def_crisp; - - if crispiness < crisphold: #{ - print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); - # print('-', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t'+ sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl])); - else: #{ - - print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])); - #print('+', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t' + sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl])); - #} + total = 0.0 + max_freq = 0.0 + max_tl = '' + for tl in ngrams[sl][ngram]: + if ngrams[sl][ngram][tl] > max_freq: + max_freq = ngrams[sl][ngram][tl] + max_tl = tl + + total += ngrams[sl][ngram][tl] + + if only_max == True: + crispiness = 0.0 + default = sl_tl_defaults[sl] + # if default == max_tl: + # print('default=max_tl', default, max_tl, '\t', ngram, file=sys.stderr) + # else: + # print('default!=max_tl', default, max_tl, '\t', ngram, file=sys.stderr) + # + alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: + def_crisp = float(ngrams[sl][ngram][default] / float(total)) + + weight = float(ngrams[sl][ngram][max_tl]) / float(total) + crispiness = alt_crisp/def_crisp + + if crispiness < crisphold: + print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])) + # print('-', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t'+ sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl])) + else: + + print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl])) + #print('+', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t' + sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl])) + # crispiness weight total default max_freq tl_freq sl #+ 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238 aozer aozer an levr organisateur 0.7236389238 #- 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438 treuzkas treuzkas teknologel transfert 0.9999321438 + else: + for tl in ngrams[sl][ngram]: + crispiness = 0.0 + default = sl_tl_defaults[sl] + alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: + def_crisp = float(ngrams[sl][ngram][default] / float(total)) + weight = float(ngrams[sl][ngram][tl]) / float(total) + crispiness = alt_crisp/def_crisp - else: #{ - - for tl in ngrams[sl][ngram]: #{ - - crispiness = 0.0; - default = sl_tl_defaults[sl]; - alt_crisp = float(ngrams[sl][ngram][tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - weight = float(ngrams[sl][ngram][tl]) / float(total); - crispiness = alt_crisp/def_crisp; + #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] - #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; + if crispiness < crisphold: + print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])) + else: + print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])) - if crispiness < crisphold: #{ - print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])); - else: #{ - print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl])); - #} #+ 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 galloud ha an galloud puissance 1.9979947504 - #} - #} except: pass - #} -#} diff --git a/scripts/biltrans-count-patterns-me.py b/scripts/biltrans-count-patterns-me.py index 2d03483..c5ae682 100755 --- a/scripts/biltrans-count-patterns-me.py +++ b/scripts/biltrans-count-patterns-me.py @@ -2,189 +2,37 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys +from collections import defaultdict +import common +import biltrans_count_common as BCC # Input: # a) Frequency lexicon # b) Biltrans output # c) Disambiguated biltrans output -MAX_NGRAMS = 3; +features = {} # features[(sl, ['a', 'list'], tl)] = 3 -cur_line = 0; +sl_tl, sl_tl_defaults, indexes = BCC.read_frequencies(sys.argv[1]) -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; +class Counter(BCC.BiltransCounter): + tokenizer = 'regex' + line_ids = False + count_ngrams = True + max_ngrams = 3 -meevents = {}; # events[sl][counter] = [feat, feat, feat]; -meoutcomes = {}; # meoutcomes[sl][counter] = tl; -event_counter = 0; - -features = {}; # features[(sl, ['a', 'list'], tl)] = 3 -feature_counter = 0; - -indexes = {}; -trad_counter = {}; - - - -for line in open(sys.argv[1]).readlines(): #{ - if len(line) < 1: #{ - continue; - #} - row = line.split(' '); - sl = row[1]; - tl = row[2].strip(); - if sl not in trad_counter: #{ - trad_counter[sl] = 0; - #} - if line.count('@') > 0: #{ - print(sl, tl, file=sys.stderr); - sl_tl_defaults[sl] = tl; - indexes[(sl, tl)] = trad_counter[sl]; - trad_counter[sl] = trad_counter[sl] + 1; - else: #{ - sl_tl[sl] = tl; - indexes[(sl, tl)] = trad_counter[sl]; - trad_counter[sl] = trad_counter[sl] + 1; - #} -#} - -am_file = open(sys.argv[2]); # File with ambiguous biltrans output -dm_file = open(sys.argv[3]); # File with disambiguated biltrans output -reading = True; - -while reading: #{ - am_line = am_file.readline(); - dm_line = dm_file.readline(); - - if am_line == '' and dm_line == '': #{ - reading = False; - continue; - #} - - if am_line.count('$ ^') != dm_line.count('$ ^'): #{ - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - continue; - #} - - - am_row = am_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^'); - dm_row = dm_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^'); - cur_sl_row = []; - for lu in am_row: #{ - sl = lu.split('/')[0]; - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - cur_sl_row.append(sl); - #} - - limit = len(am_row); - for i in range(0, limit): #{ - if am_row[i].count('/') > 1: #{ - #print(am_row[i] , dm_row[i]); - sl = am_row[i].split('/')[0].replace(' ', '~'); - tl = dm_row[i].split('/')[1].replace(' ', '~'); - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - if tl.count('><') > 0: #{ - tl = tl.split('><')[0] + '>'; - #} - - if tl != sl_tl_defaults[sl]: #{ - print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); - else: #{ - print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); - #} - - for j in range(1, MAX_NGRAMS): #{ - pregram = ' '.join(cur_sl_row[i-j:i+1]); - postgram = ' '.join(cur_sl_row[i:i+j+1]); - roundgram = ' '.join(cur_sl_row[i-j:i+j+1]); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if pregram not in ngrams[sl]: #{ - ngrams[sl][pregram] = {}; - #} - if postgram not in ngrams[sl]: #{ - ngrams[sl][postgram] = {}; - #} - if roundgram not in ngrams[sl]: #{ - ngrams[sl][roundgram] = {}; - #} - if tl not in ngrams[sl][pregram]: #{ - ngrams[sl][pregram][tl] = 0; - #} - if tl not in ngrams[sl][postgram]: #{ - ngrams[sl][postgram][tl] = 0; - #} - if tl not in ngrams[sl][roundgram]: #{ - ngrams[sl][roundgram][tl] = 0; - #} - - ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + 1; - ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + 1; - ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + 1; - #} - if sl not in meevents: #{ - meevents[sl] = {}; - #} - if sl not in meoutcomes: #{ - meoutcomes[sl] = {}; - #} - if event_counter not in meevents: #{ - meevents[sl][event_counter] = []; - #} - if event_counter not in meoutcomes: #{ - meoutcomes[sl][event_counter] = ''; - #} - for ni in ngrams[sl]: #{ - if ni not in features: #{ - feature_counter = feature_counter + 1; - features[ni] = feature_counter; - #} - meevents[sl][event_counter].append(features[ni]); - #meevents[sl][event_counter].append(feat); - meoutcomes[sl][event_counter] = tl; - - #} - del ngrams; - ngrams = {}; - if sl not in sl_tl: #{ - continue; - #} - if len(sl_tl[sl]) < 2: #{ - continue; - #} - for event in meevents[sl]: #{ - outline = str(indexes[(sl, meoutcomes[sl][event])]) + ' # '; - for j in range(0, len(sl_tl[sl])): #{ - for feature in meevents[sl][event]: #{ - outline = outline + str(feature) + ':' + str(j) + ' '; - #} - outline = outline + ' # ' - #} - print(sl , '\t', len(sl_tl[sl]),'\t', outline); - #} - del meevents; - del meoutcomes; - meevents = {}; - meoutcomes = {}; - #} - #} - event_counter = event_counter + 1; -#} - -for feature in features: #{ - print(features[feature] , '\t' , feature, file=sys.stderr); -#} + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + global sl_tl, sl_tl_defaults, features, indexes + sym = '-' if tl == sl_tl_defaults[sl] else '+' + print(sym, sl, sl_tl_defaults[sl], tl, file=sys.stderr) + BCC.features_and_outline(self.ngrams, sl, tl, sl_tl, features, + indexes) + self.clear_ngrams() +c = Counter() +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output +for feature in features: + print(features[feature] , '\t' , feature, file=sys.stderr) diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py index 9b794fc..bd5bf68 100755 --- a/scripts/biltrans-count-patterns-ngrams.py +++ b/scripts/biltrans-count-patterns-ngrams.py @@ -2,7 +2,8 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, math, re, common; +import sys +import biltrans_count_common as BCC # Input: # a) Frequency lexicon @@ -27,215 +28,37 @@ import sys, codecs, copy, math, re, common; # d) Crispiness threshold -MAX_NGRAMS = 3; # Max = 5-grams - -cur_line = 0; -crisphold = 3.0 ; # Default -only_max = True; -#only_max = False; - -if len(sys.argv) == 5: #{ - crisphold = float(sys.argv[4]); - print('crisp:', crisphold, file=sys.stderr); -#} - -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; - -def clean_line(l): #{ - newline = ''; - inword = True; - for c in l: #{ - if c == '\t': - newline = newline + c; - inword = False; - continue; - if c == '^': - newline = newline + c; - inword = True; - continue; - if c == '$': - newline = newline + c + ' '; - inword = False; - continue; - if inword == True: #{ - newline = newline + c; - #} - #} -# print(newline, file=sys.stderr); - return newline; -#} +cur_line = 0 +crisphold = 3.0 # Default +only_max = True +#only_max = False +if len(sys.argv) == 5: + crisphold = float(sys.argv[4]) + print('crisp:', crisphold, file=sys.stderr) # First read in the frequency defaults -for line in open(sys.argv[1]).readlines(): #{ - if len(line) < 1: #{ - continue; - #} - row = line.split(' '); - sl = row[1]; - tl = row[2]; - fr = float(row[0]); - if line.count('@') and fr == 0.0: #{ - print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr); - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr); - #} - if line.count('@') > 0: #{ - print('default:', sl, tl, file=sys.stderr); - sl_tl_defaults[sl] = tl; - else: #{ - sl_tl[sl] = tl; - #} - -#} - -print('Reading...', file=sys.stderr); -sys.stderr.flush(); - -am_file = open(sys.argv[2]); # File with ambiguous biltrans output -dm_file = open(sys.argv[3]); # File with disambiguated biltrans output -reading = True; - -current_am_line_id = -1; -current_dm_line_id = -1; - -dm_line = dm_file.readline(); -current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - -am_counter = 0; -dm_counter = 0; - -while reading: #{ - am_line = am_file.readline(); - - if am_line == '': #{ - reading = False; - continue; - #} - current_am_line_id = int(am_line.split("\t")[0]); - -# # to skip lines in the frac corpus if we have a sub-corpus - if current_dm_line_id != current_am_line_id: #{ - print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr); -# while current_dm_line_id != current_am_line_id: #{ -# dm_line = dm_file.readline(); -# current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); -# print('skipping %d ...' % (current_dm_line_id), file=sys.stderr); -# #} - #} - while current_dm_line_id == current_am_line_id: #{ - - am_row = common.tokenize_biltrans_line(am_line); - dm_row = common.tokenize_biltrans_line(dm_line); - - if len(am_row) != len(dm_row): #{ - amc = len(am_row); - dmc = len(dm_row); - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('am(',amc,'):\t' + am_line, file=sys.stderr); - print('dm(',dmc,'):\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - dm_line = dm_file.readline(); - if dm_line == '': #{ - reading = False; - break; - #} - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - dm_counter += 1; - - continue; - #} - - - frac_count = 0.0; - s_fc = dm_line.split('\t')[2].strip(); - if s_fc == '' or len(s_fc) == 0: #{ - print('%d %d :: %d %d :: Frac count is not floatable' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr); - #} - try: - frac_count = float(s_fc); - except: - break - if math.isnan(frac_count): #{ - print('%d %d :: %d %d :: Frac count is not a number' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr); - frac_count = 0.0; - #} - - limit = len(am_row); - cur_sl_row = [x['sl'] for x in am_row]; - - for i in range(0, limit): #{ - if len(am_row[i]['tls']) > 1: #{ - - sl = am_row[i]['sl'] - tl = dm_row[i]['tls'][0] - - for j in range(1, MAX_NGRAMS): #{ - pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1])); - postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1])); - roundgram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+j+1])); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if pregram not in ngrams[sl]: #{ - ngrams[sl][pregram] = {}; - #} - if postgram not in ngrams[sl]: #{ - ngrams[sl][postgram] = {}; - #} - if roundgram not in ngrams[sl]: #{ - ngrams[sl][roundgram] = {}; - #} - if tl not in ngrams[sl][pregram]: #{ - ngrams[sl][pregram][tl] = 0.0; - #} - if tl not in ngrams[sl][postgram]: #{ - ngrams[sl][postgram][tl] = 0.0; - #} - if tl not in ngrams[sl][roundgram]: #{ - ngrams[sl][roundgram][tl] = 0.0; - #} - ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count; - ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count; - ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count; - -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][pregram][tl], pregram), file=sys.stderr); -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][roundgram][tl], roundgram), file=sys.stderr); -# print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][postgram][tl], postgram), file=sys.stderr); - - - #} - #} - #} - - dm_line = dm_file.readline(); - if dm_line == '': #{ - reading = False; - break; - #} - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - - dm_counter += 1; - #} - am_counter += 1; - - if am_counter % 1000 == 0: #{ - print('=> %d SL and %d TL lines [id: %d] [ngrams: %d].' % (am_counter, dm_counter, current_am_line_id, len(ngrams)), file=sys.stderr); - sys.stderr.flush(); - #} -#} - -print('Caching counts...', file=sys.stderr); -for sl in ngrams: #{ - - for ngram in ngrams[sl]: #{ - - for tl in ngrams[sl][ngram]: #{ - print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)); - #} - #} -#} -print('\n', file=sys.stderr); +sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1]) + +print('Reading...', file=sys.stderr) +sys.stderr.flush() + +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = True + count_ngrams = True + max_ngrams = 3 + +c = Counter() +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output +ngrams = c.ngrams + +print('Caching counts...', file=sys.stderr) +for sl in ngrams: + for ngram in ngrams[sl]: + for tl in ngrams[sl][ngram]: + print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl)) + +print('\n', file=sys.stderr) diff --git a/scripts/biltrans-count-patterns.py b/scripts/biltrans-count-patterns.py index 8421c40..31674ca 100755 --- a/scripts/biltrans-count-patterns.py +++ b/scripts/biltrans-count-patterns.py @@ -2,7 +2,9 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, re; +import sys, re +import common +import biltrans_count_common as BCC # Input: # a) Frequency lexicon @@ -10,134 +12,41 @@ import sys, codecs, copy, re; # c) Disambiguated biltrans output # d) Crispiness threshold -MAX_NGRAMS = 3; - -cur_line = 0; -crisphold = 3.0 ; -rsep = re.compile('\$[^\^]*\^'); -if len(sys.argv) == 5: #{ - crisphold = float(sys.argv[4]); - print('crisp:', crisphold, file=sys.stderr); -#} - -sl_tl_defaults = {}; -sl_tl = {}; -ngrams = {}; - -for line in open(sys.argv[1]).readlines(): #{ - if len(line) < 1: #{ - continue; - #} - row = line.split(' '); - sl = row[1]; - tl = row[2]; - if line.count('@') > 0: #{ - sl_tl_defaults[sl] = tl; - else: #{ - sl_tl[sl] = tl; - #} -#} - -am_file = open(sys.argv[2]); # File with ambiguous biltrans output -dm_file = open(sys.argv[3]); # File with disambiguated biltrans output -reading = True; - -while reading: #{ - am_line = am_file.readline(); - dm_line = dm_file.readline(); - - if am_line == '' and dm_line == '': #{ - reading = False; - continue; - #} - - if len(rsep.findall(am_line)) != len(rsep.findall(dm_line)): #{ - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - continue; - #} - - - am_row = am_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^'); - dm_row = dm_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^'); - cur_sl_row = []; - for lu in am_row: #{ - sl = lu.split('/')[0]; - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - cur_sl_row.append(sl); - #} - - limit = len(am_row); - for i in range(0, limit): #{ - if am_row[i].count('/') > 1: #{ - #print(am_row[i] , dm_row[i]); - sl = am_row[i].split('/')[0].replace(' ', '~'); - tl = dm_row[i].split('/')[1].replace(' ', '~'); - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - if tl.count('><') > 0: #{ - tl = tl.split('><')[0] + '>'; - #} - - if tl != sl_tl_defaults[sl]: #{ - print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); - else: #{ - print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr); - #} - - for j in range(1, MAX_NGRAMS): #{ - pregram = ' '.join(cur_sl_row[i-j:i+1]); - postgram = ' '.join(cur_sl_row[i:i+j+1]); - roundgram = ' '.join(cur_sl_row[i-j:i+j+1]); - - if sl not in ngrams: #{ - ngrams[sl] = {}; - #} - if pregram not in ngrams[sl]: #{ - ngrams[sl][pregram] = {}; - #} - if postgram not in ngrams[sl]: #{ - ngrams[sl][postgram] = {}; - #} - if roundgram not in ngrams[sl]: #{ - ngrams[sl][roundgram] = {}; - #} - if tl not in ngrams[sl][pregram]: #{ - ngrams[sl][pregram][tl] = 0; - #} - if tl not in ngrams[sl][postgram]: #{ - ngrams[sl][postgram][tl] = 0; - #} - if tl not in ngrams[sl][roundgram]: #{ - ngrams[sl][roundgram][tl] = 0; - #} - - ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + 1; - ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + 1; - ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + 1; - #} - #} - #} -#} - -for sl in ngrams: #{ - - for ngram in ngrams[sl]: #{ - total = 0; - max_freq = -1; - current_tl = ''; - for tl in ngrams[sl][ngram]: #{ - if ngrams[sl][ngram][tl] > max_freq: #{ - max_freq = ngrams[sl][ngram][tl]; - current_tl = tl; - #} - total = total + ngrams[sl][ngram][tl]; - #} +cur_line = 0 +crisphold = 3.0 +if len(sys.argv) == 5: + crisphold = float(sys.argv[4]) + print('crisp:', crisphold, file=sys.stderr) + +sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1]) + +class Counter(BCC.BiltransCounter): + tokenizer = 'regex' + line_ids = False + count_ngrams = True + max_ngrams = 3 + + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + global sl_tl_defaults + sym = '-' if tl == sl_tl_defaults[sl] else '+' + print(sym, sl, sl_tl_defaults[sl], tl, file=sys.stderr) + + +c = Counter() +c.read_files(sys.argv[2], # File with ambiguous biltrans output + sys.argv[3]) # File with disambiguated biltrans output +ngrams = c.ngrams + +for sl in ngrams: + for ngram in ngrams[sl]: + total = 0 + max_freq = -1 + current_tl = '' + for tl in ngrams[sl][ngram]: + if ngrams[sl][ngram][tl] > max_freq: + max_freq = ngrams[sl][ngram][tl] + current_tl = tl + total += ngrams[sl][ngram][tl] #> If for each of the rules we include #> the amount of time the translation is seen with that pattern over the @@ -155,25 +64,20 @@ for sl in ngrams: #{ #It would be "2" in this case: the alternative is seen twice as often as #the default. - for tl in ngrams[sl][ngram]: #{ - crispiness = 0.0; - default = sl_tl_defaults[sl]; - alt_crisp = float(ngrams[sl][ngram][tl]) / float(total); - def_crisp = 1.0; - if default in ngrams[sl][ngram]: #{ - def_crisp = float(ngrams[sl][ngram][default] / float(total)); - #} - weight = float(ngrams[sl][ngram][tl]) / float(total); - crispiness = alt_crisp/def_crisp; + for tl in ngrams[sl][ngram]: + crispiness = 0.0 + default = sl_tl_defaults[sl] + alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) + def_crisp = 1.0 + if default in ngrams[sl][ngram]: + def_crisp = float(ngrams[sl][ngram][default] / float(total)) - #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ; + weight = float(ngrams[sl][ngram][tl]) / float(total) + crispiness = alt_crisp/def_crisp - if crispiness < crisphold: #{ - print('-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl])); - else: #{ + #print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] - print('+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl])); - #} - #} - #} -#} + if crispiness < crisphold: + print('-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl])) + else: + print('+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl])) diff --git a/scripts/biltrans-extract-frac-freq.py b/scripts/biltrans-extract-frac-freq.py index 84e3dbe..d17f20d 100644 --- a/scripts/biltrans-extract-frac-freq.py +++ b/scripts/biltrans-extract-frac-freq.py @@ -2,9 +2,9 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy, re -import common -import math +import sys +from collections import defaultdict +import biltrans_count_common as BCC # Input: # a) Biltrans output @@ -16,93 +16,28 @@ import math # # -sl_tl = {}; # The sl-tl possible combinations -am_file = open(sys.argv[1]); # File with ambiguous biltrans output -dm_file = open(sys.argv[2]); # File with disambiguated biltrans output -reading = True; +# The sl-tl possible combinations +sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0)) -current_am_line_id = -1; -current_dm_line_id = -1; +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = True -rsep = re.compile('\$[^\^]*\^'); + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + global sl_tl + sl_tl[sl][tl] += frac_count -dm_line = dm_file.readline(); -current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); +c = Counter() +c.read_files(sys.argv[1], # File with ambiguous biltrans output + sys.argv[2]) # File with disambiguated biltrans output -while reading: #{ - current_am_line_id += 1 - am_line = am_file.readline(); - if am_line == '': - reading = False; - continue; - #} - current_am_line_id = int(am_line.split("\t")[0]) - while current_am_line_id == current_dm_line_id: #{ - if current_dm_line_id % 1000 == 0: - print ("STATUS: at line", current_dm_line_id, file=sys.stderr); - if dm_line == '': #{ - print('breaking', file=sys.stderr); - reading = False; - break; - #} - try: - frac_count = float(dm_line.split('\t')[2]); - if math.isnan(frac_count): - frac_count = 0; - except: - break; - - am_row = common.tokenize_biltrans_line(am_line); - dm_row = common.tokenize_biltrans_line(dm_line); - - if len(am_row) != len(dm_row): #{ - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - dm_line = dm_file.readline(); - if dm_line == '': break; - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - - #} - - limit = len(am_row); - for i in range(0, limit): #{ - if len(am_row[i]['tls']) > 1: #{ - - sl = am_row[i]['sl'] - tl = dm_row[i]['tls'][0] - - if sl not in sl_tl: #{ - sl_tl[sl] = {}; - #} - if tl not in sl_tl[sl]: #{ - sl_tl[sl][tl] = 0.0; - #} - sl_tl[sl][tl] = sl_tl[sl][tl] + frac_count; - - #} - #} - dm_line = dm_file.readline(); - if dm_line == '': break; - current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); - - #} -#} - - -for sl in sl_tl: #{ +for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() - first = True; - for tl in newtl: #{ - if first: #{ - print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))); + first = True + for tl in newtl: + if first: + print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))) first = False - else: #{ - print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))); - #} - #} -#} + else: + print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl))) diff --git a/scripts/biltrans-extract-freq.py b/scripts/biltrans-extract-freq.py index 04c392f..49fd928 100644 --- a/scripts/biltrans-extract-freq.py +++ b/scripts/biltrans-extract-freq.py @@ -2,74 +2,41 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; -import common; +import sys +import common +import biltrans_count_common as BCC +from collections import defaultdict # Input: # a) Biltrans output # b) Disambiguated biltrans output -# +# The sl-tl possible combinations +sl_tl = defaultdict(lambda: defaultdict(lambda: 0)) -sl_tl = {}; # The sl-tl possible combinations -am_file = open(sys.argv[1]); # File with ambiguous biltrans output -dm_file = open(sys.argv[2]); # File with disambiguated biltrans output -reading = True; +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = False -while reading: #{ - am_line = am_file.readline(); - dm_line = dm_file.readline(); + def processs_row(self, frac_count=0): + global sl_tl + for i in range(len(self.am_row)): + if self.am_row[i].count('/') > 1: + sl = BCC.strip_tags(self.am_row[i], 'sl') + tl = BCC.strip_tags(self.dm_row[i], 'tl') + sl_tl[sl][tl] += 1 - if am_line == '' and dm_line == '': #{ - reading = False; - continue; - #} +c = Counter() +c.read_files(sys.argv[1], # File with ambiguous biltrans output + sys.argv[2]) # File with disambiguated biltrans output - am_row = common.tokenize_biltrans_line(am_line); - dm_row = common.tokenize_biltrans_line(dm_line); - - if len(am_row) != len(dm_row): #{ - print('Mismatch in number of LUs between analysis and training', file=sys.stderr); - print('\t' + am_line, file=sys.stderr); - print('\t' + dm_line, file=sys.stderr); - print('...skipping', file=sys.stderr); - continue; - #} - - limit = len(am_row); - for i in range(0, limit): #{ - if am_row[i].count('/') > 1: #{ - #print(am_row[i] , dm_row[i]); - sl = am_row[i].split('/')[0].replace(' ', '~'); - tl = dm_row[i].split('/')[1].replace(' ', '~'); - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - if tl.count('><') > 0: #{ - tl = tl.split('><')[0] + '>'; - #} - if sl not in sl_tl: #{ - sl_tl[sl] = {}; - #} - if tl not in sl_tl[sl]: #{ - sl_tl[sl][tl] = 0; - #} - sl_tl[sl][tl] = sl_tl[sl][tl] + 1; - #} - #} -#} - -for sl in sl_tl: #{ +for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() - first = True; - for tl in newtl: #{ - if first: #{ - print(sl_tl[sl][tl] , sl , tl , '@'); + first = True + for tl in newtl: + if first: + print(sl_tl[sl][tl] , sl , tl , '@') first = False - else: #{ - print(sl_tl[sl][tl] , sl , tl); - #} - #} -#} - + else: + print(sl_tl[sl][tl] , sl , tl) diff --git a/scripts/biltrans-extract-poormans-freq.py b/scripts/biltrans-extract-poormans-freq.py index 43a7c7c..7821d8b 100644 --- a/scripts/biltrans-extract-poormans-freq.py +++ b/scripts/biltrans-extract-poormans-freq.py @@ -2,7 +2,9 @@ # coding=utf-8 # -*- encoding: utf-8 -*- -import sys, codecs, copy; +import sys +import biltrans_count_common as BCC +from collections import defaultdict # Input: # a) Biltrans output @@ -14,72 +16,39 @@ import sys, codecs, copy; #11 si el asamblea estar#~de~acuerdo , hacer lo~que el señor Evans acabar de sugerir . +# The sl-tl possible combinations +sl_tl = defaultdict(lambda: defaultdict(lambda: 0)) -sl_tl = {}; # The sl-tl possible combinations -am_file = open(sys.argv[1]); # File with ambiguous biltrans output -dm_file = open(sys.argv[2]); # File with biltrans output -reading = True; +class Counter(BCC.BiltransCounter): + tokenizer = 'biltrans' + line_ids = False -while reading: #{ - am_line = am_file.readline(); - dm_line = dm_file.readline(); + def process_row(self, frac_count=0): + global sl_tl + for i in range(len(self.am_row)): + if self.am_row[i].count('/') > 1: + sl = BCC.strip_tags(am_row[i], 'sl', space=True) - if am_line == '' and dm_line == '': #{ - reading = False; - continue; - #} + bts = am_row[i].split('/')[1:] + valid_trads = set(BCC.strip_tags(b, 'sl', space=True) + for b in bts) - am_row = am_line.split('\t')[1].split(' '); - dm_row = list(set(dm_line.split('\t')[1].split(' '))); + for tl_ in dm_row: + tl = BCC.strip_tags(tl_, 'sl', space=True) + if tl in valid_trads: + sl_tl[sl][tl] += 1 - limit = len(am_row); - for i in range(0, limit): #{ - if am_row[i].count('/') > 1: #{ - #print(am_row[i] , dm_row[i]); - sl = am_row[i].split('/')[0]; - if sl.count('><') > 0: #{ - sl = sl.split('><')[0] + '>'; - #} - if sl not in sl_tl: #{ - sl_tl[sl] = {}; - #} - bts = am_row[i].split('/')[1:]; - valid_trads = set(); - for bt in bts: #{ - if bt.count('><') > 0: #{ - bt = bt.split('><')[0] + '>'; - #} - valid_trads.add(bt); - #} - limit2 = len(dm_row); - for j in range(0, limit2): #{ - tl = dm_row[j]; - if tl.count('><') > 0: #{ - tl = tl.split('><')[0] + '>'; - #} - if tl not in valid_trads: #{ - continue; - #} - if tl not in sl_tl[sl]: #{ - sl_tl[sl][tl] = 0; - #} - sl_tl[sl][tl] = sl_tl[sl][tl] + 1; - #} - #} - #} -#} +c = Counter() +c.read_files(sys.argv[1], # File with ambiguous biltrans output + sys.argv[2]) # File with biltrans output -for sl in sl_tl: #{ +for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() - first = True; - for tl in newtl: #{ - if first: #{ - print(sl_tl[sl][tl] , sl , tl , '@'); + first = True + for tl in newtl: + if first: + print(sl_tl[sl][tl] , sl , tl , '@') first = False - else: #{ - print(sl_tl[sl][tl] , sl , tl); - #} - #} -#} - + else: + print(sl_tl[sl][tl] , sl , tl) diff --git a/scripts/biltrans_count_common.py b/scripts/biltrans_count_common.py new file mode 100644 index 0000000..0aa751c --- /dev/null +++ b/scripts/biltrans_count_common.py @@ -0,0 +1,208 @@ +#!/usr/bin/python3 +# coding=utf-8 +# -*- encoding: utf-8 -*- + +from collections import defaultdict +import common +import math +import re +import sys + +def safe_float(s): + if not s: + return 0.0, False + try: + f = float(s) + if math.isnan(f): + return 0.0, False + return f, True + except: + return 0.0, False + +def strip_tags(s, side, space=False): + idx = 0 if side == 'sl' else 1 + ret = s.split('/')[idx] + if not space: + ret = ret.replace(' ', '~') + if ret.count('><') > 0: + ret = ret.split('><')[0] + '>' + return ret + +class BiltransCounter: + lu_sep = re.compile(r'\$[^\^]*\^') + tokenizer = 'regex' # or 'biltrans' + line_ids = False + count_ngrams = False + max_ngrams = 3 + biltrans_wrap_lus = False + def __init__(self): + self.reading = False + + self.am_file = None + self.am_line = None + self.am_row = None + self.am_id = None + self.am_linenum = 0 + self.dm_file = None + self.dm_line = None + self.dm_row = None + self.dm_id = None + self.am_linenum = 0 + + self.clear_ngrams() + + def __del__(self): + if self.am_file: + self.am_file.close() + if self.dm_file: + self.dm_file.close() + + def next_am_line(self): + self.am_line = self.am_file.readline() + self.am_linenum += 1 + if not self.am_line: + self.am_id, self.am_row = None, [] + self.reading = False + return + ls = self.am_line.split('\t') + if self.line_ids: + self.am_id = int(ls[0].strip()) + if self.tokenizer == 'regex': + self.am_row = self.lu_sep.split(ls[1].strip()[1:-1]) + elif self.tokenizer == 'biltrans': + self.am_row = common.tokenize_biltrans_line(self.am_line) + + def next_dm_line(self): + self.dm_linenum += 1 + self.dm_line = self.dm_file.readline() + if not self.dm_line: + self.dm_id, self.dm_row = None, [] + self.reading = False + return + if self.line_ids: + self.dm_id = int(self.dm_line.split('.[][')[1].split()[0]) + if self.tokenizer == 'regex': + self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1]) + elif self.tokenizer == 'biltrans': + self.dm_row = common.tokenize_biltrans_line(self.dm_line) + + def clear_ngrams(self): + self.ngrams = defaultdict( + lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) + + def check_rows(self): + if len(self.am_row) != len(self.dm_row): + print('Mismatch in number of LUs between analysis and training', file=sys.stderr) + print('\t' + am_line, file=sys.stderr) + print('\t' + dm_line, file=sys.stderr) + print('...skipping', file=sys.stderr) + return False + return True + + def read_files_multi_dm(am_fname, dm_fname): + self.next_dm_line() + while self.reading: + self.next_am_line() + while self.am_id == self.dm_id and self.reading: + frac_count = 0 + if self.dm_line.count('\t') > 1: + frac_count, _ = safe_float(self.dm_line.split('\t')[2]) + if self.check_rows(): + self.process_row(frac_count) + self.next_dm_line() + if self.am_linenum % 1000 == 0: + print('=> %d SL and %d TL lines read' % (self.am_linenum, self.dm_linenum), file=sys.stderr) + + def read_files(am_fname, dm_fname): + self.am_file = open(am_fname) + self.dm_file = open(dm_fname) + self.reading = True + if self.line_ids: + self.read_files_multi_dm(am_fname, dm_fname) + return + while self.reading: + self.next_am_line() + self.next_dm_line() + if self.reading and self.check_rows(): + self.process_row() + if self.am_linenum % 1000 == 0: + print('=> %d lines read' % self.am_linenum, file=sys.stderr) + + def process_row(self, frac_count=0): + if self.tokenizer == 'regex': + cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row] + for i in range(len(self.am_row)): + if self.am_row[i].count('/') > 1: + sl = strip_tags(self.am_row[i], 'sl') + tl = strip_tags(self.dm_row[i], 'tl') + self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) + elif self.tokenizer == 'biltrans': + cur_sl_row = [x['sl'] for x in self.am_row] + for i in range(len(self.am_row)): + if len(self.am_row[i]['tls']) > 1: + sl = self.am_row[i]['sl'] + tl = self.dm_row[i]['tls'][0] + if self.biltrans_wrap_lus: + sl = common.wrap(sl) + tl = common.wrap(tl) + self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count) + + def process_lu_internal(self, sl, tl, idx, cur_sl_row, frac_count=0): + if self.count_ngrams: + for j in range(1, self.max_ngrams): + pregram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+1])) + postgram = ' '.join(map(common.wrap, cur_sl_row[idx:idx+j+1])) + roundgram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+j+1])) + self.ngrams[sl][pregram][tl] += frac_count + self.ngrams[sl][postgram][tl] += frac_count + self.ngrams[sl][roundgram][tl] += frac_count + self.process_lu(sl, tl, idx, cur_sl_row, frac_count) + + def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0): + pass + +def features_and_outline(ngrams, sl, tl, sl_tl, features, indexes, + frac_count=None): + if not ngrams[sl]: + return + meevents = [] + for ni in ngrams[sl]: + if ni not in features: + feature_counter = len(features) + 1 + features[ni] = feature_counter + meevents.append(features[ni]) + if sl not in sl_tl or len(sl_tl[sl]) < 2: + return + outline = str(indexes[(sl, tl)]) + if frac_count != None: + outline += ' $ ' + str(int(frac_count * 10000)) + ' ' + outline += ' # ' + for j in range(len(sl_tl[sl])): + for feature in meevents: + outline += '%s:%s ' % (feature, j) + outline += ' # ' + print('%s\t%s\t%s' % (sl, len(sl_tl[sl]), outline)) + +def read_frequencies(fname): + with open(fname) as fin: + sl_tl = {} + sl_tl_defaults = {} + indexes = {} + trad_counter = defaultdict(lambda: 0) + for line_ in fin.readlines(): + line = line_.strip() + if not line: + continue + row = line.split(' ') + sl = row[1].strip() + tl = row[2].strip() + indexes[(sl, tl)] = trad_counter[sl] + trad_counter[sl] += 1 + if '@' in line: + sl_tl_defaults[sl] = tl + if float(row[0]) == 0.0: + print('!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr) + print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) + else: + sl_tl[sl] = tl + return sl_tl, sl_tl_defaults, indexes