commit e3d7e4096c11511de123aae5a0f25e657864c529 Author: vivekvardhanadepu Date: Sun Aug 22 21:02:03 2021 +0530 Scripts fixup: removing redundant code, wrapping issue in read_frequencies fixed diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py index cedac60..8fc5c6b 100755 --- a/scripts/biltrans-count-patterns-ngrams.py +++ b/scripts/biltrans-count-patterns-ngrams.py @@ -34,11 +34,9 @@ class Counter(BCC.BiltransCounter): count_ngrams = True max_ngrams = 3 -def biltrans_count_patterns_ngrams(lex_freq, biltrans_ambig, biltrans_annotated, crisphold=3.0): +def biltrans_count_patterns_ngrams(biltrans_ambig, biltrans_annotated, crisphold=3.0): # First read in the frequency defaults - BCC.read_frequencies(lex_freq) - print('Reading...', file=sys.stderr) sys.stderr.flush() @@ -56,12 +54,12 @@ def biltrans_count_patterns_ngrams(lex_freq, biltrans_ambig, biltrans_annotated, print('\n', file=sys.stderr) if __name__ == '__main__': - if len(sys.argv) < 4: - print('Usage: biltrans-count-patterns-ngrams.py [crisphold]', file=sys.stderr) + if len(sys.argv) < 3: + print('Usage: biltrans-count-patterns-ngrams.py [crisphold]', file=sys.stderr) exit(1) - if len(sys.argv) == 5: - print('crisp:', sys.argv[4], file=sys.stderr) - biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) - else: + if len(sys.argv) == 4: + print('crisp:', sys.argv[3], file=sys.stderr) biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3]) + else: + biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2]) diff --git a/scripts/biltrans_count_common.py b/scripts/biltrans_count_common.py index 952c878..1eea89e 100644 --- a/scripts/biltrans_count_common.py +++ b/scripts/biltrans_count_common.py @@ -203,15 +203,15 @@ def read_frequencies(fname): line = line_.strip() if not line: continue - row = line.split(' ') - fr = float(row[0]) - sl = row[1].strip() - tl = row[2].strip() + row = common.tokenize_tagger_line(line) + sl = row[0] + tl = row[1] + fr = float(line.split(' ')[0]) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] += 1 if '@' in line: sl_tl_defaults[sl] = tl - if float(row[0]) == 0.0: + if fr == 0.0: print( '!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr) else: diff --git a/scripts/ngram-pruning-frac.py b/scripts/ngram-pruning-frac.py index 743e606..176078a 100755 --- a/scripts/ngram-pruning-frac.py +++ b/scripts/ngram-pruning-frac.py @@ -4,6 +4,8 @@ import sys import common +import biltrans_count_common as BCC + # Input: # 0.6000015452 k bukatu ari izan bukatu acabar @@ -20,36 +22,14 @@ import common def ngram_pruning_frac(lex_freq, ngrams_file, crisphold=3.0): cur_line = 0 only_max = True - #only_max = False; - sl_tl_defaults = {} - sl_tl = {} ngrams = {} # First read in the frequency defaults - - for line in open(lex_freq).readlines(): - if len(line) < 1: - continue - - row = common.tokenize_tagger_line(line) - sl = row[0] - tl = row[1] - fr = float(line.split(' ')[0]) - if line.count('@') and fr == 0.0: - print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr) - print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) - - if line.count('@') > 0: - print('default:', sl, tl, file=sys.stderr) - sl_tl_defaults[sl] = tl - else: - sl_tl[sl] = tl - + _, sl_tl_defaults, _ = BCC.read_frequencies(lex_freq) max_crispiness = 0.0 print('Reading...', file=sys.stderr) - sys.stderr.flush() # Load counts from cached file