commit b98bd6ed4e576d471968fa01677662b44ff4e040 Author: vivekvardhanadepu Date: Wed Aug 18 00:40:36 2021 +0530 Scripts cleanup diff --git a/scripts/extract-freq-lexicon.py b/scripts/extract-freq-lexicon.py index 0aeb2df..a0f7a1c 100755 --- a/scripts/extract-freq-lexicon.py +++ b/scripts/extract-freq-lexicon.py @@ -20,25 +20,17 @@ import traceback # 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 # ------------------------------------------------------------------------------- - -def wrap(x): - return '^' + x + '$' - - def extract_freq_lexicon(canditates): - # MAX_NGRAMS = 3 cur_line = 0 lineno = 0 sl_tl = {} - # ngrams = {} cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] - # for line in open(sys.argv[1]).readlines(): with open(canditates) as infile: for line in infile: line = line.strip() @@ -130,10 +122,10 @@ def extract_freq_lexicon(canditates): continue if first: - print(sl_tl[sl][tl], wrap(sl), wrap(tl), '@') + print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl), '@') first = False else: - print(sl_tl[sl][tl], wrap(sl), wrap(tl)) + print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)) if __name__ == '__main__': diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py index fc55a63..03392fe 100755 --- a/scripts/extract-sentences.py +++ b/scripts/extract-sentences.py @@ -8,14 +8,11 @@ import common def ambiguous(bt): # legislation/legislaciĆ³n/ordenamiento - - ambig = False for token in bt: - tls = token['tls'] - if len(tls) > 1: + if len(token['tls']) > 1: return True - return ambig + return False def extract_sentences(phrase_table_file, biltrans_out_file): diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py index 34ade71..341be3c 100755 --- a/scripts/ngram-count-patterns.py +++ b/scripts/ngram-count-patterns.py @@ -19,11 +19,6 @@ import common # 5 0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10 # ------------------------------------------------------------------------------- - -def wrap(x): - return '^' + x + '$' - - def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): MAX_NGRAMS = 2 cur_line = 0 @@ -41,8 +36,8 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): continue row = common.tokenise_tagger_line(line) - sl = wrap(row[0]) - tl = wrap(row[1]) + sl = common.wrap(row[0]) + tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' if line.count('@') > 0: @@ -81,19 +76,18 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): if al_sl != i: continue - tlword = wrap(cur_tl_row[al_tl]) - slword = wrap(slword) + tlword = common.wrap(cur_tl_row[al_tl]) + slword = common.wrap(slword) if slword not in sl_tl_defaults: print('!', file=sys.stderr) continue for j in range(1, MAX_NGRAMS): - - pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])) - postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])) + pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1])) + postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1])) roundgram = ' '.join( - map(wrap, cur_sl_row[i-j:i+j+1])) + map(common.wrap, cur_sl_row[i-j:i+j+1])) if slword not in ngrams: ngrams[slword] = {} @@ -120,10 +114,6 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1 ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1 - # for j in range(0, MAX_NGRAMS): - # print cur_sl_row[i-j:i+1] - # print cur_sl_row[i:i+j] - i = i + 1 cur_line = 0