commit 6fecfbfe1959a62cf6dcf9569537af8b8605047a Author: vivekvardhanadepu Date: Fri Aug 6 21:33:37 2021 +0530 Scripts:adding MAX_RULES diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 389d2fb..a1647d3 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -12,4 +12,5 @@ apertium_lex_tools_DATA = \ biltrans-count-patterns-ngrams.py \ ngram-pruning-frac.py \ ngrams-to-rules.py \ - biltrans_count_common.py + biltrans_count_common.py \ + ngram-count-patterns.py diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py index 52e60ce..7f4a158 100755 --- a/scripts/ngram-count-patterns.py +++ b/scripts/ngram-count-patterns.py @@ -24,9 +24,8 @@ def wrap(x): return '^' + x + '$' -def ngram_count_patterns(freq_lexicon, candidates, crisphold): +def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): MAX_NGRAMS = 2 - cur_line = 0 sl_tl_defaults = {} @@ -145,12 +144,14 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold): cur_line = cur_line + 1 for sl in ngrams: - for ngram in ngrams[sl]: total = 0 max_freq = -1 current_tl = '' - for tl in ngrams[sl][ngram]: + newtl = sorted(ngrams[sl][ngram], key=lambda x: ngrams[sl][ngram][x]) + newtl.reverse() + newtl = newtl[:max_rules] + for tl in newtl: if ngrams[sl][ngram][tl] > max_freq: max_freq = ngrams[sl][ngram][tl] current_tl = tl @@ -173,7 +174,7 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold): # It would be "2" in this case: the alternative is seen twice as often as # the default. - for tl in ngrams[sl][ngram]: + for tl in newtl: crispiness = 0.0 default = sl_tl_defaults[sl] alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) @@ -197,9 +198,9 @@ def ngram_count_patterns(freq_lexicon, candidates, crisphold): if __name__ == '__main__': - if len(sys.argv) < 4: + if len(sys.argv) < 5: print( - 'Usage: count-patterns.py ', file=sys.stderr) + 'Usage: count-patterns.py ', file=sys.stderr) exit(1) - ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3]) + ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])