commit 41005865beeecaafa5660c24af0677ca3b1dd0e5
Author: Vivek Vardhan Adepu <vivekvicky839@gmail.com>
Date:   Sat Aug 7 08:44:37 2021 +0530

    Scripts: enclosing the code in functions (#81)

diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py
index 9a79c5a..cedac60 100755
--- a/scripts/biltrans-count-patterns-ngrams.py
+++ b/scripts/biltrans-count-patterns-ngrams.py
@@ -28,39 +28,40 @@ import biltrans_count_common as BCC
 
 #	 d) Crispiness threshold
 
-cur_line = 0
-crisphold = 3.0  # Default
-only_max = True
-#only_max = False
-
-if len(sys.argv) == 5:
-    crisphold = float(sys.argv[4])
-    print('crisp:', crisphold, file=sys.stderr)
-
-# First read in the frequency defaults
-
-sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1])
-
-print('Reading...', file=sys.stderr)
-sys.stderr.flush()
-
-
 class Counter(BCC.BiltransCounter):
     tokenizer = 'biltrans'
     line_ids = True
     count_ngrams = True
     max_ngrams = 3
-
-
-c = Counter()
-c.read_files(sys.argv[2],  # File with ambiguous biltrans output
-             sys.argv[3])  # File with disambiguated biltrans output
-ngrams = c.ngrams
-
-print('Caching counts...', file=sys.stderr)
-for sl in ngrams:
-    for ngram in ngrams[sl]:
-        for tl in ngrams[sl][ngram]:
-            print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl))
-
-print('\n', file=sys.stderr)
+    
+def biltrans_count_patterns_ngrams(lex_freq, biltrans_ambig, biltrans_annotated, crisphold=3.0):
+    # First read in the frequency defaults
+
+    BCC.read_frequencies(lex_freq)
+
+    print('Reading...', file=sys.stderr)
+    sys.stderr.flush()
+
+    c = Counter()
+    c.read_files(biltrans_ambig,  # File with ambiguous biltrans output
+                biltrans_annotated)  # File with disambiguated biltrans output
+    ngrams = c.ngrams
+
+    print('Caching counts...', file=sys.stderr)
+    for sl in ngrams:
+        for ngram in ngrams[sl]:
+            for tl in ngrams[sl][ngram]:
+                print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl))
+
+    print('\n', file=sys.stderr)
+
+if __name__ == '__main__':
+    if len(sys.argv) < 4:
+        print('Usage: biltrans-count-patterns-ngrams.py <lex_freq> <biltrans_ambig> <biltrans_annotated> [crisphold]', file=sys.stderr)
+        exit(1)
+    
+    if len(sys.argv) == 5:
+        print('crisp:', sys.argv[4], file=sys.stderr)
+        biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
+    else:
+        biltrans_count_patterns_ngrams(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/scripts/biltrans-extract-frac-freq.py b/scripts/biltrans-extract-frac-freq.py
index 17211aa..ef1ad66 100644
--- a/scripts/biltrans-extract-frac-freq.py
+++ b/scripts/biltrans-extract-frac-freq.py
@@ -17,32 +17,38 @@ import common
 #
 #
 
-# The sl-tl possible combinations
-sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0))
-
-
 class Counter(BCC.BiltransCounter):
     tokenizer = 'biltrans'
     line_ids = True
+    # The sl-tl possible combinations
+    sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0))
+
 
     def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
-        global sl_tl
-        sl_tl[sl][tl] += frac_count
-
-
-c = Counter()
-c.read_files(sys.argv[1],  # File with ambiguous biltrans output
-             sys.argv[2])  # File with disambiguated biltrans output
-
-for sl in sl_tl:
-    newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
-    newtl.reverse()
-    first = True
-    for tl in newtl:
-        if first:
-            print('%.10f %s %s @' %
-                  (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
-            first = False
-        else:
-            print('%.10f %s %s' %
-                  (sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
+        self.sl_tl[sl][tl] += frac_count
+
+def biltrans_extract_frac_freq(biltrans_ambig, biltrans_annotated):
+
+    c = Counter()
+    c.read_files(biltrans_ambig,  # File with ambiguous biltrans output
+                biltrans_annotated)  # File with disambiguated biltrans output
+
+    for sl in c.sl_tl:
+        newtl = sorted(c.sl_tl[sl], key=lambda x: c.sl_tl[sl][x])
+        newtl.reverse()
+        first = True
+        for tl in newtl:
+            if first:
+                print('%.10f %s %s @' %
+                    (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
+                first = False
+            else:
+                print('%.10f %s %s' %
+                    (c.sl_tl[sl][tl], common.wrap(sl), common.wrap(tl)))
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('Usage: biltrans-extract-frac-freq.py <biltrans_ambig> <biltrans_annotated>', file=sys.stderr)
+        exit(1)
+    
+    biltrans_extract_frac_freq(sys.argv[1], sys.argv[2])
diff --git a/scripts/extract-alig-lrx.py b/scripts/extract-alig-lrx.py
index af736c2..371b0e9 100755
--- a/scripts/extract-alig-lrx.py
+++ b/scripts/extract-alig-lrx.py
@@ -5,51 +5,58 @@
 import sys
 import common
 
-with open(sys.argv[1]) as d:
-    print('<rules>')
-    for line in d:  # {
-
-        sys.stdout.flush()
-        if line[-2] == '@':  # {
-            row = common.tokenize_tagger_line(line)
-
-            fq = line.split(' ')[0]
-            sl = row[0]
-            tl = row[1]
-
-            if line.count('>') < 2:  # {
-                continue
-            # }
-            print(sl, tl, file=sys.stderr)
-            sl_lem = sl.split('<')[0]
-            tl_lem = tl.split('<')[0]
-            sl_lem = sl_lem.replace(
-                '-', '\\-').replace('~', ' ').replace('&', '&amp;')
-            tl_lem = tl_lem.replace(
-                '-', '\\-').replace('~', ' ').replace('&', '&amp;')
-
-            sl_tag = sl.replace('><', '.').split('<')[1].strip('>')
-            tl_tag = tl.replace('><', '.').split('<')[1].strip('>')
-
-            cmb = ''
-            cma = ''
-
-            if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']:  # {
-                cmb = '<!--'
-                cma = '-->'
-            else:  # {
-                cma = ''
+
+def extract_alig_lrx(lex_freq):
+    with open(lex_freq) as d:
+        print('<rules>')
+
+        for line in d:
+            sys.stdout.flush()
+            if line[-2] == '@':
+                row = common.tokenize_tagger_line(line)
+
+                fq = line.split(' ')[0]
+                sl = row[0]
+                tl = row[1]
+
+                if line.count('>') < 2:
+                    continue
+
+                print(sl, tl, file=sys.stderr)
+                sl_lem = sl.split('<')[0]
+                tl_lem = tl.split('<')[0]
+                sl_lem = sl_lem.replace(
+                    '-', '\\-').replace('~', ' ').replace('&', '&amp;')
+                tl_lem = tl_lem.replace(
+                    '-', '\\-').replace('~', ' ').replace('&', '&amp;')
+
+                sl_tag = sl.replace('><', '.').split('<')[1].strip('>')
+                tl_tag = tl.replace('><', '.').split('<')[1].strip('>')
+
                 cmb = ''
-            # }
+                cma = ''
+
+                if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']:
+                    cmb = '<!--'
+                    cma = '-->'
+                else:
+                    cma = ''
+                    cmb = ''
+
+                rule = cmb + '<rule comment="' + fq + '">'
+                # rule = rule + '<match lemma="' + sl_lem + '" tags="' + sl_tag + '"><select lemma="' + tl_lem + '" tags="' + tl_tag + '"/>'
+                rule = rule + '<match lemma="' + sl_lem + '"><select lemma="' + tl_lem + '"/>'
+                rule = rule + '</match>'
+                rule = rule + '</rule>' + cma
+
+                print(rule)
+
+        print('</rules>')
 
-            rule = cmb + '<rule comment="' + fq + '">'
-            #rule = rule + '<match lemma="' + sl_lem + '" tags="' + sl_tag + '"><select lemma="' + tl_lem + '" tags="' + tl_tag + '"/>';
-            rule = rule + '<match lemma="' + sl_lem + '"><select lemma="' + tl_lem + '"/>'
-            rule = rule + '</match>'
-            rule = rule + '</rule>' + cma
 
-            print(rule)
-        # }
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print('Usage: extract-alig-lrx.py <lex_freq>', file=sys.stderr)
+        exit(1)
 
-    # }
-    print('</rules>')
+    extract_alig_lrx(sys.argv[1])
diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py
index 7f4a158..34ade71 100755
--- a/scripts/ngram-count-patterns.py
+++ b/scripts/ngram-count-patterns.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
diff --git a/scripts/ngram-pruning-frac.py b/scripts/ngram-pruning-frac.py
index 8509715..743e606 100755
--- a/scripts/ngram-pruning-frac.py
+++ b/scripts/ngram-pruning-frac.py
@@ -17,160 +17,157 @@ import common
 
 #	 d) Crispiness threshold
 
-cur_line = 0
-crisphold = 3.0  # Default
-only_max = True
-#only_max = False;
-
-if len(sys.argv) == 4:  # {
-    crisphold = float(sys.argv[3])
-    print('crisp:', crisphold, file=sys.stderr)
-# }
-
-sl_tl_defaults = {}
-sl_tl = {}
-ngrams = {}
-
-# First read in the frequency defaults
-
-for line in open(sys.argv[1]).readlines():  # {
-    if len(line) < 1:  # {
-        continue
-    # }
-
-    row = common.tokenize_tagger_line(line)
-    sl = row[0]
-    tl = row[1]
-    fr = float(line.split(' ')[0])
-    if line.count('@') and fr == 0.0:  # {
-        print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr)
-        print('    %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr)
-    # }
-    if line.count('@') > 0:  # {
-        print('default:', sl, tl, file=sys.stderr)
-        sl_tl_defaults[sl] = tl
-    else:  # {
-        sl_tl[sl] = tl
-    # }
-
-# }
-
-max_crispiness = 0.0
-print('Reading...', file=sys.stderr)
-sys.stderr.flush()
-
-# Load counts from cached file
-
-ngramsf = open(sys.argv[2])
-for line in ngramsf.readlines():  # {
-    if len(line) < 1:  # {
-        continue
-    # }
-    row = line.split('\t')
-
-    freq = float(row[0])
-    ngram = row[1]
-    sl = row[2]
-    tl = row[3].strip()
-
-    if sl not in ngrams:  # {
-        ngrams[sl] = {}
-    # }
-    if ngram not in ngrams[sl]:  # {
-        ngrams[sl][ngram] = {}
-    # }
-    if tl not in ngrams[sl][ngram]:  # {
-        ngrams[sl][ngram][tl] = 0.0
-    # }
-    ngrams[sl][ngram][tl] = freq
-# }
-
-for sl in ngrams:  # {
-    if sl == '':  # {
-        continue
-    # }
-    for ngram in ngrams[sl]:  # {
-        if ngram == '':  # {
+def ngram_pruning_frac(lex_freq, ngrams_file, crisphold=3.0):
+    cur_line = 0
+    only_max = True
+    #only_max = False;
+
+    sl_tl_defaults = {}
+    sl_tl = {}
+    ngrams = {}
+
+    # First read in the frequency defaults
+
+    for line in open(lex_freq).readlines():
+        if len(line) < 1:
             continue
-        # }
 
-        total = 0.0
-        max_freq = -1.0
-        max_tl = ''
-        for tl in ngrams[sl][ngram]:  # {
+        row = common.tokenize_tagger_line(line)
+        sl = row[0]
+        tl = row[1]
+        fr = float(line.split(' ')[0])
+        if line.count('@') and fr == 0.0:
+            print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr)
+            print('    %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr)
+
+        if line.count('@') > 0:
+            print('default:', sl, tl, file=sys.stderr)
+            sl_tl_defaults[sl] = tl
+        else:
+            sl_tl[sl] = tl
 
-            if ngrams[sl][ngram][tl] > max_freq:  # {
-                max_freq = ngrams[sl][ngram][tl]
-                max_tl = tl
-            # }
-            total = total + ngrams[sl][ngram][tl]
-        # }
 
-        default = sl_tl_defaults[sl]
+    max_crispiness = 0.0
+    print('Reading...', file=sys.stderr)
+    sys.stderr.flush()
 
-        if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]:  # {
-            print('Some shit went down..', file=sys.stderr)
-            print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr)
+    # Load counts from cached file
+
+    ngramsf = open(ngrams_file)
+    for line in ngramsf.readlines():
+        if len(line) < 1:
             continue
-        # }
-        if max_freq == 0.0:
+
+        row = line.split('\t')
+
+        freq = float(row[0])
+        ngram = row[1]
+        sl = row[2]
+        tl = row[3].strip()
+
+        if sl not in ngrams:
+            ngrams[sl] = {}
+
+        if ngram not in ngrams[sl]:
+            ngrams[sl][ngram] = {}
+
+        if tl not in ngrams[sl][ngram]:
+            ngrams[sl][ngram][tl] = 0.0
+
+        ngrams[sl][ngram][tl] = freq
+
+
+    for sl in ngrams:
+        if sl == '':
             continue
 
-        if only_max == True:  # {
-            crispiness = 0.0
-            alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total)
-            def_crisp = 1.0
-            if default in ngrams[sl][ngram]:  # {
-                def_crisp = float(ngrams[sl][ngram][default] / float(total))
-            # }
-            if def_crisp == 0.0:  # {
-                print('!!! Something wanky happened. :(', file=sys.stderr)
-                print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (
-                    total, max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr)
-                print('\tskipping...', file=sys.stderr)
+        for ngram in ngrams[sl]:
+            if ngram == '':
                 continue
-            # }
-            weight = float(ngrams[sl][ngram][max_tl]) / float(total)
-            crispiness = alt_crisp/def_crisp
 
-            print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total,
-                                                                          max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]))
-#			print('- %.10f \t%s\t%s\t%s\t%.10f' % (crispiness, sl, ngram, max_tl, ngrams[sl][ngram][max_tl]));
+            total = 0.0
+            max_freq = -1.0
+            max_tl = ''
+            for tl in ngrams[sl][ngram]:
+
+                if ngrams[sl][ngram][tl] > max_freq:
+                    max_freq = ngrams[sl][ngram][tl]
+                    max_tl = tl
 
-            if crispiness > max_crispiness:  # {
-                max_crispiness = crispiness
-            # }
+                total = total + ngrams[sl][ngram][tl]
 
-#   crispiness   weight      total default     max_freq     tl_freq            sl
-# + 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238         aozer<n>        aozer<n> an<det> levr<n>        organisateur<n> 0.7236389238
-# - 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438      treuzkas<n>     treuzkas<n> teknologel<adj>     transfert<n>    0.9999321438
-        else:  # {
+            default = sl_tl_defaults[sl]
 
-            for tl in ngrams[sl][ngram]:  # {
+            if max_tl not in ngrams[sl][ngram] and default not in ngrams[sl][ngram]:
+                print('Some shit went down..', file=sys.stderr)
+                print('= %s\t%s\t%s' % (sl, ngram, max_tl), file=sys.stderr)
+                continue
+
+            if max_freq == 0.0:
+                continue
 
+            if only_max == True:
                 crispiness = 0.0
-                default = sl_tl_defaults[sl]
-                alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
+                alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total)
                 def_crisp = 1.0
-                if default in ngrams[sl][ngram]:  # {
-                    def_crisp = float(
-                        ngrams[sl][ngram][default] / float(total))
-                # }
-                weight = float(ngrams[sl][ngram][tl]) / float(total)
+                if default in ngrams[sl][ngram]:
+                    def_crisp = float(ngrams[sl][ngram][default] / float(total))
+
+                if def_crisp == 0.0:
+                    print('!!! Something wanky happened. :(', file=sys.stderr)
+                    print('%.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (
+                        total, max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]), file=sys.stderr)
+                    print('\tskipping...', file=sys.stderr)
+                    continue
+
+                weight = float(ngrams[sl][ngram][max_tl]) / float(total)
                 crispiness = alt_crisp/def_crisp
 
-                # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ;
+                print('- %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total,
+                                                                            max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]))
+                # print('- %.10f \t%s\t%s\t%s\t%.10f' % (crispiness, sl, ngram, max_tl, ngrams[sl][ngram][max_tl]));
+
+                if crispiness > max_crispiness:
+                    max_crispiness = crispiness
+
+
+            #   crispiness   weight      total default     max_freq     tl_freq            sl
+            # + 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238         aozer<n>        aozer<n> an<det> levr<n>        organisateur<n> 0.7236389238
+            # - 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438      treuzkas<n>     treuzkas<n> teknologel<adj>     transfert<n>    0.9999321438
+            else:
+
+                for tl in ngrams[sl][ngram]:
+
+                    crispiness = 0.0
+                    default = sl_tl_defaults[sl]
+                    alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
+                    def_crisp = 1.0
+                    if default in ngrams[sl][ngram]:
+                        def_crisp = float(
+                            ngrams[sl][ngram][default] / float(total))
+
+                    weight = float(ngrams[sl][ngram][tl]) / float(total)
+                    crispiness = alt_crisp/def_crisp
+
+                    # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ;
+
+                    print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total,
+                                                                                        ngrams[sl][ngram][default], max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]))
+                    # + 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 	galloud<n>	ha<cnjcoo> an<det> galloud<n>	puissance<n>	1.9979947504
+
+                if crispiness > max_crispiness:
+                    max_crispiness = crispiness
 
-                print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total,
-                                                                                    ngrams[sl][ngram][default], max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]))
-# + 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 	galloud<n>	ha<cnjcoo> an<det> galloud<n>	puissance<n>	1.9979947504
-            # }
 
-            if crispiness > max_crispiness:  # {
-                max_crispiness = crispiness
-            # }
-        # }
-    # }
-# }
+    print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr)
 
-print('max_crispiness: %.10f' % (max_crispiness), file=sys.stderr)
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('Usage: ngram-pruning-frac.py <lex_freq> <ngrams> [crisphold]', file=sys.stderr)
+        exit(1)
+    
+    if len(sys.argv) == 4:
+        print('crisp:', sys.argv[3], file=sys.stderr)
+        ngram_pruning_frac(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        ngram_pruning_frac(sys.argv[1], sys.argv[2])
diff --git a/scripts/ngrams-to-rules.py b/scripts/ngrams-to-rules.py
index 329851e..5900928 100755
--- a/scripts/ngrams-to-rules.py
+++ b/scripts/ngrams-to-rules.py
@@ -18,13 +18,13 @@ def ngrams_to_rules(ngrams, crisphold):
     lineno = 1
     ruleno = 0
     for line in open(ngrams).readlines():
-        #	print('\n';
-        #	print(line
+        #	print('\n')
+        #	print(line)
         if len(line) < 2:
             continue
 
         line = line.strip()
-        #line = line.strip();
+        #line = line.strip()
 
         # + 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
         row = line.split('\t')
@@ -32,7 +32,7 @@ def ngrams_to_rules(ngrams, crisphold):
         if len(row) == 3:
             row.insert(0, '')
 
-    #	tipus = row[0].split(' ')[0];
+    #	tipus = row[0].split(' ')[0]
         weight = row[0].split(' ')[1]
         sl = row[1].strip()[1:-1]
         tl = row[3][1:-1]