commit 167cf7e81bf77a5bbdc6966fca8474fdff8dd127
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Fri Jul 30 00:10:19 2021 +0530

    Scripts fixup: cleaning old syntax and other minor fixes

diff --git a/scripts/extract-freq-lexicon.py b/scripts/extract-freq-lexicon.py
index ef7b427..0aeb2df 100755
--- a/scripts/extract-freq-lexicon.py
+++ b/scripts/extract-freq-lexicon.py
@@ -38,34 +38,34 @@ def extract_freq_lexicon(canditates):
     cur_bt_row = []
     cur_al_row = []
 
-    # for line in open(sys.argv[1]).readlines(): #{
+    # for line in open(sys.argv[1]).readlines():
     with open(canditates) as infile:
-        for line in infile:  # {
+        for line in infile:
             line = line.strip()
             lineno += 1
-            if lineno % 5000 == 0:  # {
+            if lineno % 5000 == 0:
                 sys.stderr.write('.')
-                if lineno % 100000 == 0:  # {
+                if lineno % 100000 == 0:
                     sys.stderr.write(str(lineno)+'\n')
-                # }
+
                 sys.stderr.flush()
-            # }
+
             try:
-                if line[0] == '-':  # {
+                if line[0] == '-':
                     # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
                     #
                     # sl_tl[sl_word][tl_word] = tl_freq
                     i = 0
-                    for slword in cur_sl_row:  # {
-                        if len(cur_bt_row[i]['tls']) > 1:  # {
-                            for al in cur_al_row:  # {
+                    for slword in cur_sl_row:
+                        if len(cur_bt_row[i]['tls']) > 1:
+                            for al in cur_al_row:
                                 if al == '':
                                     continue
                                 al_sl = int(al.split('-')[1])
                                 al_tl = int(al.split('-')[0])
-                                if al_sl != i:  # {
+                                if al_sl != i:
                                     continue
-                                # }
+
                                 if al_tl < len(cur_tl_row):
                                     tlword = cur_tl_row[al_tl]
                                 else:
@@ -81,72 +81,64 @@ def extract_freq_lexicon(canditates):
                                           file=sys.stderr)
                                     exit(1)
                                 slword = slword
-                                if slword not in sl_tl:  # {
+                                if slword not in sl_tl:
                                     sl_tl[slword] = {}
-                                # }
-                                if tlword not in sl_tl[slword]:  # {
+
+                                if tlword not in sl_tl[slword]:
                                     sl_tl[slword][tlword] = 0
-                                # }
+
                                 sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1
                                 # print '+' , slword , tlword , sl_tl[slword][tlword], lineno
-                            # }
-                        # }
+
                         i = i + 1
-                    # }
+
                     cur_line = 0
                     continue
-                # }
 
                 line = line.split('\t')[1]
 
-                if cur_line == 0:  # {
+                if cur_line == 0:
                     cur_sl_row = common.tokenise_tagger_line(line)
-                elif cur_line == 1:  # {
+                elif cur_line == 1:
                     cur_bt_row = common.tokenise_biltrans_line(line)
-                elif cur_line == 2:  # {
+                elif cur_line == 2:
                     cur_tl_row = common.tokenise_tagger_line(line)
-                elif cur_line == 3:  # {
+                elif cur_line == 3:
                     cur_al_row = line.split(' ')
-                # }
 
                 cur_line = cur_line + 1
             except Exception:
                 # print("Error in line", lineno, ":", e, file=sys.stderr)
                 traceback.print_exc()
                 exit(1)
-        # }
-    # }
 
-    for sl in sl_tl:  # {
+    for sl in sl_tl:
 
         newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
         newtl.reverse()
         first = True
-        for tl in newtl:  # {
-            if tl[0] == '*':  # {
+        for tl in newtl:
+            if tl[0] == '*':
                 print('Error: tl word unknown', tl,  file=sys.stderr)
                 continue
-            # }
+
             first_tag_sl = sl.split('<')[1].split('>')[0].strip()
             first_tag_tl = tl.split('<')[1].split('>')[0].strip()
-            if first_tag_sl != first_tag_tl:  # {
+            if first_tag_sl != first_tag_tl:
                 print('Error:', first_tag_sl, '!=',
                       first_tag_tl, file=sys.stderr)
                 continue
-            # }
-            if first:  # {
+
+            if first:
                 print(sl_tl[sl][tl], wrap(sl), wrap(tl), '@')
                 first = False
-            else:  # {
+            else:
                 print(sl_tl[sl][tl], wrap(sl), wrap(tl))
-            # }
-        # }
-    # }
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 2:  # {
+    if len(sys.argv) < 2:
         print('Usage: extract-freq-lexicon.py <candidate sent>', file=sys.stderr)
         exit(1)
-    # }
+
     extract_freq_lexicon(sys.argv[1])
diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py
index c80f77b..fc55a63 100755
--- a/scripts/extract-sentences.py
+++ b/scripts/extract-sentences.py
@@ -3,23 +3,19 @@
 # -*- encoding: utf-8 -*-
 
 import sys
-import codecs
 import common
 
 
-def ambiguous(bt):  # {
+def ambiguous(bt):
     # legislation<n><sg>/legislación<n><f><sg>/ordenamiento<n><m><sg>
 
     ambig = False
-    for token in bt:  # {
+    for token in bt:
         tls = token['tls']
-        if len(tls) > 1:  # {
+        if len(tls) > 1:
             return True
-        # }
-    # }
 
     return ambig
-# }
 
 
 def extract_sentences(phrase_table_file, biltrans_out_file):
@@ -29,42 +25,39 @@ def extract_sentences(phrase_table_file, biltrans_out_file):
 
     not_ambiguous = []
     with open(phrase_table_file) as phrase_table, open(biltrans_out_file) as biltrans_out:
-        while True:  # {
+        while True:
             try:
                 lineno = lineno + 1
                 pt_line = phrase_table.readline().strip()
                 bt_line = biltrans_out.readline().strip()
 
-                if not bt_line.strip() and not pt_line.strip():  # {
+                if not bt_line.strip() and not pt_line.strip():
                     break
-                elif not bt_line.strip() or not pt_line.strip():  # {
+                elif not bt_line.strip() or not pt_line.strip():
                     continue
 
-                # }
                 row = pt_line.split('|||')
                 bt = common.tokenise_biltrans_line(bt_line.strip())
                 sl = common.tokenise_tagger_line(row[1].strip())
                 tl = common.tokenise_tagger_line(row[0].strip())
 
-                if not ambiguous(bt):  # {
+                if not ambiguous(bt):
                     not_ambiguous.append(str(lineno))
-                    if len(not_ambiguous) >= 10:  # {
+                    if len(not_ambiguous) >= 10:
                         print("not ambiguous:", ' '.join(
                             not_ambiguous), file=sys.stderr)
                         not_ambiguous = []
-                    # }
+
                     continue
-                # }
-                if len(sl) < 2 and len(tl) < 2:  # {
+
+                if len(sl) < 2 and len(tl) < 2:
                     continue
-                # }
 
                 # Check that the number of words in the lexical transfer, and in the phrasetable matches up
-                if len(sl) != len(bt):  # {
+                if len(sl) != len(bt):
                     print("Error in line", lineno,
                           ": len(sl) != len(bt)", file=sys.stderr)
                     continue
-                # }
 
                 # cheking if the alignments are empty
                 if not row[2].strip():
@@ -88,8 +81,6 @@ def extract_sentences(phrase_table_file, biltrans_out_file):
                 total_errors += 1
                 continue
 
-        # }
-
     print('total:', lineno, file=sys.stderr)
     print('valid:', total_valid,
           '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr)
@@ -98,9 +89,8 @@ def extract_sentences(phrase_table_file, biltrans_out_file):
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 3:  # {
+    if len(sys.argv) < 3:
         print('Usage: extact-sentences.py <phrasetable> <biltrans>', file=sys.stderr)
         exit(1)
-    # }
 
     extract_sentences(sys.argv[1], sys.argv[2])
diff --git a/scripts/lambdas-to-rules.py b/scripts/lambdas-to-rules.py
index 745c7c7..d6214e3 100644
--- a/scripts/lambdas-to-rules.py
+++ b/scripts/lambdas-to-rules.py
@@ -15,51 +15,48 @@ def lambdas_to_rules(freq_lexicon, rules):
     rindex = {}
 
     with open(freq_lexicon) as d:
-        for line in d:  # {
-            if len(line) < 1:  # {
+        for line in d:
+            if len(line) < 1:
                 continue
-            # }
+
             row = common.tokenise_tagger_line(line)
             sl = wrap(row[0].strip())
             tl = wrap(row[1].strip())
             if tl[1] == '*':
                 tl = tl[:-3] + '$'
 
-            if sl not in sl_tl:  # {
+            if sl not in sl_tl:
                 sl_tl[sl] = []
-            # }
-            if sl not in trad_counter:  # {
+
+            if sl not in trad_counter:
                 trad_counter[sl] = 0
-            # }
-            if line.count('@') > 0:  # {
+
+            if line.count('@') > 0:
                 sl_tl_defaults[sl] = tl
-            # }
+
             sl_tl[sl].append(tl)
             indexes[(sl, tl)] = trad_counter[sl]
             rindex[(sl, trad_counter[sl])] = tl
             trad_counter[sl] = trad_counter[sl] + 1
 
-        # }
-
-    for pair in rindex:  # {
+    for pair in rindex:
         print(pair[0], pair[1], rindex[pair], file=sys.stderr)
-    # }
 
     # ability<n> 	 0.25652 	 1 	 ability<n> to<pr>
     # ability<n> 	 1.54548 	 0 	 ability<n> to<pr> deliver<vblex><inf>
     # ability<n> 	 1.48162 	 0 	 our<det><pos> ability<n> to<pr>
 
     with open(rules) as d:
-        for line in d:  # {
+        for line in d:
 
             row = line.split(' \t ')
             slword = row[0].strip()
             l = float(row[1])
             tlid = int(row[2])
-            if (slword, tlid) not in rindex:  # {
+            if (slword, tlid) not in rindex:
                 print('(', slword, ',', tlid, ') not in index', file=sys.stderr)
                 continue
-            # }
+
             tlword = rindex[(slword, tlid)]
             context = row[3].strip()
         #	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
@@ -69,21 +66,17 @@ def lambdas_to_rules(freq_lexicon, rules):
                   '\t' + context + '\t' + tlword + '\t1')
 
         #	print('  <rule weight="%.5f">' % (l))
-        #	for c in context.split(' '): #{
-        #		if c.count(slword) == 1: #{
+        #	for c in context.split(' '):
+        #		if c.count(slword) == 1:
         #			print(slword, tlword)
-        #		else: #{
+        #		else:
         #			print(c)
-        #		#}
-        #	#}
         #	print('  </rule>')
 
-        # }
-
 
 if __name__ == '__main__':
-    if len(sys.argv) < 3:  # {
+    if len(sys.argv) < 3:
         print('Usage: lambdas-to-rules.py <lex> <rules>', file=sys.stderr)
         exit(1)
-    # }
+
     lambdas_to_rules(sys.argv[1], sys.argv[2])
diff --git a/scripts/merge-ngrams-lambdas.py b/scripts/merge-ngrams-lambdas.py
index 3ceb065..fda05d7 100644
--- a/scripts/merge-ngrams-lambdas.py
+++ b/scripts/merge-ngrams-lambdas.py
@@ -3,7 +3,6 @@
 # -*- encoding: utf-8 -*-
 
 import sys
-import codecs
 import random
 
 
@@ -15,24 +14,23 @@ def merge_ngrams_lambdas(ngf, ldf):
     ngrams = {}
     # lambdas = {}
 
-    for line in open(ngf).readlines():  # {
+    for line in open(ngf).readlines():
         # 59763 	poor<adj> in<pr> capital<adj>
-        if len(line) < 2:  # {
+        if len(line) < 2:
             continue
-        # }
+
         row = line.strip().split('\t')
         if(len(row) < 2):
             row.append('')
         ngid = int(row[0].strip())
         ngrams[ngid] = row[1]
-    # }
 
     with open(ldf) as d:
-        for line in d:  # {
+        for line in d:
             # 59176:0 1.00131
-            if line.count('@@') > 0:  # {
+            if line.count('@@') > 0:
                 continue
-            # }
+
             row = line.strip().split('\t')
 
             l = float(row[2])
@@ -42,12 +40,11 @@ def merge_ngrams_lambdas(ngf, ldf):
             trad = row[1].split(':')[1]
             token = row[0]
             print(token, '\t', l, '\t', trad, '\t', ngram)
-        # }
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 3:  # {
+    if len(sys.argv) < 3:
         print('Usage: merge-ngrams-lambdas.py <ngrams> <lambdas>', file=sys.stderr)
         exit(1)
-    # }
+
     merge_ngrams_lambdas(sys.argv[1], sys.argv[2])
diff --git a/scripts/ngram-count-patterns-maxent2.py b/scripts/ngram-count-patterns-maxent2.py
index c3a9714..387336f 100755
--- a/scripts/ngram-count-patterns-maxent2.py
+++ b/scripts/ngram-count-patterns-maxent2.py
@@ -3,8 +3,6 @@
 # -*- encoding: utf-8 -*-
 
 import sys
-import codecs
-import copy
 import common
 
 # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
@@ -46,61 +44,52 @@ def ngram_count_patterns(freq_lexicon, candidates):
 
     indexes = {}
     trad_counter = {}
-    for line in open(freq_lexicon, 'r').readlines():  # {
-        if len(line) < 1:  # {
+    for line in open(freq_lexicon, 'r').readlines():
+        if len(line) < 1:
             continue
-        # }
+
         w = int(line.split(' ')[0])
         if w < THRESHOLD:
             continue
 
         row = common.tokenise_tagger_line(line)
         sl = wrap(row[0]).lower()
-        tl = wrap(row[1].strip()).lower()
+        tl = wrap(row[1]).lower()
         if tl[1] == '*':
             tl = tl[:-3] + '$'
 
-        if sl not in sl_tl:  # {
+        if sl not in sl_tl:
             sl_tl[sl] = []
-        # }
-        if sl not in trad_counter:  # {
+        
+        if sl not in trad_counter:
             trad_counter[sl] = 0
-        # }
-        if line.count('@') > 0:  # {
+
+        if line.count('@') > 0:
             sl_tl_defaults[sl] = tl
         sl_tl[sl].append(tl)
         indexes[(sl, tl)] = trad_counter[sl]
         trad_counter[sl] = trad_counter[sl] + 1
 
-        # }
-    # }
-
     cur_sl_row = []
     cur_tl_row = []
     cur_bt_row = []
     cur_al_row = []
 
-    for line in open(candidates, 'r').readlines():  # {
+    for line in open(candidates, 'r').readlines():
         line = line.strip()
-        if line[0] == '-':  # {
-            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row);
-            #		print cur_sl_row;
-            #		print cur_bt_row;
-            #		print cur_tl_row;
-            #		print cur_al_row;
-            #
+        if line[0] == '-':
             # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
             #
             # sl_tl[sl_word][tl_word] = tl_freq
             i = 0
-            for slword in cur_sl_row:  # {
-                if len(cur_bt_row[i]['tls']) > 1:  # {
-                    for al in cur_al_row:  # {
+            for slword in cur_sl_row:
+                if len(cur_bt_row[i]['tls']) > 1:
+                    for al in cur_al_row:
                         al_sl = int(al.split('-')[1])
                         al_tl = int(al.split('-')[0])
-                        if al_sl != i:  # {
+                        if al_sl != i:
                             continue
-                        # }
+                        
 
                         tlword = wrap(cur_tl_row[al_tl].lower())
                         slword = wrap(slword.lower())
@@ -108,21 +97,20 @@ def ngram_count_patterns(freq_lexicon, candidates):
                         if tlword[1] == '*' or slword[1] == '*':
                             continue
 
-                        if slword not in sl_tl_defaults:  # {
-                            #						print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping';
+                        if slword not in sl_tl_defaults:
+                            #						print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping'
                             continue
-                        # }
-                        if (slword, tlword) not in indexes:  # {
-                            #						print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword);
+                        
+                        if (slword, tlword) not in indexes:
+                            #						print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword)
                             continue
-                        # }
-    #					if tlword !=  sl_tl_defaults[slword]: #{
-    #						print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword;
-    #					else: #{
-    #						print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword;
-    #					#}
-    #					print >>sys.stderr, cur_sl_row;
-                        for j in range(1, MAX_NGRAMS):  # {
+                        
+    #					if tlword !=  sl_tl_defaults[slword]:
+    #						print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword
+    #					else:
+    #						print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword
+    #					print >>sys.stderr, cur_sl_row
+                        for j in range(1, MAX_NGRAMS):
                             #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1]
                             #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1]
                             #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1]
@@ -132,147 +120,145 @@ def ngram_count_patterns(freq_lexicon, candidates):
                             roundgram = ' '.join(
                                 map(wrap, cur_sl_row[i-j:i+j+1]))
 
-                            if slword not in ngrams:  # {
+                            if slword not in ngrams:
                                 ngrams[slword] = {}
-                            # }
-                            if pregram not in ngrams[slword]:  # {
+                            
+                            if pregram not in ngrams[slword]:
                                 ngrams[slword][pregram] = {}
-                            # }
-                            if postgram not in ngrams[slword]:  # {
+                            
+                            if postgram not in ngrams[slword]:
                                 ngrams[slword][postgram] = {}
-                            # }
-                            if roundgram not in ngrams[slword]:  # {
+                            
+                            if roundgram not in ngrams[slword]:
                                 ngrams[slword][roundgram] = {}
-                            # }
-                            if tlword not in ngrams[slword][pregram]:  # {
+                            
+                            if tlword not in ngrams[slword][pregram]:
                                 ngrams[slword][pregram][tlword] = 0
-                            # }
-                            if tlword not in ngrams[slword][postgram]:  # {
+                            
+                            if tlword not in ngrams[slword][postgram]:
                                 ngrams[slword][postgram][tlword] = 0
-                            # }
-                            if tlword not in ngrams[slword][roundgram]:  # {
+                            
+                            if tlword not in ngrams[slword][roundgram]:
                                 ngrams[slword][roundgram][tlword] = 0
-                            # }
+                            
 
                             ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
                             ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
                             ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1
-                        # }
-                        # print ',' , len(ngrams[slword]);
-                        if slword not in meevents:  # {
+                        
+                        # print ',' , len(ngrams[slword])
+                        if slword not in meevents:
                             meevents[slword] = {}
-                        # }
-                        if slword not in meoutcomes:  # {
+                        
+                        if slword not in meoutcomes:
                             meoutcomes[slword] = {}
-                        # }
-                        if event_counter not in meevents:  # {
+                        
+                        if event_counter not in meevents:
                             meevents[slword][event_counter] = []
-                        # }
-                        if event_counter not in meoutcomes[slword]:  # {
+                        
+                        if event_counter not in meoutcomes[slword]:
                             meoutcomes[slword][event_counter] = ''
-                        # }
-                        for ni in ngrams[slword]:  # {
-                            if ni not in features:  # {
+                        
+                        for ni in ngrams[slword]:
+                            if ni not in features:
                                 feature_counter = feature_counter + 1
                                 features[ni] = feature_counter
-                            # }
+                            
                             meevents[slword][event_counter].append(
                                 features[ni])
-                            # meevents[slword][event_counter].append(feat);
+                            # meevents[slword][event_counter].append(feat)
                             meoutcomes[slword][event_counter] = tlword
 
-                        # }
+                        
                         del ngrams
                         ngrams = {}
-                        if len(sl_tl[slword]) < 2:  # {
+                        if len(sl_tl[slword]) < 2:
                             continue
-                        # }
-                        for event in meevents[slword]:  # {
+                        
+                        for event in meevents[slword]:
                             outline = str(
                                 indexes[(slword, meoutcomes[slword][event])]) + ' # '
-                            for j in range(0,  len(sl_tl[slword])):  # {
-                                for feature in meevents[slword][event]:  # {
+                            for j in range(0,  len(sl_tl[slword])):
+                                for feature in meevents[slword][event]:
                                     outline = outline + \
                                         str(feature) + ':' + str(j) + ' '
-                                # }
+                                
                                 outline = outline + ' # '
-                            # }
+                            
                             print(slword, '\t', len(
                                 sl_tl[slword]), '\t', outline)
-                        # }
+                        
                         del meevents
                         del meoutcomes
                         meevents = {}
                         meoutcomes = {}
 
-    #					for f in features: #{
-    #						print >>sys.stderr, features[f] , f;
-    #					#}
+    #					for f in features:
+    #						print >>sys.stderr, features[f] , f
 
-                    # }
+                    
 
-    #				for j in range(0, MAX_NGRAMS): #{
-    #					print cur_sl_row[i-j:i+1];
-    #					print cur_sl_row[i:i+j];
-    #				#}
-                    # print ngrams[slword];
-                # }
+    #				for j in range(0, MAX_NGRAMS):
+    #					print cur_sl_row[i-j:i+1]
+    #					print cur_sl_row[i:i+j]
+                    # print ngrams[slword]
+                
                 i = i + 1
 
-            # }
+            
 
             cur_line = 0
             event_counter = event_counter + 1
-            # print line;
+            # print line
             continue
-        # }
+        
 
         line = line.split('\t')[1]
         line = line.strip()
 
-        if cur_line == 0:  # {
+        if cur_line == 0:
             cur_sl_row = common.tokenise_tagger_line(line)
-        elif cur_line == 1:  # {
+        elif cur_line == 1:
             cur_bt_row = common.tokenise_biltrans_line(line)
-        elif cur_line == 2:  # {
+        elif cur_line == 2:
             cur_tl_row = common.tokenise_tagger_line(line)
-        elif cur_line == 3:  # {
+        elif cur_line == 3:
             cur_al_row = line.split(' ')
-        # }
+        
 
         cur_line = cur_line + 1
-    # }
+    
 
-    for feature in features:  # {
+    for feature in features:
         print(features[feature], '\t', feature, file=sys.stderr)
-    # }
+    
 
     # exit(1)
     return
 
-    for slword in meevents:  # {
-        if len(sl_tl[slword]) < 2:  # {
-            continue
-        # }
-        for event in meevents[slword]:  # {
-            outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # '
-            for j in range(0,  len(sl_tl[slword])):  # {
-                for feature in meevents[slword][event]:  # {
-                    outline = outline + str(feature) + ':' + str(j) + ' '
-                # }
-                outline = outline + ' # '
-            # }
-            print(slword, '\t', len(sl_tl[slword]), '\t', outline)
-        # }
-    # }
+    # for slword in meevents:
+    #     if len(sl_tl[slword]) < 2:
+    #         continue
+        
+    #     for event in meevents[slword]:
+    #         outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # '
+    #         for j in range(0,  len(sl_tl[slword])):
+    #             for feature in meevents[slword][event]:
+    #                 outline = outline + str(feature) + ':' + str(j) + ' '
+                
+    #             outline = outline + ' # '
+            
+    #         print(slword, '\t', len(sl_tl[slword]), '\t', outline)
+        
+    
 
 
 if __name__ == '__main__':
-    if len(sys.argv) not in [3, 4]:  # {
+    if len(sys.argv) not in [3, 4]:
         print(
-            'Usage: count-patterns.py <lex> <extracted> [threshold]', file=sys.stderr)
+            'Usage: count-patterns.py <lex> <candidates> [threshold]', file=sys.stderr)
         exit(1)
-    # }
+    
 
     if len(sys.argv) == 4:
         THRESHOLD = int(sys.argv[3])
diff --git a/scripts/ngram-count-patterns.py b/scripts/ngram-count-patterns.py
index 91b7ced..52e60ce 100755
--- a/scripts/ngram-count-patterns.py
+++ b/scripts/ngram-count-patterns.py
@@ -2,11 +2,8 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, commands;
+import sys
 import common
-sys.stdin  = codecs.getreader('utf-8')(sys.stdin);
-sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
-sys.stderr = codecs.getwriter('utf-8')(sys.stderr);
 
 # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
 
@@ -17,198 +14,192 @@ sys.stderr = codecs.getwriter('utf-8')(sys.stderr);
 # ngrams[ngram][tl_word] = freq
 
 # 5 	Please<vblex><inf> rise<n> ,<cm> then<adv> ,<cm> for<pr> this<det><dem> minute<n> 's<gen> silence<n> .<sent>
-#5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
-#5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
-#5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
-#-------------------------------------------------------------------------------
-
-def wrap (x):
-	return '^' + x + '$'
-
-if len(sys.argv) < 3: #{
-	print ('count-patterns.py <lex> <extracted> <crispiness threshold>');
-	sys.exit(-1);
-#}
-
-MAX_NGRAMS = 2;
-
-crisphold = float(sys.argv[3]);
-cur_line = 0;
-
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
-
-lineno = 0
-for line in file(sys.argv[1]).readlines(): #{
-	lineno += 1
-	if lineno % 10000 == 0:
-		print >> sys.stderr, lineno
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = common.tokenise_tagger_line(line.decode('utf-8'));
-	sl = wrap(row[0]);
-	tl = wrap(row[1]);
-	if tl[1] == '*':
-		tl = tl[:-3] + '$'
-	if line.count('@') > 0: #{
-		sl_tl_defaults[sl] = tl;
-	else: #{
-		sl_tl[sl] = tl;
-	#}
-#}
-
-cur_sl_row = [];
-cur_tl_row = [];
-cur_bt_row = [];
-cur_al_row = [];
-lineno = 0
-for line in file(sys.argv[2]).readlines(): #{
-	lineno += 1
-	line = line.strip().decode('utf-8');
-	if lineno % 500 == 0:
-		print >> sys.stderr, lineno
-	if line[0] == '-': #{
-	#		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row);
-	#		print cur_sl_row;
-	#		print cur_bt_row;
-	#		print cur_tl_row;
-	#		print cur_al_row;
-	#
-		# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
-		#
-		# sl_tl[sl_word][tl_word] = tl_freq
-		i = 0;
-		for slword in cur_sl_row: #{
-			if len(cur_bt_row[i]['tls']) > 1: #{
-				for al in cur_al_row: #{
-					if al == '':
-						continue
-					al_sl = int(al.split('-')[1]);
-					al_tl = int(al.split('-')[0]);
-					if al_sl != i: #{
-						continue;
-					#}
-					tlword = wrap(cur_tl_row[al_tl]);
-					slword = wrap(slword);
-
-					if slword not in sl_tl_defaults: #{
-						print >>sys.stderr, '!',
-						continue;
-					#}
-
-					for j in range(1, MAX_NGRAMS): #{
-
-						pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1]));
-						postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1]));
-						roundgram = ' '.join(map(wrap, cur_sl_row[i-j:i+j+1]));
-
-						if slword not in ngrams: #{
-							ngrams[slword] = {};
-						#}
-						if pregram not in ngrams[slword]: #{
-							ngrams[slword][pregram] = {};
-						#}
-						if postgram not in ngrams[slword]: #{
-							ngrams[slword][postgram] = {};
-						#}
-						if roundgram not in ngrams[slword]: #{
-							ngrams[slword][roundgram] = {};
-						#}
-						if tlword not in ngrams[slword][pregram]: #{
-							ngrams[slword][pregram][tlword] = 0;
-						#}
-						if tlword not in ngrams[slword][postgram]: #{
-							ngrams[slword][postgram][tlword] = 0;
-						#}
-						if tlword not in ngrams[slword][roundgram]: #{
-							ngrams[slword][roundgram][tlword] = 0;
-						#}
-
-						ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1;
-						ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1;
-						ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1;
-					#}
-				#}
-#				for j in range(0, MAX_NGRAMS): #{
-#					print cur_sl_row[i-j:i+1];
-#					print cur_sl_row[i:i+j];
-#				#}
-			#}
-			i = i + 1;
-		#}
-		cur_line = 0;
-		#print line;
-		continue;
-	#}
-
-	line = line.split('\t')[1];
-
-	if cur_line == 0: #{
-		cur_sl_row = common.tokenise_tagger_line(line)
-	elif cur_line == 1: #{
-		cur_bt_row = common.tokenise_biltrans_line(line)
-	elif cur_line == 2: #{
-		cur_tl_row = common.tokenise_tagger_line(line)
-	elif cur_line == 3:  #{
-		cur_al_row = line.split(' ');
-	#}
-
-	cur_line = cur_line + 1;
-#}
-
-
-for sl in ngrams: #{
-
-	for ngram in ngrams[sl]: #{
-		total = 0;
-		max_freq = -1;
-		current_tl = '';
-		for tl in ngrams[sl][ngram]: #{
-			if ngrams[sl][ngram][tl] > max_freq: #{
-				max_freq = ngrams[sl][ngram][tl];
-				current_tl = tl;
-			#}
-			total = total + ngrams[sl][ngram][tl];
-		#}
-
-		#> If for each of the rules we include
-		#> the amount of time the translation is seen with that pattern over the
-		#> total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
-		#> etc.  (>0.6 would be the same as 2/3 of the time the alternative
-		#> translation is seen with that ngram, and 1/3 of the time the default
-		#> translation is). I think this would be easier to explain than the magic
-		#> number I came up with.
-		#
-		#I see this as a way to define how "crispy" the decisions are. I think it
-		#would be better to express this as a ratio: the ratio of the times the
-		#alternative translation is seen to the number of times the defaullt
-		#translation is seen with that n-gram.
-		#
-		#It would be "2" in this case: the alternative is seen twice as often as
-		#the default.
-
-		for tl in ngrams[sl][ngram]: #{
-			crispiness = 0.0;
-			default = sl_tl_defaults[sl];
-			alt_crisp = float(ngrams[sl][ngram][tl]) / float(total);
-			def_crisp = 1.0;
-			if default in ngrams[sl][ngram]: #{
-				def_crisp = float(ngrams[sl][ngram][default] / float(total));
-			#}
-			weight = float(ngrams[sl][ngram][tl]) / float(total);
-			crispiness = alt_crisp/def_crisp;
-
-			#print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ;
-
-			if crispiness < crisphold: #{
-				print '-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]);
-			else: #{
-
-				print '+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' +  sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]);
-			#}
-
-		#}
-	#}
-#}
+# 5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
+# 5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
+# 5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
+# -------------------------------------------------------------------------------
+
+
+def wrap(x):
+    return '^' + x + '$'
+
+
+def ngram_count_patterns(freq_lexicon, candidates, crisphold):
+    MAX_NGRAMS = 2
+
+    cur_line = 0
+
+    sl_tl_defaults = {}
+    sl_tl = {}
+    ngrams = {}
+
+    lineno = 0
+    for line in open(freq_lexicon).readlines():
+        lineno += 1
+        if lineno % 10000 == 0:
+            print(lineno, file=sys.stderr)
+        if len(line) < 1:
+            continue
+
+        row = common.tokenise_tagger_line(line)
+        sl = wrap(row[0])
+        tl = wrap(row[1])
+        if tl[1] == '*':
+            tl = tl[:-3] + '$'
+        if line.count('@') > 0:
+            sl_tl_defaults[sl] = tl
+        else:
+            sl_tl[sl] = tl
+
+    cur_sl_row = []
+    cur_tl_row = []
+    cur_bt_row = []
+    cur_al_row = []
+    lineno = 0
+    for line in open(candidates).readlines():
+        lineno += 1
+        line = line.strip()
+        if lineno % 500 == 0:
+            print(lineno, file=sys.stderr)
+        if line[0] == '-':
+            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row)
+            #		print cur_sl_row
+            #		print cur_bt_row
+            #		print cur_tl_row
+            #		print cur_al_row
+            #
+            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
+            #
+            # sl_tl[sl_word][tl_word] = tl_freq
+            i = 0
+            for slword in cur_sl_row:
+                if len(cur_bt_row[i]['tls']) > 1:
+                    for al in cur_al_row:
+                        if al == '':
+                            continue
+                        al_sl = int(al.split('-')[1])
+                        al_tl = int(al.split('-')[0])
+                        if al_sl != i:
+                            continue
+
+                        tlword = wrap(cur_tl_row[al_tl])
+                        slword = wrap(slword)
+
+                        if slword not in sl_tl_defaults:
+                            print('!', file=sys.stderr)
+                            continue
+
+                        for j in range(1, MAX_NGRAMS):
+
+                            pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1]))
+                            postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1]))
+                            roundgram = ' '.join(
+                                map(wrap, cur_sl_row[i-j:i+j+1]))
+
+                            if slword not in ngrams:
+                                ngrams[slword] = {}
+
+                            if pregram not in ngrams[slword]:
+                                ngrams[slword][pregram] = {}
+
+                            if postgram not in ngrams[slword]:
+                                ngrams[slword][postgram] = {}
+
+                            if roundgram not in ngrams[slword]:
+                                ngrams[slword][roundgram] = {}
+
+                            if tlword not in ngrams[slword][pregram]:
+                                ngrams[slword][pregram][tlword] = 0
+
+                            if tlword not in ngrams[slword][postgram]:
+                                ngrams[slword][postgram][tlword] = 0
+
+                            if tlword not in ngrams[slword][roundgram]:
+                                ngrams[slword][roundgram][tlword] = 0
+
+                            ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
+                            ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
+                            ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1
+
+    #				for j in range(0, MAX_NGRAMS):
+    #					print cur_sl_row[i-j:i+1]
+    #					print cur_sl_row[i:i+j]
+
+                i = i + 1
+
+            cur_line = 0
+            # print line
+            continue
+
+        line = line.split('\t')[1]
+
+        if cur_line == 0:
+            cur_sl_row = common.tokenise_tagger_line(line)
+        elif cur_line == 1:
+            cur_bt_row = common.tokenise_biltrans_line(line)
+        elif cur_line == 2:
+            cur_tl_row = common.tokenise_tagger_line(line)
+        elif cur_line == 3:
+            cur_al_row = line.split(' ')
+
+        cur_line = cur_line + 1
+
+    for sl in ngrams:
+
+        for ngram in ngrams[sl]:
+            total = 0
+            max_freq = -1
+            current_tl = ''
+            for tl in ngrams[sl][ngram]:
+                if ngrams[sl][ngram][tl] > max_freq:
+                    max_freq = ngrams[sl][ngram][tl]
+                    current_tl = tl
+
+                total = total + ngrams[sl][ngram][tl]
+
+            # > If for each of the rules we include
+            # > the amount of time the translation is seen with that pattern over the
+            # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8
+            # > etc.  (>0.6 would be the same as 2/3 of the time the alternative
+            # > translation is seen with that ngram, and 1/3 of the time the default
+            # > translation is). I think this would be easier to explain than the magic
+            # > number I came up with.
+            #
+            # I see this as a way to define how "crispy" the decisions are. I think it
+            # would be better to express this as a ratio: the ratio of the times the
+            # alternative translation is seen to the number of times the defaullt
+            # translation is seen with that n-gram.
+            #
+            # It would be "2" in this case: the alternative is seen twice as often as
+            # the default.
+
+            for tl in ngrams[sl][ngram]:
+                crispiness = 0.0
+                default = sl_tl_defaults[sl]
+                alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
+                def_crisp = 1.0
+                if default in ngrams[sl][ngram]:
+                    def_crisp = float(
+                        ngrams[sl][ngram][default] / float(total))
+
+                weight = float(ngrams[sl][ngram][tl]) / float(total)
+                crispiness = alt_crisp/def_crisp
+
+                # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]
+
+                if crispiness < crisphold:
+                    print('-', crispiness, weight, total, max_freq,
+                          ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
+                else:
+
+                    print('+', crispiness, weight, total, max_freq,
+                          ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 4:
+        print(
+            'Usage: count-patterns.py <lex> <candidates> <crispiness threshold>', file=sys.stderr)
+        exit(1)
+
+    ngram_count_patterns(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/scripts/ngrams-to-rules-me.py b/scripts/ngrams-to-rules-me.py
index ded1119..9c67b67 100755
--- a/scripts/ngrams-to-rules-me.py
+++ b/scripts/ngrams-to-rules-me.py
@@ -10,8 +10,6 @@ import common
 # +nature<n>	the<def><def> secular<adj> nature<n> of<pr> State<n>	carácter<n>	1
 # +nature<n>	its<det><pos> nature<n> prevent<vblex>	carácter<n>	1
 # +nature<n>	nature<n> be<vbser> in<pr>	carácter<n>	1
-#
-
 
 def ngrams_to_rules(ngrams):
     # FREQMIN = 8.0
@@ -25,12 +23,12 @@ def ngrams_to_rules(ngrams):
     ruleno = 0
 
     with open(ngrams) as infile:
-        for line in infile:  # {
+        for line in infile:
             #	print '\n'
             #	print line
-            if len(line) < 2:  # {
+            if len(line) < 2:
                 continue
-            # }
+
             line = line.strip()
             # line = line.decode('utf-8').strip()
             print(line, file=sys.stderr)
@@ -49,52 +47,44 @@ def ngrams_to_rules(ngrams):
 
             pattern = common.tokenize_tagger_line(row[2])
 
-            if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0:  # {
+            if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0:
                 print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
                 continue
-            # }
 
-            if tipus == '-' or tipus == '~':  # {
+            if tipus == '-' or tipus == '~':
                 print('DEFAULT_READING', line, file=sys.stderr)
                 continue
-            # }
 
             # Hacks
-        #	if len(pattern) == 0: #{
+        #	if len(pattern) == 0:
         #		print('ZERO_PATTERN' , line, file=sys.stderr);
         #		continue
-            # }
 
-            if len(pattern) < MINMATCH and len(pattern) > 0:  # {
+            if len(pattern) < MINMATCH and len(pattern) > 0:
                 print('BELOW_MINMATCH', line, file=sys.stderr)
                 continue
-            # }
 
             inpattern = False
-            for w in pattern:  # {
-                if w.lower().count(sl) > 0:  # {
+            for w in pattern:
+                if w.lower().count(sl) > 0:
                     inpattern = True
-                # }
-            # }
-            if len(pattern) > 0 and not inpattern:  # {
+
+            if len(pattern) > 0 and not inpattern:
                 print('SL_NOT_IN_PATTERN', line, file=sys.stderr)
                 continue
-            # }
 
-            if tl_tags.count('adj') > 0 and sl.count('adj') < 1:  # {
+            if tl_tags.count('adj') > 0 and sl.count('adj') < 1:
                 print("TAG_MISMATCH", line, file=sys.stderr)
                 continue
-            # }
-            if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:  # {
+
+            if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:
                 print("TAG_MISMATCH", line, file=sys.stderr)
                 continue
-            # }
 
-            if tl_tags.split('.')[0] not in permitted_tags:  # {
+            if tl_tags.split('.')[0] not in permitted_tags:
                 print("TAG_NOT_PERMITTED", tl_tags,
                       '||', line, file=sys.stderr)
                 continue
-            # }
 
             sel = False
             ruleno = ruleno + 1
@@ -102,21 +92,19 @@ def ngrams_to_rules(ngrams):
 
             commentb = ''
             commente = ''
-        #	if freq < FREQMIN: #{
+        #	if freq < FREQMIN:
         #		commentb = '<!--'
         #		commente = '-->'
-        #	#}
 
             print(commentb + '  <rule c="' + str(ruleno) + ' ' +
                   str(lineno) + ': ' + str(freq) + '" weight="' + weight + '">')
-            for word in pattern:  # {
+            for word in pattern:
                 sl_lema = word.split('<')[0].lower()
-                if word.count('><') > 0:  # {
+                if word.count('><') > 0:
                     sl_tags = '<'.join(word.split('<')[1:]).replace(
                         '><', '.').replace('>', '')
-                else:  # {
+                else:
                     sl_tags = '<'.join(word.split('<')[1:]).strip('<>')
-                # }
 
                 # ======================================================================= #
 
@@ -129,55 +117,53 @@ def ngrams_to_rules(ngrams):
         #		sl_lema = sl_lema.replace(')', '\)')
         #		tl_lema = tl_lema.replace(')', '\)')
         #
-                if word.lower().count(sl) > 0:  # {
+                if word.lower().count(sl) > 0:
                     lineno = lineno + 1
-                    if sl_lema == '':  # {
+                    if sl_lema == '':
                         print('    <match tags="' + sl_tags + '"><select lemma="' +
                               tl_lema + '" tags="' + tl_tags + '"/></match>')
-                    else:  # {
+                    else:
                         print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
                               '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
-                    # }
+
                     sel = True
-                else:  # {
+                else:
                     lineno = lineno + 1
-                    if sl_lema == '':  # {
+                    if sl_lema == '':
                         print('    <match tags="' + sl_tags + '"/>')
-                    else:  # {
+                    else:
                         print('    <match lemma="' + sl_lema +
                               '" tags="' + sl_tags + '"/>')
-                    # }
-                # }
-            # }
-            if sel == False and len(pattern) == 0:  # {
+
+            if sel == False and len(pattern) == 0:
                 sl_lema = sl.split('<')[0]
-                if sl.count('><') > 0:  # {
+                if sl.count('><') > 0:
                     sl_tags = '<'.join(sl.split('<')[1:]).replace(
                         '><', '.').replace('>', '')
-                else:  # {
+                else:
                     sl_tags = '<'.join(sl.split('<')[1:]).strip('<>')
-                # }
-                if sl_lema == '':  # {
+
+                if sl_lema == '':
                     print('    <match tags="' + sl_tags + '"><select lemma="' +
                           tl_lema + '" tags="' + tl_tags + '"/></match>')
-                else:  # {
+                else:
                     print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
                           '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
                 print('  </rule>' + commente)
             elif sel == False:
                 print('  </rule>'+commente +
                       '<!-- Warning: No select operation ', line, '-->')
-            else:  # {
+            else:
                 print('  </rule>' + commente)
-            # }
+
             lineno = lineno + 1
-        # }
+
     print('</rules>')
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 2:  # {
-        print('Usage: ngrams-to-rules.py <ngrams>', file=sys.stderr)
+    if len(sys.argv) < 2:
+        print('Usage: ngrams-to-rules-me.py <ngrams>', file=sys.stderr)
         exit(1)
-    # }
+
     ngrams_to_rules(sys.argv[1])
diff --git a/scripts/ngrams-to-rules.py b/scripts/ngrams-to-rules.py
index 13c2643..329851e 100755
--- a/scripts/ngrams-to-rules.py
+++ b/scripts/ngrams-to-rules.py
@@ -2,148 +2,140 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys;
+import sys
 import common
 
 #+nature<n>	service<n> nature<n>	carácter<n>	3
-#+nature<n>	The<def><def> imperialist<adj> nature<n>	carácter<n>	1
-#+nature<n>	the<def><def> secular<adj> nature<n> of<pr> State<n>	carácter<n>	1
-#+nature<n>	its<det><pos> nature<n> prevent<vblex>	carácter<n>	1
-#+nature<n>	nature<n> be<vbser> in<pr>	carácter<n>	1
-#
-
-infile = '';
-
-if len(sys.argv) < 3: #{
-	print('ngrams-to-rules.py <ngrams> <threshold>');
-	sys.exit(-1);
-#}
-
-infile = open(sys.argv[1]);
-threshold = float(sys.argv[2]);
-
-permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*'];
-
-print('<rules>');
-lineno = 1;
-ruleno = 0;
-for line in infile.readlines(): #{
-#	print('\n';
-#	print(line
-	if len(line) < 2: #{
-		continue;
-	#}
-	line = line.strip();
-	#line = line.decode('utf-8').strip();
-
-
-	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
-	row = line.split('\t');
-
-	if len(row) == 3:
-		row.insert(0, '');
-
-#	tipus = row[0].split(' ')[0];
-	weight = row[0].split(' ')[1];
-	sl = row[1].strip()[1:-1];
-	tl = row[3][1:-1];
-	tl_lema = tl.split('<')[0].lower();
-	tl_tags = '<'.join(tl.split('<')[1:]).replace('><', '.').replace('>', '');
-
-
-	freq = row[4];
-	pattern = common.tokenize_tagger_line(row[2]);
-
-	if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0: #{
-		print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr);
-		continue;
-	#}
-
-	inpattern = False;
-	for w in pattern: #{
-		if w.count(sl) > 0: #{
-			inpattern = True;
-		#}
-	#}
-	if inpattern == False:  #{
-		print('SL_NOT_IN_PATTERN' , line, sl, tl, file=sys.stderr);
-		continue;
-	#}
-
-	if tl_tags.count('adj') > 0 and sl.count('adj')  < 1: #{
-		print("TAG_MISMATCH" , line, file=sys.stderr);
-		continue;
-	#}
-	if tl_tags.count('vbmod') > 0 and sl.count('vbmod')  < 1: #{
-		print("TAG_MISMATCH" , line, file=sys.stderr);
-		continue;
-	#}
-
-	if tl_tags.split('.')[0] not in permitted_tags: #{
-		print("TAG_NOT_PERMITTED" , tl_tags , '||' , line, file=sys.stderr);
-		continue;
-	#}
-
-	if float(weight) <= float(threshold): #{
-		print("UNDER_THRESHOLD", weight, "<", threshold, "||",  line, file=sys.stderr);
-		continue;
-	#}
-
-	if any([x.startswith("*") for x in pattern]): #{
-		print("UNKNOWN_WORD_IN_PATTERN" , pattern, file=sys.stderr);
-		continue;
-	#}
-
-	sel = False;
-	ruleno = ruleno + 1;
-	lineno = lineno + 1;
-
-	print('  <rule c="' + str(ruleno) + ' ' + str(lineno) + ': ' + freq + '" weight="' + weight + '">');
-	for word in pattern: #{
-		sl_lema = word.split('<')[0].lower();
-		if (sl_lema[0] == '*'):
-			continue;
-
-		if word.count('><') > 0: #{
-			sl_tags = '<'.join(word.split('<')[1:]).replace('><', '.').replace('>', '');
-		else: #{
-			sl_tags = '<'.join(word.split('<')[1:]).strip('<>');
-		#}
-
-		# ======================================================================= #
-
-		sl_lema = sl_lema.replace('~', ' ');
-		tl_lema = tl_lema.replace('~', ' ');
-		sl_lema = sl_lema.replace('-', '\-');
-		tl_lema = tl_lema.replace('-', '\-');
-		sl_lema = sl_lema.replace('(', '\(');
-		tl_lema = tl_lema.replace('(', '\(');
-		sl_lema = sl_lema.replace(')', '\)');
-		tl_lema = tl_lema.replace(')', '\)');
-
-		if word.lower().count(sl) > 0: #{
-			lineno = lineno + 1;
-			if sl_lema == '': #{
-				print('    <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-			else: #{
-				print('    <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-			#}
-			sel = True;
-		else: #{
-			lineno = lineno + 1;
-			if sl_lema == '': #{
-				print('    <match tags="' + sl_tags + '"/>');
-			else: #{
-				print('    <match lemma="' + sl_lema + '" tags="' + sl_tags + '"/>');
-			#}
-		#}
-	#}
-	if sel == False: #{
-
-		print('  </rule> <!-- Warning: No select operation ', line, '-->');
-	else: #{
-		print('  </rule>');
-	#}
-	lineno = lineno + 1;
-#}
-print('</rules>');
+# +nature<n>	The<def><def> imperialist<adj> nature<n>	carácter<n>	1
+# +nature<n>	the<def><def> secular<adj> nature<n> of<pr> State<n>	carácter<n>	1
+# +nature<n>	its<det><pos> nature<n> prevent<vblex>	carácter<n>	1
+# +nature<n>	nature<n> be<vbser> in<pr>	carácter<n>	1
+
+def ngrams_to_rules(ngrams, crisphold):
+    permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*']
+
+    print('<rules>')
+    lineno = 1
+    ruleno = 0
+    for line in open(ngrams).readlines():
+        #	print('\n';
+        #	print(line
+        if len(line) < 2:
+            continue
+
+        line = line.strip()
+        #line = line.strip();
+
+        # + 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
+        row = line.split('\t')
+
+        if len(row) == 3:
+            row.insert(0, '')
+
+    #	tipus = row[0].split(' ')[0];
+        weight = row[0].split(' ')[1]
+        sl = row[1].strip()[1:-1]
+        tl = row[3][1:-1]
+        tl_lema = tl.split('<')[0].lower()
+        tl_tags = '<'.join(tl.split('<')[1:]).replace(
+            '><', '.').replace('>', '')
+
+        freq = row[4]
+        pattern = common.tokenize_tagger_line(row[2])
+
+        if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0:
+            print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
+            continue
+
+        inpattern = False
+        for w in pattern:
+            if w.count(sl) > 0:
+                inpattern = True
+
+        if inpattern == False:
+            print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr)
+            continue
+
+        if tl_tags.count('adj') > 0 and sl.count('adj') < 1:
+            print("TAG_MISMATCH", line, file=sys.stderr)
+            continue
+
+        if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:
+            print("TAG_MISMATCH", line, file=sys.stderr)
+            continue
+
+        if tl_tags.split('.')[0] not in permitted_tags:
+            print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr)
+            continue
+
+        if float(weight) <= float(crisphold):
+            print("UNDER_THRESHOLD", weight, "<",
+                  crisphold, "||",  line, file=sys.stderr)
+            continue
+
+        if any([x.startswith("*") for x in pattern]):
+            print("UNKNOWN_WORD_IN_PATTERN", pattern, file=sys.stderr)
+            continue
+
+        sel = False
+        ruleno = ruleno + 1
+        lineno = lineno + 1
+
+        print('  <rule c="' + str(ruleno) + ' ' + str(lineno) +
+              ': ' + freq + '" weight="' + weight + '">')
+        for word in pattern:
+            sl_lema = word.split('<')[0].lower()
+            if (sl_lema[0] == '*'):
+                continue
+
+            if word.count('><') > 0:
+                sl_tags = '<'.join(word.split('<')[1:]).replace(
+                    '><', '.').replace('>', '')
+            else:
+                sl_tags = '<'.join(word.split('<')[1:]).strip('<>')
+
+            # ======================================================================= #
+
+            sl_lema = sl_lema.replace('~', ' ')
+            tl_lema = tl_lema.replace('~', ' ')
+            sl_lema = sl_lema.replace('-', '\-')
+            tl_lema = tl_lema.replace('-', '\-')
+            sl_lema = sl_lema.replace('(', '\(')
+            tl_lema = tl_lema.replace('(', '\(')
+            sl_lema = sl_lema.replace(')', '\)')
+            tl_lema = tl_lema.replace(')', '\)')
+
+            if word.lower().count(sl) > 0:
+                lineno = lineno + 1
+                if sl_lema == '':
+                    print('    <match tags="' + sl_tags + '"><select lemma="' +
+                          tl_lema + '" tags="' + tl_tags + '"/></match>')
+                else:
+                    print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
+                          '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
+
+                sel = True
+            else:
+                lineno = lineno + 1
+                if sl_lema == '':
+                    print('    <match tags="' + sl_tags + '"/>')
+                else:
+                    print('    <match lemma="' + sl_lema +
+                          '" tags="' + sl_tags + '"/>')
+
+        if sel == False:
+
+            print('  </rule> <!-- Warning: No select operation ', line, '-->')
+        else:
+            print('  </rule>')
+
+        lineno = lineno + 1
+    print('</rules>')
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print('Usage: ngrams-to-rules.py <ngrams> <crisphold>', file=sys.stderr)
+        exit(1)
+
+    ngrams_to_rules(sys.argv[1], sys.argv[2])