commit 1afdffd74fd3853e7eabedbd071e7d7344bb07c4
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Thu Jun 17 15:25:56 2021 +0530

    Scripts fixup: enclosing the code in functions

diff --git a/scripts/extract-freq-lexicon.py b/scripts/extract-freq-lexicon.py
index 379539a..3534ac6 100755
--- a/scripts/extract-freq-lexicon.py
+++ b/scripts/extract-freq-lexicon.py
@@ -2,7 +2,8 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, common
+import sys
+import common
 
 # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
 
@@ -13,128 +14,135 @@ import sys, common
 # ngrams[ngram][tl_word] = freq
 
 # 5 	Please<vblex><inf> rise<n> ,<cm> then<adv> ,<cm> for<pr> this<det><dem> minute<n> 's<gen> silence<n> .<sent>
-#5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
-#5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
-#5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
-#-------------------------------------------------------------------------------
-
-def wrap (x):
-	return '^' + x + '$'
-
-MAX_NGRAMS = 3
-
-cur_line = 0
-lineno = 0
-sl_tl = {}
-ngrams = {}
-
-cur_sl_row = []
-cur_tl_row = []
-cur_bt_row = []
-cur_al_row = []
-
-if len(sys.argv) < 2: #{
-	print('extract-freq-lexicon.py <candidate sent>')
-	sys.exit(-1)
-#}
-
-#for line in open(sys.argv[1]).readlines(): #{
-with open(sys.argv[1]) as infile:
-	for line in infile: #{
-		line = line.strip()
-		lineno += 1
-		if lineno % 5000 == 0: #{
-			sys.stderr.write('.')
-			if lineno % 100000 == 0: #{
-				sys.stderr.write(str(lineno)+'\n')
-			#}
-			sys.stderr.flush()
-		#}
-		try:
-			if line[0] == '-': #{
-				# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
-				#
-				# sl_tl[sl_word][tl_word] = tl_freq
-				i = 0
-				for slword in cur_sl_row: #{
-					if len(cur_bt_row[i]['tls']) > 1: #{
-						for al in cur_al_row: #{
-							if al == '':
-								continue
-							al_sl = int(al.split('-')[1])
-							al_tl = int(al.split('-')[0])
-							if al_sl != i: #{
-								continue
-							#}
-							if al_tl < len(cur_tl_row):
-								tlword = cur_tl_row[al_tl]
-							else:
-								tlword = cur_tl_row[-1]
-								print("alignment out",
-									"of",
-									"range", al_tl,
-									"not in",
-									"len(",
-									cur_tl_row,
-									")",
-									file=sys.stderr)
-							slword = slword
-							if slword not in sl_tl: #{
-								sl_tl[slword] = {}
-							#}
-							if tlword not in sl_tl[slword]: #{
-								sl_tl[slword][tlword] = 0
-							#}
-							sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1
-							# print '+' , slword , tlword , sl_tl[slword][tlword], lineno
-						#}
-					#}
-					i = i + 1
-				#}
-				cur_line = 0
-				continue
-			#}
-
-			line = line.split('\t')[1]
-
-			if cur_line == 0: #{
-				cur_sl_row = common.tokenise_tagger_line(line)
-			elif cur_line == 1: #{
-				cur_bt_row = common.tokenise_biltrans_line(line)
-			elif cur_line == 2: #{
-				cur_tl_row = common.tokenise_tagger_line(line)
-			elif cur_line == 3:  #{
-				cur_al_row = line.split(' ')
-			#}
-
-			cur_line = cur_line + 1
-		except Exception as e:
-			print("Error in line", lineno, ":", e, file=sys.stderr)
-			sys.exit(-1)
-	#}
-#}
-
-for sl in sl_tl: #{
-
-	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
-	newtl.reverse()
-	first = True
-	for tl in newtl: #{
-		if tl[0] == '*': #{
-			print('Error: tl word unknown', tl,  file=sys.stderr)
-			continue
-		#}
-		first_tag_sl = sl.split('<')[1].split('>')[0].strip()
-		first_tag_tl = tl.split('<')[1].split('>')[0].strip()
-		if first_tag_sl != first_tag_tl: #{
-			print('Error:', first_tag_sl, '!=', first_tag_tl, file=sys.stderr)
-			continue
-		#}
-		if first: #{
-			print(sl_tl[sl][tl] , wrap(sl) , wrap(tl) , '@')
-			first = False
-		else: #{
-			print(sl_tl[sl][tl] , wrap(sl) , wrap(tl))
-		#}
-	#}
-#}
+# 5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
+# 5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
+# 5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
+# -------------------------------------------------------------------------------
+
+
+def wrap(x):
+    return '^' + x + '$'
+
+
+def extract_freq_lexicon(canditates):
+    # MAX_NGRAMS = 3
+
+    cur_line = 0
+    lineno = 0
+    sl_tl = {}
+    # ngrams = {}
+
+    cur_sl_row = []
+    cur_tl_row = []
+    cur_bt_row = []
+    cur_al_row = []
+
+    # for line in open(sys.argv[1]).readlines(): #{
+    with open(canditates) as infile:
+        for line in infile:  # {
+            line = line.strip()
+            lineno += 1
+            if lineno % 5000 == 0:  # {
+                sys.stderr.write('.')
+                if lineno % 100000 == 0:  # {
+                    sys.stderr.write(str(lineno)+'\n')
+                # }
+                sys.stderr.flush()
+            # }
+            try:
+                if line[0] == '-':  # {
+                    # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
+                    #
+                    # sl_tl[sl_word][tl_word] = tl_freq
+                    i = 0
+                    for slword in cur_sl_row:  # {
+                        if len(cur_bt_row[i]['tls']) > 1:  # {
+                            for al in cur_al_row:  # {
+                                if al == '':
+                                    continue
+                                al_sl = int(al.split('-')[1])
+                                al_tl = int(al.split('-')[0])
+                                if al_sl != i:  # {
+                                    continue
+                                # }
+                                if al_tl < len(cur_tl_row):
+                                    tlword = cur_tl_row[al_tl]
+                                else:
+                                    tlword = cur_tl_row[-1]
+                                    print("alignment out",
+                                          "of",
+                                          "range", al_tl,
+                                          "not in",
+                                          "len(",
+                                          cur_tl_row,
+                                          ")",
+                                          file=sys.stderr)
+                                slword = slword
+                                if slword not in sl_tl:  # {
+                                    sl_tl[slword] = {}
+                                # }
+                                if tlword not in sl_tl[slword]:  # {
+                                    sl_tl[slword][tlword] = 0
+                                # }
+                                sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1
+                                # print '+' , slword , tlword , sl_tl[slword][tlword], lineno
+                            # }
+                        # }
+                        i = i + 1
+                    # }
+                    cur_line = 0
+                    continue
+                # }
+
+                line = line.split('\t')[1]
+
+                if cur_line == 0:  # {
+                    cur_sl_row = common.tokenise_tagger_line(line)
+                elif cur_line == 1:  # {
+                    cur_bt_row = common.tokenise_biltrans_line(line)
+                elif cur_line == 2:  # {
+                    cur_tl_row = common.tokenise_tagger_line(line)
+                elif cur_line == 3:  # {
+                    cur_al_row = line.split(' ')
+                # }
+
+                cur_line = cur_line + 1
+            except Exception as e:
+                print("Error in line", lineno, ":", e, file=sys.stderr)
+                sys.exit(-1)
+        # }
+    # }
+
+    for sl in sl_tl:  # {
+
+        newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
+        newtl.reverse()
+        first = True
+        for tl in newtl:  # {
+            if tl[0] == '*':  # {
+                print('Error: tl word unknown', tl,  file=sys.stderr)
+                continue
+            # }
+            first_tag_sl = sl.split('<')[1].split('>')[0].strip()
+            first_tag_tl = tl.split('<')[1].split('>')[0].strip()
+            if first_tag_sl != first_tag_tl:  # {
+                print('Error:', first_tag_sl, '!=',
+                      first_tag_tl, file=sys.stderr)
+                continue
+            # }
+            if first:  # {
+                print(sl_tl[sl][tl], wrap(sl), wrap(tl), '@')
+                first = False
+            else:  # {
+                print(sl_tl[sl][tl], wrap(sl), wrap(tl))
+            # }
+        # }
+    # }
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:  # {
+        print('extract-freq-lexicon.py <candidate sent>')
+        exit(1)
+    # }
+    extract_freq_lexicon(sys.argv[1])
diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py
index 0d8834b..01aedf7 100755
--- a/scripts/extract-sentences.py
+++ b/scripts/extract-sentences.py
@@ -2,99 +2,107 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs
+import sys
+import codecs
 import common
 
-if len(sys.argv) < 3: #{
-	print('extact-sentences.py <phrasetable> <biltrans>')
-	sys.exit(-1)
-#}
-
-phrase_table = open(sys.argv[1])
-biltrans_out = open(sys.argv[2])
-
-def ambiguous(bt): #{
-	# legislation<n><sg>/legislación<n><f><sg>/ordenamiento<n><m><sg>
-
-	ambig = False
-	for token in bt: #{
-		tls = token['tls']
-		if len(tls) > 1: #{
-			return True
-		#}
-	#}
-
-	return ambig
-#}
-
-reading = True
-lineno = 0
-total_valid = 0
-total_errors = 0
-
-not_ambiguous = []
-
-while reading: #{
-	try:
-		lineno = lineno + 1
-		pt_line = phrase_table.readline().strip()
-		bt_line = biltrans_out.readline().strip()
-
-		if not bt_line.strip() and not pt_line.strip(): #{
-			reading = False
-			break
-		elif not bt_line.strip() or not pt_line.strip(): #{
-			continue
-
-		#}
-		row = pt_line.split('|||')
-		bt = common.tokenise_biltrans_line(bt_line.strip())
-		sl = common.tokenise_tagger_line(row[1].strip())
-		tl = common.tokenise_tagger_line(row[0].strip())
-		
-		if not ambiguous(bt): #{
-			not_ambiguous.append(str(lineno))
-			if len(not_ambiguous) >= 10: #{
-				print ("not ambiguous:", ' '.join(not_ambiguous), file=sys.stderr)
-				not_ambiguous = []
-			#}
-			continue
-		#}
-		if len(sl) < 2 and len(tl) < 2: #{
-			continue
-		#}
-
-		
-		# Check that the number of words in the lexical transfer, and in the phrasetable matches up
-		if len(sl) != len(bt): #{
-			print ("Error in line", lineno, ": len(sl) != len(bt)", file=sys.stderr)
-			continue
-		#}
-
-		# cheking if the alignments are empty
-		if not row[2].strip():
-			print("In line", lineno, ", alignments are empty", file=sys.stderr)
-			continue
-
-		# Resumption<n> of<pr> the<def><def> session<n>
-		# Resumption<n><sg>/Reanudación<n><f><sg> of<pr>/de<pr> the<det><def><sp>/el<det><def><GD><ND> session<n><sg>/sesión<n><f><sg>
-		# Reanudación<n> de<pr> el<det><def> periodo<n> de<pr> sesión<n>
-		# 0-0 1-1 2-2 5-3
-
-
-		print(lineno, '\t' + row[1])
-		print(lineno, '\t' + bt_line)
-		print(lineno, '\t' + row[0])
-		print(lineno, '\t' + row[2])
-		print('-------------------------------------------------------------------------------')
-		total_valid += 1
-	except Exception as e:
-		print ("Error in line", lineno, ": ", e, file=sys.stderr)
-		total_errors += 1
-		continue
-
-#}
-
-print('total:', lineno, file=sys.stderr)
-print('valid:', total_valid, '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr)
-print('errors:',total_errors, '(' + str((total_errors/lineno)*100) + '%)', file=sys.stderr)
+
+def ambiguous(bt):  # {
+    # legislation<n><sg>/legislación<n><f><sg>/ordenamiento<n><m><sg>
+
+    ambig = False
+    for token in bt:  # {
+        tls = token['tls']
+        if len(tls) > 1:  # {
+            return True
+        # }
+    # }
+
+    return ambig
+# }
+
+
+def extract_sentences(phrase_table, biltrans_out):
+    reading = True
+    lineno = 0
+    total_valid = 0
+    total_errors = 0
+
+    not_ambiguous = []
+
+    while reading:  # {
+        try:
+            lineno = lineno + 1
+            pt_line = phrase_table.readline().strip()
+            bt_line = biltrans_out.readline().strip()
+
+            if not bt_line.strip() and not pt_line.strip():  # {
+                reading = False
+                break
+            elif not bt_line.strip() or not pt_line.strip():  # {
+                continue
+
+            # }
+            row = pt_line.split('|||')
+            bt = common.tokenise_biltrans_line(bt_line.strip())
+            sl = common.tokenise_tagger_line(row[1].strip())
+            tl = common.tokenise_tagger_line(row[0].strip())
+
+            if not ambiguous(bt):  # {
+                not_ambiguous.append(str(lineno))
+                if len(not_ambiguous) >= 10:  # {
+                    print("not ambiguous:", ' '.join(
+                        not_ambiguous), file=sys.stderr)
+                    not_ambiguous = []
+                # }
+                continue
+            # }
+            if len(sl) < 2 and len(tl) < 2:  # {
+                continue
+            # }
+
+            # Check that the number of words in the lexical transfer, and in the phrasetable matches up
+            if len(sl) != len(bt):  # {
+                print("Error in line", lineno,
+                      ": len(sl) != len(bt)", file=sys.stderr)
+                continue
+            # }
+
+            # cheking if the alignments are empty
+            if not row[2].strip():
+                print("In line", lineno, ", alignments are empty", file=sys.stderr)
+                continue
+
+            # Resumption<n> of<pr> the<def><def> session<n>
+            # Resumption<n><sg>/Reanudación<n><f><sg> of<pr>/de<pr> the<det><def><sp>/el<det><def><GD><ND> session<n><sg>/sesión<n><f><sg>
+            # Reanudación<n> de<pr> el<det><def> periodo<n> de<pr> sesión<n>
+            # 0-0 1-1 2-2 5-3
+
+            print(lineno, '\t' + row[1])
+            print(lineno, '\t' + bt_line)
+            print(lineno, '\t' + row[0])
+            print(lineno, '\t' + row[2])
+            print(
+                '-------------------------------------------------------------------------------')
+            total_valid += 1
+        except Exception as e:
+            print("Error in line", lineno, ": ", e, file=sys.stderr)
+            total_errors += 1
+            continue
+
+    # }
+
+    print('total:', lineno, file=sys.stderr)
+    print('valid:', total_valid,
+          '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr)
+    print('errors:', total_errors,
+          '(' + str((total_errors/lineno)*100) + '%)', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:  # {
+        print('extact-sentences.py <phrasetable> <biltrans>')
+        exit(1)
+    # }
+    with open(sys.argv[1]) as phrase_table, open(sys.argv[2]) as biltrans_out:
+        extract_sentences(phrase_table, biltrans_out)
diff --git a/scripts/lambdas-to-rules.py b/scripts/lambdas-to-rules.py
index ead391d..d02b87f 100644
--- a/scripts/lambdas-to-rules.py
+++ b/scripts/lambdas-to-rules.py
@@ -1,78 +1,89 @@
-import sys;
-import common;
-
-def wrap (x):
-	return '^' + x + '$'
-
-sl_tl_defaults = {};
-sl_tl = {};
-
-indexes = {};
-trad_counter = {};
-rindex = {};
-
-with open(sys.argv[1]) as d:
-	for line in d: #{
-		if len(line) < 1: #{
-			continue;
-		#}
-		row = common.tokenise_tagger_line(line);
-		sl = wrap(row[0].strip());
-		tl = wrap(row[1].strip());
-		if tl[1] == '*':
-			tl = tl[:-3] + '$'
-
-		if sl not in sl_tl: #{
-			sl_tl[sl] = [];
-		#}
-		if sl not in trad_counter: #{
-			trad_counter[sl] = 0;
-		#}
-		if line.count('@') > 0: #{
-			sl_tl_defaults[sl] = tl;
-		#}
-		sl_tl[sl].append(tl);
-		indexes[(sl, tl)] = trad_counter[sl];
-		rindex[(sl, trad_counter[sl])] = tl;
-		trad_counter[sl] = trad_counter[sl] + 1;
-
-	#}
-
-for pair in rindex: #{
-	print(pair[0], pair[1], rindex[pair], file=sys.stderr);
-#}
-
-#ability<n> 	 0.25652 	 1 	 ability<n> to<pr>
-#ability<n> 	 1.54548 	 0 	 ability<n> to<pr> deliver<vblex><inf>
-#ability<n> 	 1.48162 	 0 	 our<det><pos> ability<n> to<pr>
-
-with open(sys.argv[2]) as d:
-	for line in d: #{
-
-		row = line.split(' \t ');
-		slword = row[0].strip();
-		l = float(row[1]);
-		tlid = int(row[2]);
-		if (slword, tlid) not in rindex: #{
-			print ('(', slword, ',', tlid, ') not in index', file=sys.stderr)
-			continue;
-		#}
-		tlword = rindex[(slword, tlid)];
-		context = row[3].strip();
-	#	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
-	#+nature<n>	service<n> nature<n>	carácter<n>	3
-
-
-		print('+ ' + row[1] + '\t' + slword + '\t' + context + '\t' + tlword + '\t1');
-
-	#	print('  <rule weight="%.5f">' % (l));
-	#	for c in context.split(' '): #{
-	#		if c.count(slword) == 1: #{
-	#			print(slword, tlword);
-	#		else: #{
-	#			print(c);
-	#		#}
-	#	#}
-	#	print('  </rule>');
-
-	#}
+import sys
+import common
+
+
+def wrap(x):
+    return '^' + x + '$'
+
+
+def lambdas_to_rules(freq_lexicon, rules):
+    sl_tl_defaults = {}
+    sl_tl = {}
+
+    indexes = {}
+    trad_counter = {}
+    rindex = {}
+
+    with open(freq_lexicon) as d:
+        for line in d:  # {
+            if len(line) < 1:  # {
+                continue
+            # }
+            row = common.tokenise_tagger_line(line)
+            sl = wrap(row[0].strip())
+            tl = wrap(row[1].strip())
+            if tl[1] == '*':
+                tl = tl[:-3] + '$'
+
+            if sl not in sl_tl:  # {
+                sl_tl[sl] = []
+            # }
+            if sl not in trad_counter:  # {
+                trad_counter[sl] = 0
+            # }
+            if line.count('@') > 0:  # {
+                sl_tl_defaults[sl] = tl
+            # }
+            sl_tl[sl].append(tl)
+            indexes[(sl, tl)] = trad_counter[sl]
+            rindex[(sl, trad_counter[sl])] = tl
+            trad_counter[sl] = trad_counter[sl] + 1
+
+        # }
+
+    for pair in rindex:  # {
+        print(pair[0], pair[1], rindex[pair], file=sys.stderr)
+    # }
+
+    # ability<n> 	 0.25652 	 1 	 ability<n> to<pr>
+    # ability<n> 	 1.54548 	 0 	 ability<n> to<pr> deliver<vblex><inf>
+    # ability<n> 	 1.48162 	 0 	 our<det><pos> ability<n> to<pr>
+
+    with open(rules) as d:
+        for line in d:  # {
+
+            row = line.split(' \t ')
+            slword = row[0].strip()
+            l = float(row[1])
+            tlid = int(row[2])
+            if (slword, tlid) not in rindex:  # {
+                print('(', slword, ',', tlid, ') not in index', file=sys.stderr)
+                continue
+            # }
+            tlword = rindex[(slword, tlid)]
+            context = row[3].strip()
+        #	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
+        #+nature<n>	service<n> nature<n>	carácter<n>	3
+
+            print('+ ' + row[1] + '\t' + slword +
+                  '\t' + context + '\t' + tlword + '\t1')
+
+        #	print('  <rule weight="%.5f">' % (l))
+        #	for c in context.split(' '): #{
+        #		if c.count(slword) == 1: #{
+        #			print(slword, tlword)
+        #		else: #{
+        #			print(c)
+        #		#}
+        #	#}
+        #	print('  </rule>')
+
+        # }
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:  # {
+        print('lambdas-to-rules.py <lex> <rules>')
+        exit(1)
+    # }
+    lambdas_to_rules(sys.argv[1], sys.argv[2])
diff --git a/scripts/merge-ngrams-lambdas.py b/scripts/merge-ngrams-lambdas.py
index 21264f4..1595d26 100644
--- a/scripts/merge-ngrams-lambdas.py
+++ b/scripts/merge-ngrams-lambdas.py
@@ -2,43 +2,52 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, random;
-
-# ngram file
-# lambda file
-# lexicon file
-
-ngf = sys.argv[1];
-ldf = sys.argv[2];
-
-ngrams = {};
-lambdas = {};
-
-for line in open(ngf).readlines(): #{
-	#59763 	poor<adj> in<pr> capital<adj>
-	if len(line) < 2: #{
-		continue;
-	#}
-	row = line.strip().split( '\t' );
-	if(len(row) < 2):
-		row.append('');
-	ngid = int(row[0].strip());
-	ngrams[ngid] = row[1];
-#}
-
-with open(ldf) as d:
-	for line in d: #{
-		#59176:0 1.00131
-		if line.count('@@') > 0: #{
-			continue;
-		#}
-		row = line.strip().split('\t');
-
-		l = float(row[2]);
-		ngid = int(row[1].split(':')[0]);
-		ngram = ngrams[ngid];
-
-		trad = row[1].split(':')[1];
-		token = row[0]
-		print(token, '\t', l, '\t', trad, '\t', ngram);
-	#}
+import sys
+import codecs
+import random
+
+
+def merge_ngrams_lambdas(ngf, ldf):
+    # ngram file
+    # lambda file
+    # lexicon file
+
+    ngrams = {}
+    # lambdas = {}
+
+    for line in open(ngf).readlines():  # {
+        # 59763 	poor<adj> in<pr> capital<adj>
+        if len(line) < 2:  # {
+            continue
+        # }
+        row = line.strip().split('\t')
+        if(len(row) < 2):
+            row.append('')
+        ngid = int(row[0].strip())
+        ngrams[ngid] = row[1]
+    # }
+
+    with open(ldf) as d:
+        for line in d:  # {
+            # 59176:0 1.00131
+            if line.count('@@') > 0:  # {
+                continue
+            # }
+            row = line.strip().split('\t')
+
+            l = float(row[2])
+            ngid = int(row[1].split(':')[0])
+            ngram = ngrams[ngid]
+
+            trad = row[1].split(':')[1]
+            token = row[0]
+            print(token, '\t', l, '\t', trad, '\t', ngram)
+        # }
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:  # {
+        print('merge-ngrams-lambdas.py <ngrams> <lambdas>')
+        exit(1)
+    # }
+    merge_ngrams_lambdas(sys.argv[1], sys.argv[2])
diff --git a/scripts/ngram-count-patterns-maxent2.py b/scripts/ngram-count-patterns-maxent2.py
index 9b77876..463bc3f 100755
--- a/scripts/ngram-count-patterns-maxent2.py
+++ b/scripts/ngram-count-patterns-maxent2.py
@@ -2,7 +2,9 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy;
+import sys
+import codecs
+import copy
 import common
 
 # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
@@ -14,252 +16,263 @@ import common
 # ngrams[ngram][tl_word] = freq
 
 # 5 	Please<vblex><inf> rise<n> ,<cm> then<adv> ,<cm> for<pr> this<det><dem> minute<n> 's<gen> silence<n> .<sent>
-#5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
-#5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
-#5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
-#-------------------------------------------------------------------------------
+# 5 	Please<vblex><inf>/Complacer<vblex><inf> rise<n><sg>/aumento<n><m><sg> ,<cm>/,<cm> then<adv>/entonces<adv> ,<cm>/,<cm> for<pr>/para<pr>/durante<pr> this<det><dem><sg>/este<det><dem><GD><sg> minute<n><sg>/minuto<n><m><sg> '<apos>/'<apos> *s/*s silence<n><sg>/silencio<n><m><sg> .<sent>/.<sent>
+# 5 	Invitar<vblex> a<pr> todo<prn><tn> a<pr> que<cnjsub> prpers<prn><pro> poner<vblex> de<pr> pie<n> para<pr> guardar<vblex><inf> uno<det><ind> minuto<n> de<pr> silencio<n> .<sent>
+# 5 	0-0 4-2 5-3 8-1 9-5 10-6 12-7 13-8 14-9 15-10
+# -------------------------------------------------------------------------------
 
 THRESHOLD = 0
-if len(sys.argv) not in [3, 4]: #{
-	print('count-patterns.py <lex> <extracted> [threshold]')
-	sys.exit(-1);
-#}
-
-if len(sys.argv) == 4:
-	THRESHOLD = int(sys.argv[3])
-
-MAX_NGRAMS = 3;
-cur_line = 0;
-
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
-
-meevents = {}; # events[slword][counter] = [feat, feat, feat];
-meoutcomes = {}; # meoutcomes[slword][counter] = tlword;
-event_counter = 0;
-
-features = {}; # features[(slword, ['a', 'list'], tlword)] = 3
-feature_counter = 0;
-
-indexes = {};
-trad_counter = {};
-
-def wrap (x):
-	return '^' + x + '$'
-
-for line in open(sys.argv[1], 'r').readlines(): #{
-	if len(line) < 1: #{
-		continue;
-	#}
-	w = int(line.split(' ')[0])
-	if w < THRESHOLD:
-		continue;
-
-	row = common.tokenise_tagger_line(line);
-	sl = wrap(row[0]).lower();
-	tl = wrap(row[1].strip()).lower();
-	if tl[1] == '*':
-		tl = tl[:-3] + '$'
-
-	if sl not in sl_tl: #{
-		sl_tl[sl] = [];
-	#}
-	if sl not in trad_counter: #{
-		trad_counter[sl] = 0;
-	#}
-	if line.count('@') > 0: #{
-		sl_tl_defaults[sl] = tl;
-	sl_tl[sl].append(tl);
-	indexes[(sl, tl)] = trad_counter[sl];
-	trad_counter[sl] = trad_counter[sl] + 1;
-
-	#}
-#}
-
-cur_sl_row = [];
-cur_tl_row = [];
-cur_bt_row = [];
-cur_al_row = [];
-
-
-for line in open(sys.argv[2], 'r').readlines(): #{
-	line = line.strip()
-	if line[0] == '-': #{
-#		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row);
-#		print cur_sl_row;
-#		print cur_bt_row;
-#		print cur_tl_row;
-#		print cur_al_row;
-#
-		# Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
-		#
-		# sl_tl[sl_word][tl_word] = tl_freq
-		i = 0;
-		for slword in cur_sl_row: #{
-			if len(cur_bt_row[i]['tls']) > 1: #{
-				for al in cur_al_row: #{
-					al_sl = int(al.split('-')[1]);
-					al_tl = int(al.split('-')[0]);
-					if al_sl != i: #{
-						continue;
-					#}
-
-					tlword = wrap(cur_tl_row[al_tl].lower());
-					slword = wrap(slword.lower());
-
-					if tlword[1] == '*' or slword[1] == '*':
-						continue;
-
-					if slword not in sl_tl_defaults: #{
-#						print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping';
-						continue;
-					#}
-					if (slword, tlword) not in indexes: #{
-#						print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword);
-						continue;
-					#}
-#					if tlword !=  sl_tl_defaults[slword]: #{
-#						print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword;
-#					else: #{
-#						print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword;
-#					#}
-#					print >>sys.stderr, cur_sl_row;
-					for j in range(1, MAX_NGRAMS): #{
-#						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1]
-#						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1]
-#						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1]
-
-
-						pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1]));
-						postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1]));
-						roundgram = ' '.join(map(wrap, cur_sl_row[i-j:i+j+1]));
-
-						if slword not in ngrams: #{
-							ngrams[slword] = {};
-						#}
-						if pregram not in ngrams[slword]: #{
-							ngrams[slword][pregram] = {};
-						#}
-						if postgram not in ngrams[slword]: #{
-							ngrams[slword][postgram] = {};
-						#}
-						if roundgram not in ngrams[slword]: #{
-							ngrams[slword][roundgram] = {};
-						#}
-						if tlword not in ngrams[slword][pregram]: #{
-							ngrams[slword][pregram][tlword] = 0;
-						#}
-						if tlword not in ngrams[slword][postgram]: #{
-							ngrams[slword][postgram][tlword] = 0;
-						#}
-						if tlword not in ngrams[slword][roundgram]: #{
-							ngrams[slword][roundgram][tlword] = 0;
-						#}
-
-						ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1;
-						ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1;
-						ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1;
-					#}
-					#print ',' , len(ngrams[slword]);
-					if slword not in meevents: #{
-						meevents[slword] = {};
-					#}
-					if slword not in meoutcomes: #{
-						meoutcomes[slword] = {};
-					#}
-					if event_counter not in meevents: #{
-						meevents[slword][event_counter] = [];
-					#}
-					if event_counter not in meoutcomes[slword]: #{
-						meoutcomes[slword][event_counter] = '';
-					#}
-					for ni in ngrams[slword]: #{
-						if ni not in features: #{
-							feature_counter = feature_counter + 1;
-							features[ni] = feature_counter;
-						#}
-						meevents[slword][event_counter].append(features[ni]);
-						#meevents[slword][event_counter].append(feat);
-						meoutcomes[slword][event_counter] = tlword;
-
-					#}
-					del ngrams;
-					ngrams = {};
-					if len(sl_tl[slword]) < 2: #{
-						continue;
-					#}
-					for event in meevents[slword]: #{
-						outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # ';
-						for j in range(0,  len(sl_tl[slword])): #{
-							for feature in meevents[slword][event]: #{
-								outline = outline + str(feature) + ':' + str(j) + ' ';
-							#}
-							outline = outline + ' # '
-						#}
-						print(slword , '\t', len(sl_tl[slword]),'\t', outline);
-					#}
-					del meevents;
-					del meoutcomes;
-					meevents = {};
-					meoutcomes = {};
-
-#					for f in features: #{
-#						print >>sys.stderr, features[f] , f;
-#					#}
-
-				#}
-
-#				for j in range(0, MAX_NGRAMS): #{
-#					print cur_sl_row[i-j:i+1];
-#					print cur_sl_row[i:i+j];
-#				#}
-				#print ngrams[slword];
-			#}
-			i = i + 1;
-
-		#}
-
-		cur_line = 0;
-		event_counter = event_counter + 1;
-		#print line;
-		continue;
-	#}
-
-	line = line.split('\t')[1];
-	line = line.strip()
-
-	if cur_line == 0: #{
-		cur_sl_row = common.tokenise_tagger_line(line);
-	elif cur_line == 1: #{
-		cur_bt_row = common.tokenise_biltrans_line(line);
-	elif cur_line == 2: #{
-		cur_tl_row = common.tokenise_tagger_line(line);
-	elif cur_line == 3:  #{
-		cur_al_row = line.split(' ');
-	#}
-
-	cur_line = cur_line + 1;
-#}
-
-for feature in features: #{
-	print(features[feature] , '\t' , feature, file=sys.stderr);
-#}
-
-sys.exit(-1);
-
-for slword in meevents: #{
-	if len(sl_tl[slword]) < 2: #{
-		continue;
-	#}
-	for event in meevents[slword]: #{
-		outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # ';
-		for j in range(0,  len(sl_tl[slword])): #{
-			for feature in meevents[slword][event]: #{
-				outline = outline + str(feature) + ':' + str(j) + ' ';
-			#}
-			outline = outline + ' # '
-		#}
-		print(slword , '\t', len(sl_tl[slword]),'\t', outline);
-	#}
-#}
+
+
+def wrap(x):
+    return '^' + x + '$'
+
+
+def ngram_count_patterns(freq_lexicon, candidates):
+
+    MAX_NGRAMS = 3
+    cur_line = 0
+
+    sl_tl_defaults = {}
+    sl_tl = {}
+    ngrams = {}
+
+    meevents = {}  # events[slword][counter] = [feat, feat, feat]
+    meoutcomes = {}  # meoutcomes[slword][counter] = tlword
+    event_counter = 0
+
+    features = {}  # features[(slword, ['a', 'list'], tlword)] = 3
+    feature_counter = 0
+
+    indexes = {}
+    trad_counter = {}
+    for line in open(freq_lexicon, 'r').readlines():  # {
+        if len(line) < 1:  # {
+            continue
+        # }
+        w = int(line.split(' ')[0])
+        if w < THRESHOLD:
+            continue
+
+        row = common.tokenise_tagger_line(line)
+        sl = wrap(row[0]).lower()
+        tl = wrap(row[1].strip()).lower()
+        if tl[1] == '*':
+            tl = tl[:-3] + '$'
+
+        if sl not in sl_tl:  # {
+            sl_tl[sl] = []
+        # }
+        if sl not in trad_counter:  # {
+            trad_counter[sl] = 0
+        # }
+        if line.count('@') > 0:  # {
+            sl_tl_defaults[sl] = tl
+        sl_tl[sl].append(tl)
+        indexes[(sl, tl)] = trad_counter[sl]
+        trad_counter[sl] = trad_counter[sl] + 1
+
+        # }
+    # }
+
+    cur_sl_row = []
+    cur_tl_row = []
+    cur_bt_row = []
+    cur_al_row = []
+
+    for line in open(candidates, 'r').readlines():  # {
+        line = line.strip()
+        if line[0] == '-':  # {
+            #		print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row);
+            #		print cur_sl_row;
+            #		print cur_bt_row;
+            #		print cur_tl_row;
+            #		print cur_al_row;
+            #
+            # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations
+            #
+            # sl_tl[sl_word][tl_word] = tl_freq
+            i = 0
+            for slword in cur_sl_row:  # {
+                if len(cur_bt_row[i]['tls']) > 1:  # {
+                    for al in cur_al_row:  # {
+                        al_sl = int(al.split('-')[1])
+                        al_tl = int(al.split('-')[0])
+                        if al_sl != i:  # {
+                            continue
+                        # }
+
+                        tlword = wrap(cur_tl_row[al_tl].lower())
+                        slword = wrap(slword.lower())
+
+                        if tlword[1] == '*' or slword[1] == '*':
+                            continue
+
+                        if slword not in sl_tl_defaults:  # {
+                            #						print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping';
+                            continue
+                        # }
+                        if (slword, tlword) not in indexes:  # {
+                            #						print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword);
+                            continue
+                        # }
+    #					if tlword !=  sl_tl_defaults[slword]: #{
+    #						print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword;
+    #					else: #{
+    #						print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword;
+    #					#}
+    #					print >>sys.stderr, cur_sl_row;
+                        for j in range(1, MAX_NGRAMS):  # {
+                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1]
+                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1]
+                            #						print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1]
+
+                            pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1]))
+                            postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1]))
+                            roundgram = ' '.join(
+                                map(wrap, cur_sl_row[i-j:i+j+1]))
+
+                            if slword not in ngrams:  # {
+                                ngrams[slword] = {}
+                            # }
+                            if pregram not in ngrams[slword]:  # {
+                                ngrams[slword][pregram] = {}
+                            # }
+                            if postgram not in ngrams[slword]:  # {
+                                ngrams[slword][postgram] = {}
+                            # }
+                            if roundgram not in ngrams[slword]:  # {
+                                ngrams[slword][roundgram] = {}
+                            # }
+                            if tlword not in ngrams[slword][pregram]:  # {
+                                ngrams[slword][pregram][tlword] = 0
+                            # }
+                            if tlword not in ngrams[slword][postgram]:  # {
+                                ngrams[slword][postgram][tlword] = 0
+                            # }
+                            if tlword not in ngrams[slword][roundgram]:  # {
+                                ngrams[slword][roundgram][tlword] = 0
+                            # }
+
+                            ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1
+                            ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1
+                            ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1
+                        # }
+                        # print ',' , len(ngrams[slword]);
+                        if slword not in meevents:  # {
+                            meevents[slword] = {}
+                        # }
+                        if slword not in meoutcomes:  # {
+                            meoutcomes[slword] = {}
+                        # }
+                        if event_counter not in meevents:  # {
+                            meevents[slword][event_counter] = []
+                        # }
+                        if event_counter not in meoutcomes[slword]:  # {
+                            meoutcomes[slword][event_counter] = ''
+                        # }
+                        for ni in ngrams[slword]:  # {
+                            if ni not in features:  # {
+                                feature_counter = feature_counter + 1
+                                features[ni] = feature_counter
+                            # }
+                            meevents[slword][event_counter].append(
+                                features[ni])
+                            # meevents[slword][event_counter].append(feat);
+                            meoutcomes[slword][event_counter] = tlword
+
+                        # }
+                        del ngrams
+                        ngrams = {}
+                        if len(sl_tl[slword]) < 2:  # {
+                            continue
+                        # }
+                        for event in meevents[slword]:  # {
+                            outline = str(
+                                indexes[(slword, meoutcomes[slword][event])]) + ' # '
+                            for j in range(0,  len(sl_tl[slword])):  # {
+                                for feature in meevents[slword][event]:  # {
+                                    outline = outline + \
+                                        str(feature) + ':' + str(j) + ' '
+                                # }
+                                outline = outline + ' # '
+                            # }
+                            print(slword, '\t', len(
+                                sl_tl[slword]), '\t', outline)
+                        # }
+                        del meevents
+                        del meoutcomes
+                        meevents = {}
+                        meoutcomes = {}
+
+    #					for f in features: #{
+    #						print >>sys.stderr, features[f] , f;
+    #					#}
+
+                    # }
+
+    #				for j in range(0, MAX_NGRAMS): #{
+    #					print cur_sl_row[i-j:i+1];
+    #					print cur_sl_row[i:i+j];
+    #				#}
+                    # print ngrams[slword];
+                # }
+                i = i + 1
+
+            # }
+
+            cur_line = 0
+            event_counter = event_counter + 1
+            # print line;
+            continue
+        # }
+
+        line = line.split('\t')[1]
+        line = line.strip()
+
+        if cur_line == 0:  # {
+            cur_sl_row = common.tokenise_tagger_line(line)
+        elif cur_line == 1:  # {
+            cur_bt_row = common.tokenise_biltrans_line(line)
+        elif cur_line == 2:  # {
+            cur_tl_row = common.tokenise_tagger_line(line)
+        elif cur_line == 3:  # {
+            cur_al_row = line.split(' ')
+        # }
+
+        cur_line = cur_line + 1
+    # }
+
+    for feature in features:  # {
+        print(features[feature], '\t', feature, file=sys.stderr)
+    # }
+
+    exit(1)
+
+    for slword in meevents:  # {
+        if len(sl_tl[slword]) < 2:  # {
+            continue
+        # }
+        for event in meevents[slword]:  # {
+            outline = str(indexes[(slword, meoutcomes[slword][event])]) + ' # '
+            for j in range(0,  len(sl_tl[slword])):  # {
+                for feature in meevents[slword][event]:  # {
+                    outline = outline + str(feature) + ':' + str(j) + ' '
+                # }
+                outline = outline + ' # '
+            # }
+            print(slword, '\t', len(sl_tl[slword]), '\t', outline)
+        # }
+    # }
+
+
+if __name__ == '__main__':
+    if len(sys.argv) not in [3, 4]:  # {
+        print('count-patterns.py <lex> <extracted> [threshold]')
+        exit(1)
+    # }
+
+    if len(sys.argv) == 4:
+        THRESHOLD = int(sys.argv[3])
+
+    ngram_count_patterns(sys.argv[1], sys.argv[2])
diff --git a/scripts/ngrams-to-rules-me.py b/scripts/ngrams-to-rules-me.py
index f22737a..87d0fad 100755
--- a/scripts/ngrams-to-rules-me.py
+++ b/scripts/ngrams-to-rules-me.py
@@ -2,172 +2,182 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys;
-import common;
-
-#+nature<n>	service<n> nature<n>	carácter<n>	3
-#+nature<n>	The<def><def> imperialist<adj> nature<n>	carácter<n>	1
-#+nature<n>	the<def><def> secular<adj> nature<n> of<pr> State<n>	carácter<n>	1
-#+nature<n>	its<det><pos> nature<n> prevent<vblex>	carácter<n>	1
-#+nature<n>	nature<n> be<vbser> in<pr>	carácter<n>	1
+import sys
+import common
+
+# +nature<n>	service<n> nature<n>	carácter<n>	3
+# +nature<n>	The<def><def> imperialist<adj> nature<n>	carácter<n>	1
+# +nature<n>	the<def><def> secular<adj> nature<n> of<pr> State<n>	carácter<n>	1
+# +nature<n>	its<det><pos> nature<n> prevent<vblex>	carácter<n>	1
+# +nature<n>	nature<n> be<vbser> in<pr>	carácter<n>	1
 #
 
-#FREQMIN = 8.0;
-
-MINMATCH = 2;
-
-infile = '';
-
-if len(sys.argv) < 2: #{
-	print('ngrams-to-rules.py <ngrams>');
-	sys.exit(-1);
-#}
-
-infile = open(sys.argv[1]);
-
-
-permitted_tags = ['n', 'vblex', 'adj'];
-
-print('<rules>');
-lineno = 1;
-ruleno = 0;
-for line in infile: #{
-#	print '\n';
-#	print line
-	if len(line) < 2: #{
-		continue;
-	#}
-	line = line.strip();
-	#line = line.decode('utf-8').strip();
-	print(line, file=sys.stderr)
-	#+ 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
-	row = line.split('\t');
-
-	tipus = row[0].split(' ')[0];
-	weight = row[0].replace('  ', ' ').split(' ')[1];
-	sl = row[1].strip()[1:-1];
-	tl = row[3][1:-1];
-	tl_lema = tl.split('<')[0].lower();
-	tl_tags = ''.join(tl.split('<')[1:]).replace('>', '.').rstrip('.')
-	freq = 1
-#	freq = float(row[4]);
-
-	pattern = common.tokenize_tagger_line(row[2]);
-
-	if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0: #{
-		print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr);
-		continue;
-	#}
-
-	if tipus == '-' or tipus == '~': #{
-		print('DEFAULT_READING', line, file=sys.stderr);
-		continue;
-	#}
-
-	# Hacks
-#	if len(pattern) == 0: #{
-#		print('ZERO_PATTERN' , line, file=sys.stderr);
-#		continue;
-	#}
-
-
-	if len(pattern) < MINMATCH and len(pattern) > 0: #{
-		print('BELOW_MINMATCH', line, file=sys.stderr);
-		continue;
-	#}
-
-
-
-	inpattern = False;
-	for w in pattern: #{
-		if w.lower().count(sl) > 0: #{
-			inpattern = True;
-		#}
-	#}
-	if len(pattern) > 0 and not inpattern:  #{
-		print('SL_NOT_IN_PATTERN' , line, file=sys.stderr);
-		continue;
-	#}
-
-	if tl_tags.count('adj') > 0 and sl.count('adj')  < 1: #{
-		print("TAG_MISMATCH" , line, file=sys.stderr);
-		continue;
-	#}
-	if tl_tags.count('vbmod') > 0 and sl.count('vbmod')  < 1: #{
-		print("TAG_MISMATCH" , line, file=sys.stderr);
-		continue;
-	#}
-
-	if tl_tags.split('.')[0] not in permitted_tags: #{
-		print("TAG_NOT_PERMITTED" , tl_tags , '||' , line, file=sys.stderr);
-		continue;
-	#}
-
-	sel = False;
-	ruleno = ruleno + 1;
-	lineno = lineno + 1;
-
-	commentb = '';
-	commente = '';
-#	if freq < FREQMIN: #{
-#		commentb = '<!--';
-#		commente = '-->';
-#	#}
-
-	print(commentb + '  <rule c="' + str(ruleno) + ' ' + str(lineno) + ': ' + str(freq) + '" weight="' + weight + '">');
-	for word in pattern: #{
-		sl_lema = word.split('<')[0].lower();
-		if word.count('><') > 0: #{
-			sl_tags = '<'.join(word.split('<')[1:]).replace('><', '.').replace('>', '');
-		else: #{
-			sl_tags = '<'.join(word.split('<')[1:]).strip('<>');
-		#}
-
-		# ======================================================================= #
-
-		sl_lema = sl_lema.replace('~', ' ');
-		tl_lema = tl_lema.replace('~', ' ');
-#		sl_lema = sl_lema.replace('-', '\-');
-#		tl_lema = tl_lema.replace('-', '\-');
-#		sl_lema = sl_lema.replace('(', '\(');
-#		tl_lema = tl_lema.replace('(', '\(');
-#		sl_lema = sl_lema.replace(')', '\)');
-#		tl_lema = tl_lema.replace(')', '\)');
-#
-		if word.lower().count(sl) > 0: #{
-			lineno = lineno + 1;
-			if sl_lema == '': #{
-				print('    <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-			else: #{
-				print('    <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-			#}
-			sel = True;
-		else: #{
-			lineno = lineno + 1;
-			if sl_lema == '': #{
-				print('    <match tags="' + sl_tags + '"/>');
-			else: #{
-				print('    <match lemma="' + sl_lema + '" tags="' + sl_tags + '"/>');
-			#}
-		#}
-	#}
-	if sel == False and len(pattern) == 0: #{
-		sl_lema = sl.split('<')[0];
-		if sl.count('><') > 0: #{
-			sl_tags = '<'.join(sl.split('<')[1:]).replace('><', '.').replace('>', '');
-		else: #{
-			sl_tags = '<'.join(sl.split('<')[1:]).strip('<>');
-		#}
-		if sl_lema == '': #{
-			print('    <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-		else: #{
-			print('    <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>');
-		print('  </rule>' + commente);
-	elif sel == False:
-		print('  </rule>'+commente+ '<!-- Warning: No select operation ', line, '-->');
-	else: #{
-		print('  </rule>' + commente);
-	#}
-	lineno = lineno + 1;
-#}
-print('</rules>');
+
+def ngrams_to_rules(ngrams):
+    # FREQMIN = 8.0
+
+    MINMATCH = 2
+
+    permitted_tags = ['n', 'vblex', 'adj']
+
+    print('<rules>')
+    lineno = 1
+    ruleno = 0
+
+    with open(ngrams) as infile:
+        for line in infile:  # {
+            #	print '\n'
+            #	print line
+            if len(line) < 2:  # {
+                continue
+            # }
+            line = line.strip()
+            # line = line.decode('utf-8').strip()
+            print(line, file=sys.stderr)
+            # + 0.571428571429 14 8 8 	troiñ<vblex>		tourner<vblex>	8
+            row = line.split('\t')
+
+            tipus = row[0].split(' ')[0]
+            weight = row[0].replace('  ', ' ').split(' ')[1]
+            sl = row[1].strip()[1:-1]
+            tl = row[3][1:-1]
+            tl_lema = tl.split('<')[0].lower()
+            tl_tags = ''.join(tl.split('<')[1:]).replace(
+                '>', '.').rstrip('.')
+            freq = 1
+        #	freq = float(row[4])
+
+            pattern = common.tokenize_tagger_line(row[2])
+
+            if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0:  # {
+                print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr)
+                continue
+            # }
+
+            if tipus == '-' or tipus == '~':  # {
+                print('DEFAULT_READING', line, file=sys.stderr)
+                continue
+            # }
+
+            # Hacks
+        #	if len(pattern) == 0: #{
+        #		print('ZERO_PATTERN' , line, file=sys.stderr);
+        #		continue
+            # }
+
+            if len(pattern) < MINMATCH and len(pattern) > 0:  # {
+                print('BELOW_MINMATCH', line, file=sys.stderr)
+                continue
+            # }
+
+            inpattern = False
+            for w in pattern:  # {
+                if w.lower().count(sl) > 0:  # {
+                    inpattern = True
+                # }
+            # }
+            if len(pattern) > 0 and not inpattern:  # {
+                print('SL_NOT_IN_PATTERN', line, file=sys.stderr)
+                continue
+            # }
+
+            if tl_tags.count('adj') > 0 and sl.count('adj') < 1:  # {
+                print("TAG_MISMATCH", line, file=sys.stderr)
+                continue
+            # }
+            if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1:  # {
+                print("TAG_MISMATCH", line, file=sys.stderr)
+                continue
+            # }
+
+            if tl_tags.split('.')[0] not in permitted_tags:  # {
+                print("TAG_NOT_PERMITTED", tl_tags,
+                      '||', line, file=sys.stderr)
+                continue
+            # }
+
+            sel = False
+            ruleno = ruleno + 1
+            lineno = lineno + 1
+
+            commentb = ''
+            commente = ''
+        #	if freq < FREQMIN: #{
+        #		commentb = '<!--'
+        #		commente = '-->'
+        #	#}
+
+            print(commentb + '  <rule c="' + str(ruleno) + ' ' +
+                  str(lineno) + ': ' + str(freq) + '" weight="' + weight + '">')
+            for word in pattern:  # {
+                sl_lema = word.split('<')[0].lower()
+                if word.count('><') > 0:  # {
+                    sl_tags = '<'.join(word.split('<')[1:]).replace(
+                        '><', '.').replace('>', '')
+                else:  # {
+                    sl_tags = '<'.join(word.split('<')[1:]).strip('<>')
+                # }
+
+                # ======================================================================= #
+
+                sl_lema = sl_lema.replace('~', ' ')
+                tl_lema = tl_lema.replace('~', ' ')
+        #		sl_lema = sl_lema.replace('-', '\-')
+        #		tl_lema = tl_lema.replace('-', '\-')
+        #		sl_lema = sl_lema.replace('(', '\(')
+        #		tl_lema = tl_lema.replace('(', '\(')
+        #		sl_lema = sl_lema.replace(')', '\)')
+        #		tl_lema = tl_lema.replace(')', '\)')
+        #
+                if word.lower().count(sl) > 0:  # {
+                    lineno = lineno + 1
+                    if sl_lema == '':  # {
+                        print('    <match tags="' + sl_tags + '"><select lemma="' +
+                              tl_lema + '" tags="' + tl_tags + '"/></match>')
+                    else:  # {
+                        print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
+                              '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
+                    # }
+                    sel = True
+                else:  # {
+                    lineno = lineno + 1
+                    if sl_lema == '':  # {
+                        print('    <match tags="' + sl_tags + '"/>')
+                    else:  # {
+                        print('    <match lemma="' + sl_lema +
+                              '" tags="' + sl_tags + '"/>')
+                    # }
+                # }
+            # }
+            if sel == False and len(pattern) == 0:  # {
+                sl_lema = sl.split('<')[0]
+                if sl.count('><') > 0:  # {
+                    sl_tags = '<'.join(sl.split('<')[1:]).replace(
+                        '><', '.').replace('>', '')
+                else:  # {
+                    sl_tags = '<'.join(sl.split('<')[1:]).strip('<>')
+                # }
+                if sl_lema == '':  # {
+                    print('    <match tags="' + sl_tags + '"><select lemma="' +
+                          tl_lema + '" tags="' + tl_tags + '"/></match>')
+                else:  # {
+                    print('    <match lemma="' + sl_lema + '" tags="' + sl_tags +
+                          '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>')
+                print('  </rule>' + commente)
+            elif sel == False:
+                print('  </rule>'+commente +
+                      '<!-- Warning: No select operation ', line, '-->')
+            else:  # {
+                print('  </rule>' + commente)
+            # }
+            lineno = lineno + 1
+        # }
+    print('</rules>')
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:  # {
+        print('ngrams-to-rules.py <ngrams>')
+        exit(1)
+    # }
+    ngrams_to_rules(sys.argv[1])