commit 587e4f0a51c495873e89f866ba797d753e030293
Author: Daniel Swanson <popcorn.tomato.dude@gmail.com>
Date:   Mon Jul 12 12:54:11 2021 -0500

    reduce script code duplication

diff --git a/scripts/apertium-lex-evaluate.py b/scripts/apertium-lex-evaluate.py
index e6bba14..05e59a0 100644
--- a/scripts/apertium-lex-evaluate.py
+++ b/scripts/apertium-lex-evaluate.py
@@ -2,160 +2,122 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys;
-
-# src = output of translator up to lt-proc -b
-# ref = reference corpus
-# tst = output of lexical selection module
-
-if len(sys.argv) < 4: #{
-	print('apertium-lex-evaluate [-d] [-l] <src> <ref> <tst>');
-	sys.exit(-1);
-#}
-
-debug = False;
-perLine = False;
-quiet = False;
-
-if len(sys.argv) == 6: #{
-	n_tst = sys.argv[5];
-	n_ref = sys.argv[4];
-	n_src = sys.argv[3];
-
-	debug = True;
-	perLine = True;
-
-elif len(sys.argv) == 5: #{
-	n_tst = sys.argv[4];
-	n_ref = sys.argv[3];
-	n_src = sys.argv[2];
-	if sys.argv[1] == '-d': #{
-		debug = True;
-	elif sys.argv[1] == '-q': #{
-		quiet = True;
-	elif sys.argv[1] == '-l': #{
-		perLine = True;
-	else: #{
-		printi('`' + sys.argv[1] + "' is not a valid option.");
-		sys.exit(-1);
-	#}
-elif len(sys.argv) == 4: #{
-
-	n_tst = sys.argv[3];
-	n_ref = sys.argv[2];
-	n_src = sys.argv[1];
-
-#}
-
-f_src = open(n_src);
-f_ref = open(n_ref);
-f_tst = open(n_tst);
-
-def lineToArray(line): #{
-	current_word_sl = '';
-	current_word_tl = '';
-	current_words_tl = [];
-	firstWord = False;
-	inWord = False;
-	lus = [];
-
-	for c in line: #{
-		if c == '^': #{
-			inWord = True;
-			firstWord = True;
-			continue;
-		elif c == '$': #{
-			current_words_tl.append(current_word_tl);
-			current_word = (current_word_sl, current_words_tl);
-			lus.append(current_word);
-			#print current_word;
-			current_word_sl = '';
-			current_word_tl = '';
-			current_words_tl = [];
-			i = 0;
-			inWord = False;
-			continue;
-		elif c == '/': #{
-			if not firstWord: #{
-				current_words_tl.append(current_word_tl);
-				current_word_tl = '';
-			elif firstWord: #{
-				firstWord = False;
-			#}
-			continue;
-		#}
-
-		if inWord and firstWord: #{
-			current_word_sl = current_word_sl + c;
+import argparse
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument('src', help='output of translator up to lt-proc -b')
+parser.add_argument('ref', help='reference corpus')
+parser.add_argument('tst', help='output of lexical selection module')
+parser.add_argument('-d', '--debug', action='store_true')
+parser.add_argument('-q', '--quiet', action='store_true')
+parser.add_argument('-l', '--line', action='store_true')
+args = parser.parse_args()
+
+def debug(msg):
+	global args
+	if args.debug:
+		print(msg, file=sys.stderr)
+
+f_src = open(args.src)
+f_ref = open(args.ref)
+f_tst = open(args.tst)
+
+def lineToArray(line):
+	current_word_sl = ''
+	current_word_tl = ''
+	current_words_tl = []
+	firstWord = False
+	inWord = False
+	lus = []
+
+	for c in line:
+		if c == '^':
+			inWord = True
+			firstWord = True
+			continue
+		elif c == '$':
+			current_words_tl.append(current_word_tl)
+			current_word = (current_word_sl, current_words_tl)
+			lus.append(current_word)
+			current_word_sl = ''
+			current_word_tl = ''
+			current_words_tl = []
+			i = 0
+			inWord = False
+			continue
+		elif c == '/':
+			if firstWord:
+				firstWord = False
+			else:
+				current_words_tl.append(current_word_tl)
+				current_word_tl = ''
+			continue
+
+		if inWord and firstWord:
+			current_word_sl = current_word_sl + c
 		elif inWord and not firstWord:
-			current_word_tl = current_word_tl + c;
-		#}
-	#}
-	return lus;
-#}
-
-def sanityChecks(l_src, l_ref, l_tst): #{
-	if debug:
-		print('---', file=sys.stderr);
-	src_lu = [];
-	ref_lu = [];
-	tst_lu = [];
-
-	src_lu = lineToArray(l_src);
-	ref_lu = lineToArray(l_ref);
-	tst_lu = lineToArray(l_tst);
-
-	if debug:
-		print('src:' , src_lu, file=sys.stderr);
-		print('tst:' , tst_lu, file=sys.stderr);
-		print('ref:' , ref_lu, file=sys.stderr);
-
-	if len(src_lu) != len(ref_lu): #{
-		print('WARNING: Source and reference sentence have different number of lexical units.', file=sys.stderr);
-		print('SRC: ' , len(src_lu) , ": " + l_src, file=sys.stderr);
-		print('REF: ' , len(ref_lu) , ": " + l_ref, file=sys.stderr);
-	#}
-
-	if len(src_lu) != len(tst_lu): #{
-		print('WARNING: Source and test sentence have different number of lexical units.', file=sys.stderr);
-		print(len(src_lu) , ": " + l_src, file=sys.stderr);
-		print(len(tst_lu) , ": " + l_tst, file=sys.stderr);
-	#}
+			current_word_tl = current_word_tl + c
+
+	return lus
+
+def sanityChecks(l_src, l_ref, l_tst):
+	debug('---')
+	src_lu = []
+	ref_lu = []
+	tst_lu = []
+
+	src_lu = lineToArray(l_src)
+	ref_lu = lineToArray(l_ref)
+	tst_lu = lineToArray(l_tst)
+
+	debug('src: %s' % src_lu)
+	debug('tst: %s' % tst_lu)
+	debug('ref: %s' % ref_lu)
+
+	if len(src_lu) != len(ref_lu):
+		print('WARNING: Source and reference sentence have different number of lexical units.', file=sys.stderr)
+		print('SRC: ' , len(src_lu) , ": " + l_src, file=sys.stderr)
+		print('REF: ' , len(ref_lu) , ": " + l_ref, file=sys.stderr)
+
+
+	if len(src_lu) != len(tst_lu):
+		print('WARNING: Source and test sentence have different number of lexical units.', file=sys.stderr)
+		print(len(src_lu) , ": " + l_src, file=sys.stderr)
+		print(len(tst_lu) , ": " + l_tst, file=sys.stderr)
+
 
 #       i) do a sanity check, look for outN in tst that aren't in src: LEX module is outputting strange stuff
 
-	for i in range(0, len(tst_lu)): #{
-		if len(tst_lu[i][1]) > 1: #{
-			print('WARNING: Test sentence has a translation with more than one option.', file=sys.stderr);
-			print('        ',src_lu[i], file=sys.stderr);
-			print('        ',ref_lu[i], file=sys.stderr);
-			print('        ',tst_lu[i][1], file=sys.stderr);
-		#}
-		for lu in tst_lu[i][1]: #{
-			if lu not in src_lu[i][1]: #{
-				print('WARNING: Test sentence has a translation option that can never ', file=sys.stderr);
-				print(' be generated by the MT system.', file=sys.stderr);
-				print('        TST: ', tst_lu[i], file=sys.stderr);
-				print('        SRC: ', src_lu[i], file=sys.stderr);
-			#}
-		#}
-	#}
+	for i in range(0, len(tst_lu)):
+		if len(tst_lu[i][1]) > 1:
+			print('WARNING: Test sentence has a translation with more than one option.', file=sys.stderr)
+			print('        ',src_lu[i], file=sys.stderr)
+			print('        ',ref_lu[i], file=sys.stderr)
+			print('        ',tst_lu[i][1], file=sys.stderr)
+
+		for lu in tst_lu[i][1]:
+			if lu not in src_lu[i][1]:
+				print('WARNING: Test sentence has a translation option that can never ', file=sys.stderr)
+				print(' be generated by the MT system.', file=sys.stderr)
+				print('        TST: ', tst_lu[i], file=sys.stderr)
+				print('        SRC: ', src_lu[i], file=sys.stderr)
+
+
+
 
 #      ii) look for outN in ref that aren't in src: MT system has changed
 
-	for i in range(0, len(ref_lu)): #{
-		for lu in ref_lu[i][1]: #{
-			if lu not in src_lu[i][1]: #{
-				print('WARNING: Reference sentence has a translation option that can never ', file=sys.stderr);
-				print(' be generated by the MT system.', file=sys.stderr);
-				print('REF: ', ref_lu[i], file=sys.stderr);
-				print('SRC: ', src_lu[i], file=sys.stderr);
-			#}
-		#}
-	#}
+	for i in range(0, len(ref_lu)):
+		for lu in ref_lu[i][1]:
+			if lu not in src_lu[i][1]:
+				print('WARNING: Reference sentence has a translation option that can never ', file=sys.stderr)
+				print(' be generated by the MT system.', file=sys.stderr)
+				print('REF: ', ref_lu[i], file=sys.stderr)
+				print('SRC: ', src_lu[i], file=sys.stderr)
+
+	return (src_lu, ref_lu, tst_lu)
 
-	return (src_lu, ref_lu, tst_lu);
-#}
 
 # Process:
 #  Read linestep, for each line in the three files:
@@ -171,79 +133,78 @@ def sanityChecks(l_src, l_ref, l_tst): #{
 #      iv)            if it is in the ref, increase score for that LU by 1.
 #       v)        final score is number of good TL translations / total number of TL translations
 
-lines = True;
+lines = True
+
+lineno = 0
 
-lineno = 0;
+total_ambig_lus = 0
+total_fallos = 0
 
-total_ambig_lus = 0;
-total_fallos = 0;
+while lines:
 
-while lines: #{
+	l_src = f_src.readline()
+	l_ref = f_ref.readline()
+	l_tst = f_tst.readline()
 
-	l_src = f_src.readline();
-	l_ref = f_ref.readline();
-	l_tst = f_tst.readline();
+	if l_src.strip('[]') == '' and l_ref.strip('[]') == '' and l_tst.strip('[]') == '':
+		lines = False
+		continue
 
-	if l_src.strip('[]') == '' and l_ref.strip('[]') == '' and l_tst.strip('[]') == '': #{
-		lines = False;
-		continue;
-	#}
-	lineno = lineno + 1;
+	lineno = lineno + 1
 
-	(lu_src, lu_ref, lu_tst) = sanityChecks(l_src, l_ref, l_tst);
+	(lu_src, lu_ref, lu_tst) = sanityChecks(l_src, l_ref, l_tst)
 
-	num_ambig_lus = 0;
-	num_fallos = 0;
+	num_ambig_lus = 0
+	num_fallos = 0
 
-	for i in range(0, len(lu_tst)): #{
+	for i in range(0, len(lu_tst)):
 		#  We are only interested in counting a mismatch as an error if the
 		#  source LU has more than one possible translation, and
 		#  the number of translations is lower in the reference. This means
 		#  that if we have two possible translations in both the source and
 		#  the reference, it should not be considered ambiguous as both are
 		#  valid.
-		if len(lu_src[i][1]) > 1 and len(lu_ref[i][1]) != len(lu_src[i][1]) and lu_ref[i][1] != lu_src[i][1]: #{
+		if len(lu_src[i][1]) > 1 and len(lu_ref[i][1]) != len(lu_src[i][1]) and lu_ref[i][1] != lu_src[i][1]:
 #			>> 2 3 station<n><sg> [u'station<n><sg>'] +++ [u'station<n><sg>', u'season<n><sg>', u'ski resort<n><sg>']
 #			XX station<n><sg> XX  [u'station<n><sg>']
 
-			num_ambig_lus = num_ambig_lus + 1;
-			if debug:
-				print('>>' , len(lu_tst[i][1]) , len(lu_src[i][1]) ,  lu_tst[i][1][0] , lu_ref[i][1] , '+++' , lu_src[i][1]);
-				print('XX', lu_tst[i][1][0] , 'XX ' , lu_ref[i][1]);
-			if lu_tst[i][1][0] not in lu_ref[i][1]: #{
-				num_fallos = num_fallos + 1;
-				if debug:
-					print('MISMATCH: ' , lu_tst[i][1][0] , 'not in' , lu_ref[i][1]);
-			#}
-		#}
-	#}
-
-	if num_fallos == 0 and num_ambig_lus == 0: #{
-#		print 'WEIRD: ' , l_src ;
-#		print '     : ' , l_ref ;
-#		print '     : ' , l_tst ;
-		continue;
-	#}
-	err = float(num_fallos)/float(num_ambig_lus)*100;
-	errh = str(err).split('.')[0];
-	errt = ''.join(str(err).split('.')[1][0:1]);
-	if perLine:
-		print(n_tst + ':' + str(lineno) + ' ' + str(num_fallos) + '/' + str(num_ambig_lus) + ' ' + errh + '.' + errt + '%');
-
-	total_ambig_lus = total_ambig_lus + num_ambig_lus;
-	total_fallos = total_fallos + num_fallos;
-#}
-
-if total_fallos == 0 or total_ambig_lus == 0: #{
-	print('what: ' , total_fallos ,total_ambig_lus);
-	print("Check you haven't tried to use the source as a reference");
-#}
-err = float(total_fallos)/float(total_ambig_lus)*100;
-errh = str(err).split('.')[0];
-errt = ''.join(str(err).split('.')[1][0:1]);
-#print n_tst + ' ' + str(total_fallos) + '/' + str(total_ambig_lus) + ' ' + errh + '.' + errt + '%';
-if quiet: #{
-	print(errh + '.' + errt);
-else: #{
-	print(str(total_fallos) + '/' + str(total_ambig_lus) + '\t' + errh + '.' + errt + '%');
-#}
+			num_ambig_lus = num_ambig_lus + 1
+			debug('>> %s %s %s +++' % (len(lu_tst[i][1]), len(lu_src[i][1]),
+									   lu_tst[i][1][0], lu_ref[i][1],
+									   lu_src[i][1]))
+			debug('XX %s XX  %s' % (lu_tst[i][1][0], lu_ref[i][1]))
+			if lu_tst[i][1][0] not in lu_ref[i][1]:
+				num_fallos = num_fallos + 1
+				debug('MISMATCH: %s not in %s' % (lu_tst[i][1][0], lu_ref[i][1]))
+
+
+
+
+	if num_fallos == 0 and num_ambig_lus == 0:
+#		print('WEIRD: ' , l_src)
+#		print('     : ' , l_ref)
+#		print('     : ' , l_tst)
+		continue
+
+	err = float(num_fallos)/float(num_ambig_lus)*100
+	errh = str(err).split('.')[0]
+	errt = ''.join(str(err).split('.')[1][0:1])
+	if args.line:
+		print(n_tst + ':' + str(lineno) + ' ' + str(num_fallos) + '/' + str(num_ambig_lus) + ' ' + errh + '.' + errt + '%')
+
+	total_ambig_lus = total_ambig_lus + num_ambig_lus
+	total_fallos = total_fallos + num_fallos
+
+
+if total_fallos == 0 or total_ambig_lus == 0:
+	print('what: ' , total_fallos ,total_ambig_lus)
+	print("Check you haven't tried to use the source as a reference")
+
+err = float(total_fallos)/float(total_ambig_lus)*100
+errh = str(err).split('.')[0]
+errt = ''.join(str(err).split('.')[1][0:1])
+#print(n_tst + ' ' + str(total_fallos) + '/' + str(total_ambig_lus) + ' ' + errh + '.' + errt + '%')
+if args.quiet:
+	print(errh + '.' + errt)
+else:
+	print(str(total_fallos) + '/' + str(total_ambig_lus) + '\t' + errh + '.' + errt + '%')
diff --git a/scripts/biltrans-count-patterns-frac-maxent.py b/scripts/biltrans-count-patterns-frac-maxent.py
index 13c1931..c0962b9 100755
--- a/scripts/biltrans-count-patterns-frac-maxent.py
+++ b/scripts/biltrans-count-patterns-frac-maxent.py
@@ -2,7 +2,9 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, math, re, common;
+import sys, math, re, common
+from collections import defaultdict
+import biltrans_count_common as BCC
 
 # Input:
 #	a) Frequency lexicon
@@ -25,231 +27,50 @@ import sys, codecs, copy, math, re, common;
 #.[][56011 0].[] ^un<det><ind><sp>/un<det><ind><GD><ND>$ ^digarez<n><m><sg>/excuse<n><f><sg>$ ^da<pr>/à<pr>$ ^distreiñ<vblex><inf>/revenir<vblex><inf>$ ^war<pr>/sur<pr>$ ^e<det><pos><m><sp>/son<det><pos><GD><ND>$ ^doare<n><m><sg>/manière<n><f><sg>$ ^ober<vblex><inf>/faire<vblex><inf>$ ^.<sent>/.<sent>$^.<sent>/.<sent>$ 0.9917274061    |@|
 #.[][56011 1].[] ^un<det><ind><sp>/un<det><ind><GD><ND>$ ^digarez<n><m><sg>/occasion<n><f><sg>$ ^da<pr>/à<pr>$ ^distreiñ<vblex><inf>/revenir<vblex><inf>$ ^war<pr>/sur<pr>$ ^e<det><pos><m><sp>/son<det><pos><GD><ND>$ ^doare<n><m><sg>/manière<n><f><sg>$ ^ober<vblex><inf>/faire<vblex><inf>$ ^.<sent>/.<sent>$^.<sent>/.<sent>$       0.0082725939    ||
 
-MAX_NGRAMS = 3; # Max = 5-grams
-cur_line = 0;
+sl_tl_defaults = {}
+sl_tl = defaultdict(list)
 
-re_sep = re.compile('\$[^\^]*\^');
+features = {} # features[(slword, ['a', 'list'], tlword)] = 3
 
-def split_line(line):
-	line = re_clean_start.sub('', line.split('\t')[1]);
-	line = re_clean_end.sub('$', line);
-	line = line[1:-1];
+indexes = {}
+trad_counter = defaultdict(lambda: 0)
 
-	row = re_sep.split(line);
-	return row
-
-
-
-	# am_row = re_sep.sub('$ ^', am_line.split('\t')[1])[1:-1].split('$ ^');
-
-
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
+# First read in the frequency defaults
 
-meevents = {}; # events[slword][counter] = [feat, feat, feat];
-meoutcomes = {}; # meoutcomes[slword][counter] = tlword;
-event_counter = 0;
+for line in open(sys.argv[1]):
+	line = line.strip()
+	if len(line) < 1:
+		continue
 
-features = {}; # features[(slword, ['a', 'list'], tlword)] = 3
-feature_counter = 0;
+	row = common.tokenize_tagger_line(line)
+	sl = common.wrap(row[0])
+	tl = common.wrap(row[1])
+	if tl[1] == '*':
+		tl = tl[:-3] + '$'
 
-indexes = {};
-trad_counter = {};
+	indexes[(sl, tl)] = trad_counter[sl]
+	trad_counter[sl] += 1
+	sl_tl[sl].append(tl)
 
-am_counter = 0;
-dm_counter = 0;
+	if line.count('@') > 0:
+		sl_tl_defaults[sl] = tl
 
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = True
+	count_ngrams = True
+	max_ngrams = 3
+	biltrans_wrap_lus = True
 
-# First read in the frequency defaults
+	def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		global sl_tl, features, indexes
+		BCC.features_and_outline(self.ngrams, sl, tl, sl_tl, features,
+								 indexes, frac_count=frac_count)
+		self.clear_ngrams()
 
-for line in open(sys.argv[1]): #{
-	line = line.strip();
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = common.tokenize_tagger_line(line);
-	sl = common.wrap(row[0]);
-	tl = common.wrap(row[1]);
-	if tl[1] == '*':
-		tl = tl[:-3] + '$'
-	if sl not in trad_counter: #{
-		trad_counter[sl] = 0;
-	#}
-	if sl not in sl_tl: #{
-		sl_tl[sl] = [];
-	#}
-	if line.count('@') > 0: #{
-		sl_tl_defaults[sl] = tl;
-		sl_tl[sl].append(tl);
-		indexes[(sl, tl)] = trad_counter[sl];
-		trad_counter[sl] = trad_counter[sl] + 1;
-	else: #{
-		sl_tl[sl].append(tl);
-		indexes[(sl, tl)] = trad_counter[sl];
-		trad_counter[sl] = trad_counter[sl] + 1;
-	#}
-#}
-
-am_file = open(sys.argv[2]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[3]); # File with disambiguated biltrans output
-reading = True;
-
-current_am_line_id = -1;
-current_dm_line_id = -1;
-
-dm_line = dm_file.readline();
-current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-while reading: #{
-	am_line = am_file.readline();
-
-	if am_line == '': #{
-		reading = False;
-		continue;
-	#}
-
-	current_am_line_id = int(am_line.split('\t')[0])
-	while current_dm_line_id == current_am_line_id: #{
-		am_row = common.tokenize_biltrans_line(am_line);
-		dm_row = common.tokenize_biltrans_line(dm_line);
-
-		if len(am_row) != len(dm_row): #{
-			print('Mismatch in number of LUs between analysis and training', len(am_row), len(dm_row), 'lines', current_am_line_id, current_dm_line_id, file=sys.stderr);
-			print('\t' + am_line, file=sys.stderr);
-			print('\t' + dm_line, file=sys.stderr);
-			print('...skipping', file=sys.stderr);
-			dm_line = dm_file.readline()
-			current_dm_line_id = int(dm_line.split('\t')[0]);
-			continue;
-		#}
-
-		try:
-			frac_count = 0.0;
-			s_fc = dm_line.split('\t')[2].strip();
-			if s_fc == '' or len(s_fc) == 0: #{
-#				print('%d %d :: %d %d :: Frac count is not floatable' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr);
-				dm_line = dm_file.readline()
-				current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-				continue;
-			#}
-
-			frac_count = float(s_fc);
-
-			if math.isnan(frac_count): #{
-#				print('%d %d :: %d %d :: Frac count is not a number' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr);
-				frac_count = 0.0;
-			#}
-		except:
-			dm_line = dm_file.readline()
-			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-			continue;
-
-		cur_sl_row = [x['sl'] for x in am_row]
-		limit = len(am_row);
-		for i in range(0, limit): #{
-			if len(am_row[i]['tls']) > 1: #{
-				sl = common.wrap(am_row[i]['sl'])
-				tl = common.wrap(dm_row[i]['tls'][0])
-
-				for j in range(1, MAX_NGRAMS): #{
-					pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1]));
-					postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1]));
-					roundgram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+j+1]));
-
-					if sl not in ngrams: #{
-						ngrams[sl] = {};
-					#}
-					if pregram not in ngrams[sl]: #{
-						ngrams[sl][pregram] = {};
-					#}
-					if postgram not in ngrams[sl]: #{
-						ngrams[sl][postgram] = {};
-					#}
-					if roundgram not in ngrams[sl]: #{
-						ngrams[sl][roundgram] = {};
-					#}
-					if tl not in ngrams[sl][pregram]: #{
-						ngrams[sl][pregram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][postgram]: #{
-						ngrams[sl][postgram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][roundgram]: #{
-						ngrams[sl][roundgram][tl] = 0.0;
-					#}
-
-					ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count;
-					ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count;
-					ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count;
-				#}
-				if sl not in meevents: #{
-					meevents[sl] = {};
-				#}
-				if sl not in meoutcomes: #{
-					meoutcomes[sl] = {};
-				#}
-				if event_counter not in meevents: #{
-					meevents[sl][event_counter] = [];
-				#}
-				if event_counter not in meoutcomes: #{
-					meoutcomes[sl][event_counter] = '';
-				#}
-				for ni in ngrams[sl]: #{
-					if ni not in features: #{
-						feature_counter = feature_counter + 1;
-						features[ni] = feature_counter;
-					#}
-					meevents[sl][event_counter].append(features[ni]);
-					#meevents[sl][event_counter].append(feat);
-					#meoutcomes[sl][event_counter] = (tl, frac_count);
-					meoutcomes[sl][event_counter] = (tl, int(frac_count * 10000));
-
-				#}
-				del ngrams;
-				ngrams = {};
-				if sl not in sl_tl:  #{
-					continue;
-				#}
-				if len(sl_tl[sl]) < 2: #{
-					continue;
-				#}
-
-				for event in meevents[sl]: #{
-					outline = str(indexes[(sl, meoutcomes[sl][event][0])]) + ' $ ' ;
-					outline = outline + str(meoutcomes[sl][event][1]) + '  #  ';
-					for j in range(0,  len(sl_tl[sl])): #{
-						for feature in meevents[sl][event]: #{
-							outline = outline + str(feature) + ':' + str(j) + ' ';
-						#}
-						outline = outline + ' #  '
-					#}
-					print(sl , '\t', len(sl_tl[sl]),'\t', outline);
-				#}
-				del meevents;
-				del meoutcomes;
-				meevents = {};
-				meoutcomes = {};
-			#}
-		#}
-
-		dm_line = dm_file.readline();
-		if dm_line == '': #{
-			reading = False;
-			break;
-		#}
-
-		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-		event_counter = event_counter + 1;
-
-		dm_counter += 1;
-
-	#}
-	am_counter += 1;
-
-#}
-
-for feature in features: #{
-	print(features[feature] , '\t' , feature, file=sys.stderr);
-#}
+c = Counter()
+c.read_files(sys.argv[2], # File with ambiguous biltrans output
+			 sys.argv[3]) # File with disambiguated biltrans output
 
+for feature in features:
+	print(features[feature] , '\t' , feature, file=sys.stderr)
diff --git a/scripts/biltrans-count-patterns-frac.py b/scripts/biltrans-count-patterns-frac.py
index 7e03ae7..3384566 100755
--- a/scripts/biltrans-count-patterns-frac.py
+++ b/scripts/biltrans-count-patterns-frac.py
@@ -2,8 +2,10 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, re;
-import common;
+import sys, re
+import common
+from collections import defaultdict
+import biltrans_count_common as BCC
 
 # Input:
 #        a) Frequency lexicon
@@ -28,202 +30,43 @@ import common;
 
 #	 d) Crispiness threshold
 
-MAX_NGRAMS = 3; # Max = 5-grams
+crisphold = 3.0  # Default
+only_max = True
+#only_max = False
+cache_counts = open('/tmp/cache_counts.log', 'w+')
 
-cur_line = 0;
-crisphold = 3.0 ; # Default
-only_max = True;
-#only_max = False;
-cache_counts = open('/tmp/cache_counts.log', 'w+');
+if len(sys.argv) == 5:
+	crisphold = float(sys.argv[4])
+	print('crisp:', crisphold, file=sys.stderr)
 
-if len(sys.argv) == 5: #{
-	crisphold = float(sys.argv[4]);
-	print('crisp:', crisphold, file=sys.stderr);
-#}
+# First read in the frequency defaults
 
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
+sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1])
 
-# First read in the frequency defaults
+print('Reading...', file=sys.stderr)
+sys.stderr.flush()
 
-for line in open(sys.argv[1]).readlines(): #{
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = line.split(' ');
-	sl = row[1];
-	tl = row[2];
-	fr = float(row[0]);
-	if line.count('@') and fr == 0.0: #{
-		print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr);
-		print('    %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr);
-	#}
-	if line.count('@') > 0: #{
-		print('default:', sl, tl, file=sys.stderr);
-		sl_tl_defaults[sl] = tl;
-	else: #{
-		sl_tl[sl] = tl;
-	#}
-
-#}
-
-print('Reading...', file=sys.stderr);
-sys.stderr.flush();
-
-am_file = open(sys.argv[2]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[3]); # File with disambiguated biltrans output
-reading = True;
-
-current_am_line_id = -1;
-current_dm_line_id = -1;
-
-rsep = re.compile('\$[^\^]*\^');
-
-dm_line = dm_file.readline();
-current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-am_counter = 0;
-dm_counter = 0;
-
-
-while reading: #{
-	am_line = am_file.readline();
-
-	if am_line == '': #{
-		reading = False;
-		continue;
-	#}
-
-	current_am_line_id += 1
-
-#	# to skip lines in the frac corpus if we have a sub-corpus
-#	if current_dm_line_id != current_am_line_id: #{
-#		print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr);
-#		while current_dm_line_id != current_am_line_id: #{
-#			dm_line = dm_file.readline();
-#			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-#			print('skipping %d ...' % (current_dm_line_id), file=sys.stderr);
-#		#}
-#	#}
-	while current_dm_line_id == current_am_line_id: #{
-
-		am_row = common.tokenize_biltrans_line(am_line);
-		dm_row = common.tokenize_biltrans_line(dm_line);
-
-		if len(am_row) != len(dm_row): #{
-			print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-			print('\t' + am_line, file=sys.stderr);
-			print('\t' + dm_line, file=sys.stderr);
-			print('...skipping', file=sys.stderr);
-			continue;
-		#}
-
-		cur_sl_row = [];
-		for lu in am_row: #{
-			sl = lu.split('/')[0];
-			if sl.count('><') > 0: #{
-				sl = sl.split('><')[0] + '>';
-			#}
-			cur_sl_row.append(sl);
-		#}
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = True
+	count_ngrams = True
+	max_ngrams = 3
 
-		try:
-			frac_count = float(dm_line.split('\t')[2]);
-		except:
-			break;
-
-		limit = len(am_row);
-		for i in range(0, limit): #{
-			if am_row[i].count('/') > 1: #{
-				#print(am_row[i] , dm_row[i]);
-				sl = am_row[i].split('/')[0].replace(' ', '~');
-				tl = dm_row[i].split('/')[1].replace(' ', '~');
-				if sl.count('><') > 0: #{
-					sl = sl.split('><')[0] + '>';
-				#}
-				if tl.count('><') > 0: #{
-					tl = tl.split('><')[0] + '>';
-				#}
-
-#				if tl !=  sl_tl_defaults[sl]: #{
-#					print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-#				else: #{
-#					print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-#				#}
-
-				for j in range(1, MAX_NGRAMS): #{
-					pregram = ' '.join(cur_sl_row[i-j:i+1]);
-					postgram = ' '.join(cur_sl_row[i:i+j+1]);
-					roundgram = ' '.join(cur_sl_row[i-j:i+j+1]);
-
-					if sl not in ngrams: #{
-						ngrams[sl] = {};
-					#}
-					if pregram not in ngrams[sl]: #{
-						ngrams[sl][pregram] = {};
-					#}
-					if postgram not in ngrams[sl]: #{
-						ngrams[sl][postgram] = {};
-					#}
-					if roundgram not in ngrams[sl]: #{
-						ngrams[sl][roundgram] = {};
-					#}
-
-					if tl not in ngrams[sl][pregram]: #{
-						ngrams[sl][pregram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][postgram]: #{
-						ngrams[sl][postgram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][roundgram]: #{
-						ngrams[sl][roundgram][tl] = 0.0;
-					#}
-					ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count;
-					ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count;
-					ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count;
-
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][pregram][tl], pregram), file=sys.stderr);
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][roundgram][tl], roundgram), file=sys.stderr);
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][postgram][tl], postgram), file=sys.stderr);
-
-
-				#}
-			#}
-		#}
-
-		dm_line = dm_file.readline();
-		if dm_line == '': #{
-			reading = False;
-			break;
-		#}
-		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-		dm_counter += 1;
-	#}
-	am_counter += 1;
-
-	if am_counter % 10000 == 0: #{
-		print('=> %d SL and %d TL lines [id: %d] [ngrams: %d].' % (am_counter, dm_counter, current_am_line_id, len(ngrams)), file=sys.stderr);
-		sys.stderr.flush();
-	#}
-#}
-
-print('Caching counts...', file=sys.stderr);
-for sl in ngrams: #{
-
-	for ngram in ngrams[sl]: #{
-
-		for tl in ngrams[sl][ngram]: #{
-			print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl), file=cache_counts);
-		#}
-	#}
-#}
-print('\n', file=sys.stderr);
-
-for sl in ngrams: #{
-
-	for ngram in ngrams[sl]: #{
+c = Counter()
+c.read_files(sys.argv[2], # File with ambiguous biltrans output
+			 sys.argv[3]) # File with disambiguated biltrans output
+ngrams = c.ngrams
+
+print('Caching counts...', file=sys.stderr)
+for sl in ngrams:
+	for ngram in ngrams[sl]:
+		for tl in ngrams[sl][ngram]:
+			print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl), file=cache_counts)
+
+print('\n', file=sys.stderr)
+
+for sl in ngrams:
+	for ngram in ngrams[sl]:
 		try:
 			#> If for each of the rules we include
 			#> the amount of time the translation is seen with that pattern over the
@@ -241,74 +84,65 @@ for sl in ngrams: #{
 			#It would be "2" in this case: the alternative is seen twice as often as
 			#the default.
 
-			total = 0.0;
-			max_freq = 0.0;
-			max_tl = '';
-			for tl in ngrams[sl][ngram]: #{
-				if ngrams[sl][ngram][tl] > max_freq: #{
-					max_freq = ngrams[sl][ngram][tl];
-					max_tl = tl;
-				#}
-				total = total + ngrams[sl][ngram][tl];
-			#}
-
-			if only_max == True: #{
-				crispiness = 0.0;
-				default = sl_tl_defaults[sl];
-	#			if default == max_tl: #{
-	#				print('default=max_tl', default, max_tl, '\t', ngram, file=sys.stderr);
-	#			else:#{
-	#				print('default!=max_tl', default, max_tl, '\t', ngram, file=sys.stderr);
-	#			#}
-				alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total);
-				def_crisp = 1.0;
-				if default in ngrams[sl][ngram]: #{
-					def_crisp = float(ngrams[sl][ngram][default] / float(total));
-				#}
-				weight = float(ngrams[sl][ngram][max_tl]) / float(total);
-				crispiness = alt_crisp/def_crisp;
-
-				if crispiness < crisphold: #{
-					print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]));
-	#				print('-', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t'+ sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl]));
-				else: #{
-
-					print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]));
-					#print('+', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t' +  sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl]));
-				#}
+			total = 0.0
+			max_freq = 0.0
+			max_tl = ''
+			for tl in ngrams[sl][ngram]:
+				if ngrams[sl][ngram][tl] > max_freq:
+					max_freq = ngrams[sl][ngram][tl]
+					max_tl = tl
+
+				total += ngrams[sl][ngram][tl]
+
+			if only_max == True:
+				crispiness = 0.0
+				default = sl_tl_defaults[sl]
+	#			if default == max_tl:
+	#				print('default=max_tl', default, max_tl, '\t', ngram, file=sys.stderr)
+	#			else:
+	#				print('default!=max_tl', default, max_tl, '\t', ngram, file=sys.stderr)
+	#
+				alt_crisp = float(ngrams[sl][ngram][max_tl]) / float(total)
+				def_crisp = 1.0
+				if default in ngrams[sl][ngram]:
+					def_crisp = float(ngrams[sl][ngram][default] / float(total))
+
+				weight = float(ngrams[sl][ngram][max_tl]) / float(total)
+				crispiness = alt_crisp/def_crisp
+
+				if crispiness < crisphold:
+					print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]))
+	#				print('-', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t'+ sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl]))
+				else:
+
+					print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], sl, ngram, max_tl, ngrams[sl][ngram][max_tl]))
+					#print('+', crispiness , weight , total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][max_tl], '\t' +  sl + '\t' + ngram + '\t' + max_tl + '\t' + str(ngrams[sl][ngram][max_tl]))
+
 
 	#   crispiness   weight      total default     max_freq     tl_freq            sl
 	#+ 2.61845457309 0.7236389238 1.0 0.2763610762 0.7236389238 0.7236389238         aozer<n>        aozer<n> an<det> levr<n>        organisateur<n> 0.7236389238
 	#- 14736.0468727 0.9999321438 1.0 0.9999321438 0.9999321438      treuzkas<n>     treuzkas<n> teknologel<adj>     transfert<n>    0.9999321438
 
+			else:
+				for tl in ngrams[sl][ngram]:
+					crispiness = 0.0
+					default = sl_tl_defaults[sl]
+					alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
+					def_crisp = 1.0
+					if default in ngrams[sl][ngram]:
+						def_crisp = float(ngrams[sl][ngram][default] / float(total))
 
+					weight = float(ngrams[sl][ngram][tl]) / float(total)
+					crispiness = alt_crisp/def_crisp
 
-			else: #{
-
-				for tl in ngrams[sl][ngram]: #{
-
-					crispiness = 0.0;
-					default = sl_tl_defaults[sl];
-					alt_crisp = float(ngrams[sl][ngram][tl]) / float(total);
-					def_crisp = 1.0;
-					if default in ngrams[sl][ngram]: #{
-						def_crisp = float(ngrams[sl][ngram][default] / float(total));
-					#}
-					weight = float(ngrams[sl][ngram][tl]) / float(total);
-					crispiness = alt_crisp/def_crisp;
+					#print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]
 
-					#print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ;
+					if crispiness < crisphold:
+						print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]))
+					else:
+						print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]))
 
-					if crispiness < crisphold: #{
-						print('- %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]));
-					else: #{
-						print('+ %.10f %.10f %.10f %.10f %.10f %.10f\t%s\t%s\t%s\t%.10f' % (crispiness, weight, total, ngrams[sl][ngram][default] , max_freq, ngrams[sl][ngram][tl], sl, ngram, tl, ngrams[sl][ngram][tl]));
-					#}
 	#+ 1013.01568891 0.9989973752 2.0 1.9979947504 1.9979947504 	galloud<n>	ha<cnjcoo> an<det> galloud<n>	puissance<n>	1.9979947504
 
-				#}
-			#}
 		except:
 			pass
-	#}
-#}
diff --git a/scripts/biltrans-count-patterns-me.py b/scripts/biltrans-count-patterns-me.py
index 2d03483..c5ae682 100755
--- a/scripts/biltrans-count-patterns-me.py
+++ b/scripts/biltrans-count-patterns-me.py
@@ -2,189 +2,37 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy;
+import sys
+from collections import defaultdict
+import common
+import biltrans_count_common as BCC
 
 # Input:
 #        a) Frequency lexicon
 #        b) Biltrans output
 #        c) Disambiguated biltrans output
 
-MAX_NGRAMS = 3;
+features = {} # features[(sl, ['a', 'list'], tl)] = 3
 
-cur_line = 0;
+sl_tl, sl_tl_defaults, indexes = BCC.read_frequencies(sys.argv[1])
 
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'regex'
+	line_ids = False
+	count_ngrams = True
+	max_ngrams = 3
 
-meevents = {}; # events[sl][counter] = [feat, feat, feat];
-meoutcomes = {}; # meoutcomes[sl][counter] = tl;
-event_counter = 0;
-
-features = {}; # features[(sl, ['a', 'list'], tl)] = 3
-feature_counter = 0;
-
-indexes = {};
-trad_counter = {};
-
-
-
-for line in open(sys.argv[1]).readlines(): #{
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = line.split(' ');
-	sl = row[1];
-	tl = row[2].strip();
-	if sl not in trad_counter: #{
-		trad_counter[sl] = 0;
-	#}
-	if line.count('@') > 0: #{
-		print(sl, tl, file=sys.stderr);
-		sl_tl_defaults[sl] = tl;
-		indexes[(sl, tl)] = trad_counter[sl];
-		trad_counter[sl] = trad_counter[sl] + 1;
-	else: #{
-		sl_tl[sl] = tl;
-		indexes[(sl, tl)] = trad_counter[sl];
-		trad_counter[sl] = trad_counter[sl] + 1;
-	#}
-#}
-
-am_file = open(sys.argv[2]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[3]); # File with disambiguated biltrans output
-reading = True;
-
-while reading: #{
-	am_line = am_file.readline();
-	dm_line = dm_file.readline();
-
-	if am_line == '' and dm_line == '': #{
-		reading = False;
-		continue;
-	#}
-
-	if am_line.count('$ ^') != dm_line.count('$ ^'): #{
-		print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-		print('\t' + am_line, file=sys.stderr);
-		print('\t' + dm_line, file=sys.stderr);
-		print('...skipping', file=sys.stderr);
-		continue;
-	#}
-
-
-	am_row = am_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^');
-	dm_row = dm_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^');
-	cur_sl_row = [];
-	for lu in am_row: #{
-		sl = lu.split('/')[0];
-		if sl.count('><') > 0: #{
-			sl = sl.split('><')[0] + '>';
-		#}
-		cur_sl_row.append(sl);
-	#}
-
-	limit = len(am_row);
-	for i in range(0, limit): #{
-		if am_row[i].count('/') > 1: #{
-			#print(am_row[i] , dm_row[i]);
-			sl = am_row[i].split('/')[0].replace(' ', '~');
-			tl = dm_row[i].split('/')[1].replace(' ', '~');
-			if sl.count('><') > 0: #{
-				sl = sl.split('><')[0] + '>';
-			#}
-			if tl.count('><') > 0: #{
-				tl = tl.split('><')[0] + '>';
-			#}
-
-			if tl !=  sl_tl_defaults[sl]: #{
-				print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-			else: #{
-				print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-			#}
-
-			for j in range(1, MAX_NGRAMS): #{
-				pregram = ' '.join(cur_sl_row[i-j:i+1]);
-				postgram = ' '.join(cur_sl_row[i:i+j+1]);
-				roundgram = ' '.join(cur_sl_row[i-j:i+j+1]);
-
-				if sl not in ngrams: #{
-					ngrams[sl] = {};
-				#}
-				if pregram not in ngrams[sl]: #{
-					ngrams[sl][pregram] = {};
-				#}
-				if postgram not in ngrams[sl]: #{
-					ngrams[sl][postgram] = {};
-				#}
-				if roundgram not in ngrams[sl]: #{
-					ngrams[sl][roundgram] = {};
-				#}
-				if tl not in ngrams[sl][pregram]: #{
-					ngrams[sl][pregram][tl] = 0;
-				#}
-				if tl not in ngrams[sl][postgram]: #{
-					ngrams[sl][postgram][tl] = 0;
-				#}
-				if tl not in ngrams[sl][roundgram]: #{
-					ngrams[sl][roundgram][tl] = 0;
-				#}
-
-				ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + 1;
-				ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + 1;
-				ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + 1;
-			#}
-			if sl not in meevents: #{
-				meevents[sl] = {};
-			#}
-			if sl not in meoutcomes: #{
-				meoutcomes[sl] = {};
-			#}
-			if event_counter not in meevents: #{
-				meevents[sl][event_counter] = [];
-			#}
-			if event_counter not in meoutcomes: #{
-				meoutcomes[sl][event_counter] = '';
-			#}
-			for ni in ngrams[sl]: #{
-				if ni not in features: #{
-					feature_counter = feature_counter + 1;
-					features[ni] = feature_counter;
-				#}
-				meevents[sl][event_counter].append(features[ni]);
-				#meevents[sl][event_counter].append(feat);
-				meoutcomes[sl][event_counter] = tl;
-
-			#}
-			del ngrams;
-			ngrams = {};
-			if sl not in sl_tl: #{
-				continue;
-			#}
-			if len(sl_tl[sl]) < 2: #{
-				continue;
-			#}
-			for event in meevents[sl]: #{
-				outline = str(indexes[(sl, meoutcomes[sl][event])]) + ' # ';
-				for j in range(0,  len(sl_tl[sl])): #{
-					for feature in meevents[sl][event]: #{
-						outline = outline + str(feature) + ':' + str(j) + ' ';
-					#}
-					outline = outline + ' # '
-				#}
-				print(sl , '\t', len(sl_tl[sl]),'\t', outline);
-			#}
-			del meevents;
-			del meoutcomes;
-			meevents = {};
-			meoutcomes = {};
-		#}
-	#}
-	event_counter = event_counter + 1;
-#}
-
-for feature in features: #{
-	print(features[feature] , '\t' , feature, file=sys.stderr);
-#}
+	def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		global sl_tl, sl_tl_defaults, features, indexes
+		sym = '-' if tl == sl_tl_defaults[sl] else '+'
+		print(sym, sl, sl_tl_defaults[sl], tl, file=sys.stderr)
+		BCC.features_and_outline(self.ngrams, sl, tl, sl_tl, features,
+								 indexes)
+		self.clear_ngrams()
 
+c = Counter()
+c.read_files(sys.argv[2], # File with ambiguous biltrans output
+			 sys.argv[3]) # File with disambiguated biltrans output
 
+for feature in features:
+	print(features[feature] , '\t' , feature, file=sys.stderr)
diff --git a/scripts/biltrans-count-patterns-ngrams.py b/scripts/biltrans-count-patterns-ngrams.py
index 9b794fc..bd5bf68 100755
--- a/scripts/biltrans-count-patterns-ngrams.py
+++ b/scripts/biltrans-count-patterns-ngrams.py
@@ -2,7 +2,8 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, math, re, common;
+import sys
+import biltrans_count_common as BCC
 
 # Input:
 #        a) Frequency lexicon
@@ -27,215 +28,37 @@ import sys, codecs, copy, math, re, common;
 
 #	 d) Crispiness threshold
 
-MAX_NGRAMS = 3; # Max = 5-grams
-
-cur_line = 0;
-crisphold = 3.0 ; # Default
-only_max = True;
-#only_max = False;
-
-if len(sys.argv) == 5: #{
-	crisphold = float(sys.argv[4]);
-	print('crisp:', crisphold, file=sys.stderr);
-#}
-
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
-
-def clean_line(l): #{
-	newline = '';
-	inword = True;
-	for c in l: #{
-		if c == '\t':
-			newline = newline + c;
-			inword = False;
-			continue;
-		if c == '^':
-			newline = newline + c;
-			inword = True;
-			continue;
-		if c == '$':
-			newline = newline + c + ' ';
-			inword = False;
-			continue;
-		if inword == True: #{
-			newline = newline + c;
-		#}
-	#}
-#	print(newline, file=sys.stderr);
-	return newline;
-#}
+cur_line = 0
+crisphold = 3.0  # Default
+only_max = True
+#only_max = False
 
+if len(sys.argv) == 5:
+	crisphold = float(sys.argv[4])
+	print('crisp:', crisphold, file=sys.stderr)
 
 # First read in the frequency defaults
 
-for line in open(sys.argv[1]).readlines(): #{
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = line.split(' ');
-	sl = row[1];
-	tl = row[2];
-	fr = float(row[0]);
-	if line.count('@') and fr == 0.0: #{
-		print('!!! Prolly something went wrong here, the default has a freq of 0.0', file=sys.stderr);
-		print('    %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr);
-	#}
-	if line.count('@') > 0: #{
-		print('default:', sl, tl, file=sys.stderr);
-		sl_tl_defaults[sl] = tl;
-	else: #{
-		sl_tl[sl] = tl;
-	#}
-
-#}
-
-print('Reading...', file=sys.stderr);
-sys.stderr.flush();
-
-am_file = open(sys.argv[2]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[3]); # File with disambiguated biltrans output
-reading = True;
-
-current_am_line_id = -1;
-current_dm_line_id = -1;
-
-dm_line = dm_file.readline();
-current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-am_counter = 0;
-dm_counter = 0;
-
-while reading: #{
-	am_line = am_file.readline();
-
-	if am_line == '': #{
-		reading = False;
-		continue;
-	#}
-	current_am_line_id = int(am_line.split("\t")[0]);
-
-#	# to skip lines in the frac corpus if we have a sub-corpus
-	if current_dm_line_id != current_am_line_id: #{
-		print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr);
-#		while current_dm_line_id != current_am_line_id: #{
-#			dm_line = dm_file.readline();
-#			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-#			print('skipping %d ...' % (current_dm_line_id), file=sys.stderr);
-#		#}
-	#}
-	while current_dm_line_id == current_am_line_id: #{
-
-		am_row = common.tokenize_biltrans_line(am_line);
-		dm_row = common.tokenize_biltrans_line(dm_line);
-
-		if len(am_row) != len(dm_row): #{
-			amc = len(am_row);
-			dmc = len(dm_row);
-			print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-			print('am(',amc,'):\t' + am_line, file=sys.stderr);
-			print('dm(',dmc,'):\t' + dm_line, file=sys.stderr);
-			print('...skipping', file=sys.stderr);
-			dm_line = dm_file.readline();
-			if dm_line == '': #{
-				reading = False;
-				break;
-			#}
-			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-			dm_counter += 1;
-
-			continue;
-		#}
-
-
-		frac_count = 0.0;
-		s_fc = dm_line.split('\t')[2].strip();
-		if s_fc == '' or len(s_fc) == 0: #{
-			print('%d %d :: %d %d :: Frac count is not floatable' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr);
-		#}
-		try:
-			frac_count = float(s_fc);
-		except:
-			break
-		if math.isnan(frac_count): #{
-			print('%d %d :: %d %d :: Frac count is not a number' % (am_counter, dm_counter, current_am_line_id, current_dm_line_id), file=sys.stderr);
-			frac_count = 0.0;
-		#}
-
-		limit = len(am_row);
-		cur_sl_row = [x['sl'] for x in am_row];
-
-		for i in range(0, limit): #{
-			if len(am_row[i]['tls']) > 1: #{
-
-				sl = am_row[i]['sl']
-				tl = dm_row[i]['tls'][0]
-
-				for j in range(1, MAX_NGRAMS): #{
-					pregram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+1]));
-					postgram = ' '.join(map(common.wrap, cur_sl_row[i:i+j+1]));
-					roundgram = ' '.join(map(common.wrap, cur_sl_row[i-j:i+j+1]));
-
-					if sl not in ngrams: #{
-						ngrams[sl] = {};
-					#}
-					if pregram not in ngrams[sl]: #{
-						ngrams[sl][pregram] = {};
-					#}
-					if postgram not in ngrams[sl]: #{
-						ngrams[sl][postgram] = {};
-					#}
-					if roundgram not in ngrams[sl]: #{
-						ngrams[sl][roundgram] = {};
-					#}
-					if tl not in ngrams[sl][pregram]: #{
-						ngrams[sl][pregram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][postgram]: #{
-						ngrams[sl][postgram][tl] = 0.0;
-					#}
-					if tl not in ngrams[sl][roundgram]: #{
-						ngrams[sl][roundgram][tl] = 0.0;
-					#}
-					ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + frac_count;
-					ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + frac_count;
-					ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + frac_count;
-
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][pregram][tl], pregram), file=sys.stderr);
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][roundgram][tl], roundgram), file=sys.stderr);
-#					print('=> %s\t[%.10f] %s' % (tl, ngrams[sl][postgram][tl], postgram), file=sys.stderr);
-
-
-				#}
-			#}
-		#}
-
-		dm_line = dm_file.readline();
-		if dm_line == '': #{
-			reading = False;
-			break;
-		#}
-		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-		dm_counter += 1;
-	#}
-	am_counter += 1;
-
-	if am_counter % 1000 == 0: #{
-		print('=> %d SL and %d TL lines [id: %d] [ngrams: %d].' % (am_counter, dm_counter, current_am_line_id, len(ngrams)), file=sys.stderr);
-		sys.stderr.flush();
-	#}
-#}
-
-print('Caching counts...', file=sys.stderr);
-for sl in ngrams: #{
-
-	for ngram in ngrams[sl]: #{
-
-		for tl in ngrams[sl][ngram]: #{
-			print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl));
-		#}
-	#}
-#}
-print('\n', file=sys.stderr);
+sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1])
+
+print('Reading...', file=sys.stderr)
+sys.stderr.flush()
+
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = True
+	count_ngrams = True
+	max_ngrams = 3
+
+c = Counter()
+c.read_files(sys.argv[2], # File with ambiguous biltrans output
+			 sys.argv[3]) # File with disambiguated biltrans output
+ngrams = c.ngrams
+
+print('Caching counts...', file=sys.stderr)
+for sl in ngrams:
+	for ngram in ngrams[sl]:
+		for tl in ngrams[sl][ngram]:
+			print('%.10f\t%s\t%s\t%s' % (ngrams[sl][ngram][tl], ngram, sl, tl))
+
+print('\n', file=sys.stderr)
diff --git a/scripts/biltrans-count-patterns.py b/scripts/biltrans-count-patterns.py
index 8421c40..31674ca 100755
--- a/scripts/biltrans-count-patterns.py
+++ b/scripts/biltrans-count-patterns.py
@@ -2,7 +2,9 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, re;
+import sys, re
+import common
+import biltrans_count_common as BCC
 
 # Input:
 #        a) Frequency lexicon
@@ -10,134 +12,41 @@ import sys, codecs, copy, re;
 #        c) Disambiguated biltrans output
 #	 d) Crispiness threshold
 
-MAX_NGRAMS = 3;
-
-cur_line = 0;
-crisphold = 3.0 ;
-rsep = re.compile('\$[^\^]*\^');
-if len(sys.argv) == 5: #{
-	crisphold = float(sys.argv[4]);
-	print('crisp:', crisphold, file=sys.stderr);
-#}
-
-sl_tl_defaults = {};
-sl_tl = {};
-ngrams = {};
-
-for line in open(sys.argv[1]).readlines(): #{
-	if len(line) < 1: #{
-		continue;
-	#}
-	row = line.split(' ');
-	sl = row[1];
-	tl = row[2];
-	if line.count('@') > 0: #{
-		sl_tl_defaults[sl] = tl;
-	else: #{
-		sl_tl[sl] = tl;
-	#}
-#}
-
-am_file = open(sys.argv[2]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[3]); # File with disambiguated biltrans output
-reading = True;
-
-while reading: #{
-	am_line = am_file.readline();
-	dm_line = dm_file.readline();
-
-	if am_line == '' and dm_line == '': #{
-		reading = False;
-		continue;
-	#}
-
-	if len(rsep.findall(am_line)) != len(rsep.findall(dm_line)): #{
-		print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-		print('\t' + am_line, file=sys.stderr);
-		print('\t' + dm_line, file=sys.stderr);
-		print('...skipping', file=sys.stderr);
-		continue;
-	#}
-
-
-	am_row = am_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^');
-	dm_row = dm_line.split('\t')[1].replace('$^', '$ ^')[1:-1].split('$ ^');
-	cur_sl_row = [];
-	for lu in am_row: #{
-		sl = lu.split('/')[0];
-		if sl.count('><') > 0: #{
-			sl = sl.split('><')[0] + '>';
-		#}
-		cur_sl_row.append(sl);
-	#}
-
-	limit = len(am_row);
-	for i in range(0, limit): #{
-		if am_row[i].count('/') > 1: #{
-			#print(am_row[i] , dm_row[i]);
-			sl = am_row[i].split('/')[0].replace(' ', '~');
-			tl = dm_row[i].split('/')[1].replace(' ', '~');
-			if sl.count('><') > 0: #{
-				sl = sl.split('><')[0] + '>';
-			#}
-			if tl.count('><') > 0: #{
-				tl = tl.split('><')[0] + '>';
-			#}
-
-			if tl !=  sl_tl_defaults[sl]: #{
-				print('+' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-			else: #{
-				print('-' , sl , sl_tl_defaults[sl] , tl, file=sys.stderr);
-			#}
-
-			for j in range(1, MAX_NGRAMS): #{
-				pregram = ' '.join(cur_sl_row[i-j:i+1]);
-				postgram = ' '.join(cur_sl_row[i:i+j+1]);
-				roundgram = ' '.join(cur_sl_row[i-j:i+j+1]);
-
-				if sl not in ngrams: #{
-					ngrams[sl] = {};
-				#}
-				if pregram not in ngrams[sl]: #{
-					ngrams[sl][pregram] = {};
-				#}
-				if postgram not in ngrams[sl]: #{
-					ngrams[sl][postgram] = {};
-				#}
-				if roundgram not in ngrams[sl]: #{
-					ngrams[sl][roundgram] = {};
-				#}
-				if tl not in ngrams[sl][pregram]: #{
-					ngrams[sl][pregram][tl] = 0;
-				#}
-				if tl not in ngrams[sl][postgram]: #{
-					ngrams[sl][postgram][tl] = 0;
-				#}
-				if tl not in ngrams[sl][roundgram]: #{
-					ngrams[sl][roundgram][tl] = 0;
-				#}
-
-				ngrams[sl][pregram][tl] = ngrams[sl][pregram][tl] + 1;
-				ngrams[sl][postgram][tl] = ngrams[sl][postgram][tl] + 1;
-				ngrams[sl][roundgram][tl] = ngrams[sl][roundgram][tl] + 1;
-			#}
-		#}
-	#}
-#}
-
-for sl in ngrams: #{
-
-	for ngram in ngrams[sl]: #{
-		total = 0;
-		max_freq = -1;
-		current_tl = '';
-		for tl in ngrams[sl][ngram]: #{
-			if ngrams[sl][ngram][tl] > max_freq: #{
-				max_freq = ngrams[sl][ngram][tl];
-				current_tl = tl;
-			#}
-			total = total + ngrams[sl][ngram][tl];
-		#}
+cur_line = 0
+crisphold = 3.0
+if len(sys.argv) == 5:
+	crisphold = float(sys.argv[4])
+	print('crisp:', crisphold, file=sys.stderr)
+
+sl_tl, sl_tl_defaults, _ = BCC.read_frequencies(sys.argv[1])
+
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'regex'
+	line_ids = False
+	count_ngrams = True
+	max_ngrams = 3
+
+	def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		global sl_tl_defaults
+		sym = '-' if tl == sl_tl_defaults[sl] else '+'
+		print(sym, sl, sl_tl_defaults[sl], tl, file=sys.stderr)
+
+
+c = Counter()
+c.read_files(sys.argv[2], # File with ambiguous biltrans output
+			 sys.argv[3]) # File with disambiguated biltrans output
+ngrams = c.ngrams
+
+for sl in ngrams:
+	for ngram in ngrams[sl]:
+		total = 0
+		max_freq = -1
+		current_tl = ''
+		for tl in ngrams[sl][ngram]:
+			if ngrams[sl][ngram][tl] > max_freq:
+				max_freq = ngrams[sl][ngram][tl]
+				current_tl = tl
+			total += ngrams[sl][ngram][tl]
 
 		#> If for each of the rules we include
 		#> the amount of time the translation is seen with that pattern over the
@@ -155,25 +64,20 @@ for sl in ngrams: #{
 		#It would be "2" in this case: the alternative is seen twice as often as
 		#the default.
 
-		for tl in ngrams[sl][ngram]: #{
-			crispiness = 0.0;
-			default = sl_tl_defaults[sl];
-			alt_crisp = float(ngrams[sl][ngram][tl]) / float(total);
-			def_crisp = 1.0;
-			if default in ngrams[sl][ngram]: #{
-				def_crisp = float(ngrams[sl][ngram][default] / float(total));
-			#}
-			weight = float(ngrams[sl][ngram][tl]) / float(total);
-			crispiness = alt_crisp/def_crisp;
+		for tl in ngrams[sl][ngram]:
+			crispiness = 0.0
+			default = sl_tl_defaults[sl]
+			alt_crisp = float(ngrams[sl][ngram][tl]) / float(total)
+			def_crisp = 1.0
+			if default in ngrams[sl][ngram]:
+				def_crisp = float(ngrams[sl][ngram][default] / float(total))
 
-			#print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] ;
+			weight = float(ngrams[sl][ngram][tl]) / float(total)
+			crispiness = alt_crisp/def_crisp
 
-			if crispiness < crisphold: #{
-				print('-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]));
-			else: #{
+			#print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram]
 
-				print('+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' +  sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]));
-			#}
-		#}
-	#}
-#}
+			if crispiness < crisphold:
+				print('-', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t'+ sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl]))
+			else:
+				print('+', crispiness , weight , total, max_freq, ngrams[sl][ngram][tl], '\t' +  sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))
diff --git a/scripts/biltrans-extract-frac-freq.py b/scripts/biltrans-extract-frac-freq.py
index 84e3dbe..d17f20d 100644
--- a/scripts/biltrans-extract-frac-freq.py
+++ b/scripts/biltrans-extract-frac-freq.py
@@ -2,9 +2,9 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy, re
-import common
-import math
+import sys
+from collections import defaultdict
+import biltrans_count_common as BCC
 
 # Input:
 #        a) Biltrans output
@@ -16,93 +16,28 @@ import math
 #
 #
 
-sl_tl = {}; # The sl-tl possible combinations
-am_file = open(sys.argv[1]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[2]); # File with disambiguated biltrans output
-reading = True;
+# The sl-tl possible combinations
+sl_tl = defaultdict(lambda: defaultdict(lambda: 0.0))
 
-current_am_line_id = -1;
-current_dm_line_id = -1;
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = True
 
-rsep = re.compile('\$[^\^]*\^');
+	def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		global sl_tl
+		sl_tl[sl][tl] += frac_count
 
-dm_line = dm_file.readline();
-current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
+c = Counter()
+c.read_files(sys.argv[1], # File with ambiguous biltrans output
+			 sys.argv[2]) # File with disambiguated biltrans output
 
-while reading: #{
-	current_am_line_id += 1
-	am_line = am_file.readline();
-	if am_line == '':
-		reading = False;
-		continue;
-	#}
-	current_am_line_id = int(am_line.split("\t")[0])
-	while current_am_line_id == current_dm_line_id: #{
-		if current_dm_line_id % 1000 == 0:
-			print ("STATUS: at line", current_dm_line_id, file=sys.stderr);
-		if dm_line == '': #{
-			print('breaking', file=sys.stderr);
-			reading = False;
-			break;
-		#}
-		try:
-			frac_count = float(dm_line.split('\t')[2]);
-			if math.isnan(frac_count):
-				frac_count = 0;
-		except:
-			break;
-
-		am_row = common.tokenize_biltrans_line(am_line);
-		dm_row = common.tokenize_biltrans_line(dm_line);
-
-		if len(am_row) != len(dm_row): #{
-			print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-			print('\t' + am_line, file=sys.stderr);
-			print('\t' + dm_line, file=sys.stderr);
-
-			print('\t' + am_line, file=sys.stderr);
-			print('\t' + dm_line, file=sys.stderr);
-			dm_line = dm_file.readline();
-			if dm_line == '': break;
-			current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-		#}
-
-		limit = len(am_row);
-		for i in range(0, limit): #{
-			if len(am_row[i]['tls']) > 1: #{
-
-				sl = am_row[i]['sl']
-				tl = dm_row[i]['tls'][0]
-
-				if sl not in sl_tl: #{
-					sl_tl[sl] = {};
-				#}
-				if tl not in sl_tl[sl]: #{
-					sl_tl[sl][tl] = 0.0;
-				#}
-				sl_tl[sl][tl] = sl_tl[sl][tl] + frac_count;
-
-			#}
-		#}
-		dm_line = dm_file.readline();
-		if dm_line == '': break;
-		current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
-
-	#}
-#}
-
-
-for sl in sl_tl: #{
+for sl in sl_tl:
 	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
 	newtl.reverse()
-	first = True;
-	for tl in newtl: #{
-		if first: #{
-			print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
+	first = True
+	for tl in newtl:
+		if first:
+			print('%.10f %s %s @' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)))
 			first = False
-		else: #{
-			print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)));
-		#}
-	#}
-#}
+		else:
+			print('%.10f %s %s' % (sl_tl[sl][tl] , common.wrap(sl) , common.wrap(tl)))
diff --git a/scripts/biltrans-extract-freq.py b/scripts/biltrans-extract-freq.py
index 04c392f..49fd928 100644
--- a/scripts/biltrans-extract-freq.py
+++ b/scripts/biltrans-extract-freq.py
@@ -2,74 +2,41 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy;
-import common;
+import sys
+import common
+import biltrans_count_common as BCC
+from collections import defaultdict
 
 # Input:
 #        a) Biltrans output
 #        b) Disambiguated biltrans output
 
-#
+# The sl-tl possible combinations
+sl_tl = defaultdict(lambda: defaultdict(lambda: 0))
 
-sl_tl = {}; # The sl-tl possible combinations
-am_file = open(sys.argv[1]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[2]); # File with disambiguated biltrans output
-reading = True;
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = False
 
-while reading: #{
-	am_line = am_file.readline();
-	dm_line = dm_file.readline();
+	def processs_row(self, frac_count=0):
+		global sl_tl
+		for i in range(len(self.am_row)):
+			if self.am_row[i].count('/') > 1:
+				sl = BCC.strip_tags(self.am_row[i], 'sl')
+				tl = BCC.strip_tags(self.dm_row[i], 'tl')
+				sl_tl[sl][tl] += 1
 
-	if am_line == '' and dm_line == '': #{
-		reading = False;
-		continue;
-	#}
+c = Counter()
+c.read_files(sys.argv[1], # File with ambiguous biltrans output
+			 sys.argv[2]) # File with disambiguated biltrans output
 
-	am_row = common.tokenize_biltrans_line(am_line);
-	dm_row = common.tokenize_biltrans_line(dm_line);
-
-	if len(am_row) != len(dm_row): #{
-		print('Mismatch in number of LUs between analysis and training', file=sys.stderr);
-		print('\t' + am_line, file=sys.stderr);
-		print('\t' + dm_line, file=sys.stderr);
-		print('...skipping', file=sys.stderr);
-		continue;
-	#}
-
-	limit = len(am_row);
-	for i in range(0, limit): #{
-		if am_row[i].count('/') > 1: #{
-			#print(am_row[i] , dm_row[i]);
-			sl = am_row[i].split('/')[0].replace(' ', '~');
-			tl = dm_row[i].split('/')[1].replace(' ', '~');
-			if sl.count('><') > 0: #{
-				sl = sl.split('><')[0] + '>';
-			#}
-			if tl.count('><') > 0: #{
-				tl = tl.split('><')[0] + '>';
-			#}
-			if sl not in sl_tl: #{
-				sl_tl[sl] = {};
-			#}
-			if tl not in sl_tl[sl]: #{
-				sl_tl[sl][tl] = 0;
-			#}
-			sl_tl[sl][tl] = sl_tl[sl][tl] + 1;
-		#}
-	#}
-#}
-
-for sl in sl_tl: #{
+for sl in sl_tl:
 	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
 	newtl.reverse()
-	first = True;
-	for tl in newtl: #{
-		if first: #{
-			print(sl_tl[sl][tl] , sl , tl , '@');
+	first = True
+	for tl in newtl:
+		if first:
+			print(sl_tl[sl][tl] , sl , tl , '@')
 			first = False
-		else: #{
-			print(sl_tl[sl][tl] , sl , tl);
-		#}
-	#}
-#}
-
+		else:
+			print(sl_tl[sl][tl] , sl , tl)
diff --git a/scripts/biltrans-extract-poormans-freq.py b/scripts/biltrans-extract-poormans-freq.py
index 43a7c7c..7821d8b 100644
--- a/scripts/biltrans-extract-poormans-freq.py
+++ b/scripts/biltrans-extract-poormans-freq.py
@@ -2,7 +2,9 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs, copy;
+import sys
+import biltrans_count_common as BCC
+from collections import defaultdict
 
 # Input:
 #        a) Biltrans output
@@ -14,72 +16,39 @@ import sys, codecs, copy;
 
 #11 	si<cnjadv> el<det><def> asamblea<n> estar#~de~acuerdo<vblex> ,<cm> hacer<vblex> lo~que<rel><nn> el<det><def> señor<n> Evans<np><cog> acabar<vblex> de<pr> sugerir<vblex><inf> .<sent>
 
+# The sl-tl possible combinations
+sl_tl = defaultdict(lambda: defaultdict(lambda: 0))
 
-sl_tl = {}; # The sl-tl possible combinations
-am_file = open(sys.argv[1]); # File with ambiguous biltrans output
-dm_file = open(sys.argv[2]); # File with biltrans output
-reading = True;
+class Counter(BCC.BiltransCounter):
+	tokenizer = 'biltrans'
+	line_ids = False
 
-while reading: #{
-	am_line = am_file.readline();
-	dm_line = dm_file.readline();
+	def process_row(self, frac_count=0):
+		global sl_tl
+		for i in range(len(self.am_row)):
+			if self.am_row[i].count('/') > 1:
+				sl = BCC.strip_tags(am_row[i], 'sl', space=True)
 
-	if am_line == '' and dm_line == '': #{
-		reading = False;
-		continue;
-	#}
+				bts = am_row[i].split('/')[1:]
+				valid_trads = set(BCC.strip_tags(b, 'sl', space=True)
+								  for b in bts)
 
-	am_row = am_line.split('\t')[1].split(' ');
-	dm_row = list(set(dm_line.split('\t')[1].split(' ')));
+				for tl_ in dm_row:
+					tl = BCC.strip_tags(tl_, 'sl', space=True)
+					if tl in valid_trads:
+						sl_tl[sl][tl] += 1
 
-	limit = len(am_row);
-	for i in range(0, limit): #{
-		if am_row[i].count('/') > 1: #{
-			#print(am_row[i] , dm_row[i]);
-			sl = am_row[i].split('/')[0];
-			if sl.count('><') > 0: #{
-				sl = sl.split('><')[0] + '>';
-			#}
-			if sl not in sl_tl: #{
-				sl_tl[sl] = {};
-			#}
-			bts = am_row[i].split('/')[1:];
-			valid_trads = set();
-			for bt in bts: #{
-				if bt.count('><') > 0: #{
-					bt = bt.split('><')[0] + '>';
-				#}
-				valid_trads.add(bt);
-			#}
-			limit2 = len(dm_row);
-			for j in range(0, limit2): #{
-				tl = dm_row[j];
-				if tl.count('><') > 0: #{
-					tl = tl.split('><')[0] + '>';
-				#}
-				if tl not in valid_trads: #{
-					continue;
-				#}
-				if tl not in sl_tl[sl]: #{
-					sl_tl[sl][tl] = 0;
-				#}
-				sl_tl[sl][tl] = sl_tl[sl][tl] + 1;
-			#}
-		#}
-	#}
-#}
+c = Counter()
+c.read_files(sys.argv[1], # File with ambiguous biltrans output
+			 sys.argv[2]) # File with biltrans output
 
-for sl in sl_tl: #{
+for sl in sl_tl:
 	newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x])
 	newtl.reverse()
-	first = True;
-	for tl in newtl: #{
-		if first: #{
-			print(sl_tl[sl][tl] , sl , tl , '@');
+	first = True
+	for tl in newtl:
+		if first:
+			print(sl_tl[sl][tl] , sl , tl , '@')
 			first = False
-		else: #{
-			print(sl_tl[sl][tl] , sl , tl);
-		#}
-	#}
-#}
-
+		else:
+			print(sl_tl[sl][tl] , sl , tl)
diff --git a/scripts/biltrans_count_common.py b/scripts/biltrans_count_common.py
new file mode 100644
index 0000000..0aa751c
--- /dev/null
+++ b/scripts/biltrans_count_common.py
@@ -0,0 +1,208 @@
+#!/usr/bin/python3
+# coding=utf-8
+# -*- encoding: utf-8 -*-
+
+from collections import defaultdict
+import common
+import math
+import re
+import sys
+
+def safe_float(s):
+	if not s:
+		return 0.0, False
+	try:
+		f = float(s)
+		if math.isnan(f):
+			return 0.0, False
+		return f, True
+	except:
+		return 0.0, False
+
+def strip_tags(s, side, space=False):
+	idx = 0 if side == 'sl' else 1
+	ret = s.split('/')[idx]
+	if not space:
+		ret = ret.replace(' ', '~')
+	if ret.count('><') > 0:
+		ret = ret.split('><')[0] + '>'
+	return ret
+
+class BiltransCounter:
+	lu_sep = re.compile(r'\$[^\^]*\^')
+	tokenizer = 'regex' # or 'biltrans'
+	line_ids = False
+	count_ngrams = False
+	max_ngrams = 3
+	biltrans_wrap_lus = False
+	def __init__(self):
+		self.reading = False
+
+		self.am_file = None
+		self.am_line = None
+		self.am_row = None
+		self.am_id = None
+		self.am_linenum = 0
+		self.dm_file = None
+		self.dm_line = None
+		self.dm_row = None
+		self.dm_id = None
+		self.am_linenum = 0
+
+		self.clear_ngrams()
+
+	def __del__(self):
+		if self.am_file:
+			self.am_file.close()
+		if self.dm_file:
+			self.dm_file.close()
+
+	def next_am_line(self):
+		self.am_line = self.am_file.readline()
+		self.am_linenum += 1
+		if not self.am_line:
+			self.am_id, self.am_row = None, []
+			self.reading = False
+			return
+		ls = self.am_line.split('\t')
+		if self.line_ids:
+			self.am_id = int(ls[0].strip())
+		if self.tokenizer == 'regex':
+			self.am_row = self.lu_sep.split(ls[1].strip()[1:-1])
+		elif self.tokenizer == 'biltrans':
+			self.am_row = common.tokenize_biltrans_line(self.am_line)
+
+	def next_dm_line(self):
+		self.dm_linenum += 1
+		self.dm_line = self.dm_file.readline()
+		if not self.dm_line:
+			self.dm_id, self.dm_row = None, []
+			self.reading = False
+			return
+		if self.line_ids:
+			self.dm_id = int(self.dm_line.split('.[][')[1].split()[0])
+		if self.tokenizer == 'regex':
+			self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1])
+		elif self.tokenizer == 'biltrans':
+			self.dm_row = common.tokenize_biltrans_line(self.dm_line)
+
+	def clear_ngrams(self):
+		self.ngrams = defaultdict(
+			lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+
+	def check_rows(self):
+		if len(self.am_row) != len(self.dm_row):
+			print('Mismatch in number of LUs between analysis and training', file=sys.stderr)
+			print('\t' + am_line, file=sys.stderr)
+			print('\t' + dm_line, file=sys.stderr)
+			print('...skipping', file=sys.stderr)
+			return False
+		return True
+
+	def read_files_multi_dm(am_fname, dm_fname):
+		self.next_dm_line()
+		while self.reading:
+			self.next_am_line()
+			while self.am_id == self.dm_id and self.reading:
+				frac_count = 0
+				if self.dm_line.count('\t') > 1:
+					frac_count, _ = safe_float(self.dm_line.split('\t')[2])
+				if self.check_rows():
+					self.process_row(frac_count)
+				self.next_dm_line()
+			if self.am_linenum % 1000 == 0:
+				print('=> %d SL and %d TL lines read' % (self.am_linenum, self.dm_linenum), file=sys.stderr)
+
+	def read_files(am_fname, dm_fname):
+		self.am_file = open(am_fname)
+		self.dm_file = open(dm_fname)
+		self.reading = True
+		if self.line_ids:
+			self.read_files_multi_dm(am_fname, dm_fname)
+			return
+		while self.reading:
+			self.next_am_line()
+			self.next_dm_line()
+			if self.reading and self.check_rows():
+				self.process_row()
+			if self.am_linenum % 1000 == 0:
+				print('=> %d lines read' % self.am_linenum, file=sys.stderr)
+
+	def process_row(self, frac_count=0):
+		if self.tokenizer == 'regex':
+			cur_sl_row = [strip_tags(s, 'sl', space=True) for s in self.am_row]
+			for i in range(len(self.am_row)):
+				if self.am_row[i].count('/') > 1:
+					sl = strip_tags(self.am_row[i], 'sl')
+					tl = strip_tags(self.dm_row[i], 'tl')
+					self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count)
+		elif self.tokenizer == 'biltrans':
+			cur_sl_row = [x['sl'] for x in self.am_row]
+			for i in range(len(self.am_row)):
+				if len(self.am_row[i]['tls']) > 1:
+					sl = self.am_row[i]['sl']
+					tl = self.dm_row[i]['tls'][0]
+					if self.biltrans_wrap_lus:
+						sl = common.wrap(sl)
+						tl = common.wrap(tl)
+					self.process_lu_internal(sl, tl, i, cur_sl_row, frac_count)
+
+	def process_lu_internal(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		if self.count_ngrams:
+			for j in range(1, self.max_ngrams):
+				pregram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+1]))
+				postgram = ' '.join(map(common.wrap, cur_sl_row[idx:idx+j+1]))
+				roundgram = ' '.join(map(common.wrap, cur_sl_row[idx-j:idx+j+1]))
+				self.ngrams[sl][pregram][tl] += frac_count
+				self.ngrams[sl][postgram][tl] += frac_count
+				self.ngrams[sl][roundgram][tl] += frac_count
+		self.process_lu(sl, tl, idx, cur_sl_row, frac_count)
+
+	def process_lu(self, sl, tl, idx, cur_sl_row, frac_count=0):
+		pass
+
+def features_and_outline(ngrams, sl, tl, sl_tl, features, indexes,
+						 frac_count=None):
+	if not ngrams[sl]:
+		return
+	meevents = []
+	for ni in ngrams[sl]:
+		if ni not in features:
+			feature_counter = len(features) + 1
+			features[ni] = feature_counter
+		meevents.append(features[ni])
+	if sl not in sl_tl or len(sl_tl[sl]) < 2:
+		return
+	outline = str(indexes[(sl, tl)])
+	if frac_count != None:
+		outline += ' $ ' + str(int(frac_count * 10000)) + ' '
+	outline += ' # '
+	for j in range(len(sl_tl[sl])):
+		for feature in meevents:
+			outline += '%s:%s ' % (feature, j)
+		outline += ' # '
+	print('%s\t%s\t%s' % (sl, len(sl_tl[sl]), outline))
+
+def read_frequencies(fname):
+	with open(fname) as fin:
+		sl_tl = {}
+		sl_tl_defaults = {}
+		indexes = {}
+		trad_counter = defaultdict(lambda: 0)
+		for line_ in fin.readlines():
+			line = line_.strip()
+			if not line:
+				continue
+			row = line.split(' ')
+			sl = row[1].strip()
+			tl = row[2].strip()
+			indexes[(sl, tl)] = trad_counter[sl]
+			trad_counter[sl] += 1
+			if '@' in line:
+				sl_tl_defaults[sl] = tl
+				if float(row[0]) == 0.0:
+					print('!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr)
+					print('    %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr)
+			else:
+				sl_tl[sl] = tl
+		return sl_tl, sl_tl_defaults, indexes