commit 0d6bbdd09ef8a6836672b62adaee086e73e66c99
Author: Amr Keleg <amr_mohamed@live.com>
Date:   Mon Aug 5 17:56:00 2019 +0200

    Improve the fallback weights for unknown words
    
    When generating a supervised weightlist, the number of times certain tags occur
    in the corpus can be used to provide an estimate for unknown words with a matchine tag.
    Example: if <n><pl> occurred m times then
    we can assume that analyses of form ?*<n><pl> have a fallback weight that is somehow proportional to m.
    Additionally, a default weight can be estimated as further fallback weight using laplace smoothing.

diff --git a/annotated-corpus-to-weightlist b/annotated-corpus-to-weightlist
index 6ce905b..72deb6b 100755
--- a/annotated-corpus-to-weightlist
+++ b/annotated-corpus-to-weightlist
@@ -3,25 +3,57 @@
 import math
 import argparse
 from collections import Counter
-from utils.utils import extract_analysis, generate_regex
+from utils.utils import extract_analysis, generate_regex, extract_tag_from_analysis
 
 if __name__ == '__main__':
 	parser = argparse.ArgumentParser(description='generate a regex weightlist given an annotated corpus')
 	parser.add_argument('tagged_corpus',
 						type=argparse.FileType('r'),
 						help='input tagged corpus')
-	parser.add_argument('output_weighlist',
+	parser.add_argument('analysis_weightlist',
 						type=argparse.FileType('w'),
-						help='output weightlist')
+						help='weightlist for specific analyses')
+	parser.add_argument('--tag_weightlist',
+						type=argparse.FileType('w'),
+						help='weightlist for specific tags')
+	parser.add_argument('--default_weightlist',
+						type=argparse.FileType('w'),
+						help='weightlist for out-of-corpus tokens')
+
 	args = parser.parse_args()
 	TAGGED_CORPUS = args.tagged_corpus
-	OUTPUT_WEIGHTLIST_FILE = args.output_weighlist
+	ANALYSIS_WEIGHTLIST_FILE = args.analysis_weightlist
+	TAG_WEIGHTLIST_FILE = args.tag_weightlist
+	DEFAULT_WEIGHTLIST_FILE = args.default_weightlist
+
+	lines = TAGGED_CORPUS.readlines()
+	analyses = [extract_analysis(line.strip()) for line in lines]
+	regex_analyses = Counter([generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')])
+	den = sum(regex_analyses.values())
+	num_offset = 0
+
+	if TAG_WEIGHTLIST_FILE:
+		tags = [extract_tag_from_analysis(line.strip()) for line in lines]
+		regex_tags = Counter([generate_regex(tag, match_all_prefixes=True) for tag in tags if tag and not tag.startswith('*')])
+		den += sum(regex_tags.values()) * (1+len(regex_analyses))
+		num_offset += sum(regex_tags.values())
+
+	if DEFAULT_WEIGHTLIST_FILE:
+		den += 1 + len(regex_analyses)
+		if TAG_WEIGHTLIST_FILE:
+			den += len(regex_tags)
+		num_offset += 1
+
+	weighted_regex_analyses = ['{}::{}'.format(regex, -math.log(count + num_offset)+math.log(den))
+							for regex, count in regex_analyses.most_common()]
+	ANALYSIS_WEIGHTLIST_FILE.write('\n'.join(weighted_regex_analyses))
 
-	analyses = [extract_analysis(line.strip()) for line in TAGGED_CORPUS.readlines()]
+	if TAG_WEIGHTLIST_FILE:
+		offset = 1 if DEFAULT_WEIGHTLIST_FILE else 0
+		weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count + offset) +math.log(den))
+								for regex, count in regex_tags.most_common()]
+		TAG_WEIGHTLIST_FILE.write('\n'.join(weighted_regex_tags))
 
-	regex_tags = [generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')]
-	weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count/len(regex_tags)))
-							for regex, count in Counter(regex_tags).most_common()]
 
-	for regex in weighted_regex_tags:
-		OUTPUT_WEIGHTLIST_FILE.write(regex+'\n')
+	if DEFAULT_WEIGHTLIST_FILE:
+		DEFAULT_WEIGHTLIST_FILE.write('[?*]::{}'.format(-math.log(1) +math.log(den)))
diff --git a/eval/unigram_fit.py b/eval/unigram_fit.py
old mode 100644
new mode 100755
index 1d2116c..7b17b4e
--- a/eval/unigram_fit.py
+++ b/eval/unigram_fit.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 import os
 import sys
 import argparse
@@ -13,30 +15,47 @@ if __name__ == '__main__':
 						help='a compiled dictionary')
 	parser.add_argument('-o', '--output_directory', required=True,
 						help='output directory for weighted dictionaries')
+	parser.add_argument('-t', '--use_tags', action='store_true',
+						help='Use a tags weightlist')
 	args = parser.parse_args()
 	input_directory = args.input_directory
 	apertium_bin = args.apertium_bin
 	output_directory = args.output_directory
+	use_tags = args.use_tags
+
 	if not os.path.exists(output_directory):
 		os.mkdir(output_directory)
 
 	temp_dir = tempfile.mkdtemp()
-	
-	temp_weightlist = Path(temp_dir, 'temp_weightlist')
+
+	temp_analysis_weightlist = Path(temp_dir, 'temp_analysis_weightlist')
+	if use_tags:
+		temp_tag_weightlist = Path(temp_dir, 'temp_tag_weightlist')
+	temp_default_weightlist = Path(temp_dir, 'temp_default_weightlist')
 	temp_input_file = Path(temp_dir, 'temp_input')
+
 	for input_file in sorted(os.listdir(input_directory)):
-		temp_input_files = [Path(input_directory, input_file) for file in sorted(os.listdir(input_directory)) if file!=input_file]
+		temp_input_files = [Path(input_directory, input_file)
+			for file in sorted(os.listdir(input_directory)) if file!=input_file]
+
 		with open(temp_input_file, 'w') as f:
 			for file in temp_input_files:
 				with open(file, 'r') as fold_file:
 					f.write(fold_file.read())
-		
-		subprocess.run(['python',
-			'annotated_corpus_to_weightlist.py',
-			Path(input_directory, temp_input_file), temp_weightlist])
-		
+
+
+		subprocess.run([arg for arg in ['./annotated-corpus-to-weightlist',
+			Path(input_directory, temp_input_file),
+			temp_analysis_weightlist,
+			'--tag_weightlist' if use_tags else None,
+			temp_tag_weightlist if use_tags else None,
+			'--default_weightlist',
+			temp_default_weightlist] if arg] )
+
 		# Generate a bin file
-		subprocess.run(['./lt-weight',
+		subprocess.run([arg for arg in ['./lt-weight',
 						apertium_bin,
 						Path(output_directory, '{}.bin'.format(input_file)),
-						temp_weightlist])
\ No newline at end of file
+						temp_analysis_weightlist,
+						temp_tag_weightlist if use_tags else None,
+						temp_default_weightlist] if arg])
diff --git a/utils/utils.py b/utils/utils.py
index 2f06866..a28b96f 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -4,10 +4,38 @@
 import re
 import sys
 
+def extract_surface(tagged_line):
+	"""Extract the surface form from a tagged line
+
+	A tagged line takes the form ^surface/analysis<tag>$
+	i.e: returns surface
+	"""
+	return re.findall(r'\^.*\/', tagged_line)[0][1:-1]
+
 def extract_analysis(tagged_line):
+	"""Extract the Analysis form from a tagged line
+
+	A tagged line takes the form ^surface/analysis<tag>$
+	i.e.: returns analysis<tag>
+	"""
 	return re.sub(r'[\t ]|^(\^.*\/)|(\$)$', '', tagged_line)
 
-def generate_regex(analysis):
+def extract_tag_from_analysis(tagged_line):
+	"""Extract the tags from a tagged line
+
+	A tagged line takes the form ^surface/analysis<tag>$
+	i.e.: returns tag
+	"""
+	matches = re.findall('<.*>', extract_analysis(tagged_line))
+	if len(matches) == 0:
+		# TODO: HANDLE THE CASE NO TAGS ARE FOUND
+		# raise BaseException()
+		return None
+	return matches[0]
+
+def generate_regex(analysis, match_all_prefixes=False):
+	"""Convert an analysis form into XEROX regex"""
+
 	# Add a space after each token "REGEX concatenation"
 	analysis = ' '.join(analysis)
 
@@ -26,10 +54,15 @@ def generate_regex(analysis):
 	analysis = re.sub(r'(\<.*?\>)', lambda multichar_tag:
 		'%<{}%>'.format((re.sub(' ', '', multichar_tag.group(0)[1:-1]))), analysis)
 
-	# Surround regex with []
-	analysis = '[{}]'.format(analysis)
+	analysis = re.sub(r'[^%]([>])', '%>', analysis)
+	analysis = re.sub(r'[^%]([<])', '%<', analysis)
 
 	# TODO: Transform the regex into [?*][REGEX][?*]
 	# This may be too slow to be feasible
+	if match_all_prefixes:
+		return '[?* {}]'.format(analysis)
+
+	# Surround regex with []
+	analysis = '[{}]'.format(analysis)
 
 	return analysis