commit c02dc001cc646eb130b2fba67d0185bfa2886b8a
Author: Amr Keleg <amr_mohamed@live.com>
Date:   Mon Jun 17 14:06:51 2019 +0200

    Separate the weight estimation part from the weighting script
    
    Create a script for generating weightlist in a supervised way.
    On the other hand, update lt-weight such that it gets a weighlist
    as an input.

diff --git a/annotated-corpus-to-weightlist b/annotated-corpus-to-weightlist
new file mode 100755
index 0000000..6ce905b
--- /dev/null
+++ b/annotated-corpus-to-weightlist
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+import math
+import argparse
+from collections import Counter
+from utils.utils import extract_analysis, generate_regex
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description='generate a regex weightlist given an annotated corpus')
+	parser.add_argument('tagged_corpus',
+						type=argparse.FileType('r'),
+						help='input tagged corpus')
+	parser.add_argument('output_weighlist',
+						type=argparse.FileType('w'),
+						help='output weightlist')
+	args = parser.parse_args()
+	TAGGED_CORPUS = args.tagged_corpus
+	OUTPUT_WEIGHTLIST_FILE = args.output_weighlist
+
+	analyses = [extract_analysis(line.strip()) for line in TAGGED_CORPUS.readlines()]
+
+	regex_tags = [generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')]
+	weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count/len(regex_tags)))
+							for regex, count in Counter(regex_tags).most_common()]
+
+	for regex in weighted_regex_tags:
+		OUTPUT_WEIGHTLIST_FILE.write(regex+'\n')
diff --git a/lt-weight b/lt-weight
index 03cad0d..617d84a 100755
--- a/lt-weight
+++ b/lt-weight
@@ -1,21 +1,68 @@
 #! /bin/sh
+
+usage="$(basename "$0"): weight a dictionary file using a regex weightlist
+USAGE: $(basename "$0") [-h] input_file output_file weighted_regex
+input_file	the input compiled dictionary (a finite state transducer)
+output_file	the weighted dictionary (a finite state transducer)
+weighted_regex	the weightlist in XEROX regex format
+
+Options:
+    -h, --help:	show this help
+"
+while :; do
+    case $1 in
+        -h|-\?|--help)
+            printf "$usage"
+            exit
+            ;;
+        --)
+            shift
+            break
+            ;;
+        -?*)
+            printf "WARN: Unknown option (ignored): %s\n" "$1" >&2
+            ;;
+        *)
+            break
+    esac
+
+    shift
+done
+
 FST=$1
-CORPUS=$2
-OUTPUT_FST=$3
+OUTPUT_FST=$2
+WEIGHTED_REGEXP=$3
 
-# Temporary directory for intermediate files
-TEMP_DIR=".tmp"
-# Check if it exists
-if ! [ -d "$TEMP_DIR" ]; then
-	mkdir $TEMP_DIR
+no_of_missing_args=0
+if [ ! -f "$FST" ]
+then
+	printf "ERROR: input_file \"%s\" doesn't exist\n" "$FST" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ -z "$OUTPUT_FST" ]
+then
+	printf "ERROR: output_file isn't set\n" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
 fi
 
-CLEANED_CORPUS="$TEMP_DIR/clean-corpus.tagged"
+if [ ! -f "$WEIGHTED_REGEXP" ]
+then
+	printf "ERROR: weighted_regex \"%s\" doesn't exist\n" "$WEIGHTED_REGEXP">&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ $no_of_missing_args -gt 0 ]
+then
+	printf "$usage"
+	exit
+fi
+# Temporary directory for intermediate files
+TEMP_DIR=$(mktemp -d)
 
 ATTFST="$TEMP_DIR/transducer.att"
 HFST_FST="$TEMP_DIR/transducer.hfst"
 
-WEIGHTED_REGEXP="$TEMP_DIR/weighted-regex"
 WEIGHTED_FST="$TEMP_DIR/weighted-pairs.hfst"
 COMPOSED_FST="$TEMP_DIR/weighted-transducer.hfst"
 SUBTRACTED_FST="$TEMP_DIR/subtracted-transducer.hfst"
@@ -25,31 +72,22 @@ MINIMIZED_FST="$TEMP_DIR/minimized-weighted-transducer.hfst"
 MINIMIZED_ATTFST="$TEMP_DIR/weighted-transducer.att"
 
 # Convert the input FST to HFST
-lt-print "$FST" | sed -e 's/:/\\:/' -e :a -e 's/ /@_SPACE_@/;ta'> $ATTFST
-
-hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST
-
-# Clean the input tagged corpus
-# REMOVE EMPTY LINES
-sed -e '/^$/d' "$CORPUS" > $CLEANED_CORPUS
+lt-print "$FST" | sed -e "s/:/\\:/" -e :a -e "s/ /@_SPACE_@/;ta"> "$ATTFST"
+hfst-txt2fst --epsilon=ε -i "$ATTFST" -o "$HFST_FST"
 
 # Generate a weighted FST from the string pairs
-LINES=$(wc -l $CLEANED_CORPUS | cut -d ' ' -f1)
-
-sed -e 's/[ \t]//' -e 's/\^.*\///' -e 's/\$$//' $CLEANED_CORPUS > $WEIGHTED_REGEXP
-python prepare_regex_strings.py $WEIGHTED_REGEXP
-hfst-regexp2fst -j -i $WEIGHTED_REGEXP -o $WEIGHTED_FST
+hfst-regexp2fst -j -i "$WEIGHTED_REGEXP" -o "$WEIGHTED_FST"
 
 # Compose the input FST and the weighted FST
-hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST -v -o $COMPOSED_FST
-hfst-subtract $HFST_FST $COMPOSED_FST -o $SUBTRACTED_FST
-hfst-reweight -i $SUBTRACTED_FST -o $DEFAULT_WEIGHTED_FST -e -a 1000000
-hfst-disjunct $DEFAULT_WEIGHTED_FST $COMPOSED_FST -o $DISJUNCTED_FST
-hfst-minimize $DISJUNCTED_FST -o $MINIMIZED_FST
-hfst-fst2txt -i $MINIMIZED_FST -o $MINIMIZED_ATTFST
+hfst-compose -1 "$HFST_FST" -2 "$WEIGHTED_FST" -v -o "$COMPOSED_FST"
+hfst-subtract "$HFST_FST" "$COMPOSED_FST" -o "$SUBTRACTED_FST"
+hfst-reweight -i "$SUBTRACTED_FST" -o "$DEFAULT_WEIGHTED_FST" -e -a 1000000
+hfst-disjunct "$DEFAULT_WEIGHTED_FST" "$COMPOSED_FST" -o "$DISJUNCTED_FST"
+hfst-minimize "$DISJUNCTED_FST" -o "$MINIMIZED_FST"
+hfst-fst2txt -i "$MINIMIZED_FST" -o "$MINIMIZED_ATTFST"
 
 # Compile the FST back using lttoolbox
-lt-comp lr $MINIMIZED_ATTFST $OUTPUT_FST
+../lttoolbox/lt-comp lr "$MINIMIZED_ATTFST" "$OUTPUT_FST"
 
 # Delete the temporary files
 rm -rf "$TEMP_DIR"
diff --git a/prepare_regex_strings.py b/prepare_regex_strings.py
deleted file mode 100644
index 2f9300a..0000000
--- a/prepare_regex_strings.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import re
-import sys
-import numpy as np
-import pandas as pd
-
-#TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS
-special_regex_chars = '%,.;!#-—+*:0?[]()~"\''
-
-def clean_tag_patterns(reg):
-	whitesace_free_reg = re.sub(' ', '', reg)
-	return '%{}%>'.format(whitesace_free_reg[:-1])
-
-def clean_line(line):
-	line = line.strip()
-	if line.endswith('$"'):
-		# ERROR LINE LIKE ^./.<sent>$"
-		line = line[:-2]
-	line = re.sub(r'(.)', r'\1 ', line)
-
-	for special_char in special_regex_chars:
-		line = re.sub('\\{}'.format(special_char), '%{}'.format(special_char), line)
-	
-	line = re.sub(r'(\<.*?\>)', lambda m: 
-		clean_tag_patterns(m.group(0)),line)
-	# HANDLE TAGS
-	line = line.strip()
-	line = '[{}]'.format(line)
-	# line = '[?*][{}][?*]'.format(line)
-	return line
-
-if __name__ == '__main__':
-	FILE_NAME = sys.argv[1]
-	with open(FILE_NAME, 'r') as f:
-		lines =[clean_line(line) for line in f.readlines() if not line.startswith('*')]
-
-	lines = list(pd.Series(lines).value_counts().reset_index().apply(lambda r: '{}::{}'.format(r['index'].strip(), -np.log(r[0]/len(lines))), axis=1))
-
-	with open(FILE_NAME, 'w') as f:
-		for line in lines:
-			f.write(line+'\n')
\ No newline at end of file
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..2f06866
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+#TODO: Use streamparser?
+import re
+import sys
+
+def extract_analysis(tagged_line):
+	return re.sub(r'[\t ]|^(\^.*\/)|(\$)$', '', tagged_line)
+
+def generate_regex(analysis):
+	# Add a space after each token "REGEX concatenation"
+	analysis = ' '.join(analysis)
+
+	#TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS
+	SPECIAL_REGEX_CHARACTERS = '%,.;!#-—+=@&*_:0?[]()~"\'^$°’\\'
+	# Escape special characters
+	for special_char in SPECIAL_REGEX_CHARACTERS:
+		if special_char == '\\':
+			analysis = re.sub(r'\\', r'%\\', analysis)
+		else:
+			analysis = re.sub(r'\{}'.format(special_char), '%{}'.format(special_char), analysis)
+
+	# Fix the multichar tags:
+	# - Remove intermediate spaces
+	# - Prepend < and > with %
+	analysis = re.sub(r'(\<.*?\>)', lambda multichar_tag:
+		'%<{}%>'.format((re.sub(' ', '', multichar_tag.group(0)[1:-1]))), analysis)
+
+	# Surround regex with []
+	analysis = '[{}]'.format(analysis)
+
+	# TODO: Transform the regex into [?*][REGEX][?*]
+	# This may be too slow to be feasible
+
+	return analysis