commit 6f16fe2d2e21d120a86003d6f503884b8ea2f5c3
Author: Amr Keleg <amr_mohamed@live.com>
Date:   Tue May 21 17:59:05 2019 +0200

    lt-weight: Update the script to handle non-trivial corpora/transducers
    
    * Use weighted regex instead of weighted string pairs
    * Use fst subtraction to handle analyses that don't have weights
    * Add a large weight to unweighted final states instead of zero weight

diff --git a/scripts/lt-weight b/scripts/lt-weight
index 092fc66..fcf6244 100755
--- a/scripts/lt-weight
+++ b/scripts/lt-weight
@@ -3,33 +3,50 @@ FST=$1
 CORPUS=$2
 OUTPUT_FST=$3
 
-# Temporary intermediate files
-ATTFST='transducer.att'
-HFST_FST='transducer.hfst'
-WEIGHTED_FST='weighted-pairs.hfst'
-COMPOSED_FST='weighted-transducer.hfst'
-MULTICHAR='multichar_symbols'
+# Temporary directory for intermediate files
+TEMP_DIR=".tmp"
+# Check if it exists
+if ! [ -d "$TEMP_DIR" ]; then
+	mkdir $TEMP_DIR
+fi
+
+CLEANED_CORPUS="$TEMP_DIR/clean-corpus.tagged"
+
+ATTFST="$TEMP_DIR/transducer.att"
+HFST_FST="$TEMP_DIR/transducer.hfst"
+
+WEIGHTED_REGEXP="$TEMP_DIR/weighted-regex"
+WEIGHTED_FST="$TEMP_DIR/weighted-pairs.hfst"
+COMPOSED_FST="$TEMP_DIR/weighted-transducer.hfst"
+SUBTRACTED_FST="$TEMP_DIR/subtracted-transducer.hfst"
+DEFAULT_WEIGHTED_FST="$TEMP_DIR/default-weighted-transducer.hfst"
+DISJUNCTED_FST="$TEMP_DIR/disjuncted-weighted-transducer.hfst"
+DISJUNCTED_ATTFST="$TEMP_DIR/weighted-transducer.att"
 
 # Convert the input FST to HFST
-lt-print $FST > $ATTFST
-hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST
+lt-print "$FST" | sed -e 's/:/\\:/' -e :a -e 's/ /@_SPACE_@/;ta'> $ATTFST
 
-LINES=$(wc -l $CORPUS | cut -d ' ' -f1)
+hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST
 
-# Prepare the multichar symbols files
-awk -F '[<>]' '{print "<"$(NF-1)">"}' data/corpus.tagged | tr -d , | sort | uniq > $MULTICHAR
+# Clean the input tagged corpus
+# REMOVE EMPTY LINES
+sed -e '/^$/d' "$CORPUS" > $CLEANED_CORPUS
 
 # Generate a weighted FST from the string pairs
-cat $CORPUS | sed -e 's/[ \t]//' | sed -e 's/\^.*\///' |
-sed -e 's/\$$//' | sort | uniq -c | sed -e 's/^[ \t]*//' |
-awk -v lines="$LINES" '{$1=-log($1/lines); print $2":" $2 "\t" $1}' |
-hfst-strings2fst -j -o $WEIGHTED_FST -m $MULTICHAR
+LINES=$(wc -l $CLEANED_CORPUS | cut -d ' ' -f1)
+
+sed -e 's/[ \t]//' -e 's/\^.*\///' -e 's/\$$//' $CLEANED_CORPUS > $WEIGHTED_REGEXP
+python prepare_regex_strings.py $WEIGHTED_REGEXP
+hfst-regexp2fst -j -i $WEIGHTED_REGEXP -o $WEIGHTED_FST
 
 # Compose the input FST and the weighted FST
-hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST | hfst-fst2txt > $COMPOSED_FST
+hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST -v -o $COMPOSED_FST
+hfst-subtract $HFST_FST $COMPOSED_FST -o $SUBTRACTED_FST
+hfst-reweight -i $SUBTRACTED_FST -o $DEFAULT_WEIGHTED_FST -e -a 1000000
+hfst-disjunct $DEFAULT_WEIGHTED_FST $COMPOSED_FST -o $DISJUNCTED_FST
 
 # Compile the FST back using lttoolbox
-lt-comp lr $COMPOSED_FST $OUTPUT_FST
+lt-comp lr $DISJUNCTED_ATTFST $OUTPUT_FST
 
 # Delete the temporary files
-rm $ATTFST $HFST_FST $WEIGHTED_FST $MULTICHAR $COMPOSED_FST
+rm -rf "$TEMP_DIR"
diff --git a/scripts/prepare_regex_strings.py b/scripts/prepare_regex_strings.py
new file mode 100644
index 0000000..2f9300a
--- /dev/null
+++ b/scripts/prepare_regex_strings.py
@@ -0,0 +1,40 @@
+import re
+import sys
+import numpy as np
+import pandas as pd
+
+#TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS
+special_regex_chars = '%,.;!#-—+*:0?[]()~"\''
+
+def clean_tag_patterns(reg):
+	whitesace_free_reg = re.sub(' ', '', reg)
+	return '%{}%>'.format(whitesace_free_reg[:-1])
+
+def clean_line(line):
+	line = line.strip()
+	if line.endswith('$"'):
+		# ERROR LINE LIKE ^./.<sent>$"
+		line = line[:-2]
+	line = re.sub(r'(.)', r'\1 ', line)
+
+	for special_char in special_regex_chars:
+		line = re.sub('\\{}'.format(special_char), '%{}'.format(special_char), line)
+	
+	line = re.sub(r'(\<.*?\>)', lambda m: 
+		clean_tag_patterns(m.group(0)),line)
+	# HANDLE TAGS
+	line = line.strip()
+	line = '[{}]'.format(line)
+	# line = '[?*][{}][?*]'.format(line)
+	return line
+
+if __name__ == '__main__':
+	FILE_NAME = sys.argv[1]
+	with open(FILE_NAME, 'r') as f:
+		lines =[clean_line(line) for line in f.readlines() if not line.startswith('*')]
+
+	lines = list(pd.Series(lines).value_counts().reset_index().apply(lambda r: '{}::{}'.format(r['index'].strip(), -np.log(r[0]/len(lines))), axis=1))
+
+	with open(FILE_NAME, 'w') as f:
+		for line in lines:
+			f.write(line+'\n')
\ No newline at end of file