commit 6f16fe2d2e21d120a86003d6f503884b8ea2f5c3 Author: Amr Keleg Date: Tue May 21 17:59:05 2019 +0200 lt-weight: Update the script to handle non-trivial corpora/transducers * Use weighted regex instead of weighted string pairs * Use fst subtraction to handle analyses that don't have weights * Add a large weight to unweighted final states instead of zero weight diff --git a/scripts/lt-weight b/scripts/lt-weight index 092fc66..fcf6244 100755 --- a/scripts/lt-weight +++ b/scripts/lt-weight @@ -3,33 +3,50 @@ FST=$1 CORPUS=$2 OUTPUT_FST=$3 -# Temporary intermediate files -ATTFST='transducer.att' -HFST_FST='transducer.hfst' -WEIGHTED_FST='weighted-pairs.hfst' -COMPOSED_FST='weighted-transducer.hfst' -MULTICHAR='multichar_symbols' +# Temporary directory for intermediate files +TEMP_DIR=".tmp" +# Check if it exists +if ! [ -d "$TEMP_DIR" ]; then + mkdir $TEMP_DIR +fi + +CLEANED_CORPUS="$TEMP_DIR/clean-corpus.tagged" + +ATTFST="$TEMP_DIR/transducer.att" +HFST_FST="$TEMP_DIR/transducer.hfst" + +WEIGHTED_REGEXP="$TEMP_DIR/weighted-regex" +WEIGHTED_FST="$TEMP_DIR/weighted-pairs.hfst" +COMPOSED_FST="$TEMP_DIR/weighted-transducer.hfst" +SUBTRACTED_FST="$TEMP_DIR/subtracted-transducer.hfst" +DEFAULT_WEIGHTED_FST="$TEMP_DIR/default-weighted-transducer.hfst" +DISJUNCTED_FST="$TEMP_DIR/disjuncted-weighted-transducer.hfst" +DISJUNCTED_ATTFST="$TEMP_DIR/weighted-transducer.att" # Convert the input FST to HFST -lt-print $FST > $ATTFST -hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST +lt-print "$FST" | sed -e 's/:/\\:/' -e :a -e 's/ /@_SPACE_@/;ta'> $ATTFST -LINES=$(wc -l $CORPUS | cut -d ' ' -f1) +hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST -# Prepare the multichar symbols files -awk -F '[<>]' '{print "<"$(NF-1)">"}' data/corpus.tagged | tr -d , | sort | uniq > $MULTICHAR +# Clean the input tagged corpus +# REMOVE EMPTY LINES +sed -e '/^$/d' "$CORPUS" > $CLEANED_CORPUS # Generate a weighted FST from the string pairs -cat $CORPUS | sed -e 's/[ \t]//' | sed -e 's/\^.*\///' | -sed -e 's/\$$//' | sort | uniq -c | sed -e 's/^[ \t]*//' | -awk -v lines="$LINES" '{$1=-log($1/lines); print $2":" $2 "\t" $1}' | -hfst-strings2fst -j -o $WEIGHTED_FST -m $MULTICHAR +LINES=$(wc -l $CLEANED_CORPUS | cut -d ' ' -f1) + +sed -e 's/[ \t]//' -e 's/\^.*\///' -e 's/\$$//' $CLEANED_CORPUS > $WEIGHTED_REGEXP +python prepare_regex_strings.py $WEIGHTED_REGEXP +hfst-regexp2fst -j -i $WEIGHTED_REGEXP -o $WEIGHTED_FST # Compose the input FST and the weighted FST -hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST | hfst-fst2txt > $COMPOSED_FST +hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST -v -o $COMPOSED_FST +hfst-subtract $HFST_FST $COMPOSED_FST -o $SUBTRACTED_FST +hfst-reweight -i $SUBTRACTED_FST -o $DEFAULT_WEIGHTED_FST -e -a 1000000 +hfst-disjunct $DEFAULT_WEIGHTED_FST $COMPOSED_FST -o $DISJUNCTED_FST # Compile the FST back using lttoolbox -lt-comp lr $COMPOSED_FST $OUTPUT_FST +lt-comp lr $DISJUNCTED_ATTFST $OUTPUT_FST # Delete the temporary files -rm $ATTFST $HFST_FST $WEIGHTED_FST $MULTICHAR $COMPOSED_FST +rm -rf "$TEMP_DIR" diff --git a/scripts/prepare_regex_strings.py b/scripts/prepare_regex_strings.py new file mode 100644 index 0000000..2f9300a --- /dev/null +++ b/scripts/prepare_regex_strings.py @@ -0,0 +1,40 @@ +import re +import sys +import numpy as np +import pandas as pd + +#TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS +special_regex_chars = '%,.;!#-—+*:0?[]()~"\'' + +def clean_tag_patterns(reg): + whitesace_free_reg = re.sub(' ', '', reg) + return '%{}%>'.format(whitesace_free_reg[:-1]) + +def clean_line(line): + line = line.strip() + if line.endswith('$"'): + # ERROR LINE LIKE ^./.$" + line = line[:-2] + line = re.sub(r'(.)', r'\1 ', line) + + for special_char in special_regex_chars: + line = re.sub('\\{}'.format(special_char), '%{}'.format(special_char), line) + + line = re.sub(r'(\<.*?\>)', lambda m: + clean_tag_patterns(m.group(0)),line) + # HANDLE TAGS + line = line.strip() + line = '[{}]'.format(line) + # line = '[?*][{}][?*]'.format(line) + return line + +if __name__ == '__main__': + FILE_NAME = sys.argv[1] + with open(FILE_NAME, 'r') as f: + lines =[clean_line(line) for line in f.readlines() if not line.startswith('*')] + + lines = list(pd.Series(lines).value_counts().reset_index().apply(lambda r: '{}::{}'.format(r['index'].strip(), -np.log(r[0]/len(lines))), axis=1)) + + with open(FILE_NAME, 'w') as f: + for line in lines: + f.write(line+'\n') \ No newline at end of file