commit c02dc001cc646eb130b2fba67d0185bfa2886b8a Author: Amr Keleg Date: Mon Jun 17 14:06:51 2019 +0200 Separate the weight estimation part from the weighting script Create a script for generating weightlist in a supervised way. On the other hand, update lt-weight such that it gets a weighlist as an input. diff --git a/annotated-corpus-to-weightlist b/annotated-corpus-to-weightlist new file mode 100755 index 0000000..6ce905b --- /dev/null +++ b/annotated-corpus-to-weightlist @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import math +import argparse +from collections import Counter +from utils.utils import extract_analysis, generate_regex + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='generate a regex weightlist given an annotated corpus') + parser.add_argument('tagged_corpus', + type=argparse.FileType('r'), + help='input tagged corpus') + parser.add_argument('output_weighlist', + type=argparse.FileType('w'), + help='output weightlist') + args = parser.parse_args() + TAGGED_CORPUS = args.tagged_corpus + OUTPUT_WEIGHTLIST_FILE = args.output_weighlist + + analyses = [extract_analysis(line.strip()) for line in TAGGED_CORPUS.readlines()] + + regex_tags = [generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')] + weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count/len(regex_tags))) + for regex, count in Counter(regex_tags).most_common()] + + for regex in weighted_regex_tags: + OUTPUT_WEIGHTLIST_FILE.write(regex+'\n') diff --git a/lt-weight b/lt-weight index 03cad0d..617d84a 100755 --- a/lt-weight +++ b/lt-weight @@ -1,21 +1,68 @@ #! /bin/sh + +usage="$(basename "$0"): weight a dictionary file using a regex weightlist +USAGE: $(basename "$0") [-h] input_file output_file weighted_regex +input_file the input compiled dictionary (a finite state transducer) +output_file the weighted dictionary (a finite state transducer) +weighted_regex the weightlist in XEROX regex format + +Options: + -h, --help: show this help +" +while :; do + case $1 in + -h|-\?|--help) + printf "$usage" + exit + ;; + --) + shift + break + ;; + -?*) + printf "WARN: Unknown option (ignored): %s\n" "$1" >&2 + ;; + *) + break + esac + + shift +done + FST=$1 -CORPUS=$2 -OUTPUT_FST=$3 +OUTPUT_FST=$2 +WEIGHTED_REGEXP=$3 -# Temporary directory for intermediate files -TEMP_DIR=".tmp" -# Check if it exists -if ! [ -d "$TEMP_DIR" ]; then - mkdir $TEMP_DIR +no_of_missing_args=0 +if [ ! -f "$FST" ] +then + printf "ERROR: input_file \"%s\" doesn't exist\n" "$FST" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ -z "$OUTPUT_FST" ] +then + printf "ERROR: output_file isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) fi -CLEANED_CORPUS="$TEMP_DIR/clean-corpus.tagged" +if [ ! -f "$WEIGHTED_REGEXP" ] +then + printf "ERROR: weighted_regex \"%s\" doesn't exist\n" "$WEIGHTED_REGEXP">&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ $no_of_missing_args -gt 0 ] +then + printf "$usage" + exit +fi +# Temporary directory for intermediate files +TEMP_DIR=$(mktemp -d) ATTFST="$TEMP_DIR/transducer.att" HFST_FST="$TEMP_DIR/transducer.hfst" -WEIGHTED_REGEXP="$TEMP_DIR/weighted-regex" WEIGHTED_FST="$TEMP_DIR/weighted-pairs.hfst" COMPOSED_FST="$TEMP_DIR/weighted-transducer.hfst" SUBTRACTED_FST="$TEMP_DIR/subtracted-transducer.hfst" @@ -25,31 +72,22 @@ MINIMIZED_FST="$TEMP_DIR/minimized-weighted-transducer.hfst" MINIMIZED_ATTFST="$TEMP_DIR/weighted-transducer.att" # Convert the input FST to HFST -lt-print "$FST" | sed -e 's/:/\\:/' -e :a -e 's/ /@_SPACE_@/;ta'> $ATTFST - -hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST - -# Clean the input tagged corpus -# REMOVE EMPTY LINES -sed -e '/^$/d' "$CORPUS" > $CLEANED_CORPUS +lt-print "$FST" | sed -e "s/:/\\:/" -e :a -e "s/ /@_SPACE_@/;ta"> "$ATTFST" +hfst-txt2fst --epsilon=ε -i "$ATTFST" -o "$HFST_FST" # Generate a weighted FST from the string pairs -LINES=$(wc -l $CLEANED_CORPUS | cut -d ' ' -f1) - -sed -e 's/[ \t]//' -e 's/\^.*\///' -e 's/\$$//' $CLEANED_CORPUS > $WEIGHTED_REGEXP -python prepare_regex_strings.py $WEIGHTED_REGEXP -hfst-regexp2fst -j -i $WEIGHTED_REGEXP -o $WEIGHTED_FST +hfst-regexp2fst -j -i "$WEIGHTED_REGEXP" -o "$WEIGHTED_FST" # Compose the input FST and the weighted FST -hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST -v -o $COMPOSED_FST -hfst-subtract $HFST_FST $COMPOSED_FST -o $SUBTRACTED_FST -hfst-reweight -i $SUBTRACTED_FST -o $DEFAULT_WEIGHTED_FST -e -a 1000000 -hfst-disjunct $DEFAULT_WEIGHTED_FST $COMPOSED_FST -o $DISJUNCTED_FST -hfst-minimize $DISJUNCTED_FST -o $MINIMIZED_FST -hfst-fst2txt -i $MINIMIZED_FST -o $MINIMIZED_ATTFST +hfst-compose -1 "$HFST_FST" -2 "$WEIGHTED_FST" -v -o "$COMPOSED_FST" +hfst-subtract "$HFST_FST" "$COMPOSED_FST" -o "$SUBTRACTED_FST" +hfst-reweight -i "$SUBTRACTED_FST" -o "$DEFAULT_WEIGHTED_FST" -e -a 1000000 +hfst-disjunct "$DEFAULT_WEIGHTED_FST" "$COMPOSED_FST" -o "$DISJUNCTED_FST" +hfst-minimize "$DISJUNCTED_FST" -o "$MINIMIZED_FST" +hfst-fst2txt -i "$MINIMIZED_FST" -o "$MINIMIZED_ATTFST" # Compile the FST back using lttoolbox -lt-comp lr $MINIMIZED_ATTFST $OUTPUT_FST +../lttoolbox/lt-comp lr "$MINIMIZED_ATTFST" "$OUTPUT_FST" # Delete the temporary files rm -rf "$TEMP_DIR" diff --git a/prepare_regex_strings.py b/prepare_regex_strings.py deleted file mode 100644 index 2f9300a..0000000 --- a/prepare_regex_strings.py +++ /dev/null @@ -1,40 +0,0 @@ -import re -import sys -import numpy as np -import pandas as pd - -#TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS -special_regex_chars = '%,.;!#-—+*:0?[]()~"\'' - -def clean_tag_patterns(reg): - whitesace_free_reg = re.sub(' ', '', reg) - return '%{}%>'.format(whitesace_free_reg[:-1]) - -def clean_line(line): - line = line.strip() - if line.endswith('$"'): - # ERROR LINE LIKE ^./.$" - line = line[:-2] - line = re.sub(r'(.)', r'\1 ', line) - - for special_char in special_regex_chars: - line = re.sub('\\{}'.format(special_char), '%{}'.format(special_char), line) - - line = re.sub(r'(\<.*?\>)', lambda m: - clean_tag_patterns(m.group(0)),line) - # HANDLE TAGS - line = line.strip() - line = '[{}]'.format(line) - # line = '[?*][{}][?*]'.format(line) - return line - -if __name__ == '__main__': - FILE_NAME = sys.argv[1] - with open(FILE_NAME, 'r') as f: - lines =[clean_line(line) for line in f.readlines() if not line.startswith('*')] - - lines = list(pd.Series(lines).value_counts().reset_index().apply(lambda r: '{}::{}'.format(r['index'].strip(), -np.log(r[0]/len(lines))), axis=1)) - - with open(FILE_NAME, 'w') as f: - for line in lines: - f.write(line+'\n') \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..2f06866 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +#TODO: Use streamparser? +import re +import sys + +def extract_analysis(tagged_line): + return re.sub(r'[\t ]|^(\^.*\/)|(\$)$', '', tagged_line) + +def generate_regex(analysis): + # Add a space after each token "REGEX concatenation" + analysis = ' '.join(analysis) + + #TODO: HANDLE THE REST OF THE SPECIAL CHARACTERS + SPECIAL_REGEX_CHARACTERS = '%,.;!#-—+=@&*_:0?[]()~"\'^$°’\\' + # Escape special characters + for special_char in SPECIAL_REGEX_CHARACTERS: + if special_char == '\\': + analysis = re.sub(r'\\', r'%\\', analysis) + else: + analysis = re.sub(r'\{}'.format(special_char), '%{}'.format(special_char), analysis) + + # Fix the multichar tags: + # - Remove intermediate spaces + # - Prepend < and > with % + analysis = re.sub(r'(\<.*?\>)', lambda multichar_tag: + '%<{}%>'.format((re.sub(' ', '', multichar_tag.group(0)[1:-1]))), analysis) + + # Surround regex with [] + analysis = '[{}]'.format(analysis) + + # TODO: Transform the regex into [?*][REGEX][?*] + # This may be too slow to be feasible + + return analysis