commit 58f39eaba2439c692edccbeaf077ecde2c8de81c Author: Amr Keleg Date: Wed Jun 19 16:56:06 2019 +0200 Implement the first unsupervised method of weighting automata In order to weight automata given an untagged corpus: - Tag the whole corpus using the unweighted automata - Use a constraint grammar to filter the analyses - Estimate weights from the filtered analyses diff --git a/scripts/unannotated-corpus-to-weightlist b/scripts/unannotated-corpus-to-weightlist new file mode 100755 index 0000000..819cf8d --- /dev/null +++ b/scripts/unannotated-corpus-to-weightlist @@ -0,0 +1,81 @@ +#! /bin/sh + +usage="$(basename "$0"): generate a regex weightlist given an un-annotated corpus +USAGE: $(basename "$0") [-h] input_file corpus constraint_grammar_file output_weighted_regex +input_file the input compiled dictionary (a finite state transducer) +corpus the large un-annotated corpus +constraint_grammar_file the constraint grammar to filter the analyses +output_weighted_regex the output weightlist in XEROX regex format + +Options: + -h, --help: show this help +" +while :; do + case $1 in + -h|-\?|--help) + printf "$usage" + exit + ;; + --) + shift + break + ;; + -?*) + printf "WARN: Unknown option (ignored): %s\n" "$1" >&2 + ;; + *) + break + esac + + shift +done + +FST=$1 +UNTAGGED_CORPUS=$2 +CONSTRAINT_GRAMMAR=$3 +OUTPUT_WEIGHTLIST_FILE_NAME=$4 +no_of_missing_args=0 + +if [ ! -f "$FST" ] +then + printf "ERROR: input_file \"%s\" doesn't exist\n" "$FST" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ ! -f "$UNTAGGED_CORPUS" ] +then + printf "ERROR: corpus \"%s\" doesn't exist\n" "$UNTAGGED_CORPUS" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ ! -f "$CONSTRAINT_GRAMMAR" ] +then + printf "ERROR: constraint_grammar_file \"%s\" doesn't exist\n" "$CONSTRAINT_GRAMMAR" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ -z "$OUTPUT_WEIGHTLIST_FILE_NAME" ] +then + printf "ERROR: output_file isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi +if [ $no_of_missing_args -gt 0 ] +then + printf "$usage" + exit +fi + +TEMP_DIR=$(mktemp -d) +TEMP_CORPUS_FILE="$TEMP_DIR/tagged_corpus" + +apertium-destxt $UNTAGGED_CORPUS | + ../lttoolbox/lt-proc $FST | + cg-proc $CONSTRAINT_GRAMMAR | + apertium-cleanstream -n | + sed -e 's/^\^//' -e 's/\$$//'| + awk '{n=split($0,array,"/"); for(i=2;i<=n;i++) print "^"array[1]"/" array[i]"$"}' > $TEMP_CORPUS_FILE + +# Estimate the unigram-based weightlist using the filtered analyses +./annotated-corpus-to-weightlist $TEMP_CORPUS_FILE $OUTPUT_WEIGHTLIST_FILE_NAME + +rm -rf $TEMP_DIR