commit f0024c271e7939ece91c7881f97fd7be1e63ea87 Author: Amr Keleg Date: Fri Jul 19 15:47:12 2019 +0200 Implement the third unsupervised method for weighting automata Give a weight of 1 to all the edges of the fst diff --git a/analysis-length-reweight b/analysis-length-reweight new file mode 100755 index 0000000..90e46d7 --- /dev/null +++ b/analysis-length-reweight @@ -0,0 +1,72 @@ +#! /bin/sh + +usage="$(basename "$0"): weight a dictionary file according to the analysis length +USAGE: $(basename "$0") [-h] input_file output_file +input_file the input compiled dictionary (a finite state transducer) +output_file the weighted dictionary (a finite state transducer) + +Options: + -h, --help: show this help +" +while :; do + case $1 in + -h|-\?|--help) + printf "$usage" + exit + ;; + --) + shift + break + ;; + -?*) + printf "WARN: Unknown option (ignored): %s\n" "$1" >&2 + ;; + *) + break + esac + + shift +done + +FST=$1 +OUTPUT_FST=$2 + +no_of_missing_args=0 +if [ ! -f "$FST" ] +then + printf "ERROR: input_file \"%s\" doesn't exist\n" "$FST" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ -z "$OUTPUT_FST" ] +then + printf "ERROR: output_file isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ $no_of_missing_args -gt 0 ] +then + printf "$usage" + exit +fi +# Temporary directory for intermediate files +TEMP_DIR=$(mktemp -d) + +ATTFST="$TEMP_DIR/transducer.att" +HFST_FST="$TEMP_DIR/transducer.hfst" +WEIGHTED_FST="$TEMP_DIR/weighted_transducer.hfst" +WEIGHTED_ATTFST="$TEMP_DIR/weighted_transducer.att" + +# Convert the input FST to HFST +lt-print "$FST" | sed -e "s/:/\\:/" -e :a -e "s/ /@_SPACE_@/;ta"> "$ATTFST" +hfst-txt2fst --epsilon=ε -i "$ATTFST" -o "$HFST_FST" + +# Generate a weighted FST from the string pairs +hfst-reweight -i "$HFST_FST" -o "$WEIGHTED_FST" -a 1 +hfst-fst2txt -i "$WEIGHTED_FST" -o "$WEIGHTED_ATTFST" + +# Compile the FST back using lttoolbox +../lttoolbox/lt-comp lr "$WEIGHTED_ATTFST" "$OUTPUT_FST" + +# Delete the temporary files +rm -rf "$TEMP_DIR" diff --git a/eval/analysis_length_fit.py b/eval/analysis_length_fit.py new file mode 100644 index 0000000..5493029 --- /dev/null +++ b/eval/analysis_length_fit.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import tempfile +import subprocess +from pathlib import Path + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='fit n models using n folds') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of the n folds') + parser.add_argument('-b', '--apertium_bin', required=True, + help='a compiled dictionary') + parser.add_argument('-o', '--output_directory', required=True, + help='output directory for weighted dictionaries') + args = parser.parse_args() + input_directory = args.input_directory + apertium_bin = args.apertium_bin + output_directory = args.output_directory + + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + for input_file in sorted(os.listdir(input_directory)): + # Generate a bin file + subprocess.run(['./analysis-length-reweight', + apertium_bin, + Path(output_directory, '{}.bin'.format(input_file))])