commit afbbbdf207efc62a489bc2cd89a8a4d974627e92 Author: Amr Keleg Date: Sun Aug 11 13:31:26 2019 +0200 Make use of laplace smoothing for the constraint grammar weighting method diff --git a/scripts/eval/constraintgrammar_fit.py b/scripts/eval/constraintgrammar_fit.py index c182900..66a20dd 100644 --- a/scripts/eval/constraintgrammar_fit.py +++ b/scripts/eval/constraintgrammar_fit.py @@ -31,12 +31,14 @@ if __name__ == '__main__': temp_dir = tempfile.mkdtemp() temp_weightlist = Path(temp_dir, 'temp_weightlist') + default_weightlist = Path(temp_dir, 'temp_default_weightlist') subprocess.run(['./unannotated-corpus-to-weightlist', - apertium_bin, corpus, constraint_grammar, temp_weightlist]) + apertium_bin, corpus, constraint_grammar, temp_weightlist, default_weightlist]) for input_file in sorted(os.listdir(input_directory)): # Generate a bin file subprocess.run(['./lt-weight', apertium_bin, Path(output_directory, '{}.bin'.format(input_file)), - temp_weightlist]) + temp_weightlist, + default_weightlist]) diff --git a/scripts/unannotated-corpus-to-weightlist b/scripts/unannotated-corpus-to-weightlist index 819cf8d..2c83440 100755 --- a/scripts/unannotated-corpus-to-weightlist +++ b/scripts/unannotated-corpus-to-weightlist @@ -1,11 +1,12 @@ #! /bin/sh usage="$(basename "$0"): generate a regex weightlist given an un-annotated corpus -USAGE: $(basename "$0") [-h] input_file corpus constraint_grammar_file output_weighted_regex +USAGE: $(basename "$0") [-h] input_file corpus constraint_grammar_file output_weighted_regex default_weightlist input_file the input compiled dictionary (a finite state transducer) corpus the large un-annotated corpus constraint_grammar_file the constraint grammar to filter the analyses output_weighted_regex the output weightlist in XEROX regex format +deafult_weightlist the default fallback weightlist in XEROX regex format Options: -h, --help: show this help @@ -13,7 +14,7 @@ Options: while :; do case $1 in -h|-\?|--help) - printf "$usage" + echo "$usage" exit ;; --) @@ -34,6 +35,7 @@ FST=$1 UNTAGGED_CORPUS=$2 CONSTRAINT_GRAMMAR=$3 OUTPUT_WEIGHTLIST_FILE_NAME=$4 +DEFAULT_WEIGHTLIST_FILE_NAME=$5 no_of_missing_args=0 if [ ! -f "$FST" ] @@ -59,23 +61,30 @@ then printf "ERROR: output_file isn't set\n" >&2 no_of_missing_args=$((no_of_missing_args + 1)) fi + +if [ -z "$DEFAULT_WEIGHTLIST_FILE_NAME" ] +then + printf "ERROR: default_weightlist isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + if [ $no_of_missing_args -gt 0 ] then - printf "$usage" + echo "$usage" exit fi TEMP_DIR=$(mktemp -d) TEMP_CORPUS_FILE="$TEMP_DIR/tagged_corpus" -apertium-destxt $UNTAGGED_CORPUS | - ../lttoolbox/lt-proc $FST | - cg-proc $CONSTRAINT_GRAMMAR | +apertium-destxt "$UNTAGGED_CORPUS" | + lt-proc "$FST" | + cg-proc "$CONSTRAINT_GRAMMAR" | apertium-cleanstream -n | sed -e 's/^\^//' -e 's/\$$//'| - awk '{n=split($0,array,"/"); for(i=2;i<=n;i++) print "^"array[1]"/" array[i]"$"}' > $TEMP_CORPUS_FILE + awk '{n=split($0,array,"/"); for(i=2;i<=n;i++) print "^"array[1]"/" array[i]"$"}' > "$TEMP_CORPUS_FILE" # Estimate the unigram-based weightlist using the filtered analyses -./annotated-corpus-to-weightlist $TEMP_CORPUS_FILE $OUTPUT_WEIGHTLIST_FILE_NAME +./annotated-corpus-to-weightlist "$TEMP_CORPUS_FILE" "$OUTPUT_WEIGHTLIST_FILE_NAME" --default_weightlist "$DEFAULT_WEIGHTLIST_FILE_NAME" -rm -rf $TEMP_DIR +rm -rf "$TEMP_DIR"