commit b7a4b3464e2706996631b738d569015ad2f7277f Author: elmurod1202 Date: Thu Aug 20 13:25:59 2020 +0200 TESTVOC: Just a copypasta from uzb-kaa diff --git a/testvoc/lite/README b/testvoc/lite/README new file mode 100644 index 0000000..999f545 --- /dev/null +++ b/testvoc/lite/README @@ -0,0 +1,30 @@ +PROBLEM STATEMENT +================= + +A problem with testvoc is that it takes long time to run. + +We can come up with a simpler and faster testvoc which is a close approximation +of the full testvoc if we make following simplifying assumptions: + * monodixes are trimmed; + * there aren't any transfer rules which work selectively on lemmas; + * SL words of particular POS correspond to words of one particular POS in TL. + +The first assumption holds true, the second and the third are not always the +case, and we will make corresponding tweaks to our 'simple testvoc' process +later. + +If we assume three things above, then testvocing one word per each paradigm +definition becomes a reasonable substition for the full testvoc. + +DESCRIPTION OF THE PROCESS +========================== + +Note that in context of a lexc file, paradigm definition is a lexicon which is +a direct continuation of stems. For each such lexicon, there is compressed text file +in apertium-tur/tests/morphotactics, which contains the full paradigm of a word +linked to that lexicon. The name of a file indicates which lexicon it represents, +e.g. A1.txt.gz. + +The job of the testvoc.sh script in this directory is to extract lexical forms +from that text files, and run them through the translator pipeline, as the usual +testvoc script would. \ No newline at end of file diff --git a/testvoc/lite/testvoc.sh b/testvoc/lite/testvoc.sh new file mode 100644 index 0000000..41e6946 --- /dev/null +++ b/testvoc/lite/testvoc.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# A script to run the "lite" ("one-word-per-each-paradigm-") testvoc. +# +# Assumes the pair is compiled. +# Extracts lexical units from compressed text files in languages/apertium-uzb/ +# tests/morphotactics/ and languages/apertium-kaa/tests/morphotactics +# and passes them through the translator (=INCONSISTENCY script). +# Produces 'testvoc-summary' files using the INCONSISTENCY_SUMMARY script. +# +# TODO: Generate stats about each file (e.g. N1.txt), not just about the category (e.g. nouns). +# +# Usage: [TMPDIR=/path/to/tmpdir] ./testvoc.sh + +INCONSISTENCY=../standard/./inconsistency.sh +INCONSISTENCY_SUMMARY=../standard/./inconsistency-summary.sh + +if [ -z $TMPDIR ]; then + TMPDIR="/tmp" +fi + +export TMPDIR + +function extract_lexical_units { + sort -u | cut -f2 -d':' | \ + sed 's/^/^/g' | sed 's/$/$ ^.$/g' +} + +#------------------------------------------------------------------------------- +# Uzbek->Karakalpak testvoc +#------------------------------------------------------------------------------- + +PARDEF_FILES=../../../../languages/apertium-uzb/tests/morphotactics/*.txt.gz + +echo "==Uzbek->Karakalpak===========================" + +echo "" > $TMPDIR/uzb-kaa.testvoc + +for file in $PARDEF_FILES; do + zcat $file | extract_lexical_units | + $INCONSISTENCY uzb-kaa >> $TMPDIR/uzb-kaa.testvoc +done + +$INCONSISTENCY_SUMMARY $TMPDIR/uzb-kaa.testvoc uzb-kaa + +#------------------------------------------------------------------------------- +# Karakalpak->Uzbek testvoc +#------------------------------------------------------------------------------- + +# TODO diff --git a/testvoc/standard/inconsistency-summary.sh b/testvoc/standard/inconsistency-summary.sh new file mode 100644 index 0000000..c7ef14c --- /dev/null +++ b/testvoc/standard/inconsistency-summary.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# This script takes the output of the inconsistency.sh script and prints some +# stats about it. +# It is supposed to be invoked by ./testvoc.sh, and not run directly. + +INC="$1" +OUT="testvoc-summary.$2.txt" +POS="n adj v vaux adv cm cnjcoo det guio ij np num post prn cnjsub" + +ECHOE="echo -e" +SED=sed + +if test x$(uname -s) = xDarwin; then + ECHOE="builtin echo" + SED=gsed +fi + + +echo "" > "$OUT"; + +date >> "$OUT" +$ECHOE "===============================================" >> "$OUT" +$ECHOE "POS\tTotal\tClean\tWith @\tWith #\tClean %" >> "$OUT" + +aterrors=$(mktemp -t testvoc.XXXXXXXXXX) +hasherrors=$(mktemp -t testvoc.XXXXXXXXXX) + +<"$INC" sed 's/~#/#/g' | grep -v -e '-> *#' -e REGEX | grep -e '-> *\^@' > "$aterrors" +<"$INC" sed 's/~#/#/g' | grep -v -e '-> *\^@' -e REGEX | grep -e '> *#' > "$hasherrors" + + +for i in $POS; do + if [ "$i" = "det" ]; then + remove-other-pos () { grep -v -e '' -e '' -e '' -e ''; } + elif [ "$i" = "preadv" ]; then + remove-other-pos () { grep -v -e '' -e ''; } + elif [ "$i" = "adj" ]; then + remove-other-pos () { grep -v -e '' -e '' -e '' -e '' -e ''; } + elif [ "$i" = "cnjcoo" ]; then + remove-other-pos () { grep -v -e '' -e '' -e ''; } + elif [ "$i" = "np" ]; then + remove-other-pos () { grep -v -e '' -e '' -e '' -e '' -e '' -e '' -e '' -e '' -e '' -e '' -e '' -e ''; } + elif [ "$i" = "prn" ]; then + remove-other-pos () { grep -v -e '' -e '' -e ''; } + else + remove-other-pos () { cat; } + fi + TOTAL=$(grep -v REGEX "$INC" | remove-other-pos | grep "<$i>" -c ) + AT=$(<"$aterrors" remove-other-pos | grep -c "<$i>" ) + HASH=$(<"$hasherrors" remove-other-pos | grep -c "<$i>" ) + + UNCLEAN=$(calc -p "$AT+$HASH") + CLEAN=$(calc -p "$TOTAL-$UNCLEAN") + PERCLEAN=$(calc -p "$UNCLEAN/$TOTAL*100" | sed 's/~//g' | head -c 5) + echo $PERCLEAN | grep "Err" > /dev/null; + if [ $? -eq 0 ]; then + TOTPERCLEAN="100"; + else + TOTPERCLEAN=$(calc -p "100-$PERCLEAN" | sed 's/~//g' | head -c 5) + fi + + $ECHOE "$TOTAL;$i;$CLEAN;$AT;$HASH;$TOTPERCLEAN" +done | sort -gr | awk -F';' '{print $2"\t"$1"\t"$3"\t"$4"\t"$5"\t"$6}' >> "$OUT" + +#rm -f "$aterrors" "$hasherrors" + +$ECHOE "===============================================" >> "$OUT" +cat "$OUT" diff --git a/testvoc/standard/inconsistency.sh b/testvoc/standard/inconsistency.sh new file mode 100644 index 0000000..de1f49c --- /dev/null +++ b/testvoc/standard/inconsistency.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Passes its input -- a list of lexical units -- through the translator +# (transfer modules and target language generator). +# Creates three text files in TMPDIR: +# 1) INPUT, a list of lexical units taken +# 2) TRANSFOUT, this list after passing transfer modules +# 3) GENOUT, this list after TL generator. +# Outputs "paste INPUT TRANSFOUT GENOUT" +# Supposed to be invoked by ./testvoc.sh, and not run directly. + +if [ -z $TMPDIR ]; then + TMPDIR="/tmp" +fi + +INPUT=$TMPDIR/testvoc_input.txt +TRANSFOUT=$TMPDIR/testvoc_transfout.txt +GENOUT=$TMPDIR/testvoc_genout.txt + +DIR=$1 + +if [[ $DIR = "uzb-kaa" ]]; then + + PRETRANSFER="apertium-pretransfer" + LEXTRANSFER="lt-proc -b ../../uzb-kaa.autobil.bin" + LEXSELECTION="lrx-proc -m ../../uzb-kaa.autolex.bin" + TRANSFER="rtx-proc ../../uzb-kaa.rtx.bin" + #TRANSFER_1="apertium-transfer -b ../../apertium-uzb-kaa.uzb-kaa.t1x ../../uzb-kaa.rtx.bin" + #TRANSFER_2="apertium-transfer -n ../../apertium-uzb-kaa.uzb-kaa.t2x ../../uzb-kaa.rlx.bin" + GENERATOR="lt-proc -d ../../uzb-kaa.autogen.bin" + + tee $INPUT | + $PRETRANSFER | $LEXTRANSFER | $LEXSELECTION | + $TRANSFER | tee $TRANSFOUT | + $GENERATOR > $GENOUT + paste -d % $INPUT $TRANSFOUT $GENOUT | + sed 's/\^.\$//g' | sed 's/%/ --> /g' + +else + echo "Usage: ./inconsistency.sh "; +fi diff --git a/testvoc/standard/testvoc.sh b/testvoc/standard/testvoc.sh new file mode 100644 index 0000000..010f51e --- /dev/null +++ b/testvoc/standard/testvoc.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# A script to run the standard (=full) testvoc. +# +# Assumes the pair is compiled. +# Expands the source language dictionary/transducer MONODIX and passes the +# result of the expansion through the translator (=inconsistency.sh script). +# Produces 'testvoc-summary' files using the inconsistency-summary.sh. +# +# Usage: [TMPDIR=/path/to/tmpdir] ./testvoc.sh + +if [ -z $TMPDIR ]; then + TMPDIR="/tmp" +fi + +export TMPDIR + +# Testvoc will finish in a reasonable time if we comment out the line +# with numerals regex in bidix: +cd ../../ +sed -i 's_ *\[№.*$__' apertium-uzb-kaa.uzb-kaa.dix +make +cd testvoc/standard/ + +function expand_monodix { + hfst-fst2strings -c1 $MONODIX | sort -u | cut -d':' -f2 | \ + sed 's/^/^/g' | sed 's/$/$ ^.$/g' +} + +#------------------------------------------------------------------------------- +# Uzbek->Karakalpak testvoc +#------------------------------------------------------------------------------- + +MONODIX=../../.deps/uzb-kaa.automorf.trimmed + +echo "==Uzbek->Karakalpak===========================" + +expand_monodix | +bash inconsistency.sh uzb-kaa > $TMPDIR/uzb-kaa.testvoc +bash inconsistency-summary.sh $TMPDIR/uzb-kaa.testvoc uzb-kaa