commit ea52b1f0b5f49bdd144ec3313f6eaf04b6445a74 Author: hectoralos Date: Sat Aug 3 22:44:16 2019 +0300 testvoc marcriera diff --git a/dev/testvoc/inconsistency.sh b/dev/testvoc/inconsistency.sh index 2c22d19..68c83d7 100755 --- a/dev/testvoc/inconsistency.sh +++ b/dev/testvoc/inconsistency.sh @@ -1,23 +1,116 @@ -TMPDIR=/tmp +#!/bin/bash -if [[ $1 = "cat-por" ]]; then +if [[ $# -eq 2 ]]; then + MODE=$1 + MONODIX=$2 +elif [[ $# -eq 3 ]]; then + MODE=$2 + MONODIX=$3 +else + cat >&2 < + + is the pair direction to be tested. + is the path to the source language monolingual dictionary. Set to "auto" for autodetection. + +Use the -e flag to skip testing on enclitics, which slow down the process. + +Example: $0 -e eng-cat ../../../apertium-eng/apertium-eng.eng.dix + +EOF + exit 1 +fi + +while getopts "e" opt; do + case $opt in + e) + ENCLITICS=true + ;; + esac +done -lt-expand $2 | grep -v REGEX | grep -v '' | sed 's/:>:/%/g' | grep -v ':<:' | sed 's/:/%/g' | cut -f2 -d'%' | sed 's/^/^/g' | sed 's/$/$ ^.$/g' | tee $TMPDIR/tmp_testvoc1.txt |\ - apertium-pretransfer|\ - lt-proc -b ../../cat-por.autobil.bin |\ - apertium-transfer -b ../../apertium-por-cat.cat-por.t1x ../../cat-por.t1x.bin | tee $TMPDIR/tmp_testvoc2.txt |\ - lt-proc -d ../../cat-por.autogen.bin > $TMPDIR/tmp_testvoc3.txt -paste -d _ $TMPDIR/tmp_testvoc1.txt $TMPDIR/tmp_testvoc2.txt $TMPDIR/tmp_testvoc3.txt | sed 's/\^.\$//g' | sed 's/_/ ---------> /g' | grep -v '\^@' +expand_poly () { + sed 's/>\//>\/\//g' | sed 's/\/\//\/~\//g' > $POLY1 + while grep -q "//" $POLY1; do + cat $POLY1 | + awk '# This program expands polysemic entries into multiple lines + # so each possibility is tested during testvoc. Each time + # it is executed, an entry per line is modified if necessary. -elif [[ $1 = "por-cat" ]]; then + BEGIN { FS="\\$ "; OFS = "$ " } + { + if ($2 !="") + { + first = $1; + $1 = ""; + j=split(first, a, "//"); + for (i = 2; i <= j; ++i) print a[1] "/~/" a[i] "$+" substr($0,3,length($0)); + } + else print $0; + }' > $POLY2 + mv $POLY2 $POLY1 + done + cat $POLY1 | sed 's/\/\//\//g' | sed "s|>/~/|>/|g" | sed "s|\$+\^|$ ^|g" +} -lt-expand $2 | grep -v REGEX | grep -v '' | sed 's/:>:/%/g' | grep -v ':<:' | sed 's/:/%/g' | cut -f2 -d'%' | sed 's/^/^/g' | sed 's/$/$ ^.$/g' | tee $TMPDIR/tmp_testvoc1.txt |\ - apertium-pretransfer|\ - lt-proc -b ../../por-cat.autobil.bin |\ - apertium-transfer -b ../../apertium-por-cat.por-cat.t1x ../../por-cat.t1x.bin | tee $TMPDIR/tmp_testvoc2.txt |\ - lt-proc -d ../../por-cat.autogen.bin > $TMPDIR/tmp_testvoc3.txt -paste -d _ $TMPDIR/tmp_testvoc1.txt $TMPDIR/tmp_testvoc2.txt $TMPDIR/tmp_testvoc3.txt | sed 's/\^.\$//g' | sed 's/_/ ---------> /g' | grep -v '\^@' +LANG1=$(sed 's/-.*//' <<< $MODE) +LANG2=$(sed 's/.*-//' <<< $MODE) +SRCDIR=$(grep -m1 "^abs_srcdir =" ../../Makefile | sed "s/^.*= //") +LANG1DIR=$(cd $SRCDIR; cd $(grep -m1 "^AP_SRC.*apertium-${LANG1}" Makefile | sed "s/^.*= //") && pwd) +MODE=$SRCDIR"/modes/$MODE.mode" +if ! [[ -e $MODE ]]; then + echo "Mode file ($MODE) not found." + exit 1 else - echo "sh inconsistency.sh "; + if ! [[ $(grep 'apertium-pretransfer' $MODE) ]]; then + echo "Mode file ($MODE) does not seem to contain a valid Apertium pipeline." + exit 1 + else + PIPELINE_ALL=$(grep -m1 'apertium-pretransfer' $MODE |\ + sed 's/.*apertium-pretransfer/apertium-pretransfer/' |\ + sed "s%\ lrx-proc[^|]*|%%" |\ + sed "s%\ lt-proc \$1%\ lt-proc -d%") + PIPELINE_LEX=$(mktemp -t testvoc.XXXXXXXXXXX) + PIPELINE_TFR=$(mktemp -t testvoc.XXXXXXXXXXX) + PIPELINE_GEN=$(mktemp -t testvoc.XXXXXXXXXXX) + echo $PIPELINE_ALL | sed "s%lt-proc -b\([^|]*\)|.*%lt-proc -b\1%" > "$PIPELINE_LEX" + echo $PIPELINE_ALL | sed "s%.*lt-proc -b\([^|]*\)|\(.*\)%\2%" | sed "s/| lt-proc -d.*//" > "$PIPELINE_TFR" + echo $PIPELINE_ALL | sed 's/.*lt-proc -d/lt-proc -d/' > "$PIPELINE_GEN" + TMPFILES+=("$PIPELINE_LEX" "$PIPELINE_TFR" "$PIPELINE_GEN") + fi fi + +if [[ $MONODIX != "auto" ]]; then + if ! [[ -e $MONODIX ]]; then + echo "Monolingual dictionary ($MODE) not found." + exit 1 + fi +else + MONODIX="$LANG1DIR/apertium-$LANG1.$LANG1.dix" + if ! [[ -e $MONODIX ]]; then + echo "Monolingual dictionary ($MONODIX) not found." + exit 1 + fi +fi + +POLY1=$(mktemp .testvoc.XXXXXXXXXXX) # These two are created in the working directory, because they tend to grow very big and may fill /tmp +POLY2=$(mktemp .testvoc.XXXXXXXXXXX) +TMPFILES+=("$POLY1" "$POLY2") + +lt-expand $MONODIX | grep -v 'REGEX' | grep -v ':<:' | # The monodix is expanded, regular expressions and "RL" entries are removed +( [[ $ENCLITICS ]] && grep -v '' || cat ) | # If the -e flag is used, enclitics are removed for faster processing +sed 's/:>:/\'$'\t/g' | sed 's/:/\'$'\t/g' | cut -f2 -d$'\t' | # Surface forms are removed +sed 's/^/^/g' | sed 's/\(.*\)/[\\\1\$]\1/g' | sed 's/$/$ ^.$/g' | # Entries are converted to Apertium pipeline format, preceded by the source form and followed by a full stop +bash "$PIPELINE_LEX" | # Lexical transfer takes place +grep -v '>/@' | # The list of entries is trimmed according to the bidix +expand_poly | # Polysemic entries are expanded into multiple lines +bash "$PIPELINE_TFR" | # Structural transfer takes place +bash "$PIPELINE_GEN" | # Target language surface forms are generated +sed 's/^\[\\\(.*\)\$\]/\1\$ _ /g' | sed 's/ \^.\$//g' | sed 's/ \.//g' | sed 's/ _ / ---------> /g' + +for f in "${TMPFILES[@]}"; do + rm -f "$f" +done + +exit 0 diff --git a/dev/testvoc/testvoc.conf b/dev/testvoc/testvoc.conf new file mode 100755 index 0000000..a2359a2 --- /dev/null +++ b/dev/testvoc/testvoc.conf @@ -0,0 +1,19 @@ +### Testvoc config file + +PairModes=cat-por,por-cat +PairModeNames=Català > Português, Português > Català +PairLangs=cat,por +PairLangNames=Català,Português + +# Summary settings + +POS=adj,adv,cnjadv,cnjcoo,cnjsub,det,ij,rel,n,np,num,pr,preadv,predet,prn,vaux,vbhaver,vblex,vbmod,vbser +Exclude_det=,, +Exclude_preadv=, +Exclude_adv=,, +Exclude_vblex= +Exclude_vaux= +Exclude_pr=,,,,, diff --git a/dev/testvoc/testvoc.sh b/dev/testvoc/testvoc.sh index c773224..5b43986 100755 --- a/dev/testvoc/testvoc.sh +++ b/dev/testvoc/testvoc.sh @@ -1,5 +1,52 @@ -echo "==Catalan->Portuguese=========================="; -bash inconsistency.sh cat-por ../../../apertium-cat/apertium-cat.cat.dix > /tmp/cat-por.testvoc; sh inconsistency-summary.sh /tmp/cat-por.testvoc cat-por -echo "" -echo "==Portuguese->Catalan==========================="; -bash inconsistency.sh por-cat ../../../apertium-por/apertium-por.por.dix > /tmp/por-cat.testvoc; bash inconsistency-summary.sh /tmp/por-cat.testvoc por-cat +#!/bin/bash + +if ! [[ -e testvoc.conf ]]; then + echo "Testvoc configuration file (testvoc.conf) not found." + exit 1 +fi + +while getopts "equ" opt; do + case $opt in + e) + ENCLITICS=true # If the -e flag is used, enclitics are skipped for faster processing + ;; + q) + QUIET=true # If the -q flag is used, no summary is generated + ;; + u) + UNKNOWNS=true # If the -u flag is used, unknown words are checked + ;; + esac +done + +IFS="," +modes=($(grep -m 1 "^PairModes=" testvoc.conf | cut -d = -f 2)) +modenames=($(grep -m 1 "^PairModeNames=" testvoc.conf | cut -d = -f 2)) +langs=($(grep -m 1 "^PairLangs=" testvoc.conf | cut -d = -f 2)) +langnames=($(grep -m 1 "^PairLangNames=" testvoc.conf | cut -d = -f 2)) +unset IFS + +for i in "${!modes[@]}"; do + printf "== %.45s\n" "${modenames[$i]} ============================================" + if [[ $ENCLITICS ]]; then + bash inconsistency.sh -e ${modes[$i]} auto > .testvoc + else + bash inconsistency.sh ${modes[$i]} auto > .testvoc + fi + grep -vP '(?!\\)\/.* ---------> [^#].*\\\/' .testvoc | grep -e ' #' -e '\\\/' > testvoc-errors.${modes[$i]}.txt + + if ! [[ $QUIET ]]; then + bash inconsistency-summary.sh .testvoc ${modes[$i]} + fi + rm .testvoc +done + +if [[ $UNKNOWNS ]]; then + for i in "${!langs[@]}"; do + printf "== %.45s\n" "${langnames[$i]} ============================================" + pushd ../../ > /dev/null; bash dev/testvoc/bidix-unknowns.sh ${langs[$i]} | grep -v ":<:" | grep -v "REGEX" | grep -v "" > dev/testvoc/testvoc-missing.${langs[$i]}.txt; popd > /dev/null; + printf "%s\n" "Missing entries: $(cat testvoc-missing.${langs[$i]}.txt | wc -l)" + done +fi + +exit 0