commit 4fc69b10097c6148e0c9acd6aafbd81f1dfd3627 Author: hectoralos Date: Sat Jun 29 21:09:23 2019 +0300 noves versions agafades d'eng-cat diff --git a/dev/testvoc/inconsistency.sh b/dev/testvoc/inconsistency.sh index 306a0bb..68c83d7 100755 --- a/dev/testvoc/inconsistency.sh +++ b/dev/testvoc/inconsistency.sh @@ -1,29 +1,116 @@ -TMPDIR=/tmp - -if [[ $1 = "ita-cat" ]]; then - -lt-expand $2 | grep -v REGEX | grep -v '' | sed 's/:>:/%/g' | sed 's/:<:/%/g' | sed 's/:/%/g' | cut -f2 -d'%' | sed 's/^/^/g' | sed 's/$/$ ^.$/g' | tee $TMPDIR/tmp_testvoc1-$1.txt | - apertium-pretransfer|\ - lt-proc -b ../../ita-cat.autobil.bin |\ - apertium-transfer -b ../../apertium-cat-ita.ita-cat.t1x ../../ita-cat.t1x.bin | apertium-interchunk ../../apertium-cat-ita.ita-cat.t2x ../../ita-cat.t2x.bin | apertium-postchunk ../../apertium-cat-ita.ita-cat.t3x ../../ita-cat.t3x.bin | tee $TMPDIR/tmp_testvoc2-$1.txt |\ - lt-proc -d ../../ita-cat.autogen.bin > $TMPDIR/tmp_testvoc3-$1.txt -paste -d _ $TMPDIR/tmp_testvoc1-$1.txt $TMPDIR/tmp_testvoc2-$1.txt $TMPDIR/tmp_testvoc3-$1.txt | sed 's/\^.\$//g' | sed 's/_/ ---------> /g' | grep -v '@' -rm $TMPDIR/tmp_testvoc1-$1.txt -rm $TMPDIR/tmp_testvoc2-$1.txt -rm $TMPDIR/tmp_testvoc3-$1.txt - -elif [[ $1 = "cat-ita" ]]; then - -lt-expand $2 | grep -v REGEX | grep -v '' | sed 's/:>:/%/g' | sed 's/:<:/%/g' | sed 's/:/%/g' | cut -f2 -d'%' | sed 's/^/^/g' | sed 's/$/$ ^.$/g' | tee $TMPDIR/tmp_testvoc1-$1.txt | - apertium-pretransfer|\ - lt-proc -b ../../cat-ita.autobil.bin |\ - apertium-transfer -b ../../apertium-cat-ita.cat-ita.t1x ../../cat-ita.t1x.bin | apertium-interchunk ../../apertium-cat-ita.cat-ita.t2x ../../cat-ita.t2x.bin | apertium-postchunk ../../apertium-cat-ita.cat-ita.t3x ../../cat-ita.t3x.bin | tee $TMPDIR/tmp_testvoc2-$1.txt |\ - lt-proc -d ../../cat-ita.autogen.bin > $TMPDIR/tmp_testvoc3-$1.txt -paste -d _ $TMPDIR/tmp_testvoc1-$1.txt $TMPDIR/tmp_testvoc2-$1.txt $TMPDIR/tmp_testvoc3-$1.txt | sed 's/\^.\$//g' | sed 's/_/ ---------> /g' | grep -v '@' -rm $TMPDIR/tmp_testvoc1-$1.txt -rm $TMPDIR/tmp_testvoc2-$1.txt -rm $TMPDIR/tmp_testvoc3-$1.txt +#!/bin/bash +if [[ $# -eq 2 ]]; then + MODE=$1 + MONODIX=$2 +elif [[ $# -eq 3 ]]; then + MODE=$2 + MONODIX=$3 else - echo "sh inconsistency.sh "; + cat >&2 < + + is the pair direction to be tested. + is the path to the source language monolingual dictionary. Set to "auto" for autodetection. + +Use the -e flag to skip testing on enclitics, which slow down the process. + +Example: $0 -e eng-cat ../../../apertium-eng/apertium-eng.eng.dix + +EOF + exit 1 +fi + +while getopts "e" opt; do + case $opt in + e) + ENCLITICS=true + ;; + esac +done + +expand_poly () { + sed 's/>\//>\/\//g' | sed 's/\/\//\/~\//g' > $POLY1 + while grep -q "//" $POLY1; do + cat $POLY1 | + awk '# This program expands polysemic entries into multiple lines + # so each possibility is tested during testvoc. Each time + # it is executed, an entry per line is modified if necessary. + + BEGIN { FS="\\$ "; OFS = "$ " } + { + if ($2 !="") + { + first = $1; + $1 = ""; + j=split(first, a, "//"); + for (i = 2; i <= j; ++i) print a[1] "/~/" a[i] "$+" substr($0,3,length($0)); + } + else print $0; + }' > $POLY2 + mv $POLY2 $POLY1 + done + cat $POLY1 | sed 's/\/\//\//g' | sed "s|>/~/|>/|g" | sed "s|\$+\^|$ ^|g" +} + +LANG1=$(sed 's/-.*//' <<< $MODE) +LANG2=$(sed 's/.*-//' <<< $MODE) +SRCDIR=$(grep -m1 "^abs_srcdir =" ../../Makefile | sed "s/^.*= //") +LANG1DIR=$(cd $SRCDIR; cd $(grep -m1 "^AP_SRC.*apertium-${LANG1}" Makefile | sed "s/^.*= //") && pwd) +MODE=$SRCDIR"/modes/$MODE.mode" + +if ! [[ -e $MODE ]]; then + echo "Mode file ($MODE) not found." + exit 1 +else + if ! [[ $(grep 'apertium-pretransfer' $MODE) ]]; then + echo "Mode file ($MODE) does not seem to contain a valid Apertium pipeline." + exit 1 + else + PIPELINE_ALL=$(grep -m1 'apertium-pretransfer' $MODE |\ + sed 's/.*apertium-pretransfer/apertium-pretransfer/' |\ + sed "s%\ lrx-proc[^|]*|%%" |\ + sed "s%\ lt-proc \$1%\ lt-proc -d%") + PIPELINE_LEX=$(mktemp -t testvoc.XXXXXXXXXXX) + PIPELINE_TFR=$(mktemp -t testvoc.XXXXXXXXXXX) + PIPELINE_GEN=$(mktemp -t testvoc.XXXXXXXXXXX) + echo $PIPELINE_ALL | sed "s%lt-proc -b\([^|]*\)|.*%lt-proc -b\1%" > "$PIPELINE_LEX" + echo $PIPELINE_ALL | sed "s%.*lt-proc -b\([^|]*\)|\(.*\)%\2%" | sed "s/| lt-proc -d.*//" > "$PIPELINE_TFR" + echo $PIPELINE_ALL | sed 's/.*lt-proc -d/lt-proc -d/' > "$PIPELINE_GEN" + TMPFILES+=("$PIPELINE_LEX" "$PIPELINE_TFR" "$PIPELINE_GEN") + fi +fi + +if [[ $MONODIX != "auto" ]]; then + if ! [[ -e $MONODIX ]]; then + echo "Monolingual dictionary ($MODE) not found." + exit 1 + fi +else + MONODIX="$LANG1DIR/apertium-$LANG1.$LANG1.dix" + if ! [[ -e $MONODIX ]]; then + echo "Monolingual dictionary ($MONODIX) not found." + exit 1 + fi fi + +POLY1=$(mktemp .testvoc.XXXXXXXXXXX) # These two are created in the working directory, because they tend to grow very big and may fill /tmp +POLY2=$(mktemp .testvoc.XXXXXXXXXXX) +TMPFILES+=("$POLY1" "$POLY2") + +lt-expand $MONODIX | grep -v 'REGEX' | grep -v ':<:' | # The monodix is expanded, regular expressions and "RL" entries are removed +( [[ $ENCLITICS ]] && grep -v '' || cat ) | # If the -e flag is used, enclitics are removed for faster processing +sed 's/:>:/\'$'\t/g' | sed 's/:/\'$'\t/g' | cut -f2 -d$'\t' | # Surface forms are removed +sed 's/^/^/g' | sed 's/\(.*\)/[\\\1\$]\1/g' | sed 's/$/$ ^.$/g' | # Entries are converted to Apertium pipeline format, preceded by the source form and followed by a full stop +bash "$PIPELINE_LEX" | # Lexical transfer takes place +grep -v '>/@' | # The list of entries is trimmed according to the bidix +expand_poly | # Polysemic entries are expanded into multiple lines +bash "$PIPELINE_TFR" | # Structural transfer takes place +bash "$PIPELINE_GEN" | # Target language surface forms are generated +sed 's/^\[\\\(.*\)\$\]/\1\$ _ /g' | sed 's/ \^.\$//g' | sed 's/ \.//g' | sed 's/ _ / ---------> /g' + +for f in "${TMPFILES[@]}"; do + rm -f "$f" +done + +exit 0 diff --git a/dev/testvoc/testvoc.sh b/dev/testvoc/testvoc.sh index 62abf78..5b43986 100755 --- a/dev/testvoc/testvoc.sh +++ b/dev/testvoc/testvoc.sh @@ -1,5 +1,52 @@ -echo "==Catalan->Italian=========================="; -bash inconsistency.sh cat-ita ../../../apertium-cat/apertium-cat.cat.dix > /tmp/cat-ita.testvoc; bash inconsistency-summary.sh /tmp/cat-ita.testvoc cat-ita; grep ' #' /tmp/cat-ita.testvoc > ./testvoc-errors.cat-ita.txt; grep '@' /tmp/cat-ita.testvoc >> ./testvoc-errors.cat-ita.txt -echo "" -echo "==Italian->Catalan=========================="; -bash inconsistency.sh ita-cat ../../../apertium-ita/apertium-ita.ita.dix > /tmp/ita-cat.testvoc; bash inconsistency-summary.sh /tmp/ita-cat.testvoc ita-cat; grep ' #' /tmp/ita-cat.testvoc > ./testvoc-errors.ita-cat.txt; grep '@' /tmp/ita-cat.testvoc >> ./testvoc-errors.ita-cat.txt +#!/bin/bash + +if ! [[ -e testvoc.conf ]]; then + echo "Testvoc configuration file (testvoc.conf) not found." + exit 1 +fi + +while getopts "equ" opt; do + case $opt in + e) + ENCLITICS=true # If the -e flag is used, enclitics are skipped for faster processing + ;; + q) + QUIET=true # If the -q flag is used, no summary is generated + ;; + u) + UNKNOWNS=true # If the -u flag is used, unknown words are checked + ;; + esac +done + +IFS="," +modes=($(grep -m 1 "^PairModes=" testvoc.conf | cut -d = -f 2)) +modenames=($(grep -m 1 "^PairModeNames=" testvoc.conf | cut -d = -f 2)) +langs=($(grep -m 1 "^PairLangs=" testvoc.conf | cut -d = -f 2)) +langnames=($(grep -m 1 "^PairLangNames=" testvoc.conf | cut -d = -f 2)) +unset IFS + +for i in "${!modes[@]}"; do + printf "== %.45s\n" "${modenames[$i]} ============================================" + if [[ $ENCLITICS ]]; then + bash inconsistency.sh -e ${modes[$i]} auto > .testvoc + else + bash inconsistency.sh ${modes[$i]} auto > .testvoc + fi + grep -vP '(?!\\)\/.* ---------> [^#].*\\\/' .testvoc | grep -e ' #' -e '\\\/' > testvoc-errors.${modes[$i]}.txt + + if ! [[ $QUIET ]]; then + bash inconsistency-summary.sh .testvoc ${modes[$i]} + fi + rm .testvoc +done + +if [[ $UNKNOWNS ]]; then + for i in "${!langs[@]}"; do + printf "== %.45s\n" "${langnames[$i]} ============================================" + pushd ../../ > /dev/null; bash dev/testvoc/bidix-unknowns.sh ${langs[$i]} | grep -v ":<:" | grep -v "REGEX" | grep -v "" > dev/testvoc/testvoc-missing.${langs[$i]}.txt; popd > /dev/null; + printf "%s\n" "Missing entries: $(cat testvoc-missing.${langs[$i]}.txt | wc -l)" + done +fi + +exit 0