commit 4fc69b10097c6148e0c9acd6aafbd81f1dfd3627
Author: hectoralos <hectoralos@gmail.com>
Date:   Sat Jun 29 21:09:23 2019 +0300

    noves versions agafades d'eng-cat

diff --git a/dev/testvoc/inconsistency.sh b/dev/testvoc/inconsistency.sh
index 306a0bb..68c83d7 100755
--- a/dev/testvoc/inconsistency.sh
+++ b/dev/testvoc/inconsistency.sh
@@ -1,29 +1,116 @@
-TMPDIR=/tmp
-
-if [[ $1 = "ita-cat" ]]; then
-
-lt-expand $2 | grep -v REGEX | grep -v '<prn><enc>' | sed 's/:>:/%/g' | sed 's/:<:/%/g' | sed 's/:/%/g' | cut -f2 -d'%' |  sed 's/^/^/g' | sed 's/$/$ ^.<sent>$/g' | tee $TMPDIR/tmp_testvoc1-$1.txt |
-        apertium-pretransfer|\
-        lt-proc -b ../../ita-cat.autobil.bin |\
-        apertium-transfer -b ../../apertium-cat-ita.ita-cat.t1x  ../../ita-cat.t1x.bin | apertium-interchunk ../../apertium-cat-ita.ita-cat.t2x  ../../ita-cat.t2x.bin | apertium-postchunk ../../apertium-cat-ita.ita-cat.t3x  ../../ita-cat.t3x.bin | tee $TMPDIR/tmp_testvoc2-$1.txt |\
-        lt-proc -d ../../ita-cat.autogen.bin > $TMPDIR/tmp_testvoc3-$1.txt
-paste -d _ $TMPDIR/tmp_testvoc1-$1.txt $TMPDIR/tmp_testvoc2-$1.txt $TMPDIR/tmp_testvoc3-$1.txt | sed 's/\^.<sent>\$//g' | sed 's/_/   --------->  /g' | grep -v '@'
-rm $TMPDIR/tmp_testvoc1-$1.txt
-rm $TMPDIR/tmp_testvoc2-$1.txt
-rm $TMPDIR/tmp_testvoc3-$1.txt
-
-elif [[ $1 = "cat-ita" ]]; then
-
-lt-expand $2 | grep -v REGEX | grep -v '<prn><enc>' | sed 's/:>:/%/g' | sed 's/:<:/%/g' | sed 's/:/%/g' | cut -f2 -d'%' |  sed 's/^/^/g' | sed 's/$/$ ^.<sent>$/g' | tee $TMPDIR/tmp_testvoc1-$1.txt |
-        apertium-pretransfer|\
-        lt-proc -b ../../cat-ita.autobil.bin |\
-        apertium-transfer -b ../../apertium-cat-ita.cat-ita.t1x  ../../cat-ita.t1x.bin | apertium-interchunk ../../apertium-cat-ita.cat-ita.t2x  ../../cat-ita.t2x.bin | apertium-postchunk ../../apertium-cat-ita.cat-ita.t3x  ../../cat-ita.t3x.bin | tee $TMPDIR/tmp_testvoc2-$1.txt |\
-        lt-proc -d ../../cat-ita.autogen.bin > $TMPDIR/tmp_testvoc3-$1.txt
-paste -d _ $TMPDIR/tmp_testvoc1-$1.txt $TMPDIR/tmp_testvoc2-$1.txt $TMPDIR/tmp_testvoc3-$1.txt | sed 's/\^.<sent>\$//g' | sed 's/_/   --------->  /g' | grep -v '@'
-rm $TMPDIR/tmp_testvoc1-$1.txt
-rm $TMPDIR/tmp_testvoc2-$1.txt
-rm $TMPDIR/tmp_testvoc3-$1.txt
+#!/bin/bash
 
+if  [[ $# -eq 2 ]]; then
+    MODE=$1
+    MONODIX=$2
+elif  [[ $# -eq 3 ]]; then
+    MODE=$2
+    MONODIX=$3
 else
-	echo "sh inconsistency.sh <direction>";
+    cat >&2 <<EOF
+Usage: $0 [-e] <direction> <monodix>
+
+<direction> is the pair direction to be tested.
+<monodix> is the path to the source language monolingual dictionary. Set to "auto" for autodetection.
+
+Use the -e flag to skip testing on enclitics, which slow down the process.
+
+Example: $0 -e eng-cat ../../../apertium-eng/apertium-eng.eng.dix
+
+EOF
+    exit 1
+fi
+
+while getopts "e" opt; do
+  case $opt in
+    e)
+      ENCLITICS=true
+      ;;
+  esac
+done
+
+expand_poly () {
+    sed 's/>\//>\/\//g' | sed 's/<sent>\/\//<sent>\/~\//g' > $POLY1
+    while grep -q "//" $POLY1; do 
+        cat $POLY1 | 
+        awk '# This program expands polysemic entries into multiple lines
+        # so each possibility is tested during testvoc. Each time
+        # it is executed, an entry per line is modified if necessary.
+
+        BEGIN { FS="\\$ "; OFS = "$ " }
+        {
+            if ($2 !="")
+            {
+                first = $1;
+                $1 = "";
+                j=split(first, a, "//");
+                for (i = 2; i <= j; ++i) print a[1] "/~/" a[i] "$+" substr($0,3,length($0));
+            }
+            else print $0;
+        }' > $POLY2
+        mv $POLY2 $POLY1
+    done
+    cat $POLY1 | sed 's/\/\//\//g' | sed "s|>/~/|>/|g" | sed "s|\$+\^|$ ^|g"
+}
+
+LANG1=$(sed 's/-.*//' <<< $MODE)
+LANG2=$(sed 's/.*-//' <<< $MODE)
+SRCDIR=$(grep -m1 "^abs_srcdir =" ../../Makefile | sed "s/^.*= //")
+LANG1DIR=$(cd $SRCDIR; cd $(grep -m1 "^AP_SRC.*apertium-${LANG1}" Makefile | sed "s/^.*= //") && pwd)
+MODE=$SRCDIR"/modes/$MODE.mode"
+
+if ! [[ -e $MODE ]]; then
+    echo "Mode file ($MODE) not found."
+    exit 1
+else
+    if ! [[ $(grep 'apertium-pretransfer' $MODE) ]]; then
+        echo "Mode file ($MODE) does not seem to contain a valid Apertium pipeline."
+        exit 1
+    else
+        PIPELINE_ALL=$(grep -m1 'apertium-pretransfer' $MODE |\
+        sed 's/.*apertium-pretransfer/apertium-pretransfer/' |\
+        sed "s%\ lrx-proc[^|]*|%%" |\
+        sed "s%\ lt-proc \$1%\ lt-proc -d%")
+        PIPELINE_LEX=$(mktemp -t testvoc.XXXXXXXXXXX)
+        PIPELINE_TFR=$(mktemp -t testvoc.XXXXXXXXXXX)
+        PIPELINE_GEN=$(mktemp -t testvoc.XXXXXXXXXXX)
+        echo $PIPELINE_ALL | sed "s%lt-proc -b\([^|]*\)|.*%lt-proc -b\1%" > "$PIPELINE_LEX"
+        echo $PIPELINE_ALL | sed "s%.*lt-proc -b\([^|]*\)|\(.*\)%\2%" | sed "s/| lt-proc -d.*//" > "$PIPELINE_TFR"
+        echo $PIPELINE_ALL | sed 's/.*lt-proc -d/lt-proc -d/' > "$PIPELINE_GEN"
+        TMPFILES+=("$PIPELINE_LEX" "$PIPELINE_TFR" "$PIPELINE_GEN")
+    fi
+fi
+
+if [[ $MONODIX != "auto" ]]; then
+    if ! [[ -e $MONODIX ]]; then
+        echo "Monolingual dictionary ($MODE) not found."
+        exit 1
+    fi
+else
+    MONODIX="$LANG1DIR/apertium-$LANG1.$LANG1.dix"
+    if ! [[ -e $MONODIX ]]; then
+        echo "Monolingual dictionary ($MONODIX) not found."
+        exit 1
+    fi
 fi
+
+POLY1=$(mktemp .testvoc.XXXXXXXXXXX)  # These two are created in the working directory, because they tend to grow very big and may fill /tmp
+POLY2=$(mktemp .testvoc.XXXXXXXXXXX)
+TMPFILES+=("$POLY1" "$POLY2")
+
+lt-expand $MONODIX | grep -v 'REGEX' | grep -v ':<:' |  # The monodix is expanded, regular expressions and "RL" entries are removed
+( [[ $ENCLITICS ]] && grep -v '<prn><enc>' || cat ) |  # If the -e flag is used, enclitics are removed for faster processing
+sed 's/:>:/\'$'\t/g' | sed 's/:/\'$'\t/g' | cut -f2 -d$'\t' |  # Surface forms are removed
+sed 's/^/^/g' | sed 's/\(.*\)/[\\\1\$]\1/g' | sed 's/$/$ ^.<sent>$/g' |  # Entries are converted to Apertium pipeline format, preceded by the source form and followed by a full stop
+bash "$PIPELINE_LEX" |  # Lexical transfer takes place
+grep -v '>/@' |  # The list of entries is trimmed according to the bidix
+expand_poly |  # Polysemic entries are expanded into multiple lines
+bash "$PIPELINE_TFR" |  # Structural transfer takes place
+bash "$PIPELINE_GEN" |  # Target language surface forms are generated
+sed 's/^\[\\\(.*\)\$\]/\1\$ _ /g' | sed 's/ \^.<sent>\$//g' | sed 's/ \.//g' | sed 's/ _ /   --------->   /g'
+
+for f in "${TMPFILES[@]}"; do
+    rm -f "$f"
+done
+
+exit 0
diff --git a/dev/testvoc/testvoc.sh b/dev/testvoc/testvoc.sh
index 62abf78..5b43986 100755
--- a/dev/testvoc/testvoc.sh
+++ b/dev/testvoc/testvoc.sh
@@ -1,5 +1,52 @@
-echo "==Catalan->Italian==========================";
-bash inconsistency.sh cat-ita ../../../apertium-cat/apertium-cat.cat.dix > /tmp/cat-ita.testvoc; bash inconsistency-summary.sh /tmp/cat-ita.testvoc cat-ita; grep ' #' /tmp/cat-ita.testvoc > ./testvoc-errors.cat-ita.txt; grep '@' /tmp/cat-ita.testvoc >> ./testvoc-errors.cat-ita.txt
-echo ""
-echo "==Italian->Catalan==========================";
-bash inconsistency.sh ita-cat ../../../apertium-ita/apertium-ita.ita.dix > /tmp/ita-cat.testvoc; bash inconsistency-summary.sh /tmp/ita-cat.testvoc ita-cat; grep ' #' /tmp/ita-cat.testvoc > ./testvoc-errors.ita-cat.txt; grep '@' /tmp/ita-cat.testvoc >> ./testvoc-errors.ita-cat.txt
+#!/bin/bash
+
+if ! [[ -e testvoc.conf ]]; then
+    echo "Testvoc configuration file (testvoc.conf) not found."
+    exit 1
+fi
+
+while getopts "equ" opt; do
+  case $opt in
+    e)
+      ENCLITICS=true  # If the -e flag is used, enclitics are skipped for faster processing
+      ;;
+    q)
+      QUIET=true  # If the -q flag is used, no summary is generated
+      ;;
+    u)
+      UNKNOWNS=true  # If the -u flag is used, unknown words are checked
+      ;;
+  esac
+done
+
+IFS=","
+modes=($(grep -m 1 "^PairModes=" testvoc.conf | cut -d = -f 2))
+modenames=($(grep -m 1 "^PairModeNames=" testvoc.conf | cut -d = -f 2))
+langs=($(grep -m 1 "^PairLangs=" testvoc.conf | cut -d = -f 2))
+langnames=($(grep -m 1 "^PairLangNames=" testvoc.conf | cut -d = -f 2))
+unset IFS
+
+for i in "${!modes[@]}"; do
+    printf "== %.45s\n" "${modenames[$i]} ============================================"
+    if [[ $ENCLITICS ]]; then
+        bash inconsistency.sh -e ${modes[$i]} auto > .testvoc
+    else
+        bash inconsistency.sh ${modes[$i]} auto > .testvoc
+    fi
+    grep -vP '(?!\\)\/.*   --------->   [^#].*\\\/' .testvoc | grep -e ' #' -e '\\\/' > testvoc-errors.${modes[$i]}.txt
+
+    if ! [[ $QUIET ]]; then
+        bash inconsistency-summary.sh .testvoc ${modes[$i]}
+    fi
+    rm .testvoc
+done
+
+if [[ $UNKNOWNS ]]; then
+    for i in "${!langs[@]}"; do
+        printf "== %.45s\n" "${langnames[$i]} ============================================"
+        pushd ../../ > /dev/null; bash dev/testvoc/bidix-unknowns.sh ${langs[$i]} | grep -v ":<:" | grep -v "REGEX" | grep -v "<prn><enc>" > dev/testvoc/testvoc-missing.${langs[$i]}.txt; popd > /dev/null;
+        printf "%s\n" "Missing entries: $(cat testvoc-missing.${langs[$i]}.txt | wc -l)"
+    done
+fi
+
+exit 0