commit 9d88bb6b85ba133e1c91814f4d63ffb98605b0a4 Author: priyankmodiPM Date: Mon Jun 22 04:25:39 2020 +0530 added testvoc script, removed rule causing issues with postpositions, added few bidix entries diff --git a/apertium-hin-pan.hin-pan.t1x b/apertium-hin-pan.hin-pan.t1x index d769823..bf99258 100644 --- a/apertium-hin-pan.hin-pan.t1x +++ b/apertium-hin-pan.hin-pan.t1x @@ -65,27 +65,6 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/apertium-hin-pan.hin-pan_Guru.dix b/apertium-hin-pan.hin-pan_Guru.dix index b1d9dca..d695e6b 100644 --- a/apertium-hin-pan.hin-pan_Guru.dix +++ b/apertium-hin-pan.hin-pan_Guru.dix @@ -8409,7 +8409,7 @@

नहींਨਾ

नहींਨਾਂਹ

नहींਨਹੀਂ

-

नहीं_तोਉਂਝ

+

नहीं तोਉਂਝ

निकटਨਿਕਟ

नित्यਨਿਤ

निपटਨਿਪਟ

diff --git a/dev/testvoc/inconsistency-summary.sh b/dev/testvoc/inconsistency-summary.sh new file mode 100755 index 0000000..f5b726d --- /dev/null +++ b/dev/testvoc/inconsistency-summary.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +INC=$1 +PAIR=$2 +OUT=testvoc-summary.$PAIR.txt +POS="abbr adj adv cm cnjadv cnjcoo cnjsub det guio ij n np num pr preadv prn rel vaux vbhaver vblex vbser vbmod" + +SED=sed + +echo -n "" > $OUT; + +date >> $OUT +echo -e "===============================================" >> $OUT +echo -e "POS\tTotal\tClean\tWith @\tWith #\tClean %" >> $OUT +for i in $POS; do + if [ "$i" = "det" ]; then + TOTAL=`cat $INC | grep "<$i>" | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + AT=`cat $INC | grep "<$i>" | grep '@' | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + HASH=`cat $INC | grep "<$i>" | grep '> *#' | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + elif [ "$i" = "preadv" ]; then + TOTAL=`cat $INC | grep "<$i>" | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + AT=`cat $INC | grep "<$i>" | grep '@' | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + HASH=`cat $INC | grep "<$i>" | grep '> *#' | grep -v -e '' -e '' | grep -v REGEX | wc -l`; + elif [ "$i" = "adv" ]; then + TOTAL=`cat $INC | grep "<$i>" | grep -v -e '' -e '" | grep '@' | grep -v -e '' -e '" | grep '> *#' | grep -v -e '' -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v -e '" | grep '@' | grep -v -e '" | grep '> *#' | grep -v -e '" | grep -v REGEX | wc -l`; + AT=`cat $INC | grep "<$i>" | grep '@' | grep -v REGEX | wc -l`; + HASH=`cat $INC | grep "<$i>" | grep '> *#' | grep -v REGEX | wc -l`; + fi + UNCLEAN=`calc $AT+$HASH`; + CLEAN=`calc $TOTAL-$UNCLEAN`; + PERCLEAN=`calc $UNCLEAN/$TOTAL*100 | $SED 's/^\W*//g' | $SED 's/~//g' | head -c 5`; + echo $PERCLEAN | grep "Err" > /dev/null; + if [ $? -eq 0 ]; then + TOTPERCLEAN="100"; + else + TOTPERCLEAN=`calc 100-$PERCLEAN | $SED 's/^\W*//g' | $SED 's/~//g' | head -c 5`; + fi + + echo -e $TOTAL";"$i";"$CLEAN";"$AT";"$HASH";"$TOTPERCLEAN; +done | sort -gr | awk -F';' '{print $2"\t"$1"\t"$3"\t"$4"\t"$5"\t"$6}' >> $OUT + +echo -e "===============================================" >> $OUT +cat $OUT; diff --git a/dev/testvoc/inconsistency.sh b/dev/testvoc/inconsistency.sh new file mode 100755 index 0000000..d35235d --- /dev/null +++ b/dev/testvoc/inconsistency.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +TMPDIR=/tmp + +DICT=$1 +DIR=$2 + +SED=sed + +if [[ $DIR = "hin-pan" ]]; then + +# Run the bilingual dictionary before to make sure we are only checking things we have. +lt-expand $DICT | grep -v '' | grep -v 'REGEX' | grep -v ':<:' | $SED 's/:>:/%/g' | $SED 's/:/%/g' | cut -f2 -d'%' | $SED 's/^/^/g' | $SED 's/$/$ ^.$/g' | apertium-pretransfer | lt-proc -b ../../hin-pan_Guru.autobil.bin | grep -v '/@' | cut -f1 -d'/' | $SED 's/$/$ ^.$/g' | tee $TMPDIR/$DIR.tmp_testvoc1.txt |\ + apertium-pretransfer|\ + lt-proc -b ../../hin-pan_Guru.autobil.bin | tee $TMPDIR/$DIR.tmp_testvoc2.txt |\ + apertium-transfer -b ../../apertium-hin-pan.hin-pan.t1x ../../hin-pan.t1x.bin | tee $TMPDIR/$DIR.tmp_testvoc3.txt |\ + lt-proc -d ../../hin-pan_Guru.autogen.bin > $TMPDIR/$DIR.tmp_testvoc4.txt +paste -d _ $TMPDIR/$DIR.tmp_testvoc1.txt $TMPDIR/$DIR.tmp_testvoc2.txt $TMPDIR/$DIR.tmp_testvoc3.txt $TMPDIR/$DIR.tmp_testvoc4.txt | $SED 's/\^.\$//g' | $SED 's/_/ ---------> /g' + +elif [[ $DIR = "pan-hin" ]]; then + +# Run the bilingual dictionary before to make sure we are only checking things we have. +lt-expand $DICT | grep -v '' | grep -v 'REGEX' | grep -v ':<:' | $SED 's/:>:/%/g' | $SED 's/:/%/g' | cut -f2 -d'%' | $SED 's/^/^/g' | $SED 's/$/$ ^.$/g' | apertium-pretransfer | lt-proc -b ../../pan_Guru-hin.autobil.bin | grep -v '/@' | cut -f1 -d'/' | $SED 's/$/$ ^.$/g' | tee $TMPDIR/$DIR.tmp_testvoc1.txt |\ + apertium-pretransfer|\ + lt-proc -b ../../pan_Guru-hin.autobil.bin | tee $TMPDIR/$DIR.tmp_testvoc2.txt |\ + apertium-transfer -b ../../apertium-hin-pan.pan-hin.t1x ../../pan-hin.t1x.bin | tee $TMPDIR/$DIR.tmp_testvoc3.txt |\ + lt-proc -d ../../pan-hin.autogen.bin > $TMPDIR/$DIR.tmp_testvoc4.txt +paste -d _ $TMPDIR/$DIR.tmp_testvoc1.txt $TMPDIR/$DIR.tmp_testvoc2.txt $TMPDIR/$DIR.tmp_testvoc3.txt $TMPDIR/$DIR.tmp_testvoc4.txt | $SED 's/\^.\$//g' | $SED 's/_/ ---------> /g' + +else + echo "bash inconsistency.sh "; +fi diff --git a/dev/testvoc/testvoc.sh b/dev/testvoc/testvoc.sh new file mode 100755 index 0000000..81ae7b4 --- /dev/null +++ b/dev/testvoc/testvoc.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +if [[ $2 = "hin-pan" ]]; then +echo "==Hindi->Punjabi==========================="; +bash inconsistency.sh $1 hin-pan > /tmp/hin-pan.testvoc; bash inconsistency-summary.sh /tmp/hin-pan.testvoc hin-pan +echo "" +elif [[ $2 = "pan-hin" ]]; then +echo "==Punjabi->Hindi==========================="; +bash inconsistency.sh $1 pan-hin > /tmp/pan-hin.testvoc; bash inconsistency-summary.sh /tmp/pan-hin.testvoc pan-hin +echo ""; +fi