#!/bin/sh # http://wiki.apertium.org/wiki/Asturian#Calculating_coverage # Example use: # zcat corpa/en.crp.txt.gz | sh corpus-stat.sh #CMD="cat corpa/en.crp.txt" CMD="cat ../corpora/ckt.crp.txt" F=../corpora/corpus-stat-res.txt # Calculate the number of tokenised words in the corpus: # for some reason putting the newline in directly doesn't work, so two seds $CMD | apertium-destxt | hfst-proc ../ckt.mor.hfstol | apertium-retxt | sed 's/\$[^^]*\^/$^/g' | sed 's/\$\^/$\ ^/g' > $F # first=$CMD | apertium-destxt | hfst-lookup ckt.hfst # echo first NUMWORDS=`cat $F | wc -l` echo "Number of tokenised words in the corpus: $NUMWORDS" # Calculate the number of words that are not unknown NUMKNOWNWORDS=`cat $F | grep -v '\*' | wc -l` echo "Number of known words in the corpus: $NUMKNOWNWORDS" echo "Top known words in the corpus:" cat $F | grep -v '\*' | sort -f | uniq -c | sort -gr | head -10 # Calculate the coverage COVERAGE=`calc "round($NUMKNOWNWORDS/$NUMWORDS*1000)/10"` echo "Coverage: $COVERAGE %" #If you don't have calc, change the above line to: #COVERAGE=$(perl -e 'print int($ARGV[0]/$ARGV[1]*1000)/10;' $NUMKNOWNWORDS $NUMWORDS) # Show the top-ten unknown words. echo "Top unknown words in the corpus:" cat $F | grep '\*' | sort -f | uniq -c | sort -gr | head -10 # Update hitparade cat $F | cut -f2 -d'^' | cut -f1 -d'/' | sort -f | uniq -c | sort -gr > ../corpora/ckt.hitparade.txt # Update glossed hitparade bash hitparade.sh > ../corpora/hitparade.txt # Update unknown list cat ../corpora/hitparade.txt | grep '/\*[^0-9]' > ../corpora/unknown.txt