commit b3e99d61ac1c0a2086aa4977d8fb249d729e3ae1 Author: kamush901 Date: Sun Jul 25 13:35:26 2021 +0200 Added a script to calculate WER per each line of the parallel corpus diff --git a/.gitignore b/.gitignore index 0067521..a1accc4 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ docs/ texts/uzb-kaz.txt texts/kaz-uzb.txt +texts/wer_lines.txt diff --git a/apertium-kaz-uzb.kaz-uzb.dix b/apertium-kaz-uzb.kaz-uzb.dix index 6324db3..9842952 100644 --- a/apertium-kaz-uzb.kaz-uzb.dix +++ b/apertium-kaz-uzb.kaz-uzb.dix @@ -10073,5 +10073,32 @@

т.б.boshqalar

ауызсуichimliksuvi

+

дереккөзmanba

+

құйылысquyilishjoyi

+

ныңning

+

микрорегионmikroregion

+

мезорегионmezoregion

+

ГершельGershel

+

УильямWillyam

+

ДжонJon

+

НенецкNenetsk

+

ПечорскPechorsk

+

ДвинскDvinsk

+

геga

+

динамикаdinamika

+

ОКАТОMHOUK

+

АОAJ

+

идентификациялықidentifikatsion

+

фрегезияcherkov

+

автономavtonom

+

бөлінісбөлініс

+

субрегионsubregion

+

комаркаkomarka

+

ВычегдаVichegda

+

МаңғыстауMang'istog'

+

ХантыXanti

+

торабынtarmoq

+

негізделгенasoslangan

+ diff --git a/texts/wer_by_line.sh b/texts/wer_by_line.sh index 9a3782a..e5b7f91 100644 --- a/texts/wer_by_line.sh +++ b/texts/wer_by_line.sh @@ -3,7 +3,32 @@ # Calculates WER per line # Sorts by WER score in descending order to show what sentences are causeing the most problem. -while IFS= read -r line1 && IFS= read -r line2 <&3; do - echo "File 1: $line_test" - echo "File 2: $line_ref" -done < kaz-uzb.txt 3< uzb-big.txt \ No newline at end of file +# Reading both files line by line at once: +count=0 +declare -a lines +while IFS= read -r line_test && IFS= read -r line_ref <&3; do + count=$((count+1)) + # Saving each line in a separate file so the apertium-eval can read it: + echo $line_test > tmp.test + echo $line_ref > tmp.ref + wer=$(apertium-eval-translator -ref tmp.ref -test tmp.test | grep -F "Word error rate (WER):" | head -1 | awk -F" " '{print $5}') + echo "$count: $wer" + line="$wer% $line_test" + lines[$count]=$line + # echo ${lines[$count]} + # exit +done < kaz-uzb.txt 3< uzb-big.txt + +rm tmp.test +rm tmp.ref + +# Sorting the array: +readarray -t sorted_lines < <(for a in "${lines[@]}"; do echo "$a"; done | sort -r) + +# Saving the array in a file: +for a in "${sorted_lines[@]}" +do + echo "$a" >> wer_lines.txt +done + +echo "Done." \ No newline at end of file