commit b3e99d61ac1c0a2086aa4977d8fb249d729e3ae1
Author: kamush901 <kutlimuratovab0712@gmail.com>
Date:   Sun Jul 25 13:35:26 2021 +0200

    Added a script to calculate WER per each line of the parallel corpus

diff --git a/.gitignore b/.gitignore
index 0067521..a1accc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@
 docs/
 texts/uzb-kaz.txt
 texts/kaz-uzb.txt
+texts/wer_lines.txt
diff --git a/apertium-kaz-uzb.kaz-uzb.dix b/apertium-kaz-uzb.kaz-uzb.dix
index 6324db3..9842952 100644
--- a/apertium-kaz-uzb.kaz-uzb.dix
+++ b/apertium-kaz-uzb.kaz-uzb.dix
@@ -10073,5 +10073,32 @@
 <e><p><l>т.б.<s n="abbr"/></l><r>boshqalar<s n="det"/></r></p></e>
 <e><p><l>ауызсу<s n="n"/></l><r>ichimlik<b/>suvi<s n="n"/></r></p></e>
 
+<e><p><l>дереккөз<s n="n"/></l><r>manba<s n="n"/></r></p></e>
+<e><p><l>құйылыс<s n="adv"/></l><r>quyilish<b/>joyi<s n="adv"/></r></p></e>
+<e><p><l>ның<s n="det"/></l><r>ning<s n="det"/></r></p></e>
+<e><p><l>микрорегион<s n="n"/></l><r>mikroregion<s n="n"/></r></p></e>
+<e><p><l>мезорегион<s n="n"/></l><r>mezoregion<s n="n"/></r></p></e>
+<e><p><l>Гершель<s n="np"/><s n="ant"/></l><r>Gershel<s n="np"/><s n="ant"/></r></p></e>
+<e><p><l>Уильям<s n="np"/><s n="ant"/></l><r>Willyam<s n="np"/><s n="ant"/></r></p></e>
+<e><p><l>Джон<s n="np"/><s n="ant"/></l><r>Jon<s n="np"/><s n="ant"/></r></p></e>
+<e><p><l>Ненецк<s n="np"/><s n="top"/></l><r>Nenetsk<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>Печорск<s n="np"/><s n="top"/></l><r>Pechorsk<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>Двинск<s n="np"/><s n="top"/></l><r>Dvinsk<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>ге<s n="acr"/></l><r>ga<s n="acr"/></r></p></e>
+<e><p><l>динамика<s n="n"/></l><r>dinamika<s n="n"/></r></p></e>
+<e><p><l>ОКАТО<s n="abbr"/></l><r>MHOUK<s n="abbr"/></r></p></e>
+<e><p><l>АО<s n="abbr"/></l><r>AJ<s n="abbr"/></r></p></e>
+<e><p><l>идентификациялық<s n="adj"/></l><r>identifikatsion<s n="adj"/></r></p></e>
+<e><p><l>фрегезия<s n="n"/></l><r>cherkov<s n="n"/></r></p></e>
+<e><p><l>автоном<s n="adj"/></l><r>avtonom<s n="adj"/></r></p></e>
+<e><p><l>бөлініс<s n="v"/></l><r>бөлініс<s n="v"/></r></p></e>
+<e><p><l>субрегион<s n="n"/></l><r>subregion<s n="n"/></r></p></e>
+<e><p><l>комарка<s n="n"/></l><r>komarka<s n="v"/></r></p></e>
+<e><p><l>Вычегда<s n="np"/><s n="top"/></l><r>Vichegda<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>Маңғыстау<s n="np"/><s n="top"/></l><r>Mang'istog'<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>Ханты<s n="np"/><s n="top"/></l><r>Xanti<s n="np"/><s n="top"/></r></p></e>
+<e><p><l>торабын<s n="n"/></l><r>tarmoq<s n="n"/></r></p></e>
+<e><p><l>негізделген<s n="adj"/></l><r>asoslangan<s n="adj"/></r></p></e>
+
 </section>
 </dictionary>
diff --git a/texts/wer_by_line.sh b/texts/wer_by_line.sh
index 9a3782a..e5b7f91 100644
--- a/texts/wer_by_line.sh
+++ b/texts/wer_by_line.sh
@@ -3,7 +3,32 @@
 # Calculates WER per line
 # Sorts by WER score in descending order to show what sentences are causeing the most problem.
 
-while IFS= read -r line1 && IFS= read -r line2 <&3; do
-  echo "File 1: $line_test"
-  echo "File 2: $line_ref"
-done < kaz-uzb.txt 3< uzb-big.txt
\ No newline at end of file
+# Reading both files line by line at once:
+count=0
+declare -a lines
+while IFS= read -r line_test && IFS= read -r line_ref <&3; do
+    count=$((count+1))
+    # Saving each line in a separate file so the apertium-eval can read it:
+    echo $line_test  > tmp.test
+    echo $line_ref  > tmp.ref
+    wer=$(apertium-eval-translator -ref tmp.ref -test tmp.test | grep -F "Word error rate (WER):" | head -1 | awk -F" " '{print $5}')
+    echo "$count: $wer"
+    line="$wer% $line_test"
+    lines[$count]=$line
+    # echo ${lines[$count]}
+    # exit
+done < kaz-uzb.txt 3< uzb-big.txt
+
+rm tmp.test
+rm tmp.ref
+
+# Sorting the array:
+readarray -t sorted_lines < <(for a in "${lines[@]}"; do echo "$a"; done | sort -r)
+
+# Saving the array in a file:
+for a in "${sorted_lines[@]}" 
+do
+    echo "$a" >> wer_lines.txt
+done
+
+echo "Done."
\ No newline at end of file