commit e7f3c96ce39eeedc63d0a9ac6a30d021bb08781c Author: ruthenian8 Date: Sat Aug 7 23:15:02 2021 +0300 add reworked stats diff --git a/Makefile b/Makefile index cef3af8..caddad7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ .DEFAULT_GOAL := bagval_numbers.gen.hfst -.PHONY: check check-gen check-ana +analyser=./merged.tr.hfstol +tests=tests.csv +test_sources=$(shell sed -s 1d $(tests) | cut -d, -f5 | sort -u) +.PHONY: check + %.tr.hfst: %.ana.hfst hfst-compose translit/translit.hfst $< -o $@ %.lexd.hfst: %.lexd @@ -14,8 +18,22 @@ hfst-compose translit/translit.hfst $< -o $@ %.tr.hfstol: %.tr.hfst hfst-fst2fst --optimized-lookup-unweighted -i $< -o $@ -check-ana: bagval_numbers.ana.hfst gold-standart.ana.txt - bash compare.sh $^ -check-gen: bagval_numbers.gen.hfst gold-standart.txt - bash compare.sh $^ -check: check-ana check-gen \ No newline at end of file +%.pass.txt: $(tests) + awk -F, '$$4 == "$*" && $$3 == "pass" {print $$1 ":" $$2}' $^ | sort -u > $@ +%.ignore.txt: $(tests) + awk -F, '$$4 == "$*" && $$3 == "ignore" {print $$1 ":" $$2}' $^ | sort -u > $@ +check-gen: merged.gen.hfst $(foreach t,$(test_sources),$(t).pass.txt $(t).ignore.txt) + for t in $(test_sources); do echo $$t; bash compare.sh $< $$t.ignore.txt; bash compare.sh $< $$t.pass.txt || exit $$?; done; +check: check-gen +check-coverage-stats: corpora $(analyser) stats + @ cd corpora; find * -name "*.lat" -print0 | xargs -0 ../stats ../$(analyser) +check-coverage-unrecog: corpora $(analyser) unrecog + @ find corpora -name "*.lat" -print0 | xargs -0 ./unrecog $(analyser) +check-coverage: corpora $(analyser) stats unrecog + @ echo aggregate coverage: + @ (make -s check-coverage-stats; find corpora -name "*.lat" -exec cat {} \; | ./stats -q $(analyser) -) | column -t + @ echo + @ echo unrecognised words: + @ make -s check-coverage-unrecog | tail -n20 +%.hfstol: %.hfst + echo '?::0' | hfst-regexp2fst | hfst-repeat | hfst-compose -F $< - | hfst-minimise -E | hfst-fst2fst -w -o $@ \ No newline at end of file diff --git a/README.md b/README.md index 3378f14..3380214 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,16 @@ This repository contains a prototype for a Bagvalal morphological analyzer. ### Making the analyzers * run translit/make * run make merged.ana.hfst for cyrillic version -* run make merged.tr.hfst for simplified IPA version +* run make merged.tr.hfst for Caucasiologist transcription version ### Running the analyzers -* run make corpora/\*corpus name\*.analyzed to analyze with the cyrillic transducer -* run make corpora/\*corpus name\*.tr.analyzed to analyze with the IPA transducer -* run bash corpora/tr_stats.sh \*corpus name\*.analyzed to view the statistics +* run make check-coverage-stats to view the statistics +* cd to corpora & run make \*corpus name\*.analyzed to analyze with the cyrillic transducer +* cd to corpora & run make \*corpus name\*.tr.analyzed to analyze with the IPA transducer + +###Examples: +* make check-coverage-stats -###Example: * cd corpora * make k_newline.tr.analyzed -* bash tr_stats.sh k_newline.tr.analyzed -Current performance: ~65% \ No newline at end of file +Current performance: Naive Coverage ~82% \ No newline at end of file diff --git a/merged.tr.hfstol b/merged.tr.hfstol index 89a8f35..cb041ce 100644 Binary files a/merged.tr.hfstol and b/merged.tr.hfstol differ diff --git a/stats b/stats new file mode 100644 index 0000000..bdd59d9 --- /dev/null +++ b/stats @@ -0,0 +1,20 @@ +#!/bin/bash + +if [[ $# -lt 1 ]] || [[ ! -e "$1" ]]; then + echo "usage: $(basename "$0") AN.hfstol [TEXT...]" >&2 + exit 1 +fi + +t="$1" +shift + +echo "source tokens+ tokens- token-cov types+ types- types-cov" +for f in "$@"; do + echo -n "$f " + cat "$f" | sed 's@[][/<>]@\\\0@g' | hfst-proc -q "$t" | grep -o '\^[^$]*\$' | awk ' + $0 ~ /\*/ { fail += 1; fail_h[$0] += 1; } + $0 !~ /\*/ { success += 1; success_h[$0] += 1; } + END { + print(success "\t" fail "\t" (success / (success+fail)) "\t" length(success_h) "\t" length(fail_h) "\t" (length(success_h) / (length(success_h) + length(fail_h)))); + }' +done diff --git a/unrecog b/unrecog new file mode 100644 index 0000000..1ec758d --- /dev/null +++ b/unrecog @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ $# -lt 1 ]] || [[ ! -e "$1" ]]; then + echo "usage: $(basename "$0") AN.hfstol [TEXT...]" >&2 + exit 1 +fi + +t="$1" +shift + +cat "$@" \ + | sed 's@[][/<>]@\\\0@g' \ + | hfst-proc "$t" 2>/dev/null \ + | grep -o '\^[^$]*\$' \ + | grep '\*' \ + | sort \ + | uniq -c \ + | sort -n