commit 0c3379eaeb3f4b4842cbd914e63fb3493caaa9db Author: MambetniyazovAmir Date: Mon Jun 15 19:48:17 2020 +0500 rebasing master diff --git a/apertium-uzb-kaa.uzb-kaa.dix b/apertium-uzb-kaa.uzb-kaa.dix index e4ccee4..14b356d 100644 --- a/apertium-uzb-kaa.uzb-kaa.dix +++ b/apertium-uzb-kaa.uzb-kaa.dix @@ -5455,7 +5455,7 @@

kuchli kúshli

kuchsiz kúshsiz

kuchsizlik kúshsizlik

-

kuchuk kushık

+

kuchuk kúshik

kudurat qayǵı

kufron kufron

kuf-suf kuf-suf

@@ -19492,7 +19492,7 @@

tashqari sırt

Oygul Aygúl

Baxtiyor Baxtıyar

-

ular olar

+

ular olar

ulkan úlken

qachon qashan

xursand quwanıshli

@@ -19502,7 +19502,7 @@

ber ber

yoki yamasa

yosh jas

-

sen sen

+

sen sen

yer jer

kir kir

bolakay bala

@@ -19517,6 +19517,8 @@

yeyishjew

bekinishjasırınıw

bekinjasırın

+

koʼrkór

+

narsa nárse

diff --git a/tests/wer.sh b/tests/wer.sh index cc7f94f..55f44f7 100755 --- a/tests/wer.sh +++ b/tests/wer.sh @@ -3,15 +3,15 @@ # apertium-quality -cat ../texts/story.uzb.txt | apertium -d . uzb-kaa > ../texts/story.uzb-kaa.txt +cat texts/story.uzb.txt | apertium -d . uzb-kaa > texts/story.uzb-kaa.txt -cat ../texts/story.kaa.txt | apertium -d . kaa-uzb > ../texts/story.kaa-uzb.txt +cat texts/story.kaa.txt | apertium -d . kaa-uzb > texts/story.kaa-uzb.txt echo 'WER uzb-kaa:' -perl ../apertium-eval-translator/apertium-eval-translator-line.pl -test ../texts/story.uzb-kaa.txt -ref ../texts/story.kaa.txt > ../texts/uzb-kaa-wer.txt -grep '(WER)' ../texts/uzb-kaa-wer.txt +perl ../../apertium-eval-translator/apertium-eval-translator-line.pl -test texts/story.uzb-kaa.txt -ref texts/story.kaa.txt > texts/uzb-kaa-wer.txt +grep '(WER)' texts/uzb-kaa-wer.txt echo 'WER kaa-uzb:' -perl ../apertium-eval-translator/apertium-eval-translator-line.pl -test ../texts/story.kaa-uzb.txt -ref ../texts/story.uzb.txt > ../texts/kaa-uzb-wer.txt -grep '(WER)' ../texts/kaa-uzb-wer.txt +perl ../../apertium-eval-translator/apertium-eval-translator-line.pl -test texts/story.kaa-uzb.txt -ref texts/story.uzb.txt > texts/kaa-uzb-wer.txt +grep '(WER)' texts/kaa-uzb-wer.txt diff --git a/texts/story.kaa.txt b/texts/story.kaa.txt index 38d00b0..0ea1d0a 100644 --- a/texts/story.kaa.txt +++ b/texts/story.kaa.txt @@ -1,6 +1,6 @@ 1 Baxtıyar QAY JERDE? 2 Baxtıyar benen Aygúl baǵda. Búgin hawa-rayı jaqsı, júdá jıllı. Biraq keshe júdá suwıq boldı. Olar dalada oynay almadı. Baxtıyar benen Aygúl oynaǵandı jaqsı kóredi, olar barqulla úlken úydiń aldındaǵı baǵda birge oynaydı. -3 Baxtıyar altı jasar kishkene bala. Qız onıń qarındası, ol bes jasta. Baxtıyartıń kishkene bir kúshigi bar, házir ol da baǵda. Kúshik balalar menen oynaǵandı jaqsı kóredi. Kúshigi házir júdá quwanıshlı. +3 Baxtıyar altı jasar kishkene bala. Qız onıń qarındası, ol bes jasta. Baxtıyardıń kishkene bir kúshigi bar, házir ol da baǵda. Kúshik balalar menen oynaǵandı jaqsı kóredi. Kúshigi házir júdá quwanıshlı. 4 Al Aygúldiń kúshigi barma? Yaq, Aygúldiń kúshigi joq, onıń pıshıǵı bar. Biraq pıshıǵı úyde, uyıqlap atır. 5 Olardıń anası pıshıǵı menen birge úyde, ol aynadan Baxtıyar benen Aygúldiń oynaǵanına qarap tur. Baxtıyar eski úlken bir terekke qaray qattı júgirip baratır, ol Aygúlden jasırınıp atır. Ne ushın ekenligin bilesizbe? Aygúl qolı menen kózlerin jawıp otır. Ol hesh nárseni kórip turǵanı joq, ol sanap atır. Aygúl ne ushın bunday qılıp atır? Hám Baxtıyar terektiń qasında ne qılıp atır? 6 Bul oyın. Aygúl sanap bolǵannan keyin átirapına qaradı. Ol "Baxtıyar qay jerge ketti? Onı kórdińizbe?" - dep izlep atır. diff --git a/uzb-kaa.rlx b/uzb-kaa.rlx new file mode 100644 index 0000000..7279ea6 --- /dev/null +++ b/uzb-kaa.rlx @@ -0,0 +1,307 @@ +DELIMITERS = "<.>" "" "" ; +SOFT-DELIMITERS = "<,>" ; + +LIST BOS = (>>>) sent ; # Beginning of sentence +LIST EOS = (<<<) sent ; # End of sentence +LIST Lpar = lpar; +LIST Rpar = rpar ; + +LIST N = n ; +LIST V = v ; +LIST Prop = np ; +LIST Pron = prn ; +LIST Num = num ; +LIST A = adj ; +LIST Det = det ; +LIST Adv = adv ; +LIST CC = cnjcoo ; +LIST CS = cnjsub ; +LIST Interj = ij ; +LIST Post = post ; +LIST Cop = cop ; +LIST IV = iv ; +LIST TV = tv ; +LIST Poss = px1sg px2sg px3sg px1pl px2pl px3pl ; +LIST Poss3 = px3sg px3sp px3pl ; +LIST Poss2 = px2sg ; +LIST Past = past ; +LIST Px3Sp = px3sp ; +LIST Px2Sg = px2sg ; + +LIST 1PS = p1 sg ; +LIST 2PS = p2 sg ; +LIST 3PS = p3 sg ; +LIST 1Pl = p1 pl ; +LIST 2Pl = p2 pl ; +LIST 3Pl = p3 pl ; + +LIST Person = p1 p2 p3 ; + +LIST Nom = nom ; +LIST Gen = gen ; +LIST Abe = abe ; +LIST Acc = acc ; +LIST Dat = dat ; +LIST Loc = loc ; + +LIST Subst = subst ; +LIST Attr = attr ; +LIST Advl = advl ; + +LIST Ant = (np ant); +LIST Cog = (np cog); + +LIST Recip = rec ; +LIST Caus = caus ; + +LIST FiniteVerb = pres aor past ifi ifi_evid fut fut_plan imp opt pih ; + +LIST Ger = ger_past ger_abst ger_inf ger4 ger5 gna2 gna3 gna4 gpr_rsub; #these numbers are to be replaced with actual ones, right? + +LIST Vaux = vaux ; + +LIST rsub = gpr_rsub ; + +LIST Gerinf = ger_inf ; + +LIST Imper = imp ; + +LIST Mistake = mistake ; + +LIST Colon = ":" ; + +SET FINITE = V - Ger ; + +SET PRE-N = Det | Num | Attr | A | Gen | ("-") ; # CC + +SET NOMINAL-HEAD = N | Ger | Subst ; + +SET WORD = N | V | A | Post | Pron | Det | Adv | CC | CS | Interj | Num | ("\?") ; +SET MARK = (",") | ("\\") | ("\;") | ("–"); #" +SET WORDMARK = WORD | MARK ; +SET PHRASEMARK = ("\\") | ("\;") ; #" + + + +REMOVE Mistake ; +#why is there a tag like this anyway? + + +#To be fixed, ugly + +REMOVE Imper IF (NOT 0C Imper) ; + +#There can be no gerund at the end of a sentence + +REMOVE Ger IF (1 EOS OR Lpar); + + +# N+attr selections +REMOVE Attr IF (0 A); +#select adjectives over n.attr + +REMOVE Attr IF (NOT 1 PRE-N) (NOT 1 NOMINAL-HEAD) (NOT 1/1 NOMINAL-HEAD) ; + +REMOVE N + Nom IF (0 Attr OR Nom) (1C Nom) ; + +SELECT Attr (0 Nom) (1C Px3Sp + Nom) ; + +# + + +SELECT Pron IF (0 N) ; + +#REMOVE Cop IF (NOT 1C EOS); + +SELECT SUB:1 Cop IF (1 EOS) ; + + REMOVE SUB:1 Cop IF + (NOT 1 EOS OR MARK OR ("da")) +; + REMOVE SUB:1 Cop IF + (-1 BOS OR MARK) ## Headings or enumerations + (NOT 1 EOS) +; + + SELECT SUB:1 Cop IF + (1 (lpar)) + (2* (rpar) BARRIER EOS) + (NOT -1 Colon) + ; + + + +# + SELECT SUB:1 Cop IF + (1 MARK) + (2*/1 Cop BARRIER EOS) + (NOT 0 Interj) ## Дұрыс, оның мысығы бар. + (NOT 0 FiniteVerb) ## 74 ... барлығы 53 ел [0]қатысты. + (NOT 2 N) +; +## Жоқ, Айгүлдің күшігі [0]жоқ, оның мысығы [0]бар. + + SELECT SUB:1 Cop IF + (1 EOS) + (NOT 0 V OR Vaux) + ; + + + + +#SInce the 3 singular can be mute in some cases, better to remove it if it is not the end of the sentence! + +REMOVE 3PS IF (NOT 1 EOS) ; + +# REMOVE the Intransitive if the previous item is in accusative form + +REMOVE IV IF (-1C Acc) ; + +SELECT TV IF (-1C Acc) ; + +#If following item is an ADV, then select Pron reading + +SELECT Pron IF (0C Det OR Pron) (1 Adv) ; + +# Select Proper noun if it starts with a capital letter while not being after a full stop + +SELECT Prop IF (0 N)(0 Prop) (0 ("[:upper:]+[:lower:]*"r))(NOT -1 BOS) ; + +# IF there is a Noun which is both np and n, and the following name is a cog, then the first one may be as well a proper noun + +SELECT Prop IF (0 N) (0 Ant)(1 Cog) (-1 BOS); + +#If we have a form which is both present as N1 or derivative gerund, select N1 + +SELECT N IF (0 N) (0 Ger) ; + + + +#Construction gen + poss (ataturk'un cumhuriyeti) + +SELECT Poss3 IF (-1 Gen) ; + +SELECT Gen IF (1C Poss3) ; + + +#### POSTPOSITIONS ###### + +"" SELECT Post IF (-1 Ger + Poss) ; + +"" SELECT Post IF (-1 Ger) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Nom) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Abe) ; + +"" SELECT Post IF (-1 Gen) ; + +"" SELECT N IF (-1 Num) ; + +"" SELECT Post IF (-1 Dat) ; + +"" SELECT CS IF (-1 MARK) (1 V) ; +#“Qadimgilarga: “Zino qilma”, – deb aytilganini eshitgansizlar. + +# ATTRIBUTIVE ADJ + +#If an adjective is right before a finite verb, select its adverbial reading + +SELECT Advl IF (1C FINITE) ; + +#If an adjective is right before the end of a sentence, we can safely expect it to be an adjective. + +SELECT N IF (0 Ger); +#kurash + +REMOVE Advl IF (0 A)(1 EOS) ; + +#If the following item does not include a copula in its reading, discard subst from the adjective. + +REMOVE Subst IF (0 A) (1 EOS) (NOT 1 Cop) ; + +REMOVE A IF (0 A) (NOT 1 Subst) ; + +#If an adjective is right before a numeral + noun it is an adjective for sure + +SELECT A IF (1C Num) (2C N) ; + +## select A if inbetween nouns +SELECT A IF (-1 N) (1 N) ; + +## select A if before copula (idi, iken) +SELECT A IF (1 Cop) ; + +# 2nd Singular Possessive + +REMOVE Poss2 IF (NOT 0 Gen)(1 Poss3) ; + +REMOVE Gerinf IF (0 Loc) ; +#ketmoqda + +#VERBS + +#Select FINITE FORM (in this case past) if it is the last word of the sentence + +SELECT Past IF (1 EOS) ; + +#Remove V + V reading + +REMOVE V IF (1 FINITE) (2 EOS) ; + +SELECT A IF (-1 N) (0 Nom) (1 N) ; +# Aholining koʻpchilik qismi + +REMOVE Attr IF (1C Cop); + ## Shu bilan birga kamolchilik inqilobi kator salbiy xususiyatlarga xam ega edi. + +REMOVE Interj IF (NOT -1 BOS) (NOT 1 EOS) ; + #yoq, bar + +SELECT rsub IF (-1 N) (1 N) ; +#yor olgan tasvir + +SELECT Ger IF (0 A) ; +#o'tgan + +SELECT Sub:1 FINITE IF + #(0/1 Ger) # FIXME: why does this only work when commented + (1 EOS OR PHRASEMARK) ; + +SELECT Prop IF (0 N) (-1 Post) ; # e.g. asal/Asal