Index: incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.dix =================================================================== --- incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.dix (revision 69701) +++ incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.dix (revision 69703) @@ -625,6 +625,7 @@ +

zarównoкак

aа

aи

alboили

@@ -708,9 +709,6 @@

natomiastно

natomiastтогдакак

nibyбудто

-

nibyдескать

-

nibyподобно

-

nibyпочти

nibyсловно

nibyточно

ponieważпотомучто

@@ -36148,9 +36146,14 @@

gburowatośćгрубость

ichtiozaurихтиозавр

+ +

ludzieлюди

+

rolaихтиозавр

+

kolejихтиозавр

+

abchaskiабхазский

Index: incubator/apertium-pol-rus/dev/from_morpheus.py =================================================================== --- incubator/apertium-pol-rus/dev/from_morpheus.py (nonexistent) +++ incubator/apertium-pol-rus/dev/from_morpheus.py (revision 69703) @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +import codecs +import re + +def forms_collector(fname): + '''opens a with smthn from morpheus, reads it and makes a dictionary of lemmas and wordforms''' + with codecs.open(fname, 'r', 'utf-8') as f: + forms = [line.split('\t') for line in f.readlines()] + + morph_d = {} + for line in forms: + if line[1] not in morph_d: + morph_d[line[1]] = [line[0]] + else: + morph_d[line[1]].append(line[0]) + return morph_d + +def info_collector(fname): + '''opens a with smthn from morpheus, reads it and makes a dictionary of lemmas and wordforms''' + with codecs.open(fname, 'r', 'utf-8') as f: + forms = [line.split('\t') for line in f.readlines()] + + gram_d = {} + for line in forms: + if line[1] not in gram_d: + gram_d[line[1]] = line[2].split(':')[-1] + # for key in gram_d: + # print(key + ' : ' + gram_d[key]) + return gram_d + +def paradigm_collector(morph_d): + '''returns a dictionary, where keys are lemmas and values is a tuple of stem and flections''' + paradigms = {} + for lemma in morph_d: + # print(lemma) + stem_len = stem_finder(morph_d[lemma], lemma) + # for form in morph_d[lemma]: + # print(form[:stem_len] + ' : ' + form[stem_len:], end = ', ') + # print('\n') + stem = lemma[:stem_len] + flections = [form[stem_len:] for form in morph_d[lemma]] + paradigms[lemma] = (stem, flections) + return paradigms + +def stem_finder(forms, lemma): + '''finds length of the stem, returns an integer. called in paradigm_collector''' + min_len = len(min(forms, key=len)) + stems_len = min_len + for form in forms: + for i in range(min_len): + if lemma[i:i+1] != form[i:i+1]: + # print(form[i:], end = ', ') + if i < stems_len: + stems_len = i + break + return stems_len + +def find_similar(paradigms): + '''finds similar inflectional types''' + similar = {} + for lemma in paradigms: + if tuple(set(paradigms[lemma][1])) not in similar: + similar[tuple(set(paradigms[lemma][1]))] = [lemma] + else: + similar[tuple(set(paradigms[lemma][1]))].append(lemma) + + # for inventory in similar: + # print(str(inventory)) + # print(str(similar[inventory])) + print('number of paradigms: ' + str(len(similar))) + return similar + +def check_presence(lemmas): + with codecs.open('../../apertium-pol/apertium-pol.pol.dix', 'r', 'utf-8') as f: + hyp = [re.findall('\w+', line) for line in f] + already_there = set([h[0] for h in hyp if len(h) > 0]) + # print(already_there) + intersection = set(lemmas).intersection(set(already_there)) + print('intersection: ' + str(intersection)) + return set(lemmas).difference(set(already_there)) + +def to_morph(to_add, info): + with codecs.open('add_to_monodix.xml', 'w', 'utf-8') as f: + for word in to_add: + f.write(' ' + word[:-4] + '\n') + # if info[word] == 'f': + # f.write(' ' + word + '\n') + # else: + # print('aaa') + # f.write(' ' + word + '\n') + +def to_bidix(to_add, info): + with codecs.open('add_to_bidix.xml', 'w', 'utf-8') as f: + for word in to_add: + if info[word] == 'f': + f.write('

' + word + '' + word + '

\n') + else: + f.write('

' + word + '' + word + '

\n') + +morph_d = forms_collector('verbs_from_morpheus.txt') +# info = info_collector('adjectives_from_morpheus.txt') +paradigms = paradigm_collector(morph_d) +similar = find_similar(paradigms) +inventories = [similar[inventory] for inventory in similar] +# wordclass = sorted(inventories, key = len)[-1] + +for inventory in inventories: + if 'stać' in inventory: + wordclass = inventory + +print(wordclass) + +with codecs.open('added.txt', 'w', 'utf-8') as f: + for lemma in wordclass: + f.write(lemma + '\n') + +for key in similar: + if similar[key] == wordclass: + print(key) + +to_add = check_presence(wordclass) +# to_morph(to_add, info) +# to_bidix(to_add, info) + +# done = [wordclass] +# inventories = [similar[inventory] for inventory in similar if similar[inventory] not in done] + Index: languages/apertium-pol/apertium-pol.pol.dix =================================================================== --- languages/apertium-pol/apertium-pol.pol.dix (revision 69701) +++ languages/apertium-pol/apertium-pol.pol.dix (revision 69703) @@ -5281,6 +5281,16 @@

ni

+ +

j

+

i

+

i

+

j

+

+

i

+

i

+
+

ź

zi

@@ -5309,6 +5319,16 @@

niach

nie

+ + +

je

+

i

+

jom

+

je

+

jami

+

jach

+

je

+

a

@@ -6193,6 +6213,19 @@

ciachcko

cicko

+ + + + +

ziezie

+

zizie

+

ziomzie

+

zizie

+

źmizie

+

ziachzie

+

zizie

+
+

toto

@@ -8329,6 +8362,11 @@

ń

ń

+ + +

ń

+

ń

+

ńń

@@ -21869,6 +21907,7 @@ ciel imi dzie + lud śpioch astronom filozof @@ -23216,7 +23255,11 @@ żarów żylet cór - top + top + + r + kole + fasol aureol ari