Index: incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.t2x =================================================================== --- incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.t2x (revision 71826) +++ incubator/apertium-pol-rus/apertium-pol-rus.pol-rus.t2x (revision 71827) @@ -1,4 +1,4 @@ - + Index: incubator/apertium-pol-rus/dev/cleaner.py =================================================================== --- incubator/apertium-pol-rus/dev/cleaner.py (revision 71826) +++ incubator/apertium-pol-rus/dev/cleaner.py (revision 71827) @@ -41,5 +41,5 @@ info = forms_collector('../../stuffs2') #../../stuffs # someverbs.txt -with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f: +with codecs.open('../../verbs_z.json', 'w', 'utf-8')as f: json.dump(info, f, ensure_ascii=False, indent=2) Index: incubator/apertium-pol-rus/dev/from_morpheus.py =================================================================== --- incubator/apertium-pol-rus/dev/from_morpheus.py (revision 71826) +++ incubator/apertium-pol-rus/dev/from_morpheus.py (revision 71827) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- + # -*- coding: utf-8 -*- import codecs import re Index: incubator/apertium-pol-rus/dev/from_z.py =================================================================== --- incubator/apertium-pol-rus/dev/from_z.py (revision 71826) +++ incubator/apertium-pol-rus/dev/from_z.py (revision 71827) @@ -1,5 +1,9 @@ # -*- coding: utf-8 -*- + +### NB: я заменяю prb на пустую строку в change_tags. пометить их как-то до этого. +### why on earh does this happen: pstpss+pstpss + import codecs import re import json @@ -9,14 +13,38 @@ def paradigm_collector(gram_d): # working at this '''returns a dictionary, where keys are lemmas and values is a tuple of stem and frozenset of tuples of flections and frozensets of grammar tags''' morph_d = {lexeme : [el[0] for el in gram_d[lexeme]] for lexeme in gram_d} + # print('example of morph_d: ' + str(morph_d[list(morph_d.keys())[0]])) paradigms = {} for lemma in morph_d: - stem_len = stem_finder(morph_d[lemma], lemma) + new_lemma = choose_lemma(gram_d[lemma]) + stem_len = stem_finder(morph_d[lemma], new_lemma) stem = lemma[:stem_len] - flections = frozenset([pair[0][stem_len:] + ' ' + pair[1] for pair in gram_d[lemma]]) + flections = frozenset([pair[0][stem_len:] + ' ' + change_tags(pair[1]) for pair in gram_d[lemma]]) paradigms[lemma] = (stem, flections) + # print('example of paradigms: ' + str(paradigms[list(paradigms.keys())[0]])) return paradigms +def change_tags(grammar_tags, secondary = True): + grammar_tags = grammar_tags.replace(' use/ant', '').replace(' fac', '').replace('pstpss pstpss ', 'pstpss ') + if secondary: + grammar_tags = grammar_tags.replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '').replace(' prb', '') + return grammar_tags + +def choose_lemma(lexeme): + '''takes a list of forms amd grammar tags and returns a lemma''' + if 'pstact' in lexeme[0][1] or 'pstpss' in lexeme[0][1]: + for arr in lexeme: + if 'nom' in arr[1] and 'sg' in arr[1] and 'msc' in arr[1]: + return arr[0] + elif 'imp' in lexeme[0][1]: + for arr in lexeme: + if 'sg' in arr[1]: + return arr[0] + else: + for arr in lexeme: + if 'inf' in arr[1]: + return arr[0] + def stem_finder(forms, lemma): '''finds length of the stem, returns an integer. called in paradigm_collector''' min_len = len(min(forms, key = len)) @@ -30,24 +58,11 @@ break return stems_len -def stem_finder_mod(forms, lemma): - '''finds length of the stem, returns an integer. called in paradigm_collector''' - min_len = len(min(forms, key = len)) - stems_len = min_len - for form in forms: - for i in range(min_len): - if lemma[i:i+1] != form[i:i+1]: - # print(form[i:], end = ', ') - if i < stems_len: - stems_len = i - break - return stems_len - def find_similar(paradigms): '''returns dictionary where keys are flections and grammar tags and values are lists of lexemes''' similar = {} for lemma in paradigms: - flecs = frozenset(paradigms[lemma][1]) + flecs = final_tags(paradigms[lemma][1]) if flecs not in similar: similar[flecs] = [lemma] else: @@ -59,20 +74,29 @@ print('number of paradigms: ' + str(len(similar))) return similar +def final_tags(frozen_info): + '''replaces tags''' + replacer = {'msc' : 'm', 'anin': 'an', 'fem' : 'f', 'inan' : 'nn', 'anim' : 'aa', 'neu' : 'nt'} + new_info = [] + for wordform in frozen_info: + for replacement in replacer: + wordform = wordform.replace(replacement, replacer[replacement]) + new_info.append(wordform) + return frozenset(new_info) + # info = forms_collector('../../stuffs') #../../stuffs # someverbs.txt # with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f: # json.dump(info, f, ensure_ascii=False, indent=2) - def par_splitter(info): imp = {k:[] for k in info.keys()} pstact, pstpss, other = copy.deepcopy(imp), copy.deepcopy(imp), copy.deepcopy(imp) for lexeme in info: for wordform in info[lexeme]: - if 'pstpss' in wordform[1]: + if 'pstpss' in wordform[1] and 'pred' not in wordform[1]: pstpss[lexeme].append(wordform) - elif 'pstact' in wordform[1]: + elif 'pstact' in wordform[1] and 'adv' not in wordform[1]: pstact[lexeme].append(wordform) elif 'imp' in wordform[1]: imp[lexeme].append(wordform) @@ -84,8 +108,18 @@ d.pop(l) return pstpss, pstact, imp, other -def pstpss_par_maker(classes): - pass +def pstpss_par_maker(similar): + text = '\n\n' + for infl_class in similar: + text += '\n' + for item in infl_class: + item = item.split() + text += '

' + item[0] + for tag in item[2:]: + text += '' + text += '

\n' + text += '
\n\n' + print(text) def pstact_par_maker(classes): pass @@ -96,6 +130,16 @@ def whole_par(classes, pstpss_par, pstact_par, imp_par): pass +def lexeme_spliter(info): + infinitive = 0 + for lexeme in info: + for wordform in info[lexeme]: + if 'inf' in wordform[1]: + infinitive += 1 + if infinitive == 2: + print(lexeme) + + def find_paradigm(word, inventories, similar): for inventory in inventories: @@ -107,22 +151,47 @@ if similar[key] == wordclass: print(key) +def fun_debugging_time(similar): + inventories = similar.values() + # greatest = sorted(inventories, key=len)[-1] + # print('length of the greatest class: ' + str(len(greatest))) + # print('three words from the greatest wordclass: ' + greatest[0] + ', ' + greatest[1] + ', ' + greatest[2]) + # find_paradigm(greatest[0], inventories, similar) + # print('----------------') + # second = sorted(inventories, key=len)[-2] + # print('length of the second greatest class: ' + str(len(second))) + # print('three words from the second greatest wordclass: ' + second[0] + ', ' + second[1] + ', ' + second[2]) + # find_paradigm(second[0], inventories, similar) + # print('----------------') + # third = sorted(inventories, key=len)[-3] + # print('length of the third greatest class: ' + str(len(third))) + # print('three words from the second greatest wordclass: ' + third[0] + ', ' + third[1]) # + ', ' + third[2]) + # find_paradigm(third[0], inventories, similar) + # print('----------------') + fourth = sorted(inventories, key=len)[-4] + print('length of the fourth greatest class: ' + str(len(fourth))) + print('three words from the second greatest wordclass: ' + fourth[0]) # + ', ' + third[2]) + print(fourth) + find_paradigm(fourth[0], inventories, similar) + def main(): with codecs.open('../../verbs_z.json', 'r', 'utf-8')as f: info = json.load(f) + + # lexeme_spliter(info) # отдебажить pstpss, pstact, imp, other = par_splitter(info) similar_pstact = find_similar(paradigm_collector(pstact)) + similar_pstpss = find_similar(paradigm_collector(pstpss)) + + pstpss_par_maker(similar_pstpss) # import pickle # pickle.dump(similar_pstact, open( "save.p", "wb" ) ) - inventories = similar_pstact.values() - greatest = sorted(inventories, key=len)[2] - print('length of the greatest class: ' + str(len(greatest))) - print('the greatest class: ') - find_paradigm(greatest[0], inventories, similar_pstact) - print(greatest) + fun_debugging_time(similar_pstpss) + + main()