Index: incubator/apertium-pol-rus/dev/from_z.py =================================================================== --- incubator/apertium-pol-rus/dev/from_z.py (revision 71996) +++ incubator/apertium-pol-rus/dev/from_z.py (revision 71997) @@ -8,9 +8,10 @@ import re import json import copy +import time -def paradigm_collector(gram_d): # working at this +def paradigm_collector(gram_d, secondary = True): # working at this '''returns a dictionary, where keys are lemmas and values is a tuple of stem and frozenset of tuples of flections and frozensets of grammar tags''' morph_d = {lexeme : [el[0] for el in gram_d[lexeme]] for lexeme in gram_d} # print('example of morph_d: ' + str(morph_d[list(morph_d.keys())[0]])) @@ -17,15 +18,16 @@ paradigms = {} for lemma in morph_d: new_lemma = choose_lemma(gram_d[lemma]) + if new_lemma is not None: stem_len = stem_finder(morph_d[lemma], new_lemma) stem = lemma[:stem_len] - flections = frozenset([pair[0][stem_len:] + ' ' + change_tags(pair[1]) for pair in gram_d[lemma]]) + flections = frozenset([pair[0][stem_len:] + ' ' + change_tags(pair[1], secondary) for pair in gram_d[lemma]]) paradigms[lemma] = (stem, flections) # print('example of paradigms: ' + str(paradigms[list(paradigms.keys())[0]])) return paradigms def change_tags(grammar_tags, secondary = True): - grammar_tags = grammar_tags.replace(' use/ant', '').replace(' fac', '').replace('pstpss pstpss ', 'pstpss ') + grammar_tags = grammar_tags.replace(' use/ant', '').replace('pstpss pstpss ', 'pstpss ') if secondary: grammar_tags = grammar_tags.replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '').replace(' prb', '') return grammar_tags @@ -32,19 +34,18 @@ def choose_lemma(lexeme): '''takes a list of forms amd grammar tags and returns a lemma''' - if 'pstact' in lexeme[0][1] or 'pstpss' in lexeme[0][1]: + if 'pstact' in lexeme[0][1] or 'pstpss' in lexeme[0][1] or 'prsact' in lexeme[0][1] or 'prspss' in lexeme[0][1]: for arr in lexeme: if 'nom' in arr[1] and 'sg' in arr[1] and 'msc' in arr[1]: return arr[0] - elif 'imp' in lexeme[0][1]: - for arr in lexeme: - if 'sg' in arr[1]: - return arr[0] else: for arr in lexeme: if 'inf' in arr[1]: return arr[0] + print('no lemma', end = ': ') + print(lexeme) + def stem_finder(forms, lemma): '''finds length of the stem, returns an integer. called in paradigm_collector''' min_len = len(min(forms, key = len)) @@ -76,7 +77,7 @@ def final_tags(frozen_info): '''replaces tags''' - replacer = {'msc' : 'm', 'anin': 'an', 'fem' : 'f', 'inan' : 'nn', 'anim' : 'aa', 'neu' : 'nt'} + replacer = {'msc' : 'm', 'anin': 'an', 'fem' : 'f', 'inan' : 'nn', 'anim' : 'aa', 'neu' : 'nt', 'pred' : 'short'} new_info = [] for wordform in frozen_info: for replacement in replacer: @@ -90,8 +91,8 @@ # json.dump(info, f, ensure_ascii=False, indent=2) def par_splitter(info): - imp = {k:[] for k in info.keys()} - pstact, pstpss, other = copy.deepcopy(imp), copy.deepcopy(imp), copy.deepcopy(imp) + pstact = {k:[] for k in info.keys()} + pstpss, prsact, prspss, other = copy.deepcopy(pstact), copy.deepcopy(pstact), copy.deepcopy(pstact), copy.deepcopy(pstact) for lexeme in info: for wordform in info[lexeme]: if 'pstpss' in wordform[1] and 'pred' not in wordform[1]: @@ -98,36 +99,55 @@ pstpss[lexeme].append(wordform) elif 'pstact' in wordform[1] and 'adv' not in wordform[1]: pstact[lexeme].append(wordform) - elif 'imp' in wordform[1]: - imp[lexeme].append(wordform) + elif 'prsact' in wordform[1] and 'adv' not in wordform[1]: + prsact[lexeme].append(wordform) + elif 'prspss' in wordform[1]: + prspss[lexeme].append(wordform) else: other[lexeme].append(wordform) - for d in pstpss, pstact, imp, other: + for d in pstpss, pstact, prsact, prspss, other: for l in info: if d[l] == []: d.pop(l) - return pstpss, pstact, imp, other + return pstpss, pstact, prsact, prspss, other -def secondary_par_maker(similar, pos = 'pstpss'): +def secondary_par_maker(similar, pos): text = '\n\n' for infl_class in similar: - text += '\n' + text += '\n' for item in infl_class: item = item.split() text += '

' + item[0] for tag in item[2:]: - if tag == 'leng': + if tag in ['leng', 'use/ant']: text = text.rsplit('\n', 1)[0] + '\n' + text.rsplit('\n', 1)[1].replace('', '') continue text += '' text += '

\n' text += '
\n\n' - print(text) + return text -def whole_par(classes, pstpss_par, pstact_par, imp_par): - pass +def whole_par(similar): + labels = {} + text = '\n\n' + for infl_class in similar: + label = similar[infl_class][0] + labels[label] = similar[infl_class] + text += '\n' + for item in infl_class: + item = item.split() + text += '

' + item[0] + for tag in item[2:]: + if tag in ['leng', 'use/ant']: + text = text.rsplit('\n', 1)[0] + '\n' + text.rsplit('\n', 1)[1].replace('', '') + continue + text += '' + text += '

\n' + text += '
\n\n' + return text, labels def lexeme_spliter(info): + tochange = {} for lexeme in info: infinitives = [] for wordform in info[lexeme]: @@ -135,36 +155,40 @@ infinitives.append(wordform) if len(infinitives) > 1: lexemes = split_correctly_mod(info[lexeme]) + tochange[lexeme] = lexemes + for lemma in tochange: + info.pop(lemma) + for i in range(len(tochange[lemma])): + info[lemma + str(i + 1)] = tochange[lemma][i] + # print(lemma + str(i + 1) + ' : ' + str(tochange[lemma][i])) + return info +# def find_criterion(infinitives): # it's easier to find griteria of distinction between two things +# info = [set(inf[1].split()) for inf in infinitives] +# print('info for criterion: ' + str(info)) +# criterion = info[0].difference(info[1]) +# if len(criterion): +# print(criterion) +# return list(criterion)[0] +# else: +# print('zero difference: ' + str(infinitives)) -def find_criterion(infinitives): # it's easier to find griteria of distinction between two things - info = [set(inf[1].split()) for inf in infinitives] - print('info for criterion: ' + str(info)) - criterion = info[0].difference(info[1]) - if len(criterion): - print(criterion) - return list(criterion)[0] - else: - print('zero difference: ' + str(infinitives)) +# def split_correctly(lexeme, infinitives): +# if len(infinitives) == 2: +# criterion = find_criterion(infinitives) +# lexemes = [], [] +# for i in range(len(lexeme)): +# if criterion in lexeme[i][1]: +# lexemes[0].append(lexeme[i]) +# else: +# lexemes[1].append(lexeme[i]) +# print('lexemes ' + str(lexemes[1])) +# return lexemes +# elif len(infinitives) > 2: +# pass -def split_correctly(lexeme, infinitives): - if len(infinitives) == 2: - criterion = find_criterion(infinitives) - lexemes = [], [] - for i in range(len(lexeme)): - if criterion in lexeme[i][1]: - lexemes[0].append(lexeme[i]) - else: - lexemes[1].append(lexeme[i]) - print('lexemes ' + str(lexemes[1])) - return lexemes - elif len(infinitives) > 2: - pass - # split_correctly() - # else: - # print('AAAAAAAA ' + str(infinitives)) -def split_correctly_mod(lexeme): +def split_correctly_mod(lexeme): # отдебажить perf_iv, perf_tv, impf_iv, impf_tv = [], [], [], [] for wordform in lexeme: if 'impf iv' in wordform[1]: @@ -172,19 +196,20 @@ elif 'perf iv' in wordform[1]: perf_iv.append(wordform) elif 'impf tv' in wordform[1]: - impf_iv.append(wordform) + impf_tv.append(wordform) elif 'perf tv' in wordform[1]: - perf_iv.append(wordform) + perf_tv.append(wordform) else: print(wordform) lexemes = [perf_iv, perf_tv, impf_iv, impf_tv] lexemes = [arr for arr in lexemes if arr != []] + # if len(lexemes) != 2: + # print(lexemes) + # quit() return lexemes - - def find_paradigm(word, inventories, similar): for inventory in inventories: @@ -200,7 +225,7 @@ def cleaner(info): cleaned_info = {} for lexeme in info: - wordforms = [[wordform[0], wordform[1].replace(' use/ant', '').replace(' fac', '')] for wordform in info[lexeme]] + wordforms = [[wordform[0], wordform[1].replace(' fac', '')] for wordform in info[lexeme]] cleaned_info[lexeme] = wordforms new_blistatj = [w for w in info['блистать'] if not w[0].startswith('и') and not w[0].startswith('е')] cleaned_info['блистать'] = new_blistatj @@ -208,46 +233,73 @@ def fun_debugging_time(similar): inventories = similar.values() - # greatest = sorted(inventories, key=len)[-1] - # print('length of the greatest class: ' + str(len(greatest))) - # print('three words from the greatest wordclass: ' + greatest[0] + ', ' + greatest[1] + ', ' + greatest[2]) - # find_paradigm(greatest[0], inventories, similar) - # print('----------------') - # second = sorted(inventories, key=len)[-2] - # print('length of the second greatest class: ' + str(len(second))) - # print('three words from the second greatest wordclass: ' + second[0] + ', ' + second[1] + ', ' + second[2]) - # find_paradigm(second[0], inventories, similar) - # print('----------------') - # third = sorted(inventories, key=len)[-3] - # print('length of the third greatest class: ' + str(len(third))) - # print('three words from the second greatest wordclass: ' + third[0] + ', ' + third[1]) # + ', ' + third[2]) - # find_paradigm(third[0], inventories, similar) - # print('----------------') + greatest = sorted(inventories, key=len)[-1] + print('length of the greatest class: ' + str(len(greatest))) + print('three words from the greatest wordclass: ' + greatest[0] + ', ' + greatest[1] + ', ' + greatest[2]) + find_paradigm(greatest[0], inventories, similar) + print('----------------') + second = sorted(inventories, key=len)[-2] + print('length of the second greatest class: ' + str(len(second))) + print('three words from the second greatest wordclass: ' + second[0] + ', ' + second[1] + ', ' + second[2]) + find_paradigm(second[0], inventories, similar) + print('----------------') + third = sorted(inventories, key=len)[-3] + print('length of the third greatest class: ' + str(len(third))) + print('three words from the second greatest wordclass: ' + third[0] + ', ' + third[1]) # + ', ' + third[2]) + find_paradigm(third[0], inventories, similar) + print('----------------') fourth = sorted(inventories, key=len)[-4] print('length of the fourth greatest class: ' + str(len(fourth))) print('three words from the second greatest wordclass: ' + fourth[0]) # + ', ' + third[2]) - print(fourth) find_paradigm(fourth[0], inventories, similar) +def entries_maker(similar, labels): + text = '\n'*4 + for wordclass in similar: + # t1 = time.clock() + for verb in similar[wordclass]: + thereis = False + for label in labels: + if verb in labels[label]: + text += ' ' + verb + '\n' + thereis = True + if not thereis: + text += ' ' + verb + '\n' + # t2 = time.clock() + # print('one wordclass: ' + str(t2 - t1)) + return text + + def main(): with codecs.open('../../verbs_z.json', 'r', 'utf-8')as f: info = json.load(f) info = cleaner(info) - lexeme_spliter(info) # отдебажить + lexeme_spliter(info) - pstpss, pstact, imp, other = par_splitter(info) + pstpss, pstact, prsact, prspss, other = par_splitter(info) similar_pstact = find_similar(paradigm_collector(pstact)) similar_pstpss = find_similar(paradigm_collector(pstpss)) - # similar_imp = find_similar(paradigm_collector(imp)) + similar_prsact = find_similar(paradigm_collector(prsact)) + similar_prspss = find_similar(paradigm_collector(prspss)) + similar_other = find_similar(paradigm_collector(other, secondary = False)) - # secondary_par_maker(similar_imp) + russian_verbs = codecs.open('russian_verbs.dix', 'w') + russian_verbs.write(secondary_par_maker(similar_pstpss, 'pstpss')) + russian_verbs.write(secondary_par_maker(similar_pstact, 'pstact')) + russian_verbs.write(secondary_par_maker(similar_prsact, 'prsact')) + russian_verbs.write(secondary_par_maker(similar_prspss, 'prspss')) + entries, labels = whole_par(similar_other) + russian_verbs.write(entries) + russian_verbs.write(entries_maker(similar_other, labels)) + russian_verbs.close() + # import pickle # pickle.dump(similar_pstact, open( "save.p", "wb" ) ) - fun_debugging_time(similar_pstpss) + fun_debugging_time(similar_other)