Index: incubator/apertium-pol-rus/dev/cleaner.py =================================================================== --- incubator/apertium-pol-rus/dev/cleaner.py (nonexistent) +++ incubator/apertium-pol-rus/dev/cleaner.py (revision 71597) @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +import codecs +import re +import json +import copy + +def forms_collector(fname): + '''opens a file with smthn from Zaliznyak, reads it and makes a dictionary where keys are lexemes and values are dictionaries with wordforms and morph analysis''' + with codecs.open(fname, 'r', 'utf-8') as f: + forms = [line.replace(chr(769),'').replace(chr(768),'') for line in f] + print(len(forms)) + forms = kill_duplicates(forms) + print(len(forms)) + forms = [form.split('+', 1) for form in forms] + + gram_d = {} + for line in forms: + pair = line[0].split(':') + lexeme = pair[1] + wordform = pair[0] + if lexeme not in gram_d: + gram_d[lexeme] = {wordform : line[1][:-1].split('+')} + else: + gram_d[lexeme][wordform] = line[1][:-1].split('+') + return gram_d + + +def kill_duplicates(forms): + '''deletes repeting lines and lines that are the repetiotions of lines without jo''' + table = {} + for line in forms: + if line not in table: + table[line.replace('ё', 'е')] = line + forms = [table[key] for key in table] + print(len(forms)) + forms = list(forms) + return forms + + + +info = forms_collector('../../stuffs2') #../../stuffs # someverbs.txt + +with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f: + json.dump(info, f, ensure_ascii=False, indent=2) Index: incubator/apertium-pol-rus/dev/from_z.py =================================================================== --- incubator/apertium-pol-rus/dev/from_z.py (revision 71596) +++ incubator/apertium-pol-rus/dev/from_z.py (revision 71597) @@ -38,10 +38,35 @@ def duplicate_killer(forms): '''deletes repeting lines and lines that are the repetiotions of lines without jo''' - forms = list(set(forms)) - withjo_ind = [ind for ind in range(len(forms)) if 'ё' in forms[ind]] - for ind in withjo_ind: - print(str(ind) + ' : '+ forms[ind]) + forms = set(forms) + withjo = forms.difference(set([line.replace('ё', 'е') for line in forms])) + forms = list(forms) + # withjo_ind = [ind for ind in range(len(forms)) if 'ё' in forms[ind]] + # new_forms = list(set([line.replace('ё', 'е') for line in forms])) + # print('len of new_forms: ' + str(len(new_forms))) + print('len of withjo: ' + str(len(withjo))) + withjo = set([line.replace('ё', 'е') for line in withjo]) + n = list(range(len(forms))) + for i in n: + if i%1000 == 0: + print('1000 more') + for l in withjo: + if forms[i] == l: + forms.pop(i) + n.pop(-1) + + # for ind in withjo_ind: + # for i in range(len(new_forms)): + # if forms[ind].replace('ё', 'е') == new_forms[i]: + # new_forms[i] = forms[ind] + # n = list(range(len(forms))) + # for i in n: + # if forms[i].replace('ё', 'е') == forms[i -1]: + # print('yes') + # forms.pop(i - 1) + # n.pop(-1) + # for line in withjo: + # print(line) return forms def paradigm_collector(gram_d): # working at this @@ -94,7 +119,7 @@ print('number of paradigms: ' + str(len(similar))) return similar -info = forms_collector('someverbs.txt') #../../stuffs +info = forms_collector('../../stuffs') #../../stuffs # someverbs.txt with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f: json.dump(info, f, ensure_ascii=False, indent=2)