Index: languages/apertium-rus/apertium-rus.rus.dix =================================================================== --- languages/apertium-rus/apertium-rus.rus.dix (revision 72291) +++ languages/apertium-rus/apertium-rus.rus.dix (revision 72293) @@ -17,7 +17,7 @@ - + vj @@ -23342,8 +23342,8 @@

аленьк

-

еньшее

-

еньшей

+

еньше

+

еньший

ала

ал

алы

@@ -26654,6 +26654,7 @@

екитеечь

ечённечь

еченечь

+

ёкшечь

ёкечь

еклаечь

@@ -111410,6 +111411,8 @@ походя + + арест ахн @@ -111450,7 +111453,6 @@ бро бу быва - вал вар вв Index: incubator/apertium-pol-rus/dev/from_z.py =================================================================== --- incubator/apertium-pol-rus/dev/from_z.py (revision 72291) +++ incubator/apertium-pol-rus/dev/from_z.py (revision 72293) @@ -38,7 +38,7 @@ for wordform in gram_d[lexeme]: wordform[1] = wordform[1].replace('pstpss pstpss ', 'pstpss ') if secondary: - wordform[1] = wordform[1].replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '').replace(' prb', '') + wordform[1] = wordform[1].replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '') elif lexeme in ['иметь1', 'иметь2']: wordform[1] = wordform[1].replace('v impf ', 'vbhaver impf ').replace('v impf ', 'vbhaver perf ') elif lexeme in ['мочь', 'хотеть']: @@ -90,7 +90,7 @@ '''replaces tags''' replacer = {'msc' : 'm', 'anin': 'an', 'fem' : 'f', 'inan' : 'nn', 'anim' : 'aa', 'neu' : 'nt', 'pred' : 'short', 'v' : 'vblex', 'sg1' : 'p1 sg', 'sg2' : 'p2 sg', 'sg3' : 'p3 sg', 'pl1' : 'p1 pl', 'pl2' : 'p2 pl', 'pl3' : 'p3 pl', - 'prs' : 'pres'} + 'prs' : 'pres', 'pstpss' : 'pp pasv', 'pstact' : 'pp actv', 'prspss' : 'pprs pasv', 'prsact' : 'pprs actv'} new_info = [] for wordform in frozen_info: for replacement in replacer: @@ -154,6 +154,8 @@ if tag in ['leng', 'use/ant', 'use/obs']: text = text.rsplit('\n', 1)[0] + '\n' + text.rsplit('\n', 1)[1].replace('', '') continue + elif tag in ['pstpss', 'pstact', 'prspss', 'prsact']: + continue text += '' text += '

\n' text += '
\n\n' @@ -174,8 +176,10 @@ return base, inf_ending + addition def participle_pars(text, label, base_fin, ending): + replacer = {'pstpss' : '', 'pstact' : '', 'prspss' : '', 'prsact' : ''} for el in ['pstpss', 'pstact', 'prsact', 'prspss']: - text += '

#' + base_fin + '#' + ending + '

\n' + tags = replacer[el] + text += '

#' + base_fin + '#' + ending + '' + tags + '

\n' return text def whole_par(similar): @@ -292,9 +296,9 @@ if verb in labels[label]: st_and_fl = paradigms[label][0] ending = st_and_fl[1] - # ending = re.sub('[1234¹²]', '', st_and_fl[1]) # mb will help with pars with the same name + clean_ending = re.sub('[1234¹²]', '', st_and_fl[1]) # mb will help with pars with the same name verb = re.sub('[1234¹²³]', '', verb) - text += ' ' + verb.split(ending)[0] + '\n' + text += ' ' + verb.split(clean_ending)[0] + '\n' thereis = True break if not thereis: Index: incubator/apertium-pol-rus/dev/merge_verbs.py =================================================================== --- incubator/apertium-pol-rus/dev/merge_verbs.py (nonexistent) +++ incubator/apertium-pol-rus/dev/merge_verbs.py (revision 72293) @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# ok, i'm writing it in Python 2.7 -_- + +import codecs +import re +import lxml +from lxml import etree + +def get_data(): + with codecs.open(u'../../apertium-rus/apertium-rus.rus.dix', u'r') as rus_dix: + lines_rus_dix = rus_dix.read() + with codecs.open('russian_verbs.dix', 'r') as rv_par: + to_add = rv_par.read() + return lines_rus_dix, to_add + +# def get_parts(the_xml): +# dictionary = etree.fromstring(the_xml.replace('', '')) +# paradigms = dictionary.xpath('pardefs/pardef') +# entries = dictionary.xpath('section/e[@lm]') +# return paradigms, entries + +# def del_vblex_pars(paradigms): +# print('length of pars before ' + str(len(paradigms))) +# pars_len = len(paradigms) +# i = 0 +# while i < pars_len: +# if len(paradigms[i].xpath('./e/p/r/s[@n="vblex"]')) > 0: +# # print(paradigms[i].get('n')) +# paradigms.pop(i) +# pars_len -= 1 +# else: +# i += 1 +# print('length of pars after ' + str(len(paradigms))) +# return paradigms + +def del_vblex_ents(entries): + print('entr bfr: ' + str(len(entries))) + for child in entries: + if child.tag == 'e' and 'vblex' in child.xpath('par')[0].get('n'): + entries.remove(child) + print('entr aftr: ' + str(len(entries))) + return entries + + +def del_vblex_pars_mod(paradigms): + print('length of pars before ' + str(len(paradigms))) + pars_len = len(paradigms) + for child in paradigms: + if len(child.xpath('./e/p/r/s[@n="vblex"]')) > 0: + # print(child.get('n')) + paradigms.remove(child) + print('length of pars after ' + str(len(paradigms))) + return paradigms + +def get_parts_mod(the_xml, ent_sec): + dictionary = etree.fromstring(the_xml.replace('', '')) + paradigms = dictionary.xpath('pardefs')[0] + verb_entries = dictionary.xpath(ent_sec)[0] + return paradigms, verb_entries, dictionary + +def write_new_file(dictionary): + new = codecs.open('apertium-rus.rus.dix.new', 'w') + # new_par_text = '\n\n' + # cyr_attr = {} + # for paradigm in paradigms[10:15]: + # cyr_attr[etree.tostring(paradigm, encoding='unicode', pretty_print=True).split('"')[1]] = paradigm.get('n') + # curr_attr = etree.tostring(paradigm, encoding='unicode', pretty_print=True).split('"')[1] + # new.write(etree.tostring(paradigm, encoding='unicode', pretty_print=True).replace(curr_attr, paradigm.get('n'))) + text = etree.tostring(dictionary, encoding='unicode', pretty_print=True) + text = text.replace('>\n \n\n\n