Index: languages/apertium-rus/apertium-rus.rus.dix
===================================================================
--- languages/apertium-rus/apertium-rus.rus.dix (revision 72291)
+++ languages/apertium-rus/apertium-rus.rus.dix (revision 72293)
@@ -17,7 +17,7 @@
-
+ vj
@@ -23342,8 +23342,8 @@
аленьк
- еньшее
- еньшей
+ еньше
+ еньший
ала
ал
алы
@@ -26654,6 +26654,7 @@
екитеечь
ечённечь
еченечь
+
ёкшечь
ёкечь
еклаечь
@@ -111410,6 +111411,8 @@
походя
+
+
арест
ахн
@@ -111450,7 +111453,6 @@
бро
бу
быва
-
вал
вар
вв
Index: incubator/apertium-pol-rus/dev/from_z.py
===================================================================
--- incubator/apertium-pol-rus/dev/from_z.py (revision 72291)
+++ incubator/apertium-pol-rus/dev/from_z.py (revision 72293)
@@ -38,7 +38,7 @@
for wordform in gram_d[lexeme]:
wordform[1] = wordform[1].replace('pstpss pstpss ', 'pstpss ')
if secondary:
- wordform[1] = wordform[1].replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '').replace(' prb', '')
+ wordform[1] = wordform[1].replace('v impf ', '').replace('v perf ', '').replace('tv ', '').replace('iv ', '')
elif lexeme in ['иметь1', 'иметь2']:
wordform[1] = wordform[1].replace('v impf ', 'vbhaver impf ').replace('v impf ', 'vbhaver perf ')
elif lexeme in ['мочь', 'хотеть']:
@@ -90,7 +90,7 @@
'''replaces tags'''
replacer = {'msc' : 'm', 'anin': 'an', 'fem' : 'f', 'inan' : 'nn', 'anim' : 'aa', 'neu' : 'nt', 'pred' : 'short', 'v' : 'vblex',
'sg1' : 'p1 sg', 'sg2' : 'p2 sg', 'sg3' : 'p3 sg', 'pl1' : 'p1 pl', 'pl2' : 'p2 pl', 'pl3' : 'p3 pl',
- 'prs' : 'pres'}
+ 'prs' : 'pres', 'pstpss' : 'pp pasv', 'pstact' : 'pp actv', 'prspss' : 'pprs pasv', 'prsact' : 'pprs actv'}
new_info = []
for wordform in frozen_info:
for replacement in replacer:
@@ -154,6 +154,8 @@
if tag in ['leng', 'use/ant', 'use/obs']:
text = text.rsplit('\n', 1)[0] + '\n' + text.rsplit('\n', 1)[1].replace('', '')
continue
+ elif tag in ['pstpss', 'pstact', 'prspss', 'prsact']:
+ continue
text += ''
text += '
\n'
text += ' \n\n'
@@ -174,8 +176,10 @@
return base, inf_ending + addition
def participle_pars(text, label, base_fin, ending):
+ replacer = {'pstpss' : '', 'pstact' : '', 'prspss' : '', 'prsact' : ''}
for el in ['pstpss', 'pstact', 'prsact', 'prspss']:
- text += ' #' + base_fin + '#' + ending + '
\n'
+ tags = replacer[el]
+ text += ' #' + base_fin + '#' + ending + '' + tags + '
\n'
return text
def whole_par(similar):
@@ -292,9 +296,9 @@
if verb in labels[label]:
st_and_fl = paradigms[label][0]
ending = st_and_fl[1]
- # ending = re.sub('[1234¹²]', '', st_and_fl[1]) # mb will help with pars with the same name
+ clean_ending = re.sub('[1234¹²]', '', st_and_fl[1]) # mb will help with pars with the same name
verb = re.sub('[1234¹²³]', '', verb)
- text += ' ' + verb.split(ending)[0] + '\n'
+ text += ' ' + verb.split(clean_ending)[0] + '\n'
thereis = True
break
if not thereis:
Index: incubator/apertium-pol-rus/dev/merge_verbs.py
===================================================================
--- incubator/apertium-pol-rus/dev/merge_verbs.py (nonexistent)
+++ incubator/apertium-pol-rus/dev/merge_verbs.py (revision 72293)
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# ok, i'm writing it in Python 2.7 -_-
+
+import codecs
+import re
+import lxml
+from lxml import etree
+
+def get_data():
+ with codecs.open(u'../../apertium-rus/apertium-rus.rus.dix', u'r') as rus_dix:
+ lines_rus_dix = rus_dix.read()
+ with codecs.open('russian_verbs.dix', 'r') as rv_par:
+ to_add = rv_par.read()
+ return lines_rus_dix, to_add
+
+# def get_parts(the_xml):
+# dictionary = etree.fromstring(the_xml.replace('', ''))
+# paradigms = dictionary.xpath('pardefs/pardef')
+# entries = dictionary.xpath('section/e[@lm]')
+# return paradigms, entries
+
+# def del_vblex_pars(paradigms):
+# print('length of pars before ' + str(len(paradigms)))
+# pars_len = len(paradigms)
+# i = 0
+# while i < pars_len:
+# if len(paradigms[i].xpath('./e/p/r/s[@n="vblex"]')) > 0:
+# # print(paradigms[i].get('n'))
+# paradigms.pop(i)
+# pars_len -= 1
+# else:
+# i += 1
+# print('length of pars after ' + str(len(paradigms)))
+# return paradigms
+
+def del_vblex_ents(entries):
+ print('entr bfr: ' + str(len(entries)))
+ for child in entries:
+ if child.tag == 'e' and 'vblex' in child.xpath('par')[0].get('n'):
+ entries.remove(child)
+ print('entr aftr: ' + str(len(entries)))
+ return entries
+
+
+def del_vblex_pars_mod(paradigms):
+ print('length of pars before ' + str(len(paradigms)))
+ pars_len = len(paradigms)
+ for child in paradigms:
+ if len(child.xpath('./e/p/r/s[@n="vblex"]')) > 0:
+ # print(child.get('n'))
+ paradigms.remove(child)
+ print('length of pars after ' + str(len(paradigms)))
+ return paradigms
+
+def get_parts_mod(the_xml, ent_sec):
+ dictionary = etree.fromstring(the_xml.replace('', ''))
+ paradigms = dictionary.xpath('pardefs')[0]
+ verb_entries = dictionary.xpath(ent_sec)[0]
+ return paradigms, verb_entries, dictionary
+
+def write_new_file(dictionary):
+ new = codecs.open('apertium-rus.rus.dix.new', 'w')
+ # new_par_text = '\n\n'
+ # cyr_attr = {}
+ # for paradigm in paradigms[10:15]:
+ # cyr_attr[etree.tostring(paradigm, encoding='unicode', pretty_print=True).split('"')[1]] = paradigm.get('n')
+ # curr_attr = etree.tostring(paradigm, encoding='unicode', pretty_print=True).split('"')[1]
+ # new.write(etree.tostring(paradigm, encoding='unicode', pretty_print=True).replace(curr_attr, paradigm.get('n')))
+ text = etree.tostring(dictionary, encoding='unicode', pretty_print=True)
+ text = text.replace('>\n \n\n\n