Index: incubator/apertium-pol-rus/dev/from_z.py
===================================================================
--- incubator/apertium-pol-rus/dev/from_z.py (revision 72182)
+++ incubator/apertium-pol-rus/dev/from_z.py (revision 72185)
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
+### todo: write a splitter which disambiguates between verbs with similar
+### infs but differnt meanings and other forms _before_ building paradigms
+
### NB: я заменяю prb на пустую строку в change_tags. пометить их как-то до этого.
### why on earh does this happen: pstpss+pstpss
@@ -70,9 +73,6 @@
else:
similar[flecs].append(lemma)
- # for inventory in similar:
- # print(str(inventory))
- # print(str(similar[inventory]))
print('number of paradigms: ' + str(len(similar)))
return similar
@@ -144,18 +144,19 @@
continue
text += ''
text += '
\n'
- text += '\n\n'
+ text += ' \n\n'
return text, labels
def make_stem(label, infl_class):
for wordform in infl_class:
- if 'inf' in wordform:
+ if 'inf' in wordform and 'pass' not in wordform:
inf_ending = wordform.split(' ')[0]
if label[-1] in '1234':
label = label[:-1]
if label[-1] in '¹²':
label = label[:-1]
- return label.split(inf_ending)[0], inf_ending
+ base = label.split(inf_ending)[0]
+ return base, inf_ending
def participle_pars(text, label, base_fin):
for el in ['pstpss', 'pstact', 'prsact', 'prspss']:
@@ -266,7 +267,7 @@
find_paradigm(fourth[0], inventories, similar)
-def entries_maker(similar, labels):
+def entries_maker(similar, labels, paradigms):
print('building entries ...')
text = '\n'*4
for wordclass in similar:
@@ -274,10 +275,13 @@
thereis = False
for label in labels:
if verb in labels[label]:
- verb = re.sub('[1234¹²]', '', verb)
- text += ' ' + verb + '\n'
+ st_and_fl = paradigms[label][0]
+ ending = re.sub('[1234¹²]', '', st_and_fl[1])
+ text += ' ' + verb.split(ending)[0] + '\n'
thereis = True
+ break
if not thereis:
+ print('Something is wrong with entries_maker: ' + verb)
text += ' ' + verb + '\n'
return text
@@ -298,25 +302,17 @@
base = line.split('#')[1]
if base:
if len(prtcp_base.split(base)) > 1:
- ending = prtcp_base.split(base)[1]
- ptcp_affix = ending.split
- line = line.split('#')[0] + ending + line.split('#')[2]
- # print('SUCCESSFULLY: ' + ending + ', base: ' + base)
+ prtcp_base = prtcp_base.split(base)[1]
+ # print('SUCCESSFULLY: ' + affix + ', base: ' + base)
else:
print('something strange: '+ line + '\nbase: ' + base + ', ptcpl_base: ' + prtcp_base)
else:
print('zero ptcpl ending: ' + line)
- line = line.split('#')[0] + line.split('#')[2]
+ line = line.split('#')[0] + prtcp_base + line.split('#')[2]
return line
-'''
-очень простой алгоритм:
--- берёшь глагол из парадигмы vblex, инфинитивную основу и окончание этого класса причастий
--- находишь от него причастие в nom sg
--- обрезаешь причастие по основе и окончанию
-'''
-
-def secondary_par_matcher(text, ptcpls, info): # pstpss, pstact, prsact, prspss
+def secondary_par_matcher(text, ptcpls, info):
+ '''finds places in vblex pars where there should be references to participle paradigms and makes it, returns string with paradigms'''
lines = []
for line in text.split('\n'):
if 'BASE REQUIRED' in line:
@@ -336,6 +332,8 @@
return '\n'.join(lines)
def secondary_par_writer(ptcpls):
+ '''returns string with participle paradigms and a dictionary where keys are lemmas used in names of ptcple pars and values
+ are something complicated with other lemmaas belonging to the same inflectional class'''
text = ''
labels_s = {}
for part in ptcpls:
@@ -348,15 +346,17 @@
return text, labels_s
-def paradigms_writer(info):
+def paradigms_writer(info):
+ '''returns a string with all paradigms'''
pstpss, pstact, prsact, prspss, other = par_splitter(info)
ptcpls = {'pstpss' : pstpss, 'pstact' : pstact, 'prsact' : prsact, 'prspss' : prspss}
text, labels_s = secondary_par_writer(ptcpls)
- similar_other = find_similar(paradigm_collector(other, secondary = False))
+ paradigms = paradigm_collector(other, secondary = False)
+ similar_other = find_similar(paradigms)
types, labels_vblex = whole_par(similar_other)
text += types
text = secondary_par_matcher(text, labels_s, info)
- text += entries_maker(similar_other, labels_vblex)
+ text += entries_maker(similar_other, labels_vblex, paradigms)
fun_debugging_time(similar_other)