Index: incubator/apertium-pol-rus/dev/from_z.py =================================================================== --- incubator/apertium-pol-rus/dev/from_z.py (revision 72182) +++ incubator/apertium-pol-rus/dev/from_z.py (revision 72185) @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +### todo: write a splitter which disambiguates between verbs with similar +### infs but differnt meanings and other forms _before_ building paradigms + ### NB: я заменяю prb на пустую строку в change_tags. пометить их как-то до этого. ### why on earh does this happen: pstpss+pstpss @@ -70,9 +73,6 @@ else: similar[flecs].append(lemma) - # for inventory in similar: - # print(str(inventory)) - # print(str(similar[inventory])) print('number of paradigms: ' + str(len(similar))) return similar @@ -144,18 +144,19 @@ continue text += '' text += '

\n' - text += '\n\n' + text += ' \n\n' return text, labels def make_stem(label, infl_class): for wordform in infl_class: - if 'inf' in wordform: + if 'inf' in wordform and 'pass' not in wordform: inf_ending = wordform.split(' ')[0] if label[-1] in '1234': label = label[:-1] if label[-1] in '¹²': label = label[:-1] - return label.split(inf_ending)[0], inf_ending + base = label.split(inf_ending)[0] + return base, inf_ending def participle_pars(text, label, base_fin): for el in ['pstpss', 'pstact', 'prsact', 'prspss']: @@ -266,7 +267,7 @@ find_paradigm(fourth[0], inventories, similar) -def entries_maker(similar, labels): +def entries_maker(similar, labels, paradigms): print('building entries ...') text = '\n'*4 for wordclass in similar: @@ -274,10 +275,13 @@ thereis = False for label in labels: if verb in labels[label]: - verb = re.sub('[1234¹²]', '', verb) - text += ' ' + verb + '\n' + st_and_fl = paradigms[label][0] + ending = re.sub('[1234¹²]', '', st_and_fl[1]) + text += ' ' + verb.split(ending)[0] + '\n' thereis = True + break if not thereis: + print('Something is wrong with entries_maker: ' + verb) text += ' ' + verb + '\n' return text @@ -298,25 +302,17 @@ base = line.split('#')[1] if base: if len(prtcp_base.split(base)) > 1: - ending = prtcp_base.split(base)[1] - ptcp_affix = ending.split - line = line.split('#')[0] + ending + line.split('#')[2] - # print('SUCCESSFULLY: ' + ending + ', base: ' + base) + prtcp_base = prtcp_base.split(base)[1] + # print('SUCCESSFULLY: ' + affix + ', base: ' + base) else: print('something strange: '+ line + '\nbase: ' + base + ', ptcpl_base: ' + prtcp_base) else: print('zero ptcpl ending: ' + line) - line = line.split('#')[0] + line.split('#')[2] + line = line.split('#')[0] + prtcp_base + line.split('#')[2] return line -''' -очень простой алгоритм: --- берёшь глагол из парадигмы vblex, инфинитивную основу и окончание этого класса причастий --- находишь от него причастие в nom sg --- обрезаешь причастие по основе и окончанию -''' - -def secondary_par_matcher(text, ptcpls, info): # pstpss, pstact, prsact, prspss +def secondary_par_matcher(text, ptcpls, info): + '''finds places in vblex pars where there should be references to participle paradigms and makes it, returns string with paradigms''' lines = [] for line in text.split('\n'): if 'BASE REQUIRED' in line: @@ -336,6 +332,8 @@ return '\n'.join(lines) def secondary_par_writer(ptcpls): + '''returns string with participle paradigms and a dictionary where keys are lemmas used in names of ptcple pars and values + are something complicated with other lemmaas belonging to the same inflectional class''' text = '' labels_s = {} for part in ptcpls: @@ -348,15 +346,17 @@ return text, labels_s -def paradigms_writer(info): +def paradigms_writer(info): + '''returns a string with all paradigms''' pstpss, pstact, prsact, prspss, other = par_splitter(info) ptcpls = {'pstpss' : pstpss, 'pstact' : pstact, 'prsact' : prsact, 'prspss' : prspss} text, labels_s = secondary_par_writer(ptcpls) - similar_other = find_similar(paradigm_collector(other, secondary = False)) + paradigms = paradigm_collector(other, secondary = False) + similar_other = find_similar(paradigms) types, labels_vblex = whole_par(similar_other) text += types text = secondary_par_matcher(text, labels_s, info) - text += entries_maker(similar_other, labels_vblex) + text += entries_maker(similar_other, labels_vblex, paradigms) fun_debugging_time(similar_other)