Index: incubator/apertium-pol-rus/dev/cleaner.py
===================================================================
--- incubator/apertium-pol-rus/dev/cleaner.py	(nonexistent)
+++ incubator/apertium-pol-rus/dev/cleaner.py	(revision 71597)
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+import codecs
+import re
+import json
+import copy
+
+def forms_collector(fname):
+	'''opens a file with smthn from Zaliznyak, reads it and makes a dictionary where keys are lexemes and values are dictionaries with wordforms and morph analysis'''
+	with codecs.open(fname, 'r', 'utf-8') as f:
+		forms = [line.replace(chr(769),'').replace(chr(768),'') for line in f]
+	print(len(forms))
+	forms = kill_duplicates(forms)
+	print(len(forms))
+	forms = [form.split('+', 1) for form in forms]
+
+	gram_d = {}
+	for line in forms:
+		pair = line[0].split(':')
+		lexeme = pair[1]
+		wordform = pair[0]
+		if lexeme not in gram_d:
+			gram_d[lexeme] = {wordform : line[1][:-1].split('+')}
+		else:
+			gram_d[lexeme][wordform] = line[1][:-1].split('+')
+	return gram_d
+
+
+def kill_duplicates(forms):
+	'''deletes repeting lines and lines that are the repetiotions of lines without jo'''
+	table = {}
+	for line in forms:
+		if line not in table:
+			table[line.replace('ё', 'е')] = line
+	forms = [table[key] for key in table]
+	print(len(forms))
+	forms = list(forms)
+	return forms
+
+
+
+info = forms_collector('../../stuffs2') #../../stuffs # someverbs.txt
+
+with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f:
+    json.dump(info, f, ensure_ascii=False, indent=2)
Index: incubator/apertium-pol-rus/dev/from_z.py
===================================================================
--- incubator/apertium-pol-rus/dev/from_z.py	(revision 71596)
+++ incubator/apertium-pol-rus/dev/from_z.py	(revision 71597)
@@ -38,10 +38,35 @@
 
 def duplicate_killer(forms):
 	'''deletes repeting lines and lines that are the repetiotions of lines without jo'''
-	forms = list(set(forms))
-	withjo_ind = [ind for ind in range(len(forms)) if 'ё' in forms[ind]]
-	for ind in withjo_ind:
-		print(str(ind) + ' : '+  forms[ind])
+	forms = set(forms)
+	withjo = forms.difference(set([line.replace('ё', 'е') for line in forms]))
+	forms = list(forms)
+								# withjo_ind = [ind for ind in range(len(forms)) if 'ё' in forms[ind]]
+								# new_forms = list(set([line.replace('ё', 'е') for line in forms]))
+								# print('len of new_forms: ' + str(len(new_forms)))
+	print('len of withjo: ' + str(len(withjo)))
+	withjo = set([line.replace('ё', 'е') for line in withjo])
+	n = list(range(len(forms)))
+	for i in n:
+		if i%1000 == 0:
+			print('1000 more')
+		for l in withjo:
+			if forms[i] == l:
+				forms.pop(i)
+				n.pop(-1)
+
+								# for ind in withjo_ind:
+								# 	for i in range(len(new_forms)):
+								# 		if forms[ind].replace('ё', 'е') == new_forms[i]:
+																	# 			new_forms[i] = forms[ind]
+												# n = list(range(len(forms)))
+												# for i in n:
+												# 	if forms[i].replace('ё', 'е') == forms[i -1]:
+												# 		print('yes')
+												# 		forms.pop(i - 1)
+												# 		n.pop(-1)
+	# for line in withjo:
+	# 	print(line)
 	return forms
 
 def paradigm_collector(gram_d): # working at this
@@ -94,7 +119,7 @@
 	print('number of paradigms: ' + str(len(similar)))
 	return similar
 
-info = forms_collector('someverbs.txt') #../../stuffs
+info = forms_collector('../../stuffs') #../../stuffs # someverbs.txt
 
 with codecs.open('../../verbs_z_experiment.json', 'w', 'utf-8')as f:
     json.dump(info, f, ensure_ascii=False, indent=2)