commit 55276b0ff666085961ea7e2200b51f3919691606 Author: kamush901 Date: Tue Jul 13 17:03:27 2021 +0200 Added a code that generates lexical selection rules from given file with dix-like formatted stems. diff --git a/texts/lexical_selection_generate.py b/texts/lexical_selection_generate.py new file mode 100644 index 0000000..8d65190 --- /dev/null +++ b/texts/lexical_selection_generate.py @@ -0,0 +1,89 @@ +# lexical_selection_generate.py +# Reads manually revieved dix elements +# Creates lexical selection rules for them in a new file. + +from argparse import ArgumentParser +from collections import defaultdict + +# importing element tree +# under the alias of ET +import xml.etree.ElementTree as ET + + +def write_variants(file_name,side_list): + with open(file_name,'w') as out_file: + previous_el = side_list[0].split(" ")[0] + for el in side_list: + tmp = el.split(" ")[0] + if previous_el != tmp: + out_file.write("\n") + previous_el = tmp + out_file.write(el) + out_file.write("\n") + out_file.close() + +if __name__ == "__main__": + # START + print("------------------------------------------------------------------------------------------------------------------------------------") + + # Parsing arguments: + arg_parser = ArgumentParser() + + arg_parser.add_argument("--input_file", help="Path to the dix-like formatted file you want rules tobe generated from.", required=True) + args = arg_parser.parse_args() + + file_in = args.input_file + print("Starting analysis with file:", file_in) + + # file_in = "tmp.txt" + file_out = file_in + "_result.txt" + + output_file = open(file_out,"w") + + # Reading file line by line: + with open(file_in, "r") as input_file: + for line in input_file.readlines(): + line = line.strip() + + # Skipping blank line: + if len(line) == 0: + continue + + # Converting line into a proper xml format: + line = "" + line + "" + + # Reading line as an XML Tree: + tree = ET.fromstring(line) + l = tree.find("l") + l_word = l.text + l_tag = l.find("s").get("n") + r = tree.find("r") + r_word = r.text + r_tag = r.find("s").get("n") + # print(l_word, l_tag,r_word, r_tag) + + # Making LRX format: + + # Multiline option: + # + # + # + + output_file.write("\n".format(l_word)) + rule1 = "\n".format(l_word,l_tag,r_word,r_tag) + print("Rule: ", rule2) + output_file.write(rule1) + output_file.write(rule2) + output_file.write("\n") + + # print(ET.tostring(r, encoding="unicode")) + + input_file.close() + output_file.close() + print("Finished. Resulting rules saved in a file: ", file_out) \ No newline at end of file