commit 820767f11c6cee19e179b410fd4f389341c6f130
Author: kamush901 <kutlimuratovab0712@gmail.com>
Date:   Tue Jul 13 16:56:47 2021 +0200

    Added a code that checks for ambiguous words in a dix file (For Lexical Selection Rules)

diff --git a/texts/lexical_selection_check.py b/texts/lexical_selection_check.py
new file mode 100644
index 0000000..2776175
--- /dev/null
+++ b/texts/lexical_selection_check.py
@@ -0,0 +1,75 @@
+# lexical_selection_check.py
+# Checks the dix file for multiple translations for a single entry for both sides.
+# Generates two files for each side with ambigious translations.
+# Output file names: left_variants.txt, right_variants.txt
+
+from argparse import ArgumentParser
+from collections import defaultdict
+
+# importing element tree
+# under the alias of ET
+import xml.etree.ElementTree as ET
+
+def get_variants(side):
+    found_pairs = list()
+    for key in side.keys():
+        variants = side[key]
+        if(len(variants)>1):
+            print("Found: ", key, ":  ", variants)
+            for variant in variants:
+                found_pairs.append(key + " " + variant)  
+    return found_pairs
+
+def write_variants(file_name,side_list):
+    with open(file_name,'w') as out_file:
+        previous_el = side_list[0].split(" ")[0]
+        for el in side_list:
+            tmp = el.split(" ")[0]
+            if previous_el != tmp:
+                out_file.write("\n")
+                previous_el = tmp
+            out_file.write(el)
+            out_file.write("\n")
+    out_file.close()
+
+if __name__ == "__main__":
+    # START
+    print("------------------------------------------------------------------------------------------------------------------------------------")
+
+    # Parsing arguments:
+    arg_parser = ArgumentParser()
+    
+    arg_parser.add_argument("--input_file", help="Path to the dix file you want to analyse.", required=True)
+    args = arg_parser.parse_args()
+
+    # input_file = '../../apertium-kaz-uzb.kaz-uzb.dix'
+    input_file = args.input_file
+    print("Starting analysis with file:", input_file)
+
+    # Passing the path of the xml document to enable the parsing process
+    tree = ET.parse(input_file)
+
+    # getting the parent tag of
+    # the xml document
+    root = tree.getroot()
+
+    section = root[2]
+    l_side = defaultdict(list)
+    r_side = defaultdict(list)
+    for element in section:
+        # Checking regex, skipping if present:
+        if len(element.findall("re"))>0:
+            continue
+        l = ET.tostring(element.find("p/l"), encoding='unicode').strip()
+        r = ET.tostring(element.find("p/r"), encoding='unicode').strip()
+        l_side[l].append(r)
+        r_side[r].append(l)
+
+    l_list = get_variants(l_side)
+    r_list = get_variants(r_side)
+    print("Writing results to a file:")
+    write_variants("left_variants.txt",l_list)
+    write_variants("right_variants.txt",r_list)
+    print("Finished writing to files.")
+
+