commit 27837c29ac763b3f55f99a652cd14ef9c8dbf8ed Author: koguzhan Date: Fri Jul 19 14:24:29 2019 +0200 updated scripts diff --git a/dev/entry.py b/dev/entry.py index a14905a..8fc8e06 100644 --- a/dev/entry.py +++ b/dev/entry.py @@ -20,6 +20,7 @@ def entrify(line): "org":'', "al":'', "num":'', +"abbr":'', "det":'' } epl, rpe, lr = "

", "

", "" diff --git a/dev/monofy.py b/dev/monofy.py index eefcd25..43b7573 100644 --- a/dev/monofy.py +++ b/dev/monofy.py @@ -5,7 +5,7 @@ def monofy(line,left=True): '':"ADV", '':"N1", '':"NP-TOP", - '':"POST", + '':"POST", '':"NP-COG-MF", '':"NP-ORG", '':"NP-AL", @@ -56,6 +56,5 @@ if __name__=="__main__": if m not in text: sys.stdout.write(m + "\n") except KeyError: - print("Unknown tags:",line) continue diff --git a/dev/turmonofy.py b/dev/turmonofy.py index f9404c2..cfda8d0 100644 --- a/dev/turmonofy.py +++ b/dev/turmonofy.py @@ -1,4 +1,4 @@ -import sys, os +import sys, os, re def monofy(line,left=True): dic = { '':'A1', @@ -20,6 +20,7 @@ def monofy(line,left=True): '':'NP-ANT-M', '':'NP-ANT-F', '':"CA", + '':"ABBR", '':"V-IR-TV", '':"DET-QNT", '':"DET-DEM", @@ -30,12 +31,13 @@ def monofy(line,left=True): } if left: - word = line.partition("")[2].partition("")[0] + word = line.partition("")[2].partition("")[0] else: word = line.partition("")[0] - word = word.replace("","% ") + word = re.sub("", "% ", word) entry = word + ":" + word + " " + dic[tags] + " ; !" return entry @@ -48,14 +50,23 @@ if __name__=="__main__": filename = os.path.join(d, '../../apertium-uig/apertium-uig.uig.lexc') else: filename = os.path.join(d, '../../apertium-tur/apertium-tur.tur.lexc') - text = "".join([x for x in open(filename).readlines() if "V-TD" not in x]) + #text = "".join([x for x in open(filename).readlines() if "V-TD" not in x]) + present_entries = set() + entry_re = re.compile("([-\w]*):([-\w]*) *([\w-]*) ;") + with open(filename) as infile: + for line in infile.readlines(): + present_entry = re.match(entry_re, line) + if present_entry is not None: + present_entries.add(present_entry.groups()) for line in sys.stdin.readlines(): if "<" in line: try: m = monofy(line,left) - if m not in text: + new_entry = re.match(entry_re, m) + if not new_entry: + continue + if new_entry.groups() not in present_entries: sys.stdout.write(m + "\n") except KeyError: - print("Unknown tags:",line) continue