commit d06988ab17e655f8a191042b09476ba29f612b3f Author: vivekvardhanadepu Date: Sat Aug 21 18:05:10 2021 +0530 scripts:moving ambiguous to common diff --git a/scripts/common.py b/scripts/common.py index d0f4d08..4d4031f 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -7,6 +7,15 @@ import re import sys re_start = re.compile('(^[^\^]*)'); + +def ambiguous(bt): + # legislation/legislación/ordenamiento + for token in bt: + if len(token['tls']) > 1: + return True + + return False + def wrap (x): return '^' + x + '$' diff --git a/scripts/extract-biltrans-candidates.py b/scripts/extract-biltrans-candidates.py index 665988a..6b08d04 100644 --- a/scripts/extract-biltrans-candidates.py +++ b/scripts/extract-biltrans-candidates.py @@ -47,21 +47,6 @@ def pos_equal(s, t): return spos == tpos; - -def ambiguous(bt): #{ - # legislation/legislación/ordenamiento - - ambig = False; - for token in bt: #{ - tls = token['tls'] - if len(tls) > 1: #{ - return True; - #} - #} - - return ambig; -#} - reading = True; lineno = 0; total_valid = 0; @@ -82,7 +67,7 @@ while reading: #{ alignments = row[2].strip(); bt = common.tokenise_biltrans_line(bt_line); - if not ambiguous(bt): #{ + if not common.ambiguous(bt): #{ continue; #} if len(sl) < 2 and len(tl) < 2: #{ diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py index 03392fe..4796976 100755 --- a/scripts/extract-sentences.py +++ b/scripts/extract-sentences.py @@ -5,16 +5,6 @@ import sys import common - -def ambiguous(bt): - # legislation/legislación/ordenamiento - for token in bt: - if len(token['tls']) > 1: - return True - - return False - - def extract_sentences(phrase_table_file, biltrans_out_file): lineno = 0 total_valid = 0 @@ -38,7 +28,7 @@ def extract_sentences(phrase_table_file, biltrans_out_file): sl = common.tokenise_tagger_line(row[1].strip()) tl = common.tokenise_tagger_line(row[0].strip()) - if not ambiguous(bt): + if not common.ambiguous(bt): not_ambiguous.append(str(lineno)) if len(not_ambiguous) >= 10: print("not ambiguous:", ' '.join(