commit ccc35b6808507f8bf6c8d074080d395437091365
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Sat Aug 21 18:11:35 2021 +0530

    cleaning scripts

diff --git a/scripts/common.py b/scripts/common.py
index 4d4031f..30dc5c0 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -6,7 +6,8 @@
 import re
 import sys
 
-re_start = re.compile('(^[^\^]*)');
+re_start = re.compile('(^[^\^]*)')
+
 
 def ambiguous(bt):
     # legislation<n><sg>/legislación<n><f><sg>/ordenamiento<n><m><sg>
@@ -15,198 +16,207 @@ def ambiguous(bt):
             return True
 
     return False
-	
-def wrap (x):
-	return '^' + x + '$'
+
+
+def wrap(x):
+    return '^' + x + '$'
+
 
 def parse_tags(ptr, line):
-	tags = []
-	tag = '';
+    tags = []
+    tag = ''
+
+    while True:
+        c = line[ptr]
 
-	while True:
-		c = line[ptr];
+        if c == '$' or c == '/':
+            return (ptr-1, tags)
+        elif c == '>':
+            tags.append(tag)
+            tag = ''
+        elif c != '<':
+            tag += c
 
-		if c == '$' or c == '/':
-			return (ptr-1, tags);
-		elif c == '>':
-			tags.append(tag);
-			tag = '';
-		elif c != '<':
-			tag += c;
+        ptr += 1
 
-		ptr += 1
 
 def parse_sl(ptr, line):
-	out = '';
-	if line[ptr] == '*':
-		(ptr, out) = parse_unknown(ptr, line)
-		return (ptr, (out, []));
-
-	escaped = False;
-	while True:
-		c = line[ptr];
-		if c == '\\':
-			escaped = True;
-		elif (c == '/' or c == '$') and not escaped:
-			return (ptr, out);
-		elif c == '<' and not escaped:
-			(ptr, tags) = parse_tags(ptr+1, line);
-			return (ptr, (out, tags));
-		else:
-			out += c;
-			escaped = False;
-		ptr += 1;
+    out = ''
+    if line[ptr] == '*':
+        (ptr, out) = parse_unknown(ptr, line)
+        return (ptr, (out, []))
+
+    escaped = False
+    while True:
+        c = line[ptr]
+        if c == '\\':
+            escaped = True
+        elif (c == '/' or c == '$') and not escaped:
+            return (ptr, out)
+        elif c == '<' and not escaped:
+            (ptr, tags) = parse_tags(ptr+1, line)
+            return (ptr, (out, tags))
+        else:
+            out += c
+            escaped = False
+        ptr += 1
+
 
 def parse_unknown(ptr, line):
-	out = '';
-	escaped = False;
-	while True:
-		c = line[ptr];
-		if c == '\\':
-			escaped = True;
-		elif (c == '$' or c == '/') and not escaped:
-			return (ptr, out);
-		else:
-			out += c;
-			escaped = False;
-		ptr += 1;
+    out = ''
+    escaped = False
+    while True:
+        c = line[ptr]
+        if c == '\\':
+            escaped = True
+        elif (c == '$' or c == '/') and not escaped:
+            return (ptr, out)
+        else:
+            out += c
+            escaped = False
+        ptr += 1
+
 
 def parse_tls(ptr, line):
-	tls = [];
-	tl = '';
-	out = '';
-	escaped = False;
-	if line[ptr] == '*':
-		(ptr, out) = parse_unknown(ptr, line)
-		return (ptr, [(out, [])]);
-
-
-	while True:
-		if ptr == len(line):
-			tls.append(tl)
-			return (ptr, tls);
-		c = line[ptr];
-		if c == '\\':
-			escaped = True;
-		elif c == '/' and tl != '' and not escaped:
-			tls.append(tl)
-			tl = '';
-		elif c == '$' and not escaped:
-			if tl != '':
-				tls.append(tl)
-			return (ptr, tls);
-		elif c == '<' and not escaped:
-			(ptr, tags) = parse_tags(ptr, line);
-			tls.append((tl, tags));
-			tl = '';
-		elif c != '/' or escaped:
-			tl += c;
-			escaped = False;
-		ptr += 1;
+    tls = []
+    tl = ''
+    out = ''
+    escaped = False
+    if line[ptr] == '*':
+        (ptr, out) = parse_unknown(ptr, line)
+        return (ptr, [(out, [])])
+
+    while True:
+        if ptr == len(line):
+            tls.append(tl)
+            return (ptr, tls)
+        c = line[ptr]
+        if c == '\\':
+            escaped = True
+        elif c == '/' and tl != '' and not escaped:
+            tls.append(tl)
+            tl = ''
+        elif c == '$' and not escaped:
+            if tl != '':
+                tls.append(tl)
+            return (ptr, tls)
+        elif c == '<' and not escaped:
+            (ptr, tags) = parse_tags(ptr, line)
+            tls.append((tl, tags))
+            tl = ''
+        elif c != '/' or escaped:
+            tl += c
+            escaped = False
+        ptr += 1
 
 
 def toBiltransToken(sl, tls):
-	new_tls = []
-	for tl in tls:
-		new_tl = tl[0] + '<' + '><'.join(tl[1]) + '>';
-		new_tls.append(new_tl);
-	new_sl = sl[0] + '<' + '><'.join(sl[1]) + '>';
+    new_tls = []
+    for tl in tls:
+        new_tl = tl[0] + '<' + '><'.join(tl[1]) + '>'
+        new_tls.append(new_tl)
+    new_sl = sl[0] + '<' + '><'.join(sl[1]) + '>'
 
-	return (new_sl, new_tls);
+    return (new_sl, new_tls)
 
 
 def parse_biltrans_token(ptr, line):
-	(ptr, sl) = parse_sl(ptr, line);
-	(ptr, tls) = parse_tls(ptr+1, line);
-	(sl, tls) = toBiltransToken(sl, tls);
+    (ptr, sl) = parse_sl(ptr, line)
+    (ptr, tls) = parse_tls(ptr+1, line)
+    (sl, tls) = toBiltransToken(sl, tls)
 
-	token = {};
-	token['sl'] = sl;
-	token['tls'] = tls;
+    token = {}
+    token['sl'] = sl
+    token['tls'] = tls
+
+    return (ptr, token)
 
-	return (ptr, token);
 
 def parse_tagger_token(ptr, line):
-	(ptr, sl) = parse_sl(ptr, line);
-	sl = sl[0] + '<' + '><'.join(sl[1]) + '>'
+    (ptr, sl) = parse_sl(ptr, line)
+    sl = sl[0] + '<' + '><'.join(sl[1]) + '>'
+
+    return (ptr, sl)
 
-	return (ptr, sl);
 
 def tokenize_biltrans_line(line):
     return tokenise_biltrans_line(line)
 
+
 def tokenise_biltrans_line(line):
-	out = []
-	escaped = False;
-	for ptr in range(0, len(line)):
-		c = line[ptr];
-		if c == '^' and not escaped:
-			(ptr, token) = parse_biltrans_token(ptr+1, line)
-			out.append(token);
-		elif c == '\\':
-			escaped = True;
-		elif escaped:
-			escaped = False;
-
-	return out
+    out = []
+    escaped = False
+    for ptr in range(0, len(line)):
+        c = line[ptr]
+        if c == '^' and not escaped:
+            (ptr, token) = parse_biltrans_token(ptr+1, line)
+            out.append(token)
+        elif c == '\\':
+            escaped = True
+        elif escaped:
+            escaped = False
+
+    return out
+
 
 def tokenize_tagger_line(line):
     return tokenise_tagger_line(line)
 
+
 def tokenise_tagger_line(line):
 
-	out = []
-	escaped = False;
-	for ptr in range(0, len(line)):
-		c = line[ptr];
-		if c == '^' and not escaped:
-			(ptr, token) = parse_tagger_token(ptr+1, line)
-			out.append(token);
-		elif c == '\\':
-			escaped = True;
-		elif escaped:
-			escaped = False;
+    out = []
+    escaped = False
+    for ptr in range(0, len(line)):
+        c = line[ptr]
+        if c == '^' and not escaped:
+            (ptr, token) = parse_tagger_token(ptr+1, line)
+            out.append(token)
+        elif c == '\\':
+            escaped = True
+        elif escaped:
+            escaped = False
 
+    return out
 
-	return out
 
 def tokenize_biltrans_line2(line):
     return tokenise_biltrans_line2(line)
 
-def tokenise_biltrans_line2(line):
-	line = clean_biltrans_line(line)[1:-1];
-	row = [];
-	token = '';
-	state = 0;
-
-	escaped = False;
-
-	for c in line:
-		# in token
-		if state == 0:
-			if c == '$':
-				row.append(token);
-				token = '';
-				state = 1;
-			elif c == '\\':
-				continue;
-			else:
-				token += c;
-		# between tokens
-		elif state == 1:
-
-			if c == '\\':
-				escaped = True;
-			elif c == '^' and not escaped:
-				state = 0;
-				escaped = False;
-			elif escaped:
-				escaped = False
-
-	return row
 
-def clean_biltrans_line(line):
-	line = re_start.sub('', line);
-	return line
+def tokenise_biltrans_line2(line):
+    line = clean_biltrans_line(line)[1:-1]
+    row = []
+    token = ''
+    state = 0
+
+    escaped = False
+
+    for c in line:
+        # in token
+        if state == 0:
+            if c == '$':
+                row.append(token)
+                token = ''
+                state = 1
+            elif c == '\\':
+                continue
+            else:
+                token += c
+        # between tokens
+        elif state == 1:
+
+            if c == '\\':
+                escaped = True
+            elif c == '^' and not escaped:
+                state = 0
+                escaped = False
+            elif escaped:
+                escaped = False
+
+    return row
 
 
+def clean_biltrans_line(line):
+    line = re_start.sub('', line)
+    return line
diff --git a/scripts/extract-biltrans-candidates.py b/scripts/extract-biltrans-candidates.py
index 6b08d04..e469fb3 100644
--- a/scripts/extract-biltrans-candidates.py
+++ b/scripts/extract-biltrans-candidates.py
@@ -2,121 +2,114 @@
 # coding=utf-8
 # -*- encoding: utf-8 -*-
 
-import sys, codecs;
-import common;
+import sys
+import common
 
-if len(sys.argv) < 3 or len(sys.argv) > 4: #{
-	print('extact-sentences.py <phrasetable> <biltrans> [-m|--match-pos]');
-	sys.exit(-1);
-#}
+if len(sys.argv) < 3 or len(sys.argv) > 4:
+    print('extact-sentences.py <phrasetable> <biltrans> [-m|--match-pos]')
+    exit(1)
 
-match_pos = False;
+match_pos = False
 if len(sys.argv) == 4 and sys.argv[3] not in ['-m', '--match-pos']:
-	print('extact-sentences.py <phrasetable> <biltrans> [-m|--match-pos]');
-	sys.exit(-1);
+    print('extact-sentences.py <phrasetable> <biltrans> [-m|--match-pos]')
+    exit(1)
 elif len(sys.argv) == 4 and sys.argv[3] in ['-m', '--match-pos']:
-	match_pos = True;
+    match_pos = True
 
-phrase_table = open(sys.argv[1]);
-biltrans_out = open(sys.argv[2]);
+phrase_table = open(sys.argv[1])
+biltrans_out = open(sys.argv[2])
 
 def bttoken_tostr(token):
-	return '^' + token['sl'] + '/' + '/'.join(token['tls']) + '$';
+    return '^' + token['sl'] + '/' + '/'.join(token['tls']) + '$'
 
 def generate_tags(token):
-	tags = filter(lambda x: x != "*>", token.split('<')[1:]);
-	tags = ["<s n=\"" + x.rstrip('>') + "\"/>" for x in tags ];
-	tags = ''.join(tags);
-	return tags
+    tags = filter(lambda x: x != "*>", token.split('<')[1:])
+    tags = ["<s n=\"" + x.rstrip('>') + "\"/>" for x in tags]
+    tags = ''.join(tags)
+    return tags
 
 def generate_entry(slw, tlw):
-	out = '<e><p><l>%s%s</l><r>%s%s</r></p></e>';
-	llemma = slw.split('<')[0]
-	ltags = generate_tags(slw);
+    out = '<e><p><l>%s%s</l><r>%s%s</r></p></e>'
+    llemma = slw.split('<')[0]
+    ltags = generate_tags(slw)
 
-	rlemma = tlw.split('<')[0]
-	rtags = generate_tags(tlw);
+    rlemma = tlw.split('<')[0]
+    rtags = generate_tags(tlw)
 
 #	ltags = ["<s n=\"" + x + "\"/>" for x in ltags]
 
-	print (out % (llemma, ltags, rlemma, rtags));
+    print(out % (llemma, ltags, rlemma, rtags))
+
 
 def pos_equal(s, t):
-	spos = s.split('>')[1][1:]
-	tpos = s.split('>')[1][1:]
-
-	return spos == tpos;
-
-reading = True;
-lineno = 0;
-total_valid = 0;
-
-while reading: #{
-	try:
-		lineno = lineno + 1;
-		pt_line = phrase_table.readline().strip();
-		bt_line = biltrans_out.readline().strip();
-
-		if bt_line == '' and pt_line == '': #{
-			reading = False;
-		#}
-
-		row = pt_line.split('|||');
-		sl = common.tokenise_tagger_line(row[1]);
-		tl = common.tokenise_tagger_line(row[0]);
-		alignments = row[2].strip();
-		bt = common.tokenise_biltrans_line(bt_line);
-
-		if not common.ambiguous(bt): #{
-			continue;
-		#}
-		if len(sl) < 2 and len(tl) < 2: #{
-			continue;
-		#}
-
-		# Here we collect a set of SL words, with their correspondences in the bilingual
-		# dictionary, and the word they have been aligned with in the target.
-			# e.g.  words[0] = ('sl', ['bt1', 'bt2', ...], 'tl')
-
-		translations = {};
-		i = 0;
-		for j in alignments.split(' '): #{
-			ament = j.split('-');
-			if int(ament[0]) > len(tl): #{
-				continue;
-			#}
-			slw = sl[int(ament[1])]
-			if slw not in translations:
-				translations[slw] = {}
-			translations[slw]['tls'] = tl[int(ament[0])]
-			translations[slw]['bts'] = bt[int(ament[1])]
-		#}
-
-	#	for tr in translations:
-	#		print (tr, translations[tr])
-
-		current_ambig_words = {};
-		valid = True;
-		i = 0;
-		#
-		for tran in translations: #{
-			r = translations[tran]
-			tlw = r['tls']
-			# If the word is ambiguous
-			if len(r['bts']['tls']) > 1: #{
-				# if match_pos = 1 and pos tags do not match
-				if match_pos and not pos_equal(tran, tlw):
-					continue;
-
-				# Check to see if the TL possibilities are found in the lexical
-				# transfer output.
-				if tlw not in r['bts']['tls']:
-					print (tlw, "not found for", tran, file=sys.stderr);
-					generate_entry(tran, tlw);
-
-		#}
-
-	#}
-
-	except:
-		pass
+    spos = s.split('>')[1][1:]
+    tpos = s.split('>')[1][1:]
+
+    return spos == tpos
+
+reading = True
+lineno = 0
+total_valid = 0
+
+while reading:
+    try:
+        lineno = lineno + 1
+        pt_line = phrase_table.readline().strip()
+        bt_line = biltrans_out.readline().strip()
+
+        if bt_line == '' and pt_line == '':
+            reading = False
+
+        row = pt_line.split('|||')
+        sl = common.tokenise_tagger_line(row[1])
+        tl = common.tokenise_tagger_line(row[0])
+        alignments = row[2].strip()
+        bt = common.tokenise_biltrans_line(bt_line)
+
+        if not common.ambiguous(bt):
+            continue
+
+        if len(sl) < 2 and len(tl) < 2:
+            continue
+
+        # Here we collect a set of SL words, with their correspondences in the bilingual
+        # dictionary, and the word they have been aligned with in the target.
+            # e.g.  words[0] = ('sl', ['bt1', 'bt2', ...], 'tl')
+
+        translations = {}
+        i = 0
+        for j in alignments.split(' '):
+            ament = j.split('-')
+            if int(ament[0]) > len(tl):
+                continue
+
+            slw = sl[int(ament[1])]
+            if slw not in translations:
+                translations[slw] = {}
+            translations[slw]['tls'] = tl[int(ament[0])]
+            translations[slw]['bts'] = bt[int(ament[1])]
+
+    #	for tr in translations:
+    #		print (tr, translations[tr])
+
+        current_ambig_words = {}
+        valid = True
+        i = 0
+        #
+        for tran in translations:
+            r = translations[tran]
+            tlw = r['tls']
+            # If the word is ambiguous
+            if len(r['bts']['tls']) > 1:
+                # if match_pos = 1 and pos tags do not match
+                if match_pos and not pos_equal(tran, tlw):
+                    continue
+
+                # Check to see if the TL possibilities are found in the lexical
+                # transfer output.
+                if tlw not in r['bts']['tls']:
+                    print(tlw, "not found for", tran, file=sys.stderr)
+                    generate_entry(tran, tlw)
+
+    except:
+        pass