commit 893b7366f3095f3065cfe226cf1d4ca3fc17a35d Author: vivekvardhanadepu Date: Thu Jun 24 20:03:33 2021 +0530 Scripts fixup diff --git a/scripts/extract-sentences.py b/scripts/extract-sentences.py index 01aedf7..8d7ee06 100755 --- a/scripts/extract-sentences.py +++ b/scripts/extract-sentences.py @@ -22,75 +22,76 @@ def ambiguous(bt): # { # } -def extract_sentences(phrase_table, biltrans_out): +def extract_sentences(phrase_table_file, biltrans_out_file): reading = True lineno = 0 total_valid = 0 total_errors = 0 not_ambiguous = [] + with open(phrase_table_file) as phrase_table, open(biltrans_out_file) as biltrans_out: + while reading: # { + try: + lineno = lineno + 1 + pt_line = phrase_table.readline().strip() + bt_line = biltrans_out.readline().strip() + + if not bt_line.strip() and not pt_line.strip(): # { + reading = False + break + elif not bt_line.strip() or not pt_line.strip(): # { + continue - while reading: # { - try: - lineno = lineno + 1 - pt_line = phrase_table.readline().strip() - bt_line = biltrans_out.readline().strip() - - if not bt_line.strip() and not pt_line.strip(): # { - reading = False - break - elif not bt_line.strip() or not pt_line.strip(): # { - continue - - # } - row = pt_line.split('|||') - bt = common.tokenise_biltrans_line(bt_line.strip()) - sl = common.tokenise_tagger_line(row[1].strip()) - tl = common.tokenise_tagger_line(row[0].strip()) - - if not ambiguous(bt): # { - not_ambiguous.append(str(lineno)) - if len(not_ambiguous) >= 10: # { - print("not ambiguous:", ' '.join( - not_ambiguous), file=sys.stderr) - not_ambiguous = [] # } - continue - # } - if len(sl) < 2 and len(tl) < 2: # { - continue - # } + row = pt_line.split('|||') + bt = common.tokenise_biltrans_line(bt_line.strip()) + sl = common.tokenise_tagger_line(row[1].strip()) + tl = common.tokenise_tagger_line(row[0].strip()) + + if not ambiguous(bt): # { + not_ambiguous.append(str(lineno)) + if len(not_ambiguous) >= 10: # { + print("not ambiguous:", ' '.join( + not_ambiguous), file=sys.stderr) + not_ambiguous = [] + # } + continue + # } + if len(sl) < 2 and len(tl) < 2: # { + continue + # } - # Check that the number of words in the lexical transfer, and in the phrasetable matches up - if len(sl) != len(bt): # { - print("Error in line", lineno, - ": len(sl) != len(bt)", file=sys.stderr) - continue - # } + # Check that the number of words in the lexical transfer, and in the phrasetable matches up + if len(sl) != len(bt): # { + print("Error in line", lineno, + ": len(sl) != len(bt)", file=sys.stderr) + continue + # } - # cheking if the alignments are empty - if not row[2].strip(): - print("In line", lineno, ", alignments are empty", file=sys.stderr) + # cheking if the alignments are empty + if not row[2].strip(): + print("In line", lineno, + ", alignments are empty", file=sys.stderr) + continue + + # Resumption of the session + # Resumption/Reanudación of/de the/el session/sesión + # Reanudación de el periodo de sesión + # 0-0 1-1 2-2 5-3 + + print(lineno, '\t' + row[1]) + print(lineno, '\t' + bt_line) + print(lineno, '\t' + row[0]) + print(lineno, '\t' + row[2]) + print( + '-------------------------------------------------------------------------------') + total_valid += 1 + except Exception as e: + print("Error in line", lineno, ": ", e, file=sys.stderr) + total_errors += 1 continue - # Resumption of the session - # Resumption/Reanudación of/de the/el session/sesión - # Reanudación de el periodo de sesión - # 0-0 1-1 2-2 5-3 - - print(lineno, '\t' + row[1]) - print(lineno, '\t' + bt_line) - print(lineno, '\t' + row[0]) - print(lineno, '\t' + row[2]) - print( - '-------------------------------------------------------------------------------') - total_valid += 1 - except Exception as e: - print("Error in line", lineno, ": ", e, file=sys.stderr) - total_errors += 1 - continue - - # } + # } print('total:', lineno, file=sys.stderr) print('valid:', total_valid, @@ -104,5 +105,5 @@ if __name__ == '__main__': print('extact-sentences.py ') exit(1) # } - with open(sys.argv[1]) as phrase_table, open(sys.argv[2]) as biltrans_out: - extract_sentences(phrase_table, biltrans_out) + + extract_sentences(sys.argv[1], sys.argv[2]) diff --git a/scripts/ngram-count-patterns-maxent2.py b/scripts/ngram-count-patterns-maxent2.py index 463bc3f..693fc82 100755 --- a/scripts/ngram-count-patterns-maxent2.py +++ b/scripts/ngram-count-patterns-maxent2.py @@ -247,7 +247,8 @@ def ngram_count_patterns(freq_lexicon, candidates): print(features[feature], '\t', feature, file=sys.stderr) # } - exit(1) + # exit(1) + return for slword in meevents: # { if len(sl_tl[slword]) < 2: # {