commit 39bce12d39edccdc4d49333705974c29eb9d255c Author: Amr Keleg Date: Sun Jun 23 18:08:03 2019 +0200 Implement the evaluation scripts diff --git a/scripts/eval/constraintgrammar_fit.py b/scripts/eval/constraintgrammar_fit.py new file mode 100644 index 0000000..c182900 --- /dev/null +++ b/scripts/eval/constraintgrammar_fit.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import tempfile +import subprocess +from pathlib import Path + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='fit n models using n folds') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of the n folds') + parser.add_argument('-b', '--apertium_bin', required=True, + help='a compiled dictionary') + parser.add_argument('-corpus', required=True, + help='an untagged corpus') + parser.add_argument('-cg', '--constraint_grammar', required=True, + help='a compiled constraint grammar') + parser.add_argument('-o', '--output_directory', required=True, + help='output directory for weighted dictionaries') + args = parser.parse_args() + input_directory = args.input_directory + output_directory = args.output_directory + apertium_bin = args.apertium_bin + corpus = args.corpus + constraint_grammar = args.constraint_grammar + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + temp_dir = tempfile.mkdtemp() + + temp_weightlist = Path(temp_dir, 'temp_weightlist') + subprocess.run(['./unannotated-corpus-to-weightlist', + apertium_bin, corpus, constraint_grammar, temp_weightlist]) + + for input_file in sorted(os.listdir(input_directory)): + # Generate a bin file + subprocess.run(['./lt-weight', + apertium_bin, + Path(output_directory, '{}.bin'.format(input_file)), + temp_weightlist]) diff --git a/scripts/eval/corpus_split.py b/scripts/eval/corpus_split.py new file mode 100755 index 0000000..e35c084 --- /dev/null +++ b/scripts/eval/corpus_split.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import os +import sys +import random +import argparse +from pathlib import Path + +def write_to_file(lines_list, file_name): + with open(file_name, 'w') as f: + f.write('\n'.join(lines_list)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='split a tagged corpus into n folds') + parser.add_argument('input_tagged_corpus', + type=argparse.FileType('r'), + help='a tagged corpus') + parser.add_argument('-n', '--no_of_folds', type=int, default=5, + help='number of folds') + parser.add_argument('-s', '--seed', type=int, default=42, + help='number of folds') + parser.add_argument('-o', '--output_directory',required=True, + help='output directory') + args = parser.parse_args() + cv = args.no_of_folds + output_directory = args.output_directory + input_tagged_corpus = args.input_tagged_corpus + random.seed(args.seed) + + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + splitted_corpus = [[] for _ in range(cv)] + + # For each line, generate a random index + for line_number, line in enumerate(input_tagged_corpus): + splitted_corpus[random.randint(0, cv - 1)].append(line.strip()) + + # Extract the base file name of the corpus + corpus_file_name = Path(input_tagged_corpus.name).name + + for cv_index in range(cv): + write_to_file(splitted_corpus[cv_index], + str(Path(output_directory, '{}_{}'.format(corpus_file_name, cv_index)))) diff --git a/scripts/eval/equalweight_fit.py b/scripts/eval/equalweight_fit.py new file mode 100644 index 0000000..c3744b1 --- /dev/null +++ b/scripts/eval/equalweight_fit.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import tempfile +import subprocess +from pathlib import Path + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='fit n models using n folds') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of the n folds') + parser.add_argument('-b', '--apertium_bin', required=True, + help='a compiled dictionary') + parser.add_argument('-o', '--output_directory', required=True, + help='output directory for weighted dictionaries') + args = parser.parse_args() + input_directory = args.input_directory + output_directory = args.output_directory + apertium_bin = args.apertium_bin + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + temp_dir = tempfile.mkdtemp() + + temp_weightlist = Path(temp_dir, 'temp_weightlist') + for input_file in sorted(os.listdir(input_directory)): + subprocess.run(['./equal-weightlist', temp_weightlist]) + # Generate a bin file + subprocess.run(['./lt-weight', + apertium_bin, + Path(output_directory, '{}.bin'.format(input_file)), + temp_weightlist]) diff --git a/scripts/eval/eval_utils.py b/scripts/eval/eval_utils.py new file mode 100644 index 0000000..dfb1bf8 --- /dev/null +++ b/scripts/eval/eval_utils.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +import re +import subprocess +from pathlib import Path + +def get_apertium_analyses(X, weighted_bin, base_dir, only_one_analysis=True): + #TODO: WHY DOES ADDING A DOT WORK AS A SEPARATOR? + joined_X = ' .\n'.join([x for x in X + ['\n']]) + + base_input_as_file = str(Path(base_dir, 'base_input_as_file')) + with open(base_input_as_file, 'w') as f: + f.write(joined_X) + + deformatted_input = str(Path(base_dir, 'formatted_input')) + assert(subprocess.run(['apertium-destxt', '-n', base_input_as_file, deformatted_input]).returncode == 0) + + analysed_output = str(Path(base_dir, 'analysis_output')) + reformatted_output = str(Path(base_dir, 'reformatted_output')) + # TODO: Is cleaning the reformatted output file needed? + + processing_command = ['lt-proc', weighted_bin, deformatted_input, analysed_output] + if only_one_analysis: + processing_command.append('-N 1') + + subprocess.run(processing_command) + subprocess.run(['apertium-retxt', analysed_output, reformatted_output]) + + with open(reformatted_output, 'r') as f: + analyses = [a.strip() for a in f.readlines() if a.strip()] + if only_one_analysis: + return [analysis[analysis.find('/') + 1: analysis.find('$')] + for analysis in analyses] + else: + return [analysis.strip('$').split('/')[1:] + for analysis in analyses] + +def split_X_y(file_lines): + '^With/with$' + splitted_lines = [line.strip()[1:-1].split('/') for line in file_lines if line.strip()] + + tokens = [l[0] for l in splitted_lines] + targets = [l[1] for l in splitted_lines] + + assert(len(tokens)==len(targets)), 'Token and Target vectors size mismatch ({}!={})'.format(len(tokens), len(targets)) + + return tokens, targets diff --git a/scripts/eval/metrics_report.py b/scripts/eval/metrics_report.py new file mode 100644 index 0000000..d529f23 --- /dev/null +++ b/scripts/eval/metrics_report.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +import os +import argparse +import tempfile +import tabulate +import statistics +from pathlib import Path +from eval_utils import get_apertium_analyses, split_X_y + +def get_sorted_files_in_directory(dir): + return sorted(os.listdir(dir)) + +def compute_weighted_precision(tag, metrics_dict, corpus_size): + den = (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP']) + if not den: + return 0 + + return ((metrics_dict[tag]['support'] / corpus_size) * + (metrics_dict[tag]['TP'] / den)) + +def compute_weighted_recall(tag, metrics_dict, corpus_size): + den = (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN']) + if not den: + return 0 + + return ((metrics_dict[tag]['support'] / corpus_size) * + (metrics_dict[tag]['TP'] / den)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='compute metrics for a compiled dictionary') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of n folds') + parser.add_argument('-b', '--apertium_bins', required=True, + help='directory of compiled dictionary') + args = parser.parse_args() + + base_dir = tempfile.mkdtemp() + + precision = [] + recall = [] + for testing_corpus, bin_file in zip( + get_sorted_files_in_directory(args.input_directory), + get_sorted_files_in_directory(args.apertium_bins)): + test_corpus = str(Path(args.input_directory, testing_corpus)) + + with open(test_corpus, 'r') as f: + X, y = split_X_y(f.readlines()) + pred = get_apertium_analyses(X, Path(args.apertium_bins, bin_file), base_dir) + assert(len(y) == len(pred)), 'Target and Predicted vectors size mismatch ({}!={})'.format(len(y), len(pred)) + metrics_dict = {} + + metrics_vars = ['TP', 'FP', 'FN', 'support'] + for target, prediction in zip(y, pred): + # IGNORE MISSING TARGET TAGS? + if target.startswith('*'): + continue + # IGNORE MISSING PREDICTION TAGS? + if prediction.startswith('*'): + continue + if not target in metrics_dict: + metrics_dict[target] = {var:0 for var in metrics_vars} + metrics_dict[target]['support'] += 1 + metrics_dict[target]['TP'] += target == prediction + metrics_dict[target]['FN'] += target != prediction + if not prediction in metrics_dict: + metrics_dict[prediction] = {var:0 for var in metrics_vars} + metrics_dict[prediction]['FP'] += target != prediction + + average_precision = 0 + average_recall = 0 + for tag in metrics_dict: + prec = compute_weighted_precision(tag, metrics_dict, len(X)) + if prec: + average_precision += prec + rec = compute_weighted_recall(tag, metrics_dict, len(X)) + if rec: + average_recall += rec + recall.append(average_recall) + precision.append(average_precision) + + metrics_dict = {'testing_corpus': get_sorted_files_in_directory(args.input_directory), + 'precision': precision, + 'recall': recall} + + print('Precision: {0:0.5f} +- {1:0.5f}'.format(statistics.mean(precision), statistics.stdev(precision))) + print('Recall: {0:0.5f} +- {1:0.5f}'.format(statistics.mean(recall), statistics.stdev(recall))) + + print(tabulate.tabulate(metrics_dict, headers=metrics_dict.keys(), showindex=False, tablefmt='github')) + print() diff --git a/scripts/eval/unigram_fit.py b/scripts/eval/unigram_fit.py new file mode 100644 index 0000000..1d2116c --- /dev/null +++ b/scripts/eval/unigram_fit.py @@ -0,0 +1,42 @@ +import os +import sys +import argparse +import tempfile +import subprocess +from pathlib import Path + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='fit n models using n folds') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of the n folds') + parser.add_argument('-b', '--apertium_bin', required=True, + help='a compiled dictionary') + parser.add_argument('-o', '--output_directory', required=True, + help='output directory for weighted dictionaries') + args = parser.parse_args() + input_directory = args.input_directory + apertium_bin = args.apertium_bin + output_directory = args.output_directory + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + temp_dir = tempfile.mkdtemp() + + temp_weightlist = Path(temp_dir, 'temp_weightlist') + temp_input_file = Path(temp_dir, 'temp_input') + for input_file in sorted(os.listdir(input_directory)): + temp_input_files = [Path(input_directory, input_file) for file in sorted(os.listdir(input_directory)) if file!=input_file] + with open(temp_input_file, 'w') as f: + for file in temp_input_files: + with open(file, 'r') as fold_file: + f.write(fold_file.read()) + + subprocess.run(['python', + 'annotated_corpus_to_weightlist.py', + Path(input_directory, temp_input_file), temp_weightlist]) + + # Generate a bin file + subprocess.run(['./lt-weight', + apertium_bin, + Path(output_directory, '{}.bin'.format(input_file)), + temp_weightlist]) \ No newline at end of file