commit 0d6bbdd09ef8a6836672b62adaee086e73e66c99 Author: Amr Keleg Date: Mon Aug 5 17:56:00 2019 +0200 Improve the fallback weights for unknown words When generating a supervised weightlist, the number of times certain tags occur in the corpus can be used to provide an estimate for unknown words with a matchine tag. Example: if occurred m times then we can assume that analyses of form ?* have a fallback weight that is somehow proportional to m. Additionally, a default weight can be estimated as further fallback weight using laplace smoothing. diff --git a/annotated-corpus-to-weightlist b/annotated-corpus-to-weightlist index 6ce905b..72deb6b 100755 --- a/annotated-corpus-to-weightlist +++ b/annotated-corpus-to-weightlist @@ -3,25 +3,57 @@ import math import argparse from collections import Counter -from utils.utils import extract_analysis, generate_regex +from utils.utils import extract_analysis, generate_regex, extract_tag_from_analysis if __name__ == '__main__': parser = argparse.ArgumentParser(description='generate a regex weightlist given an annotated corpus') parser.add_argument('tagged_corpus', type=argparse.FileType('r'), help='input tagged corpus') - parser.add_argument('output_weighlist', + parser.add_argument('analysis_weightlist', type=argparse.FileType('w'), - help='output weightlist') + help='weightlist for specific analyses') + parser.add_argument('--tag_weightlist', + type=argparse.FileType('w'), + help='weightlist for specific tags') + parser.add_argument('--default_weightlist', + type=argparse.FileType('w'), + help='weightlist for out-of-corpus tokens') + args = parser.parse_args() TAGGED_CORPUS = args.tagged_corpus - OUTPUT_WEIGHTLIST_FILE = args.output_weighlist + ANALYSIS_WEIGHTLIST_FILE = args.analysis_weightlist + TAG_WEIGHTLIST_FILE = args.tag_weightlist + DEFAULT_WEIGHTLIST_FILE = args.default_weightlist + + lines = TAGGED_CORPUS.readlines() + analyses = [extract_analysis(line.strip()) for line in lines] + regex_analyses = Counter([generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')]) + den = sum(regex_analyses.values()) + num_offset = 0 + + if TAG_WEIGHTLIST_FILE: + tags = [extract_tag_from_analysis(line.strip()) for line in lines] + regex_tags = Counter([generate_regex(tag, match_all_prefixes=True) for tag in tags if tag and not tag.startswith('*')]) + den += sum(regex_tags.values()) * (1+len(regex_analyses)) + num_offset += sum(regex_tags.values()) + + if DEFAULT_WEIGHTLIST_FILE: + den += 1 + len(regex_analyses) + if TAG_WEIGHTLIST_FILE: + den += len(regex_tags) + num_offset += 1 + + weighted_regex_analyses = ['{}::{}'.format(regex, -math.log(count + num_offset)+math.log(den)) + for regex, count in regex_analyses.most_common()] + ANALYSIS_WEIGHTLIST_FILE.write('\n'.join(weighted_regex_analyses)) - analyses = [extract_analysis(line.strip()) for line in TAGGED_CORPUS.readlines()] + if TAG_WEIGHTLIST_FILE: + offset = 1 if DEFAULT_WEIGHTLIST_FILE else 0 + weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count + offset) +math.log(den)) + for regex, count in regex_tags.most_common()] + TAG_WEIGHTLIST_FILE.write('\n'.join(weighted_regex_tags)) - regex_tags = [generate_regex(analysis) for analysis in analyses if not analysis.startswith('*')] - weighted_regex_tags = ['{}::{}'.format(regex, -math.log(count/len(regex_tags))) - for regex, count in Counter(regex_tags).most_common()] - for regex in weighted_regex_tags: - OUTPUT_WEIGHTLIST_FILE.write(regex+'\n') + if DEFAULT_WEIGHTLIST_FILE: + DEFAULT_WEIGHTLIST_FILE.write('[?*]::{}'.format(-math.log(1) +math.log(den))) diff --git a/eval/unigram_fit.py b/eval/unigram_fit.py old mode 100644 new mode 100755 index 1d2116c..7b17b4e --- a/eval/unigram_fit.py +++ b/eval/unigram_fit.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import os import sys import argparse @@ -13,30 +15,47 @@ if __name__ == '__main__': help='a compiled dictionary') parser.add_argument('-o', '--output_directory', required=True, help='output directory for weighted dictionaries') + parser.add_argument('-t', '--use_tags', action='store_true', + help='Use a tags weightlist') args = parser.parse_args() input_directory = args.input_directory apertium_bin = args.apertium_bin output_directory = args.output_directory + use_tags = args.use_tags + if not os.path.exists(output_directory): os.mkdir(output_directory) temp_dir = tempfile.mkdtemp() - - temp_weightlist = Path(temp_dir, 'temp_weightlist') + + temp_analysis_weightlist = Path(temp_dir, 'temp_analysis_weightlist') + if use_tags: + temp_tag_weightlist = Path(temp_dir, 'temp_tag_weightlist') + temp_default_weightlist = Path(temp_dir, 'temp_default_weightlist') temp_input_file = Path(temp_dir, 'temp_input') + for input_file in sorted(os.listdir(input_directory)): - temp_input_files = [Path(input_directory, input_file) for file in sorted(os.listdir(input_directory)) if file!=input_file] + temp_input_files = [Path(input_directory, input_file) + for file in sorted(os.listdir(input_directory)) if file!=input_file] + with open(temp_input_file, 'w') as f: for file in temp_input_files: with open(file, 'r') as fold_file: f.write(fold_file.read()) - - subprocess.run(['python', - 'annotated_corpus_to_weightlist.py', - Path(input_directory, temp_input_file), temp_weightlist]) - + + + subprocess.run([arg for arg in ['./annotated-corpus-to-weightlist', + Path(input_directory, temp_input_file), + temp_analysis_weightlist, + '--tag_weightlist' if use_tags else None, + temp_tag_weightlist if use_tags else None, + '--default_weightlist', + temp_default_weightlist] if arg] ) + # Generate a bin file - subprocess.run(['./lt-weight', + subprocess.run([arg for arg in ['./lt-weight', apertium_bin, Path(output_directory, '{}.bin'.format(input_file)), - temp_weightlist]) \ No newline at end of file + temp_analysis_weightlist, + temp_tag_weightlist if use_tags else None, + temp_default_weightlist] if arg]) diff --git a/utils/utils.py b/utils/utils.py index 2f06866..a28b96f 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -4,10 +4,38 @@ import re import sys +def extract_surface(tagged_line): + """Extract the surface form from a tagged line + + A tagged line takes the form ^surface/analysis$ + i.e: returns surface + """ + return re.findall(r'\^.*\/', tagged_line)[0][1:-1] + def extract_analysis(tagged_line): + """Extract the Analysis form from a tagged line + + A tagged line takes the form ^surface/analysis$ + i.e.: returns analysis + """ return re.sub(r'[\t ]|^(\^.*\/)|(\$)$', '', tagged_line) -def generate_regex(analysis): +def extract_tag_from_analysis(tagged_line): + """Extract the tags from a tagged line + + A tagged line takes the form ^surface/analysis$ + i.e.: returns tag + """ + matches = re.findall('<.*>', extract_analysis(tagged_line)) + if len(matches) == 0: + # TODO: HANDLE THE CASE NO TAGS ARE FOUND + # raise BaseException() + return None + return matches[0] + +def generate_regex(analysis, match_all_prefixes=False): + """Convert an analysis form into XEROX regex""" + # Add a space after each token "REGEX concatenation" analysis = ' '.join(analysis) @@ -26,10 +54,15 @@ def generate_regex(analysis): analysis = re.sub(r'(\<.*?\>)', lambda multichar_tag: '%<{}%>'.format((re.sub(' ', '', multichar_tag.group(0)[1:-1]))), analysis) - # Surround regex with [] - analysis = '[{}]'.format(analysis) + analysis = re.sub(r'[^%]([>])', '%>', analysis) + analysis = re.sub(r'[^%]([<])', '%<', analysis) # TODO: Transform the regex into [?*][REGEX][?*] # This may be too slow to be feasible + if match_all_prefixes: + return '[?* {}]'.format(analysis) + + # Surround regex with [] + analysis = '[{}]'.format(analysis) return analysis