commit c3600d891e37dd0da86ff86cdbbf16a40e594d4b Author: Amr Keleg Date: Sun Aug 18 22:24:51 2019 +0200 Implement the word2vec based weighting method diff --git a/eval/w2v_fit.py b/eval/w2v_fit.py new file mode 100644 index 0000000..84981b1 --- /dev/null +++ b/eval/w2v_fit.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import tempfile +import subprocess +from pathlib import Path + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='fit n models using n folds') + parser.add_argument('-i', '--input_directory', required=True, + help='input directory of the n folds') + parser.add_argument('-b', '--apertium_bin', required=True, + help='a compiled dictionary') + parser.add_argument('-corpus', required=True, + help='an untagged corpus') + parser.add_argument('-o', '--output_directory', required=True, + help='output directory for weighted dictionaries') + args = parser.parse_args() + input_directory = args.input_directory + output_directory = args.output_directory + apertium_bin = args.apertium_bin + corpus = args.corpus + if not os.path.exists(output_directory): + os.mkdir(output_directory) + + temp_dir = tempfile.mkdtemp() + + temp_weightlist = Path(temp_dir, 'temp_weightlist') + default_weightlist = Path(temp_dir, 'temp_default_weightlist') + subprocess.run(['./w2v-weightlist', + corpus, apertium_bin, temp_weightlist, default_weightlist]) + + for input_file in sorted(os.listdir(input_directory)): + # Generate a bin file + subprocess.run(['./lt-weight', + apertium_bin, + Path(output_directory, '{}.bin'.format(input_file)), + temp_weightlist, + default_weightlist]) diff --git a/utils/w2v_generate_weights.py b/utils/w2v_generate_weights.py new file mode 100755 index 0000000..daf68fb --- /dev/null +++ b/utils/w2v_generate_weights.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +import sys +import math +from collections import Counter +from utils import extract_tag_from_analysis, generate_regex + +def get_weight(word_a, similar_a): + """ + Return the count of times the word had analyses tags + similar to those of an un-ambiguous similar word + + word_a: The word analyses + similar_a: The similar word analyses (list of strings) + """ + + # TODO: Pass a string instead of a list of size 1 + word_a = [analysis.strip('$').split('/')[1:] + for analysis in word_a if analysis][0] + similar_a = [analysis.strip('$').split('/')[1:] + for analysis in similar_a if analysis] + if not word_a: + return None + + if word_a[0].startswith('*'): + return None + + if len(word_a) == 1: + return Counter({generate_regex(word_a[0]):1}) + + unambig_analyses = [a[0] for a in similar_a if len(a)==1 and not a[0].startswith('*')] + tags = [extract_tag_from_analysis(word_analysis) for word_analysis in unambig_analyses] + tags_count = Counter(tags) + + return Counter({generate_regex(analysis): tags_count[extract_tag_from_analysis(analysis)] for analysis in word_a}) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate a weightlist using a set of words and their similar words given the context') + parser.add_argument('--words_file', + type=argparse.FileType('r'), + required=True, + help='words file') + parser.add_argument('--similar_file', + type=argparse.FileType('r'), + required=True, + help='similar words file (each line in tab-delimited)') + parser.add_argument('--output_weightlist', + type=argparse.FileType('w'), + required=True, + help='The output weightlist using the similar words analysis') + parser.add_argument('--default_weightlist', + type=argparse.FileType('w'), + required=True, + help='The weightlist containing a laplace smoothed weight') + args = parser.parse_args() + words_file = args.words_file + similar_words_file = args.similar_words_file + output_weightlist = args.output_weightlist + default_weightlist = args.default_weightlist + + words = [[l.strip()] for l in words_file.readlines() if l.strip()] + similar_words = [l.strip().split() for l in similar_words_file.readlines() if l.strip()] + + weights = [get_weight(w, s) for w, s in zip(words, similar_words)] + weights = [w for w in weights if w] + counts = sum(weights, Counter()) + sum_counts = sum(counts.values()) + len(counts) + 1 + + with open(output_weightlist, 'w') as f: + for t in counts: + f.write('{}::{}\n'.format(t, -math.log((1 + counts[t]) / sum_counts ))) + + with open(default_weightlist, 'w') as f: + f.write('[?*]::{}'.format(-math.log(1 / sum_counts))) diff --git a/utils/w2v_get_similar_words.py b/utils/w2v_get_similar_words.py new file mode 100755 index 0000000..603b23f --- /dev/null +++ b/utils/w2v_get_similar_words.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import sys +import tqdm +import gensim +import string +import argparse + +class Dataset: + """ + Wrap corpus for avoiding complete loading of file into memory + """ + def __init__(self, file, window_size=4): + self.corpus_file = file + self.window_size = window_size + self.gram_size = 2 * self.window_size + 1 + self.corpus_file.seek(0) + + def __iter__(self): + self.grams = [] + return self + + def __next__(self): + # TODO: Consider using faster data-structures + if not self.grams: + self.grams = [self.read_word() for _ in range(self.gram_size)] + return self.grams + word = self.read_word() + if not word: + raise StopIteration + self.grams.pop(0) + self.grams.append(word) + return self.grams + + def read_word(self): + # TODO: Improve the way to get a word from a file + word = '' + while True: + c = self.corpus_file.read(1) + if c.isspace() and word: + return word + elif not c: + return None if not word else word + word = word + c + return None if not word else word + +def get_similar_tokens(context, word2vec): + """ Find the most probable words given bag of context words + + context: A list of context words + word2vec: A fitted word2vec model + """ + similar_words = word2vec.predict_output_word(context) + if not similar_words: + return [] + return [w for w, _ in similar_words] + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate the set of words and similar words using a raw corpus file') + parser.add_argument('--corpus', + type=argparse.FileType('r'), + required=True, + help='large raw corpus file') + parser.add_argument('--output_words_file', + type=argparse.FileType('w'), + required=True, + help='The words of the corpus, one word per line') + parser.add_argument('--output_similar_words_file', + type=argparse.FileType('w'), + required=True, + help='The set of similar words for the words of the corpus, tab-delimited') + args = parser.parse_args() + corpus_file = args.corpus + output_words_file = args.output_words_file + output_similar_words_file = args.output_similar_words_file + + word2vec = gensim.models.Word2Vec(Dataset(corpus_file, 2), min_count=1) + + for gram in tqdm.tqdm(Dataset(corpus_file, 2)): + center_word = gram.pop(len(gram) // 2) + if not center_word or not all(gram): + continue + similar_words = get_similar_tokens(gram, word2vec) + if not similar_words: + continue + output_words_file.write(center_word + '\n') + output_similar_words_file.write('\t'.join(similar_words) + '\n') diff --git a/w2v-weightlist b/w2v-weightlist new file mode 100755 index 0000000..5f35e37 --- /dev/null +++ b/w2v-weightlist @@ -0,0 +1,85 @@ +#!/bin/sh + +usage="$(basename "$0"): generate a disambiguated weightlist using a word2vec model +USAGE: $(basename "$0") [-h] corpus input_file output_weightlist +corpus a raw corpus file +input_file the input compiled dictionary (a finite state transducer) +output_weightlist a weightlist for unambiguous words in corpus +default_weightlist a laplace smoothed weightlist for OOV words + +Options: + -h, --help: show this help +" + +while :; do + case $1 in + -h|-\?|--help) + printf "$usage" + exit + ;; + --) + shift + break + ;; + -?*) + printf "WARN: Unknown option (ignored): %s\n" "$1" >&2 + ;; + *) + break + esac + + shift +done + +CORPUS=$1 +INPUT_FST=$2 +OUTPUT_WEIGHTLIST=$3 +DEFAULT_WEIGHTLIST=$4 + +no_of_missing_args=0 +if [ ! -f "$CORPUS" ] +then + printf "ERROR: corpus file \"%s\" doesn't exist\n" "$CORPUS" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ ! -f "$INPUT_FST" ] +then + printf "ERROR: input_file \"%s\" doesn't exist\n" "$INPUT_FST" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ -z "$OUTPUT_WEIGHTLIST" ] +then + printf "ERROR: output_weightlist isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ -z "$DEFAULT_WEIGHTLIST" ] +then + printf "ERROR: default_weightlist isn't set\n" >&2 + no_of_missing_args=$((no_of_missing_args + 1)) +fi + +if [ $no_of_missing_args -gt 0 ] +then + printf "$usage" + exit +fi + +TEMP_DIR=$(mktemp -d) +WORD_FILE="$TEMP_DIR/word" +SIMILAR_WORD_FILE="$TEMP_DIR/similar" +./utils/w2v_get_similar_words.py --corpus "$CORPUS" --output_words_file "$WORD_FILE" --output_similar_words_file "$SIMILAR_WORD_FILE" + +ANALYZED_WORDS="$TEMP_DIR/analyzed_words" +ANALYZED_SIMILAR_WORDS="$TEMP_DIR/analyzed_similar_words" + +apertium-destxt "$WORD_FILE" | lt-proc "$INPUT_FST" | apertium-retxt > "$ANALYZED_WORDS" +apertium-destxt "$SIMILAR_WORD_FILE" | lt-proc "$INPUT_FST" | apertium-retxt > "$ANALYZED_SIMILAR_WORDS" + +# TODO: WHY? +CLEANED_WORDS="$TEMP_DIR/cleaned_words" +apertium-cleanstream -n < "$ANALYZED_WORDS" > "$CLEANED_WORDS" + +./utils/w2v_generate_weights.py "$CLEANED_WORDS" "$ANALYZED_SIMILAR_WORDS" "$OUTPUT_WEIGHTLIST" "$DEFAULT_WEIGHTLIST"