commit c3600d891e37dd0da86ff86cdbbf16a40e594d4b
Author: Amr Keleg <amr_mohamed@live.com>
Date:   Sun Aug 18 22:24:51 2019 +0200

    Implement the word2vec based weighting method

diff --git a/eval/w2v_fit.py b/eval/w2v_fit.py
new file mode 100644
index 0000000..84981b1
--- /dev/null
+++ b/eval/w2v_fit.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import argparse
+import tempfile
+import subprocess
+from pathlib import Path
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description='fit n models using n folds')
+	parser.add_argument('-i', '--input_directory', required=True,
+						help='input directory of the n folds')
+	parser.add_argument('-b', '--apertium_bin', required=True,
+						help='a compiled dictionary')
+	parser.add_argument('-corpus', required=True,
+						help='an untagged corpus')
+	parser.add_argument('-o', '--output_directory', required=True,
+						help='output directory for weighted dictionaries')
+	args = parser.parse_args()
+	input_directory = args.input_directory
+	output_directory = args.output_directory
+	apertium_bin = args.apertium_bin
+	corpus = args.corpus
+	if not os.path.exists(output_directory):
+		os.mkdir(output_directory)
+
+	temp_dir = tempfile.mkdtemp()
+
+	temp_weightlist = Path(temp_dir, 'temp_weightlist')
+	default_weightlist = Path(temp_dir, 'temp_default_weightlist')
+	subprocess.run(['./w2v-weightlist',
+		corpus, apertium_bin, temp_weightlist, default_weightlist])
+
+	for input_file in sorted(os.listdir(input_directory)):
+		# Generate a bin file
+		subprocess.run(['./lt-weight',
+						apertium_bin,
+						Path(output_directory, '{}.bin'.format(input_file)),
+						temp_weightlist,
+						default_weightlist])
diff --git a/utils/w2v_generate_weights.py b/utils/w2v_generate_weights.py
new file mode 100755
index 0000000..daf68fb
--- /dev/null
+++ b/utils/w2v_generate_weights.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+import sys
+import math
+from collections import Counter
+from utils import extract_tag_from_analysis, generate_regex
+
+def get_weight(word_a, similar_a):
+	"""
+	Return the count of times the word had analyses tags
+	similar to those of an un-ambiguous similar word
+
+	word_a: The word analyses
+	similar_a: The similar word analyses (list of strings)
+	"""
+
+	# TODO: Pass a string instead of a list of size 1
+	word_a = [analysis.strip('$').split('/')[1:]
+		for analysis in word_a if analysis][0]
+	similar_a = [analysis.strip('$').split('/')[1:]
+		for analysis in similar_a if analysis]
+	if not word_a:
+		return None
+
+	if word_a[0].startswith('*'):
+		return None
+
+	if len(word_a) == 1:
+		return Counter({generate_regex(word_a[0]):1})
+
+	unambig_analyses = [a[0] for a in similar_a if len(a)==1 and not a[0].startswith('*')]
+	tags = [extract_tag_from_analysis(word_analysis) for word_analysis in unambig_analyses]
+	tags_count = Counter(tags)
+
+	return Counter({generate_regex(analysis): tags_count[extract_tag_from_analysis(analysis)] for analysis in word_a})
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description='Generate a weightlist using a set of words and their similar words given the context')
+	parser.add_argument('--words_file',
+						type=argparse.FileType('r'),
+						required=True,
+						help='words file')
+	parser.add_argument('--similar_file',
+						type=argparse.FileType('r'),
+						required=True,
+						help='similar words file (each line in tab-delimited)')
+	parser.add_argument('--output_weightlist',
+						type=argparse.FileType('w'),
+						required=True,
+						help='The output weightlist using the similar words analysis')
+	parser.add_argument('--default_weightlist',
+						type=argparse.FileType('w'),
+						required=True,
+						help='The weightlist containing a laplace smoothed weight')
+	args = parser.parse_args()
+	words_file = args.words_file
+	similar_words_file = args.similar_words_file
+	output_weightlist = args.output_weightlist
+	default_weightlist = args.default_weightlist
+	
+	words = [[l.strip()] for l in words_file.readlines() if l.strip()]
+	similar_words = [l.strip().split() for l in similar_words_file.readlines() if l.strip()]
+
+	weights = [get_weight(w, s) for w, s in zip(words, similar_words)]
+	weights = [w for w in weights if w]
+	counts = sum(weights, Counter())
+	sum_counts = sum(counts.values()) + len(counts) + 1
+
+	with open(output_weightlist, 'w') as f:
+		for t in counts:
+			f.write('{}::{}\n'.format(t, -math.log((1 + counts[t]) / sum_counts )))
+	
+	with open(default_weightlist, 'w') as f:
+		f.write('[?*]::{}'.format(-math.log(1 / sum_counts)))
diff --git a/utils/w2v_get_similar_words.py b/utils/w2v_get_similar_words.py
new file mode 100755
index 0000000..603b23f
--- /dev/null
+++ b/utils/w2v_get_similar_words.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import sys
+import tqdm
+import gensim
+import string
+import argparse
+
+class Dataset:
+	"""
+	Wrap corpus for avoiding complete loading of file into memory
+	"""
+	def __init__(self, file, window_size=4):
+		self.corpus_file = file
+		self.window_size = window_size
+		self.gram_size = 2 * self.window_size + 1
+		self.corpus_file.seek(0)
+
+	def __iter__(self):
+		self.grams = []
+		return self
+
+	def __next__(self):
+		# TODO: Consider using faster data-structures
+		if not self.grams:
+			self.grams = [self.read_word() for _ in range(self.gram_size)]
+			return self.grams
+		word = self.read_word()
+		if not word:
+			raise StopIteration
+		self.grams.pop(0)
+		self.grams.append(word)
+		return self.grams
+
+	def read_word(self):
+		# TODO: Improve the way to get a word from a file
+		word = ''
+		while True:
+			c = self.corpus_file.read(1)
+			if c.isspace() and word:
+				return word
+			elif not c:
+				return None if not word else word
+			word = word + c
+		return None if not word else word
+
+def get_similar_tokens(context, word2vec):
+	""" Find the most probable words given bag of context words
+
+	context: A list of context words
+	word2vec: A fitted word2vec model
+	"""
+	similar_words = word2vec.predict_output_word(context)
+	if not similar_words:
+		return []
+	return  [w for w, _ in similar_words]
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description='Generate the set of words and similar words using a raw corpus file')
+	parser.add_argument('--corpus',
+						type=argparse.FileType('r'),
+						required=True,
+						help='large raw corpus file')
+	parser.add_argument('--output_words_file',
+						type=argparse.FileType('w'),
+						required=True,
+						help='The words of the corpus, one word per line')
+	parser.add_argument('--output_similar_words_file',
+						type=argparse.FileType('w'),
+						required=True,
+						help='The set of similar words for the words of the corpus, tab-delimited')
+	args = parser.parse_args()
+	corpus_file = args.corpus
+	output_words_file = args.output_words_file
+	output_similar_words_file = args.output_similar_words_file
+
+	word2vec = gensim.models.Word2Vec(Dataset(corpus_file, 2), min_count=1)
+
+	for gram in tqdm.tqdm(Dataset(corpus_file, 2)):
+		center_word = gram.pop(len(gram) // 2)
+		if not center_word or not all(gram):
+			continue
+		similar_words = get_similar_tokens(gram, word2vec)
+		if not similar_words:
+			continue
+		output_words_file.write(center_word + '\n')
+		output_similar_words_file.write('\t'.join(similar_words) + '\n')
diff --git a/w2v-weightlist b/w2v-weightlist
new file mode 100755
index 0000000..5f35e37
--- /dev/null
+++ b/w2v-weightlist
@@ -0,0 +1,85 @@
+#!/bin/sh
+
+usage="$(basename "$0"): generate a disambiguated weightlist using a word2vec model
+USAGE: $(basename "$0") [-h] corpus input_file output_weightlist
+corpus	a raw corpus file
+input_file	the input compiled dictionary (a finite state transducer)
+output_weightlist	a weightlist for unambiguous words in corpus
+default_weightlist	a laplace smoothed weightlist for OOV words
+
+Options:
+    -h, --help:	show this help
+"
+
+while :; do
+    case $1 in
+        -h|-\?|--help)
+            printf "$usage"
+            exit
+            ;;
+        --)
+            shift
+            break
+            ;;
+        -?*)
+            printf "WARN: Unknown option (ignored): %s\n" "$1" >&2
+            ;;
+        *)
+            break
+    esac
+
+    shift
+done
+
+CORPUS=$1
+INPUT_FST=$2
+OUTPUT_WEIGHTLIST=$3
+DEFAULT_WEIGHTLIST=$4
+
+no_of_missing_args=0
+if [ ! -f "$CORPUS" ]
+then
+	printf "ERROR: corpus file \"%s\" doesn't exist\n" "$CORPUS" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ ! -f "$INPUT_FST" ]
+then
+	printf "ERROR: input_file \"%s\" doesn't exist\n" "$INPUT_FST" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ -z "$OUTPUT_WEIGHTLIST" ]
+then
+	printf "ERROR: output_weightlist isn't set\n" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ -z "$DEFAULT_WEIGHTLIST" ]
+then
+	printf "ERROR: default_weightlist isn't set\n" >&2
+	no_of_missing_args=$((no_of_missing_args + 1))
+fi
+
+if [ $no_of_missing_args -gt 0 ]
+then
+	printf "$usage"
+	exit
+fi
+
+TEMP_DIR=$(mktemp -d)
+WORD_FILE="$TEMP_DIR/word"
+SIMILAR_WORD_FILE="$TEMP_DIR/similar"
+./utils/w2v_get_similar_words.py --corpus "$CORPUS" --output_words_file "$WORD_FILE" --output_similar_words_file "$SIMILAR_WORD_FILE"
+
+ANALYZED_WORDS="$TEMP_DIR/analyzed_words"
+ANALYZED_SIMILAR_WORDS="$TEMP_DIR/analyzed_similar_words"
+
+apertium-destxt "$WORD_FILE" | lt-proc "$INPUT_FST" | apertium-retxt > "$ANALYZED_WORDS"
+apertium-destxt "$SIMILAR_WORD_FILE" | lt-proc "$INPUT_FST" | apertium-retxt > "$ANALYZED_SIMILAR_WORDS"
+
+# TODO: WHY?
+CLEANED_WORDS="$TEMP_DIR/cleaned_words"
+apertium-cleanstream -n < "$ANALYZED_WORDS" > "$CLEANED_WORDS"
+
+./utils/w2v_generate_weights.py "$CLEANED_WORDS" "$ANALYZED_SIMILAR_WORDS" "$OUTPUT_WEIGHTLIST" "$DEFAULT_WEIGHTLIST"