commit 57b88ac0d0f64ccc8c00210a2f66ce6efefaf29b Author: Amr Keleg Date: Wed Jun 19 22:43:05 2019 +0200 lt-weight: Use HFST to weight the input fst diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..b5d303a --- /dev/null +++ b/AUTHORS @@ -0,0 +1,22 @@ +(c) 2005 Universitat d'Alacant / Universidad de Alicante. +(c) 2007-2008 Prompsit Language Engineering S.L. + +2007-2018, Francis M. Tyers +2015-2018, Tino Didriksen +2017-2018, Tommi Pirinen +2018, Xavi Ivars +2018, Sushain Cherivirala +2017, Himanshu Sekhar Nayak +2008-2017, Jim O'Regan +2009-2017, Kevin Brubeck Unhammer +2016, Frankie Robertson +2014-2015, Hrvoje Peradin +2007-2013, Sergio Ortiz Rojas +2011, Pim Otte +2011, Sjur Nørstebø Moshagen +2010, Trond Trosterud +2009, Pasquale Minervini +2008, Jacob Nordfalk +2008, Felipe Sánchez Martínez +2008, Wynand Winterbach +2007, Stephen Paulger diff --git a/lt-weight b/lt-weight new file mode 100755 index 0000000..092fc66 --- /dev/null +++ b/lt-weight @@ -0,0 +1,35 @@ +#! /bin/sh +FST=$1 +CORPUS=$2 +OUTPUT_FST=$3 + +# Temporary intermediate files +ATTFST='transducer.att' +HFST_FST='transducer.hfst' +WEIGHTED_FST='weighted-pairs.hfst' +COMPOSED_FST='weighted-transducer.hfst' +MULTICHAR='multichar_symbols' + +# Convert the input FST to HFST +lt-print $FST > $ATTFST +hfst-txt2fst --epsilon=ε -i $ATTFST -o $HFST_FST + +LINES=$(wc -l $CORPUS | cut -d ' ' -f1) + +# Prepare the multichar symbols files +awk -F '[<>]' '{print "<"$(NF-1)">"}' data/corpus.tagged | tr -d , | sort | uniq > $MULTICHAR + +# Generate a weighted FST from the string pairs +cat $CORPUS | sed -e 's/[ \t]//' | sed -e 's/\^.*\///' | +sed -e 's/\$$//' | sort | uniq -c | sed -e 's/^[ \t]*//' | +awk -v lines="$LINES" '{$1=-log($1/lines); print $2":" $2 "\t" $1}' | +hfst-strings2fst -j -o $WEIGHTED_FST -m $MULTICHAR + +# Compose the input FST and the weighted FST +hfst-compose -1 $HFST_FST -2 $WEIGHTED_FST | hfst-fst2txt > $COMPOSED_FST + +# Compile the FST back using lttoolbox +lt-comp lr $COMPOSED_FST $OUTPUT_FST + +# Delete the temporary files +rm $ATTFST $HFST_FST $WEIGHTED_FST $MULTICHAR $COMPOSED_FST diff --git a/tests/basictest.py b/tests/basictest.py new file mode 100644 index 0000000..0aff6a3 --- /dev/null +++ b/tests/basictest.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +import signal +class Alarm(Exception): + pass + +class BasicTest: + def alarmHandler(self, signum, frame): + raise Alarm + + def withTimeout(self, seconds, cmd, *args, **kwds): + signal.signal(signal.SIGALRM, self.alarmHandler) + signal.alarm(seconds) + ret = cmd(*args, **kwds) + signal.alarm(0) # reset the alarm + return ret + + def communicateFlush(self, string, process): + if string: + process.stdin.write(string.encode('utf-8')) + process.stdin.write(b'\0') + process.stdin.flush() + + output = [] + char = None + try: + char = self.withTimeout(2, process.stdout.read, 1) + except Alarm: + pass + while char and char != b'\0': + output.append(char) + try: + char = self.withTimeout(2, process.stdout.read, 1) + except Alarm: + break # send what we got up till now + + return b"".join(output).decode('utf-8') diff --git a/tests/run_tests.py b/tests/run_tests.py new file mode 100755 index 0000000..c394422 --- /dev/null +++ b/tests/run_tests.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import sys +import os +sys.path.append(os.path.realpath(".")) + +import unittest +import lt_proc, lt_trim, lt_print + +if __name__ == "__main__": + os.chdir(os.path.dirname(__file__)) + failures = 0 + for module in [lt_trim, lt_proc, lt_print]: + suite = unittest.TestLoader().loadTestsFromModule(module) + res = unittest.TextTestRunner(verbosity = 2).run(suite) + failures += len(res.failures) + sys.exit(min(failures, 255))