Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/Makefile =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/Makefile (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/Makefile (revision 70904) @@ -0,0 +1,2 @@ +all: + g++ -Wall -Wextra -O3 -g process_tagger_output.cc -o process-tagger-output -I/usr/local/include/apertium-3.4 -I/usr/local/include/lttoolbox-3.3 -I/usr/include/apertium-3.4 -I/usr/include/lttoolbox-3.3 -I/usr/include/libxml2 -llttoolbox3 -lapertium3 -lxml2 Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/process_tagger_output.cc =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/process_tagger_output.cc (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/process_tagger_output.cc (revision 70904) @@ -0,0 +1,172 @@ +#include +#include +#include + +#include +#include +#include + +#include +#include +//#include +#include +#include + +using namespace std; + +int find(vector xs, wstring x) { + for (size_t i = 0; i < xs.size(); i++) { + if (xs[i] == x) + return i; + } + return -1; + +} + +FSTProcessor loadBilingual(char *path) { + FSTProcessor bilingual; + + FILE *f_bin = fopen(path, "r"); + bilingual.load(f_bin); + fclose(f_bin); + bilingual.initBiltrans(); + return bilingual; +} + +vector parseTags(wstring token) { + int state = 0; // outside + vector tags; + wstring buffer; + for (size_t i = 0; i < token.size(); i++) { + wchar_t c = token[i]; + if (state == 0) { + if (c == '<') { + state = 1; + } + } else if (state == 1) { + if (c == '>') { + tags.push_back(buffer); + buffer = L""; + state = 0; + } else { + buffer += c; + } + } + } + return tags; +} + +vector wsplit(wstring wstr, wchar_t delim) { + vector tokens; + wstring buffer; + + for (size_t i = 0; i < wstr.size(); i++) { + buffer += wstr[i]; + if(wstr[i] == delim) { + tokens.push_back(buffer); + buffer = L""; + } + } + if(buffer != L"") { + tokens.push_back(buffer); + } + return tokens; + +} + +wstring getLemma(wstring token) { + wstring buffer; + for (size_t i = 0; i < token.size(); i++) { + if(token[i] != '<') { + buffer += token[i]; + } else { + break; + } + } + return buffer; +} + +void processTaggerOutput(FSTProcessor *bilingual) { + + wstring buffer; + + bool escaped = false; + int state = 0; // outside + wchar_t c; + bilingual->setBiltransSurfaceForms(true); + while((wcin.get(c)) != NULL) { + + if (state == 0) { + if (c == '^' && !escaped) { + state = 1; // inside + buffer += c; + } else if (c == '\\' && !escaped) { + wcout << c; + escaped = true; + } else { + wcout << c; + escaped = false; + } + } else if (state == 1) { + if(c == L'$' && !escaped) { + + vector sourceTags = parseTags(buffer); + wstring target = bilingual->biltrans(buffer + L"$", true); + vector targetTags = parseTags(target); + wstring targetTrimmed = bilingual->biltransWithoutQueue(buffer + L"$", true); + vector trimmedTags = parseTags(targetTrimmed); + vector newTags; + + for (size_t i = 0; i < sourceTags.size(); i++) { + wstring sourceTag = sourceTags[i]; + int idx_1 = find(targetTags, sourceTag); + int idx_2 = find(trimmedTags, sourceTag); + if (idx_1 == idx_2){ + newTags.push_back(sourceTag); + } + } + wcout << getLemma(buffer); + for (size_t i = 0; i < newTags.size(); i++) { + wcout << '<' << newTags[i] << '>'; + } + targetTrimmed[0] = '/'; + if(targetTrimmed == L"/") { + buffer[0] = L'@'; + wcout << L"/" + buffer + L"$"; + } else { + vector tokens = wsplit(targetTrimmed, '/'); + for (size_t i = 0; i < tokens.size(); i++) { + wcout << tokens[i]; + } + } + + buffer = L""; + state = 0; + escaped = false; + + + } else if (c == '\\' && !escaped) { + escaped = true; + buffer += c; + } else { + buffer += c; + escaped = false; + } + } + } +} + +int main(int argc, char **argv) +{ + if(argc != 2) { + wcout << L"Usage: " << argv[0] << " bidix_bin_file" << endl; + wcout << L"with output from pretransfer on standard input." << endl; + exit(-1); + } + + LtLocale::tryToSetLocale(); + FSTProcessor bilingual = loadBilingual(argv[1]); + processTaggerOutput(&bilingual); + + return 0; +} Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/giza-to-moses.awk =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/giza-to-moses.awk (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/giza-to-moses.awk (revision 70904) @@ -0,0 +1,109 @@ +#!/usr/bin/gawk -f + +# (c) 2011 Felipe Snchez-Martnez +# (c) 2011 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + + + +# This script reads GIZA++ alignment and proccess them giving +# as output a more human (and machine) readable format with the +# same information. From this new format it easier to construct an +# aligment matrix between source an target sentences. +# +# Ouput format: +# source_sentence ||| target_sentence ||| alignment +# +# Example: +# Reanudacin del perodo de sesiones ||| Resumption of the session ||| 0-0 1-2 3-1 4-3 + + +function process_alignment(str) { + source_sentence=""; + alignment=""; + + ntokens=split(str, tokens, " }) "); + + sl_pos=0; + + for(i=1; i<=ntokens; i++) { + if (length(tokens[i])==0) + continue; + + nwa=split(tokens[i], wa, " "); + if (nwa<2) { + print "Error while processing the alignment information at input line " line > "/dev/stderr"; + exit 1; + } + + if (wa[1] == "NULL") # NULL is ignored + continue; + + if(length(source_sentence)>0) { + source_sentence = source_sentence " "; + } + source_sentence = source_sentence wa[1]; + + for (j=3; j<=nwa; j++) { + if (length(alignment)>0) { + alignment = alignment " "; + } + alignment = alignment sl_pos "-" wa[j]-1; + } + + sl_pos++; + } +} + +function trim(w) { + for(i=1;i<=length(w);i++){ + if(substr(w,i,1)~/[ \t\r\n]/); + else break; + } + liminf=i; + + for(i=length(w);i>=1;i--){ + if(substr(w,i,1)~/[ \t\r\n]/); + else break; + } + + limsup=i; + + return substr(w,liminf,limsup-liminf+1); +} + +BEGIN { + line=0; + alignment_score=0; + target_sentence=""; + source_sentence=""; + alignment=""; + reading_al=0; +} +{ + line++; + if (reading_al==0) + alignment_score=$NF; + else if (reading_al==1) + target_sentence=$0; + else { + process_alignment($0); + print trim(source_sentence) " ||| " trim(target_sentence) " ||| " alignment; + } + + reading_al++; + if (reading_al>2) + reading_al=0; +} Property changes on: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/giza-to-moses.awk ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/strip-empty-lines.py =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/strip-empty-lines.py (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/strip-empty-lines.py (revision 70904) @@ -0,0 +1,46 @@ +#!/usr/bin/python3 +# coding=utf-8 +# -*- encoding: utf-8 -*- + +import sys ; + +#perl "$MOSESDECODER/clean-corpus-n.perl" data.$SL-$TL/$CORPUS.tagged $SL $TL "data.$SL-$TL/$CORPUS.tagged-clean" 1 40; + + +prefix = sys.argv[1]; +sl = sys.argv[2]; +tl = sys.argv[3]; +outfix = sys.argv[4] + +sl_f = open(prefix + '.' + sl); +tl_f = open(prefix + '.' + tl); + +sl_o = open(outfix + '.' + sl, 'w+'); +tl_o = open(outfix + '.' + tl, 'w+'); + +reading = True; + +inlines = 0; +outlines = 0; + +while reading: #{ + + slline = sl_f.readline(); + tlline = tl_f.readline(); + + if not slline and not tlline: #{ + break; + #} + + if slline.strip() == '' or tlline.strip() == '': #{ + inlines = inlines + 1; + continue; + else: #{ + sl_o.write(slline); + tl_o.write(tlline); + outlines = outlines + 1; + #} + inlines = inlines + 1; +#} + +print('in: %d, out: %d' % (inlines, outlines)); Property changes on: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/strip-empty-lines.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/extract-sentences.py =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/extract-sentences.py (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/extract-sentences.py (revision 70904) @@ -0,0 +1,96 @@ +#!/usr/bin/python3 +# coding=utf-8 +# -*- encoding: utf-8 -*- + +import sys, codecs; +import common; + +if len(sys.argv) < 2: #{ + print('extact-sentences.py '); + sys.exit(-1); +#} + +phrase_table = open(sys.argv[1]); +biltrans_out = open(sys.argv[2]); + +def ambiguous(bt): #{ + # legislation/legislación/ordenamiento + + ambig = False; + for token in bt: #{ + tls = token['tls'] + if len(tls) > 1: #{ + return True; + #} + #} + + return ambig; +#} + +reading = True; +lineno = 0; +total_valid = 0; +total_errors = 0; + +not_ambiguous = []; + +while reading: #{ + try: + lineno = lineno + 1; + pt_line = phrase_table.readline().strip(); + bt_line = biltrans_out.readline().strip(); + + if not bt_line.strip() and not pt_line.strip(): #{ + reading = False; + break + elif not bt_line.strip() or not pt_line.strip(): #{ + continue; + + #} + row = pt_line.split('|||'); + bt = common.tokenise_biltrans_line(bt_line.strip()); + sl = common.tokenise_tagger_line(row[1].strip()); + tl = common.tokenise_tagger_line(row[0].strip()); + + if not ambiguous(bt): #{ + not_ambiguous.append(str(lineno)); + if len(not_ambiguous) >= 10: #{ + print ("not ambiguous:", ' '.join(not_ambiguous), file=sys.stderr); + not_ambiguous = []; + #} + continue; + #} + if len(sl) < 2 and len(tl) < 2: #{ + continue; + #} + + + # Check that the number of words in the lexical transfer, and in the phrasetable matches up + if len(sl) != len(bt): #{ + print ("len(sl) != len(bt)", file=sys.stderr); + continue; + #} + + + # Resumption of the session + # Resumption/Reanudación of/de the/el session/sesión + # Reanudación de el periodo de sesión + # 0-0 1-1 2-2 5-3 + + + print(lineno, '\t' + row[1]); + print(lineno, '\t' + bt_line); + print(lineno, '\t' + row[0]); + print(lineno, '\t' + row[2]); + print('-------------------------------------------------------------------------------'); + total_valid += 1 + except: + print ("error in line", lineno, file=sys.stderr); + total_errors += 1 + continue + +#} + +print('total:', lineno, file=sys.stderr); +print('valid:', total_valid, '(' + str((total_valid/lineno)*100) + '%)', file=sys.stderr); +print('errors:',total_errors, '(' + str((total_errors/lineno)*100) + '%)', file=sys.stderr); Property changes on: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/extract-sentences.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/split-sentences.py =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/split-sentences.py (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/split-sentences.py (revision 70904) @@ -0,0 +1,160 @@ +#!/usr/bin/python3 +# coding=utf-8 +# -*- encoding: utf-8 -*- + +import sys, codecs, random; + +def convert_to_biltrans(bt): #{ + outline = ''; + for word in bt.split(' '): #{ + outline = outline + '^' + word.replace('~', ' ') + '$ '; + #} + return outline; +#} + +# Take the ambiguous lexical transfer output and return +# an unambiguous lexical transfer output, en la medida de lo posible. +def disambiguate_with_alig(bt, al, tl): #{ + outline = ''; + i = 0; + tl_row = tl.split(' '); + for word in bt.split(' '): #{ + resolved = []; + if word.count('/') > 1: #{ + for alig in al.split(' '): #{ + # 0-3 2-0 3-2 4-1 5-4 6-7 7-5 8-6 9-8 11-9 12-15 13-16 15-10 16-11 18-12 20-13 21-14 22-17 + l = int(alig.split('-')[0]); + r = int(alig.split('-')[1]); + if r == i: #{ + #print 'x' , word , tg_row[l]; + + for lu in word.split('/')[1:]: #{ + st_lem = lu.split('<')[0].lower(); + t_lem = tl_row[l].split('<')[0].lower(); + #print st_lem , t_lem + + if st_lem == t_lem: #{ + resolved.append(lu); + #} + #} + #} + #} + #} + if len(resolved) > 0: #{ + slw = word.split('/')[0]; + out = '^' + slw + '/'; + for tlw in resolved: #{ + out = out + tlw + '/'; + #} + out = out + '$ '; + outline = outline + out.replace('/$', '$').replace('~', ' '); + else: #{ + outline = outline + '^' + word.replace('~', ' ') + '$ '; + #} + i = i + 1; + #} + #print(outline); + return outline; +#} + +if len(sys.argv) < 2: #{ + print('split-sentences.py '); + sys.exit(-1); +#} + +candidates = open(sys.argv[1]); +testlen = int(sys.argv[2]); + +# We read through the file and collect all the line numbers. +# Then we randomise and select the top _tstlen_ for the test +# set and the next top _tstlen_ for the dev set. +# The rest are printed out + +linenos = []; + +for line in candidates.readlines(): #{ + if line.count('--') > 2: #{ + continue; + #} + num = line.split('\t'); + linenos.append(int(num[0])); +#} + +candidates.close(); + +linenos = set(linenos); +linenos = list(linenos); + +random.shuffle(linenos); + +print(len(linenos)); + +tst = linenos[0:testlen]; +dev = linenos[testlen:testlen*2]; +train = linenos[2000:]; + +candidates = open(sys.argv[1]); + +trainout = open(sys.argv[1].replace('candidates', 'train'), 'w'); +tst_refout = open(sys.argv[1].replace('candidates', 'tst') + '.ref', 'w'); +tst_srcout = open(sys.argv[1].replace('candidates', 'tst') + '.src', 'w'); +dev_refout = open(sys.argv[1].replace('candidates', 'dev') + '.ref', 'w'); +dev_srcout = open(sys.argv[1].replace('candidates', 'dev') + '.src', 'w'); + +cur_line = -1; +state = 0; +sl = ''; +tl = ''; +al = ''; +bt = ''; +for line in candidates.readlines(): #{ + line = line.strip(); + if line.count('--') > 2: #{ + if cur_line in tst: #{ + #print(cur_line, 'tst'); + outline = disambiguate_with_alig(bt, al, tl); + print(cur_line, ']\t' + outline, file=tst_refout); + outline = convert_to_biltrans(bt); + print(cur_line, ']\t' + outline, file=tst_srcout); + elif cur_line in dev: #{ + print(cur_line, 'dev'); + outline = disambiguate_with_alig(bt, al, tl); + print(cur_line, ']\t' + outline, file=dev_refout); + outline = convert_to_biltrans(bt); + print(cur_line, ']\t' + outline, file=dev_srcout); + elif cur_line in train: #{ + #print(cur_line, 'train'); + print(cur_line, '\t' + sl, file=trainout); + print(cur_line, '\t' + bt, file=trainout); + print(cur_line, '\t' + tl, file=trainout); + print(cur_line, '\t' + al, file=trainout); + print('-------------------------------------------------------------------------------', file=trainout); + #} + cur_line = -1; + state = 0; + sl = ''; + tl = ''; + al = ''; + bt = ''; + continue; + #} + if cur_line == -1: #{ + cur_line = int(line.split('\t')[0]); + #} + + if cur_line > -1 and state == 0: #{ + sl = line.split('\t')[1]; + state = state + 1; + elif cur_line > -1 and state == 1: #{ + bt = line.split('\t')[1]; + state = state + 1; + elif cur_line > -1 and state == 2: #{ + tl = line.split('\t')[1]; + state = state + 1; + elif cur_line > -1 and state == 3: #{ + al = line.split('\t')[1]; + state = state + 1; + else: #{ + print('Something went wrong!', file=sys.stderr); + #} +#} Index: branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/process-tagger-output.py =================================================================== --- branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/process-tagger-output.py (nonexistent) +++ branches/friendly-lexical-selection-training/apertium-lex-tools/scripts/unused/process-tagger-output.py (revision 70904) @@ -0,0 +1,1335 @@ +#!/usr/bin/python +# coding=utf-8 +# -*- encoding: utf-8 -*- + +import sys, codecs, copy, commands; + +sys.stdin = codecs.getreader('utf-8')(sys.stdin); +sys.stdout = codecs.getwriter('utf-8')(sys.stdout); +sys.stderr = codecs.getwriter('utf-8')(sys.stderr); + +c = sys.stdin.read(1); + +mk_table = { + + u'' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'<@←SPRED>' : u'', + u'' : u'', + u'<@→N>' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'<@INTERJ>' : u'', + u'' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'' : u'', + u'<@P←>' : u'', + u'<@←SPRED>' : u'', + u'<@SUBJ→>' : u'', + u'' : u'', + u'<@P←>' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'' : u'', + u'