Index: branches/apertium-separable/README =================================================================== --- branches/apertium-separable/README (revision 80319) +++ branches/apertium-separable/README (nonexistent) @@ -1,20 +0,0 @@ -Noun phrase acceptor: - n* - prn* - np* - det* n* - adj(*) n* - adj(*) adj(*) n* - det* adj(*) n* - -* = one or more tags -( ) = optional - -takeout.fst gives: - take# out => ϵ => out - take# out => prn* => ϵ => out - take# out => np* => ϵ => out - take# out => ϵ => ϵ => n* => out - take# out => det* => ϵ => n => out - take# out => ϵ => adj(*) => n => out - take# out => det* => adj(*) => n* => out Index: branches/apertium-separable/examples/Makefile =================================================================== --- branches/apertium-separable/examples/Makefile (revision 80319) +++ branches/apertium-separable/examples/Makefile (nonexistent) @@ -1,7 +0,0 @@ -all: - lt-comp lr example.dix example.bin - lt-print example.bin | python3 convert-mwe-att.py > example.att -# cat example.att | sed 's/ /@_SPACE_@/g' | hfst-txt2fst -e ε -o example.hfst - cat example.2.att | sed 's/ /@_SPACE_@/g' | hfst-txt2fst -e ε -o example.2.hfst - -# echo "pick<$><$><$>up" | hfst-lookup -p example.2.hfst Index: branches/apertium-separable/fst/Makefile =================================================================== --- branches/apertium-separable/fst/Makefile (nonexistent) +++ branches/apertium-separable/fst/Makefile (revision 80320) @@ -0,0 +1,15 @@ +CFLAGS= -I/usr/local/include/lttoolbox-3.3 +LDFLAGS= -L/usr/local/lib -llttoolbox3 + +transducer2: ../src/transducer2.cpp + g++ -ggdb $(CFLAGS) -Wall ../src/transducer2.cpp -o $@ $(LDFLAGS) + +all: transducer2 + +test: + ./transducer2 + +clean: + rm transducer2 + rm -rf transducer2.dSYM + find . -name "*.fst" -type f -delete Index: branches/apertium-separable/fst/README =================================================================== --- branches/apertium-separable/fst/README (nonexistent) +++ branches/apertium-separable/fst/README (revision 80320) @@ -0,0 +1,11 @@ +in fst directory: + +make +python ../scripts/reap_multiwords.py ../../apertium-en-es/apertium-en-es.en.metadix | ./transducer2 + + +generates fst's from dictionary + + +if make clean doesn't work: +shopt -s extglob Index: branches/apertium-separable/scripts/att2dot.py =================================================================== --- branches/apertium-separable/scripts/att2dot.py (nonexistent) +++ branches/apertium-separable/scripts/att2dot.py (revision 80320) @@ -0,0 +1,18 @@ +import sys ; + +print('digraph G {'); +print('rankdir=LR'); +print('node [shape=circle];'); + +for line in sys.stdin.readlines(): #{ + + row = line.strip('\n').split('\t'); + + if len(row) == 5: #{ + print('\t%s->%s [label="%s:%s"];' % (row[0], row[1], row[2], row[3])); + elif len(row) == 1: #{ + print('\t%s [style=double];' % (row[0])); + #} +#} + +print('}'); Index: branches/apertium-separable/scripts/reap_multiwords.py =================================================================== --- branches/apertium-separable/scripts/reap_multiwords.py (nonexistent) +++ branches/apertium-separable/scripts/reap_multiwords.py (revision 80320) @@ -0,0 +1,16 @@ +#python reap_multiwords.py ../../apertium-en-es/apertium-en-es.en.metadix + +#!/usr/bin/env python +import sys, re + +dix_file = sys.argv[1] +sep_words = [] + +with open(dix_file) as input_file: + for line in input_file: + line = line.strip() + if len(line) > 2 and line[0:6] == ' -#include -#include - -#include -#include -#include - -using namespace std; - -FSTProcessor fstp; - -int main(int argc, char **argv) -{ - if(argc < 2) { - wcout << L"Please specify a transducer" << endl; - exit(-1); - } - - LtLocale::tryToSetLocale(); - FILE *t_rl = fopen(argv[1], "r"); - - fstp.load(t_rl); - fclose(t_rl); - fstp.initBiltrans(); - - wstring input = L"^car$"; - wstring trad = fstp.biltrans(input); - - wcout << input << L" --> " << trad << endl; - - return 0; -} \ No newline at end of file Index: branches/apertium-separable/src/test.dix =================================================================== --- branches/apertium-separable/src/test.dix (revision 80319) +++ branches/apertium-separable/src/test.dix (nonexistent) @@ -1,10 +0,0 @@ - - - - - - -
-

carscar

-
-
\ No newline at end of file Index: branches/apertium-separable/src/Makefile =================================================================== --- branches/apertium-separable/src/Makefile (revision 80319) +++ branches/apertium-separable/src/Makefile (revision 80320) @@ -1,5 +1,6 @@ CFLAGS=-I/home/fran/local/include/lttoolbox-3.3 -I/usr/local/include/lttoolbox-3.3 LDFLAGS=-L/home/fran/local/lib -L/usr/local/lib -llttoolbox3 + transducer: transducer.cpp g++ -ggdb $(CFLAGS) -Wall transducer.cpp -o $@ $(LDFLAGS) @@ -15,4 +16,7 @@ all: transducer processor proc2 test: - ./transducer + ./transducer ./processor ./proc2 ./proc3 + +clean: + rm -f *.o transducer processor proc2 proc3 Index: branches/apertium-separable/src/transducer.cpp =================================================================== --- branches/apertium-separable/src/transducer.cpp (revision 80319) +++ branches/apertium-separable/src/transducer.cpp (revision 80320) @@ -46,8 +46,10 @@ int any_char = alphabet(L""); int wb_sym = alphabet(L"<$>"); + /* noun phrase acceptor: see README */ + int initial = t.getInitial(); - int take_out = initial; //0 + int take_out = initial; take_out = t.insertSingleTransduction(alphabet(L't',L't'), take_out); take_out = t.insertSingleTransduction(alphabet(L'a',L'a'), take_out); take_out = t.insertSingleTransduction(alphabet(L'k',L'k'), take_out); @@ -63,18 +65,6 @@ t.linkStates(take_out, loop, 0); take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - /* noun phrase acceptor: - n - prn - det.* n.* - adj. n.* - adj.* n.* - det.* adj n.* - adj.* adj.* n.* - prn.pers.* - prn.dem.* - np - */ int after_takeout = take_out; /* no det */ @@ -162,12 +152,6 @@ /* n */ take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), lm_noun); - // take_out = after_det; - // take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), take_out); - // - // take_out = from_adj; - // take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), take_out); - // loop = take_out; take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); t.linkStates(take_out, loop, 0); Index: branches/apertium-separable/src/transducer2.cpp =================================================================== --- branches/apertium-separable/src/transducer2.cpp (revision 80319) +++ branches/apertium-separable/src/transducer2.cpp (revision 80320) @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -14,42 +15,10 @@ #include #include -struct foo { - Transducer t; - int takeout_state; - int none_state; -} +using namespace std; -foo add_anychar(Alphabet alphabet, int any_char, Transducer t, int take_out); -foo add_anytag(Alphabet alphabet, int any_char, Transducer t, int take_out); - -foo add_anychar(Alphabet alphabet, int any_char, Transducer t, int take_out) { - int loop = take_out; - int none = take_out; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - none = t.insertSingleTransduction(alphabet(0,0), none); - t.linkStates(take_out, loop, 0); - t.linkStates(none, loop, 0); - - foo bar = {t, take_out, none}; - return bar; -} - -foo add_anytag(Alphabet alphabet, int any_tag, Transducer t, int take_out) { - int loop = take_out; - int none = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - none = t.insertSingleTransduction(alphabet(0,0), none); - t.linkStates(take_out, loop, 0); - t.linkStates(none, loop, 0); - - foo bar = {t, take_out, none}; - return bar; -} - -int main(int argc, char** argv) { +int main (int argc, char** argv) { Alphabet alphabet; - Transducer t; LtLocale::tryToSetLocale(); @@ -59,8 +28,6 @@ alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); @@ -72,26 +39,29 @@ int det_sym = alphabet(L""); int prn_sym = alphabet(L""); int np_sym = alphabet(L""); - int adv_sym = alphabet(L""); - int pr_sym = alphabet(L""); int any_tag = alphabet(L""); int any_char = alphabet(L""); int wb_sym = alphabet(L"<$>"); + /* reap from input file */ + for (string line; getline(cin, line);) { + Transducer t; + string first_token = line.substr(0, line.find(' ')); + string second_token = line.substr(line.find(' ') + 1); + + /* noun phrase acceptor: see README */ + int initial = t.getInitial(); int take_out = initial; - - /* take# out */ - take_out = t.insertSingleTransduction(alphabet(L't',L't'), take_out); //1 - take_out = t.insertSingleTransduction(alphabet(L'a',L'a'), take_out); //2 - take_out = t.insertSingleTransduction(alphabet(L'k',L'k'), take_out); //3 - take_out = t.insertSingleTransduction(alphabet(L'e',L'e'), take_out); //4 - take_out = t.insertSingleTransduction(alphabet(0,L'#'), take_out); //5 - take_out = t.insertSingleTransduction(alphabet(0,L' '), take_out); //6 - take_out = t.insertSingleTransduction(alphabet(0,L'o'), take_out); //7 - take_out = t.insertSingleTransduction(alphabet(0,L'u'), take_out); //8 - take_out = t.insertSingleTransduction(alphabet(0,L't'), take_out); //9 + for (wchar_t c : first_token) { + take_out = t.insertSingleTransduction(alphabet(c,c), take_out); + } + take_out = t.insertSingleTransduction(alphabet(0,L'#'), take_out); + take_out = t.insertSingleTransduction(alphabet(0,L' '), take_out); + for (wchar_t c : second_token) { + take_out = t.insertSingleTransduction(alphabet(0,c), take_out); + } take_out = t.insertSingleTransduction(alphabet(vblex_sym,vblex_sym), take_out); int loop = take_out; take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); @@ -98,65 +68,118 @@ t.linkStates(take_out, loop, 0); take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - /* nothing */ - int reset = take_out; - int none = 0; + int after_takeout = take_out; - /* n */ - take_out = reset; + /* no det */ + int from_nodet = after_takeout; - foobar = add_anychar(alphabet, any_char, t, take_out); - t = foobar.transducer; - take_out = foobar.takeout_state; - none = foobar.none_state; + /* first lemma */ + loop = after_takeout; + take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); + t.linkStates(take_out, loop, 0); - take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), take_out); - none = t.insertSingleTransduction(alphabet(0,0), none); + int first_lm = take_out; - foobar = add_anytag(alphabet, any_tag, t, take_out); - t = foobar.transducer; - take_out = foobar.takeout_state; - none = foobar.none_state; + /* prn */ + take_out = t.insertSingleTransduction(alphabet(prn_sym,prn_sym), first_lm); + loop = take_out; + take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); + t.linkStates(take_out, loop, 0); + take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - none = t.insertSingleTransduction(alphabet(0,0), none); - /* pr */ - take_out = reset; + int after_prn = take_out; - foobar = add_anychar(alphabet, any_char, t, take_out); - t = foobar.transducer; - take_out = foobar.takeout_state; - none = foobar.none_state; + /* np */ + take_out = t.insertSingleTransduction(alphabet(np_sym,np_sym), first_lm); - take_out = t.insertSingleTransduction(alphabet(pr_sym,pr_sym), take_out); - none = t.insertSingleTransduction(alphabet(0,0), none); + loop = take_out; + take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); + t.linkStates(take_out, loop, 0); - foobar = add_anytag(alphabet, any_tag, t, take_out); - t = foobar.transducer; - take_out = foobar.takeout_state; - none = foobar.none_state; + take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); + int after_np = take_out; + + /* det */ + take_out = t.insertSingleTransduction(alphabet(det_sym,det_sym), first_lm); + + loop = take_out; + take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); + t.linkStates(take_out, loop, 0); + take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - none = t.insertSingleTransduction(alphabet(0,0), none); + int after_det = take_out; + /* no adj */ + int from_noadj = take_out; //same as after_det + /* lemma for the adj */ + loop = after_det; + take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); + t.linkStates(take_out, loop, 0); + int lm_adj = take_out; + /* adj */ + take_out = t.insertSingleTransduction(alphabet(adj_sym,adj_sym), lm_adj); + + int optional_adj = take_out; + + loop = take_out; + take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); + t.linkStates(take_out, loop, 0); + + //may not have a second tag + t.linkStates(optional_adj, take_out, 0); + + take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); + + int after_adj = take_out; + + /* lemma for the noun */ + loop = after_adj; + take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); + t.linkStates(take_out, loop, 0); + + int lm_noun = take_out; + + /* possible subsequent adj */ + t.linkStates(lm_noun, lm_adj, alphabet(adj_sym,adj_sym)); + + /* n */ + take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), lm_noun); + + loop = take_out; + take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); + t.linkStates(take_out, loop, 0); + + take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); + /* out */ - take_out = t.insertSingleTransduction(alphabet(L'o',0), take_out); - take_out = t.insertSingleTransduction(alphabet(L'u',0), take_out); - take_out = t.insertSingleTransduction(alphabet(L't',0), take_out); + int before_out = take_out; + + for (wchar_t c : second_token) { + take_out = t.insertSingleTransduction(alphabet(c,0), take_out); + } take_out = t.insertSingleTransduction(alphabet(any_tag, 0), take_out); take_out = t.insertSingleTransduction(alphabet(wb_sym,0), take_out); t.setFinal(take_out); - FILE* fst = fopen("takeout.fst", "w+"); + /* final link states */ + t.linkStates(after_takeout, before_out, 0); + t.linkStates(after_prn, before_out, 0); + t.linkStates(after_np, before_out, 0); + t.linkStates(from_nodet, after_det, 0); + t.linkStates(from_noadj, after_adj, 0); + string filename = regex_replace(line,std::regex("\\s+"), "") + ".fst"; + FILE* fst = fopen(filename.c_str(), "w+"); // First write the letter symbols of the alphabet - Compression::wstring_write(L"aekout", fst); + Compression::wstring_write(L"abcdefghijklmnopqrstuvwxyz", fst); // Then write the multicharacter symbols alphabet.write(fst); // Then write then number of transducers @@ -165,8 +188,9 @@ Compression::wstring_write(L"main@standard", fst); // Then write the transducer t.write(fst); - wcout << "t.size(): " << t.size() << endl ; + cout << line << " t.size(): " << t.size() << endl ; fclose(fst); + } return 0; -} +} \ No newline at end of file Index: branches/apertium-separable/testing/acceptor.py =================================================================== --- branches/apertium-separable/testing/acceptor.py (revision 80319) +++ branches/apertium-separable/testing/acceptor.py (nonexistent) @@ -1,40 +0,0 @@ -import sys, re - -def next_token(): - token = sys.stdin.read(1) - if token == '<': - c = '' - while c != '>': - c = sys.stdin.read(1) - token += c - # print token - return token - -def main(): - states = ['^', 't', 'a', 'k', 'e', '', '$', ' ', '^', 'ANY_CHAR', '', '$', ' ', '^', 'o', 'u', 't', '$'] - #states = ['^', 't', 'a', 'k', 'e', '', '', '$', ' ', '^', 'ANY_CHAR', '', '', '$', ' ', '^', 'o', 'u', 't', '$'] - current_state = 0 - - print('input a string:') - c = next_token() - -#^take$ ^the$ ^thing$ ^out$ - - while c: - if current_state == len(states): #end of input - print('Yes') - sys.exit(0) - elif c == states[current_state]: #ok - print c, current_state - current_state += 1 - elif re.match('[a-zA-Z]', c): - print c, current_state - elif re.match('\<(.+?)\>', c): - print c, current_state - else: #error - print('No') - sys.exit(1) - c = next_token() - -if __name__ == '__main__': - main() \ No newline at end of file