commit 950ad03cd99e75b24b5e9ebb1396ed9bc35a2905 Author: Daniel Swanson Date: Fri Jun 11 14:29:47 2021 -0500 unlocked I/O functions and old src files are unused - remove diff --git a/configure.ac b/configure.ac index b200861..ff39669 100644 --- a/configure.ac +++ b/configure.ac @@ -46,7 +46,6 @@ AC_SUBST(ICU_LIBS) # Checks for libraries. AC_CHECK_LIB(xml2, xmlReaderForFile) -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked]) AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $APERTIUM_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" diff --git a/src/processor.cc b/src/processor.cc deleted file mode 100644 index aab265e..0000000 --- a/src/processor.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); - - -/* get the text between delim1 and delim2 */ -/* next_token() */ -wstring -readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) -{ - wstring result = L""; - result += delim1; - wchar_t c = delim1; - - while(!feof(input) && c != delim2) - { - c = static_cast(fgetwc(input)); //fget_unlocked - result += c; - } - - return result; -} - -/*** -main -***/ -int main (int argc, char** argv) -{ - Alphabet alphabet; - TransExe transducer; - - LtLocale::tryToSetLocale(); - FILE *fst = fopen(argv[1], "r"); - - set alphabetic_chars; - int len = Compression::multibyte_read(fst); - while(len > 0) - { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(fst))); - len--; - } - - alphabet.read(fst); - wcout << L"alphabet_size: " << alphabet.size() << endl; - - len = Compression::multibyte_read(fst); - len = Compression::multibyte_read(fst); - wcout << len << endl; - wstring name = L""; - while(len > 0) - { - name += static_cast(Compression::multibyte_read(fst)); - len--; - } - wcout << name << endl; - - transducer.read(fst, alphabet); - - FILE *input = stdin; - FILE *output = stdout; - - /* preparing for processing */ - vector alive_states; //A set of alive states is maintained to compute all the possible ways to - set anfinals; //alive node finals ? - set escaped_chars; - - State* initial_state = new State(); - initial_state->init(transducer.getInitial()); // getInitial() returns an int - anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); - - set final_states = transducer.getFinals(); - for(auto final_state : final_states) { - final_state.init(transducer.getInitial()); //initialize - } - - - /* processing */ - - vector new_states; - alive_states.push_back(*initial_state); - // TODO: insert the other states - // TODO: insert the final state - - int line_number = 0; - bool accepted = true; - while(!feof(input)) // while true - { - //initialize conditions - int tag_count = 0; - State* current_state = initial_state; - bool in_lemma = false; - bool in_take = false; - bool in_out = false; - - while (alive_states.size() > 1 and !isFinal(current_state)) { - //get the next token - int val = fgetwc(input); // read 1 wide char - bool is_tag = false; - if(val == L'<') // if in tag, get the whole tag - { - in_lemma = false; - is_tag = true; - wstring tag = L""; - tag = readFullBlock(input, L'<', L'>'); - val = static_cast(alphabet(tag)); - - tag_count++; - - cout << "val before: " << val << endl; - cout << "tag_count: " << tag_count << endl; - - if(val == 0 && tag_count > 2) //TODO: val==0? - { - val = static_cast(alphabet(L"")); - } - - cout << "val after: " << val << endl; - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); - - if (tag == '') { - accepted = true; - } - } - else if(in_lemma && !in_take && !in_out) { - val == static_cast(alphabet(L"&")); - } - - // if (current_state == initial_state && not eof) { - //successfully reached eof - //exit() - - if (current_state == initial_state && val != '\n') { - accepted = true; - break; - } else if (val == '\n') { //or sent - accepted = true; - } - - //step into the next state - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) { //step //for every state in alive_states - State s = *it; - - if (tag_count > 2) { - s.step(val, alphabet(L"")); - } else { - s.step(val) - } - - if(s.size() > 0) - { - new_states.push_back(s); - } - wcout << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; - } - - alive_states.swap(new_states); - } - return 0; - } diff --git a/src/transducer.py b/src/transducer.py deleted file mode 100644 index 77fc1b4..0000000 --- a/src/transducer.py +++ /dev/null @@ -1,189 +0,0 @@ -#usage: python transducer.py testfile.txt - -import sys - -transitions = { - (-1,'^') : 0, - (0,'t') : 1, - (1,'a') : 2, - (2,'k') : 3, - (3,'e') : 4, - (4,'') : 5, - (5,'') : 6, - (6,'') : 7, - (6,'$') : 8, - (7,'') : 7, - (7,'$'): 8, - (8,' ') : 9, - (9,'^') : 10, - (10,'&') : 11, - (11,'&') : 11, - (11,'') : 12, - (11,'') : 13, - (11,'') : 14, - (11,'') : 15, - (11,''): 16, - (12,'') : 200, - (200,'') : 201, - (200,'$') : 17, - (201,'') : 201, - (201,'$') : 17, - (13,'') : 225, - (13,'$') : 250, - (225,'') : 225, - (225,'$') : 250, - (250,' '):251, - (251,'^'):252, - (252,'&'):253, - (253,'&'):253, - (253,''):12, - (253,''):13, - (14,'') : 275, - (275,'') : 276, - (275,'$') : 250, - (276,'') : 276, - (276,'$') : 250, - (15,'') : 200, - (16,''): 200, - (100,'') : 100, - (100,'$') : 17, - (17,' ') : 18, #do not go to state 17 unless you are expecting 'out' to be the next word - (18,'^') : 19, - (19,'o') : 20, - (20,'u') : 21, - (21,'t') : 22, - (22,'') : 23, - (22,'') : 24, - (23,'$') : 25, - (24,'$') : 25, - (25,'') : 26, - (25,' ') : 26, - (25,'\n') : 26, - (25,'^') : 27, - (27,'.') : 28, - (28,'') : 29, - (29,'$') : 25 -} - -# is required -# is optional -states = { - -1 : '', - 0 : '^', - 1 : 't', - 2 : 'a', - 3 : 'k', - 4 : 'e', - 5 : '', - 6 : '', #secondary tag is necessary - 7 : '', #third, fourth, fifth...tags are optional - 8 : '$', - 9 : ' ', - 10 : '^', - 11 : '&', #represents any character 'ANY_CHAR - 12 : '', - 13 : '', - 14 : '', - 15 : '', - 16 : '', - 100: '', - 200: '', - 201: '', - 225: '', - 250: '$', - 251: ' ', - 252: '^', - 253: '&', - 275: '', - 276: '', - 17 : '$', - 18 : ' ', - 19 : '^', - 20 : 'o', - 21 : 'u', - 22 : 't', - 23 : '', - 24 : '', - 25 : '$', - 26 : '\n', - 27 : '^', - 28 : '.', - 29 : '', - -} - -def next_token(file, subsequent_tag, in_lemma, in_take, in_out): - original_token = file.read(1) - modified_token = original_token - if original_token == '<': #if in tag - in_lemma = False - c = '' - while c != '>': - c = file.read(1) - original_token += c - modified_token += c - if subsequent_tag: - modified_token = '' - if in_lemma and not in_take and not in_out: - modified_token = '&' #ANY_CHAR - return original_token, modified_token - -def step(state, token): #token is at the next state - next_state = transitions.get((state,token)) - output_token = states.get(next_state) - return next_state, output_token #return the next state, or None if it doesn't exist - -def main(): - f = open(sys.argv[1]) - line_number = 0 - accepted = True - while True: - line = '' - if accepted: - line_number += 1 - current_state = -1 - - subsequent_tag = False - in_lemma = False - in_take = False - in_out = False - - while states.get(current_state) != None and current_state != 26: - original_token, modified_token = next_token(f, subsequent_tag, in_lemma, in_take, in_out) - if current_state == -1 and modified_token == '': - print('successfully reached end of file') - exit(0) - elif current_state == -1 and modified_token == '\n': - accepted = True - break - elif modified_token == '\n': - accepted = True - - current_state, output_token = step(current_state, modified_token) - if output_token == None: - break - - line += original_token - - subsequent_tag = current_state in [5, 6, 7, 12, 13, 14, 15, 16, 100, 200, 201, 225, 275, 276] - in_lemma = current_state in [1, 2, 3, 10, 11, 252, 253, 19, 20, 21, 22] - in_take = current_state in [1, 2, 3, 4] - if current_state == 19: - pos = f.tell() #store the current buffer position - peek = f.read(4) #read in the next 4 chars - f.seek(pos) #return to the original position - if peek == 'out<': - in_out = True - - if current_state == 26: - print str(line_number) + ' ' + line - accepted = True - else: - if accepted: - print str(line_number) + ' string not accepted \n' - accepted = False - current_state = -1 - line_number += 1 - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/transducer2.cc b/src/transducer2.cc deleted file mode 100644 index 7042095..0000000 --- a/src/transducer2.cc +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -int main (int argc, char** argv) { - Alphabet alphabet; - - LtLocale::tryToSetLocale(); - - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L""); - alphabet.includeSymbol(L"<$>"); - - int vblex_sym = alphabet(L""); - int n_sym = alphabet(L""); - int adj_sym = alphabet(L""); - int det_sym = alphabet(L""); - int prn_sym = alphabet(L""); - int np_sym = alphabet(L""); - - int any_tag = alphabet(L""); - int any_char = alphabet(L""); - int wb_sym = alphabet(L"<$>"); - - /* reap from input file */ - for (string line; getline(cin, line);) { - Transducer t; - string first_token = line.substr(0, line.find(' ')); - string second_token = line.substr(line.find(' ') + 1); - - /* noun phrase acceptor: see README */ - - int initial = t.getInitial(); - int take_out = initial; - for (wchar_t c : first_token) { - take_out = t.insertSingleTransduction(alphabet(c,c), take_out); - } - take_out = t.insertSingleTransduction(alphabet(0,L'#'), take_out); - take_out = t.insertSingleTransduction(alphabet(0,L' '), take_out); - for (wchar_t c : second_token) { - take_out = t.insertSingleTransduction(alphabet(0,c), take_out); - } - take_out = t.insertSingleTransduction(alphabet(vblex_sym,vblex_sym), take_out); - int loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_takeout = take_out; - - /* no det */ - int from_nodet = after_takeout; - - /* first lemma */ - loop = after_takeout; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int first_lm = take_out; - - /* prn */ - take_out = t.insertSingleTransduction(alphabet(prn_sym,prn_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_prn = take_out; - - /* np */ - take_out = t.insertSingleTransduction(alphabet(np_sym,np_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_np = take_out; - - /* det */ - take_out = t.insertSingleTransduction(alphabet(det_sym,det_sym), first_lm); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_det = take_out; - - /* no adj */ - int from_noadj = take_out; //same as after_det - - /* lemma for the adj */ - loop = after_det; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int lm_adj = take_out; - - /* adj */ - take_out = t.insertSingleTransduction(alphabet(adj_sym,adj_sym), lm_adj); - - int optional_adj = take_out; - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - //may not have a second tag - t.linkStates(optional_adj, take_out, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - int after_adj = take_out; - - /* lemma for the noun */ - loop = after_adj; - take_out = t.insertSingleTransduction(alphabet(any_char,any_char), loop); - t.linkStates(take_out, loop, 0); - - int lm_noun = take_out; - - /* possible subsequent adj */ - t.linkStates(lm_noun, lm_adj, alphabet(adj_sym,adj_sym)); - - /* n */ - take_out = t.insertSingleTransduction(alphabet(n_sym,n_sym), lm_noun); - - loop = take_out; - take_out = t.insertSingleTransduction(alphabet(any_tag,any_tag), loop); - t.linkStates(take_out, loop, 0); - - take_out = t.insertSingleTransduction(alphabet(wb_sym,wb_sym), take_out); - - /* out */ - int before_out = take_out; - - for (wchar_t c : second_token) { - take_out = t.insertSingleTransduction(alphabet(c,0), take_out); - } - take_out = t.insertSingleTransduction(alphabet(any_tag, 0), take_out); - take_out = t.insertSingleTransduction(alphabet(wb_sym,0), take_out); - - t.setFinal(take_out); - - /* final link states */ - t.linkStates(after_takeout, before_out, 0); - t.linkStates(after_prn, before_out, 0); - t.linkStates(after_np, before_out, 0); - t.linkStates(from_nodet, after_det, 0); - t.linkStates(from_noadj, after_adj, 0); - - string filename = regex_replace(line,std::regex("\\s+"), "") + ".fst"; - FILE* fst = fopen(filename.c_str(), "w+"); - // First write the letter symbols of the alphabet - Compression::wstring_write(L"abcdefghijklmnopqrstuvwxyz", fst); - // Then write the multicharacter symbols - alphabet.write(fst); - // Then write then number of transducers - Compression::multibyte_write(1, fst); - // Then write the name of the transducer - Compression::wstring_write(L"main@standard", fst); - // Then write the transducer - t.write(fst); - cout << line << " t.size(): " << t.size() << endl ; - fclose(fst); - } - - return 0; -} \ No newline at end of file