Index: branches/apertium-separable/src/lsx_processor.cc =================================================================== --- branches/apertium-separable/src/lsx_processor.cc (revision 80648) +++ branches/apertium-separable/src/lsx_processor.cc (revision 80649) @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -20,8 +21,8 @@ readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) { wstring result = L""; - result += delim1; wchar_t c = delim1; + result += c; while(!feof(input) && c != delim2) { @@ -28,24 +29,28 @@ c = static_cast(fgetwc(input)); result += c; } - return result; } int main (int argc, char** argv) { + if(argc != 2) + { + wcout << L"./lsx-proc " << endl; + exit(0); + } + Alphabet alphabet; TransExe transducer; - LtLocale::tryToSetLocale(); - if (argc != 2) { - cout << "incorrect usage: needs one input file" << endl; - exit(1); + FILE* fst = fopen(argv[1], "r"); + if(!fst) + { + wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl; + exit(EXIT_FAILURE); } - FILE *fst = fopen(argv[1], "r"); - set alphabetic_chars; int len = Compression::multibyte_read(fst); while(len > 0) @@ -55,12 +60,11 @@ } alphabet.read(fst); - wcerr << L"alphabet_size: " << alphabet.size() << endl; + wcerr << L"alphabet_size: " << alphabet.size() << endl; //NOTE len = Compression::multibyte_read(fst); + len = Compression::multibyte_read(fst); - len = Compression::multibyte_read(fst); - wcerr << len << endl; wstring name = L""; while(len > 0) { @@ -67,7 +71,7 @@ name += static_cast(Compression::multibyte_read(fst)); len--; } - wcerr << name << endl; + wcerr << name << endl; //NOTE transducer.read(fst, alphabet); @@ -75,6 +79,7 @@ FILE *output = stdout; set anfinals; + vector new_states, alive_states; set escaped_chars; escaped_chars.insert(L'['); @@ -93,55 +98,20 @@ initial_state = new State(); initial_state->init(transducer.getInitial()); anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); - - - vector new_states; - vector alive_states; - alive_states.push_back(*initial_state); bool outOfWord = true; bool isEscaped = false; + int tagCount = 0; + + int val = 0; while(!feof(input)) { - int val = fgetwc(input); // read 1 wide char + val = fgetwc(input); // read 1 wide char - wcerr << L"| " << (wchar_t)val << L" | val: " << val << L" || s.size(): " << alive_states.size() << L" || " << outOfWord << endl; - - if(val == L'^' && !isEscaped && outOfWord) + if(val == L'<') // tag { - outOfWord = false; - continue; - } - - if((feof(input) || val == L'$') && !isEscaped && !outOfWord) - { - new_states.clear(); - for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - { - State s = *it; - s.step(alphabet(L"<$>")); - if(s.size() > 0) - { - new_states.push_back(s); - } - - if(s.isFinal(anfinals)) - { - wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - wcerr << "FINAL: " << out << endl; - new_states.push_back(*initial_state); - } - } - alive_states.swap(new_states); - - outOfWord = true; - continue; - } - - if(val == L'<' && !outOfWord) // if in tag, get the whole tag and modify if necessary - { wstring tag = L""; tag = readFullBlock(input, L'<', L'>'); if(!alphabet.isSymbolDefined(tag)) @@ -149,41 +119,136 @@ alphabet.includeSymbol(tag); } val = static_cast(alphabet(tag)); - - fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); + tagCount++; } + new_states.clear(); - if(!outOfWord) - { - new_states.clear(); - wstring res = L""; + cout << "val: " << val << " " << (char) val << " alive_states size: " << alive_states.size() << " tagCount: " << tagCount << endl; for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) { - res = L""; State s = *it; - if(val < 0) - { + + if(val == L'$') { + s.step(alphabet(L"<$>")); + cout << "wb" << endl; + tagCount = 0; + } + else if(alphabet.isTag(val) && tagCount <= 1) { + cout << "first tag" << endl; + // cout << "vblex defined? " << alphabet.isSymbolDefined(L"") << endl; + // s.step(alphabet(L"")); + s.step_override(val, alphabet(L""), val); + + } else if(alphabet.isTag(val) && tagCount > 1) { + cout << "second tag" << endl; s.step_override(val, alphabet(L""), val); } else if(val > 0) { - s.step_override(val, alphabet(L""), val); // deal with cases! + // s.step_override(val, alphabet(L""), val); + s.step(val); + cout << "original char" << endl; } - if(s.size() > 0) + else { + cout << "error?" << endl; + } + if(s.size() > 0) // alive if the vector isn't empty { new_states.push_back(s); } - wcerr << L"| | " << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; - wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; - } - alive_states.swap(new_states); - } - if(outOfWord) + if(s.isFinal(anfinals)) { - continue; + wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); + wcerr << "FINAL: " << out << endl; + new_states.push_back(*initial_state); } + } + // cout << "new-states size: " << new_states.size() << endl; + alive_states.swap(new_states); + + // + // if(val == L'$' && !isEscaped /*&& outOfWord*/) + // { + // cout << "val: " << val << " " << (char) val << endl; + // outOfWord = false; + // continue; + // } + // + // if((feof(input) || val == L'$') && !isEscaped /*&& !outOfWord*/) + // { + // cout << "val: " << val << " " << (char) val << endl; + // new_states.clear(); + // for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) + // { + // State s = *it; + // s.step(alphabet(L"<$>")); + // cout << "alive_states size: " << alive_states.size() << endl; + // if(s.size() > 0) + // { + // new_states.push_back(s); + // } + // + // if(s.isFinal(anfinals)) + // { + // wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); + // wcerr << "FINAL: " << out << endl; + // new_states.push_back(*initial_state); + // } + // } + // alive_states.swap(new_states); + // + // outOfWord = true; + // continue; + // } + // + // if(val == L'<' /*&& !outOfWord*/) // tag + // { + // wstring tag = L""; + // tag = readFullBlock(input, L'<', L'>'); + // if(!alphabet.isSymbolDefined(tag)) + // { + // alphabet.includeSymbol(tag); + // } + // val = static_cast(alphabet(tag)); + // cout << "val: " << val << " " << (char) val << endl; + // + // // fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); + // } + // + // if(!outOfWord) + // { + // cout << "val: " << val << " " << (char) val << endl; + // new_states.clear(); + // wstring res = L""; + // for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) + // { + // res = L""; + // State s = *it; + // if(val < 0) + // { + // s.step_override(val, alphabet(L""), val); + // } + // else if(val > 0) + // { + // s.step_override(val, alphabet(L""), val); // deal with cases! + // } + // if(s.size() > 0) + // { + // new_states.push_back(s); + // } + // // wcerr << L"| | " << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; + // // wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; + // + // } + // alive_states.swap(new_states); + // } + // + // if(outOfWord) + // { + // continue; + // } } return 0;