Index: branches/apertium-separable/src/lsx_processor.cc =================================================================== --- branches/apertium-separable/src/lsx_processor.cc (revision 80740) +++ branches/apertium-separable/src/lsx_processor.cc (revision 80741) @@ -3,9 +3,9 @@ #include #include #include +// #include #include #include -#include #include #include @@ -15,6 +15,7 @@ #include #include +/* get the text between delim1 and delim2 */ wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); wstring @@ -21,8 +22,8 @@ readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) { wstring result = L""; + result += delim1; wchar_t c = delim1; - result += c; while(!feof(input) && c != delim2) { @@ -29,6 +30,7 @@ c = static_cast(fgetwc(input)); result += c; } + return result; } @@ -36,18 +38,19 @@ { if(argc != 2) { - wcout << L"./lsx-proc " << endl; + wcout << L"./lsx-comp " << endl; exit(0); } Alphabet alphabet; TransExe transducer; + LtLocale::tryToSetLocale(); - FILE* fst = fopen(argv[1], "r"); + FILE *fst = fopen(argv[1], "r"); if(!fst) { - wcerr << "Error: Cannot open file '" << argv[2] << "'." << endl; + wcerr << "Error: Cannot open file '" << fst << "'." << endl; exit(EXIT_FAILURE); } @@ -60,11 +63,11 @@ } alphabet.read(fst); - wcerr << L"alphabet_size: " << alphabet.size() << endl; //NOTE + wcerr << L"alphabet_size: " << alphabet.size() << endl; len = Compression::multibyte_read(fst); len = Compression::multibyte_read(fst); - + wcerr << L"len: " << len << endl; wstring name = L""; while(len > 0) { @@ -71,7 +74,7 @@ name += static_cast(Compression::multibyte_read(fst)); len--; } - wcerr << name << endl; //NOTE + wcerr << name << endl << endl; transducer.read(fst, alphabet); @@ -79,7 +82,6 @@ FILE *output = stdout; set anfinals; - vector new_states, alive_states; set escaped_chars; escaped_chars.insert(L'['); @@ -98,20 +100,55 @@ initial_state = new State(); initial_state->init(transducer.getInitial()); anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); + + vector new_states; + vector alive_states; + alive_states.push_back(*initial_state); bool outOfWord = true; bool isEscaped = false; - int tagCount = 0; - - int val = 0; while(!feof(input)) { - val = fgetwc(input); // read 1 wide char + int val = fgetwc(input); // read 1 wide char + // wcout << L"| " << (wchar_t)val << L" | val: " << val << L" || as.size(): " << alive_states.size() << L" || out of word: " << outOfWord << endl; - if(val == L'<') // tag + + if((val == L'^' && !isEscaped && outOfWord) /*|| val == L' '*/) { + outOfWord = false; + // wcout << "| continue " << (wchar_t)val << endl; + continue; + } + + if((feof(input) || val == L'$') && !isEscaped && !outOfWord) + { + new_states.clear(); + for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) + { + State s = *it; + s.step(alphabet(L"<$>")); + if(s.size() > 0) + { + new_states.push_back(s); + } + + if(s.isFinal(anfinals)) + { + wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); + // cout << "FINAL: " << /*out <<*/ endl; + new_states.push_back(*initial_state); + } + } + alive_states.swap(new_states); + + outOfWord = true; + continue; + } + + if(val == L'<' && !outOfWord) // tag + { wstring tag = L""; tag = readFullBlock(input, L'<', L'>'); if(!alphabet.isSymbolDefined(tag)) @@ -119,148 +156,50 @@ alphabet.includeSymbol(tag); } val = static_cast(alphabet(tag)); - tagCount++; - } - new_states.clear(); - if(val == L'^') { - outOfWord = false; - continue; + // fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); } - wcout << "val: " << val << " " << (char) val << " alive_states size: " << alive_states.size() << " tagCount: " << tagCount << " isTag: " << alphabet.isTag(val) << endl; + if(!outOfWord) + { + new_states.clear(); + wstring res = L""; for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) { + res = L""; State s = *it; - - if(val == L'$') { - s.step(alphabet(L"<$>")); - wcout << " wb" << endl; - tagCount = 0; - } - else if(alphabet.isTag(val) && tagCount <= 1) { - wcout << " first tag" << endl; - // if ( alphabet(L"") == val) { wcout << "equal" ;} else { wcout << "not equal" ;} - // wcout << "vblex defined? " << alphabet.isSymbolDefined(L"") << endl; - //s.step(val); + if(val < 0) + { s.step_override(val, alphabet(L""), val); - // s.step(-18); - // s.step(alphabet(L"")); - // s.step_override(val, alphabet(L""), val); - - } else if(/*alphabet.isTag(val) &&*/ tagCount > 1) { - wcout << " second tag" << endl; - s.step_override(val, alphabet(L""), val); } else if(val > 0) { - s.step_override(val, alphabet(L""), val); - //s.step(val); - wcout << " original char: " << val << endl; + s.step_override(val, alphabet(L""), val); // deal with cases! } - else { - wcout << "error?" << endl; - } - if(s.size() > 0) // alive if the vector isn't empty + if(s.size() > 0) { - wcout << "pushing new states" << endl; new_states.push_back(s); } - + // wcout << L"| | " << /*(wchar_t) val << */L"val " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; + // wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; if(s.isFinal(anfinals)) { + // cout << "finals size: " << s.size() << endl; wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - wcerr << "FINAL: " << out << endl; + wcout << out << endl; //FIXME + // wcerr << s.getReadableString(alphabet) << endl; new_states.push_back(*initial_state); + // const wchar_t* ws = out; + // fputws(out, output); } - wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; } - // wcout << "new-states size: " << new_states.size() << endl; alive_states.swap(new_states); + } - - // - // if(val == L'$' && !isEscaped /*&& outOfWord*/) - // { - // wcout << "val: " << val << " " << (char) val << endl; - // outOfWord = false; - // continue; - // } - // - // if((feof(input) || val == L'$') && !isEscaped /*&& !outOfWord*/) - // { - // wcout << "val: " << val << " " << (char) val << endl; - // new_states.clear(); - // for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - // { - // State s = *it; - // s.step(alphabet(L"<$>")); - // wcout << "alive_states size: " << alive_states.size() << endl; - // if(s.size() > 0) - // { - // new_states.push_back(s); - // } - // - // if(s.isFinal(anfinals)) - // { - // wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); - // wcerr << "FINAL: " << out << endl; - // new_states.push_back(*initial_state); - // } - // } - // alive_states.swap(new_states); - // - // outOfWord = true; - // continue; - // } - // - // if(val == L'<' /*&& !outOfWord*/) // tag - // { - // wstring tag = L""; - // tag = readFullBlock(input, L'<', L'>'); - // if(!alphabet.isSymbolDefined(tag)) - // { - // alphabet.includeSymbol(tag); - // } - // val = static_cast(alphabet(tag)); - // wcout << "val: " << val << " " << (char) val << endl; - // - // // fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); - // } - // - // if(!outOfWord) - // { - // wcout << "val: " << val << " " << (char) val << endl; - // new_states.clear(); - // wstring res = L""; - // for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) - // { - // res = L""; - // State s = *it; - // if(val < 0) - // { - // s.step_override(val, alphabet(L""), val); - // } - // else if(val > 0) - // { - // s.step_override(val, alphabet(L""), val); // deal with cases! - // } - // if(s.size() > 0) - // { - // new_states.push_back(s); - // } - // // wcerr << L"| | " << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; - // // wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; - // - // } - // alive_states.swap(new_states); - // } - // - // if(outOfWord) - // { - // continue; - // } + if(outOfWord) + { + continue; } - + } return 0; -} +} \ No newline at end of file