Index: branches/apertium-separable/src/compiler.cc =================================================================== --- branches/apertium-separable/src/compiler.cc (revision 80609) +++ branches/apertium-separable/src/compiler.cc (revision 80610) @@ -262,7 +262,7 @@ { t.linkStates(nuevo_estado, estado, 0); } - + if(acx_map_ptr != acx_map.end()) { for(set::iterator it = acx_map_ptr->second.begin(); @@ -307,7 +307,6 @@ void Compiler::readString(list &result, wstring const &name) { - wcerr << name << endl; if(name == L"#text") { wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); @@ -353,7 +352,6 @@ result.push_back(alphabet(symbol)); } else if(name == COMPILER_ANYTAG_ELEM) { - wcerr << L"" << endl; result.push_back(alphabet(L"")); } else if(name == COMPILER_ANYCHAR_ELEM) { @@ -362,7 +360,6 @@ else if(name == COMPILER_WB_ELEM) { requireEmptyError(name); result.push_back(alphabet(L"<$>")); - } else @@ -795,8 +792,6 @@ } } -void Compiler::procWb() {} //TODO - void Compiler::procNodeACX() { Index: branches/apertium-separable/src/lsx_compiler.cc =================================================================== --- branches/apertium-separable/src/lsx_compiler.cc (revision 80609) +++ branches/apertium-separable/src/lsx_compiler.cc (revision 80610) @@ -7,6 +7,9 @@ #include #include +#include +#include + #include #include #include @@ -13,32 +16,196 @@ #include #include #include +#include -#include -#include +wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2); -using namespace std; - -int main (int argc, char** argv) +wstring +readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2) { - Alphabet alphabet; - Transducer t; + wstring result = L""; + result += delim1; + wchar_t c = delim1; - LtLocale::tryToSetLocale(); + while(!feof(input) && c != delim2) + { + c = static_cast(fgetwc(input)); + result += c; + } - if(argc < 3) + return result; +} + +int main (int argc, char** argv) +{ + if(argc != 3) { wcout << L"lsx-comp " << endl; exit(0); } + /* compile */ + Compiler c; c.parse(argv[1], L"lr"); FILE* fst = fopen(argv[2], "w+"); + if(!fst) + { + wcerr << "Error: Cannot open file '" << fst << "'." << endl; + exit(EXIT_FAILURE); + } c.write(fst); - fclose(fst); + /* process */ + + Alphabet alphabet; + TransExe transducer; + + LtLocale::tryToSetLocale(); + + fst = fopen(argv[2], "r"); + + set alphabetic_chars; + int len = Compression::multibyte_read(fst); + while(len > 0) + { + alphabetic_chars.insert(static_cast(Compression::multibyte_read(fst))); + len--; + } + + alphabet.read(fst); + wcerr << L"alphabet_size: " << alphabet.size() << endl; + + len = Compression::multibyte_read(fst); + len = Compression::multibyte_read(fst); + + wcerr << len << endl; + wstring name = L""; + while(len > 0) + { + name += static_cast(Compression::multibyte_read(fst)); + len--; + } + wcerr << name << endl; + + transducer.read(fst, alphabet); + + FILE *input = stdin; + FILE *output = stdout; + + set anfinals; + set escaped_chars; + + escaped_chars.insert(L'['); + escaped_chars.insert(L']'); + escaped_chars.insert(L'{'); + escaped_chars.insert(L'}'); + escaped_chars.insert(L'^'); + escaped_chars.insert(L'$'); + escaped_chars.insert(L'/'); + escaped_chars.insert(L'\\'); + escaped_chars.insert(L'@'); + escaped_chars.insert(L'<'); + escaped_chars.insert(L'>'); + + State *initial_state; + initial_state = new State(); + initial_state->init(transducer.getInitial()); + anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); + + + vector new_states; + vector alive_states; + + alive_states.push_back(*initial_state); + + bool outOfWord = true; + bool isEscaped = false; + + while(!feof(input)) + { + int val = fgetwc(input); // read 1 wide char + + wcerr << L"| " << (wchar_t)val << L" | val: " << val << L" || s.size(): " << alive_states.size() << L" || " << outOfWord << endl; + + if(/*val == L'^' && */ !isEscaped && outOfWord) + { + outOfWord = false; + continue; + } + + if((feof(input) || val == L'$') && !isEscaped && !outOfWord) + { + new_states.clear(); + for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) + { + State s = *it; + s.step(alphabet(L"<$>")); + if(s.size() > 0) + { + new_states.push_back(s); + } + + if(s.isFinal(anfinals)) + { + wstring out = s.filterFinals(anfinals, alphabet, escaped_chars); + wcerr << "FINAL: " << out << endl; + new_states.push_back(*initial_state); + } + } + alive_states.swap(new_states); + + outOfWord = true; + continue; + } + + if(val == L'<' && !outOfWord) // if in tag, get the whole tag and modify if necessary + { + wstring tag = L""; + tag = readFullBlock(input, L'<', L'>'); + if(!alphabet.isSymbolDefined(tag)) + { + alphabet.includeSymbol(tag); + } + val = static_cast(alphabet(tag)); + + fwprintf(stderr, L"tag %S: %d\n", tag.c_str(), val); + } + + if(!outOfWord) + { + new_states.clear(); + wstring res = L""; + for(vector::const_iterator it = alive_states.begin(); it != alive_states.end(); it++) + { + res = L""; + State s = *it; + if(val < 0) + { + s.step_override(val, alphabet(L""), val); + } + else if(val > 0) + { + s.step_override(val, alphabet(L""), val); // deal with cases! + } + if(s.size() > 0) + { + new_states.push_back(s); + } + wcerr << L"| | " << (wchar_t) val << L" " << L"size: " << s.size() << L" final: " << s.isFinal(anfinals) << endl; + wcerr << L"| | cur: " << s.getReadableString(alphabet) << endl; + } + alive_states.swap(new_states); + } + + if(outOfWord) + { + continue; + } + + } + return 0; }