Index: branches/apertium-separable/examples/new-example.dix =================================================================== --- branches/apertium-separable/examples/new-example.dix (revision 80468) +++ branches/apertium-separable/examples/new-example.dix (revision 80471) @@ -4,16 +4,19 @@ + + - + + -

always

-

never

-

often

-

sometimes

-

rarely

+

always

+

never

+

often

+

sometimes

+

rarely

@@ -32,7 +35,7 @@

-

+

-->

@@ -149,8 +152,7 @@

about

- - +

setsetup

up

@@ -205,30 +207,61 @@

way

- - - bring together - carry out - check in - check out - give back - line up - note down - pull out - send back - set about - set up - shut down - slow down - stir up - take for granted - take off - trace back - - --> - @@ -254,4 +287,4 @@ - --> + Index: branches/apertium-separable/src/compiler_copy.cc =================================================================== --- branches/apertium-separable/src/compiler_copy.cc (nonexistent) +++ branches/apertium-separable/src/compiler_copy.cc (revision 80471) @@ -0,0 +1,975 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +wstring const Compiler::COMPILER_DICTIONARY_ELEM = L"dictionary"; +wstring const Compiler::COMPILER_ALPHABET_ELEM = L"alphabet"; +wstring const Compiler::COMPILER_SDEFS_ELEM = L"sdefs"; +wstring const Compiler::COMPILER_SDEF_ELEM = L"sdef"; +wstring const Compiler::COMPILER_N_ATTR = L"n"; +wstring const Compiler::COMPILER_PARDEFS_ELEM = L"pardefs"; +wstring const Compiler::COMPILER_PARDEF_ELEM = L"pardef"; +wstring const Compiler::COMPILER_PAR_ELEM = L"par"; +wstring const Compiler::COMPILER_ENTRY_ELEM = L"e"; +wstring const Compiler::COMPILER_RESTRICTION_ATTR = L"r"; +wstring const Compiler::COMPILER_RESTRICTION_LR_VAL = L"LR"; +wstring const Compiler::COMPILER_RESTRICTION_RL_VAL = L"RL"; +wstring const Compiler::COMPILER_PAIR_ELEM = L"p"; +wstring const Compiler::COMPILER_LEFT_ELEM = L"l"; +wstring const Compiler::COMPILER_RIGHT_ELEM = L"r"; +wstring const Compiler::COMPILER_S_ELEM = L"s"; +wstring const Compiler::COMPILER_REGEXP_ELEM = L"re"; +wstring const Compiler::COMPILER_SECTION_ELEM = L"section"; +wstring const Compiler::COMPILER_ID_ATTR = L"id"; +wstring const Compiler::COMPILER_TYPE_ATTR = L"type"; +wstring const Compiler::COMPILER_IDENTITY_ELEM = L"i"; +wstring const Compiler::COMPILER_JOIN_ELEM = L"j"; +wstring const Compiler::COMPILER_BLANK_ELEM = L"b"; +wstring const Compiler::COMPILER_POSTGENERATOR_ELEM = L"a"; +wstring const Compiler::COMPILER_GROUP_ELEM = L"g"; +wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm"; +wstring const Compiler::COMPILER_IGNORE_ATTR = L"i"; +wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes"; +wstring const Compiler::COMPILER_ALT_ATTR = L"alt"; +wstring const Compiler::COMPILER_V_ATTR = L"v"; +wstring const Compiler::COMPILER_VL_ATTR = L"vl"; +wstring const Compiler::COMPILER_VR_ATTR = L"vr"; + +wstring const Compiler::COMPILER_ANYTAG_ELEM = L"t"; +wstring const Compiler::COMPILER_ANYCHAR_ELEM = L"w"; +wstring const Compiler::COMPILER_WB_ELEM = L"j"; + +Compiler::Compiler() : +reader(0), +verbose(false), +first_element(false), +acx_current_char(0) +{ +} + +Compiler::~Compiler() +{ +} + +void +Compiler::parseACX(string const &fichero, wstring const &dir) +{ + if(dir == COMPILER_RESTRICTION_LR_VAL) + { + reader = xmlReaderForFile(fichero.c_str(), NULL, 0); + if(reader == NULL) + { + wcerr << "Error: cannot open '" << fichero << "'." << endl; + exit(EXIT_FAILURE); + } + int ret = xmlTextReaderRead(reader); + while(ret == 1) + { + procNodeACX(); + ret = xmlTextReaderRead(reader); + } + } +} + +void +Compiler::parse(string const &fichero, wstring const &dir) +{ + direction = dir; + wcout << direction << endl; //NOTE never prints + + reader = xmlReaderForFile(fichero.c_str(), NULL, 0); + if(reader == NULL) + { + wcerr << "Error: Cannot open '" << fichero << "'." << endl; + exit(EXIT_FAILURE); + } + + int ret = xmlTextReaderRead(reader); + + cout << ret << endl; //NOTE never prints + + while(ret == 1) + { + procNode(); + ret = xmlTextReaderRead(reader); + cout << ret << endl; //NOTE never prints + } + + if(ret != 0) + { + wcerr << L"Error: Parse error at the end of input." << endl; + } + + xmlFreeTextReader(reader); + xmlCleanupParser(); + + + // Minimize transducers + for(map::iterator it = sections.begin(), + limit = sections.end(); + it != limit; it++) + { + (it->second).minimize(); + } +} + + +void +Compiler::procAlphabet() +{ + int tipo=xmlTextReaderNodeType(reader); + + if(tipo != XML_READER_TYPE_END_ELEMENT) + { + int ret = xmlTextReaderRead(reader); + if(ret == 1) + { + xmlChar const *valor = xmlTextReaderConstValue(reader); + letters = XMLParseUtil::towstring(valor); + bool espai = true; + for(unsigned int i = 0; i < letters.length(); i++) + { + if(!isspace(letters.at(i))) + { + espai = false; + break; + } + } + if(espai == true) // libxml2 returns '\n' for , should be empty + { + letters = L""; + } + } + else + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Missing alphabet symbols." << endl; + exit(EXIT_FAILURE); + } + } +} + +void +Compiler::procSDef() +{ + alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); +} + +void +Compiler::procParDef() +{ + int tipo=xmlTextReaderNodeType(reader); + + if(tipo != XML_READER_TYPE_END_ELEMENT) + { + current_paradigm = attrib(COMPILER_N_ATTR); + } + else + { + if(!paradigms[current_paradigm].isEmpty()) + { + paradigms[current_paradigm].minimize(); + paradigms[current_paradigm].joinFinals(); + current_paradigm = L""; + } + } +} + +int +Compiler::matchTransduction(list const &pi, + list const &pd, + int estado, Transducer &t) +{ + list::const_iterator izqda, dcha, limizqda, limdcha; + + if(direction == COMPILER_RESTRICTION_LR_VAL) + { + izqda = pi.begin(); + dcha = pd.begin(); + limizqda = pi.end(); + limdcha = pd.end(); + } + else + { + izqda = pd.begin(); + dcha = pi.begin(); + limizqda = pd.end(); + limdcha = pi.end(); + } + + + if(pi.size() == 0 && pd.size() == 0) + { + estado = t.insertNewSingleTransduction(alphabet(0, 0), estado); + } + else + { + map >::iterator acx_map_ptr; + int rsymbol = 0; + + while(true) + { + int etiqueta; + + acx_map_ptr = acx_map.end(); + + if(izqda == limizqda && dcha == limdcha) + { + break; + } + else if(izqda == limizqda) + { + etiqueta = alphabet(0, *dcha); + dcha++; + } + else if(dcha == limdcha) + { + etiqueta = alphabet(*izqda, 0); + acx_map_ptr = acx_map.find(*izqda); + rsymbol = 0; + izqda++; + } + else + { + etiqueta = alphabet(*izqda, *dcha); + acx_map_ptr = acx_map.find(*izqda); + rsymbol = *dcha; + izqda++; + dcha++; + } + + int nuevo_estado = t.insertSingleTransduction(etiqueta, estado); + + if(acx_map_ptr != acx_map.end()) + { + for(set::iterator it = acx_map_ptr->second.begin(); + it != acx_map_ptr->second.end(); it++) + { + t.linkStates(estado, nuevo_estado, alphabet(*it ,rsymbol)); + } + } + estado = nuevo_estado; + } + } + + return estado; +} + + +void +Compiler::requireEmptyError(wstring const &name) +{ + if(!xmlTextReaderIsEmptyElement(reader)) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; + exit(EXIT_FAILURE); + } +} + +bool +Compiler::allBlanks() +{ + bool flag = true; + wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + + for(unsigned int i = 0, limit = text.size(); i < limit; i++) + { + flag = flag && iswspace(text[i]); + } + + return flag; +} + +void +Compiler::readString(list &result, wstring const &name) +{ + + wcout << name << endl; //NOTE never prints + + if(name == L"#text") + { + wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + for(unsigned int i = 0, limit = value.size(); i < limit; i++) + { + result.push_back(static_cast(value[i])); + } + } + else if(name == COMPILER_BLANK_ELEM) + { + requireEmptyError(name); + result.push_back(static_cast(L' ')); + } + // else if(name == COMPILER_JOIN_ELEM) + // { + // requireEmptyError(name); + // result.push_back(static_cast(L'+')); + // } + else if(name == COMPILER_POSTGENERATOR_ELEM) + { + requireEmptyError(name); + result.push_back(static_cast(L'~')); + } + else if(name == COMPILER_GROUP_ELEM) + { + int tipo=xmlTextReaderNodeType(reader); + if(tipo != XML_READER_TYPE_END_ELEMENT) + { + result.push_back(static_cast(L'#')); + } + } + else if(name == COMPILER_S_ELEM) + { + requireEmptyError(name); + wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; + + if(!alphabet.isSymbolDefined(symbol)) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; + exit(EXIT_FAILURE); + } + + result.push_back(alphabet(symbol)); + } + + /* additions */ + else if(name == COMPILER_ANYTAG_ELEM) { + // wstring symbol = L"<" + name + L">"; + result.push_back(alphabet(L"")); + } + else if(name == COMPILER_ANYCHAR_ELEM) { + result.push_back(alphabet(L"")); + } + else if(name == COMPILER_WB_ELEM) { + requireEmptyError(name); + wstring symbol = L"<" + name + L">"; + result.push_back(alphabet(symbol)); + } + + else + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid specification of element '<" << name; + wcerr << L">' in this context." << endl; + wcerr << L"anytag_elem: " << COMPILER_ANYTAG_ELEM << endl; + exit(EXIT_FAILURE); + } +} + +void +Compiler::skipBlanks(wstring &name) +{ + while(name == L"#text" || name == L"#comment") + { + if(name != L"#comment") + { + if(!allBlanks()) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid construction." << endl; + exit(EXIT_FAILURE); + } + } + + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + } +} + +void +Compiler::skip(wstring &name, wstring const &elem) +{ + skip(name, elem, true); +} + +void +Compiler::skip(wstring &name, wstring const &elem, bool open) +{ + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + wstring slash; + + if(!open) + { + slash = L"/"; + } + + while(name == L"#text" || name == L"#comment") + { + if(name != L"#comment") + { + if(!allBlanks()) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid construction." << endl; + exit(EXIT_FAILURE); + } + } + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + } + + if(name != elem) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Expected '<" << slash << elem << L">'." << endl; + exit(EXIT_FAILURE); + } +} + +EntryToken +Compiler::procIdentity() +{ + list both_sides; + + if(!xmlTextReaderIsEmptyElement(reader)) + { + wstring name = L""; + + while(true) + { + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + if(name == COMPILER_IDENTITY_ELEM) + { + break; + } + readString(both_sides, name); + } + } + + if(verbose && first_element && (both_sides.front() == (int)L' ')) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Entry begins with space." << endl; + } + first_element = false; + EntryToken e; + e.setSingleTransduction(both_sides, both_sides); + return e; +} + +EntryToken +Compiler::procTransduction() +{ + list lhs, rhs; + wstring name; + + skip(name, COMPILER_LEFT_ELEM); + + if(!xmlTextReaderIsEmptyElement(reader)) + { + name = L""; + while(true) + { + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + if(name == COMPILER_LEFT_ELEM) + { + break; + } + readString(lhs, name); + } + } + + if(verbose && first_element && (lhs.front() == (int)L' ')) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Entry begins with space." << endl; + } + first_element = false; + + skip(name, COMPILER_RIGHT_ELEM); + + if(!xmlTextReaderIsEmptyElement(reader)) + { + name = L""; + while(true) + { + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + if(name == COMPILER_RIGHT_ELEM) + { + break; + } + readString(rhs, name); + } + } + + skip(name, COMPILER_PAIR_ELEM, false); + + EntryToken e; + e.setSingleTransduction(lhs, rhs); + return e; +} + +wstring +Compiler::attrib(wstring const &name) +{ + return XMLParseUtil::attrib(reader, name); +} + +EntryToken +Compiler::procPar() +{ + EntryToken e; + wstring nomparadigma = attrib(COMPILER_N_ATTR); + first_element = false; + + if(current_paradigm != L"" && nomparadigma == current_paradigm) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." < const &elements) +{ + if(current_paradigm != L"") + { + // compilation of paradigms + Transducer &t = paradigms[current_paradigm]; + int e = t.getInitial(); + + for(unsigned int i = 0, limit = elements.size(); i < limit; i++) + { + if(elements[i].isParadigm()) + { + e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); + } + else if(elements[i].isSingleTransduction()) + { + e = matchTransduction(elements[i].left(), + elements[i].right(), e, t); + } + else if(elements[i].isRegexp()) + { + RegexpCompiler analyzer; + analyzer.initialize(&alphabet); + analyzer.compile(elements[i].regExp()); + e = t.insertTransducer(e, analyzer.getTransducer(), alphabet(0,0)); + } + else + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid entry token." << endl; + exit(EXIT_FAILURE); + } + } + t.setFinal(e); + } + else + { + // compilaci�n de dictionary + + Transducer &t = sections[current_section]; + int e = t.getInitial(); + + for(unsigned int i = 0, limit = elements.size(); i < limit; i++) + { + if(elements[i].isParadigm()) + { + if(i == elements.size()-1) + { + // paradigma sufijo + if(suffix_paradigms[current_section].find(elements[i].paradigmName()) != suffix_paradigms[current_section].end()) + { + t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0); + e = postsuffix_paradigms[current_section][elements[i].paradigmName()]; + } + else + { + e = t.insertNewSingleTransduction(alphabet(0, 0), e); + suffix_paradigms[current_section][elements[i].paradigmName()] = e; + e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); + postsuffix_paradigms[current_section][elements[i].paradigmName()] = e; + } + } + else if(i == 0) + { + // paradigma prefijo + if(prefix_paradigms[current_section].find(elements[i].paradigmName()) != prefix_paradigms[current_section].end()) + { + e = prefix_paradigms[current_section][elements[i].paradigmName()]; + } + else + { + e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); + prefix_paradigms[current_section][elements[i].paradigmName()] = e; + } + } + else + { + // paradigma intermedio + e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); + } + } + else if(elements[i].isRegexp()) + { + RegexpCompiler analyzer; + analyzer.initialize(&alphabet); + analyzer.compile(elements[i].regExp()); + e = t.insertTransducer(e, analyzer.getTransducer(), alphabet(0,0)); + } + else + { + e = matchTransduction(elements[i].left(), elements[i].right(), e, t); + } + } + t.setFinal(e); + } +} + + +void +Compiler::requireAttribute(wstring const &value, wstring const &attrname, + wstring const &elemname) +{ + if(value == L"") + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): '<" << elemname; + wcerr << L"' element must specify non-void '"; + wcerr << attrname << L"' attribute." << endl; + exit(EXIT_FAILURE); + } +} + + +void +Compiler::procSection() +{ + int tipo=xmlTextReaderNodeType(reader); + + if(tipo != XML_READER_TYPE_END_ELEMENT) + { + wstring const &id = attrib(COMPILER_ID_ATTR); + wstring const &type = attrib(COMPILER_TYPE_ATTR); + requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); + requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); + + current_section = id; + current_section += L"@"; + current_section.append(type); + } + else + { + current_section = L""; + } +} + +void +Compiler::procEntry() +{ + wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR); + wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); + wstring altval = this->attrib(COMPILER_ALT_ATTR); + wstring varval = this->attrib(COMPILER_V_ATTR); + wstring varl = this->attrib(COMPILER_VL_ATTR); + wstring varr = this->attrib(COMPILER_VR_ATTR); + + //�if entry is masked by a restriction of direction or an ignore mark + if((atributo != L"" && atributo != direction) + || ignore == COMPILER_IGNORE_YES_VAL + || (altval != L"" && altval != alt) + || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) + || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) + || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) + { + // parse to the end of the entry + wstring name = L""; + + while(name != COMPILER_ENTRY_ELEM) + { + xmlTextReaderRead(reader); + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + } + + return; + } + + vector elements; + + while(true) + { + int ret = xmlTextReaderRead(reader); + if(ret != 1) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Parse error." << endl; + exit(EXIT_FAILURE); + } + wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + skipBlanks(name); + + if(current_paradigm == L"" && verbose) + { + first_element = true; + } + + int tipo = xmlTextReaderNodeType(reader); + if(name == COMPILER_PAIR_ELEM) + { + elements.push_back(procTransduction()); + } + else if(name == COMPILER_IDENTITY_ELEM) + { + elements.push_back(procIdentity()); + } + else if(name == COMPILER_REGEXP_ELEM) + { + elements.push_back(procRegexp()); + } + else if(name == COMPILER_PAR_ELEM) + { + elements.push_back(procPar()); + + // detecci�n del uso de paradigmas no definidos + + wstring const &p = elements.rbegin()->paradigmName(); + + if(paradigms.find(p) == paradigms.end()) + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Undefined paradigm '" << p << L"'." <' into '<" << COMPILER_ENTRY_ELEM; + wcerr << L">'." << endl; + exit(EXIT_FAILURE); + } + } +} + +void Compiler::procTag() {} //TODO + +void Compiler::procChar() {} //TODO + +void Compiler::procWb() {} //TODO + +void +Compiler::procNodeACX() +{ + xmlChar const *xnombre = xmlTextReaderConstName(reader); + wstring nombre = XMLParseUtil::towstring(xnombre); + + if(nombre == L"#text") + { + /* ignore */ + } + else if(nombre == L"analysis-chars") + { + /* ignore */ + } + else if(nombre == L"char") + { + acx_current_char = static_cast(attrib(L"value")[0]); + } + else if(nombre == L"equiv-char") + { + acx_map[acx_current_char].insert(static_cast(attrib(L"value")[0])); + } + else if(nombre == L"#comment") + { + /* ignore */ + } + else + { + wcerr << L"Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid node '<" << nombre << L">'." << endl; + exit(EXIT_FAILURE); + } +} + +void +Compiler::procNode() +{ + xmlChar const *xnombre = xmlTextReaderConstName(reader); + wstring nombre = XMLParseUtil::towstring(xnombre); + + // HACER: optimizar el orden de ejecuci�n de esta ristra de "ifs" + + if(nombre == L"#text") + { + /* ignorar */ + } + else if(nombre == COMPILER_DICTIONARY_ELEM) + { + /* ignorar */ + } + else if(nombre == COMPILER_ALPHABET_ELEM) + { + procAlphabet(); + } + else if(nombre == COMPILER_SDEFS_ELEM) + { + /* ignorar */ + } + else if(nombre == COMPILER_SDEF_ELEM) + { + procSDef(); + } + else if(nombre == COMPILER_PARDEFS_ELEM) + { + /* ignorar */ + } + else if(nombre == COMPILER_PARDEF_ELEM) + { + procParDef(); + } + else if(nombre == COMPILER_ENTRY_ELEM) + { + procEntry(); + } + else if(nombre == COMPILER_SECTION_ELEM) + { + procSection(); + } + else if(nombre == L"#comment") + { + /* ignorar */ + } + else if(nombre == COMPILER_ANYTAG_ELEM) { + procTag(); + } + else if(nombre == COMPILER_ANYCHAR_ELEM) { + procChar(); + } + else if(nombre == COMPILER_WB_ELEM) { + procWb(); + } + else + { + wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): Invalid node '<" << nombre << L">'." << endl; + exit(EXIT_FAILURE); + } +} + +EntryToken +Compiler::procRegexp() +{ + EntryToken et; + xmlTextReaderRead(reader); + wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); + et.setRegexp(re); + xmlTextReaderRead(reader); + return et; +} + +void +Compiler::write(FILE *output) +{ + // letters + Compression::wstring_write(letters, output); + + // symbols + alphabet.write(output); + + // transducers + Compression::multibyte_write(sections.size(), output); + + int conta=0; + for(map::iterator it = sections.begin(), + limit = sections.end(); + it != limit; it++) + { + conta++; + wcout << it->first << " " << it->second.size(); + wcout << " " << it->second.numberOfTransitions() << endl; + Compression::wstring_write(it->first, output); + it->second.write(output); + } +} + +void +Compiler::setAltValue(string const &a) +{ + alt = XMLParseUtil::stows(a); +} + +void +Compiler::setVariantValue(string const &v) +{ + variant = XMLParseUtil::stows(v); +} + +void +Compiler::setVariantLeftValue(string const &v) +{ + variant_left = XMLParseUtil::stows(v); +} + +void +Compiler::setVariantRightValue(string const &v) +{ + variant_right = XMLParseUtil::stows(v); +} + +void +Compiler::setVerbose(bool verbosity) +{ + verbose = verbosity; +} Index: branches/apertium-separable/src/lsx_compiler.cc =================================================================== --- branches/apertium-separable/src/lsx_compiler.cc (revision 80468) +++ branches/apertium-separable/src/lsx_compiler.cc (revision 80471) @@ -1,19 +1,3 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ // #ifndef _MYCOMPILER_ // #define _MYCOMPILER_ @@ -24,963 +8,37 @@ #include #include #include -#include -#include -#include +#include #include #include #include +#include #include #include -#include -#include +// #include #include -#include -#include -#include +// #include #include -// #include //file not found using namespace std; -wstring const Compiler::COMPILER_DICTIONARY_ELEM = L"dictionary"; -wstring const Compiler::COMPILER_ALPHABET_ELEM = L"alphabet"; -wstring const Compiler::COMPILER_SDEFS_ELEM = L"sdefs"; -wstring const Compiler::COMPILER_SDEF_ELEM = L"sdef"; -wstring const Compiler::COMPILER_N_ATTR = L"n"; -wstring const Compiler::COMPILER_PARDEFS_ELEM = L"pardefs"; -wstring const Compiler::COMPILER_PARDEF_ELEM = L"pardef"; -wstring const Compiler::COMPILER_PAR_ELEM = L"par"; -wstring const Compiler::COMPILER_ENTRY_ELEM = L"e"; -wstring const Compiler::COMPILER_RESTRICTION_ATTR = L"r"; -wstring const Compiler::COMPILER_RESTRICTION_LR_VAL = L"LR"; -wstring const Compiler::COMPILER_RESTRICTION_RL_VAL = L"RL"; -wstring const Compiler::COMPILER_PAIR_ELEM = L"p"; -wstring const Compiler::COMPILER_LEFT_ELEM = L"l"; -wstring const Compiler::COMPILER_RIGHT_ELEM = L"r"; -wstring const Compiler::COMPILER_S_ELEM = L"s"; -wstring const Compiler::COMPILER_REGEXP_ELEM = L"re"; -wstring const Compiler::COMPILER_SECTION_ELEM = L"section"; -wstring const Compiler::COMPILER_ID_ATTR = L"id"; -wstring const Compiler::COMPILER_TYPE_ATTR = L"type"; -wstring const Compiler::COMPILER_IDENTITY_ELEM = L"i"; -wstring const Compiler::COMPILER_JOIN_ELEM = L"j"; -wstring const Compiler::COMPILER_BLANK_ELEM = L"b"; -wstring const Compiler::COMPILER_POSTGENERATOR_ELEM = L"a"; -wstring const Compiler::COMPILER_GROUP_ELEM = L"g"; -wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm"; -wstring const Compiler::COMPILER_IGNORE_ATTR = L"i"; -wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes"; -wstring const Compiler::COMPILER_ALT_ATTR = L"alt"; -wstring const Compiler::COMPILER_V_ATTR = L"v"; -wstring const Compiler::COMPILER_VL_ATTR = L"vl"; -wstring const Compiler::COMPILER_VR_ATTR = L"vr"; +int main (/*int argc, char** argv*/) { + Alphabet alphabet; + Transducer t; -/* add to header -wstring const Compiler::COMPILER_ANYCHAR_ELEM = L"w"; -wstring const Compiler::COMPILER_ANYTAG_ELEM = L"t"; -wstring const Compiler::COMPILER_WB_ELEM = L"j"; -*/ + LtLocale::tryToSetLocale(); -Compiler::Compiler() : -reader(0), -verbose(false), -first_element(false), -acx_current_char(0) -{ -} + cout << "1" << endl; -Compiler::~Compiler() -{ -} + Compiler c; + c.parse("examples/new-example.dix", L"lr"); -void -Compiler::parseACX(string const &fichero, wstring const &dir) -{ - if(dir == COMPILER_RESTRICTION_LR_VAL) - { - reader = xmlReaderForFile(fichero.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: cannot open '" << fichero << "'." << endl; - exit(EXIT_FAILURE); - } - int ret = xmlTextReaderRead(reader); - while(ret == 1) - { - procNodeACX(); - ret = xmlTextReaderRead(reader); - } - } -} -void -Compiler::parse(string const &fichero, wstring const &dir) -{ - direction = dir; - reader = xmlReaderForFile(fichero.c_str(), NULL, 0); - if(reader == NULL) - { - cerr << "Error: Cannot open '" << fichero << "'." << endl; - exit(EXIT_FAILURE); - } + // xmlTextReaderPtr reader; + // reader = xmlReaderForFile("examples/new-example.dix", NULL, 0); - int ret = xmlTextReaderRead(reader); - while(ret == 1) - { - procNode(); - ret = xmlTextReaderRead(reader); - } - - if(ret != 0) - { - wcerr << L"Error: Parse error at the end of input." << endl; - } - - xmlFreeTextReader(reader); - xmlCleanupParser(); - - - // Minimize transducers - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - (it->second).minimize(); - } -} - - -void -Compiler::procAlphabet() -{ - int tipo=xmlTextReaderNodeType(reader); - - if(tipo != XML_READER_TYPE_END_ELEMENT) - { - int ret = xmlTextReaderRead(reader); - if(ret == 1) - { - xmlChar const *valor = xmlTextReaderConstValue(reader); - letters = XMLParseUtil::towstring(valor); - bool espai = true; - for(unsigned int i = 0; i < letters.length(); i++) - { - if(!isspace(letters.at(i))) - { - espai = false; - break; - } - } - if(espai == true) // libxml2 returns '\n' for , should be empty - { - letters = L""; - } - } - else - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Missing alphabet symbols." << endl; - exit(EXIT_FAILURE); - } - } -} - -void -Compiler::procSDef() -{ - alphabet.includeSymbol(L"<"+attrib(COMPILER_N_ATTR)+L">"); -} - -void -Compiler::procParDef() -{ - int tipo=xmlTextReaderNodeType(reader); - - if(tipo != XML_READER_TYPE_END_ELEMENT) - { - current_paradigm = attrib(COMPILER_N_ATTR); - } - else - { - if(!paradigms[current_paradigm].isEmpty()) - { - paradigms[current_paradigm].minimize(); - paradigms[current_paradigm].joinFinals(); - current_paradigm = L""; - } - } -} - -int -Compiler::matchTransduction(list const &pi, - list const &pd, - int estado, Transducer &t) -{ - list::const_iterator izqda, dcha, limizqda, limdcha; - - if(direction == COMPILER_RESTRICTION_LR_VAL) - { - izqda = pi.begin(); - dcha = pd.begin(); - limizqda = pi.end(); - limdcha = pd.end(); - } - else - { - izqda = pd.begin(); - dcha = pi.begin(); - limizqda = pd.end(); - limdcha = pi.end(); - } - - - if(pi.size() == 0 && pd.size() == 0) - { - estado = t.insertNewSingleTransduction(alphabet(0, 0), estado); - } - else - { - map >::iterator acx_map_ptr; - int rsymbol = 0; - - while(true) - { - int etiqueta; - - acx_map_ptr = acx_map.end(); - - if(izqda == limizqda && dcha == limdcha) - { - break; - } - else if(izqda == limizqda) - { - etiqueta = alphabet(0, *dcha); - dcha++; - } - else if(dcha == limdcha) - { - etiqueta = alphabet(*izqda, 0); - acx_map_ptr = acx_map.find(*izqda); - rsymbol = 0; - izqda++; - } - else - { - etiqueta = alphabet(*izqda, *dcha); - acx_map_ptr = acx_map.find(*izqda); - rsymbol = *dcha; - izqda++; - dcha++; - } - - int nuevo_estado = t.insertSingleTransduction(etiqueta, estado); - - if(acx_map_ptr != acx_map.end()) - { - for(set::iterator it = acx_map_ptr->second.begin(); - it != acx_map_ptr->second.end(); it++) - { - t.linkStates(estado, nuevo_estado, alphabet(*it ,rsymbol)); - } - } - estado = nuevo_estado; - } - } - - return estado; -} - - -void -Compiler::requireEmptyError(wstring const &name) -{ - if(!xmlTextReaderIsEmptyElement(reader)) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Non-empty element '<" << name << L">' should be empty." << endl; - exit(EXIT_FAILURE); - } -} - -bool -Compiler::allBlanks() -{ - bool flag = true; - wstring text = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - - for(unsigned int i = 0, limit = text.size(); i < limit; i++) - { - flag = flag && iswspace(text[i]); - } - - return flag; -} - -/* -@param result: (referenced) empty list -@param name: name of the node -*/ -void -Compiler::readString(list &result, wstring const &name) -{ - wcout << "NAME" << name << endl; - if(name == L"#text") - { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); //NOTE returns the (wstring) text value of the node, or NULL if unavailable - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } - } - else if(name == COMPILER_BLANK_ELEM) - { - requireEmptyError(name); - result.push_back(static_cast(L' ')); - } - else if(name == L"j" /*COMPILER_WB_ELEM*/) //FIXME "j" - { - requireEmptyError(name); - result.push_back(static_cast(L'$')); - } - else if(name == COMPILER_POSTGENERATOR_ELEM) - { - requireEmptyError(name); - result.push_back(static_cast(L'~')); - } - else if(name == COMPILER_GROUP_ELEM) - { - int tipo=xmlTextReaderNodeType(reader); - if(tipo != XML_READER_TYPE_END_ELEMENT) - { - result.push_back(static_cast(L'#')); - } - } - else if(name == L"w" /*COMPILER_ANYCHAR_ELEM*/) //FIXME "w" - { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } - } - else if(name == L"t" /*COMPILER_ANYTAG_ELEM*/ ) //FIXME "t" - { - wstring value = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } - } - else if(name == COMPILER_S_ELEM) - { - requireEmptyError(name); - wstring symbol = L"<" + attrib(COMPILER_N_ATTR) + L">"; //NOTE attrib from - - if(!alphabet.isSymbolDefined(symbol)) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined symbol '" << symbol << L"'." << endl; - exit(EXIT_FAILURE); - } - - result.push_back(alphabet(symbol)); - } - else - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid specification of element '<" << name; - wcerr << L">' in this context." << endl; - exit(EXIT_FAILURE); - } -} - -void -Compiler::skipBlanks(wstring &name) -{ - while(name == L"#text" || name == L"#comment") - { - if(name != L"#comment") - { - if(!allBlanks()) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; - exit(EXIT_FAILURE); - } - } - - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - } -} - -void -Compiler::skip(wstring &name, wstring const &elem) -{ - skip(name, elem, true); -} - -void -Compiler::skip(wstring &name, wstring const &elem, bool open) -{ - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - wstring slash; - - if(!open) - { - slash = L"/"; - } - - while(name == L"#text" || name == L"#comment") - { - if(name != L"#comment") - { - if(!allBlanks()) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid construction." << endl; - exit(EXIT_FAILURE); - } - } - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - } - - if(name != elem) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Expected '<" << slash << elem << L">'." << endl; - exit(EXIT_FAILURE); - } -} - -EntryToken -Compiler::procIdentity() -{ - list both_sides; - - if(!xmlTextReaderIsEmptyElement(reader)) - { - wstring name = L""; - - while(true) - { - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); //NOTE returns the qualified name of the node - if(name == COMPILER_IDENTITY_ELEM) - { - break; - } - readString(both_sides, name); - } - } - - if(verbose && first_element && (both_sides.front() == (int)L' ')) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; - } - first_element = false; - EntryToken e; - e.setSingleTransduction(both_sides, both_sides); - return e; -} - -EntryToken -Compiler::procTransduction() -{ - list lhs, rhs; - wstring name; - - skip(name, COMPILER_LEFT_ELEM); - - if(!xmlTextReaderIsEmptyElement(reader)) - { - name = L""; - while(true) - { - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - if(name == COMPILER_LEFT_ELEM) - { - break; - } - readString(lhs, name); - } - } - - if(verbose && first_element && (lhs.front() == (int)L' ')) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Entry begins with space." << endl; - } - first_element = false; - - skip(name, COMPILER_RIGHT_ELEM); - - if(!xmlTextReaderIsEmptyElement(reader)) - { - name = L""; - while(true) - { - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - if(name == COMPILER_RIGHT_ELEM) - { - break; - } - readString(rhs, name); - } - } - - skip(name, COMPILER_PAIR_ELEM, false); - - EntryToken e; - e.setSingleTransduction(lhs, rhs); - return e; -} - -wstring -Compiler::attrib(wstring const &name) -{ - return XMLParseUtil::attrib(reader, name); -} - -EntryToken -Compiler::procPar() -{ - EntryToken e; - wstring nomparadigma = attrib(COMPILER_N_ATTR); - first_element = false; - - if(current_paradigm != L"" && nomparadigma == current_paradigm) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Paradigm refers to itself '" << nomparadigma << L"'." < const &elements) -{ - if(current_paradigm != L"") - { - // compilation of paradigms - Transducer &t = paradigms[current_paradigm]; - int e = t.getInitial(); - - for(unsigned int i = 0, limit = elements.size(); i < limit; i++) - { - if(elements[i].isParadigm()) - { - e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); - } - else if(elements[i].isSingleTransduction()) - { - e = matchTransduction(elements[i].left(), - elements[i].right(), e, t); - } - else if(elements[i].isRegexp()) - { - RegexpCompiler analyzer; - analyzer.initialize(&alphabet); - analyzer.compile(elements[i].regExp()); - e = t.insertTransducer(e, analyzer.getTransducer(), alphabet(0,0)); - } - else - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid entry token." << endl; - exit(EXIT_FAILURE); - } - } - t.setFinal(e); - } - else - { - // compilaci�n de dictionary - - Transducer &t = sections[current_section]; - int e = t.getInitial(); - - for(unsigned int i = 0, limit = elements.size(); i < limit; i++) - { - if(elements[i].isParadigm()) - { - if(i == elements.size()-1) - { - // paradigma sufijo - if(suffix_paradigms[current_section].find(elements[i].paradigmName()) != suffix_paradigms[current_section].end()) - { - t.linkStates(e, suffix_paradigms[current_section][elements[i].paradigmName()], 0); - e = postsuffix_paradigms[current_section][elements[i].paradigmName()]; - } - else - { - e = t.insertNewSingleTransduction(alphabet(0, 0), e); - suffix_paradigms[current_section][elements[i].paradigmName()] = e; - e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); - postsuffix_paradigms[current_section][elements[i].paradigmName()] = e; - } - } - else if(i == 0) - { - // paradigma prefijo - if(prefix_paradigms[current_section].find(elements[i].paradigmName()) != prefix_paradigms[current_section].end()) - { - e = prefix_paradigms[current_section][elements[i].paradigmName()]; - } - else - { - e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); - prefix_paradigms[current_section][elements[i].paradigmName()] = e; - } - } - else - { - // paradigma intermedio - e = t.insertTransducer(e, paradigms[elements[i].paradigmName()]); - } - } - else if(elements[i].isRegexp()) - { - RegexpCompiler analyzer; - analyzer.initialize(&alphabet); - analyzer.compile(elements[i].regExp()); - e = t.insertTransducer(e, analyzer.getTransducer(), alphabet(0,0)); - } - else - { - e = matchTransduction(elements[i].left(), elements[i].right(), e, t); - } - } - t.setFinal(e); - } -} - - -void -Compiler::requireAttribute(wstring const &value, wstring const &attrname, - wstring const &elemname) -{ - if(value == L"") - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): '<" << elemname; - wcerr << L"' element must specify non-void '"; - wcerr << attrname << L"' attribute." << endl; - exit(EXIT_FAILURE); - } -} - - -void -Compiler::procSection() -{ - int tipo=xmlTextReaderNodeType(reader); - - if(tipo != XML_READER_TYPE_END_ELEMENT) - { - wstring const &id = attrib(COMPILER_ID_ATTR); - wstring const &type = attrib(COMPILER_TYPE_ATTR); - requireAttribute(id, COMPILER_ID_ATTR, COMPILER_SECTION_ELEM); - requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); - - current_section = id; - current_section += L"@"; - current_section.append(type); - } - else - { - current_section = L""; - } -} - -void -Compiler::procEntry() -{ - wstring atributo=this->attrib(COMPILER_RESTRICTION_ATTR); - wstring ignore = this->attrib(COMPILER_IGNORE_ATTR); - wstring altval = this->attrib(COMPILER_ALT_ATTR); - wstring varval = this->attrib(COMPILER_V_ATTR); - wstring varl = this->attrib(COMPILER_VL_ATTR); - wstring varr = this->attrib(COMPILER_VR_ATTR); - - //�if entry is masked by a restriction of direction or an ignore mark - if((atributo != L"" && atributo != direction) - || ignore == COMPILER_IGNORE_YES_VAL - || (altval != L"" && altval != alt) - || (direction == COMPILER_RESTRICTION_RL_VAL && varval != L"" && varval != variant) - || (direction == COMPILER_RESTRICTION_RL_VAL && varl != L"" && varl != variant_left) - || (direction == COMPILER_RESTRICTION_LR_VAL && varr != L"" && varr != variant_right)) - { - // parse to the end of the entry - wstring name = L""; - - while(name != COMPILER_ENTRY_ELEM) - { - xmlTextReaderRead(reader); - name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - } - - return; - } - - vector elements; - - while(true) - { - int ret = xmlTextReaderRead(reader); - if(ret != 1) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Parse error." << endl; - exit(EXIT_FAILURE); - } - wstring name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); - skipBlanks(name); - - if(current_paradigm == L"" && verbose) - { - first_element = true; - } - - int tipo = xmlTextReaderNodeType(reader); - if(name == COMPILER_PAIR_ELEM) - { - elements.push_back(procTransduction()); - } - else if(name == COMPILER_IDENTITY_ELEM) - { - elements.push_back(procIdentity()); - } - else if(name == COMPILER_REGEXP_ELEM) - { - elements.push_back(procRegexp()); - } - else if(name == COMPILER_PAR_ELEM) - { - elements.push_back(procPar()); - - // detecci�n del uso de paradigmas no definidos - - wstring const &p = elements.rbegin()->paradigmName(); - - if(paradigms.find(p) == paradigms.end()) - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Undefined paradigm '" << p << L"'." <' into '<" << COMPILER_ENTRY_ELEM; - wcerr << L">'." << endl; - exit(EXIT_FAILURE); - } - } -} - -void -Compiler::procNodeACX() -{ - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); - if(nombre == L"#text") - { - /* ignore */ - } - else if(nombre == L"analysis-chars") - { - /* ignore */ - } - else if(nombre == L"char") - { - acx_current_char = static_cast(attrib(L"value")[0]); - } - else if(nombre == L"equiv-char") - { - acx_map[acx_current_char].insert(static_cast(attrib(L"value")[0])); - } - else if(nombre == L"#comment") - { - /* ignore */ - } - else - { - wcerr << L"Error in ACX file (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; - exit(EXIT_FAILURE); - } -} - -void -Compiler::procNode() -{ - xmlChar const *xnombre = xmlTextReaderConstName(reader); - wstring nombre = XMLParseUtil::towstring(xnombre); - - // HACER: optimizar el orden de ejecuci�n de esta ristra de "ifs" - - if(nombre == L"#text") - { - /* ignorar */ - } - else if(nombre == COMPILER_DICTIONARY_ELEM) - { - /* ignorar */ - } - else if(nombre == COMPILER_ALPHABET_ELEM) - { - procAlphabet(); - } - else if(nombre == COMPILER_SDEFS_ELEM) - { - /* ignorar */ - } - else if(nombre == COMPILER_SDEF_ELEM) - { - procSDef(); - } - else if(nombre == COMPILER_PARDEFS_ELEM) - { - /* ignorar */ - } - else if(nombre == COMPILER_PARDEF_ELEM) - { - procParDef(); - } - else if(nombre == COMPILER_ENTRY_ELEM) - { - procEntry(); - } - else if(nombre == COMPILER_SECTION_ELEM) - { - procSection(); - } - else if(nombre== L"#comment") - { - /* ignorar */ - } - else - { - wcerr << L"Error (" << xmlTextReaderGetParserLineNumber(reader); - wcerr << L"): Invalid node '<" << nombre << L">'." << endl; - exit(EXIT_FAILURE); - } -} - -EntryToken -Compiler::procRegexp() -{ - EntryToken et; - xmlTextReaderRead(reader); - wstring re = XMLParseUtil::towstring(xmlTextReaderConstValue(reader)); - et.setRegexp(re); - xmlTextReaderRead(reader); - return et; -} - -void -Compiler::write(FILE *output) -{ - // letters - Compression::wstring_write(letters, output); - - // symbols - alphabet.write(output); - - // transducers - Compression::multibyte_write(sections.size(), output); - - int conta=0; - for(map::iterator it = sections.begin(), - limit = sections.end(); - it != limit; it++) - { - conta++; - wcout << it->first << " " << it->second.size(); - wcout << " " << it->second.numberOfTransitions() << endl; - Compression::wstring_write(it->first, output); - it->second.write(output); - } -} - -void -Compiler::setAltValue(string const &a) -{ - alt = XMLParseUtil::stows(a); -} - -void -Compiler::setVariantValue(string const &v) -{ - variant = XMLParseUtil::stows(v); -} - -void -Compiler::setVariantLeftValue(string const &v) -{ - variant_left = XMLParseUtil::stows(v); -} - -void -Compiler::setVariantRightValue(string const &v) -{ - variant_right = XMLParseUtil::stows(v); -} - -void -Compiler::setVerbose(bool verbosity) -{ - verbose = verbosity; -} - - -int main (int argc, char** argv) { - Alphabet alphabet; - Transducer t; - - LtLocale::tryToSetLocale(); - alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); alphabet.includeSymbol(L""); @@ -1009,6 +67,7 @@ int initial = t.getInitial(); int take_out = initial; + take_out = t.insertSingleTransduction(alphabet(L't',L't'), take_out); take_out = t.insertSingleTransduction(alphabet(L'a',L'a'), take_out); take_out = t.insertSingleTransduction(alphabet(L'k',L'k'), take_out); @@ -1026,6 +85,8 @@ int after_takeout = take_out; + + /* no det */ int from_nodet = after_takeout; @@ -1133,18 +194,22 @@ t.linkStates(from_noadj, after_adj, 0); - FILE* fst = fopen("takeout.fst", "w+"); - // First write the letter symbols of the alphabet - Compression::wstring_write(L"aekout", fst); - // Then write the multicharacter symbols - alphabet.write(fst); - // Then write then number of transducers - Compression::multibyte_write(1, fst); - // Then write the name of the transducer - Compression::wstring_write(L"main@standard", fst); - // Then write the transducer - t.write(fst); - wcout << "t.size(): " << t.size() << endl ; + // FILE* fst = fopen("takeout.fst", "w+"); + // // First write the letter symbols of the alphabet + // Compression::wstring_write(L"aekout", fst); + // // Then write the multicharacter symbols + // alphabet.write(fst); + // // Then write then number of transducers + // Compression::multibyte_write(1, fst); + // // Then write the name of the transducer + // Compression::wstring_write(L"main@standard", fst); + // // Then write the transducer + // t.write(fst); + // wcout << "t.size(): " << t.size() << endl ; + + FILE* fst = fopen("lsx-compiler.fst", "w+"); + c.write(fst); + fclose(fst); return 0; Index: branches/apertium-separable/src/compiler_copy.h =================================================================== --- branches/apertium-separable/src/compiler_copy.h (nonexistent) +++ branches/apertium-separable/src/compiler_copy.h (revision 80471) @@ -0,0 +1,385 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _MYCOMPILER_ +#define _MYCOMPILER_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + +/** + * A compiler of dictionaries to letter transducers + */ +class Compiler +{ +private: + /** + * The libxml2's XML reader + */ + xmlTextReaderPtr reader; + + /** + * The alt value + */ + wstring alt; + + /** + * The variant value (monodix) + */ + wstring variant; + + /** + * The variant value (left side of bidix) + */ + wstring variant_left; + + /** + * The variant value (right side of bidix) + */ + wstring variant_right; + + /** + * The paradigm being compiled + */ + wstring current_paradigm; + + /** + * The dictionary section being compiled + */ + wstring current_section; + + /** + * The direction of the compilation, 'lr' (left-to-right) or 'rl' + * (right-to-left) + */ + wstring direction; + + /** + * List of characters to be considered alphabetic + */ + wstring letters; + + /** + * Set verbose mode: warnings which may or may not be correct + */ + bool verbose; + + /** + * First element (of an entry) + */ + bool first_element; + + /** + * Identifier of all the symbols during the compilation + */ + Alphabet alphabet; + + /** + * List of named transducers-paradigms + */ + map paradigms; + + /** + * List of named dictionary sections + */ + map sections; + + /** + * List of named prefix copy of a paradigm + */ + map, Ltstr> prefix_paradigms; + + /** + * List of named suffix copy of a paradigm + */ + map, Ltstr> suffix_paradigms; + + /** + * List of named endings of a suffix copy of a paradgim + */ + map, Ltstr> postsuffix_paradigms; + + /** + * Mapping of aliases of characters specified in ACX files + */ + map > acx_map; + + /** + * Original char being mapped + */ + int acx_current_char; + + /* + static string range(char const a, char const b); + string readAlphabet(); + */ + + /** + * Method to parse an XML Node + */ + void procNode(); + + /** + * Method to parse an XML Node in ACX files + */ + void procNodeACX(); + + + /** + * Parse the <alphabet> element + */ + void procAlphabet(); + + /** + * Parse the <sdef< element + */ + void procSDef(); + + /** + * Parse the <pardef> element + */ + void procParDef(); + + /** + * Parse the <e> element + */ + void procEntry(); + + + /* + * added + */ + void procTag(); + void procChar(); + void procWb(); + + /** + * Parse the <re> element + * @return a list of tokens from the dictionary's entry + */ + EntryToken procRegexp(); + + /** + * Parse the <section> element + */ + void procSection(); + + /** + * Gets an attribute value with their name and the current context + * @param name the name of the attribute + * @return the value of the attribute + */ + wstring attrib(wstring const &name); + + /** + * Construct symbol pairs by align left side of both parts and insert + * them into a transducer + * @param lp left part of the transduction + * @param rp right part of the transduction + * @param state the state from wich insert the new transduction + * @param t the transducer + * @return the last state of the inserted transduction + */ + int matchTransduction(list const &lp, list const &rp, + int state, Transducer &t); + /** + * Parse the <p< element + * @return a list of tokens from the dictionary's entry + */ + EntryToken procTransduction(); + + /** + * Parse the <i< element + * @return a list of tokens from the dictionary's entry + */ + EntryToken procIdentity(); + + /** + * Parse the <par> element + * @return a list of tokens from the dictionary's entry + */ + EntryToken procPar(); + + /** + * Insert a list of tokens into the paradigm / section being processed + * @param elements the list + */ + void insertEntryTokens(vector const &elements); + + /** + * Skip all document #text nodes before "elem" + * @param name the name of the node + * @param elem the name of the expected node + */ + void skip(wstring &name, wstring const &elem); + + /** + * Skip all document #text nodes before "elem" + * @param name the name of the node + * @param elem the name of the expected node + * @param open true for open element, false for closed + */ + void skip(wstring &name, wstring const &elem, bool open); + + /** + * Skip all blank #text nodes before "name" + * @param name the name of the node + */ + void skipBlanks(wstring &name); + + + void readString(list &result, wstring const &name); + + /** + * Force an element to be empty, and check for it + * @param name the element + */ + void requireEmptyError(wstring const &name); + + /** + * Force an attribute to be specified, amd check for it + * @param value the value of the attribute + * @param attrname the name of the attribute + * @param elemname the parent of the attribute + */ + void requireAttribute(wstring const &value, wstring const &attrname, + wstring const &elemname); + + /** + * True if all the elements in the current node are blanks + * @return true if all are blanks + */ + bool allBlanks(); + +public: + + /* + * Constants to represent the element and the attributes of + * dictionaries + */ + static wstring const COMPILER_DICTIONARY_ELEM; + static wstring const COMPILER_ALPHABET_ELEM; + static wstring const COMPILER_SDEFS_ELEM; + static wstring const COMPILER_SDEF_ELEM; + static wstring const COMPILER_N_ATTR; + static wstring const COMPILER_PARDEFS_ELEM; + static wstring const COMPILER_PARDEF_ELEM; + static wstring const COMPILER_PAR_ELEM; + static wstring const COMPILER_ENTRY_ELEM; + static wstring const COMPILER_RESTRICTION_ATTR; + static wstring const COMPILER_RESTRICTION_LR_VAL; + static wstring const COMPILER_RESTRICTION_RL_VAL; + static wstring const COMPILER_PAIR_ELEM; + static wstring const COMPILER_LEFT_ELEM; + static wstring const COMPILER_RIGHT_ELEM; + static wstring const COMPILER_S_ELEM; + static wstring const COMPILER_REGEXP_ELEM; + static wstring const COMPILER_SECTION_ELEM; + static wstring const COMPILER_ID_ATTR; + static wstring const COMPILER_TYPE_ATTR; + static wstring const COMPILER_IDENTITY_ELEM; + static wstring const COMPILER_JOIN_ELEM; + static wstring const COMPILER_BLANK_ELEM; + static wstring const COMPILER_POSTGENERATOR_ELEM; + static wstring const COMPILER_GROUP_ELEM; + static wstring const COMPILER_LEMMA_ATTR; + static wstring const COMPILER_IGNORE_ATTR; + static wstring const COMPILER_IGNORE_YES_VAL; + static wstring const COMPILER_ALT_ATTR; + static wstring const COMPILER_V_ATTR; + static wstring const COMPILER_VL_ATTR; + static wstring const COMPILER_VR_ATTR; + + static wstring const COMPILER_ANYTAG_ELEM; + static wstring const COMPILER_ANYCHAR_ELEM; + static wstring const COMPILER_WB_ELEM; + + + /** + * Constructor + */ + Compiler(); + + /** + * Destructor + */ + ~Compiler(); + + /** + * Compile dictionary to letter transducers + * @param fichero file + * @param dir direction + */ + void parse(string const &fichero, wstring const &dir); + + /** + * Read ACX file + */ + void parseACX(string const &fichero, wstring const &dir); + + + auto getAlt(); + auto getInt(); + + + /** + * Write the result of compilation + * @param fd the stream where write the result + */ + void write(FILE *fd); + + /** + * Set verbose output + */ + void setVerbose(bool verbosity = false); + + /** + * Set the alt value to use in compilation + * @param a the value + */ + void setAltValue(string const &a); + + /** + * Set the variant value to use in compilation + * @param v the value + */ + void setVariantValue(string const &v); + + /** + * Set the variant_left value to use in compilation + * @param v the value + */ + void setVariantLeftValue(string const &v); + + /** + * Set the variant_right value to use in compilation + * @param v the value + */ + void setVariantRightValue(string const &v); +}; + + +#endif Index: branches/apertium-separable/src/lsx_processor.cc =================================================================== --- branches/apertium-separable/src/lsx_processor.cc (revision 80468) +++ branches/apertium-separable/src/lsx_processor.cc (revision 80471) @@ -39,6 +39,11 @@ LtLocale::tryToSetLocale(); + if (argc != 2) { + cout << "incorrect usage: needs one input file" << endl; + exit(1); + } + FILE *fst = fopen(argv[1], "r"); set alphabetic_chars; Index: branches/apertium-separable/fst/Makefile =================================================================== --- branches/apertium-separable/fst/Makefile (revision 80468) +++ branches/apertium-separable/fst/Makefile (revision 80471) @@ -1,8 +1,8 @@ CFLAGS= -I/usr/local/include/lttoolbox-3.3 LDFLAGS= -L/usr/local/lib -llttoolbox3 -transducer2: ../src/transducer2.cpp - g++ -ggdb $(CFLAGS) -Wall ../src/transducer2.cpp -o $@ $(LDFLAGS) +transducer2: ../src/transducer2.cc + g++ -ggdb $(CFLAGS) -Wall ../src/transducer2.cc -o $@ $(LDFLAGS) all: transducer2