commit b5d6e0758faac64045086fcf184b8859e5650b56 Author: Daniel Swanson Date: Thu Jun 3 12:14:47 2021 -0500 utf-32 in monodix and some type cleanup diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index d4186f2..f4f00fc 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -226,12 +226,12 @@ Compiler::procParDef() } int -Compiler::matchTransduction(list const &pi, - list const &pd, +Compiler::matchTransduction(vector const &pi, + vector const &pd, int state, Transducer &t, double const &entry_weight) { - list::const_iterator left, right, limleft, limright; + vector::const_iterator left, right, limleft, limright; if(direction == COMPILER_RESTRICTION_LR_VAL) { @@ -336,22 +336,18 @@ Compiler::allBlanks() for(auto c : text) { - flag = flag && iswspace(c); + flag = flag && u_isspace(c); } return flag; } void -Compiler::readString(list &result, UString const &name) +Compiler::readString(vector &result, UString const &name) { if(name == COMPILER_TEXT_NODE) { - UString value = XMLParseUtil::readValue(reader); - for(unsigned int i = 0, limit = value.size(); i < limit; i++) - { - result.push_back(static_cast(value[i])); - } + XMLParseUtil::readValueInto32(reader, result); } else if(name == COMPILER_M_ELEM) { @@ -471,7 +467,7 @@ Compiler::skip(UString &name, UString const &elem, bool open) EntryToken Compiler::procIdentity(UString const &wsweight, bool ig) { - list both_sides; + vector both_sides; double entry_weight = stod(wsweight); if(!xmlTextReaderIsEmptyElement(reader)) @@ -499,7 +495,7 @@ Compiler::procIdentity(UString const &wsweight, bool ig) EntryToken e; if(ig) { - list right; + vector right; right.push_back(static_cast(L'#')); right.insert(right.end(), both_sides.begin(), both_sides.end()); e.setSingleTransduction(both_sides, right, entry_weight); @@ -514,7 +510,7 @@ Compiler::procIdentity(UString const &wsweight, bool ig) EntryToken Compiler::procTransduction(UString const &wsweight) { - list lhs, rhs; + vector lhs, rhs; double entry_weight = stod(wsweight); UString name; @@ -941,8 +937,7 @@ Compiler::procRegexp() { EntryToken et; xmlTextReaderRead(reader); - UString re = XMLParseUtil::readValue(reader); - et.setRegexp(re); + et.readRegexp(reader); xmlTextReaderRead(reader); return et; } diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index b03c14a..5ad073d 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -216,7 +216,7 @@ private: * @param t the transducer * @return the last state of the inserted transduction */ - int matchTransduction(list const &lp, list const &rp, + int matchTransduction(vector const &lp, vector const &rp, int state, Transducer &t, double const &entry_weight); /** * Parse the <p> element @@ -264,7 +264,7 @@ private: void skipBlanks(UString &name); - void readString(list &result, UString const &name); + void readString(vector &result, UString const &name); /** * Force an element to be empty, and check for it diff --git a/lttoolbox/entry_token.cc b/lttoolbox/entry_token.cc index e23db8a..da00e9a 100644 --- a/lttoolbox/entry_token.cc +++ b/lttoolbox/entry_token.cc @@ -68,7 +68,7 @@ EntryToken::setParadigm(UString const &np) } void -EntryToken::setSingleTransduction(list const &pi, list const &pd, double const ew) +EntryToken::setSingleTransduction(vector const &pi, vector const &pd, double const ew) { weight = ew; leftSide = pi; @@ -79,7 +79,15 @@ EntryToken::setSingleTransduction(list const &pi, list const &pd, doub void EntryToken::setRegexp(UString const &r) { - myregexp = r; + //myregexp = r; + myregexp = vector(r.begin(), r.end()); + type = regexp; +} + +void +EntryToken::readRegexp(xmlTextReaderPtr reader) +{ + XMLParseUtil::readValueInto32(reader, myregexp); type = regexp; } @@ -107,19 +115,19 @@ EntryToken::paradigmName() const return parName; } -list const & +vector const & EntryToken::left() const { return leftSide; } -list const & +vector const & EntryToken::right() const { return rightSide; } -UString const & +vector const & EntryToken::regExp() const { return myregexp; diff --git a/lttoolbox/entry_token.h b/lttoolbox/entry_token.h index 804162e..0b4b43a 100644 --- a/lttoolbox/entry_token.h +++ b/lttoolbox/entry_token.h @@ -18,8 +18,10 @@ #define _ENTRYTOKEN_ -#include +#include #include +#include +#include using namespace std; @@ -52,17 +54,17 @@ private: /** * Left side of transduction (if 'single_transduction') */ - list leftSide; + vector leftSide; /** * Right side of transduction (if 'single_transduction') */ - list rightSide; + vector rightSide; /** * Regular expression (if 'regexp') */ - UString myregexp; + vector myregexp; /** * copy method @@ -107,7 +109,7 @@ public: * @param pd right part * @param ew entry weight */ - void setSingleTransduction(list const &pi, list const &pd, double const ew = 0); + void setSingleTransduction(vector const &pi, vector const &pd, double const ew = 0); /** * Set regular expression. @@ -115,6 +117,12 @@ public: */ void setRegexp(UString const &r); + /** + * More efficient version of setRegexp() + * @param reader the current xml parser state + */ + void readRegexp(xmlTextReaderPtr reader); + /** * eTest EntryToken to detect if is a paradigm. * @return true if it is a paradigm. @@ -143,19 +151,19 @@ public: * Retrieve the left part of the paradigm. * @return the left part of the paradigm. */ - list const & left() const; + vector const & left() const; /** * Retrieve the right part of the paradigm. * @return the right part of the paradigm. */ - list const & right() const; + vector const & right() const; /** * Retrieve the regular expression specification. * @return the regular expression specification. */ - UString const & regExp() const; + vector const & regExp() const; /** * Retrieve the weight value of the entry. diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index 340f953..96151d8 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -20,6 +20,7 @@ #include RegexpCompiler::RegexpCompiler() : +index(0), token(0), alphabet(0), state(0), @@ -112,14 +113,14 @@ RegexpCompiler::consume(int const t) { if(token == t) { - input = input.substr(1); - if(input.empty()) + index++; + if(index == input.size()) { token = FIN_FICHERO; } else { - token = input[0]; + token = input[index]; } } else @@ -129,10 +130,11 @@ RegexpCompiler::consume(int const t) } void -RegexpCompiler::compile(UString const &er) +RegexpCompiler::compile(vector const &er) { input = er; - token = static_cast(input[0]); + token = input[0]; + index = 0; state = transducer.getInitial(); S(); transducer.setFinal(state, default_weight); diff --git a/lttoolbox/regexp_compiler.h b/lttoolbox/regexp_compiler.h index da428d9..ab7e460 100644 --- a/lttoolbox/regexp_compiler.h +++ b/lttoolbox/regexp_compiler.h @@ -22,6 +22,8 @@ #include #include +#include +#include using namespace std; @@ -42,7 +44,12 @@ private: /** * Input string */ - UString input; + vector input; + + /** + * Location in the input string + */ + size_t index; /** * Alphabet to encode symbols @@ -201,7 +208,7 @@ public: * Function that parses a regular expression and produces a transducer * @param er the regular expression */ - void compile(UString const &er); + void compile(vector const &er); /** * Set the decoder of symbols diff --git a/lttoolbox/xml_parse_util.cc b/lttoolbox/xml_parse_util.cc index d60f62b..eb5da81 100644 --- a/lttoolbox/xml_parse_util.cc +++ b/lttoolbox/xml_parse_util.cc @@ -59,3 +59,12 @@ XMLParseUtil::readValue(xmlTextReaderPtr reader) const xmlChar* val = xmlTextReaderConstValue(reader); return to_ustring(reinterpret_cast(val)); } + +void +XMLParseUtil::readValueInto32(xmlTextReaderPtr reader, vector& vec) +{ + const xmlChar* val = xmlTextReaderConstValue(reader); + auto sz = xmlStrlen(val); + vec.reserve(vec.size() + sz); + utf8::utf8to32(val, val+sz, std::back_inserter(vec)); +} diff --git a/lttoolbox/xml_parse_util.h b/lttoolbox/xml_parse_util.h index 2dfdd0f..a9b5c3c 100644 --- a/lttoolbox/xml_parse_util.h +++ b/lttoolbox/xml_parse_util.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include using namespace std; @@ -35,6 +37,7 @@ public: static UString readName(xmlTextReaderPtr reader); static UString readValue(xmlTextReaderPtr reader); + static void readValueInto32(xmlTextReaderPtr reader, vector& vec); }; #endif