commit 3a293af7262f649d268aec4be95c707ae70a6a9a Author: Daniel Swanson Date: Fri Jun 4 11:48:57 2021 -0500 extracting string constants diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 12509f5..31d2865 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -55,13 +55,14 @@ AttCompiler::clear() void AttCompiler::convert_hfst(UString& symbol) { - if (symbol == "@0@"_u || symbol == "ε"_u) - { + if (symbol == Transducer::HFST_EPSILON_SYMBOL_SHORT || + symbol == Transducer::HFST_EPSILON_SYMBOL_LONG || + symbol == Transducer::LTTB_EPSILON_SYMBOL) { symbol.clear(); - } - else if (symbol == "@_SPACE_@"_u) - { + } else if (symbol == Transducer::HFST_SPACE_SYMBOL) { symbol = " "_u; + } else if (symbol == Transducer::HFST_TAB_SYMBOL) { + symbol = "\t"_u; } } diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index b016d84..851c68b 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -464,10 +464,9 @@ Compiler::skip(UString &name, UString const &elem, bool open) } EntryToken -Compiler::procIdentity(UString const &wsweight, bool ig) +Compiler::procIdentity(double const entry_weight, bool ig) { vector both_sides; - double entry_weight = stod(wsweight); if(!xmlTextReaderIsEmptyElement(reader)) { @@ -507,10 +506,9 @@ Compiler::procIdentity(UString const &wsweight, bool ig) } EntryToken -Compiler::procTransduction(UString const &wsweight) +Compiler::procTransduction(double const entry_weight) { vector lhs, rhs; - double entry_weight = stod(wsweight); UString name; skip(name, COMPILER_LEFT_ELEM); @@ -718,7 +716,7 @@ Compiler::procSection() requireAttribute(type, COMPILER_TYPE_ATTR, COMPILER_SECTION_ELEM); current_section = id; - current_section += "@"_u; + current_section += '@'; current_section.append(type); } else @@ -758,9 +756,10 @@ Compiler::procEntry() return; } - if(wsweight.empty()) + double weight = 0.0; + if(!wsweight.empty()) { - wsweight = "0.0000"_u; + weight = stod(wsweight); } vector elements; @@ -785,15 +784,15 @@ Compiler::procEntry() int type = xmlTextReaderNodeType(reader); if(name == COMPILER_PAIR_ELEM) { - elements.push_back(procTransduction(wsweight)); + elements.push_back(procTransduction(weight)); } else if(name == COMPILER_IDENTITY_ELEM) { - elements.push_back(procIdentity(wsweight, false)); + elements.push_back(procIdentity(weight, false)); } else if(name == COMPILER_IDENTITYGROUP_ELEM) { - elements.push_back(procIdentity(wsweight, true)); + elements.push_back(procIdentity(weight, true)); } else if(name == COMPILER_REGEXP_ELEM) { diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index 5ad073d..ad18f69 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -222,13 +222,13 @@ private: * Parse the <p> element * @return a list of tokens from the dictionary's entry */ - EntryToken procTransduction(UString const &wsweight); + EntryToken procTransduction(double const entry_weight); /** * Parse the <i> element * @return a list of tokens from the dictionary's entry */ - EntryToken procIdentity(UString const &wsweight, bool ig = false); + EntryToken procIdentity(double const entry_weight, bool ig = false); /** * Parse the <par> element diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 90a1212..0bc6cc6 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -27,6 +27,18 @@ using namespace std; +UString const FSTProcessor::XML_TEXT_NODE = "#text"_u; +UString const FSTProcessor::XML_COMMENT_NODE = "#comment"_u; +UString const FSTProcessor::XML_IGNORED_CHARS_ELEM = "ignored-chars"_u; +UString const FSTProcessor::XML_RESTORE_CHAR_ELEM = "restore-char"_u; +UString const FSTProcessor::XML_RESTORE_CHARS_ELEM = "restore-chars"_u; +UString const FSTProcessor::XML_VALUE_ATTR = "value"_u; +UString const FSTProcessor::XML_CHAR_ELEM = "char"_u; +UString const FSTProcessor::WBLANK_START = "[["_u; +UString const FSTProcessor::WBLANK_END = "]]"_u; +UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; + + FSTProcessor::FSTProcessor() : default_weight(0.0000), outOfWord(false), @@ -123,19 +135,19 @@ void FSTProcessor::procNodeICX() { UString name = XMLParseUtil::readName(reader); - if(name == "#text"_u) + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == "ignored-chars"_u) + else if(name == XML_IGNORED_CHARS_ELEM) { /* ignore */ } - else if(name == "char"_u) + else if(name == XML_CHAR_ELEM) { - ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, "value"_u)[0])); + ignored_chars.insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == "#comment"_u) + else if(name == XML_COMMENT_NODE) { /* ignore */ } @@ -157,23 +169,23 @@ void FSTProcessor::procNodeRCX() { UString name = XMLParseUtil::readName(reader); - if(name == "#text"_u) + if(name == XML_TEXT_NODE) { /* ignore */ } - else if(name == "restore-chars"_u) + else if(name == XML_RESTORE_CHARS_ELEM) { /* ignore */ } - else if(name == "char"_u) + else if(name == XML_CHAR_ELEM) { - rcx_current_char = static_cast(XMLParseUtil::attrib(reader, "value"_u)[0]); + rcx_current_char = static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0]); } - else if(name == "restore-char"_u) + else if(name == XML_RESTORE_CHAR_ELEM) { - rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, "value"_u)[0])); + rcx_map[rcx_current_char].insert(static_cast(XMLParseUtil::attrib(reader, XML_VALUE_ATTR)[0])); } - else if(name == "#comment"_u) + else if(name == XML_COMMENT_NODE) { /* ignore */ } @@ -235,8 +247,7 @@ FSTProcessor::readFullBlock(InputFile& input, UChar32 const delim1, UChar32 cons UString FSTProcessor::readWblank(InputFile& input) { - UString result; - result += "[["_u; + UString result = WBLANK_START; UChar32 c = 0; while(!input.eof()) @@ -271,8 +282,7 @@ FSTProcessor::readWblank(InputFile& input) bool FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) { - UString result; - result += "[["_u; + UString result = WBLANK_START; UChar32 c = 0; while(!input.eof()) @@ -781,11 +791,11 @@ FSTProcessor::combineWblanks() while(wblankqueue.size() > 0) { - if(wblankqueue.front().compare("[[/]]"_u) == 0) + if(wblankqueue.front().compare(WBLANK_FINAL) == 0) { if(final_wblank.empty()) { - final_wblank += "[["_u; + final_wblank += WBLANK_START; } else if(final_wblank.size() > 2) { @@ -809,7 +819,7 @@ FSTProcessor::combineWblanks() if(!final_wblank.empty()) { - final_wblank += "]]"_u; + final_wblank += WBLANK_END; need_end_wblank = true; } @@ -1899,7 +1909,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output) { if(need_end_wblank) { - write("[[/]]"_u, output); + write(WBLANK_FINAL, output); need_end_wblank = false; } @@ -1920,7 +1930,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output) if(need_end_wblank) { - write("[[/]]"_u, output); + write(WBLANK_FINAL, output); need_end_wblank = false; } } @@ -2028,7 +2038,7 @@ FSTProcessor::postgeneration(InputFile& input, UFILE *output) if(need_end_wblank) { - write("[[/]]"_u, output); + write(WBLANK_FINAL, output); need_end_wblank = false; u_fputc(sf[space_index], output); flushWblanks(output); @@ -2394,32 +2404,17 @@ FSTProcessor::biltransfull(UString const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = "^="_u + result.substr(1); - } - else - { - result[0] = '^'; - } + result.clear(); + if(with_delim) { + result += '^'; } - else - { - if(mark) - { - result = "="_u + result.substr(1); - } - else - { - result = result.substr(1); - } + if(mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) @@ -2562,32 +2557,17 @@ FSTProcessor::biltrans(UString const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = "^="_u + result.substr(1); - } - else - { - result[0] = '^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = "="_u + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) @@ -2671,6 +2651,7 @@ UString FSTProcessor::compose(UString const &lexforms, UString const &queue) const { UString result; + result += '/'; for(unsigned int i = 1; i< lexforms.size(); i++) { @@ -2686,7 +2667,7 @@ FSTProcessor::compose(UString const &lexforms, UString const &queue) const result += lexforms[i]; } - return "/"_u + result + queue; + return result + queue; } void @@ -2937,32 +2918,17 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = "^="_u + result.substr(1); - } - else - { - result[0] = '^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = "="_u + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) @@ -2988,10 +2954,9 @@ FSTProcessor::biltransWithQueue(UString const &input_word, bool with_delim) } if (!seentags - && ""_u == current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0)) + && current_state.filterFinals(all_finals, alphabet, escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).empty()) { // word is not present if(with_delim) @@ -3118,32 +3083,17 @@ FSTProcessor::biltransWithoutQueue(UString const &input_word, bool with_delim) } if(current_state.isFinal(all_finals)) { - result = current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0); - if(with_delim) - { - if(mark) - { - result = "^="_u + result.substr(1); - } - else - { - result[0] = '^'; - } + result.clear(); + if (with_delim) { + result += '^'; } - else - { - if(mark) - { - result = "="_u + result.substr(1); - } - else - { - result = result.substr(1); - } + if (mark) { + result += '='; } + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); } if(current_state.size() == 0) diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 8be5eb0..6e5c218 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -489,6 +489,21 @@ private: xmlTextReaderPtr reader; public: + + /* + * String constants + */ + static UString const XML_TEXT_NODE; + static UString const XML_COMMENT_NODE; + static UString const XML_IGNORED_CHARS_ELEM; + static UString const XML_RESTORE_CHAR_ELEM; + static UString const XML_RESTORE_CHARS_ELEM; + static UString const XML_VALUE_ATTR; + static UString const XML_CHAR_ELEM; + static UString const WBLANK_START; + static UString const WBLANK_END; + static UString const WBLANK_FINAL; + FSTProcessor(); void initAnalysis(); diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index bc92108..c3c9ec4 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -179,7 +179,7 @@ int main(int argc, char *argv[]) it->second.show(alphabet, output, 0, hfst); if(it != penum) { - u_fputs("--\n"_u, output); + u_fputs("--"_u, output); } } diff --git a/lttoolbox/regexp_compiler.cc b/lttoolbox/regexp_compiler.cc index 677fbf5..96d98c8 100644 --- a/lttoolbox/regexp_compiler.cc +++ b/lttoolbox/regexp_compiler.cc @@ -25,6 +25,7 @@ index(0), alphabet(0), state(0), letter(0), +postop(0), default_weight(0.0000) { } @@ -202,20 +203,20 @@ RegexpCompiler::Term() e = t.insertNewSingleTransduction((*alphabet)(letter, letter), e, default_weight); t.setFinal(e, default_weight); Postop(); - if(postop == "*"_u) + if(postop == '*') { t.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == "+"_u) + else if(postop == '+') { t.oneOrMore((*alphabet)(0, 0)); } - else if(postop == "?"_u) + else if(postop == '?') { t.optional((*alphabet)(0, 0)); } - postop.clear(); + postop = 0; state = transducer.insertTransducer(state, t, (*alphabet)(0, 0)); } else if(token == '(') @@ -229,20 +230,20 @@ RegexpCompiler::Term() consume(')'); transducer.setFinal(state, default_weight); Postop(); - if(postop == "*"_u) + if(postop == '*') { transducer.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == "+"_u) + else if(postop == '+') { transducer.oneOrMore((*alphabet)(0, 0)); } - else if(postop == "?"_u) + else if(postop == '?') { transducer.optional((*alphabet)(0, 0)); } - postop.clear(); + postop = 0; state = t.insertTransducer(e, transducer, (*alphabet)(0, 0)); transducer = t; } @@ -300,17 +301,17 @@ RegexpCompiler::Postop() if(token == '*') { consume('*'); - postop = "*"_u; + postop = '*'; } else if(token == '?') { consume('?'); - postop = "?"_u; + postop = '?'; } else if(token == '+') { consume('+'); - postop = "+"_u; + postop = '+'; } else if(token == '(' || token == '[' || !isReserved(token) || token == '\\' || token == '|' || token == FIN_FICHERO || @@ -369,20 +370,20 @@ RegexpCompiler::Esp() error(); } - if(postop == "+"_u) + if(postop == '+') { t.oneOrMore((*alphabet)(0, 0)); } - else if(postop == "*"_u) + else if(postop == '*') { t.zeroOrMore((*alphabet)(0, 0)); } - else if(postop == "?"_u) + else if(postop == '?') { t.optional((*alphabet)(0, 0)); } brackets.clear(); - postop.clear(); + postop = 0; state = transducer.insertTransducer(state, t, (*alphabet)(0, 0)); } @@ -480,5 +481,5 @@ RegexpCompiler::initialize(Alphabet *a) setAlphabet(a); transducer.clear(); brackets.clear(); - postop.clear(); + postop = 0; } diff --git a/lttoolbox/regexp_compiler.h b/lttoolbox/regexp_compiler.h index ab7e460..e9bdb30 100644 --- a/lttoolbox/regexp_compiler.h +++ b/lttoolbox/regexp_compiler.h @@ -74,7 +74,7 @@ private: /** * Post-operator: '+', '?', '*' */ - UString postop; + UChar32 postop; /** * Default value of weight diff --git a/lttoolbox/state.cc b/lttoolbox/state.cc index d476efc..4edbb85 100644 --- a/lttoolbox/state.cc +++ b/lttoolbox/state.cc @@ -894,7 +894,8 @@ State::restartFinals(const map &finals, int requiredSymbol, Stat UString State::getReadableString(const Alphabet &a) { - UString retval = "["_u; + UString retval; + retval += '['; for(unsigned int i=0; i"_u); // -1 -> numbers - alphabet.includeSymbol(""_u); // -2 -> blanks + alphabet.includeSymbol(TMX_COMPILER_NUMBER_TAG); // -1 -> numbers + alphabet.includeSymbol(TMX_COMPILER_BLANK_TAG); // -2 -> blanks + number_tag = alphabet(TMX_COMPILER_NUMBER_TAG); + blank_tag = alphabet(TMX_COMPILER_BLANK_TAG); } TMXCompiler::~TMXCompiler() @@ -96,23 +102,23 @@ TMXCompiler::requireEmptyError(UString const &name) bool TMXCompiler::allBlanks() { - bool flag = true; UString text = XMLParseUtil::readValue(reader); for(auto c : text) { - flag = flag && u_isspace(c); + if (!u_isspace(c)) { + return false; + } } - - return flag; + return true; } void TMXCompiler::skipBlanks(UString &name) { - while(name == "#text"_u || name == "#comment"_u) + while(name == TMX_COMPILER_TEXT_NODE || name == TMX_COMPILER_COMMENT_NODE) { - if(name != "#comment"_u) + if(name != TMX_COMPILER_COMMENT_NODE) { if(!allBlanks()) { @@ -133,9 +139,9 @@ TMXCompiler::skip(UString &name, UString const &elem) xmlTextReaderRead(reader); name = XMLParseUtil::readName(reader); - while(name == "#text"_u || name == "#comment"_u) + while(name == TMX_COMPILER_TEXT_NODE || name == TMX_COMPILER_COMMENT_NODE) { - if(name != "#comment"_u) + if(name != TMX_COMPILER_COMMENT_NODE) { if(!allBlanks()) { @@ -192,7 +198,7 @@ TMXCompiler::insertTU(vector const &origin, vector const &meta) return; } - if(origin[0] == alphabet(""_u) || meta[0] == alphabet(""_u)) + if(origin[0] == blank_tag || meta[0] == blank_tag) { return; } @@ -268,12 +274,10 @@ TMXCompiler::align_blanks(vector &o, vector &m) vector puntos; vector resultado_o, resultado_m; - int const symbol = alphabet(""_u); - vector > so, sm; - split(o, so, symbol); - split(m, sm, symbol); + split(o, so, blank_tag); + split(m, sm, blank_tag); if(so.size() == sm.size()) { @@ -357,19 +361,15 @@ TMXCompiler::procTU() name = XMLParseUtil::readName(reader); type = xmlTextReaderNodeType(reader); - if(name == "#text"_u) + if(name == TMX_COMPILER_TEXT_NODE) { - UString l = XMLParseUtil::readValue(reader); - for(size_t i = 0, limit = l.size(); i != limit; i++) - { - ref->push_back(l[i]); - } + XMLParseUtil::readValueInto32(reader, *ref); } else if(name == TMX_COMPILER_HI_ELEM || name == TMX_COMPILER_PH_ELEM) { if(type != XML_READER_TYPE_END_ELEMENT) { - ref->push_back(alphabet(""_u)); + ref->push_back(blank_tag); } } } @@ -394,7 +394,7 @@ TMXCompiler::procNode() // HACER: optimizar el orden de ejecución de esta ristra de "ifs" - if(name == "#text"_u) + if(name == TMX_COMPILER_TEXT_NODE) { /* ignorar */ } @@ -418,7 +418,7 @@ TMXCompiler::procNode() { procTU(); } - else if(name== "#comment"_u) + else if(name== TMX_COMPILER_COMMENT_NODE) { /* ignorar */ } @@ -438,14 +438,14 @@ TMXCompiler::write(FILE *output) write_le(output, features); // letters (empty to keep the file format) - Compression::string_write(""_u, output); + Compression::multibyte_write(0, output); // symbols alphabet.write(output); - // transducers + // transducers (1, with empty name) Compression::multibyte_write(1, output); // keeping file format - Compression::string_write(""_u, output); // keeping file format + Compression::multibyte_write(0, output); // keeping file format transducer.write(output); cout << origin_language << "->" << meta_language << " "; @@ -498,7 +498,7 @@ TMXCompiler::align(vector &origin, vector &meta) numbers_origin_start.push_back(i); numbers_origin_length.push_back(nl); i += nl-1; - modified_origin.push_back(alphabet(""_u)); + modified_origin.push_back(number_tag); } else { diff --git a/lttoolbox/tmx_compiler.h b/lttoolbox/tmx_compiler.h index 7d0633a..9cf9595 100644 --- a/lttoolbox/tmx_compiler.h +++ b/lttoolbox/tmx_compiler.h @@ -76,6 +76,9 @@ private: */ UString meta_language_inner_code; + int32_t number_tag; + int32_t blank_tag; + /** * Method to parse an XML Node @@ -163,6 +166,10 @@ public: static UString const TMX_COMPILER_LANG_ATTR; static UString const TMX_COMPILER_SEG_ELEM; static UString const TMX_COMPILER_PROP_ELEM; + static UString const TMX_COMPILER_TEXT_NODE; + static UString const TMX_COMPILER_COMMENT_NODE; + static UString const TMX_COMPILER_NUMBER_TAG; + static UString const TMX_COMPILER_BLANK_TAG; /** diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index cf534a1..e27c972 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -26,6 +26,23 @@ #include #include +UString const Transducer::HFST_EPSILON_SYMBOL_SHORT = "@0@"_u; +UString const Transducer::HFST_EPSILON_SYMBOL_LONG = "@_EPSILON_SYMBOL_@"_u; +// could extend the ""_u helper to include u""_u +// this is the only place that needs it +UString const Transducer::LTTB_EPSILON_SYMBOL = UString(1, (UChar)0x3B5); + // = "ε"_u; +UString const Transducer::HFST_SPACE_SYMBOL = "@_SPACE_@"_u; +UString const Transducer::HFST_TAB_SYMBOL = "@_TAB_@"_u; +UString const Transducer::GROUP_SYMBOL = "#"_u; +UString const Transducer::JOIN_SYMBOL = "+"_u; +UString const Transducer::ANY_TAG_SYMBOL = ""_u; +UString const Transducer::ANY_CHAR_SYMBOL = ""_u; +UString const Transducer::LSX_BOUNDARY_SYMBOL = "<$>"_u; +UString const Transducer::COMPOUND_ONLY_L_SYMBOL = ""_u; +UString const Transducer::COMPOUND_R_SYMBOL = ""_u; + + int Transducer::newState() { @@ -720,21 +737,20 @@ Transducer::escapeSymbol(UString& symbol, bool hfst) const { if(hfst) { - symbol = "@0@"_u; + symbol = HFST_EPSILON_SYMBOL_SHORT; } else { - //symbol = "ε"_u; - symbol += (UChar)949; + symbol = LTTB_EPSILON_SYMBOL; } } else if(hfst && symbol == " "_u) { - symbol = "@_SPACE_@"_u; + symbol = HFST_SPACE_SYMBOL; } else if(hfst && symbol == "\t"_u) { - symbol = "@_TAB_@"_u; + symbol = HFST_TAB_SYMBOL; } } @@ -983,10 +999,6 @@ Transducer Transducer::moveLemqsLast(Alphabet const &alphabet, int const epsilon_tag) { - // TODO: These should be in file which is included by both - // fst_processor.cc and compiler.cc: - UString COMPILER_GROUP_ELEM = "#"_u; - Transducer new_t; typedef int SearchState; std::set seen; @@ -1009,7 +1021,7 @@ Transducer::moveLemqsLast(Alphabet const &alphabet, alphabet.getSymbol(left, alphabet.decode(label).first); int new_src = states_this_new[this_src]; - if(left == COMPILER_GROUP_ELEM) + if(left == GROUP_SYMBOL) { Transducer tagsFirst = copyWithTagsFirst(this_trg, label, alphabet, epsilon_tag); new_t.finals.insert(make_pair( @@ -1054,16 +1066,6 @@ Transducer::intersect(Transducer &trimmer, * The trimmer is typically a bidix passed through appendDotStar. */ - // TODO: These should be in file which is included by both - // fst_processor.cc and compiler.cc: - UString compoundOnlyLSymbol = ""_u; - UString compoundRSymbol = ""_u; - UString COMPILER_JOIN_ELEM = "+"_u; - UString COMPILER_GROUP_ELEM = "#"_u; - UString COMPILER_ANY_TAG = ""_u; - UString COMPILER_ANY_CHAR = ""_u; - UString COMPILER_SEPARABLE_BOUNDARY = "<$>"_u; - // When searching, we need to record (this, (trimmer, trimmer_pre_plus)) typedef std::pair > SearchState; // first: currently searched state in this; @@ -1138,7 +1140,7 @@ Transducer::intersect(Transducer &trimmer, UString this_right; this_a.getSymbol(this_right, this_a.decode(this_label).second); - if(this_right == COMPILER_JOIN_ELEM || this_right == COMPILER_SEPARABLE_BOUNDARY) + if(this_right == JOIN_SYMBOL || this_right == LSX_BOUNDARY_SYMBOL) { if(trimmer_preplus == trimmer_src) { // Keep the old preplus state if it was set; equal to current trimmer state means unset: @@ -1159,13 +1161,13 @@ Transducer::intersect(Transducer &trimmer, trimmed_trg, // toState this_label, // symbol-pair, using this alphabet this_wt); //weight of transduction - if(this_right == COMPILER_SEPARABLE_BOUNDARY && isFinal(this_trg)) + if(this_right == LSX_BOUNDARY_SYMBOL && isFinal(this_trg)) { trimmed.setFinal(trimmed_trg, default_weight); } } - else if ( this_right == compoundOnlyLSymbol - || this_right == compoundRSymbol + else if ( this_right == COMPOUND_ONLY_L_SYMBOL + || this_right == COMPOUND_R_SYMBOL || this_right.empty() ) { // Stay put in the trimmer FST @@ -1196,7 +1198,7 @@ Transducer::intersect(Transducer &trimmer, // Loop through non-epsilon arcs from the live state of trimmer // If we see a hash/group, we may have to rewind our trimmer state first: - if(this_right == COMPILER_GROUP_ELEM && trimmer_preplus != trimmer_src) + if(this_right == GROUP_SYMBOL && trimmer_preplus != trimmer_src) { states_this_trimmed.insert(make_pair(make_pair(this_src, make_pair(trimmer_preplus, trimmer_preplus)), @@ -1218,7 +1220,7 @@ Transducer::intersect(Transducer &trimmer, if(!trimmer_left.empty() && // we've already dealt with trimmer epsilons (this_right == trimmer_left || - (this_right == ((trimmer_left[0] == '<') ? COMPILER_ANY_TAG : COMPILER_ANY_CHAR)))) + (this_right == ((trimmer_left[0] == '<') ? ANY_TAG_SYMBOL : ANY_CHAR_SYMBOL)))) { next = make_pair(this_trg, make_pair(trimmer_trg, trimmer_preplus_next)); if(seen.find(next) == seen.end()) diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 5dcabba..3dd91d4 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -91,6 +91,22 @@ private: public: + /** + * String constants + */ + static UString const HFST_EPSILON_SYMBOL_SHORT; + static UString const HFST_EPSILON_SYMBOL_LONG; + static UString const LTTB_EPSILON_SYMBOL; + static UString const HFST_SPACE_SYMBOL; + static UString const HFST_TAB_SYMBOL; + static UString const GROUP_SYMBOL; + static UString const JOIN_SYMBOL; + static UString const ANY_TAG_SYMBOL; + static UString const ANY_CHAR_SYMBOL; + static UString const LSX_BOUNDARY_SYMBOL; + static UString const COMPOUND_ONLY_L_SYMBOL; + static UString const COMPOUND_R_SYMBOL; + /** * Constructor */