commit 9b075d69579c470dd1014dd776ff851c6913c291 Author: Daniel Swanson Date: Wed Jun 16 13:54:45 2021 -0500 move constant initializers to header and make more use of helpers diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index ea07823..0e63a0a 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -39,10 +39,7 @@ UString const FSTProcessor::WBLANK_END = "]]"_u; UString const FSTProcessor::WBLANK_FINAL = "[[/]]"_u; -FSTProcessor::FSTProcessor() : -default_weight(0.0000), -outOfWord(false), -isLastBlankTM(false) +FSTProcessor::FSTProcessor() { // escaped_chars chars escaped_chars.insert('['); @@ -57,23 +54,6 @@ isLastBlankTM(false) escaped_chars.insert('<'); escaped_chars.insert('>'); - caseSensitive = false; - dictionaryCase = false; - do_decomposition = false; - nullFlush = false; - nullFlushGeneration = false; - useIgnoredChars = false; - useDefaultIgnoredChars = true; - useRestoreChars = false; - displayWeightsMode = false; - showControlSymbols = false; - biltransSurfaceForms = false; - maxAnalyses = INT_MAX; - maxWeightClasses = INT_MAX; - compoundOnlyLSymbol = 0; - compoundRSymbol = 0; - compound_max_elements = 4; - if(useDefaultIgnoredChars) { initDefaultIgnoredCharacters(); @@ -197,88 +177,6 @@ FSTProcessor::procNodeRCX() } } -UChar32 -FSTProcessor::readEscaped(InputFile& input) -{ - if(input.eof()) - { - streamError(); - } - - UChar32 val = input.get(); - - if(input.eof()) - { - streamError(); - } - - return val; -} - -UString -FSTProcessor::readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2) -{ - UString result; - result += delim1; - UChar32 c = delim1; - - while(!input.eof() && c != delim2) - { - c = input.get(); - result += c; - if(c != '\\') - { - continue; - } - else - { - result += readEscaped(input); - } - } - - if(c != delim2) - { - streamError(); - } - - return result; -} - -UString -FSTProcessor::readWblank(InputFile& input) -{ - UString result = WBLANK_START; - UChar32 c = 0; - - while(!input.eof()) - { - c = input.get(); - result += c; - - if(c == '\\') - { - result += readEscaped(input); - } - else if(c == ']') - { - c = input.get(); - result += c; - - if(c == ']') - { - break; - } - } - } - - if(c != ']') - { - streamError(); - } - - return result; -} - bool FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) { @@ -308,7 +206,8 @@ FSTProcessor::wblankPostGen(InputFile& input, UFILE *output) if(c == '\\') { - result += readEscaped(input); + if (input.eof()) streamError(); + result += input.get(); } else if(c == ']') { @@ -368,7 +267,7 @@ FSTProcessor::readAnalysis(InputFile& input) switch(val) { case '<': - altval = alphabet(readFullBlock(input, '<', '>')); + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; @@ -377,12 +276,12 @@ FSTProcessor::readAnalysis(InputFile& input) if(val == '[') { - blankqueue.push(readWblank(input)); + blankqueue.push(input.finishWBlank()); } else { input.unget(val); - blankqueue.push(readFullBlock(input, '[', ']')); + blankqueue.push(input.readBlock('[', ']')); } input_buffer.add(static_cast(' ')); @@ -426,7 +325,7 @@ FSTProcessor::readTMAnalysis(InputFile& input) switch(val) { case '<': - altval = alphabet(readFullBlock(input, '<', '>')); + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; @@ -435,12 +334,12 @@ FSTProcessor::readTMAnalysis(InputFile& input) if(val == '[') { - blankqueue.push(readWblank(input)); + blankqueue.push(input.finishWBlank()); } else { input.unget(val); - blankqueue.push(readFullBlock(input, '[', ']')); + blankqueue.push(input.readBlock('[', ']')); } input_buffer.add(static_cast(' ')); @@ -503,7 +402,7 @@ FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) switch(val) { case '<': - altval = alphabet(readFullBlock(input, '<', '>')); + altval = alphabet(input.readBlock('<', '>')); input_buffer.add(altval); return altval; @@ -514,7 +413,7 @@ FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) { if(collect_wblanks) { - wblankqueue.push(readWblank(input)); + wblankqueue.push(input.finishWBlank()); is_wblank = true; return static_cast(' '); } @@ -531,7 +430,7 @@ FSTProcessor::readPostgeneration(InputFile& input, UFILE *output) else { input.unget(val); - blankqueue.push(readFullBlock(input, '[', ']')); + blankqueue.push(input.readBlock('[', ']')); input_buffer.add(static_cast(' ')); return static_cast(' '); @@ -654,19 +553,19 @@ FSTProcessor::readGeneration(InputFile& input, UFILE *output) } else if(val == '<') { - return alphabet(readFullBlock(input, '<', '>')); + return alphabet(input.readBlock('<', '>')); } else if(val == '[') { val = input.get(); if(val == '[') { - write(readWblank(input), output); + write(input.finishWBlank(), output); } else { input.unget(val); - write(readFullBlock(input, '[', ']'), output); + write(input.readBlock('[', ']'), output); } return readGeneration(input, output); @@ -741,7 +640,7 @@ FSTProcessor::readBilingual(InputFile& input, UFILE *output) } else if(val == '<') { - UString cad = readFullBlock(input, '<', '>'); + UString cad = input.readBlock('<', '>'); int res = alphabet(cad); @@ -756,12 +655,12 @@ FSTProcessor::readBilingual(InputFile& input, UFILE *output) val = input.get(); if(val == '[') { - write(readWblank(input), output); + write(input.finishWBlank(), output); } else { input.unget(val); - write(readFullBlock(input, '[', ']'), output); + write(input.readBlock('[', ']'), output); } return readBilingual(input, output); @@ -3177,12 +3076,12 @@ FSTProcessor::readSAO(InputFile& input) { if(val == '<') { - UString str = readFullBlock(input, '<', '>'); + UString str = input.readBlock('<', '>'); if(str.substr(0, 9) == ""_u) { - str.append(readFullBlock(input, '<', '>').substr(1)); + str.append(input.readBlock('<', '>').substr(1)); } blankqueue.push(str); input_buffer.add(static_cast(' ')); diff --git a/lttoolbox/fst_processor.h b/lttoolbox/fst_processor.h index 76d3783..32263ac 100644 --- a/lttoolbox/fst_processor.h +++ b/lttoolbox/fst_processor.h @@ -72,7 +72,7 @@ private: /** * Default value of weight unless specified */ - double default_weight; + double default_weight = 0.0000; /** * The final states of inconditional sections in the dictionaries @@ -152,86 +152,86 @@ private: /** * true if the position of input stream is out of a word */ - bool outOfWord; + bool outOfWord = false; /** * true if we're automatically removing surface forms. */ - bool biltransSurfaceForms; + bool biltransSurfaceForms = false; /** * if true, makes always difference between uppercase and lowercase * characters */ - bool caseSensitive; + bool caseSensitive = false; /** * if true, uses the dictionary case, discarding surface case * information */ - bool dictionaryCase; + bool dictionaryCase = false; /** * if true, flush the output when the null character is found */ - bool nullFlush; + bool nullFlush = false; /** * nullFlush property for the skipUntil function */ - bool nullFlushGeneration; + bool nullFlushGeneration = false; /** * if true, ignore the provided set of characters */ - bool useIgnoredChars; + bool useIgnoredChars = false; /** * if true, attempt simplistic diacritic restoration */ - bool useRestoreChars; + bool useRestoreChars = false; /** * if true, skips loading the default set of ignored characters */ - bool useDefaultIgnoredChars; + bool useDefaultIgnoredChars = true; /** * if true, displays the final weights (if any) */ - bool displayWeightsMode; + bool displayWeightsMode = false; /** * try analysing unknown words as compounds */ - bool do_decomposition; + bool do_decomposition = false; /** * Symbol of CompoundOnlyL */ - int compoundOnlyLSymbol; + int compoundOnlyLSymbol = 0; /** * Symbol of CompoundR */ - int compoundRSymbol; + int compoundRSymbol = 0; /** * Show or not the controls symbols (as compoundRSymbol) */ - bool showControlSymbols; + bool showControlSymbols = false; /** * Max compound elements * Hard coded for now, but there might come a switch one day */ - int compound_max_elements; + int compound_max_elements = 4; /** * Output no more than 'N' number of weighted analyses */ - int maxAnalyses; + int maxAnalyses = INT_MAX; /** * True if a wblank block ([[..]]xyz[[/]]) was just read @@ -251,34 +251,13 @@ private: /** * Output no more than 'N' best weight classes */ - int maxWeightClasses; + int maxWeightClasses = INT_MAX; /** * Prints an error of input stream and exits */ void streamError(); - /** - * Reads a character that is defined in the set of escaped_chars - * @param input the stream to read from - * @return code of the character - */ - UChar32 readEscaped(InputFile& input); - - /** - * Reads a block from the stream input, enclosed by delim1 and delim2 - * @param input the stream being read - * @param delim1 the delimiter of the beginning of the sequence - * @param delim1 the delimiter of the end of the sequence - */ - UString readFullBlock(InputFile& input, UChar32 const delim1, UChar32 const delim2); - - /** - * Reads a wordbound blank from the stream input - * @param input the stream being read - */ - UString readWblank(InputFile& input); - /** * Reads a wordbound blank (opening blank to closing blank) from the stream input -> [[...]]xyz[[/]] * @param input the stream being read @@ -493,7 +472,7 @@ private: void procNodeRCX(); void initDefaultIgnoredCharacters(); - bool isLastBlankTM; + bool isLastBlankTM = false; xmlTextReaderPtr reader; public: diff --git a/lttoolbox/input_file.cc b/lttoolbox/input_file.cc index d12f6b8..2bd9a8f 100644 --- a/lttoolbox/input_file.cc +++ b/lttoolbox/input_file.cc @@ -167,3 +167,23 @@ InputFile::readBlock(const UChar32 start, const UChar32 end) } return ret; } + +UString +InputFile::finishWBlank() +{ + UString ret; + ret += '['; + ret += '['; + UChar32 c = 0; + while (!eof()) { + c = get(); + ret += c; + if (c == '\\') { + ret += get(); + } else if (c == ']' && peek() == ']') { + ret += get(); + break; + } + } + return ret; +} diff --git a/lttoolbox/input_file.h b/lttoolbox/input_file.h index 6426f7c..de031c8 100644 --- a/lttoolbox/input_file.h +++ b/lttoolbox/input_file.h @@ -42,7 +42,13 @@ public: void unget(UChar32 c); bool eof(); void rewind(); + // assumes that start has already been read + // returns string from start to end inclusive + // respects backslash escapes UString readBlock(const UChar32 start, const UChar32 end); + // assumes [[ has already been read, reads to ]] + // returns entire string, including brackets + UString finishWBlank(); }; #endif