commit 811b4788d00646e52d25dd70f295acb86372f92c Author: Daniel Swanson Date: Mon Jun 14 17:40:41 2021 -0500 random cleanup diff --git a/src/chunk.cc b/src/chunk.cc index 2df850c..4a0b4c7 100644 --- a/src/chunk.cc +++ b/src/chunk.cc @@ -1,6 +1,6 @@ #include #include -#include +#include #include @@ -109,7 +109,7 @@ Chunk::getTags(const vector& parentTags) { if(isNum) { - unsigned int n = stoi(target.substr(last+1, j-last-1)); + unsigned int n = StringUtils::stoi(target.substr(last+1, j-last-1)); if(n != 0 && n <= parentTags.size()) { ret.push_back(parentTags[n-1]); @@ -157,7 +157,7 @@ Chunk::updateTags(const vector& parentTags) { if(isNum) { - unsigned int n = stoi(target.substr(last+1, j-last-1)); + unsigned int n = StringUtils::stoi(target.substr(last+1, j-last-1)); if(n != 0 && n <= parentTags.size()) { result += parentTags[n-1]; diff --git a/src/chunk.h b/src/chunk.h index 648fc74..9e6f6d0 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -3,7 +3,6 @@ #include #include -#include #include #include diff --git a/src/matcher.h b/src/matcher.h index 0312af3..a1c8f78 100644 --- a/src/matcher.h +++ b/src/matcher.h @@ -6,6 +6,7 @@ #include #include #include +#include using namespace std; diff --git a/src/pattern.cc b/src/pattern.cc index b36419b..ea4406b 100644 --- a/src/pattern.cc +++ b/src/pattern.cc @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -83,7 +83,7 @@ PatternBuilder::insertTags(int const base, const vector& tags) } else { - vector tgs = StringUtils::split_UString(tags[i], "."_u); + vector tgs = StringUtils::split(tags[i], "."_u); for(auto t : tgs) { UString tg = "<"_u + t + ">"_u; @@ -508,7 +508,7 @@ PatternBuilder::loadLexFile(const string& fname) pat.push_back(vector(1, p)); } lex.get(); - lexicalizations[name].push_back(make_pair(stod(weight), pat)); + lexicalizations[name].push_back(make_pair(StringUtils::stod(weight), pat)); } } @@ -562,7 +562,7 @@ PatternBuilder::write(FILE* output, int longest, vector> inpu if(s.compare(0, rule_sym_pre.size(), rule_sym_pre) != 0) { continue; } - const int rule_num = stoi(s.substr(rule_sym_pre.size())); + const int rule_num = StringUtils::stoi(s.substr(rule_sym_pre.size())); transducer.setFinal(src); finals_rules.insert(make_pair(src, make_pair(rule_num, wgt))); } diff --git a/src/rtx_comp.cc b/src/rtx_comp.cc index 9bbf87f..d9deb8e 100644 --- a/src/rtx_comp.cc +++ b/src/rtx_comp.cc @@ -3,13 +3,11 @@ #include #include #include -#include #include #include #include #include -using namespace Apertium; using namespace std; void endProgram(char *name) diff --git a/src/rtx_compiler.cc b/src/rtx_compiler.cc index 72ee13b..962fe29 100644 --- a/src/rtx_compiler.cc +++ b/src/rtx_compiler.cc @@ -1,6 +1,6 @@ #include #include -#include +#include using namespace std; @@ -255,7 +255,7 @@ RTXCompiler::parseInt() { ret += getchar(); } - return stoi(ret); + return StringUtils::stoi(ret); } float @@ -269,7 +269,7 @@ RTXCompiler::parseWeight() float r; try { - r = stod(ret); + r = StringUtils::stod(ret); } catch(const invalid_argument& ia) { diff --git a/src/rtx_proc.cc b/src/rtx_proc.cc index b2ccdb5..63093cb 100644 --- a/src/rtx_proc.cc +++ b/src/rtx_proc.cc @@ -3,6 +3,7 @@ #include #include #include +#include void endProgram(char *name) { diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc index 08afc66..3a71000 100644 --- a/src/rtx_processor.cc +++ b/src/rtx_processor.cc @@ -1,34 +1,17 @@ #include #include #include -#include +//#include #include #include -#include +#include //#include -using namespace Apertium; using namespace std; RTXProcessor::RTXProcessor() { - furtherInput = true; - inword = false; - inwblank = false; - printingSteps = false; - printingRules = false; - printingBranches = false; - printingAll = false; - noCoref = true; - isLinear = false; - null_flush = false; - printingTrees = false; - printingText = true; - treePrintMode = TreeModeNest; - newBranchId = 0; - noFilter = true; - currentBranch = NULL; } RTXProcessor::~RTXProcessor() @@ -175,43 +158,6 @@ RTXProcessor::endsWith(UString const &s1, UString const &s2) const return true; } -UString -RTXProcessor::copycase(UString const &source_word, UString const &target_word) -{ - UString result; - - bool firstupper = iswupper(source_word[0]); - bool uppercase = firstupper && iswupper(source_word[source_word.size()-1]); - bool sizeone = source_word.size() == 1; - - if(!uppercase || (sizeone && uppercase)) - { - if(isLinear) - { - result = target_word; - result[0] = towlower(result[0]); - } - else result = StringUtils::tolower(target_word); - } - else - { - result = StringUtils::toupper(target_word); - } - - if(firstupper) - { - result[0] = towupper(result[0]); - } - - return result; -} - -UString -RTXProcessor::caseOf(UString const &s) -{ - return copycase(s, "aa"_u); -} - inline bool RTXProcessor::popBool() { @@ -789,7 +735,7 @@ RTXProcessor::applyRule(const UString& rule) break; case GETCASE: if(printingSteps) { cerr << "getcase" << endl; } - pushStack(caseOf(popString())); + pushStack(StringUtils::getcase(popString())); if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } break; case SETCASE: @@ -797,7 +743,7 @@ RTXProcessor::applyRule(const UString& rule) { UString src = popString(); UString dest = popString(); - pushStack(copycase(src, dest)); + pushStack(StringUtils::copycase(src, dest)); } if(printingSteps) { cerr << " -> " << theStack[stackIdx].s << endl; } break; @@ -1038,7 +984,7 @@ RTXProcessor::applyRule(const UString& rule) } Chunk * -RTXProcessor::readToken(FILE *in) +RTXProcessor::readToken() { int pos = 0; UString cur; @@ -1047,12 +993,10 @@ RTXProcessor::readToken(FILE *in) UString dest; UString coref; cur.reserve(256); - bool inSquare = false; while(true) { - int val = fgetwc_unlocked(in); - if(feof(in) || (null_flush && val == 0)) - { + UChar32 val = infile.get(); + if (infile.eof() || (null_flush && val == '\0')) { furtherInput = false; Chunk* ret = chunkPool.next(); ret->target = cur; @@ -1062,11 +1006,11 @@ RTXProcessor::readToken(FILE *in) else if(val == '\\') { cur += '\\'; - cur += wchar_t(fgetwc_unlocked(in)); + cur += infile.get(); } else if(val == '[' && !inword) { - val = fgetwc_unlocked(in); + val = infile.get(); if(val == '[') { @@ -1078,30 +1022,8 @@ RTXProcessor::readToken(FILE *in) } else { - cur += '['; - inSquare = true; - - if(val == '\\') - { - cur += '\\'; - cur += static_cast(fgetwc_unlocked(in)); - } - else - { - cur += val; - if(val == ']') - { - inSquare = false; - } - } - } - } - else if(inSquare) - { - cur += val; - if(val == ']') - { - inSquare = false; + infile.unget(val); + cur += infile.readBlock('[', ']'); } } else if(inwblank) @@ -1109,22 +1031,22 @@ RTXProcessor::readToken(FILE *in) if(val == ']') { cur += val; - val = fgetwc_unlocked(in); + val = infile.get(); if(val == '\\') { cur += '\\'; - cur += static_cast(fgetwc_unlocked(in)); + cur += infile.get(); } else if(val == ']') { cur += val; - val = fgetwc_unlocked(in); + val = infile.get(); if(val == '\\') { cur += '\\'; - cur += static_cast(fgetwc_unlocked(in)); + cur += infile.get(); } else if(val == '^') { @@ -1199,7 +1121,7 @@ RTXProcessor::readToken(FILE *in) } else { - cur += wchar_t(val); + cur += val; } } } @@ -1738,7 +1660,7 @@ RTXProcessor::filterParseGraph() } void -RTXProcessor::processGLR(FILE *in, UFILE *out) +RTXProcessor::processGLR(UFILE *out) { int sentenceId = 1; if(printingAll && treePrintMode == TreeModeLatex) @@ -1747,7 +1669,7 @@ RTXProcessor::processGLR(FILE *in, UFILE *out) } while(furtherInput && inputBuffer.size() < 5) { - inputBuffer.push_back(readToken(in)); + inputBuffer.push_back(readToken()); } bool real_printingAll = printingAll; while(true) @@ -1783,7 +1705,7 @@ RTXProcessor::processGLR(FILE *in, UFILE *out) next->output(out); if(furtherInput) { - inputBuffer.push_back(readToken(in)); + inputBuffer.push_back(readToken()); } if(inputBuffer.empty()) { @@ -1834,7 +1756,7 @@ RTXProcessor::processGLR(FILE *in, UFILE *out) } } } - if(furtherInput) inputBuffer.push_back(readToken(in)); + if(furtherInput) inputBuffer.push_back(readToken()); if(filterParseGraph()) { cerr.flush(); @@ -2037,7 +1959,7 @@ RTXProcessor::processTRXLayer(list& t1x, list& t2x) } void -RTXProcessor::processTRX(FILE *in, UFILE *out) +RTXProcessor::processTRX(UFILE *out) { list t1x; list t2x; @@ -2046,7 +1968,7 @@ RTXProcessor::processTRX(FILE *in, UFILE *out) { while(furtherInput && t1x.size() < 2*longestPattern) { - t1x.push_back(readToken(in)); + t1x.push_back(readToken()); } if(furtherInput) { @@ -2123,18 +2045,19 @@ RTXProcessor::process(FILE* in, UFILE* out) cerr << "\\usepackage[cm]{fullpage}" << endl << endl; cerr << "\\begin{document}" << endl << endl; } + infile.wrap(in); if(null_flush) { - while(!feof(in)) + while(!infile.eof()) { furtherInput = true; if(isLinear) { - processTRX(in, out); + processTRX(out); } else { - processGLR(in, out); + processGLR(out); } u_fputc('\0', out); u_fflush(out); @@ -2150,11 +2073,11 @@ RTXProcessor::process(FILE* in, UFILE* out) } else if(isLinear) { - processTRX(in, out); + processTRX(out); } else { - processGLR(in, out); + processGLR(out); } if(printingAll && treePrintMode == TreeModeLatex) { diff --git a/src/rtx_processor.h b/src/rtx_processor.h index 5272a8b..7607e1c 100644 --- a/src/rtx_processor.h +++ b/src/rtx_processor.h @@ -4,12 +4,11 @@ #include #include #include -#include #include #include #include +#include -#include #include #include #include @@ -112,7 +111,7 @@ private: /** * false if EOF or \0 has been reached in the input stream, true otherwise */ - bool furtherInput; + bool furtherInput = true; /** * The stack used by the virtual machine @@ -209,7 +208,7 @@ private: * Branch of parseGraph currently being operated on * Needed by applyRule() for FETCHCHUNK and SETCHUNK */ - ParseNode* currentBranch; + ParseNode* currentBranch = nullptr; ////////// // SETTINGS @@ -217,113 +216,86 @@ private: /** * true if the next input token should be parsed as an LU, false otherwise - * Initial value: false */ - bool inword; + bool inword = false; /** * true if the next input token should be parsed as a wordbound blank, false otherwise - * Initial value: false */ - bool inwblank; + bool inwblank = false; /** * Whether output should flush on \0 - * Default: false */ - bool null_flush; + bool null_flush = false; /** * If true, each instruction of virtual machine will be printed to wcerr - * Default: false */ - bool printingSteps; + bool printingSteps = false; /** * If true, each rule that is applied will be printed to wcerr - * Default: false */ - bool printingRules; + bool printingRules = false; /** * If true, each action of filterParseGraph() will be logged to wcerr - * Default: false */ - bool printingBranches; + bool printingBranches = false; /** * If true, produce a full report, similar to (printingRules && printingBranches) * Affected by treePrintMode - * Default: false */ - bool printingAll; + bool printingAll = false; /** * false if input comes from apertium-anaphora, true otherwise - * Default: true */ - bool noCoref; + bool noCoref = true; /** * true if rule application should mimic the chunker-interchunk-postchunk * pipeline, false otherwise - * Default: false */ - bool isLinear; + bool isLinear = false; /** * If true, parse tree will be printed according to treePrintMode * before output-time rules are applied - * Default: false */ - bool printingTrees; + bool printingTrees = false; /** * If false, output-time rules will not be applied and linear output * will not be produced - * Default: true */ - bool printingText; + bool printingText = true; /** * Manner in which to print trees * Set by setOutputMode() * Enum defined in chunk.h - * Default: TreeModeNest */ - TreeMode treePrintMode; + TreeMode treePrintMode = TreeModeNest; /** * Counter used to give distinct, consistent identifiers to ParseNodes * for tracing purposes */ - int newBranchId; + int newBranchId = 0; /** * If this is set to true, filterParseGraph() will only discard branches * on parse error */ - bool noFilter; + bool noFilter = true; ////////// // VIRTUAL MACHINE ////////// - /** - * Determine capitalization of a string - * @param str - input string - * @return L"AA", L"Aa", or L"aa" - */ - UString caseOf(UString const &str); - - /** - * Produce a version of target_word with the case of source_word - * @param source_word - source of case - * @param target_word - source of content - * @return generated string - */ - UString copycase(UString const &source_word, UString const &target_word); - /** * Return whether str1 begins with str2 */ @@ -411,13 +383,15 @@ private: // RULE SELECTION AND I/O ////////// + InputFile infile; + /** * Read an LU or a blank * Modifies: furtherInput * @param in - input stream * @return pointer to token read */ - Chunk* readToken(FILE *in); + Chunk* readToken(); bool lookahead(ParseNode* node); @@ -451,7 +425,7 @@ private: * Process input as a GLR parser * Read input, call checkForReduce(), call filterParseGraph(), call outputAll() */ - void processGLR(FILE* in, UFILE* out); + void processGLR(UFILE* out); /** * Apply longest rule matching the beginning of t1x and append the result to t2x @@ -462,7 +436,7 @@ private: * Mimic apertium-transfer | apertium-interchunk | apertium-postchunk * Read input, call processTRXLayer twice, apply output-time rules, output */ - void processTRX(FILE* in, UFILE* out); + void processTRX(UFILE* out); /** * True if clipping lem/lemh/whole diff --git a/src/trx_compiler.cc b/src/trx_compiler.cc index 6c753dc..234655d 100644 --- a/src/trx_compiler.cc +++ b/src/trx_compiler.cc @@ -8,10 +8,9 @@ #include #include #include -#include -#include +#include +#include -using namespace Apertium; using namespace std; TRXCompiler::TRXCompiler() @@ -143,7 +142,7 @@ TRXCompiler::getPos(xmlNode* node, bool isBlank = false) die(node, "Position must be an integer."); } } - int ret = stoi(v); + int ret = StringUtils::stoi(v); if(inOutput && ret == 0) { return ret; @@ -188,7 +187,7 @@ TRXCompiler::processCats(xmlNode* node) cur->lemma = getattr(item, "lemma"); UString tags = requireAttr(item, "tags"); if(tags.empty()) tags = "UNKNOWN:INTERNAL"_u; - cur->tags = StringUtils::split_UString(tags, "."_u); + cur->tags = StringUtils::split(tags, "."_u); pat.push_back(cur); } if(patterns.find(pat_name) != patterns.end()) { @@ -272,7 +271,7 @@ TRXCompiler::gatherMacros(xmlNode* node) continue; } UString name = requireAttr(mac, "n"); - int npar = stoi(requireAttr(mac, "npar")); + int npar = StringUtils::stoi(requireAttr(mac, "npar")); if(macros.find(name) != macros.end()) { warn(mac, "Redefinition of macro '%S' - using later definition", name.c_str()); @@ -348,7 +347,7 @@ TRXCompiler::processRules(xmlNode* node) } if(excludedRules.find(id) == excludedRules.end()) { - PB.addRule(inputRules.size() + 1, (weight.size() > 0 ? stod(weight) : 0.0), pls, StringUtils::split_UString(firstChunk, " "_u), id); + PB.addRule(inputRules.size() + 1, (weight.size() > 0 ? StringUtils::stod(weight) : 0.0), pls, StringUtils::split(firstChunk, " "_u), id); } inputRuleSizes.push_back(pls.size()); }