commit 082a7299dc6194fe8291f37c6c12e962167d3f5b Author: vivekvardhanadepu Date: Sat Jul 31 20:31:23 2021 +0530 removing additional ^ and $ diff --git a/src/tagger_output_processor.cc b/src/tagger_output_processor.cc index 859aae3..e9e65a4 100644 --- a/src/tagger_output_processor.cc +++ b/src/tagger_output_processor.cc @@ -2,53 +2,71 @@ #include #include -int TaggerOutputProcessor::find(vector xs, UString x) { - for (size_t i = 0; i < xs.size(); ++i) { +int TaggerOutputProcessor::find(vector xs, UString x) +{ + for (size_t i = 0; i < xs.size(); ++i) + { if (xs[i] == x) return i; } return -1; } -TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) { +TaggerToken TaggerOutputProcessor::parseTaggerToken(UString str) +{ TaggerToken token; int state = 0; // lemma; UString buffer; - for (auto& c : str) { - if(c == '<' && state == 0) { + for (auto &c : str) + { + if (c == '<' && state == 0) + { state = 1; token.lemma = buffer; buffer.clear(); } - if (c == '>') { + if (c == '>') + { token.tags.push_back(buffer); buffer.clear(); - } else if (c != '<') { + } + else if (c != '<') + { buffer += c; } } - if(state == 0) { + if (state == 0) + { token.lemma = buffer; } return token; } -vector TaggerOutputProcessor::parseTags(UString token) { +vector TaggerOutputProcessor::parseTags(UString token) +{ int state = 0; // outside vector tags; UString buffer; - for (auto& c : token) { - if (state == 0) { - if (c == '<') { + for (auto &c : token) + { + if (state == 0) + { + if (c == '<') + { state = 1; } - } else if (state == 1) { - if (c == '>') { + } + else if (state == 1) + { + if (c == '>') + { tags.push_back(buffer); buffer.clear(); state = 0; - } else { + } + else + { buffer += c; } } @@ -56,50 +74,69 @@ vector TaggerOutputProcessor::parseTags(UString token) { return tags; } -vector TaggerOutputProcessor::wsplit(UString wstr, UChar delim) { +vector TaggerOutputProcessor::wsplit(UString wstr, UChar delim) +{ vector tokens; UString buffer; - for(size_t i = 0; i < wstr.size(); ++i) { - if(wstr[i] == delim && (i == 0 || wstr[i-1] != '\\')) { + for (size_t i = 0; i < wstr.size(); ++i) + { + if (wstr[i] == delim && (i == 0 || wstr[i - 1] != '\\')) + { tokens.push_back(buffer); buffer.clear(); - } else { + } + else + { buffer += wstr[i]; } } - if(!buffer.empty()) { + if (!buffer.empty()) + { tokens.push_back(buffer); } return tokens; } -UString TaggerOutputProcessor::getLemma(UString token) { +UString TaggerOutputProcessor::getLemma(UString token) +{ UString buffer; - for (auto& c : token) { - if(c != '<') { + for (auto &c : token) + { + if (c != '<') + { buffer += c; - } else { + } + else + { break; } } return buffer; } -void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) { +void TaggerOutputProcessor::processTaggerOutput(bool nullFlush) +{ vector sentence; UChar32 c; - InputFile in; - while (!in.eof()) { - c = in.get(); + InputFile in; + while (!in.eof()) + { + c = in.get(); - if ((c == '\n') || (nullFlush && c == '\0')) { - processSentence(sentence); - sentence.clear(); - } else if (c == '\\') { - in.get(); - } else if (c == '^') { - sentence.push_back(parseTaggerToken(in.readBlock('^', '$'))); + if ((c == '\n') || (nullFlush && c == '\0')) + { + processSentence(sentence); + sentence.clear(); + } + else if (c == '\\') + { + in.get(); + } + else if (c == '^') + { + UString temp = in.readBlock('^', '$'); + sentence.push_back(parseTaggerToken(temp.substr(1, temp.size() - 2))); } } }