commit 19d6f35133274e5eeb7520f96449d53ecf3bcdd5 Author: Eiji Miyamoto Date: Wed Jul 12 12:29:19 2023 +0100 improve cpp file as it did not tokenize non-blancket text and text after blancket diff --git a/buffer_mecab.cpp b/buffer_mecab.cpp index 42b9672..45c1491 100644 --- a/buffer_mecab.cpp +++ b/buffer_mecab.cpp @@ -9,26 +9,33 @@ void process_text(std::istream& sin, std::ostream& sout) { MeCab::Tagger* tagger = MeCab::createTagger("-Owakati"); std::stringstream buffer; - //std::ostringstream buffer; std::string tokenized; bool in_bracket = false; for (char i : text) { buffer << i; + if (i == text.back()){ + std::string token = tagger->parse(buffer.str().c_str()); + token.erase(token.find_last_not_of(" \n\r\t") + 1); + tokenized += token; + } if (i == '[') { + std::string temp = buffer.str(); + temp.pop_back(); + buffer.str(temp); std::string parsed = tagger->parse(buffer.str().c_str()); parsed.erase(parsed.find_last_not_of(" \n\r\t") + 1); tokenized += parsed; - // buffer = tagger->parse(buffer.str().c_str()); - // tokenized += buffer; buffer.str(""); buffer.clear(); in_bracket = true; } - else if (in_bracket) { + if (in_bracket) { tokenized += i; } - else if (i == ']') { + if (i == ']') { in_bracket = false; + buffer.str(""); + buffer.clear(); } }