commit 30bd1f58f02131379f9f48e3f0a1194c938d546a Author: Eiji Miyamoto Date: Wed Jul 12 10:39:09 2023 +0100 fixing bug as it did not tokenize text after blanckets and without blanckets diff --git a/tokenize.py b/tokenize.py index f8267b7..9df7e3f 100644 --- a/tokenize.py +++ b/tokenize.py @@ -10,16 +10,23 @@ def process_text(sin, sout): for i in text: buffer += i + if i == text[-1]: + buffer = mecab.parse(buffer.strip()).rstrip() + tokenized += buffer + if i == "[": + buffer = buffer[:-1] buffer = mecab.parse(buffer.strip()).rstrip() tokenized += buffer buffer = "" in_blancket = True - elif in_blancket: + + if in_blancket: tokenized += i - elif i == "]": + + if i == "]": in_blancket = False + buffer = "" sout.write(tokenized) - if __name__ == '__main__': - process_text(sys.stdin, sys.stdout) + process_text(sys.stdin, sys.stdout) \ No newline at end of file