commit 4d6b15ad87803e2e2a470bc0d384f6d42be3e20f Author: Tanmai Khanna Date: Wed Aug 19 14:18:45 2020 +0530 Parse wordbound blanks as normal blanks in hfst-proc (#478) Parse wordbound blanks as superblanks diff --git a/tools/src/hfst-proc/tokenizer.cc b/tools/src/hfst-proc/tokenizer.cc index 35a8c43d..255a2e75 100644 --- a/tools/src/hfst-proc/tokenizer.cc +++ b/tools/src/hfst-proc/tokenizer.cc @@ -186,7 +186,27 @@ TokenIOStream::read_delimited(const char delim) { std::string result; int c = EOF; - + bool is_wblank = false; + + if(is && c != delim) + { + c = is.get(); + if(c != EOF) + { + result += c; + if(c == '\\') + result += read_escaped(); + else if(null_flush && c == '\0') + do_null_flush(); + else if(c == '[') + { + int next_char = is.peek(); + if(next_char == '[') //Check if wblank is being read + is_wblank = true; + } + } + } + while(is && c != delim) { c = is.get(); @@ -199,6 +219,22 @@ TokenIOStream::read_delimited(const char delim) if(null_flush && c == '\0') do_null_flush(); } + + if(is_wblank) + { + c = is.get(); + if(c != EOF) + { + if(c != delim) + { + stream_error(std::string("Error in parsing a wordbound blank")); + } + else + { + result += c; + } + } + } if(c != delim) stream_error(std::string("Didn't find delimiting character ")+delim); diff --git a/tools/src/hfst-proc/tokenizer.h b/tools/src/hfst-proc/tokenizer.h index 4c5e3a80..b367101c 100644 --- a/tools/src/hfst-proc/tokenizer.h +++ b/tools/src/hfst-proc/tokenizer.h @@ -187,6 +187,7 @@ class TokenIOStream * Read into the the stream until the delimiting character is found. The * delimiting character is read and included in the string. Charater escaping * is handled. Fails on stream error + * If a wblank is being parsed, then the parsing happens till ]] is reached. * @return the string from the stream's current point up to and including * the delimiting character */