commit 2ea36928f9d450b027f469385477937f47065368 Author: Tanmai Khanna Date: Wed Aug 19 03:06:02 2020 +0530 Parse wordbound blanks as superblanks diff --git a/tools/src/hfst-proc/tokenizer.cc b/tools/src/hfst-proc/tokenizer.cc index 35a8c43d..f3eaee7f 100644 --- a/tools/src/hfst-proc/tokenizer.cc +++ b/tools/src/hfst-proc/tokenizer.cc @@ -184,9 +184,30 @@ TokenIOStream::read_escaped() std::string TokenIOStream::read_delimited(const char delim) { + wcerr << "\n\n##TEST##\n\n"; std::string result; int c = EOF; - + bool is_wblank = false; + + if(is && c != delim) //Check if wblank is being read + { + c = is.get(); + if(c == EOF || c != '[') + break; + + result += c; + if(c == '\\') + result += read_escaped(); + else if(null_flush && c == '\0') + do_null_flush(); + else + { + int next_char = is.peek(); + if(next_char == '[') + is_wblank = true; + } + } + while(is && c != delim) { c = is.get(); @@ -199,6 +220,22 @@ TokenIOStream::read_delimited(const char delim) if(null_flush && c == '\0') do_null_flush(); } + + if(is_wblank) + { + c = is.get(); + if(c == EOF) + break; + + if(c != delim) + { + stream_error(std::string("Error in parsing a wordbound blank")); + } + else + { + result += c; + } + } if(c != delim) stream_error(std::string("Didn't find delimiting character ")+delim); diff --git a/tools/src/hfst-proc/tokenizer.h b/tools/src/hfst-proc/tokenizer.h index 4c5e3a80..b367101c 100644 --- a/tools/src/hfst-proc/tokenizer.h +++ b/tools/src/hfst-proc/tokenizer.h @@ -187,6 +187,7 @@ class TokenIOStream * Read into the the stream until the delimiting character is found. The * delimiting character is read and included in the string. Charater escaping * is handled. Fails on stream error + * If a wblank is being parsed, then the parsing happens till ]] is reached. * @return the string from the stream's current point up to and including * the delimiting character */