commit fa7a03034b6c019d98d9033a3db935aa10c0e4d6 Author: Tanmai Khanna Date: Fri Aug 7 16:57:40 2020 +0530 wblank parsing diff --git a/src/chunk.h b/src/chunk.h index 1a185e7..8dfa72d 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -29,6 +29,7 @@ public: wstring source; wstring target; wstring coref; + wstring wblank; bool isBlank; bool isJoiner; vector contents; @@ -39,8 +40,8 @@ public: Chunk(wstring blankContent) : target(blankContent), isBlank(true), isJoiner(false), rule(-1) {} - Chunk(wstring src, wstring dest, wstring cor) - : source(src), target(dest), coref(cor), isBlank(false), isJoiner(false), rule(-1) + Chunk(wstring src, wstring dest, wstring cor, wstring wbl) + : source(src), target(dest), coref(cor), wblank(wbl), isBlank(false), isJoiner(false), rule(-1) {} Chunk(wstring dest, vector& children, int r = -1) : target(dest), isBlank(false), isJoiner(false), contents(children), rule(r) @@ -50,6 +51,7 @@ public: source = other.source; target = other.target; coref = other.coref; + wblank = other.wblank; isBlank = other.isBlank; isJoiner = other.isJoiner; contents = other.contents; @@ -60,6 +62,7 @@ public: source.swap(other.source); target.swap(other.target); coref.swap(other.coref); + wblank.swap(other.wblank); isBlank = other.isBlank; isJoiner = other.isJoiner; contents.swap(other.contents); @@ -70,6 +73,7 @@ public: source.swap(other.source); target.swap(other.target); coref.swap(other.coref); + wblank.swap(other.wblank); isBlank = other.isBlank; isJoiner = other.isJoiner; contents.swap(other.contents); @@ -84,6 +88,7 @@ public: ret->source = source; ret->target = target; ret->coref = coref; + ret->wblank = wblank; ret->contents.reserve(contents.size()); for(unsigned int i = 0, limit = contents.size(); i < limit; i++) { diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc index 4012c87..d784453 100644 --- a/src/rtx_processor.cc +++ b/src/rtx_processor.cc @@ -16,6 +16,7 @@ RTXProcessor::RTXProcessor() { furtherInput = true; inword = false; + inwblank = false; printingSteps = false; printingRules = false; printingBranches = false; @@ -984,6 +985,7 @@ RTXProcessor::readToken(FILE *in) { int pos = 0; wstring cur; + wstring wbl; wstring src; wstring dest; wstring coref; @@ -1007,8 +1009,32 @@ RTXProcessor::readToken(FILE *in) } else if(val == L'[' && !inword) { - cur += L'['; - inSquare = true; + val = fgetwc_unlocked(in); + + if(val == L'[') + { + inwblank = true; + Chunk* ret = chunkPool.next(); + ret->target = cur; + ret->isBlank = true; + return ret; + } + else + { + cur += L'['; + inSquare = true; + + if(val == L'\\') + { + cur += L'\\'; + cur += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L']') + { + cur += val; + inSquare = false; + } + } } else if(inSquare) { @@ -1018,6 +1044,49 @@ RTXProcessor::readToken(FILE *in) inSquare = false; } } + else if(inwblank) + { + if(val == L']') + { + cur += val; + val = fgetwc_unlocked(in); + + if(val == L'\\') + { + cur += L'\\'; + cur += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L']') + { + cur += val; + val = fgetwc_unlocked(in); + + if(val == L'\\') + { + cur += L'\\'; + cur += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L'^') + { + inwblank = false; + wbl.swap(cur); + inword = true; + } + else + { + //ParseError TODO + } + } + else + { + cur += val; + } + } + else + { + cur += val; + } + } else if(inword && (val == L'$' || val == L'/')) { if(pos == 0) @@ -1041,6 +1110,7 @@ RTXProcessor::readToken(FILE *in) { inword = false; Chunk* ret = chunkPool.next(); + ret->wblank = wbl; ret->source = src; ret->target = dest; ret->coref = coref; diff --git a/src/rtx_processor.h b/src/rtx_processor.h index 607e988..8c68783 100644 --- a/src/rtx_processor.h +++ b/src/rtx_processor.h @@ -195,6 +195,12 @@ private: * Initial value: false */ bool inword; + + /** + * true if the next input token should be parsed as a wordbound blank, false otherwise + * Initial value: false + */ + bool inwblank; /** * Whether output should flush on \0