commit 7b48c2ac6b823873cc6ca2f7bcbc11313913fddd Author: Tanmai Khanna Date: Wed Aug 12 19:31:49 2020 +0200 Wordbound blanks (closes #65) diff --git a/src/Makefile.am b/src/Makefile.am index 85e0bd4..7230be5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -4,7 +4,7 @@ bin_PROGRAMS = rtx-comp rtx-proc rtx-decomp random-path rtx_comp_SOURCES = rtx_comp.cc rtx_compiler.cc trx_compiler.cc pattern.cc apertium_re.cc -rtx_proc_SOURCES = rtx_proc.cc rtx_processor.cc apertium_re.cc +rtx_proc_SOURCES = rtx_proc.cc rtx_processor.cc apertium_re.cc chunk.cc rtx_decomp_SOURCES = rtx_decomp.cc diff --git a/src/chunk.cc b/src/chunk.cc new file mode 100644 index 0000000..135d4db --- /dev/null +++ b/src/chunk.cc @@ -0,0 +1,621 @@ +#include +#include +#include +#include + +#include + +wstring +combineWblanks(wstring wblank_current, wstring wblank_to_add) +{ + if(wblank_current.empty() && wblank_to_add.empty()) + { + return wblank_current; + } + else if(wblank_current.empty()) + { + return wblank_to_add; + } + else if(wblank_to_add.empty()) + { + return wblank_current; + } + + wstring new_out_wblank; + for(wstring::const_iterator it = wblank_current.begin(); it != wblank_current.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == ']') + { + if(*(it+1) == ']') + { + new_out_wblank += ';'; + break; + } + } + else + { + new_out_wblank += *it; + } + } + + for(wstring::const_iterator it = wblank_to_add.begin(); it != wblank_to_add.end(); it++) + { + if(*it == '\\') + { + new_out_wblank += *it; + it++; + new_out_wblank += *it; + } + else if(*it == '[') + { + if(*(it+1) == '[') + { + new_out_wblank += ' '; + it++; + } + } + else + { + new_out_wblank += *it; + } + } + + return new_out_wblank; +} + +wstring +Chunk::chunkPart(ApertiumRE const &part, const ClipType side) +{ + string chunk; + switch(side) + { + case SourceClip: + chunk = UtfConverter::toUtf8(source); + break; + case TargetClip: + chunk = UtfConverter::toUtf8(target); + break; + case ReferenceClip: + chunk = UtfConverter::toUtf8(coref); + break; + } + string result = part.match(chunk); + if(result.size() == 0) + { + return wstring(L""); + } + else + { + return UtfConverter::fromUtf8(result); + } +} + +void +Chunk::setChunkPart(ApertiumRE const &part, wstring const &value) +{ + string surf = UtfConverter::toUtf8(target); + if(part.match(surf).size() == 0) + { + //target += value; + } + else + { + string val = UtfConverter::toUtf8(value); + part.replace(surf, val); + target = UtfConverter::fromUtf8(surf); + } +} + +vector +Chunk::getTags(const vector& parentTags) +{ + unsigned int last = 0; + vector ret; + for(unsigned int i = 0, limit = target.size(); i < limit; i++) + { + if(target[i] == L'<') + { + last = i; + bool isNum = true; + for(unsigned int j = i+1; j < limit; j++) + { + if(target[j] == L'>') + { + if(isNum) + { + unsigned int n = stoul(target.substr(last+1, j-last-1)); + if(n != 0 && n <= parentTags.size()) + { + ret.push_back(parentTags[n-1]); + last = j+1; + break; + } + } + wstring tag = target.substr(last, j-last+1); + ret.push_back(tag); + last = j+1; + break; + } + if(!isdigit(target[j])) + { + isNum = false; + } + } + } + else if(target[i] == L'\\') + { + i++; + } + } + return ret; +} + +void +Chunk::updateTags(const vector& parentTags) +{ + if(isBlank) return; + unsigned int last = 0; + wstring result; + result.reserve(target.size() + (2*parentTags.size())); + // a rough estimate - works if most number tags are 1 digit and most new tags are 3 chars or less + for(unsigned int i = 0, limit = target.size(); i < limit; i++) + { + if(target[i] == L'<') + { + result += target.substr(last, i-last); + last = i; + bool isNum = true; + for(unsigned int j = i+1; j < limit; j++) + { + if(target[j] == L'>') + { + if(isNum) + { + unsigned int n = stoul(target.substr(last+1, j-last-1)); + if(n != 0 && n <= parentTags.size()) + { + result += parentTags[n-1]; + } + } + else + { + result += target.substr(last, j-last+1); + } + last = j+1; + break; + } + if(!isdigit(target[j])) + { + isNum = false; + } + } + } + else if(target[i] == L'\\') + { + i++; + } + } + if(last != target.size()-1) + { + result += target.substr(last); + } + target = result; +} + +void +Chunk::output(const vector& parentTags, FILE* out = NULL) +{ + if(contents.size() > 0) + { + vector tags = getTags(parentTags); + for(unsigned int i = 0; i < contents.size(); i++) + { + contents[i]->output(tags, out); + } + } + else if(isBlank) + { + if(out == NULL) + { + cout << UtfConverter::toUtf8(target); + } + else + { + fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); + } + } + else + { + updateTags(parentTags); + if(target.size() == 0) + { + } + else if(out == NULL) + { + cout << UtfConverter::toUtf8(wblank); + cout << "^"; + cout << UtfConverter::toUtf8(target); + cout << "$"; + } + else + { + fputs_unlocked(UtfConverter::toUtf8(wblank).c_str(), out); + fputc_unlocked('^', out); + fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); + fputc_unlocked('$', out); + } + } +} + +void +Chunk::output(FILE* out) +{ + vector tags; + output(tags, out); +} + +wstring +Chunk::matchSurface() +{ + if(contents.size() == 0) + { + return source; + } + return target; +} + +void +Chunk::appendChild(Chunk* kid) +{ + contents.push_back(kid); +} + +void +Chunk::conjoin(Chunk* other) +{ + unsigned int lemq_loc = 0; + for(; lemq_loc < target.size(); lemq_loc++) + { + if(target[lemq_loc] == L'\\') + { + lemq_loc++; + continue; + } + else if(target[lemq_loc] == L'#') + { + break; + } + } + target.insert(lemq_loc, L"+" + other->target); + wblank = combineWblanks(other->wblank, wblank); +} + +void +Chunk::writeTree(TreeMode mode, FILE* out) +{ + switch(mode) + { + case TreeModeFlat: writeTreePlain(out, -1); break; + case TreeModeNest: writeTreePlain(out, 0); break; + case TreeModeLatex: + if(isBlank) return; + writeString(L"\\begin{forest}\n%where n children=0{tier=word}{}\n", out); + writeString(L"% Uncomment the preceding line to make the LUs bottom-aligned.\n", out); + writeTreeLatex(out); + writeString(L"\n\\end{forest}\n", out); + break; + case TreeModeDot: + if(isBlank) return; + writeString(L"digraph {", out); + writeTreeDot(out); + writeString(L"}\n", out); + break; + case TreeModeBox: + { + if(isBlank) return; + vector> tree = writeTreeBox(); + if(tree.size() == 0) return; + unsigned int tr = 4, sl = 12, st = 11, tl = 12, tt = 11, rl = 0, rt = 0; + for(unsigned int i = 0; i < tree.size(); i++) + { + if(tree[i][0].size() > tr) tr = tree[i][0].size(); + if(tree[i][1].size() > sl) sl = tree[i][1].size(); + if(tree[i][2].size() > st) st = tree[i][2].size(); + if(tree[i][3].size() > tl) tl = tree[i][3].size(); + if(tree[i][4].size() > tt) tt = tree[i][4].size(); + if(tree[i][5].size() > rl) rl = tree[i][5].size(); + if(tree[i][6].size() > rt) rt = tree[i][6].size(); + } + bool doCoref = (rl > 0 || rt > 0); + if(doCoref && rl < 17) rl = 17; + if(doCoref && rt < 16) rt = 16; + writeString(L"Tree" + wstring(tr-3, L' '), out); + writeString(L"Source Lemma" + wstring(sl - 11, L' '), out); + writeString(L"Source Tags" + wstring(st - 10, L' '), out); + writeString(L"Target Lemma" + wstring(tl - 11, L' '), out); + writeString(L"Target Tags" + wstring(tt - 10, L' '), out); + if(doCoref) + { + writeString(L"Coreference Lemma" + wstring(rl - 16, L' '), out); + writeString(L"Coreference Tags", out); + if(rt > 16) writeString(wstring(rt - 16, L' '), out); + } + writeString(L"\n", out); + writeString(wstring(tr, L'─') + L" ", out); + writeString(wstring(sl, L'─') + L" ", out); + writeString(wstring(st, L'─') + L" ", out); + writeString(wstring(tl, L'─') + L" ", out); + writeString(wstring(tt, L'─'), out); + if(doCoref) writeString(L" " + wstring(rl, L'─'), out); + if(doCoref) writeString(L" " + wstring(rt, L'─'), out); + writeString(L"\n", out); + for(unsigned int i = 0; i < tree.size(); i++) + { + writeString(wstring(tr - tree[i][0].size(), L' ') + tree[i][0] + L" ", out); + writeString(tree[i][1] + wstring(sl - tree[i][1].size() + 1, L' '), out); + writeString(tree[i][2] + wstring(st - tree[i][2].size() + 1, L' '), out); + writeString(tree[i][3] + wstring(tl - tree[i][3].size() + 1, L' '), out); + writeString(tree[i][4] + wstring(tt - tree[i][4].size(), L' '), out); + if(doCoref) + { + writeString(L" " + tree[i][5] + wstring(rl - tree[i][5].size(), L' '), out); + writeString(L" " + tree[i][6], out); + } + writeString(L"\n", out); + } + writeString(L"\n", out); + } + break; + default: + wcerr << L"That tree mode has not yet been implemented." << endl; + } +} + +pair +Chunk::chopString(wstring s) +{ + wstring lem; + wstring tags; + for(unsigned int i = 0; i < s.size(); i++) + { + if(s[i] == L'<') + { + lem = s.substr(0, i); + tags = s.substr(i+1, s.size()-i-2); + break; + } + } + if(lem.size() == 0 && tags.size() == 0 && s.size() > 0) + { + lem = s; + } + return make_pair(lem, StringUtils::substitute(tags, L"><", L".")); +} + +void +Chunk::writeString(wstring s, FILE* out) +{ + if(out == NULL) wcerr << s; + else fputs_unlocked(UtfConverter::toUtf8(s).c_str(), out); +} + +void +Chunk::writeTreePlain(FILE* out, int depth) +{ + if(depth >= 0 && isBlank) return; + wstring base; + for(int i = 0; i < depth; i++) + { + base += L'\t'; + } + if(!isBlank) + { + if(wblank.size() > 0) + { + base += wblank; + } + base += L"^"; + } + if(source.size() > 0) + { + base += source + L"/"; + } + base += target; + if(coref.size() > 0) + { + base += L"/" + coref; + } + writeString(base, out); + if(contents.size() > 0) + { + writeString((depth == -1) ? L"{" : L"{\n", out); + int newdepth = (depth == -1) ? -1 : depth + 1; + for(unsigned int i = 0; i < contents.size(); i++) + { + contents[i]->writeTreePlain(out, newdepth); + } + for(int i = 0; i < depth; i++) + { + writeString(L"\t", out); + } + writeString(L"}", out); + } + if(!isBlank) writeString(L"$", out); + if(depth != -1) writeString(L"\n", out); +} + +void +Chunk::writeTreeLatex(FILE* out) +{ + if(isBlank) return; + wstring nl = L" \\\\ "; + wstring base; + pair p; + if(source.size() > 0) + { + p = chopString(source); + base += L"\\textbf{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}" + nl; + } + p = chopString(target); + if(contents.size() == 0) + { + base += L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; + } + else + { + unsigned int i = 0; + for(; i < p.second.size(); i++) + { + if(p.second[i] == L'.') break; + } + if(i < p.second.size()) + { + base += p.second.substr(0, i) + nl + L"\\textit{" + p.first + L"}"; + base += nl + L"\\texttt{" + p.second.substr(i+1) + L"}"; + } + else + { + base += p.second + nl + L"\\textit{" + p.first + L"}"; + } + } + if(coref.size() > 0) + { + p = chopString(coref); + base += nl + L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; + } + base = L"[{ \\begin{tabular}{c} " + base + L" \\end{tabular} } "; + base = StringUtils::substitute(base, L"_", L"\\_"); + writeString(base, out); + for(unsigned int i = 0; i < contents.size(); i++) contents[i]->writeTreeLatex(out); + writeString(L" ]", out); +} + +wstring +Chunk::writeTreeDot(FILE* out) +{ + if(isBlank) return L""; + static int nodeId = 0; + nodeId++; + wstring name = L"n" + to_wstring(nodeId); + wstring node = name + L" \\[label=\""; + if(source.size() > 0) + { + node += source + L"\\\\n"; + } + node += target; + if(coref.size() > 0) + { + node += L"\\\\n" + coref; + } + node += L"\"\\];"; + writeString(node, out); + for(unsigned int i = 0; i < contents.size(); i++) + { + wstring kid = contents[i]->writeTreeDot(out); + if(kid.size() > 0) writeString(name + L" -> " + kid + L";", out); + } + return name; +} + +vector> +Chunk::writeTreeBox() +{ + if(contents.size() == 0) + { + vector ret; + ret.resize(7); + pair p = chopString(source); + ret[1] = p.first; ret[2] = p.second; + p = chopString(target); + ret[3] = p.first; ret[4] = p.second; + p = chopString(coref); + ret[5] = p.first; ret[6] = p.second; + return vector>(1, ret); + } + else + { + vector> bounds; + vector> tree; + for(unsigned int i = 0; i < contents.size(); i++) + { + if(!contents[i]->isBlank) + { + vector> temp = contents[i]->writeTreeBox(); + tree.insert(tree.end(), temp.begin(), temp.end()); + if(temp.size() == 1) + { + bounds.push_back(make_pair(tree.size() -1, tree.size() - 1)); + continue; + } + int first = -1, last = -1; + for(unsigned int j = tree.size() - temp.size(); j < tree.size(); j++) + { + if(first == -1 && tree[j][0][0] != L' ') first = j; + else if(first != -1 && last == -1 && tree[j][0][0] == L' ') last = j-1; + } + first = (first == -1) ? tree.size() - temp.size() : first; + last = (last == -1) ? tree.size() - 1 : last; + bounds.push_back(make_pair((unsigned int)first, (unsigned int)last)); + } + } + if(tree.size() == 1) + { + tree[0][0] = L"─" + tree[0][0]; + return tree; + } + unsigned int center = tree.size() / 2; + unsigned int len = 0; + for(unsigned int i = 0; i < tree.size(); i++) + { + if(tree[i][0].size() > len) len = tree[i][0].size(); + } + set lines; + for(unsigned int i = 0; i < bounds.size(); i++) + { + if(bounds[i].second < center) lines.insert(bounds[i].second); + else if(bounds[i].first > center) lines.insert(bounds[i].first); + else lines.insert(center); + } + unsigned int firstLine = *lines.begin(); + unsigned int lastLine = *lines.rbegin(); + for(unsigned int i = 0; i < tree.size(); i++) + { + unsigned int sz = tree[i][0].size(); + if(lines.count(i) == 0) + { + tree[i][0] = wstring(len - sz, L' ') + tree[i][0]; + } + else + { + if(sz > 0) + { + switch(tree[i][0][0]) + { + case L'│': tree[i][0][0] = L'┤'; break; + case L'├': tree[i][0][0] = L'┼'; break; + case L'┌': tree[i][0][0] = L'┬'; break; + case L'└': tree[i][0][0] = L'┴'; break; + default: break; + } + } + tree[i][0] = wstring(len - sz, L'─') + tree[i][0]; + } + if(i < firstLine || i > lastLine) tree[i][0] = L' ' + tree[i][0]; + else if(i == firstLine && i == lastLine) tree[i][0] = L'─' + tree[i][0]; + else if(i == firstLine) tree[i][0] = L'┌' + tree[i][0]; + else if(i > firstLine && i < lastLine) + { + if(lines.count(i) == 0) tree[i][0] = L'│' + tree[i][0]; + else tree[i][0] = L'├' + tree[i][0]; + } + else if(i == lastLine) tree[i][0] = L'└' + tree[i][0]; + } + return tree; + } +} diff --git a/src/chunk.h b/src/chunk.h index 1a185e7..214dc8d 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -2,11 +2,13 @@ #define __RTXCHUNK__ #include -#include -#include #include #include +#include +#include +#include + enum ClipType { SourceClip, @@ -29,18 +31,20 @@ public: wstring source; wstring target; wstring coref; + wstring wblank; bool isBlank; bool isJoiner; vector contents; int rule; + Chunk() : isBlank(false), isJoiner(false), rule(-1) {} Chunk(wstring blankContent) : target(blankContent), isBlank(true), isJoiner(false), rule(-1) {} - Chunk(wstring src, wstring dest, wstring cor) - : source(src), target(dest), coref(cor), isBlank(false), isJoiner(false), rule(-1) + Chunk(wstring src, wstring dest, wstring cor, wstring wbl) + : source(src), target(dest), coref(cor), wblank(wbl), isBlank(false), isJoiner(false), rule(-1) {} Chunk(wstring dest, vector& children, int r = -1) : target(dest), isBlank(false), isJoiner(false), contents(children), rule(r) @@ -50,6 +54,7 @@ public: source = other.source; target = other.target; coref = other.coref; + wblank = other.wblank; isBlank = other.isBlank; isJoiner = other.isJoiner; contents = other.contents; @@ -60,6 +65,7 @@ public: source.swap(other.source); target.swap(other.target); coref.swap(other.coref); + wblank.swap(other.wblank); isBlank = other.isBlank; isJoiner = other.isJoiner; contents.swap(other.contents); @@ -70,6 +76,7 @@ public: source.swap(other.source); target.swap(other.target); coref.swap(other.coref); + wblank.swap(other.wblank); isBlank = other.isBlank; isJoiner = other.isJoiner; contents.swap(other.contents); @@ -84,6 +91,7 @@ public: ret->source = source; ret->target = target; ret->coref = coref; + ret->wblank = wblank; ret->contents.reserve(contents.size()); for(unsigned int i = 0, limit = contents.size(); i < limit; i++) { @@ -91,516 +99,30 @@ public: } return ret; } - wstring chunkPart(ApertiumRE const &part, const ClipType side) - { - string chunk; - switch(side) - { - case SourceClip: - chunk = UtfConverter::toUtf8(source); - break; - case TargetClip: - chunk = UtfConverter::toUtf8(target); - break; - case ReferenceClip: - chunk = UtfConverter::toUtf8(coref); - break; - } - string result = part.match(chunk); - if(result.size() == 0) - { - return wstring(L""); - } - else - { - return UtfConverter::fromUtf8(result); - } - } - void setChunkPart(ApertiumRE const &part, wstring const &value) - { - string surf = UtfConverter::toUtf8(target); - if(part.match(surf).size() == 0) - { - //target += value; - } - else - { - string val = UtfConverter::toUtf8(value); - part.replace(surf, val); - target = UtfConverter::fromUtf8(surf); - } - } - vector getTags(const vector& parentTags) - { - unsigned int last = 0; - vector ret; - for(unsigned int i = 0, limit = target.size(); i < limit; i++) - { - if(target[i] == L'<') - { - last = i; - bool isNum = true; - for(unsigned int j = i+1; j < limit; j++) - { - if(target[j] == L'>') - { - if(isNum) - { - unsigned int n = stoul(target.substr(last+1, j-last-1)); - if(n != 0 && n <= parentTags.size()) - { - ret.push_back(parentTags[n-1]); - last = j+1; - break; - } - } - wstring tag = target.substr(last, j-last+1); - ret.push_back(tag); - last = j+1; - break; - } - if(!isdigit(target[j])) - { - isNum = false; - } - } - } - else if(target[i] == L'\\') - { - i++; - } - } - return ret; - } - void updateTags(const vector& parentTags) - { - if(isBlank) return; - unsigned int last = 0; - wstring result; - result.reserve(target.size() + (2*parentTags.size())); - // a rough estimate - works if most number tags are 1 digit and most new tags are 3 chars or less - for(unsigned int i = 0, limit = target.size(); i < limit; i++) - { - if(target[i] == L'<') - { - result += target.substr(last, i-last); - last = i; - bool isNum = true; - for(unsigned int j = i+1; j < limit; j++) - { - if(target[j] == L'>') - { - if(isNum) - { - unsigned int n = stoul(target.substr(last+1, j-last-1)); - if(n != 0 && n <= parentTags.size()) - { - result += parentTags[n-1]; - } - } - else - { - result += target.substr(last, j-last+1); - } - last = j+1; - break; - } - if(!isdigit(target[j])) - { - isNum = false; - } - } - } - else if(target[i] == L'\\') - { - i++; - } - } - if(last != target.size()-1) - { - result += target.substr(last); - } - target = result; - } - void output(const vector& parentTags, FILE* out = NULL) - { - if(contents.size() > 0) - { - vector tags = getTags(parentTags); - for(unsigned int i = 0; i < contents.size(); i++) - { - contents[i]->output(tags, out); - } - } - else if(isBlank) - { - if(out == NULL) - { - cout << UtfConverter::toUtf8(target); - } - else - { - fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); - } - } - else - { - updateTags(parentTags); - if(target.size() == 0) - { - } - else if(out == NULL) - { - cout << "^"; - cout << UtfConverter::toUtf8(target); - cout << "$"; - } - else - { - fputc_unlocked('^', out); - fputs_unlocked(UtfConverter::toUtf8(target).c_str(), out); - fputc_unlocked('$', out); - } - } - } - void output(FILE* out) - { - vector tags; - output(tags, out); - } - wstring matchSurface() - { - if(contents.size() == 0) - { - return source; - } - return target; - } - void appendChild(Chunk* kid) - { - contents.push_back(kid); - } - void conjoin(Chunk* other) - { - unsigned int lemq_loc = 0; - for(; lemq_loc < target.size(); lemq_loc++) - { - if(target[lemq_loc] == L'\\') - { - lemq_loc++; - continue; - } - else if(target[lemq_loc] == L'#') - { - break; - } - } - target.insert(lemq_loc, L"+" + other->target); - } - void writeTree(TreeMode mode, FILE* out) - { - switch(mode) - { - case TreeModeFlat: writeTreePlain(out, -1); break; - case TreeModeNest: writeTreePlain(out, 0); break; - case TreeModeLatex: - if(isBlank) return; - writeString(L"\\begin{forest}\n%where n children=0{tier=word}{}\n", out); - writeString(L"% Uncomment the preceding line to make the LUs bottom-aligned.\n", out); - writeTreeLatex(out); - writeString(L"\n\\end{forest}\n", out); - break; - case TreeModeDot: - if(isBlank) return; - writeString(L"digraph {", out); - writeTreeDot(out); - writeString(L"}\n", out); - break; - case TreeModeBox: - { - if(isBlank) return; - vector> tree = writeTreeBox(); - if(tree.size() == 0) return; - unsigned int tr = 4, sl = 12, st = 11, tl = 12, tt = 11, rl = 0, rt = 0; - for(unsigned int i = 0; i < tree.size(); i++) - { - if(tree[i][0].size() > tr) tr = tree[i][0].size(); - if(tree[i][1].size() > sl) sl = tree[i][1].size(); - if(tree[i][2].size() > st) st = tree[i][2].size(); - if(tree[i][3].size() > tl) tl = tree[i][3].size(); - if(tree[i][4].size() > tt) tt = tree[i][4].size(); - if(tree[i][5].size() > rl) rl = tree[i][5].size(); - if(tree[i][6].size() > rt) rt = tree[i][6].size(); - } - bool doCoref = (rl > 0 || rt > 0); - if(doCoref && rl < 17) rl = 17; - if(doCoref && rt < 16) rt = 16; - writeString(L"Tree" + wstring(tr-3, L' '), out); - writeString(L"Source Lemma" + wstring(sl - 11, L' '), out); - writeString(L"Source Tags" + wstring(st - 10, L' '), out); - writeString(L"Target Lemma" + wstring(tl - 11, L' '), out); - writeString(L"Target Tags" + wstring(tt - 10, L' '), out); - if(doCoref) - { - writeString(L"Coreference Lemma" + wstring(rl - 16, L' '), out); - writeString(L"Coreference Tags", out); - if(rt > 16) writeString(wstring(rt - 16, L' '), out); - } - writeString(L"\n", out); - writeString(wstring(tr, L'─') + L" ", out); - writeString(wstring(sl, L'─') + L" ", out); - writeString(wstring(st, L'─') + L" ", out); - writeString(wstring(tl, L'─') + L" ", out); - writeString(wstring(tt, L'─'), out); - if(doCoref) writeString(L" " + wstring(rl, L'─'), out); - if(doCoref) writeString(L" " + wstring(rt, L'─'), out); - writeString(L"\n", out); - for(unsigned int i = 0; i < tree.size(); i++) - { - writeString(wstring(tr - tree[i][0].size(), L' ') + tree[i][0] + L" ", out); - writeString(tree[i][1] + wstring(sl - tree[i][1].size() + 1, L' '), out); - writeString(tree[i][2] + wstring(st - tree[i][2].size() + 1, L' '), out); - writeString(tree[i][3] + wstring(tl - tree[i][3].size() + 1, L' '), out); - writeString(tree[i][4] + wstring(tt - tree[i][4].size(), L' '), out); - if(doCoref) - { - writeString(L" " + tree[i][5] + wstring(rl - tree[i][5].size(), L' '), out); - writeString(L" " + tree[i][6], out); - } - writeString(L"\n", out); - } - writeString(L"\n", out); - } - break; - default: - wcerr << L"That tree mode has not yet been implemented." << endl; - } - } + + wstring chunkPart(ApertiumRE const &part, const ClipType side); + void setChunkPart(ApertiumRE const &part, wstring const &value); + vector getTags(const vector& parentTags); + void updateTags(const vector& parentTags); + void output(const vector& parentTags, FILE* out); + void output(FILE* out); + wstring matchSurface(); + void appendChild(Chunk* kid); + void conjoin(Chunk* other); + void writeTree(TreeMode mode, FILE* out); + private: - static pair chopString(wstring s) - { - wstring lem; - wstring tags; - for(unsigned int i = 0; i < s.size(); i++) - { - if(s[i] == L'<') - { - lem = s.substr(0, i); - tags = s.substr(i+1, s.size()-i-2); - break; - } - } - if(lem.size() == 0 && tags.size() == 0 && s.size() > 0) - { - lem = s; - } - return make_pair(lem, StringUtils::substitute(tags, L"><", L".")); - } - static void writeString(wstring s, FILE* out) - { - if(out == NULL) wcerr << s; - else fputs_unlocked(UtfConverter::toUtf8(s).c_str(), out); - } - void writeTreePlain(FILE* out, int depth) - { - if(depth >= 0 && isBlank) return; - wstring base; - for(int i = 0; i < depth; i++) - { - base += L'\t'; - } - if(!isBlank) base += L"^"; - if(source.size() > 0) - { - base += source + L"/"; - } - base += target; - if(coref.size() > 0) - { - base += L"/" + coref; - } - writeString(base, out); - if(contents.size() > 0) - { - writeString((depth == -1) ? L"{" : L"{\n", out); - int newdepth = (depth == -1) ? -1 : depth + 1; - for(unsigned int i = 0; i < contents.size(); i++) - { - contents[i]->writeTreePlain(out, newdepth); - } - for(int i = 0; i < depth; i++) - { - writeString(L"\t", out); - } - writeString(L"}", out); - } - if(!isBlank) writeString(L"$", out); - if(depth != -1) writeString(L"\n", out); - } - void writeTreeLatex(FILE* out) - { - if(isBlank) return; - wstring nl = L" \\\\ "; - wstring base; - pair p; - if(source.size() > 0) - { - p = chopString(source); - base += L"\\textbf{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}" + nl; - } - p = chopString(target); - if(contents.size() == 0) - { - base += L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; - } - else - { - unsigned int i = 0; - for(; i < p.second.size(); i++) - { - if(p.second[i] == L'.') break; - } - if(i < p.second.size()) - { - base += p.second.substr(0, i) + nl + L"\\textit{" + p.first + L"}"; - base += nl + L"\\texttt{" + p.second.substr(i+1) + L"}"; - } - else - { - base += p.second + nl + L"\\textit{" + p.first + L"}"; - } - } - if(coref.size() > 0) - { - p = chopString(coref); - base += nl + L"\\textit{" + p.first + L"}" + nl + L"\\texttt{" + p.second + L"}"; - } - base = L"[{ \\begin{tabular}{c} " + base + L" \\end{tabular} } "; - base = StringUtils::substitute(base, L"_", L"\\_"); - writeString(base, out); - for(unsigned int i = 0; i < contents.size(); i++) contents[i]->writeTreeLatex(out); - writeString(L" ]", out); - } - wstring writeTreeDot(FILE* out) - { - if(isBlank) return L""; - static int nodeId = 0; - nodeId++; - wstring name = L"n" + to_wstring(nodeId); - wstring node = name + L" \\[label=\""; - if(source.size() > 0) - { - node += source + L"\\\\n"; - } - node += target; - if(coref.size() > 0) - { - node += L"\\\\n" + coref; - } - node += L"\"\\];"; - writeString(node, out); - for(unsigned int i = 0; i < contents.size(); i++) - { - wstring kid = contents[i]->writeTreeDot(out); - if(kid.size() > 0) writeString(name + L" -> " + kid + L";", out); - } - return name; - } - vector> writeTreeBox() - { - if(contents.size() == 0) - { - vector ret; - ret.resize(7); - pair p = chopString(source); - ret[1] = p.first; ret[2] = p.second; - p = chopString(target); - ret[3] = p.first; ret[4] = p.second; - p = chopString(coref); - ret[5] = p.first; ret[6] = p.second; - return vector>(1, ret); - } - else - { - vector> bounds; - vector> tree; - for(unsigned int i = 0; i < contents.size(); i++) - { - if(!contents[i]->isBlank) - { - vector> temp = contents[i]->writeTreeBox(); - tree.insert(tree.end(), temp.begin(), temp.end()); - if(temp.size() == 1) - { - bounds.push_back(make_pair(tree.size() -1, tree.size() - 1)); - continue; - } - int first = -1, last = -1; - for(unsigned int j = tree.size() - temp.size(); j < tree.size(); j++) - { - if(first == -1 && tree[j][0][0] != L' ') first = j; - else if(first != -1 && last == -1 && tree[j][0][0] == L' ') last = j-1; - } - first = (first == -1) ? tree.size() - temp.size() : first; - last = (last == -1) ? tree.size() - 1 : last; - bounds.push_back(make_pair((unsigned int)first, (unsigned int)last)); - } - } - if(tree.size() == 1) - { - tree[0][0] = L"─" + tree[0][0]; - return tree; - } - unsigned int center = tree.size() / 2; - unsigned int len = 0; - for(unsigned int i = 0; i < tree.size(); i++) - { - if(tree[i][0].size() > len) len = tree[i][0].size(); - } - set lines; - for(unsigned int i = 0; i < bounds.size(); i++) - { - if(bounds[i].second < center) lines.insert(bounds[i].second); - else if(bounds[i].first > center) lines.insert(bounds[i].first); - else lines.insert(center); - } - unsigned int firstLine = *lines.begin(); - unsigned int lastLine = *lines.rbegin(); - for(unsigned int i = 0; i < tree.size(); i++) - { - unsigned int sz = tree[i][0].size(); - if(lines.count(i) == 0) - { - tree[i][0] = wstring(len - sz, L' ') + tree[i][0]; - } - else - { - if(sz > 0) - { - switch(tree[i][0][0]) - { - case L'│': tree[i][0][0] = L'┤'; break; - case L'├': tree[i][0][0] = L'┼'; break; - case L'┌': tree[i][0][0] = L'┬'; break; - case L'└': tree[i][0][0] = L'┴'; break; - default: break; - } - } - tree[i][0] = wstring(len - sz, L'─') + tree[i][0]; - } - if(i < firstLine || i > lastLine) tree[i][0] = L' ' + tree[i][0]; - else if(i == firstLine && i == lastLine) tree[i][0] = L'─' + tree[i][0]; - else if(i == firstLine) tree[i][0] = L'┌' + tree[i][0]; - else if(i > firstLine && i < lastLine) - { - if(lines.count(i) == 0) tree[i][0] = L'│' + tree[i][0]; - else tree[i][0] = L'├' + tree[i][0]; - } - else if(i == lastLine) tree[i][0] = L'└' + tree[i][0]; - } - return tree; - } - } + static pair chopString(wstring s); + static void writeString(wstring s, FILE* out); + void writeTreePlain(FILE* out, int depth); + void writeTreeLatex(FILE* out); + wstring writeTreeDot(FILE* out); + vector> writeTreeBox(); }; +/** + * Combines two wordbound blanks and returns it +*/ +wstring combineWblanks(wstring wblank_current, wstring wblank_to_add); + #endif diff --git a/src/matcher.h b/src/matcher.h index 499aabb..da69cab 100644 --- a/src/matcher.h +++ b/src/matcher.h @@ -358,6 +358,7 @@ public: int lastWord; int id; map stringVars; + map wblankVars; vector chunkVars; ParseNode() : first(0), last(0), firstWord(0), lastWord(0), id(-1) @@ -393,6 +394,7 @@ public: mx = prevNode->mx; length = prev->length+1; stringVars = prev->stringVars; + wblankVars = prev->wblankVars; chunkVars = prev->chunkVars; weight = (w == 0) ? prev->weight : w; if(next->isBlank) @@ -418,6 +420,7 @@ public: firstWord = prev->lastWord+1; lastWord = firstWord; stringVars = prev->stringVars; + wblankVars = prev->wblankVars; chunkVars = prev->chunkVars; if(next->isBlank) { @@ -446,6 +449,7 @@ public: firstWord = other->firstWord; lastWord = other->lastWord; stringVars = other->stringVars; + wblankVars = other->wblankVars; chunkVars = other->chunkVars; } void getChunks(list& chls, int count) diff --git a/src/rtx_processor.cc b/src/rtx_processor.cc index 4012c87..6f9f12f 100644 --- a/src/rtx_processor.cc +++ b/src/rtx_processor.cc @@ -16,6 +16,7 @@ RTXProcessor::RTXProcessor() { furtherInput = true; inword = false; + inwblank = false; printingSteps = false; printingRules = false; printingBranches = false; @@ -295,15 +296,19 @@ RTXProcessor::stackCopy(int src, int dest) { case 0: theStack[dest].b = theStack[src].b; + theWblankStack[dest] = theWblankStack[src]; break; case 1: theStack[dest].i = theStack[src].i; + theWblankStack[dest] = theWblankStack[src]; break; case 2: theStack[dest].s = theStack[src].s; + theWblankStack[dest] = theWblankStack[src]; break; case 3: theStack[dest].c = theStack[src].c; + theWblankStack[dest] = theWblankStack[src]; break; default: wcerr << "Unknown StackElement mode " << theStack[src].mode; @@ -311,6 +316,12 @@ RTXProcessor::stackCopy(int src, int dest) } } +bool +RTXProcessor::gettingLemmaFromWord(wstring attr) +{ + return (attr.compare(L"lem") == 0 || attr.compare(L"lemh") == 0 || attr.compare(L"whole") == 0); +} + bool RTXProcessor::applyRule(const wstring& rule) { @@ -576,6 +587,8 @@ RTXProcessor::applyRule(const wstring& rule) wstring var = popString(); wstring val = popString(); currentBranch->stringVars[var] = val; + currentBranch->wblankVars[var] = theWblankStack[stackIdx+1]; + theWblankStack[stackIdx+1].clear(); if(printingSteps) { wcerr << " -> " << var << " = '" << val << "'" << endl; } } break; @@ -599,6 +612,8 @@ RTXProcessor::applyRule(const wstring& rule) Chunk* temp = chunkPool.next(); temp->isBlank = false; temp->target = ch->target.substr(last, c-last); + temp->wblank = out_wblank; + out_wblank.clear(); if(chunk) currentOutput.back()->contents.push_back(temp); else currentOutput.push_back(temp); last = c+1; @@ -633,7 +648,9 @@ RTXProcessor::applyRule(const wstring& rule) } else { + ch->wblank = out_wblank; currentOutput.push_back(ch); + out_wblank.clear(); } } break; @@ -679,7 +696,17 @@ RTXProcessor::applyRule(const wstring& rule) popString(part); Chunk* ch = popChunk(); if(ch == NULL) pushStack(L""); - else pushStack(ch->chunkPart(attr_items[part], SourceClip)); + else + { + if(gettingLemmaFromWord(part)) + { + pushStack(ch->chunkPart(attr_items[part], SourceClip), ch->wblank); + } + else + { + pushStack(ch->chunkPart(attr_items[part], SourceClip)); + } + } if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } } break; @@ -690,7 +717,17 @@ RTXProcessor::applyRule(const wstring& rule) popString(part); Chunk* ch = popChunk(); if(ch == NULL) pushStack(L""); - else pushStack(ch->chunkPart(attr_items[part], TargetClip)); + else + { + if(gettingLemmaFromWord(part)) + { + pushStack(ch->chunkPart(attr_items[part], TargetClip), ch->wblank); + } + else + { + pushStack(ch->chunkPart(attr_items[part], TargetClip)); + } + } if(printingSteps) { wcerr << " -> " << theStack[stackIdx].s << endl; } } break; @@ -731,7 +768,8 @@ RTXProcessor::applyRule(const wstring& rule) { wstring name = popString(); wstring val = currentBranch->stringVars[name]; - pushStack(val); + wstring wblank_val = currentBranch->wblankVars[name]; + pushStack(val, wblank_val); if(printingSteps) { wcerr << " -> " << name << " = " << val << endl; } } break; @@ -794,6 +832,8 @@ RTXProcessor::applyRule(const wstring& rule) Chunk* ch = chunkPool.next(); ch->isBlank = false; ch->target = kid->target.substr(1, j-1); + ch->wblank = out_wblank; + out_wblank.clear(); theStack[stackIdx].c->contents.push_back(ch); ch = chunkPool.next(); ch->isBlank = true; @@ -802,6 +842,8 @@ RTXProcessor::applyRule(const wstring& rule) } else { + kid->wblank = out_wblank; + out_wblank.clear(); theStack[stackIdx].c->contents.push_back(kid); } if(printingSteps) { wcerr << " -> child with surface '" << kid->target << L"' appended" << endl; } @@ -824,10 +866,13 @@ RTXProcessor::applyRule(const wstring& rule) if(theStack[stackIdx+1].mode == 2) { theStack[stackIdx].c->target += theStack[stackIdx+1].s; + out_wblank = combineWblanks(out_wblank, theWblankStack[stackIdx+1]); + theWblankStack[stackIdx+1].clear(); } else { theStack[stackIdx].c->target += theStack[stackIdx+1].c->target; + theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank; } if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; } } @@ -849,10 +894,13 @@ RTXProcessor::applyRule(const wstring& rule) if(theStack[stackIdx+1].mode == 2) { theStack[stackIdx].c->source += theStack[stackIdx+1].s; + out_wblank = combineWblanks(out_wblank, theWblankStack[stackIdx+1]); + theWblankStack[stackIdx+1].clear(); } else { theStack[stackIdx].c->source += theStack[stackIdx+1].c->source; + theStack[stackIdx].c->wblank += theStack[stackIdx+1].c->wblank; } if(printingSteps) { wcerr << " -> " << theStack[stackIdx+1].s << endl; } } @@ -984,6 +1032,7 @@ RTXProcessor::readToken(FILE *in) { int pos = 0; wstring cur; + wstring wbl; wstring src; wstring dest; wstring coref; @@ -1007,8 +1056,32 @@ RTXProcessor::readToken(FILE *in) } else if(val == L'[' && !inword) { - cur += L'['; - inSquare = true; + val = fgetwc_unlocked(in); + + if(val == L'[') + { + inwblank = true; + Chunk* ret = chunkPool.next(); + ret->target = cur; + ret->isBlank = true; + return ret; + } + else + { + cur += L'['; + inSquare = true; + + if(val == L'\\') + { + cur += L'\\'; + cur += static_cast(fgetwc_unlocked(in)); + } + else if(val == L']') + { + cur += val; + inSquare = false; + } + } } else if(inSquare) { @@ -1018,6 +1091,51 @@ RTXProcessor::readToken(FILE *in) inSquare = false; } } + else if(inwblank) + { + if(val == L']') + { + cur += val; + val = fgetwc_unlocked(in); + + if(val == L'\\') + { + cur += L'\\'; + cur += static_cast(fgetwc_unlocked(in)); + } + else if(val == L']') + { + cur += val; + val = fgetwc_unlocked(in); + + if(val == L'\\') + { + cur += L'\\'; + cur += static_cast(fgetwc_unlocked(in)); + } + else if(val == L'^') + { + inwblank = false; + cur = L"[[" + cur; + wbl.swap(cur); + inword = true; + } + else + { + wcerr << L"Parse Error: Wordbound blank should be immediately followed by a Lexical Unit -> [[..]]^..$" << endl; + exit(EXIT_FAILURE); + } + } + else + { + cur += val; + } + } + else + { + cur += val; + } + } else if(inword && (val == L'$' || val == L'/')) { if(pos == 0) @@ -1041,6 +1159,7 @@ RTXProcessor::readToken(FILE *in) { inword = false; Chunk* ret = chunkPool.next(); + ret->wblank = wbl; ret->source = src; ret->target = dest; ret->coref = coref; @@ -1212,6 +1331,7 @@ RTXProcessor::checkForReduce(vector& result, ParseNode* node) cur->init(back, currentOutput[0], weight); } cur->stringVars = node->stringVars; + cur->wblankVars = node->wblankVars; cur->chunkVars = node->chunkVars; cur->id = node->id; if(temp.size() == 0) @@ -1231,6 +1351,7 @@ RTXProcessor::checkForReduce(vector& result, ParseNode* node) cur = parsePool.next(); cur->init(*it, temp.back()); cur->stringVars = (*it)->stringVars; + cur->wblankVars = (*it)->wblankVars; cur->chunkVars = (*it)->chunkVars; cur->firstWord = first; cur->lastWord = last; @@ -1607,6 +1728,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out) temp->init(mx, next); temp->id = ++newBranchId; temp->stringVars = variables; + temp->wblankVars = wblank_variables; temp->chunkVars = vector(varCount, NULL); checkForReduce(parseGraph, temp); } @@ -1621,6 +1743,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out) tempNode->init(parseGraph[i], next, true); tempNode->id = parseGraph[i]->id; tempNode->stringVars = parseGraph[i]->stringVars; + tempNode->wblankVars = parseGraph[i]->wblankVars; tempNode->chunkVars = parseGraph[i]->chunkVars; checkForReduce(temp, tempNode); } @@ -1676,7 +1799,9 @@ RTXProcessor::processGLR(FILE *in, FILE *out) parseGraph.clear(); outputAll(out); variables = currentBranch->stringVars; + wblank_variables = currentBranch->wblankVars; fflush(out); + vector wblanks; vector sources; vector targets; vector corefs; @@ -1695,6 +1820,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out) { unknowns.push_back(false); } + wblanks.push_back(temp->wblank); sources.push_back(temp->source); targets.push_back(temp->target); corefs.push_back(temp->coref); @@ -1714,6 +1840,7 @@ RTXProcessor::processGLR(FILE *in, FILE *out) for(int i = 0; i < N; i++) { Chunk* c = chunkPool.next(); + c->wblank = wblanks[i]; c->source = sources[i]; c->target = targets[i]; c->coref = corefs[i]; diff --git a/src/rtx_processor.h b/src/rtx_processor.h index 607e988..d425abf 100644 --- a/src/rtx_processor.h +++ b/src/rtx_processor.h @@ -56,6 +56,12 @@ private: * name => value */ map variables; + + /** + * Virtual machine global variables to wblank map + * name => value + */ + map wblank_variables; /** * Lists @@ -122,6 +128,17 @@ private: * Index of the top element on theStack */ int stackIdx; + + /** + * A parallel stack to store wordbound blanks that mimics the operations + * of the main stack. wblanks are added everytime lemmas are clipped + */ + wstring theWblankStack[32]; + + /** + * wordbound blank to be output + */ + wstring out_wblank; /** * Input to the virtual machine @@ -195,6 +212,12 @@ private: * Initial value: false */ bool inword; + + /** + * true if the next input token should be parsed as a wordbound blank, false otherwise + * Initial value: false + */ + bool inwblank; /** * Whether output should flush on \0 @@ -350,21 +373,25 @@ private: { theStack[++stackIdx].mode = 0; theStack[stackIdx].b = b; + theWblankStack[stackIdx].clear(); } inline void pushStack(int i) { theStack[++stackIdx].mode = 1; theStack[stackIdx].i = i; + theWblankStack[stackIdx].clear(); } - inline void pushStack(const wstring& s) + inline void pushStack(const wstring& s, wstring wbl = L"") { theStack[++stackIdx].mode = 2; theStack[stackIdx].s.assign(s); + theWblankStack[stackIdx] = wbl; } inline void pushStack(Chunk* c) { theStack[++stackIdx].mode = 3; theStack[stackIdx].c = c; + theWblankStack[stackIdx].clear(); } /** @@ -424,6 +451,11 @@ private: */ void processTRX(FILE* in, FILE* out); + /** + * True if clipping lem/lemh/whole + */ + bool gettingLemmaFromWord(wstring attr); + public: RTXProcessor(); ~RTXProcessor(); diff --git a/tests/WblankBasic.input b/tests/WblankBasic.input new file mode 100644 index 0000000..1f606f9 --- /dev/null +++ b/tests/WblankBasic.input @@ -0,0 +1,2 @@ +[[t:b:123avx]]^llama/llama$ [[t:b:324avx]]^llama/llama$ [[t:b:6345fx]]^red/rojo$ [[t:b:874jfn]]^pajama/pajama$ [[t:b:47567hb]]^llama/llama$ +[[t:b:07vndsd]]^green/verde$ [[t:b:362354]]^sock/calcetine$ diff --git a/tests/WblankBasic.output b/tests/WblankBasic.output new file mode 100644 index 0000000..694607e --- /dev/null +++ b/tests/WblankBasic.output @@ -0,0 +1,2 @@ +[[t:b:123avx]]^llama$ [[t:b:324avx]]^llama$ [[t:b:874jfn]]^pajama$ [[t:b:6345fx]]^rojo$ [[t:b:47567hb]]^llama$ +[[t:b:362354]]^calcetine$ [[t:b:07vndsd]]^verde$ diff --git a/tests/WblankBasic.rtx b/tests/WblankBasic.rtx new file mode 100644 index 0000000..a5afd2e --- /dev/null +++ b/tests/WblankBasic.rtx @@ -0,0 +1,10 @@ +n: _.gender.number; +adj: _.gender.number; +NP: _; + +gender = m f mf; +number = sg pl sp; + +NP -> n {1[number=pl]} | + adj n n {2[number=sp] _1 1[number=pl, gender=f] _2 3[number=sg]} | + adj n {2[number=sp] _1 1[number=sp, gender=m]}; diff --git a/tests/WblankConjoinedLUs.input b/tests/WblankConjoinedLUs.input new file mode 100644 index 0000000..31e0d46 --- /dev/null +++ b/tests/WblankConjoinedLUs.input @@ -0,0 +1,6 @@ +[[t:i:123zxs]]^the/el$ [[t:b:po12ds3]]^green/verde$ [[t:s:213vda]]^dragon/dragón$[[t:x:asd123]]^./.$ +^the/el$ [[t:b:4520dw]]^green/verde$ [[t:x:12asd3; t:b:asm293]]^dragon/dragón# con queso$[[t:i:129cas]]^./.$ +[[t:i:123zxs]]^the/el$ [[t:b:po12ds3; t:x:290cas]]^green/verde$ [[t:s:213vda]]^dragon/dragón$[[t:x:asd123]]^./.$ +^the/el$ ^green/verde$ [[t:x:12asd3; t:b:asm293]]^dragon/dragón# con queso$[[t:i:129cas]]^./.$ +^the/el$ [[t:b:po12ds3; t:y:12vas03]]^green/verde$ ^dragon/dragón$^./.$ +^the/el$ [[t:b:4520dw; t:o:830wic]]^green/verde$ [[t:x:12asd3; t:b:asm293]]^dragon/dragón# con queso$[[t:i:129cas]]^./.$ diff --git a/tests/WblankConjoinedLUs.output b/tests/WblankConjoinedLUs.output new file mode 100644 index 0000000..f2d3a95 --- /dev/null +++ b/tests/WblankConjoinedLUs.output @@ -0,0 +1,6 @@ +[[t:i:123zxs]]^el$ [[t:b:po12ds3; t:s:213vda]]^dragón+verde$[[t:x:asd123]]^.$ +^el$ [[t:b:4520dw; t:x:12asd3; t:b:asm293]]^dragón+verde# con queso$[[t:i:129cas]]^.$ +[[t:i:123zxs]]^el$ [[t:b:po12ds3; t:x:290cas; t:s:213vda]]^dragón+verde$[[t:x:asd123]]^.$ +^el$ [[t:x:12asd3; t:b:asm293]]^dragón+verde# con queso$[[t:i:129cas]]^.$ +^el$ [[t:b:po12ds3; t:y:12vas03]]^dragón+verde$^.$ +^el$ [[t:b:4520dw; t:o:830wic; t:x:12asd3; t:b:asm293]]^dragón+verde# con queso$[[t:i:129cas]]^.$ diff --git a/tests/WblankConjoinedLUs.rtx b/tests/WblankConjoinedLUs.rtx new file mode 100644 index 0000000..014dfe9 --- /dev/null +++ b/tests/WblankConjoinedLUs.rtx @@ -0,0 +1,8 @@ +n: _.gender.number; +adj: _.gender.number; +NP: _; + +gender = m f mf; +number = sg pl sp; + +NP -> adj n {2 + 1}; diff --git a/tests/WblankEmpty.input b/tests/WblankEmpty.input new file mode 100644 index 0000000..5907d52 --- /dev/null +++ b/tests/WblankEmpty.input @@ -0,0 +1,2 @@ +^llama/llama$ [[t:b:324avx]]^llama/llama$ ^red/rojo$ [[t:b:874jfn]]^pajama/pajama$ [[t:b:47567hb]]^llama/llama$ +[[t:b:07vndsd]]^green/verde$ ^sock/calcetine$ diff --git a/tests/WblankEmpty.output b/tests/WblankEmpty.output new file mode 100644 index 0000000..c6d1013 --- /dev/null +++ b/tests/WblankEmpty.output @@ -0,0 +1,2 @@ +^llama$ [[t:b:324avx]]^llama$ [[t:b:874jfn]]^pajama$ ^rojo$ [[t:b:47567hb]]^llama$ +^calcetine$ [[t:b:07vndsd]]^verde$ diff --git a/tests/WblankEmpty.rtx b/tests/WblankEmpty.rtx new file mode 100644 index 0000000..4c5635a --- /dev/null +++ b/tests/WblankEmpty.rtx @@ -0,0 +1,11 @@ +n: _.gender.number; +adj: _.gender.number; +NP: _; + +gender = m f mf; +number = sg pl sp; + +NP -> n {1[number=pl]} | + adj n n {2[number=sp] _1 1[number=pl, gender=f] _2 3[number=sg]} | + adj n {2[number=sp] _1 1[number=sp, gender=m]}; + diff --git a/tests/WblankVariable.input b/tests/WblankVariable.input new file mode 100644 index 0000000..47c6057 --- /dev/null +++ b/tests/WblankVariable.input @@ -0,0 +1,3 @@ +[[t:b:123abc]]^the/el$ [[t:i:asfkl3]]^green/verde$ [[t:s:098sjd]]^dragon/dragón$ [[t:b:po234f]]^sleep/duerme$[[t:b:8snx23]]^./.$ +[[t:i:xyzkm2]]^the/el$ [[t:n:124fcd]]^green/verde$ [[t:b:832dax]]^dragon/dragón$ [[t:s:24nda1]]^sleep/duerme$[[t:i:512rw9]]^./.$ + diff --git a/tests/WblankVariable.output b/tests/WblankVariable.output new file mode 100644 index 0000000..9e5c65e --- /dev/null +++ b/tests/WblankVariable.output @@ -0,0 +1,3 @@ +[[t:b:123abc]]^el$ [[t:s:098sjd]]^dragón$ [[t:i:asfkl3]]^verde$ [[t:b:po234f]]^duerme$[[t:b:8snx23]]^.$ +[[t:i:xyzkm2]]^el$ [[t:b:832dax]]^dragón$ [[t:n:124fcd]]^verde$ [[t:s:24nda1]]^duerme$[[t:i:512rw9]]^.$ + diff --git a/tests/WblankVariable.rtx b/tests/WblankVariable.rtx new file mode 100644 index 0000000..5729ffb --- /dev/null +++ b/tests/WblankVariable.rtx @@ -0,0 +1,12 @@ +n: _.gender.number; +adj: _.gender.number; +vblex: _.number; +NP: _; +V: _; + +gender = m f mf; +number = sg pl sp; + +NP -> adj %n [$%number=2.number/tl,$%lemvar=1.lemh/tl] { 2 _1 1[lemh=$%lemvar,number=2.number/sl] } ; + +V -> %vblex [$%number=(if ($%number = "") sg else $%number),$%lemvar=1.lemh/tl] { 1[lemh=$%lemvar,number=$%number] } ; diff --git a/tests/WblankVariableBasic.input b/tests/WblankVariableBasic.input new file mode 100644 index 0000000..47c6057 --- /dev/null +++ b/tests/WblankVariableBasic.input @@ -0,0 +1,3 @@ +[[t:b:123abc]]^the/el$ [[t:i:asfkl3]]^green/verde$ [[t:s:098sjd]]^dragon/dragón$ [[t:b:po234f]]^sleep/duerme$[[t:b:8snx23]]^./.$ +[[t:i:xyzkm2]]^the/el$ [[t:n:124fcd]]^green/verde$ [[t:b:832dax]]^dragon/dragón$ [[t:s:24nda1]]^sleep/duerme$[[t:i:512rw9]]^./.$ + diff --git a/tests/WblankVariableBasic.output b/tests/WblankVariableBasic.output new file mode 100644 index 0000000..9e5c65e --- /dev/null +++ b/tests/WblankVariableBasic.output @@ -0,0 +1,3 @@ +[[t:b:123abc]]^el$ [[t:s:098sjd]]^dragón$ [[t:i:asfkl3]]^verde$ [[t:b:po234f]]^duerme$[[t:b:8snx23]]^.$ +[[t:i:xyzkm2]]^el$ [[t:b:832dax]]^dragón$ [[t:n:124fcd]]^verde$ [[t:s:24nda1]]^duerme$[[t:i:512rw9]]^.$ + diff --git a/tests/WblankVariableBasic.rtx b/tests/WblankVariableBasic.rtx new file mode 100644 index 0000000..c3d0e9a --- /dev/null +++ b/tests/WblankVariableBasic.rtx @@ -0,0 +1,13 @@ +n: _.gender.number; +adj: _.gender.number; +NP: _; +V: _; +vblex: _.number; + +gender = m f mf; +number = sg pl sp; + +NP -> adj n [$%number=2.number] {2 _1 1[number=2.number]}; + +V -> vblex { 1[number=(if ($%number not = "") $%number else sg)] } ; + diff --git a/tests/WblankVariableTRX.input b/tests/WblankVariableTRX.input new file mode 100644 index 0000000..785faaf --- /dev/null +++ b/tests/WblankVariableTRX.input @@ -0,0 +1,2 @@ +[[t:b:123abc]]^the/el$ [[t:i:asfkl3]]^green/verde$ [[t:s:098sjd]]^dragon/dragón$ [[t:b:po234f]]^sleep/duerme$[[t:b:8snx23]]^./.$ +[[t:i:xyzkm2]]^the/el$ [[t:n:124fcd]]^green/verde$ [[t:b:832dax]]^dragon/dragón$ [[t:s:24nda1]]^sleep/duerme$[[t:i:512rw9]]^./.$ diff --git a/tests/WblankVariableTRX.output b/tests/WblankVariableTRX.output new file mode 100644 index 0000000..775c35c --- /dev/null +++ b/tests/WblankVariableTRX.output @@ -0,0 +1,2 @@ +[[t:b:123abc]]^el$ [[t:s:098sjd; t:s:098sjd]]^dragón$ [[t:i:asfkl3]]^verde$ [[t:b:po234f; t:b:po234f]]^duerme$[[t:b:8snx23]]^.$ +[[t:i:xyzkm2]]^el$ [[t:b:832dax; t:b:832dax]]^dragón$ [[t:n:124fcd]]^verde$ [[t:s:24nda1; t:s:24nda1]]^duerme$[[t:i:512rw9]]^.$ diff --git a/tests/WblankVariableTRX.trx b/tests/WblankVariableTRX.trx new file mode 100644 index 0000000..e9089ab --- /dev/null +++ b/tests/WblankVariableTRX.trx @@ -0,0 +1,95 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/WblankVariableTRXBasic.input b/tests/WblankVariableTRXBasic.input new file mode 100644 index 0000000..785faaf --- /dev/null +++ b/tests/WblankVariableTRXBasic.input @@ -0,0 +1,2 @@ +[[t:b:123abc]]^the/el$ [[t:i:asfkl3]]^green/verde$ [[t:s:098sjd]]^dragon/dragón$ [[t:b:po234f]]^sleep/duerme$[[t:b:8snx23]]^./.$ +[[t:i:xyzkm2]]^the/el$ [[t:n:124fcd]]^green/verde$ [[t:b:832dax]]^dragon/dragón$ [[t:s:24nda1]]^sleep/duerme$[[t:i:512rw9]]^./.$ diff --git a/tests/WblankVariableTRXBasic.output b/tests/WblankVariableTRXBasic.output new file mode 100644 index 0000000..775c35c --- /dev/null +++ b/tests/WblankVariableTRXBasic.output @@ -0,0 +1,2 @@ +[[t:b:123abc]]^el$ [[t:s:098sjd; t:s:098sjd]]^dragón$ [[t:i:asfkl3]]^verde$ [[t:b:po234f; t:b:po234f]]^duerme$[[t:b:8snx23]]^.$ +[[t:i:xyzkm2]]^el$ [[t:b:832dax; t:b:832dax]]^dragón$ [[t:n:124fcd]]^verde$ [[t:s:24nda1; t:s:24nda1]]^duerme$[[t:i:512rw9]]^.$ diff --git a/tests/WblankVariableTRXBasic.trx b/tests/WblankVariableTRXBasic.trx new file mode 100644 index 0000000..e18ffe8 --- /dev/null +++ b/tests/WblankVariableTRXBasic.trx @@ -0,0 +1,92 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +