commit e71d640e2728d1a10e4f3b6441c1dc8d781a8416 Author: aboelhamd Date: Mon Aug 5 18:03:33 2019 +0200 Fix '#' issue in lemmas. diff --git a/src/CLExec.cpp b/src/CLExec.cpp index ecf1041..eca14f9 100644 --- a/src/CLExec.cpp +++ b/src/CLExec.cpp @@ -243,9 +243,18 @@ void CLExec::beamSearch( string num = "_" + ss.str(); // handle the case of two lemmas separated by a space - for (unsigned t = 0; t < slTokens[x].size(); t++) - if (slTokens[x][t] == ' ') + for (unsigned t = 0; t < slTokens[x].size(); t++) { + // remove '#' and put '_' + if (slTokens[x][t] == '#') + if (t + 1 < slTokens[x].length() && slTokens[x][t] == ' ') + slTokens[x].replace(t--, 1, ""); + else + slTokens[x].replace(t, 1, "_"); + + // remove ' ' and put '_' + else if (slTokens[x][t] == ' ') slTokens[x].replace(t, 1, "_"); + } string word = toLowerCase(slTokens[x], localeId) + num; vector wordWeights = classWeights[word]; diff --git a/src/SklearnHandler.cpp b/src/SklearnHandler.cpp index 3c781bc..be4586e 100644 --- a/src/SklearnHandler.cpp +++ b/src/SklearnHandler.cpp @@ -130,7 +130,7 @@ int main(int argc, char **argv) { while (getline(lextorFile, tokenizedSentence)) { // cout << i++ << endl; - // spaces after each token +// spaces after each token vector spaces; // tokens in the sentence order @@ -205,9 +205,19 @@ int main(int argc, char **argv) { for (unsigned x = ambig->firTokId; x < ambig->firTokId + ambig->maxPat; x++) { - for (unsigned t = 0; t < slTokens[x].size(); t++) - if (slTokens[x][t] == ' ') + for (unsigned t = 0; t < slTokens[x].size(); t++) { + // remove '#' and put '_' + if (slTokens[x][t] == '#') + if (t + 1 < slTokens[x].length() + && slTokens[x][t] == ' ') + slTokens[x].replace(t--, 1, ""); + else + slTokens[x].replace(t, 1, "_"); + + // remove ' ' and put '_' + else if (slTokens[x][t] == ' ') slTokens[x].replace(t, 1, "_"); + } predictDataFile << CLExec::toLowerCase(slTokens[x], localeId) diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index 9f47601..2d9f038 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -274,9 +274,18 @@ int main(int argc, char **argv) { string word = CLExec::toLowerCase(slTokens[z], localeId); - for (unsigned c = 0; c < word.length(); c++) - if (word[c] == ' ') + for (unsigned c = 0; c < word.length(); c++) { + // remove '#' and put '_' + if (word[c] == '#') + if (c + 1 < word.length() && word[c] == ' ') + word.replace(c--, 1, ""); + else + word.replace(c, 1, "_"); + + // remove ' ' and put '_' + else if (word[c] == ' ') word.replace(c, 1, "_"); + } features += " " + word + "_" + num + ":" + label;