commit 7767f5c1c1466e27b4b01bb9f36e41541fd65882 Author: aboelhamd Date: Sat Jun 1 03:26:37 2019 +0200 Fix some bugs in scripts and add option -n to RulesApplier diff --git a/choose-best-sents.py b/choose-best-sents.py index 28b22c6..e9b93ba 100644 --- a/choose-best-sents.py +++ b/choose-best-sents.py @@ -8,10 +8,7 @@ minwerFile = open(sys.argv[3], 'w+') minperFile = open(sys.argv[4], 'w+') minwerperFile = open(sys.argv[5], 'w+') -sents = [] -wers = [] -pers = [] -werspers = [] +sents, wers, pers, werspers = [], [], [], [] minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0,0,0 with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile: @@ -46,6 +43,7 @@ with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile: minwerperFile.write(sents[minwerperI]) minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0,0,0 + sents, wers, pers, werspers = [], [], [], [] scoresFile.close() diff --git a/rem-bad-sents.py b/rem-bad-sents.py index ad2e263..e7aec1d 100644 --- a/rem-bad-sents.py +++ b/rem-bad-sents.py @@ -1,7 +1,7 @@ import sys if (len(sys.argv) != 4) : - print('\nUsage: python rem-bad-sents.py source-file ambig-target-file new-source-file'); + print('\nUsage: python rem-bad-sents.py source-file ambig-target-file(with new lines) new-source-file'); sys.exit() srcFile = open(sys.argv[1], 'r') @@ -15,10 +15,14 @@ for sent in ambigTarFile: sents.append(sent) else : - src = scrFile.readline() + src = srcFile.readline() + bad = False for sent in sents : - if (line.find("#") == -1 and line.find("@") == -1) : - newSrcFile.write() + if (sent.find("*") > -1 or sent.find("#") > -1 or sent.find("@") > -1) : + bad = True + break + if (not bad) : + newSrcFile.write(src) sents.clear() srcFile.close() diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index ea36284..692ddfb 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -28,12 +28,16 @@ int main(int argc, char **argv) { string localeId, transferFilePath, lextorFilePath, chunkerFilePath, newLextorFilePath; + bool newline = false; int opt; - while ((opt = getopt(argc, argv, ":u:")) != -1) { + while ((opt = getopt(argc, argv, ":u:n")) != -1) { switch (opt) { case 'u': newLextorFilePath = optarg; break; + case 'n': + newline = true; + break; case ':': printf("option %c needs a value\n", optopt); return -1; @@ -71,7 +75,7 @@ int main(int argc, char **argv) { cout << "Error in parameters !" << endl; cout << "Parameters are : localeId transferFilePath" - << " lextorFilePath chunkerFilePath [-u newlextorFilePath]" + << " lextorFilePath chunkerFilePath [-u newlextorFilePath] [-n]" << endl; cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" @@ -89,6 +93,9 @@ int main(int argc, char **argv) { cout << "newlextorFilePath : write the new sentences lextor in this lextor file." << endl; + cout + << "-n : put newline after each sentence ambiguous chunker (to use it removing bad sentences)." + << endl; return -1; } @@ -186,7 +193,8 @@ int main(int argc, char **argv) { for (unsigned j = 0; j < outs.size(); j++) { chunkerFile << outs[j] << endl; } -// interInFile << endl; + if (newline) + chunkerFile << endl; // delete AmbigInfo pointers for (unsigned j = 0; j < ambigInfo.size(); j++) {