commit a9ee9289b69ad4bb8a971fb1d5471331a266eacb Author: aboelhamd Date: Mon May 13 23:48:45 2019 +0200 Some modifications and evaluation results diff --git a/choose-best-sents.py b/choose-best-sents.py index 7dac6ab..28b22c6 100644 --- a/choose-best-sents.py +++ b/choose-best-sents.py @@ -16,10 +16,10 @@ minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0 with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile: for scores, sent in zip(scoresFile, combFile): - print(scores.strip()) + #print(scores.strip()) if (scores.strip()) : sents.append(sent) - scoresArr = list(map(int, scores.split())) + scoresArr = list(map(float, scores.split())) wer = scoresArr[0] per = scoresArr[1] werper = wer+per @@ -41,9 +41,9 @@ with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile: werspers.append(werper) else : - minwerFile.write(sents[minwerI]+"\n") - minperFile.write(sents[minperI]+"\n") - minwerperFile.write(sents[minwerperI]+"\n") + minwerFile.write(sents[minwerI]) + minperFile.write(sents[minperI]) + minwerperFile.write(sents[minwerperI]) minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0,0,0 diff --git a/spa-eng evaluation/results-ambig-average.txt b/spa-eng evaluation/results-ambig-average.txt new file mode 100644 index 0000000..27167b7 --- /dev/null +++ b/spa-eng evaluation/results-ambig-average.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 20914420 +Number of words in test: 23314483 +Number of unknown words (marked with a star) in test: 839003 +Percentage of unknown words: 3.599 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 17038189 +Word error rate (WER): 81.466 % +Number of position-independent correct words: 10077193 +Position-independent word error rate (PER): 63.293 % + diff --git a/spa-eng evaluation/results-apertium.txt b/spa-eng evaluation/results-apertium.txt new file mode 100644 index 0000000..f68d93e --- /dev/null +++ b/spa-eng evaluation/results-apertium.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 545016 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.531 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 395016 +Word error rate (WER): 81.056 % +Number of position-independent correct words: 235838 +Position-independent word error rate (PER): 63.443 % + diff --git a/spa-eng evaluation/results-beam.txt b/spa-eng evaluation/results-beam.txt new file mode 100644 index 0000000..6cd9e90 --- /dev/null +++ b/spa-eng evaluation/results-beam.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 501391 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.839 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 380845 +Word error rate (WER): 78.149 % +Number of position-independent correct words: 224934 +Position-independent word error rate (PER): 56.728 % + diff --git a/spa-eng evaluation/results-lm.txt b/spa-eng evaluation/results-lm.txt new file mode 100644 index 0000000..690a61a --- /dev/null +++ b/spa-eng evaluation/results-lm.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 530546 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.628 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 389040 +Word error rate (WER): 79.830 % +Number of position-independent correct words: 232384 +Position-independent word error rate (PER): 61.182 % + diff --git a/spa-eng evaluation/results-per.txt b/spa-eng evaluation/results-per.txt new file mode 100644 index 0000000..326f21f --- /dev/null +++ b/spa-eng evaluation/results-per.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 530466 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.628 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 389950 +Word error rate (WER): 80.017 % +Number of position-independent correct words: 235288 +Position-independent word error rate (PER): 60.570 % + diff --git a/spa-eng evaluation/results-wer.txt b/spa-eng evaluation/results-wer.txt new file mode 100644 index 0000000..87f24e7 --- /dev/null +++ b/spa-eng evaluation/results-wer.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 530224 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.630 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 382290 +Word error rate (WER): 78.445 % +Number of position-independent correct words: 233448 +Position-independent word error rate (PER): 60.898 % + diff --git a/spa-eng evaluation/results-werper.txt b/spa-eng evaluation/results-werper.txt new file mode 100644 index 0000000..b3b30b1 --- /dev/null +++ b/spa-eng evaluation/results-werper.txt @@ -0,0 +1,14 @@ +Statistics about input files +------------------------------------------------------- +Number of words in reference: 487335 +Number of words in test: 529447 +Number of unknown words (marked with a star) in test: 19246 +Percentage of unknown words: 3.635 % + +Results when removing unknown-word marks (stars) +------------------------------------------------------- +Edit distance: 383016 +Word error rate (WER): 78.594 % +Number of position-independent correct words: 234864 +Position-independent word error rate (PER): 60.448 % + diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index 8a396c6..90c3f56 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -70,7 +70,7 @@ main (int argc, char **argv) cout << "interInFilePath : Output file of this program which is the input for apertium interchunk." << endl; - cout << "modelsDest : Yasmet models destination." << endl; + cout << "modelsDest : Yasmet models merged file destination." << endl; cout << "beamSize : The size of beam in beam search algorithm." << endl; return -1; } @@ -101,10 +101,11 @@ main (int argc, char **argv) stringstream buffer (k); buffer >> beam; +// unsigned i = 0; string tokenizedSentence; while (getline (lextorFile, tokenizedSentence)) { - // cout << i << endl; +// cout << i << endl; // spaces after each token vector spaces; diff --git a/src/BestLangMod.cpp b/src/BestLangMod.cpp index 47ecb90..ea43705 100644 --- a/src/BestLangMod.cpp +++ b/src/BestLangMod.cpp @@ -120,8 +120,8 @@ main (int argc, char **argv) map vars = RuleParser::getVars (transfer); map > lists = RuleParser::getLists (transfer); - string tokenizedSentence; // unsigned i = 0; + string tokenizedSentence; while (getline (lextorFile, tokenizedSentence)) { // cout << i << endl; @@ -186,6 +186,8 @@ main (int argc, char **argv) float weight = strtof (line.c_str (), NULL); normWeights.push_back (weight); } + // beware of the newline + getline (weightFile, line); // read transfer vector normTransfers; @@ -194,6 +196,8 @@ main (int argc, char **argv) getline (transferOutFile, line); normTransfers.push_back (line); } + // beware of the newline + getline (transferOutFile, line); // remove redundant outputs vector outs; diff --git a/src/CombAlign.cpp b/src/CombAlign.cpp index 1aaa27d..60790eb 100644 --- a/src/CombAlign.cpp +++ b/src/CombAlign.cpp @@ -79,13 +79,13 @@ main (int argc, char **argv) cout << "referenceFilePath : Reference parallel target translation file path." << endl; cout << "newRefFilePath : New aligned reference file path." << endl; -// return -1; + return -1; } ifstream lextorFile (lextorFilePath.c_str ()); ofstream chunkerFile (chunkerFilePath.c_str ()); - ifstream referenceFile (referenceFilePath); - ofstream newRefFile (newRefFilePath); + ifstream referenceFile (referenceFilePath.c_str ()); + ofstream newRefFile (newRefFilePath.c_str ()); if (lextorFile.is_open () && chunkerFile.is_open () && referenceFile.is_open () && newRefFile.is_open ()) { @@ -106,11 +106,11 @@ main (int argc, char **argv) map vars = RuleParser::getVars (transfer); map > lists = RuleParser::getLists (transfer); - unsigned i = 0; +// unsigned i = 0; string tokenizedSentence, refSent; while (getline (lextorFile, tokenizedSentence) && getline (referenceFile, refSent)) { - cout << i++ << endl; +// cout << i++ << endl; // spaces after each token vector spaces; @@ -200,7 +200,7 @@ main (int argc, char **argv) chunkerFile.close (); referenceFile.close (); newRefFile.close (); - cout << "CombAlign finished!"; +// cout << "CombAlign finished!"; } else { diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 8285a19..5ee3c75 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -75,15 +75,6 @@ main (int argc, char **argv) ifstream lextorFile (lextorFilePath.c_str ()); ofstream interInFile (interInFilePath.c_str ()); - ifstream refFile ( - string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt").c_str ()); - ofstream refInFile ( - string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt").c_str ()); - ifstream errFile ( - string ( - "/home/aboelhamd/Downloads/apertium-eval-translator-master/ambig_results.txt").c_str ()); - ofstream bestInFile ( - string ("/home/aboelhamd/eclipse-workspace/machinetranslation/best-chunker.txt").c_str ()); if (lextorFile.is_open () && interInFile.is_open ()) { // load transfer file in an xml document object @@ -103,11 +94,11 @@ main (int argc, char **argv) map vars = RuleParser::getVars (transfer); map > lists = RuleParser::getLists (transfer); - unsigned i = 0; - string tokenizedSentence, refSent; - while (getline (lextorFile, tokenizedSentence) && getline (refFile, refSent)) +// unsigned i = 0; + string tokenizedSentence; + while (getline (lextorFile, tokenizedSentence)) { - cout << i++ << endl; +// cout << i++ << endl; // spaces after each token vector spaces; @@ -159,109 +150,12 @@ main (int argc, char **argv) RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, spaces); -// for (unsigned j = 0; j < tlTokens.size (); j++) -// { -// cout << tlTokens[j] << endl; -// vector > rulees = tokenRules[j]; -// for (unsigned k = 0; k < rulees.size (); k++) -// { -// cout << rulees[k].first << " , " << rulees[k].second << endl; -// } -// cout << endl; -// } -// -// for (unsigned j = 0; j < ambigInfo.size (); j++) -// { -// cout << "firTokId = " << ambigInfo[j]->firTokId << "; maxPat = " -// << ambigInfo[j]->maxPat << endl; -// vector > combinations = -// ambigInfo[j]->combinations; -// cout << endl; -// for (unsigned k = 0; k < combinations.size (); k++) -// { -// vector nodes = combinations[k]; -// for (unsigned l = 1; l < nodes.size (); l++) -// { -// cout << "tok=" << nodes[l]->tokenId << "; rul=" << nodes[l]->ruleId -// << "; pat=" << nodes[l]->patNum << " - "; -// } -// cout << endl; -// } -// cout << endl; -// } -// -// for (map >::iterator it = ruleOutputs.begin (); -// it != ruleOutputs.end (); it++) -// { -// cout << "ruleId=" << it->first << endl; -// map outs = it->second; -// -// for (map::iterator it2 = outs.begin (); -// it2 != outs.end (); it2++) -// { -// cout << "tokId=" << it2->first << " , out = " << it2->second << endl; -// } -// cout << endl; -// } -// cout << endl; -// -// for (unsigned j = 0; j < tlTokens.size (); j++) -// { -// vector nodes = nodesPool[j]; -// cout << "tokId = " << j << " : " << tlTokens[j] << endl; -// for (unsigned k = 0; k < nodes.size (); k++) -// { -// cout << "ruleId = " << nodes[k]->ruleId << "; patNum = " -// << nodes[k]->patNum << endl; -// } -// cout << endl; -// } -// -// for (unsigned j = 0; j < combNodes.size (); j++) -// { -// vector nodes = combNodes[j]; -// for (unsigned k = 0; k < nodes.size (); k++) -// { -// cout << "tok=" << nodes[k]->tokenId << "; rul=" << nodes[k]->ruleId -// << "; pat=" << nodes[k]->patNum << " - "; -// } -// cout << endl; -// } - -// set diffOuts (outs.begin (), outs.end ()); -// -// // write the outs -// for (set::iterator it = diffOuts.begin (); it != diffOuts.end (); it++) -// { -// interInFile << *it << endl; -// refInFile << refSent << endl; -// } - - float min = 100000; - int minInd = -1; - string serr; - float err; - // write the outs for (unsigned j = 0; j < outs.size (); j++) { - getline (errFile, serr); - err = strtof (serr.c_str (), NULL); - - if (err < min) - { - min = err; - minInd = j; - } - interInFile << outs[j] << endl; - refInFile << refSent << endl; } -// cout << minInd << endl; - bestInFile << outs[minInd] << endl; - interInFile << endl; - refInFile << endl; // delete AmbigInfo pointers for (unsigned j = 0; j < ambigInfo.size (); j++) @@ -289,10 +183,7 @@ main (int argc, char **argv) lextorFile.close (); interInFile.close (); - refFile.close (); - refInFile.close (); - bestInFile.close (); - cout << "RulesApplier finished!"; +// cout << "RulesApplier finished!"; } else {