commit 9c8960f29d3cac4df71ecdb53627920ba02d9f07 Author: aboelhamd Date: Fri May 17 23:19:44 2019 +0200 Modify language model analysis classes diff --git a/src/LangModAnalysis.cpp b/src/LangModAnalysis.cpp index 183d394..20be8a0 100644 --- a/src/LangModAnalysis.cpp +++ b/src/LangModAnalysis.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -26,29 +27,40 @@ using namespace std; using namespace pugi; using namespace elem; -int -main (int argc, char **argv) -{ - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, - transferOutFilePath, weightFilePath, /*outputFilePath,*/ bestModFilePath, - randModFilePath; - - if (argc == 9) - { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - weightFilePath = argv[6]; - -// outputFilePath = argv[7]; - bestModFilePath = argv[7]; - randModFilePath = argv[8]; - } - else - { +int main(int argc, char **argv) { + string sentenceFilePath, lextorFilePath, localeId, transferFilePath, + targetFilePath, weightFilePath, analysisFilePath, bestModFilePath, + randModFilePath; + + int opt; + while ((opt = getopt(argc, argv, ":a:b:r:")) != -1) { + switch (opt) { + case 'a': + analysisFilePath = optarg; + break; + case 'b': + bestModFilePath = optarg; + break; + case 'r': + randModFilePath = optarg; + break; + case ':': + printf("option %c needs a value\n", optopt); + return -1; + case '?': + printf("unknown option: %c\n", optopt); + return -1; + } + } + + if (optind - argc == 7) { + localeId = argv[1]; + transferFilePath = argv[2]; + sentenceFilePath = argv[3]; + lextorFilePath = argv[4]; + targetFilePath = argv[5]; + weightFilePath = argv[6]; + } else { // localeId = "es_ES"; // transferFilePath = "transferFile.t1x"; // sentenceFilePath = "spa-test.txt"; @@ -69,308 +81,307 @@ main (int argc, char **argv) // bestModFilePath = "bestModFile.txt"; // randModFilePath = "randModFile.txt"; - localeId = "es_ES"; - transferFilePath = "transferFile3.t1x"; - sentenceFilePath = "spa-toknizer.txt"; - lextorFilePath = "spa-lextor.txt"; + localeId = "es_ES"; + transferFilePath = "transferFile3.t1x"; + sentenceFilePath = "spa-toknizer.txt"; + lextorFilePath = "spa-lextor.txt"; + + targetFilePath = "spa-transfer.txt"; + weightFilePath = "spa-weight.txt"; + + analysisFilePath = "outAnalysis.txt"; + bestModFilePath = "bestModFile.txt"; + randModFilePath = "randModFile.txt"; + + cout << "Error in parameters !" << endl; + cout << "Parameters are : localeId transferFilePath sentenceFilePath" + << " lextorFilePath targetFilePath weightOutFilePath" + << " [-a analysisFilePath] [-b bestModFilePath] [-r randModFilePath]" + << endl; + cout + << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << endl; + cout + << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout << "sentenceFilePath : Source language sentences file." << endl; + cout + << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "transferOutFilePath : Output file of apertium transfer for the source language sentences." + << endl; + cout + << "weightOutFilePath : Language model weights file for the source language sentences." + << endl; + cout + << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." + << endl; + cout + << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." + << endl; + cout + << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." + << endl; + return -1; + } - transferOutFilePath = "spa-transfer.txt"; - weightFilePath = "spa-weight.txt"; + // seed for randomness + srand(time(NULL)); -// outputFilePath = "outAnalysis.txt"; - bestModFilePath = "bestModFile.txt"; - randModFilePath = "randModFile.txt"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath bestModFilePath randModFilePath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; -// cout -// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." -// << endl; - cout - << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." - << endl; - cout - << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." - << endl; - return -1; - } - - // seed for randomness - srand (time (NULL)); - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream inSentenceFile (sentenceFilePath.c_str ()); - if (lextorFile.is_open () && inSentenceFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } + ifstream lextorFile(lextorFilePath.c_str()); + ifstream inSentenceFile(sentenceFilePath.c_str()); + if (lextorFile.is_open() && inSentenceFile.is_open()) { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file( + transferFilePath.c_str()); - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); + if (string(result.description()) != "No error") { + cout << "ERROR : " << result.description() << endl; + return -1; + } - vector sourceSentences, tokenizedSentences; + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child("transfer"); - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - string sourceSentence; - if (!getline (inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; + vector sourceSentences, tokenizedSentences; - sourceSentences.push_back (sourceSentence); - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); - inSentenceFile.close (); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - // empty output files -// ofstream outputFile (outputFilePath.c_str ()); -// outputFile.close (); - ofstream bestModFile (bestModFilePath.c_str ()); - bestModFile.close (); - ofstream randModFile (randModFilePath.c_str ()); - randModFile.close (); - - ifstream weightFile (weightFilePath.c_str ()); - ifstream transferOutFile (transferOutFilePath.c_str ()); - - if (weightFile.is_open () && transferOutFile.is_open ()) - for (unsigned i = 0; i < sourceSentences.size (); i++) - { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outputs - vector normOuts; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > normCombNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, - ruleOutputs, spaces); - - // read weights - string line; - vector normWeights; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (weightFile, line); - float weight = strtof (line.c_str (), NULL); - normWeights.push_back (weight); - } - - // read transfer - vector normTransfers; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (transferOutFile, line); - normTransfers.push_back (line); - } - - // remove redundant outputs - vector outs; - vector > combNodes; - vector weights; - vector transfers; - for (unsigned j = 0; j < normOuts.size (); j++) - if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) - { - outs.push_back (normOuts[j]); - combNodes.push_back (normCombNodes[j]); - weights.push_back (normWeights[j]); - transfers.push_back (normTransfers[j]); + string tokenizedSentence; + while (getline(lextorFile, tokenizedSentence)) { + string sourceSentence; + if (!getline(inSentenceFile, sourceSentence)) + sourceSentence = "No more sentences"; + + sourceSentences.push_back(sourceSentence); + tokenizedSentences.push_back(tokenizedSentence); } - normOuts = outs; - normCombNodes = combNodes; - normWeights = weights; - normTransfers = transfers; - - // normalize weights - RuleExecution::normaliseWeights (&normWeights); - - // write normal outputs -// ofstream outputFile (outputFilePath.c_str (), ofstream::app); -// if (outputFile.is_open ()) -// { -// outputFile << "Analysis of sentence : " << endl; -// outputFile << sourceSentence << endl << endl << endl; -// -// outputFile << endl; -// outputFile << "sentence id ||| coverage id ||| original sentence |||" -// << " lextor ||| rules ||| chunker ||| final sentence ||| score" -// << endl << endl; -// -// for (unsigned j = 0; j < normWeights.size (); j++) -// { -// // sentence id -// outputFile << (i + 1) << " ||| "; -// // coverage id -// outputFile << (j + 1) << " ||| "; -// // original sentence -// outputFile << sourceSentence << " ||| "; -// // lextor -// outputFile << tokenizedSentence << " ||| "; -// // rules -// for (unsigned k = 0; k < normCombNodes[j].size (); k++) -// if (normCombNodes[j][k]->ruleId) -// outputFile << normCombNodes[j][k]->ruleId << " "; -// outputFile << "||| "; -// // chuncker -// outputFile << normOuts[j] << " ||| "; -// // final sentence -// outputFile << normTransfers[j] << " ||| "; -// // score -// outputFile << normWeights[j] << endl << endl; -// } -// -// outputFile -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl; -// -// outputFile.close (); -// } - - // Model weighting - // best weight - ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); - if (bestModFile.is_open ()) - { -// bestModFile -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl; - -// bestModFile << (i + 1) << endl; -// bestModFile << "Source : " << sourceSentence << endl << endl; - - unsigned maxInd = 0; - for (unsigned j = 1; j < normWeights.size (); j++) - { - if (normWeights[j] > normWeights[maxInd]) - maxInd = j; - } - - // final sentence - bestModFile /*<< "Target : "*/ << normTransfers[maxInd] << endl; - // score -// bestModFile << "Weight : " << normWeights[maxInd] << endl; - // rules -// bestModFile << "Rules : "; -// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) -// if (normCombNodes[maxInd][k]->ruleId) -// bestModFile << normCombNodes[maxInd][k]->ruleId << " "; -// -// bestModFile << endl -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl << endl; - } - bestModFile.close (); - - // Random weight - ofstream randModFile (randModFilePath.c_str (), ofstream::app); - if (randModFile.is_open ()) - { -// randModFile << (i + 1) << endl; -// randModFile << "Source : " << sourceSentence << endl << endl; - - int random = rand () % normWeights.size (); - - // final sentence - randModFile /*<< "Target : "*/ << normTransfers[random] << endl; - // score -// randModFile << "Weight : " << normWeights[random] << endl; - // rules -// randModFile << "Rules : "; -// for (unsigned k = 0; k < normCombNodes[random].size (); k++) -// if (normCombNodes[random][k]->ruleId) -// randModFile << normCombNodes[random][k]->ruleId << " "; -// -// randModFile << endl -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl << endl; - } - randModFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; + lextorFile.close(); + inSentenceFile.close(); + + map > > attrs = RuleParser::getAttrs( + transfer); + map vars = RuleParser::getVars(transfer); + map > lists = RuleParser::getLists(transfer); + + if (!analysisFilePath.empty()) { + ofstream analysisFile(analysisFilePath.c_str()); + analysisFile.close(); + } + if (!bestModFilePath.empty()) { + ofstream bestModFile(bestModFilePath.c_str()); + bestModFile.close(); + } + if (!randModFilePath.empty()) { + ofstream randModFile(randModFilePath.c_str()); + randModFile.close(); + } + + ifstream weightFile(weightFilePath.c_str()); + ifstream targetFile(targetFilePath.c_str()); + + if (weightFile.is_open() && targetFile.is_open()) + for (unsigned i = 0; i < sourceSentences.size(); i++) { +// cout << i << endl; + + string sourceSentence, tokenizedSentence; + sourceSentence = sourceSentences[i]; + tokenizedSentence = tokenizedSentences[i]; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, + &tlTags, &spaces, tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, + transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, + slTags, tlTokens, tlTags, rulesApplied, attrs, lists, + &vars, spaces, localeId); + + // final outputs + vector normOuts; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // rules combinations + vector > normCombNodes; + + nodesPool = RuleExecution::getNodesPool(tokenRules); + + RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, + &compNum); + + RuleExecution::getOuts(&normOuts, &normCombNodes, ambigInfo, + nodesPool, ruleOutputs, spaces); + + // read weights + string line; + vector normWeights; + for (unsigned j = 0; j < normOuts.size(); j++) { + getline(weightFile, line); + float weight = strtof(line.c_str(), NULL); + normWeights.push_back(weight); + } + + // read transfer + vector normTransfers; + for (unsigned j = 0; j < normOuts.size(); j++) { + getline(targetFile, line); + normTransfers.push_back(line); + } + + // remove redundant outputs + vector outs; + vector > combNodes; + vector weights; + vector transfers; + for (unsigned j = 0; j < normOuts.size(); j++) + if (find(outs.begin(), outs.end(), normOuts[j]) + == outs.end()) { + outs.push_back(normOuts[j]); + combNodes.push_back(normCombNodes[j]); + weights.push_back(normWeights[j]); + transfers.push_back(normTransfers[j]); + } + normOuts = outs; + normCombNodes = combNodes; + normWeights = weights; + normTransfers = transfers; + + // normalize weights + RuleExecution::normaliseWeights(&normWeights); + + // write normal outputs + if (!analysisFilePath.empty()) { + ofstream analysisFile(analysisFilePath.c_str(), + ofstream::app); + if (analysisFile.is_open()) { + analysisFile << "Analysis of sentence : " << endl; + analysisFile << sourceSentence << endl << endl << endl; + + analysisFile << endl; + analysisFile + << "sentence id ||| coverage id ||| original sentence |||" + << " lextor ||| rules ||| chunker ||| final sentence ||| score" + << endl << endl; + + for (unsigned j = 0; j < normWeights.size(); j++) { + // sentence id + analysisFile << (i + 1) << " ||| "; + // coverage id + analysisFile << (j + 1) << " ||| "; + // original sentence + analysisFile << sourceSentence << " ||| "; + // lextor + analysisFile << tokenizedSentence << " ||| "; + // rules + for (unsigned k = 0; k < normCombNodes[j].size(); + k++) + if (normCombNodes[j][k]->ruleId) + analysisFile << normCombNodes[j][k]->ruleId + << " "; + analysisFile << "||| "; + // chuncker + analysisFile << normOuts[j] << " ||| "; + // final sentence + analysisFile << normTransfers[j] << " ||| "; + // score + analysisFile << normWeights[j] << endl << endl; + } + + analysisFile + << "---------------------------------------------------------------------------------------------------------" + << endl << endl; + + analysisFile.close(); + } + } + + // Model weighting + // best weight + if (!bestModFilePath.empty()) { + ofstream bestModFile(bestModFilePath.c_str(), + ofstream::app); + if (bestModFile.is_open()) { + unsigned maxInd = 0; + for (unsigned j = 1; j < normWeights.size(); j++) { + if (normWeights[j] > normWeights[maxInd]) + maxInd = j; + } + + // final sentence + bestModFile << normTransfers[maxInd] << endl; + } + bestModFile.close(); + } + if (!randModFilePath.empty()) { + // Random weight + ofstream randModFile(randModFilePath.c_str(), + ofstream::app); + if (randModFile.is_open()) { + int random = rand() % normWeights.size(); + + // final sentence + randModFile << normTransfers[random] << endl; + } + randModFile.close(); + } + + // delete AmbigInfo pointers + for (unsigned j = 0; j < ambigInfo.size(); j++) { + // delete the dummy node pointers + set dummies; + for (unsigned k = 0; k < ambigInfo[j]->combinations.size(); + k++) + dummies.insert(ambigInfo[j]->combinations[k][0]); + for (set::iterator it = + dummies.begin(); it != dummies.end(); it++) + delete (*it); + + delete ambigInfo[j]; + } + // delete Node pointers + for (map >::iterator it = + nodesPool.begin(); it != nodesPool.end(); it++) { + for (unsigned j = 0; j < it->second.size(); j++) { + delete it->second[j]; + } + } + } + else { + cout << "ERROR in opening files!" << endl; + } + weightFile.close(); + targetFile.close(); + } else { + cout << "ERROR in opening files!" << endl; } - weightFile.close (); - transferOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; + return 0; }