commit 0ffd24814364ba30779d96120689a555610e6ab7 Author: aboelhamd Date: Sun May 12 03:57:22 2019 +0200 Add file to choose best sentence of LM diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index f2dffb6..8a396c6 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -1,186 +1,186 @@ -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -// -//#include "../pugixml/pugixml.hpp" -//#include "RuleParser.h" -//#include "RuleExecution.h" -//#include "TranElemLiterals.h" -//#include "CLExec.h" -// -//#include -// -//using namespace std; -//using namespace pugi; -//using namespace elem; -// -//int -//main (int argc, char **argv) -//{ -// string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; -// -// if (argc == 7) -// { -// localeId = argv[1]; -// transferFilePath = argv[2]; -// lextorFilePath = argv[3]; -// interInFilePath = argv[4]; -// modelsDest = argv[5]; -// k = argv[6]; -// } -// else -// { -// localeId = "es_ES"; -// transferFilePath = "apertium-eng-spa.spa-eng.t1x"; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../pugixml/pugixml.hpp" +#include "RuleParser.h" +#include "RuleExecution.h" +#include "TranElemLiterals.h" +#include "CLExec.h" + +#include + +using namespace std; +using namespace pugi; +using namespace elem; + +int +main (int argc, char **argv) +{ + string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; + + if (argc == 7) + { + localeId = argv[1]; + transferFilePath = argv[2]; + lextorFilePath = argv[3]; + interInFilePath = argv[4]; + modelsDest = argv[5]; + k = argv[6]; + } + else + { + localeId = "es_ES"; + transferFilePath = "apertium-eng-spa.spa-eng.t1x"; + lextorFilePath = "lextor.txt"; + interInFilePath = "beaminter.txt"; + modelsDest = "/home/aboelhamd/Downloads/models"; + k = "8"; + +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "src.txt"; // lextorFilePath = "lextor.txt"; -// interInFilePath = "beaminter.txt"; -// modelsDest = "/home/aboelhamd/Downloads/models"; +// interInFilePath = "beam-inter.txt"; +// modelsDest = "./UntitledFolder/models"; // k = "8"; -// -//// localeId = "kk_KZ"; -//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -//// sentenceFilePath = "src.txt"; -//// lextorFilePath = "lextor.txt"; -//// interInFilePath = "beam-inter.txt"; -//// modelsDest = "./UntitledFolder/models"; -//// k = "8"; -// -// cout << "Error in parameters !" << endl; -// cout -// << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" -// << endl; -// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" -// << endl; -// cout << "transferFilePath : Apertium transfer file of the language pair used." -// << endl; -// cout << "lextorFilePath : Apertium lextor file for the source language sentences." -// << endl; -// cout -// << "interInFilePath : Output file of this program which is the input for apertium interchunk." -// << endl; -// cout << "modelsDest : Yasmet models destination." << endl; -// cout << "beamSize : The size of beam in beam search algorithm." << endl; -// return -1; -// } -// -// ifstream lextorFile (lextorFilePath.c_str ()); -// ofstream interInFile (interInFilePath.c_str ()); -// if (lextorFile.is_open () && interInFile.is_open ()) -// { -// // load transfer file in an xml document object -// xml_document transferDoc; -// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); -// if (string (result.description ()) != "No error") -// { -// cout << "ERROR : " << result.description () << endl; -// return -1; -// } -// -// // xml node of the parent node (transfer) in the transfer file -// xml_node transfer = transferDoc.child ("transfer"); -// -// map > > attrs = RuleParser::getAttrs (transfer); -// map vars = RuleParser::getVars (transfer); -// map > lists = RuleParser::getLists (transfer); -// map > > classesWeights = -// CLExec::loadYasmetModels (modelsDest); -// -// int beam; -// stringstream buffer (k); -// buffer >> beam; -// -// string tokenizedSentence; -// while (getline (lextorFile, tokenizedSentence)) -// { -// // cout << i << endl; -// -// // spaces after each token -// vector spaces; -// -// // tokens in the sentence order -// vector slTokens, tlTokens; -// -// // tags of tokens in order -// vector > slTags, tlTags; -// -// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, -// tokenizedSentence); -// -// // map of tokens ids and their matched categories -// map > catsApplied; -// -// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); -// -// // map of matched rules and a pair of first token id and patterns number -// map > > rulesApplied; -// -// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); -// -// // rule and (target) token map to specific output -// // if rule has many patterns we will choose the first token only -// map > ruleOutputs; -// -// // map (target) token to all matched rules ids and the number of pattern items of each rule -// map > > tokenRules; -// -// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, -// tlTags, rulesApplied, attrs, lists, &vars, spaces, -// localeId); -// -// // final outputs -// vector outs; -// // number of generated combinations -// unsigned compNum; -// // nodes for every token and rule -// map > nodesPool; -// // ambiguous informations -// vector ambigInfo; -// // beam tree -// vector, float> > beamTree; -// // rules combinations -// vector > combNodes; -// -// nodesPool = RuleExecution::getNodesPool (tokenRules); -// -// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); -// -// vector newAmbigInfo; -// for (unsigned j = 0; j < ambigInfo.size (); j++) -// if (ambigInfo[j]->combinations.size () > 1) -// newAmbigInfo.push_back (ambigInfo[j]); -// -// CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, -// localeId); -// -// // take the first sentence only -// beamTree.erase (beamTree.begin () + 1, beamTree.end ()); -// -// RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, -// spaces); -// -// // write the outs -// for (unsigned j = 0; j < outs.size (); j++) -// interInFile << outs[j] << endl; -// -// } -// interInFile.close (); -// lextorFile.close (); -// } -// else -// { -// cout << "ERROR in opening files!" << endl; -// } -// return 0; -//} + + cout << "Error in parameters !" << endl; + cout + << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" + << endl; + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << endl; + cout << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "interInFilePath : Output file of this program which is the input for apertium interchunk." + << endl; + cout << "modelsDest : Yasmet models destination." << endl; + cout << "beamSize : The size of beam in beam search algorithm." << endl; + return -1; + } + + ifstream lextorFile (lextorFilePath.c_str ()); + ofstream interInFile (interInFilePath.c_str ()); + if (lextorFile.is_open () && interInFile.is_open ()) + { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); + if (string (result.description ()) != "No error") + { + cout << "ERROR : " << result.description () << endl; + return -1; + } + + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child ("transfer"); + + map > > attrs = RuleParser::getAttrs (transfer); + map vars = RuleParser::getVars (transfer); + map > lists = RuleParser::getLists (transfer); + map > > classesWeights = + CLExec::loadYasmetModels (modelsDest); + + int beam; + stringstream buffer (k); + buffer >> beam; + + string tokenizedSentence; + while (getline (lextorFile, tokenizedSentence)) + { + // cout << i << endl; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // final outputs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // beam tree + vector, float> > beamTree; + // rules combinations + vector > combNodes; + + nodesPool = RuleExecution::getNodesPool (tokenRules); + + RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size (); j++) + if (ambigInfo[j]->combinations.size () > 1) + newAmbigInfo.push_back (ambigInfo[j]); + + CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, + localeId); + + // take the first sentence only + beamTree.erase (beamTree.begin () + 1, beamTree.end ()); + + RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, + spaces); + + // write the outs + for (unsigned j = 0; j < outs.size (); j++) + interInFile << outs[j] << endl; + + } + interInFile.close (); + lextorFile.close (); + } + else + { + cout << "ERROR in opening files!" << endl; + } + return 0; +} diff --git a/src/BestLangMod.cpp b/src/BestLangMod.cpp new file mode 100644 index 0000000..8601ca7 --- /dev/null +++ b/src/BestLangMod.cpp @@ -0,0 +1,376 @@ +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, +// transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, +// randModFilePath; +// +// if (argc == 10) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// weightFilePath = argv[6]; +// +// outputFilePath = argv[7]; +// bestModFilePath = argv[8]; +// randModFilePath = argv[9]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// +//// transferOutFilePath = "sample-transfer.txt"; +//// weightFilePath = "sample-weights.txt"; +//// +//// outputFilePath = "outAnalysis.txt"; +//// bestModFilePath = "bestModFile.txt"; +//// randModFilePath = "randModFile.txt"; +// +// localeId = "es_ES"; +// transferFilePath = "transferFile3.t1x"; +// sentenceFilePath = "spa-toknizer.txt"; +// lextorFilePath = "spa-lextor.txt"; +// +// transferOutFilePath = "spa-transfer.txt"; +// weightFilePath = "spa-weight.txt"; +// +// outputFilePath = "outAnalysis.txt"; +// bestModFilePath = "bestModFile.txt"; +// randModFilePath = "randModFile.txt"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout +// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." +// << endl; +// cout +// << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." +// << endl; +// cout +// << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." +// << endl; +// return -1; +// } +// +// // seed for randomness +// srand (time (NULL)); +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream inSentenceFile (sentenceFilePath.c_str ()); +// if (lextorFile.is_open () && inSentenceFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// string sourceSentence; +// if (!getline (inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back (sourceSentence); +// tokenizedSentences.push_back (tokenizedSentence); +// } +// lextorFile.close (); +// inSentenceFile.close (); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// // empty output files +// ofstream outputFile (outputFilePath.c_str ()); +// outputFile.close (); +// ofstream bestModFile (bestModFilePath.c_str ()); +// bestModFile.close (); +// ofstream randModFile (randModFilePath.c_str ()); +// randModFile.close (); +// +// ifstream weightFile (weightFilePath.c_str ()); +// ifstream transferOutFile (transferOutFilePath.c_str ()); +// +// if (weightFile.is_open () && transferOutFile.is_open ()) +// for (unsigned i = 0; i < sourceSentences.size (); i++) +// { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, +// &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, +// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, +// spaces, localeId); +// +// // final outputs +// vector normOuts; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > normCombNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, +// ruleOutputs, spaces); +// +// // read weights +// string line; +// vector normWeights; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (weightFile, line); +// float weight = strtof (line.c_str (), NULL); +// normWeights.push_back (weight); +// } +// +// // read transfer +// vector normTransfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (transferOutFile, line); +// normTransfers.push_back (line); +// } +// +// // remove redundant outputs +// vector outs; +// vector > combNodes; +// vector weights; +// vector transfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) +// { +// outs.push_back (normOuts[j]); +// combNodes.push_back (normCombNodes[j]); +// weights.push_back (normWeights[j]); +// transfers.push_back (normTransfers[j]); +// } +// normOuts = outs; +// normCombNodes = combNodes; +// normWeights = weights; +// normTransfers = transfers; +// +// // normalize weights +// RuleExecution::normaliseWeights (&normWeights); +// +// // write normal outputs +// ofstream outputFile (outputFilePath.c_str (), ofstream::app); +// if (outputFile.is_open ()) +// { +// outputFile << "Analysis of sentence : " << endl; +// outputFile << sourceSentence << endl << endl << endl; +// +// outputFile << endl; +// outputFile << "sentence id ||| coverage id ||| original sentence |||" +// << " lextor ||| rules ||| chunker ||| final sentence ||| score" +// << endl << endl; +// +// for (unsigned j = 0; j < normWeights.size (); j++) +// { +// // sentence id +// outputFile << (i + 1) << " ||| "; +// // coverage id +// outputFile << (j + 1) << " ||| "; +// // original sentence +// outputFile << sourceSentence << " ||| "; +// // lextor +// outputFile << tokenizedSentence << " ||| "; +// // rules +// for (unsigned k = 0; k < normCombNodes[j].size (); k++) +// if (normCombNodes[j][k]->ruleId) +// outputFile << normCombNodes[j][k]->ruleId << " "; +// outputFile << "||| "; +// // chuncker +// outputFile << normOuts[j] << " ||| "; +// // final sentence +// outputFile << normTransfers[j] << " ||| "; +// // score +// outputFile << normWeights[j] << endl << endl; +// } +// +// outputFile +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl; +// +// outputFile.close (); +// } +// +// // Model weighting +// // best weight +// ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); +// if (bestModFile.is_open ()) +// { +//// bestModFile +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl; +// +//// bestModFile << (i + 1) << endl; +//// bestModFile << "Source : " << sourceSentence << endl << endl; +// +// unsigned maxInd = 0; +// for (unsigned j = 1; j < normWeights.size (); j++) +// { +// if (normWeights[j] > normWeights[maxInd]) +// maxInd = j; +// } +// +// // final sentence +// bestModFile /*<< "Target : "*/ << normTransfers[maxInd] << endl; +// // score +//// bestModFile << "Weight : " << normWeights[maxInd] << endl; +// // rules +//// bestModFile << "Rules : "; +//// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) +//// if (normCombNodes[maxInd][k]->ruleId) +//// bestModFile << normCombNodes[maxInd][k]->ruleId << " "; +//// +//// bestModFile << endl +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl << endl; +// } +// bestModFile.close (); +// +// // Random weight +// ofstream randModFile (randModFilePath.c_str (), ofstream::app); +// if (randModFile.is_open ()) +// { +// randModFile << (i + 1) << endl; +// randModFile << "Source : " << sourceSentence << endl << endl; +// +// int random = rand () % normWeights.size (); +// +// // final sentence +// randModFile << "Target : " << normTransfers[random] << endl; +// // score +// randModFile << "Weight : " << normWeights[random] << endl; +// // rules +// randModFile << "Rules : "; +// for (unsigned k = 0; k < normCombNodes[random].size (); k++) +// if (normCombNodes[random][k]->ruleId) +// randModFile << normCombNodes[random][k]->ruleId << " "; +// +// randModFile << endl +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl << endl; +// } +// randModFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// weightFile.close (); +// transferOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/CombAlign.cpp b/src/CombAlign.cpp index 1aaa27d..762679a 100644 --- a/src/CombAlign.cpp +++ b/src/CombAlign.cpp @@ -1,211 +1,211 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string localeId, transferFilePath, lextorFilePath, chunkerFilePath, referenceFilePath, - newRefFilePath; - - if (argc == 7) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - chunkerFilePath = argv[4]; - referenceFilePath = argv[5]; - newRefFilePath = argv[6]; - } - else - { +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string localeId, transferFilePath, lextorFilePath, chunkerFilePath, referenceFilePath, +// newRefFilePath; +// +// if (argc == 7) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// lextorFilePath = argv[3]; +// chunkerFilePath = argv[4]; +// referenceFilePath = argv[5]; +// newRefFilePath = argv[6]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "inter2.txt"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// interInFilePath = "sample-inter.txt"; +// // localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "inter2.txt"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "sample-sentences.txt"; -// lextorFilePath = "sample-lextor.txt"; -// interInFilePath = "sample-inter.txt"; - - localeId = "es_ES"; - transferFilePath = - "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt"; - chunkerFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt"; - referenceFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt"; - newRefFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt"; - - cout << "Error in parameters !" << endl; - cout << "Parameters are : localeId transferFilePath lextorFilePath" - << " chunkerFilePath referenceFilePath newRefFilePath" << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout << "chunkerFilePath : chunker file path (output of this program and" - << " input for apertium interchunk)." << endl; - cout << "referenceFilePath : Reference parallel target translation file path." - << endl; - cout << "newRefFilePath : New aligned reference file path." << endl; -// return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ofstream chunkerFile (chunkerFilePath.c_str ()); - ifstream referenceFile (referenceFilePath); - ofstream newRefFile (newRefFilePath); - if (lextorFile.is_open () && chunkerFile.is_open () && referenceFile.is_open () - && newRefFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - unsigned i = 0; - string tokenizedSentence, refSent; - while (getline (lextorFile, tokenizedSentence) && getline (referenceFile, refSent)) - { - cout << i++ << endl; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - // final outs - vector outs; - // number of possible combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - { - chunkerFile << outs[j] << endl; - newRefFile << refSent << endl; - } - - chunkerFile << endl; - newRefFile << endl; - - // delete AmbigInfo pointers - for (unsigned j = 0; j < ambigInfo.size (); j++) - { - // delete the dummy node pointers - set dummies; - for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) - dummies.insert (ambigInfo[j]->combinations[k][0]); - for (set::iterator it = dummies.begin (); - it != dummies.end (); it++) - delete (*it); - - delete ambigInfo[j]; - } - // delete Node pointers - for (map >::iterator it = - nodesPool.begin (); it != nodesPool.end (); it++) - { - for (unsigned j = 0; j < it->second.size (); j++) - { - delete it->second[j]; - } - } - } - - lextorFile.close (); - chunkerFile.close (); - referenceFile.close (); - newRefFile.close (); - cout << "CombAlign finished!"; - } - else - { - cout << "ERROR in opening files!" << endl; - } - - return 0; -} +// transferFilePath = +// "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; +// lextorFilePath = +// "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt"; +// chunkerFilePath = +// "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt"; +// referenceFilePath = +// "/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt"; +// newRefFilePath = +// "/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt"; +// +// cout << "Error in parameters !" << endl; +// cout << "Parameters are : localeId transferFilePath lextorFilePath" +// << " chunkerFilePath referenceFilePath newRefFilePath" << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout << "chunkerFilePath : chunker file path (output of this program and" +// << " input for apertium interchunk)." << endl; +// cout << "referenceFilePath : Reference parallel target translation file path." +// << endl; +// cout << "newRefFilePath : New aligned reference file path." << endl; +//// return -1; +// } +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ofstream chunkerFile (chunkerFilePath.c_str ()); +// ifstream referenceFile (referenceFilePath); +// ofstream newRefFile (newRefFilePath); +// if (lextorFile.is_open () && chunkerFile.is_open () && referenceFile.is_open () +// && newRefFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// unsigned i = 0; +// string tokenizedSentence, refSent; +// while (getline (lextorFile, tokenizedSentence) && getline (referenceFile, refSent)) +// { +// cout << i++ << endl; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, +// tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// // final outs +// vector outs; +// // number of possible combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, +// spaces); +// +// // write the outs +// for (unsigned j = 0; j < outs.size (); j++) +// { +// chunkerFile << outs[j] << endl; +// newRefFile << refSent << endl; +// } +// +// chunkerFile << endl; +// newRefFile << endl; +// +// // delete AmbigInfo pointers +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// { +// // delete the dummy node pointers +// set dummies; +// for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) +// dummies.insert (ambigInfo[j]->combinations[k][0]); +// for (set::iterator it = dummies.begin (); +// it != dummies.end (); it++) +// delete (*it); +// +// delete ambigInfo[j]; +// } +// // delete Node pointers +// for (map >::iterator it = +// nodesPool.begin (); it != nodesPool.end (); it++) +// { +// for (unsigned j = 0; j < it->second.size (); j++) +// { +// delete it->second[j]; +// } +// } +// } +// +// lextorFile.close (); +// chunkerFile.close (); +// referenceFile.close (); +// newRefFile.close (); +// cout << "CombAlign finished!"; +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// +// return 0; +//} diff --git a/src/LangModAnalysis.cpp b/src/LangModAnalysis.cpp index 8601ca7..47ecb90 100644 --- a/src/LangModAnalysis.cpp +++ b/src/LangModAnalysis.cpp @@ -1,376 +1,240 @@ -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -// -//#include "../pugixml/pugixml.hpp" -//#include "RuleParser.h" -//#include "RuleExecution.h" -//#include "TranElemLiterals.h" -//#include "CLExec.h" -// -//#include -// -//using namespace std; -//using namespace pugi; -//using namespace elem; -// -//int -//main (int argc, char **argv) -//{ -// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, -// transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, -// randModFilePath; -// -// if (argc == 10) -// { -// localeId = argv[1]; -// transferFilePath = argv[2]; -// sentenceFilePath = argv[3]; -// lextorFilePath = argv[4]; -// -// transferOutFilePath = argv[5]; -// weightFilePath = argv[6]; -// -// outputFilePath = argv[7]; -// bestModFilePath = argv[8]; -// randModFilePath = argv[9]; -// } -// else -// { -//// localeId = "es_ES"; -//// transferFilePath = "transferFile.t1x"; -//// sentenceFilePath = "spa-test.txt"; -//// lextorFilePath = "spa-test.lextor"; -//// interInFilePath = "beaminter.out"; -//// modelsDest = "modelstry"; -//// k = "8"; -// -//// localeId = "kk_KZ"; -//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -//// sentenceFilePath = "sample-sentences.txt"; -//// lextorFilePath = "sample-lextor.txt"; -//// -//// transferOutFilePath = "sample-transfer.txt"; -//// weightFilePath = "sample-weights.txt"; -//// -//// outputFilePath = "outAnalysis.txt"; -//// bestModFilePath = "bestModFile.txt"; -//// randModFilePath = "randModFile.txt"; -// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../pugixml/pugixml.hpp" +#include "RuleParser.h" +#include "RuleExecution.h" +#include "TranElemLiterals.h" +#include "CLExec.h" + +#include + +using namespace std; +using namespace pugi; +using namespace elem; + +int +main (int argc, char **argv) +{ + string lextorFilePath, localeId, transferFilePath, transferOutFilePath, weightFilePath, + bestModFilePath; + + if (argc == 7) + { + localeId = argv[1]; + transferFilePath = argv[2]; + lextorFilePath = argv[3]; + + transferOutFilePath = argv[4]; + weightFilePath = argv[5]; + + bestModFilePath = argv[6]; + } + else + { // localeId = "es_ES"; -// transferFilePath = "transferFile3.t1x"; -// sentenceFilePath = "spa-toknizer.txt"; -// lextorFilePath = "spa-lextor.txt"; -// -// transferOutFilePath = "spa-transfer.txt"; -// weightFilePath = "spa-weight.txt"; +// transferFilePath = "transferFile.t1x"; +// sentenceFilePath = "spa-test.txt"; +// lextorFilePath = "spa-test.lextor"; +// interInFilePath = "beaminter.out"; +// modelsDest = "modelstry"; +// k = "8"; + +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "sample-sentences.txt"; +// lextorFilePath = "sample-lextor.txt"; +// +// transferOutFilePath = "sample-transfer.txt"; +// weightFilePath = "sample-weights.txt"; // // outputFilePath = "outAnalysis.txt"; // bestModFilePath = "bestModFile.txt"; // randModFilePath = "randModFile.txt"; -// -// cout << "Error in parameters !" << endl; -// cout -// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" -// << endl; -// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" -// << endl; -// cout << "transferFilePath : Apertium transfer file of the language pair used." -// << endl; -// cout << "sentenceFilePath : Source language sentences file." << endl; -// cout << "lextorFilePath : Apertium lextor file for the source language sentences." -// << endl; -// cout -// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." -// << endl; -// cout -// << "weightOutFilePath : Language model weights file for the source language sentences." -// << endl; -// cout -// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." -// << endl; -// cout -// << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." -// << endl; -// cout -// << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." -// << endl; -// return -1; -// } -// -// // seed for randomness -// srand (time (NULL)); -// -// ifstream lextorFile (lextorFilePath.c_str ()); -// ifstream inSentenceFile (sentenceFilePath.c_str ()); -// if (lextorFile.is_open () && inSentenceFile.is_open ()) -// { -// // load transfer file in an xml document object -// xml_document transferDoc; -// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); -// -// if (string (result.description ()) != "No error") -// { -// cout << "ERROR : " << result.description () << endl; -// return -1; -// } -// -// // xml node of the parent node (transfer) in the transfer file -// xml_node transfer = transferDoc.child ("transfer"); -// -// vector sourceSentences, tokenizedSentences; -// -// string tokenizedSentence; -// while (getline (lextorFile, tokenizedSentence)) -// { -// string sourceSentence; -// if (!getline (inSentenceFile, sourceSentence)) -// sourceSentence = "No more sentences"; -// -// sourceSentences.push_back (sourceSentence); -// tokenizedSentences.push_back (tokenizedSentence); -// } -// lextorFile.close (); -// inSentenceFile.close (); -// -// map > > attrs = RuleParser::getAttrs (transfer); -// map vars = RuleParser::getVars (transfer); -// map > lists = RuleParser::getLists (transfer); -// -// // empty output files -// ofstream outputFile (outputFilePath.c_str ()); -// outputFile.close (); -// ofstream bestModFile (bestModFilePath.c_str ()); -// bestModFile.close (); -// ofstream randModFile (randModFilePath.c_str ()); -// randModFile.close (); -// -// ifstream weightFile (weightFilePath.c_str ()); -// ifstream transferOutFile (transferOutFilePath.c_str ()); -// -// if (weightFile.is_open () && transferOutFile.is_open ()) -// for (unsigned i = 0; i < sourceSentences.size (); i++) -// { -// cout << i << endl; -// -// string sourceSentence, tokenizedSentence; -// sourceSentence = sourceSentences[i]; -// tokenizedSentence = tokenizedSentences[i]; -// -// // spaces after each token -// vector spaces; -// -// // tokens in the sentence order -// vector slTokens, tlTokens; -// -// // tags of tokens in order -// vector > slTags, tlTags; -// -// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, -// &spaces, tokenizedSentence); -// -// // map of tokens ids and their matched categories -// map > catsApplied; -// -// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); -// -// // map of matched rules and a pair of first token id and patterns number -// map > > rulesApplied; -// -// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); -// -// // rule and (target) token map to specific output -// // if rule has many patterns we will choose the first token only -// map > ruleOutputs; -// -// // map (target) token to all matched rules ids and the number of pattern items of each rule -// map > > tokenRules; -// -// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, -// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, -// spaces, localeId); -// -// // final outputs -// vector normOuts; -// // number of generated combinations -// unsigned compNum; -// // nodes for every token and rule -// map > nodesPool; -// // ambiguous informations -// vector ambigInfo; -// // rules combinations -// vector > normCombNodes; -// -// nodesPool = RuleExecution::getNodesPool (tokenRules); -// -// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); -// -// RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, -// ruleOutputs, spaces); -// -// // read weights -// string line; -// vector normWeights; -// for (unsigned j = 0; j < normOuts.size (); j++) -// { -// getline (weightFile, line); -// float weight = strtof (line.c_str (), NULL); -// normWeights.push_back (weight); -// } -// -// // read transfer -// vector normTransfers; -// for (unsigned j = 0; j < normOuts.size (); j++) -// { -// getline (transferOutFile, line); -// normTransfers.push_back (line); -// } -// -// // remove redundant outputs -// vector outs; -// vector > combNodes; -// vector weights; -// vector transfers; -// for (unsigned j = 0; j < normOuts.size (); j++) -// if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) -// { -// outs.push_back (normOuts[j]); -// combNodes.push_back (normCombNodes[j]); -// weights.push_back (normWeights[j]); -// transfers.push_back (normTransfers[j]); -// } -// normOuts = outs; -// normCombNodes = combNodes; -// normWeights = weights; -// normTransfers = transfers; -// -// // normalize weights -// RuleExecution::normaliseWeights (&normWeights); -// -// // write normal outputs -// ofstream outputFile (outputFilePath.c_str (), ofstream::app); -// if (outputFile.is_open ()) -// { -// outputFile << "Analysis of sentence : " << endl; -// outputFile << sourceSentence << endl << endl << endl; -// -// outputFile << endl; -// outputFile << "sentence id ||| coverage id ||| original sentence |||" -// << " lextor ||| rules ||| chunker ||| final sentence ||| score" -// << endl << endl; -// -// for (unsigned j = 0; j < normWeights.size (); j++) -// { -// // sentence id -// outputFile << (i + 1) << " ||| "; -// // coverage id -// outputFile << (j + 1) << " ||| "; -// // original sentence -// outputFile << sourceSentence << " ||| "; -// // lextor -// outputFile << tokenizedSentence << " ||| "; -// // rules -// for (unsigned k = 0; k < normCombNodes[j].size (); k++) -// if (normCombNodes[j][k]->ruleId) -// outputFile << normCombNodes[j][k]->ruleId << " "; -// outputFile << "||| "; -// // chuncker -// outputFile << normOuts[j] << " ||| "; -// // final sentence -// outputFile << normTransfers[j] << " ||| "; -// // score -// outputFile << normWeights[j] << endl << endl; -// } -// -// outputFile -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl; -// -// outputFile.close (); -// } -// -// // Model weighting -// // best weight -// ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); -// if (bestModFile.is_open ()) -// { -//// bestModFile -//// << "---------------------------------------------------------------------------------------------------------" -//// << endl << endl; -// -//// bestModFile << (i + 1) << endl; -//// bestModFile << "Source : " << sourceSentence << endl << endl; -// -// unsigned maxInd = 0; -// for (unsigned j = 1; j < normWeights.size (); j++) -// { -// if (normWeights[j] > normWeights[maxInd]) -// maxInd = j; -// } -// -// // final sentence -// bestModFile /*<< "Target : "*/ << normTransfers[maxInd] << endl; -// // score -//// bestModFile << "Weight : " << normWeights[maxInd] << endl; -// // rules -//// bestModFile << "Rules : "; -//// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) -//// if (normCombNodes[maxInd][k]->ruleId) -//// bestModFile << normCombNodes[maxInd][k]->ruleId << " "; -//// -//// bestModFile << endl -//// << "---------------------------------------------------------------------------------------------------------" -//// << endl << endl << endl; -// } -// bestModFile.close (); -// -// // Random weight -// ofstream randModFile (randModFilePath.c_str (), ofstream::app); -// if (randModFile.is_open ()) -// { -// randModFile << (i + 1) << endl; -// randModFile << "Source : " << sourceSentence << endl << endl; -// -// int random = rand () % normWeights.size (); -// -// // final sentence -// randModFile << "Target : " << normTransfers[random] << endl; -// // score -// randModFile << "Weight : " << normWeights[random] << endl; -// // rules -// randModFile << "Rules : "; -// for (unsigned k = 0; k < normCombNodes[random].size (); k++) -// if (normCombNodes[random][k]->ruleId) -// randModFile << normCombNodes[random][k]->ruleId << " "; -// -// randModFile << endl -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl << endl; -// } -// randModFile.close (); -// } -// else -// { -// cout << "ERROR in opening files!" << endl; -// } -// weightFile.close (); -// transferOutFile.close (); -// } -// else -// { -// cout << "ERROR in opening files!" << endl; -// } -// return 0; -//} + + localeId = "es_ES"; + transferFilePath = "transferFile3.t1x"; + lextorFilePath = "spa-lextor.txt"; + + transferOutFilePath = "spa-transfer.txt"; + weightFilePath = "spa-weight.txt"; + + bestModFilePath = "bestModFile.txt"; + + cout << "Error in parameters !" << endl; + cout << "Parameters are : localeId transferFilePath lextorFilePath" + << " transferOutFilePath weightOutFilePath bestModFilePath" << endl; + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << endl; + cout << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "transferOutFilePath : Output file of apertium transfer for the source language sentences." + << endl; + cout + << "weightOutFilePath : Language model weights file for the source language sentences." + << endl; + cout + << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." + << endl; + return -1; + } + + ifstream lextorFile (lextorFilePath.c_str ()); + ifstream weightFile (weightFilePath.c_str ()); + ifstream transferOutFile (transferOutFilePath.c_str ()); + ofstream bestModFile (bestModFilePath.c_str ()); + + if (lextorFile.is_open () && weightFile.is_open () && transferOutFile.is_open () + && bestModFile.is_open ()) + { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); + + if (string (result.description ()) != "No error") + { + cout << "ERROR : " << result.description () << endl; + return -1; + } + + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child ("transfer"); + + map > > attrs = RuleParser::getAttrs (transfer); + map vars = RuleParser::getVars (transfer); + map > lists = RuleParser::getLists (transfer); + + string tokenizedSentence; + // unsigned i = 0; + while (getline (lextorFile, tokenizedSentence)) + { + // cout << i << endl; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // final outputs + vector normOuts; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // rules combinations + vector > normCombNodes; + + nodesPool = RuleExecution::getNodesPool (tokenRules); + + RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + + RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, + ruleOutputs, spaces); + + // read weights + string line; + vector normWeights; + for (unsigned j = 0; j < normOuts.size (); j++) + { + getline (weightFile, line); + float weight = strtof (line.c_str (), NULL); + normWeights.push_back (weight); + } + + // read transfer + vector normTransfers; + for (unsigned j = 0; j < normOuts.size (); j++) + { + getline (transferOutFile, line); + normTransfers.push_back (line); + } + + // remove redundant outputs + vector outs; + vector > combNodes; + vector weights; + vector transfers; + for (unsigned j = 0; j < normOuts.size (); j++) + if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) + { + outs.push_back (normOuts[j]); + combNodes.push_back (normCombNodes[j]); + weights.push_back (normWeights[j]); + transfers.push_back (normTransfers[j]); + } + normOuts = outs; + normCombNodes = combNodes; + normWeights = weights; + normTransfers = transfers; + + // normalize weights + RuleExecution::normaliseWeights (&normWeights); + + // best model weight + unsigned maxInd = 0; + for (unsigned j = 1; j < normWeights.size (); j++) + { + if (normWeights[j] > normWeights[maxInd]) + maxInd = j; + } + bestModFile << normTransfers[maxInd] << endl; + + } + + weightFile.close (); + transferOutFile.close (); + lextorFile.close (); + bestModFile.close (); + } + else + { + cout << "ERROR in opening files!" << endl; + } + return 0; +} diff --git a/src/OrderAmbigSents.cpp b/src/OrderAmbigSents.cpp index 84896e3..f54096f 100644 --- a/src/OrderAmbigSents.cpp +++ b/src/OrderAmbigSents.cpp @@ -122,7 +122,7 @@ main (int argc, char **argv) { cout << i++ << endl; -// spaces after each token + // spaces after each token vector spaces; // tokens in the sentence order diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 13476af..8285a19 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -70,7 +70,7 @@ main (int argc, char **argv) cout << "interInFilePath : Output file name of this program which is the input for apertium interchunk." << endl; -// return -1; + return -1; } ifstream lextorFile (lextorFilePath.c_str ());