commit 22e024c58e63a3c4ff3c6356703359f2352963a9 Author: aboelhamd Date: Fri May 3 00:34:52 2019 +0200 Some changes to adapt for evaluation diff --git a/src/BeamResult.cpp b/src/BeamResult.cpp index cd34a7e..b873638 100644 --- a/src/BeamResult.cpp +++ b/src/BeamResult.cpp @@ -1,259 +1,259 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, modelsDest, - beamSize, transferOutFilePath, beamOutFilePath; - - if (argc == 9) - { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - beamOutFilePath = argv[6]; - - modelsDest = argv[7]; - beamSize = argv[8]; - } - else - { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - sentenceFilePath = "src.txt"; - lextorFilePath = "lextor.txt"; - - transferOutFilePath = "beam-transfer.txt"; - beamOutFilePath = "beamOutFile.txt"; - - modelsDest = "./UntitledFolder/models"; - beamSize = "8"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." - << endl; - cout << "modelsDest : Yasmet models destination." << endl; - cout << "beamSize : The size of beam in beam search algorithm." << endl; - return -1; - } - - // seed for randomness - srand (time (NULL)); - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream inSentenceFile (sentenceFilePath.c_str ()); - if (lextorFile.is_open () && inSentenceFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - string sourceSentence; - if (!getline (inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back (sourceSentence); - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); - inSentenceFile.close (); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - map > > classesWeights = - CLExec::loadYasmetModels (modelsDest); - -// vector > vouts; - - int beam; - stringstream buffer (beamSize); - buffer >> beam; - - // empty the output file - ofstream beamFile (beamOutFilePath.c_str ()); - beamFile.close (); - - ifstream transferOutFile (transferOutFilePath.c_str ()); - - if (transferOutFile.is_open ()) - for (unsigned i = 0; i < sourceSentences.size (); i++) - { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j]->combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - - CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, - localeId); - - RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); - - // read transfer - string line; - vector beamTransfers; - for (unsigned j = 0; j < outs.size (); j++) - { - getline (transferOutFile, line); - beamTransfers.push_back (line); - } - - // write beam results - ofstream beamFile (beamOutFilePath.c_str (), ofstream::app); - if (beamFile.is_open ()) - { - beamFile << "source sentence (" << (i + 1) << ") : " << endl; - beamFile << sourceSentence << endl << endl; - // just take first best - for (unsigned j = 0; j < /*outs.size ()*/1; j++) - { - beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; - beamFile << beamTransfers[j] << endl; - beamFile << "weight = " << beamTree[j].second << endl; - beamFile << "rules : "; - for (unsigned k = 0; k < combNodes[j].size (); k++) - if (combNodes[j][k]->ruleId) - beamFile << combNodes[j][k]->ruleId << " "; - beamFile << endl << endl; - beamFile - << "------------------------------------------------------------------" - << endl << endl; - } - } - beamFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - transferOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, modelsDest, +// beamSize, transferOutFilePath, beamOutFilePath; +// +// if (argc == 9) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// beamOutFilePath = argv[6]; +// +// modelsDest = argv[7]; +// beamSize = argv[8]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "src.txt"; +// lextorFilePath = "lextor.txt"; +// +// transferOutFilePath = "beam-transfer.txt"; +// beamOutFilePath = "beamOutFile.txt"; +// +// modelsDest = "./UntitledFolder/models"; +// beamSize = "8"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." +// << endl; +// cout << "modelsDest : Yasmet models destination." << endl; +// cout << "beamSize : The size of beam in beam search algorithm." << endl; +// return -1; +// } +// +// // seed for randomness +// srand (time (NULL)); +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream inSentenceFile (sentenceFilePath.c_str ()); +// if (lextorFile.is_open () && inSentenceFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// string sourceSentence; +// if (!getline (inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back (sourceSentence); +// tokenizedSentences.push_back (tokenizedSentence); +// } +// lextorFile.close (); +// inSentenceFile.close (); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// map > > classesWeights = +// CLExec::loadYasmetModels (modelsDest); +// +//// vector > vouts; +// +// int beam; +// stringstream buffer (beamSize); +// buffer >> beam; +// +// // empty the output file +// ofstream beamFile (beamOutFilePath.c_str ()); +// beamFile.close (); +// +// ifstream transferOutFile (transferOutFilePath.c_str ()); +// +// if (transferOutFile.is_open ()) +// for (unsigned i = 0; i < sourceSentences.size (); i++) +// { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, +// &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, +// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, +// spaces, localeId); +// +// // final outputs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // beam tree +// vector, float> > beamTree; +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// if (ambigInfo[j]->combinations.size () > 1) +// newAmbigInfo.push_back (ambigInfo[j]); +// +// CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, +// localeId); +// +// RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, +// spaces); +// +// // read transfer +// string line; +// vector beamTransfers; +// for (unsigned j = 0; j < outs.size (); j++) +// { +// getline (transferOutFile, line); +// beamTransfers.push_back (line); +// } +// +// // write beam results +// ofstream beamFile (beamOutFilePath.c_str (), ofstream::app); +// if (beamFile.is_open ()) +// { +// beamFile << "source sentence (" << (i + 1) << ") : " << endl; +// beamFile << sourceSentence << endl << endl; +// // just take first best +// for (unsigned j = 0; j < /*outs.size ()*/1; j++) +// { +// beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; +// beamFile << beamTransfers[j] << endl; +// beamFile << "weight = " << beamTree[j].second << endl; +// beamFile << "rules : "; +// for (unsigned k = 0; k < combNodes[j].size (); k++) +// if (combNodes[j][k]->ruleId) +// beamFile << combNodes[j][k]->ruleId << " "; +// beamFile << endl << endl; +// beamFile +// << "------------------------------------------------------------------" +// << endl << endl; +// } +// } +// beamFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// transferOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index 8a396c6..f2dffb6 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -1,186 +1,186 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; - - if (argc == 7) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - interInFilePath = argv[4]; - modelsDest = argv[5]; - k = argv[6]; - } - else - { - localeId = "es_ES"; - transferFilePath = "apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = "lextor.txt"; - interInFilePath = "beaminter.txt"; - modelsDest = "/home/aboelhamd/Downloads/models"; - k = "8"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "src.txt"; +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; +// +// if (argc == 7) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// lextorFilePath = argv[3]; +// interInFilePath = argv[4]; +// modelsDest = argv[5]; +// k = argv[6]; +// } +// else +// { +// localeId = "es_ES"; +// transferFilePath = "apertium-eng-spa.spa-eng.t1x"; // lextorFilePath = "lextor.txt"; -// interInFilePath = "beam-inter.txt"; -// modelsDest = "./UntitledFolder/models"; +// interInFilePath = "beaminter.txt"; +// modelsDest = "/home/aboelhamd/Downloads/models"; // k = "8"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "interInFilePath : Output file of this program which is the input for apertium interchunk." - << endl; - cout << "modelsDest : Yasmet models destination." << endl; - cout << "beamSize : The size of beam in beam search algorithm." << endl; - return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ofstream interInFile (interInFilePath.c_str ()); - if (lextorFile.is_open () && interInFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - map > > classesWeights = - CLExec::loadYasmetModels (modelsDest); - - int beam; - stringstream buffer (k); - buffer >> beam; - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - // cout << i << endl; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j]->combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - - CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, - localeId); - - // take the first sentence only - beamTree.erase (beamTree.begin () + 1, beamTree.end ()); - - RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); - - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - interInFile << outs[j] << endl; - - } - interInFile.close (); - lextorFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "src.txt"; +//// lextorFilePath = "lextor.txt"; +//// interInFilePath = "beam-inter.txt"; +//// modelsDest = "./UntitledFolder/models"; +//// k = "8"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "interInFilePath : Output file of this program which is the input for apertium interchunk." +// << endl; +// cout << "modelsDest : Yasmet models destination." << endl; +// cout << "beamSize : The size of beam in beam search algorithm." << endl; +// return -1; +// } +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ofstream interInFile (interInFilePath.c_str ()); +// if (lextorFile.is_open () && interInFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// map > > classesWeights = +// CLExec::loadYasmetModels (modelsDest); +// +// int beam; +// stringstream buffer (k); +// buffer >> beam; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// // cout << i << endl; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, +// tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// +// // final outputs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // beam tree +// vector, float> > beamTree; +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// if (ambigInfo[j]->combinations.size () > 1) +// newAmbigInfo.push_back (ambigInfo[j]); +// +// CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, +// localeId); +// +// // take the first sentence only +// beamTree.erase (beamTree.begin () + 1, beamTree.end ()); +// +// RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, +// spaces); +// +// // write the outs +// for (unsigned j = 0; j < outs.size (); j++) +// interInFile << outs[j] << endl; +// +// } +// interInFile.close (); +// lextorFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/ModelResult.cpp b/src/ModelResult.cpp index 3b2c4a2..8601ca7 100644 --- a/src/ModelResult.cpp +++ b/src/ModelResult.cpp @@ -1,376 +1,376 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, - transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, - randModFilePath; - - if (argc == 10) - { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - weightFilePath = argv[6]; - - outputFilePath = argv[7]; - bestModFilePath = argv[8]; - randModFilePath = argv[9]; - } - else - { +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, +// transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, +// randModFilePath; +// +// if (argc == 10) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// weightFilePath = argv[6]; +// +// outputFilePath = argv[7]; +// bestModFilePath = argv[8]; +// randModFilePath = argv[9]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// +//// transferOutFilePath = "sample-transfer.txt"; +//// weightFilePath = "sample-weights.txt"; +//// +//// outputFilePath = "outAnalysis.txt"; +//// bestModFilePath = "bestModFile.txt"; +//// randModFilePath = "randModFile.txt"; +// // localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "sample-sentences.txt"; -// lextorFilePath = "sample-lextor.txt"; -// -// transferOutFilePath = "sample-transfer.txt"; -// weightFilePath = "sample-weights.txt"; +// transferFilePath = "transferFile3.t1x"; +// sentenceFilePath = "spa-toknizer.txt"; +// lextorFilePath = "spa-lextor.txt"; +// +// transferOutFilePath = "spa-transfer.txt"; +// weightFilePath = "spa-weight.txt"; // // outputFilePath = "outAnalysis.txt"; // bestModFilePath = "bestModFile.txt"; // randModFilePath = "randModFile.txt"; - - localeId = "es_ES"; - transferFilePath = "transferFile3.t1x"; - sentenceFilePath = "spa-toknizer.txt"; - lextorFilePath = "spa-lextor.txt"; - - transferOutFilePath = "spa-transfer.txt"; - weightFilePath = "spa-weight.txt"; - - outputFilePath = "outAnalysis.txt"; - bestModFilePath = "bestModFile.txt"; - randModFilePath = "randModFile.txt"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout - << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." - << endl; - cout - << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." - << endl; - cout - << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." - << endl; - return -1; - } - - // seed for randomness - srand (time (NULL)); - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream inSentenceFile (sentenceFilePath.c_str ()); - if (lextorFile.is_open () && inSentenceFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - string sourceSentence; - if (!getline (inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back (sourceSentence); - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); - inSentenceFile.close (); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - // empty output files - ofstream outputFile (outputFilePath.c_str ()); - outputFile.close (); - ofstream bestModFile (bestModFilePath.c_str ()); - bestModFile.close (); - ofstream randModFile (randModFilePath.c_str ()); - randModFile.close (); - - ifstream weightFile (weightFilePath.c_str ()); - ifstream transferOutFile (transferOutFilePath.c_str ()); - - if (weightFile.is_open () && transferOutFile.is_open ()) - for (unsigned i = 0; i < sourceSentences.size (); i++) - { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outputs - vector normOuts; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > normCombNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, - ruleOutputs, spaces); - - // read weights - string line; - vector normWeights; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (weightFile, line); - float weight = strtof (line.c_str (), NULL); - normWeights.push_back (weight); - } - - // read transfer - vector normTransfers; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (transferOutFile, line); - normTransfers.push_back (line); - } - - // remove redundant outputs - vector outs; - vector > combNodes; - vector weights; - vector transfers; - for (unsigned j = 0; j < normOuts.size (); j++) - if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) - { - outs.push_back (normOuts[j]); - combNodes.push_back (normCombNodes[j]); - weights.push_back (normWeights[j]); - transfers.push_back (normTransfers[j]); - } - normOuts = outs; - normCombNodes = combNodes; - normWeights = weights; - normTransfers = transfers; - - // normalize weights - RuleExecution::normaliseWeights (&normWeights); - - // write normal outputs - ofstream outputFile (outputFilePath.c_str (), ofstream::app); - if (outputFile.is_open ()) - { - outputFile << "Analysis of sentence : " << endl; - outputFile << sourceSentence << endl << endl << endl; - - outputFile << endl; - outputFile << "sentence id ||| coverage id ||| original sentence |||" - << " lextor ||| rules ||| chunker ||| final sentence ||| score" - << endl << endl; - - for (unsigned j = 0; j < normWeights.size (); j++) - { - // sentence id - outputFile << (i + 1) << " ||| "; - // coverage id - outputFile << (j + 1) << " ||| "; - // original sentence - outputFile << sourceSentence << " ||| "; - // lextor - outputFile << tokenizedSentence << " ||| "; - // rules - for (unsigned k = 0; k < normCombNodes[j].size (); k++) - if (normCombNodes[j][k]->ruleId) - outputFile << normCombNodes[j][k]->ruleId << " "; - outputFile << "||| "; - // chuncker - outputFile << normOuts[j] << " ||| "; - // final sentence - outputFile << normTransfers[j] << " ||| "; - // score - outputFile << normWeights[j] << endl << endl; - } - - outputFile - << "---------------------------------------------------------------------------------------------------------" - << endl << endl; - - outputFile.close (); - } - - // Model weighting - // best weight - ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); - if (bestModFile.is_open ()) - { - bestModFile - << "---------------------------------------------------------------------------------------------------------" - << endl << endl; - - bestModFile << (i + 1) << endl; - bestModFile << "Source : " << sourceSentence << endl << endl; - - unsigned maxInd = 0; - for (unsigned j = 1; j < normWeights.size (); j++) - { - if (normWeights[j] > normWeights[maxInd]) - maxInd = j; - } - - // final sentence - bestModFile << "Target : " << normTransfers[maxInd] << endl; - // score - bestModFile << "Weight : " << normWeights[maxInd] << endl; - // rules - bestModFile << "Rules : "; - for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) - if (normCombNodes[maxInd][k]->ruleId) - bestModFile << normCombNodes[maxInd][k]->ruleId << " "; - - bestModFile << endl - << "---------------------------------------------------------------------------------------------------------" - << endl << endl << endl; - } - bestModFile.close (); - - // Random weight - ofstream randModFile (randModFilePath.c_str (), ofstream::app); - if (randModFile.is_open ()) - { - randModFile << (i + 1) << endl; - randModFile << "Source : " << sourceSentence << endl << endl; - - int random = rand () % normWeights.size (); - - // final sentence - randModFile << "Target : " << normTransfers[random] << endl; - // score - randModFile << "Weight : " << normWeights[random] << endl; - // rules - randModFile << "Rules : "; - for (unsigned k = 0; k < normCombNodes[random].size (); k++) - if (normCombNodes[random][k]->ruleId) - randModFile << normCombNodes[random][k]->ruleId << " "; - - randModFile << endl - << "---------------------------------------------------------------------------------------------------------" - << endl << endl << endl; - } - randModFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - weightFile.close (); - transferOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout +// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." +// << endl; +// cout +// << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." +// << endl; +// cout +// << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." +// << endl; +// return -1; +// } +// +// // seed for randomness +// srand (time (NULL)); +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream inSentenceFile (sentenceFilePath.c_str ()); +// if (lextorFile.is_open () && inSentenceFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// string sourceSentence; +// if (!getline (inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back (sourceSentence); +// tokenizedSentences.push_back (tokenizedSentence); +// } +// lextorFile.close (); +// inSentenceFile.close (); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// // empty output files +// ofstream outputFile (outputFilePath.c_str ()); +// outputFile.close (); +// ofstream bestModFile (bestModFilePath.c_str ()); +// bestModFile.close (); +// ofstream randModFile (randModFilePath.c_str ()); +// randModFile.close (); +// +// ifstream weightFile (weightFilePath.c_str ()); +// ifstream transferOutFile (transferOutFilePath.c_str ()); +// +// if (weightFile.is_open () && transferOutFile.is_open ()) +// for (unsigned i = 0; i < sourceSentences.size (); i++) +// { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, +// &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, +// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, +// spaces, localeId); +// +// // final outputs +// vector normOuts; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > normCombNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, +// ruleOutputs, spaces); +// +// // read weights +// string line; +// vector normWeights; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (weightFile, line); +// float weight = strtof (line.c_str (), NULL); +// normWeights.push_back (weight); +// } +// +// // read transfer +// vector normTransfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (transferOutFile, line); +// normTransfers.push_back (line); +// } +// +// // remove redundant outputs +// vector outs; +// vector > combNodes; +// vector weights; +// vector transfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) +// { +// outs.push_back (normOuts[j]); +// combNodes.push_back (normCombNodes[j]); +// weights.push_back (normWeights[j]); +// transfers.push_back (normTransfers[j]); +// } +// normOuts = outs; +// normCombNodes = combNodes; +// normWeights = weights; +// normTransfers = transfers; +// +// // normalize weights +// RuleExecution::normaliseWeights (&normWeights); +// +// // write normal outputs +// ofstream outputFile (outputFilePath.c_str (), ofstream::app); +// if (outputFile.is_open ()) +// { +// outputFile << "Analysis of sentence : " << endl; +// outputFile << sourceSentence << endl << endl << endl; +// +// outputFile << endl; +// outputFile << "sentence id ||| coverage id ||| original sentence |||" +// << " lextor ||| rules ||| chunker ||| final sentence ||| score" +// << endl << endl; +// +// for (unsigned j = 0; j < normWeights.size (); j++) +// { +// // sentence id +// outputFile << (i + 1) << " ||| "; +// // coverage id +// outputFile << (j + 1) << " ||| "; +// // original sentence +// outputFile << sourceSentence << " ||| "; +// // lextor +// outputFile << tokenizedSentence << " ||| "; +// // rules +// for (unsigned k = 0; k < normCombNodes[j].size (); k++) +// if (normCombNodes[j][k]->ruleId) +// outputFile << normCombNodes[j][k]->ruleId << " "; +// outputFile << "||| "; +// // chuncker +// outputFile << normOuts[j] << " ||| "; +// // final sentence +// outputFile << normTransfers[j] << " ||| "; +// // score +// outputFile << normWeights[j] << endl << endl; +// } +// +// outputFile +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl; +// +// outputFile.close (); +// } +// +// // Model weighting +// // best weight +// ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); +// if (bestModFile.is_open ()) +// { +//// bestModFile +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl; +// +//// bestModFile << (i + 1) << endl; +//// bestModFile << "Source : " << sourceSentence << endl << endl; +// +// unsigned maxInd = 0; +// for (unsigned j = 1; j < normWeights.size (); j++) +// { +// if (normWeights[j] > normWeights[maxInd]) +// maxInd = j; +// } +// +// // final sentence +// bestModFile /*<< "Target : "*/ << normTransfers[maxInd] << endl; +// // score +//// bestModFile << "Weight : " << normWeights[maxInd] << endl; +// // rules +//// bestModFile << "Rules : "; +//// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) +//// if (normCombNodes[maxInd][k]->ruleId) +//// bestModFile << normCombNodes[maxInd][k]->ruleId << " "; +//// +//// bestModFile << endl +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl << endl; +// } +// bestModFile.close (); +// +// // Random weight +// ofstream randModFile (randModFilePath.c_str (), ofstream::app); +// if (randModFile.is_open ()) +// { +// randModFile << (i + 1) << endl; +// randModFile << "Source : " << sourceSentence << endl << endl; +// +// int random = rand () % normWeights.size (); +// +// // final sentence +// randModFile << "Target : " << normTransfers[random] << endl; +// // score +// randModFile << "Weight : " << normWeights[random] << endl; +// // rules +// randModFile << "Rules : "; +// for (unsigned k = 0; k < normCombNodes[random].size (); k++) +// if (normCombNodes[random][k]->ruleId) +// randModFile << normCombNodes[random][k]->ruleId << " "; +// +// randModFile << endl +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl << endl; +// } +// randModFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// weightFile.close (); +// transferOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 4a509c3..13476af 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -53,12 +53,13 @@ main (int argc, char **argv) localeId = "es_ES"; transferFilePath = "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = "/home/aboelhamd/Downloads/es-en/splits/xaa-lextor.txt"; - interInFilePath = "/home/aboelhamd/Downloads/es-en/splits/xaa-chunker.txt"; + lextorFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt"; + interInFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt"; cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath" + cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath" << endl; cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" << endl; @@ -69,11 +70,20 @@ main (int argc, char **argv) cout << "interInFilePath : Output file name of this program which is the input for apertium interchunk." << endl; - return -1; +// return -1; } ifstream lextorFile (lextorFilePath.c_str ()); ofstream interInFile (interInFilePath.c_str ()); + ifstream refFile ( + string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt").c_str ()); + ofstream refInFile ( + string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt").c_str ()); + ifstream errFile ( + string ( + "/home/aboelhamd/Downloads/apertium-eval-translator-master/ambig_results.txt").c_str ()); + ofstream bestInFile ( + string ("/home/aboelhamd/eclipse-workspace/machinetranslation/best-chunker.txt").c_str ()); if (lextorFile.is_open () && interInFile.is_open ()) { // load transfer file in an xml document object @@ -93,11 +103,11 @@ main (int argc, char **argv) map vars = RuleParser::getVars (transfer); map > lists = RuleParser::getLists (transfer); -// unsigned i = 0; - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) + unsigned i = 0; + string tokenizedSentence, refSent; + while (getline (lextorFile, tokenizedSentence) && getline (refFile, refSent)) { -// cout << i++ << endl; + cout << i++ << endl; // spaces after each token vector spaces; @@ -218,9 +228,40 @@ main (int argc, char **argv) // cout << endl; // } +// set diffOuts (outs.begin (), outs.end ()); +// +// // write the outs +// for (set::iterator it = diffOuts.begin (); it != diffOuts.end (); it++) +// { +// interInFile << *it << endl; +// refInFile << refSent << endl; +// } + + float min = 100000; + int minInd = -1; + string serr; + float err; + // write the outs for (unsigned j = 0; j < outs.size (); j++) - interInFile << outs[j] << endl; + { + getline (errFile, serr); + err = strtof (serr.c_str (), NULL); + + if (err < min) + { + min = err; + minInd = j; + } + + interInFile << outs[j] << endl; + refInFile << refSent << endl; + } +// cout << minInd << endl; + bestInFile << outs[minInd] << endl; + + interInFile << endl; + refInFile << endl; // delete AmbigInfo pointers for (unsigned j = 0; j < ambigInfo.size (); j++) @@ -248,6 +289,9 @@ main (int argc, char **argv) lextorFile.close (); interInFile.close (); + refFile.close (); + refInFile.close (); + bestInFile.close (); cout << "RulesApplier finished!"; } else diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index 80134a2..0d21e53 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -1,290 +1,290 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string lextorFilePath = "lextor.txt", weightOutFilePath = "weights.txt", localeId = - "kk_KZ", transferFilePath = "transferFile.tx1", datasetsPath = "datasets"; - - if (argc == 6) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - weightOutFilePath = argv[4]; - datasetsPath = argv[5]; - } - else - { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// transferOutFilePath = "transfer.out"; -// weightOutFilePath = "weights.txt"; -// outputFilePath = "output.out"; -// datasetsPath = "datasetstry2"; - - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - lextorFilePath = "sample-lextor.txt"; - weightOutFilePath = "norm-weights.txt"; - datasetsPath = "datasetstry1234"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath lextorFilePath weightOutFilePath datasetsPath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout << "datasetsPath : Datasets destination to put in the generated yasmet files." - << endl; - return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream weightOutFile (weightOutFilePath.c_str ()); - if (lextorFile.is_open () && weightOutFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - // cout << i << endl; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - - // final outs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j]->combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - ambigInfo = newAmbigInfo; - - // read weights - string line; - vector weights; - for (unsigned j = 0; j < outs.size (); j++) - { - getline (weightOutFile, line); - float weight = strtof (line.c_str (), NULL); - weights.push_back (weight); - } - - RuleExecution::normaliseWeights (&weights, ambigInfo); - - // Yasmet format preparing - // make a directory if not found - mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - - unsigned weigInd = 0; - for (unsigned i = 0; i < ambigInfo.size (); i++) - { - RuleExecution::AmbigInfo* ambig = ambigInfo[i]; - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambig->combinations.size (); x++) - { - // avoid dummy node - for (unsigned y = 1; y < ambig->combinations[x].size (); y++) - { - stringstream ss; -// ss->clear (); - ss << ambig->combinations[x][y]->ruleId; - rulesNums += ss.str (); - - if (y + 1 < ambig->combinations[x].size ()) - rulesNums += "_"; - } - rulesNums += "+"; - } - - // if it's the first time to open , put the number of classes - bool firstTime = true; - if (FILE *file = fopen ((datasetsPath + string ("/") + rulesNums).c_str (), - "r")) - { - firstTime = false; - fclose (file); - } - -// stringstream* dataset = new stringstream (); - ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (), - ofstream::app); - - if (firstTime) - dataset << ambig->combinations.size () << endl; - - for (unsigned x = 0; x < ambig->combinations.size (); x++) - { - - dataset << x << " $ "; - - float weight = weights[x + weigInd]; - - dataset << weight << " #"; - - string features; - for (unsigned v = 0; v < ambig->combinations.size (); v++) - { - stringstream ss; -// ss.clear (); - ss << v; - string label = ss.str (); - - for (unsigned z = ambig->firTokId; - z < ambig->firTokId + ambig->maxPat; z++) - { - stringstream ss; -// ss->clear (); - ss << z - ambig->firTokId; - string num = ss.str (); -// *num = ss->str (); - string word = CLExec::toLowerCase (slTokens[z], localeId); - - for (unsigned c = 0; c < word.length (); c++) - if (word[c] == ' ') - word.replace (c, 1, "_"); - - features += " " + word + "_" + num + ":" + label; - } - features += " #"; - } - dataset << features << endl; -// delete (features); - } - weigInd += ambig->combinations.size (); -// dataset.close (); - } - - // delete AmbigInfo pointers - for (unsigned j = 0; j < ambigInfo.size (); j++) - { - // delete the dummy node pointers - set dummies; - for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) - dummies.insert (ambigInfo[j]->combinations[k][0]); - for (set::iterator it = dummies.begin (); - it != dummies.end (); it++) - delete (*it); - - delete ambigInfo[j]; - } - // delete Node pointers - for (map >::iterator it = - nodesPool.begin (); it != nodesPool.end (); it++) - { - for (unsigned j = 0; j < it->second.size (); j++) - { - delete it->second[j]; - } - } - -// } - } - lextorFile.close (); - weightOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string lextorFilePath = "lextor.txt", weightOutFilePath = "weights.txt", localeId = +// "kk_KZ", transferFilePath = "transferFile.tx1", datasetsPath = "datasets"; +// +// if (argc == 6) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// lextorFilePath = argv[3]; +// weightOutFilePath = argv[4]; +// datasetsPath = argv[5]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// transferOutFilePath = "transfer.out"; +//// weightOutFilePath = "weights.txt"; +//// outputFilePath = "output.out"; +//// datasetsPath = "datasetstry2"; +// +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// lextorFilePath = "sample-lextor.txt"; +// weightOutFilePath = "norm-weights.txt"; +// datasetsPath = "datasetstry1234"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath lextorFilePath weightOutFilePath datasetsPath" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout << "datasetsPath : Datasets destination to put in the generated yasmet files." +// << endl; +// return -1; +// } +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream weightOutFile (weightOutFilePath.c_str ()); +// if (lextorFile.is_open () && weightOutFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// // cout << i << endl; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, +// tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// +// // final outs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, +// spaces); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// if (ambigInfo[j]->combinations.size () > 1) +// newAmbigInfo.push_back (ambigInfo[j]); +// ambigInfo = newAmbigInfo; +// +// // read weights +// string line; +// vector weights; +// for (unsigned j = 0; j < outs.size (); j++) +// { +// getline (weightOutFile, line); +// float weight = strtof (line.c_str (), NULL); +// weights.push_back (weight); +// } +// +// RuleExecution::normaliseWeights (&weights, ambigInfo); +// +// // Yasmet format preparing +// // make a directory if not found +// mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +// +// unsigned weigInd = 0; +// for (unsigned i = 0; i < ambigInfo.size (); i++) +// { +// RuleExecution::AmbigInfo* ambig = ambigInfo[i]; +// +// // name of the file is the concatenation of rules ids +// string rulesNums; +// for (unsigned x = 0; x < ambig->combinations.size (); x++) +// { +// // avoid dummy node +// for (unsigned y = 1; y < ambig->combinations[x].size (); y++) +// { +// stringstream ss; +//// ss->clear (); +// ss << ambig->combinations[x][y]->ruleId; +// rulesNums += ss.str (); +// +// if (y + 1 < ambig->combinations[x].size ()) +// rulesNums += "_"; +// } +// rulesNums += "+"; +// } +// +// // if it's the first time to open , put the number of classes +// bool firstTime = true; +// if (FILE *file = fopen ((datasetsPath + string ("/") + rulesNums).c_str (), +// "r")) +// { +// firstTime = false; +// fclose (file); +// } +// +//// stringstream* dataset = new stringstream (); +// ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (), +// ofstream::app); +// +// if (firstTime) +// dataset << ambig->combinations.size () << endl; +// +// for (unsigned x = 0; x < ambig->combinations.size (); x++) +// { +// +// dataset << x << " $ "; +// +// float weight = weights[x + weigInd]; +// +// dataset << weight << " #"; +// +// string features; +// for (unsigned v = 0; v < ambig->combinations.size (); v++) +// { +// stringstream ss; +//// ss.clear (); +// ss << v; +// string label = ss.str (); +// +// for (unsigned z = ambig->firTokId; +// z < ambig->firTokId + ambig->maxPat; z++) +// { +// stringstream ss; +//// ss->clear (); +// ss << z - ambig->firTokId; +// string num = ss.str (); +//// *num = ss->str (); +// string word = CLExec::toLowerCase (slTokens[z], localeId); +// +// for (unsigned c = 0; c < word.length (); c++) +// if (word[c] == ' ') +// word.replace (c, 1, "_"); +// +// features += " " + word + "_" + num + ":" + label; +// } +// features += " #"; +// } +// dataset << features << endl; +//// delete (features); +// } +// weigInd += ambig->combinations.size (); +//// dataset.close (); +// } +// +// // delete AmbigInfo pointers +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// { +// // delete the dummy node pointers +// set dummies; +// for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) +// dummies.insert (ambigInfo[j]->combinations[k][0]); +// for (set::iterator it = dummies.begin (); +// it != dummies.end (); it++) +// delete (*it); +// +// delete ambigInfo[j]; +// } +// // delete Node pointers +// for (map >::iterator it = +// nodesPool.begin (); it != nodesPool.end (); it++) +// { +// for (unsigned j = 0; j < it->second.size (); j++) +// { +// delete it->second[j]; +// } +// } +// +//// } +// } +// lextorFile.close (); +// weightOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// +// return 0; +//}