commit a697fc63bb1c72e0b269f7e4df69e208471b1802 Author: aboelhamd Date: Sun Apr 21 19:14:56 2019 +0200 Pointers bug is fixed diff --git a/merge-models.py b/merge-models.py index 8f33b7d..d6287fb 100644 --- a/merge-models.py +++ b/merge-models.py @@ -3,15 +3,15 @@ from os.path import isfile, join import sys if (len(sys.argv) != 4) : - print('Usage: python merge-models.py modelsdest localeid newfile'); + print('Usage: python merge-models.py modelsdest newfile'); sys.exit(-1) -newfile = open(sys.argv[3], 'w') -localeid = sys.argv[2] +newfile = open(sys.argv[2], 'w') +#localeid = sys.argv[2] modelsdest = sys.argv[1] # localeid -newfile.write("%s\n" % localeid) +#newfile.write("%s\n" % localeid) models = [f for f in listdir(modelsdest) if isfile(join(modelsdest, f))] diff --git a/src/BeamResult.cpp b/src/BeamResult.cpp index e1f00e5..31c69bb 100644 --- a/src/BeamResult.cpp +++ b/src/BeamResult.cpp @@ -1,259 +1,259 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, modelsDest, - beamSize, transferOutFilePath, beamOutFilePath; - - if (argc == 9) - { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - beamOutFilePath = argv[6]; - - modelsDest = argv[7]; - beamSize = argv[8]; - } - else - { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - sentenceFilePath = "src.txt"; - lextorFilePath = "lextor.txt"; - - transferOutFilePath = "beam-transfer.txt"; - beamOutFilePath = "beamOutFile.txt"; - - modelsDest = "./UntitledFolder/models"; - beamSize = "8"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." - << endl; - cout << "modelsDest : Yasmet models destination." << endl; - cout << "beamSize : The size of beam in beam search algorithm." << endl; - return -1; - } - - // seed for randomness - srand (time (NULL)); - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream inSentenceFile (sentenceFilePath.c_str ()); - if (lextorFile.is_open () && inSentenceFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - string sourceSentence; - if (!getline (inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back (sourceSentence); - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); - inSentenceFile.close (); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - map > > classesWeights = - CLExec::loadYasmetModels (modelsDest); - -// vector > vouts; - - int beam; - stringstream buffer (beamSize); - buffer >> beam; - - // empty the output file - ofstream beamFile (beamOutFilePath.c_str ()); - beamFile.close (); - - ifstream transferOutFile (transferOutFilePath.c_str ()); - - if (transferOutFile.is_open ()) - for (unsigned i = 0; i < sourceSentences.size (); i++) - { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j].combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - - CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, - localeId); - - RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); - - // read transfer - string line; - vector beamTransfers; - for (unsigned j = 0; j < outs.size (); j++) - { - getline (transferOutFile, line); - beamTransfers.push_back (line); - } - - // write beam results - ofstream beamFile (beamOutFilePath.c_str (), ofstream::app); - if (beamFile.is_open ()) - { - beamFile << "source sentence (" << (i + 1) << ") : " << endl; - beamFile << sourceSentence << endl << endl; - // just take first best - for (unsigned j = 0; j < /*outs.size ()*/1; j++) - { - beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; - beamFile << beamTransfers[j] << endl; - beamFile << "weight = " << beamTree[j].second << endl; - beamFile << "rules : "; - for (unsigned k = 0; k < combNodes[j].size (); k++) - if (combNodes[j][k].ruleId) - beamFile << combNodes[j][k].ruleId << " "; - beamFile << endl << endl; - beamFile - << "------------------------------------------------------------------" - << endl << endl; - } - } - beamFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - transferOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, modelsDest, +// beamSize, transferOutFilePath, beamOutFilePath; +// +// if (argc == 9) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// beamOutFilePath = argv[6]; +// +// modelsDest = argv[7]; +// beamSize = argv[8]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "src.txt"; +// lextorFilePath = "lextor.txt"; +// +// transferOutFilePath = "beam-transfer.txt"; +// beamOutFilePath = "beamOutFile.txt"; +// +// modelsDest = "./UntitledFolder/models"; +// beamSize = "8"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." +// << endl; +// cout << "modelsDest : Yasmet models destination." << endl; +// cout << "beamSize : The size of beam in beam search algorithm." << endl; +// return -1; +// } +// +// // seed for randomness +// srand (time (NULL)); +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream inSentenceFile (sentenceFilePath.c_str ()); +// if (lextorFile.is_open () && inSentenceFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// string sourceSentence; +// if (!getline (inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back (sourceSentence); +// tokenizedSentences.push_back (tokenizedSentence); +// } +// lextorFile.close (); +// inSentenceFile.close (); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// map > > classesWeights = +// CLExec::loadYasmetModels (modelsDest); +// +//// vector > vouts; +// +// int beam; +// stringstream buffer (beamSize); +// buffer >> beam; +// +// // empty the output file +// ofstream beamFile (beamOutFilePath.c_str ()); +// beamFile.close (); +// +// ifstream transferOutFile (transferOutFilePath.c_str ()); +// +// if (transferOutFile.is_open ()) +// for (unsigned i = 0; i < sourceSentences.size (); i++) +// { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, +// &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, +// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, +// spaces, localeId); +// +// // final outputs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // beam tree +// vector, float> > beamTree; +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// if (ambigInfo[j].combinations.size () > 1) +// newAmbigInfo.push_back (ambigInfo[j]); +// +// CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, +// localeId); +// +// RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, +// spaces); +// +// // read transfer +// string line; +// vector beamTransfers; +// for (unsigned j = 0; j < outs.size (); j++) +// { +// getline (transferOutFile, line); +// beamTransfers.push_back (line); +// } +// +// // write beam results +// ofstream beamFile (beamOutFilePath.c_str (), ofstream::app); +// if (beamFile.is_open ()) +// { +// beamFile << "source sentence (" << (i + 1) << ") : " << endl; +// beamFile << sourceSentence << endl << endl; +// // just take first best +// for (unsigned j = 0; j < /*outs.size ()*/1; j++) +// { +// beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; +// beamFile << beamTransfers[j] << endl; +// beamFile << "weight = " << beamTree[j].second << endl; +// beamFile << "rules : "; +// for (unsigned k = 0; k < combNodes[j].size (); k++) +// if (combNodes[j][k].ruleId) +// beamFile << combNodes[j][k].ruleId << " "; +// beamFile << endl << endl; +// beamFile +// << "------------------------------------------------------------------" +// << endl << endl; +// } +// } +// beamFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// transferOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index ecedef4..016d32b 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -18,7 +19,6 @@ #include "RuleExecution.h" #include "TranElemLiterals.h" #include "CLExec.h" -#include "BeamSearch.h" #include @@ -26,365 +26,162 @@ using namespace std; using namespace pugi; using namespace elem; -void BeamSearch::transfer(string transferFilePath, string modelsFileDest, - string k, FILE* lextorFileFile, FILE* outFile) { - - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file(transferFilePath.c_str()); - if (string(result.description()) != "No error") { - cout << "ERROR : " << result.description() << endl; - exit(EXIT_FAILURE); +int +main (int argc, char **argv) +{ + string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; + + if (argc == 7) + { + localeId = argv[1]; + transferFilePath = argv[2]; + lextorFilePath = argv[3]; + interInFilePath = argv[4]; + modelsDest = argv[5]; + k = argv[6]; + } + else + { + localeId = "es_ES"; + transferFilePath = "apertium-eng-spa.spa-eng.t1x"; + lextorFilePath = "lextor.txt"; + interInFilePath = "beaminter.txt"; + modelsDest = "/home/aboelhamd/Downloads/models"; + k = "8"; + +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "src.txt"; +// lextorFilePath = "lextor.txt"; +// interInFilePath = "beam-inter.txt"; +// modelsDest = "./UntitledFolder/models"; +// k = "8"; + + cout << "Error in parameters !" << endl; + cout + << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath interInFilePath modelsDest beamSize" + << endl; + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << endl; + cout << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "interInFilePath : Output file of this program which is the input for apertium interchunk." + << endl; + cout << "modelsDest : Yasmet models destination." << endl; + cout << "beamSize : The size of beam in beam search algorithm." << endl; + return -1; + } + + ifstream lextorFile (lextorFilePath.c_str ()); + ofstream interInFile (interInFilePath.c_str ()); + if (lextorFile.is_open () && interInFile.is_open ()) + { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); + if (string (result.description ()) != "No error") + { + cout << "ERROR : " << result.description () << endl; + return -1; } - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child("transfer"); - - map > > attrs = RuleParser::getAttrs( - transfer); - map vars = RuleParser::getVars(transfer); - map > lists = RuleParser::getLists(transfer); - - string localeId; - map > > classesWeights = - CLExec::loadYasmetModels(modelsFileDest, &localeId); + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child ("transfer"); - int beam; - stringstream buffer(k); - buffer >> beam; + map > > attrs = RuleParser::getAttrs (transfer); + map vars = RuleParser::getVars (transfer); + map > lists = RuleParser::getLists (transfer); + map > > classesWeights = + CLExec::loadYasmetModels (modelsDest); - char buff[10240]; - string tokenizedSentence; - while (fgets(buff, 10240, lextorFileFile)) { - tokenizedSentence = buff; + int beam; + stringstream buffer (k); + buffer >> beam; - // spaces after each token - vector spaces; + string tokenizedSentence; + while (getline (lextorFile, tokenizedSentence)) + { + // cout << i << endl; - // tokens in the sentence order - vector slTokens, tlTokens; + // spaces after each token + vector spaces; - // tags of tokens in order - vector > slTags, tlTags; + // tokens in the sentence order + vector slTokens, tlTokens; - RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); + // tags of tokens in order + vector > slTags, tlTags; - // map of tokens ids and their matched categories - map > catsApplied; + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); - RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); + // map of tokens ids and their matched categories + map > catsApplied; - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, transfer); + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; - RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); - nodesPool = RuleExecution::getNodesPool(tokenRules); + // final outputs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // beam tree + vector, float> > beamTree; + // rules combinations + vector > combNodes; - RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, - &compNum); + nodesPool = RuleExecution::getNodesPool (tokenRules); - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size(); j++) - if (ambigInfo[j].combinations.size() > 1) - newAmbigInfo.push_back(ambigInfo[j]); + RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - CLExec::beamSearch(&beamTree, beam, slTokens, newAmbigInfo, - classesWeights, localeId); + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size (); j++) + if (ambigInfo[j]->combinations.size () > 1) + newAmbigInfo.push_back (ambigInfo[j]); - RuleExecution::getOuts(&outs, &combNodes, beamTree, nodesPool, - ruleOutputs, spaces); + CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, + localeId); - // write the outs - for (unsigned j = 0; j < outs.size(); j++) { - fputs(outs[j].c_str(), outFile); - } - } + RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, + spaces); -} + // write the outs -FILE * open_input(string const &filename) { - FILE *input = fopen(filename.c_str(), "r"); - if (!input) { - wcerr << "Error: can't open input file '"; - wcerr << filename.c_str() << "'." << endl; - exit(EXIT_FAILURE); - } - - return input; -} - -FILE * open_output(string const &filename) { - FILE *output = fopen(filename.c_str(), "w"); - if (!output) { - wcerr << "Error: can't open output file '"; - wcerr << filename.c_str() << "'." << endl; - exit(EXIT_FAILURE); - } - return output; -} + if (interInFile.is_open ()) + { + for (unsigned j = 0; j < outs.size (); j++) + interInFile << outs[j] << endl; + } -//int main(int argc, char **argv) { -// string sentenceFilePath, lextorFilePath, interInFilePath, localeId, -// transferFilePath, modelsDest, k; -// -// if (argc == 8) { -// localeId = argv[1]; -// transferFilePath = argv[2]; -// sentenceFilePath = argv[3]; -// lextorFilePath = argv[4]; -// interInFilePath = argv[5]; -// modelsDest = argv[6]; -// k = argv[7]; -// } else { -// localeId = "es_ES"; -// transferFilePath = "apertium-eng-spa.spa-eng.t1x"; -// sentenceFilePath = "sentences.txt"; -// lextorFilePath = "lextor.txt"; -// interInFilePath = "beaminter.txt"; -// modelsDest = "/home/aboelhamd/Downloads/models"; -// k = "8"; -// -//// localeId = "kk_KZ"; -//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -//// sentenceFilePath = "src.txt"; -//// lextorFilePath = "lextor.txt"; -//// interInFilePath = "beam-inter.txt"; -//// modelsDest = "./UntitledFolder/models"; -//// k = "8"; -// -// cout << "Error in parameters !" << endl; -// cout -// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath interInFilePath modelsDest beamSize" -// << endl; -// cout -// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" -// << endl; -// cout -// << "transferFilePath : Apertium transfer file of the language pair used." -// << endl; -// cout << "sentenceFilePath : Source language sentences file." << endl; -// cout -// << "lextorFilePath : Apertium lextor file for the source language sentences." -// << endl; -// cout -// << "interInFilePath : Output file of this program which is the input for apertium interchunk." -// << endl; -// cout << "modelsDest : Yasmet models destination." << endl; -// cout << "beamSize : The size of beam in beam search algorithm." << endl; -//// return -1; -// } -// -// ifstream lextorFile(lextorFilePath.c_str()); -// ifstream inSentenceFile(sentenceFilePath.c_str()); -// if (lextorFile.is_open() && inSentenceFile.is_open()) { -// // load transfer file in an xml document object -// xml_document transferDoc; -// xml_parse_result result = transferDoc.load_file( -// transferFilePath.c_str()); -// if (string(result.description()) != "No error") { -// cout << "ERROR : " << result.description() << endl; -// return -1; -// } -// -// // xml node of the parent node (transfer) in the transfer file -// xml_node transfer = transferDoc.child("transfer"); -// -// vector sourceSentences, tokenizedSentences; -// -// string tokenizedSentence; -// while (getline(lextorFile, tokenizedSentence)) { -// string sourceSentence; -// if (!getline(inSentenceFile, sourceSentence)) -// sourceSentence = "No more sentences"; -// -// sourceSentences.push_back(sourceSentence); -// tokenizedSentences.push_back(tokenizedSentence); -// } -// lextorFile.close(); -// inSentenceFile.close(); -// -// map > > attrs = RuleParser::getAttrs( -// transfer); -// map vars = RuleParser::getVars(transfer); -// map > lists = RuleParser::getLists(transfer); -// -// map > > classesWeights = -// CLExec::loadYasmetModels(modelsDest); -// -// vector > vouts; -// -// int beam; -// stringstream buffer(k); -// buffer >> beam; -// for (unsigned i = 0; i < sourceSentences.size(); i++) { -// cout << i << endl; -// -// string sourceSentence, tokenizedSentence; -// sourceSentence = sourceSentences[i]; -// tokenizedSentence = tokenizedSentences[i]; -// -// // spaces after each token -// vector spaces; -// -// // tokens in the sentence order -// vector slTokens, tlTokens; -// -// // tags of tokens in order -// vector > slTags, tlTags; -// -// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, -// &tlTags, &spaces, tokenizedSentence); -// -// // map of tokens ids and their matched categories -// map > catsApplied; -// -// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); -// -// // map of matched rules and a pair of first token id and patterns number -// map > > rulesApplied; -// -// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, -// transfer); -// -// // rule and (target) token map to specific output -// // if rule has many patterns we will choose the first token only -// map > ruleOutputs; -// -// // map (target) token to all matched rules ids and the number of pattern items of each rule -// map > > tokenRules; -// -// RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, -// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, -// localeId); -// -// // final outputs -// vector outs; -// // number of generated combinations -// unsigned compNum; -// // nodes for every token and rule -// map > nodesPool; -// // ambiguous informations -// vector ambigInfo; -// // beam tree -// vector, float> > beamTree; -// // rules combinations -// vector > combNodes; -// -// nodesPool = RuleExecution::getNodesPool(tokenRules); -// -// RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, -// &compNum); -// -// vector newAmbigInfo; -// for (unsigned j = 0; j < ambigInfo.size(); j++) -// if (ambigInfo[j].combinations.size() > 1) -// newAmbigInfo.push_back(ambigInfo[j]); -// -// CLExec::beamSearch(&beamTree, beam, slTokens, newAmbigInfo, -// classesWeights, localeId); -// -// RuleExecution::getOuts(&outs, &combNodes, beamTree, nodesPool, -// ruleOutputs, spaces); -// -// vouts.push_back(outs); -// } -// -// // write the outs -// ofstream interInFile(interInFilePath.c_str()); -// if (interInFile.is_open()) -// for (unsigned i = 0; i < vouts.size(); i++) { -// for (unsigned j = 0; j < vouts[i].size(); j++) -// interInFile << vouts[i][j] << endl; -// } -// else -// cout << "ERROR in opening files!" << endl; -// interInFile.close(); -// -// } else { -// cout << "ERROR in opening files!" << endl; -// } -// return 0; -//} - -//int main(int argc, char *argv[]) { -// -// string transferFilePath, localeId, modelsDest, k; -// FILE *input = stdin, *output = stdout; -// -// if (argc == 7) { -// output = open_output(argv[argc - 1]); -// input = open_input(argv[argc - 2]); -// k = argv[argc - 3]; -// modelsDest = argv[argc - 4]; -// localeId = argv[argc - 5]; -// transferFilePath = argv[argc - 6]; -// } else if (argc == 6) { -// input = open_input(argv[argc - 1]); -// k = argv[argc - 2]; -// modelsDest = argv[argc - 3]; -// localeId = argv[argc - 4]; -// transferFilePath = argv[argc - 5]; -// } else if (argc == 5) { -// k = argv[argc - 1]; -// modelsDest = argv[argc - 2]; -// localeId = argv[argc - 3]; -// transferFilePath = argv[argc - 4]; -// } -// -// BeamSearch::transfer(transferFilePath, localeId, modelsDest, k, input, -// output); -//} - -int main() { - string localeid; - map > > models = CLExec::loadYasmetModels( - "/home/aboelhamd/Downloads/newmodel2.model", &localeid); - cout << localeid << endl; -// cout << models["34+36_32+.model"][""]; - -// vector weights = -// models["33_68+33_67+33_95+33_113+115_71+115_66+.model"]["muy_0"]; -// for (unsigned i = 0; i < weights.size(); i++) { -// cout << weights[i] << endl; -// } -// cout << endl; - for (map > >::iterator it = - models.begin(); it != models.end(); it++) { - cout << "model=" << it->first << endl; - for (map >::iterator it2 = it->second.begin(); - it2 != it->second.end(); it2++) { - cout << "word= " << it2->first << endl; - vector weights = it2->second; - for (unsigned i = 0; i < weights.size(); i++) { - cout << weights[i] << endl; - } - cout << endl; - } + interInFile.close (); + lextorFile.close (); } + } + else + { + cout << "ERROR in opening files!" << endl; + } + return 0; } diff --git a/src/BeamSearch.h b/src/BeamSearch.h index 0872415..6793339 100644 --- a/src/BeamSearch.h +++ b/src/BeamSearch.h @@ -12,9 +12,11 @@ using namespace std; -class BeamSearch { +class BeamSearch +{ public: - static void transfer(string transferFilePath, string modelsFileDest, - string k, FILE* lextorFileFile, FILE* outFile); + static void + transfer (string transferFilePath, string localeId, string modelsFileDest, string k, + FILE* lextorFileFile, FILE* outFile); }; #endif /* SRC_BEAMSEARCH_H_ */ diff --git a/src/BeamSearch2.cpp b/src/BeamSearch2.cpp new file mode 100644 index 0000000..2225b71 --- /dev/null +++ b/src/BeamSearch2.cpp @@ -0,0 +1,426 @@ +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +//#include "BeamSearch.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//void +//BeamSearch::transfer (string transferFilePath, string localeId, string modelsFileDest, +// string k, FILE* lextorFileFile, FILE* outFile) +//{ +// +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// exit (EXIT_FAILURE); +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// map > > classesWeights = CLExec::loadYasmetModels ( +// modelsFileDest); +// +// int beam; +// stringstream buffer (k); +// buffer >> beam; +// +// char buff[10240]; +// string tokenizedSentence; +// while (fgets (buff, 10240, lextorFileFile)) +// { +// tokenizedSentence = buff; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, +// tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// +// // final outputs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // beam tree +// vector, float> > beamTree; +// // rules combinations +// vector > combNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// if (ambigInfo[j]->combinations.size () > 1) +// newAmbigInfo.push_back (ambigInfo[j]); +// +// CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, +// localeId); +// +// RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, +// spaces); +// +// // write the outs +// for (unsigned j = 0; j < outs.size (); j++) +// { +// fputs (outs[j].c_str (), outFile); +// } +// +// // delete AmbigInfo pointers +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// { +// // delete the dummy node pointers +// set dummies; +// for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) +// dummies.insert (ambigInfo[j]->combinations[k][0]); +// for (set::iterator it = dummies.begin (); +// it != dummies.end (); it++) +// delete (*it); +// +// delete ambigInfo[j]; +// } +// // delete Node pointers +// for (map >::iterator it = nodesPool.begin (); +// it != nodesPool.end (); it++) +// { +// for (unsigned j = 0; j < it->second.size (); j++) +// { +// delete it->second[j]; +// } +// } +// +// } +// +//} +// +//FILE * +//open_input (string const &filename) +//{ +// FILE *input = fopen (filename.c_str (), "r"); +// if (!input) +// { +// wcerr << "Error: can't open input file '"; +// wcerr << filename.c_str () << "'." << endl; +// exit (EXIT_FAILURE); +// } +// +// return input; +//} +// +//FILE * +//open_output (string const &filename) +//{ +// FILE *output = fopen (filename.c_str (), "w"); +// if (!output) +// { +// wcerr << "Error: can't open output file '"; +// wcerr << filename.c_str () << "'." << endl; +// exit (EXIT_FAILURE); +// } +// return output; +//} +// +////int main(int argc, char **argv) { +//// string sentenceFilePath, lextorFilePath, interInFilePath, localeId, +//// transferFilePath, modelsDest, k; +//// +//// if (argc == 8) { +//// localeId = argv[1]; +//// transferFilePath = argv[2]; +//// sentenceFilePath = argv[3]; +//// lextorFilePath = argv[4]; +//// interInFilePath = argv[5]; +//// modelsDest = argv[6]; +//// k = argv[7]; +//// } else { +//// localeId = "es_ES"; +//// transferFilePath = "apertium-eng-spa.spa-eng.t1x"; +//// sentenceFilePath = "sentences.txt"; +//// lextorFilePath = "lextor.txt"; +//// interInFilePath = "beaminter.txt"; +//// modelsDest = "/home/aboelhamd/Downloads/models"; +//// k = "8"; +//// +////// localeId = "kk_KZ"; +////// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +////// sentenceFilePath = "src.txt"; +////// lextorFilePath = "lextor.txt"; +////// interInFilePath = "beam-inter.txt"; +////// modelsDest = "./UntitledFolder/models"; +////// k = "8"; +//// +//// cout << "Error in parameters !" << endl; +//// cout +//// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath interInFilePath modelsDest beamSize" +//// << endl; +//// cout +//// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +//// << endl; +//// cout +//// << "transferFilePath : Apertium transfer file of the language pair used." +//// << endl; +//// cout << "sentenceFilePath : Source language sentences file." << endl; +//// cout +//// << "lextorFilePath : Apertium lextor file for the source language sentences." +//// << endl; +//// cout +//// << "interInFilePath : Output file of this program which is the input for apertium interchunk." +//// << endl; +//// cout << "modelsDest : Yasmet models destination." << endl; +//// cout << "beamSize : The size of beam in beam search algorithm." << endl; +////// return -1; +//// } +//// +//// ifstream lextorFile(lextorFilePath.c_str()); +//// ifstream inSentenceFile(sentenceFilePath.c_str()); +//// if (lextorFile.is_open() && inSentenceFile.is_open()) { +//// // load transfer file in an xml document object +//// xml_document transferDoc; +//// xml_parse_result result = transferDoc.load_file( +//// transferFilePath.c_str()); +//// if (string(result.description()) != "No error") { +//// cout << "ERROR : " << result.description() << endl; +//// return -1; +//// } +//// +//// // xml node of the parent node (transfer) in the transfer file +//// xml_node transfer = transferDoc.child("transfer"); +//// +//// vector sourceSentences, tokenizedSentences; +//// +//// string tokenizedSentence; +//// while (getline(lextorFile, tokenizedSentence)) { +//// string sourceSentence; +//// if (!getline(inSentenceFile, sourceSentence)) +//// sourceSentence = "No more sentences"; +//// +//// sourceSentences.push_back(sourceSentence); +//// tokenizedSentences.push_back(tokenizedSentence); +//// } +//// lextorFile.close(); +//// inSentenceFile.close(); +//// +//// map > > attrs = RuleParser::getAttrs( +//// transfer); +//// map vars = RuleParser::getVars(transfer); +//// map > lists = RuleParser::getLists(transfer); +//// +//// map > > classesWeights = +//// CLExec::loadYasmetModels(modelsDest); +//// +//// vector > vouts; +//// +//// int beam; +//// stringstream buffer(k); +//// buffer >> beam; +//// for (unsigned i = 0; i < sourceSentences.size(); i++) { +//// cout << i << endl; +//// +//// string sourceSentence, tokenizedSentence; +//// sourceSentence = sourceSentences[i]; +//// tokenizedSentence = tokenizedSentences[i]; +//// +//// // spaces after each token +//// vector spaces; +//// +//// // tokens in the sentence order +//// vector slTokens, tlTokens; +//// +//// // tags of tokens in order +//// vector > slTags, tlTags; +//// +//// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, +//// &tlTags, &spaces, tokenizedSentence); +//// +//// // map of tokens ids and their matched categories +//// map > catsApplied; +//// +//// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); +//// +//// // map of matched rules and a pair of first token id and patterns number +//// map > > rulesApplied; +//// +//// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, +//// transfer); +//// +//// // rule and (target) token map to specific output +//// // if rule has many patterns we will choose the first token only +//// map > ruleOutputs; +//// +//// // map (target) token to all matched rules ids and the number of pattern items of each rule +//// map > > tokenRules; +//// +//// RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, +//// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, +//// localeId); +//// +//// // final outputs +//// vector outs; +//// // number of generated combinations +//// unsigned compNum; +//// // nodes for every token and rule +//// map > nodesPool; +//// // ambiguous informations +//// vector ambigInfo; +//// // beam tree +//// vector, float> > beamTree; +//// // rules combinations +//// vector > combNodes; +//// +//// nodesPool = RuleExecution::getNodesPool(tokenRules); +//// +//// RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, +//// &compNum); +//// +//// vector newAmbigInfo; +//// for (unsigned j = 0; j < ambigInfo.size(); j++) +//// if (ambigInfo[j].combinations.size() > 1) +//// newAmbigInfo.push_back(ambigInfo[j]); +//// +//// CLExec::beamSearch(&beamTree, beam, slTokens, newAmbigInfo, +//// classesWeights, localeId); +//// +//// RuleExecution::getOuts(&outs, &combNodes, beamTree, nodesPool, +//// ruleOutputs, spaces); +//// +//// vouts.push_back(outs); +//// } +//// +//// // write the outs +//// ofstream interInFile(interInFilePath.c_str()); +//// if (interInFile.is_open()) +//// for (unsigned i = 0; i < vouts.size(); i++) { +//// for (unsigned j = 0; j < vouts[i].size(); j++) +//// interInFile << vouts[i][j] << endl; +//// } +//// else +//// cout << "ERROR in opening files!" << endl; +//// interInFile.close(); +//// +//// } else { +//// cout << "ERROR in opening files!" << endl; +//// } +//// return 0; +////} +// +////int main(int argc, char *argv[]) { +//// +//// string transferFilePath, localeId, modelsDest, k; +//// FILE *input = stdin, *output = stdout; +//// +//// if (argc == 7) { +//// output = open_output(argv[argc - 1]); +//// input = open_input(argv[argc - 2]); +//// k = argv[argc - 3]; +//// modelsDest = argv[argc - 4]; +//// localeId = argv[argc - 5]; +//// transferFilePath = argv[argc - 6]; +//// } else if (argc == 6) { +//// input = open_input(argv[argc - 1]); +//// k = argv[argc - 2]; +//// modelsDest = argv[argc - 3]; +//// localeId = argv[argc - 4]; +//// transferFilePath = argv[argc - 5]; +//// } else if (argc == 5) { +//// k = argv[argc - 1]; +//// modelsDest = argv[argc - 2]; +//// localeId = argv[argc - 3]; +//// transferFilePath = argv[argc - 4]; +//// } +//// +//// BeamSearch::transfer(transferFilePath, localeId, modelsDest, k, input, +//// output); +////} +// +//int +//main () +//{ +// map > > models = CLExec::loadYasmetModels ( +// "/home/aboelhamd/Downloads/newmodel2.model"); +//// cout << models["34+36_32+.model"][""]; +// +//// vector weights = +//// models["33_68+33_67+33_95+33_113+115_71+115_66+.model"]["muy_0"]; +//// for (unsigned i = 0; i < weights.size(); i++) { +//// cout << weights[i] << endl; +//// } +//// cout << endl; +// for (map > >::iterator it = models.begin (); +// it != models.end (); it++) +// { +// cout << "model=" << it->first << endl; +// for (map >::iterator it2 = it->second.begin (); +// it2 != it->second.end (); it2++) +// { +// cout << "word= " << it2->first << endl; +// vector weights = it2->second; +// for (unsigned i = 0; i < weights.size (); i++) +// { +// cout << weights[i] << endl; +// } +// cout << endl; +// } +// } +//} diff --git a/src/CLExec.cpp b/src/CLExec.cpp index 805741f..e54475f 100644 --- a/src/CLExec.cpp +++ b/src/CLExec.cpp @@ -33,103 +33,118 @@ using namespace std; using namespace pugi; using namespace elem; -string exec(string cmd) { - string data; - FILE * stream; - const int max_buffer = 256; - char buffer[max_buffer]; - - stream = popen(cmd.c_str(), "r"); - if (stream) { - while (!feof(stream)) - if (fgets(buffer, max_buffer, stream) != NULL) - data.append(buffer); - pclose(stream); - } - return data; +string +exec (string cmd) +{ + string data; + FILE * stream; + const int max_buffer = 256; + char buffer[max_buffer]; + + stream = popen (cmd.c_str (), "r"); + if (stream) + { + while (!feof (stream)) + if (fgets (buffer, max_buffer, stream) != NULL) + data.append (buffer); + pclose (stream); + } + return data; } -void CLExec::segmenter(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("ruby2.3 kazSentenceTokenizer.rb ") + inFilePath - + string(" ") + outFilePath); +void +CLExec::segmenter (string inFilePath, string outFilePath) +{ + // clear file before writing again + ofstream ofs; + ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); + exec ( + string ("ruby2.3 kazSentenceTokenizer.rb ") + inFilePath + string (" ") + + outFilePath); } -void CLExec::biltrans(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("apertium -d $HOME/apertium-kaz-tur kaz-tur-biltrans ") - + inFilePath + string(" ") + outFilePath); +void +CLExec::biltrans (string inFilePath, string outFilePath) +{ + // clear file before writing again + ofstream ofs; + ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); + exec ( + string ("apertium -d $HOME/apertium-kaz-tur kaz-tur-biltrans ") + inFilePath + + string (" ") + outFilePath); } -void CLExec::lextor(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("lrx-proc -m $HOME/apertium-kaz-tur/kaz-tur.autolex.bin ") - + inFilePath + string(" >") + outFilePath); +void +CLExec::lextor (string inFilePath, string outFilePath) +{ + // clear file before writing again + ofstream ofs; + ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); + exec ( + string ("lrx-proc -m $HOME/apertium-kaz-tur/kaz-tur.autolex.bin ") + inFilePath + + string (" >") + outFilePath); } -void CLExec::interchunk(string inFilePath, string outFilePath) { - exec( - string("apertium-interchunk") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t2x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t2x.bin ") - + inFilePath + string(" ") + outFilePath); +void +CLExec::interchunk (string inFilePath, string outFilePath) +{ + exec ( + string ("apertium-interchunk") + + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t2x") + + string (" $HOME/apertium-kaz-tur/kaz-tur.t2x.bin ") + inFilePath + + string (" ") + outFilePath); } -void CLExec::postchunk(string inFilePath, string outFilePath) { - exec( - string("apertium-postchunk") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t3x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t3x.bin ") - + inFilePath + string(" ") + outFilePath); +void +CLExec::postchunk (string inFilePath, string outFilePath) +{ + exec ( + string ("apertium-postchunk") + + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t3x") + + string (" $HOME/apertium-kaz-tur/kaz-tur.t3x.bin ") + inFilePath + + string (" ") + outFilePath); } -void CLExec::transfer(string inFilePath, string outFilePath) { - exec( - string("apertium-transfer -n") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t4x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t4x.bin ") - + inFilePath - + string( - " | lt-proc -g $HOME/apertium-kaz-tur/kaz-tur.autogen.bin") - + string( - " | lt-proc -p $HOME/apertium-kaz-tur/kaz-tur.autopgen.bin") - + string(" >") + outFilePath); +void +CLExec::transfer (string inFilePath, string outFilePath) +{ + exec ( + string ("apertium-transfer -n") + + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t4x") + + string (" $HOME/apertium-kaz-tur/kaz-tur.t4x.bin ") + inFilePath + + string (" | lt-proc -g $HOME/apertium-kaz-tur/kaz-tur.autogen.bin") + + string (" | lt-proc -p $HOME/apertium-kaz-tur/kaz-tur.autopgen.bin") + + string (" >") + outFilePath); } -void CLExec::assignWeights(string inFilePath, string outFilePath) { - exec( - (string("python3 $HOME/NormaliseK/exampleken.py <") - + string(inFilePath) + string(">") + string(outFilePath)).c_str()); +void +CLExec::assignWeights (string inFilePath, string outFilePath) +{ + exec ( + (string ("python3 $HOME/NormaliseK/exampleken.py <") + string (inFilePath) + + string (">") + string (outFilePath)).c_str ()); } -vector CLExec::getFilesInDir(string dir) { - vector files; - - DIR *pDIR; - struct dirent *entry; - if ((pDIR = opendir((string("./") + dir).c_str()))) { - while ((entry = readdir(pDIR))) { - if (strcmp(entry->d_name, ".") != 0 - && strcmp(entry->d_name, "..") != 0) { - files.push_back(entry->d_name); - } - } - closedir(pDIR); +vector +CLExec::getFilesInDir (string dir) +{ + vector files; + + DIR *pDIR; + struct dirent *entry; + if ((pDIR = opendir ((string ("./") + dir).c_str ()))) + { + while ((entry = readdir (pDIR))) + { + if (strcmp (entry->d_name, ".") != 0 && strcmp (entry->d_name, "..") != 0) + { + files.push_back (entry->d_name); + } } + closedir (pDIR); + } - return files; + return files; } //void @@ -147,57 +162,63 @@ vector CLExec::getFilesInDir(string dir) { // } //} -map > > CLExec::loadYasmetModels( - string modelsFilePath, string *localeid) { - // map with key yasmet model name and the value is - // another map with key word name and the value is - // vector of weights in order - map > > classWeights; - - ifstream modelsFile((modelsFilePath).c_str()); - - if (modelsFile.is_open()) { - string line, model, token, weight; - - // localeid - getline(modelsFile, line); - *localeid = line; - - while (getline(modelsFile, line)) { - // 0=>word , 1=>rule_num & 2=>wieght - // we don't need rule number , because - // the weights are already sorted - - char lineChar[line.size()]; - strcpy(lineChar, line.c_str()); - - token = strtok(lineChar, ": "); - if (token == "file") { - model = strtok(NULL, ": "); - continue; - } - // skip rule_num - strtok(NULL, ": "); +map > > +CLExec::loadYasmetModels (string modelsFilePath/*, string *localeid*/) +{ + // map with key yasmet model name and the value is + // another map with key word name and the value is + // vector of weights in order + map > > classWeights; + + ifstream modelsFile ((modelsFilePath).c_str ()); + + if (modelsFile.is_open ()) + { + string line, model, token, weight; + + // localeid +// getline (modelsFile, line); +// *localeid = line; + + while (getline (modelsFile, line)) + { + // 0=>word , 1=>rule_num & 2=>wieght + // we don't need rule number , because + // the weights are already sorted + + char lineChar[line.size ()]; + strcpy (lineChar, line.c_str ()); + + token = strtok (lineChar, ": "); + if (token == "file") + { + model = strtok (NULL, ": "); + continue; + } + // skip rule_num + strtok (NULL, ": "); // cout << "rulenum= " << strtok(NULL, ": ") << endl; - weight = strtok(NULL, ": "); + weight = strtok (NULL, ": "); // cout << "weight= " << weight << endl; - float w = strtof(weight.c_str(), NULL); + float w = strtof (weight.c_str (), NULL); // cout << w << endl; // if (w < 0) // cout << w << endl; - classWeights[model][token].push_back(w); + classWeights[model][token].push_back (w); // if (classWeights[model][token][classWeights[model][token].size() - 1] // < 0) // cout << w << endl; // cout // << classWeights[model][token][classWeights[model][token].size() // - 1] << endl; - } - } else { - cout << "error in opening models file" << endl; } + } + else + { + cout << "error in opening models file" << endl; + } // for (map > >::iterator it = // classWeights.begin(); it != classWeights.end(); it++) { // cout << "model=" << it->first << endl; @@ -211,71 +232,84 @@ map > > CLExec::loadYasmetModels( // cout << endl; // } // } - return classWeights; + return classWeights; } -string CLExec::toLowerCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - string lowWord; - uString.toLower(localeId.c_str()).toUTF8String(lowWord); - return lowWord; +string +CLExec::toLowerCase (string word, string localeId) +{ + icu::UnicodeString uString (word.c_str ()); + string lowWord; + uString.toLower (localeId.c_str ()).toUTF8String (lowWord); + return lowWord; } -string CLExec::toUpperCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - string upWord; - uString.toUpper(localeId.c_str()).toUTF8String(upWord); - return upWord; +string +CLExec::toUpperCase (string word, string localeId) +{ + icu::UnicodeString uString (word.c_str ()); + string upWord; + uString.toUpper (localeId.c_str ()).toUTF8String (upWord); + return upWord; } -string CLExec::FirLetUpperCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - uString.toLower(localeId.c_str()); - uString.setCharAt(0, - icu::UnicodeString(uString.charAt(0)).toUpper(localeId.c_str()).charAt( - 0)); - - string upWord; - uString.toUTF8String(upWord); - return upWord; +string +CLExec::FirLetUpperCase (string word, string localeId) +{ + icu::UnicodeString uString (word.c_str ()); + uString.toLower (localeId.c_str ()); + uString.setCharAt ( + 0, icu::UnicodeString (uString.charAt (0)).toUpper (localeId.c_str ()).charAt (0)); + + string upWord; + uString.toUTF8String (upWord); + return upWord; } // The result of bitwise character comparison: 0 if this contains // the same characters as text, -1 if the characters in this are // bitwise less than the characters in text, +1 if the characters // in this are bitwise greater than the characters in text. -int CLExec::compare(string word1, string word2) { - icu::UnicodeString uString1(word1.c_str()); - icu::UnicodeString uString2(word2.c_str()); +int +CLExec::compare (string word1, string word2) +{ + icu::UnicodeString uString1 (word1.c_str ()); + icu::UnicodeString uString2 (word2.c_str ()); - return uString1.compare(uString2); + return uString1.compare (uString2); } -int CLExec::compareCaseless(string word1, string word2, string localeId) { - icu::UnicodeString uString1(word1.c_str()); - uString1.toLower(localeId.c_str()); - icu::UnicodeString uString2(word2.c_str()); - uString2.toLower(localeId.c_str()); +int +CLExec::compareCaseless (string word1, string word2, string localeId) +{ + icu::UnicodeString uString1 (word1.c_str ()); + uString1.toLower (localeId.c_str ()); + icu::UnicodeString uString2 (word2.c_str ()); + uString2.toLower (localeId.c_str ()); - return uString1.compare(uString2); + return uString1.compare (uString2); } // to sort translations from best to worth by their weight -bool sortParameter(pair, float> a, - pair, float> b) { - return (a.second > b.second); +bool +sortParameter (pair, float> a, + pair, float> b) +{ + return (a.second > b.second); } -void CLExec::beamSearch( - vector, float> > *beamTree, - unsigned beam, vector slTokens, - vector ambigInfo, - map > > classesWeights, - string localeId) { - // Initialization - (*beamTree).push_back(pair, float>()); - - for (unsigned i = 0; i < ambigInfo.size(); i++) { +void +CLExec::beamSearch (vector, float> > *beamTree, + unsigned beam, vector slTokens, + vector ambigInfo, + map > > classesWeights, + string localeId) +{ + // Initialization + (*beamTree).push_back (pair, float> ()); + + for (unsigned i = 0; i < ambigInfo.size (); i++) + { // for (unsigned x = 0; x < beamTree->size (); x++) // { // cout << "weight = " << (*beamTree)[x].second << endl; @@ -287,129 +321,142 @@ void CLExec::beamSearch( // } // } - RuleExecution::AmbigInfo ambig = ambigInfo[i]; + RuleExecution::AmbigInfo* ambig = ambigInfo[i]; // pair, pair > > > p = // ambigInfo[i]; // pair wordInd = p.first; // vector > ambigRules = p.second.second; - unsigned ambigRulesSize = ambig.combinations.size(); - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambigRulesSize; x++) { - // avoid dummy node - for (unsigned y = 1; y < ambig.combinations[x].size(); y++) { - stringstream ss; - ss << ambig.combinations[x][y].ruleId; - rulesNums += ss.str(); - - if (y + 1 < ambig.combinations[x].size()) - rulesNums += "_"; - } - rulesNums += "+"; - } + unsigned ambigRulesSize = ambig->combinations.size (); + + // name of the file is the concatenation of rules ids + string rulesNums; + for (unsigned x = 0; x < ambigRulesSize; x++) + { + // avoid dummy node + for (unsigned y = 1; y < ambig->combinations[x].size (); y++) + { + stringstream ss; + ss << ambig->combinations[x][y]->ruleId; + rulesNums += ss.str (); + + if (y + 1 < ambig->combinations[x].size ()) + rulesNums += "_"; + } + rulesNums += "+"; + } // cout << rulesNums << endl; - map > classWeights = classesWeights[(rulesNums - + ".model")]; - - // build new tree for the new words - vector, float> > newTree; - - // initialize the new tree - for (unsigned x = 0; x < ambigRulesSize; x++) { - newTree.push_back( - pair, float>( - vector(), 0)); - } - // put rules - for (unsigned z = 0; z < ambigRulesSize; z++) { - for (unsigned y = 0; y < ambig.combinations[z].size(); y++) { - newTree[z].first.push_back(ambig.combinations[z][y]); - } - } - - for (unsigned x = ambig.firTokId; x < ambig.firTokId + ambig.maxPat; - x++) { - // word key is the word and it's order in the rule - stringstream ss; - ss << x - ambig.firTokId; - string num = "_" + ss.str(); - - // handle the case of two lemmas separated by a space - for (unsigned t = 0; t < slTokens[x].size(); t++) - if (slTokens[x][t] == ' ') - slTokens[x].replace(t, 1, "_"); - - string word = toLowerCase(slTokens[x], localeId) + num; - vector wordWeights = classWeights[word]; - - // put weights - if (wordWeights.empty()) { - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += 1; - cout << "word : " << word << " is not found in dataset : " - << rulesNums << endl; - } - - else - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += wordWeights[z]; - - } - - // expand beamTree - unsigned initSize = beamTree->size(); - for (unsigned z = 0; z < ambigRulesSize - 1; z++) { - for (unsigned x = 0; x < initSize; x++) { - beamTree->push_back( - pair, float>( - (*beamTree)[x])); - } - } - - // merge the two trees - for (unsigned z = 0; z < ambigRulesSize; z++) { - for (unsigned x = initSize * z; x < initSize * (z + 1); x++) { - // put the new rules with the old - (*beamTree)[x].first.insert((*beamTree)[x].first.end(), - newTree[z].first.begin(), newTree[z].first.end()); - - // add their wiehgts - (*beamTree)[x].second += newTree[z].second; - } - } - - // sort beam tree - sort(beamTree->begin(), beamTree->end(), sortParameter); - - // remove elements more than (beam) - if (beamTree->size() > beam) - beamTree->erase(beamTree->begin() + beam, beamTree->end()); + map > classWeights = classesWeights[(rulesNums + ".model")]; + + // build new tree for the new words + vector, float> > newTree; + + // initialize the new tree + for (unsigned x = 0; x < ambigRulesSize; x++) + { + newTree.push_back ( + pair, float> (vector (), + 0)); + } + // put rules + for (unsigned z = 0; z < ambigRulesSize; z++) + { + for (unsigned y = 0; y < ambig->combinations[z].size (); y++) + { + newTree[z].first.push_back (ambig->combinations[z][y]); + } + } + + for (unsigned x = ambig->firTokId; x < ambig->firTokId + ambig->maxPat; x++) + { + // word key is the word and it's order in the rule + stringstream ss; + ss << x - ambig->firTokId; + string num = "_" + ss.str (); + + // handle the case of two lemmas separated by a space + for (unsigned t = 0; t < slTokens[x].size (); t++) + if (slTokens[x][t] == ' ') + slTokens[x].replace (t, 1, "_"); + + string word = toLowerCase (slTokens[x], localeId) + num; + vector wordWeights = classWeights[word]; + + // put weights + if (wordWeights.empty ()) + { + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += 1; + cout << "word : " << word << " is not found in dataset : " << rulesNums + << endl; + } + + else + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += wordWeights[z]; + + } + + // expand beamTree + unsigned initSize = beamTree->size (); + for (unsigned z = 0; z < ambigRulesSize - 1; z++) + { + for (unsigned x = 0; x < initSize; x++) + { + beamTree->push_back ( + pair, float> ((*beamTree)[x])); + } } + + // merge the two trees + for (unsigned z = 0; z < ambigRulesSize; z++) + { + for (unsigned x = initSize * z; x < initSize * (z + 1); x++) + { + // put the new rules with the old + (*beamTree)[x].first.insert ((*beamTree)[x].first.end (), + newTree[z].first.begin (), + newTree[z].first.end ()); + + // add their wiehgts + (*beamTree)[x].second += newTree[z].second; + } + } + + // sort beam tree + sort (beamTree->begin (), beamTree->end (), sortParameter); + + // remove elements more than (beam) + if (beamTree->size () > beam) + beamTree->erase (beamTree->begin () + beam, beamTree->end ()); + } } -void CLExec::getTransInds(vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds) { - for (unsigned i = 0; i < beamTree.size(); i++) { - vector transInd = beamTree[i].first; - for (unsigned j = 0; j < rulesIds.size(); j++) { - vector > weigInd = rulesIds[j]; - - unsigned count = 0; - for (unsigned x = 0; x < weigInd.size() && count < transInd.size(); - x++) { - if (transInd[count] == weigInd[x].first) - count++; - } - - if (count == transInd.size()) { - transInds->push_back( - pair(j, beamTree[i].second)); - break; - } - } +void +CLExec::getTransInds (vector > *transInds, + vector, float> > beamTree, + vector > > rulesIds) +{ + for (unsigned i = 0; i < beamTree.size (); i++) + { + vector transInd = beamTree[i].first; + for (unsigned j = 0; j < rulesIds.size (); j++) + { + vector > weigInd = rulesIds[j]; + + unsigned count = 0; + for (unsigned x = 0; x < weigInd.size () && count < transInd.size (); x++) + { + if (transInd[count] == weigInd[x].first) + count++; + } + + if (count == transInd.size ()) + { + transInds->push_back (pair (j, beamTree[i].second)); + break; + } } + } } diff --git a/src/CLExec.h b/src/CLExec.h index e807460..16fbc7b 100644 --- a/src/CLExec.h +++ b/src/CLExec.h @@ -17,68 +17,67 @@ using namespace std; using namespace pugi; -class CLExec { +class CLExec +{ public: - static void - segmenter(string inFilePath, string outFilePath); + static void + segmenter (string inFilePath, string outFilePath); - static void - lextor(string inFilePath, string outFilePath); + static void + lextor (string inFilePath, string outFilePath); - static void - biltrans(string inFilePath, string outFilePath); + static void + biltrans (string inFilePath, string outFilePath); - static void - interchunk(string inFilePath, string outFilePath); + static void + interchunk (string inFilePath, string outFilePath); - static void - postchunk(string inFilePath, string outFilePath); + static void + postchunk (string inFilePath, string outFilePath); - static void - transfer(string inFilePath, string outFilePath); + static void + transfer (string inFilePath, string outFilePath); - static void - assignWeights(string inFilePath, string outFilePath); + static void + assignWeights (string inFilePath, string outFilePath); - static vector - getFilesInDir(string dir); + static vector + getFilesInDir (string dir); // static void // runYasmet (); - static map > > - loadYasmetModels(string modelsDest, string *localeid); + static map > > + loadYasmetModels (string modelsDest/*, string *localeid*/); - static void - handleDatasets(); + static void + handleDatasets (); - static string - toLowerCase(string word, string localeId); + static string + toLowerCase (string word, string localeId); - static string - toUpperCase(string word, string localeId); + static string + toUpperCase (string word, string localeId); - static string - FirLetUpperCase(string word, string localeId); + static string + FirLetUpperCase (string word, string localeId); - static int - compare(string word1, string word2); + static int + compare (string word1, string word2); - static int - compareCaseless(string word1, string word2, string localeId); + static int + compareCaseless (string word1, string word2, string localeId); - static void - beamSearch(vector, float> > *beamTree, - unsigned beam, vector slTokens, - vector ambigInfo, - map > > classesWeights, - string localeId); + static void + beamSearch (vector, float> > *beamTree, unsigned beam, + vector slTokens, vector ambigInfo, + map > > classesWeights, string localeId); - static void - getTransInds(vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds); + static void + getTransInds (vector > *transInds, + vector, float> > beamTree, + vector > > rulesIds); }; #endif /* SRC_CLEXEC_H_ */ diff --git a/src/ModelResult.cpp b/src/ModelResult.cpp index 1e4be82..d15f9d0 100644 --- a/src/ModelResult.cpp +++ b/src/ModelResult.cpp @@ -1,376 +1,376 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../pugixml/pugixml.hpp" -#include "RuleParser.h" -#include "RuleExecution.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace pugi; -using namespace elem; - -int -main (int argc, char **argv) -{ - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, - transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, - randModFilePath; - - if (argc == 10) - { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - weightFilePath = argv[6]; - - outputFilePath = argv[7]; - bestModFilePath = argv[8]; - randModFilePath = argv[9]; - } - else - { +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "../pugixml/pugixml.hpp" +//#include "RuleParser.h" +//#include "RuleExecution.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace pugi; +//using namespace elem; +// +//int +//main (int argc, char **argv) +//{ +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, +// transferOutFilePath, weightFilePath, outputFilePath, bestModFilePath, +// randModFilePath; +// +// if (argc == 10) +// { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// weightFilePath = argv[6]; +// +// outputFilePath = argv[7]; +// bestModFilePath = argv[8]; +// randModFilePath = argv[9]; +// } +// else +// { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// +//// transferOutFilePath = "sample-transfer.txt"; +//// weightFilePath = "sample-weights.txt"; +//// +//// outputFilePath = "outAnalysis.txt"; +//// bestModFilePath = "bestModFile.txt"; +//// randModFilePath = "randModFile.txt"; +// // localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "sample-sentences.txt"; -// lextorFilePath = "sample-lextor.txt"; -// -// transferOutFilePath = "sample-transfer.txt"; -// weightFilePath = "sample-weights.txt"; +// transferFilePath = "transferFile3.t1x"; +// sentenceFilePath = "spa-toknizer.txt"; +// lextorFilePath = "spa-lextor.txt"; +// +// transferOutFilePath = "spa-transfer.txt"; +// weightFilePath = "spa-weight.txt"; // // outputFilePath = "outAnalysis.txt"; // bestModFilePath = "bestModFile.txt"; // randModFilePath = "randModFile.txt"; - - localeId = "es_ES"; - transferFilePath = "transferFile3.t1x"; - sentenceFilePath = "spa-toknizer.txt"; - lextorFilePath = "spa-lextor.txt"; - - transferOutFilePath = "spa-transfer.txt"; - weightFilePath = "spa-weight.txt"; - - outputFilePath = "outAnalysis.txt"; - bestModFilePath = "bestModFile.txt"; - randModFilePath = "randModFile.txt"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout - << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." - << endl; - cout - << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." - << endl; - cout - << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." - << endl; - return -1; - } - - // seed for randomness - srand (time (NULL)); - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream inSentenceFile (sentenceFilePath.c_str ()); - if (lextorFile.is_open () && inSentenceFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; - } - - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - string sourceSentence; - if (!getline (inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back (sourceSentence); - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); - inSentenceFile.close (); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - // empty output files - ofstream outputFile (outputFilePath.c_str ()); - outputFile.close (); - ofstream bestModFile (bestModFilePath.c_str ()); - bestModFile.close (); - ofstream randModFile (randModFilePath.c_str ()); - randModFile.close (); - - ifstream weightFile (weightFilePath.c_str ()); - ifstream transferOutFile (transferOutFilePath.c_str ()); - - if (weightFile.is_open () && transferOutFile.is_open ()) - for (unsigned i = 0; i < sourceSentences.size (); i++) - { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outputs - vector normOuts; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > normCombNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, - ruleOutputs, spaces); - - // read weights - string line; - vector normWeights; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (weightFile, line); - float weight = strtof (line.c_str (), NULL); - normWeights.push_back (weight); - } - - // read transfer - vector normTransfers; - for (unsigned j = 0; j < normOuts.size (); j++) - { - getline (transferOutFile, line); - normTransfers.push_back (line); - } - - // remove redundant outputs - vector outs; - vector > combNodes; - vector weights; - vector transfers; - for (unsigned j = 0; j < normOuts.size (); j++) - if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) - { - outs.push_back (normOuts[j]); - combNodes.push_back (normCombNodes[j]); - weights.push_back (normWeights[j]); - transfers.push_back (normTransfers[j]); - } - normOuts = outs; - normCombNodes = combNodes; - normWeights = weights; - normTransfers = transfers; - - // normalize weights - RuleExecution::normaliseWeights (&normWeights); - - // write normal outputs - ofstream outputFile (outputFilePath.c_str (), ofstream::app); - if (outputFile.is_open ()) - { - outputFile << "Analysis of sentence : " << endl; - outputFile << sourceSentence << endl << endl << endl; - - outputFile << endl; - outputFile << "sentence id ||| coverage id ||| original sentence |||" - << " lextor ||| rules ||| chunker ||| final sentence ||| score" - << endl << endl; - - for (unsigned j = 0; j < normWeights.size (); j++) - { - // sentence id - outputFile << (i + 1) << " ||| "; - // coverage id - outputFile << (j + 1) << " ||| "; - // original sentence - outputFile << sourceSentence << " ||| "; - // lextor - outputFile << tokenizedSentence << " ||| "; - // rules - for (unsigned k = 0; k < normCombNodes[j].size (); k++) - if (normCombNodes[j][k].ruleId) - outputFile << normCombNodes[j][k].ruleId << " "; - outputFile << "||| "; - // chuncker - outputFile << normOuts[j] << " ||| "; - // final sentence - outputFile << normTransfers[j] << " ||| "; - // score - outputFile << normWeights[j] << endl << endl; - } - - outputFile - << "---------------------------------------------------------------------------------------------------------" - << endl << endl; - - outputFile.close (); - } - - // Model weighting - // best weight - ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); - if (bestModFile.is_open ()) - { - bestModFile - << "---------------------------------------------------------------------------------------------------------" - << endl << endl; - - bestModFile << (i + 1) << endl; - bestModFile << "Source : " << sourceSentence << endl << endl; - - unsigned maxInd = 0; - for (unsigned j = 1; j < normWeights.size (); j++) - { - if (normWeights[j] > normWeights[maxInd]) - maxInd = j; - } - - // final sentence - bestModFile << "Target : " << normTransfers[maxInd] << endl; - // score - bestModFile << "Weight : " << normWeights[maxInd] << endl; - // rules - bestModFile << "Rules : "; - for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) - if (normCombNodes[maxInd][k].ruleId) - bestModFile << normCombNodes[maxInd][k].ruleId << " "; - - bestModFile << endl - << "---------------------------------------------------------------------------------------------------------" - << endl << endl << endl; - } - bestModFile.close (); - - // Random weight - ofstream randModFile (randModFilePath.c_str (), ofstream::app); - if (randModFile.is_open ()) - { - randModFile << (i + 1) << endl; - randModFile << "Source : " << sourceSentence << endl << endl; - - int random = rand () % normWeights.size (); - - // final sentence - randModFile << "Target : " << normTransfers[random] << endl; - // score - randModFile << "Weight : " << normWeights[random] << endl; - // rules - randModFile << "Rules : "; - for (unsigned k = 0; k < normCombNodes[random].size (); k++) - if (normCombNodes[random][k].ruleId) - randModFile << normCombNodes[random][k].ruleId << " "; - - randModFile << endl - << "---------------------------------------------------------------------------------------------------------" - << endl << endl << endl; - } - randModFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - weightFile.close (); - transferOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" +// << endl; +// cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout +// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." +// << endl; +// cout +// << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." +// << endl; +// cout +// << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." +// << endl; +// return -1; +// } +// +// // seed for randomness +// srand (time (NULL)); +// +// ifstream lextorFile (lextorFilePath.c_str ()); +// ifstream inSentenceFile (sentenceFilePath.c_str ()); +// if (lextorFile.is_open () && inSentenceFile.is_open ()) +// { +// // load transfer file in an xml document object +// xml_document transferDoc; +// xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); +// +// if (string (result.description ()) != "No error") +// { +// cout << "ERROR : " << result.description () << endl; +// return -1; +// } +// +// // xml node of the parent node (transfer) in the transfer file +// xml_node transfer = transferDoc.child ("transfer"); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline (lextorFile, tokenizedSentence)) +// { +// string sourceSentence; +// if (!getline (inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back (sourceSentence); +// tokenizedSentences.push_back (tokenizedSentence); +// } +// lextorFile.close (); +// inSentenceFile.close (); +// +// map > > attrs = RuleParser::getAttrs (transfer); +// map vars = RuleParser::getVars (transfer); +// map > lists = RuleParser::getLists (transfer); +// +// // empty output files +// ofstream outputFile (outputFilePath.c_str ()); +// outputFile.close (); +// ofstream bestModFile (bestModFilePath.c_str ()); +// bestModFile.close (); +// ofstream randModFile (randModFilePath.c_str ()); +// randModFile.close (); +// +// ifstream weightFile (weightFilePath.c_str ()); +// ifstream transferOutFile (transferOutFilePath.c_str ()); +// +// if (weightFile.is_open () && transferOutFile.is_open ()) +// for (unsigned i = 0; i < sourceSentences.size (); i++) +// { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, +// &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, +// tlTokens, tlTags, rulesApplied, attrs, lists, &vars, +// spaces, localeId); +// +// // final outputs +// vector normOuts; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > normCombNodes; +// +// nodesPool = RuleExecution::getNodesPool (tokenRules); +// +// RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); +// +// RuleExecution::getOuts (&normOuts, &normCombNodes, ambigInfo, nodesPool, +// ruleOutputs, spaces); +// +// // read weights +// string line; +// vector normWeights; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (weightFile, line); +// float weight = strtof (line.c_str (), NULL); +// normWeights.push_back (weight); +// } +// +// // read transfer +// vector normTransfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// { +// getline (transferOutFile, line); +// normTransfers.push_back (line); +// } +// +// // remove redundant outputs +// vector outs; +// vector > combNodes; +// vector weights; +// vector transfers; +// for (unsigned j = 0; j < normOuts.size (); j++) +// if (find (outs.begin (), outs.end (), normOuts[j]) == outs.end ()) +// { +// outs.push_back (normOuts[j]); +// combNodes.push_back (normCombNodes[j]); +// weights.push_back (normWeights[j]); +// transfers.push_back (normTransfers[j]); +// } +// normOuts = outs; +// normCombNodes = combNodes; +// normWeights = weights; +// normTransfers = transfers; +// +// // normalize weights +// RuleExecution::normaliseWeights (&normWeights); +// +// // write normal outputs +// ofstream outputFile (outputFilePath.c_str (), ofstream::app); +// if (outputFile.is_open ()) +// { +// outputFile << "Analysis of sentence : " << endl; +// outputFile << sourceSentence << endl << endl << endl; +// +// outputFile << endl; +// outputFile << "sentence id ||| coverage id ||| original sentence |||" +// << " lextor ||| rules ||| chunker ||| final sentence ||| score" +// << endl << endl; +// +// for (unsigned j = 0; j < normWeights.size (); j++) +// { +// // sentence id +// outputFile << (i + 1) << " ||| "; +// // coverage id +// outputFile << (j + 1) << " ||| "; +// // original sentence +// outputFile << sourceSentence << " ||| "; +// // lextor +// outputFile << tokenizedSentence << " ||| "; +// // rules +// for (unsigned k = 0; k < normCombNodes[j].size (); k++) +// if (normCombNodes[j][k].ruleId) +// outputFile << normCombNodes[j][k].ruleId << " "; +// outputFile << "||| "; +// // chuncker +// outputFile << normOuts[j] << " ||| "; +// // final sentence +// outputFile << normTransfers[j] << " ||| "; +// // score +// outputFile << normWeights[j] << endl << endl; +// } +// +// outputFile +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl; +// +// outputFile.close (); +// } +// +// // Model weighting +// // best weight +// ofstream bestModFile (bestModFilePath.c_str (), ofstream::app); +// if (bestModFile.is_open ()) +// { +// bestModFile +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl; +// +// bestModFile << (i + 1) << endl; +// bestModFile << "Source : " << sourceSentence << endl << endl; +// +// unsigned maxInd = 0; +// for (unsigned j = 1; j < normWeights.size (); j++) +// { +// if (normWeights[j] > normWeights[maxInd]) +// maxInd = j; +// } +// +// // final sentence +// bestModFile << "Target : " << normTransfers[maxInd] << endl; +// // score +// bestModFile << "Weight : " << normWeights[maxInd] << endl; +// // rules +// bestModFile << "Rules : "; +// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) +// if (normCombNodes[maxInd][k].ruleId) +// bestModFile << normCombNodes[maxInd][k].ruleId << " "; +// +// bestModFile << endl +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl << endl; +// } +// bestModFile.close (); +// +// // Random weight +// ofstream randModFile (randModFilePath.c_str (), ofstream::app); +// if (randModFile.is_open ()) +// { +// randModFile << (i + 1) << endl; +// randModFile << "Source : " << sourceSentence << endl << endl; +// +// int random = rand () % normWeights.size (); +// +// // final sentence +// randModFile << "Target : " << normTransfers[random] << endl; +// // score +// randModFile << "Weight : " << normWeights[random] << endl; +// // rules +// randModFile << "Rules : "; +// for (unsigned k = 0; k < normCombNodes[random].size (); k++) +// if (normCombNodes[random][k].ruleId) +// randModFile << normCombNodes[random][k].ruleId << " "; +// +// randModFile << endl +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl << endl; +// } +// randModFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// weightFile.close (); +// transferOutFile.close (); +// } +// else +// { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/RuleExecution.cpp b/src/RuleExecution.cpp index 1caeb92..cc524ef 100644 --- a/src/RuleExecution.cpp +++ b/src/RuleExecution.cpp @@ -18,25 +18,25 @@ using namespace elem; #include "RuleExecution.h" void -putCombination (vector >* combinations, - vector combination) +putCombination (vector >* combinations, + vector combination) { for (unsigned i = 0; i < combinations->size (); i++) (*combinations)[i].insert ((*combinations)[i].end (), combination.begin (), combination.end ()); } -vector > -putCombinations (vector > combinations, - vector > nestedcombinations) +vector > +putCombinations (vector > combinations, + vector > nestedcombinations) { - vector > newcombinations; + vector > newcombinations; for (unsigned i = 0; i < combinations.size (); i++) { for (unsigned j = 0; j < nestedcombinations.size (); j++) { - vector newcombination = vector ( + vector newcombination = vector ( combinations[i]); // +1 to skip dummy node newcombination.insert (newcombination.end (), @@ -116,39 +116,39 @@ putOuts (vector outputs, vector nestedOutputs) //} void -RuleExecution::getOuts (vector* finalOuts, vector >* finalCombNodes, - vector, float> > beamTree, - map > nodesPool, +RuleExecution::getOuts (vector* finalOuts, vector >* finalCombNodes, + vector, float> > beamTree, + map > nodesPool, map > ruleOutputs, vector spaces) { for (unsigned i = 0; i < beamTree.size (); i++) { - map bestNodes; + map bestNodes; for (unsigned j = 0; j < beamTree[i].first.size (); j++) { // unsigned tokId = beamTree[i].first[j].tokenId; // Node node = beamTree[i].first[j]; - bestNodes[beamTree[i].first[j].tokenId] = beamTree[i].first[j]; + bestNodes[beamTree[i].first[j]->tokenId] = beamTree[i].first[j]; // bestNodes.insert (pair (tokId, node)); } - vector nodes; + vector nodes; string out; for (unsigned j = 0; j < nodesPool.size ();) { - Node node; + Node* node; if (bestNodes.count (j)) node = bestNodes[j]; else node = nodesPool[j][0]; - out += ruleOutputs[node.ruleId][node.tokenId] - + spaces[node.tokenId + node.patNum - 1]; + out += ruleOutputs[node->ruleId][node->tokenId] + + spaces[node->tokenId + node->patNum - 1]; nodes.push_back (node); - j += node.patNum; + j += node->patNum; } finalCombNodes->push_back (nodes); @@ -157,43 +157,57 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC } void -RuleExecution::getOuts (vector* finalOuts, vector >* finalCombNodes, - vector ambigInfo, - map > nodesPool, +RuleExecution::getOuts (vector* finalOuts, vector >* finalCombNodes, + vector ambigInfo, + map > nodesPool, map > ruleOutputs, vector spaces) { - map ambigMap; + map ambigMap; for (unsigned i = 0; i < ambigInfo.size (); i++) { - ambigMap.insert (pair (ambigInfo[i].firTokId, ambigInfo[i])); + ambigMap.insert (pair (ambigInfo[i]->firTokId, ambigInfo[i])); } // cout << ambigInfo.size () << endl; // cout << ambigInfo[0]->combinations.size () << endl; for (unsigned i = 0; (i < ambigInfo.size ()) || (i < 1); i++) { - vector > combNodes; - combNodes.push_back (vector ()); + vector > combNodes; + combNodes.push_back (vector ()); vector outs; outs.push_back (""); for (unsigned j = 0; j < nodesPool.size ();) { - vector nodes = nodesPool[j]; + vector nodes = nodesPool[j]; // cout << "i = " << i << " , curAmbig = " << curAmbig << " , j = " << j << endl; if (nodes.size () > 1 && ambigMap.count (j)) { - vector > combinations = ambigMap[j].combinations; + vector > combinations = ambigMap[j]->combinations; +// cout << endl << endl; +// for (unsigned k = 0; k < combinations.size (); k++) +// { +// vector nodes = combinations[k]; +// for (unsigned l = 1; l < nodes.size (); l++) +// { +// cout << "tok=" << nodes[l].tokenId << "; rul=" << nodes[l].ruleId +// << "; pat=" << nodes[l].patNum << " - "; +// } +// cout << endl; +// } +// cout << endl << endl; // cout << "comNum = " << combinations.size () << " , " // << ambigInfo[i].firTokId << " ;; " // << (combinations.size () > 1 && ambigInfo[i].firTokId == j) << endl; // cout << "size= " << finalOuts->size () << endl; // cout << "size= " << combinations[0].size () << endl; - if (ambigInfo[i].firTokId == j) + if (ambigInfo[i]->firTokId == j) { +// cout << "heeeeeeereereeerer" << endl; + combNodes = putCombinations (combNodes, combinations); vector ambigOuts; @@ -205,9 +219,9 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC for (unsigned l = 1; l < combinations[k].size (); l++) { ambigOut += - ruleOutputs[combinations[k][l].ruleId][combinations[k][l].tokenId] - + spaces[combinations[k][l].tokenId - + combinations[k][l].patNum - 1]; + ruleOutputs[combinations[k][l]->ruleId][combinations[k][l]->tokenId] + + spaces[combinations[k][l]->tokenId + + combinations[k][l]->patNum - 1]; // cout << "ambigout : " << ambigOut << endl; // cout << i << " : " << j << " , " << ambigOut << endl; } @@ -220,8 +234,8 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC { putCombination ( &combNodes, - vector (combinations[0].begin () + 1, - combinations[0].end ())); + vector (combinations[0].begin () + 1, + combinations[0].end ())); // take the first combination only , while solving the last space issue string ambigOut; // skip the dummy node @@ -229,20 +243,20 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC for (; l < combinations[0].size () - 1; l++) { ambigOut += - ruleOutputs[combinations[0][l].ruleId][combinations[0][l].tokenId] - + spaces[combinations[0][l].tokenId - + combinations[0][l].patNum - 1]; + ruleOutputs[combinations[0][l]->ruleId][combinations[0][l]->tokenId] + + spaces[combinations[0][l]->tokenId + + combinations[0][l]->patNum - 1]; // cout << i << " : " << j << " , " << ambigOut << endl; } ambigOut += - ruleOutputs[combinations[0][l].ruleId][combinations[0][l].tokenId]; + ruleOutputs[combinations[0][l]->ruleId][combinations[0][l]->tokenId]; // cout << i << " : " << j << " , " << ambigOut << endl; putOut (&outs, ambigOut, - combinations[0][l].tokenId + combinations[0][l].patNum - 1, + combinations[0][l]->tokenId + combinations[0][l]->patNum - 1, spaces); } - j += ambigMap[j].maxPat; + j += ambigMap[j]->maxPat; } // make it else if nodes.size()==1 else @@ -254,16 +268,28 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC // << spaces.size () << endl; // cout << i << " : " << j << " , " // << ruleOutputs[nodes[0].ruleId][nodes[0].tokenId] << endl; - putOut (&outs, ruleOutputs[nodes[0].ruleId][nodes[0].tokenId], - nodes[0].tokenId + nodes[0].patNum - 1, spaces); - j += nodes[0].patNum; + putOut (&outs, ruleOutputs[nodes[0]->ruleId][nodes[0]->tokenId], + nodes[0]->tokenId + nodes[0]->patNum - 1, spaces); + j += nodes[0]->patNum; } +// cout << endl << endl; +// for (unsigned k = 0; k < combNodes.size (); k++) +// { +// vector nodes = combNodes[k]; +// for (unsigned l = 0; l < nodes.size (); l++) +// { +// cout << "tok=" << nodes[l].tokenId << "; rul=" << nodes[l].ruleId +// << "; pat=" << nodes[l].patNum << " - "; +// } +// cout << endl; +// } +// cout << endl << endl; } // put only different outputs for (unsigned j = 0; j < outs.size (); j++) { - if ((!ambigInfo.empty () && ambigInfo[i].combinations.size () > 1) + if ((!ambigInfo.empty () && ambigInfo[i]->combinations.size () > 1) || find (finalOuts->begin (), finalOuts->end (), outs[j]) == finalOuts->end ()) { @@ -278,22 +304,24 @@ RuleExecution::getOuts (vector* finalOuts, vector >* finalC } void -RuleExecution::getCombinations (Node root, vector path, - vector >* ambigRules) +RuleExecution::getCombinations (Node* root, vector path, + vector >* ambigRules) { // if (ambigRules->size () >= 100000) // return; path.push_back (root); - for (unsigned i = 0; i < root.neighbors.size (); i++) - getCombinations (root.neighbors[i], path, ambigRules); +// cout << "node: tokId = " << root->tokenId << " , ruleId = " << root->ruleId << endl; + + for (unsigned i = 0; i < root->neighbors.size (); i++) + getCombinations (root->neighbors[i], path, ambigRules); - if (root.neighbors.empty ()) + if (root->neighbors.empty ()) { - // if the rule0 in a combi nation , don't count it + // if the rule0 in a combination , don't count it for (unsigned i = 0; i < path.size (); i++) - if (path[i].ruleId == 0) + if (path[i]->ruleId == 0) return; ambigRules->push_back (path); @@ -352,7 +380,7 @@ RuleExecution::normaliseWeights (vector* weights) void RuleExecution::normaliseWeights (vector >* vweights, - vector >* vambigInfo) + vector >* vambigInfo) { // vector > newvweights; // cout << vambigInfo.size () << " " << vweights->size () << endl; @@ -365,23 +393,23 @@ RuleExecution::normaliseWeights (vector >* vweights, { // get sum of weights of an ambigInfo float sum = 0; - for (unsigned k = 0; k < (*vambigInfo)[i][j].combinations.size (); k++) + for (unsigned k = 0; k < (*vambigInfo)[i][j]->combinations.size (); k++) { sum += (*vweights)[i][weigInd + k]; } // Then normalize it - for (unsigned k = 0; k < (*vambigInfo)[i][j].combinations.size (); k++) + for (unsigned k = 0; k < (*vambigInfo)[i][j]->combinations.size (); k++) { // if sum=0 , to avoid nans we will make them all equal in weights if (sum) (*vweights)[i][weigInd + k] /= sum; else (*vweights)[i][weigInd + k] = 1 - / (*vambigInfo)[i][j].combinations.size (); + / (*vambigInfo)[i][j]->combinations.size (); } // update weighInd - weigInd += (*vambigInfo)[i][j].combinations.size (); + weigInd += (*vambigInfo)[i][j]->combinations.size (); } // newvweights.push_back (weights); } @@ -390,7 +418,7 @@ RuleExecution::normaliseWeights (vector >* vweights, void RuleExecution::normaliseWeights (vector* weights, - vector ambigInfo) + vector ambigInfo) { // vector > newvweights; // cout << vambigInfo.size () << " " << vweights->size () << endl; @@ -403,22 +431,22 @@ RuleExecution::normaliseWeights (vector* weights, { // get sum of weights of an ambigInfo float sum = 0; - for (unsigned k = 0; k < ambigInfo[j].combinations.size (); k++) + for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) { sum += (*weights)[weigInd + k]; } // Then normalize it - for (unsigned k = 0; k < ambigInfo[j].combinations.size (); k++) + for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) { // if sum=0 , to avoid nans we will make them all equal in weights if (sum) (*weights)[weigInd + k] /= sum; else - (*weights)[weigInd + k] = 1 / ambigInfo[j].combinations.size (); + (*weights)[weigInd + k] = 1 / ambigInfo[j]->combinations.size (); } // update weighInd - weigInd += ambigInfo[j].combinations.size (); + weigInd += ambigInfo[j]->combinations.size (); } // newvweights.push_back (weights); // } @@ -480,8 +508,8 @@ getMaxPat (int curMaxPat, unsigned curToken, void RuleExecution::getAmbigInfo (map > > tokenRules, - map > nodesPool, - vector* ambigInfo, + map > nodesPool, + vector* ambigInfo, unsigned* combNum) { *combNum = 0; @@ -494,24 +522,24 @@ RuleExecution::getAmbigInfo (map > > t // if there is ambiguity if (nodesPool[tokId].size () > 1) { - AmbigInfo ambig = AmbigInfo (tokId, maxPat); + AmbigInfo* ambig = new AmbigInfo (tokId, maxPat); - Node dummy = ambiguousGraph (tokenRules, nodesPool, tokId, maxPat); - getCombinations (dummy, vector (), &ambig.combinations); + Node* dummy = ambiguousGraph (tokenRules, nodesPool, tokId, maxPat); + getCombinations (dummy, vector (), &ambig->combinations); - if (!ambig.combinations.empty ()) + if (!ambig->combinations.empty ()) ambigInfo->push_back (ambig); - *combNum += ambig.combinations.size (); + *combNum += ambig->combinations.size (); } tokId += maxPat; } } -map > +map > RuleExecution::getNodesPool (map > > tokenRules) { - map > nodesPool; + map > nodesPool; for (map > >::iterator it = tokenRules.begin (); it != tokenRules.end (); it++) { @@ -521,62 +549,66 @@ RuleExecution::getNodesPool (map > > t { unsigned ruleId = rules[i].first; unsigned patNum = rules[i].second; - Node node = Node (tokenId, ruleId, patNum); + Node* node = new Node (tokenId, ruleId, patNum); nodesPool[tokenId].push_back (node); } } return nodesPool; } -RuleExecution::Node +RuleExecution::Node* RuleExecution::ambiguousGraph ( map > > tokenRules, - map > nodesPool, unsigned firTok, unsigned maxPat) + map > nodesPool, unsigned firTok, unsigned maxPat) { for (unsigned i = firTok; i < firTok + maxPat; i++) { - vector nodes = nodesPool[i]; +// cout << "node id = " << i << endl; + vector nodes = nodesPool[i]; for (unsigned j = 0; j < nodes.size (); j++) { - Node node = nodes[j]; + Node* node = nodes[j]; // last nodes will point to nothing - if (node.tokenId + node.patNum < firTok + maxPat) - node.neighbors = nodesPool[node.tokenId + node.patNum]; + if (node->tokenId + node->patNum < firTok + maxPat) + { + node->neighbors = nodesPool[node->tokenId + node->patNum]; +// cout << "here " << node->tokenId << endl; + } - nodes[j] = node; +// nodes[j] = node; } - nodesPool[i] = nodes; +// nodesPool[i] = nodes; } // root(dummy) node points to the first token node/s - Node root = Node (-1, -1, -1); - root.neighbors = nodesPool[firTok]; + Node* root = new Node (-1, -1, -1); + root->neighbors = nodesPool[firTok]; return root; } -RuleExecution::Node +RuleExecution::Node* RuleExecution::ambiguousGraph ( map > > tokenRules, - map > nodesPool) + map > nodesPool) { for (unsigned i = 0; i < nodesPool.size (); i++) { - vector nodes = nodesPool[i]; + vector nodes = nodesPool[i]; for (unsigned j = 0; j < nodes.size (); j++) { - Node node = nodes[j]; + Node* node = nodes[j]; // last nodes will point to not existent nodes - if (nodesPool.count (node.tokenId + node.patNum)) - node.neighbors = nodesPool[node.tokenId + node.patNum]; + if (nodesPool.count (node->tokenId + node->patNum)) + node->neighbors = nodesPool[node->tokenId + node->patNum]; - nodes[j] = node; +// nodes[j] = node; } - nodesPool[i] = nodes; +// nodesPool[i] = nodes; } // root(dummy) node points to the first token node/s - Node root = Node (-1, -1, -1); - root.neighbors = nodesPool[0]; + Node* root = new Node (-1, -1, -1); + root->neighbors = nodesPool[0]; return root; } diff --git a/src/RuleExecution.h b/src/RuleExecution.h index 76529d4..29d2034 100644 --- a/src/RuleExecution.h +++ b/src/RuleExecution.h @@ -22,7 +22,7 @@ public: unsigned tokenId; unsigned ruleId; unsigned patNum; - vector neighbors; + vector neighbors; Node (unsigned tokenId, unsigned ruleId, unsigned patNum) { this->tokenId = tokenId; @@ -42,7 +42,7 @@ public: public: unsigned firTokId; unsigned maxPat; - vector > combinations; + vector > combinations; AmbigInfo (unsigned firTokId, unsigned maxPat) { this->firTokId = firTokId; @@ -63,24 +63,24 @@ public: static void normaliseWeights (vector >* vweights, - vector >* vambigInfo); + vector >* vambigInfo); static void - normaliseWeights (vector* weights, vector ambigInfo); + normaliseWeights (vector* weights, vector ambigInfo); static void - getOuts (vector* finalOuts, vector >* finalCombNodes, - vector, float> > beamTree, - map > nodesPool, + getOuts (vector* finalOuts, vector >* finalCombNodes, + vector, float> > beamTree, + map > nodesPool, map > ruleOutputs, vector spaces); static void - getOuts (vector* finalOuts, vector >* combNodes, - vector ambigInfo, - map > nodesPool, + getOuts (vector* finalOuts, vector >* finalCombNodes, + vector ambigInfo, + map > nodesPool, map > ruleOutputs, vector spaces); - static map > + static map > getNodesPool (map > > tokenRules); static string @@ -91,21 +91,21 @@ public: // vector > >* ambigRules); static void - getCombinations (Node root, vector path, vector >* ambigRules); + getCombinations (Node* root, vector path, vector >* ambigRules); - static Node + static Node* ambiguousGraph (map > > tokenRules, - map > nodesPool, unsigned firTok, + map > nodesPool, unsigned firTok, unsigned maxPat); - static Node + static Node* ambiguousGraph (map > > tokenRules, - map > nodesPool); + map > nodesPool); static void getAmbigInfo (map > > tokenRules, - map > nodesPool, - vector* ambigInfo, unsigned* combNum); + map > nodesPool, + vector* ambigInfo, unsigned* combNum); // static void // getAmbigInfo ( diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 3035cbe..056a2c3 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -50,9 +51,10 @@ main (int argc, char **argv) // interInFilePath = "sample-inter.txt"; localeId = "es_ES"; - transferFilePath = "./issues/apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = "./issues/lextor.txt"; - interInFilePath = "./issues/interIn.txt"; + transferFilePath = + "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; + lextorFilePath = "/home/aboelhamd/Downloads/es-en/splits/xaa-lextor.txt"; + interInFilePath = "/home/aboelhamd/Downloads/es-en/splits/xaa-chunker.txt"; cout << "Error in parameters !" << endl; cout @@ -67,11 +69,12 @@ main (int argc, char **argv) cout << "interInFilePath : Output file name of this program which is the input for apertium interchunk." << endl; - return -1; +// return -1; } ifstream lextorFile (lextorFilePath.c_str ()); - if (lextorFile.is_open ()) + ofstream interInFile (interInFilePath.c_str ()); + if (lextorFile.is_open () && interInFile.is_open ()) { // load transfer file in an xml document object xml_document transferDoc; @@ -86,85 +89,165 @@ main (int argc, char **argv) // xml node of the parent node (transfer) in the transfer file xml_node transfer = transferDoc.child ("transfer"); - vector tokenizedSentences; + map > > attrs = RuleParser::getAttrs (transfer); + map vars = RuleParser::getVars (transfer); + map > lists = RuleParser::getLists (transfer); +// unsigned i = 0; string tokenizedSentence; while (getline (lextorFile, tokenizedSentence)) { - tokenizedSentences.push_back (tokenizedSentence); - } - lextorFile.close (); +// cout << i++ << endl; - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); + // spaces after each token + vector spaces; - ofstream interInFile (interInFilePath.c_str ()); - if (interInFile.is_open ()) - for (unsigned i = 0; i < tokenizedSentences.size (); i++) - { -// cout << i << endl; + // tokens in the sentence order + vector slTokens, tlTokens; - string tokenizedSentence; - tokenizedSentence = tokenizedSentences[i]; + // tags of tokens in order + vector > slTags, tlTags; - // spaces after each token - vector spaces; + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); - // tokens in the sentence order - vector slTokens, tlTokens; + // map of tokens ids and their matched categories + map > catsApplied; - // tags of tokens in order - vector > slTags, tlTags; + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; - // map of tokens ids and their matched categories - map > catsApplied; + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + // final outs + vector outs; + // number of possible combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; + // rules combinations + vector > combNodes; - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; + nodesPool = RuleExecution::getNodesPool (tokenRules); - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - // final outs - vector outs; - // number of possible combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; + RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, + spaces); - nodesPool = RuleExecution::getNodesPool (tokenRules); +// for (unsigned j = 0; j < tlTokens.size (); j++) +// { +// cout << tlTokens[j] << endl; +// vector > rulees = tokenRules[j]; +// for (unsigned k = 0; k < rulees.size (); k++) +// { +// cout << rulees[k].first << " , " << rulees[k].second << endl; +// } +// cout << endl; +// } +// +// for (unsigned j = 0; j < ambigInfo.size (); j++) +// { +// cout << "firTokId = " << ambigInfo[j]->firTokId << "; maxPat = " +// << ambigInfo[j]->maxPat << endl; +// vector > combinations = +// ambigInfo[j]->combinations; +// cout << endl; +// for (unsigned k = 0; k < combinations.size (); k++) +// { +// vector nodes = combinations[k]; +// for (unsigned l = 1; l < nodes.size (); l++) +// { +// cout << "tok=" << nodes[l]->tokenId << "; rul=" << nodes[l]->ruleId +// << "; pat=" << nodes[l]->patNum << " - "; +// } +// cout << endl; +// } +// cout << endl; +// } +// +// for (map >::iterator it = ruleOutputs.begin (); +// it != ruleOutputs.end (); it++) +// { +// cout << "ruleId=" << it->first << endl; +// map outs = it->second; +// +// for (map::iterator it2 = outs.begin (); +// it2 != outs.end (); it2++) +// { +// cout << "tokId=" << it2->first << " , out = " << it2->second << endl; +// } +// cout << endl; +// } +// cout << endl; +// +// for (unsigned j = 0; j < tlTokens.size (); j++) +// { +// vector nodes = nodesPool[j]; +// cout << "tokId = " << j << " : " << tlTokens[j] << endl; +// for (unsigned k = 0; k < nodes.size (); k++) +// { +// cout << "ruleId = " << nodes[k]->ruleId << "; patNum = " +// << nodes[k]->patNum << endl; +// } +// cout << endl; +// } +// +// for (unsigned j = 0; j < combNodes.size (); j++) +// { +// vector nodes = combNodes[j]; +// for (unsigned k = 0; k < nodes.size (); k++) +// { +// cout << "tok=" << nodes[k]->tokenId << "; rul=" << nodes[k]->ruleId +// << "; pat=" << nodes[k]->patNum << " - "; +// } +// cout << endl; +// } - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); + // write the outs + for (unsigned j = 0; j < outs.size (); j++) + interInFile << outs[j] << endl; - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - interInFile << outs[j] << endl; - } - else - cout << "ERROR in opening files!" << endl; - interInFile.close (); + // delete AmbigInfo pointers + for (unsigned j = 0; j < ambigInfo.size (); j++) + { + // delete the dummy node pointers + set dummies; + for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) + dummies.insert (ambigInfo[j]->combinations[k][0]); + for (set::iterator it = dummies.begin (); + it != dummies.end (); it++) + delete (*it); + + delete ambigInfo[j]; + } + // delete Node pointers + for (map >::iterator it = + nodesPool.begin (); it != nodesPool.end (); it++) + { + for (unsigned j = 0; j < it->second.size (); j++) + { + delete it->second[j]; + } + } + } + lextorFile.close (); + interInFile.close (); cout << "RulesApplier finished!"; } else diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index 91451e2..2ff0461 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -73,11 +74,12 @@ main (int argc, char **argv) } ifstream lextorFile (lextorFilePath.c_str ()); - if (lextorFile.is_open ()) + ifstream weightOutFile (weightOutFilePath.c_str ()); + if (lextorFile.is_open () && weightOutFile.is_open ()) { // load transfer file in an xml document object - xml_document* transferDoc = new xml_document (); - xml_parse_result result = transferDoc->load_file (transferFilePath.c_str ()); + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); if (string (result.description ()) != "No error") { @@ -86,188 +88,198 @@ main (int argc, char **argv) } // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc->child ("transfer"); - - vector *tokenizedSentences = new vector (); - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - tokenizedSentences->push_back (tokenizedSentence); - } - lextorFile.close (); + xml_node transfer = transferDoc.child ("transfer"); map > > attrs = RuleParser::getAttrs (transfer); map vars = RuleParser::getVars (transfer); map > lists = RuleParser::getLists (transfer); - ifstream weightOutFile (weightOutFilePath.c_str ()); - if (weightOutFile.is_open ()) - for (unsigned i = 0; i < tokenizedSentences->size (); i++) - { -// cout << i << endl; - - string sourceSentence, tokenizedSentence; - tokenizedSentence = (*tokenizedSentences)[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, - &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, - tlTokens, tlTags, rulesApplied, attrs, lists, &vars, - spaces, localeId); - - // final outs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j].combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - ambigInfo = newAmbigInfo; - - // read weights - string line; - vector weights; - weights.reserve (1000); - for (unsigned j = 0; j < outs.size (); j++) - { - getline (weightOutFile, line); - float weight = strtof (line.c_str (), NULL); - weights.push_back (weight); - } - - RuleExecution::normaliseWeights (&weights, ambigInfo); - - // Yasmet format preparing - // make a directory if not found - mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - - unsigned weigInd = 0; - for (unsigned i = 0; i < ambigInfo.size (); i++) - { - RuleExecution::AmbigInfo ambig = ambigInfo[i]; - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambig.combinations.size (); x++) - { - // avoid dummy node - for (unsigned y = 1; y < ambig.combinations[x].size (); y++) - { - stringstream ss; + string tokenizedSentence; + while (getline (lextorFile, tokenizedSentence)) + { + // cout << i << endl; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // final outs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // rules combinations + vector > combNodes; + + nodesPool = RuleExecution::getNodesPool (tokenRules); + + RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + + RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, + spaces); + + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size (); j++) + if (ambigInfo[j]->combinations.size () > 1) + newAmbigInfo.push_back (ambigInfo[j]); + ambigInfo = newAmbigInfo; + + // read weights + string line; + vector weights; + for (unsigned j = 0; j < outs.size (); j++) + { + getline (weightOutFile, line); + float weight = strtof (line.c_str (), NULL); + weights.push_back (weight); + } + + RuleExecution::normaliseWeights (&weights, ambigInfo); + + // Yasmet format preparing + // make a directory if not found + mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + + unsigned weigInd = 0; + for (unsigned i = 0; i < ambigInfo.size (); i++) + { + RuleExecution::AmbigInfo* ambig = ambigInfo[i]; + + // name of the file is the concatenation of rules ids + string rulesNums; + for (unsigned x = 0; x < ambig->combinations.size (); x++) + { + // avoid dummy node + for (unsigned y = 1; y < ambig->combinations[x].size (); y++) + { + stringstream ss; // ss->clear (); - ss << ambig.combinations[x][y].ruleId; - rulesNums += ss.str (); - - if (y + 1 < ambig.combinations[x].size ()) - rulesNums += "_"; - } - rulesNums += "+"; - } - - // if it's the first time to open , put the number of classes - bool firstTime = true; - if (FILE *file = fopen ( - (datasetsPath + string ("/") + rulesNums).c_str (), "r")) - { - firstTime = false; - fclose (file); - } + ss << ambig->combinations[x][y]->ruleId; + rulesNums += ss.str (); + + if (y + 1 < ambig->combinations[x].size ()) + rulesNums += "_"; + } + rulesNums += "+"; + } + + // if it's the first time to open , put the number of classes + bool firstTime = true; + if (FILE *file = fopen ((datasetsPath + string ("/") + rulesNums).c_str (), + "r")) + { + firstTime = false; + fclose (file); + } // stringstream* dataset = new stringstream (); - ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (), - ofstream::app); + ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (), + ofstream::app); - if (firstTime) - dataset << ambig.combinations.size () << endl; + if (firstTime) + dataset << ambig->combinations.size () << endl; - for (unsigned x = 0; x < ambig.combinations.size (); x++) - { + for (unsigned x = 0; x < ambig->combinations.size (); x++) + { - dataset << x << " $ "; + dataset << x << " $ "; - float weight = weights[x + weigInd]; + float weight = weights[x + weigInd]; - dataset << weight << " #"; + dataset << weight << " #"; - string features; - for (unsigned v = 0; v < ambig.combinations.size (); v++) - { - stringstream ss; + string features; + for (unsigned v = 0; v < ambig->combinations.size (); v++) + { + stringstream ss; // ss.clear (); - ss << v; - string label = ss.str (); + ss << v; + string label = ss.str (); - for (unsigned z = ambig.firTokId; - z < ambig.firTokId + ambig.maxPat; z++) - { - stringstream ss; + for (unsigned z = ambig->firTokId; + z < ambig->firTokId + ambig->maxPat; z++) + { + stringstream ss; // ss->clear (); - ss << z - ambig.firTokId; - string num = ss.str (); + ss << z - ambig->firTokId; + string num = ss.str (); // *num = ss->str (); - string word = CLExec::toLowerCase (slTokens[z], localeId); + string word = CLExec::toLowerCase (slTokens[z], localeId); - for (unsigned c = 0; c < word.length (); c++) - if (word[c] == ' ') - word.replace (c, 1, "_"); + for (unsigned c = 0; c < word.length (); c++) + if (word[c] == ' ') + word.replace (c, 1, "_"); - features += " " + word + "_" + num + ":" + label; - } - features += " #"; - } - dataset << features << endl; + features += " " + word + "_" + num + ":" + label; + } + features += " #"; + } + dataset << features << endl; // delete (features); - } - weigInd += ambig.combinations.size (); + } + weigInd += ambig->combinations.size (); // dataset.close (); - } + } + + // delete AmbigInfo pointers + for (unsigned j = 0; j < ambigInfo.size (); j++) + { + // delete the dummy node pointers + set dummies; + for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) + dummies.insert (ambigInfo[j]->combinations[k][0]); + for (set::iterator it = dummies.begin (); + it != dummies.end (); it++) + delete (*it); + + delete ambigInfo[j]; + } + // delete Node pointers + for (map >::iterator it = + nodesPool.begin (); it != nodesPool.end (); it++) + { + for (unsigned j = 0; j < it->second.size (); j++) + { + delete it->second[j]; + } + } + // } - } + } + lextorFile.close (); weightOutFile.close (); - } else {