commit 366f92da0ceddf5ce26d6eb2407550440c94c9d3 Author: aboelhamd Date: Sat Apr 6 00:07:53 2019 +0200 Refactoring for apertium-transfer integration files is finished. diff --git a/src/BeamResult.cpp b/src/BeamResult.cpp index 996beaf..507419d 100644 --- a/src/BeamResult.cpp +++ b/src/BeamResult.cpp @@ -1,246 +1,246 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "RuleParser.h" -#include "ambiguous_transfer.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace elem; - -int main(int argc, char **argv) { - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, - modelsDest, beamSize, transferOutFilePath, beamOutFilePath; - - if (argc == 9) { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - beamOutFilePath = argv[6]; - - modelsDest = argv[7]; - beamSize = argv[8]; - } else { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - sentenceFilePath = "src.txt"; - lextorFilePath = "lextor.txt"; - - transferOutFilePath = "beam-transfer.txt"; - beamOutFilePath = "beamOutFile.txt"; - - modelsDest = "./UntitledFolder/models"; - beamSize = "8"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" - << endl; - cout - << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout - << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout - << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." - << endl; - cout << "modelsDest : Yasmet models destination." << endl; - cout << "beamSize : The size of beam in beam search algorithm." << endl; - return -1; - } - - // seed for randomness - srand(time(NULL)); - - ifstream lextorFile(lextorFilePath.c_str()); - ifstream inSentenceFile(sentenceFilePath.c_str()); - if (lextorFile.is_open() && inSentenceFile.is_open()) { - - xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); - - if (doc == NULL) { - cerr << "Error: Could not parse file \'" << transferFilePath - << "\'." << endl; - exit(EXIT_FAILURE); - } - - xmlNode* transfer = xmlDocGetRootElement(doc); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline(lextorFile, tokenizedSentence)) { - string sourceSentence; - if (!getline(inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back(sourceSentence); - tokenizedSentences.push_back(tokenizedSentence); - } - lextorFile.close(); - inSentenceFile.close(); - - map > > attrs = RuleParser::getAttrs( - transfer); - map vars = RuleParser::getVars(transfer); - map > lists = RuleParser::getLists(transfer); - - map > > classesWeights = - CLExec::loadYasmetModels(modelsDest, &localeId); - -// vector > vouts; - - int beam; - stringstream buffer(beamSize); - buffer >> beam; - - // empty the output file - ofstream beamFile(beamOutFilePath.c_str()); - beamFile.close(); - - ifstream transferOutFile(transferOutFilePath.c_str()); - - if (transferOutFile.is_open()) - for (unsigned i = 0; i < sourceSentences.size(); i++) { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, - &tlTags, &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, - transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; - - nodesPool = getNodesPool(tokenRules); - - getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size(); j++) - if (ambigInfo[j].combinations.size() > 1) - newAmbigInfo.push_back(ambigInfo[j]); - - CLExec::beamSearch(&beamTree, beam, slTokens, newAmbigInfo, - classesWeights, localeId); - - getOuts(&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); - - // read transfer - string line; - vector beamTransfers; - for (unsigned j = 0; j < outs.size(); j++) { - getline(transferOutFile, line); - beamTransfers.push_back(line); - } - - // write beam results - ofstream beamFile(beamOutFilePath.c_str(), ofstream::app); - if (beamFile.is_open()) { -// beamFile << "source sentence (" << (i + 1) << ") : " << endl; -// beamFile << sourceSentence << endl << endl; - // just take first best - for (unsigned j = 0; j < /*outs.size ()*/1; j++) { -// beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; - beamFile << beamTransfers[j] << endl; -// beamFile << "weight = " << beamTree[j].second << endl; -// beamFile << "rules : "; -// for (unsigned k = 0; k < combNodes[j].size (); k++) -// if (combNodes[j][k].ruleId) -// beamFile << combNodes[j][k].ruleId << " "; -// beamFile << endl << endl; -// beamFile -// << "------------------------------------------------------------------" -// << endl << endl; - } - } - beamFile.close(); - } - else { - cout << "ERROR in opening files!" << endl; - } - transferOutFile.close(); - } else { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "RuleParser.h" +//#include "ambiguous_transfer.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace elem; +// +//int main(int argc, char **argv) { +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, +// modelsDest, beamSize, transferOutFilePath, beamOutFilePath; +// +// if (argc == 9) { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// beamOutFilePath = argv[6]; +// +// modelsDest = argv[7]; +// beamSize = argv[8]; +// } else { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "src.txt"; +// lextorFilePath = "lextor.txt"; +// +// transferOutFilePath = "beam-transfer.txt"; +// beamOutFilePath = "beamOutFile.txt"; +// +// modelsDest = "./UntitledFolder/models"; +// beamSize = "8"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath beamOutFilePath modelsDest beamSize" +// << endl; +// cout +// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout +// << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout +// << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "beamOutFilePath : Output file name of this program which is the best translations for the language sentences." +// << endl; +// cout << "modelsDest : Yasmet models destination." << endl; +// cout << "beamSize : The size of beam in beam search algorithm." << endl; +// return -1; +// } +// +// // seed for randomness +// srand(time(NULL)); +// +// ifstream lextorFile(lextorFilePath.c_str()); +// ifstream inSentenceFile(sentenceFilePath.c_str()); +// if (lextorFile.is_open() && inSentenceFile.is_open()) { +// +// xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); +// +// if (doc == NULL) { +// cerr << "Error: Could not parse file \'" << transferFilePath +// << "\'." << endl; +// exit(EXIT_FAILURE); +// } +// +// xmlNode* transfer = xmlDocGetRootElement(doc); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline(lextorFile, tokenizedSentence)) { +// string sourceSentence; +// if (!getline(inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back(sourceSentence); +// tokenizedSentences.push_back(tokenizedSentence); +// } +// lextorFile.close(); +// inSentenceFile.close(); +// +// map > > attrs = RuleParser::getAttrs( +// transfer); +// map vars = RuleParser::getVars(transfer); +// map > lists = RuleParser::getLists(transfer); +// +// map > > classesWeights = +// CLExec::loadYasmetModels(modelsDest, &localeId); +// +//// vector > vouts; +// +// int beam; +// stringstream buffer(beamSize); +// buffer >> beam; +// +// // empty the output file +// ofstream beamFile(beamOutFilePath.c_str()); +// beamFile.close(); +// +// ifstream transferOutFile(transferOutFilePath.c_str()); +// +// if (transferOutFile.is_open()) +// for (unsigned i = 0; i < sourceSentences.size(); i++) { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, +// &tlTags, &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, +// transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// +// // final outputs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // beam tree +// vector, float> > beamTree; +// // rules combinations +// vector > combNodes; +// +// nodesPool = getNodesPool(tokenRules); +// +// getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size(); j++) +// if (ambigInfo[j].combinations.size() > 1) +// newAmbigInfo.push_back(ambigInfo[j]); +// +// CLExec::beamSearch(&beamTree, beam, slTokens, newAmbigInfo, +// classesWeights, localeId); +// +// getOuts(&outs, &combNodes, beamTree, nodesPool, ruleOutputs, +// spaces); +// +// // read transfer +// string line; +// vector beamTransfers; +// for (unsigned j = 0; j < outs.size(); j++) { +// getline(transferOutFile, line); +// beamTransfers.push_back(line); +// } +// +// // write beam results +// ofstream beamFile(beamOutFilePath.c_str(), ofstream::app); +// if (beamFile.is_open()) { +//// beamFile << "source sentence (" << (i + 1) << ") : " << endl; +//// beamFile << sourceSentence << endl << endl; +// // just take first best +// for (unsigned j = 0; j < /*outs.size ()*/1; j++) { +//// beamFile << "target sentence " /*<< (j + 1)*/<< " : " << endl; +// beamFile << beamTransfers[j] << endl; +//// beamFile << "weight = " << beamTree[j].second << endl; +//// beamFile << "rules : "; +//// for (unsigned k = 0; k < combNodes[j].size (); k++) +//// if (combNodes[j][k].ruleId) +//// beamFile << combNodes[j][k].ruleId << " "; +//// beamFile << endl << endl; +//// beamFile +//// << "------------------------------------------------------------------" +//// << endl << endl; +// } +// } +// beamFile.close(); +// } +// else { +// cout << "ERROR in opening files!" << endl; +// } +// transferOutFile.close(); +// } else { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp deleted file mode 100644 index 8076150..0000000 --- a/src/BeamSearch.cpp +++ /dev/null @@ -1,261 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "RuleParser.h" -#include "TranElemLiterals.h" -#include "CLExec.h" -#include "BeamSearch.h" - -#include -#include "ambiguous_transfer.h" - -using namespace std; -using namespace elem; - -void -BeamSearch::transfer (string transferFilePath, string modelsFileDest, string k, - FILE* lextorFile, FILE* outFile) -{ - - xmlDoc* doc = xmlReadFile (transferFilePath.c_str (), NULL, 0); - - if (doc == NULL) - { - cerr << "Error: Could not parse file \'" << transferFilePath << "\'." << endl; - exit (EXIT_FAILURE); - } - - xmlNode* transfer = xmlDocGetRootElement (doc); - - map > > attrs = RuleParser::getAttrs (transfer); - -// cout << "size = " << attrs.size () << endl; -// for (map > >::iterator it = attrs.begin (); -// it != attrs.end (); it++) -// { -// cout << "attr-item : " << it->first << endl; -// vector > items = it->second; -// for (unsigned i = 0; i < items.size (); i++) -// { -// for (unsigned j = 0; j < items[i].size (); j++) -// { -// cout << items[i][j] << " "; -// } -// cout << endl; -// } -// cout << endl; -// } - - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - string localeId; - map > > classesWeights = CLExec::loadYasmetModels ( - modelsFileDest, &localeId); - - int beam; - stringstream buffer (k); - buffer >> beam; - - char buff[10240]; - string tokenizedSentence; - while (fgets (buff, 10240, lextorFile)) - { - tokenizedSentence = buff; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - -// cout << "here1" << endl; - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); -// cout << "here2" << endl; - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); -// cout << "size = " << catsApplied.size () << endl; -// for (map >::iterator it = catsApplied.begin (); -// it != catsApplied.end (); it++) -// { -// cout << "tokId : " << it->first << endl; -// vector cats = it->second; -// for (unsigned i = 0; i < cats.size (); i++) -// { -// cout << cats[i] << " "; -// } -// cout << endl; -// } - - // cout << "here3" << endl; - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - -// cout << "size = " << rulesApplied.size () << endl; -// for (map > >::iterator it = -// rulesApplied.begin (); it != rulesApplied.end (); it++) -// { -// cout << "ruleId : " << xml_operations::getAttValUnsg (it->first, string ("id")) -// << endl; -//// vector cats = it->second; -//// for (unsigned i = 0; i < cats.size (); i++) -//// { -//// cout << cats[i] << " "; -//// } -//// cout << endl; -// } - -// cout << "here4" << endl; - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - ambiguous_transfer::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); -// cout << "here5" << endl; - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; - - nodesPool = ambiguous_transfer::getNodesPool (tokenRules); -// cout << "here6" << endl; - ambiguous_transfer::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); -// cout << "ambigsize = " << ambigInfo.size () << endl; - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j].combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - -// cout << "here8" << endl; - CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, - localeId); -// cout << "here9" << endl; - ambiguous_transfer::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); -// cout << "here8" << endl; - - for (map >::iterator it = ruleOutputs.begin (); - it != ruleOutputs.end (); it++) - { - cout << "ruleId=" << it->first << endl; - map outs = it->second; - - for (map::iterator it2 = outs.begin (); it2 != outs.end (); - it2++) - { - cout << "tokId=" << it2->first << " , out = " << it2->second << endl; - } - cout << endl; - } - cout << endl; - - for (unsigned j = 0; j < tlTokens.size (); j++) - { - vector nodes = nodesPool[j]; - cout << "tokId = " << j << " : " << tlTokens[j] << endl; - for (unsigned k = 0; k < nodes.size (); k++) - { - cout << "ruleId = " << nodes[k].ruleId << "; patNum = " << nodes[k].patNum - << endl; - } - cout << endl; - } - - for (unsigned j = 0; j < combNodes.size (); j++) - { - vector nodes = combNodes[j]; - for (unsigned k = 0; k < nodes.size (); k++) - { - cout << "tok=" << nodes[k].tokenId << "; rul=" << nodes[k].ruleId - << "; pat=" << nodes[k].patNum << " - "; - } - cout << endl; - } - - for (unsigned j = 0; j < outs.size (); j++) - cout << outs[j] << endl; - - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - { - fputs (outs[j].c_str (), outFile); - } - } - -} - -FILE * -open_input (string const &filename) -{ - FILE *input = fopen (filename.c_str (), "r"); - if (!input) - { - wcerr << "Error: can't open input file '"; - wcerr << filename.c_str () << "'." << endl; - exit (EXIT_FAILURE); - } - - return input; -} - -FILE * -open_output (string const &filename) -{ - FILE *output = fopen (filename.c_str (), "w"); - if (!output) - { - wcerr << "Error: can't open output file '"; - wcerr << filename.c_str () << "'." << endl; - exit (EXIT_FAILURE); - } - return output; -} - -int -main (int argc, char **argv) -{ - string transferFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/apertium-eng-spa.spa-eng.t1x"; - string modelsFileDest = "nomodel"; - string k = "8"; - string lextor = "lex1.txt"; - string out = "here.txt"; - - FILE *input = stdin, *output = stdout; - input = open_input (lextor); - output = open_output (out); - BeamSearch::transfer (transferFilePath, modelsFileDest, k, input, output); -} diff --git a/src/BeamSearch.h b/src/BeamSearch.h deleted file mode 100644 index 0872415..0000000 --- a/src/BeamSearch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * BeamSearch.h - * - * Created on: Mar 10, 2019 - * Author: aboelhamd - */ - -#ifndef SRC_BEAMSEARCH_H_ -#define SRC_BEAMSEARCH_H_ - -#include - -using namespace std; - -class BeamSearch { -public: - static void transfer(string transferFilePath, string modelsFileDest, - string k, FILE* lextorFileFile, FILE* outFile); -}; -#endif /* SRC_BEAMSEARCH_H_ */ diff --git a/src/CLExec.cpp b/src/CLExec.cpp deleted file mode 100644 index 6416d3a..0000000 --- a/src/CLExec.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/* - * CLExec.cpp - * - * Created on: Jun 21, 2018 - * Author: aboelhamd - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "CLExec.h" -#include "TranElemLiterals.h" - -using namespace std; -using namespace elem; - -string exec(string cmd) { - string data; - FILE * stream; - const int max_buffer = 256; - char buffer[max_buffer]; - - stream = popen(cmd.c_str(), "r"); - if (stream) { - while (!feof(stream)) - if (fgets(buffer, max_buffer, stream) != NULL) - data.append(buffer); - pclose(stream); - } - return data; -} - -void CLExec::segmenter(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("ruby2.3 kazSentenceTokenizer.rb ") + inFilePath - + string(" ") + outFilePath); -} - -void CLExec::biltrans(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("apertium -d $HOME/apertium-kaz-tur kaz-tur-biltrans ") - + inFilePath + string(" ") + outFilePath); -} - -void CLExec::lextor(string inFilePath, string outFilePath) { - // clear file before writing again - ofstream ofs; - ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); - exec( - string("lrx-proc -m $HOME/apertium-kaz-tur/kaz-tur.autolex.bin ") - + inFilePath + string(" >") + outFilePath); -} - -void CLExec::interchunk(string inFilePath, string outFilePath) { - exec( - string("apertium-interchunk") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t2x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t2x.bin ") - + inFilePath + string(" ") + outFilePath); -} - -void CLExec::postchunk(string inFilePath, string outFilePath) { - exec( - string("apertium-postchunk") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t3x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t3x.bin ") - + inFilePath + string(" ") + outFilePath); -} - -void CLExec::transfer(string inFilePath, string outFilePath) { - exec( - string("apertium-transfer -n") - + string( - " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t4x") - + string(" $HOME/apertium-kaz-tur/kaz-tur.t4x.bin ") - + inFilePath - + string( - " | lt-proc -g $HOME/apertium-kaz-tur/kaz-tur.autogen.bin") - + string( - " | lt-proc -p $HOME/apertium-kaz-tur/kaz-tur.autopgen.bin") - + string(" >") + outFilePath); -} - -void CLExec::assignWeights(string inFilePath, string outFilePath) { - exec( - (string("python3 $HOME/NormaliseK/exampleken.py <") - + string(inFilePath) + string(">") + string(outFilePath)).c_str()); -} - -vector CLExec::getFilesInDir(string dir) { - vector files; - - DIR *pDIR; - struct dirent *entry; - if ((pDIR = opendir((string("./") + dir).c_str()))) { - while ((entry = readdir(pDIR))) { - if (strcmp(entry->d_name, ".") != 0 - && strcmp(entry->d_name, "..") != 0) { - files.push_back(entry->d_name); - } - } - closedir(pDIR); - } - - return files; -} - -//void -//CLExec::runYasmet () -//{ -// vector datasets = getFilesInDir (DATASETS); -// -// for (unsigned i = 0; i < datasets.size (); i++) -// { -// string dataset = datasets[i]; -// -// exec ( -// (string ("./yasmet <") + DATASETS + string ("/") + dataset + string (">") -// + MODELS + string ("/") + dataset + string (".model")).c_str ()); -// } -//} - -map > > CLExec::loadYasmetModels( - string modelsFilePath, string *localeid) { - // map with key yasmet model name and the value is - // another map with key word name and the value is - // vector of weights in order - map > > classWeights; - - ifstream modelsFile((modelsFilePath).c_str()); - - if (modelsFile.is_open()) { - string line, model, token, weight; - - // localeid - getline(modelsFile, line); - *localeid = line; - - while (getline(modelsFile, line)) { - // 0=>word , 1=>rule_num & 2=>wieght - // we don't need rule number , because - // the weights are already sorted - - char lineChar[line.size()]; - strcpy(lineChar, line.c_str()); - - token = strtok(lineChar, ": "); - if (token == "file") { - model = strtok(NULL, ": "); - continue; - } - // skip rule_num - strtok(NULL, ": "); -// cout << "rulenum= " << strtok(NULL, ": ") << endl; - - weight = strtok(NULL, ": "); -// cout << "weight= " << weight << endl; - - float w = strtof(weight.c_str(), NULL); -// cout << w << endl; -// if (w < 0) -// cout << w << endl; - classWeights[model][token].push_back(w); -// if (classWeights[model][token][classWeights[model][token].size() - 1] -// < 0) -// cout << w << endl; -// cout -// << classWeights[model][token][classWeights[model][token].size() -// - 1] << endl; - } - } else { - cout << "error in opening models file" << endl; - } -// for (map > >::iterator it = -// classWeights.begin(); it != classWeights.end(); it++) { -// cout << "model=" << it->first << endl; -// for (map >::iterator it2 = it->second.begin(); -// it2 != it->second.end(); it2++) { -// cout << "word= " << it2->first << endl; -//// vector weights = it2->second; -// for (unsigned i = 0; i < it2->second.size(); i++) { -// cout << it2->second[i] << endl; -// } -// cout << endl; -// } -// } - return classWeights; -} - -string CLExec::toLowerCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - string lowWord; - uString.toLower(localeId.c_str()).toUTF8String(lowWord); - return lowWord; -} - -string CLExec::toUpperCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - string upWord; - uString.toUpper(localeId.c_str()).toUTF8String(upWord); - return upWord; -} - -string CLExec::FirLetUpperCase(string word, string localeId) { - icu::UnicodeString uString(word.c_str()); - uString.toLower(localeId.c_str()); - uString.setCharAt(0, - icu::UnicodeString(uString.charAt(0)).toUpper(localeId.c_str()).charAt( - 0)); - - string upWord; - uString.toUTF8String(upWord); - return upWord; -} - -// The result of bitwise character comparison: 0 if this contains -// the same characters as text, -1 if the characters in this are -// bitwise less than the characters in text, +1 if the characters -// in this are bitwise greater than the characters in text. -int CLExec::compare(string word1, string word2) { - icu::UnicodeString uString1(word1.c_str()); - icu::UnicodeString uString2(word2.c_str()); - - return uString1.compare(uString2); -} - -int CLExec::compareCaseless(string word1, string word2, string localeId) { - icu::UnicodeString uString1(word1.c_str()); - uString1.toLower(localeId.c_str()); - icu::UnicodeString uString2(word2.c_str()); - uString2.toLower(localeId.c_str()); - - return uString1.compare(uString2); -} - -// to sort translations from best to worth by their weight -bool sortParameter(pair, float> a, - pair, float> b) { - return (a.second > b.second); -} - -void CLExec::beamSearch( - vector, float> > *beamTree, - unsigned beam, vector slTokens, - vector ambigInfo, - map > > classesWeights, - string localeId) { - // Initialization - (*beamTree).push_back(pair, float>()); - - for (unsigned i = 0; i < ambigInfo.size(); i++) { -// for (unsigned x = 0; x < beamTree->size (); x++) -// { -// cout << "weight = " << (*beamTree)[x].second << endl; -// for (unsigned j = 0; j < (*beamTree)[x].first.size (); j++) -// { -// cout << (*beamTree)[x].first[j].tokenId << " " -// << (*beamTree)[x].first[j].ruleId << " " -// << (*beamTree)[x].first[j].patNum << endl; -// } -// } - - ambiguous_transfer::AmbigInfo ambig = ambigInfo[i]; -// pair, pair > > > p = -// ambigInfo[i]; -// pair wordInd = p.first; -// vector > ambigRules = p.second.second; - unsigned ambigRulesSize = ambig.combinations.size(); - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambigRulesSize; x++) { - // avoid dummy node - for (unsigned y = 1; y < ambig.combinations[x].size(); y++) { - stringstream ss; - ss << ambig.combinations[x][y].ruleId; - rulesNums += ss.str(); - - if (y + 1 < ambig.combinations[x].size()) - rulesNums += "_"; - } - rulesNums += "+"; - } - -// cout << rulesNums << endl; - - map > classWeights = classesWeights[(rulesNums - + ".model")]; - - // build new tree for the new words - vector, float> > newTree; - - // initialize the new tree - for (unsigned x = 0; x < ambigRulesSize; x++) { - newTree.push_back( - pair, float>( - vector(), 0)); - } - // put rules - for (unsigned z = 0; z < ambigRulesSize; z++) { - for (unsigned y = 0; y < ambig.combinations[z].size(); y++) { - newTree[z].first.push_back(ambig.combinations[z][y]); - } - } - - for (unsigned x = ambig.firTokId; x < ambig.firTokId + ambig.maxPat; - x++) { - // word key is the word and it's order in the rule - stringstream ss; - ss << x - ambig.firTokId; - string num = "_" + ss.str(); - - // handle the case of two lemmas separated by a space - for (unsigned t = 0; t < slTokens[x].size(); t++) - if (slTokens[x][t] == ' ') - slTokens[x].replace(t, 1, "_"); - - string word = toLowerCase(slTokens[x], localeId) + num; - vector wordWeights = classWeights[word]; - - // put weights - if (wordWeights.empty()) { - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += 1; - cout << "word : " << word << " is not found in dataset : " - << rulesNums << endl; - } - - else - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += wordWeights[z]; - - } - - // expand beamTree - unsigned initSize = beamTree->size(); - for (unsigned z = 0; z < ambigRulesSize - 1; z++) { - for (unsigned x = 0; x < initSize; x++) { - beamTree->push_back( - pair, float>( - (*beamTree)[x])); - } - } - - // merge the two trees - for (unsigned z = 0; z < ambigRulesSize; z++) { - for (unsigned x = initSize * z; x < initSize * (z + 1); x++) { - // put the new rules with the old - (*beamTree)[x].first.insert((*beamTree)[x].first.end(), - newTree[z].first.begin(), newTree[z].first.end()); - - // add their wiehgts - (*beamTree)[x].second += newTree[z].second; - } - } - - // sort beam tree - sort(beamTree->begin(), beamTree->end(), sortParameter); - - // remove elements more than (beam) - if (beamTree->size() > beam) - beamTree->erase(beamTree->begin() + beam, beamTree->end()); - } -} - -void CLExec::getTransInds(vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds) { - for (unsigned i = 0; i < beamTree.size(); i++) { - vector transInd = beamTree[i].first; - for (unsigned j = 0; j < rulesIds.size(); j++) { - vector > weigInd = rulesIds[j]; - - unsigned count = 0; - for (unsigned x = 0; x < weigInd.size() && count < transInd.size(); - x++) { - if (transInd[count] == weigInd[x].first) - count++; - } - - if (count == transInd.size()) { - transInds->push_back( - pair(j, beamTree[i].second)); - break; - } - } - } -} diff --git a/src/CLExec.h b/src/CLExec.h deleted file mode 100644 index 2d956af..0000000 --- a/src/CLExec.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * CLExec.h - * - * Created on: Jun 21, 2018 - * Author: aboelhamd - */ - -#ifndef SRC_CLEXEC_H_ -#define SRC_CLEXEC_H_ - -#include -#include - -#include "ambiguous_transfer.h" - -using namespace std; - -class CLExec { -public: - - static void - segmenter(string inFilePath, string outFilePath); - - static void - lextor(string inFilePath, string outFilePath); - - static void - biltrans(string inFilePath, string outFilePath); - - static void - interchunk(string inFilePath, string outFilePath); - - static void - postchunk(string inFilePath, string outFilePath); - - static void - transfer(string inFilePath, string outFilePath); - - static void - assignWeights(string inFilePath, string outFilePath); - - static vector - getFilesInDir(string dir); - -// static void -// runYasmet (); - - static map > > - loadYasmetModels(string modelsDest, string *localeid); - - static void - handleDatasets(); - - static string - toLowerCase(string word, string localeId); - - static string - toUpperCase(string word, string localeId); - - static string - FirLetUpperCase(string word, string localeId); - - static int - compare(string word1, string word2); - - static int - compareCaseless(string word1, string word2, string localeId); - - static void - beamSearch(vector, float> > *beamTree, - unsigned beam, vector slTokens, - vector ambigInfo, - map > > classesWeights, - string localeId); - - static void - getTransInds(vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds); -}; - -#endif /* SRC_CLEXEC_H_ */ diff --git a/src/ModelResult.cpp b/src/ModelResult.cpp index 20c1530..3c5bc55 100644 --- a/src/ModelResult.cpp +++ b/src/ModelResult.cpp @@ -1,363 +1,363 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "RuleParser.h" -#include "ambiguous_transfer.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -#include - -using namespace std; -using namespace elem; - -int main(int argc, char **argv) { - string sentenceFilePath, lextorFilePath, localeId, transferFilePath, - transferOutFilePath, weightFilePath, outputFilePath, - bestModFilePath, randModFilePath; - - if (argc == 10) { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - - transferOutFilePath = argv[5]; - weightFilePath = argv[6]; - - outputFilePath = argv[7]; - bestModFilePath = argv[8]; - randModFilePath = argv[9]; - } else { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "beaminter.out"; -// modelsDest = "modelstry"; -// k = "8"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "sample-sentences.txt"; -// lextorFilePath = "sample-lextor.txt"; -// -// transferOutFilePath = "sample-transfer.txt"; -// weightFilePath = "sample-weights.txt"; -// -// outputFilePath = "outAnalysis.txt"; -// bestModFilePath = "bestModFile.txt"; -// randModFilePath = "randModFile.txt"; - - localeId = "es_ES"; - transferFilePath = "transferFile3.t1x"; - sentenceFilePath = "spa-toknizer.txt"; - lextorFilePath = "spa-lextor.txt"; - - transferOutFilePath = "spa-transfer.txt"; - weightFilePath = "spa-weight.txt"; - - outputFilePath = "outAnalysis.txt"; - bestModFilePath = "bestModFile.txt"; - randModFilePath = "randModFile.txt"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" - << endl; - cout - << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout - << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout - << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "transferOutFilePath : Output file of apertium transfer for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout - << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." - << endl; - cout - << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." - << endl; - cout - << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." - << endl; - return -1; - } - - // seed for randomness - srand(time(NULL)); - - ifstream lextorFile(lextorFilePath.c_str()); - ifstream inSentenceFile(sentenceFilePath.c_str()); - if (lextorFile.is_open() && inSentenceFile.is_open()) { - - xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); - - if (doc == NULL) { - cerr << "Error: Could not parse file \'" << transferFilePath - << "\'." << endl; - exit(EXIT_FAILURE); - } - - xmlNode* transfer = xmlDocGetRootElement(doc); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline(lextorFile, tokenizedSentence)) { - string sourceSentence; - if (!getline(inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back(sourceSentence); - tokenizedSentences.push_back(tokenizedSentence); - } - lextorFile.close(); - inSentenceFile.close(); - - map > > attrs = RuleParser::getAttrs( - transfer); - map vars = RuleParser::getVars(transfer); - map > lists = RuleParser::getLists(transfer); - - // empty output files - ofstream outputFile(outputFilePath.c_str()); - outputFile.close(); - ofstream bestModFile(bestModFilePath.c_str()); - bestModFile.close(); - ofstream randModFile(randModFilePath.c_str()); - randModFile.close(); - - ifstream weightFile(weightFilePath.c_str()); - ifstream transferOutFile(transferOutFilePath.c_str()); - - if (weightFile.is_open() && transferOutFile.is_open()) - for (unsigned i = 0; i < sourceSentences.size(); i++) { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, - &tlTags, &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, - transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, - slTags, tlTokens, tlTags, rulesApplied, attrs, lists, - &vars, spaces, localeId); - - // final outputs - vector normOuts; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > normCombNodes; - - nodesPool = RuleExecution::getNodesPool(tokenRules); - - RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, - &compNum); - - RuleExecution::getOuts(&normOuts, &normCombNodes, ambigInfo, - nodesPool, ruleOutputs, spaces); - - // read weights - string line; - vector normWeights; - for (unsigned j = 0; j < normOuts.size(); j++) { - getline(weightFile, line); - float weight = strtof(line.c_str(), NULL); - normWeights.push_back(weight); - } - - // read transfer - vector normTransfers; - for (unsigned j = 0; j < normOuts.size(); j++) { - getline(transferOutFile, line); - normTransfers.push_back(line); - } - - // remove redundant outputs - vector outs; - vector > combNodes; - vector weights; - vector transfers; - for (unsigned j = 0; j < normOuts.size(); j++) - if (find(outs.begin(), outs.end(), normOuts[j]) - == outs.end()) { - outs.push_back(normOuts[j]); - combNodes.push_back(normCombNodes[j]); - weights.push_back(normWeights[j]); - transfers.push_back(normTransfers[j]); - } - normOuts = outs; - normCombNodes = combNodes; - normWeights = weights; - normTransfers = transfers; - - // normalize weights - RuleExecution::normaliseWeights(&normWeights); - - // write normal outputs - ofstream outputFile(outputFilePath.c_str(), ofstream::app); - if (outputFile.is_open()) { - outputFile << "Analysis of sentence : " << endl; - outputFile << sourceSentence << endl << endl << endl; - - outputFile << endl; - outputFile - << "sentence id ||| coverage id ||| original sentence |||" - << " lextor ||| rules ||| chunker ||| final sentence ||| score" - << endl << endl; - - for (unsigned j = 0; j < normWeights.size(); j++) { - // sentence id - outputFile << (i + 1) << " ||| "; - // coverage id - outputFile << (j + 1) << " ||| "; - // original sentence - outputFile << sourceSentence << " ||| "; - // lextor - outputFile << tokenizedSentence << " ||| "; - // rules - for (unsigned k = 0; k < normCombNodes[j].size(); k++) - if (normCombNodes[j][k].ruleId) - outputFile << normCombNodes[j][k].ruleId << " "; - outputFile << "||| "; - // chuncker - outputFile << normOuts[j] << " ||| "; - // final sentence - outputFile << normTransfers[j] << " ||| "; - // score - outputFile << normWeights[j] << endl << endl; - } - - outputFile - << "---------------------------------------------------------------------------------------------------------" - << endl << endl; - - outputFile.close(); - } - - // Model weighting - // best weight - ofstream bestModFile(bestModFilePath.c_str(), ofstream::app); - if (bestModFile.is_open()) { -// bestModFile -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl; -// -// bestModFile << (i + 1) << endl; -// bestModFile << "Source : " << sourceSentence << endl << endl; - - unsigned maxInd = 0; - for (unsigned j = 1; j < normWeights.size(); j++) { - if (normWeights[j] > normWeights[maxInd]) - maxInd = j; - } - - // final sentence - bestModFile /*<< "Target : "*/<< normTransfers[maxInd] - << endl; - // score -// bestModFile << "Weight : " << normWeights[maxInd] << endl; - // rules -// bestModFile << "Rules : "; -// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) -// if (normCombNodes[maxInd][k].ruleId) -// bestModFile << normCombNodes[maxInd][k].ruleId << " "; -// -// bestModFile << endl -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl << endl; - } - bestModFile.close(); - - // Random weight - ofstream randModFile(randModFilePath.c_str(), ofstream::app); - if (randModFile.is_open()) { -// randModFile << (i + 1) << endl; -// randModFile << "Source : " << sourceSentence << endl << endl; - - int random = rand() % normWeights.size(); - - // final sentence - randModFile /*<< "Target : "*/<< normTransfers[random] - << endl; - // score -// randModFile << "Weight : " << normWeights[random] << endl; -// // rules -// randModFile << "Rules : "; -// for (unsigned k = 0; k < normCombNodes[random].size (); k++) -// if (normCombNodes[random][k].ruleId) -// randModFile << normCombNodes[random][k].ruleId << " "; -// -// randModFile << endl -// << "---------------------------------------------------------------------------------------------------------" -// << endl << endl << endl; - } - randModFile.close(); - } - else { - cout << "ERROR in opening files!" << endl; - } - weightFile.close(); - transferOutFile.close(); - } else { - cout << "ERROR in opening files!" << endl; - } - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "RuleParser.h" +//#include "ambiguous_transfer.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//#include +// +//using namespace std; +//using namespace elem; +// +//int main(int argc, char **argv) { +// string sentenceFilePath, lextorFilePath, localeId, transferFilePath, +// transferOutFilePath, weightFilePath, outputFilePath, +// bestModFilePath, randModFilePath; +// +// if (argc == 10) { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// +// transferOutFilePath = argv[5]; +// weightFilePath = argv[6]; +// +// outputFilePath = argv[7]; +// bestModFilePath = argv[8]; +// randModFilePath = argv[9]; +// } else { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "beaminter.out"; +//// modelsDest = "modelstry"; +//// k = "8"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// +//// transferOutFilePath = "sample-transfer.txt"; +//// weightFilePath = "sample-weights.txt"; +//// +//// outputFilePath = "outAnalysis.txt"; +//// bestModFilePath = "bestModFile.txt"; +//// randModFilePath = "randModFile.txt"; +// +// localeId = "es_ES"; +// transferFilePath = "transferFile3.t1x"; +// sentenceFilePath = "spa-toknizer.txt"; +// lextorFilePath = "spa-lextor.txt"; +// +// transferOutFilePath = "spa-transfer.txt"; +// weightFilePath = "spa-weight.txt"; +// +// outputFilePath = "outAnalysis.txt"; +// bestModFilePath = "bestModFile.txt"; +// randModFilePath = "randModFile.txt"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath transferOutFilePath weightOutFilePath outputFilePath bestModFilePath randModFilePath" +// << endl; +// cout +// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout +// << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout +// << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "transferOutFilePath : Output file of apertium transfer for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout +// << "outputFilePath : First output file name of this program which is the complete analysis for the source language sentences." +// << endl; +// cout +// << "bestModFilePath : Second output file name which is the best (language model) translations for the source language sentences." +// << endl; +// cout +// << "randModFilePath : Third output file name which is random translations from (language model) for the source language sentences." +// << endl; +// return -1; +// } +// +// // seed for randomness +// srand(time(NULL)); +// +// ifstream lextorFile(lextorFilePath.c_str()); +// ifstream inSentenceFile(sentenceFilePath.c_str()); +// if (lextorFile.is_open() && inSentenceFile.is_open()) { +// +// xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); +// +// if (doc == NULL) { +// cerr << "Error: Could not parse file \'" << transferFilePath +// << "\'." << endl; +// exit(EXIT_FAILURE); +// } +// +// xmlNode* transfer = xmlDocGetRootElement(doc); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline(lextorFile, tokenizedSentence)) { +// string sourceSentence; +// if (!getline(inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back(sourceSentence); +// tokenizedSentences.push_back(tokenizedSentence); +// } +// lextorFile.close(); +// inSentenceFile.close(); +// +// map > > attrs = RuleParser::getAttrs( +// transfer); +// map vars = RuleParser::getVars(transfer); +// map > lists = RuleParser::getLists(transfer); +// +// // empty output files +// ofstream outputFile(outputFilePath.c_str()); +// outputFile.close(); +// ofstream bestModFile(bestModFilePath.c_str()); +// bestModFile.close(); +// ofstream randModFile(randModFilePath.c_str()); +// randModFile.close(); +// +// ifstream weightFile(weightFilePath.c_str()); +// ifstream transferOutFile(transferOutFilePath.c_str()); +// +// if (weightFile.is_open() && transferOutFile.is_open()) +// for (unsigned i = 0; i < sourceSentences.size(); i++) { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, +// &tlTags, &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, +// transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, +// slTags, tlTokens, tlTags, rulesApplied, attrs, lists, +// &vars, spaces, localeId); +// +// // final outputs +// vector normOuts; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > normCombNodes; +// +// nodesPool = RuleExecution::getNodesPool(tokenRules); +// +// RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, +// &compNum); +// +// RuleExecution::getOuts(&normOuts, &normCombNodes, ambigInfo, +// nodesPool, ruleOutputs, spaces); +// +// // read weights +// string line; +// vector normWeights; +// for (unsigned j = 0; j < normOuts.size(); j++) { +// getline(weightFile, line); +// float weight = strtof(line.c_str(), NULL); +// normWeights.push_back(weight); +// } +// +// // read transfer +// vector normTransfers; +// for (unsigned j = 0; j < normOuts.size(); j++) { +// getline(transferOutFile, line); +// normTransfers.push_back(line); +// } +// +// // remove redundant outputs +// vector outs; +// vector > combNodes; +// vector weights; +// vector transfers; +// for (unsigned j = 0; j < normOuts.size(); j++) +// if (find(outs.begin(), outs.end(), normOuts[j]) +// == outs.end()) { +// outs.push_back(normOuts[j]); +// combNodes.push_back(normCombNodes[j]); +// weights.push_back(normWeights[j]); +// transfers.push_back(normTransfers[j]); +// } +// normOuts = outs; +// normCombNodes = combNodes; +// normWeights = weights; +// normTransfers = transfers; +// +// // normalize weights +// RuleExecution::normaliseWeights(&normWeights); +// +// // write normal outputs +// ofstream outputFile(outputFilePath.c_str(), ofstream::app); +// if (outputFile.is_open()) { +// outputFile << "Analysis of sentence : " << endl; +// outputFile << sourceSentence << endl << endl << endl; +// +// outputFile << endl; +// outputFile +// << "sentence id ||| coverage id ||| original sentence |||" +// << " lextor ||| rules ||| chunker ||| final sentence ||| score" +// << endl << endl; +// +// for (unsigned j = 0; j < normWeights.size(); j++) { +// // sentence id +// outputFile << (i + 1) << " ||| "; +// // coverage id +// outputFile << (j + 1) << " ||| "; +// // original sentence +// outputFile << sourceSentence << " ||| "; +// // lextor +// outputFile << tokenizedSentence << " ||| "; +// // rules +// for (unsigned k = 0; k < normCombNodes[j].size(); k++) +// if (normCombNodes[j][k].ruleId) +// outputFile << normCombNodes[j][k].ruleId << " "; +// outputFile << "||| "; +// // chuncker +// outputFile << normOuts[j] << " ||| "; +// // final sentence +// outputFile << normTransfers[j] << " ||| "; +// // score +// outputFile << normWeights[j] << endl << endl; +// } +// +// outputFile +// << "---------------------------------------------------------------------------------------------------------" +// << endl << endl; +// +// outputFile.close(); +// } +// +// // Model weighting +// // best weight +// ofstream bestModFile(bestModFilePath.c_str(), ofstream::app); +// if (bestModFile.is_open()) { +//// bestModFile +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl; +//// +//// bestModFile << (i + 1) << endl; +//// bestModFile << "Source : " << sourceSentence << endl << endl; +// +// unsigned maxInd = 0; +// for (unsigned j = 1; j < normWeights.size(); j++) { +// if (normWeights[j] > normWeights[maxInd]) +// maxInd = j; +// } +// +// // final sentence +// bestModFile /*<< "Target : "*/<< normTransfers[maxInd] +// << endl; +// // score +//// bestModFile << "Weight : " << normWeights[maxInd] << endl; +// // rules +//// bestModFile << "Rules : "; +//// for (unsigned k = 0; k < normCombNodes[maxInd].size (); k++) +//// if (normCombNodes[maxInd][k].ruleId) +//// bestModFile << normCombNodes[maxInd][k].ruleId << " "; +//// +//// bestModFile << endl +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl << endl; +// } +// bestModFile.close(); +// +// // Random weight +// ofstream randModFile(randModFilePath.c_str(), ofstream::app); +// if (randModFile.is_open()) { +//// randModFile << (i + 1) << endl; +//// randModFile << "Source : " << sourceSentence << endl << endl; +// +// int random = rand() % normWeights.size(); +// +// // final sentence +// randModFile /*<< "Target : "*/<< normTransfers[random] +// << endl; +// // score +//// randModFile << "Weight : " << normWeights[random] << endl; +//// // rules +//// randModFile << "Rules : "; +//// for (unsigned k = 0; k < normCombNodes[random].size (); k++) +//// if (normCombNodes[random][k].ruleId) +//// randModFile << normCombNodes[random][k].ruleId << " "; +//// +//// randModFile << endl +//// << "---------------------------------------------------------------------------------------------------------" +//// << endl << endl << endl; +// } +// randModFile.close(); +// } +// else { +// cout << "ERROR in opening files!" << endl; +// } +// weightFile.close(); +// transferOutFile.close(); +// } else { +// cout << "ERROR in opening files!" << endl; +// } +// return 0; +//} diff --git a/src/RuleParser.cpp b/src/RuleParser.cpp deleted file mode 100644 index daa1b86..0000000 --- a/src/RuleParser.cpp +++ /dev/null @@ -1,322 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "RuleParser.h" - -using namespace std; -using namespace elem; - -void RuleParser::sentenceTokenizer(vector* slTokens, - vector* tlTokens, vector >* slTags, - vector >* tlTags, vector* spaces, - string tokenizedSentenceStr) { - vector taggedTokens; - // from string to char* - char tokenizedSentence[tokenizedSentenceStr.size()]; - strcpy(tokenizedSentence, tokenizedSentenceStr.c_str()); - - char * taggedToken; - taggedToken = strtok(tokenizedSentence, "^"); - while (taggedToken != NULL) { - taggedTokens.push_back(taggedToken); - taggedToken = strtok(NULL, "^"); - } - - size_t taggedTokensSize = taggedTokens.size(); - for (unsigned i = 0; i < taggedTokensSize; i++) { - // take spaces after token - size_t dolSignInd = taggedTokens[i].find("$"); - spaces->push_back(taggedTokens[i].substr(dolSignInd + 1)); - taggedTokens[i] = taggedTokens[i].substr(0, dolSignInd); - - // remove multiple translations and take only the first one - size_t firSlashInd = taggedTokens[i].find("/"); - - // if no translation , remove that word - if (firSlashInd + 1 == taggedTokens[i].size()) { - taggedTokens.erase(taggedTokens.begin() + i); - spaces->erase(spaces->begin() + i); - taggedTokensSize--; - i--; - continue; - } - - size_t secSlashInd = taggedTokens[i].find("/", firSlashInd + 1); - if (secSlashInd != string::npos) - taggedTokens[i] = taggedTokens[i].substr(0, secSlashInd); - - // split source and target tokens - string target = taggedTokens[i].substr(firSlashInd + 1); - - taggedTokens.push_back(target); - - taggedTokens[i] = taggedTokens[i].substr(0, firSlashInd); - } - - for (unsigned i = 0; i < taggedTokens.size(); i++) { - char taggedToken[taggedTokens[i].size()]; - strcpy(taggedToken, taggedTokens[i].c_str()); - char* split; - - string token; - vector tokTags; - -// cout << "taggedToken : " << taggedToken << endl; - - if (taggedToken[0] != '<') { - split = strtok(taggedToken, "<>"); - token = split; - split = strtok(NULL, "<>"); - } else { - split = strtok(taggedToken, "<>"); - } - -// cout << "word : " << token << endl; - - while (split != NULL) { - string tag = split; - tokTags.push_back(tag); - - split = strtok(NULL, "<>"); - } - - if (i < taggedTokens.size() / 2) { - slTokens->push_back(token); - slTags->push_back(tokTags); - } else { - tlTokens->push_back(token); - tlTags->push_back(tokTags); - } - } -} - -void RuleParser::matchCats(map >* catsApplied, - vector slTokens, vector > tags, - xmlNode* transfer) { - xmlNode* section_def_cats = xml_operations::getChild(transfer, SECTION_DEF_CATS); - -// cout << "here" << endl; - - for (xmlNode* def_cat = xml_operations::getFirstChild(section_def_cats); def_cat; def_cat = - xml_operations::getFirstNext(def_cat)) { - - for (xmlNode* cat_item = xml_operations::getFirstChild(def_cat); cat_item; cat_item = - xml_operations::getFirstNext(cat_item)) { - - // separate tags from (t1.t2) format, for easy access - string tagsString = xml_operations::getAttVal(cat_item, TAGS); - - char tagDotted[tagsString.size()]; - strcpy(tagDotted, tagsString.c_str()); - char* split; - split = strtok(tagDotted, "."); - - vector itemTags; - - while (split != NULL) { - string tag = split; - itemTags.push_back(tag); - - split = strtok(NULL, "."); - } - - for (unsigned x = 0; x < slTokens.size(); x++) { - // if cat-item have lemma - if (!xml_operations::getAttVal(cat_item, LEMMA).empty()) { - if (xml_operations::getAttVal(cat_item, LEMMA) != slTokens[x]) { - continue; - } - } - - vector tokTags = tags[x]; - - unsigned i = 0, j = 0; - for (; i < tokTags.size() && j < itemTags.size(); i++) { - if (itemTags[j] == "*") { - if (j + 1 < itemTags.size() && i + 1 < tokTags.size() - && itemTags[j + 1] == tokTags[i + 1]) { - j += 2; - i++; - } - } else if (itemTags[j] == tokTags[i]) { - j++; - } else { - break; - } - } - - if (i == tokTags.size() - && (j == itemTags.size() - || (j + 1 == itemTags.size() - && itemTags[j] == "*" - && itemTags[j - 1] != tokTags[i - 1]))) { -// cout << N < > >* rulesApplied, - vector slTokens, map > catsApplied, - xmlNode* transfer) { - - xmlNode* section_rules = xml_operations::getChild(transfer, SECTION_RULES); - - vector tokensApplied; - - for (xmlNode* rule = xml_operations::getFirstChild(section_rules); rule; rule = - xml_operations::getFirstNext(rule)) { - - xmlNode* pattern = xml_operations::getChild(rule, PATTERN); - - // Put pattern items in vector for ease in processing - vector pattern_items; - for (xmlNode* pattern_item = pattern->children; pattern_item; - pattern_item = pattern_item->next) { - - pattern_items.push_back(pattern_item); - } - - for (unsigned i = 0; - (slTokens.size() >= pattern_items.size()) - && i <= slTokens.size() - pattern_items.size(); i++) { - - vector slMatchedTokens; - for (unsigned j = 0; j < pattern_items.size(); j++) { - - // match cat-item with pattern-item - string slToken = slTokens[i + j]; - vector cats = catsApplied[i + j]; - - for (unsigned k = 0; k < cats.size(); k++) { - // if cat name equals pattern item name - if (xml_operations::getAttVal(pattern_items[j], N) == cats[k]) { - slMatchedTokens.push_back(i + j); - break; - } - } - } - // if matched tokens' size = pattern items' size - // then this rule is matched - if (slMatchedTokens.size() == pattern_items.size()) { - if (slMatchedTokens.size() == 1) - tokensApplied.insert(tokensApplied.end(), - slMatchedTokens.begin(), slMatchedTokens.end()); - (*rulesApplied)[rule].push_back( - pair(slMatchedTokens[0], - slMatchedTokens.size())); - } - - } - - } - - // set a default rule for tokens without rules applied - vector > tokensNotApp; - for (unsigned i = 0; i < slTokens.size(); i++) { - bool found = false; - for (unsigned j = 0; j < tokensApplied.size(); j++) { - if (i == tokensApplied[j]) { - found = true; - break; - } - } - if (!found) { -// vector tokenNotApp; -// tokenNotApp.push_back (i); -// tokensNotApp.push_back (tokenNotApp); - tokensNotApp.push_back(pair(i, 1)); - } - } - - xmlNode* defaultRule; - - (*rulesApplied)[defaultRule] = tokensNotApp; -} - -// to sort attribute tags descendingly -bool sortParameter(vector a, vector b) { - return (a.size() > b.size()); -} - -map > > RuleParser::getAttrs(xmlNode* transfer) { - map > > attrs; - xmlNode* section_def_attrs = xml_operations::getChild(transfer, SECTION_DEF_ATTRS); - - for (xmlNode* def_attr = xml_operations::getFirstChild(section_def_attrs); def_attr; - def_attr = xml_operations::getFirstNext(def_attr)) { - - vector > allTags; - for (xmlNode* attr_item = xml_operations::getFirstChild(def_attr); attr_item; - attr_item = xml_operations::getFirstNext(attr_item)) { - - // splitting tags by '.' - string tagsString = xml_operations::getAttVal(attr_item, TAGS); - char tagsChars[tagsString.size()]; - strcpy(tagsChars, tagsString.c_str()); - - vector tags; - - char * tag; - tag = strtok(tagsChars, "."); - while (tag != NULL) { - tags.push_back(tag); - tag = strtok(NULL, "."); - } - - allTags.push_back(tags); - } - // sort the tags , descendingly by their size - sort(allTags.begin(), allTags.end(), sortParameter); -// cout << def_attr.attribute (N).value () << endl; - attrs[xml_operations::getAttVal(def_attr, N)] = allTags; - } - - return attrs; -} - -map RuleParser::getVars(xmlNode* transfer) { - map vars; - - xmlNode* section_def_vars = xml_operations::getChild(transfer, SECTION_DEF_VARS); - if (section_def_vars) - for (xmlNode* def_var = xml_operations::getFirstChild(section_def_vars); def_var; - def_var = xml_operations::getFirstNext(def_var)) { - - vars[xml_operations::getAttVal(def_var, N)] = xml_operations::getAttVal(def_var, V); - } - - return vars; -} - -map > RuleParser::getLists(xmlNode* transfer) { - map > lists; - - xmlNode* section_def_lists = xml_operations::getChild(transfer, SECTION_DEF_LISTS); - if (section_def_lists) - for (xmlNode* def_list = xml_operations::getFirstChild(section_def_lists); def_list; - def_list = xml_operations::getFirstNext(def_list)) { - - vector list; - for (xmlNode* list_item = xml_operations::getFirstChild(def_list); list_item; - list_item = xml_operations::getFirstNext(list_item)) { - - list.push_back(xml_operations::getAttVal(list_item, V)); - } - lists[xml_operations::getAttVal(def_list, N)] = list; - } - - return lists; -} diff --git a/src/RuleParser.h b/src/RuleParser.h deleted file mode 100644 index 3041f81..0000000 --- a/src/RuleParser.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * ruleParser.h - * - * Created on: Apr 25, 2018 - * Author: aboelhamd - */ - -#ifndef SRC_RULEPARSER_H_ -#define SRC_RULEPARSER_H_ - -#include "TranElemLiterals.h" -#include "xml_operations.h" - -using namespace std; - -class RuleParser { -public: - static void - sentenceTokenizer(vector* slTokens, vector* tlTokens, - vector >* slTags, vector >* tlTags, - vector* spaces, string tokenizedSentenceStr); - - static void - matchCats(map >* catsApplied, - vector slTokens, vector > tags, - xmlNode* transfer); - - static void - matchRules(map > >* rulesApplied, - vector slTokens, map > catsApplied, - xmlNode* transfer); - - static map > > - getAttrs(xmlNode* transfer); - - static map - getVars(xmlNode* transfer); - - static map > - getLists(xmlNode* transfer); -}; - -#endif /* SRC_RULEPARSER_H_ */ diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 757f307..46b0315 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -1,215 +1,215 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ambiguous_transfer.h" -#include "RuleParser.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -using namespace std; -using namespace elem; - -int main(int argc, char **argv) { - string localeId, transferFilePath, sentenceFilePath, lextorFilePath, - interInFilePath; - - if (argc == 6) { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - interInFilePath = argv[5]; - } else { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// interInFilePath = "inter2.txt"; - -// localeId = "kk_KZ"; -// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; -// sentenceFilePath = "sample-sentences.txt"; -// lextorFilePath = "sample-lextor.txt"; -// interInFilePath = "sample-inter.txt"; - - localeId = "es_ES"; - transferFilePath = "apertium-eng-spa.spa-eng.t1x"; - sentenceFilePath = "sentences.txt"; - lextorFilePath = "lextor.txt"; - interInFilePath = "interIn.txt"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath interInFilePath" - << endl; - cout - << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout - << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout - << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "interInFilePath : Output file name of this program which is the input for apertium interchunk." - << endl; - return -1; - } - - ifstream lextorFile(lextorFilePath.c_str()); - ifstream inSentenceFile(sentenceFilePath.c_str()); - if (lextorFile.is_open() && inSentenceFile.is_open()) { - - xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); - - if (doc == NULL) { - cerr << "Error: Could not parse file \'" << transferFilePath - << "\'." << endl; - exit(EXIT_FAILURE); - } - - xmlNode* transfer = xmlDocGetRootElement(doc); - - vector sourceSentences, tokenizedSentences; - - string tokenizedSentence; - while (getline(lextorFile, tokenizedSentence)) { - string sourceSentence; - if (!getline(inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences.push_back(sourceSentence); - tokenizedSentences.push_back(tokenizedSentence); - } - lextorFile.close(); - inSentenceFile.close(); - - map > > attrs = RuleParser::getAttrs( - transfer); - map vars = RuleParser::getVars(transfer); - map > lists = RuleParser::getLists(transfer); - - ofstream interInFile(interInFilePath.c_str()); - if (interInFile.is_open()) - for (unsigned i = 0; i < sourceSentences.size(); i++) { - cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = sourceSentences[i]; - tokenizedSentence = tokenizedSentences[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, - &tlTags, &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, - transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - // final outs - vector outs; - // number of possible combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; - - nodesPool = getNodesPool(tokenRules); - - for (map >::iterator it = - ruleOutputs.begin(); it != ruleOutputs.end(); it++) { - cout << "ruleId=" << it->first << endl; - map outs = it->second; - - for (map::iterator it2 = outs.begin(); - it2 != outs.end(); it2++) { - cout << "tokId=" << it2->first << " , out = " - << it2->second << endl; - } - cout << endl; - } - cout << endl; - - for (unsigned j = 0; j < tlTokens.size(); j++) { - vector nodes = nodesPool[j]; - cout << "tokId = " << j << " : " << tlTokens[j] << endl; - for (unsigned k = 0; k < nodes.size(); k++) { - cout << "ruleId = " << nodes[k].ruleId << "; patNum = " - << nodes[k].patNum << endl; - } - cout << endl; - } - - getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); - getOuts(&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - for (unsigned j = 0; j < combNodes.size(); j++) { - vector nodes = combNodes[j]; - for (unsigned k = 0; k < nodes.size(); k++) { - cout << "tok=" << nodes[k].tokenId << "; rul=" - << nodes[k].ruleId << "; pat=" - << nodes[k].patNum << " - "; - } - cout << endl; - } - - // write the outs - for (unsigned j = 0; j < outs.size(); j++) - interInFile << outs[j] << endl; - } - else - cout << "ERROR in opening files!" << endl; - interInFile.close(); - - cout << "RulesApplier finished!"; - } else { - cout << "ERROR in opening files!" << endl; - } - - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "ambiguous_transfer.h" +//#include "RuleParser.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//using namespace std; +//using namespace elem; +// +//int main(int argc, char **argv) { +// string localeId, transferFilePath, sentenceFilePath, lextorFilePath, +// interInFilePath; +// +// if (argc == 6) { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// interInFilePath = argv[5]; +// } else { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// interInFilePath = "inter2.txt"; +// +//// localeId = "kk_KZ"; +//// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +//// sentenceFilePath = "sample-sentences.txt"; +//// lextorFilePath = "sample-lextor.txt"; +//// interInFilePath = "sample-inter.txt"; +// +// localeId = "es_ES"; +// transferFilePath = "apertium-eng-spa.spa-eng.t1x"; +// sentenceFilePath = "sentences.txt"; +// lextorFilePath = "lextor.txt"; +// interInFilePath = "interIn.txt"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath interInFilePath" +// << endl; +// cout +// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout +// << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout +// << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "interInFilePath : Output file name of this program which is the input for apertium interchunk." +// << endl; +// return -1; +// } +// +// ifstream lextorFile(lextorFilePath.c_str()); +// ifstream inSentenceFile(sentenceFilePath.c_str()); +// if (lextorFile.is_open() && inSentenceFile.is_open()) { +// +// xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); +// +// if (doc == NULL) { +// cerr << "Error: Could not parse file \'" << transferFilePath +// << "\'." << endl; +// exit(EXIT_FAILURE); +// } +// +// xmlNode* transfer = xmlDocGetRootElement(doc); +// +// vector sourceSentences, tokenizedSentences; +// +// string tokenizedSentence; +// while (getline(lextorFile, tokenizedSentence)) { +// string sourceSentence; +// if (!getline(inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences.push_back(sourceSentence); +// tokenizedSentences.push_back(tokenizedSentence); +// } +// lextorFile.close(); +// inSentenceFile.close(); +// +// map > > attrs = RuleParser::getAttrs( +// transfer); +// map vars = RuleParser::getVars(transfer); +// map > lists = RuleParser::getLists(transfer); +// +// ofstream interInFile(interInFilePath.c_str()); +// if (interInFile.is_open()) +// for (unsigned i = 0; i < sourceSentences.size(); i++) { +// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = sourceSentences[i]; +// tokenizedSentence = tokenizedSentences[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, +// &tlTags, &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, +// transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// // final outs +// vector outs; +// // number of possible combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > combNodes; +// +// nodesPool = getNodesPool(tokenRules); +// +// for (map >::iterator it = +// ruleOutputs.begin(); it != ruleOutputs.end(); it++) { +// cout << "ruleId=" << it->first << endl; +// map outs = it->second; +// +// for (map::iterator it2 = outs.begin(); +// it2 != outs.end(); it2++) { +// cout << "tokId=" << it2->first << " , out = " +// << it2->second << endl; +// } +// cout << endl; +// } +// cout << endl; +// +// for (unsigned j = 0; j < tlTokens.size(); j++) { +// vector nodes = nodesPool[j]; +// cout << "tokId = " << j << " : " << tlTokens[j] << endl; +// for (unsigned k = 0; k < nodes.size(); k++) { +// cout << "ruleId = " << nodes[k].ruleId << "; patNum = " +// << nodes[k].patNum << endl; +// } +// cout << endl; +// } +// +// getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); +// getOuts(&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, +// spaces); +// +// for (unsigned j = 0; j < combNodes.size(); j++) { +// vector nodes = combNodes[j]; +// for (unsigned k = 0; k < nodes.size(); k++) { +// cout << "tok=" << nodes[k].tokenId << "; rul=" +// << nodes[k].ruleId << "; pat=" +// << nodes[k].patNum << " - "; +// } +// cout << endl; +// } +// +// // write the outs +// for (unsigned j = 0; j < outs.size(); j++) +// interInFile << outs[j] << endl; +// } +// else +// cout << "ERROR in opening files!" << endl; +// interInFile.close(); +// +// cout << "RulesApplier finished!"; +// } else { +// cout << "ERROR in opening files!" << endl; +// } +// +// return 0; +//} diff --git a/src/TranElemLiterals.h b/src/TranElemLiterals.h index 253857b..f6fd861 100644 --- a/src/TranElemLiterals.h +++ b/src/TranElemLiterals.h @@ -8,80 +8,81 @@ #ifndef SRC_TRANELEMLITERALS_H_ #define SRC_TRANELEMLITERALS_H_ -namespace elem { +namespace elem +{ -const static char* ACTION = "action"; -const static char* LET = "let"; -const static char* OUT = "out"; -const static char* CHOOSE = "choose"; -const static char* CALL_MACRO = "call-macro"; -const static char* WITH_PARAM = "with-param"; -const static char* CLIP = "clip"; -const static char* LIT_TAG = "lit-tag"; -const static char* LIT = "lit"; -const static char* VAR = "var"; -const static char* CONCAT = "concat"; -const static char* CHUNK = "chunk"; -const static char* MLU = "mlu"; -const static char* LU = "lu"; -const static char* B = "b"; -const static char* N = "n"; -const static char* V = "v"; -const static char* aa = "aa"; -const static char* AA = "AA"; -const static char* Aa = "Aa"; -const static char* TAGS = "tags"; -const static char* TAG = "tag"; -const static char* WHEN = "when"; -const static char* OTHERWISE = "otherwise"; -const static char* TEST = "test"; -const static char* OR = "or"; -const static char* AND = "and"; -const static char* NOT = "not"; -const static char* BEGINS_WITH = "beigns-with"; -const static char* ENDS_WITH = "ends-with"; -const static char* CONTAINS_SUBSTRING = "contains-substring"; -const static char* IN = "in"; -const static char* EQUAL = "equal"; -const static char* CASE_LESS = "caseless"; -const static char* RULE = "rule"; -const static char* ID = "id"; -const static char* PATTERN = "pattern"; -const static char* PATTERN_ITEM = "pattern-item"; -const static char* SECTION_RULES = "section-rules"; -const static char* SECTION_DEF_MACROS = "section-def-macros"; -const static char* DEF_MACRO = "def-macro"; -const static char* SECTION_DEF_ATTRS = "section-def-attrs"; -const static char* DEF_ATTR = "def-attr"; -const static char* ATTR_ITEM = "attr-item"; -const static char* SECTION_DEF_CATS = "section-def-cats"; -const static char* DEF_CAT = "def-cat"; -const static char* CAT_ITEM = "cat-item"; -const static char* SECTION_DEF_LISTS = "section-def-lists"; -const static char* DEF_LIST = "def-list"; -const static char* LIST_ITEM = "list-item"; -const static char* SECTION_DEF_VARS = "section-def-vars"; -const static char* DEF_VAR = "def-var"; -const static char* POS = "pos"; -const static char* SIDE = "side"; -const static char* PART = "part"; -const static char* LINK_TO = "link-to"; -const static char* SL = "sl"; -const static char* TL = "tl"; -const static char* WHOLE = "whole"; -const static char* ATTR = "attr"; -const static char* LEMMA = "lemma"; -const static char* LEM = "lem"; -const static char* LEMH = "lemh"; -const static char* LEMQ = "lemq"; -const static char* APPEND = "append"; -const static char* GET_CASE_FROM = "get-case-from"; -const static char* CASE_OF = "case-of"; -const static char* MODIFY_CASE = "modify-case"; -const static char* NAME = "name"; -const static char* NAME_FROM = "namefrom"; -const static char* DATASETS = "datasets"; -const static char* MODELS = "models"; + const static char* ACTION = "action"; + const static char* LET = "let"; + const static char* OUT = "out"; + const static char* CHOOSE = "choose"; + const static char* CALL_MACRO = "call-macro"; + const static char* WITH_PARAM = "with-param"; + const static char* CLIP = "clip"; + const static char* LIT_TAG = "lit-tag"; + const static char* LIT = "lit"; + const static char* VAR = "var"; + const static char* CONCAT = "concat"; + const static char* CHUNK = "chunk"; + const static char* MLU = "mlu"; + const static char* LU = "lu"; + const static char* B = "b"; + const static char* N = "n"; + const static char* V = "v"; + const static char* aa = "aa"; + const static char* AA = "AA"; + const static char* Aa = "Aa"; + const static char* TAGS = "tags"; + const static char* TAG = "tag"; + const static char* WHEN = "when"; + const static char* OTHERWISE = "otherwise"; + const static char* TEST = "test"; + const static char* OR = "or"; + const static char* AND = "and"; + const static char* NOT = "not"; + const static char* BEGINS_WITH = "beigns-with"; + const static char* ENDS_WITH = "ends-with"; + const static char* CONTAINS_SUBSTRING = "contains-substring"; + const static char* IN = "in"; + const static char* EQUAL = "equal"; + const static char* CASE_LESS = "caseless"; + const static char* RULE = "rule"; + const static char* ID = "id"; + const static char* PATTERN = "pattern"; + const static char* PATTERN_ITEM = "pattern-item"; + const static char* SECTION_RULES = "section-rules"; + const static char* SECTION_DEF_MACROS = "section-def-macros"; + const static char* DEF_MACRO = "def-macro"; + const static char* SECTION_DEF_ATTRS = "section-def-attrs"; + const static char* DEF_ATTR = "def-attr"; + const static char* ATTR_ITEM = "attr-item"; + const static char* SECTION_DEF_CATS = "section-def-cats"; + const static char* DEF_CAT = "def-cat"; + const static char* CAT_ITEM = "cat-item"; + const static char* SECTION_DEF_LISTS = "section-def-lists"; + const static char* DEF_LIST = "def-list"; + const static char* LIST_ITEM = "list-item"; + const static char* SECTION_DEF_VARS = "section-def-vars"; + const static char* DEF_VAR = "def-var"; + const static char* POS = "pos"; + const static char* SIDE = "side"; + const static char* PART = "part"; + const static char* LINK_TO = "link-to"; + const static char* SL = "sl"; + const static char* TL = "tl"; + const static char* WHOLE = "whole"; + const static char* ATTR = "attr"; + const static char* LEMMA = "lemma"; + const static char* LEM = "lem"; + const static char* LEMH = "lemh"; + const static char* LEMQ = "lemq"; + const static char* APPEND = "append"; + const static char* GET_CASE_FROM = "get-case-from"; + const static char* CASE_OF = "case-of"; + const static char* MODIFY_CASE = "modify-case"; + const static char* NAME = "name"; + const static char* NAME_FROM = "namefrom"; + const static char* DATASETS = "datasets"; + const static char* MODELS = "models"; } #endif /* SRC_TRANELEMLITERALS_H_ */ diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index e1e1886..ffe0432 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -1,293 +1,293 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "RuleParser.h" -#include "ambiguous_transfer.h" -#include "TranElemLiterals.h" -#include "CLExec.h" - -using namespace std; -using namespace elem; - -int main(int argc, char **argv) { - string sentenceFilePath = "sentences.txt", lextorFilePath = "lextor.txt", - weightOutFilePath = "weights.txt", localeId = "kk_KZ", - transferFilePath = "transferFile.tx1", datasetsPath = "datasets"; - - if (argc == 7) { - localeId = argv[1]; - transferFilePath = argv[2]; - sentenceFilePath = argv[3]; - lextorFilePath = argv[4]; - weightOutFilePath = argv[5]; - datasetsPath = argv[6]; - } else { -// localeId = "es_ES"; -// transferFilePath = "transferFile.t1x"; -// sentenceFilePath = "spa-test.txt"; -// lextorFilePath = "spa-test.lextor"; -// transferOutFilePath = "transfer.out"; -// weightOutFilePath = "weights.txt"; -// outputFilePath = "output.out"; -// datasetsPath = "datasetstry2"; - -//./yasmet-formatter $localeId sentences.txt lextor.txt transfer.txt weights.txt $outputFile $datasets; - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - sentenceFilePath = "sample-sentences.txt"; - lextorFilePath = "sample-lextor.txt"; - weightOutFilePath = "norm-weights.txt"; - datasetsPath = "datasetstry1234"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath weightOutFilePath datasetsPath" - << endl; - cout - << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout - << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "sentenceFilePath : Source language sentences file." << endl; - cout - << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout - << "datasetsPath : Datasets destination to put in the generated yasmet files." - << endl; - return -1; - } - - ifstream lextorFile(lextorFilePath.c_str()); - ifstream inSentenceFile(sentenceFilePath.c_str()); - if (lextorFile.is_open() && inSentenceFile.is_open()) { - - xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); - - if (doc == NULL) { - cerr << "Error: Could not parse file \'" << transferFilePath - << "\'." << endl; - exit(EXIT_FAILURE); - } - - xmlNode* transfer = xmlDocGetRootElement(doc); - - vector *sourceSentences = new vector(), - *tokenizedSentences = new vector(); - - string tokenizedSentence; - //unsigned i = 0; - while (getline(lextorFile, tokenizedSentence)) { - string sourceSentence; - if (!getline(inSentenceFile, sourceSentence)) - sourceSentence = "No more sentences"; - - sourceSentences->push_back(sourceSentence); - tokenizedSentences->push_back(tokenizedSentence); - //if (i == 100) - // break; - //i++; - } - lextorFile.close(); - inSentenceFile.close(); - - map > > attrs = RuleParser::getAttrs( - transfer); - map vars = RuleParser::getVars(transfer); - map > lists = RuleParser::getLists(transfer); - -// vector >* vslTokens = new vector > (); -// vector vouts; -// vector >* vambigInfo = new vector< -// vector > (); -// vector > > vcompNodes; - - ifstream weightOutFile(weightOutFilePath.c_str()); - if (weightOutFile.is_open()) - for (unsigned i = 0; i < sourceSentences->size(); i++) { -// cout << i << endl; - - string sourceSentence, tokenizedSentence; - sourceSentence = (*sourceSentences)[i]; - tokenizedSentence = (*tokenizedSentences)[i]; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, - &tlTags, &spaces, tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, - transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - - // final outs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; - - nodesPool = getNodesPool(tokenRules); - - getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); - - getOuts(&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size(); j++) - if (ambigInfo[j].combinations.size() > 1) - newAmbigInfo.push_back(ambigInfo[j]); - ambigInfo = newAmbigInfo; - - // read weights - string line; - vector weights; - weights.reserve(1000); - for (unsigned j = 0; j < outs.size(); j++) { - getline(weightOutFile, line); - float weight = strtof(line.c_str(), NULL); - weights.push_back(weight); - } - - normaliseWeights(&weights, ambigInfo); - - // Yasmet format preparing - // make a directory if not found - mkdir(datasetsPath.c_str(), - S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - - unsigned weigInd = 0; - for (unsigned i = 0; i < ambigInfo.size(); i++) { - AmbigInfo ambig = ambigInfo[i]; - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambig.combinations.size(); x++) { - // avoid dummy node - for (unsigned y = 1; y < ambig.combinations[x].size(); - y++) { - stringstream ss; -// ss->clear (); - ss << ambig.combinations[x][y].ruleId; - rulesNums += ss.str(); - - if (y + 1 < ambig.combinations[x].size()) - rulesNums += "_"; - } - rulesNums += "+"; - } - - // if it's the first time to open , put the number of classes - bool firstTime = true; - if (FILE *file = fopen( - (datasetsPath + string("/") + rulesNums).c_str(), - "r")) { - firstTime = false; - fclose(file); - } - -// stringstream* dataset = new stringstream (); - ofstream dataset( - (datasetsPath + string("/") + rulesNums).c_str(), - ofstream::app); - - if (firstTime) - dataset << ambig.combinations.size() << endl; - - for (unsigned x = 0; x < ambig.combinations.size(); x++) { - - dataset << x << " $ "; - - float weight = weights[x + weigInd]; - - dataset << weight << " #"; - - string features; - for (unsigned v = 0; v < ambig.combinations.size(); - v++) { - stringstream ss; -// ss.clear (); - ss << v; - string label = ss.str(); - - for (unsigned z = ambig.firTokId; - z < ambig.firTokId + ambig.maxPat; z++) { - stringstream ss; -// ss->clear (); - ss << z - ambig.firTokId; - string num = ss.str(); -// *num = ss->str (); - string word = CLExec::toLowerCase(slTokens[z], - localeId); - - for (unsigned c = 0; c < word.length(); c++) - if (word[c] == ' ') - word.replace(c, 1, "_"); - - features += " " + word + "_" + num + ":" - + label; - } - features += " #"; - } - dataset << features << endl; -// delete (features); - } - weigInd += ambig.combinations.size(); -// dataset.close (); - } -// } - } - weightOutFile.close(); - - } else { - cout << "ERROR in opening files!" << endl; - } - - return 0; -} +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +//#include +// +//#include "RuleParser.h" +//#include "ambiguous_transfer.h" +//#include "TranElemLiterals.h" +//#include "CLExec.h" +// +//using namespace std; +//using namespace elem; +// +//int main(int argc, char **argv) { +// string sentenceFilePath = "sentences.txt", lextorFilePath = "lextor.txt", +// weightOutFilePath = "weights.txt", localeId = "kk_KZ", +// transferFilePath = "transferFile.tx1", datasetsPath = "datasets"; +// +// if (argc == 7) { +// localeId = argv[1]; +// transferFilePath = argv[2]; +// sentenceFilePath = argv[3]; +// lextorFilePath = argv[4]; +// weightOutFilePath = argv[5]; +// datasetsPath = argv[6]; +// } else { +//// localeId = "es_ES"; +//// transferFilePath = "transferFile.t1x"; +//// sentenceFilePath = "spa-test.txt"; +//// lextorFilePath = "spa-test.lextor"; +//// transferOutFilePath = "transfer.out"; +//// weightOutFilePath = "weights.txt"; +//// outputFilePath = "output.out"; +//// datasetsPath = "datasetstry2"; +// +////./yasmet-formatter $localeId sentences.txt lextor.txt transfer.txt weights.txt $outputFile $datasets; +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "sample-sentences.txt"; +// lextorFilePath = "sample-lextor.txt"; +// weightOutFilePath = "norm-weights.txt"; +// datasetsPath = "datasetstry1234"; +// +// cout << "Error in parameters !" << endl; +// cout +// << "Parameters are : localeId transferFilePath sentenceFilePath lextorFilePath weightOutFilePath datasetsPath" +// << endl; +// cout +// << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" +// << endl; +// cout +// << "transferFilePath : Apertium transfer file of the language pair used." +// << endl; +// cout << "sentenceFilePath : Source language sentences file." << endl; +// cout +// << "lextorFilePath : Apertium lextor file for the source language sentences." +// << endl; +// cout +// << "weightOutFilePath : Language model weights file for the source language sentences." +// << endl; +// cout +// << "datasetsPath : Datasets destination to put in the generated yasmet files." +// << endl; +// return -1; +// } +// +// ifstream lextorFile(lextorFilePath.c_str()); +// ifstream inSentenceFile(sentenceFilePath.c_str()); +// if (lextorFile.is_open() && inSentenceFile.is_open()) { +// +// xmlDoc* doc = xmlReadFile(transferFilePath.c_str(), NULL, 0); +// +// if (doc == NULL) { +// cerr << "Error: Could not parse file \'" << transferFilePath +// << "\'." << endl; +// exit(EXIT_FAILURE); +// } +// +// xmlNode* transfer = xmlDocGetRootElement(doc); +// +// vector *sourceSentences = new vector(), +// *tokenizedSentences = new vector(); +// +// string tokenizedSentence; +// //unsigned i = 0; +// while (getline(lextorFile, tokenizedSentence)) { +// string sourceSentence; +// if (!getline(inSentenceFile, sourceSentence)) +// sourceSentence = "No more sentences"; +// +// sourceSentences->push_back(sourceSentence); +// tokenizedSentences->push_back(tokenizedSentence); +// //if (i == 100) +// // break; +// //i++; +// } +// lextorFile.close(); +// inSentenceFile.close(); +// +// map > > attrs = RuleParser::getAttrs( +// transfer); +// map vars = RuleParser::getVars(transfer); +// map > lists = RuleParser::getLists(transfer); +// +//// vector >* vslTokens = new vector > (); +//// vector vouts; +//// vector >* vambigInfo = new vector< +//// vector > (); +//// vector > > vcompNodes; +// +// ifstream weightOutFile(weightOutFilePath.c_str()); +// if (weightOutFile.is_open()) +// for (unsigned i = 0; i < sourceSentences->size(); i++) { +//// cout << i << endl; +// +// string sourceSentence, tokenizedSentence; +// sourceSentence = (*sourceSentences)[i]; +// tokenizedSentence = (*tokenizedSentences)[i]; +// +// // spaces after each token +// vector spaces; +// +// // tokens in the sentence order +// vector slTokens, tlTokens; +// +// // tags of tokens in order +// vector > slTags, tlTags; +// +// RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, +// &tlTags, &spaces, tokenizedSentence); +// +// // map of tokens ids and their matched categories +// map > catsApplied; +// +// RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); +// +// // map of matched rules and a pair of first token id and patterns number +// map > > rulesApplied; +// +// RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, +// transfer); +// +// // rule and (target) token map to specific output +// // if rule has many patterns we will choose the first token only +// map > ruleOutputs; +// +// // map (target) token to all matched rules ids and the number of pattern items of each rule +// map > > tokenRules; +// +// ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, +// tlTags, rulesApplied, attrs, lists, &vars, spaces, +// localeId); +// +// // final outs +// vector outs; +// // number of generated combinations +// unsigned compNum; +// // nodes for every token and rule +// map > nodesPool; +// // ambiguous informations +// vector ambigInfo; +// // rules combinations +// vector > combNodes; +// +// nodesPool = getNodesPool(tokenRules); +// +// getAmbigInfo(tokenRules, nodesPool, &ambigInfo, &compNum); +// +// getOuts(&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, +// spaces); +// +// vector newAmbigInfo; +// for (unsigned j = 0; j < ambigInfo.size(); j++) +// if (ambigInfo[j].combinations.size() > 1) +// newAmbigInfo.push_back(ambigInfo[j]); +// ambigInfo = newAmbigInfo; +// +// // read weights +// string line; +// vector weights; +// weights.reserve(1000); +// for (unsigned j = 0; j < outs.size(); j++) { +// getline(weightOutFile, line); +// float weight = strtof(line.c_str(), NULL); +// weights.push_back(weight); +// } +// +// normaliseWeights(&weights, ambigInfo); +// +// // Yasmet format preparing +// // make a directory if not found +// mkdir(datasetsPath.c_str(), +// S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +// +// unsigned weigInd = 0; +// for (unsigned i = 0; i < ambigInfo.size(); i++) { +// AmbigInfo ambig = ambigInfo[i]; +// +// // name of the file is the concatenation of rules ids +// string rulesNums; +// for (unsigned x = 0; x < ambig.combinations.size(); x++) { +// // avoid dummy node +// for (unsigned y = 1; y < ambig.combinations[x].size(); +// y++) { +// stringstream ss; +//// ss->clear (); +// ss << ambig.combinations[x][y].ruleId; +// rulesNums += ss.str(); +// +// if (y + 1 < ambig.combinations[x].size()) +// rulesNums += "_"; +// } +// rulesNums += "+"; +// } +// +// // if it's the first time to open , put the number of classes +// bool firstTime = true; +// if (FILE *file = fopen( +// (datasetsPath + string("/") + rulesNums).c_str(), +// "r")) { +// firstTime = false; +// fclose(file); +// } +// +//// stringstream* dataset = new stringstream (); +// ofstream dataset( +// (datasetsPath + string("/") + rulesNums).c_str(), +// ofstream::app); +// +// if (firstTime) +// dataset << ambig.combinations.size() << endl; +// +// for (unsigned x = 0; x < ambig.combinations.size(); x++) { +// +// dataset << x << " $ "; +// +// float weight = weights[x + weigInd]; +// +// dataset << weight << " #"; +// +// string features; +// for (unsigned v = 0; v < ambig.combinations.size(); +// v++) { +// stringstream ss; +//// ss.clear (); +// ss << v; +// string label = ss.str(); +// +// for (unsigned z = ambig.firTokId; +// z < ambig.firTokId + ambig.maxPat; z++) { +// stringstream ss; +//// ss->clear (); +// ss << z - ambig.firTokId; +// string num = ss.str(); +//// *num = ss->str (); +// string word = CLExec::toLowerCase(slTokens[z], +// localeId); +// +// for (unsigned c = 0; c < word.length(); c++) +// if (word[c] == ' ') +// word.replace(c, 1, "_"); +// +// features += " " + word + "_" + num + ":" +// + label; +// } +// features += " #"; +// } +// dataset << features << endl; +//// delete (features); +// } +// weigInd += ambig.combinations.size(); +//// dataset.close (); +// } +//// } +// } +// weightOutFile.close(); +// +// } else { +// cout << "ERROR in opening files!" << endl; +// } +// +// return 0; +//} diff --git a/src/ambiguous_transfer.cpp b/src/ambiguous_chunker.cpp similarity index 78% rename from src/ambiguous_transfer.cpp rename to src/ambiguous_chunker.cpp index 305e14e..ec3d415 100644 --- a/src/ambiguous_transfer.cpp +++ b/src/ambiguous_chunker.cpp @@ -5,83 +5,387 @@ * Author: aboelhamd */ -#include "ambiguous_transfer.h" -#include "CLExec.h" +#include "ambiguous_chunker.h" + +#include "case_handler.h" using namespace std; using namespace elem; -ambiguous_transfer::Node -ambiguousGraph (map > > tokenRules, - map > nodesPool, - unsigned firTok, unsigned maxPat) +void +AmbiguousChunker::lexFormsTokenizer (vector* slTokens, vector* tlTokens, + vector >* slTags, + vector >* tlTags, + vector* spaces, string tokenizedSentenceStr) { - for (unsigned i = firTok; i < firTok + maxPat; i++) + vector taggedTokens; + // from string to char* + char tokenizedSentence[tokenizedSentenceStr.size ()]; + strcpy (tokenizedSentence, tokenizedSentenceStr.c_str ()); + + char * taggedToken; + taggedToken = strtok (tokenizedSentence, "^"); + while (taggedToken != NULL) { - vector nodes = nodesPool[i]; - for (unsigned j = 0; j < nodes.size (); j++) + taggedTokens.push_back (taggedToken); + taggedToken = strtok (NULL, "^"); + } + + size_t taggedTokensSize = taggedTokens.size (); + for (unsigned i = 0; i < taggedTokensSize; i++) + { + // take spaces after token + size_t dolSignInd = taggedTokens[i].find ("$"); + spaces->push_back (taggedTokens[i].substr (dolSignInd + 1)); + taggedTokens[i] = taggedTokens[i].substr (0, dolSignInd); + + // remove multiple translations and take only the first one + size_t firSlashInd = taggedTokens[i].find ("/"); + + // if no translation , remove that word + if (firSlashInd + 1 == taggedTokens[i].size ()) { - ambiguous_transfer::Node node = nodes[j]; - // last nodes will point to nothing - if (node.tokenId + node.patNum < firTok + maxPat) - node.neighbors = nodesPool[node.tokenId + node.patNum]; + taggedTokens.erase (taggedTokens.begin () + i); + spaces->erase (spaces->begin () + i); + taggedTokensSize--; + i--; + continue; + } - nodes[j] = node; + size_t secSlashInd = taggedTokens[i].find ("/", firSlashInd + 1); + if (secSlashInd != string::npos) + taggedTokens[i] = taggedTokens[i].substr (0, secSlashInd); + + // split source and target tokens + string target = taggedTokens[i].substr (firSlashInd + 1); + + taggedTokens.push_back (target); + + taggedTokens[i] = taggedTokens[i].substr (0, firSlashInd); + } + + for (unsigned i = 0; i < taggedTokens.size (); i++) + { + char taggedToken[taggedTokens[i].size ()]; + strcpy (taggedToken, taggedTokens[i].c_str ()); + char* split; + + string token; + vector tokTags; + + if (taggedToken[0] != '<') + { + split = strtok (taggedToken, "<>"); + token = split; + split = strtok (NULL, "<>"); + } + else + { + split = strtok (taggedToken, "<>"); + } + + while (split != NULL) + { + string tag = split; + tokTags.push_back (tag); + + split = strtok (NULL, "<>"); + } + + if (i < taggedTokens.size () / 2) + { + slTokens->push_back (token); + slTags->push_back (tokTags); + } + else + { + tlTokens->push_back (token); + tlTags->push_back (tokTags); + } + } +} + +void +AmbiguousChunker::matchCats (map >* catsApplied, + vector slTokens, vector > tags, + xmlNode* transfer) +{ + xmlNode* section_def_cats = xml_operations::getChild (transfer, SECTION_DEF_CATS); + + for (xmlNode* def_cat = xml_operations::getFirstChild (section_def_cats); def_cat; + def_cat = xml_operations::getFirstNext (def_cat)) + { + + for (xmlNode* cat_item = xml_operations::getFirstChild (def_cat); cat_item; + cat_item = xml_operations::getFirstNext (cat_item)) + { + + // separate tags from (t1.t2) format, for easy access + string tagsString = xml_operations::getAttVal (cat_item, TAGS); + + char tagDotted[tagsString.size ()]; + strcpy (tagDotted, tagsString.c_str ()); + char* split; + split = strtok (tagDotted, "."); + + vector itemTags; + + while (split != NULL) + { + string tag = split; + itemTags.push_back (tag); + + split = strtok (NULL, "."); + } + + for (unsigned x = 0; x < slTokens.size (); x++) + { + // if cat-item have lemma + if (!xml_operations::getAttVal (cat_item, LEMMA).empty ()) + { + if (xml_operations::getAttVal (cat_item, LEMMA) != slTokens[x]) + { + continue; + } + } + + vector tokTags = tags[x]; + + unsigned i = 0, j = 0; + for (; i < tokTags.size () && j < itemTags.size (); i++) + { + if (itemTags[j] == "*") + { + if (j + 1 < itemTags.size () && i + 1 < tokTags.size () + && itemTags[j + 1] == tokTags[i + 1]) + { + j += 2; + i++; + } + } + else if (itemTags[j] == tokTags[i]) + { + j++; + } + else + { + break; + } + } + + if (i == tokTags.size () + && (j == itemTags.size () + || (j + 1 == itemTags.size () && itemTags[j] == "*" + && itemTags[j - 1] != tokTags[i - 1]))) + { + string s = xml_operations::getAttVal (def_cat, N); + (*catsApplied)[x].push_back (s); + } + } } - nodesPool[i] = nodes; } - // root(dummy) node points to the first token node/s - ambiguous_transfer::Node root = ambiguous_transfer::Node (-1, -1, -1); - root.neighbors = nodesPool[firTok]; - return root; } -ambiguous_transfer::Node -ambiguousGraph (map > > tokenRules, - map > nodesPool) +void +AmbiguousChunker::matchRules ( + map > >* rulesApplied, + vector slTokens, map > catsApplied, + xmlNode* transfer) { - for (unsigned i = 0; i < nodesPool.size (); i++) + + xmlNode* section_rules = xml_operations::getChild (transfer, SECTION_RULES); + + vector tokensApplied; + + for (xmlNode* rule = xml_operations::getFirstChild (section_rules); rule; rule = + xml_operations::getFirstNext (rule)) { - vector nodes = nodesPool[i]; - for (unsigned j = 0; j < nodes.size (); j++) + + xmlNode* pattern = xml_operations::getChild (rule, PATTERN); + + // Put pattern items in vector for ease in processing + vector pattern_items; + for (xmlNode* pattern_item = xml_operations::getFirstChild (pattern); pattern_item; + pattern_item = xml_operations::getFirstNext (pattern_item)) { - ambiguous_transfer::Node node = nodes[j]; - // last nodes will point to not existent nodes - if (nodesPool.count (node.tokenId + node.patNum)) - node.neighbors = nodesPool[node.tokenId + node.patNum]; - nodes[j] = node; + pattern_items.push_back (pattern_item); } - nodesPool[i] = nodes; + + for (unsigned i = 0; + (slTokens.size () >= pattern_items.size ()) + && i <= slTokens.size () - pattern_items.size (); i++) + { + + vector slMatchedTokens; + for (unsigned j = 0; j < pattern_items.size (); j++) + { + + // match cat-item with pattern-item + string slToken = slTokens[i + j]; + vector cats = catsApplied[i + j]; + + for (unsigned k = 0; k < cats.size (); k++) + { + // if cat name equals pattern item name + if (xml_operations::getAttVal (pattern_items[j], N) == cats[k]) + { + slMatchedTokens.push_back (i + j); + break; + } + } + } + // if matched tokens' size = pattern items' size + // then this rule is matched + if (slMatchedTokens.size () == pattern_items.size ()) + { + if (slMatchedTokens.size () == 1) + tokensApplied.insert (tokensApplied.end (), slMatchedTokens.begin (), + slMatchedTokens.end ()); + (*rulesApplied)[rule].push_back ( + pair (slMatchedTokens[0], slMatchedTokens.size ())); + } + + } + } - // root(dummy) node points to the first token node/s - ambiguous_transfer::Node root = ambiguous_transfer::Node (-1, -1, -1); - root.neighbors = nodesPool[0]; - return root; + // set a default rule for tokens without rules applied + vector > tokensNotApp; + for (unsigned i = 0; i < slTokens.size (); i++) + { + bool found = false; + for (unsigned j = 0; j < tokensApplied.size (); j++) + { + if (i == tokensApplied[j]) + { + found = true; + break; + } + } + if (!found) + { + tokensNotApp.push_back (pair (i, 1)); + } + } + + xmlNode* defaultRule = xmlNewNode (NULL, (xmlChar*) "rule"); + + (*rulesApplied)[defaultRule] = tokensNotApp; +} + +// to sort attribute tags descendingly +bool +sortParameter1 (vector a, vector b) +{ + return (a.size () > b.size ()); +} + +map > > +AmbiguousChunker::getAttrs (xmlNode* transfer) +{ + map > > attrs; + xmlNode* section_def_attrs = xml_operations::getChild (transfer, SECTION_DEF_ATTRS); + + for (xmlNode* def_attr = xml_operations::getFirstChild (section_def_attrs); def_attr; + def_attr = xml_operations::getFirstNext (def_attr)) + { + + vector > allTags; + for (xmlNode* attr_item = xml_operations::getFirstChild (def_attr); attr_item; + attr_item = xml_operations::getFirstNext (attr_item)) + { + + // splitting tags by '.' + string tagsString = xml_operations::getAttVal (attr_item, TAGS); + char tagsChars[tagsString.size ()]; + strcpy (tagsChars, tagsString.c_str ()); + + vector tags; + + char * tag; + tag = strtok (tagsChars, "."); + while (tag != NULL) + { + tags.push_back (tag); + tag = strtok (NULL, "."); + } + + allTags.push_back (tags); + } + // sort the tags , descendingly by their size + sort (allTags.begin (), allTags.end (), sortParameter1); + + attrs[xml_operations::getAttVal (def_attr, N)] = allTags; + } + + return attrs; +} + +map +AmbiguousChunker::getVars (xmlNode* transfer) +{ + map vars; + + xmlNode* section_def_vars = xml_operations::getChild (transfer, SECTION_DEF_VARS); + if (section_def_vars) + for (xmlNode* def_var = xml_operations::getFirstChild (section_def_vars); def_var; + def_var = xml_operations::getFirstNext (def_var)) + { + + vars[xml_operations::getAttVal (def_var, N)] = xml_operations::getAttVal (def_var, + V); + } + + return vars; +} + +map > +AmbiguousChunker::getLists (xmlNode* transfer) +{ + map > lists; + + xmlNode* section_def_lists = xml_operations::getChild (transfer, SECTION_DEF_LISTS); + if (section_def_lists) + for (xmlNode* def_list = xml_operations::getFirstChild (section_def_lists); def_list; + def_list = xml_operations::getFirstNext (def_list)) + { + + vector list; + for (xmlNode* list_item = xml_operations::getFirstChild (def_list); list_item; + list_item = xml_operations::getFirstNext (list_item)) + { + + list.push_back (xml_operations::getAttVal (list_item, V)); + } + lists[xml_operations::getAttVal (def_list, N)] = list; + } + + return lists; } void -ambiguous_transfer::getOuts ( - vector* finalOuts, vector >* finalCombNodes, - vector, float> > beamTree, - map > nodesPool, - map > ruleOutputs, vector spaces) +AmbiguousChunker::getOuts (vector* finalOuts, + vector >* finalCombNodes, + vector, float> > beamTree, + map > nodesPool, + map > ruleOutputs, + vector spaces) { for (unsigned i = 0; i < beamTree.size (); i++) { - map bestNodes; + map bestNodes; for (unsigned j = 0; j < beamTree[i].first.size (); j++) { bestNodes[beamTree[i].first[j].tokenId] = beamTree[i].first[j]; } - vector nodes; + vector nodes; string out; for (unsigned j = 0; j < nodesPool.size ();) { - ambiguous_transfer::Node node; + AmbiguousChunker::Node node; if (bestNodes.count (j)) node = bestNodes[j]; else @@ -101,35 +405,35 @@ ambiguous_transfer::getOuts ( } void -ambiguous_transfer::getOuts (vector* finalOuts, - vector >* finalCombNodes, - vector ambigInfo, - map > nodesPool, - map > ruleOutputs, - vector spaces) +AmbiguousChunker::getOuts (vector* finalOuts, + vector >* finalCombNodes, + vector ambigInfo, + map > nodesPool, + map > ruleOutputs, + vector spaces) { - map ambigMap; + map ambigMap; for (unsigned i = 0; i < ambigInfo.size (); i++) { ambigMap.insert ( - pair (ambigInfo[i].firTokId, - ambigInfo[i])); + pair (ambigInfo[i].firTokId, + ambigInfo[i])); } for (unsigned i = 0; (i < ambigInfo.size ()) || (i < 1); i++) { - vector > combNodes; - combNodes.push_back (vector ()); + vector > combNodes; + combNodes.push_back (vector ()); vector outs; outs.push_back (""); for (unsigned j = 0; j < nodesPool.size ();) { - vector nodes = nodesPool[j]; + vector nodes = nodesPool[j]; if (nodes.size () > 1 && ambigMap.count (j)) { - vector > combinations = + vector > combinations = ambigMap[j].combinations; if (ambigInfo[i].firTokId == j) @@ -158,8 +462,8 @@ ambiguous_transfer::getOuts (vector* finalOuts, { putCombination ( &combNodes, - vector (combinations[0].begin () + 1, - combinations[0].end ())); + vector (combinations[0].begin () + 1, + combinations[0].end ())); // take the first combination only , while solving the last space issue string ambigOut; // skip the dummy node @@ -206,28 +510,79 @@ ambiguous_transfer::getOuts (vector* finalOuts, } } +AmbiguousChunker::Node +ambiguousGraph (map > > tokenRules, + map > nodesPool, unsigned firTok, + unsigned maxPat) +{ + for (unsigned i = firTok; i < firTok + maxPat; i++) + { + vector nodes = nodesPool[i]; + for (unsigned j = 0; j < nodes.size (); j++) + { + AmbiguousChunker::Node node = nodes[j]; + // last nodes will point to nothing + if (node.tokenId + node.patNum < firTok + maxPat) + node.neighbors = nodesPool[node.tokenId + node.patNum]; + + nodes[j] = node; + } + nodesPool[i] = nodes; + } + + // root(dummy) node points to the first token node/s + AmbiguousChunker::Node root = AmbiguousChunker::Node (-1, -1, -1); + root.neighbors = nodesPool[firTok]; + return root; +} + +AmbiguousChunker::Node +ambiguousGraph (map > > tokenRules, + map > nodesPool) +{ + for (unsigned i = 0; i < nodesPool.size (); i++) + { + vector nodes = nodesPool[i]; + for (unsigned j = 0; j < nodes.size (); j++) + { + AmbiguousChunker::Node node = nodes[j]; + // last nodes will point to not existent nodes + if (nodesPool.count (node.tokenId + node.patNum)) + node.neighbors = nodesPool[node.tokenId + node.patNum]; + + nodes[j] = node; + } + nodesPool[i] = nodes; + } + + // root(dummy) node points to the first token node/s + AmbiguousChunker::Node root = AmbiguousChunker::Node (-1, -1, -1); + root.neighbors = nodesPool[0]; + return root; +} + void -putCombination (vector >* combinations, - vector combination) +putCombination (vector >* combinations, + vector combination) { for (unsigned i = 0; i < combinations->size (); i++) (*combinations)[i].insert ((*combinations)[i].end (), combination.begin (), combination.end ()); } -vector > -putCombinations (vector > combinations, - vector > nestedcombinations) +vector > +putCombinations (vector > combinations, + vector > nestedcombinations) { - vector > newcombinations; + vector > newcombinations; for (unsigned i = 0; i < combinations.size (); i++) { for (unsigned j = 0; j < nestedcombinations.size (); j++) { - vector newcombination = vector< - ambiguous_transfer::Node> (combinations[i]); + vector newcombination = vector ( + combinations[i]); // +1 to skip dummy node newcombination.insert (newcombination.end (), nestedcombinations[j].begin () + 1, @@ -274,8 +629,8 @@ putOuts (vector outputs, vector nestedOutputs) } void -getCombinations (ambiguous_transfer::Node root, vector path, - vector >* ambigRules) +getCombinations (AmbiguousChunker::Node root, vector path, + vector >* ambigRules) { path.push_back (root); @@ -295,8 +650,8 @@ getCombinations (ambiguous_transfer::Node root, vector } void -ambiguous_transfer::normaliseWeights (vector* weights, - vector ambigInfo) +AmbiguousChunker::normaliseWeights (vector* weights, + vector ambigInfo) { unsigned weigInd = 0; @@ -342,10 +697,10 @@ getMaxPat (int curMaxPat, unsigned curToken, } void -ambiguous_transfer::getAmbigInfo ( +AmbiguousChunker::getAmbigInfo ( map > > tokenRules, - map > nodesPool, - vector* ambigInfo, unsigned* combNum) + map > nodesPool, + vector* ambigInfo, unsigned* combNum) { *combNum = 0; for (unsigned tokId = 0; tokId < tokenRules.size ();) @@ -357,13 +712,11 @@ ambiguous_transfer::getAmbigInfo ( // if there is ambiguity if (nodesPool[tokId].size () > 1) { - ambiguous_transfer::AmbigInfo ambig = ambiguous_transfer::AmbigInfo (tokId, - maxPat); + AmbiguousChunker::AmbigInfo ambig = AmbiguousChunker::AmbigInfo (tokId, maxPat); - ambiguous_transfer::Node dummy = ambiguousGraph (tokenRules, nodesPool, tokId, - maxPat); - getCombinations (dummy, vector (), - &ambig.combinations); + AmbiguousChunker::Node dummy = ambiguousGraph (tokenRules, nodesPool, tokId, + maxPat); + getCombinations (dummy, vector (), &ambig.combinations); if (!ambig.combinations.empty ()) ambigInfo->push_back (ambig); @@ -374,11 +727,11 @@ ambiguous_transfer::getAmbigInfo ( } } -map > -ambiguous_transfer::getNodesPool ( +map > +AmbiguousChunker::getNodesPool ( map > > tokenRules) { - map > nodesPool; + map > nodesPool; for (map > >::iterator it = tokenRules.begin (); it != tokenRules.end (); it++) { @@ -388,21 +741,13 @@ ambiguous_transfer::getNodesPool ( { unsigned ruleId = rules[i].first; unsigned patNum = rules[i].second; - ambiguous_transfer::Node node = ambiguous_transfer::Node (tokenId, ruleId, - patNum); + AmbiguousChunker::Node node = AmbiguousChunker::Node (tokenId, ruleId, patNum); nodesPool[tokenId].push_back (node); } } return nodesPool; } -// to sort rules in tokenRules descendingly by their number of pattern items -bool -sortParameter (pair a, pair b) -{ - return (a.second > b.second); -} - string noRuleOut (vector analysis) { @@ -468,31 +813,37 @@ nestedRules (vector tlTokens, string output, } } +// to sort rules in tokenRules descendingly by their number of pattern items +bool +sortParameter2 (pair a, pair b) +{ + return (a.second > b.second); +} + void pushDistinct (map > >* tokenRules, unsigned tlTokInd, xmlNode* rule, unsigned patNum) { vector > pairs = (*tokenRules)[tlTokInd]; -// cout << "here001" << endl; -// cout << "here002" << endl; + for (unsigned i = 0; i < pairs.size (); i++) { if (pairs[i].first == xml_operations::xml_operations::getAttValUnsg (rule, ID)) return; } -// xml_operations::xml_operations::getAttValUnsg (rule, ID); -// cout << "here003" << endl; + (*tokenRules)[tlTokInd].push_back ( pair (xml_operations::xml_operations::getAttValUnsg (rule, ID), patNum)); -// cout << "here004" << endl; - sort ((*tokenRules)[tlTokInd].begin (), (*tokenRules)[tlTokInd].end (), sortParameter); + + sort ((*tokenRules)[tlTokInd].begin (), (*tokenRules)[tlTokInd].end (), sortParameter2); } +// for debugging void printNodeAttrs (xmlNode* node) { -// cout << node.name() << endl; +// cout << node->name << endl; // for (xmlNode*::attribute_iterator it = node.attributes_begin (); // it != node.attributes_end(); // it++ @@ -512,7 +863,6 @@ ruleOuts (map >* ruleOuts, map > > attrs, map > lists, map* vars, vector spaces, string localeId) { - //cout << "Inside " << "ruleOuts" << endl; for (map > >::iterator it = rulesApplied.begin (); it != rulesApplied.end (); ++it) @@ -562,7 +912,7 @@ ruleOuts (map >* ruleOuts, } void -ambiguous_transfer::ruleOuts ( +AmbiguousChunker::ruleOuts ( map >* ruleOuts, map > >* tokenRules, vector slTokens, vector > slTags, vector tlTokens, @@ -571,7 +921,6 @@ ambiguous_transfer::ruleOuts ( map > > attrs, map > lists, map* vars, vector spaces, string localeId) { -// cout << "Inside " << "ruleOuts" << endl; for (map > >::iterator it = rulesApplied.begin (); it != rulesApplied.end (); ++it) @@ -582,7 +931,6 @@ ambiguous_transfer::ruleOuts ( vector > slAnalysisTokens, tlAnalysisTokens; // format tokens and their tags into analysisTokens -// cout << "here01" << endl; unsigned firstMatTok = rulesApplied[rule][i].first; unsigned patNum = rulesApplied[rule][i].second; @@ -599,21 +947,10 @@ ambiguous_transfer::ruleOuts ( tlAnalysisTokens.push_back (tlAnalysisToken); } -// cout << "here02" << endl; // insert the rule (if not found) then sort the vector pushDistinct (tokenRules, firstMatTok, rule, patNum); -// cout << "here03" << endl; - vector output; - cout << "before" << endl; - for (unsigned j = 0; j < tlAnalysisTokens.size (); j++) - { - for (unsigned k = 0; k < tlAnalysisTokens[j].size (); k++) - { - cout << tlAnalysisTokens[j][k] << " "; - } - cout << endl; - } + vector output; if (xml_operations::getAttValUnsg (rule, ID) == 0) output.push_back (noRuleOut (tlAnalysisTokens[0])); @@ -621,17 +958,6 @@ ambiguous_transfer::ruleOuts ( output = ruleExe (rule, &slAnalysisTokens, &tlAnalysisTokens, attrs, lists, vars, spaces, firstMatTok, localeId); // first pattern index - cout << "after" << endl; - for (unsigned j = 0; j < tlAnalysisTokens.size (); j++) - { - for (unsigned k = 0; k < tlAnalysisTokens[j].size (); k++) - { - cout << tlAnalysisTokens[j][k] << " "; - } - cout << endl; - } - - // cout << "here04" << endl; string str; for (unsigned j = 0; j < output.size (); j++) str += output[j]; @@ -1071,15 +1397,13 @@ equal (xmlNode* equal, vector >* slAnalysisTokens, secondStr += secondResult[i]; } -// cout << "firstStr=" << firstStr << " , secondStr=" << secondStr << endl; - if (xml_operations::getAttVal (equal, CASE_LESS) == "yes") { - return !(CLExec::compareCaseless (firstStr, secondStr, localeId)); + return !(CaseHandler::compareCaseless (firstStr, secondStr)); } else { - return !(CLExec::compare (firstStr, secondStr)); + return !(CaseHandler::compare (firstStr, secondStr)); } } @@ -1090,7 +1414,6 @@ choose (xmlNode* chooseNode, vector >* slAnalysisTokens, map* vars, vector spaces, unsigned firPat, string localeId, map paramToPattern) { - //cout << "Inside " << "choose" << endl; printNodeAttrs (chooseNode); vector output; @@ -1114,7 +1437,6 @@ choose (xmlNode* chooseNode, vector >* slAnalysisTokens, condition = true; } -// cout << "condition=" << condition << endl; if (condition) { for (xmlNode* inst = xml_operations::getFirstChild (child); inst; inst = @@ -1367,7 +1689,7 @@ in (xmlNode* inNode, vector >* slAnalysisTokens, { for (unsigned i = 0; i < list.size (); i++) { - if (!CLExec::compareCaseless (firstStr, list[i], localeId)) + if (!CaseHandler::compareCaseless (firstStr, list[i])) return true; } } @@ -1375,7 +1697,7 @@ in (xmlNode* inNode, vector >* slAnalysisTokens, { for (unsigned i = 0; i < list.size (); i++) { - if (!CLExec::compare (firstStr, list[i])) + if (!CaseHandler::compare (firstStr, list[i])) return true; } } @@ -1467,7 +1789,7 @@ var (xmlNode* var, map* vars) string varName = xml_operations::getAttVal (var, N); string varValue = (*vars)[varName]; -// cout << "varname=" << varName << " , value=" << (*vars)[varName] << endl; + return varValue; } @@ -1534,9 +1856,7 @@ let (xmlNode* let, vector >* slAnalysisTokens, resultStr += secondResult[i]; string varName = xml_operations::getAttVal (firstChild, N); -// cout << "varname=" << varName << " , value=" << resultStr << endl; (*vars)[varName] = resultStr; -// cout << "varname=" << varName << " , value=" << (*vars)[varName] << endl; } else if (firstName == CLIP) { @@ -1623,11 +1943,6 @@ clip (xmlNode* clip, vector >* slAnalysisTokens, if (side == TL) analysisToken = (*tlAnalysisTokens)[pos]; -// cout << "analysisToken = "; -// for (unsigned i = 0; i < analysisToken.size (); i++) -// cout << analysisToken[i] << " "; -// cout << endl; - if (part == WHOLE) { result = analysisToken; @@ -1859,9 +2174,9 @@ caseOf (xmlNode* caseOf, vector >* slAnalysisTokens, else token = (*tlAnalysisTokens)[pos][0]; - if (token == CLExec::toLowerCase (token, localeId)) + if (token == CaseHandler::toLowerCase (token)) Case = aa; - else if (token == CLExec::toUpperCase (token, localeId)) + else if (token == CaseHandler::toUpperCase (token)) Case = AA; else Case = Aa; @@ -1911,12 +2226,12 @@ getCaseFrom (xmlNode* getCaseFrom, vector >* slAnalysisTokens, string slToken = (*slAnalysisTokens)[pos][0]; - if (slToken == CLExec::toLowerCase (slToken, localeId)) - result = CLExec::toLowerCase (result, localeId); - else if (slToken == CLExec::toUpperCase (slToken, localeId)) - result = CLExec::toUpperCase (result, localeId); + if (slToken == CaseHandler::toLowerCase (slToken)) + result = CaseHandler::toLowerCase (result); + else if (slToken == CaseHandler::toUpperCase (slToken)) + result = CaseHandler::toUpperCase (result); else - result = CLExec::FirLetUpperCase (result, localeId); + result = CaseHandler::FirLetUpperCase (result); return result; } @@ -1952,11 +2267,11 @@ modifyCase (xmlNode* modifyCase, vector >* slAnalysisTokens, string varName = xml_operations::getAttVal (firstChild, N); if (Case == aa) - (*vars)[varName] = CLExec::toLowerCase ((*vars)[varName], localeId); + (*vars)[varName] = CaseHandler::toLowerCase ((*vars)[varName]); else if (Case == AA) - (*vars)[varName] = CLExec::toUpperCase ((*vars)[varName], localeId); + (*vars)[varName] = CaseHandler::toUpperCase ((*vars)[varName]); else if (Case == Aa) - (*vars)[varName] = CLExec::FirLetUpperCase ((*vars)[varName], localeId); + (*vars)[varName] = CaseHandler::FirLetUpperCase ((*vars)[varName]); } else if (childName == CLIP) @@ -1979,14 +2294,14 @@ modifyCase (xmlNode* modifyCase, vector >* slAnalysisTokens, if (part == LEM) { if (Case == aa) - (*analysisTokens)[pos][0] = CLExec::toLowerCase ((*analysisTokens)[pos][0], - localeId); + (*analysisTokens)[pos][0] = CaseHandler::toLowerCase ( + (*analysisTokens)[pos][0]); else if (Case == AA) - (*analysisTokens)[pos][0] = CLExec::toUpperCase ((*analysisTokens)[pos][0], - localeId); + (*analysisTokens)[pos][0] = CaseHandler::toUpperCase ( + (*analysisTokens)[pos][0]); else if (Case == Aa) - (*analysisTokens)[pos][0] = CLExec::FirLetUpperCase ( - (*analysisTokens)[pos][0], localeId); + (*analysisTokens)[pos][0] = CaseHandler::FirLetUpperCase ( + (*analysisTokens)[pos][0]); } else if (part == LEMH || part == LEMQ) { @@ -1996,11 +2311,11 @@ modifyCase (xmlNode* modifyCase, vector >* slAnalysisTokens, if (spaceInd == string::npos) { if (Case == aa) - lem = CLExec::toLowerCase (lem, localeId); + lem = CaseHandler::toLowerCase (lem); else if (Case == AA) - lem = CLExec::toUpperCase (lem, localeId); + lem = CaseHandler::toUpperCase (lem); else if (Case == Aa) - lem = CLExec::FirLetUpperCase (lem, localeId); + lem = CaseHandler::FirLetUpperCase (lem); } else { @@ -2010,20 +2325,20 @@ modifyCase (xmlNode* modifyCase, vector >* slAnalysisTokens, if (part == LEMH) { if (Case == aa) - lemh = CLExec::toLowerCase (lemh, localeId); + lemh = CaseHandler::toLowerCase (lemh); else if (Case == AA) - lemh = CLExec::toUpperCase (lemh, localeId); + lemh = CaseHandler::toUpperCase (lemh); else if (Case == Aa) - lemh = CLExec::FirLetUpperCase (lemh, localeId); + lemh = CaseHandler::FirLetUpperCase (lemh); } else { if (Case == aa) - lemq = CLExec::toLowerCase (lemq, localeId); + lemq = CaseHandler::toLowerCase (lemq); else if (Case == AA) - lemq = CLExec::toUpperCase (lemq, localeId); + lemq = CaseHandler::toUpperCase (lemq); else if (Case == Aa) - lemq = CLExec::FirLetUpperCase (lemq, localeId); + lemq = CaseHandler::FirLetUpperCase (lemq); } lem = lemh + lemq; diff --git a/src/ambiguous_chunker.h b/src/ambiguous_chunker.h new file mode 100644 index 0000000..7ef3231 --- /dev/null +++ b/src/ambiguous_chunker.h @@ -0,0 +1,306 @@ +/* + * ambiguous_transfer.h + * + * Created on: May 5, 2018 + * Author: aboelhamd + */ + +#ifndef SRC_AMBIGUOUS_CHUNKER_H_ +#define SRC_AMBIGUOUS_CHUNKER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "TranElemLiterals.h" +#include "xml_operations.h" + +using namespace std; + +class AmbiguousChunker +{ +public: + class Node + { + public: + unsigned tokenId; + unsigned ruleId; + unsigned patNum; + vector neighbors; + Node (unsigned tokenId, unsigned ruleId, unsigned patNum) + { + this->tokenId = tokenId; + this->ruleId = ruleId; + this->patNum = patNum; + } + Node () + { + this->tokenId = 0; + this->ruleId = 0; + this->patNum = 0; + } + }; + + class AmbigInfo + { + public: + unsigned firTokId; + unsigned maxPat; + vector > combinations; + AmbigInfo (unsigned firTokId, unsigned maxPat) + { + this->firTokId = firTokId; + this->maxPat = maxPat; + } + AmbigInfo () + { + this->firTokId = 0; + this->maxPat = 0; + } + }; + + static void + lexFormsTokenizer (vector* slTokens, vector* tlTokens, + vector >* slTags, vector >* tlTags, + vector* spaces, string tokenizedSentenceStr); + + static void + matchCats (map >* catsApplied, vector slTokens, + vector > tags, xmlNode* transfer); + + static void + matchRules (map > >* rulesApplied, + vector slTokens, map > catsApplied, + xmlNode* transfer); + + static map > > + getAttrs (xmlNode* transfer); + + static map + getVars (xmlNode* transfer); + + static map > + getLists (xmlNode* transfer); + + static void + normaliseWeights (vector* weights, vector ambigInfo); + + static map > + getNodesPool (map > > tokenRules); + + static void + getAmbigInfo (map > > tokenRules, + map > nodesPool, vector* ambigInfo, + unsigned* combNum); + + static void + ruleOuts (map >* ruleOuts, + map > >* tokenRules, + vector slTokens, vector > slTags, + vector tlTokens, vector > tlTags, + map > > rulesApplied, + map > > attrs, + map > lists, map* vars, + vector spaces, string localeId); + + static void + getOuts (vector* finalOuts, + vector >* finalCombNodes, + vector, float> > beamTree, + map > nodesPool, + map > ruleOutputs, vector spaces); + + static void + getOuts (vector* finalOuts, vector >* combNodes, + vector ambigInfo, + map > nodesPool, + map > ruleOutputs, vector spaces); +}; + +string +noRuleOut (vector analysis); + +void +putOut (vector* outputs, string output, unsigned tokenIndex, + vector spaces); + +vector +putOuts (vector outputs, vector nestedOutputs); + +void +putCombination (vector >* combinations, + vector combination); + +vector > +putCombinations (vector > combinations, + vector > nestedcombinations); + +AmbiguousChunker::Node +ambiguousGraph (map > > tokenRules, + map > nodesPool, unsigned firTok, + unsigned maxPat); + +AmbiguousChunker::Node +ambiguousGraph (map > > tokenRules, + map > nodesPool); + +bool +outputs ( + vector* outs, + vector > >* rulesIds, + vector > >* outsRules, + vector, pair > > > > *ambigInfo, + vector tlTokens, vector > tags, + map > ruleOuts, + map > > tokenRules, vector spaces); + +vector +ruleExe (xmlNode* rule, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, + string localeId); + +vector +choose (xmlNode* choose, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, + string localeId, map paramToPattern); + +void +let (xmlNode* let, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +vector +callMacro (xmlNode* callMacro, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, + map > lists, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +vector +out (xmlNode* out, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +vector +chunk (xmlNode* chunkNode, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +vector +formatTokenTags (string token, vector tags); + +vector +findAttrPart (vector tokenTags, vector > attrTags); + +vector +clip (xmlNode* clip, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern, + vector > tags = vector > ()); + +vector +concat (xmlNode* concat, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern, + vector > tags = vector > ()); + +bool +equal (xmlNode* equal, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +bool +test (xmlNode* test, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +bool +And (xmlNode* And, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +bool +Or (xmlNode* Or, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +bool +in (xmlNode* in, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +bool +Not (xmlNode* Not, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map > lists, + map* vars, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +vector +litTag (xmlNode* litTag); + +string +lit (xmlNode* lit); + +string +var (xmlNode* var, map* vars); + +void +append (xmlNode* append, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +string +b (xmlNode* b, vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +string +caseOf (xmlNode* caseOf, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, string localeId, + map paramToPattern); + +string +getCaseFrom (xmlNode* getCaseFrom, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +void +modifyCase (xmlNode* modifyCase, vector >* slAnalysisTokens, + vector >* tlAnalysisTokens, + map > > attrs, map* vars, + vector spaces, unsigned firPat, string localeId, + map paramToPattern); + +#endif /* SRC_AMBIGUOUS_CHUNKER_H_ */ diff --git a/src/ambiguous_tansfer.cpp b/src/ambiguous_tansfer.cpp new file mode 100644 index 0000000..ede7ae8 --- /dev/null +++ b/src/ambiguous_tansfer.cpp @@ -0,0 +1,349 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "TranElemLiterals.h" +#include + +#include "ambiguous_chunker.h" +#include "ambiguous_transfer.h" +#include "case_handler.h" + +using namespace std; +using namespace elem; + +void +AmbiguousTransfer::transfer (string transferFilePath, string modelsFileDest, string k, + FILE* lextorFile, FILE* outFile) +{ + + xmlDoc* doc = xmlReadFile (transferFilePath.c_str (), NULL, 0); + + if (doc == NULL) + { + cerr << "Error: Could not parse file \'" << transferFilePath << "\'." << endl; + exit (EXIT_FAILURE); + } + + xmlNode* transfer = xmlDocGetRootElement (doc); + + map > > attrs = AmbiguousChunker::getAttrs (transfer); + map vars = AmbiguousChunker::getVars (transfer); + map > lists = AmbiguousChunker::getLists (transfer); + + string localeId; + map > > classesWeights = loadYasmetModels ( + modelsFileDest, &localeId); + + int beam; + stringstream buffer (k); + buffer >> beam; + + char buff[10240]; + string tokenizedSentence; + while (fgets (buff, 10240, lextorFile)) + { + tokenizedSentence = buff; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + AmbiguousChunker::lexFormsTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, + &spaces, tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + AmbiguousChunker::matchCats (&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + AmbiguousChunker::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + AmbiguousChunker::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // final outputs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // beam tree + vector, float> > beamTree; + // rules combinations + vector > combNodes; + + nodesPool = AmbiguousChunker::getNodesPool (tokenRules); + + AmbiguousChunker::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size (); j++) + if (ambigInfo[j].combinations.size () > 1) + newAmbigInfo.push_back (ambigInfo[j]); + + beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, localeId); + + AmbiguousChunker::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, + spaces); + + // write the outs + for (unsigned j = 0; j < outs.size (); j++) + { + fputs (outs[j].c_str (), outFile); + } + } + +} + +// to sort translations from best to worth by their weight +bool +sortParameter (pair, float> a, + pair, float> b) +{ + return (a.second > b.second); +} + +void +beamSearch (vector, float> > *beamTree, unsigned beam, + vector slTokens, vector ambigInfo, + map > > classesWeights, string localeId) +{ + // Initialization + (*beamTree).push_back (pair, float> ()); + + for (unsigned i = 0; i < ambigInfo.size (); i++) + { + + AmbiguousChunker::AmbigInfo ambig = ambigInfo[i]; + + unsigned ambigRulesSize = ambig.combinations.size (); + + // name of the file is the concatenation of rules ids + string rulesNums; + for (unsigned x = 0; x < ambigRulesSize; x++) + { + // avoid dummy node + for (unsigned y = 1; y < ambig.combinations[x].size (); y++) + { + stringstream ss; + ss << ambig.combinations[x][y].ruleId; + rulesNums += ss.str (); + + if (y + 1 < ambig.combinations[x].size ()) + rulesNums += "_"; + } + rulesNums += "+"; + } + + map > classWeights = classesWeights[(rulesNums + ".model")]; + + // build new tree for the new words + vector, float> > newTree; + + // initialize the new tree + for (unsigned x = 0; x < ambigRulesSize; x++) + { + newTree.push_back ( + pair, float> ( + vector (), 0)); + } + // put rules + for (unsigned z = 0; z < ambigRulesSize; z++) + { + for (unsigned y = 0; y < ambig.combinations[z].size (); y++) + { + newTree[z].first.push_back (ambig.combinations[z][y]); + } + } + + for (unsigned x = ambig.firTokId; x < ambig.firTokId + ambig.maxPat; x++) + { + // word key is the word and it's order in the rule + stringstream ss; + ss << x - ambig.firTokId; + string num = "_" + ss.str (); + + // handle the case of two lemmas separated by a space + for (unsigned t = 0; t < slTokens[x].size (); t++) + if (slTokens[x][t] == ' ') + slTokens[x].replace (t, 1, "_"); + + string word = CaseHandler::toLowerCase (slTokens[x]) + num; + vector wordWeights = classWeights[word]; + + // put weights + if (wordWeights.empty ()) + { + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += 1; + cout << "word : " << word << " is not found in dataset : " << rulesNums + << endl; + } + + else + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += wordWeights[z]; + + } + + // expand beamTree + unsigned initSize = beamTree->size (); + for (unsigned z = 0; z < ambigRulesSize - 1; z++) + { + for (unsigned x = 0; x < initSize; x++) + { + beamTree->push_back ( + pair, float> ((*beamTree)[x])); + } + } + + // merge the two trees + for (unsigned z = 0; z < ambigRulesSize; z++) + { + for (unsigned x = initSize * z; x < initSize * (z + 1); x++) + { + // put the new rules with the old + (*beamTree)[x].first.insert ((*beamTree)[x].first.end (), + newTree[z].first.begin (), + newTree[z].first.end ()); + + // add their wiehgts + (*beamTree)[x].second += newTree[z].second; + } + } + + // sort beam tree + sort (beamTree->begin (), beamTree->end (), sortParameter); + + // remove elements more than (beam) + if (beamTree->size () > beam) + beamTree->erase (beamTree->begin () + beam, beamTree->end ()); + } + + // keep only the best sentence + beamTree->erase (beamTree->begin () + 1, beamTree->end ()); +} + +map > > +loadYasmetModels (string modelsFilePath, string *localeid) +{ + // map with key yasmet model name and the value is + // another map with key word name and the value is + // vector of weights in order + map > > classWeights; + + ifstream modelsFile ((modelsFilePath).c_str ()); + + if (modelsFile.is_open ()) + { + string line, model, token, weight; + + // localeid + getline (modelsFile, line); + *localeid = line; + + while (getline (modelsFile, line)) + { + // 0=>word , 1=>rule_num & 2=>wieght + // we don't need rule number , because + // the weights are already sorted + + char lineChar[line.size ()]; + strcpy (lineChar, line.c_str ()); + + token = strtok (lineChar, ": "); + if (token == "file") + { + model = strtok (NULL, ": "); + continue; + } + // skip rule_num + strtok (NULL, ": "); + + weight = strtok (NULL, ": "); + + float w = strtof (weight.c_str (), NULL); + + classWeights[model][token].push_back (w); + } + } + else + { + cout << "error in opening models file" << endl; + } + + return classWeights; +} + +FILE * +open_input (string const &filename) +{ + FILE *input = fopen (filename.c_str (), "r"); + if (!input) + { + wcerr << "Error: can't open input file '"; + wcerr << filename.c_str () << "'." << endl; + exit (EXIT_FAILURE); + } + + return input; +} + +FILE * +open_output (string const &filename) +{ + FILE *output = fopen (filename.c_str (), "w"); + if (!output) + { + wcerr << "Error: can't open output file '"; + wcerr << filename.c_str () << "'." << endl; + exit (EXIT_FAILURE); + } + return output; +} + +int +main (int argc, char **argv) +{ + string transferFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/apertium-eng-spa.spa-eng.t1x"; + string modelsFileDest = "nomodel"; + string k = "8"; + string lextor = "lex.txt"; + string out = "out.txt"; + + FILE *input = stdin, *output = stdout; + input = open_input (lextor); + output = open_output (out); + AmbiguousTransfer::transfer (transferFilePath, modelsFileDest, k, input, output); +} diff --git a/src/ambiguous_transfer.h b/src/ambiguous_transfer.h index 3e5b7e2..dd6f8db 100644 --- a/src/ambiguous_transfer.h +++ b/src/ambiguous_transfer.h @@ -1,287 +1,31 @@ /* - * ambiguous_transfer.h + * BeamSearch.h * - * Created on: May 5, 2018 + * Created on: Mar 10, 2019 * Author: aboelhamd */ #ifndef SRC_AMBIGUOUS_TRANSFER_H_ #define SRC_AMBIGUOUS_TRANSFER_H_ -#include -#include -#include #include -#include -#include -#include -#include "TranElemLiterals.h" -#include "xml_operations.h" using namespace std; -class ambiguous_transfer { +class AmbiguousTransfer +{ public: - class Node { - public: - unsigned tokenId; - unsigned ruleId; - unsigned patNum; - vector neighbors; - Node(unsigned tokenId, unsigned ruleId, unsigned patNum) { - this->tokenId = tokenId; - this->ruleId = ruleId; - this->patNum = patNum; - } - Node() { - this->tokenId = 0; - this->ruleId = 0; - this->patNum = 0; - } - }; - - class AmbigInfo { - public: - unsigned firTokId; - unsigned maxPat; - vector > combinations; - AmbigInfo(unsigned firTokId, unsigned maxPat) { - this->firTokId = firTokId; - this->maxPat = maxPat; - } - AmbigInfo() { - this->firTokId = 0; - this->maxPat = 0; - } - }; - - static void - normaliseWeights(vector* weights, vector ambigInfo); - - static map > - getNodesPool(map > > tokenRules); - - static void - getAmbigInfo(map > > tokenRules, - map > nodesPool, - vector* ambigInfo, unsigned* combNum); - - static void - ruleOuts(map >* ruleOuts, - map > >* tokenRules, - vector slTokens, vector > slTags, - vector tlTokens, vector > tlTags, - map > > rulesApplied, - map > > attrs, - map > lists, map* vars, - vector spaces, string localeId); - - static void - getOuts(vector* finalOuts, - vector >* finalCombNodes, - vector, float> > beamTree, - map > nodesPool, - map > ruleOutputs, - vector spaces); - - static void - getOuts(vector* finalOuts, - vector >* combNodes, - vector ambigInfo, - map > nodesPool, - map > ruleOutputs, - vector spaces); + static void + transfer (string transferFilePath, string modelsFileDest, string k, + FILE* lextorFileFile, FILE* outFile); }; -string -noRuleOut(vector analysis); - -void -putOut(vector* outputs, string output, unsigned tokenIndex, - vector spaces); - -vector -putOuts(vector outputs, vector nestedOutputs); - void -putCombination(vector >* combinations, - vector combination); - -vector > -putCombinations(vector > combinations, - vector > nestedcombinations); - -ambiguous_transfer::Node -ambiguousGraph(map > > tokenRules, - map > nodesPool, - unsigned firTok, unsigned maxPat); - -ambiguous_transfer::Node -ambiguousGraph(map > > tokenRules, - map > nodesPool); - -bool -outputs(vector* outs, - vector > >* rulesIds, - vector > >* outsRules, - vector< - pair, - pair > > > > *ambigInfo, - vector tlTokens, vector > tags, - map > ruleOuts, - map > > tokenRules, - vector spaces); - -vector -ruleExe(xmlNode* rule, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId); - -vector -choose(xmlNode* choose, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -void -let(xmlNode* let, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -vector -callMacro(xmlNode* callMacro, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -vector -out(xmlNode* out, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); +beamSearch (vector, float> > *beamTree, unsigned beam, + vector slTokens, vector ambigInfo, + map > > classesWeights, string localeId); -vector -chunk(xmlNode* chunkNode, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -vector -formatTokenTags(string token, vector tags); - -vector -findAttrPart(vector tokenTags, vector > attrTags); - -vector -clip(xmlNode* clip, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern, vector > tags = - vector >()); - -vector -concat(xmlNode* concat, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern, vector > tags = - vector >()); - -bool -equal(xmlNode* equal, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -bool -test(xmlNode* test, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -bool -And(xmlNode* And, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -bool -Or(xmlNode* Or, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -bool -in(xmlNode* in, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -bool -Not(xmlNode* Not, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, - map > lists, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -vector -litTag(xmlNode* litTag); - -string -lit(xmlNode* lit); - -string -var(xmlNode* var, map* vars); - -void -append(xmlNode* append, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -string -b(xmlNode* b, vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -string -caseOf(xmlNode* caseOf, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, string localeId, - map paramToPattern); - -string -getCaseFrom(xmlNode* getCaseFrom, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); - -void -modifyCase(xmlNode* modifyCase, vector >* slAnalysisTokens, - vector >* tlAnalysisTokens, - map > > attrs, map* vars, - vector spaces, unsigned firPat, string localeId, - map paramToPattern); +map > > +loadYasmetModels (string modelsDest, string *localeid); #endif /* SRC_AMBIGUOUS_TRANSFER_H_ */ diff --git a/src/case_handler.cpp b/src/case_handler.cpp new file mode 100644 index 0000000..73a9a7a --- /dev/null +++ b/src/case_handler.cpp @@ -0,0 +1,85 @@ +/* + * CLExec.cpp + * + * Created on: Jun 21, 2018 + * Author: aboelhamd + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "case_handler.h" + +using namespace std; + +string +CaseHandler::toLowerCase (string s) +{ + wstring ws = UtfConverter::fromUtf8 (s); + ws = StringUtils::tolower (ws); + s = UtfConverter::toUtf8 (ws); + + return s; +} + +string +CaseHandler::toUpperCase (string s) +{ + wstring ws = UtfConverter::fromUtf8 (s); + ws = StringUtils::toupper (ws); + s = UtfConverter::toUtf8 (ws); + + return s; +} + +string +CaseHandler::FirLetUpperCase (string s) +{ + wstring ws = UtfConverter::fromUtf8 (s); + + ws[0] = (wchar_t) towupper (ws[0]); + + s = UtfConverter::toUtf8 (ws); + + return s; +} + +// The result of bitwise character comparison: 0 if this contains +// the same characters as text, -1 if the characters in this are +// bitwise less than the characters in text, +1 if the characters +// in this are bitwise greater than the characters in text. +int +CaseHandler::compare (string s1, string s2) +{ + wstring ws1 = UtfConverter::fromUtf8 (s1); + wstring ws2 = UtfConverter::fromUtf8 (s2); + + return ws1.compare (ws2); +} + +int +CaseHandler::compareCaseless (string s1, string s2) +{ + wstring ws1 = UtfConverter::fromUtf8 (s1); + ws1 = StringUtils::tolower (ws1); + wstring ws2 = UtfConverter::fromUtf8 (s2); + ws2 = StringUtils::tolower (ws2); + + return ws1.compare (ws2); +} diff --git a/src/case_handler.h b/src/case_handler.h new file mode 100644 index 0000000..953dfda --- /dev/null +++ b/src/case_handler.h @@ -0,0 +1,39 @@ +/* + * CLExec.h + * + * Created on: Jun 21, 2018 + * Author: aboelhamd + */ + +#ifndef SRC_CASE_HANDLER_H_ +#define SRC_CASE_HANDLER_H_ + +#include +#include + +#include "ambiguous_chunker.h" + +using namespace std; + +class CaseHandler +{ +public: + + static string + toLowerCase (string s); + + static string + toUpperCase (string s); + + static string + FirLetUpperCase (string s); + + static int + compare (string s1, string s2); + + static int + compareCaseless (string s1, string s2); + +}; + +#endif /* SRC_CASE_HANDLER_H_ */ diff --git a/src/xml_operations.h b/src/xml_operations.h index 8caf706..f0109e0 100644 --- a/src/xml_operations.h +++ b/src/xml_operations.h @@ -14,25 +14,34 @@ using namespace std; -class xml_operations { +class xml_operations +{ public: - static xmlNode* getRoot(xmlNode* node); + static xmlNode* + getRoot (xmlNode* node); - static xmlNode* getFirstNext(xmlNode* node); + static xmlNode* + getFirstNext (xmlNode* node); - static xmlNode* getNext(xmlNode* node, string nextName); + static xmlNode* + getNext (xmlNode* node, string nextName); - static xmlNode* getFirstChild(xmlNode* parent); + static xmlNode* + getFirstChild (xmlNode* parent); - static xmlNode* getChild(xmlNode* parent, string childName); + static xmlNode* + getChild (xmlNode* parent, string childName); - static string getAttVal(xmlNode* node, string attrName); + static string + getAttVal (xmlNode* node, string attrName); - static unsigned getAttValUnsg(xmlNode* node, string attrName); + static unsigned + getAttValUnsg (xmlNode* node, string attrName); - static string getName(xmlNode* node); + static string + getName (xmlNode* node); }; #endif /* SRC_XML_OPERATIONS_H_ */