commit de5c00c7318bd8705feb6f18d5ccdc99437bb8de Author: aboelhamd Date: Sat May 25 02:59:58 2019 +0200 Modify beamsearch to work with tags features diff --git a/score-sentences.py b/score-sentences.py index 21de0bf..3db521f 100644 --- a/score-sentences.py +++ b/score-sentences.py @@ -12,7 +12,7 @@ weightfile = open(sys.argv[3], 'w+') model = kenlm.LanguageModel(sys.argv[1]) for sentence in targetfile: - weightfile.write('%f\n' % (1.0/model.score(sentence))) + weightfile.write('%f\n' % -(1.0/model.score(sentence))) targetfile.close() weightfile.close() diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index 9ca5738..2f0186c 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -26,28 +26,24 @@ using namespace std; using namespace pugi; using namespace elem; -int -main (int argc, char **argv) -{ - string lextorFilePath, interInFilePath, localeId, transferFilePath, modelsDest, k; - - if (argc == 7) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - interInFilePath = argv[4]; - modelsDest = argv[5]; - k = argv[6]; - } - else - { - localeId = "es_ES"; - transferFilePath = "apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = "lextor.txt"; - interInFilePath = "beaminter.txt"; - modelsDest = "/home/aboelhamd/Downloads/models"; - k = "8"; +int main(int argc, char **argv) { + string lextorFilePath, interInFilePath, localeId, transferFilePath, + modelsDest, k; + + if (argc == 7) { + localeId = argv[1]; + transferFilePath = argv[2]; + lextorFilePath = argv[3]; + interInFilePath = argv[4]; + modelsDest = argv[5]; + k = argv[6]; + } else { + localeId = "es_ES"; + transferFilePath = "apertium-eng-spa.spa-eng.t1x"; + lextorFilePath = "lextor.txt"; + interInFilePath = "beaminter.txt"; + modelsDest = "/home/aboelhamd/Downloads/models"; + k = "8"; // localeId = "kk_KZ"; // transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; @@ -57,131 +53,133 @@ main (int argc, char **argv) // modelsDest = "./UntitledFolder/models"; // k = "8"; - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "interInFilePath : Output file of this program which is the input for apertium interchunk." - << endl; - cout << "modelsDest : Yasmet models merged file destination." << endl; - cout << "beamSize : The size of beam in beam search algorithm." << endl; - return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ofstream interInFile (interInFilePath.c_str ()); - if (lextorFile.is_open () && interInFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; + cout << "Error in parameters !" << endl; + cout + << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" + << endl; + cout + << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" + << endl; + cout + << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout + << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "interInFilePath : Output file of this program which is the input for apertium interchunk." + << endl; + cout << "modelsDest : Yasmet models merged file destination." << endl; + cout << "beamSize : The size of beam in beam search algorithm." << endl; + return -1; } - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - map > > classesWeights = - CLExec::loadYasmetModels (modelsDest); - - int beam; - stringstream buffer (k); - buffer >> beam; + ifstream lextorFile(lextorFilePath.c_str()); + ofstream interInFile(interInFilePath.c_str()); + if (lextorFile.is_open() && interInFile.is_open()) { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file( + transferFilePath.c_str()); + if (string(result.description()) != "No error") { + cout << "ERROR : " << result.description() << endl; + return -1; + } + + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child("transfer"); + + map > > attrs = RuleParser::getAttrs( + transfer); + map vars = RuleParser::getVars(transfer); + map > lists = RuleParser::getLists(transfer); + map > > classesWeights = + CLExec::loadYasmetModels(modelsDest); + + int beam; + stringstream buffer(k); + buffer >> beam; // unsigned i = 0; - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { + string tokenizedSentence; + while (getline(lextorFile, tokenizedSentence)) { // cout << i << endl; - // spaces after each token - vector spaces; +// spaces after each token + vector spaces; - // tokens in the sentence order - vector slTokens, tlTokens; + // tokens in the sentence order + vector slTokens, tlTokens; - // tags of tokens in order - vector > slTags, tlTags; + // tags of tokens in order + vector > slTags, tlTags; - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); + RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, + &tlTags, &spaces, tokenizedSentence); - // map of tokens ids and their matched categories - map > catsApplied; + // map of tokens ids and their matched categories + map > catsApplied; - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, + transfer); - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); + RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, + tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); - // final outputs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // beam tree - vector, float> > beamTree; - // rules combinations - vector > combNodes; + // final outputs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // beam tree + vector, float> > beamTree; + // rules combinations + vector > combNodes; - nodesPool = RuleExecution::getNodesPool (tokenRules); + nodesPool = RuleExecution::getNodesPool(tokenRules); - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); + RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, + &compNum); - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j]->combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size(); j++) + if (ambigInfo[j]->combinations.size() > 1) + newAmbigInfo.push_back(ambigInfo[j]); - CLExec::beamSearch (&beamTree, beam, slTokens, newAmbigInfo, classesWeights, - localeId); + CLExec::beamSearch(&beamTree, beam, slTokens, slTags, newAmbigInfo, + classesWeights, localeId); - // take the first sentence only - beamTree.erase (beamTree.begin () + 1, beamTree.end ()); + // take the first sentence only + beamTree.erase(beamTree.begin() + 1, beamTree.end()); - RuleExecution::getOuts (&outs, &combNodes, beamTree, nodesPool, ruleOutputs, - spaces); + RuleExecution::getOuts(&outs, &combNodes, beamTree, nodesPool, + ruleOutputs, spaces); - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - interInFile << outs[j] << endl; + // write the outs + for (unsigned j = 0; j < outs.size(); j++) + interInFile << outs[j] << endl; + } + interInFile.close(); + lextorFile.close(); + } else { + cout << "ERROR in opening files!" << endl; } - interInFile.close (); - lextorFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - return 0; + return 0; } diff --git a/src/BeamSearch.h b/src/BeamSearch.h deleted file mode 100644 index 6793339..0000000 --- a/src/BeamSearch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * BeamSearch.h - * - * Created on: Mar 10, 2019 - * Author: aboelhamd - */ - -#ifndef SRC_BEAMSEARCH_H_ -#define SRC_BEAMSEARCH_H_ - -#include - -using namespace std; - -class BeamSearch -{ -public: - static void - transfer (string transferFilePath, string localeId, string modelsFileDest, string k, - FILE* lextorFileFile, FILE* outFile); -}; -#endif /* SRC_BEAMSEARCH_H_ */ diff --git a/src/CLExec.cpp b/src/CLExec.cpp index e54475f..e2215db 100644 --- a/src/CLExec.cpp +++ b/src/CLExec.cpp @@ -33,118 +33,103 @@ using namespace std; using namespace pugi; using namespace elem; -string -exec (string cmd) -{ - string data; - FILE * stream; - const int max_buffer = 256; - char buffer[max_buffer]; - - stream = popen (cmd.c_str (), "r"); - if (stream) - { - while (!feof (stream)) - if (fgets (buffer, max_buffer, stream) != NULL) - data.append (buffer); - pclose (stream); - } - return data; +string exec(string cmd) { + string data; + FILE * stream; + const int max_buffer = 256; + char buffer[max_buffer]; + + stream = popen(cmd.c_str(), "r"); + if (stream) { + while (!feof(stream)) + if (fgets(buffer, max_buffer, stream) != NULL) + data.append(buffer); + pclose(stream); + } + return data; } -void -CLExec::segmenter (string inFilePath, string outFilePath) -{ - // clear file before writing again - ofstream ofs; - ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); - exec ( - string ("ruby2.3 kazSentenceTokenizer.rb ") + inFilePath + string (" ") - + outFilePath); +void CLExec::segmenter(string inFilePath, string outFilePath) { + // clear file before writing again + ofstream ofs; + ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); + exec( + string("ruby2.3 kazSentenceTokenizer.rb ") + inFilePath + + string(" ") + outFilePath); } -void -CLExec::biltrans (string inFilePath, string outFilePath) -{ - // clear file before writing again - ofstream ofs; - ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); - exec ( - string ("apertium -d $HOME/apertium-kaz-tur kaz-tur-biltrans ") + inFilePath - + string (" ") + outFilePath); +void CLExec::biltrans(string inFilePath, string outFilePath) { + // clear file before writing again + ofstream ofs; + ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); + exec( + string("apertium -d $HOME/apertium-kaz-tur kaz-tur-biltrans ") + + inFilePath + string(" ") + outFilePath); } -void -CLExec::lextor (string inFilePath, string outFilePath) -{ - // clear file before writing again - ofstream ofs; - ofs.open (outFilePath.c_str (), ofstream::out | ofstream::trunc); - exec ( - string ("lrx-proc -m $HOME/apertium-kaz-tur/kaz-tur.autolex.bin ") + inFilePath - + string (" >") + outFilePath); +void CLExec::lextor(string inFilePath, string outFilePath) { + // clear file before writing again + ofstream ofs; + ofs.open(outFilePath.c_str(), ofstream::out | ofstream::trunc); + exec( + string("lrx-proc -m $HOME/apertium-kaz-tur/kaz-tur.autolex.bin ") + + inFilePath + string(" >") + outFilePath); } -void -CLExec::interchunk (string inFilePath, string outFilePath) -{ - exec ( - string ("apertium-interchunk") - + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t2x") - + string (" $HOME/apertium-kaz-tur/kaz-tur.t2x.bin ") + inFilePath - + string (" ") + outFilePath); +void CLExec::interchunk(string inFilePath, string outFilePath) { + exec( + string("apertium-interchunk") + + string( + " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t2x") + + string(" $HOME/apertium-kaz-tur/kaz-tur.t2x.bin ") + + inFilePath + string(" ") + outFilePath); } -void -CLExec::postchunk (string inFilePath, string outFilePath) -{ - exec ( - string ("apertium-postchunk") - + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t3x") - + string (" $HOME/apertium-kaz-tur/kaz-tur.t3x.bin ") + inFilePath - + string (" ") + outFilePath); +void CLExec::postchunk(string inFilePath, string outFilePath) { + exec( + string("apertium-postchunk") + + string( + " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t3x") + + string(" $HOME/apertium-kaz-tur/kaz-tur.t3x.bin ") + + inFilePath + string(" ") + outFilePath); } -void -CLExec::transfer (string inFilePath, string outFilePath) -{ - exec ( - string ("apertium-transfer -n") - + string (" $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t4x") - + string (" $HOME/apertium-kaz-tur/kaz-tur.t4x.bin ") + inFilePath - + string (" | lt-proc -g $HOME/apertium-kaz-tur/kaz-tur.autogen.bin") - + string (" | lt-proc -p $HOME/apertium-kaz-tur/kaz-tur.autopgen.bin") - + string (" >") + outFilePath); +void CLExec::transfer(string inFilePath, string outFilePath) { + exec( + string("apertium-transfer -n") + + string( + " $HOME/apertium-kaz-tur/apertium-kaz-tur.kaz-tur.t4x") + + string(" $HOME/apertium-kaz-tur/kaz-tur.t4x.bin ") + + inFilePath + + string( + " | lt-proc -g $HOME/apertium-kaz-tur/kaz-tur.autogen.bin") + + string( + " | lt-proc -p $HOME/apertium-kaz-tur/kaz-tur.autopgen.bin") + + string(" >") + outFilePath); } -void -CLExec::assignWeights (string inFilePath, string outFilePath) -{ - exec ( - (string ("python3 $HOME/NormaliseK/exampleken.py <") + string (inFilePath) - + string (">") + string (outFilePath)).c_str ()); +void CLExec::assignWeights(string inFilePath, string outFilePath) { + exec( + (string("python3 $HOME/NormaliseK/exampleken.py <") + + string(inFilePath) + string(">") + string(outFilePath)).c_str()); } -vector -CLExec::getFilesInDir (string dir) -{ - vector files; - - DIR *pDIR; - struct dirent *entry; - if ((pDIR = opendir ((string ("./") + dir).c_str ()))) - { - while ((entry = readdir (pDIR))) - { - if (strcmp (entry->d_name, ".") != 0 && strcmp (entry->d_name, "..") != 0) - { - files.push_back (entry->d_name); - } +vector CLExec::getFilesInDir(string dir) { + vector files; + + DIR *pDIR; + struct dirent *entry; + if ((pDIR = opendir((string("./") + dir).c_str()))) { + while ((entry = readdir(pDIR))) { + if (strcmp(entry->d_name, ".") != 0 + && strcmp(entry->d_name, "..") != 0) { + files.push_back(entry->d_name); + } + } + closedir(pDIR); } - closedir (pDIR); - } - return files; + return files; } //void @@ -162,63 +147,57 @@ CLExec::getFilesInDir (string dir) // } //} -map > > -CLExec::loadYasmetModels (string modelsFilePath/*, string *localeid*/) -{ - // map with key yasmet model name and the value is - // another map with key word name and the value is - // vector of weights in order - map > > classWeights; +map > > CLExec::loadYasmetModels( + string modelsFilePath/*, string *localeid*/) { + // map with key yasmet model name and the value is + // another map with key word name and the value is + // vector of weights in order + map > > classWeights; - ifstream modelsFile ((modelsFilePath).c_str ()); + ifstream modelsFile((modelsFilePath).c_str()); - if (modelsFile.is_open ()) - { - string line, model, token, weight; + if (modelsFile.is_open()) { + string line, model, token, weight; - // localeid + // localeid // getline (modelsFile, line); // *localeid = line; - while (getline (modelsFile, line)) - { - // 0=>word , 1=>rule_num & 2=>wieght - // we don't need rule number , because - // the weights are already sorted - - char lineChar[line.size ()]; - strcpy (lineChar, line.c_str ()); - - token = strtok (lineChar, ": "); - if (token == "file") - { - model = strtok (NULL, ": "); - continue; - } - // skip rule_num - strtok (NULL, ": "); + while (getline(modelsFile, line)) { + // 0=>word , 1=>rule_num & 2=>wieght + // we don't need rule number , because + // the weights are already sorted + + char lineChar[line.size()]; + strcpy(lineChar, line.c_str()); + + token = strtok(lineChar, ": "); + if (token == "file") { + model = strtok(NULL, ": "); + continue; + } + // skip rule_num + strtok(NULL, ": "); // cout << "rulenum= " << strtok(NULL, ": ") << endl; - weight = strtok (NULL, ": "); + weight = strtok(NULL, ": "); // cout << "weight= " << weight << endl; - float w = strtof (weight.c_str (), NULL); + float w = strtof(weight.c_str(), NULL); // cout << w << endl; // if (w < 0) // cout << w << endl; - classWeights[model][token].push_back (w); + classWeights[model][token].push_back(w); // if (classWeights[model][token][classWeights[model][token].size() - 1] // < 0) // cout << w << endl; // cout // << classWeights[model][token][classWeights[model][token].size() // - 1] << endl; + } + } else { + cout << "error in opening models file" << endl; } - } - else - { - cout << "error in opening models file" << endl; - } // for (map > >::iterator it = // classWeights.begin(); it != classWeights.end(); it++) { // cout << "model=" << it->first << endl; @@ -232,84 +211,71 @@ CLExec::loadYasmetModels (string modelsFilePath/*, string *localeid*/) // cout << endl; // } // } - return classWeights; + return classWeights; } -string -CLExec::toLowerCase (string word, string localeId) -{ - icu::UnicodeString uString (word.c_str ()); - string lowWord; - uString.toLower (localeId.c_str ()).toUTF8String (lowWord); - return lowWord; +string CLExec::toLowerCase(string word, string localeId) { + icu::UnicodeString uString(word.c_str()); + string lowWord; + uString.toLower(localeId.c_str()).toUTF8String(lowWord); + return lowWord; } -string -CLExec::toUpperCase (string word, string localeId) -{ - icu::UnicodeString uString (word.c_str ()); - string upWord; - uString.toUpper (localeId.c_str ()).toUTF8String (upWord); - return upWord; +string CLExec::toUpperCase(string word, string localeId) { + icu::UnicodeString uString(word.c_str()); + string upWord; + uString.toUpper(localeId.c_str()).toUTF8String(upWord); + return upWord; } -string -CLExec::FirLetUpperCase (string word, string localeId) -{ - icu::UnicodeString uString (word.c_str ()); - uString.toLower (localeId.c_str ()); - uString.setCharAt ( - 0, icu::UnicodeString (uString.charAt (0)).toUpper (localeId.c_str ()).charAt (0)); - - string upWord; - uString.toUTF8String (upWord); - return upWord; +string CLExec::FirLetUpperCase(string word, string localeId) { + icu::UnicodeString uString(word.c_str()); + uString.toLower(localeId.c_str()); + uString.setCharAt(0, + icu::UnicodeString(uString.charAt(0)).toUpper(localeId.c_str()).charAt( + 0)); + + string upWord; + uString.toUTF8String(upWord); + return upWord; } // The result of bitwise character comparison: 0 if this contains // the same characters as text, -1 if the characters in this are // bitwise less than the characters in text, +1 if the characters // in this are bitwise greater than the characters in text. -int -CLExec::compare (string word1, string word2) -{ - icu::UnicodeString uString1 (word1.c_str ()); - icu::UnicodeString uString2 (word2.c_str ()); +int CLExec::compare(string word1, string word2) { + icu::UnicodeString uString1(word1.c_str()); + icu::UnicodeString uString2(word2.c_str()); - return uString1.compare (uString2); + return uString1.compare(uString2); } -int -CLExec::compareCaseless (string word1, string word2, string localeId) -{ - icu::UnicodeString uString1 (word1.c_str ()); - uString1.toLower (localeId.c_str ()); - icu::UnicodeString uString2 (word2.c_str ()); - uString2.toLower (localeId.c_str ()); +int CLExec::compareCaseless(string word1, string word2, string localeId) { + icu::UnicodeString uString1(word1.c_str()); + uString1.toLower(localeId.c_str()); + icu::UnicodeString uString2(word2.c_str()); + uString2.toLower(localeId.c_str()); - return uString1.compare (uString2); + return uString1.compare(uString2); } // to sort translations from best to worth by their weight -bool -sortParameter (pair, float> a, - pair, float> b) -{ - return (a.second > b.second); +bool sortParameter(pair, float> a, + pair, float> b) { + return (a.second > b.second); } -void -CLExec::beamSearch (vector, float> > *beamTree, - unsigned beam, vector slTokens, - vector ambigInfo, - map > > classesWeights, - string localeId) -{ - // Initialization - (*beamTree).push_back (pair, float> ()); - - for (unsigned i = 0; i < ambigInfo.size (); i++) - { +void CLExec::beamSearch( + vector, float> > *beamTree, + unsigned beam, vector slTokens, vector > slTags, + vector ambigInfo, + map > > classesWeights, + string localeId) { + // Initialization + (*beamTree).push_back(pair, float>()); + + for (unsigned i = 0; i < ambigInfo.size(); i++) { // for (unsigned x = 0; x < beamTree->size (); x++) // { // cout << "weight = " << (*beamTree)[x].second << endl; @@ -321,142 +287,137 @@ CLExec::beamSearch (vector, float> > *beamTree // } // } - RuleExecution::AmbigInfo* ambig = ambigInfo[i]; + RuleExecution::AmbigInfo* ambig = ambigInfo[i]; // pair, pair > > > p = // ambigInfo[i]; // pair wordInd = p.first; // vector > ambigRules = p.second.second; - unsigned ambigRulesSize = ambig->combinations.size (); - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambigRulesSize; x++) - { - // avoid dummy node - for (unsigned y = 1; y < ambig->combinations[x].size (); y++) - { - stringstream ss; - ss << ambig->combinations[x][y]->ruleId; - rulesNums += ss.str (); - - if (y + 1 < ambig->combinations[x].size ()) - rulesNums += "_"; - } - rulesNums += "+"; - } + unsigned ambigRulesSize = ambig->combinations.size(); + + // name of the file is the concatenation of rules ids + string rulesNums; + for (unsigned x = 0; x < ambigRulesSize; x++) { + // avoid dummy node + for (unsigned y = 1; y < ambig->combinations[x].size(); y++) { + stringstream ss; + ss << ambig->combinations[x][y]->ruleId; + rulesNums += ss.str(); + + if (y + 1 < ambig->combinations[x].size()) + rulesNums += "_"; + } + rulesNums += "+"; + } // cout << rulesNums << endl; - map > classWeights = classesWeights[(rulesNums + ".model")]; - - // build new tree for the new words - vector, float> > newTree; - - // initialize the new tree - for (unsigned x = 0; x < ambigRulesSize; x++) - { - newTree.push_back ( - pair, float> (vector (), - 0)); - } - // put rules - for (unsigned z = 0; z < ambigRulesSize; z++) - { - for (unsigned y = 0; y < ambig->combinations[z].size (); y++) - { - newTree[z].first.push_back (ambig->combinations[z][y]); - } - } - - for (unsigned x = ambig->firTokId; x < ambig->firTokId + ambig->maxPat; x++) - { - // word key is the word and it's order in the rule - stringstream ss; - ss << x - ambig->firTokId; - string num = "_" + ss.str (); - - // handle the case of two lemmas separated by a space - for (unsigned t = 0; t < slTokens[x].size (); t++) - if (slTokens[x][t] == ' ') - slTokens[x].replace (t, 1, "_"); - - string word = toLowerCase (slTokens[x], localeId) + num; - vector wordWeights = classWeights[word]; - - // put weights - if (wordWeights.empty ()) - { - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += 1; - cout << "word : " << word << " is not found in dataset : " << rulesNums - << endl; - } - - else - for (unsigned z = 0; z < ambigRulesSize; z++) - newTree[z].second += wordWeights[z]; - - } - - // expand beamTree - unsigned initSize = beamTree->size (); - for (unsigned z = 0; z < ambigRulesSize - 1; z++) - { - for (unsigned x = 0; x < initSize; x++) - { - beamTree->push_back ( - pair, float> ((*beamTree)[x])); - } + map > classWeights = classesWeights[(rulesNums + + ".model")]; + + // build new tree for the new words + vector, float> > newTree; + + // initialize the new tree + for (unsigned x = 0; x < ambigRulesSize; x++) { + newTree.push_back( + pair, float>( + vector(), 0)); + } + // put rules + for (unsigned z = 0; z < ambigRulesSize; z++) { + for (unsigned y = 0; y < ambig->combinations[z].size(); y++) { + newTree[z].first.push_back(ambig->combinations[z][y]); + } + } + + for (unsigned x = ambig->firTokId; x < ambig->firTokId + ambig->maxPat; + x++) { + // word key is the word and it's order in the rule + stringstream ss; + ss << x - ambig->firTokId; + string num = "_" + ss.str(); + + // handle the case of two lemmas separated by a space + for (unsigned t = 0; t < slTokens[x].size(); t++) + if (slTokens[x][t] == ' ') + slTokens[x].replace(t, 1, "_"); + + string word = toLowerCase(slTokens[x], localeId) + num; + vector wordWeights = classWeights[word]; + + // put weights + if (wordWeights.empty()) { + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += 1; + cout << "word : " << word << " is not found in dataset : " + << rulesNums << endl; + } + + else { + vector tagWeights; + for (unsigned t = 0; t < slTags[x].size(); t++) { + string tag = slTags[x][t] + num; + tagWeights = classWeights[tag]; + for (unsigned w = 0; w < tagWeights.size(); w++) + wordWeights[w] += tagWeights[w]; + } + for (unsigned z = 0; z < ambigRulesSize; z++) + newTree[z].second += wordWeights[z]; + } + + } + + // expand beamTree + unsigned initSize = beamTree->size(); + for (unsigned z = 0; z < ambigRulesSize - 1; z++) { + for (unsigned x = 0; x < initSize; x++) { + beamTree->push_back( + pair, float>( + (*beamTree)[x])); + } + } + + // merge the two trees + for (unsigned z = 0; z < ambigRulesSize; z++) { + for (unsigned x = initSize * z; x < initSize * (z + 1); x++) { + // put the new rules with the old + (*beamTree)[x].first.insert((*beamTree)[x].first.end(), + newTree[z].first.begin(), newTree[z].first.end()); + + // add their wiehgts + (*beamTree)[x].second += newTree[z].second; + } + } + + // sort beam tree + sort(beamTree->begin(), beamTree->end(), sortParameter); + + // remove elements more than (beam) + if (beamTree->size() > beam) + beamTree->erase(beamTree->begin() + beam, beamTree->end()); } - - // merge the two trees - for (unsigned z = 0; z < ambigRulesSize; z++) - { - for (unsigned x = initSize * z; x < initSize * (z + 1); x++) - { - // put the new rules with the old - (*beamTree)[x].first.insert ((*beamTree)[x].first.end (), - newTree[z].first.begin (), - newTree[z].first.end ()); - - // add their wiehgts - (*beamTree)[x].second += newTree[z].second; - } - } - - // sort beam tree - sort (beamTree->begin (), beamTree->end (), sortParameter); - - // remove elements more than (beam) - if (beamTree->size () > beam) - beamTree->erase (beamTree->begin () + beam, beamTree->end ()); - } } -void -CLExec::getTransInds (vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds) -{ - for (unsigned i = 0; i < beamTree.size (); i++) - { - vector transInd = beamTree[i].first; - for (unsigned j = 0; j < rulesIds.size (); j++) - { - vector > weigInd = rulesIds[j]; - - unsigned count = 0; - for (unsigned x = 0; x < weigInd.size () && count < transInd.size (); x++) - { - if (transInd[count] == weigInd[x].first) - count++; - } - - if (count == transInd.size ()) - { - transInds->push_back (pair (j, beamTree[i].second)); - break; - } +void CLExec::getTransInds(vector > *transInds, + vector, float> > beamTree, + vector > > rulesIds) { + for (unsigned i = 0; i < beamTree.size(); i++) { + vector transInd = beamTree[i].first; + for (unsigned j = 0; j < rulesIds.size(); j++) { + vector > weigInd = rulesIds[j]; + + unsigned count = 0; + for (unsigned x = 0; x < weigInd.size() && count < transInd.size(); + x++) { + if (transInd[count] == weigInd[x].first) + count++; + } + + if (count == transInd.size()) { + transInds->push_back( + pair(j, beamTree[i].second)); + break; + } + } } - } } diff --git a/src/CLExec.h b/src/CLExec.h index 16fbc7b..37f6dfa 100644 --- a/src/CLExec.h +++ b/src/CLExec.h @@ -17,67 +17,68 @@ using namespace std; using namespace pugi; -class CLExec -{ +class CLExec { public: - static void - segmenter (string inFilePath, string outFilePath); + static void + segmenter(string inFilePath, string outFilePath); - static void - lextor (string inFilePath, string outFilePath); + static void + lextor(string inFilePath, string outFilePath); - static void - biltrans (string inFilePath, string outFilePath); + static void + biltrans(string inFilePath, string outFilePath); - static void - interchunk (string inFilePath, string outFilePath); + static void + interchunk(string inFilePath, string outFilePath); - static void - postchunk (string inFilePath, string outFilePath); + static void + postchunk(string inFilePath, string outFilePath); - static void - transfer (string inFilePath, string outFilePath); + static void + transfer(string inFilePath, string outFilePath); - static void - assignWeights (string inFilePath, string outFilePath); + static void + assignWeights(string inFilePath, string outFilePath); - static vector - getFilesInDir (string dir); + static vector + getFilesInDir(string dir); // static void // runYasmet (); - static map > > - loadYasmetModels (string modelsDest/*, string *localeid*/); + static map > > + loadYasmetModels(string modelsDest/*, string *localeid*/); - static void - handleDatasets (); + static void + handleDatasets(); - static string - toLowerCase (string word, string localeId); + static string + toLowerCase(string word, string localeId); - static string - toUpperCase (string word, string localeId); + static string + toUpperCase(string word, string localeId); - static string - FirLetUpperCase (string word, string localeId); + static string + FirLetUpperCase(string word, string localeId); - static int - compare (string word1, string word2); + static int + compare(string word1, string word2); - static int - compareCaseless (string word1, string word2, string localeId); + static int + compareCaseless(string word1, string word2, string localeId); - static void - beamSearch (vector, float> > *beamTree, unsigned beam, - vector slTokens, vector ambigInfo, - map > > classesWeights, string localeId); + static void + beamSearch(vector, float> > *beamTree, + unsigned beam, vector slTokens, vector > slTags, + vector ambigInfo, + map > > classesWeights, + string localeId); - static void - getTransInds (vector > *transInds, - vector, float> > beamTree, - vector > > rulesIds); + static void + getTransInds(vector > *transInds, + vector, float> > beamTree, + vector > > rulesIds); }; #endif /* SRC_CLEXEC_H_ */ diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index 08eb485..ea36284 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -34,6 +34,9 @@ int main(int argc, char **argv) { case 'u': newLextorFilePath = optarg; break; + case ':': + printf("option %c needs a value\n", optopt); + return -1; case '?': printf("unknown option: %c\n", optopt); return -1;