commit 2485561b17caeb53bec2dff3f3f8463a9212b4fc Author: aboelhamd Date: Mon May 20 18:12:20 2019 +0200 Finish adding tags as features and removing bad sentences as option diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp index 90c3f56..9ca5738 100644 --- a/src/BeamSearch.cpp +++ b/src/BeamSearch.cpp @@ -61,7 +61,7 @@ main (int argc, char **argv) cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize" << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; cout << "transferFilePath : Apertium transfer file of the language pair used." << endl; diff --git a/src/CombAlign.cpp b/src/CombAlign.cpp index 60790eb..71a4673 100644 --- a/src/CombAlign.cpp +++ b/src/CombAlign.cpp @@ -68,7 +68,7 @@ main (int argc, char **argv) cout << "Error in parameters !" << endl; cout << "Parameters are : localeId transferFilePath lextorFilePath" << " chunkerFilePath referenceFilePath newRefFilePath" << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; cout << "transferFilePath : Apertium transfer file of the language pair used." << endl; diff --git a/src/LangModAnalysis.cpp b/src/LangModAnalysis.cpp index 03a308a..4319ca5 100644 --- a/src/LangModAnalysis.cpp +++ b/src/LangModAnalysis.cpp @@ -102,7 +102,7 @@ int main(int argc, char **argv) { << " [-a analysisFilePath] [-b bestModFilePath] [-r randModFilePath]" << endl; cout - << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; cout << "transferFilePath : Apertium transfer file of the language pair used." diff --git a/src/OrderAmbigSents.cpp b/src/OrderAmbigSents.cpp index ddde9d8..fac18bc 100644 --- a/src/OrderAmbigSents.cpp +++ b/src/OrderAmbigSents.cpp @@ -70,7 +70,7 @@ main (int argc, char **argv) cout << "Parameters are : localeId transferFilePath sourceFilePath" << " lextorFilePath targetFilePath orderedSrcFilePath orderedTrgFilePath" << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; cout << "transferFilePath : Apertium transfer file of the language pair used." << endl; diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index db4a15e..c7c2279 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -61,7 +61,7 @@ main (int argc, char **argv) cout << "Error in parameters !" << endl; cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath" << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; cout << "transferFilePath : Apertium transfer file of the language pair used." << endl; diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index 80134a2..6b358ff 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -24,22 +24,32 @@ using namespace std; using namespace pugi; using namespace elem; -int -main (int argc, char **argv) -{ - string lextorFilePath = "lextor.txt", weightOutFilePath = "weights.txt", localeId = - "kk_KZ", transferFilePath = "transferFile.tx1", datasetsPath = "datasets"; - - if (argc == 6) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - weightOutFilePath = argv[4]; - datasetsPath = argv[5]; - } - else - { +int main(int argc, char **argv) { + string localeId, transferFilePath, lextorFilePath, targetFilePath, + weightsFilePath, datasetsPath; + + int opt; + while ((opt = getopt(argc, argv, ":t:")) != -1) { + switch (opt) { + case 't': + targetFilePath = optarg; + break; + case ':': + printf("option %c needs a value\n", optopt); + return -1; + case '?': + printf("unknown option: %c\n", optopt); + return -1; + } + } + + if (argc - optind == 5) { + localeId = argv[argc - 5]; + transferFilePath = argv[argc - 4]; + lextorFilePath = argv[argc - 3]; + weightsFilePath = argv[argc - 2]; + datasetsPath = argv[argc - 1]; + } else { // localeId = "es_ES"; // transferFilePath = "transferFile.t1x"; // sentenceFilePath = "spa-test.txt"; @@ -49,242 +59,262 @@ main (int argc, char **argv) // outputFilePath = "output.out"; // datasetsPath = "datasetstry2"; - localeId = "kk_KZ"; - transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; - lextorFilePath = "sample-lextor.txt"; - weightOutFilePath = "norm-weights.txt"; - datasetsPath = "datasetstry1234"; - - cout << "Error in parameters !" << endl; - cout - << "Parameters are : localeId transferFilePath lextorFilePath weightOutFilePath datasetsPath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "weightOutFilePath : Language model weights file for the source language sentences." - << endl; - cout << "datasetsPath : Datasets destination to put in the generated yasmet files." - << endl; - return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ifstream weightOutFile (weightOutFilePath.c_str ()); - if (lextorFile.is_open () && weightOutFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; + localeId = "kk_KZ"; + transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; + lextorFilePath = "sample-lextor.txt"; + weightsFilePath = "norm-weights.txt"; + datasetsPath = "datasetstry1234"; + targetFilePath = "target.txt"; + + cout << "Error in parameters !" << endl; + cout << "Parameters are : localeId transferFilePath lextorFilePath" + << " weightOutFilePath datasetsPath -t targetFilePath" << endl; + cout + << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" + << endl; + cout + << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout + << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "weightOutFilePath : Language model weights file for the source language sentences." + << endl; + cout + << "datasetsPath : Datasets destination to put in the generated yasmet files." + << endl; + cout + << "targetFilePath : Target file path, if you want to remove \"bad\" sentences." + << endl; + return -1; } - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); - - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); - - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { - // cout << i << endl; - - // spaces after each token - vector spaces; - - // tokens in the sentence order - vector slTokens, tlTokens; - - // tags of tokens in order - vector > slTags, tlTags; - - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); - - // map of tokens ids and their matched categories - map > catsApplied; - - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); - - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; - - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); - - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; - - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; - - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - - // final outs - vector outs; - // number of generated combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; - // rules combinations - vector > combNodes; - - nodesPool = RuleExecution::getNodesPool (tokenRules); - - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); - - vector newAmbigInfo; - for (unsigned j = 0; j < ambigInfo.size (); j++) - if (ambigInfo[j]->combinations.size () > 1) - newAmbigInfo.push_back (ambigInfo[j]); - ambigInfo = newAmbigInfo; - - // read weights - string line; - vector weights; - for (unsigned j = 0; j < outs.size (); j++) - { - getline (weightOutFile, line); - float weight = strtof (line.c_str (), NULL); - weights.push_back (weight); - } - - RuleExecution::normaliseWeights (&weights, ambigInfo); - - // Yasmet format preparing - // make a directory if not found - mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - - unsigned weigInd = 0; - for (unsigned i = 0; i < ambigInfo.size (); i++) - { - RuleExecution::AmbigInfo* ambig = ambigInfo[i]; - - // name of the file is the concatenation of rules ids - string rulesNums; - for (unsigned x = 0; x < ambig->combinations.size (); x++) - { - // avoid dummy node - for (unsigned y = 1; y < ambig->combinations[x].size (); y++) - { - stringstream ss; -// ss->clear (); - ss << ambig->combinations[x][y]->ruleId; - rulesNums += ss.str (); - - if (y + 1 < ambig->combinations[x].size ()) - rulesNums += "_"; - } - rulesNums += "+"; - } - - // if it's the first time to open , put the number of classes - bool firstTime = true; - if (FILE *file = fopen ((datasetsPath + string ("/") + rulesNums).c_str (), - "r")) - { - firstTime = false; - fclose (file); + ifstream lextorFile(lextorFilePath.c_str()); + ifstream weightOutFile(weightsFilePath.c_str()); + ifstream targetFile; + if (!targetFilePath.empty()) + targetFile = ifstream(targetFilePath.c_str()); + if (lextorFile.is_open() && weightOutFile.is_open() + && (targetFilePath.empty() || targetFile.is_open())) { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file( + transferFilePath.c_str()); + + if (string(result.description()) != "No error") { + cout << "ERROR : " << result.description() << endl; + return -1; } -// stringstream* dataset = new stringstream (); - ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (), - ofstream::app); - - if (firstTime) - dataset << ambig->combinations.size () << endl; - - for (unsigned x = 0; x < ambig->combinations.size (); x++) - { - - dataset << x << " $ "; - - float weight = weights[x + weigInd]; - - dataset << weight << " #"; - - string features; - for (unsigned v = 0; v < ambig->combinations.size (); v++) - { - stringstream ss; -// ss.clear (); - ss << v; - string label = ss.str (); - - for (unsigned z = ambig->firTokId; - z < ambig->firTokId + ambig->maxPat; z++) - { - stringstream ss; -// ss->clear (); - ss << z - ambig->firTokId; - string num = ss.str (); -// *num = ss->str (); - string word = CLExec::toLowerCase (slTokens[z], localeId); + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child("transfer"); + + map > > attrs = RuleParser::getAttrs( + transfer); + map vars = RuleParser::getVars(transfer); + map > lists = RuleParser::getLists(transfer); + + unsigned allSents = 0, goodSents = 0; + string tokenizedSentence; + while (getline(lextorFile, tokenizedSentence)) { + allSents++; + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, + &tlTags, &spaces, tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, + transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, + tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // final outs + vector outs; + // number of generated combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; + // rules combinations + vector > combNodes; + + nodesPool = RuleExecution::getNodesPool(tokenRules); + + RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, + &compNum); + + RuleExecution::getOuts(&outs, &combNodes, ambigInfo, nodesPool, + ruleOutputs, spaces); + + vector newAmbigInfo; + for (unsigned j = 0; j < ambigInfo.size(); j++) + if (ambigInfo[j]->combinations.size() > 1) + newAmbigInfo.push_back(ambigInfo[j]); + ambigInfo = newAmbigInfo; + + // remove bad sentences with (*,#,@) + string line; + + if (!targetFilePath.empty()) { + bool isBad; + for (unsigned j = 0; j < outs.size(); j++) { + getline(targetFile, line); + if (line.find('*') != string::npos + || line.find('#') != string::npos + || line.find('@') != string::npos) { + isBad = true; + break; + } + } + if (isBad) + continue; + } + goodSents++; + + // read weights + vector weights; + for (unsigned j = 0; j < outs.size(); j++) { + getline(weightOutFile, line); + float weight = strtof(line.c_str(), NULL); + weights.push_back(weight); + } - for (unsigned c = 0; c < word.length (); c++) - if (word[c] == ' ') - word.replace (c, 1, "_"); + RuleExecution::normaliseWeights(&weights, ambigInfo); + + // Yasmet format preparing + // make a directory if not found + mkdir(datasetsPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + + unsigned weigInd = 0; + for (unsigned i = 0; i < ambigInfo.size(); i++) { + RuleExecution::AmbigInfo* ambig = ambigInfo[i]; + + // name of the file is the concatenation of rules ids + string rulesNums; + for (unsigned x = 0; x < ambig->combinations.size(); x++) { + // avoid dummy node + for (unsigned y = 1; y < ambig->combinations[x].size(); + y++) { + stringstream ss; + ss << ambig->combinations[x][y]->ruleId; + rulesNums += ss.str(); + + if (y + 1 < ambig->combinations[x].size()) + rulesNums += "_"; + } + rulesNums += "+"; + } + + // if it's the first time to open , put the number of classes + bool firstTime = true; + if (FILE *file = fopen( + (datasetsPath + string("/") + rulesNums).c_str(), + "r")) { + firstTime = false; + fclose(file); + } + + ofstream dataset( + (datasetsPath + string("/") + rulesNums).c_str(), + ofstream::app); + + if (firstTime) + dataset << ambig->combinations.size() << endl; + + for (unsigned x = 0; x < ambig->combinations.size(); x++) { + + dataset << x << " $ "; + + float weight = weights[x + weigInd]; + + dataset << weight << " #"; + + string features; + for (unsigned v = 0; v < ambig->combinations.size(); v++) { + stringstream ss; + ss << v; + string label = ss.str(); + + for (unsigned z = ambig->firTokId; + z < ambig->firTokId + ambig->maxPat; z++) { + stringstream ss; + ss << z - ambig->firTokId; + string num = ss.str(); + string word = CLExec::toLowerCase(slTokens[z], + localeId); + + for (unsigned c = 0; c < word.length(); c++) + if (word[c] == ' ') + word.replace(c, 1, "_"); + + features += " " + word + "_" + num + ":" + label; + for (unsigned d = 0; d < slTags[z].size(); d++) + features += " " + slTags[z][d] + "_" + num + ":" + + label; + } + features += " #"; + } + dataset << features << endl; + } + weigInd += ambig->combinations.size(); + dataset.close(); + } - features += " " + word + "_" + num + ":" + label; + // delete AmbigInfo pointers + for (unsigned j = 0; j < ambigInfo.size(); j++) { + // delete the dummy node pointers + set dummies; + for (unsigned k = 0; k < ambigInfo[j]->combinations.size(); k++) + dummies.insert(ambigInfo[j]->combinations[k][0]); + for (set::iterator it = dummies.begin(); + it != dummies.end(); it++) + delete (*it); + + delete ambigInfo[j]; } - features += " #"; - } - dataset << features << endl; -// delete (features); - } - weigInd += ambig->combinations.size (); -// dataset.close (); - } - - // delete AmbigInfo pointers - for (unsigned j = 0; j < ambigInfo.size (); j++) - { - // delete the dummy node pointers - set dummies; - for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) - dummies.insert (ambigInfo[j]->combinations[k][0]); - for (set::iterator it = dummies.begin (); - it != dummies.end (); it++) - delete (*it); - - delete ambigInfo[j]; - } - // delete Node pointers - for (map >::iterator it = - nodesPool.begin (); it != nodesPool.end (); it++) - { - for (unsigned j = 0; j < it->second.size (); j++) - { - delete it->second[j]; + // delete Node pointers + for (map >::iterator it = + nodesPool.begin(); it != nodesPool.end(); it++) { + for (unsigned j = 0; j < it->second.size(); j++) { + delete it->second[j]; + } + } + } - } + lextorFile.close(); + weightOutFile.close(); -// } + cout << "There are " << goodSents << " good sentences from " << allSents + << endl; + } else { + cout << "ERROR in opening files!" << endl; } - lextorFile.close (); - weightOutFile.close (); - } - else - { - cout << "ERROR in opening files!" << endl; - } - - return 0; + + return 0; } diff --git a/training-yasmet.sh b/training-yasmet.sh index be738af..74a316a 100644 --- a/training-yasmet.sh +++ b/training-yasmet.sh @@ -16,7 +16,7 @@ pairPar=$4; # 5) model weight program path. modelWeight=$5; -# 6) ICU locale ID for the source language. For Kazakh => kk-KZ +# 6) ICU locale ID for the source language. For Kazakh => kk_KZ localeId=$6; # 7) Analysis output file name.