commit a770332e4abd5a518100fb1f64c85a36f7c3d0c6 Author: aboelhamd Date: Mon May 20 23:58:01 2019 +0200 Solve bugs, add options and some modifications diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp index c7c2279..08eb485 100644 --- a/src/RulesApplier.cpp +++ b/src/RulesApplier.cpp @@ -24,20 +24,28 @@ using namespace std; using namespace pugi; using namespace elem; -int -main (int argc, char **argv) -{ - string localeId, transferFilePath, lextorFilePath, interInFilePath; - - if (argc == 5) - { - localeId = argv[1]; - transferFilePath = argv[2]; - lextorFilePath = argv[3]; - interInFilePath = argv[4]; - } - else - { +int main(int argc, char **argv) { + string localeId, transferFilePath, lextorFilePath, chunkerFilePath, + newLextorFilePath; + + int opt; + while ((opt = getopt(argc, argv, ":u:")) != -1) { + switch (opt) { + case 'u': + newLextorFilePath = optarg; + break; + case '?': + printf("unknown option: %c\n", optopt); + return -1; + } + } + + if (argc - optind == 4) { + localeId = argv[argc - 4]; + transferFilePath = argv[argc - 3]; + lextorFilePath = argv[argc - 2]; + chunkerFilePath = argv[argc - 1]; + } else { // localeId = "es_ES"; // transferFilePath = "transferFile.t1x"; // sentenceFilePath = "spa-test.txt"; @@ -50,145 +58,163 @@ main (int argc, char **argv) // lextorFilePath = "sample-lextor.txt"; // interInFilePath = "sample-inter.txt"; - localeId = "es_ES"; - transferFilePath = - "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; - lextorFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt"; - interInFilePath = - "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt"; - - cout << "Error in parameters !" << endl; - cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath" - << endl; - cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" - << endl; - cout << "transferFilePath : Apertium transfer file of the language pair used." - << endl; - cout << "lextorFilePath : Apertium lextor file for the source language sentences." - << endl; - cout - << "interInFilePath : Output file name of this program which is the input for apertium interchunk." - << endl; - return -1; - } - - ifstream lextorFile (lextorFilePath.c_str ()); - ofstream interInFile (interInFilePath.c_str ()); - if (lextorFile.is_open () && interInFile.is_open ()) - { - // load transfer file in an xml document object - xml_document transferDoc; - xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); - - if (string (result.description ()) != "No error") - { - cout << "ERROR : " << result.description () << endl; - return -1; + localeId = "es_ES"; + transferFilePath = + "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; + lextorFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt"; + chunkerFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt"; + + cout << "Error in parameters !" << endl; + cout << "Parameters are : localeId transferFilePath" + << " lextorFilePath chunkerFilePath [-u newlextorFilePath]" + << endl; + cout + << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" + << endl; + cout + << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout + << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout + << "chunkerFilePath : chunker file name of this program which is the input for apertium interchunk." + << endl; + cout << "-u : remove sentences with unknown words." << endl; + cout + << "newlextorFilePath : write the new sentences lextor in this lextor file." + << endl; + return -1; } - // xml node of the parent node (transfer) in the transfer file - xml_node transfer = transferDoc.child ("transfer"); + ifstream lextorFile(lextorFilePath.c_str()); + ofstream chunkerFile(chunkerFilePath.c_str()); + ofstream newLextorFile; + if (!newLextorFilePath.empty()) + newLextorFile = ofstream(newLextorFilePath.c_str()); + if (lextorFile.is_open() && chunkerFile.is_open() + && (newLextorFilePath.empty() || newLextorFile.is_open())) { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file( + transferFilePath.c_str()); + + if (string(result.description()) != "No error") { + cout << "ERROR : " << result.description() << endl; + return -1; + } - map > > attrs = RuleParser::getAttrs (transfer); - map vars = RuleParser::getVars (transfer); - map > lists = RuleParser::getLists (transfer); + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child("transfer"); + + map > > attrs = RuleParser::getAttrs( + transfer); + map vars = RuleParser::getVars(transfer); + map > lists = RuleParser::getLists(transfer); + + unsigned allSents = 0, goodSents = 0; + string tokenizedSentence; + while (getline(lextorFile, tokenizedSentence)) { + allSents++; + if (!newLextorFilePath.empty() + && tokenizedSentence.find("^*") != string::npos) + continue; + goodSents++; + // write to new lextor file + newLextorFile << tokenizedSentence << endl; -// unsigned i = 0; - string tokenizedSentence; - while (getline (lextorFile, tokenizedSentence)) - { // cout << i++ << endl; - // spaces after each token - vector spaces; +// spaces after each token + vector spaces; - // tokens in the sentence order - vector slTokens, tlTokens; + // tokens in the sentence order + vector slTokens, tlTokens; - // tags of tokens in order - vector > slTags, tlTags; + // tags of tokens in order + vector > slTags, tlTags; - RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, - tokenizedSentence); + RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags, + &tlTags, &spaces, tokenizedSentence); - // map of tokens ids and their matched categories - map > catsApplied; + // map of tokens ids and their matched categories + map > catsApplied; - RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer); - // map of matched rules and a pair of first token id and patterns number - map > > rulesApplied; + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; - RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + RuleParser::matchRules(&rulesApplied, slTokens, catsApplied, + transfer); - // rule and (target) token map to specific output - // if rule has many patterns we will choose the first token only - map > ruleOutputs; + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; - // map (target) token to all matched rules ids and the number of pattern items of each rule - map > > tokenRules; + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; - RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, - tlTags, rulesApplied, attrs, lists, &vars, spaces, - localeId); - // final outs - vector outs; - // number of possible combinations - unsigned compNum; - // nodes for every token and rule - map > nodesPool; - // ambiguous informations - vector ambigInfo; + RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags, + tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + // final outs + vector outs; + // number of possible combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + // ambiguous informations + vector ambigInfo; - // rules combinations - vector > combNodes; + // rules combinations + vector > combNodes; - nodesPool = RuleExecution::getNodesPool (tokenRules); + nodesPool = RuleExecution::getNodesPool(tokenRules); - RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum); - RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs, - spaces); + RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo, + &compNum); + RuleExecution::getOuts(&outs, &combNodes, ambigInfo, nodesPool, + ruleOutputs, spaces); - // write the outs - for (unsigned j = 0; j < outs.size (); j++) - { - interInFile << outs[j] << endl; - } + // write the outs + for (unsigned j = 0; j < outs.size(); j++) { + chunkerFile << outs[j] << endl; + } // interInFile << endl; - // delete AmbigInfo pointers - for (unsigned j = 0; j < ambigInfo.size (); j++) - { - // delete the dummy node pointers - set dummies; - for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++) - dummies.insert (ambigInfo[j]->combinations[k][0]); - for (set::iterator it = dummies.begin (); - it != dummies.end (); it++) - delete (*it); - - delete ambigInfo[j]; - } - // delete Node pointers - for (map >::iterator it = - nodesPool.begin (); it != nodesPool.end (); it++) - { - for (unsigned j = 0; j < it->second.size (); j++) - { - delete it->second[j]; + // delete AmbigInfo pointers + for (unsigned j = 0; j < ambigInfo.size(); j++) { + // delete the dummy node pointers + set dummies; + for (unsigned k = 0; k < ambigInfo[j]->combinations.size(); k++) + dummies.insert(ambigInfo[j]->combinations[k][0]); + for (set::iterator it = dummies.begin(); + it != dummies.end(); it++) + delete (*it); + + delete ambigInfo[j]; + } + // delete Node pointers + for (map >::iterator it = + nodesPool.begin(); it != nodesPool.end(); it++) { + for (unsigned j = 0; j < it->second.size(); j++) { + delete it->second[j]; + } + } } - } - } - lextorFile.close (); - interInFile.close (); -// cout << "RulesApplier finished!"; - } - else - { - cout << "ERROR in opening files!" << endl; - } + lextorFile.close(); + chunkerFile.close(); + newLextorFile.close(); + + cout << "There are " << goodSents << " good sentences from " << allSents + << endl; + } else { + cout << "ERROR in opening files!" << endl; + } - return 0; + return 0; } diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp index 6b358ff..662fd3f 100644 --- a/src/YasmetFormatter.cpp +++ b/src/YasmetFormatter.cpp @@ -28,10 +28,14 @@ int main(int argc, char **argv) { string localeId, transferFilePath, lextorFilePath, targetFilePath, weightsFilePath, datasetsPath; + bool tagsFeats = false; int opt; - while ((opt = getopt(argc, argv, ":t:")) != -1) { + while ((opt = getopt(argc, argv, ":r:t")) != -1) { switch (opt) { case 't': + tagsFeats = true; + break; + case 'r': targetFilePath = optarg; break; case ':': @@ -68,7 +72,8 @@ int main(int argc, char **argv) { cout << "Error in parameters !" << endl; cout << "Parameters are : localeId transferFilePath lextorFilePath" - << " weightOutFilePath datasetsPath -t targetFilePath" << endl; + << " weightOutFilePath datasetsPath [-r targetFilePath] [-t]" + << endl; cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ" << endl; @@ -84,9 +89,10 @@ int main(int argc, char **argv) { cout << "datasetsPath : Datasets destination to put in the generated yasmet files." << endl; - cout - << "targetFilePath : Target file path, if you want to remove \"bad\" sentences." + cout << "-r : Remove \"bad\" sentences (with # or @)." << endl; + cout << "targetFilePath : Target file path for these sentences." << endl; + cout << "-t : Tags as features in yasmet." << endl; return -1; } @@ -181,21 +187,17 @@ int main(int argc, char **argv) { // remove bad sentences with (*,#,@) string line; - if (!targetFilePath.empty()) { - bool isBad; - for (unsigned j = 0; j < outs.size(); j++) { - getline(targetFile, line); - if (line.find('*') != string::npos - || line.find('#') != string::npos - || line.find('@') != string::npos) { - isBad = true; - break; - } + bool isBad = false; + for (unsigned j = 0; j < outs.size(); j++) { + getline(targetFile, line); +// cout << line << " " << line.find('#') << " " << line.find('@') +// << endl; + if (line.find('#') != string::npos + || line.find('@') != string::npos) { + isBad = true; + break; } - if (isBad) - continue; } - goodSents++; // read weights vector weights; @@ -205,6 +207,11 @@ int main(int argc, char **argv) { weights.push_back(weight); } + if (!targetFilePath.empty() && isBad) + continue; + + goodSents++; + RuleExecution::normaliseWeights(&weights, ambigInfo); // Yasmet format preparing @@ -274,9 +281,11 @@ int main(int argc, char **argv) { word.replace(c, 1, "_"); features += " " + word + "_" + num + ":" + label; - for (unsigned d = 0; d < slTags[z].size(); d++) - features += " " + slTags[z][d] + "_" + num + ":" - + label; + + if (tagsFeats) + for (unsigned d = 0; d < slTags[z].size(); d++) + features += " " + slTags[z][d] + "_" + num + + ":" + label; } features += " #"; }