commit a770332e4abd5a518100fb1f64c85a36f7c3d0c6
Author: aboelhamd <aboelhamd.abotreka@gmail.com>
Date:   Mon May 20 23:58:01 2019 +0200

    Solve bugs, add options and some modifications

diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp
index c7c2279..08eb485 100644
--- a/src/RulesApplier.cpp
+++ b/src/RulesApplier.cpp
@@ -24,20 +24,28 @@ using namespace std;
 using namespace pugi;
 using namespace elem;
 
-int
-main (int argc, char **argv)
-{
-  string localeId, transferFilePath, lextorFilePath, interInFilePath;
-
-  if (argc == 5)
-    {
-      localeId = argv[1];
-      transferFilePath = argv[2];
-      lextorFilePath = argv[3];
-      interInFilePath = argv[4];
-    }
-  else
-    {
+int main(int argc, char **argv) {
+	string localeId, transferFilePath, lextorFilePath, chunkerFilePath,
+			newLextorFilePath;
+
+	int opt;
+	while ((opt = getopt(argc, argv, ":u:")) != -1) {
+		switch (opt) {
+		case 'u':
+			newLextorFilePath = optarg;
+			break;
+		case '?':
+			printf("unknown option: %c\n", optopt);
+			return -1;
+		}
+	}
+
+	if (argc - optind == 4) {
+		localeId = argv[argc - 4];
+		transferFilePath = argv[argc - 3];
+		lextorFilePath = argv[argc - 2];
+		chunkerFilePath = argv[argc - 1];
+	} else {
 //      localeId = "es_ES";
 //      transferFilePath = "transferFile.t1x";
 //      sentenceFilePath = "spa-test.txt";
@@ -50,145 +58,163 @@ main (int argc, char **argv)
 //      lextorFilePath = "sample-lextor.txt";
 //      interInFilePath = "sample-inter.txt";
 
-      localeId = "es_ES";
-      transferFilePath =
-	  "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x";
-      lextorFilePath =
-	  "/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt";
-      interInFilePath =
-	  "/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt";
-
-      cout << "Error in parameters !" << endl;
-      cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath"
-	  << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
-	  << endl;
-      cout << "transferFilePath : Apertium transfer file of the language pair used."
-	  << endl;
-      cout << "lextorFilePath : Apertium lextor file for the source language sentences."
-	  << endl;
-      cout
-	  << "interInFilePath : Output file name of this program which is the input for apertium interchunk."
-	  << endl;
-      return -1;
-    }
-
-  ifstream lextorFile (lextorFilePath.c_str ());
-  ofstream interInFile (interInFilePath.c_str ());
-  if (lextorFile.is_open () && interInFile.is_open ())
-    {
-      // load transfer file in an xml document object
-      xml_document transferDoc;
-      xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ());
-
-      if (string (result.description ()) != "No error")
-	{
-	  cout << "ERROR : " << result.description () << endl;
-	  return -1;
+		localeId = "es_ES";
+		transferFilePath =
+				"/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x";
+		lextorFilePath =
+				"/home/aboelhamd/eclipse-workspace/machinetranslation/test-lextor.txt";
+		chunkerFilePath =
+				"/home/aboelhamd/eclipse-workspace/machinetranslation/test-chunker.txt";
+
+		cout << "Error in parameters !" << endl;
+		cout << "Parameters are : localeId transferFilePath"
+				<< " lextorFilePath chunkerFilePath [-u newlextorFilePath]"
+				<< endl;
+		cout
+				<< "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
+				<< endl;
+		cout
+				<< "transferFilePath : Apertium transfer file of the language pair used."
+				<< endl;
+		cout
+				<< "lextorFilePath : Apertium lextor file for the source language sentences."
+				<< endl;
+		cout
+				<< "chunkerFilePath : chunker file name of this program which is the input for apertium interchunk."
+				<< endl;
+		cout << "-u : remove sentences with unknown words." << endl;
+		cout
+				<< "newlextorFilePath : write the new sentences lextor in this lextor file."
+				<< endl;
+		return -1;
 	}
 
-      // xml node of the parent node (transfer) in the transfer file
-      xml_node transfer = transferDoc.child ("transfer");
+	ifstream lextorFile(lextorFilePath.c_str());
+	ofstream chunkerFile(chunkerFilePath.c_str());
+	ofstream newLextorFile;
+	if (!newLextorFilePath.empty())
+		newLextorFile = ofstream(newLextorFilePath.c_str());
+	if (lextorFile.is_open() && chunkerFile.is_open()
+			&& (newLextorFilePath.empty() || newLextorFile.is_open())) {
+		// load transfer file in an xml document object
+		xml_document transferDoc;
+		xml_parse_result result = transferDoc.load_file(
+				transferFilePath.c_str());
+
+		if (string(result.description()) != "No error") {
+			cout << "ERROR : " << result.description() << endl;
+			return -1;
+		}
 
-      map<string, vector<vector<string> > > attrs = RuleParser::getAttrs (transfer);
-      map<string, string> vars = RuleParser::getVars (transfer);
-      map<string, vector<string> > lists = RuleParser::getLists (transfer);
+		// xml node of the parent node (transfer) in the transfer file
+		xml_node transfer = transferDoc.child("transfer");
+
+		map<string, vector<vector<string> > > attrs = RuleParser::getAttrs(
+				transfer);
+		map<string, string> vars = RuleParser::getVars(transfer);
+		map<string, vector<string> > lists = RuleParser::getLists(transfer);
+
+		unsigned allSents = 0, goodSents = 0;
+		string tokenizedSentence;
+		while (getline(lextorFile, tokenizedSentence)) {
+			allSents++;
+			if (!newLextorFilePath.empty()
+					&& tokenizedSentence.find("^*") != string::npos)
+				continue;
+			goodSents++;
+			// write to new lextor file
+			newLextorFile << tokenizedSentence << endl;
 
-//      unsigned i = 0;
-      string tokenizedSentence;
-      while (getline (lextorFile, tokenizedSentence))
-	{
 //	  cout << i++ << endl;
 
-	  // spaces after each token
-	  vector<string> spaces;
+// spaces after each token
+			vector<string> spaces;
 
-	  // tokens in the sentence order
-	  vector<string> slTokens, tlTokens;
+			// tokens in the sentence order
+			vector<string> slTokens, tlTokens;
 
-	  // tags of tokens in order
-	  vector<vector<string> > slTags, tlTags;
+			// tags of tokens in order
+			vector<vector<string> > slTags, tlTags;
 
-	  RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces,
-					 tokenizedSentence);
+			RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags,
+					&tlTags, &spaces, tokenizedSentence);
 
-	  // map of tokens ids and their matched categories
-	  map<unsigned, vector<string> > catsApplied;
+			// map of tokens ids and their matched categories
+			map<unsigned, vector<string> > catsApplied;
 
-	  RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer);
+			RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer);
 
-	  // map of matched rules and a pair of first token id and patterns number
-	  map<xml_node, vector<pair<unsigned, unsigned> > > rulesApplied;
+			// map of matched rules and a pair of first token id and patterns number
+			map<xml_node, vector<pair<unsigned, unsigned> > > rulesApplied;
 
-	  RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer);
+			RuleParser::matchRules(&rulesApplied, slTokens, catsApplied,
+					transfer);
 
-	  // rule and (target) token map to specific output
-	  // if rule has many patterns we will choose the first token only
-	  map<unsigned, map<unsigned, string> > ruleOutputs;
+			// rule and (target) token map to specific output
+			// if rule has many patterns we will choose the first token only
+			map<unsigned, map<unsigned, string> > ruleOutputs;
 
-	  // map (target) token to all matched rules ids and the number of pattern items of each rule
-	  map<unsigned, vector<pair<unsigned, unsigned> > > tokenRules;
+			// map (target) token to all matched rules ids and the number of pattern items of each rule
+			map<unsigned, vector<pair<unsigned, unsigned> > > tokenRules;
 
-	  RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens,
-				   tlTags, rulesApplied, attrs, lists, &vars, spaces,
-				   localeId);
-	  // final outs
-	  vector<string> outs;
-	  // number of possible combinations
-	  unsigned compNum;
-	  // nodes for every token and rule
-	  map<unsigned, vector<RuleExecution::Node*> > nodesPool;
-	  // ambiguous informations
-	  vector<RuleExecution::AmbigInfo*> ambigInfo;
+			RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags,
+					tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces,
+					localeId);
+			// final outs
+			vector<string> outs;
+			// number of possible combinations
+			unsigned compNum;
+			// nodes for every token and rule
+			map<unsigned, vector<RuleExecution::Node*> > nodesPool;
+			// ambiguous informations
+			vector<RuleExecution::AmbigInfo*> ambigInfo;
 
-	  // rules combinations
-	  vector<vector<RuleExecution::Node*> > combNodes;
+			// rules combinations
+			vector<vector<RuleExecution::Node*> > combNodes;
 
-	  nodesPool = RuleExecution::getNodesPool (tokenRules);
+			nodesPool = RuleExecution::getNodesPool(tokenRules);
 
-	  RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum);
-	  RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs,
-				  spaces);
+			RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo,
+					&compNum);
+			RuleExecution::getOuts(&outs, &combNodes, ambigInfo, nodesPool,
+					ruleOutputs, spaces);
 
-	  // write the outs
-	  for (unsigned j = 0; j < outs.size (); j++)
-	    {
-	      interInFile << outs[j] << endl;
-	    }
+			// write the outs
+			for (unsigned j = 0; j < outs.size(); j++) {
+				chunkerFile << outs[j] << endl;
+			}
 //	  interInFile << endl;
 
-	  // delete AmbigInfo pointers
-	  for (unsigned j = 0; j < ambigInfo.size (); j++)
-	    {
-	      // delete the dummy node pointers
-	      set<RuleExecution::Node*> dummies;
-	      for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++)
-		dummies.insert (ambigInfo[j]->combinations[k][0]);
-	      for (set<RuleExecution::Node*>::iterator it = dummies.begin ();
-		  it != dummies.end (); it++)
-		delete (*it);
-
-	      delete ambigInfo[j];
-	    }
-	  // delete Node pointers
-	  for (map<unsigned, vector<RuleExecution::Node*> >::iterator it =
-	      nodesPool.begin (); it != nodesPool.end (); it++)
-	    {
-	      for (unsigned j = 0; j < it->second.size (); j++)
-		{
-		  delete it->second[j];
+			// delete AmbigInfo pointers
+			for (unsigned j = 0; j < ambigInfo.size(); j++) {
+				// delete the dummy node pointers
+				set<RuleExecution::Node*> dummies;
+				for (unsigned k = 0; k < ambigInfo[j]->combinations.size(); k++)
+					dummies.insert(ambigInfo[j]->combinations[k][0]);
+				for (set<RuleExecution::Node*>::iterator it = dummies.begin();
+						it != dummies.end(); it++)
+					delete (*it);
+
+				delete ambigInfo[j];
+			}
+			// delete Node pointers
+			for (map<unsigned, vector<RuleExecution::Node*> >::iterator it =
+					nodesPool.begin(); it != nodesPool.end(); it++) {
+				for (unsigned j = 0; j < it->second.size(); j++) {
+					delete it->second[j];
+				}
+			}
 		}
-	    }
-	}
 
-      lextorFile.close ();
-      interInFile.close ();
-//      cout << "RulesApplier finished!";
-    }
-  else
-    {
-      cout << "ERROR in opening files!" << endl;
-    }
+		lextorFile.close();
+		chunkerFile.close();
+		newLextorFile.close();
+
+		cout << "There are " << goodSents << " good sentences from " << allSents
+				<< endl;
+	} else {
+		cout << "ERROR in opening files!" << endl;
+	}
 
-  return 0;
+	return 0;
 }
diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp
index 6b358ff..662fd3f 100644
--- a/src/YasmetFormatter.cpp
+++ b/src/YasmetFormatter.cpp
@@ -28,10 +28,14 @@ int main(int argc, char **argv) {
 	string localeId, transferFilePath, lextorFilePath, targetFilePath,
 			weightsFilePath, datasetsPath;
 
+	bool tagsFeats = false;
 	int opt;
-	while ((opt = getopt(argc, argv, ":t:")) != -1) {
+	while ((opt = getopt(argc, argv, ":r:t")) != -1) {
 		switch (opt) {
 		case 't':
+			tagsFeats = true;
+			break;
+		case 'r':
 			targetFilePath = optarg;
 			break;
 		case ':':
@@ -68,7 +72,8 @@ int main(int argc, char **argv) {
 
 		cout << "Error in parameters !" << endl;
 		cout << "Parameters are : localeId transferFilePath lextorFilePath"
-				<< " weightOutFilePath datasetsPath -t targetFilePath" << endl;
+				<< " weightOutFilePath datasetsPath [-r targetFilePath] [-t]"
+				<< endl;
 		cout
 				<< "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 				<< endl;
@@ -84,9 +89,10 @@ int main(int argc, char **argv) {
 		cout
 				<< "datasetsPath : Datasets destination to put in the generated yasmet files."
 				<< endl;
-		cout
-				<< "targetFilePath : Target file path, if you want to remove \"bad\" sentences."
+		cout << "-r : Remove \"bad\" sentences (with # or @)." << endl;
+		cout << "targetFilePath : Target file path for these sentences."
 				<< endl;
+		cout << "-t : Tags as features in yasmet." << endl;
 		return -1;
 	}
 
@@ -181,21 +187,17 @@ int main(int argc, char **argv) {
 			// remove bad sentences with (*,#,@)
 			string line;
 
-			if (!targetFilePath.empty()) {
-				bool isBad;
-				for (unsigned j = 0; j < outs.size(); j++) {
-					getline(targetFile, line);
-					if (line.find('*') != string::npos
-							|| line.find('#') != string::npos
-							|| line.find('@') != string::npos) {
-						isBad = true;
-						break;
-					}
+			bool isBad = false;
+			for (unsigned j = 0; j < outs.size(); j++) {
+				getline(targetFile, line);
+//				cout << line << "  " << line.find('#') << "  " << line.find('@')
+//						<< endl;
+				if (line.find('#') != string::npos
+						|| line.find('@') != string::npos) {
+					isBad = true;
+					break;
 				}
-				if (isBad)
-					continue;
 			}
-			goodSents++;
 
 			// read weights
 			vector<float> weights;
@@ -205,6 +207,11 @@ int main(int argc, char **argv) {
 				weights.push_back(weight);
 			}
 
+			if (!targetFilePath.empty() && isBad)
+				continue;
+
+			goodSents++;
+
 			RuleExecution::normaliseWeights(&weights, ambigInfo);
 
 			// Yasmet format preparing
@@ -274,9 +281,11 @@ int main(int argc, char **argv) {
 									word.replace(c, 1, "_");
 
 							features += " " + word + "_" + num + ":" + label;
-							for (unsigned d = 0; d < slTags[z].size(); d++)
-								features += " " + slTags[z][d] + "_" + num + ":"
-										+ label;
+
+							if (tagsFeats)
+								for (unsigned d = 0; d < slTags[z].size(); d++)
+									features += " " + slTags[z][d] + "_" + num
+											+ ":" + label;
 						}
 						features += " #";
 					}