commit 2485561b17caeb53bec2dff3f3f8463a9212b4fc
Author: aboelhamd <aboelhamd.abotreka@gmail.com>
Date:   Mon May 20 18:12:20 2019 +0200

    Finish adding tags as features and removing bad sentences as option

diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp
index 90c3f56..9ca5738 100644
--- a/src/BeamSearch.cpp
+++ b/src/BeamSearch.cpp
@@ -61,7 +61,7 @@ main (int argc, char **argv)
       cout
 	  << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath modelsDest beamSize"
 	  << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
+      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 	  << endl;
       cout << "transferFilePath : Apertium transfer file of the language pair used."
 	  << endl;
diff --git a/src/CombAlign.cpp b/src/CombAlign.cpp
index 60790eb..71a4673 100644
--- a/src/CombAlign.cpp
+++ b/src/CombAlign.cpp
@@ -68,7 +68,7 @@ main (int argc, char **argv)
       cout << "Error in parameters !" << endl;
       cout << "Parameters are : localeId transferFilePath lextorFilePath"
 	  << " chunkerFilePath referenceFilePath newRefFilePath" << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
+      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 	  << endl;
       cout << "transferFilePath : Apertium transfer file of the language pair used."
 	  << endl;
diff --git a/src/LangModAnalysis.cpp b/src/LangModAnalysis.cpp
index 03a308a..4319ca5 100644
--- a/src/LangModAnalysis.cpp
+++ b/src/LangModAnalysis.cpp
@@ -102,7 +102,7 @@ int main(int argc, char **argv) {
 				<< " [-a analysisFilePath] [-b bestModFilePath] [-r randModFilePath]"
 				<< endl;
 		cout
-				<< "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
+				<< "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 				<< endl;
 		cout
 				<< "transferFilePath : Apertium transfer file of the language pair used."
diff --git a/src/OrderAmbigSents.cpp b/src/OrderAmbigSents.cpp
index ddde9d8..fac18bc 100644
--- a/src/OrderAmbigSents.cpp
+++ b/src/OrderAmbigSents.cpp
@@ -70,7 +70,7 @@ main (int argc, char **argv)
       cout << "Parameters are : localeId transferFilePath sourceFilePath"
 	  << " lextorFilePath targetFilePath orderedSrcFilePath orderedTrgFilePath"
 	  << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
+      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 	  << endl;
       cout << "transferFilePath : Apertium transfer file of the language pair used."
 	  << endl;
diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp
index db4a15e..c7c2279 100644
--- a/src/RulesApplier.cpp
+++ b/src/RulesApplier.cpp
@@ -61,7 +61,7 @@ main (int argc, char **argv)
       cout << "Error in parameters !" << endl;
       cout << "Parameters are : localeId transferFilePath lextorFilePath interInFilePath"
 	  << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
+      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
 	  << endl;
       cout << "transferFilePath : Apertium transfer file of the language pair used."
 	  << endl;
diff --git a/src/YasmetFormatter.cpp b/src/YasmetFormatter.cpp
index 80134a2..6b358ff 100644
--- a/src/YasmetFormatter.cpp
+++ b/src/YasmetFormatter.cpp
@@ -24,22 +24,32 @@ using namespace std;
 using namespace pugi;
 using namespace elem;
 
-int
-main (int argc, char **argv)
-{
-  string lextorFilePath = "lextor.txt", weightOutFilePath = "weights.txt", localeId =
-      "kk_KZ", transferFilePath = "transferFile.tx1", datasetsPath = "datasets";
-
-  if (argc == 6)
-    {
-      localeId = argv[1];
-      transferFilePath = argv[2];
-      lextorFilePath = argv[3];
-      weightOutFilePath = argv[4];
-      datasetsPath = argv[5];
-    }
-  else
-    {
+int main(int argc, char **argv) {
+	string localeId, transferFilePath, lextorFilePath, targetFilePath,
+			weightsFilePath, datasetsPath;
+
+	int opt;
+	while ((opt = getopt(argc, argv, ":t:")) != -1) {
+		switch (opt) {
+		case 't':
+			targetFilePath = optarg;
+			break;
+		case ':':
+			printf("option %c needs a value\n", optopt);
+			return -1;
+		case '?':
+			printf("unknown option: %c\n", optopt);
+			return -1;
+		}
+	}
+
+	if (argc - optind == 5) {
+		localeId = argv[argc - 5];
+		transferFilePath = argv[argc - 4];
+		lextorFilePath = argv[argc - 3];
+		weightsFilePath = argv[argc - 2];
+		datasetsPath = argv[argc - 1];
+	} else {
 //      localeId = "es_ES";
 //      transferFilePath = "transferFile.t1x";
 //      sentenceFilePath = "spa-test.txt";
@@ -49,242 +59,262 @@ main (int argc, char **argv)
 //      outputFilePath = "output.out";
 //      datasetsPath = "datasetstry2";
 
-      localeId = "kk_KZ";
-      transferFilePath = "apertium-kaz-tur.kaz-tur.t1x";
-      lextorFilePath = "sample-lextor.txt";
-      weightOutFilePath = "norm-weights.txt";
-      datasetsPath = "datasetstry1234";
-
-      cout << "Error in parameters !" << endl;
-      cout
-	  << "Parameters are : localeId transferFilePath lextorFilePath weightOutFilePath datasetsPath"
-	  << endl;
-      cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ"
-	  << endl;
-      cout << "transferFilePath : Apertium transfer file of the language pair used."
-	  << endl;
-      cout << "lextorFilePath : Apertium lextor file for the source language sentences."
-	  << endl;
-      cout
-	  << "weightOutFilePath : Language model weights file for the source language sentences."
-	  << endl;
-      cout << "datasetsPath : Datasets destination to put in the generated yasmet files."
-	  << endl;
-      return -1;
-    }
-
-  ifstream lextorFile (lextorFilePath.c_str ());
-  ifstream weightOutFile (weightOutFilePath.c_str ());
-  if (lextorFile.is_open () && weightOutFile.is_open ())
-    {
-      // load transfer file in an xml document object
-      xml_document transferDoc;
-      xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ());
-
-      if (string (result.description ()) != "No error")
-	{
-	  cout << "ERROR : " << result.description () << endl;
-	  return -1;
+		localeId = "kk_KZ";
+		transferFilePath = "apertium-kaz-tur.kaz-tur.t1x";
+		lextorFilePath = "sample-lextor.txt";
+		weightsFilePath = "norm-weights.txt";
+		datasetsPath = "datasetstry1234";
+		targetFilePath = "target.txt";
+
+		cout << "Error in parameters !" << endl;
+		cout << "Parameters are : localeId transferFilePath lextorFilePath"
+				<< " weightOutFilePath datasetsPath -t targetFilePath" << endl;
+		cout
+				<< "localeId : ICU locale ID for the source language. For Kazakh => kk_KZ"
+				<< endl;
+		cout
+				<< "transferFilePath : Apertium transfer file of the language pair used."
+				<< endl;
+		cout
+				<< "lextorFilePath : Apertium lextor file for the source language sentences."
+				<< endl;
+		cout
+				<< "weightOutFilePath : Language model weights file for the source language sentences."
+				<< endl;
+		cout
+				<< "datasetsPath : Datasets destination to put in the generated yasmet files."
+				<< endl;
+		cout
+				<< "targetFilePath : Target file path, if you want to remove \"bad\" sentences."
+				<< endl;
+		return -1;
 	}
 
-      // xml node of the parent node (transfer) in the transfer file
-      xml_node transfer = transferDoc.child ("transfer");
-
-      map<string, vector<vector<string> > > attrs = RuleParser::getAttrs (transfer);
-      map<string, string> vars = RuleParser::getVars (transfer);
-      map<string, vector<string> > lists = RuleParser::getLists (transfer);
-
-      string tokenizedSentence;
-      while (getline (lextorFile, tokenizedSentence))
-	{
-	  // cout << i << endl;
-
-	  // spaces after each token
-	  vector<string> spaces;
-
-	  // tokens in the sentence order
-	  vector<string> slTokens, tlTokens;
-
-	  // tags of tokens in order
-	  vector<vector<string> > slTags, tlTags;
-
-	  RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces,
-					 tokenizedSentence);
-
-	  // map of tokens ids and their matched categories
-	  map<unsigned, vector<string> > catsApplied;
-
-	  RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer);
-
-	  // map of matched rules and a pair of first token id and patterns number
-	  map<xml_node, vector<pair<unsigned, unsigned> > > rulesApplied;
-
-	  RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer);
-
-	  // rule and (target) token map to specific output
-	  // if rule has many patterns we will choose the first token only
-	  map<unsigned, map<unsigned, string> > ruleOutputs;
-
-	  // map (target) token to all matched rules ids and the number of pattern items of each rule
-	  map<unsigned, vector<pair<unsigned, unsigned> > > tokenRules;
-
-	  RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens,
-				   tlTags, rulesApplied, attrs, lists, &vars, spaces,
-				   localeId);
-
-	  // final outs
-	  vector<string> outs;
-	  // number of generated combinations
-	  unsigned compNum;
-	  // nodes for every token and rule
-	  map<unsigned, vector<RuleExecution::Node*> > nodesPool;
-	  // ambiguous informations
-	  vector<RuleExecution::AmbigInfo*> ambigInfo;
-	  // rules combinations
-	  vector<vector<RuleExecution::Node*> > combNodes;
-
-	  nodesPool = RuleExecution::getNodesPool (tokenRules);
-
-	  RuleExecution::getAmbigInfo (tokenRules, nodesPool, &ambigInfo, &compNum);
-
-	  RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs,
-				  spaces);
-
-	  vector<RuleExecution::AmbigInfo*> newAmbigInfo;
-	  for (unsigned j = 0; j < ambigInfo.size (); j++)
-	    if (ambigInfo[j]->combinations.size () > 1)
-	      newAmbigInfo.push_back (ambigInfo[j]);
-	  ambigInfo = newAmbigInfo;
-
-	  // read weights
-	  string line;
-	  vector<float> weights;
-	  for (unsigned j = 0; j < outs.size (); j++)
-	    {
-	      getline (weightOutFile, line);
-	      float weight = strtof (line.c_str (), NULL);
-	      weights.push_back (weight);
-	    }
-
-	  RuleExecution::normaliseWeights (&weights, ambigInfo);
-
-	  // Yasmet format preparing
-	  // make a directory if not found
-	  mkdir (datasetsPath.c_str (), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-
-	  unsigned weigInd = 0;
-	  for (unsigned i = 0; i < ambigInfo.size (); i++)
-	    {
-	      RuleExecution::AmbigInfo* ambig = ambigInfo[i];
-
-	      // name of the file is the concatenation of rules ids
-	      string rulesNums;
-	      for (unsigned x = 0; x < ambig->combinations.size (); x++)
-		{
-		  // avoid dummy node
-		  for (unsigned y = 1; y < ambig->combinations[x].size (); y++)
-		    {
-		      stringstream ss;
-//			    ss->clear ();
-		      ss << ambig->combinations[x][y]->ruleId;
-		      rulesNums += ss.str ();
-
-		      if (y + 1 < ambig->combinations[x].size ())
-			rulesNums += "_";
-		    }
-		  rulesNums += "+";
-		}
-
-	      // if it's the first time to open , put the number of classes
-	      bool firstTime = true;
-	      if (FILE *file = fopen ((datasetsPath + string ("/") + rulesNums).c_str (),
-				      "r"))
-		{
-		  firstTime = false;
-		  fclose (file);
+	ifstream lextorFile(lextorFilePath.c_str());
+	ifstream weightOutFile(weightsFilePath.c_str());
+	ifstream targetFile;
+	if (!targetFilePath.empty())
+		targetFile = ifstream(targetFilePath.c_str());
+	if (lextorFile.is_open() && weightOutFile.is_open()
+			&& (targetFilePath.empty() || targetFile.is_open())) {
+		// load transfer file in an xml document object
+		xml_document transferDoc;
+		xml_parse_result result = transferDoc.load_file(
+				transferFilePath.c_str());
+
+		if (string(result.description()) != "No error") {
+			cout << "ERROR : " << result.description() << endl;
+			return -1;
 		}
 
-//		    stringstream* dataset = new stringstream ();
-	      ofstream dataset ((datasetsPath + string ("/") + rulesNums).c_str (),
-				ofstream::app);
-
-	      if (firstTime)
-		dataset << ambig->combinations.size () << endl;
-
-	      for (unsigned x = 0; x < ambig->combinations.size (); x++)
-		{
-
-		  dataset << x << " $ ";
-
-		  float weight = weights[x + weigInd];
-
-		  dataset << weight << " #";
-
-		  string features;
-		  for (unsigned v = 0; v < ambig->combinations.size (); v++)
-		    {
-		      stringstream ss;
-//			    ss.clear ();
-		      ss << v;
-		      string label = ss.str ();
-
-		      for (unsigned z = ambig->firTokId;
-			  z < ambig->firTokId + ambig->maxPat; z++)
-			{
-			  stringstream ss;
-//				ss->clear ();
-			  ss << z - ambig->firTokId;
-			  string num = ss.str ();
-//			  *num = ss->str ();
-			  string word = CLExec::toLowerCase (slTokens[z], localeId);
+		// xml node of the parent node (transfer) in the transfer file
+		xml_node transfer = transferDoc.child("transfer");
+
+		map<string, vector<vector<string> > > attrs = RuleParser::getAttrs(
+				transfer);
+		map<string, string> vars = RuleParser::getVars(transfer);
+		map<string, vector<string> > lists = RuleParser::getLists(transfer);
+
+		unsigned allSents = 0, goodSents = 0;
+		string tokenizedSentence;
+		while (getline(lextorFile, tokenizedSentence)) {
+			allSents++;
+			// spaces after each token
+			vector<string> spaces;
+
+			// tokens in the sentence order
+			vector<string> slTokens, tlTokens;
+
+			// tags of tokens in order
+			vector<vector<string> > slTags, tlTags;
+
+			RuleParser::sentenceTokenizer(&slTokens, &tlTokens, &slTags,
+					&tlTags, &spaces, tokenizedSentence);
+
+			// map of tokens ids and their matched categories
+			map<unsigned, vector<string> > catsApplied;
+
+			RuleParser::matchCats(&catsApplied, slTokens, slTags, transfer);
+
+			// map of matched rules and a pair of first token id and patterns number
+			map<xml_node, vector<pair<unsigned, unsigned> > > rulesApplied;
+
+			RuleParser::matchRules(&rulesApplied, slTokens, catsApplied,
+					transfer);
+
+			// rule and (target) token map to specific output
+			// if rule has many patterns we will choose the first token only
+			map<unsigned, map<unsigned, string> > ruleOutputs;
+
+			// map (target) token to all matched rules ids and the number of pattern items of each rule
+			map<unsigned, vector<pair<unsigned, unsigned> > > tokenRules;
+
+			RuleExecution::ruleOuts(&ruleOutputs, &tokenRules, slTokens, slTags,
+					tlTokens, tlTags, rulesApplied, attrs, lists, &vars, spaces,
+					localeId);
+
+			// final outs
+			vector<string> outs;
+			// number of generated combinations
+			unsigned compNum;
+			// nodes for every token and rule
+			map<unsigned, vector<RuleExecution::Node*> > nodesPool;
+			// ambiguous informations
+			vector<RuleExecution::AmbigInfo*> ambigInfo;
+			// rules combinations
+			vector<vector<RuleExecution::Node*> > combNodes;
+
+			nodesPool = RuleExecution::getNodesPool(tokenRules);
+
+			RuleExecution::getAmbigInfo(tokenRules, nodesPool, &ambigInfo,
+					&compNum);
+
+			RuleExecution::getOuts(&outs, &combNodes, ambigInfo, nodesPool,
+					ruleOutputs, spaces);
+
+			vector<RuleExecution::AmbigInfo*> newAmbigInfo;
+			for (unsigned j = 0; j < ambigInfo.size(); j++)
+				if (ambigInfo[j]->combinations.size() > 1)
+					newAmbigInfo.push_back(ambigInfo[j]);
+			ambigInfo = newAmbigInfo;
+
+			// remove bad sentences with (*,#,@)
+			string line;
+
+			if (!targetFilePath.empty()) {
+				bool isBad;
+				for (unsigned j = 0; j < outs.size(); j++) {
+					getline(targetFile, line);
+					if (line.find('*') != string::npos
+							|| line.find('#') != string::npos
+							|| line.find('@') != string::npos) {
+						isBad = true;
+						break;
+					}
+				}
+				if (isBad)
+					continue;
+			}
+			goodSents++;
+
+			// read weights
+			vector<float> weights;
+			for (unsigned j = 0; j < outs.size(); j++) {
+				getline(weightOutFile, line);
+				float weight = strtof(line.c_str(), NULL);
+				weights.push_back(weight);
+			}
 
-			  for (unsigned c = 0; c < word.length (); c++)
-			    if (word[c] == ' ')
-			      word.replace (c, 1, "_");
+			RuleExecution::normaliseWeights(&weights, ambigInfo);
+
+			// Yasmet format preparing
+			// make a directory if not found
+			mkdir(datasetsPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+
+			unsigned weigInd = 0;
+			for (unsigned i = 0; i < ambigInfo.size(); i++) {
+				RuleExecution::AmbigInfo* ambig = ambigInfo[i];
+
+				// name of the file is the concatenation of rules ids
+				string rulesNums;
+				for (unsigned x = 0; x < ambig->combinations.size(); x++) {
+					// avoid dummy node
+					for (unsigned y = 1; y < ambig->combinations[x].size();
+							y++) {
+						stringstream ss;
+						ss << ambig->combinations[x][y]->ruleId;
+						rulesNums += ss.str();
+
+						if (y + 1 < ambig->combinations[x].size())
+							rulesNums += "_";
+					}
+					rulesNums += "+";
+				}
+
+				// if it's the first time to open , put the number of classes
+				bool firstTime = true;
+				if (FILE *file = fopen(
+						(datasetsPath + string("/") + rulesNums).c_str(),
+						"r")) {
+					firstTime = false;
+					fclose(file);
+				}
+
+				ofstream dataset(
+						(datasetsPath + string("/") + rulesNums).c_str(),
+						ofstream::app);
+
+				if (firstTime)
+					dataset << ambig->combinations.size() << endl;
+
+				for (unsigned x = 0; x < ambig->combinations.size(); x++) {
+
+					dataset << x << " $ ";
+
+					float weight = weights[x + weigInd];
+
+					dataset << weight << " #";
+
+					string features;
+					for (unsigned v = 0; v < ambig->combinations.size(); v++) {
+						stringstream ss;
+						ss << v;
+						string label = ss.str();
+
+						for (unsigned z = ambig->firTokId;
+								z < ambig->firTokId + ambig->maxPat; z++) {
+							stringstream ss;
+							ss << z - ambig->firTokId;
+							string num = ss.str();
+							string word = CLExec::toLowerCase(slTokens[z],
+									localeId);
+
+							for (unsigned c = 0; c < word.length(); c++)
+								if (word[c] == ' ')
+									word.replace(c, 1, "_");
+
+							features += " " + word + "_" + num + ":" + label;
+							for (unsigned d = 0; d < slTags[z].size(); d++)
+								features += " " + slTags[z][d] + "_" + num + ":"
+										+ label;
+						}
+						features += " #";
+					}
+					dataset << features << endl;
+				}
+				weigInd += ambig->combinations.size();
+				dataset.close();
+			}
 
-			  features += " " + word + "_" + num + ":" + label;
+			// delete AmbigInfo pointers
+			for (unsigned j = 0; j < ambigInfo.size(); j++) {
+				// delete the dummy node pointers
+				set<RuleExecution::Node*> dummies;
+				for (unsigned k = 0; k < ambigInfo[j]->combinations.size(); k++)
+					dummies.insert(ambigInfo[j]->combinations[k][0]);
+				for (set<RuleExecution::Node*>::iterator it = dummies.begin();
+						it != dummies.end(); it++)
+					delete (*it);
+
+				delete ambigInfo[j];
 			}
-		      features += " #";
-		    }
-		  dataset << features << endl;
-//		  delete (features);
-		}
-	      weigInd += ambig->combinations.size ();
-//	      dataset.close ();
-	    }
-
-	  // delete AmbigInfo pointers
-	  for (unsigned j = 0; j < ambigInfo.size (); j++)
-	    {
-	      // delete the dummy node pointers
-	      set<RuleExecution::Node*> dummies;
-	      for (unsigned k = 0; k < ambigInfo[j]->combinations.size (); k++)
-		dummies.insert (ambigInfo[j]->combinations[k][0]);
-	      for (set<RuleExecution::Node*>::iterator it = dummies.begin ();
-		  it != dummies.end (); it++)
-		delete (*it);
-
-	      delete ambigInfo[j];
-	    }
-	  // delete Node pointers
-	  for (map<unsigned, vector<RuleExecution::Node*> >::iterator it =
-	      nodesPool.begin (); it != nodesPool.end (); it++)
-	    {
-	      for (unsigned j = 0; j < it->second.size (); j++)
-		{
-		  delete it->second[j];
+			// delete Node pointers
+			for (map<unsigned, vector<RuleExecution::Node*> >::iterator it =
+					nodesPool.begin(); it != nodesPool.end(); it++) {
+				for (unsigned j = 0; j < it->second.size(); j++) {
+					delete it->second[j];
+				}
+			}
+
 		}
-	    }
+		lextorFile.close();
+		weightOutFile.close();
 
-//	      }
+		cout << "There are " << goodSents << " good sentences from " << allSents
+				<< endl;
+	} else {
+		cout << "ERROR in opening files!" << endl;
 	}
-      lextorFile.close ();
-      weightOutFile.close ();
-    }
-  else
-    {
-      cout << "ERROR in opening files!" << endl;
-    }
-
-  return 0;
+
+	return 0;
 }
diff --git a/training-yasmet.sh b/training-yasmet.sh
index be738af..74a316a 100644
--- a/training-yasmet.sh
+++ b/training-yasmet.sh
@@ -16,7 +16,7 @@ pairPar=$4;
 # 5) model weight program path.
 modelWeight=$5;
 
-# 6) ICU locale ID for the source language. For Kazakh => kk-KZ
+# 6) ICU locale ID for the source language. For Kazakh => kk_KZ
 localeId=$6;
 
 # 7) Analysis output file name.