commit 166c05013f1d1cc976a8d00a7208f60955ad056f Author: aboelhamd Date: Mon May 6 23:55:01 2019 +0200 Adding new class to order sentences by their ambiguity diff --git a/src/OrderAmbigSents.cpp b/src/OrderAmbigSents.cpp new file mode 100644 index 0000000..0aa81c6 --- /dev/null +++ b/src/OrderAmbigSents.cpp @@ -0,0 +1,218 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../pugixml/pugixml.hpp" +#include "RuleParser.h" +#include "RuleExecution.h" +#include "TranElemLiterals.h" +#include "CLExec.h" + +using namespace std; +using namespace pugi; +using namespace elem; + +int +main (int argc, char **argv) +{ + string localeId, transferFilePath, sourceFilePath, srcLexFilePath, targetFilePath, + orderedSrcFilePath, orderedTrgFilePath; + + if (argc == 8) + { + localeId = argv[1]; + transferFilePath = argv[2]; + sourceFilePath = argv[3]; + srcLexFilePath = argv[4]; + targetFilePath = argv[5]; + orderedSrcFilePath = argv[6]; + orderedTrgFilePath = argv[7]; + } + else + { +// localeId = "es_ES"; +// transferFilePath = "transferFile.t1x"; +// sentenceFilePath = "spa-test.txt"; +// lextorFilePath = "spa-test.lextor"; +// interInFilePath = "inter2.txt"; + +// localeId = "kk_KZ"; +// transferFilePath = "apertium-kaz-tur.kaz-tur.t1x"; +// sentenceFilePath = "sample-sentences.txt"; +// lextorFilePath = "sample-lextor.txt"; +// interInFilePath = "sample-inter.txt"; + + localeId = "es_ES"; + transferFilePath = + "/home/aboelhamd/apertium-eng-spa-ambiguous-rules/apertium-eng-spa.spa-eng.t1x"; + sourceFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/xbe-sentences.txt"; + srcLexFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/xbe-lextor.txt"; + targetFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/xbe-transfer.txt"; + orderedSrcFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/ordered-xbe-source.txt"; + orderedTrgFilePath = + "/home/aboelhamd/eclipse-workspace/machinetranslation/ordered-xbe-target.txt"; + + cout << "Error in parameters !" << endl; + cout << "Parameters are : localeId transferFilePath sourceFilePath" + << " lextorFilePath targetFilePath orderedSrcFilePath orderedTrgFilePath" + << endl; + cout << "localeId : ICU locale ID for the source language. For Kazakh => kk-KZ" + << endl; + cout << "transferFilePath : Apertium transfer file of the language pair used." + << endl; + cout << "sourceFilePath : File for the source language sentences." << endl; + cout << "lextorFilePath : Apertium lextor file for the source language sentences." + << endl; + cout << "targetFilePath : File for the target language sentences." << endl; + cout + << "orderedSrcFilePath : New file for the ordered source language sentences by most ambiguous." + << endl; + cout + << "orderedTrgFilePath : New file for the ordered target language sentences by most ambiguous." + << endl; + return -1; + } + + ifstream lextorFile (srcLexFilePath.c_str ()); + ifstream sourceFile (sourceFilePath.c_str ()); + ifstream targetFile (targetFilePath.c_str ()); + ofstream orderedSrcFile (orderedSrcFilePath.c_str ()); + ofstream orderedTrgFile (orderedTrgFilePath.c_str ()); + if (lextorFile.is_open () && sourceFile.is_open () && orderedSrcFile.is_open () + && targetFile.is_open () && orderedTrgFile.is_open ()) + { + // load transfer file in an xml document object + xml_document transferDoc; + xml_parse_result result = transferDoc.load_file (transferFilePath.c_str ()); + + if (string (result.description ()) != "No error") + { + cout << "ERROR : " << result.description () << endl; + return -1; + } + + // xml node of the parent node (transfer) in the transfer file + xml_node transfer = transferDoc.child ("transfer"); + + map > > attrs = RuleParser::getAttrs (transfer); + map vars = RuleParser::getVars (transfer); + map > lists = RuleParser::getLists (transfer); + + vector orderedSources, orderedTargets; + vector ambigCounts; + + unsigned i = 0; + string tokenizedSentence, sourceSentence, targetSentence; + while (getline (lextorFile, tokenizedSentence) + && getline (sourceFile, sourceSentence) && getline (targetFile, targetSentence)) + { + cout << i++ << endl; + + // spaces after each token + vector spaces; + + // tokens in the sentence order + vector slTokens, tlTokens; + + // tags of tokens in order + vector > slTags, tlTags; + + RuleParser::sentenceTokenizer (&slTokens, &tlTokens, &slTags, &tlTags, &spaces, + tokenizedSentence); + + // map of tokens ids and their matched categories + map > catsApplied; + + RuleParser::matchCats (&catsApplied, slTokens, slTags, transfer); + + // map of matched rules and a pair of first token id and patterns number + map > > rulesApplied; + + RuleParser::matchRules (&rulesApplied, slTokens, catsApplied, transfer); + + // rule and (target) token map to specific output + // if rule has many patterns we will choose the first token only + map > ruleOutputs; + + // map (target) token to all matched rules ids and the number of pattern items of each rule + map > > tokenRules; + + RuleExecution::ruleOuts (&ruleOutputs, &tokenRules, slTokens, slTags, tlTokens, + tlTags, rulesApplied, attrs, lists, &vars, spaces, + localeId); + + // number of possible combinations + unsigned compNum; + // nodes for every token and rule + map > nodesPool; + + nodesPool = RuleExecution::getNodesPool (tokenRules); + + compNum = RuleExecution::getAmbigCount (tokenRules, nodesPool); + + bool put = false; + for (unsigned j = 0; j < ambigCounts.size (); j++) + { + if (compNum > ambigCounts[j]) + { + ambigCounts.insert (ambigCounts.begin () + j, compNum); + orderedSources.insert (orderedSources.begin () + j, sourceSentence); + orderedTargets.insert (orderedTargets.begin () + j, targetSentence); + put = true; + break; + } + } + if (!put) + { + ambigCounts.push_back (compNum); + orderedSources.push_back (sourceSentence); + orderedTargets.push_back (targetSentence); + } + + // delete Node pointers + for (map >::iterator it = + nodesPool.begin (); it != nodesPool.end (); it++) + { + for (unsigned j = 0; j < it->second.size (); j++) + { + delete it->second[j]; + } + } + } + + // write the ordered sentences + for (unsigned j = 0; j < orderedSources.size (); j++) + { + orderedSrcFile << ambigCounts[j] << endl << orderedSources[j] << endl << endl; + orderedTrgFile << ambigCounts[j] << endl << orderedTargets[j] << endl << endl; + } + + lextorFile.close (); + sourceFile.close (); + targetFile.close (); + orderedSrcFile.close (); + orderedTrgFile.close (); + } + else + { + cout << "ERROR in opening files!" << endl; + } + + return 0; +} diff --git a/src/RuleExecution.cpp b/src/RuleExecution.cpp index cc524ef..b904d01 100644 --- a/src/RuleExecution.cpp +++ b/src/RuleExecution.cpp @@ -506,6 +506,36 @@ getMaxPat (int curMaxPat, unsigned curToken, getMaxPat (max (curMaxPat - 1, maxPat - curMaxPat), curToken + 1, tokenRules, count); } +unsigned +RuleExecution::getAmbigCount ( + map > > tokenRules, + map > nodesPool) +{ + unsigned combCount = 0; + for (unsigned tokId = 0; tokId < tokenRules.size ();) + { + unsigned maxPat = 0; + vector > rules = tokenRules[tokId]; + getMaxPat (rules[0].second, tokId, tokenRules, &maxPat); + + // if there is ambiguity + if (nodesPool[tokId].size () > 1) + { + AmbigInfo* ambig = new AmbigInfo (tokId, maxPat); + + Node* dummy = ambiguousGraph (tokenRules, nodesPool, tokId, maxPat); + getCombinations (dummy, vector (), &ambig->combinations); + + // update combinations count + combCount += ambig->combinations.size (); + + delete ambig; + } + tokId += maxPat; + } + return combCount; +} + void RuleExecution::getAmbigInfo (map > > tokenRules, map > nodesPool, @@ -527,10 +557,15 @@ RuleExecution::getAmbigInfo (map > > t Node* dummy = ambiguousGraph (tokenRules, nodesPool, tokId, maxPat); getCombinations (dummy, vector (), &ambig->combinations); + // update combinations count + *combNum += ambig->combinations.size (); + if (!ambig->combinations.empty ()) ambigInfo->push_back (ambig); - *combNum += ambig->combinations.size (); + // delete the pointer if not pushed to the list + else + delete ambig; } tokId += maxPat; } diff --git a/src/RuleExecution.h b/src/RuleExecution.h index 29d2034..90f396e 100644 --- a/src/RuleExecution.h +++ b/src/RuleExecution.h @@ -286,6 +286,10 @@ public: map > > attrs, map* vars, vector spaces, unsigned firPat, string localeId, map paramToPattern); + + static unsigned + getAmbigCount (map > > tokenRules, + map > nodesPool); }; #endif /* SRC_RULEEXECUTION_H_ */