commit 0a0a1617e31f92a7d00ec31cde9cbe745b8deb90 Author: aboelhamd Date: Sat Jul 27 02:14:09 2019 +0200 writing results and testing are left diff --git a/loadmodels.ipynb b/loadmodels.ipynb index 0531b26..6ddce66 100644 --- a/loadmodels.ipynb +++ b/loadmodels.ipynb @@ -15458,8 +15458,8 @@ "\n", "for file in files:\n", " # These are the classifiers that permit training data with sample weights!\n", - " names = [\"NaiveBayes\", \"LinearSVM\", \"RBFSVM\", \"DecisionTree\",\n", - " \"RandomForest\", \"AdaBoost\"]\n", + "# names = [\"NaiveBayes\", \"LinearSVM\", \"RBFSVM\", \"DecisionTree\",\n", + "# \"RandomForest\", \"AdaBoost\"]\n", " \n", " print(\"file name :\", file)\n", " data = pd.read_csv(files[file], delimiter=r\"\\s+\", header=None).dropna()\n", @@ -15469,39 +15469,33 @@ " enc = joblib.load('models/'+'encoder'+'-'+file[:-4])\n", " # remove records with unseen word, will return always 0 for that record\n", " # this will be solved later\n", + " unseen = []\n", + " for i in range(len(data.values)) :\n", + " for j in range(len(data.values[i])) :\n", + " if data.values[i][j] not in enc.categories_[j] :\n", + " unseen.append(data.values[i])\n", " \n", - " \n", - " features = enc.fit_transform(data.iloc[:,2:])\n", - "# display(enc.categories_)\n", - "# display(data.iloc[:,2:],features)\n", - " # target and weights\n", - " target = data.iloc[:,0]\n", - " weights = data.iloc[:,1].values\n", + " seen = [x for x in data.values if x not in unseen]\n", + "\n", + " samples = enc.transform(seen)\n", " \n", "# print(\"file name :\", file)\n", - " print(\"Rules(classes) number :\",target.nunique())\n", - " print(\"Words(features) number :\",features.shape[1])\n", - " print(\"Records number :\",features.shape[0], end = '')\n", - " display(data.iloc[:target.nunique(),:])\n", + "# print(\"Rules(classes) number :\",target.nunique())\n", + " print(\"Words(features) number :\",samples.shape[1])\n", + " print(\"Records number :\",samples.shape[0], end = '')\n", + "# display(data.iloc[:target.nunique(),:])\n", " \n", - " # split to train and test\n", - " X_train, X_test, y_train, y_test, w_train, w_test = \\\n", - " train_test_split(features, target, weights, test_size=.5, random_state=0, stratify=target)\n", "# display(features, target, weights)\n", - "# display(X_train, X_test, y_train, y_test, w_train, w_test)\n", " \n", - " # train models and print their scores\n", - " for name in names:\n", - " print(\"model :\", name, \",\", end = '')\n", - " modelname = 'sklearn-models/'+name+'-'+file[:-4]+'.model'\n", - " loaded_model = joblib.load(modelname)\n", - " score = loaded_model.score(X=X_test, y=y_test, sample_weight=w_test)\n", - " print(\" score =\", score)\n", - " \n", - " # save models\n", - "# name+'-'+file[:-4]+'.model'\n", - "# modelname = 'sklearn-models/'+name+'-'+file[:-4]+'.model'\n", - "# joblib.dump(clf, filename)\n", + " # prediction by using svm\n", + "# print(\"model :\", name, \",\", end = '')\n", + " name = 'LinearSVM'\n", + " modelname = 'sklearn-models/'+name+'-'+file[:-4]+'.model'\n", + " loaded_model = joblib.load(modelname)\n", + " rules = loaded_model.predict(samples)\n", + " \n", + " # write results in file\n", + " \n", " print(\"----------------------------------------------\\n\")\n" ] }, @@ -15551,95 +15545,46 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 72, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012345
0ruleweightword1word2word3word4
100.314649lopoderhaberser
210.342676lopoderhaberser
320.342676lopoderhaberser
\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5\n", - "0 rule weight word1 word2 word3 word4\n", - "1 0 0.314649 lo poder haber ser\n", - "2 1 0.342676 lo poder haber ser\n", - "3 2 0.342676 lo poder haber ser" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "rule\n", + "weight\n", + "word1\n", + "word2\n", + "word3\n", + "word4\n", + "0\n", + "0.314649\n", + "lo\n", + "poder\n", + "haber\n", + "ser\n", + "1\n", + "0.342676\n", + "lo\n", + "poder\n", + "haber\n", + "ser\n", + "2\n", + "0.342676\n", + "lo\n", + "poder\n", + "haber\n", + "ser\n" + ] } ], "source": [ "data = pd.read_csv(files[file], delimiter=r\"\\s+\", header=None).dropna()\n", - "data" + "data\n", + "for i in range(len(data.values)) :\n", + " for j in range(len(data.values[i])) :\n", + " print(data.values[i][j])" ] }, {