commit ec23e8955bd9cfdd257863e5626843881148983d Author: aboelhamd Date: Wed Jul 31 18:23:04 2019 +0200 Final modifications, it works. diff --git a/sklearn-train.py b/sklearn-train.py index e5cb2ae..74bba38 100644 --- a/sklearn-train.py +++ b/sklearn-train.py @@ -1,4 +1,5 @@ import os +import sys import matplotlib.pyplot as plt import pandas as pd import numpy as np @@ -19,7 +20,7 @@ models_path = sys.argv[2] files = {} # r=root, d=directories, f=files -for r, d, f in os.walk(path): +for r, d, f in os.walk(dataset_path): for file in f: files[file]=os.path.join(r, file) @@ -27,9 +28,9 @@ if not os.path.exists(models_path): os.makedirs(models_path) -# These are the classifiers that permit training data with sample weights! for file in files: +# These are the classifiers that permit training data with sample weights! models_names = ["NaiveBayes", "LinearSVM", "RBFSVM", "DecisionTree", "RandomForest", "AdaBoost"] @@ -44,39 +45,31 @@ for file in files: print("file name :", file) data = pd.read_csv(files[file], delimiter=r"\s+").dropna() -# print (data.shape[0] , data.iloc[:,0].nunique()) # if records equals to classes number, duplicates the data if data.shape[0] == data.iloc[:,0].nunique(): data = data.append(data) -# display(data) - -# print(data.iloc[:,2:]) # words (features) encoding from sklearn.preprocessing import OrdinalEncoder enc = OrdinalEncoder(dtype=np.int32) features = enc.fit_transform(data.iloc[:,2:]) - - # save the encoder - enc_name = models_path+'/'+'encoder'+'-'+file[:-4] + + # save the encoder + enc_name = os.path.join(models_path, 'encoder'+'-'+file[:-4]) joblib.dump(enc, enc_name) -# display(enc.categories_) -# display(data.iloc[:,2:],features) + # target and weights target = data.iloc[:,0] weights = data.iloc[:,1].values -# print("file name :", file) print("Rules(classes) number :",target.nunique()) print("Words(features) number :",features.shape[1]) - print("Records number :",features.shape[0], end = '') - display(data.iloc[:target.nunique(),:]) + print("Records number :",features.shape[0]) + print(data.iloc[:target.nunique(),:] + '\n') # split to train and test X_train, X_test, y_train, y_test, w_train, w_test = \ train_test_split(features, target, weights, test_size=.5, random_state=0, stratify=target) -# display(features, target, weights) -# display(X_train, X_test, y_train, y_test, w_train, w_test) # train models and print their scores for name, model in zip(models_names, classifiers): @@ -86,7 +79,7 @@ for file in files: print(" score =", score) # save models - model_name = models_path+'/'+name+'-'+file[:-4] + model_name = os.path.join(models_path, name+'-'+file[:-4]) joblib.dump(model, model_name) print("----------------------------------------------\n")