commit 1e8268d7f1fdb7cbd8f4964fae00a11e19fd8da4 Author: aboelhamd Date: Mon Aug 19 18:55:06 2019 +0200 return sklearn Ordinal encoder to compare random with samples datasets. diff --git a/sklearn-train.py b/sklearn-train.py index 9194543..5bc9438 100644 --- a/sklearn-train.py +++ b/sklearn-train.py @@ -39,17 +39,17 @@ for file in files: classifiers = [SVC(kernel="linear", C=0.025)] print("file name :", file) - data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:100000] + data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:200000] # if records equals to classes number, duplicates the data if data.shape[0] == data.iloc[:,0].nunique(): data = data.append(data) - # words(features) encoding - from sklearn.preprocessing import OneHotEncoder - enc = OneHotEncoder(handle_unknown='ignore') - features = enc.fit_transform(data.iloc[:,2:]).toarray() - + # words (features) encoding + from sklearn.preprocessing import OrdinalEncoder + enc = OrdinalEncoder(dtype=np.int32) + features = enc.fit_transform(data.iloc[:,2:]) + # save the encoder enc_name = os.path.join(models_path, 'encoder'+'-'+file_no_ext)[:256] joblib.dump(enc, enc_name)