commit af469143a8c8082b4cc5721d9b4f6946355d8d54
Author: aboelhamd <aboelhamd.abotreka@gmail.com>
Date:   Sat Aug 17 18:49:21 2019 +0200

    implement a one-hot encoder built on the custom ordinal encoder.

diff --git a/sklearn-train.py b/sklearn-train.py
index df10b35..e643d5c 100644
--- a/sklearn-train.py
+++ b/sklearn-train.py
@@ -3,7 +3,7 @@ import sys
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
-from matplotlib.colors import ListedColormap
+import random
 from sklearn.model_selection import train_test_split
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
@@ -39,7 +39,7 @@ for file in files:
   classifiers = [SVC(kernel="linear", C=0.025)]
   
   print("file name :", file)
-  data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:200000]
+  data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:100000]
   
   # if records equals to classes number, duplicates the data  
   if data.shape[0] == data.iloc[:,0].nunique():
@@ -48,16 +48,24 @@ for file in files:
   # words(features) encoding
   features = data.iloc[:,2:].values
 
+  feat_set = set(features.reshape(-1))
+  feat_list = list(feat_set)
+  # shuffle list
+  random.shuffle(feat_list)
+  feat_set = feat_list
+
   enc = {}
   c = 0
+  for feature in feat_set :
+    enc[feature]=c
+    c=c+1
+
+  onehot = np.zeros((len(features),c), dtype=int)
   for i in range (len(features)) :
     for j in range (len(features[i])) :
       w = features[i][j]
-      if (w not in enc) :
-        enc[w]=c
-        c=c+1
-      features[i][j]=enc[w]
-
+      onehot[i][enc[w]]=1
+  
   # save the encoder 
   enc_name = os.path.join(models_path, 'encoder'+'-'+file_no_ext)[:256]
   joblib.dump(enc, enc_name)
@@ -70,11 +78,11 @@ for file in files:
   print("Words(features) number :",features.shape[1])
   print("Records number :",features.shape[0])
   print(data.iloc[:target.nunique(),:] , '\n')
-  
+
   # split to train and test
   X_train, X_test, y_train, y_test, w_train, w_test = \
-      train_test_split(features, target, weights, test_size=.5, random_state=0, stratify=target)
-  
+      train_test_split(onehot, target, weights, test_size=.5, random_state=0, stratify=target)
+
   # train models and print their scores
   for name, model in zip(models_names, classifiers):
     print("model :", name, ",", end = '')