In [2]:
import pandas as pd # pour importer le jeu de données
import numpy as np  # pour les simple manips de base
from sklearn.ensemble import RandomForestClassifier # pour faire des RF
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt # faire une figure ou deux 
%matplotlib inline
In [3]:
pima_tr = pd.read_csv('../data/Pima_tr.csv', sep=";", decimal = ",")
pima_te = pd.read_csv('../data/Pima_te.csv', sep=";", decimal = ",")
pima_tr.head()
Out[3]:
npreg glu bp skin bmi ped age type
0 5 86 68 28 30.2 0.364 24 No
1 7 195 70 33 25.1 0.163 55 Yes
2 5 77 82 41 35.8 0.156 35 No
3 0 165 76 43 47.9 0.259 26 No
4 0 107 60 25 26.4 0.133 23 No
In [4]:
# Train-test split
y_train = np.array(pima_tr['type'])
y_test = np.array(pima_te['type'])
X_train = np.array(pima_tr.drop(['type'], axis=1))
X_test = np.array(pima_te.drop(['type'], axis=1))
In [7]:
forest = RandomForestClassifier(n_estimators=500, 
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features='auto', max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
rfFit = forest.fit(X_train, y_train)
## affichage du taux de mauvais classement 
print(1-rfFit.oob_score_)
0.28500000000000003
In [8]:
## erreur de prédiction du jeu de données test
print(1-rfFit.score(X_test,y_test))
0.2319277108433735
In [16]:
param=[{"max_features":list(range(2,7,1))}]
rf= GridSearchCV(RandomForestClassifier(n_estimators=500),
        param,cv=3,n_jobs=-1)
rfOpt=rf.fit(X_train, y_train)
# paramètre optimal
rfOpt.best_params_['max_features']
/home/sedki/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Out[16]:
6
In [17]:
nb_var=rfOpt.best_params_['max_features']
rf = RandomForestClassifier(n_estimators=500,
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features=nb_var,max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
rfFit = rf.fit(X_train,y_train)
# erreur de prévision sur le test
print(1-rfFit.score(X_test,y_test))
0.24698795180722888
In [18]:
# Importance décroissante des variables
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(pima_tr.columns[indices[f]], importances[indices[f]])
glu 0.2972967767648795
ped 0.17031903873281914
age 0.15138171180978932
bmi 0.1434182701547072
skin 0.08539659943365831
bp 0.07832789116491452
npreg 0.07385971193923198
In [19]:
# Graphe des importances
plt.figure()
plt.title("Importances des variables")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()