import pandas as pd # pour importer le jeu de données
import numpy as np # pour les simple manips de base
from sklearn.ensemble import RandomForestClassifier # pour faire des RF
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt # faire une figure ou deux
%matplotlib inline
pima_tr = pd.read_csv('../data/Pima_tr.csv', sep=";", decimal = ",")
pima_te = pd.read_csv('../data/Pima_te.csv', sep=";", decimal = ",")
pima_tr.head()
# Train-test split
y_train = np.array(pima_tr['type'])
y_test = np.array(pima_te['type'])
X_train = np.array(pima_tr.drop(['type'], axis=1))
X_test = np.array(pima_te.drop(['type'], axis=1))
forest = RandomForestClassifier(n_estimators=500,
criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1,
max_features='auto', max_leaf_nodes=None,
bootstrap=True, oob_score=True)
# apprentissage
rfFit = forest.fit(X_train, y_train)
## affichage du taux de mauvais classement
print(1-rfFit.oob_score_)
## erreur de prédiction du jeu de données test
print(1-rfFit.score(X_test,y_test))
param=[{"max_features":list(range(2,7,1))}]
rf= GridSearchCV(RandomForestClassifier(n_estimators=500),
param,cv=3,n_jobs=-1)
rfOpt=rf.fit(X_train, y_train)
# paramètre optimal
rfOpt.best_params_['max_features']
nb_var=rfOpt.best_params_['max_features']
rf = RandomForestClassifier(n_estimators=500,
criterion='gini', max_depth=None,
min_samples_split=2, min_samples_leaf=1,
max_features=nb_var,max_leaf_nodes=None,
bootstrap=True, oob_score=True)
# apprentissage
rfFit = rf.fit(X_train,y_train)
# erreur de prévision sur le test
print(1-rfFit.score(X_test,y_test))
# Importance décroissante des variables
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print(pima_tr.columns[indices[f]], importances[indices[f]])
# Graphe des importances
plt.figure()
plt.title("Importances des variables")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()