Diverses librairies

In [12]:
import pandas as pd # pour importer le jeu de données
import numpy as np  # pour les simple manips de base
from sklearn.ensemble import GradientBoostingClassifier # pour faire des RF
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
import matplotlib.pyplot as plt # faire une figure ou deux 
%matplotlib inline

Lectures des jeux de données

In [8]:
pima_tr = pd.read_csv('../data/Pima_tr.csv', sep=";", decimal = ",")
pima_te = pd.read_csv('../data/Pima_te.csv', sep=";", decimal = ",")
pima_tr.head()
Out[8]:
npreg glu bp skin bmi ped age type
0 5 86 68 28 30.2 0.364 24 No
1 7 195 70 33 25.1 0.163 55 Yes
2 5 77 82 41 35.8 0.156 35 No
3 0 165 76 43 47.9 0.259 26 No
4 0 107 60 25 26.4 0.133 23 No

Jeux de données d'apprentissage et test

In [9]:
# Train-test split
y_train = np.array(pima_tr['type'])
y_test = np.array(pima_te['type'])
X_train = np.array(pima_tr.drop(['type'], axis=1))
X_test = np.array(pima_te.drop(['type'], axis=1))

Une manière d'introduire les paramètres

In [13]:
# Fit classif model
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'deviance'}
clf = GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
Accuracy: 0.7500

Optimisation par validation croisée des paramètres du gradient boosting

In [21]:
param=[{'n_estimators': list(range(10,150,10)),
        'max_depth': list(range(1,7,1)),
        'min_samples_split': list(range(2,14,2)),
        'learning_rate': list([0.01, 0.05, 0.1, 0.5, 1])}]

gboost= GridSearchCV(GradientBoostingClassifier(loss="deviance"),
                     param,cv=3,n_jobs=-1)
gboostOptim=gboost.fit(X_train, y_train)
/home/sedki/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
In [22]:
gboostOptim.best_params_
Out[22]:
{'learning_rate': 0.05,
 'max_depth': 1,
 'min_samples_split': 2,
 'n_estimators': 70}
In [24]:
# Fit best model
params = {'n_estimators': 70, 'max_depth': 1, 'min_samples_split': 2,
          'learning_rate': 0.05, 'loss': 'deviance'}
clf = GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
Accuracy: 0.7861

Importance des variables

In [25]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(pima_tr.columns[indices[f]], importances[indices[f]])
glu 0.5207466807698699
age 0.22029408989324326
bmi 0.13842694841057512
ped 0.0909760942270941
npreg 0.029556186699217692
skin 0.0
bp 0.0
In [26]:
# Graphe des importances
plt.figure()
plt.title("Importances des variables")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()