import pandas as pd # pour importer le jeu de données
import numpy as np # pour les simple manips de base
from sklearn.ensemble import GradientBoostingClassifier # pour faire des RF
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
import matplotlib.pyplot as plt # faire une figure ou deux
%matplotlib inline
pima_tr = pd.read_csv('../data/Pima_tr.csv', sep=";", decimal = ",")
pima_te = pd.read_csv('../data/Pima_te.csv', sep=";", decimal = ",")
pima_tr.head()
# Train-test split
y_train = np.array(pima_tr['type'])
y_test = np.array(pima_te['type'])
X_train = np.array(pima_tr.drop(['type'], axis=1))
X_test = np.array(pima_te.drop(['type'], axis=1))
# Fit classif model
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'deviance'}
clf = GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
param=[{'n_estimators': list(range(10,150,10)),
'max_depth': list(range(1,7,1)),
'min_samples_split': list(range(2,14,2)),
'learning_rate': list([0.01, 0.05, 0.1, 0.5, 1])}]
gboost= GridSearchCV(GradientBoostingClassifier(loss="deviance"),
param,cv=3,n_jobs=-1)
gboostOptim=gboost.fit(X_train, y_train)
gboostOptim.best_params_
# Fit best model
params = {'n_estimators': 70, 'max_depth': 1, 'min_samples_split': 2,
'learning_rate': 0.05, 'loss': 'deviance'}
clf = GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print(pima_tr.columns[indices[f]], importances[indices[f]])
# Graphe des importances
plt.figure()
plt.title("Importances des variables")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()