import pandas as pd # pour importer le jeu de données
import numpy as np # pour les simple manips de base
from sklearn.ensemble import GradientBoostingRegressor # pour faire des RF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
import matplotlib.pyplot as plt # faire une figure ou deux
%matplotlib inline
prostate = pd.read_csv('../data/Prostate_Cancer.txt', delimiter=',')
# binariser gleason
prostate['gleason'][prostate['gleason']==6] = 0
prostate['gleason'][prostate['gleason']>6] = 1
# Train-test split
y_train = np.array(prostate[prostate.train == "T"]['lpsa'])
y_test = np.array(prostate[prostate.train == "F"]['lpsa'])
X_train = np.array(prostate[prostate.train == "T"].drop(['lpsa', 'train'], axis=1))
X_test = np.array(prostate[prostate.train == "F"].drop(['lpsa', 'train'], axis=1))
# Fit classif model
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, clf.predict(X_test)))
print("RMSE: %.4f" % rmse)
param=[{'n_estimators': list(range(5,250,5)),
'max_depth': list(range(1,8,1)),
'min_samples_split': list(range(2,10,1)),
'learning_rate': list([0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1])}]
gboost= GridSearchCV(GradientBoostingRegressor(loss="ls"),
param,cv=3,n_jobs=-1)
gboostOptim=gboost.fit(X_train, y_train)
gboostOptim.best_params_
# Fit best model
params = {'n_estimators': 30, 'max_depth': 6, 'min_samples_split': 10,
'learning_rate': 0.5, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, clf.predict(X_test)))
print("RMSE: %.4f" % rmse)