In [2]:
import pandas as pd # pour importer le jeu de données
import numpy as np  # pour les simple manips de base
from sklearn.ensemble import GradientBoostingRegressor # pour faire des RF
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
import matplotlib.pyplot as plt # faire une figure ou deux 
%matplotlib inline
In [3]:
prostate = pd.read_csv('../data/Prostate_Cancer.txt', delimiter=',')
# binariser gleason 
prostate['gleason'][prostate['gleason']==6] = 0
prostate['gleason'][prostate['gleason']>6] = 1
# Train-test split
y_train = np.array(prostate[prostate.train == "T"]['lpsa'])
y_test = np.array(prostate[prostate.train == "F"]['lpsa'])
X_train = np.array(prostate[prostate.train == "T"].drop(['lpsa', 'train'], axis=1))
X_test = np.array(prostate[prostate.train == "F"].drop(['lpsa', 'train'], axis=1))
/home/sedki/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/home/sedki/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
In [4]:
# Fit classif model
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, clf.predict(X_test)))
print("RMSE: %.4f" % rmse)
RMSE: 0.1670
In [ ]:
param=[{'n_estimators': list(range(5,250,5)),
        'max_depth': list(range(1,8,1)),
        'min_samples_split': list(range(2,10,1)),
        'learning_rate': list([0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1])}]

gboost= GridSearchCV(GradientBoostingRegressor(loss="ls"),
                     param,cv=3,n_jobs=-1)
gboostOptim=gboost.fit(X_train, y_train)
gboostOptim.best_params_
In [8]:
# Fit best model
params = {'n_estimators': 30, 'max_depth': 6, 'min_samples_split': 10,
          'learning_rate': 0.5, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, clf.predict(X_test)))
print("RMSE: %.4f" % rmse)
RMSE: 0.2420