In [ ]:
import pyreadr as pyR
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
In [ ]:
insurance = pyR.read_r('/home/sedki/Dropbox/enseignement/M2SDS/data/insurance.rda')
print(type(insurance))
print(insurance.keys())
df = insurance['insurance']
df.head()
In [ ]:
X = pd.get_dummies(df.drop(columns=['charges']))
print(X.head())
y = df['charges']
print(X.shape)
print(y.shape)
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(X_test.shape)
In [ ]:
rf = RandomForestRegressor(n_estimators=1000, n_jobs=10, criterion='squared_error',
max_depth=None, min_samples_split=2,
min_samples_leaf=1,max_leaf_nodes=None,
bootstrap=True, oob_score=True)
In [ ]:
params = {'max_features': list(range(1,12))}
gcv = GridSearchCV(estimator=rf,param_grid=params)
gcv.fit(X_train,y_train)
print(gcv.best_params_)
In [ ]:
model = gcv.best_estimator_
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("Racine du mse d'apprentissage", round(np.sqrt(mean_squared_error(y_train, y_train_pred)),2))
print("Racine du mse de test", round(np.sqrt(mean_squared_error(y_test, y_test_pred)),2))
In [ ]:
# Importance décroissante des variables
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print(X_train.columns[indices[f]], importances[indices[f]])
In [ ]:
r = permutation_importance(model, X_test, y_test,
n_repeats=30,
random_state=0)
In [ ]:
for i in r.importances_mean.argsort()[::-1]:
if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
print(f"{X_train.columns[i]:<8}"
f" {r.importances_mean[i]:.3f}"
f" +/- {r.importances_std[i]:.3f}")