import pandas as pd # pour importer le jeu de données
import numpy as np  # pour les simple manips de base
from sklearn.ensemble import RandomForestClassifier # pour faire des RF
from sklearn.model_selection import GridSearchCV # pour faire de la validation croisée
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt # faire une figure ou deux 
%matplotlib inline

pima_tr = pd.read_csv('../data/Pima_tr.csv', sep=";", decimal = ",")
pima_te = pd.read_csv('../data/Pima_te.csv', sep=";", decimal = ",")
pima_tr.head()

# Train-test split
y_train = np.array(pima_tr['type'])
y_test = np.array(pima_te['type'])
X_train = np.array(pima_tr.drop(['type'], axis=1))
X_test = np.array(pima_te.drop(['type'], axis=1))

forest = RandomForestClassifier(n_estimators=500, 
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features='auto', max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
rfFit = forest.fit(X_train, y_train)
## affichage du taux de mauvais classement 
print(1-rfFit.oob_score_)

0.28500000000000003

## erreur de prédiction du jeu de données test
print(1-rfFit.score(X_test,y_test))

0.2319277108433735

param=[{"max_features":list(range(2,7,1))}]
rf= GridSearchCV(RandomForestClassifier(n_estimators=500),
        param,cv=3,n_jobs=-1)
rfOpt=rf.fit(X_train, y_train)
# paramètre optimal
rfOpt.best_params_['max_features']

/home/sedki/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)

6

nb_var=rfOpt.best_params_['max_features']
rf = RandomForestClassifier(n_estimators=500,
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features=nb_var,max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# apprentissage
rfFit = rf.fit(X_train,y_train)
# erreur de prévision sur le test
print(1-rfFit.score(X_test,y_test))

0.24698795180722888

# Importance décroissante des variables
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(pima_tr.columns[indices[f]], importances[indices[f]])

glu 0.2972967767648795
ped 0.17031903873281914
age 0.15138171180978932
bmi 0.1434182701547072
skin 0.08539659943365831
bp 0.07832789116491452
npreg 0.07385971193923198

# Graphe des importances
plt.figure()
plt.title("Importances des variables")
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()

	npreg	glu	bp	skin	bmi	ped	age	type
0	5	86	68	28	30.2	0.364	24	No
1	7	195	70	33	25.1	0.163	55	Yes
2	5	77	82	41	35.8	0.156	35	No
3	0	165	76	43	47.9	0.259	26	No
4	0	107	60	25	26.4	0.133	23	No