import pyreadr as pyR
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
fetal = pyR.read_r('/home/sedki/Dropbox/enseignement/M2SDS/data/fetal_health.rda')
print(type(fetal))
print(fetal.keys())
df = fetal['fetal_df']
df.head()
X = pd.get_dummies(df.drop(columns=['fetal_health']))
print(X.head())
y = df['fetal_health']
print(X.shape)
print(y.shape)
x_train,x_test,y_train,y_test = train_test_split(X,y)
print(x_train.shape)
print(x_test.shape)
regtree = tree.DecisionTreeClassifier(random_state=0)
regtree.fit(x_train,y_train)
y_train_pred = regtree.predict(x_train)
y_test_pred = regtree.predict(x_test)
plt.figure(figsize=(20,20))
features = list(X.columns)
tree.plot_tree(regtree,feature_names=features,filled=True)
plt.show()
print("Erreur d'apprentissage : ", 1-accuracy_score(y_train, y_train_pred))
print("Erreur de test : ", 1-accuracy_score(y_test, y_test_pred))
params = {'max_depth': [2,4,6,8,10,12],
'min_samples_split': [2,3,4],
'min_samples_leaf': [1,2,3,4,5,6,7,8]}
classtree = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=classtree,param_grid=params, n_jobs=-1)
gcv.fit(x_train,y_train)
print(gcv.best_params_)
model = gcv.best_estimator_
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
print("Erreur d'apprentissage : ", 1-accuracy_score(y_train, y_train_pred))
print("Erreur de test : ", 1-accuracy_score(y_test, y_test_pred))
plt.figure(figsize=(20,20))
features = list(X.columns)
tree.plot_tree(model,feature_names=features,filled=True)
plt.show()
classtree_0 = tree.DecisionTreeClassifier(min_samples_leaf=1, min_samples_split=2, max_depth = 1000)
path = classtree_0.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)
params = {'ccp_alpha' : ccp_alphas}
classtree = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=classtree,param_grid=params, n_jobs=-1)
gcv.fit(x_train,y_train)
print(gcv.best_params_['ccp_alpha'])
classtree_cv = tree.DecisionTreeClassifier(random_state=0,ccp_alpha=gcv.best_params_['ccp_alpha'])
classtree_cv.fit(x_train,y_train)
y_train_pred = classtree_cv.predict(x_train)
y_test_pred = classtree_cv.predict(x_test)
print("Erreur d'apprentissage : ", 1-accuracy_score(y_train, y_train_pred))
print("Erreur de test : ", 1-accuracy_score(y_test, y_test_pred))