# simuler 100 réalisations d'une loi normale  suivant une loi N(0, 1)
#import numpy as np
import numpy as np
from scipy.stats import norm, uniform, expon
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

#norm_rv.random_state = np.random.RandomState(seed=11)
np.random.seed(11)
x = norm.rvs(loc =2, scale =1, size=100)


# simuler N = 10000 jeu de données
n =  100
N =  10000 
X = np.reshape(norm.rvs(loc =2, scale =1, size=n*N), (N,n))
t = X.mean(axis=1)
t.shape
print(t.mean())
print(t.std())
print(t.var())


# comparer des distribution empirique versus empirique 

xx = np.linspace(norm.ppf(0.01, loc=2, scale = 1/np.sqrt(n)), 
                 norm.ppf(0.99, loc=2, scale = 1/np.sqrt(n)), 100)
fig, ax = plt.subplots(1, 1)
ax.plot(xx, norm.pdf(xx, loc=2, scale = 1/np.sqrt(n)), 'r-', lw=5, alpha=0.6, label='densite')
ax.hist(t, density=True, histtype='stepfilled', alpha=0.2, bins =30)


x_u =  uniform.rvs(size=1000) 
x_e =  expon.ppf(x_u, scale=1/2)

xx = np.linspace(expon.ppf(0.01, scale = 1/2), 
                 expon.ppf(0.99, scale = 1/2), 100)
fig, ax = plt.subplots(1, 1)
ax.hist(x_e, density=True, histtype='stepfilled', alpha=0.2, bins=50)
ax.plot(xx, expon.pdf(xx, scale = 1/2), 'r-', lw=5, alpha=0.6, label='densite')


coronary = pd.read_csv("data/coronary.csv", delimiter=';')
coronary = coronary.drop(coronary.columns[0], axis=1)
#print(coronary.head())

y = np.array(coronary['coron'])
X = np.array(coronary.drop(['coron'], axis=1))
print(X[:4,])
X1 = sm.add_constant(X) # adding a constant
print(X1[:4,])
model = sm.Logit(y, X1)
result = model.fit()
result.summary()
#print(result.params)


def boot_logistic_reg(X, y, size=1):
    """bootstrap pour les coefficients de régression logistique."""

    # Set up array of indices to sample from: inds
    inds = np.arange(X.shape[0])

    # Initialize replicates
    bs_params_reps = np.empty(shape=(size, X.shape[1]))

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds)) # sampling the indices (1d array requirement)
        bs_X, bs_y = X[bs_inds], y[bs_inds]
        bs_model = sm.Logit(bs_y, bs_X)
        bs_result = bs_model.fit(disp=False)
        bs_params_reps[i:] = bs_result.params

    return bs_params_reps

toto = boot_logistic_reg(X1, y, 100)

toto.std(axis=0)


 inds = np.arange(X.shape[0])
 print(inds)   
 bs_inds = np.random.choice(inds, size=len(inds))
 print(bs_inds)


probs = model.predict(result.params, X1)
print(probs[:10])
fpr, tpr, thresholds = metrics.roc_curve(y, probs)
roc_auc = metrics.auc(fpr, tpr)


## point le plus pro0h1 du coin (0,1)
d = np.sqrt((fpr - 0)**2 + (tpr - 1)**2)
print('Le seuil optimal est %0.3f' % thresholds[np.argmin(d)])


## on peut maintenant tracer la courbe
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot([fpr[np.argmin(d)]], [tpr[np.argmin(d)]], marker='o', markersize=5, color="red")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

Introduction à bootstrap¶

Principe¶

Simuler suivant $F_\theta$ ou $f_\theta$¶

Une petite parenthèse : générateur de nombres aléatoires¶

Inversion de la fonction de répartition¶

Simuler une loi normale : Box-Muller¶

Alternative : rééchantillonnage¶

Courbe ROC¶