On charge le package leaps et le jeu de données de rétinol plasmatique
require(leaps)
## Loading required package: leaps
load("~/Dropbox/enseignement/ml/Lectures/rmds/prostate.rda")
names(prostate)
## [1] "lcavol" "lweight" "age" "lbph" "svi" "lcp" "gleason"
## [8] "pgg45" "lpsa"
On peut limiter le nombre de variables dans un modèle à tester via le paramètre nvmax
regfit.full = regsubsets(lcavol~., data = prostate)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate)
## 8 Variables (and intercept)
## Forced in Forced out
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svi FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## lpsa FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## lweight age lbph svi lcp gleason pgg45 lpsa
## 1 ( 1 ) " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " "*"
## 3 ( 1 ) " " "*" " " " " "*" " " " " "*"
## 4 ( 1 ) " " "*" "*" " " "*" " " " " "*"
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" "*"
## 6 ( 1 ) " " "*" "*" " " "*" "*" "*" "*"
## 7 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
reg.summary = summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2 ,2))
plot(reg.summary$rss , xlab =" Number of Variables " , ylab =" RSS " ,type ="l")
plot(reg.summary$adjr2 , xlab =" Number of Variables " , ylab =" Adjusted RSq " , type ="l")
which.max(reg.summary$adjr2)
## [1] 6
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col =" red " , cex =2 , pch =20)
plot(reg.summary$cp , xlab =" Number of Variables " , ylab =" Cp " ,type = "l")
which.min(reg.summary$cp)
## [1] 4
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col =" red " , cex =2 , pch =20)
plot(reg.summary$bic , xlab =" Number of Variables " , ylab =" BIC " , type ="l")
which.min(reg.summary$bic)
## [1] 2
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col =" red " , cex =2 , pch =20)
Le package leap possède sa propre fonction plot
#plot(regfit.full , scale ="r2")
#plot(regfit.full , scale ="bic")
#plot(regfit.full , scale ="adjr2")
plot(regfit.full , scale ="Cp")
On peut accéder aux coefficient de régression du modèle à 1 variable
coef(regfit.full , 1)
## (Intercept) lpsa
## -0.5085796 0.7499189
On peut faire aussi de la recherche pas à pas en forward et backward
regfit.fwd = regsubsets(lcavol~. , data = prostate, method ="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, method = "forward")
## 8 Variables (and intercept)
## Forced in Forced out
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svi FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## lpsa FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## lweight age lbph svi lcp gleason pgg45 lpsa
## 1 ( 1 ) " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " "*"
## 3 ( 1 ) " " "*" " " " " "*" " " " " "*"
## 4 ( 1 ) " " "*" "*" " " "*" " " " " "*"
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" "*"
## 6 ( 1 ) " " "*" "*" " " "*" "*" "*" "*"
## 7 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
plot(regfit.fwd, scale="bic")
regfit.bwd = regsubsets(lcavol~. , data = prostate, method ="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, method = "backward")
## 8 Variables (and intercept)
## Forced in Forced out
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svi FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## lpsa FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## lweight age lbph svi lcp gleason pgg45 lpsa
## 1 ( 1 ) " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " "*" " " " " "*"
## 3 ( 1 ) " " "*" " " " " "*" " " " " "*"
## 4 ( 1 ) " " "*" "*" " " "*" " " " " "*"
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" "*"
## 6 ( 1 ) " " "*" "*" " " "*" "*" "*" "*"
## 7 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
On construit les deux ensembles apprentissage validation aléatoirement
set.seed(1)
train = sample (c(TRUE,FALSE) , nrow(prostate), prob=c(2/3,1/3) , rep = TRUE)
test=(!train)
regfit.best = regsubsets(lcavol~. , data = prostate[train,])
On peut récupérer la matrice dite de “design” du modèle complet sur les données de validation
test.mat = model.matrix(lcavol~. , data = prostate[test,])
On va maintenant calculer l’erreur de prédiction sur l’ensemble de validation pour chaque nombre de variables dans le modèle
val.errors = rep(NA ,8)
for( i in 1:8){
coefi = coef(regfit.best, id=i)
pred = test.mat[ ,names(coefi)]%*%coefi
val.errors[i]=mean((prostate$lcavol[test] - pred)^2)
}
val.errors
## [1] 0.5605286 0.4245755 0.5069872 0.4896560 0.5561193 0.5377202 0.5397409
## [8] 0.5444512
which.min(val.errors)
## [1] 2
coef(regfit.full, 1)
## (Intercept) lpsa
## -0.5085796 0.7499189
On a besoin d’une fonction qui prédit à partir d’un objet regsubset
predict.regsubsets = function(object , newdata , id ,...){
form = as.formula(object$call[[2]])
mat = model.matrix( form , newdata)
coefi = coef(object , id=id)
xvars = names(coefi)
mat[ ,xvars]%*% coefi
}
On va faire une validation croisée à 5 folds
k=5
set.seed(1)
folds = sample(1:k, nrow(prostate) , replace = TRUE)
cv.errors = matrix(NA, k, 8 ,dimnames = list(NULL , paste(1:8)))
for(j in 1:k){
best.fit = regsubsets(lcavol~. , data=prostate[folds!=j,] , nvmax=13)
for (i in 1:8){
pred = predict(best.fit , prostate[folds==j,] , id=i)
cv.errors[j,i]= mean((prostate$lcavol[folds==j]-pred) ^2)
}
}
La moyenne de l’erreur sur les 5 folds
mean.cv.errors = apply (cv.errors ,2 ,mean)
which.min(mean.cv.errors)
## 2
## 2