Sélection de modèle sur le jeu de données entier

On charge le package leaps et le jeu de données de rétinol plasmatique

require(leaps)
## Loading required package: leaps
load("~/Dropbox/enseignement/ml/Lectures/rmds/prostate.rda")
names(prostate)
## [1] "lcavol"  "lweight" "age"     "lbph"    "svi"     "lcp"     "gleason"
## [8] "pgg45"   "lpsa"

On peut limiter le nombre de variables dans un modèle à tester via le paramètre nvmax

regfit.full = regsubsets(lcavol~., data = prostate)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate)
## 8 Variables  (and intercept)
##         Forced in Forced out
## lweight     FALSE      FALSE
## age         FALSE      FALSE
## lbph        FALSE      FALSE
## svi         FALSE      FALSE
## lcp         FALSE      FALSE
## gleason     FALSE      FALSE
## pgg45       FALSE      FALSE
## lpsa        FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          lweight age lbph svi lcp gleason pgg45 lpsa
## 1  ( 1 ) " "     " " " "  " " " " " "     " "   "*" 
## 2  ( 1 ) " "     " " " "  " " "*" " "     " "   "*" 
## 3  ( 1 ) " "     "*" " "  " " "*" " "     " "   "*" 
## 4  ( 1 ) " "     "*" "*"  " " "*" " "     " "   "*" 
## 5  ( 1 ) " "     "*" "*"  " " "*" " "     "*"   "*" 
## 6  ( 1 ) " "     "*" "*"  " " "*" "*"     "*"   "*" 
## 7  ( 1 ) " "     "*" "*"  "*" "*" "*"     "*"   "*" 
## 8  ( 1 ) "*"     "*" "*"  "*" "*" "*"     "*"   "*"
reg.summary = summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
par(mfrow=c(2 ,2))
plot(reg.summary$rss , xlab =" Number of Variables " , ylab =" RSS " ,type ="l")
plot(reg.summary$adjr2 , xlab =" Number of Variables " , ylab =" Adjusted RSq " , type ="l")
which.max(reg.summary$adjr2)
## [1] 6
points(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)], col =" red " , cex =2 , pch =20)

plot(reg.summary$cp , xlab =" Number of Variables " , ylab =" Cp " ,type = "l")
which.min(reg.summary$cp)
## [1] 4
points(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)], col =" red " , cex =2 , pch =20)

plot(reg.summary$bic , xlab =" Number of Variables " , ylab =" BIC " , type ="l")
which.min(reg.summary$bic)
## [1] 2
points(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)], col =" red " , cex =2 , pch =20)

Le package leap possède sa propre fonction plot

#plot(regfit.full , scale ="r2")
#plot(regfit.full , scale ="bic")
#plot(regfit.full , scale ="adjr2")
plot(regfit.full , scale ="Cp")

On peut accéder aux coefficient de régression du modèle à 1 variable

coef(regfit.full , 1)
## (Intercept)        lpsa 
##  -0.5085796   0.7499189

On peut faire aussi de la recherche pas à pas en forward et backward

regfit.fwd = regsubsets(lcavol~. , data = prostate, method ="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, method = "forward")
## 8 Variables  (and intercept)
##         Forced in Forced out
## lweight     FALSE      FALSE
## age         FALSE      FALSE
## lbph        FALSE      FALSE
## svi         FALSE      FALSE
## lcp         FALSE      FALSE
## gleason     FALSE      FALSE
## pgg45       FALSE      FALSE
## lpsa        FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          lweight age lbph svi lcp gleason pgg45 lpsa
## 1  ( 1 ) " "     " " " "  " " " " " "     " "   "*" 
## 2  ( 1 ) " "     " " " "  " " "*" " "     " "   "*" 
## 3  ( 1 ) " "     "*" " "  " " "*" " "     " "   "*" 
## 4  ( 1 ) " "     "*" "*"  " " "*" " "     " "   "*" 
## 5  ( 1 ) " "     "*" "*"  " " "*" " "     "*"   "*" 
## 6  ( 1 ) " "     "*" "*"  " " "*" "*"     "*"   "*" 
## 7  ( 1 ) " "     "*" "*"  "*" "*" "*"     "*"   "*" 
## 8  ( 1 ) "*"     "*" "*"  "*" "*" "*"     "*"   "*"
plot(regfit.fwd, scale="bic")

regfit.bwd = regsubsets(lcavol~. , data = prostate, method ="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(lcavol ~ ., data = prostate, method = "backward")
## 8 Variables  (and intercept)
##         Forced in Forced out
## lweight     FALSE      FALSE
## age         FALSE      FALSE
## lbph        FALSE      FALSE
## svi         FALSE      FALSE
## lcp         FALSE      FALSE
## gleason     FALSE      FALSE
## pgg45       FALSE      FALSE
## lpsa        FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
##          lweight age lbph svi lcp gleason pgg45 lpsa
## 1  ( 1 ) " "     " " " "  " " " " " "     " "   "*" 
## 2  ( 1 ) " "     " " " "  " " "*" " "     " "   "*" 
## 3  ( 1 ) " "     "*" " "  " " "*" " "     " "   "*" 
## 4  ( 1 ) " "     "*" "*"  " " "*" " "     " "   "*" 
## 5  ( 1 ) " "     "*" "*"  " " "*" " "     "*"   "*" 
## 6  ( 1 ) " "     "*" "*"  " " "*" "*"     "*"   "*" 
## 7  ( 1 ) " "     "*" "*"  "*" "*" "*"     "*"   "*" 
## 8  ( 1 ) "*"     "*" "*"  "*" "*" "*"     "*"   "*"

Sélection de modèle avec des ensembles “apprentissage-validation” et validation croisée

On construit les deux ensembles apprentissage validation aléatoirement

set.seed(1)
train = sample (c(TRUE,FALSE) , nrow(prostate), prob=c(2/3,1/3) , rep = TRUE)
test=(!train)
regfit.best = regsubsets(lcavol~. , data = prostate[train,])

On peut récupérer la matrice dite de “design” du modèle complet sur les données de validation

test.mat = model.matrix(lcavol~. , data = prostate[test,])

On va maintenant calculer l’erreur de prédiction sur l’ensemble de validation pour chaque nombre de variables dans le modèle

val.errors = rep(NA ,8)
for( i in 1:8){
  coefi = coef(regfit.best,  id=i)
  pred = test.mat[ ,names(coefi)]%*%coefi
  val.errors[i]=mean((prostate$lcavol[test] - pred)^2)
}
val.errors
## [1] 0.5605286 0.4245755 0.5069872 0.4896560 0.5561193 0.5377202 0.5397409
## [8] 0.5444512
which.min(val.errors)
## [1] 2
coef(regfit.full, 1)
## (Intercept)        lpsa 
##  -0.5085796   0.7499189

On a besoin d’une fonction qui prédit à partir d’un objet regsubset

predict.regsubsets = function(object , newdata , id ,...){
form = as.formula(object$call[[2]])
mat = model.matrix( form , newdata)
coefi = coef(object , id=id)
xvars = names(coefi)
mat[ ,xvars]%*% coefi
}

On va faire une validation croisée à 5 folds

k=5
set.seed(1)
folds = sample(1:k, nrow(prostate) , replace = TRUE)
cv.errors = matrix(NA, k, 8 ,dimnames = list(NULL , paste(1:8)))
for(j in 1:k){
  best.fit = regsubsets(lcavol~. , data=prostate[folds!=j,] , nvmax=13)
  for (i in 1:8){
   pred = predict(best.fit , prostate[folds==j,] , id=i)
   cv.errors[j,i]= mean((prostate$lcavol[folds==j]-pred) ^2)
   }
 }

La moyenne de l’erreur sur les 5 folds

mean.cv.errors = apply (cv.errors ,2 ,mean)
which.min(mean.cv.errors)
## 2 
## 2