3 Random Forests

3.1 General principle

3.1.1 Instability of a tree

set.seed(368910)
spamBoot1 <- spamApp[sample(1:nrow(spamApp), nrow(spamApp), replace = TRUE), ]
treeBoot1 <- rpart(type ~ ., data = spamBoot1)
plot(treeBoot1)
text(treeBoot1, xpd = TRUE)
set.seed(368915)
spamBoot2 <- spamApp[sample(1:nrow(spamApp), nrow(spamApp), replace = TRUE), ]
treeBoot2 <- rpart(type ~ ., data = spamBoot2)
plot(treeBoot2)
text(treeBoot2, xpd = TRUE)
mean(predict(treeBoot1, spamTest, type = "class") != predict(treeBoot2, spamTest,
    type = "class"))

3.1.2 From a tree to an ensemble: the Bagging

library(randomForest)
bagging <- randomForest(type ~ ., data = spamApp, mtry = ncol(spamApp) - 1)
bagging
errTestBagging <- mean(predict(bagging, spamTest) != spamTest$type)
errEmpBagging <- mean(predict(bagging, spamApp) != spamApp$type)

3.3 The randomForest package

RFDef <- randomForest(type ~ ., data = spamApp)
RFDef
RFDef <- randomForest(spamApp[, -58], spamApp[, 58])
errTestRFDef <- mean(predict(RFDef, spamTest) != spamTest$type)
errEmpRFDef <- mean(predict(RFDef, spamApp) != spamApp$type)

3.5 Parameters setting for prediction

3.5.1 The number of trees: ntree

plot(RFDef)
RFDoTrace <- randomForest(type ~ ., data = spamApp, ntree = 250, do.trace = 25)

3.5.2 The number of variables chosen at each node: mtry

nbvars <- 1:(ncol(spamApp) - 1)
oobsMtry <- sapply(nbvars, function(nbv) {
    RF <- randomForest(type ~ ., spamApp, ntree = 250, mtry = nbv)
    return(RF$err.rate[RF$ntree, "OOB"])
})
mean(replicate(n = 25, randomForest(type ~ ., spamApp, ntree = 250)$err.rate[250,
    "OOB"]))
bagStump <- randomForest(type ~ ., spamApp, ntree = 100, mtry = ncol(spamApp) - 1,
    maxnodes = 2)
bagStumpbestvar <- table(bagStump$forest$bestvar[1, ])
names(bagStumpbestvar) <- colnames(spamApp)[as.numeric(names(bagStumpbestvar))]
sort(bagStumpbestvar, decreasing = TRUE)
RFStump <- randomForest(type ~ ., spamApp, ntree = 100, maxnodes = 2)
RFStumpbestvar <- table(RFStump$forest$bestvar[1, ])
names(RFStumpbestvar) <- colnames(spamApp)[as.numeric(names(RFStumpbestvar))]
sort(RFStumpbestvar, decreasing = TRUE)

3.6 Examples

3.6.1 Predicting ozone concentration

library("randomForest")
data("Ozone", package = "mlbench")
OzRFDef <- randomForest(V4 ~ ., Ozone, na.action = na.omit)
OzRFDef
plot(OzRFDef)
plot(nbvars, oobsMtrys, type = "l", xlab = "mtry", ylab = "Erreur OOB")
bins <- c(0, 10, 20, 40)
V4bin <- cut(Ozone$V4, bins, include.lowest = TRUE, right = FALSE)
OzoneBin <- data.frame(Ozone, V4bin)
OzRFDefStrat <- randomForest(V4 ~ . - V9 - V4bin, OzoneBin, strata = V4bin, sampsize = 200,
    na.action = na.omit)
OzRFDefStrat

3.6.2 Analyzing genomic data

library(randomForest)
data("vac18", package = "mixOmics")
geneExpr <- vac18$genes
stimu <- vac18$stimulation
VacRFpsur3 <- randomForest(x = geneExpr, y = stimu, mtry = ncol(geneExpr)/3)
VacRFpsur3
plot(VacRFpsur3)
nFor <- 25
VacOOBsqrtp <- replicate(nFor, randomForest(geneExpr, stimu)$err.rate[500, "OOB"])
VacOOBpsur3 <- replicate(nFor, randomForest(geneExpr, stimu, mtry = ncol(geneExpr)/3)$err.rate[500,
    "OOB"])

3.6.3 Analyzing dust pollution

library(randomForest)
data("jus", package = "VSURF")
jusComp <- na.omit(jus)
jusRF <- randomForest(PM10 ~ ., data = jusComp)
partialPlot(jusRF, pred.data = jusComp, x.var = "NO", main = "Effet marginal - NO")