# 4 Variable importance ## 4.1 Notions of importance ```{r impRFDef, fig.height=8, fig.cap="Importance of variables ranked in descending order for `spam` data."} RFDefImp <- randomForest(type ~ ., data = spamApp, importance=TRUE) varImpPlot(RFDefImp, type = 1, scale = FALSE, n.var = ncol(spamApp) - 1, cex = 0.8, main = "Importance des variables") ``` ## 4.3 Tree diversity and importance ```{r impBagStump, fig.cap="Importance of the 20 most important variables for a Bagging of stumps, `spam` data."} bagStumpImp <- randomForest(type~., spamApp, mtry = ncol(spamApp) - 1, maxnodes=2, importance=TRUE) varImpPlot(bagStumpImp, type = 1, scale = FALSE, n.var = 20, cex = 0.8, main = "Importance des variables") ``` ```{r impRFStump, fig.cap="Importance of the 20 most important variables for an RF of stumps with the default value of `mtry` (equal to 7), `spam` data."} RFStumpImp <- randomForest(type~., spamApp, maxnodes=2, importance=TRUE) varImpPlot(RFStumpImp, type = 1, scale = FALSE, n.var = 20, cex = 0.8, main = "Importance des variables") ``` ## 4.5 Examples ### 4.5.1 An illustration by simulation in regression ```{r impFried, fig.cap="Importance of variables, simulated data in regression."} library(mlbench) fried1Simu <- mlbench.friedman1(n = 500) fried1Data <- data.frame(fried1Simu$x, y = fried1Simu$y) fried1RFimp <- randomForest(y ~., fried1Data, importance = TRUE) varImpPlot(fried1RFimp, type = 1, scale = FALSE, main = "Importance des variables") ``` ```{r impFriedPartialPlot, fig.cap="Marginal effects, simulated data in regression."} partialPlot(fried1RFimp, fried1Data, x.var = "X1", main = "X1") ``` ### 4.5.2 Predicting ozone concentration ```{r impOzoneLoad} library("randomForest") data("Ozone", package = "mlbench") OzRFDefImp <- randomForest(V4 ~ ., Ozone, na.action = na.omit, importance = TRUE) ``` ```{r impOzoneRFDefImpPlot, fig.cap="Importance of variables, obtained with a default RF, `Ozone` data."} varImpPlot(OzRFDefImp, type = 1, scale = FALSE, main = "Importance des variables") ``` ### 4.5.3 Analyzing genomic data ```{r impVac18load} library(randomForest) data("vac18", package = "mixOmics") ``` ```{r impVac18DataManage} geneExpr <- vac18$genes stimu <- vac18$stimulation ``` ```{r impVac18RFDefImp} vacRFDefImp <- randomForest(x = geneExpr, y = stimu, mtry = ncol(geneExpr)/3, importance = TRUE) ``` ```{r impVac18RFDefImpPlot, fig.height=6, fig.cap="Importance of the 30 most important variables sorted in descending order, `Vac18` data."} varImpPlot(vacRFDefImp, type = 1, scale = FALSE, cex = 0.8) ``` ```{r impVac18RFDefAllImpPlot, fig.cap="Importance of all variables sorted in descending order of importance, `Vac18` data."} vacImp <- vacRFDefImp$importance[, nlevels(stimu) + 1] plot(sort(vacImp, decreasing = TRUE), type = "l", xlab = "Variables", ylab = "Importance des variables") ```