################################################################################ # R-Code accompanying mboost tutorial # # Statistical Computing 2011 - Reisensburg, Germany # # # # Author: Benjamin Hofner # ################################################################################ ## install package (mboost >= 2.1-0) if necessary if (!require("mboost") || packageVersion("mboost") < 2.1-0){ if (require("mboost")) ## needed to install new version detach("package:mboost", unload = TRUE) install.packages("mboost", repos = "http://R-Forge.R-project.org") } ## load library mboost library("mboost") ## define red color red <- rgb(103,0,31, max = 255) ################################################################################ # importance of centering in glmboost (and bols, intercept = FALSE) # ################################################################################ ## create graphics to show importance of centering set.seed(1907) x <- rnorm(50, mean = 5) y <- rnorm(50, mean = x, sd = 0.3) ## subtract offset as usual in boosting y <- y - mean(y) ## figure without centering of x pdf("graphics/fig_center=false.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) xrange <- range(0,x) plot(y ~ x, xlim = xrange, main = "center = FALSE") abline(lm(y ~ x -1)) abline(h = 0, col = "gray", lwd = 0.5) abline(v = 0, col = "gray", lwd = 0.5) points(0, 0, col = red, cex = 2) points(mean(x), mean(y), col = red, cex = 2, pch = "+") legend("topleft", legend = c("(0,0)", "center of data"), pch = c("o", "+"), col = red, bg = "white", bty = "n") dev.off() ## figure with centering of x mx <- mean(x) x <- x - mean(x) pdf("graphics/fig_center=true.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) plot(y ~ x, xlim = xrange - mx, main = "center = TRUE") abline(lm(y ~ x -1)) abline(h = 0, col = "gray", lwd = 0.5) abline(v = 0, col = "gray", lwd = 0.5) points(0, 0, col = red, cex = 2) points(mean(x), mean(y), col = red, cex = 2, pch = "+") dev.off() ################################################################################ # glmboost # ################################################################################ ## fit simple model for cars data mod.cars <- glmboost(dist ~ speed, data = cars) mod.cars ## create graphics directory dir.create("graphics") ## create plot for speed vs. dist with model fit pdf("graphics/fig_modcars1.pdf") par(mar=c(5,4,4,5) + 0.1) plot(dist ~ speed, data = cars) cf <- coef(mod.cars, off2int = TRUE) abline(cf, lwd = 2) abline(lm(dist ~ speed, data = cars), col = "gray", lty = 2, lwd = 2) dev.off() ## --> glmboost() model practically identical to lm() model ## create coefficient paths pdf("graphics/fig_modcars2.pdf") par(mar=c(5,4,4,5) + 0.1) plot(mod.cars, main = "Coefficient Paths") dev.off() ################################################################################ # glmboost with high-dimensional data # ################################################################################ set.seed(1907) n <- 100 p <- 1000 ptrue <- 10 # simulate data X <- matrix(runif(n * p), nrow = n, ncol = p) # set first 10 effects to 10 beta <- numeric(p) beta[1:ptrue] <- 10 # create response y <- X %*% beta + rnorm(n, sd = 0.1) # drop to vector y <- drop(y) # add intercept to the model matrix X1 <- cbind(1, X) # estimate model mod <- glmboost(y = y, x = X1, control = boost_control(trace = TRUE)) cvr <- cvrisk(mod, grid = 1:500) plot(cvr) mod[mstop(cvr)] plot(mod, off2int = TRUE) coef(mod, off2int = TRUE) ################################################################################ # now glmboost with high-dimensional data and factors # ################################################################################ set.seed(1907) n <- 100 p <- 1000 ptrue <- 10 # simulate data (make factors) X <- matrix(as.factor(sample(1:2, n * p, replace = TRUE)), nrow = n, ncol = p) X <- data.frame(X) ## make model matrix Xm <- model.matrix(as.formula(paste("~", paste(names(X), collapse = " + "))), data = X) # set first 10 effects to 10 beta <- numeric(p+1) beta[2:ptrue+1] <- 10 # create response y <- Xm %*% beta + rnorm(n, sd = 0.1) # drop to vector y <- drop(y) ## estimate model mod1 <- glmboost(y = y, x = Xm, control = boost_control(trace = TRUE)) cvr1 <- cvrisk(mod1, grid = 1:300) ## alternative way fm <- as.formula(paste("y ~ int + ", paste(names(X), collapse = " + "))) data <- cbind(int = 1, y, X) mod2 <- glmboost(fm, data = data, control = boost_control(trace = TRUE)) ## slower as it internally needs to built up the model matrix cvr2 <- cvrisk(mod2, grid = 1:300) # quick again as it uses the built up model ################################################################################ # gamboost # ################################################################################ ### Simulate some data set.seed(1907) n <- 100 x1 <- rnorm(n) x2 <- as.factor(sample(0:3, n, replace = TRUE)) y <- 0.5 * x1 + rnorm(n) mod <- gamboost(y ~ bols(x1), control = boost_control(mstop = 25)) pdf("graphics/fig_bolsx1.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) plot(sort(x1), (0.5 * x1)[order(x1)], type = "l", main = "Linear effect", xlab = expression(x[1]), ylab = expression(f[partial])) lines(sort(x1), fitted(mod, which = 1)[order(x1)], col = red, type = "b") legend("topleft", c("true effect", "model"), lty = c(1, 1), pch = c(-1,1), merge = TRUE, col = c("black", red), bty = "n") dev.off() beta <- c(0, -1, 0.5, 3) y <- drop(model.matrix(~ x2) %*% beta + rnorm(n, sd = 0.3)) mod <- gamboost(y ~ bols(x2), control = boost_control(mstop = 50)) pdf("graphics/fig_bolsx2.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) betaPred <- coef(mod)[[1]] betaPred[1] <- betaPred[1] + attr(coef(mod), "offset") betaTrue <- c(0, -1, 0.5, 3) plot(1:4, betaPred, type = "n", xaxt = "n", main = "Categorical effect", xlab = expression(x[2]), ylab = expression(f[partial]), xlim = c(0.5, 4.5), ylim = c(-1.5, 3.5)) axis(1, at = 1:4, labels = expression(x[2]^(1), x[2]^(2), x[2]^(3), x[2]^(4))) for (i in 1:4) lines(i + c(-0.4, 0.4), rep(betaPred[i], each = 2), lwd = 2, col = red) for (i in 1:4) lines(i + c(-0.4, 0.4), rep(betaTrue[i], each = 2), lwd = 2, col = "black") legend("topleft", c("true effect", "model"), lty = c(1, 1), col = c("black", red), bty = "n") dev.off() ### Now show bbs() set.seed(1907) n <- 100 x1 <- rnorm(n) x2 <- rnorm(n) + 0.25 * x1 x3 <- as.factor(sample(0:1, n, replace = TRUE)) x4 <- gl(4, 25) y <- 3 * sin(x1) + x2^2 + rnorm(n) ## specify knot locations for bbs(x2, ...) knots.x2 <- quantile(x2, c(0.25, 0.5, 0.75)) ## fit the model mod <- gamboost(y ~ bbs(x1, knots = 20, df = 4) + bbs(x2, knots = knots.x2, df = 5) + bols(x3) + bols(x4)) ## plot partial effects pdf("graphics/fig_bbsx1.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) plot(sort(x1), (3 * sin(x1))[order(x1)], type = "l", main = expression(paste("True effect: ", f(x[1])==3 * sin(x[1]))), xlab = expression(x[1]), ylab = expression(f[partial])) lines(sort(x1), fitted(mod, which = 1)[order(x1)], col = red, type = "b") legend("topleft", c("true effect", "model"), lty = c(1, 1), pch = c(-1,1), merge = TRUE, col = c("black", red), bty = "n") dev.off() pdf("graphics/fig_bbsx2.pdf", width = 5, height = 5) par(mar = c(4, 4, 2, 0) + 0.1) plot(sort(x2), (x2^2)[order(x2)], type = "l", main = expression(paste("True effect: ", f(x[2])==x[2]^2)), xlab = expression(x[2]), ylab = expression(f[partial])) ## offset needed to conserve correct level (otherwise center both effects around ## zero to make them comparable) lines(sort(x2), fitted(mod, which = 2)[order(x2)] + mod$offset, col = red, type = "b") dev.off() ### example for cyclic spline set.seed(1907) x <- runif(200, 0,(2*pi)) y <- rnorm(200, mean=sin(x), sd=0.2) newX <- seq(0,2*pi, length=100) mod <- gamboost(y ~ bbs(x, knots = 12)) mod_cyclic <- gamboost(y ~ bbs(x, cyclic = TRUE, knots = 12, boundary.knots = c(0, 2*pi))) pdf("graphics/fig_cyclic.pdf", width = 5, height = 5) plot(x,y, main= expression(paste("True effect: ", f(x)==sin(x))), cex=1, pch = 20, col = rgb(0.5, 0.5, 0.5, alpha = 0.8)) lines(newX, predict(mod, data.frame(x = newX)), col = "black", lwd = 2) lines(newX + 2 * pi, predict(mod, data.frame(x = newX)), col = "black", lwd = 2) legend("bottomleft", c("cyclic = FALSE", "cyclic = TRUE"), lty = c(1, 1), col = c("black", red), lwd = 2, bty = "n") dev.off() # for interactive use pdf("graphics/fig_cyclic2.pdf", width = 5, height = 5) plot(x,y, main= expression(paste("True effect: ", f(x)==sin(x))), cex=1, pch = 20, col = rgb(0.5, 0.5, 0.5, alpha = 0.8)) lines(newX, predict(mod, data.frame(x = newX)), col = "black", lwd = 2) lines(newX + 2 * pi, predict(mod, data.frame(x = newX)), col = "black", lwd = 2) lines(newX, predict(mod_cyclic, data.frame(x = newX)), col = red, lwd = 2) lines(newX + 2 * pi, predict(mod_cyclic, data.frame(x = newX)), col = red, lwd = 2) abline(v = 2 * pi, col = "gray", lty = "dashed", lwd = 2) legend("bottomleft", c("cyclic = FALSE", "cyclic = TRUE"), lty = c(1, 1), col = c("black", red), lwd = 2, bty = "n") dev.off() ### example for bspatial set.seed(1907) x1 <- runif(250,-pi,pi) x2 <- runif(250,-pi,pi) y <- sin(x1) * sin(x2) + rnorm(250, sd = 0.4) modspa <- gamboost(y ~ bspatial(x1, x2, knots = list(x1 = 12, x2 = 12))) # possible to specify different knot mesh for x1 and x2: # modspa <- gamboost(y ~ bspatial(x1, x2, knots = list(x1 = 12, x2 = 20))) library(lattice) library(RColorBrewer) ## set up colorscheme nCols <- 50 cols <- colorRampPalette(brewer.pal(11, "RdBu"), space = "Lab", interpolate = "spline")(nCols) ## make new data on a grid: nd <- expand.grid(x1 = seq(-pi, pi, length = 100), x2 = seq(-pi, pi, length = 100)) preds <- predict(modspa, newdata = nd) breaks <- seq(min(preds), max(preds), length = nCols + 1) pdf("graphics/fig_spatial1.pdf") levelplot(preds ~ nd$x1 + nd$x2, xlab = "x1", ylab = "x2", at = breaks, col.regions = cols, cex = 0.7, colorkey = list(space = "left", at = breaks, width = 1)) dev.off() x1 <- x2 <- seq(-pi, pi, length = 100) z <- matrix(preds, ncol = length(x1)) nrz <- nrow(z) ncz <- ncol(z) jet.colors <- colorRampPalette(paste(brewer.pal(11,"RdBu")[c(10,2)], "E6", sep="")) nbcol <- 100 color <- jet.colors(nbcol) zfacet <- z[-1, -1] + z[-1, -ncz] + z[-nrz, -1] + z[-nrz, -ncz] facetcol <- cut(zfacet, nbcol) pdf("graphics/fig_spatial2.pdf") par(mar=c(1, 0.1, 0.1, 0.1), mgp = c(1.8,1,0)) persp(seq(-pi, pi, length = 100), seq(-pi, pi, length = 100), z, theta = 45, phi = 30, expand = 0.5, col = color[facetcol], ticktype = "detailed", nticks = 3, xlab = "x1", ylab = "x2") dev.off() ################################################################################ # optimal stopping and subsetting # ################################################################################ set.seed(1907) ## fit linear model to data model <- glmboost(DEXfat ~ ., data = bodyfat, center = TRUE, control = boost_control(trace = TRUE)) ## 10-fold cross-validation cv10f <- cv(model.weights(model), type = "kfold") cvm <- cvrisk(model, folds = cv10f, grid = 1:100, papply = lapply) # explicitly sequeltial cvm ## now plot the cross-validated risk pdf("graphics/fig_crossvalidated_risk.pdf") plot(cvm) dev.off() ## extract optimal mstop mstop(cvm) ### now expand the model to this value of mstop: model[mstop(cvm)] ### To see that nothing got lost we now increase mstop to 200: model[200]