##################################### # R - Code for Tutorial Reisensburg # Model based boosting in R # Case study 1: birth weight prediction # # Last mod: Andreas Mayr, 18/05/11 ##################################### # read data dat <- read.table("birthweight.txt", header=TRUE, sep="\t") # remove variable "bmi" dat <- dat[,names(dat)!="bmi"] # histogram of the birthweight ##pdf("hist.pdf", width=3, height=3) par(mar=c(2,2,0.1,0.1)) hist(dat$birthw, prob=TRUE, ylab="", main="", cex.axis=0.7) lines(density(dat$birthw), col=2) dev.off() # center the covariates dat[,2:9] <- data.frame(scale(dat[,2:9], center=T, scale=F)) # display the correlations round(cor(dat[,-1]), 3) # add i"ntercept covariate" dat$INT <- rep(1, 150) # splitting the data frame set.seed(0804) random.split <- sample(1:nrow(dat), 100) dat.train <- dat[random.split,] dat.test <- dat[-random.split,] nrow(dat.train) nrow(dat.test) # Build the formula formula1 <- birthw ~ bols(INT, intercept=FALSE) + bols(volARM, intercept=FALSE) + bbs(volARM, center=TRUE, df=1) + bols(volFEM, intercept=FALSE) + bbs(volFEM, center=TRUE, df=1) + bols(volABDO, intercept=FALSE)+ bbs(volABDO, center=TRUE, df=1) + bols(bpd, intercept=FALSE) + bbs(bpd, center=TRUE, df=1) + bols(hc, intercept=FALSE) + bbs(hc, center=TRUE, df=1) + bols(atd, intercept=FALSE) + bbs(atd, center=TRUE, df=1) + bols(fl, intercept=FALSE) + bbs(fl, center=TRUE, df=1) + bols(ac, intercept=FALSE) + bbs(ac, center=TRUE, df=1) # load mboost library(mboost) # fit the model with 500 iterations m1 <- gamboost(formula1, control=boost_control(mstop=500), data=dat.train) # Warning is OK, just reminds us of mean centering the covariates # - allready done # 15-fold boostrap to optimize predictive accuracy set.seed(0804) cvr <- cvrisk(m1, method="bootstrap", B=25) # if multicore not available, warning is issued, but again: just for information ##pdf("cvr.pdf", width=4, height=4) par(mar=c(2,2,2,.1)) plot(cvr, cex.axis=0.7, cex.main=0.7, cex.lab=0.7) dev.off() # Get the optimal iteration st <- mstop(cvr) st # Set the model to this iteration m1[st] # Get the mse mse_boost <- mean((dat.test$birthw - predict(m1, newdata=dat.test))^2) mse_boost_long <- mean((dat.test$birthw - predict(m1[500], newdata=dat.test))^2) # Boundary warnings from spline are OK mse_boost mse_boost_long # set the model back (was changed when calculating mse_boost_long) m1[st] # Compare to classic gam() library(mgcv) gam1 <- gam(birthw ~ s(volARM) + s(volFEM) + s(volABDO) + s(bpd) + s(hc) + s(atd) + s(fl) + s(ac) , data=dat.train) # Get the mse mse_gam1 <- mean((dat.test$birthw - predict(gam1, newdata=dat.test))^2) # Make some plots, comparing the fits par(mfrow=c(1,2)) plot(dat.train$birthw, fitted(m1[61]), xlim=c(200,1500), ylim=c(200,1500)) abline(a=0,b=1) plot(dat.train$birthw, fitted(gam1), xlim=c(200,1500), ylim=c(200,1500)) abline(a=0,b=1) ##pdf("fits.pdf", width=3, height=3) par(mar=c(2,2,0.1,0.1)) plot(fitted(gam1), fitted(m1[500]), cex.axis=.7, ylab="gamboost()", xlab="gam()", col= rgb(0.1,0.1,0.1,0.3),xlim=c(200,1500), ylim=c(200,1500) ) abline(a=0,b=1) dev.off() # On the whole data set m2 <- gamboost(formula1, control=boost_control(mstop=500), data=dat) # 25-fold bootstrapping set.seed(0804) cvr <- cvrisk(m2, method="bootstrap", B=25) mstop(cvr) # subsetting the model m2 <- m2[74] # effect estimates summary(coef(m2)) coef(m2, which="volFEM") # partial plot #pdf("partial.pdf", width=7, height=3.5) par(mfrow=c(1,2), mar=c(4,4,.1,.1)) plot(m2, which="volFEM", cex.axis=0.75, cex.lab=0.75, cex=0.85) dev.off() ord <- order(dat$volFEM) #pdf("partial_sum.pdf", width=3.5, height=3.5) par(mar=c(4,4,.1,.1)) plot(dat$volFEM[ord], rowSums(predict(m2, which="volFEM"))[ord], type="b", ylab="partial effect [g]", xlab="volFEM", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$volFEM[ord]) dev.off() #### Partial plots for all effects yl <- c(-250,250) #pdf("partials1.pdf", width=7, height=3.5) par(mfrow=c(1,2), mar=c(4,4,.1,.1)) ord <- order(dat$volARM) plot(dat$volARM[ord], rowSums(predict(m2, which="volARM"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="volARM", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$volARM[ord]) ord <- order(dat$volFEM) plot(dat$volFEM[ord], rowSums(predict(m2, which="volFEM"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="volFEM", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$volFEM[ord]) dev.off() #pdf("partials2.pdf", width=7, height=3.5) par(mfrow=c(1,2), mar=c(4,4,.1,.1)) ord <- order(dat$volABDO) plot(dat$volABDO[ord], rowSums(predict(m2, which="volABDO"))[ord], type="b",ylim=yl, ylab="partial effect [g]", xlab="volABDO", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$volABDO[ord]) ord <- order(dat$bpd) plot(dat$bpd[ord], rowSums(predict(m2, which="bpd"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="bpd", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$bpd[ord]) dev.off() #pdf("partials3.pdf", width=7, height=3.5) par(mfrow=c(1,2), mar=c(4,4,.1,.1)) ord <- order(dat$hc) plot(dat$hc[ord], rowSums(predict(m2, which="hc"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="hc", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$hc[ord]) ord <- order(dat$atd) plot(dat$atd[ord], rowSums(predict(m2, which="atd"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="atd", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$atd[ord]) dev.off() #pdf("partials4.pdf", width=7, height=3.5) par(mfrow=c(1,2), mar=c(4,4,.1,.1)) ord <- order(dat$fl) plot(dat$fl[ord], rowSums(predict(m2, which="fl"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="fl", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$fl) ord <- order(dat$ac) plot(dat$ac[ord], rowSums(predict(m2, which="ac"))[ord], type="b", ylim=yl, ylab="partial effect [g]", xlab="ac", cex.axis=0.75, cex.lab=0.75, cex=0.8) rug(dat$ac) dev.off()