## R version 2.13.2 ## R-package mlr version 0.3.1895 ## data is a # 'data.frame': 1163 obs. of 20 variables: # $ age.continuous : num 0.806 1.021 0.914 1.237 0.914 ... # $ sex : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ... # $ bmi.continuous : num -0.621 0.4979 -0.8219 -0.5227 -0.0509 ... # $ nyha : Factor w/ 3 levels "0","1","2": 2 1 1 1 1 3 3 3 1 2 ... # $ myocardial.infarction : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 2 1 1 ... # $ critical.preoperative.state : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ... # $ pulmonary.hypertension : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ... # $ heart.rhythm : Factor w/ 3 levels "0","1","2": 1 1 1 1 2 1 2 2 1 1 ... # $ leftventricular.dysfunction : Factor w/ 3 levels "0","1","2": 1 1 2 1 1 2 3 3 1 1 ... # $ angiography.findings : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ... # $ reoperation : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... # $ diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ... # $ extracardiac.arteriopathy : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 2 2 2 1 ... # $ pulmonary.disease : Factor w/ 3 levels "0","1","2": 1 1 2 1 1 2 1 1 1 2 ... # $ neurological.dysfunction : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 1 1 ... # $ preoperative.renal.replacement.therapy: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... # $ preoperative.creatinine.level : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... # $ emergency : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 1 1 ... # $ troponin : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ... # $ recovery.state : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ... library(mlr) # create the classification task mt <- makeClassifTask(data = data, target = "recovery.state", positive = "1") ### PARAMETER TUNING ### # 5-fold stratified cross-validation rinst <- makeResampleDesc("StratCV", iters = 5) rinst <- makeResampleInstance(rinst, task = mt) # performance measures (the first one is optimized) measures = list(auc, mmce, fpr, fnr, timeboth) ## kknn learn.kknn <- makeLearner("classif.kknn", predict.type = "prob") pars <- makeParameterSet(makeDiscreteParameter("k",c(5,10,20,50,100,200,500,600)), makeDiscreteParameter("distance",1:3), makeDiscreteParameter("kernel",c("rectangular", "gaussian"))) tune.kknn <- tune(learner = learn.kknn, task = mt, resampling = rinst, par.set = pars, control = makeTuneGridControl(), measures = measures) # ksvm learn.ksvm <- makeLearner("classif.ksvm", predict.type = "prob") pars.radial <- makeParameterSet(makeDiscreteParameter("kernel", "rbfdot"), makeDiscreteParameter("C", 2^seq(-5,5)), makeDiscreteParameter("sigma", 2^seq(-5,5))) pars.poly <- makeParameterSet(makeDiscreteParameter("kernel", "polydot"), makeDiscreteParameter("C", 2^seq(-5,5)), makeDiscreteParameter("degree", 1:3)) tune.ksvm.radial <- tune(learner = learn.ksvm, task = mt, resampling = rinst, par.set = pars.radial, control = makeTuneGridControl(), measures = measures) tune.ksvm.poly <- tune(learner = learn.ksvm, task = mt, resampling = rinst, par.set = pars.poly, control = makeTuneGridControl(), measures = measures) # randomForest n <- nrow(data) * 4/5 learn.randomForest <- makeLearner("classif.randomForest", predict.type = "prob") pars <- makeParameterSet(makeDiscreteParameter("ntree", c(500,1000,2000,3000)), makeDiscreteParameter("mtry", c(3,5,10,15)), makeDiscreteParameter("sampsize", ceiling(c(0.25, 0.5, 0.75)*n))) tune.rF <- tune(learner = learn.randomForest, task = mt, resampling = rinst, par.set = pars, control = makeTuneGridControl(), measures = measures) # gbm learn.gbm <- makeLearner("classif.gbm", predict.type = "prob") pars <- makeParameterSet(makeDiscreteParameter("n.trees", c(500,1000,2000,3000)), makeDiscreteParameter("interaction.depth", 1:3), makeDiscreteParameter("distribution", c("bernoulli", "adaboost")), makeDiscreteParameter("shrinkage", c(0.001, 0.01, 0.05)), makeDiscreteParameter("bag.fraction", c(0.25, 0.5, 0.75))) tune.gbm <- tune(learner = learn.gbm, task = mt, resampling = rinst, par.set = pars, control = makeTuneGridControl(), measures = measures) ### VARIABLE SELECTION AND PERFORMANCE ASSESSMENT ### # nested resampling strategy # outer loop: 25 subsampling iterations outer <- makeResampleDesc("Subsample", iter = 25, split = 4/5) outer <- makeResampleInstance(outer, task = mt) # inner loop: 3-fold stratified cross-validation inner <- makeResampleDesc("StratCV", iters = 3) # forward search until the AUC cannot be improved by at least 0.001 control <- makeVarselControlSequential(method = "sfs", alpha = 0.001) # kknn learn.kknn <- makeLearner(class = "classif.kknn", predict.type = "prob", par.vals = tune.kknn@x, id = "kknn") learn.kknn.wrapper <- makeVarselWrapper(learner = learn.kknn, resampling = inner, measures = measures, control = control) learn.kknn.wrapper <- setId(learn.kknn.wrapper, "kknn.varsel") # ksvm learn.ksvm.radial <- makeLearner(class = "classif.ksvm", predict.type = "prob", par.vals = tune.ksvm.radial@x, id = "svm.radial") learn.ksvm.radial.wrapper <- makeVarselWrapper(learner = learn.ksvm.radial, resampling = inner, measures = measures, control = control) learn.ksvm.radial.wrapper <- setId(learn.ksvm.radial.wrapper, "svm.radial.varsel") learn.ksvm.poly <- makeLearner(class = "classif.ksvm", predict.type = "prob", par.vals = tune.ksvm.poly@x, id = "svm.poly") learn.ksvm.poly.wrapper <- makeVarselWrapper(learner = learn.ksvm.poly, resampling = inner, measures = measures, control = control) learn.ksvm.poly.wrapper <- setId(learn.ksvm.poly.wrapper, "svm.poly.varsel") # randomForest learn.rF <- makeLearner(class = "classif.randomForest", predict.type = "prob", par.vals = tune.rF@x, id = "rF") learn.rF.wrapper <- makeVarselWrapper(learner = learn.rF, resampling = inner, measures = measures, control = control) learn.rF.wrapper <- setId(learn.rF.wrapper, "rF.varsel") # lda learn.lda <- makeLearner(class = "classif.lda", predict.type = "prob", id = "lda") learn.lda.wrapper <- makeVarselWrapper(learner = learn.lda, resampling = inner, measures = measures, control = control) learn.lda.wrapper <- setId(learn.lda.wrapper, "lda.varsel") # logreg learn.logreg <- makeLearner(class = "classif.logreg", predict.type = "prob", id = "logreg") learn.logreg.wrapper <- makeVarselWrapper(learner = learn.logreg, resampling = inner, measures = measures, control = control) learn.logreg.wrapper <- setId(learn.logreg.wrapper, "logreg.varsel") # gbm learn.gbm <- makeLearner(class = "classif.gbm", predict.type="prob", par.vals = tune.gbm@x, id = "gbm") learn.gbm.wrapper <- makeVarselWrapper(learner = learn.gbm, resampling = inner, measures = measures, control = control) learn.gbm.wrapper <- setId(learn.gbm.wrapper, "gbm.varsel") learners <- c(learn.logreg, learn.logreg.wrapper, learn.lda, learn.lda.wrapper, learn.kknn, learn.kknn.wrapper, learn.ksvm.poly, learn.ksvm.poly.wrapper, learn.ksvm.radial, learn.ksvm.radial.wrapper, learn.rF, learn.rF.wrapper, learn.gbm, learn.gbm.wrapper) result <- bench.exp(learners = learners, tasks = mt, resampling = outer, measures = measures) result # auc.test.mean auc.test.sd mmce.test.mean mmce.test.sd fpr.test.mean fpr.test.sd fnr.test.mean fnr.test.sd timeboth.test.mean timeboth.test.sd # logreg 0.8071598 0.07104211 0.07399142 0.01196596 0.016658096 0.009278535 0.7932162 0.10969073 0.17284 8.374847e-02 # logreg.varsel 0.8207111 0.06628239 0.07347639 0.01265427 0.012599565 0.007479000 0.8345289 0.10553406 421.68972 5.436710e+01 # lda 0.8018197 0.07520445 0.08326180 0.01638976 0.041080653 0.014100631 0.6156837 0.13243794 0.14788 5.423354e-02 # lda.varsel 0.8093730 0.06836761 0.08566524 0.01393363 0.042199969 0.013191150 0.6309453 0.14113748 399.56944 5.076704e+01 # kknn 0.8146294 0.05858792 0.07416309 0.01290650 0.000000000 0.000000000 1.0000000 0.00000000 0.76608 8.099276e-02 # kknn.varsel 0.7812462 0.06461410 0.07416309 0.01290650 0.000000000 0.000000000 1.0000000 0.00000000 590.21348 1.646269e+02 # svm.poly 0.7214310 0.08032230 0.07416309 0.01290650 0.000000000 0.000000000 1.0000000 0.00000000 0.33100 3.679787e-02 # svm.poly.varsel 0.5486130 0.12461284 0.07416309 0.01290650 0.000000000 0.000000000 1.0000000 0.00000000 299.82240 1.703016e+02 # svm.radial 0.7761683 0.05939727 0.07914163 0.01108704 0.009789133 0.005799594 0.9429037 0.05247327 0.42796 1.224568e-02 # svm.radial.varsel 0.5480310 0.12638513 0.07416309 0.01290650 0.001121593 0.003882123 0.9875280 0.04070159 280.42124 1.941768e+02 # rF 0.8223086 0.05369785 0.07673820 0.01324691 0.004814723 0.003121756 0.9726135 0.03597640 0.84616 1.381182e-01 # rF.varsel 0.7705418 0.06961739 0.07622318 0.01271237 0.008310137 0.006474144 0.9229443 0.08020209 641.42404 1.034416e+02 # gbm 0.8202491 0.05943119 0.07673820 0.01265427 0.006675439 0.004039268 0.9524897 0.05496522 0.62144 7.577819e-03 # gbm.varsel 0.7914310 0.07132319 0.07587983 0.01230994 0.003692993 0.004627978 0.9767682 0.04421254 413.96228 9.646875e+01 ### R-CODE FOR TABLE 3 AND FIGURE 1 ### library(lattice) # variable names names <- names(result["opt.results"]$data) sel.var <- lapply(names[seq(2,14,2)], function(x) getSelectedFeatures(br = result, learner.id = x)) vars <- names(sel.var[[1]]) vars.eng <- c("age", "sex", "bmi", "nyha", "myocardial infarction", "critical preoperative state", "pulmonary hypertension", "heart rhythm", "left-ventricular dysfunction", "angiography findings", "reoperation", "diabetes", "extra cardiac arteriopathy", "pulmonary disease", "neurological dysfunction", "preoperative renal replacement therapy", "preoperative creatinine level", "emergency", "troponin") names(sel.var) <- names[seq(1,14,2)] # calculate the mean number of selected variables and standard deviation nr.sel <- lapply(sel.var, rowSums) lapply(nr.sel, mean) # $logreg # [1] 6.8 # $lda # [1] 6.68 # $kknn # [1] 4.48 # $svm.poly # [1] 3.32 # $svm.radial # [1] 3.08 # $rF # [1] 7.72 # $gbm # [1] 6.04 lapply(nr.sel, sd) # $logreg # [1] 1.322876 # $lda # [1] 1.314027 # $kknn # [1] 1.636052 # $svm.poly # [1] 2.688246 # $svm.radial # [1] 3.581434 # $rF # [1] 1.791647 # $gbm # [1] 1.968079 # remove svm results sel.var["svm.poly"] <- NULL sel.var["svm.radial"] <- NULL # barplot showing the number of selected variables names <- names(sel.var) sel.var <- t(do.call("cbind", sel.var)) sel.var <- data.frame(sel.var, learner = factor(rep(names, each=19)), vars = factor(rep(vars.eng,length(names)))) sel.var.sum <- data.frame(X = rowSums(sel.var[,1:25]), learner = sel.var$learner, vars = sel.var$vars) sel.var.sum <- reshape(sel.var.sum, v.names = "X", direction = "wide", idvar = "vars", timevar = "learner") ord <- order(rowSums(sel.var.sum[,2:6])) sel.var.sum$vars <- factor(sel.var.sum$vars, levels = sel.var.sum$vars[ord]) color <- rainbow(5, s = 0.4) trellis.par.set(fontsize = list(text = 22)) barchart(vars ~ X.logreg + X.lda + X.kknn + X.rF + X.gbm, data = sel.var.sum, origin = 0, xlab = "selection frequency", ylab = "variables", stack = TRUE, xlim = c(0,125), scales = list(x = list(at = seq(25,125,25))), key = list(text = list(c("logreg", "lda", "kknn", "rF", "gbm")), rectangles=list(col = color), columns = 3, just = "right"), panel = function(x, ...) { panel.grid(h = 0, v = 4, ...) panel.barchart(x, col = color, ...) })