r - Custom classification threshold for GBM -
i'm trying create custom gbm model tunes classification threshold binary classification problem. there nice example provided on caret website here, when try apply similar gbm receive following error:
error in { : task 1 failed - "argument 1 not vector"
unfortunately, have no idea error , error isn't helpful.
here's example, code i've used defining custom gbm
library(caret) library(gbm) library(proc) #### define custom gbm model probability threshold tuning #### ## model code original gbm method caret customgbm <- getmodelinfo("gbm", regex = false)[[1]] customgbm$type <- c("classification") ## add threshold (i.e. class cutoff) tuning parameter customgbm$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage", "n.minobsinnode", "threshold"), class = rep("numeric", 5), label = c("# boosting iterations", "max tree depth", "shrinkage", "min. terminal node size", "probability cutoff")) ## customise tuning grid: ## paramters fixed. give tuning grid of 2,500 values if len = 100 customgbm$grid <- function(x, y, len = null, search = "grid") { if (search == "grid") { grid <- expand.grid(n.trees = seq(50, 250, 50), interaction.depth = 2, ### fix interaction depth @ 2 shrinkage = 0.0001, ### fix learning rate @ 0.0001 n.minobsinnode = seq(2, 10, 2), threshold = seq(.01, .99, length = len)) } else { grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)), interaction.depth = sample(1:10, replace = true, size = len), shrinkage = runif(len, min = .001, max = .6), n.minobsinnode = sample(5:25, replace = true, size = len), threshold = runif(1, 0, size = len)) grid <- grid[!duplicated(grid),] ### remove duplicated rows in training grid } grid } ## here fit single gbm model , loop on threshold values predictions ## same gbm model. customgbm$loop = function(grid) { library(plyr) loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"), function(x) c(threshold = max(x$threshold))) submodels <- vector(mode = "list", length = nrow(loop)) (i in seq(along = loop$threshold)) { index <- which(grid$n.trees == loop$n.trees[i] & grid$interaction.depth == loop$interaction.depth[i] & grid$shrinkage == loop$shrinkage[i] & grid$n.minobsinnode == loop$n.minobsinnode[i]) cuts <- grid[index, "threshold"] submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]]) } list(loop = loop, submodels = submodels) } ## fit model independent of threshold parameter customgbm$fit = function(x, y, wts, param, lev, last, classprobs, ...) { thedots <- list(...) if (any(names(thedots) == "distribution")) { moddist <- thedots$distribution thedots$distribution <- null } else { if (is.numeric(y)) { stop("this works 2-class classification problems") } else moddist <- if (length(lev) == 2) "bernoulli" else stop("this works 2-class classification problems") } # if (length(levels(y)) != 2) # stop("this works 2-class problems") ## check see if weights passed in (and availible) if (!is.null(wts)) thedots$w <- wts if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0) modargs <- list(x = x, y = y, interaction.depth = param$interaction.depth, n.trees = param$n.trees, shrinkage = param$shrinkage, n.minobsinnode = param$n.minobsinnode, distribution = moddist) do.call("gbm.fit", modargs) } ## probability prediction , use different thresholds ## predicted class customgbm$predict = function(modelfit, newdata, submodels = null) { out <- predict(modelfit, newdata, n.trees = modelfit$tunevalue$n.trees, type = "response")#[, modelfit$obslevels[1]] out[is.nan(out)] <- na class1prob <- ifelse(out >= modelfit$tunevalue$threshold, modelfit$obslevels[1], modelfit$obslevels[2]) ## raise threshold class #1 , higher level of ## evidence needed call class 1 should ## decrease sensitivity , increase specificity out <- ifelse(class1prob >= modelfit$tunevalue$threshold, modelfit$obslevels[1], modelfit$obslevels[2]) if (!is.null(submodels)) { tmp2 <- out out <- vector(mode = "list", length = length(submodels$threshold)) out[[1]] <- tmp2 (i in seq(along = submodels$threshold)) { out[[i + 1]] <- ifelse(class1prob >= submodels$threshold[[i]], modelfit$obslevels[1], modelfit$obslevels[2]) } } out } ## probabilities same have create ## mulitple versions of probs evaluate data across ## thresholds customgbm$prob = function(modelfit, newdata, submodels = null) { out <- predict(modelfit, newdata, type = "response", n.trees = modelfit$tunevalue$n.trees) out[is.nan(out)] <- na out <- cbind(out, 1 - out) colnames(out) <- modelfit$obslevels if (!is.null(submodels)) { tmp <- predict(modelfit, newdata, type = "response", n.trees = submodels$n.trees) tmp <- as.list(as.data.frame(tmp)) lapply(tmp, function(x, lvl) { x <- cbind(x, 1 - x) colnames(x) <- lvl x}, lvl = modelfit$obslevels) out <- c(list(out), tmp) } out } fourstats <- function (data, lev = levels(data$obs), model = null) { ## code use area under roc curve , ## sensitivity , specificity values using current candidate ## value of probability threshold. out <- c(twoclasssummary(data, lev = levels(data$obs), model = null)) ## best possible model has sensitivity of 1 , specificity of 1. ## how far value? coords <- matrix(c(1, 1, out["spec"], out["sens"]), ncol = 2, byrow = true) colnames(coords) <- c("spec", "sens") rownames(coords) <- c("best", "current") c(out, dist = dist(coords)[1]) }
and code showing how use custom model
set.seed(949) trainingset <- twoclasssim(500, -9) mod1 <- train(class ~ ., data = trainingset, method = customgbm, metric = "dist", maximize = false, tunelength = 10, trcontrol = traincontrol(method = "cv", number = 5, classprobs = true, summaryfunction = fourstats))
the model appears run, finishes error above. if please me customising gbm model tune gbm parameters, , probability threshold classes great.
Comments
Post a Comment