Solved – GBM package vs. Caret using GBM

boostingcaretr

I have been model tuning using caret, but then re-running the model using the gbm package. It is my understanding that the caret package uses gbm and the output should be the same. However, just a quick test run using data(iris) shows a discrepancy in model of about 5% using RMSE and R^2 as the evaluation metric.
I want to find optimal model performance using caret but re-run in gbm to make use of the partial dependency plots. Code below for reproducibility.

My questions would be:

1) Why am I seeing a difference between these two packages even though they should be the same (I understand that they are stochastic but 5% is somewhat a large difference, especially when I am not using such a nice dataset as iris for my modeling).

2) Are there any advantages or disadvantages to using both packages – if so, which ones?

3) Unrelated: Using the iris dataset the optimal interaction.depth is 5 however it is higher than what I've read should be the maximum using floor(sqrt(ncol(iris))) which would be 2. Is this a strict rule of thumb or is it quite flexible?

library(caret)
library(gbm)
library(hydroGOF)
library(Metrics)
data(iris)

# Using caret
caretGrid <- expand.grid(interaction.depth=c(1, 3, 5), n.trees = (0:50)*50,
                   shrinkage=c(0.01, 0.001),
                   n.minobsinnode=10)
metric <- "RMSE"
trainControl <- trainControl(method="cv", number=10)

set.seed(99)
gbm.caret <- train(Sepal.Length ~ ., data=iris, distribution="gaussian", method="gbm",
              trControl=trainControl, verbose=FALSE, 
              tuneGrid=caretGrid, metric=metric, bag.fraction=0.75)                  

print(gbm.caret)
# caret determines the optimal model to be at n.tress=700, interaction.depth=5, shrinkage=0.01
# and n.minobsinnode=10
# RMSE = 0.3247354
# R^2 = 0.8604

# Using GBM
set.seed(99)
gbm.gbm <- gbm(Sepal.Length ~ ., data=iris, distribution="gaussian", n.trees=700, interaction.depth=5,
           n.minobsinnode=10, shrinkage=0.01, bag.fraction=0.75, cv.folds=10, verbose=FALSE)
best.iter <- gbm.perf(gbm.gbm, method="cv")
print(best.iter)
# Here the optimal n.trees = 540

train.predict <- predict.gbm(object=gbm.gbm, newdata=iris, 700)

print(rmse(iris$Sepal.Length, train.predict))
# RMSE = 0.2377

R2 <- cor(gbm.gbm$fit, iris$Sepal.Length)^2
print(R2)
# R^2 = 0.9178`

Best Answer

Use with the default grid to optimize parameters and use predict to have the same results:

R2.caret-R2.gbm=0.0009125435

rmse.caret-rmse.gbm=-0.001680319

library(caret)
library(gbm)
library(hydroGOF)
library(Metrics)
data(iris)

# Using caret with the default grid to optimize tune parameters automatically
# GBM Tuning parameters:
# n.trees (# Boosting Iterations)
# interaction.depth (Max Tree Depth)
# shrinkage (Shrinkage)
# n.minobsinnode (Min. Terminal Node Size)

metric <- "RMSE"
trainControl <- trainControl(method="cv", number=10)

set.seed(99)
gbm.caret <- train(Sepal.Length ~ .
                   , data=iris
                   , distribution="gaussian"
                   , method="gbm"
                   , trControl=trainControl
                   , verbose=FALSE
                   #, tuneGrid=caretGrid
                   , metric=metric
                   , bag.fraction=0.75
                   )                  

print(gbm.caret)

caret.predict <- predict(gbm.caret, newdata=iris, type="raw")

rmse.caret<-rmse(iris$Sepal.Length, caret.predict)
print(rmse.caret)

R2.caret <- cor(gbm.caret$finalModel$fit, iris$Sepal.Length)^2
print(R2.caret)

#using gbm without caret with the same parameters
set.seed(99)
gbm.gbm <- gbm(Sepal.Length ~ .
               , data=iris
               , distribution="gaussian"
               , n.trees=150
               , interaction.depth=3
               , n.minobsinnode=10
               , shrinkage=0.1
               , bag.fraction=0.75
               , cv.folds=10
               , verbose=FALSE
               )
best.iter <- gbm.perf(gbm.gbm, method="cv")
print(best.iter)

train.predict <- predict.gbm(object=gbm.gbm, newdata=iris, 150)

rmse.gbm<-rmse(iris$Sepal.Length, train.predict)
print(rmse.gbm)

R2.gbm <- cor(gbm.gbm$fit, iris$Sepal.Length)^2
print(R2.gbm)

print(R2.caret-R2.gbm)
print(rmse.caret-rmse.gbm)
Related Question