Gradient Boosted Trees using XGBoost
gbt( dataset, rvar, evar, type = "classification", lev = "", max_depth = 6, learning_rate = 0.3, min_split_loss = 0, min_child_weight = 1, subsample = 1, nrounds = 100, early_stopping_rounds = 10, nthread = 12, wts = "None", seed = NA, data_filter = "", envir = parent.frame(), ... )
dataset | Dataset |
---|---|
rvar | The response variable in the model |
evar | Explanatory variables in the model |
type | Model type (i.e., "classification" or "regression") |
lev | Level to use as the first column in prediction output |
max_depth | Maximum 'depth' of tree |
learning_rate | Learning rate (eta) |
min_split_loss | Minimal improvement (gamma) |
min_child_weight | Minimum number of instances allowed in each node |
subsample | Subsample ratio of the training instances (0-1) |
nrounds | Number of trees to create |
early_stopping_rounds | Early stopping rule |
nthread | Number of parallel threads to use. Defaults to 12 if available |
wts | Weights to use in estimation |
seed | Random seed to use as the starting point |
data_filter | Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "price > 10000") |
envir | Environment to extract data from |
... | Further arguments to pass to xgboost |
A list with all variables defined in gbt as an object of class gbt
See https://radiant-rstats.github.io/docs/model/gbt.html for an example in Radiant
summary.gbt
to summarize results
plot.gbt
to plot results
predict.gbt
for prediction
if (FALSE) { gbt(titanic, "survived", c("pclass", "sex"), lev = "Yes") %>% summary() gbt(titanic, "survived", c("pclass", "sex")) %>% str() } gbt(titanic, "survived", c("pclass", "sex"), lev = "Yes", early_stopping_rounds = 0) %>% summary()#> Gradient Boosted Trees (XGBoost) #> Type : Classification #> Data : titanic #> Response variable : survived #> Level : Yes in survived #> Explanatory variables: pclass, sex #> Max depth : 6 #> Learning rate (eta) : 0.3 #> Min split loss : 0 #> Min child weight : 1 #> Sub-sample : 1 #> Nr of rounds (trees) : 100 #> Early stopping rounds: 0 #> Nr obs : 1,043 #> #> Iteration history: #> #> [1] train-auc:0.824504 #> [2] train-auc:0.824504 #> Stopping. Best iteration: #> [1] train-auc:0.824504#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:3937] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.825 #> ..$ best_msg : chr "[1]\ttrain-auc:0.824504" #> ..$ niter : int 2 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 2 obs. of 2 variables: #> .. ..$ iter : num [1:2] 1 2 #> .. ..$ train_auc: num [1:2] 0.825 0.825 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 9 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ eval_metric : chr "auc" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.758 0.21 0.032 #> .. ..$ Cover : num [1:3] 0.435 0.435 0.13 #> .. ..$ Frequency: num [1:3] 0.25 0.5 0.25 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:7] "[1]\ttrain-auc:0.824504 " "Will train until train_auc hasn't improved in 0 rounds." "" "[2]\ttrain-auc:0.824504 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : NULL #> $ extra_args : list() #> $ gbt_input :List of 10 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 0 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric : chr "auc" #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 0 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:8906] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.593 #> ..$ best_msg : chr "[1]\ttrain-error@0.0833333:0.592522" #> ..$ niter : int 11 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 11 obs. of 2 variables: #> .. ..$ iter : num [1:11] 1 2 3 4 5 6 7 8 9 10 ... #> .. ..$ train_error@0.0833333: num [1:11] 0.593 0.593 0.593 0.593 0.593 ... #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 9 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ eval_metric : chr "error@0.0833333333333333" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.7291 0.2356 0.0353 #> .. ..$ Cover : num [1:3] 0.393 0.364 0.243 #> .. ..$ Frequency: num [1:3] 0.208 0.415 0.377 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:16] "[1]\ttrain-error@0.0833333:0.592522 " "Will train until train_error@0.0833333 hasn't improved in 10 rounds." "" "[2]\ttrain-error@0.0833333:0.592522 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : chr "eval_metric" #> $ extra_args :List of 1 #> ..$ eval_metric: chr "error@0.0833333333333333" #> $ gbt_input :List of 10 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 10 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric : chr "error@0.0833333333333333" #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 10 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"#> Gradient Boosted Trees (XGBoost) #> Type : Regression #> Data : diamonds #> Response variable : price #> Explanatory variables: carat, clarity #> Max depth : 6 #> Learning rate (eta) : 0.3 #> Min split loss : 0 #> Min child weight : 1 #> Sub-sample : 1 #> Nr of rounds (trees) : 100 #> Early stopping rounds: 10 #> Nr obs : 3,000 #> #> Iteration history: #> #> [02:24:23] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror. #> #> [2] train-rmse:2934.489990 #> [3] train-rmse:2209.099365 #> [4] train-rmse:1731.832275 #> [5] train-rmse:1429.542236 #> [6] train-rmse:1236.558716 #> [7] train-rmse:1112.986206 #> [8] train-rmse:1041.411133 #> [9] train-rmse:994.494568 #> ... #> [91] train-rmse:747.640930 #> [92] train-rmse:747.166992 #> [93] train-rmse:745.880615 #> [94] train-rmse:744.933716 #> [95] train-rmse:744.060425 #> [96] train-rmse:743.978699 #> [97] train-rmse:743.872742 #> [98] train-rmse:743.621277 #> [99] train-rmse:743.458862 #> [100] train-rmse:743.281799rig_wrap <- function(preds, dtrain) { labels <- xgboost::getinfo(dtrain, "label") value <- rig(preds, labels, lev = 1) list(metric = "rig", value = value) } gbt(titanic, "survived", c("pclass", "sex"), eval_metric = rig_wrap, maximize = TRUE) %>% str()#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:8838] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.124 #> ..$ best_msg : chr "[1]\ttrain-rig:0.124424" #> ..$ niter : int 11 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 11 obs. of 2 variables: #> .. ..$ iter : num [1:11] 1 2 3 4 5 6 7 8 9 10 ... #> .. ..$ train_rig: num [1:11] 0.124 0.124 0.124 0.124 0.124 ... #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 8 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.7291 0.2356 0.0353 #> .. ..$ Cover : num [1:3] 0.393 0.364 0.243 #> .. ..$ Frequency: num [1:3] 0.208 0.415 0.377 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:16] "[1]\ttrain-rig:0.124424 " "Will train until train_rig hasn't improved in 10 rounds." "" "[2]\ttrain-rig:0.124424 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : chr [1:2] "eval_metric" "maximize" #> $ extra_args :List of 2 #> ..$ eval_metric:function (preds, dtrain) #> .. ..- attr(*, "srcref")= 'srcref' int [1:8] 9 13 13 1 13 1 9 13 #> .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x10791bf0> #> ..$ maximize : logi TRUE #> $ gbt_input :List of 11 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 10 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric :function (preds, dtrain) #> .. ..- attr(*, "srcref")= 'srcref' int [1:8] 9 13 13 1 13 1 9 13 #> .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x10791bf0> #> ..$ maximize : logi TRUE #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 10 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"