Gradient Boosted Trees using XGBoost

gbt(
  dataset,
  rvar,
  evar,
  type = "classification",
  lev = "",
  max_depth = 6,
  learning_rate = 0.3,
  min_split_loss = 0,
  min_child_weight = 1,
  subsample = 1,
  nrounds = 100,
  early_stopping_rounds = 10,
  nthread = 12,
  wts = "None",
  seed = NA,
  data_filter = "",
  envir = parent.frame(),
  ...
)

Arguments

dataset

Dataset

rvar

The response variable in the model

evar

Explanatory variables in the model

type

Model type (i.e., "classification" or "regression")

lev

Level to use as the first column in prediction output

max_depth

Maximum 'depth' of tree

learning_rate

Learning rate (eta)

min_split_loss

Minimal improvement (gamma)

min_child_weight

Minimum number of instances allowed in each node

subsample

Subsample ratio of the training instances (0-1)

nrounds

Number of trees to create

early_stopping_rounds

Early stopping rule

nthread

Number of parallel threads to use. Defaults to 12 if available

wts

Weights to use in estimation

seed

Random seed to use as the starting point

data_filter

Expression entered in, e.g., Data > View to filter the dataset in Radiant. The expression should be a string (e.g., "price > 10000")

envir

Environment to extract data from

...

Further arguments to pass to xgboost

Value

A list with all variables defined in gbt as an object of class gbt

Details

See https://radiant-rstats.github.io/docs/model/gbt.html for an example in Radiant

See also

summary.gbt to summarize results

plot.gbt to plot results

predict.gbt for prediction

Examples

if (FALSE) { gbt(titanic, "survived", c("pclass", "sex"), lev = "Yes") %>% summary() gbt(titanic, "survived", c("pclass", "sex")) %>% str() } gbt(titanic, "survived", c("pclass", "sex"), lev = "Yes", early_stopping_rounds = 0) %>% summary()
#> Gradient Boosted Trees (XGBoost) #> Type : Classification #> Data : titanic #> Response variable : survived #> Level : Yes in survived #> Explanatory variables: pclass, sex #> Max depth : 6 #> Learning rate (eta) : 0.3 #> Min split loss : 0 #> Min child weight : 1 #> Sub-sample : 1 #> Nr of rounds (trees) : 100 #> Early stopping rounds: 0 #> Nr obs : 1,043 #> #> Iteration history: #> #> [1] train-auc:0.824504 #> [2] train-auc:0.824504 #> Stopping. Best iteration: #> [1] train-auc:0.824504
gbt(titanic, "survived", c("pclass", "sex"), early_stopping_rounds = 0) %>% str()
#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:3937] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.825 #> ..$ best_msg : chr "[1]\ttrain-auc:0.824504" #> ..$ niter : int 2 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 2 obs. of 2 variables: #> .. ..$ iter : num [1:2] 1 2 #> .. ..$ train_auc: num [1:2] 0.825 0.825 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 9 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ eval_metric : chr "auc" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.758 0.21 0.032 #> .. ..$ Cover : num [1:3] 0.435 0.435 0.13 #> .. ..$ Frequency: num [1:3] 0.25 0.5 0.25 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:7] "[1]\ttrain-auc:0.824504 " "Will train until train_auc hasn't improved in 0 rounds." "" "[2]\ttrain-auc:0.824504 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : NULL #> $ extra_args : list() #> $ gbt_input :List of 10 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 0 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric : chr "auc" #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 0 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"
gbt(titanic, "survived", c("pclass", "sex"), eval_metric = paste0("error@", 0.5 / 6)) %>% str()
#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:8906] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.593 #> ..$ best_msg : chr "[1]\ttrain-error@0.0833333:0.592522" #> ..$ niter : int 11 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 11 obs. of 2 variables: #> .. ..$ iter : num [1:11] 1 2 3 4 5 6 7 8 9 10 ... #> .. ..$ train_error@0.0833333: num [1:11] 0.593 0.593 0.593 0.593 0.593 ... #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 9 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ eval_metric : chr "error@0.0833333333333333" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.7291 0.2356 0.0353 #> .. ..$ Cover : num [1:3] 0.393 0.364 0.243 #> .. ..$ Frequency: num [1:3] 0.208 0.415 0.377 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:16] "[1]\ttrain-error@0.0833333:0.592522 " "Will train until train_error@0.0833333 hasn't improved in 10 rounds." "" "[2]\ttrain-error@0.0833333:0.592522 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : chr "eval_metric" #> $ extra_args :List of 1 #> ..$ eval_metric: chr "error@0.0833333333333333" #> $ gbt_input :List of 10 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 10 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric : chr "error@0.0833333333333333" #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 10 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"
gbt(diamonds, "price", c("carat", "clarity"), type = "regression") %>% summary()
#> Gradient Boosted Trees (XGBoost) #> Type : Regression #> Data : diamonds #> Response variable : price #> Explanatory variables: carat, clarity #> Max depth : 6 #> Learning rate (eta) : 0.3 #> Min split loss : 0 #> Min child weight : 1 #> Sub-sample : 1 #> Nr of rounds (trees) : 100 #> Early stopping rounds: 10 #> Nr obs : 3,000 #> #> Iteration history: #> #> [02:24:23] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror. #> #> [2] train-rmse:2934.489990 #> [3] train-rmse:2209.099365 #> [4] train-rmse:1731.832275 #> [5] train-rmse:1429.542236 #> [6] train-rmse:1236.558716 #> [7] train-rmse:1112.986206 #> [8] train-rmse:1041.411133 #> [9] train-rmse:994.494568 #> ... #> [91] train-rmse:747.640930 #> [92] train-rmse:747.166992 #> [93] train-rmse:745.880615 #> [94] train-rmse:744.933716 #> [95] train-rmse:744.060425 #> [96] train-rmse:743.978699 #> [97] train-rmse:743.872742 #> [98] train-rmse:743.621277 #> [99] train-rmse:743.458862 #> [100] train-rmse:743.281799
rig_wrap <- function(preds, dtrain) { labels <- xgboost::getinfo(dtrain, "label") value <- rig(preds, labels, lev = 1) list(metric = "rig", value = value) } gbt(titanic, "survived", c("pclass", "sex"), eval_metric = rig_wrap, maximize = TRUE) %>% str()
#> List of 26 #> $ check : chr "" #> $ model :List of 15 #> ..$ handle :Class 'xgb.Booster.handle' <externalptr> #> ..$ raw : raw [1:8838] 43 4f 4e 46 ... #> ..$ best_iteration : num 1 #> ..$ best_ntreelimit: int 1 #> ..$ best_score : num 0.124 #> ..$ best_msg : chr "[1]\ttrain-rig:0.124424" #> ..$ niter : int 11 #> ..$ evaluation_log :Classes ‘data.table’ and 'data.frame': 11 obs. of 2 variables: #> .. ..$ iter : num [1:11] 1 2 3 4 5 6 7 8 9 10 ... #> .. ..$ train_rig: num [1:11] 0.124 0.124 0.124 0.124 0.124 ... #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ call : language xgb.train(params = params, data = dtrain, nrounds = nrounds, watchlist = watchlist, verbose = verbose, print| __truncated__ ... #> ..$ params :List of 8 #> .. ..$ max_depth : num 6 #> .. ..$ learning_rate : num 0.3 #> .. ..$ min_split_loss : num 0 #> .. ..$ min_child_weight : num 1 #> .. ..$ subsample : num 1 #> .. ..$ nthread : num 12 #> .. ..$ objective : chr "binary:logistic" #> .. ..$ validate_parameters: logi TRUE #> ..$ callbacks :List of 3 #> .. ..$ cb.print.evaluation:function (env = parent.frame()) #> .. .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n) #> .. .. ..- attr(*, "name")= chr "cb.print.evaluation" #> .. ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.evaluation.log() #> .. .. ..- attr(*, "name")= chr "cb.evaluation.log" #> .. ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE) #> .. .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose) #> .. .. ..- attr(*, "name")= chr "cb.early.stop" #> ..$ feature_names : chr [1:3] "pclass2nd" "pclass3rd" "sexmale" #> ..$ nfeatures : int 3 #> ..$ importance :Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables: #> .. ..$ Feature : chr [1:3] "sexmale" "pclass3rd" "pclass2nd" #> .. ..$ Gain : num [1:3] 0.7291 0.2356 0.0353 #> .. ..$ Cover : num [1:3] 0.393 0.364 0.243 #> .. ..$ Frequency: num [1:3] 0.208 0.415 0.377 #> .. ..- attr(*, ".internal.selfref")=<externalptr> #> ..$ model : tibble [1,043 × 3] (S3: tbl_df/tbl/data.frame) #> .. ..$ survived: Factor w/ 2 levels "Yes","No": 1 1 2 2 2 1 1 2 1 2 ... #> .. ..$ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ... #> .. ..$ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ... #> .. ..- attr(*, "description")= chr "## Titanic\n\nThis dataset describes the survival status of individual passengers on the Titanic. The titanic d"| __truncated__ #> ..- attr(*, "class")= chr "xgb.Booster" #> $ output : chr [1:16] "[1]\ttrain-rig:0.124424 " "Will train until train_rig hasn't improved in 10 rounds." "" "[2]\ttrain-rig:0.124424 " ... #> $ check_args :function (arg, default, inp = gbt_input) #> $ extra_args_names : chr [1:2] "eval_metric" "maximize" #> $ extra_args :List of 2 #> ..$ eval_metric:function (preds, dtrain) #> .. ..- attr(*, "srcref")= 'srcref' int [1:8] 9 13 13 1 13 1 9 13 #> .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x10791bf0> #> ..$ maximize : logi TRUE #> $ gbt_input :List of 11 #> ..$ max_depth : num 6 #> ..$ learning_rate : num 0.3 #> ..$ min_split_loss : num 0 #> ..$ nrounds : num 100 #> ..$ min_child_weight : num 1 #> ..$ subsample : num 1 #> ..$ early_stopping_rounds: num 10 #> ..$ nthread : num 12 #> ..$ objective : chr "binary:logistic" #> ..$ eval_metric :function (preds, dtrain) #> .. ..- attr(*, "srcref")= 'srcref' int [1:8] 9 13 13 1 13 1 9 13 #> .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x10791bf0> #> ..$ maximize : logi TRUE #> $ not_vary : chr(0) #> $ nr_obs : int 1043 #> $ df_name : chr "titanic" #> $ vars : chr [1:2] "pclass" "sex" #> $ rvar : chr "survived" #> $ evar : chr [1:2] "pclass" "sex" #> $ type : chr "classification" #> $ lev : chr "Yes" #> $ max_depth : num 6 #> $ learning_rate : num 0.3 #> $ min_split_loss : num 0 #> $ min_child_weight : num 1 #> $ subsample : num 1 #> $ nrounds : num 100 #> $ early_stopping_rounds: num 10 #> $ nthread : num 12 #> $ wts : NULL #> $ seed : chr NA #> $ data_filter : chr "" #> - attr(*, "class")= chr [1:3] "gbt" "model" "list"