Note: save and load Rstudio project (.Rproj) in directory containing the dataset
# setwd("<write/path/to/dataset/>") # if not loading Rproject file
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
data <- read_csv("countdata_large_normed.csv")
## Rows: 11204 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): id, author, born, category, sex, title_monogr, topics
## dbl (17): num_sentences, subst, subst_prop, verb, adj, kolon, punkt, komma, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
inspect the first 6 rows of the data
head(data)
## # A tibble: 6 × 24
## num_s…¹ subst subst…² verb adj kolon punkt komma spm utrop overs…³
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0817 0.203 0.0949 0.152 0.102 0.0155 0.0596 0.0442 0.00442 0 0.0177
## 2 0.0813 0.218 0.119 0.147 0.0923 0 0.0703 0.0462 0 0 0.0110
## 3 0.0622 0.237 0.0705 0.151 0.102 0.00207 0.0560 0.0353 0 0 0.00622
## 4 0.0721 0.232 0.0762 0.140 0.102 0 0.0621 0.0361 0.00200 0 0.00802
## 5 0.0764 0.186 0.0306 0.205 0.0939 0.00437 0.0655 0.0393 0.00655 0 0.00437
## 6 0.120 0.133 0.00451 0.228 0.0632 0.00226 0.115 0.0497 0 0 0.00451
## # … with 13 more variables: ukjent <dbl>, mean_sent_length <dbl>,
## # num_words <dbl>, id <chr>, section_number <dbl>, author <chr>, born <chr>,
## # category <chr>, n_authors <dbl>, publication_year <dbl>, sex <chr>,
## # title_monogr <chr>, topics <chr>, and abbreviated variable names
## # ¹num_sentences, ²subst_prop, ³overskrift
data <- data %>%
rename(noun = subst,
prop_noun = subst_prop,
colon = kolon,
period = punkt,
comma = komma,
exclamation = utrop)
names(data)
## [1] "num_sentences" "noun" "prop_noun" "verb"
## [5] "adj" "colon" "period" "comma"
## [9] "spm" "exclamation" "overskrift" "ukjent"
## [13] "mean_sent_length" "num_words" "id" "section_number"
## [17] "author" "born" "category" "n_authors"
## [21] "publication_year" "sex" "title_monogr" "topics"
table(data$category)
##
## AV SA SK
## 5205 3000 2999
names(data)
## [1] "num_sentences" "noun" "prop_noun" "verb"
## [5] "adj" "colon" "period" "comma"
## [9] "spm" "exclamation" "overskrift" "ukjent"
## [13] "mean_sent_length" "num_words" "id" "section_number"
## [17] "author" "born" "category" "n_authors"
## [21] "publication_year" "sex" "title_monogr" "topics"
input <- data %>%
dplyr::select(category, num_sentences:comma, exclamation)
names(input)
## [1] "category" "num_sentences" "noun" "prop_noun"
## [5] "verb" "adj" "colon" "period"
## [9] "comma" "exclamation"
str(input)
## tibble [11,204 × 10] (S3: tbl_df/tbl/data.frame)
## $ category : chr [1:11204] "AV" "AV" "AV" "AV" ...
## $ num_sentences: num [1:11204] 0.0817 0.0813 0.0622 0.0721 0.0764 ...
## $ noun : num [1:11204] 0.203 0.218 0.237 0.232 0.186 ...
## $ prop_noun : num [1:11204] 0.0949 0.1187 0.0705 0.0762 0.0306 ...
## $ verb : num [1:11204] 0.152 0.147 0.151 0.14 0.205 ...
## $ adj : num [1:11204] 0.1015 0.0923 0.1017 0.1022 0.0939 ...
## $ colon : num [1:11204] 0.01545 0 0.00207 0 0.00437 ...
## $ period : num [1:11204] 0.0596 0.0703 0.056 0.0621 0.0655 ...
## $ comma : num [1:11204] 0.0442 0.0462 0.0353 0.0361 0.0393 ...
## $ exclamation : num [1:11204] 0 0 0 0 0 0 0 0 0 0 ...
input$category <- factor(input$category)
ggplot(data=input, aes(x=category, y=prop_noun, fill=category))+
geom_boxplot()
ggplot(data=input, aes(x=category, y=prop_noun, fill=category))+
geom_boxplot()+
ylim(0, 0.3)
## Warning: Removed 24 rows containing non-finite values (stat_boxplot).
ggplot(data=data, aes(x=category, y=noun, fill=category))+
geom_boxplot()
facet_plot <- input %>%
tidyr::pivot_longer(!category, names_to = "variable", values_to = "value")
facet_plot
## # A tibble: 100,836 × 3
## category variable value
## <fct> <chr> <dbl>
## 1 AV num_sentences 0.0817
## 2 AV noun 0.203
## 3 AV prop_noun 0.0949
## 4 AV verb 0.152
## 5 AV adj 0.102
## 6 AV colon 0.0155
## 7 AV period 0.0596
## 8 AV comma 0.0442
## 9 AV exclamation 0
## 10 AV num_sentences 0.0813
## # … with 100,826 more rows
ggplot(data=facet_plot, aes(x=variable, y=value, fill=category))+
geom_boxplot()
ggplot(data=facet_plot, aes(x=variable, y=value, fill=category))+
geom_boxplot()+
facet_wrap(~variable, scales="free")+
theme(axis.text.x = element_blank(), #optional: remove x axis label (redundant)
axis.ticks.x = element_blank()) #optional: remove x axis tick
pairs(input[,2:ncol(input)], pch = 19, lower.panel = NULL) #pch is symbol type (maybe leave out)
if ( "GGally" %in% rownames(installed.packages()) ){
library(GGally)
input %>%
dplyr::select(-category) %>%
ggpairs(aes(alpha = 0.4))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}
input_bin <- input %>%
filter(category != "SK")
table(input_bin$category)
##
## AV SA SK
## 5205 3000 0
input_bin$category <- factor(input_bin$category)
table(input_bin$category)
##
## AV SA
## 5205 3000
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✔ broom 1.0.2 ✔ recipes 0.1.17
## ✔ dials 0.0.9 ✔ rsample 0.0.9
## ✔ infer 0.5.4 ✔ tune 0.1.3
## ✔ modeldata 0.1.0 ✔ workflows 0.2.3
## ✔ parsnip 0.1.5 ✔ yardstick 0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ workflows::extract_recipe() masks tune::extract_recipe()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
train_set_prop = 0.8
set.seed(561) #for reproducibility (i.e. ensures the "split" is the same for each rerun)
train_test_split <- rsample::initial_split(input_bin, prop=train_set_prop, strata=category)
train <- training(train_test_split)
test <- testing(train_test_split)
table(train$category)
##
## AV SA
## 4165 2401
table(test$category)
##
## AV SA
## 1040 599
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
#Note: within tidymodels, you get same result from
#xboost across runs
#set.seed(169) #XGBoost is "stochastic"
xgb_fit <-
boost_tree(trees=50) %>% #specify that you wanted a boosted tree model
set_engine("xgboost") %>% #specify that you want to use xgboost
set_mode("classification") %>% #specify the "objective" is classification
fit(category ~ ., data=train)
xgb_bin <- xgb_fit #saving for later
xgb_pred = predict(xgb_fit, test)
head(xgb_pred)
## # A tibble: 6 × 1
## .pred_class
## <fct>
## 1 AV
## 2 AV
## 3 AV
## 4 AV
## 5 AV
## 6 AV
xgb_pred_prob = predict(xgb_fit, test, type="prob")
head(xgb_pred_prob)
## # A tibble: 6 × 2
## .pred_AV .pred_SA
## <dbl> <dbl>
## 1 0.989 0.0110
## 2 0.998 0.00195
## 3 0.628 0.372
## 4 0.987 0.0130
## 5 0.995 0.00460
## 6 0.935 0.0650
xgb_res <- bind_cols(test$category, xgb_pred, xgb_pred_prob)
## New names:
## • `` -> `...1`
names(xgb_res) <- c("obs", "pred", "pred_AV", "pred_SA")
xgb_res$pred <- factor(xgb_res$pred) # Note: this seems to be "necessary" for R v4.0 and upwards
xgb_conf <- xgb_res %>%
conf_mat(obs, pred)
xgb_conf
## Truth
## Prediction AV SA
## AV 1017 49
## SA 23 550
xgb_acc <- sum(diag(xgb_conf$table))/sum(xgb_conf$table)
xgb_acc
## [1] 0.9560708
comp_acc <- function(conf_matrix){
accuracy <- sum(diag(conf_matrix))/sum(conf_matrix)
return(accuracy)
}
comp_acc(xgb_conf$table)
## [1] 0.9560708
comp_prec <- function(conf_matrix){
precision <- diag(conf_matrix) / rowSums(conf_matrix)
return(precision)
}
xgb_prec <- comp_prec(xgb_conf$table)
xgb_prec
## AV SA
## 0.9540338 0.9598604
comp_recall <- function(conf_matrix){
recall <- diag(conf_matrix) / colSums(conf_matrix)
return(recall)
}
xgb_rec <- comp_recall(xgb_conf$table)
xgb_rec
## AV SA
## 0.9778846 0.9181970
xgb_res %>%
accuracy(obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.956
xgb_res %>%
recall(obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.978
xgb_res %>%
precision(obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.954
levels(xgb_res$obs)
## [1] "AV" "SA"
xgb_res %>%
recall(obs, pred, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.918
xgb_res %>%
precision(obs, pred, event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.960
xgb_res %>%
metrics(obs, pred)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.956
## 2 kap binary 0.904
xgb_res %>%
metrics(obs, pred, pred_AV)
## # A tibble: 4 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.956
## 2 kap binary 0.904
## 3 mn_log_loss binary 0.124
## 4 roc_auc binary 0.989
ggplot(xgb_res, aes(x=obs, y=pred_SA, fill=obs))+
geom_boxplot(outlier.shape = NA)+
geom_jitter(width=0.1,alpha=0.1)+
geom_hline(yintercept = 0.5, color = "blue", linetype = "dashed", alpha=0.6)+
theme(axis.text.x = element_blank(), #ev. ekstra
axis.ticks.x = element_blank())+ #ev. ekstra
labs(fill="actual category",
x="")
ggplot(xgb_res, aes(x=obs, y=pred_AV, fill=obs))+
geom_boxplot()+
geom_jitter(width=0.1,alpha=0.2, outlier.shape = NA)+
geom_hline(yintercept = 0.5, color = "blue", linetype = "dashed", alpha=0.6)+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())+
labs(fill="actual category",
x="")
## Warning: Ignoring unknown parameters: outlier.shape
knitr::include_graphics("overfitting_runs.svg")
#https://commons.wikimedia.org/wiki/File:2d-epochs-overfitting.svg
threshold_new = 0.8
xgb_pred_new <- if_else(xgb_pred_prob$.pred_SA >= threshold_new, "SA", "AV")
xgb_res_new <- bind_cols(test$category, xgb_pred_new, xgb_pred_prob)
## New names:
## • `` -> `...1`
## • `` -> `...2`
names(xgb_res_new) <- c("obs", "pred", "pred_AV", "pred_SA")
xgb_res_new$pred <- factor(xgb_res_new$pred) # Note: this seems to be "necessary" for R v4.0 and upwards
xgb_conf_new <- xgb_res_new %>%
conf_mat(obs, pred)
xgb_conf_new
## Truth
## Prediction AV SA
## AV 1034 93
## SA 6 506
comp_acc(xgb_conf_new$table)
## [1] 0.9395973
comp_prec(xgb_conf_new$table)
## AV SA
## 0.9174800 0.9882812
comp_recall(xgb_conf_new$table)
## AV SA
## 0.9942308 0.8447412
roc_curve(xgb_res, obs, pred_AV) %>%
autoplot()
roc_curve(xgb_res, obs, pred_AV) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line()+ #in this case, same result with geom_path (order in dataframe, NOT x-axis)
geom_abline(lty = 3) +
coord_equal() +
theme_bw()
auc_xgb_bin <- roc_auc(xgb_res, obs, pred_AV)
auc_xgb_bin
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.989
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
vip(xgb_fit)
xgb_features <- vip::vi(xgb_bin)
xgb_features
## # A tibble: 9 × 2
## Variable Importance
## <chr> <dbl>
## 1 colon 0.366
## 2 exclamation 0.233
## 3 prop_noun 0.216
## 4 num_sentences 0.0461
## 5 noun 0.0337
## 6 adj 0.0304
## 7 period 0.0299
## 8 verb 0.0243
## 9 comma 0.0203
xgb_feature_bin <-
ggplot(data=xgb_features, aes(x=reorder(Variable, -Importance),
y=Importance))+
geom_col(aes(fill=Variable))+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(x="",
title="Binary classification\n('AV' vs 'SA')")
xgb_feature_bin
train_set_prop = 0.8
set.seed(231) #for reproducibility
train_test_multi <- rsample::initial_split(input, prop=train_set_prop, strata=category)
train <- training(train_test_multi)
test <- testing(train_test_multi)
#set.seed(1245) #XGBoost is "stochastic"
table(train$category)
##
## AV SA SK
## 4165 2401 2400
table(test$category)
##
## AV SA SK
## 1040 599 599
xgb_multi <-
boost_tree(trees=50) %>%
set_engine("xgboost") %>%
set_mode("classification") %>%
fit(category ~ ., data=train)
xgb_pred = predict(xgb_multi, test)
xgb_pred_prob = predict(xgb_multi, test, type="prob")
xgb_pred
## # A tibble: 2,238 × 1
## .pred_class
## <fct>
## 1 AV
## 2 AV
## 3 AV
## 4 AV
## 5 AV
## 6 AV
## 7 AV
## 8 AV
## 9 AV
## 10 AV
## # … with 2,228 more rows
xgb_pred_prob
## # A tibble: 2,238 × 3
## .pred_AV .pred_SA .pred_SK
## <dbl> <dbl> <dbl>
## 1 0.960 0.0397 0.000584
## 2 0.986 0.0130 0.000864
## 3 0.850 0.150 0.000581
## 4 1.00 0.000211 0.000190
## 5 0.909 0.0839 0.00755
## 6 0.911 0.0807 0.00781
## 7 0.948 0.0493 0.00309
## 8 0.998 0.00198 0.0000314
## 9 1.00 0.000162 0.000275
## 10 0.980 0.0196 0.000340
## # … with 2,228 more rows
xgb_res <- bind_cols(test$category, xgb_pred, xgb_pred_prob)
## New names:
## • `` -> `...1`
names(xgb_res) <- c("obs", "pred", "pred_AV", "pred_SA", "pred_SK")
xgb_res$pred <- factor(xgb_res$pred) # Note: this seems to be "necessary" for R v4.0 and upwards
xgb_conf <- xgb_res %>%
conf_mat(obs, pred)
xgb_conf
## Truth
## Prediction AV SA SK
## AV 997 43 14
## SA 24 503 34
## SK 19 53 551
xgb_res %>%
accuracy(obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.916
multi_prec <- comp_prec(xgb_conf$table)
multi_prec
## AV SA SK
## 0.9459203 0.8966132 0.8844302
multi_recall <- comp_recall(xgb_conf$table)
multi_recall
## AV SA SK
## 0.9586538 0.8397329 0.9198664
macroPrecision <- mean(multi_prec)
macroPrecision
## [1] 0.9089879
precision(xgb_res, obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision macro 0.909
macroRecall <- mean(multi_recall)
macroRecall
## [1] 0.9060844
recall(xgb_res, obs, pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall macro 0.906
xgb_res %>%
roc_auc(obs, pred_AV:pred_SK)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc hand_till 0.981
xgb_res %>%
metrics(obs, pred, pred_AV:pred_SK)
## # A tibble: 4 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.916
## 2 kap multiclass 0.869
## 3 mn_log_loss multiclass 0.223
## 4 roc_auc hand_till 0.981
test_plot <- xgb_res %>%
dplyr::select(obs,pred_AV, pred_SA, pred_SK)
test_plot_long <- test_plot %>%
tidyr::pivot_longer(!obs, names_to = "variable", values_to = "value")
head(test_plot_long)
## # A tibble: 6 × 3
## obs variable value
## <fct> <chr> <dbl>
## 1 AV pred_AV 0.960
## 2 AV pred_SA 0.0397
## 3 AV pred_SK 0.000584
## 4 AV pred_AV 0.986
## 5 AV pred_SA 0.0130
## 6 AV pred_SK 0.000864
ggplot(data=test_plot_long, aes(x=obs, y=value, fill=obs))+
geom_boxplot(outlier.shape = NA)+
geom_jitter(width=0.1,alpha=0.05)+
geom_hline(yintercept = 1/3, color = "blue", linetype = "dashed", alpha=0.6)+
facet_wrap(~variable)+
labs(y="Predicted probability",
fill="Actual category")
vip(xgb_multi)
xgb_features <- vip::vi(xgb_multi)
xgb_feature_multi <-
ggplot(data=xgb_features, aes(x=reorder(Variable, -Importance),
y=Importance))+
geom_col(aes(fill=Variable))+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(x="",
title="Multiclass")
xgb_feature_multi
library("patchwork", quietly = TRUE)
features <- xgb_feature_bin + xgb_feature_multi & theme(legend.position = "bottom")
features + plot_layout(guides = "collect") +
plot_annotation(title = "Feature importance from XGB")
min_n = 2 #choose an integer from range 1-10 (default = 1)
tree_depth = 9 #choose integer from range 1-10 (default = 6)
learn_rate = 0.15 # choose decimal value between 0.01 and 0.5 (default = 0.3)
knitr::include_graphics("CV_image.svg")
#https://upload.wikimedia.org/wikipedia/commons/1/1c/K-fold_cross_validation_EN.jpg
xgb_mod <- boost_tree(trees=50) %>%
set_engine("xgboost") %>%
set_mode("classification")
wflow_xgb <-
workflow() %>%
add_model(xgb_mod) %>%
add_formula(category ~ .)
set.seed(156)
folds <- vfold_cv(input_bin, v = 10, strata = category) #strata for stratified cv
xgb_cv <- fit_resamples(wflow_xgb, folds)
xgb_cv
## # Resampling results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [7384/821]> Fold01 <tibble [2 × 4]> <tibble [0 × 1]>
## 2 <split [7384/821]> Fold02 <tibble [2 × 4]> <tibble [0 × 1]>
## 3 <split [7384/821]> Fold03 <tibble [2 × 4]> <tibble [0 × 1]>
## 4 <split [7384/821]> Fold04 <tibble [2 × 4]> <tibble [0 × 1]>
## 5 <split [7384/821]> Fold05 <tibble [2 × 4]> <tibble [0 × 1]>
## 6 <split [7385/820]> Fold06 <tibble [2 × 4]> <tibble [0 × 1]>
## 7 <split [7385/820]> Fold07 <tibble [2 × 4]> <tibble [0 × 1]>
## 8 <split [7385/820]> Fold08 <tibble [2 × 4]> <tibble [0 × 1]>
## 9 <split [7385/820]> Fold09 <tibble [2 × 4]> <tibble [0 × 1]>
## 10 <split [7385/820]> Fold10 <tibble [2 × 4]> <tibble [0 × 1]>
collect_metrics(xgb_cv)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy binary 0.960 10 0.00199 Preprocessor1_Model1
## 2 roc_auc binary 0.990 10 0.000949 Preprocessor1_Model1
# the same as
# xgb_cv %>%
# collect_metrics()
collect_metrics(xgb_cv, summarize=FALSE) %>%
filter(.metric =="accuracy")
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <fct>
## 1 Fold01 accuracy binary 0.950 Preprocessor1_Model1
## 2 Fold02 accuracy binary 0.960 Preprocessor1_Model1
## 3 Fold03 accuracy binary 0.954 Preprocessor1_Model1
## 4 Fold04 accuracy binary 0.957 Preprocessor1_Model1
## 5 Fold05 accuracy binary 0.962 Preprocessor1_Model1
## 6 Fold06 accuracy binary 0.972 Preprocessor1_Model1
## 7 Fold07 accuracy binary 0.956 Preprocessor1_Model1
## 8 Fold08 accuracy binary 0.963 Preprocessor1_Model1
## 9 Fold09 accuracy binary 0.961 Preprocessor1_Model1
## 10 Fold10 accuracy binary 0.966 Preprocessor1_Model1
#the same as
# xgb_cv %>%
# collect_metrics(summarize=FALSE) %>%
# filter(.metric =="accuracy")
xgb_cv %>%
collect_metrics(summarize=FALSE) %>%
filter(.metric =="accuracy") %>%
summarise(max = max(.estimate), min = min(.estimate))
## # A tibble: 1 × 2
## max min
## <dbl> <dbl>
## 1 0.972 0.950
xgb_cv_res <-
xgb_cv %>%
collect_metrics(summarize=FALSE) %>%
filter(.metric =="accuracy") %>%
summarise(max = max(.estimate), min = min(.estimate),
mean = mean(.estimate), sd = sd(.estimate))
xgb_cv_res$mean + xgb_cv_res$sd
## [1] 0.9664288
xgb_cv_res$mean - xgb_cv_res$sd
## [1] 0.953868
my_metrics <- metric_set(accuracy,roc_auc, precision,
recall)
xgb_cv_rev <- fit_resamples(wflow_xgb, folds,
metrics = my_metrics)
xgb_cv_rev %>%
collect_metrics()
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy binary 0.960 10 0.00199 Preprocessor1_Model1
## 2 precision binary 0.956 10 0.00257 Preprocessor1_Model1
## 3 recall binary 0.982 10 0.00178 Preprocessor1_Model1
## 4 roc_auc binary 0.990 10 0.000949 Preprocessor1_Model1
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)
set.seed(156)
xgb_cv_rev <- fit_resamples(wflow_xgb, folds,
metrics = my_metrics, control = keep_pred)
collect_predictions(xgb_cv_rev)
## # A tibble: 8,205 × 7
## id .pred_class .row .pred_AV .pred_SA category .config
## <chr> <fct> <int> <dbl> <dbl> <fct> <fct>
## 1 Fold01 AV 8 0.991 0.00885 AV Preprocessor1_Model1
## 2 Fold01 AV 16 0.969 0.0309 AV Preprocessor1_Model1
## 3 Fold01 AV 30 0.979 0.0213 AV Preprocessor1_Model1
## 4 Fold01 AV 31 0.999 0.000631 AV Preprocessor1_Model1
## 5 Fold01 AV 41 1.00 0.000283 AV Preprocessor1_Model1
## 6 Fold01 AV 47 0.998 0.00226 AV Preprocessor1_Model1
## 7 Fold01 AV 68 0.998 0.00230 AV Preprocessor1_Model1
## 8 Fold01 AV 90 0.998 0.00204 AV Preprocessor1_Model1
## 9 Fold01 AV 96 0.974 0.0263 AV Preprocessor1_Model1
## 10 Fold01 AV 101 0.967 0.0331 AV Preprocessor1_Model1
## # … with 8,195 more rows
xgb_cv_conf <- xgb_cv_rev %>%
collect_predictions() %>%
conf_mat(category, .pred_class)
xgb_cv_conf
## Truth
## Prediction AV SA
## AV 5113 235
## SA 92 2765
comp_prec(xgb_cv_conf$table)
## AV SA
## 0.9560583 0.9677984
comp_recall(xgb_cv_conf$table)
## AV SA
## 0.9823247 0.9216667
xgb_cv_rev %>%
collect_metrics(summarize=FALSE) %>%
filter(.metric=="accuracy") %>%
summarise(mean = mean(.estimate)) %>%
pull()
## [1] 0.9601484
comp_acc(xgb_cv_conf$table)
## [1] 0.9601463
rep_folds <- vfold_cv(input_bin, v = 10, repeats = 50,
strata = category) #strata for stratified cv
xgb_mod_cv <- boost_tree(trees=5) %>%
set_engine("xgboost") %>%
set_mode("classification")
wflow_xgb_cv <-
workflow() %>%
add_model(xgb_mod_cv) %>%
add_formula(category ~ .)
# If doParallel is installed, and your computer has
# several cores, this will make use of parallelization
# (splitting the task into bits to be run simultaneously)
if ("doParallel" %in% rownames(installed.packages())){
library("doParallel")
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores) #number of cores?
registerDoParallel(cl) #only version > 1.0.16..
}
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
set.seed(777)
xgb_cv_rep <- fit_resamples(wflow_xgb_cv, rep_folds,
metrics = my_metrics, control = keep_pred)
if (exists("cl")){
stopCluster(cl)
}
xgb_cv_rep
## # Resampling results
## # 10-fold cross-validation repeated 50 times using stratification
## # A tibble: 500 × 6
## splits id id2 .metrics .notes .predi…¹
## <list> <chr> <chr> <list> <list> <list>
## 1 <split [7384/821]> Repeat01 Fold01 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 2 <split [7384/821]> Repeat01 Fold02 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 3 <split [7384/821]> Repeat01 Fold03 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 4 <split [7384/821]> Repeat01 Fold04 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 5 <split [7384/821]> Repeat01 Fold05 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 6 <split [7385/820]> Repeat01 Fold06 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 7 <split [7385/820]> Repeat01 Fold07 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 8 <split [7385/820]> Repeat01 Fold08 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 9 <split [7385/820]> Repeat01 Fold09 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 10 <split [7385/820]> Repeat01 Fold10 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## # … with 490 more rows, and abbreviated variable name ¹.predictions
xgb_cv_rep %>%
collect_metrics()
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy binary 0.923 500 0.000418 Preprocessor1_Model1
## 2 precision binary 0.916 500 0.000502 Preprocessor1_Model1
## 3 recall binary 0.967 500 0.000394 Preprocessor1_Model1
## 4 roc_auc binary 0.969 500 0.000287 Preprocessor1_Model1
# xgb_cv_rep %>%
# collect_metrics(summarize=FALSE) %>%
# group_by(.metric) %>%
# summarise(max = max(.estimate), min = min(.estimate),
# mean = mean(.estimate), sd = sd(.estimate))
nrow(xgb_cv_rep)
## [1] 500
xgb_cv_rep %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == "accuracy") %>%
ggplot(aes(x=.estimate))+
geom_histogram()+
labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
xgb_cv_rep %>%
collect_predictions() %>%
group_by(id) %>% #id stores nth repeat
accuracy(category, .pred_class) %>%
ggplot(aes(x=.estimate))+
geom_histogram()+
labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
xgb_cv_rep %>%
collect_predictions() %>%
group_by(id) %>%
accuracy(category, .pred_class) %>%
summarise(min =min(.estimate), max = max(.estimate))
## # A tibble: 1 × 2
## min max
## <dbl> <dbl>
## 1 0.919 0.926
xgb_cv_rep %>%
collect_predictions() %>%
group_by(id) %>%
accuracy(category, .pred_class) %>%
summarise(sd = sd(.estimate))
## # A tibble: 1 × 1
## sd
## <dbl>
## 1 0.00140
rep_folds <- vfold_cv(input, v = 10, repeats = 25,
strata = category) #strata for stratified cv
if ("doParallel" %in% rownames(installed.packages())){
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores) #number of cores?
registerDoParallel(cl) #only version > 1.0.16..
}
set.seed(456)
xgb_cv_rep <- fit_resamples(wflow_xgb_cv, rep_folds,
metrics = my_metrics,
control = keep_pred)
if (exists("cl")){
stopCluster(cl)
}
Exercise:
1) Get mean accuracy, auc, precision, recall
2)Plot a histogram for all the accuracy estimates
xgb_cv_rep %>%
collect_metrics()
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy multiclass 0.874 250 0.000591 Preprocessor1_Model1
## 2 precision macro 0.867 250 0.000672 Preprocessor1_Model1
## 3 recall macro 0.857 250 0.000673 Preprocessor1_Model1
## 4 roc_auc hand_till 0.962 250 0.000296 Preprocessor1_Model1
xgb_cv_rep %>%
collect_predictions() %>%
group_by(id) %>%
accuracy(category, .pred_class) %>%
ggplot(aes(x=.estimate))+
geom_histogram()+
labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
size = nrow(input) * 0.05
set.seed(126)
small <- dplyr::sample_n(input, size)
nrow(small)
## [1] 560
rep_folds <- vfold_cv(small, v = 10, repeats = 50,
strata = category) #strata for stratified cv
if ("doParallel" %in% rownames(installed.packages())){
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores) #number of cores?
registerDoParallel(cl) #only version > 1.0.16..
}
set.seed(123)
cv_result_small <- fit_resamples(wflow_xgb_cv, rep_folds,
metrics = my_metrics,
control = keep_pred)
if (exists("cl")){
stopCluster(cl)
}
cv_result_small %>%
collect_metrics()
## # A tibble: 4 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 accuracy multiclass 0.783 500 0.00231 Preprocessor1_Model1
## 2 precision macro 0.774 500 0.00257 Preprocessor1_Model1
## 3 recall macro 0.767 500 0.00251 Preprocessor1_Model1
## 4 roc_auc hand_till 0.912 500 0.00144 Preprocessor1_Model1
# cv_result_small %>%
# collect_metrics(summarize=FALSE) %>%
# group_by(.metric) %>%
# summarise(mean = mean(.estimate))
Here you see a large variability in accuracy
cv_result_small %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == "accuracy") %>%
ggplot(aes(x=.estimate))+
geom_histogram()+
labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cv_result_small %>%
collect_predictions() %>%
group_by(id) %>%
accuracy(category, .pred_class) %>%
ggplot(aes(x=.estimate))+
geom_histogram()+
labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# if need to split into training and test set again
# train_set_prop = 0.8
# set.seed(231) #for reproducibility
#
# train_test_multi <- rsample::initial_split(input, prop=train_set_prop, strata=category)
# train <- training(train_test_multi)
# test <- testing(train_test_multi)
norm_recipe <- recipe(category ~ ., data = train) %>%
step_normalize(all_predictors())
library("discrim")
##
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
##
## smoothness
lda_mod <-
discrim_linear() %>%
set_engine("MASS")
lda_wfl <- workflow() %>%
add_recipe(norm_recipe) %>%
add_model(lda_mod)
lda_fit_norm <- fit(lda_wfl, data = train)
lda_norm_pred = predict(lda_fit_norm, test)
# head(lda_norm_pred)
train_norm <- norm_recipe %>%
prep(training = train) %>%
bake(train)
test_norm <- norm_recipe %>%
prep(training = train) %>%
bake(test)
head(train_norm)
## # A tibble: 6 × 10
## num_s…¹ noun prop_…² verb adj colon period comma excla…³ categ…⁴
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0.750 -0.0787 0.749 -0.411 3.65e-1 2.31 0.128 -0.191 -0.429 AV
## 2 0.730 0.250 1.29 -0.599 -7.48e-2 -0.725 0.850 -0.0806 -0.429 AV
## 3 -0.317 0.680 0.194 -0.443 3.70e-1 -0.317 -0.113 -0.680 -0.429 AV
## 4 0.461 -0.476 -0.717 1.55 3.14e-4 0.134 0.525 -0.458 -0.429 AV
## 5 2.83 -1.67 -1.31 2.40 -1.46e+0 -0.281 3.86 0.113 -0.429 AV
## 6 0.163 0.382 -0.652 1.84 -6.93e-1 -0.725 -0.0893 0.0220 -0.429 AV
## # … with abbreviated variable names ¹num_sentences, ²prop_noun, ³exclamation,
## # ⁴category
head(test_norm)
## # A tibble: 6 × 10
## num_sent…¹ noun prop_…² verb adj colon period comma excla…³ categ…⁴
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0.226 0.588 0.321 -0.857 0.396 -0.725 0.298 -0.636 -0.429 AV
## 2 0.978 0.0850 0.298 -1.44 1.05 0.119 1.03 -0.968 -0.429 AV
## 3 -0.505 0.554 0.224 -0.485 -0.217 1.85 -0.364 0.0172 -0.429 AV
## 4 0.880 -0.580 1.58 -0.225 0.763 -0.725 1.05 -0.249 -0.429 AV
## 5 0.536 -0.0984 0.611 -0.703 -0.555 -0.288 0.304 1.05 1.03 AV
## 6 1.54 -0.808 1.04 -0.414 0.548 0.196 1.00 1.12 1.11 AV
## # … with abbreviated variable names ¹num_sentences, ²prop_noun, ³exclamation,
## # ⁴category
# other_recipe <- recipe(category ~ ., data = input) %>%
# step_normalize(all_predictors())
other_recipe <- recipe(category ~ ., data = input) %>%
step_normalize(all_predictors())
train_other <- other_recipe %>%
prep(training = input) %>%
bake(train)
test_other <- other_recipe %>%
prep(training = input) %>%
bake(test)
head(train_other)
## # A tibble: 6 × 10
## num_s…¹ noun prop_…² verb adj colon period comma excla…³ categ…⁴
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0.750 -0.0761 0.747 -0.412 0.358 2.33 0.129 -0.194 -0.434 AV
## 2 0.731 0.254 1.29 -0.601 -0.0810 -0.736 0.849 -0.0837 -0.434 AV
## 3 -0.316 0.686 0.191 -0.445 0.363 -0.324 -0.112 -0.683 -0.434 AV
## 4 0.462 -0.475 -0.719 1.56 -0.00594 0.131 0.525 -0.461 -0.434 AV
## 5 2.83 -1.67 -1.31 2.40 -1.46 -0.288 3.86 0.110 -0.434 AV
## 6 0.163 0.387 -0.654 1.84 -0.697 -0.736 -0.0889 0.0189 -0.434 AV
## # … with abbreviated variable names ¹num_sentences, ²prop_noun, ³exclamation,
## # ⁴category
head(test_other)
## # A tibble: 6 × 10
## num_sent…¹ noun prop_…² verb adj colon period comma excla…³ categ…⁴
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0.227 0.594 0.319 -0.860 0.389 -0.736 0.298 -0.639 -0.434 AV
## 2 0.978 0.0883 0.295 -1.45 1.04 0.116 1.03 -0.971 -0.434 AV
## 3 -0.504 0.559 0.222 -0.486 -0.223 1.86 -0.363 0.0142 -0.434 AV
## 4 0.880 -0.579 1.58 -0.226 0.756 -0.736 1.05 -0.252 -0.434 AV
## 5 0.536 -0.0959 0.609 -0.705 -0.560 -0.295 0.305 1.05 1.04 AV
## 6 1.54 -0.809 1.04 -0.416 0.540 0.194 1.00 1.11 1.12 AV
## # … with abbreviated variable names ¹num_sentences, ²prop_noun, ³exclamation,
## # ⁴category
# norm_recipe <- recipe(category ~ ., data = train) %>%
# step_normalize(all_predictors())
# lda_mod <-
# discrim_linear() %>%
# set_engine("MASS")
xgb_mod <- boost_tree(trees=50) %>%
set_engine("xgboost") %>%
set_mode("classification")
#library(ranger)
rand_mod <- rand_forest(mode="classification") %>%
set_engine("randomForest")
library(workflowsets)
##
## Attaching package: 'workflowsets'
## The following object is masked from 'package:tune':
##
## extract_recipe
all_models_set <-
workflow_set(
preproc = list(norm = norm_recipe),
models = list(lda = lda_mod, rand_for = rand_mod,
xgb = xgb_mod)
)
if ("doParallel" %in% rownames(installed.packages())){
unregister <- function() {
env <- foreach:::.foreachGlobals
rm(list=ls(name=env), pos=env)
}
unregister()
}
folds <- vfold_cv(data, v = 10, strata = category)
all_models <-
all_models_set %>%
workflow_map(
fn = "fit_resamples",
resamples = folds,
verbose = TRUE,
seed = 1234
)
## i 1 of 3 resampling: norm_lda
## ✔ 1 of 3 resampling: norm_lda (7.3s)
## i 2 of 3 resampling: norm_rand_for
## ✔ 2 of 3 resampling: norm_rand_for (1m 16.8s)
## i 3 of 3 resampling: norm_xgb
## ✔ 3 of 3 resampling: norm_xgb (30.9s)
all_models %>%
collect_metrics()
## # A tibble: 6 × 9
## wflow_id .config preproc model .metric .esti…¹ mean n std_err
## <chr> <fct> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 norm_lda Preprocessor1… recipe disc… accura… multic… 0.759 10 2.62e-3
## 2 norm_lda Preprocessor1… recipe disc… roc_auc hand_t… 0.901 10 1.97e-3
## 3 norm_rand_for Preprocessor1… recipe rand… accura… multic… 0.904 10 1.95e-3
## 4 norm_rand_for Preprocessor1… recipe rand… roc_auc hand_t… 0.977 10 6.83e-4
## 5 norm_xgb Preprocessor1… recipe boos… accura… multic… 0.917 10 2.31e-3
## 6 norm_xgb Preprocessor1… recipe boos… roc_auc hand_t… 0.982 10 6.82e-4
## # … with abbreviated variable name ¹.estimator
autoplot(all_models)
all_auc_wide <- all_models %>%
collect_metrics(summarize = FALSE) %>%
filter(.metric == "roc_auc") %>%
dplyr::select(wflow_id, .estimate, id) %>%
pivot_wider(id_cols = "id", names_from = "wflow_id", values_from = ".estimate")
all_auc_wide
## # A tibble: 10 × 4
## id norm_lda norm_rand_for norm_xgb
## <chr> <dbl> <dbl> <dbl>
## 1 Fold01 0.912 0.978 0.979
## 2 Fold02 0.904 0.977 0.981
## 3 Fold03 0.897 0.979 0.985
## 4 Fold04 0.903 0.982 0.983
## 5 Fold05 0.892 0.975 0.979
## 6 Fold06 0.900 0.975 0.982
## 7 Fold07 0.895 0.975 0.979
## 8 Fold08 0.895 0.979 0.982
## 9 Fold09 0.904 0.979 0.984
## 10 Fold10 0.908 0.977 0.980
all_auc_wide %>%
with( t.test(norm_lda, norm_rand_for, paired = TRUE))
##
## Paired t-test
##
## data: norm_lda and norm_rand_for
## t = -40.62, df = 9, p-value = 1.654e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08078254 -0.07225957
## sample estimates:
## mean of the differences
## -0.07652106
all_auc_wide %>%
with( t.test(norm_lda, norm_xgb, paired = TRUE))
##
## Paired t-test
##
## data: norm_lda and norm_xgb
## t = -37.705, df = 9, p-value = 3.221e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08537097 -0.07570695
## sample estimates:
## mean of the differences
## -0.08053896
all_auc_wide %>%
with( t.test(norm_rand_for, norm_xgb, paired = TRUE))
##
## Paired t-test
##
## data: norm_rand_for and norm_xgb
## t = -7.2272, df = 9, p-value = 4.936e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.005275520 -0.002760282
## sample estimates:
## mean of the differences
## -0.004017901
xgb_spec <- boost_tree(
trees = 10,
tree_depth = tune(),
loss_reduction = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow() %>%
add_formula(category ~ .) %>%
add_model(xgb_spec)
rep_folds <- vfold_cv(input_bin, v = 10, strata = category, repeats = 5)
tree_grid <- grid_regular(tree_depth(), loss_reduction(), levels=2)
tree_grid
## # A tibble: 4 × 2
## tree_depth loss_reduction
## <int> <dbl>
## 1 1 0.0000000001
## 2 15 0.0000000001
## 3 1 31.6
## 4 15 31.6
if ("doParallel" %in% rownames(installed.packages())){
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores) #number of cores?
registerDoParallel(cl) #only version > 1.0.16..
}
set.seed(234)
xgb_tune <- tune_grid(
xgb_wf,
resamples = rep_folds,
grid = tree_grid
)
if (exists("cl")){
stopCluster(cl)
}
collect_metrics(xgb_tune)
## # A tibble: 8 × 8
## tree_depth loss_reduction .metric .estimator mean n std_err .config
## <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 1 0.0000000001 accuracy binary 0.813 50 0.00207 Preprocess…
## 2 1 0.0000000001 roc_auc binary 0.884 50 0.00158 Preprocess…
## 3 15 0.0000000001 accuracy binary 0.949 50 0.00101 Preprocess…
## 4 15 0.0000000001 roc_auc binary 0.983 50 0.000529 Preprocess…
## 5 1 31.6 accuracy binary 0.813 50 0.00207 Preprocess…
## 6 1 31.6 roc_auc binary 0.884 50 0.00158 Preprocess…
## 7 15 31.6 accuracy binary 0.937 50 0.000979 Preprocess…
## 8 15 31.6 roc_auc binary 0.979 50 0.000630 Preprocess…
show_best(xgb_tune, "roc_auc")
## # A tibble: 4 × 8
## tree_depth loss_reduction .metric .estimator mean n std_err .config
## <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 15 0.0000000001 roc_auc binary 0.983 50 0.000529 Preprocesso…
## 2 15 31.6 roc_auc binary 0.979 50 0.000630 Preprocesso…
## 3 1 0.0000000001 roc_auc binary 0.884 50 0.00158 Preprocesso…
## 4 1 31.6 roc_auc binary 0.884 50 0.00158 Preprocesso…
best_model <- select_best(xgb_tune, "roc_auc")
best_model
## # A tibble: 1 × 3
## tree_depth loss_reduction .config
## <int> <dbl> <fct>
## 1 15 0.0000000001 Preprocessor1_Model2
final_xgb <- finalize_workflow(
xgb_wf,
best_model
)
final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Formula
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## category ~ .
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## trees = 10
## tree_depth = 15
## loss_reduction = 1e-10
##
## Computational engine: xgboost
# alternatively, can use base R expand.grid function (same paramater setup)
manual_grid <- tidyr::crossing(tree_depth = c(5, 10),
loss_reduction = c(2, 4))
manual_grid
## # A tibble: 4 × 2
## tree_depth loss_reduction
## <dbl> <dbl>
## 1 5 2
## 2 5 4
## 3 10 2
## 4 10 4
if ("doParallel" %in% rownames(installed.packages())){
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores) #number of cores?
registerDoParallel(cl) #only version > 1.0.16..
}
set.seed(155)
xgb_tune_manual <- tune_grid(
xgb_wf,
resamples = rep_folds,
grid = manual_grid
)
if (exists("cl")){
stopCluster(cl)
}
show_best(xgb_tune_manual, "roc_auc")
## # A tibble: 4 × 8
## tree_depth loss_reduction .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 10 4 roc_auc binary 0.984 50 0.000423 Preprocesso…
## 2 10 2 roc_auc binary 0.984 50 0.000466 Preprocesso…
## 3 5 2 roc_auc binary 0.980 50 0.000574 Preprocesso…
## 4 5 4 roc_auc binary 0.979 50 0.000535 Preprocesso…
xgb_tune_manual %>%
collect_metrics() %>%
mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(loss_reduction, mean, color = tree_depth)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2)
mod_cont_recipe <-
recipe(num_sentences ~ ., data=input) %>%
step_dummy(all_nominal())
xgb_mod_cont <- boost_tree(trees=50) %>%
set_engine("xgboost") %>%
set_mode("regression")
wflow_xgb_cont <-
workflow() %>%
add_model(xgb_mod_cont) %>%
#add_formula(num_sentences ~ .)
add_recipe(mod_cont_recipe)
# input_cont <- input %>%
# dplyr::select(-c(category))
if ("doParallel" %in% rownames(installed.packages())){
all_cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(all_cores)
registerDoParallel(cl)
}
set.seed(156)
folds_cont <- vfold_cv(input, v = 10)
cont_metrics <- metric_set(rmse, rsq, mae)
keep_pred <- control_resamples(save_pred = TRUE)
xgb_cv_cont <- fit_resamples(wflow_xgb_cont, folds_cont,
metrics = cont_metrics,
control = keep_pred)
if (exists("cl")){
stopCluster(cl)
}
collect_predictions(xgb_cv_cont)
## # A tibble: 11,204 × 5
## id .pred .row num_sentences .config
## <chr> <dbl> <int> <dbl> <fct>
## 1 Fold01 0.0830 8 0.0858 Preprocessor1_Model1
## 2 Fold01 0.0524 22 0.0539 Preprocessor1_Model1
## 3 Fold01 0.0539 27 0.0376 Preprocessor1_Model1
## 4 Fold01 0.0976 41 0.0905 Preprocessor1_Model1
## 5 Fold01 0.0569 43 0.0523 Preprocessor1_Model1
## 6 Fold01 0.0555 46 0.0492 Preprocessor1_Model1
## 7 Fold01 0.0826 51 0.0766 Preprocessor1_Model1
## 8 Fold01 0.0728 70 0.0823 Preprocessor1_Model1
## 9 Fold01 0.0874 86 0.0886 Preprocessor1_Model1
## 10 Fold01 0.0713 142 0.0735 Preprocessor1_Model1
## # … with 11,194 more rows
collect_predictions(xgb_cv_cont) %>%
ggplot(aes(x=num_sentences, y=.pred))+
geom_point(alpha=0.5)+
geom_smooth(method="lm", se = TRUE)+
coord_fixed()
## `geom_smooth()` using formula 'y ~ x'
xgb_cv_cont %>%
collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 mae standard 0.00472 10 0.0000493 Preprocessor1_Model1
## 2 rmse standard 0.00711 10 0.000139 Preprocessor1_Model1
## 3 rsq standard 0.848 10 0.00461 Preprocessor1_Model1
cor(collect_predictions(xgb_cv_cont)$num_sentences,
collect_predictions(xgb_cv_cont)$.pred)
## [1] 0.920497