1 Exploratory Data Analysis (EDA)

Note: save and load Rstudio project (.Rproj) in directory containing the dataset

# setwd("<write/path/to/dataset/>") # if not loading Rproject file

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
data <- read_csv("countdata_large_normed.csv")
## Rows: 11204 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): id, author, born, category, sex, title_monogr, topics
## dbl (17): num_sentences, subst, subst_prop, verb, adj, kolon, punkt, komma, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

inspect the first 6 rows of the data

head(data)
## # A tibble: 6 × 24
##   num_s…¹ subst subst…²  verb    adj   kolon  punkt  komma     spm utrop overs…³
##     <dbl> <dbl>   <dbl> <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl> <dbl>   <dbl>
## 1  0.0817 0.203 0.0949  0.152 0.102  0.0155  0.0596 0.0442 0.00442     0 0.0177 
## 2  0.0813 0.218 0.119   0.147 0.0923 0       0.0703 0.0462 0           0 0.0110 
## 3  0.0622 0.237 0.0705  0.151 0.102  0.00207 0.0560 0.0353 0           0 0.00622
## 4  0.0721 0.232 0.0762  0.140 0.102  0       0.0621 0.0361 0.00200     0 0.00802
## 5  0.0764 0.186 0.0306  0.205 0.0939 0.00437 0.0655 0.0393 0.00655     0 0.00437
## 6  0.120  0.133 0.00451 0.228 0.0632 0.00226 0.115  0.0497 0           0 0.00451
## # … with 13 more variables: ukjent <dbl>, mean_sent_length <dbl>,
## #   num_words <dbl>, id <chr>, section_number <dbl>, author <chr>, born <chr>,
## #   category <chr>, n_authors <dbl>, publication_year <dbl>, sex <chr>,
## #   title_monogr <chr>, topics <chr>, and abbreviated variable names
## #   ¹​num_sentences, ²​subst_prop, ³​overskrift

1.0.1 Description of dataset

data <- data %>% 
  rename(noun = subst,
         prop_noun = subst_prop,
         colon = kolon,
         period = punkt,
         comma = komma,
         exclamation = utrop)
names(data)
##  [1] "num_sentences"    "noun"             "prop_noun"        "verb"            
##  [5] "adj"              "colon"            "period"           "comma"           
##  [9] "spm"              "exclamation"      "overskrift"       "ukjent"          
## [13] "mean_sent_length" "num_words"        "id"               "section_number"  
## [17] "author"           "born"             "category"         "n_authors"       
## [21] "publication_year" "sex"              "title_monogr"     "topics"
table(data$category)
## 
##   AV   SA   SK 
## 5205 3000 2999
names(data)
##  [1] "num_sentences"    "noun"             "prop_noun"        "verb"            
##  [5] "adj"              "colon"            "period"           "comma"           
##  [9] "spm"              "exclamation"      "overskrift"       "ukjent"          
## [13] "mean_sent_length" "num_words"        "id"               "section_number"  
## [17] "author"           "born"             "category"         "n_authors"       
## [21] "publication_year" "sex"              "title_monogr"     "topics"
input <- data %>% 
  dplyr::select(category, num_sentences:comma, exclamation) 

names(input)
##  [1] "category"      "num_sentences" "noun"          "prop_noun"    
##  [5] "verb"          "adj"           "colon"         "period"       
##  [9] "comma"         "exclamation"
str(input)
## tibble [11,204 × 10] (S3: tbl_df/tbl/data.frame)
##  $ category     : chr [1:11204] "AV" "AV" "AV" "AV" ...
##  $ num_sentences: num [1:11204] 0.0817 0.0813 0.0622 0.0721 0.0764 ...
##  $ noun         : num [1:11204] 0.203 0.218 0.237 0.232 0.186 ...
##  $ prop_noun    : num [1:11204] 0.0949 0.1187 0.0705 0.0762 0.0306 ...
##  $ verb         : num [1:11204] 0.152 0.147 0.151 0.14 0.205 ...
##  $ adj          : num [1:11204] 0.1015 0.0923 0.1017 0.1022 0.0939 ...
##  $ colon        : num [1:11204] 0.01545 0 0.00207 0 0.00437 ...
##  $ period       : num [1:11204] 0.0596 0.0703 0.056 0.0621 0.0655 ...
##  $ comma        : num [1:11204] 0.0442 0.0462 0.0353 0.0361 0.0393 ...
##  $ exclamation  : num [1:11204] 0 0 0 0 0 0 0 0 0 0 ...
input$category <- factor(input$category)
ggplot(data=input, aes(x=category, y=prop_noun, fill=category))+
    geom_boxplot()

ggplot(data=input, aes(x=category, y=prop_noun, fill=category))+
    geom_boxplot()+
    ylim(0, 0.3)
## Warning: Removed 24 rows containing non-finite values (stat_boxplot).

1.0.1.1 Exercise

ggplot(data=data, aes(x=category, y=noun, fill=category))+
    geom_boxplot()

facet_plot <- input %>% 
  tidyr::pivot_longer(!category, names_to = "variable", values_to = "value")

facet_plot
## # A tibble: 100,836 × 3
##    category variable       value
##    <fct>    <chr>          <dbl>
##  1 AV       num_sentences 0.0817
##  2 AV       noun          0.203 
##  3 AV       prop_noun     0.0949
##  4 AV       verb          0.152 
##  5 AV       adj           0.102 
##  6 AV       colon         0.0155
##  7 AV       period        0.0596
##  8 AV       comma         0.0442
##  9 AV       exclamation   0     
## 10 AV       num_sentences 0.0813
## # … with 100,826 more rows
ggplot(data=facet_plot, aes(x=variable, y=value, fill=category))+
    geom_boxplot()

ggplot(data=facet_plot, aes(x=variable, y=value, fill=category))+
    geom_boxplot()+
    facet_wrap(~variable, scales="free")+
    theme(axis.text.x = element_blank(), #optional: remove x axis label (redundant) 
          axis.ticks.x = element_blank()) #optional: remove x axis tick

pairs(input[,2:ncol(input)], pch = 19, lower.panel = NULL) #pch is symbol type (maybe leave out)

if ( "GGally" %in% rownames(installed.packages()) ){
  
  library(GGally)
  
  input %>%
  dplyr::select(-category) %>%
  ggpairs(aes(alpha = 0.4))+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
     
}

2 Binary Classification

2.1 Data preparation

input_bin <- input %>% 
  filter(category != "SK")

table(input_bin$category)
## 
##   AV   SA   SK 
## 5205 3000    0
input_bin$category <- factor(input_bin$category)
table(input_bin$category)
## 
##   AV   SA 
## 5205 3000

2.2 Training and test sets

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
## ✔ broom     1.0.2      ✔ recipes   0.1.17
## ✔ dials     0.0.9      ✔ rsample   0.0.9 
## ✔ infer     0.5.4      ✔ tune      0.1.3 
## ✔ modeldata 0.1.0      ✔ workflows 0.2.3 
## ✔ parsnip   0.1.5      ✔ yardstick 0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()           masks purrr::discard()
## ✖ workflows::extract_recipe() masks tune::extract_recipe()
## ✖ dplyr::filter()             masks stats::filter()
## ✖ recipes::fixed()            masks stringr::fixed()
## ✖ dplyr::lag()                masks stats::lag()
## ✖ yardstick::spec()           masks readr::spec()
## ✖ recipes::step()             masks stats::step()
train_set_prop = 0.8
set.seed(561) #for reproducibility (i.e. ensures the "split" is the same for each rerun)

train_test_split <- rsample::initial_split(input_bin, prop=train_set_prop, strata=category)
train <- training(train_test_split)
test <- testing(train_test_split)
table(train$category)
## 
##   AV   SA 
## 4165 2401
table(test$category)
## 
##   AV   SA 
## 1040  599

2.3 XGBoost: Training and evaluating our machine learning model

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
#Note: within tidymodels, you get same result from
#xboost across runs
#set.seed(169) #XGBoost is "stochastic"

xgb_fit <-
  boost_tree(trees=50) %>% #specify that you wanted a boosted tree model
  set_engine("xgboost") %>% #specify that you want to use xgboost
  set_mode("classification") %>% #specify the "objective" is classification
  fit(category ~ ., data=train)


xgb_bin <- xgb_fit #saving for later
xgb_pred = predict(xgb_fit, test)
head(xgb_pred)
## # A tibble: 6 × 1
##   .pred_class
##   <fct>      
## 1 AV         
## 2 AV         
## 3 AV         
## 4 AV         
## 5 AV         
## 6 AV
xgb_pred_prob = predict(xgb_fit, test, type="prob")
head(xgb_pred_prob)
## # A tibble: 6 × 2
##   .pred_AV .pred_SA
##      <dbl>    <dbl>
## 1    0.989  0.0110 
## 2    0.998  0.00195
## 3    0.628  0.372  
## 4    0.987  0.0130 
## 5    0.995  0.00460
## 6    0.935  0.0650
xgb_res <- bind_cols(test$category, xgb_pred, xgb_pred_prob)
## New names:
## • `` -> `...1`
names(xgb_res) <- c("obs", "pred", "pred_AV", "pred_SA")

xgb_res$pred <- factor(xgb_res$pred) # Note: this seems to be "necessary" for R v4.0 and upwards

xgb_conf <- xgb_res %>% 
  conf_mat(obs, pred)

xgb_conf
##           Truth
## Prediction   AV   SA
##         AV 1017   49
##         SA   23  550

2.3.1 Accuracy

xgb_acc <- sum(diag(xgb_conf$table))/sum(xgb_conf$table)
xgb_acc
## [1] 0.9560708
comp_acc <- function(conf_matrix){
  accuracy <- sum(diag(conf_matrix))/sum(conf_matrix)
  return(accuracy)
}
comp_acc(xgb_conf$table)
## [1] 0.9560708

2.3.2 Precision

comp_prec <- function(conf_matrix){
  precision <- diag(conf_matrix) / rowSums(conf_matrix)
  return(precision)
}
xgb_prec <- comp_prec(xgb_conf$table)
xgb_prec
##        AV        SA 
## 0.9540338 0.9598604

2.3.3 Recall

comp_recall <- function(conf_matrix){
  recall <- diag(conf_matrix) / colSums(conf_matrix) 
  return(recall)
}
xgb_rec <- comp_recall(xgb_conf$table)
xgb_rec
##        AV        SA 
## 0.9778846 0.9181970
xgb_res %>% 
  accuracy(obs, pred) 
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.956
xgb_res %>% 
  recall(obs, pred) 
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.978
xgb_res %>% 
  precision(obs, pred) 
## # A tibble: 1 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.954
levels(xgb_res$obs)
## [1] "AV" "SA"
xgb_res %>% 
  recall(obs, pred, event_level = "second")
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.918
xgb_res %>% 
  precision(obs, pred, event_level = "second") 
## # A tibble: 1 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision binary         0.960
xgb_res %>% 
  metrics(obs, pred)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.956
## 2 kap      binary         0.904
xgb_res %>% 
  metrics(obs, pred, pred_AV)
## # A tibble: 4 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 accuracy    binary         0.956
## 2 kap         binary         0.904
## 3 mn_log_loss binary         0.124
## 4 roc_auc     binary         0.989

2.3.4 Plots

ggplot(xgb_res, aes(x=obs, y=pred_SA, fill=obs))+
    geom_boxplot(outlier.shape = NA)+
    geom_jitter(width=0.1,alpha=0.1)+
    geom_hline(yintercept = 0.5, color = "blue", linetype = "dashed", alpha=0.6)+
    theme(axis.text.x = element_blank(), #ev. ekstra
          axis.ticks.x = element_blank())+ #ev. ekstra
    labs(fill="actual category",
        x="")    

ggplot(xgb_res, aes(x=obs, y=pred_AV, fill=obs))+
    geom_boxplot()+
    geom_jitter(width=0.1,alpha=0.2, outlier.shape = NA)+
    geom_hline(yintercept = 0.5, color = "blue", linetype = "dashed", alpha=0.6)+
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank())+
    labs(fill="actual category",
        x="")    
## Warning: Ignoring unknown parameters: outlier.shape

knitr::include_graphics("overfitting_runs.svg")

#https://commons.wikimedia.org/wiki/File:2d-epochs-overfitting.svg

2.3.5 Thresholds

threshold_new = 0.8
xgb_pred_new <- if_else(xgb_pred_prob$.pred_SA >= threshold_new, "SA", "AV")
xgb_res_new <- bind_cols(test$category, xgb_pred_new, xgb_pred_prob)
## New names:
## • `` -> `...1`
## • `` -> `...2`
names(xgb_res_new) <- c("obs", "pred", "pred_AV", "pred_SA")

xgb_res_new$pred <- factor(xgb_res_new$pred) # Note: this seems to be "necessary" for R v4.0 and upwards

xgb_conf_new <- xgb_res_new %>% 
  conf_mat(obs, pred)

xgb_conf_new
##           Truth
## Prediction   AV   SA
##         AV 1034   93
##         SA    6  506
comp_acc(xgb_conf_new$table) 
## [1] 0.9395973
comp_prec(xgb_conf_new$table)
##        AV        SA 
## 0.9174800 0.9882812
comp_recall(xgb_conf_new$table)
##        AV        SA 
## 0.9942308 0.8447412

2.4 ROC and AUC

2.4.1 ROC

roc_curve(xgb_res, obs, pred_AV) %>% 
    autoplot()

roc_curve(xgb_res, obs, pred_AV) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_line()+ #in this case, same result with geom_path (order in dataframe, NOT x-axis)
  geom_abline(lty = 3) +
  coord_equal() +
  theme_bw()

2.4.2 AUC

auc_xgb_bin <- roc_auc(xgb_res, obs, pred_AV)
auc_xgb_bin
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.989

2.5 Feature importance (binary classification)

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vip(xgb_fit)

xgb_features <- vip::vi(xgb_bin)
xgb_features
## # A tibble: 9 × 2
##   Variable      Importance
##   <chr>              <dbl>
## 1 colon             0.366 
## 2 exclamation       0.233 
## 3 prop_noun         0.216 
## 4 num_sentences     0.0461
## 5 noun              0.0337
## 6 adj               0.0304
## 7 period            0.0299
## 8 verb              0.0243
## 9 comma             0.0203
xgb_feature_bin <-
ggplot(data=xgb_features, aes(x=reorder(Variable, -Importance),
                              y=Importance))+
    geom_col(aes(fill=Variable))+
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
    labs(x="",
        title="Binary classification\n('AV' vs 'SA')")

xgb_feature_bin

3 Multiclass classification

train_set_prop = 0.8
set.seed(231) #for reproducibility

train_test_multi <- rsample::initial_split(input, prop=train_set_prop, strata=category)
train <- training(train_test_multi)
test <- testing(train_test_multi)

#set.seed(1245) #XGBoost is "stochastic"
table(train$category)
## 
##   AV   SA   SK 
## 4165 2401 2400
table(test$category)
## 
##   AV   SA   SK 
## 1040  599  599
xgb_multi <-
  boost_tree(trees=50) %>%
  set_engine("xgboost") %>%
  set_mode("classification") %>%
  fit(category ~ ., data=train)

xgb_pred = predict(xgb_multi, test)
xgb_pred_prob = predict(xgb_multi, test, type="prob")
xgb_pred
## # A tibble: 2,238 × 1
##    .pred_class
##    <fct>      
##  1 AV         
##  2 AV         
##  3 AV         
##  4 AV         
##  5 AV         
##  6 AV         
##  7 AV         
##  8 AV         
##  9 AV         
## 10 AV         
## # … with 2,228 more rows
xgb_pred_prob
## # A tibble: 2,238 × 3
##    .pred_AV .pred_SA  .pred_SK
##       <dbl>    <dbl>     <dbl>
##  1    0.960 0.0397   0.000584 
##  2    0.986 0.0130   0.000864 
##  3    0.850 0.150    0.000581 
##  4    1.00  0.000211 0.000190 
##  5    0.909 0.0839   0.00755  
##  6    0.911 0.0807   0.00781  
##  7    0.948 0.0493   0.00309  
##  8    0.998 0.00198  0.0000314
##  9    1.00  0.000162 0.000275 
## 10    0.980 0.0196   0.000340 
## # … with 2,228 more rows
xgb_res <- bind_cols(test$category, xgb_pred, xgb_pred_prob)
## New names:
## • `` -> `...1`
names(xgb_res) <- c("obs", "pred", "pred_AV", "pred_SA", "pred_SK")

xgb_res$pred <- factor(xgb_res$pred) # Note: this seems to be "necessary" for R v4.0 and upwards

xgb_conf <- xgb_res %>% 
  conf_mat(obs, pred)

xgb_conf
##           Truth
## Prediction  AV  SA  SK
##         AV 997  43  14
##         SA  24 503  34
##         SK  19  53 551

3.0.1 Accuracy

xgb_res %>% 
    accuracy(obs, pred)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.916

3.0.2 Precision

multi_prec <- comp_prec(xgb_conf$table)
multi_prec
##        AV        SA        SK 
## 0.9459203 0.8966132 0.8844302

3.0.3 Recall

multi_recall <- comp_recall(xgb_conf$table)
multi_recall 
##        AV        SA        SK 
## 0.9586538 0.8397329 0.9198664

3.0.4 Macro-averaged metrics

macroPrecision <- mean(multi_prec)
macroPrecision
## [1] 0.9089879
precision(xgb_res, obs, pred)
## # A tibble: 1 × 3
##   .metric   .estimator .estimate
##   <chr>     <chr>          <dbl>
## 1 precision macro          0.909
macroRecall <- mean(multi_recall)
macroRecall
## [1] 0.9060844
recall(xgb_res, obs, pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  macro          0.906

3.0.5 Multiclass AUC

xgb_res %>% 
    roc_auc(obs, pred_AV:pred_SK) 
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc hand_till      0.981
xgb_res %>% 
  metrics(obs, pred, pred_AV:pred_SK)
## # A tibble: 4 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 accuracy    multiclass     0.916
## 2 kap         multiclass     0.869
## 3 mn_log_loss multiclass     0.223
## 4 roc_auc     hand_till      0.981

3.0.6 Optional: Plotting predicted probability with actual category

test_plot <- xgb_res %>% 
  dplyr::select(obs,pred_AV, pred_SA, pred_SK)

test_plot_long <- test_plot %>% 
  tidyr::pivot_longer(!obs, names_to = "variable", values_to = "value")

head(test_plot_long)
## # A tibble: 6 × 3
##   obs   variable    value
##   <fct> <chr>       <dbl>
## 1 AV    pred_AV  0.960   
## 2 AV    pred_SA  0.0397  
## 3 AV    pred_SK  0.000584
## 4 AV    pred_AV  0.986   
## 5 AV    pred_SA  0.0130  
## 6 AV    pred_SK  0.000864
ggplot(data=test_plot_long, aes(x=obs, y=value, fill=obs))+
    geom_boxplot(outlier.shape = NA)+
    geom_jitter(width=0.1,alpha=0.05)+
    geom_hline(yintercept = 1/3, color = "blue", linetype = "dashed", alpha=0.6)+
    facet_wrap(~variable)+
    labs(y="Predicted probability",
        fill="Actual category")

3.1 Feature importance (multiclass classification)

vip(xgb_multi)

xgb_features <- vip::vi(xgb_multi)

xgb_feature_multi <-
ggplot(data=xgb_features, aes(x=reorder(Variable, -Importance),
                              y=Importance))+
    geom_col(aes(fill=Variable))+
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
    labs(x="",
        title="Multiclass")


xgb_feature_multi

library("patchwork", quietly = TRUE)

features <- xgb_feature_bin + xgb_feature_multi & theme(legend.position = "bottom")


features + plot_layout(guides = "collect") +
    plot_annotation(title = "Feature importance from XGB")

3.2 Exercise (optional)

min_n = 2 #choose an integer from range 1-10 (default = 1)
tree_depth = 9  #choose integer from range 1-10 (default = 6)
learn_rate =  0.15 # choose decimal value between 0.01 and 0.5 (default = 0.3)
  1. Calculate model evaluation metrics (accuracy, macro-precision, macro-recall, auc)

4 K-fold cross-validation

4.1 Cross-validation for binary classification

knitr::include_graphics("CV_image.svg")

#https://upload.wikimedia.org/wikipedia/commons/1/1c/K-fold_cross_validation_EN.jpg
xgb_mod <- boost_tree(trees=50) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

wflow_xgb <-
  workflow() %>% 
  add_model(xgb_mod) %>% 
  add_formula(category ~ .)
set.seed(156) 
folds <- vfold_cv(input_bin, v = 10, strata = category) #strata for stratified cv
xgb_cv <- fit_resamples(wflow_xgb, folds)
xgb_cv
## # Resampling results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 4
##    splits             id     .metrics         .notes          
##    <list>             <chr>  <list>           <list>          
##  1 <split [7384/821]> Fold01 <tibble [2 × 4]> <tibble [0 × 1]>
##  2 <split [7384/821]> Fold02 <tibble [2 × 4]> <tibble [0 × 1]>
##  3 <split [7384/821]> Fold03 <tibble [2 × 4]> <tibble [0 × 1]>
##  4 <split [7384/821]> Fold04 <tibble [2 × 4]> <tibble [0 × 1]>
##  5 <split [7384/821]> Fold05 <tibble [2 × 4]> <tibble [0 × 1]>
##  6 <split [7385/820]> Fold06 <tibble [2 × 4]> <tibble [0 × 1]>
##  7 <split [7385/820]> Fold07 <tibble [2 × 4]> <tibble [0 × 1]>
##  8 <split [7385/820]> Fold08 <tibble [2 × 4]> <tibble [0 × 1]>
##  9 <split [7385/820]> Fold09 <tibble [2 × 4]> <tibble [0 × 1]>
## 10 <split [7385/820]> Fold10 <tibble [2 × 4]> <tibble [0 × 1]>

4.1.1 Accuracy

collect_metrics(xgb_cv)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <fct>               
## 1 accuracy binary     0.960    10 0.00199  Preprocessor1_Model1
## 2 roc_auc  binary     0.990    10 0.000949 Preprocessor1_Model1
# the same as
# xgb_cv %>% 
#   collect_metrics()
collect_metrics(xgb_cv, summarize=FALSE) %>% 
  filter(.metric =="accuracy")
## # A tibble: 10 × 5
##    id     .metric  .estimator .estimate .config             
##    <chr>  <chr>    <chr>          <dbl> <fct>               
##  1 Fold01 accuracy binary         0.950 Preprocessor1_Model1
##  2 Fold02 accuracy binary         0.960 Preprocessor1_Model1
##  3 Fold03 accuracy binary         0.954 Preprocessor1_Model1
##  4 Fold04 accuracy binary         0.957 Preprocessor1_Model1
##  5 Fold05 accuracy binary         0.962 Preprocessor1_Model1
##  6 Fold06 accuracy binary         0.972 Preprocessor1_Model1
##  7 Fold07 accuracy binary         0.956 Preprocessor1_Model1
##  8 Fold08 accuracy binary         0.963 Preprocessor1_Model1
##  9 Fold09 accuracy binary         0.961 Preprocessor1_Model1
## 10 Fold10 accuracy binary         0.966 Preprocessor1_Model1
#the same as
# xgb_cv %>% 
#   collect_metrics(summarize=FALSE) %>% 
#   filter(.metric =="accuracy")
xgb_cv %>% 
  collect_metrics(summarize=FALSE) %>% 
  filter(.metric =="accuracy") %>% 
  summarise(max = max(.estimate), min = min(.estimate))
## # A tibble: 1 × 2
##     max   min
##   <dbl> <dbl>
## 1 0.972 0.950
xgb_cv_res <- 
  xgb_cv %>% 
  collect_metrics(summarize=FALSE) %>% 
  filter(.metric =="accuracy") %>% 
  summarise(max = max(.estimate), min = min(.estimate), 
            mean = mean(.estimate), sd = sd(.estimate))
xgb_cv_res$mean + xgb_cv_res$sd
## [1] 0.9664288
xgb_cv_res$mean - xgb_cv_res$sd
## [1] 0.953868

4.1.2 Other metrics

my_metrics <- metric_set(accuracy,roc_auc, precision,
                         recall)

xgb_cv_rev <- fit_resamples(wflow_xgb, folds,
                            metrics = my_metrics)



xgb_cv_rev %>% 
  collect_metrics()
## # A tibble: 4 × 6
##   .metric   .estimator  mean     n  std_err .config             
##   <chr>     <chr>      <dbl> <int>    <dbl> <fct>               
## 1 accuracy  binary     0.960    10 0.00199  Preprocessor1_Model1
## 2 precision binary     0.956    10 0.00257  Preprocessor1_Model1
## 3 recall    binary     0.982    10 0.00178  Preprocessor1_Model1
## 4 roc_auc   binary     0.990    10 0.000949 Preprocessor1_Model1

4.1.3 Out-of-sample predictions

keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)

set.seed(156) 
xgb_cv_rev <- fit_resamples(wflow_xgb, folds,
                            metrics = my_metrics, control = keep_pred)
collect_predictions(xgb_cv_rev)
## # A tibble: 8,205 × 7
##    id     .pred_class  .row .pred_AV .pred_SA category .config             
##    <chr>  <fct>       <int>    <dbl>    <dbl> <fct>    <fct>               
##  1 Fold01 AV              8    0.991 0.00885  AV       Preprocessor1_Model1
##  2 Fold01 AV             16    0.969 0.0309   AV       Preprocessor1_Model1
##  3 Fold01 AV             30    0.979 0.0213   AV       Preprocessor1_Model1
##  4 Fold01 AV             31    0.999 0.000631 AV       Preprocessor1_Model1
##  5 Fold01 AV             41    1.00  0.000283 AV       Preprocessor1_Model1
##  6 Fold01 AV             47    0.998 0.00226  AV       Preprocessor1_Model1
##  7 Fold01 AV             68    0.998 0.00230  AV       Preprocessor1_Model1
##  8 Fold01 AV             90    0.998 0.00204  AV       Preprocessor1_Model1
##  9 Fold01 AV             96    0.974 0.0263   AV       Preprocessor1_Model1
## 10 Fold01 AV            101    0.967 0.0331   AV       Preprocessor1_Model1
## # … with 8,195 more rows
xgb_cv_conf <- xgb_cv_rev %>% 
  collect_predictions() %>% 
  conf_mat(category, .pred_class)

xgb_cv_conf
##           Truth
## Prediction   AV   SA
##         AV 5113  235
##         SA   92 2765
comp_prec(xgb_cv_conf$table)
##        AV        SA 
## 0.9560583 0.9677984
comp_recall(xgb_cv_conf$table)
##        AV        SA 
## 0.9823247 0.9216667
xgb_cv_rev %>% 
  collect_metrics(summarize=FALSE) %>% 
  filter(.metric=="accuracy") %>% 
  summarise(mean = mean(.estimate)) %>% 
  pull()
## [1] 0.9601484
comp_acc(xgb_cv_conf$table)
## [1] 0.9601463

4.2 Repeated cross-validation

rep_folds <- vfold_cv(input_bin, v = 10, repeats = 50,
                      strata = category) #strata for stratified cv
xgb_mod_cv <- boost_tree(trees=5) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

wflow_xgb_cv <-
  workflow() %>% 
  add_model(xgb_mod_cv) %>% 
  add_formula(category ~ .)
# If doParallel is installed, and your computer has
# several cores, this will make use of parallelization
# (splitting the task into bits to be run simultaneously)

if ("doParallel" %in% rownames(installed.packages())){
  library("doParallel")
  all_cores <- parallel::detectCores(logical = FALSE)
  cl <- makePSOCKcluster(all_cores) #number of cores?
  registerDoParallel(cl) #only version > 1.0.16..
}
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
set.seed(777)
xgb_cv_rep <- fit_resamples(wflow_xgb_cv, rep_folds, 
                            metrics = my_metrics, control = keep_pred)

if (exists("cl")){
  stopCluster(cl) 
}
xgb_cv_rep
## # Resampling results
## # 10-fold cross-validation repeated 50 times using stratification 
## # A tibble: 500 × 6
##    splits             id       id2    .metrics         .notes           .predi…¹
##    <list>             <chr>    <chr>  <list>           <list>           <list>  
##  1 <split [7384/821]> Repeat01 Fold01 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  2 <split [7384/821]> Repeat01 Fold02 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  3 <split [7384/821]> Repeat01 Fold03 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  4 <split [7384/821]> Repeat01 Fold04 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  5 <split [7384/821]> Repeat01 Fold05 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  6 <split [7385/820]> Repeat01 Fold06 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  7 <split [7385/820]> Repeat01 Fold07 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  8 <split [7385/820]> Repeat01 Fold08 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
##  9 <split [7385/820]> Repeat01 Fold09 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## 10 <split [7385/820]> Repeat01 Fold10 <tibble [4 × 4]> <tibble [0 × 1]> <tibble>
## # … with 490 more rows, and abbreviated variable name ¹​.predictions
xgb_cv_rep %>%
  collect_metrics()
## # A tibble: 4 × 6
##   .metric   .estimator  mean     n  std_err .config             
##   <chr>     <chr>      <dbl> <int>    <dbl> <fct>               
## 1 accuracy  binary     0.923   500 0.000418 Preprocessor1_Model1
## 2 precision binary     0.916   500 0.000502 Preprocessor1_Model1
## 3 recall    binary     0.967   500 0.000394 Preprocessor1_Model1
## 4 roc_auc   binary     0.969   500 0.000287 Preprocessor1_Model1
# xgb_cv_rep %>%
#   collect_metrics(summarize=FALSE) %>%
#   group_by(.metric) %>%
#   summarise(max = max(.estimate), min = min(.estimate),
#             mean = mean(.estimate), sd = sd(.estimate))
nrow(xgb_cv_rep)
## [1] 500
xgb_cv_rep %>%
  collect_metrics(summarize = FALSE) %>% 
  filter(.metric == "accuracy") %>% 
  ggplot(aes(x=.estimate))+
  geom_histogram()+
  labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

xgb_cv_rep %>% 
    collect_predictions() %>% 
    group_by(id) %>%  #id stores nth repeat
    accuracy(category, .pred_class) %>% 
    ggplot(aes(x=.estimate))+
      geom_histogram()+
      labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


xgb_cv_rep %>% 
    collect_predictions() %>% 
    group_by(id) %>% 
    accuracy(category, .pred_class) %>% 
    summarise(min =min(.estimate), max = max(.estimate))
## # A tibble: 1 × 2
##     min   max
##   <dbl> <dbl>
## 1 0.919 0.926
xgb_cv_rep %>% 
    collect_predictions() %>% 
    group_by(id) %>% 
    accuracy(category, .pred_class) %>% 
    summarise(sd = sd(.estimate))
## # A tibble: 1 × 1
##        sd
##     <dbl>
## 1 0.00140

4.3 Multiclass classification with cross-validation

rep_folds <- vfold_cv(input, v = 10, repeats = 25,
                      strata = category) #strata for stratified cv


if ("doParallel" %in% rownames(installed.packages())){
  all_cores <- parallel::detectCores(logical = FALSE)
  cl <- makePSOCKcluster(all_cores) #number of cores?
  registerDoParallel(cl) #only version > 1.0.16..
}

set.seed(456)
xgb_cv_rep <- fit_resamples(wflow_xgb_cv, rep_folds, 
                            metrics = my_metrics,
                            control = keep_pred)


if (exists("cl")){
  stopCluster(cl) 
}

Exercise:
1) Get mean accuracy, auc, precision, recall
2)Plot a histogram for all the accuracy estimates

xgb_cv_rep %>% 
  collect_metrics()
## # A tibble: 4 × 6
##   .metric   .estimator  mean     n  std_err .config             
##   <chr>     <chr>      <dbl> <int>    <dbl> <fct>               
## 1 accuracy  multiclass 0.874   250 0.000591 Preprocessor1_Model1
## 2 precision macro      0.867   250 0.000672 Preprocessor1_Model1
## 3 recall    macro      0.857   250 0.000673 Preprocessor1_Model1
## 4 roc_auc   hand_till  0.962   250 0.000296 Preprocessor1_Model1
xgb_cv_rep %>% 
  collect_predictions() %>% 
  group_by(id) %>% 
  accuracy(category, .pred_class) %>% 
  ggplot(aes(x=.estimate))+
  geom_histogram()+
  labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4.3.1 Impact of sample size

size = nrow(input) * 0.05
set.seed(126)
small <- dplyr::sample_n(input, size)
nrow(small)
## [1] 560
rep_folds <- vfold_cv(small, v = 10, repeats = 50,
                      strata = category) #strata for stratified cv



if ("doParallel" %in% rownames(installed.packages())){
  all_cores <- parallel::detectCores(logical = FALSE)
  cl <- makePSOCKcluster(all_cores) #number of cores?
  registerDoParallel(cl) #only version > 1.0.16..
}

set.seed(123)
cv_result_small <- fit_resamples(wflow_xgb_cv, rep_folds, 
                            metrics = my_metrics,
                            control = keep_pred)

if (exists("cl")){
  stopCluster(cl) 
}
cv_result_small %>% 
  collect_metrics()
## # A tibble: 4 × 6
##   .metric   .estimator  mean     n std_err .config             
##   <chr>     <chr>      <dbl> <int>   <dbl> <fct>               
## 1 accuracy  multiclass 0.783   500 0.00231 Preprocessor1_Model1
## 2 precision macro      0.774   500 0.00257 Preprocessor1_Model1
## 3 recall    macro      0.767   500 0.00251 Preprocessor1_Model1
## 4 roc_auc   hand_till  0.912   500 0.00144 Preprocessor1_Model1
# cv_result_small %>% 
#   collect_metrics(summarize=FALSE) %>% 
#   group_by(.metric) %>% 
#   summarise(mean = mean(.estimate))

Here you see a large variability in accuracy

cv_result_small %>%
  collect_metrics(summarize = FALSE) %>% 
  filter(.metric == "accuracy") %>% 
  ggplot(aes(x=.estimate))+
  geom_histogram()+
  labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cv_result_small %>% 
  collect_predictions() %>% 
  group_by(id) %>% 
  accuracy(category, .pred_class) %>% 
  ggplot(aes(x=.estimate))+
  geom_histogram()+
    labs(x="accuracy")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5 Preprocessing data with “recipe”

# if need to split into training and test set again
# train_set_prop = 0.8
# set.seed(231) #for reproducibility
# 
# train_test_multi <- rsample::initial_split(input, prop=train_set_prop, strata=category)
# train <- training(train_test_multi)
# test <- testing(train_test_multi)
norm_recipe <- recipe(category ~ ., data = train) %>%
  step_normalize(all_predictors())
library("discrim")
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
lda_mod <-
  discrim_linear() %>%
  set_engine("MASS")

lda_wfl <- workflow() %>% 
  add_recipe(norm_recipe) %>% 
  add_model(lda_mod)
lda_fit_norm <- fit(lda_wfl, data = train)

lda_norm_pred = predict(lda_fit_norm, test)
# head(lda_norm_pred)
train_norm <- norm_recipe %>% 
  prep(training = train) %>% 
  bake(train)

test_norm  <- norm_recipe %>% 
  prep(training = train) %>% 
  bake(test)
head(train_norm)
## # A tibble: 6 × 10
##   num_s…¹    noun prop_…²   verb      adj  colon  period   comma excla…³ categ…⁴
##     <dbl>   <dbl>   <dbl>  <dbl>    <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <fct>  
## 1   0.750 -0.0787   0.749 -0.411  3.65e-1  2.31   0.128  -0.191   -0.429 AV     
## 2   0.730  0.250    1.29  -0.599 -7.48e-2 -0.725  0.850  -0.0806  -0.429 AV     
## 3  -0.317  0.680    0.194 -0.443  3.70e-1 -0.317 -0.113  -0.680   -0.429 AV     
## 4   0.461 -0.476   -0.717  1.55   3.14e-4  0.134  0.525  -0.458   -0.429 AV     
## 5   2.83  -1.67    -1.31   2.40  -1.46e+0 -0.281  3.86    0.113   -0.429 AV     
## 6   0.163  0.382   -0.652  1.84  -6.93e-1 -0.725 -0.0893  0.0220  -0.429 AV     
## # … with abbreviated variable names ¹​num_sentences, ²​prop_noun, ³​exclamation,
## #   ⁴​category
head(test_norm)
## # A tibble: 6 × 10
##   num_sent…¹    noun prop_…²   verb    adj  colon period   comma excla…³ categ…⁴
##        <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl> <fct>  
## 1      0.226  0.588    0.321 -0.857  0.396 -0.725  0.298 -0.636   -0.429 AV     
## 2      0.978  0.0850   0.298 -1.44   1.05   0.119  1.03  -0.968   -0.429 AV     
## 3     -0.505  0.554    0.224 -0.485 -0.217  1.85  -0.364  0.0172  -0.429 AV     
## 4      0.880 -0.580    1.58  -0.225  0.763 -0.725  1.05  -0.249   -0.429 AV     
## 5      0.536 -0.0984   0.611 -0.703 -0.555 -0.288  0.304  1.05     1.03  AV     
## 6      1.54  -0.808    1.04  -0.414  0.548  0.196  1.00   1.12     1.11  AV     
## # … with abbreviated variable names ¹​num_sentences, ²​prop_noun, ³​exclamation,
## #   ⁴​category
# other_recipe <- recipe(category ~ ., data = input) %>%
#   step_normalize(all_predictors())

other_recipe <- recipe(category ~ ., data = input) %>%
  step_normalize(all_predictors())

train_other <- other_recipe %>% 
  prep(training = input) %>% 
  bake(train)

test_other  <- other_recipe %>% 
  prep(training = input) %>% 
  bake(test)
head(train_other)
## # A tibble: 6 × 10
##   num_s…¹    noun prop_…²   verb      adj  colon  period   comma excla…³ categ…⁴
##     <dbl>   <dbl>   <dbl>  <dbl>    <dbl>  <dbl>   <dbl>   <dbl>   <dbl> <fct>  
## 1   0.750 -0.0761   0.747 -0.412  0.358    2.33   0.129  -0.194   -0.434 AV     
## 2   0.731  0.254    1.29  -0.601 -0.0810  -0.736  0.849  -0.0837  -0.434 AV     
## 3  -0.316  0.686    0.191 -0.445  0.363   -0.324 -0.112  -0.683   -0.434 AV     
## 4   0.462 -0.475   -0.719  1.56  -0.00594  0.131  0.525  -0.461   -0.434 AV     
## 5   2.83  -1.67    -1.31   2.40  -1.46    -0.288  3.86    0.110   -0.434 AV     
## 6   0.163  0.387   -0.654  1.84  -0.697   -0.736 -0.0889  0.0189  -0.434 AV     
## # … with abbreviated variable names ¹​num_sentences, ²​prop_noun, ³​exclamation,
## #   ⁴​category
head(test_other)
## # A tibble: 6 × 10
##   num_sent…¹    noun prop_…²   verb    adj  colon period   comma excla…³ categ…⁴
##        <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl> <fct>  
## 1      0.227  0.594    0.319 -0.860  0.389 -0.736  0.298 -0.639   -0.434 AV     
## 2      0.978  0.0883   0.295 -1.45   1.04   0.116  1.03  -0.971   -0.434 AV     
## 3     -0.504  0.559    0.222 -0.486 -0.223  1.86  -0.363  0.0142  -0.434 AV     
## 4      0.880 -0.579    1.58  -0.226  0.756 -0.736  1.05  -0.252   -0.434 AV     
## 5      0.536 -0.0959   0.609 -0.705 -0.560 -0.295  0.305  1.05     1.04  AV     
## 6      1.54  -0.809    1.04  -0.416  0.540  0.194  1.00   1.11     1.12  AV     
## # … with abbreviated variable names ¹​num_sentences, ²​prop_noun, ³​exclamation,
## #   ⁴​category

6 Build and evaluate multiple models simultaneously

# norm_recipe <- recipe(category ~ ., data = train) %>%
#   step_normalize(all_predictors())

# lda_mod <-
#   discrim_linear() %>%
#   set_engine("MASS")

xgb_mod <- boost_tree(trees=50) %>%
  set_engine("xgboost") %>%
  set_mode("classification")


#library(ranger)
rand_mod <- rand_forest(mode="classification") %>% 
  set_engine("randomForest")
library(workflowsets)
## 
## Attaching package: 'workflowsets'
## The following object is masked from 'package:tune':
## 
##     extract_recipe
all_models_set <-
  workflow_set(
    preproc = list(norm = norm_recipe),
    models = list(lda = lda_mod, rand_for = rand_mod,
                  xgb = xgb_mod)
  )
if ("doParallel" %in% rownames(installed.packages())){
  
  unregister <- function() {
    env <- foreach:::.foreachGlobals
    rm(list=ls(name=env), pos=env)
  }
  
  unregister()
}


folds <- vfold_cv(data, v = 10, strata = category)



all_models <-
  all_models_set %>%
  workflow_map(
    fn = "fit_resamples",
    resamples = folds,
    verbose = TRUE,
    seed = 1234
  )
## i 1 of 3 resampling: norm_lda
## ✔ 1 of 3 resampling: norm_lda (7.3s)
## i 2 of 3 resampling: norm_rand_for
## ✔ 2 of 3 resampling: norm_rand_for (1m 16.8s)
## i 3 of 3 resampling: norm_xgb
## ✔ 3 of 3 resampling: norm_xgb (30.9s)
all_models %>%
  collect_metrics()
## # A tibble: 6 × 9
##   wflow_id      .config        preproc model .metric .esti…¹  mean     n std_err
##   <chr>         <fct>          <chr>   <chr> <chr>   <chr>   <dbl> <int>   <dbl>
## 1 norm_lda      Preprocessor1… recipe  disc… accura… multic… 0.759    10 2.62e-3
## 2 norm_lda      Preprocessor1… recipe  disc… roc_auc hand_t… 0.901    10 1.97e-3
## 3 norm_rand_for Preprocessor1… recipe  rand… accura… multic… 0.904    10 1.95e-3
## 4 norm_rand_for Preprocessor1… recipe  rand… roc_auc hand_t… 0.977    10 6.83e-4
## 5 norm_xgb      Preprocessor1… recipe  boos… accura… multic… 0.917    10 2.31e-3
## 6 norm_xgb      Preprocessor1… recipe  boos… roc_auc hand_t… 0.982    10 6.82e-4
## # … with abbreviated variable name ¹​.estimator
autoplot(all_models)

6.1 Statistically compare models

all_auc_wide <- all_models %>%
  collect_metrics(summarize = FALSE) %>%
  filter(.metric == "roc_auc") %>%
  dplyr::select(wflow_id, .estimate, id) %>%
  pivot_wider(id_cols = "id", names_from = "wflow_id", values_from = ".estimate")

all_auc_wide
## # A tibble: 10 × 4
##    id     norm_lda norm_rand_for norm_xgb
##    <chr>     <dbl>         <dbl>    <dbl>
##  1 Fold01    0.912         0.978    0.979
##  2 Fold02    0.904         0.977    0.981
##  3 Fold03    0.897         0.979    0.985
##  4 Fold04    0.903         0.982    0.983
##  5 Fold05    0.892         0.975    0.979
##  6 Fold06    0.900         0.975    0.982
##  7 Fold07    0.895         0.975    0.979
##  8 Fold08    0.895         0.979    0.982
##  9 Fold09    0.904         0.979    0.984
## 10 Fold10    0.908         0.977    0.980
all_auc_wide %>%
  with( t.test(norm_lda, norm_rand_for, paired = TRUE))
## 
##  Paired t-test
## 
## data:  norm_lda and norm_rand_for
## t = -40.62, df = 9, p-value = 1.654e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08078254 -0.07225957
## sample estimates:
## mean of the differences 
##             -0.07652106
all_auc_wide %>%
  with( t.test(norm_lda, norm_xgb, paired = TRUE))
## 
##  Paired t-test
## 
## data:  norm_lda and norm_xgb
## t = -37.705, df = 9, p-value = 3.221e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08537097 -0.07570695
## sample estimates:
## mean of the differences 
##             -0.08053896
all_auc_wide %>%
  with( t.test(norm_rand_for, norm_xgb, paired = TRUE))
## 
##  Paired t-test
## 
## data:  norm_rand_for and norm_xgb
## t = -7.2272, df = 9, p-value = 4.936e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.005275520 -0.002760282
## sample estimates:
## mean of the differences 
##            -0.004017901

7 Hyperparameter tuning

8 Brief example of predicting a continuous variable

mod_cont_recipe <- 
  recipe(num_sentences ~ ., data=input) %>% 
  step_dummy(all_nominal())
xgb_mod_cont <- boost_tree(trees=50) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

wflow_xgb_cont <-
  workflow() %>% 
  add_model(xgb_mod_cont) %>% 
  #add_formula(num_sentences ~ .)
  add_recipe(mod_cont_recipe)
# input_cont <- input %>% 
#   dplyr::select(-c(category))

if ("doParallel" %in% rownames(installed.packages())){
  all_cores <- parallel::detectCores(logical = FALSE)
  cl <- makePSOCKcluster(all_cores) 
  registerDoParallel(cl) 
}

set.seed(156) 
folds_cont <- vfold_cv(input, v = 10) 
cont_metrics <- metric_set(rmse, rsq, mae)


keep_pred <- control_resamples(save_pred = TRUE)

xgb_cv_cont <- fit_resamples(wflow_xgb_cont, folds_cont,
                             metrics = cont_metrics,
                             control = keep_pred)

if (exists("cl")){
  stopCluster(cl) 
}
collect_predictions(xgb_cv_cont)
## # A tibble: 11,204 × 5
##    id      .pred  .row num_sentences .config             
##    <chr>   <dbl> <int>         <dbl> <fct>               
##  1 Fold01 0.0830     8        0.0858 Preprocessor1_Model1
##  2 Fold01 0.0524    22        0.0539 Preprocessor1_Model1
##  3 Fold01 0.0539    27        0.0376 Preprocessor1_Model1
##  4 Fold01 0.0976    41        0.0905 Preprocessor1_Model1
##  5 Fold01 0.0569    43        0.0523 Preprocessor1_Model1
##  6 Fold01 0.0555    46        0.0492 Preprocessor1_Model1
##  7 Fold01 0.0826    51        0.0766 Preprocessor1_Model1
##  8 Fold01 0.0728    70        0.0823 Preprocessor1_Model1
##  9 Fold01 0.0874    86        0.0886 Preprocessor1_Model1
## 10 Fold01 0.0713   142        0.0735 Preprocessor1_Model1
## # … with 11,194 more rows
collect_predictions(xgb_cv_cont) %>% 
  ggplot(aes(x=num_sentences, y=.pred))+
    geom_point(alpha=0.5)+
    geom_smooth(method="lm", se = TRUE)+
    coord_fixed()
## `geom_smooth()` using formula 'y ~ x'

xgb_cv_cont %>% 
  collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator    mean     n   std_err .config             
##   <chr>   <chr>        <dbl> <int>     <dbl> <fct>               
## 1 mae     standard   0.00472    10 0.0000493 Preprocessor1_Model1
## 2 rmse    standard   0.00711    10 0.000139  Preprocessor1_Model1
## 3 rsq     standard   0.848      10 0.00461   Preprocessor1_Model1
cor(collect_predictions(xgb_cv_cont)$num_sentences, 
    collect_predictions(xgb_cv_cont)$.pred)
## [1] 0.920497