Libraries
Code
# data manipulation
library(readxl)
library(tidyverse)
library(janitor)
# modelling
library(tidymodels)
library(finetune)
library(bundle)
library(plotly)
# Processing power
library(doParallel)
library(parallelly)
tidymodels_prefer()
Data
Code
# Load Data ----
<- readxl::read_excel(path = "2022_12_15_promotions.xlsx")
promotions_tbl
<- promotions_tbl %>%
promotions_tbl mutate(
promoted = forcats::as_factor(promoted) %>% forcats::fct_relevel("promoted", "not promoted")
%>%
) mutate_at(.vars = c("gender", "work_site", "management_level"), .funs = ~ forcats::as_factor(.))
Building an ML Model
1. Splitting the data
Code
# Spending the dataset ----
set.seed(836)
<- initial_split(promotions_tbl, strata = promoted)
promotion_split <- training(promotion_split)
promotion_train_tbl <- testing(promotion_split)
promotion_test_tbl
set.seed(234)
<- bootstraps(promotion_train_tbl,
promotion_folds times = 75, # default is 25 - inflated to accommodate racing method of tuning
strata = promoted)
# check the promotion_folds
# promotion_folds
### 2. Pre-processing the data
Code
# Data Pre-processing ----
<-
xgboost_recipe recipe(formula = promoted ~ ., data = promotion_train_tbl) %>%
::update_role(employee_id, new_role = "id") %>%
recipesstep_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors())
# check the recipe
# xgboost_recipe
### 3. Create a model specification
Code
# Model Set-up ----
<-
xgboost_spec boost_tree(trees = 1000,
tree_depth = tune(), min_n = tune(),
loss_reduction = tune(),
sample_size = tune(), mtry = tune(),
learn_rate = tune()) %>%
set_engine("xgboost") %>%
set_mode("classification")
# check the model specification
# xgboost_spec
### 4. Workflow setup
Code
# Workflow setup
<-
xgboost_workflow workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Check the workflow
# xgboost_workflow
### 5. Tuning the model
Code
# specify the metrics of interest
# NOTE: The first metric listed will be used for tuning
<- metric_set(
promotion_metrics
roc_auc,
accuracy,
sensitivity,
specificity
)
# establish parallel processing based on the number of available cores
::registerDoParallel(cores = parallelly::availableCores())
doParallel
set.seed(826)
<- finetune::tune_race_anova(
racing_resamples
xgboost_workflow,resamples = promotion_folds,
grid = 100, # cast a wide grid to optimise the results -
# works best with many resamples - set earlier to 75
metrics = promotion_metrics,
control = control_race(
verbose_elim = TRUE,
save_pred = TRUE
)
)
# racing_resamples
6. Assess model performance
Code
<- collect_metrics(racing_resamples)
first_model_metrics_tbl <- plotly_build(plot_race(racing_resamples))
tuning_plot
::use_panelset() xaringanExtra
Promotion Metrics
mtry | min_n | tree_depth | learn_rate | loss_reduction | sample_size | .metric | .estimator | mean | n | std_err | .config |
---|---|---|---|---|---|---|---|---|---|---|---|
3 | 8 | 13 | 0.002555174 | 0.0003583973 | 0.8054182 | accuracy | binary | 0.7927507 | 75 | 0.002301022 | Preprocessor1_Model044 |
3 | 8 | 13 | 0.002555174 | 0.0003583973 | 0.8054182 | roc_auc | binary | 0.8479777 | 75 | 0.002578232 | Preprocessor1_Model044 |
3 | 8 | 13 | 0.002555174 | 0.0003583973 | 0.8054182 | sensitivity | binary | 0.6046792 | 75 | 0.006856507 | Preprocessor1_Model044 |
3 | 8 | 13 | 0.002555174 | 0.0003583973 | 0.8054182 | specificity | binary | 0.9112788 | 75 | 0.005088285 | Preprocessor1_Model044 |
Model Tuning Visualisation
7. Finalise the workflow
Code
<- xgboost_workflow %>%
last_fit_xgboost_workflow finalize_workflow(select_best(racing_resamples, "roc_auc")) %>%
last_fit(promotion_split)
# last_fit_xgboost_workflow
# test the fit
collect_metrics(last_fit_xgboost_workflow) %>% gt::gt()
.metric | .estimator | .estimate | .config |
---|---|---|---|
accuracy | binary | 0.8190045 | Preprocessor1_Model1 |
roc_auc | binary | 0.8768303 | Preprocessor1_Model1 |
Code
# extract the model workflow for further testing & saving
<- last_fit_xgboost_workflow %>%
final_model_workflow extract_workflow()
8. Re-assess model performance
Code
# test the model
<- final_model_workflow %>%
pred_test predict(promotion_test_tbl) %>%
bind_cols(promotion_test_tbl)
# Visualise the performance using a confusion matrix
<- conf_mat(pred_test, promoted, .pred_class)
cm autoplot(cm, type = "heatmap") %>%
::plotly_build() plotly
Save the model
Code
# save the model for future use
<- bundle::bundle(final_model_workflow)
model_bundle ::write_rds(model_bundle, file = "model_bundle.rds") readr
Reuse
Citation
BibTeX citation:
@online{dmckinnon2022,
author = {Adam D McKinnon},
title = {Predicting {Promotions} {Through} {Machine} {Learning}},
date = {2022-12-30},
url = {https://www.adam-d-mckinnon.com//posts/2022-12-30-promotion_prediction},
langid = {en}
}
For attribution, please cite this work as:
Adam D McKinnon. 2022. “Predicting Promotions Through Machine
Learning.” December 30, 2022. https://www.adam-d-mckinnon.com//posts/2022-12-30-promotion_prediction.