Predicting Promotions Through Machine Learning

Building an XGBoost model in the Tidymodels ecosystem that predicts whether an employee should be promoted.
Tidymodels
XGBoost
R
Machine Learning
Employee Promotions
Author

Adam D McKinnon

Published

December 30, 2022


Libraries

Code
# data manipulation
library(readxl)
library(tidyverse)
library(janitor)

# modelling
library(tidymodels)
library(finetune)
library(bundle)
library(plotly)

# Processing power
library(doParallel)
library(parallelly)


tidymodels_prefer()


Data

Code
# Load Data ----
promotions_tbl <- readxl::read_excel(path = "2022_12_15_promotions.xlsx")


promotions_tbl <- promotions_tbl %>% 
    mutate(
        promoted  = forcats::as_factor(promoted) %>% forcats::fct_relevel("promoted", "not promoted")
    ) %>% 
    mutate_at(.vars = c("gender", "work_site", "management_level"), .funs = ~ forcats::as_factor(.))


Building an ML Model

1. Splitting the data

Code
# Spending the dataset ----

set.seed(836)
promotion_split     <- initial_split(promotions_tbl, strata = promoted)
promotion_train_tbl <- training(promotion_split)
promotion_test_tbl  <- testing(promotion_split)


set.seed(234)
promotion_folds <- bootstraps(promotion_train_tbl, 
                              times = 75, # default is 25 - inflated to accommodate racing method of tuning 
                              strata = promoted)

# check the promotion_folds 
# promotion_folds


### 2. Pre-processing the data

Code
# Data Pre-processing ----
xgboost_recipe <- 
    recipe(formula = promoted ~ ., data = promotion_train_tbl) %>% 
    recipes::update_role(employee_id, new_role = "id") %>% 
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
    step_zv(all_predictors()) 


# check the recipe
# xgboost_recipe


### 3. Create a model specification

Code
# Model Set-up ----
xgboost_spec <- 
    boost_tree(trees = 1000, 
               tree_depth = tune(), min_n = tune(), 
               loss_reduction = tune(), 
               sample_size = tune(), mtry = tune(),
               learn_rate = tune()) %>% 
    set_engine("xgboost") %>% 
    set_mode("classification")


# check the model specification
# xgboost_spec


### 4. Workflow setup

Code
# Workflow setup
xgboost_workflow <- 
    workflow() %>% 
    add_recipe(xgboost_recipe) %>% 
    add_model(xgboost_spec) 

# Check the workflow
# xgboost_workflow


### 5. Tuning the model

Code
# specify the metrics of interest
# NOTE: The first metric listed will be used for tuning
promotion_metrics <- metric_set(
                            roc_auc, 
                            accuracy, 
                            sensitivity, 
                            specificity
                            )


# establish parallel processing based on the number of available cores
doParallel::registerDoParallel(cores = parallelly::availableCores())


set.seed(826)
racing_resamples <- finetune::tune_race_anova(
    xgboost_workflow,
    resamples = promotion_folds,
    grid = 100, # cast a wide grid to optimise the results -
                # works best with many resamples - set earlier to 75
    metrics = promotion_metrics,
    control = control_race(
        verbose_elim = TRUE,
        save_pred    = TRUE
        )
)


# racing_resamples


6. Assess model performance

Code
first_model_metrics_tbl <- collect_metrics(racing_resamples)
tuning_plot <- plotly_build(plot_race(racing_resamples))

xaringanExtra::use_panelset()

Promotion Metrics

mtry min_n tree_depth learn_rate loss_reduction sample_size .metric .estimator mean n std_err .config
3 8 13 0.002555174 0.0003583973 0.8054182 accuracy binary 0.7927507 75 0.002301022 Preprocessor1_Model044
3 8 13 0.002555174 0.0003583973 0.8054182 roc_auc binary 0.8479777 75 0.002578232 Preprocessor1_Model044
3 8 13 0.002555174 0.0003583973 0.8054182 sensitivity binary 0.6046792 75 0.006856507 Preprocessor1_Model044
3 8 13 0.002555174 0.0003583973 0.8054182 specificity binary 0.9112788 75 0.005088285 Preprocessor1_Model044

Model Tuning Visualisation


7. Finalise the workflow

Code
last_fit_xgboost_workflow <- xgboost_workflow %>%
    finalize_workflow(select_best(racing_resamples, "roc_auc")) %>%
    last_fit(promotion_split)


# last_fit_xgboost_workflow

# test the fit
collect_metrics(last_fit_xgboost_workflow) %>% gt::gt()
.metric .estimator .estimate .config
accuracy binary 0.8190045 Preprocessor1_Model1
roc_auc binary 0.8768303 Preprocessor1_Model1
Code
# extract the model workflow for further testing & saving
final_model_workflow <- last_fit_xgboost_workflow %>%
    extract_workflow()


8. Re-assess model performance

Code
# test the model
pred_test <- final_model_workflow %>% 
    predict(promotion_test_tbl) %>%
    bind_cols(promotion_test_tbl)

# Visualise the performance using a confusion matrix
cm <- conf_mat(pred_test, promoted, .pred_class)
autoplot(cm, type = "heatmap") %>% 
    plotly::plotly_build()


Save the model

Code
# save the model for future use 
model_bundle <- bundle::bundle(final_model_workflow)
readr::write_rds(model_bundle, file = "model_bundle.rds")

Reuse

Citation

BibTeX citation:
@online{dmckinnon2022,
  author = {Adam D McKinnon},
  title = {Predicting {Promotions} {Through} {Machine} {Learning}},
  date = {2022-12-30},
  url = {https://www.adam-d-mckinnon.com//posts/2022-12-30-promotion_prediction},
  langid = {en}
}
For attribution, please cite this work as:
Adam D McKinnon. 2022. “Predicting Promotions Through Machine Learning.” December 30, 2022. https://www.adam-d-mckinnon.com//posts/2022-12-30-promotion_prediction.