library(tidyverse, warn.conflicts = F)
library(tidymodels, warn.conflicts = F)
library(tidylog)
library(ggthemes)
theme_set(theme_clean())
source("../scripts/prune_race_variables.R")
cores <- parallel::detectCores()

Data Import

hh <- read_csv("../data/hh.csv") %>%
    mutate(leaid = as.integer(leaid)) %>% 
    filter(
        if_any(ends_with("MOE"), 
               function(x) {x < 0.5})
    ) %>%
    select(-ends_with("MOE"))
## Rows: 13313 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): state, dist, region
## dbl (15): leaid, children, pct_pov, pct_SP, SP_MOE, pct_HHVJ, HHVJ_MOE, pct_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
## 
## filter: removed 46 rows (<1%), 13,267 rows remaining
## 
## select: dropped 6 variables (SP_MOE, HHVJ_MOE, CC_MOE, nci_MOE, CD_MOE, …)
grad <- read_csv("../data/grad.csv") %>%
    mutate(leaid = as.integer(leaid))
## Rows: 12663 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): leaid, grad_rate_midpt
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
race <- read_csv("../data/race.csv") %>% 
    prune_and_predom() %>%
    mutate(leaid = as.integer(leaid), 
           predom_race = as.character(predom_race))
## Rows: 11910 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): geo_id, dist, year
## dbl (63): leaid, total_pop_est, total_pop_moe, total_hisp_latino, total_hisp...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## select: dropped 59 variables (geo_id, dist, year, total_pop_est, total_pop_moe, …)
## 
## mutate: converted 'leaid' from double to integer (0 new NA)
## 
## select: dropped one variable (leaid)
## 
## mutate: new variable 'predom_race' (factor) with 5 unique values and 0% NA
## 
## select: dropped 6 variables (pct_hisp_latino, pct_white, pct_black, pct_native, pct_asian, …)
## 
## mutate: converted 'predom_race' from factor to character (0 new NA)
# assess <- read_csv("../data/assess.csv") %>%
#     mutate(leaid = as.integer(leaid))

finance <- read_csv("../data/finance.csv") %>%
    mutate(leaid = as.integer(leaid))
## Rows: 13314 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): leaid, fed_per_child, state_per_child, local_per_child
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
data <- hh %>%
    left_join(grad,    by = c("leaid" = "leaid")) %>%
    left_join(race,    by = c("leaid" = "leaid")) %>%
#    left_join(assess,  by = c("leaid" = "leaid")) %>%
    left_join(finance, by = c("leaid" = "leaid")) %>%
    select(
        -state, -leaid, ends_with(".y")
    ) %>%
    rename_with(~ str_remove_all(.x, ".x"), ends_with(".x"))
## left_join: added one column (grad_rate_midpt)
##            > rows only in x    2,505
##            > rows only in y  ( 1,901)
##            > matched rows     10,762
##            >                 ========
##            > rows total       13,267
## left_join: added 7 columns (predom_race, pct_hisp_latino, pct_white, pct_black, pct_native, …)
##            > rows only in x    1,358
##            > rows only in y  (     1)
##            > matched rows     11,909
##            >                 ========
##            > rows total       13,267
## left_join: added 3 columns (fed_per_child, state_per_child, local_per_child)
##            > rows only in x        0
##            > rows only in y  (    47)
##            > matched rows     13,267
##            >                 ========
##            > rows total       13,267
## select: dropped 2 variables (state, leaid)
nrow(data)
## [1] 13267
data <- data %>%
    na.omit()

nrow(data)
## [1] 10192
data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 10192
Number of columns 21
_______________________
Column type frequency:
character 3
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
dist 0 1 4 68 0 9733 0
region 0 1 4 13 0 4 0
predom_race 0 1 5 15 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
children 0 1 5066.03 15935.34 9.00 720.00 1689.00 4108.50 724446.00 ▇▁▁▁▁
pct_pov 0 1 0.16 0.09 0.00 0.10 0.15 0.21 0.65 ▇▇▂▁▁
pct_SP 0 1 0.31 0.13 0.00 0.22 0.30 0.38 0.96 ▂▇▃▁▁
pct_HHVJ 0 1 0.26 0.05 0.00 0.23 0.26 0.29 0.57 ▁▂▇▁▁
pct_CC 0 1 0.02 0.03 0.00 0.01 0.02 0.03 0.56 ▇▁▁▁▁
pct_NCI 0 1 0.14 0.11 0.00 0.06 0.12 0.19 0.88 ▇▃▁▁▁
pct_CD 0 1 0.06 0.04 0.00 0.04 0.05 0.08 0.32 ▇▅▁▁▁
pct_CLI 0 1 0.02 0.04 0.00 0.00 0.00 0.02 0.66 ▇▁▁▁▁
grad_rate_midpt 0 1 86.08 7.85 3.23 82.25 87.85 91.68 97.93 ▁▁▁▂▇
pct_hisp_latino 0 1 13.91 18.93 0.00 2.50 6.50 16.10 100.00 ▇▁▁▁▁
pct_white 0 1 70.87 25.77 0.00 56.50 80.40 91.00 100.00 ▁▁▂▃▇
pct_black 0 1 7.00 14.87 0.00 0.00 1.15 5.60 99.10 ▇▁▁▁▁
pct_native 0 1 2.13 9.42 0.00 0.00 0.00 0.40 100.00 ▇▁▁▁▁
pct_asian 0 1 1.86 4.53 0.00 0.00 0.20 1.80 65.20 ▇▁▁▁▁
pct_PI 0 1 0.06 0.39 0.00 0.00 0.00 0.00 13.10 ▇▁▁▁▁
fed_per_child 0 1 991.51 1323.87 11.51 517.14 786.57 1157.86 74305.56 ▇▁▁▁▁
state_per_child 0 1 6579.35 5971.48 275.17 4271.43 5942.27 7851.77 393027.78 ▇▁▁▁▁
local_per_child 0 1 6210.00 5804.47 152.86 3160.68 4832.63 7620.01 309722.22 ▇▁▁▁▁

Data Splitting and Folding

Our goal is to create a regression model to predict graduation rate from household conditions and race data. We will use the tidymodels framework for analysis by creating an initial split (stratifying by predominant race) of training and testing data. We will also create 10 fold for 10-fold cross-validation.

set.seed(1234)
distr_split <- initial_split(data, strata = region)
distr_train <- training(distr_split)
distr_test  <- testing(distr_split)

set.seed(4321)
distr_folds <- vfold_cv(distr_train, v = 10, strata = region)

Preprocessing

We can now create a recipe, or a preprocessor, which can help us by creating dummy variables for our nominal variable(s), as well as centering and scaling our predictors, and removing all near-zero-variance predictors.

distr_rec <- 
    recipe(grad_rate_midpt ~ ., data = distr_train) %>%
    update_role(dist, new_role = "ID") %>%
    step_interact(~ all_numeric_predictors():all_numeric_predictors()) %>%
    step_nzv(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_scale(all_numeric_predictors()) %>%
    step_center(all_numeric_predictors()) %>%
    step_nzv(all_numeric_predictors())
distr_rec %>%
    prep() %>%
    bake(new_data = NULL)
## # A tibble: 7,642 × 140
##    dist          children pct_pov pct_SP pct_HHVJ pct_CC pct_NCI  pct_CD pct_CLI
##    <fct>            <dbl>   <dbl>  <dbl>    <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
##  1 Casey-Westfi…   -0.247  -0.199  1.12    -0.505 -0.758 -0.608   2.23   -0.492 
##  2 Blue Ridge C…   -0.266  -0.544  0.265    1.42  -0.259 -0.105  -1.19   -0.492 
##  3 Prairie Cent…   -0.191  -0.588 -0.259   -0.709 -0.402 -0.0651 -0.0501 -0.0603
##  4 Eastland Com…   -0.270  -0.438 -0.839   -1.10  -0.822 -0.857   0.0936 -0.492 
##  5 Goreville Co…   -0.268  -0.651 -0.427   -1.55  -0.719 -0.435  -1.25   -0.492 
##  6 Joppa-Maple …   -0.283   0.822 -0.526   -0.498  1.50   3.37   -0.258  -0.492 
##  7 Farmington C…   -0.222  -0.645 -0.460    0.536 -0.331 -0.562   0.268  -0.492 
##  8 Gallatin Com…   -0.260   1.07   1.23    -0.939 -0.356 -0.283   2.54   -0.492 
##  9 Georgetown-R…   -0.240   0.565  0.110    0.116 -0.520  0.153  -0.0723 -0.492 
## 10 Norris City-…   -0.256   0.201 -0.340   -1.21  -0.593 -0.238   1.12   -0.492 
## # … with 7,632 more rows, and 131 more variables: pct_hisp_latino <dbl>,
## #   pct_white <dbl>, pct_black <dbl>, fed_per_child <dbl>,
## #   state_per_child <dbl>, local_per_child <dbl>, grad_rate_midpt <dbl>,
## #   children_x_pct_pov <dbl>, children_x_pct_SP <dbl>,
## #   children_x_pct_HHVJ <dbl>, children_x_pct_CC <dbl>,
## #   children_x_pct_NCI <dbl>, children_x_pct_CD <dbl>,
## #   children_x_pct_CLI <dbl>, children_x_pct_hisp_latino <dbl>, …

Model Specifications

We’ll now create some specifications, or types of models, which we will integrate into our workflow. We’ll make the following models:

  • Linear Regression (lm)
  • Lasso Regression (glmnet)
  • (NOT WORKING) Partial Least Squares (mixOmics)
  • Multivariate Adaptive Regression Spline (earth)
  • Support Vector Regression (kernlab)
  • Decision Tree (rpart)
  • Random Forest (ranger)
  • Gradient Boosted Trees (xgboost)
  • K-Nearest Neighbors (kknn)
lm_spec <-
    linear_reg() %>%
    set_engine('lm')

lasso_spec <-
    linear_reg(
        mixture = 1, 
        penalty = tune()
    ) %>%
    set_engine('glmnet')

pls_spec <-
    pls(
        predictor_prop = tune(), 
        num_comp = tune()
    ) %>%
    set_engine('mixOmics') %>%
    set_mode('regression')

mars_spec <-
    mars(
        prod_degree = tune()
    ) %>%
    set_engine('earth') %>%
    set_mode('regression')

svm_spec <-
    svm_linear(
        cost = tune(), 
        margin = tune()
    ) %>%
    set_engine('kernlab') %>%
    set_mode('regression')

dtree_spec <-
    decision_tree(
        tree_depth = tune(), 
        min_n = tune(), 
        cost_complexity = tune()
    ) %>%
    set_engine('rpart') %>%
    set_mode('regression')

rf_spec <-
    rand_forest(
        trees = 1000,
        mtry = tune(),
        min_n = tune()
    ) %>%
    set_engine('ranger', num.threads = cores, importance = "impurity") %>%
    set_mode('regression')

xgb_spec <- 
    boost_tree(
        trees = 1000, 
        tree_depth = tune(),
        min_n = tune(),
        mtry = tune(),
        sample_size = tune(),
        learn_rate = tune()
    ) %>%
    set_engine("xgboost") %>%
    set_mode("regression")

# knn_spec <-
#     nearest_neighbor(
#         neighbors = tune(), 
#         weight_func = tune(), 
#         dist_power = tune()
#     ) %>%
#     set_engine('kknn') %>%
#     set_mode('regression')

Now that we have our model specifications, we can put all of these into a workflow set.

distr_workflowset <- 
    workflow_set(
        preproc = list("rec" = distr_rec), 
        models = list(
            "lm" = lm_spec, 
            "lasso" = lasso_spec, 
            # "pls" = pls_spec, 
            "mars" = mars_spec, 
            "svm" = svm_spec, 
            "dtree" = dtree_spec, 
            "rf" = rf_spec, 
            "xgboost" = xgb_spec
            # "knn" = knn_spec
        )
    ) %>%
    mutate(wflow_id = wflow_id %>% str_remove_all("rec_"))
## mutate: changed 7 values (100%) of 'wflow_id' (0 new NA)
distr_workflowset
## # A workflow set/tibble: 7 × 4
##   wflow_id info             option    result    
##   <chr>    <list>           <list>    <list>    
## 1 lm       <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 lasso    <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 mars     <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 svm      <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 dtree    <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 rf       <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 xgboost  <tibble [1 × 4]> <opts[0]> <list [0]>

Model Training

We can now tune our models:

distr_grid_ctrl <- 
    control_grid(
        save_pred = TRUE, 
        parallel_over = "everything", 
        save_workflow = TRUE
    )

doParallel::registerDoParallel(cores = cores)
distr_tune <- 
    distr_workflowset %>%
    workflow_map("tune_grid", seed = 2314, 
                resamples = distr_folds, 
                grid = 30, 
                control = distr_grid_ctrl, 
                verbose = TRUE)

saveRDS(distr_tune, paste0("trained_models/", Sys.Date(), "_distr_tune.rds"))
i 1 of 7 resampling: lm
✔ 1 of 7 resampling: lm (7.2s)
i 2 of 7 tuning:     lasso
✔ 2 of 7 tuning:     lasso (7.5s)
i 3 of 7 tuning:     mars
✔ 3 of 7 tuning:     mars (31.5s)
i 4 of 7 tuning:     svm
✔ 4 of 7 tuning:     svm (1h 32m 55.8s)
i 5 of 7 tuning:     dtree
✔ 5 of 7 tuning:     dtree (4m 18.9s)
i 6 of 7 tuning:     rf
✔ 6 of 7 tuning:     rf (2h 35m 19.7s)
i 7 of 7 tuning:     xgboost
✔ 7 of 7 tuning:     xgboost (32m 56.5s)

Model Evaluation

distr_tune <- readRDS("trained_models/2022-05-25_distr_tune.rds")
autoplot(distr_tune, select_best = TRUE)

(ranks <- distr_tune %>%
    rank_results(select_best = TRUE) %>%
    select(-std_err) %>%
    pivot_wider(names_from = .metric, values_from = mean) %>%
    select(wflow_id, rank, rmse, rsq))
## select: dropped one variable (std_err)
## pivot_wider: reorganized (.metric, mean) into (rmse, rsq) [was 14x8, now 7x8]
## select: dropped 4 variables (.config, n, preprocessor, model)
## # A tibble: 7 × 4
##   wflow_id  rank  rmse   rsq
##   <chr>    <int> <dbl> <dbl>
## 1 xgboost      1  5.74 0.455
## 2 rf           2  5.79 0.445
## 3 mars         3  6.12 0.385
## 4 lasso        4  6.36 0.332
## 5 dtree        5  6.40 0.323
## 6 lm           6  6.45 0.320
## 7 svm          7  6.47 0.325
(best_wflow_id <- ranks %>% 
    head(1) %>% pull(wflow_id))
## [1] "xgboost"
best_results <- 
    distr_tune %>%
    extract_workflow_set_result(best_wflow_id) %>%
    select_best(metric = "rmse")

rf_results <- 
    distr_tune %>%
    extract_workflow_set_result("rf") %>%
    select_best(metric = "rmse")

best_results
## # A tibble: 1 × 6
##    mtry min_n tree_depth learn_rate sample_size .config              
##   <int> <int>      <int>      <dbl>       <dbl> <chr>                
## 1    19    13         11    0.00565       0.960 Preprocessor1_Model10
rf_results
## # A tibble: 1 × 3
##    mtry min_n .config              
##   <int> <int> <chr>                
## 1   104    13 Preprocessor1_Model23
test_results <- 
    distr_tune %>%
    extract_workflow(best_wflow_id) %>%
    finalize_workflow(best_results) %>%
    last_fit(split = distr_split)

rf_test_results <- 
    distr_tune %>%
    extract_workflow("rf") %>%
    finalize_workflow(rf_results) %>%
    last_fit(split = distr_split)

collect_metrics(test_results)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       5.98  Preprocessor1_Model1
## 2 rsq     standard       0.450 Preprocessor1_Model1
collect_metrics(rf_test_results)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       6.06  Preprocessor1_Model1
## 2 rsq     standard       0.435 Preprocessor1_Model1
test_results %>%
    collect_predictions() %>%
    ggplot(aes(x = grad_rate_midpt, y = .pred)) + 
    geom_abline(color = "gray50", lty = 2) + 
    geom_point(alpha = 0.5) + 
    coord_obs_pred() + 
    labs(x = "observed graduation rate", y = "predicted graduation rate", 
         title = paste0("Predicted vs. Observed Graduation Rate with ", best_wflow_id))

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
rf_test_results %>%
    extract_fit_parsnip() %>%
    vip(num_features = 20)

---
title: "Regression Modeling with `tidymodels`"
author: "Jon Geiger, Noel Goodwin, Abigail Joppa"
date: "`r Sys.Date()`"
output: openintro::lab_report
---

```{r load-packages, message=FALSE}
library(tidyverse, warn.conflicts = F)
library(tidymodels, warn.conflicts = F)
library(tidylog)
library(ggthemes)
theme_set(theme_clean())
source("../scripts/prune_race_variables.R")
cores <- parallel::detectCores()
```

# Data Import

```{r import-data}
hh <- read_csv("../data/hh.csv") %>%
    mutate(leaid = as.integer(leaid)) %>% 
    filter(
        if_any(ends_with("MOE"), 
               function(x) {x < 0.5})
    ) %>%
    select(-ends_with("MOE"))

grad <- read_csv("../data/grad.csv") %>%
    mutate(leaid = as.integer(leaid))

race <- read_csv("../data/race.csv") %>% 
    prune_and_predom() %>%
    mutate(leaid = as.integer(leaid), 
           predom_race = as.character(predom_race))

# assess <- read_csv("../data/assess.csv") %>%
#     mutate(leaid = as.integer(leaid))

finance <- read_csv("../data/finance.csv") %>%
    mutate(leaid = as.integer(leaid))

data <- hh %>%
    left_join(grad,    by = c("leaid" = "leaid")) %>%
    left_join(race,    by = c("leaid" = "leaid")) %>%
#    left_join(assess,  by = c("leaid" = "leaid")) %>%
    left_join(finance, by = c("leaid" = "leaid")) %>%
    select(
        -state, -leaid, ends_with(".y")
    ) %>%
    rename_with(~ str_remove_all(.x, ".x"), ends_with(".x"))

nrow(data)

data <- data %>%
    na.omit()

nrow(data)

data %>% skimr::skim()
```

# Data Splitting and Folding

Our goal is to create a regression model to predict graduation rate from household conditions and race data. We will use the `tidymodels` framework for analysis by creating an initial split (stratifying by predominant race) of training and testing data. We will also create 10 fold for 10-fold cross-validation.

```{r split-folds}
set.seed(1234)
distr_split <- initial_split(data, strata = region)
distr_train <- training(distr_split)
distr_test  <- testing(distr_split)

set.seed(4321)
distr_folds <- vfold_cv(distr_train, v = 10, strata = region)
```

# Preprocessing

We can now create a recipe, or a preprocessor, which can help us by creating dummy variables for our nominal variable(s), as well as centering and scaling our predictors, and removing all near-zero-variance predictors. 

```{r create-recipe}
distr_rec <- 
    recipe(grad_rate_midpt ~ ., data = distr_train) %>%
    update_role(dist, new_role = "ID") %>%
    step_interact(~ all_numeric_predictors():all_numeric_predictors()) %>%
    step_nzv(all_numeric_predictors()) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_scale(all_numeric_predictors()) %>%
    step_center(all_numeric_predictors()) %>%
    step_nzv(all_numeric_predictors())
distr_rec %>%
    prep() %>%
    bake(new_data = NULL)
```

# Model Specifications

We'll now create some specifications, or types of models, which we will integrate into our workflow. We'll make the following models: 

- Linear Regression (`lm`)
- Lasso Regression (`glmnet`)
- (NOT WORKING) Partial Least Squares (`mixOmics`) 
- Multivariate Adaptive Regression Spline (`earth`)
- Support Vector Regression (`kernlab`)
- Decision Tree (`rpart`)
- Random Forest (`ranger`)
- Gradient Boosted Trees (`xgboost`)
- K-Nearest Neighbors (`kknn`)

```{r create-specs}
lm_spec <-
    linear_reg() %>%
    set_engine('lm')

lasso_spec <-
    linear_reg(
        mixture = 1, 
        penalty = tune()
    ) %>%
    set_engine('glmnet')

pls_spec <-
    pls(
        predictor_prop = tune(), 
        num_comp = tune()
    ) %>%
    set_engine('mixOmics') %>%
    set_mode('regression')

mars_spec <-
    mars(
        prod_degree = tune()
    ) %>%
    set_engine('earth') %>%
    set_mode('regression')

svm_spec <-
    svm_linear(
        cost = tune(), 
        margin = tune()
    ) %>%
    set_engine('kernlab') %>%
    set_mode('regression')

dtree_spec <-
    decision_tree(
        tree_depth = tune(), 
        min_n = tune(), 
        cost_complexity = tune()
    ) %>%
    set_engine('rpart') %>%
    set_mode('regression')

rf_spec <-
    rand_forest(
        trees = 1000,
        mtry = tune(),
        min_n = tune()
    ) %>%
    set_engine('ranger', num.threads = cores, importance = "impurity") %>%
    set_mode('regression')

xgb_spec <- 
    boost_tree(
        trees = 1000, 
        tree_depth = tune(),
        min_n = tune(),
        mtry = tune(),
        sample_size = tune(),
        learn_rate = tune()
    ) %>%
    set_engine("xgboost") %>%
    set_mode("regression")

# knn_spec <-
#     nearest_neighbor(
#         neighbors = tune(), 
#         weight_func = tune(), 
#         dist_power = tune()
#     ) %>%
#     set_engine('kknn') %>%
#     set_mode('regression')
```

Now that we have our model specifications, we can put all of these into a workflow set. 
```{r create-workflowset}
distr_workflowset <- 
    workflow_set(
        preproc = list("rec" = distr_rec), 
        models = list(
            "lm" = lm_spec, 
            "lasso" = lasso_spec, 
            # "pls" = pls_spec, 
            "mars" = mars_spec, 
            "svm" = svm_spec, 
            "dtree" = dtree_spec, 
            "rf" = rf_spec, 
            "xgboost" = xgb_spec
            # "knn" = knn_spec
        )
    ) %>%
    mutate(wflow_id = wflow_id %>% str_remove_all("rec_"))
distr_workflowset

```

# Model Training

We can now tune our models: 

```{r model-train, eval = F}
distr_grid_ctrl <- 
    control_grid(
        save_pred = TRUE, 
        parallel_over = "everything", 
        save_workflow = TRUE
    )

doParallel::registerDoParallel(cores = cores)
distr_tune <- 
    distr_workflowset %>%
    workflow_map("tune_grid", seed = 2314, 
                resamples = distr_folds, 
                grid = 30, 
                control = distr_grid_ctrl, 
                verbose = TRUE)

saveRDS(distr_tune, paste0("trained_models/", Sys.Date(), "_distr_tune.rds"))
```

```
i 1 of 7 resampling: lm
✔ 1 of 7 resampling: lm (7.2s)
i 2 of 7 tuning:     lasso
✔ 2 of 7 tuning:     lasso (7.5s)
i 3 of 7 tuning:     mars
✔ 3 of 7 tuning:     mars (31.5s)
i 4 of 7 tuning:     svm
✔ 4 of 7 tuning:     svm (1h 32m 55.8s)
i 5 of 7 tuning:     dtree
✔ 5 of 7 tuning:     dtree (4m 18.9s)
i 6 of 7 tuning:     rf
✔ 6 of 7 tuning:     rf (2h 35m 19.7s)
i 7 of 7 tuning:     xgboost
✔ 7 of 7 tuning:     xgboost (32m 56.5s)
```

# Model Evaluation

```{r model-load}
distr_tune <- readRDS("trained_models/2022-05-25_distr_tune.rds")
```

```{r model-evaluation}
autoplot(distr_tune, select_best = TRUE)

(ranks <- distr_tune %>%
    rank_results(select_best = TRUE) %>%
    select(-std_err) %>%
    pivot_wider(names_from = .metric, values_from = mean) %>%
    select(wflow_id, rank, rmse, rsq))

(best_wflow_id <- ranks %>% 
    head(1) %>% pull(wflow_id))

best_results <- 
    distr_tune %>%
    extract_workflow_set_result(best_wflow_id) %>%
    select_best(metric = "rmse")

rf_results <- 
    distr_tune %>%
    extract_workflow_set_result("rf") %>%
    select_best(metric = "rmse")

best_results

rf_results


test_results <- 
    distr_tune %>%
    extract_workflow(best_wflow_id) %>%
    finalize_workflow(best_results) %>%
    last_fit(split = distr_split)

rf_test_results <- 
    distr_tune %>%
    extract_workflow("rf") %>%
    finalize_workflow(rf_results) %>%
    last_fit(split = distr_split)

collect_metrics(test_results)

collect_metrics(rf_test_results)

test_results %>%
    collect_predictions() %>%
    ggplot(aes(x = grad_rate_midpt, y = .pred)) + 
    geom_abline(color = "gray50", lty = 2) + 
    geom_point(alpha = 0.5) + 
    coord_obs_pred() + 
    labs(x = "observed graduation rate", y = "predicted graduation rate", 
         title = paste0("Predicted vs. Observed Graduation Rate with ", best_wflow_id))

library(vip)
rf_test_results %>%
    extract_fit_parsnip() %>%
    vip(num_features = 20)

```