tidymodelslibrary(tidyverse, warn.conflicts = F)
library(tidymodels, warn.conflicts = F)
library(tidylog)
library(ggthemes)
theme_set(theme_clean())
source("../scripts/prune_race_variables.R")
cores <- parallel::detectCores()hh <- read_csv("../data/hh.csv") %>%
mutate(leaid = as.integer(leaid)) %>%
filter(
if_any(ends_with("MOE"),
function(x) {x < 0.5})
) %>%
select(-ends_with("MOE"))## Rows: 13313 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): state, dist, region
## dbl (15): leaid, children, pct_pov, pct_SP, SP_MOE, pct_HHVJ, HHVJ_MOE, pct_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
##
## filter: removed 46 rows (<1%), 13,267 rows remaining
##
## select: dropped 6 variables (SP_MOE, HHVJ_MOE, CC_MOE, nci_MOE, CD_MOE, …)
grad <- read_csv("../data/grad.csv") %>%
mutate(leaid = as.integer(leaid))## Rows: 12663 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): leaid, grad_rate_midpt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
race <- read_csv("../data/race.csv") %>%
prune_and_predom() %>%
mutate(leaid = as.integer(leaid),
predom_race = as.character(predom_race))## Rows: 11910 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): geo_id, dist, year
## dbl (63): leaid, total_pop_est, total_pop_moe, total_hisp_latino, total_hisp...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## select: dropped 59 variables (geo_id, dist, year, total_pop_est, total_pop_moe, …)
##
## mutate: converted 'leaid' from double to integer (0 new NA)
##
## select: dropped one variable (leaid)
##
## mutate: new variable 'predom_race' (factor) with 5 unique values and 0% NA
##
## select: dropped 6 variables (pct_hisp_latino, pct_white, pct_black, pct_native, pct_asian, …)
##
## mutate: converted 'predom_race' from factor to character (0 new NA)
# assess <- read_csv("../data/assess.csv") %>%
# mutate(leaid = as.integer(leaid))
finance <- read_csv("../data/finance.csv") %>%
mutate(leaid = as.integer(leaid))## Rows: 13314 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): leaid, fed_per_child, state_per_child, local_per_child
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
data <- hh %>%
left_join(grad, by = c("leaid" = "leaid")) %>%
left_join(race, by = c("leaid" = "leaid")) %>%
# left_join(assess, by = c("leaid" = "leaid")) %>%
left_join(finance, by = c("leaid" = "leaid")) %>%
select(
-state, -leaid, ends_with(".y")
) %>%
rename_with(~ str_remove_all(.x, ".x"), ends_with(".x"))## left_join: added one column (grad_rate_midpt)
## > rows only in x 2,505
## > rows only in y ( 1,901)
## > matched rows 10,762
## > ========
## > rows total 13,267
## left_join: added 7 columns (predom_race, pct_hisp_latino, pct_white, pct_black, pct_native, …)
## > rows only in x 1,358
## > rows only in y ( 1)
## > matched rows 11,909
## > ========
## > rows total 13,267
## left_join: added 3 columns (fed_per_child, state_per_child, local_per_child)
## > rows only in x 0
## > rows only in y ( 47)
## > matched rows 13,267
## > ========
## > rows total 13,267
## select: dropped 2 variables (state, leaid)
nrow(data)## [1] 13267
data <- data %>%
na.omit()
nrow(data)## [1] 10192
data %>% skimr::skim()| Name | Piped data |
| Number of rows | 10192 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| dist | 0 | 1 | 4 | 68 | 0 | 9733 | 0 |
| region | 0 | 1 | 4 | 13 | 0 | 4 | 0 |
| predom_race | 0 | 1 | 5 | 15 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| children | 0 | 1 | 5066.03 | 15935.34 | 9.00 | 720.00 | 1689.00 | 4108.50 | 724446.00 | ▇▁▁▁▁ |
| pct_pov | 0 | 1 | 0.16 | 0.09 | 0.00 | 0.10 | 0.15 | 0.21 | 0.65 | ▇▇▂▁▁ |
| pct_SP | 0 | 1 | 0.31 | 0.13 | 0.00 | 0.22 | 0.30 | 0.38 | 0.96 | ▂▇▃▁▁ |
| pct_HHVJ | 0 | 1 | 0.26 | 0.05 | 0.00 | 0.23 | 0.26 | 0.29 | 0.57 | ▁▂▇▁▁ |
| pct_CC | 0 | 1 | 0.02 | 0.03 | 0.00 | 0.01 | 0.02 | 0.03 | 0.56 | ▇▁▁▁▁ |
| pct_NCI | 0 | 1 | 0.14 | 0.11 | 0.00 | 0.06 | 0.12 | 0.19 | 0.88 | ▇▃▁▁▁ |
| pct_CD | 0 | 1 | 0.06 | 0.04 | 0.00 | 0.04 | 0.05 | 0.08 | 0.32 | ▇▅▁▁▁ |
| pct_CLI | 0 | 1 | 0.02 | 0.04 | 0.00 | 0.00 | 0.00 | 0.02 | 0.66 | ▇▁▁▁▁ |
| grad_rate_midpt | 0 | 1 | 86.08 | 7.85 | 3.23 | 82.25 | 87.85 | 91.68 | 97.93 | ▁▁▁▂▇ |
| pct_hisp_latino | 0 | 1 | 13.91 | 18.93 | 0.00 | 2.50 | 6.50 | 16.10 | 100.00 | ▇▁▁▁▁ |
| pct_white | 0 | 1 | 70.87 | 25.77 | 0.00 | 56.50 | 80.40 | 91.00 | 100.00 | ▁▁▂▃▇ |
| pct_black | 0 | 1 | 7.00 | 14.87 | 0.00 | 0.00 | 1.15 | 5.60 | 99.10 | ▇▁▁▁▁ |
| pct_native | 0 | 1 | 2.13 | 9.42 | 0.00 | 0.00 | 0.00 | 0.40 | 100.00 | ▇▁▁▁▁ |
| pct_asian | 0 | 1 | 1.86 | 4.53 | 0.00 | 0.00 | 0.20 | 1.80 | 65.20 | ▇▁▁▁▁ |
| pct_PI | 0 | 1 | 0.06 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 13.10 | ▇▁▁▁▁ |
| fed_per_child | 0 | 1 | 991.51 | 1323.87 | 11.51 | 517.14 | 786.57 | 1157.86 | 74305.56 | ▇▁▁▁▁ |
| state_per_child | 0 | 1 | 6579.35 | 5971.48 | 275.17 | 4271.43 | 5942.27 | 7851.77 | 393027.78 | ▇▁▁▁▁ |
| local_per_child | 0 | 1 | 6210.00 | 5804.47 | 152.86 | 3160.68 | 4832.63 | 7620.01 | 309722.22 | ▇▁▁▁▁ |
Our goal is to create a regression model to predict graduation rate
from household conditions and race data. We will use the
tidymodels framework for analysis by creating an initial
split (stratifying by predominant race) of training and testing data. We
will also create 10 fold for 10-fold cross-validation.
set.seed(1234)
distr_split <- initial_split(data, strata = region)
distr_train <- training(distr_split)
distr_test <- testing(distr_split)
set.seed(4321)
distr_folds <- vfold_cv(distr_train, v = 10, strata = region)We can now create a recipe, or a preprocessor, which can help us by creating dummy variables for our nominal variable(s), as well as centering and scaling our predictors, and removing all near-zero-variance predictors.
distr_rec <-
recipe(grad_rate_midpt ~ ., data = distr_train) %>%
update_role(dist, new_role = "ID") %>%
step_interact(~ all_numeric_predictors():all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors())
distr_rec %>%
prep() %>%
bake(new_data = NULL)## # A tibble: 7,642 × 140
## dist children pct_pov pct_SP pct_HHVJ pct_CC pct_NCI pct_CD pct_CLI
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Casey-Westfi… -0.247 -0.199 1.12 -0.505 -0.758 -0.608 2.23 -0.492
## 2 Blue Ridge C… -0.266 -0.544 0.265 1.42 -0.259 -0.105 -1.19 -0.492
## 3 Prairie Cent… -0.191 -0.588 -0.259 -0.709 -0.402 -0.0651 -0.0501 -0.0603
## 4 Eastland Com… -0.270 -0.438 -0.839 -1.10 -0.822 -0.857 0.0936 -0.492
## 5 Goreville Co… -0.268 -0.651 -0.427 -1.55 -0.719 -0.435 -1.25 -0.492
## 6 Joppa-Maple … -0.283 0.822 -0.526 -0.498 1.50 3.37 -0.258 -0.492
## 7 Farmington C… -0.222 -0.645 -0.460 0.536 -0.331 -0.562 0.268 -0.492
## 8 Gallatin Com… -0.260 1.07 1.23 -0.939 -0.356 -0.283 2.54 -0.492
## 9 Georgetown-R… -0.240 0.565 0.110 0.116 -0.520 0.153 -0.0723 -0.492
## 10 Norris City-… -0.256 0.201 -0.340 -1.21 -0.593 -0.238 1.12 -0.492
## # … with 7,632 more rows, and 131 more variables: pct_hisp_latino <dbl>,
## # pct_white <dbl>, pct_black <dbl>, fed_per_child <dbl>,
## # state_per_child <dbl>, local_per_child <dbl>, grad_rate_midpt <dbl>,
## # children_x_pct_pov <dbl>, children_x_pct_SP <dbl>,
## # children_x_pct_HHVJ <dbl>, children_x_pct_CC <dbl>,
## # children_x_pct_NCI <dbl>, children_x_pct_CD <dbl>,
## # children_x_pct_CLI <dbl>, children_x_pct_hisp_latino <dbl>, …
We’ll now create some specifications, or types of models, which we will integrate into our workflow. We’ll make the following models:
lm)glmnet)mixOmics)earth)kernlab)rpart)ranger)xgboost)kknn)lm_spec <-
linear_reg() %>%
set_engine('lm')
lasso_spec <-
linear_reg(
mixture = 1,
penalty = tune()
) %>%
set_engine('glmnet')
pls_spec <-
pls(
predictor_prop = tune(),
num_comp = tune()
) %>%
set_engine('mixOmics') %>%
set_mode('regression')
mars_spec <-
mars(
prod_degree = tune()
) %>%
set_engine('earth') %>%
set_mode('regression')
svm_spec <-
svm_linear(
cost = tune(),
margin = tune()
) %>%
set_engine('kernlab') %>%
set_mode('regression')
dtree_spec <-
decision_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
) %>%
set_engine('rpart') %>%
set_mode('regression')
rf_spec <-
rand_forest(
trees = 1000,
mtry = tune(),
min_n = tune()
) %>%
set_engine('ranger', num.threads = cores, importance = "impurity") %>%
set_mode('regression')
xgb_spec <-
boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune(),
mtry = tune(),
sample_size = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
# knn_spec <-
# nearest_neighbor(
# neighbors = tune(),
# weight_func = tune(),
# dist_power = tune()
# ) %>%
# set_engine('kknn') %>%
# set_mode('regression')Now that we have our model specifications, we can put all of these into a workflow set.
distr_workflowset <-
workflow_set(
preproc = list("rec" = distr_rec),
models = list(
"lm" = lm_spec,
"lasso" = lasso_spec,
# "pls" = pls_spec,
"mars" = mars_spec,
"svm" = svm_spec,
"dtree" = dtree_spec,
"rf" = rf_spec,
"xgboost" = xgb_spec
# "knn" = knn_spec
)
) %>%
mutate(wflow_id = wflow_id %>% str_remove_all("rec_"))## mutate: changed 7 values (100%) of 'wflow_id' (0 new NA)
distr_workflowset## # A workflow set/tibble: 7 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 lm <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 svm <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 dtree <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 xgboost <tibble [1 × 4]> <opts[0]> <list [0]>
We can now tune our models:
distr_grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
doParallel::registerDoParallel(cores = cores)
distr_tune <-
distr_workflowset %>%
workflow_map("tune_grid", seed = 2314,
resamples = distr_folds,
grid = 30,
control = distr_grid_ctrl,
verbose = TRUE)
saveRDS(distr_tune, paste0("trained_models/", Sys.Date(), "_distr_tune.rds"))i 1 of 7 resampling: lm
✔ 1 of 7 resampling: lm (7.2s)
i 2 of 7 tuning: lasso
✔ 2 of 7 tuning: lasso (7.5s)
i 3 of 7 tuning: mars
✔ 3 of 7 tuning: mars (31.5s)
i 4 of 7 tuning: svm
✔ 4 of 7 tuning: svm (1h 32m 55.8s)
i 5 of 7 tuning: dtree
✔ 5 of 7 tuning: dtree (4m 18.9s)
i 6 of 7 tuning: rf
✔ 6 of 7 tuning: rf (2h 35m 19.7s)
i 7 of 7 tuning: xgboost
✔ 7 of 7 tuning: xgboost (32m 56.5s)
distr_tune <- readRDS("trained_models/2022-05-25_distr_tune.rds")autoplot(distr_tune, select_best = TRUE)(ranks <- distr_tune %>%
rank_results(select_best = TRUE) %>%
select(-std_err) %>%
pivot_wider(names_from = .metric, values_from = mean) %>%
select(wflow_id, rank, rmse, rsq))## select: dropped one variable (std_err)
## pivot_wider: reorganized (.metric, mean) into (rmse, rsq) [was 14x8, now 7x8]
## select: dropped 4 variables (.config, n, preprocessor, model)
## # A tibble: 7 × 4
## wflow_id rank rmse rsq
## <chr> <int> <dbl> <dbl>
## 1 xgboost 1 5.74 0.455
## 2 rf 2 5.79 0.445
## 3 mars 3 6.12 0.385
## 4 lasso 4 6.36 0.332
## 5 dtree 5 6.40 0.323
## 6 lm 6 6.45 0.320
## 7 svm 7 6.47 0.325
(best_wflow_id <- ranks %>%
head(1) %>% pull(wflow_id))## [1] "xgboost"
best_results <-
distr_tune %>%
extract_workflow_set_result(best_wflow_id) %>%
select_best(metric = "rmse")
rf_results <-
distr_tune %>%
extract_workflow_set_result("rf") %>%
select_best(metric = "rmse")
best_results## # A tibble: 1 × 6
## mtry min_n tree_depth learn_rate sample_size .config
## <int> <int> <int> <dbl> <dbl> <chr>
## 1 19 13 11 0.00565 0.960 Preprocessor1_Model10
rf_results## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 104 13 Preprocessor1_Model23
test_results <-
distr_tune %>%
extract_workflow(best_wflow_id) %>%
finalize_workflow(best_results) %>%
last_fit(split = distr_split)
rf_test_results <-
distr_tune %>%
extract_workflow("rf") %>%
finalize_workflow(rf_results) %>%
last_fit(split = distr_split)
collect_metrics(test_results)## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 5.98 Preprocessor1_Model1
## 2 rsq standard 0.450 Preprocessor1_Model1
collect_metrics(rf_test_results)## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 6.06 Preprocessor1_Model1
## 2 rsq standard 0.435 Preprocessor1_Model1
test_results %>%
collect_predictions() %>%
ggplot(aes(x = grad_rate_midpt, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(alpha = 0.5) +
coord_obs_pred() +
labs(x = "observed graduation rate", y = "predicted graduation rate",
title = paste0("Predicted vs. Observed Graduation Rate with ", best_wflow_id))library(vip)##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
rf_test_results %>%
extract_fit_parsnip() %>%
vip(num_features = 20)