tidymodels
library(tidyverse, warn.conflicts = F)
library(tidymodels, warn.conflicts = F)
library(tidylog)
library(ggthemes)
theme_set(theme_clean())
source("../scripts/prune_race_variables.R")
<- parallel::detectCores() cores
<- read_csv("../data/hh.csv") %>%
hh mutate(leaid = as.integer(leaid)) %>%
filter(
if_any(ends_with("MOE"),
function(x) {x < 0.5})
%>%
) select(-ends_with("MOE"))
## Rows: 13313 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): state, dist, region
## dbl (15): leaid, children, pct_pov, pct_SP, SP_MOE, pct_HHVJ, HHVJ_MOE, pct_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
##
## filter: removed 46 rows (<1%), 13,267 rows remaining
##
## select: dropped 6 variables (SP_MOE, HHVJ_MOE, CC_MOE, nci_MOE, CD_MOE, …)
<- read_csv("../data/grad.csv") %>%
grad mutate(leaid = as.integer(leaid))
## Rows: 12663 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): leaid, grad_rate_midpt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
<- read_csv("../data/race.csv") %>%
race prune_and_predom() %>%
mutate(leaid = as.integer(leaid),
predom_race = as.character(predom_race))
## Rows: 11910 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): geo_id, dist, year
## dbl (63): leaid, total_pop_est, total_pop_moe, total_hisp_latino, total_hisp...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## select: dropped 59 variables (geo_id, dist, year, total_pop_est, total_pop_moe, …)
##
## mutate: converted 'leaid' from double to integer (0 new NA)
##
## select: dropped one variable (leaid)
##
## mutate: new variable 'predom_race' (factor) with 5 unique values and 0% NA
##
## select: dropped 6 variables (pct_hisp_latino, pct_white, pct_black, pct_native, pct_asian, …)
##
## mutate: converted 'predom_race' from factor to character (0 new NA)
# assess <- read_csv("../data/assess.csv") %>%
# mutate(leaid = as.integer(leaid))
<- read_csv("../data/finance.csv") %>%
finance mutate(leaid = as.integer(leaid))
## Rows: 13314 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): leaid, fed_per_child, state_per_child, local_per_child
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## mutate: converted 'leaid' from double to integer (0 new NA)
<- hh %>%
data left_join(grad, by = c("leaid" = "leaid")) %>%
left_join(race, by = c("leaid" = "leaid")) %>%
# left_join(assess, by = c("leaid" = "leaid")) %>%
left_join(finance, by = c("leaid" = "leaid")) %>%
select(
-state, -leaid, ends_with(".y")
%>%
) rename_with(~ str_remove_all(.x, ".x"), ends_with(".x"))
## left_join: added one column (grad_rate_midpt)
## > rows only in x 2,505
## > rows only in y ( 1,901)
## > matched rows 10,762
## > ========
## > rows total 13,267
## left_join: added 7 columns (predom_race, pct_hisp_latino, pct_white, pct_black, pct_native, …)
## > rows only in x 1,358
## > rows only in y ( 1)
## > matched rows 11,909
## > ========
## > rows total 13,267
## left_join: added 3 columns (fed_per_child, state_per_child, local_per_child)
## > rows only in x 0
## > rows only in y ( 47)
## > matched rows 13,267
## > ========
## > rows total 13,267
## select: dropped 2 variables (state, leaid)
nrow(data)
## [1] 13267
<- data %>%
data na.omit()
nrow(data)
## [1] 10192
%>% skimr::skim() data
Name | Piped data |
Number of rows | 10192 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 18 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
dist | 0 | 1 | 4 | 68 | 0 | 9733 | 0 |
region | 0 | 1 | 4 | 13 | 0 | 4 | 0 |
predom_race | 0 | 1 | 5 | 15 | 0 | 5 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
children | 0 | 1 | 5066.03 | 15935.34 | 9.00 | 720.00 | 1689.00 | 4108.50 | 724446.00 | ▇▁▁▁▁ |
pct_pov | 0 | 1 | 0.16 | 0.09 | 0.00 | 0.10 | 0.15 | 0.21 | 0.65 | ▇▇▂▁▁ |
pct_SP | 0 | 1 | 0.31 | 0.13 | 0.00 | 0.22 | 0.30 | 0.38 | 0.96 | ▂▇▃▁▁ |
pct_HHVJ | 0 | 1 | 0.26 | 0.05 | 0.00 | 0.23 | 0.26 | 0.29 | 0.57 | ▁▂▇▁▁ |
pct_CC | 0 | 1 | 0.02 | 0.03 | 0.00 | 0.01 | 0.02 | 0.03 | 0.56 | ▇▁▁▁▁ |
pct_NCI | 0 | 1 | 0.14 | 0.11 | 0.00 | 0.06 | 0.12 | 0.19 | 0.88 | ▇▃▁▁▁ |
pct_CD | 0 | 1 | 0.06 | 0.04 | 0.00 | 0.04 | 0.05 | 0.08 | 0.32 | ▇▅▁▁▁ |
pct_CLI | 0 | 1 | 0.02 | 0.04 | 0.00 | 0.00 | 0.00 | 0.02 | 0.66 | ▇▁▁▁▁ |
grad_rate_midpt | 0 | 1 | 86.08 | 7.85 | 3.23 | 82.25 | 87.85 | 91.68 | 97.93 | ▁▁▁▂▇ |
pct_hisp_latino | 0 | 1 | 13.91 | 18.93 | 0.00 | 2.50 | 6.50 | 16.10 | 100.00 | ▇▁▁▁▁ |
pct_white | 0 | 1 | 70.87 | 25.77 | 0.00 | 56.50 | 80.40 | 91.00 | 100.00 | ▁▁▂▃▇ |
pct_black | 0 | 1 | 7.00 | 14.87 | 0.00 | 0.00 | 1.15 | 5.60 | 99.10 | ▇▁▁▁▁ |
pct_native | 0 | 1 | 2.13 | 9.42 | 0.00 | 0.00 | 0.00 | 0.40 | 100.00 | ▇▁▁▁▁ |
pct_asian | 0 | 1 | 1.86 | 4.53 | 0.00 | 0.00 | 0.20 | 1.80 | 65.20 | ▇▁▁▁▁ |
pct_PI | 0 | 1 | 0.06 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 13.10 | ▇▁▁▁▁ |
fed_per_child | 0 | 1 | 991.51 | 1323.87 | 11.51 | 517.14 | 786.57 | 1157.86 | 74305.56 | ▇▁▁▁▁ |
state_per_child | 0 | 1 | 6579.35 | 5971.48 | 275.17 | 4271.43 | 5942.27 | 7851.77 | 393027.78 | ▇▁▁▁▁ |
local_per_child | 0 | 1 | 6210.00 | 5804.47 | 152.86 | 3160.68 | 4832.63 | 7620.01 | 309722.22 | ▇▁▁▁▁ |
Our goal is to create a regression model to predict graduation rate
from household conditions and race data. We will use the
tidymodels
framework for analysis by creating an initial
split (stratifying by predominant race) of training and testing data. We
will also create 10 fold for 10-fold cross-validation.
set.seed(1234)
<- initial_split(data, strata = region)
distr_split <- training(distr_split)
distr_train <- testing(distr_split)
distr_test
set.seed(4321)
<- vfold_cv(distr_train, v = 10, strata = region) distr_folds
We can now create a recipe, or a preprocessor, which can help us by creating dummy variables for our nominal variable(s), as well as centering and scaling our predictors, and removing all near-zero-variance predictors.
<-
distr_rec recipe(grad_rate_midpt ~ ., data = distr_train) %>%
update_role(dist, new_role = "ID") %>%
step_interact(~ all_numeric_predictors():all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors())
%>%
distr_rec prep() %>%
bake(new_data = NULL)
## # A tibble: 7,642 × 140
## dist children pct_pov pct_SP pct_HHVJ pct_CC pct_NCI pct_CD pct_CLI
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Casey-Westfi… -0.247 -0.199 1.12 -0.505 -0.758 -0.608 2.23 -0.492
## 2 Blue Ridge C… -0.266 -0.544 0.265 1.42 -0.259 -0.105 -1.19 -0.492
## 3 Prairie Cent… -0.191 -0.588 -0.259 -0.709 -0.402 -0.0651 -0.0501 -0.0603
## 4 Eastland Com… -0.270 -0.438 -0.839 -1.10 -0.822 -0.857 0.0936 -0.492
## 5 Goreville Co… -0.268 -0.651 -0.427 -1.55 -0.719 -0.435 -1.25 -0.492
## 6 Joppa-Maple … -0.283 0.822 -0.526 -0.498 1.50 3.37 -0.258 -0.492
## 7 Farmington C… -0.222 -0.645 -0.460 0.536 -0.331 -0.562 0.268 -0.492
## 8 Gallatin Com… -0.260 1.07 1.23 -0.939 -0.356 -0.283 2.54 -0.492
## 9 Georgetown-R… -0.240 0.565 0.110 0.116 -0.520 0.153 -0.0723 -0.492
## 10 Norris City-… -0.256 0.201 -0.340 -1.21 -0.593 -0.238 1.12 -0.492
## # … with 7,632 more rows, and 131 more variables: pct_hisp_latino <dbl>,
## # pct_white <dbl>, pct_black <dbl>, fed_per_child <dbl>,
## # state_per_child <dbl>, local_per_child <dbl>, grad_rate_midpt <dbl>,
## # children_x_pct_pov <dbl>, children_x_pct_SP <dbl>,
## # children_x_pct_HHVJ <dbl>, children_x_pct_CC <dbl>,
## # children_x_pct_NCI <dbl>, children_x_pct_CD <dbl>,
## # children_x_pct_CLI <dbl>, children_x_pct_hisp_latino <dbl>, …
We’ll now create some specifications, or types of models, which we will integrate into our workflow. We’ll make the following models:
lm
)glmnet
)mixOmics
)earth
)kernlab
)rpart
)ranger
)xgboost
)kknn
)<-
lm_spec linear_reg() %>%
set_engine('lm')
<-
lasso_spec linear_reg(
mixture = 1,
penalty = tune()
%>%
) set_engine('glmnet')
<-
pls_spec pls(
predictor_prop = tune(),
num_comp = tune()
%>%
) set_engine('mixOmics') %>%
set_mode('regression')
<-
mars_spec mars(
prod_degree = tune()
%>%
) set_engine('earth') %>%
set_mode('regression')
<-
svm_spec svm_linear(
cost = tune(),
margin = tune()
%>%
) set_engine('kernlab') %>%
set_mode('regression')
<-
dtree_spec decision_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
%>%
) set_engine('rpart') %>%
set_mode('regression')
<-
rf_spec rand_forest(
trees = 1000,
mtry = tune(),
min_n = tune()
%>%
) set_engine('ranger', num.threads = cores, importance = "impurity") %>%
set_mode('regression')
<-
xgb_spec boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune(),
mtry = tune(),
sample_size = tune(),
learn_rate = tune()
%>%
) set_engine("xgboost") %>%
set_mode("regression")
# knn_spec <-
# nearest_neighbor(
# neighbors = tune(),
# weight_func = tune(),
# dist_power = tune()
# ) %>%
# set_engine('kknn') %>%
# set_mode('regression')
Now that we have our model specifications, we can put all of these into a workflow set.
<-
distr_workflowset workflow_set(
preproc = list("rec" = distr_rec),
models = list(
"lm" = lm_spec,
"lasso" = lasso_spec,
# "pls" = pls_spec,
"mars" = mars_spec,
"svm" = svm_spec,
"dtree" = dtree_spec,
"rf" = rf_spec,
"xgboost" = xgb_spec
# "knn" = knn_spec
)%>%
) mutate(wflow_id = wflow_id %>% str_remove_all("rec_"))
## mutate: changed 7 values (100%) of 'wflow_id' (0 new NA)
distr_workflowset
## # A workflow set/tibble: 7 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 lm <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 svm <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 dtree <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 rf <tibble [1 × 4]> <opts[0]> <list [0]>
## 7 xgboost <tibble [1 × 4]> <opts[0]> <list [0]>
We can now tune our models:
<-
distr_grid_ctrl control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
::registerDoParallel(cores = cores)
doParallel<-
distr_tune %>%
distr_workflowset workflow_map("tune_grid", seed = 2314,
resamples = distr_folds,
grid = 30,
control = distr_grid_ctrl,
verbose = TRUE)
saveRDS(distr_tune, paste0("trained_models/", Sys.Date(), "_distr_tune.rds"))
i 1 of 7 resampling: lm
✔ 1 of 7 resampling: lm (7.2s)
i 2 of 7 tuning: lasso
✔ 2 of 7 tuning: lasso (7.5s)
i 3 of 7 tuning: mars
✔ 3 of 7 tuning: mars (31.5s)
i 4 of 7 tuning: svm
✔ 4 of 7 tuning: svm (1h 32m 55.8s)
i 5 of 7 tuning: dtree
✔ 5 of 7 tuning: dtree (4m 18.9s)
i 6 of 7 tuning: rf
✔ 6 of 7 tuning: rf (2h 35m 19.7s)
i 7 of 7 tuning: xgboost
✔ 7 of 7 tuning: xgboost (32m 56.5s)
<- readRDS("trained_models/2022-05-25_distr_tune.rds") distr_tune
autoplot(distr_tune, select_best = TRUE)
<- distr_tune %>%
(ranks rank_results(select_best = TRUE) %>%
select(-std_err) %>%
pivot_wider(names_from = .metric, values_from = mean) %>%
select(wflow_id, rank, rmse, rsq))
## select: dropped one variable (std_err)
## pivot_wider: reorganized (.metric, mean) into (rmse, rsq) [was 14x8, now 7x8]
## select: dropped 4 variables (.config, n, preprocessor, model)
## # A tibble: 7 × 4
## wflow_id rank rmse rsq
## <chr> <int> <dbl> <dbl>
## 1 xgboost 1 5.74 0.455
## 2 rf 2 5.79 0.445
## 3 mars 3 6.12 0.385
## 4 lasso 4 6.36 0.332
## 5 dtree 5 6.40 0.323
## 6 lm 6 6.45 0.320
## 7 svm 7 6.47 0.325
<- ranks %>%
(best_wflow_id head(1) %>% pull(wflow_id))
## [1] "xgboost"
<-
best_results %>%
distr_tune extract_workflow_set_result(best_wflow_id) %>%
select_best(metric = "rmse")
<-
rf_results %>%
distr_tune extract_workflow_set_result("rf") %>%
select_best(metric = "rmse")
best_results
## # A tibble: 1 × 6
## mtry min_n tree_depth learn_rate sample_size .config
## <int> <int> <int> <dbl> <dbl> <chr>
## 1 19 13 11 0.00565 0.960 Preprocessor1_Model10
rf_results
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 104 13 Preprocessor1_Model23
<-
test_results %>%
distr_tune extract_workflow(best_wflow_id) %>%
finalize_workflow(best_results) %>%
last_fit(split = distr_split)
<-
rf_test_results %>%
distr_tune extract_workflow("rf") %>%
finalize_workflow(rf_results) %>%
last_fit(split = distr_split)
collect_metrics(test_results)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 5.98 Preprocessor1_Model1
## 2 rsq standard 0.450 Preprocessor1_Model1
collect_metrics(rf_test_results)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 6.06 Preprocessor1_Model1
## 2 rsq standard 0.435 Preprocessor1_Model1
%>%
test_results collect_predictions() %>%
ggplot(aes(x = grad_rate_midpt, y = .pred)) +
geom_abline(color = "gray50", lty = 2) +
geom_point(alpha = 0.5) +
coord_obs_pred() +
labs(x = "observed graduation rate", y = "predicted graduation rate",
title = paste0("Predicted vs. Observed Graduation Rate with ", best_wflow_id))
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
%>%
rf_test_results extract_fit_parsnip() %>%
vip(num_features = 20)