library(tidyverse)
library(openintro)
library(scales)
library(corrplot)
library(ggthemes)
source("../scripts/prune_race_variables.R")
theme_set(theme_clean())
<- read_csv("../data/assess.csv") assess
## Rows: 13328 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): leaid, read_score, math_score, total_score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv("../data/hh.csv") %>%
hh select(-contains("MOE"))
## Rows: 13313 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): state, dist, region
## dbl (15): leaid, children, pct_pov, pct_SP, SP_MOE, pct_HHVJ, HHVJ_MOE, pct_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv("../data/race.csv") %>%
race prune_and_predom() %>%
mutate(leaid = as.integer(leaid),
predom_race = as.character(predom_race))
## Rows: 11910 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): geo_id, dist, year
## dbl (63): leaid, total_pop_est, total_pop_moe, total_hisp_latino, total_hisp...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv("../data/grad.csv") grad
## Rows: 12663 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): leaid, grad_rate_midpt
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- hh %>%
data left_join(assess, by = "leaid")%>%
left_join(race, by = "leaid") %>%
left_join(grad, by = "leaid")
<- data %>%
data select(leaid,
dist, children,
grad_rate_midpt, starts_with("pct"),
-pct_PI) %>%
mutate(
across(.cols = pct_pov:pct_CLI,
.fns = function(x) {round(x*100, 3)})
)names(data)
## [1] "leaid" "dist" "children" "grad_rate_midpt"
## [5] "pct_pov" "pct_SP" "pct_HHVJ" "pct_CC"
## [9] "pct_NCI" "pct_CD" "pct_CLI" "pct_hisp_latino"
## [13] "pct_white" "pct_black" "pct_native" "pct_asian"
Let’s look at the distributions of all the different variables.
%>%
data pivot_longer(cols = contains("pct"),
names_to = "type",
values_to = "vals") %>%
ggplot(aes(x = vals)) +
geom_histogram() +
facet_wrap(vars(type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7020 rows containing non-finite values (stat_bin).
%>%
data select(pct_native) %>%
filter(pct_native > 0) %>%
na.omit() %>%
ggplot(aes(x = pct_native)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Because of the heavy right skew of the distribution of Native American proportions across school districts, we will replace this column with an indicator. To account for all the difference races, we can make this indicator column correspond to which race is the predominant race, indicated by a given race having the maximum proportion in the partition of races within a given school district.
<- data %>%
data select(12:16) %>%
mutate(
predom_race = names(.)[max.col(.)],
predom_race = predom_race %>%
str_replace_all("pct_", ""),
predom_race = predom_race %>%
factor(levels = c("hisp_latino", "white",
"black", "native", "asian"),
labels = c("Hispanic/Latino", "White",
"Black", "Native American", "Asian")
)%>%
) select(predom_race) %>%
bind_cols(data) %>%
relocate(predom_race, .after = children)
%>%
data count(predom_race)
## # A tibble: 6 × 2
## predom_race n
## <fct> <int>
## 1 Hispanic/Latino 1213
## 2 White 9917
## 3 Black 530
## 4 Native American 198
## 5 Asian 51
## 6 <NA> 1404
Now that we’ve added an indicator for the predominant race, let’s visualize some of the graduation rates as broken up by that most predominant race.
%>%
data filter(!is.na(predom_race)) %>%
ggplot(aes(
x = grad_rate_midpt,
y = after_stat(density)
+
)) geom_freqpoly(aes(color = predom_race)) +
scale_x_continuous(labels = scales::percent_format(scale = 1)) +
labs(x = "Graduation Rate",
y = "",
title = "Graduation Rate Distribution per Race",
fill = "Predominant Race",
color = "Predominant Race")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1708 rows containing non-finite values (stat_bin).
%>%
data filter(!is.na(predom_race)) %>%
ggplot(aes(
x = grad_rate_midpt,
y = after_stat(density)
+
)) geom_histogram(aes(color = predom_race,
fill = predom_race)) +
geom_freqpoly(aes(color = predom_race)) +
scale_x_continuous(labels = scales::percent_format(scale = 1)) +
labs(x = "Graduation Rate",
y = "",
title = "Graduation Rate Distribution per Race",
fill = "Predominant Race",
color = "Predominant Race")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1708 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1708 rows containing non-finite values (stat_bin).
%>%
data filter(!is.na(predom_race)) %>%
ggplot(aes(
x = grad_rate_midpt,
y = after_stat(density)
+
)) geom_histogram() +
geom_density() +
facet_grid(rows = vars(predom_race)) +
scale_x_continuous(labels = scales::percent_format(scale = 1)) +
labs(x = "Graduation Rate",
y = "",
title = "Graduation Rate Distribution per Race")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1708 rows containing non-finite values (stat_bin).
## Warning: Removed 1708 rows containing non-finite values (stat_density).
Now that we’ve split up our school districts by the most prevalent race in that district, we can now look at the distributions of our household conditions by race to see what we can see.
%>%
data select(!(pct_hisp_latino:last_col())) %>%
pivot_longer(cols = contains("pct"),
names_to = "type",
values_to = "vals") %>%
ggplot(aes(x = vals, y = after_stat(density))) +
geom_histogram() +
facet_grid(rows = vars(type),
cols = vars(predom_race))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- data %>%
hh_conds select(predom_race, pct_pov:pct_CLI) %>%
filter(!is.na(predom_race))
<- hh_conds %>%
races count(predom_race) %>%
pull(predom_race) %>%
as.character()
%>%
hh_conds filter(
== races[1]
predom_race %>%
) select(!predom_race) %>%
cor() %>%
corrplot(method = "number")
For school districts which are primarily Hispanic/Latino, we can see that the following variables are moderately correlated with each other (\(|r| > 0.4\)): - Percent Poverty and Percent Single Parent - Percent Poverty and Percent No Computer/Internet Access - Percent Crowded Conditions and Percent of Linguistically Isolated Children
%>%
hh_conds filter(
== races[2]
predom_race %>%
) select(!predom_race) %>%
cor() %>%
corrplot(method = "number")
For school districts which are primarily White, we can see that the following variables are moderately correlated with each other (\(|r| > 0.4\)): - Percent Poverty and Percent Single Parent - Percent Poverty and Percent No Computer/Internet Access
%>%
hh_conds filter(
== races[3]
predom_race %>%
) select(!predom_race) %>%
cor() %>%
corrplot(method = "number")
For school districts which are primarily Black, we can see that the following variables are moderately correlated with each other (\(|r| > 0.4\)): - Percent Poverty and Percent Single Parent - Percent Poverty and Percent No Computer/Internet Access
%>%
hh_conds filter(
== races[4]
predom_race %>%
) select(!predom_race) %>%
cor() %>%
corrplot(method = "number")
For school districts which are primarily White, we can see that the following variables are moderately correlated with each other (\(|r| > 0.4\)): - Percent Poverty and Percent Crowded Conditions - Percent Poverty and Percent No Computer/Internet Access
%>%
hh_conds filter(
== races[5]
predom_race %>%
) select(!predom_race) %>%
cor() %>%
corrplot(method = "number")
For school districts which are primarily Asian, we can see that almost all of the variables are strongly correlated with one another. This will require some thought for our regression analysis, though it is important to note that there are only 51 school districts recorded which are predominantly Asian.