library(tidyverse)
library(readr)
library(corrplot)
<- read_csv("../data/finance_data.csv") finance_data
## Rows: 74772 Columns: 134
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): leaid, censusid
## dbl (129): year, fips, rev_total, rev_fed_total, rev_fed_child_nutrition_act...
## lgl (3): rev_fed_arra, exp_current_arra, outlay_capital_arra
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- read_csv("../data/grad_raceP_household.csv") grad_raceP_household
## Rows: 13314 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): school_ID, state, dist, abbr_state, state_code, LEAID, year
## dbl (16): children, grad_rate_midpt, pct_pov, pct_SP, pct_HHVJ, pct_CC, pct_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Lets filter by revenues that are above 0 and group by LEAID so we can average the total revenue across all four years. Note: the finance data does not include year 2018.
<- finance_data %>%
finance_per_student filter(rev_total > 0) %>%
group_by(leaid)%>%
summarize(rev_total_avg = mean(rev_total))
<- grad_raceP_household %>%
finance_hh_join left_join(finance_per_student, by = c("LEAID" = "leaid"))
%>%
finance_hh_join is.na()%>%
colSums
## school_ID state dist abbr_state state_code
## 0 0 0 0 0
## LEAID year children grad_rate_midpt pct_pov
## 1404 1404 0 3113 0
## pct_SP pct_HHVJ pct_CC pct_NCI pct_CD
## 0 0 0 0 0
## pct_CLI pct_hisp_latino pct_white pct_black pct_native
## 0 1404 1404 1404 1404
## pct_asian pct_PI student_total rev_total_avg
## 1404 1404 3113 1522
There are about 1522 missing total revenues. We would expect 1404 revenue values not to join because we are missing LEAIDs for 1404 districts. However, there are 118 districts that do not have finance information. This number is relatively small, so we don’t need to be super concerned about it.
Now lets divide the revenue by the number of children in each district. This will give us a more accurate way to compare the districts.
<- finance_hh_join %>%
grad_raceP_household_rev mutate(rev_per_cap = rev_total_avg/children)
Lets checkout the distribution of revenue per student. It looks like there is an extreme outlier that has almost 800 thousand dollars in funding per student. This makes sense given that there are only 9 children that are documented to attend Boys Ranch Independent School District.
ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
geom_boxplot()
## Warning: Removed 1522 rows containing non-finite values (stat_boxplot).
%>%
grad_raceP_household_rev select(dist, state, children, rev_per_cap)%>%
arrange(desc(rev_per_cap))
## # A tibble: 13,314 × 4
## dist state children rev_per_cap
## <chr> <chr> <dbl> <dbl>
## 1 Boys Ranch Independent School District Texas 9 777056.
## 2 Galena City School District Alaska 119 290910.
## 3 Julian Union Elementary School District Californ… 270 161892.
## 4 Wink-Loving Independent School District Texas 221 132308.
## 5 Williamsburg City Public Schools Virginia 1122 131633.
## 6 Fort Elliott Consolidated Independent School … Texas 150 102787.
## 7 Fremont County School District 24 Wyoming 231 88415.
## 8 Iraan-Sheffield Independent School District Texas 215 80829.
## 9 Niobrara Public Schools Nebraska 79 80256.
## 10 Karnes City Independent School District Texas 814 78077.
## # … with 13,304 more rows
Lets take the outlier out and see what our distribution looks like.
<- grad_raceP_household_rev %>%
grad_raceP_household_rev filter(rev_per_cap < 700000)
ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
geom_boxplot()
ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
geom_histogram(bins = 100)
%>%
grad_raceP_household_rev select(dist, state, children, rev_per_cap)%>%
arrange(desc(rev_per_cap))
## # A tibble: 11,791 × 4
## dist state children rev_per_cap
## <chr> <chr> <dbl> <dbl>
## 1 Galena City School District Alaska 119 290910.
## 2 Julian Union Elementary School District Californ… 270 161892.
## 3 Wink-Loving Independent School District Texas 221 132308.
## 4 Williamsburg City Public Schools Virginia 1122 131633.
## 5 Fort Elliott Consolidated Independent School … Texas 150 102787.
## 6 Fremont County School District 24 Wyoming 231 88415.
## 7 Iraan-Sheffield Independent School District Texas 215 80829.
## 8 Niobrara Public Schools Nebraska 79 80256.
## 9 Karnes City Independent School District Texas 814 78077.
## 10 Lake and Peninsula Borough School District Alaska 276 75269.
## # … with 11,781 more rows
This is better, but we still have a significant number of outliers.
It looks like filtering the data to only include districts with more than 10,000 students does not change the nature of the distribution. We can see through the histogram that this distribution is highly skewed to the right.
<- grad_raceP_household_rev %>%
grad_raceP_household_rev_filtered filter(children < 10000)
ggplot(grad_raceP_household_rev_filtered, aes(x=rev_per_cap)) +
geom_boxplot()
ggplot(grad_raceP_household_rev_filtered, aes(x=rev_per_cap)) +
geom_histogram(bins = 100)
Here is the joined data set that includes the HH conditions, race data and total revenue.
%>%
grad_raceP_household_rev write_csv(file = "../data/grad_raceP_household_rev.csv")
I am curious how total revenue per child is correlated with grad rates, hh conditions and race.
<- grad_raceP_household_rev %>%
grad_raceP_household_rev_race select(grad_rate_midpt, pct_hisp_latino:pct_PI, rev_per_cap) %>%
na.omit()
1:ncol(grad_raceP_household_rev_race)] %>%
grad_raceP_household_rev_race[,cor() %>%
corrplot(method = "number")
It looks like revenue per student does not correlate with race at all.
<- grad_raceP_household_rev %>%
grad_raceP_household_rev_HHC select(grad_rate_midpt:pct_CLI, rev_per_cap) %>%
na.omit()
1:ncol(grad_raceP_household_rev_HHC)] %>%
grad_raceP_household_rev_HHC[,cor() %>%
corrplot(method = "number")
It looks like revenue per student does not correlate with household conditions at all. These findings are perplexing. We would have guessed that total revenue per student would be correlated to a certain extent with race or household conditions, or even graduation rates.This makes me wonder if it would make more sense to use a different revenue calculation rather than the total. These are divided by federal revenue, state revenue, and local revenue. Perhaps local or state revenue would be more correlated with our other indicators.