library(tidyverse)
library(readr)
library(corrplot)

Download Packages

finance_data <- read_csv("../data/finance_data.csv")
## Rows: 74772 Columns: 134
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (2): leaid, censusid
## dbl (129): year, fips, rev_total, rev_fed_total, rev_fed_child_nutrition_act...
## lgl   (3): rev_fed_arra, exp_current_arra, outlay_capital_arra
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grad_raceP_household <- read_csv("../data/grad_raceP_household.csv")
## Rows: 13314 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): school_ID, state, dist, abbr_state, state_code, LEAID, year
## dbl (16): children, grad_rate_midpt, pct_pov, pct_SP, pct_HHVJ, pct_CC, pct_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Average revenue (2014-2017)

Lets filter by revenues that are above 0 and group by LEAID so we can average the total revenue across all four years. Note: the finance data does not include year 2018.

finance_per_student <- finance_data %>%
  filter(rev_total > 0) %>%
  group_by(leaid)%>%
  summarize(rev_total_avg = mean(rev_total))

Join by LEAID

finance_hh_join <- grad_raceP_household %>%
  left_join(finance_per_student, by = c("LEAID" = "leaid"))

finance_hh_join %>%
  is.na()%>%
  colSums
##       school_ID           state            dist      abbr_state      state_code 
##               0               0               0               0               0 
##           LEAID            year        children grad_rate_midpt         pct_pov 
##            1404            1404               0            3113               0 
##          pct_SP        pct_HHVJ          pct_CC         pct_NCI          pct_CD 
##               0               0               0               0               0 
##         pct_CLI pct_hisp_latino       pct_white       pct_black      pct_native 
##               0            1404            1404            1404            1404 
##       pct_asian          pct_PI   student_total   rev_total_avg 
##            1404            1404            3113            1522

There are about 1522 missing total revenues. We would expect 1404 revenue values not to join because we are missing LEAIDs for 1404 districts. However, there are 118 districts that do not have finance information. This number is relatively small, so we don’t need to be super concerned about it.

Total Revenue per Child

Now lets divide the revenue by the number of children in each district. This will give us a more accurate way to compare the districts.

grad_raceP_household_rev <- finance_hh_join %>%
  mutate(rev_per_cap = rev_total_avg/children)

Lets checkout the distribution of revenue per student. It looks like there is an extreme outlier that has almost 800 thousand dollars in funding per student. This makes sense given that there are only 9 children that are documented to attend Boys Ranch Independent School District.

ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
  geom_boxplot()
## Warning: Removed 1522 rows containing non-finite values (stat_boxplot).

 grad_raceP_household_rev %>%
  select(dist, state, children, rev_per_cap)%>%
  arrange(desc(rev_per_cap))
## # A tibble: 13,314 × 4
##    dist                                           state     children rev_per_cap
##    <chr>                                          <chr>        <dbl>       <dbl>
##  1 Boys Ranch Independent School District         Texas            9     777056.
##  2 Galena City School District                    Alaska         119     290910.
##  3 Julian Union Elementary School District        Californ…      270     161892.
##  4 Wink-Loving Independent School District        Texas          221     132308.
##  5 Williamsburg City Public Schools               Virginia      1122     131633.
##  6 Fort Elliott Consolidated Independent School … Texas          150     102787.
##  7 Fremont County School District 24              Wyoming        231      88415.
##  8 Iraan-Sheffield Independent School District    Texas          215      80829.
##  9 Niobrara Public Schools                        Nebraska        79      80256.
## 10 Karnes City Independent School District        Texas          814      78077.
## # … with 13,304 more rows

Lets take the outlier out and see what our distribution looks like.

 grad_raceP_household_rev <- grad_raceP_household_rev %>%
  filter(rev_per_cap < 700000)

ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
  geom_boxplot()

ggplot(grad_raceP_household_rev, aes(x=rev_per_cap)) +
  geom_histogram(bins = 100)

 grad_raceP_household_rev %>%
  select(dist, state, children, rev_per_cap)%>%
  arrange(desc(rev_per_cap))
## # A tibble: 11,791 × 4
##    dist                                           state     children rev_per_cap
##    <chr>                                          <chr>        <dbl>       <dbl>
##  1 Galena City School District                    Alaska         119     290910.
##  2 Julian Union Elementary School District        Californ…      270     161892.
##  3 Wink-Loving Independent School District        Texas          221     132308.
##  4 Williamsburg City Public Schools               Virginia      1122     131633.
##  5 Fort Elliott Consolidated Independent School … Texas          150     102787.
##  6 Fremont County School District 24              Wyoming        231      88415.
##  7 Iraan-Sheffield Independent School District    Texas          215      80829.
##  8 Niobrara Public Schools                        Nebraska        79      80256.
##  9 Karnes City Independent School District        Texas          814      78077.
## 10 Lake and Peninsula Borough School District     Alaska         276      75269.
## # … with 11,781 more rows

This is better, but we still have a significant number of outliers.

It looks like filtering the data to only include districts with more than 10,000 students does not change the nature of the distribution. We can see through the histogram that this distribution is highly skewed to the right.

grad_raceP_household_rev_filtered <- grad_raceP_household_rev %>%
  filter(children < 10000)

ggplot(grad_raceP_household_rev_filtered, aes(x=rev_per_cap)) +
  geom_boxplot()

ggplot(grad_raceP_household_rev_filtered, aes(x=rev_per_cap)) +
  geom_histogram(bins = 100)

Here is the joined data set that includes the HH conditions, race data and total revenue.

 grad_raceP_household_rev %>% 
  write_csv(file = "../data/grad_raceP_household_rev.csv")

I am curious how total revenue per child is correlated with grad rates, hh conditions and race.

Correlations

Race and total revenue per child correlation

 grad_raceP_household_rev_race <- grad_raceP_household_rev %>%
  select(grad_rate_midpt, pct_hisp_latino:pct_PI, rev_per_cap) %>%
  na.omit()

grad_raceP_household_rev_race[,1:ncol(grad_raceP_household_rev_race)] %>% 
    cor() %>%
    corrplot(method = "number")

It looks like revenue per student does not correlate with race at all.

HH conditions and total revenue per child correlation

grad_raceP_household_rev_HHC <- grad_raceP_household_rev %>%
  select(grad_rate_midpt:pct_CLI, rev_per_cap) %>%
  na.omit()

grad_raceP_household_rev_HHC[,1:ncol(grad_raceP_household_rev_HHC)] %>% 
    cor() %>%
    corrplot(method = "number")

It looks like revenue per student does not correlate with household conditions at all. These findings are perplexing. We would have guessed that total revenue per student would be correlated to a certain extent with race or household conditions, or even graduation rates.This makes me wonder if it would make more sense to use a different revenue calculation rather than the total. These are divided by federal revenue, state revenue, and local revenue. Perhaps local or state revenue would be more correlated with our other indicators.

LS0tCnRpdGxlOiAiSm9pbmluZyBSZXZlbnVlIHBlciBDaGlsZCIKYXV0aG9yOiAiSm9obiBHZWlnZXIsIE5vZWwgR29vZHdpbiwgQWJpZ2FpbCBKb3BwYSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydAotLS0KCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KGNvcnJwbG90KQpgYGAKCiMjIyBEb3dubG9hZCBQYWNrYWdlcwoKYGBge3J9CmZpbmFuY2VfZGF0YSA8LSByZWFkX2NzdigiLi4vZGF0YS9maW5hbmNlX2RhdGEuY3N2IikKZ3JhZF9yYWNlUF9ob3VzZWhvbGQgPC0gcmVhZF9jc3YoIi4uL2RhdGEvZ3JhZF9yYWNlUF9ob3VzZWhvbGQuY3N2IikKCmBgYAoKCiMjIyBBdmVyYWdlIHJldmVudWUgKDIwMTQtMjAxNykKCkxldHMgZmlsdGVyIGJ5IHJldmVudWVzIHRoYXQgYXJlIGFib3ZlIDAgYW5kIGdyb3VwIGJ5IExFQUlEIHNvIHdlIGNhbiBhdmVyYWdlIHRoZSB0b3RhbCByZXZlbnVlIGFjcm9zcyBhbGwgZm91ciB5ZWFycy4gIE5vdGU6IHRoZSBmaW5hbmNlIGRhdGEgZG9lcyBub3QgaW5jbHVkZSB5ZWFyIDIwMTguIApgYGB7cn0KZmluYW5jZV9wZXJfc3R1ZGVudCA8LSBmaW5hbmNlX2RhdGEgJT4lCiAgZmlsdGVyKHJldl90b3RhbCA+IDApICU+JQogIGdyb3VwX2J5KGxlYWlkKSU+JQogIHN1bW1hcml6ZShyZXZfdG90YWxfYXZnID0gbWVhbihyZXZfdG90YWwpKQoKYGBgCgoKIyMjIEpvaW4gYnkgTEVBSUQKCmBgYHtyfQpmaW5hbmNlX2hoX2pvaW4gPC0gZ3JhZF9yYWNlUF9ob3VzZWhvbGQgJT4lCiAgbGVmdF9qb2luKGZpbmFuY2VfcGVyX3N0dWRlbnQsIGJ5ID0gYygiTEVBSUQiID0gImxlYWlkIikpCgpmaW5hbmNlX2hoX2pvaW4gJT4lCiAgaXMubmEoKSU+JQogIGNvbFN1bXMKYGBgClRoZXJlIGFyZSBhYm91dCAxNTIyIG1pc3NpbmcgdG90YWwgcmV2ZW51ZXMuIFdlIHdvdWxkIGV4cGVjdCAxNDA0IHJldmVudWUgdmFsdWVzIG5vdCB0byBqb2luIGJlY2F1c2Ugd2UgYXJlIG1pc3NpbmcgTEVBSURzIGZvciAxNDA0IGRpc3RyaWN0cy4gSG93ZXZlciwgdGhlcmUgYXJlIDExOCBkaXN0cmljdHMgdGhhdCBkbyBub3QgaGF2ZSBmaW5hbmNlIGluZm9ybWF0aW9uLiBUaGlzIG51bWJlciBpcyByZWxhdGl2ZWx5IHNtYWxsLCBzbyB3ZSBkb24ndCBuZWVkIHRvIGJlIHN1cGVyIGNvbmNlcm5lZCBhYm91dCBpdC4gCgojIyMgVG90YWwgUmV2ZW51ZSBwZXIgQ2hpbGQKCk5vdyBsZXRzIGRpdmlkZSB0aGUgcmV2ZW51ZSBieSB0aGUgbnVtYmVyIG9mIGNoaWxkcmVuIGluIGVhY2ggZGlzdHJpY3QuIFRoaXMgd2lsbCBnaXZlIHVzIGEgbW9yZSBhY2N1cmF0ZSB3YXkgdG8gY29tcGFyZSB0aGUgZGlzdHJpY3RzLiAKYGBge3J9CmdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiA8LSBmaW5hbmNlX2hoX2pvaW4gJT4lCiAgbXV0YXRlKHJldl9wZXJfY2FwID0gcmV2X3RvdGFsX2F2Zy9jaGlsZHJlbikKYGBgCgpMZXRzIGNoZWNrb3V0IHRoZSBkaXN0cmlidXRpb24gb2YgcmV2ZW51ZSBwZXIgc3R1ZGVudC4gSXQgbG9va3MgbGlrZSB0aGVyZSBpcyBhbiBleHRyZW1lIG91dGxpZXIgdGhhdCBoYXMgYWxtb3N0IDgwMCB0aG91c2FuZCBkb2xsYXJzIGluIGZ1bmRpbmcgcGVyIHN0dWRlbnQuIFRoaXMgbWFrZXMgc2Vuc2UgZ2l2ZW4gdGhhdCB0aGVyZSBhcmUgb25seSA5IGNoaWxkcmVuIHRoYXQgYXJlIGRvY3VtZW50ZWQgdG8gYXR0ZW5kIEJveXMgUmFuY2ggSW5kZXBlbmRlbnQgU2Nob29sIERpc3RyaWN0LiAKYGBge3J9CgpnZ3Bsb3QoZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2LCBhZXMoeD1yZXZfcGVyX2NhcCkpICsKICBnZW9tX2JveHBsb3QoKQoKIGdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiAlPiUKICBzZWxlY3QoZGlzdCwgc3RhdGUsIGNoaWxkcmVuLCByZXZfcGVyX2NhcCklPiUKICBhcnJhbmdlKGRlc2MocmV2X3Blcl9jYXApKQoKYGBgCgpMZXRzIHRha2UgdGhlIG91dGxpZXIgb3V0IGFuZCBzZWUgd2hhdCBvdXIgZGlzdHJpYnV0aW9uIGxvb2tzIGxpa2UuIApgYGB7cn0KIGdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiA8LSBncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXYgJT4lCiAgZmlsdGVyKHJldl9wZXJfY2FwIDwgNzAwMDAwKQoKZ2dwbG90KGdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiwgYWVzKHg9cmV2X3Blcl9jYXApKSArCiAgZ2VvbV9ib3hwbG90KCkKCmdncGxvdChncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXYsIGFlcyh4PXJldl9wZXJfY2FwKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbnMgPSAxMDApCgogZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2ICU+JQogIHNlbGVjdChkaXN0LCBzdGF0ZSwgY2hpbGRyZW4sIHJldl9wZXJfY2FwKSU+JQogIGFycmFuZ2UoZGVzYyhyZXZfcGVyX2NhcCkpCmBgYApUaGlzIGlzIGJldHRlciwgYnV0IHdlIHN0aWxsIGhhdmUgYSBzaWduaWZpY2FudCBudW1iZXIgb2Ygb3V0bGllcnMuIAoKSXQgbG9va3MgbGlrZSBmaWx0ZXJpbmcgdGhlIGRhdGEgdG8gb25seSBpbmNsdWRlIGRpc3RyaWN0cyB3aXRoIG1vcmUgdGhhbiAxMCwwMDAgc3R1ZGVudHMgZG9lcyBub3QgY2hhbmdlIHRoZSBuYXR1cmUgb2YgdGhlIGRpc3RyaWJ1dGlvbi4gV2UgY2FuIHNlZSB0aHJvdWdoIHRoZSBoaXN0b2dyYW0gdGhhdCB0aGlzIGRpc3RyaWJ1dGlvbiBpcyBoaWdobHkgc2tld2VkIHRvIHRoZSByaWdodC4gCgpgYGB7cn0KZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2X2ZpbHRlcmVkIDwtIGdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiAlPiUKICBmaWx0ZXIoY2hpbGRyZW4gPCAxMDAwMCkKCmdncGxvdChncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXZfZmlsdGVyZWQsIGFlcyh4PXJldl9wZXJfY2FwKSkgKwogIGdlb21fYm94cGxvdCgpCgpnZ3Bsb3QoZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2X2ZpbHRlcmVkLCBhZXMoeD1yZXZfcGVyX2NhcCkpICsKICBnZW9tX2hpc3RvZ3JhbShiaW5zID0gMTAwKQpgYGAKSGVyZSBpcyB0aGUgam9pbmVkIGRhdGEgc2V0IHRoYXQgaW5jbHVkZXMgdGhlIEhIIGNvbmRpdGlvbnMsIHJhY2UgZGF0YSBhbmQgdG90YWwgcmV2ZW51ZS4gCmBgYHtyfQogZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2ICU+JSAKICB3cml0ZV9jc3YoZmlsZSA9ICIuLi9kYXRhL2dyYWRfcmFjZVBfaG91c2Vob2xkX3Jldi5jc3YiKQpgYGAKCkkgYW0gY3VyaW91cyBob3cgdG90YWwgcmV2ZW51ZSBwZXIgY2hpbGQgaXMgY29ycmVsYXRlZCB3aXRoIGdyYWQgcmF0ZXMsIGhoIGNvbmRpdGlvbnMgYW5kIHJhY2UuICAKCiMjIyBDb3JyZWxhdGlvbnMKCiMjIyMgUmFjZSBhbmQgdG90YWwgcmV2ZW51ZSBwZXIgY2hpbGQgY29ycmVsYXRpb24gCgpgYGB7cn0KIGdyYWRfcmFjZVBfaG91c2Vob2xkX3Jldl9yYWNlIDwtIGdyYWRfcmFjZVBfaG91c2Vob2xkX3JldiAlPiUKICBzZWxlY3QoZ3JhZF9yYXRlX21pZHB0LCBwY3RfaGlzcF9sYXRpbm86cGN0X1BJLCByZXZfcGVyX2NhcCkgJT4lCiAgbmEub21pdCgpCgpncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXZfcmFjZVssMTpuY29sKGdyYWRfcmFjZVBfaG91c2Vob2xkX3Jldl9yYWNlKV0gJT4lIAogICAgY29yKCkgJT4lCiAgICBjb3JycGxvdChtZXRob2QgPSAibnVtYmVyIikKYGBgCkl0IGxvb2tzIGxpa2UgcmV2ZW51ZSBwZXIgc3R1ZGVudCBkb2VzIG5vdCBjb3JyZWxhdGUgd2l0aCByYWNlIGF0IGFsbC4gCgojIyMjIEhIIGNvbmRpdGlvbnMgYW5kIHRvdGFsIHJldmVudWUgcGVyIGNoaWxkIGNvcnJlbGF0aW9uCgpgYGB7cn0KZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2X0hIQyA8LSBncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXYgJT4lCiAgc2VsZWN0KGdyYWRfcmF0ZV9taWRwdDpwY3RfQ0xJLCByZXZfcGVyX2NhcCkgJT4lCiAgbmEub21pdCgpCgpncmFkX3JhY2VQX2hvdXNlaG9sZF9yZXZfSEhDWywxOm5jb2woZ3JhZF9yYWNlUF9ob3VzZWhvbGRfcmV2X0hIQyldICU+JSAKICAgIGNvcigpICU+JQogICAgY29ycnBsb3QobWV0aG9kID0gIm51bWJlciIpCgpgYGAKCgpJdCBsb29rcyBsaWtlIHJldmVudWUgcGVyIHN0dWRlbnQgZG9lcyBub3QgY29ycmVsYXRlIHdpdGggaG91c2Vob2xkIGNvbmRpdGlvbnMgYXQgYWxsLiBUaGVzZSBmaW5kaW5ncyBhcmUgcGVycGxleGluZy4gV2Ugd291bGQgaGF2ZSBndWVzc2VkIHRoYXQgdG90YWwgcmV2ZW51ZSBwZXIgc3R1ZGVudCB3b3VsZCBiZSBjb3JyZWxhdGVkIHRvIGEgY2VydGFpbiBleHRlbnQgd2l0aCByYWNlIG9yIGhvdXNlaG9sZCBjb25kaXRpb25zLCBvciBldmVuIGdyYWR1YXRpb24gcmF0ZXMuVGhpcyBtYWtlcyBtZSB3b25kZXIgaWYgaXQgd291bGQgbWFrZSBtb3JlIHNlbnNlIHRvIHVzZSBhIGRpZmZlcmVudCByZXZlbnVlIGNhbGN1bGF0aW9uIHJhdGhlciB0aGFuIHRoZSB0b3RhbC4gVGhlc2UgYXJlIGRpdmlkZWQgYnkgZmVkZXJhbCByZXZlbnVlLCBzdGF0ZSByZXZlbnVlLCBhbmQgbG9jYWwgcmV2ZW51ZS4gUGVyaGFwcyBsb2NhbCBvciBzdGF0ZSByZXZlbnVlIHdvdWxkIGJlIG1vcmUgY29ycmVsYXRlZCB3aXRoIG91ciBvdGhlciBpbmRpY2F0b3JzLiAgCg==