<- read_csv("../data/grad.csv",
grad_data show_col_types = FALSE) %>%
mutate(leaid = as.integer(leaid))
## mutate: converted 'leaid' from double to integer (0 new NA)
<- read_csv("../data/hh.csv",
hh_data show_col_types = FALSE) %>%
mutate(leaid = as.integer(leaid))
## mutate: converted 'leaid' from double to integer (0 new NA)
<- read_csv("../data/race.csv",
race_data show_col_types = FALSE) %>%
mutate(leaid = as.integer(leaid))
## mutate: converted 'leaid' from double to integer (0 new NA)
Let’s take a look at the variables in each of these data sets:
%>% glimpse() grad_data
## Rows: 12,663
## Columns: 2
## $ leaid <int> 100005, 100006, 100007, 100008, 100011, 100012, 100013…
## $ grad_rate_midpt <dbl> 90.62127, 88.86380, 92.01013, 95.81859, 90.77311, 86.4…
%>% glimpse() race_data
## Rows: 11,910
## Columns: 66
## $ geo_id <chr> "97000US2700106", "97000US4500690", "97000US5…
## $ dist <chr> "A.C.G.C. Public School District", "Abbeville…
## $ leaid <int> 2700106, 4500690, 5500030, 4807380, 2800360, …
## $ year <chr> "2014-2018", "2014-2018", "2014-2018", "2014-…
## $ total_pop_est <dbl> 885, 3420, 685, 190, 1405, 3480, 745, 805, 15…
## $ total_pop_moe <dbl> 105, 223, 124, 58, 233, 347, 196, 166, 602, 1…
## $ total_hisp_latino <dbl> 50, 25, 285, 4, 10, 980, 365, 515, 6515, 60, …
## $ total_hisp_latino_moe <dbl> 35, 37, 83, 6, 18, 249, 138, 144, 501, 51, 93…
## $ pct_hisp_latino <dbl> 5.6, 0.7, 41.6, 2.1, 0.7, 28.2, 49.0, 64.0, 4…
## $ pct_hisp_latino_moe <dbl> 4.0, 1.1, 9.0, 3.8, 1.3, 6.3, 14.7, 11.4, 2.4…
## $ total_mexican <dbl> 50, 4, 275, 4, 10, 795, 365, 485, 6045, 40, 5…
## $ total_mexican_moe <dbl> 34, 12, 82, 6, 18, 237, 138, 146, 504, 45, 68…
## $ pct_mexican <dbl> 5.6, 0.1, 40.1, 2.1, 0.7, 22.8, 49.0, 60.2, 3…
## $ pct_mexican_moe <dbl> 3.8, 0.4, 9.0, 3.8, 1.3, 6.5, 14.7, 11.9, 2.6…
## $ total_puertrican <dbl> 0, 20, 4, 0, 0, 0, 0, 4, 170, 0, 15, 10, 10, …
## $ total_puertrican_moe <dbl> 12, 33, 12, 13, 18, 22, 12, 7, 132, 15, 16, 1…
## $ pct_puertrican <dbl> 0.0, 0.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.5, 1.1, …
## $ pct_puertrican_moe <dbl> 2.0, 1.0, 2.9, 18.9, 2.6, 0.9, 4.3, 0.9, 0.9,…
## $ total_cuban <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0,…
## $ total_cuban_moe <dbl> 12, 23, 9, 13, 18, 22, 12, 13, 31, 28, 17, 11…
## $ pct_cuban <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ pct_cuban_moe <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.9, 4.3, 4.8, 0.3,…
## $ total_other_hl <dbl> 4, 0, 0, 0, 0, 185, 0, 20, 305, 0, 80, 0, 4, …
## $ total_other_hl_moe <dbl> 3, 23, 9, 13, 18, 169, 12, 22, 156, 15, 71, 1…
## $ pct_other_hl <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 5.3, 0.0, 2.5, 2.0, …
## $ pct_other_hl_moe <dbl> 1.0, 1.0, 2.5, 18.9, 2.6, 4.7, 4.3, 2.8, 1.0,…
## $ total_NOT_hl <dbl> 830, 3395, 405, 185, 1395, 2500, 380, 295, 88…
## $ total_NOT_hlmoe <dbl> 95, 220, 93, 57, 235, 315, 155, 108, 444, 165…
## $ pct_NOT_hl <dbl> 93.8, 99.3, 59.1, 97.4, 99.3, 71.8, 51.0, 36.…
## $ pct_NOT_hl_moe <dbl> 3.4, 1.1, 9.1, 3.3, 1.3, 6.3, 14.7, 11.4, 2.4…
## $ total_white <dbl> 830, 2050, 380, 185, 140, 2010, 285, 255, 603…
## $ total_white_moe <dbl> 95, 184, 88, 57, 154, 321, 118, 105, 448, 168…
## $ pct_white <dbl> 93.8, 59.9, 55.5, 97.4, 10.0, 57.8, 38.3, 31.…
## $ pct_white_moe <dbl> 3.5, 2.7, 8.9, 3.3, 10.0, 7.1, 13.1, 11.3, 2.…
## $ total_black <dbl> 0, 1120, 0, 0, 1225, 4, 0, 0, 2110, 45, 85, 4…
## $ total_black_moe <dbl> 12, 168, 9, 13, 185, 3, 12, 13, 225, 42, 58, …
## $ pct_black <dbl> 0.0, 32.7, 0.0, 0.0, 87.2, 0.1, 0.0, 0.0, 13.…
## $ pct_black_moe <dbl> 2.0, 4.7, 2.5, 18.9, 10.0, 0.4, 4.3, 4.8, 1.5…
## $ total_native <dbl> 0, 0, 0, 0, 0, 115, 0, 0, 65, 0, 0, 10, 0, 0,…
## $ total_native_moe <dbl> 12, 23, 9, 13, 18, 81, 12, 13, 26, 15, 17, 20…
## $ pct_native <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 3.3, 0.0, 0.0, 0.4, …
## $ pct_native_moe <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 2.4, 4.3, 4.8, 0.2,…
## $ total_asian <dbl> 0, 4, 4, 0, 0, 60, 15, 0, 185, 0, 55, 0, 90, …
## $ total_asian_moe <dbl> 12, 7, 3, 13, 18, 47, 33, 13, 108, 15, 34, 11…
## $ pct_asian <dbl> 0.0, 0.1, 0.6, 0.0, 0.0, 1.7, 2.0, 0.0, 1.2, …
## $ pct_asian_moe <dbl> 2.0, 0.2, 1.1, 18.9, 2.6, 1.3, 4.2, 4.8, 0.7,…
## $ total_PI <dbl> 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 10, 0,…
## $ total_PI_moe <dbl> 12, 23, 9, 13, 18, 3, 12, 13, 31, 15, 17, 11,…
## $ pct_PI <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, …
## $ pct_PI_moe <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.4, 4.3, 4.8, 0.3,…
## $ total_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 4,…
## $ total_other_moe <dbl> 12, 23, 9, 13, 18, 22, 12, 13, 51, 15, 17, 11…
## $ pct_other <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, …
## $ pct_other_moe <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.9, 4.3, 4.8, 0.3,…
## $ total_nonhl_2race <dbl> 4, 220, 20, 0, 30, 315, 75, 35, 465, 0, 0, 10…
## $ total_nonhl_2race_moe <dbl> 3, 127, 20, 13, 38, 139, 79, 29, 168, 15, 17,…
## $ pct_nonhl_2race <dbl> 0.5, 6.4, 2.9, 0.0, 2.1, 9.1, 10.1, 4.3, 3.0,…
## $ pct_nonhl_2race_moe <dbl> 0.8, 3.7, 3.0, 18.9, 2.9, 4.0, 9.8, 3.8, 1.1,…
## $ total_nonhl_2_other <dbl> 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, …
## $ total_nonhl_2_other_moe <dbl> 12, 2, 9, 13, 18, 22, 12, 9, 31, 15, 17, 11, …
## $ pct_nonhl_2_other <dbl> 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, …
## $ pct_nonhl_2_other_moe <dbl> 2.0, 0.3, 2.5, 18.9, 2.6, 0.9, 4.3, 1.4, 0.3,…
## $ total_nonhl_2_3other <dbl> 4, 220, 20, 0, 30, 315, 75, 30, 465, 0, 0, 10…
## $ total_nonhl_2_3other_moe <dbl> 3, 127, 20, 13, 38, 139, 79, 26, 168, 15, 17,…
## $ pct_nonhl_2_3other <dbl> 0.5, 6.4, 2.9, 0.0, 2.1, 9.1, 10.1, 3.7, 3.0,…
## $ pct_nonhl_2_3other_moe <dbl> 0.8, 3.7, 3.0, 18.9, 2.9, 4.0, 9.8, 3.4, 1.1,…
Initially, these data were in a different, more raw form, since transformed by the download_grad_data.R
script in the scripts/
directory. One row represented one cohort in one school district in one year, so in order to transform the data, we needed to take weighted averages across all the cohorts in a school district, then across all the years for a school district, in order to make one row equal to one school district in the 2014-2018 time period.
From EDFacts documentation:
“The definition of adjusted four-year cohort graduation rate data provided to the SEAs in the 2008 non-regulatory guidance and for the purposes of submitting data files to EDFacts is ‘the number of students who graduate in four years with a regular high school diploma divided by the number of students who form the adjusted cohort for the graduating class.’ From the beginning of 9th grade (or the earliest high school grade), students who are entering that grade for the first time form a cohort that is “adjusted” by adding any students who subsequently transfer into the cohort and subtracting any students who subsequently transfer out, emigrate to another country, or die.”
In cleaning the data, we ignored all of the columns giving extraneous information, and just included the LEAID, the year, the cohort number, and the grduation rate midpoint.
If the variable cohort_num
represents the number of students in that cohort (rather than the cohort index, per se), then we can add the total number of students in a school district in a year, then take the weighted average of the graduation rates. We can then summarize across cohort, and across year, in order to get the five-year average for the school districts. When summarizing across the years, we also cannot assume that the total number of students across all cohorts is the same, so we also need to take a weighted average across all of the years. This is not too difficult, as we simply need to take column sums in the grouped data frames, create weighting proportions, and sum the product of the weighs with the graduation rates. This is demonstrated below, and the final product is a data frame for which one row is one high school district, with graduation rate data properly averaged for the five-year measurement.
<- grad_data %>%
grad_data_summarized filter(
!is.na(cohort_num), grad_rate_midpt > 0
%>%
)group_by(
leaid, year%>%
) mutate(
cohort_total = sum(cohort_num, na.rm = TRUE),
cohort_weight = cohort_num/cohort_total,
.after = cohort_num
%>%
) summarize( # Weighted Averages within year based on cohort size
cohort_total = max(cohort_total),
grad_rate_midpt = sum(cohort_weight * grad_rate_midpt)
%>%
) group_by(leaid) %>%
mutate(
student_total = sum(cohort_total, na.rm = TRUE),
student_weight = cohort_total/student_total,
.after = cohort_total
%>%
) summarize( # Weighted Averages within district based on total size
student_total = max(student_total),
grad_rate_midpt = sum(student_weight * grad_rate_midpt)
)
Let’s join the graduation data with the race data, and take a look at the school districts that didn’t properly join with the graduation data. We’ll left join the race data with the graduation data.
nrow(race_data)
## [1] 11910
nrow(grad_data)
## [1] 12663
<- hh_data %>%
data_joined left_join(race_data,
by = c("leaid" = "leaid")) %>%
left_join(grad_data,
by = c("leaid" = "leaid"))
## left_join: added 66 columns (dist.x, geo_id, dist.y, year, total_pop_est, …)
## > rows only in x 1,404
## > rows only in y ( 1)
## > matched rows 11,909
## > ========
## > rows total 13,313
## left_join: added one column (grad_rate_midpt)
## > rows only in x 2,550
## > rows only in y ( 1,900)
## > matched rows 10,763
## > ========
## > rows total 13,313
<- hh_data %>%
data_not_joined left_join(race_data,
by = c("leaid" = "leaid")) %>%
anti_join(grad_data,
by = c("leaid" = "leaid")) %>%
select(-ends_with(".y"))
## left_join: added 66 columns (dist.x, geo_id, dist.y, year, total_pop_est, …)
## > rows only in x 1,404
## > rows only in y ( 1)
## > matched rows 11,909
## > ========
## > rows total 13,313
## anti_join: added no columns
## > rows only in x 2,550
## > rows only in y ( 1,900)
## > matched rows (10,763)
## > ========
## > rows total 2,550
## select: dropped one variable (dist.y)
names(data_not_joined) <- names(data_not_joined) %>%
str_remove_all(".x")
Now that we’ve joined our data, let’s take a look at some of the data that wasn’t properly joined. First, let’s look at some of the larger ones.
%>%
data_not_joined is.na() %>%
colSums()
## state leaid dist
## 0 0 0
## children pct_pov pct_SP
## 0 0 0
## SP_MOE pct_HHVJ HHVJ_MOE
## 0 0 0
## pct_CC CC_MOE pct_NCI
## 0 0 0
## nci_MOE pct_CD CD_MOE
## 0 0 0
## pct_CLI CLI_MOE region
## 0 0 0
## geo_id year total_pop_est
## 842 842 842
## total_pop_moe total_hisp_latino total_hisp_latino_moe
## 842 842 842
## pct_hisp_latino pct_hisp_latino_moe total_mican
## 842 842 842
## total_mican_moe pct_mican pct_mican_moe
## 842 842 842
## total_puertrican total_puertrican_moe pct_puertrican
## 842 842 842
## pct_puertrican_moe total_cuban total_cuban_moe
## 842 842 842
## pct_cuban pct_cuban_moe total_other_hl
## 842 842 842
## total_other_hl_moe pct_other_hl pct_other_hl_moe
## 842 842 842
## total_NOT_hl total_NOT_hlmoe pct_NOT_hl
## 842 842 842
## pct_NOT_hl_moe total_white total_white_moe
## 842 842 842
## pct_white pct_white_moe total_black
## 842 842 842
## total_black_moe pct_black pct_black_moe
## 842 842 842
## total_native total_native_moe pct_native
## 842 842 842
## pct_native_moe total_asian total_asian_moe
## 842 842 842
## pct_asian pct_asian_moe total_PI
## 842 842 842
## total_PI_moe pct_PI pct_PI_moe
## 842 842 842
## total_other total_other_moe pct_other
## 842 842 842
## pct_other_moe total_nonhl_2race total_nonhl_2race_moe
## 842 842 842
## pct_nonhl_2race pct_nonhl_2race_moe total_nonhl_2_other
## 842 842 842
## total_nonhl_2_other_moe pct_nonhl_2_other pct_nonhl_2_other_moe
## 842 842 842
## total_nonhl_2_3other total_nonhl_2_3other_moe pct_nonhl_2_3other
## 842 842 842
## pct_nonhl_2_3other_moe
## 842
%>%
data_not_joined arrange(desc(children)) %>%
head(10) %>%
select(dist, children)
## select: dropped 80 variables (state, leaid, pct_pov, pct_SP, SP_MOE, …)
## # A tibble: 10 × 2
## dist children
## <chr> <dbl>
## 1 Chula Vista Elementary School District 56164
## 2 Bakersfield City School District 45236
## 3 Washington Elementary District 41426
## 4 Anaheim Elementary School District 37965
## 5 Ontario-Montclair School District 35615
## 6 Cartwright Elementary District 33532
## 7 Palmdale Elementary School District 30270
## 8 Cupertino Union Elementary School District 29481
## 9 Cajon Valley Union Elementary School District 29397
## 10 Escondido Union Elementary School District 28450
Many of these school districts didn’t join properly because they lack LEA IDs.
Initially upon running this analysis, we observed the New York City Department of Education to be the first and foremost unjoined row. For the sake of data cleanliness, we have since removed this row from the original, cleaned hh.csv
file for the following reason:
“New York – Data for the New York City School District (NCES LEAID ‘3620580’) has been submitted as a supervisory union with 32 subordinate school districts. Each record within this file includes information about the local education agency (LEA) to which the school belongs. The schools included in this file are reported as they were submitted to EDFacts, with associations for all New York City being to these subordinate school districts. All of the subordinate school districts have the name “New York City Geographic District ##” where ## is a number between 1 and 32. If you are interested in aggregating the submitted school level data to the level of the New York City School District, use the names and LEA IDs in the Table 16 to identify the proper records within the data file.”
Unfortunately, our original data set doesn’t distinguish between all of these geographical districts, so we will not provide graduation rate data for this conglomerate of 32 districts.
%>%
data_not_joined ggplot(aes(x = children)) +
scale_x_log10() +
geom_boxplot() +
geom_density()
## Warning: Transformation introduced infinite values in continuous x-axis
## Transformation introduced infinite values in continuous x-axis
## Warning: Removed 46 rows containing non-finite values (stat_boxplot).
## Warning: Removed 46 rows containing non-finite values (stat_density).
About 50% of the school district for which there is no data on graduation rates (didn’t join properly, revealed by anti_join()
) have between about 100 and 1000 students—in other words, these are very small school districts.
It turns out that there was also no graduation rate data for the New York City Department of Education, which is a conglomeration of 32 individual school districts. While we could replace this department of education with its constituent school districts, the data we were given initially in our data set included this row, which is not, in fact, a school district. Similar to the New York Times at some of the newer COVID outbreaks, we will ignore New York City in our analysis.