Data Import and Glimpsing

grad_data <- read_csv("../data/grad.csv", 
                       show_col_types = FALSE) %>%
    mutate(leaid = as.integer(leaid))

## mutate: converted 'leaid' from double to integer (0 new NA)

hh_data <- read_csv("../data/hh.csv", 
                    show_col_types = FALSE) %>%
    mutate(leaid = as.integer(leaid))

## mutate: converted 'leaid' from double to integer (0 new NA)

race_data <- read_csv("../data/race.csv", 
                       show_col_types = FALSE) %>%
    mutate(leaid = as.integer(leaid))

## mutate: converted 'leaid' from double to integer (0 new NA)

Let’s take a look at the variables in each of these data sets:

grad_data %>% glimpse()

## Rows: 12,663
## Columns: 2
## $ leaid           <int> 100005, 100006, 100007, 100008, 100011, 100012, 100013…
## $ grad_rate_midpt <dbl> 90.62127, 88.86380, 92.01013, 95.81859, 90.77311, 86.4…

race_data %>% glimpse()

## Rows: 11,910
## Columns: 66
## $ geo_id                   <chr> "97000US2700106", "97000US4500690", "97000US5…
## $ dist                     <chr> "A.C.G.C. Public School District", "Abbeville…
## $ leaid                    <int> 2700106, 4500690, 5500030, 4807380, 2800360, …
## $ year                     <chr> "2014-2018", "2014-2018", "2014-2018", "2014-…
## $ total_pop_est            <dbl> 885, 3420, 685, 190, 1405, 3480, 745, 805, 15…
## $ total_pop_moe            <dbl> 105, 223, 124, 58, 233, 347, 196, 166, 602, 1…
## $ total_hisp_latino        <dbl> 50, 25, 285, 4, 10, 980, 365, 515, 6515, 60, …
## $ total_hisp_latino_moe    <dbl> 35, 37, 83, 6, 18, 249, 138, 144, 501, 51, 93…
## $ pct_hisp_latino          <dbl> 5.6, 0.7, 41.6, 2.1, 0.7, 28.2, 49.0, 64.0, 4…
## $ pct_hisp_latino_moe      <dbl> 4.0, 1.1, 9.0, 3.8, 1.3, 6.3, 14.7, 11.4, 2.4…
## $ total_mexican            <dbl> 50, 4, 275, 4, 10, 795, 365, 485, 6045, 40, 5…
## $ total_mexican_moe        <dbl> 34, 12, 82, 6, 18, 237, 138, 146, 504, 45, 68…
## $ pct_mexican              <dbl> 5.6, 0.1, 40.1, 2.1, 0.7, 22.8, 49.0, 60.2, 3…
## $ pct_mexican_moe          <dbl> 3.8, 0.4, 9.0, 3.8, 1.3, 6.5, 14.7, 11.9, 2.6…
## $ total_puertrican         <dbl> 0, 20, 4, 0, 0, 0, 0, 4, 170, 0, 15, 10, 10, …
## $ total_puertrican_moe     <dbl> 12, 33, 12, 13, 18, 22, 12, 7, 132, 15, 16, 1…
## $ pct_puertrican           <dbl> 0.0, 0.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.5, 1.1, …
## $ pct_puertrican_moe       <dbl> 2.0, 1.0, 2.9, 18.9, 2.6, 0.9, 4.3, 0.9, 0.9,…
## $ total_cuban              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0,…
## $ total_cuban_moe          <dbl> 12, 23, 9, 13, 18, 22, 12, 13, 31, 28, 17, 11…
## $ pct_cuban                <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ pct_cuban_moe            <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.9, 4.3, 4.8, 0.3,…
## $ total_other_hl           <dbl> 4, 0, 0, 0, 0, 185, 0, 20, 305, 0, 80, 0, 4, …
## $ total_other_hl_moe       <dbl> 3, 23, 9, 13, 18, 169, 12, 22, 156, 15, 71, 1…
## $ pct_other_hl             <dbl> 0.5, 0.0, 0.0, 0.0, 0.0, 5.3, 0.0, 2.5, 2.0, …
## $ pct_other_hl_moe         <dbl> 1.0, 1.0, 2.5, 18.9, 2.6, 4.7, 4.3, 2.8, 1.0,…
## $ total_NOT_hl             <dbl> 830, 3395, 405, 185, 1395, 2500, 380, 295, 88…
## $ total_NOT_hlmoe          <dbl> 95, 220, 93, 57, 235, 315, 155, 108, 444, 165…
## $ pct_NOT_hl               <dbl> 93.8, 99.3, 59.1, 97.4, 99.3, 71.8, 51.0, 36.…
## $ pct_NOT_hl_moe           <dbl> 3.4, 1.1, 9.1, 3.3, 1.3, 6.3, 14.7, 11.4, 2.4…
## $ total_white              <dbl> 830, 2050, 380, 185, 140, 2010, 285, 255, 603…
## $ total_white_moe          <dbl> 95, 184, 88, 57, 154, 321, 118, 105, 448, 168…
## $ pct_white                <dbl> 93.8, 59.9, 55.5, 97.4, 10.0, 57.8, 38.3, 31.…
## $ pct_white_moe            <dbl> 3.5, 2.7, 8.9, 3.3, 10.0, 7.1, 13.1, 11.3, 2.…
## $ total_black              <dbl> 0, 1120, 0, 0, 1225, 4, 0, 0, 2110, 45, 85, 4…
## $ total_black_moe          <dbl> 12, 168, 9, 13, 185, 3, 12, 13, 225, 42, 58, …
## $ pct_black                <dbl> 0.0, 32.7, 0.0, 0.0, 87.2, 0.1, 0.0, 0.0, 13.…
## $ pct_black_moe            <dbl> 2.0, 4.7, 2.5, 18.9, 10.0, 0.4, 4.3, 4.8, 1.5…
## $ total_native             <dbl> 0, 0, 0, 0, 0, 115, 0, 0, 65, 0, 0, 10, 0, 0,…
## $ total_native_moe         <dbl> 12, 23, 9, 13, 18, 81, 12, 13, 26, 15, 17, 20…
## $ pct_native               <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 3.3, 0.0, 0.0, 0.4, …
## $ pct_native_moe           <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 2.4, 4.3, 4.8, 0.2,…
## $ total_asian              <dbl> 0, 4, 4, 0, 0, 60, 15, 0, 185, 0, 55, 0, 90, …
## $ total_asian_moe          <dbl> 12, 7, 3, 13, 18, 47, 33, 13, 108, 15, 34, 11…
## $ pct_asian                <dbl> 0.0, 0.1, 0.6, 0.0, 0.0, 1.7, 2.0, 0.0, 1.2, …
## $ pct_asian_moe            <dbl> 2.0, 0.2, 1.1, 18.9, 2.6, 1.3, 4.2, 4.8, 0.7,…
## $ total_PI                 <dbl> 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 10, 0,…
## $ total_PI_moe             <dbl> 12, 23, 9, 13, 18, 3, 12, 13, 31, 15, 17, 11,…
## $ pct_PI                   <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, …
## $ pct_PI_moe               <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.4, 4.3, 4.8, 0.3,…
## $ total_other              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 35, 0, 0, 0, 0, 0, 4,…
## $ total_other_moe          <dbl> 12, 23, 9, 13, 18, 22, 12, 13, 51, 15, 17, 11…
## $ pct_other                <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, …
## $ pct_other_moe            <dbl> 2.0, 1.0, 2.5, 18.9, 2.6, 0.9, 4.3, 4.8, 0.3,…
## $ total_nonhl_2race        <dbl> 4, 220, 20, 0, 30, 315, 75, 35, 465, 0, 0, 10…
## $ total_nonhl_2race_moe    <dbl> 3, 127, 20, 13, 38, 139, 79, 29, 168, 15, 17,…
## $ pct_nonhl_2race          <dbl> 0.5, 6.4, 2.9, 0.0, 2.1, 9.1, 10.1, 4.3, 3.0,…
## $ pct_nonhl_2race_moe      <dbl> 0.8, 3.7, 3.0, 18.9, 2.9, 4.0, 9.8, 3.8, 1.1,…
## $ total_nonhl_2_other      <dbl> 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, …
## $ total_nonhl_2_other_moe  <dbl> 12, 2, 9, 13, 18, 22, 12, 9, 31, 15, 17, 11, …
## $ pct_nonhl_2_other        <dbl> 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, …
## $ pct_nonhl_2_other_moe    <dbl> 2.0, 0.3, 2.5, 18.9, 2.6, 0.9, 4.3, 1.4, 0.3,…
## $ total_nonhl_2_3other     <dbl> 4, 220, 20, 0, 30, 315, 75, 30, 465, 0, 0, 10…
## $ total_nonhl_2_3other_moe <dbl> 3, 127, 20, 13, 38, 139, 79, 26, 168, 15, 17,…
## $ pct_nonhl_2_3other       <dbl> 0.5, 6.4, 2.9, 0.0, 2.1, 9.1, 10.1, 3.7, 3.0,…
## $ pct_nonhl_2_3other_moe   <dbl> 0.8, 3.7, 3.0, 18.9, 2.9, 4.0, 9.8, 3.4, 1.1,…

Initially, these data were in a different, more raw form, since transformed by the download_grad_data.R script in the scripts/ directory. One row represented one cohort in one school district in one year, so in order to transform the data, we needed to take weighted averages across all the cohorts in a school district, then across all the years for a school district, in order to make one row equal to one school district in the 2014-2018 time period.

From EDFacts documentation:

“The definition of adjusted four-year cohort graduation rate data provided to the SEAs in the 2008 non-regulatory guidance and for the purposes of submitting data files to EDFacts is ‘the number of students who graduate in four years with a regular high school diploma divided by the number of students who form the adjusted cohort for the graduating class.’ From the beginning of 9th grade (or the earliest high school grade), students who are entering that grade for the first time form a cohort that is “adjusted” by adding any students who subsequently transfer into the cohort and subtracting any students who subsequently transfer out, emigrate to another country, or die.”

Data Cleaning

In cleaning the data, we ignored all of the columns giving extraneous information, and just included the LEAID, the year, the cohort number, and the grduation rate midpoint.

If the variable cohort_num represents the number of students in that cohort (rather than the cohort index, per se), then we can add the total number of students in a school district in a year, then take the weighted average of the graduation rates. We can then summarize across cohort, and across year, in order to get the five-year average for the school districts. When summarizing across the years, we also cannot assume that the total number of students across all cohorts is the same, so we also need to take a weighted average across all of the years. This is not too difficult, as we simply need to take column sums in the grouped data frames, create weighting proportions, and sum the product of the weighs with the graduation rates. This is demonstrated below, and the final product is a data frame for which one row is one high school district, with graduation rate data properly averaged for the five-year measurement.

grad_data_summarized <- grad_data %>% 
    filter(
        !is.na(cohort_num), grad_rate_midpt > 0
    )%>%
    group_by(
        leaid, year
    ) %>% 
    mutate(
        cohort_total = sum(cohort_num, na.rm = TRUE),
        cohort_weight = cohort_num/cohort_total,
        .after = cohort_num
    ) %>% 
    summarize( # Weighted Averages within year based on cohort size
        cohort_total = max(cohort_total), 
        grad_rate_midpt = sum(cohort_weight * grad_rate_midpt)
    ) %>% 
    group_by(leaid) %>%
    mutate(
        student_total = sum(cohort_total, na.rm = TRUE),
        student_weight = cohort_total/student_total,
        .after = cohort_total
    ) %>%
    summarize( # Weighted Averages within district based on total size
        student_total = max(student_total), 
        grad_rate_midpt = sum(student_weight * grad_rate_midpt)
    )

Joining the Data

Let’s join the graduation data with the race data, and take a look at the school districts that didn’t properly join with the graduation data. We’ll left join the race data with the graduation data.

nrow(race_data)

## [1] 11910

nrow(grad_data)

## [1] 12663

data_joined <- hh_data %>%
    left_join(race_data, 
              by = c("leaid" = "leaid")) %>% 
    left_join(grad_data, 
              by = c("leaid" = "leaid"))

## left_join: added 66 columns (dist.x, geo_id, dist.y, year, total_pop_est, …)

##            > rows only in x    1,404

##            > rows only in y  (     1)

##            > matched rows     11,909

##            >                 ========

##            > rows total       13,313

## left_join: added one column (grad_rate_midpt)

##            > rows only in x    2,550

##            > rows only in y  ( 1,900)

##            > matched rows     10,763

##            >                 ========

##            > rows total       13,313

data_not_joined <- hh_data %>%
    left_join(race_data, 
              by = c("leaid" = "leaid")) %>% 
    anti_join(grad_data, 
              by = c("leaid" = "leaid")) %>%
    select(-ends_with(".y"))

## left_join: added 66 columns (dist.x, geo_id, dist.y, year, total_pop_est, …)

##            > rows only in x    1,404

##            > rows only in y  (     1)

##            > matched rows     11,909

##            >                 ========

##            > rows total       13,313

## anti_join: added no columns

##            > rows only in x    2,550

##            > rows only in y  ( 1,900)

##            > matched rows    (10,763)

##            >                 ========

##            > rows total        2,550

## select: dropped one variable (dist.y)

names(data_not_joined) <- names(data_not_joined) %>%
    str_remove_all(".x")

Explaining Un-joined Data

Now that we’ve joined our data, let’s take a look at some of the data that wasn’t properly joined. First, let’s look at some of the larger ones.

data_not_joined %>% 
    is.na() %>% 
    colSums()

##                    state                    leaid                     dist 
##                        0                        0                        0 
##                 children                  pct_pov                   pct_SP 
##                        0                        0                        0 
##                   SP_MOE                 pct_HHVJ                 HHVJ_MOE 
##                        0                        0                        0 
##                   pct_CC                   CC_MOE                  pct_NCI 
##                        0                        0                        0 
##                  nci_MOE                   pct_CD                   CD_MOE 
##                        0                        0                        0 
##                  pct_CLI                  CLI_MOE                   region 
##                        0                        0                        0 
##                   geo_id                     year            total_pop_est 
##                      842                      842                      842 
##            total_pop_moe        total_hisp_latino    total_hisp_latino_moe 
##                      842                      842                      842 
##          pct_hisp_latino      pct_hisp_latino_moe              total_mican 
##                      842                      842                      842 
##          total_mican_moe                pct_mican            pct_mican_moe 
##                      842                      842                      842 
##         total_puertrican     total_puertrican_moe           pct_puertrican 
##                      842                      842                      842 
##       pct_puertrican_moe              total_cuban          total_cuban_moe 
##                      842                      842                      842 
##                pct_cuban            pct_cuban_moe           total_other_hl 
##                      842                      842                      842 
##       total_other_hl_moe             pct_other_hl         pct_other_hl_moe 
##                      842                      842                      842 
##             total_NOT_hl          total_NOT_hlmoe               pct_NOT_hl 
##                      842                      842                      842 
##           pct_NOT_hl_moe              total_white          total_white_moe 
##                      842                      842                      842 
##                pct_white            pct_white_moe              total_black 
##                      842                      842                      842 
##          total_black_moe                pct_black            pct_black_moe 
##                      842                      842                      842 
##             total_native         total_native_moe               pct_native 
##                      842                      842                      842 
##           pct_native_moe              total_asian          total_asian_moe 
##                      842                      842                      842 
##                pct_asian            pct_asian_moe                 total_PI 
##                      842                      842                      842 
##             total_PI_moe                   pct_PI               pct_PI_moe 
##                      842                      842                      842 
##              total_other          total_other_moe                pct_other 
##                      842                      842                      842 
##            pct_other_moe        total_nonhl_2race    total_nonhl_2race_moe 
##                      842                      842                      842 
##          pct_nonhl_2race      pct_nonhl_2race_moe      total_nonhl_2_other 
##                      842                      842                      842 
##  total_nonhl_2_other_moe        pct_nonhl_2_other    pct_nonhl_2_other_moe 
##                      842                      842                      842 
##     total_nonhl_2_3other total_nonhl_2_3other_moe       pct_nonhl_2_3other 
##                      842                      842                      842 
##   pct_nonhl_2_3other_moe 
##                      842

data_not_joined %>% 
    arrange(desc(children)) %>% 
    head(10) %>% 
    select(dist, children)

## select: dropped 80 variables (state, leaid, pct_pov, pct_SP, SP_MOE, …)

## # A tibble: 10 × 2
##    dist                                          children
##    <chr>                                            <dbl>
##  1 Chula Vista Elementary School District           56164
##  2 Bakersfield City School District                 45236
##  3 Washington Elementary District                   41426
##  4 Anaheim Elementary School District               37965
##  5 Ontario-Montclair School District                35615
##  6 Cartwright Elementary District                   33532
##  7 Palmdale Elementary School District              30270
##  8 Cupertino Union Elementary School District       29481
##  9 Cajon Valley Union Elementary School District    29397
## 10 Escondido Union Elementary School District       28450

Many of these school districts didn’t join properly because they lack LEA IDs.

Initially upon running this analysis, we observed the New York City Department of Education to be the first and foremost unjoined row. For the sake of data cleanliness, we have since removed this row from the original, cleaned hh.csv file for the following reason:

“New York – Data for the New York City School District (NCES LEAID ‘3620580’) has been submitted as a supervisory union with 32 subordinate school districts. Each record within this file includes information about the local education agency (LEA) to which the school belongs. The schools included in this file are reported as they were submitted to EDFacts, with associations for all New York City being to these subordinate school districts. All of the subordinate school districts have the name “New York City Geographic District ##” where ## is a number between 1 and 32. If you are interested in aggregating the submitted school level data to the level of the New York City School District, use the names and LEA IDs in the Table 16 to identify the proper records within the data file.”

Unfortunately, our original data set doesn’t distinguish between all of these geographical districts, so we will not provide graduation rate data for this conglomerate of 32 districts.

data_not_joined %>% 
    ggplot(aes(x = children)) + 
    scale_x_log10() + 
    geom_boxplot() + 
    geom_density()

## Warning: Transformation introduced infinite values in continuous x-axis
## Transformation introduced infinite values in continuous x-axis

## Warning: Removed 46 rows containing non-finite values (stat_boxplot).

## Warning: Removed 46 rows containing non-finite values (stat_density).

About 50% of the school district for which there is no data on graduation rates (didn’t join properly, revealed by anti_join()) have between about 100 and 1000 students—in other words, these are very small school districts.

It turns out that there was also no graduation rate data for the New York City Department of Education, which is a conglomeration of 32 individual school districts. While we could replace this department of education with its constituent school districts, the data we were given initially in our data set included this row, which is not, in fact, a school district. Similar to the New York Times at some of the newer COVID outbreaks, we will ignore New York City in our analysis.

LS0tCnRpdGxlOiAiSm9pbmluZyBHcmFkdWF0aW9uIERhdGEiCmF1dGhvcjogIkpvbiBHZWlnZXIsIE5vZWwgR29vZHdpbiwgQWJpZ2FpbCBKb3BwYSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydAotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRpZHlsb2cpCmxpYnJhcnkoZ2d0aGVtZXMpCnRoZW1lX3NldCh0aGVtZV9jbGVhbigpKQpgYGAKCiMgRGF0YSBJbXBvcnQgYW5kIEdsaW1wc2luZwoKYGBge3IgaW1wb3J0LWRhdGF9CmdyYWRfZGF0YSA8LSByZWFkX2NzdigiLi4vZGF0YS9ncmFkLmNzdiIsIAogICAgICAgICAgICAgICAgICAgICAgIHNob3dfY29sX3R5cGVzID0gRkFMU0UpICU+JQogICAgbXV0YXRlKGxlYWlkID0gYXMuaW50ZWdlcihsZWFpZCkpCmhoX2RhdGEgPC0gcmVhZF9jc3YoIi4uL2RhdGEvaGguY3N2IiwgCiAgICAgICAgICAgICAgICAgICAgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkgJT4lCiAgICBtdXRhdGUobGVhaWQgPSBhcy5pbnRlZ2VyKGxlYWlkKSkKcmFjZV9kYXRhIDwtIHJlYWRfY3N2KCIuLi9kYXRhL3JhY2UuY3N2IiwgCiAgICAgICAgICAgICAgICAgICAgICAgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkgJT4lCiAgICBtdXRhdGUobGVhaWQgPSBhcy5pbnRlZ2VyKGxlYWlkKSkKYGBgCgpMZXQncyB0YWtlIGEgbG9vayBhdCB0aGUgdmFyaWFibGVzIGluIGVhY2ggb2YgdGhlc2UgZGF0YSBzZXRzOiAKYGBge3J9CmdyYWRfZGF0YSAlPiUgZ2xpbXBzZSgpCnJhY2VfZGF0YSAlPiUgZ2xpbXBzZSgpCmBgYAoKSW5pdGlhbGx5LCB0aGVzZSBkYXRhIHdlcmUgaW4gYSBkaWZmZXJlbnQsIG1vcmUgcmF3IGZvcm0sIHNpbmNlIHRyYW5zZm9ybWVkIGJ5IHRoZSBgZG93bmxvYWRfZ3JhZF9kYXRhLlJgIHNjcmlwdCBpbiB0aGUgYHNjcmlwdHMvYCBkaXJlY3RvcnkuIE9uZSByb3cgcmVwcmVzZW50ZWQgb25lIGNvaG9ydCBpbiBvbmUgc2Nob29sIGRpc3RyaWN0IGluIG9uZSB5ZWFyLCBzbyBpbiBvcmRlciB0byB0cmFuc2Zvcm0gdGhlIGRhdGEsIHdlIG5lZWRlZCB0byB0YWtlIHdlaWdodGVkIGF2ZXJhZ2VzIGFjcm9zcyBhbGwgdGhlIGNvaG9ydHMgaW4gYSBzY2hvb2wgZGlzdHJpY3QsIHRoZW4gYWNyb3NzIGFsbCB0aGUgeWVhcnMgZm9yIGEgc2Nob29sIGRpc3RyaWN0LCBpbiBvcmRlciB0byBtYWtlIG9uZSByb3cgZXF1YWwgdG8gb25lIHNjaG9vbCBkaXN0cmljdCBpbiB0aGUgMjAxNC0yMDE4IHRpbWUgcGVyaW9kLiAKCkZyb20gRURGYWN0cyBkb2N1bWVudGF0aW9uOiAKCj4gIlRoZSBkZWZpbml0aW9uIG9mIGFkanVzdGVkIGZvdXIteWVhciBjb2hvcnQgZ3JhZHVhdGlvbiByYXRlIGRhdGEgcHJvdmlkZWQgdG8gdGhlIFNFQXMgaW4gdGhlIDIwMDggbm9uLXJlZ3VsYXRvcnkgZ3VpZGFuY2UgYW5kIGZvciB0aGUgcHVycG9zZXMgb2Ygc3VibWl0dGluZyBkYXRhIGZpbGVzIHRvIEVERmFjdHMgaXMgJ3RoZSBudW1iZXIgb2Ygc3R1ZGVudHMgd2hvIGdyYWR1YXRlIGluIGZvdXIgeWVhcnMgd2l0aCBhIHJlZ3VsYXIgaGlnaCBzY2hvb2wgZGlwbG9tYSBkaXZpZGVkIGJ5IHRoZSBudW1iZXIgb2Ygc3R1ZGVudHMgd2hvIGZvcm0gdGhlIGFkanVzdGVkIGNvaG9ydCBmb3IgdGhlIGdyYWR1YXRpbmcgY2xhc3MuJyBGcm9tIHRoZSBiZWdpbm5pbmcgb2YgOXRoIGdyYWRlIChvciB0aGUgZWFybGllc3QgaGlnaCBzY2hvb2wgZ3JhZGUpLCBzdHVkZW50cyB3aG8gYXJlIGVudGVyaW5nIHRoYXQgZ3JhZGUgZm9yIHRoZSBmaXJzdCB0aW1lIGZvcm0gYSBjb2hvcnQgdGhhdCBpcyDigJxhZGp1c3RlZOKAnSBieSBhZGRpbmcgYW55IHN0dWRlbnRzIHdobyBzdWJzZXF1ZW50bHkgdHJhbnNmZXIgaW50byB0aGUgY29ob3J0IGFuZCBzdWJ0cmFjdGluZyBhbnkgc3R1ZGVudHMgd2hvIHN1YnNlcXVlbnRseSB0cmFuc2ZlciBvdXQsIGVtaWdyYXRlIHRvIGFub3RoZXIgY291bnRyeSwgb3IgZGllLiIKCiMgRGF0YSBDbGVhbmluZwoKSW4gY2xlYW5pbmcgdGhlIGRhdGEsIHdlIGlnbm9yZWQgYWxsIG9mIHRoZSBjb2x1bW5zIGdpdmluZyBleHRyYW5lb3VzIGluZm9ybWF0aW9uLCBhbmQganVzdCBpbmNsdWRlZCB0aGUgTEVBSUQsIHRoZSB5ZWFyLCB0aGUgY29ob3J0IG51bWJlciwgYW5kIHRoZSBncmR1YXRpb24gcmF0ZSBtaWRwb2ludC4KCklmIHRoZSB2YXJpYWJsZSBgY29ob3J0X251bWAgcmVwcmVzZW50cyB0aGUgbnVtYmVyIG9mIHN0dWRlbnRzIGluIHRoYXQgY29ob3J0IChyYXRoZXIgdGhhbiB0aGUgY29ob3J0IGluZGV4LCAqcGVyIHNlKiksIHRoZW4gd2UgY2FuIGFkZCB0aGUgdG90YWwgbnVtYmVyIG9mIHN0dWRlbnRzIGluIGEgc2Nob29sIGRpc3RyaWN0IGluIGEgeWVhciwgdGhlbiB0YWtlIHRoZSB3ZWlnaHRlZCBhdmVyYWdlIG9mIHRoZSBncmFkdWF0aW9uIHJhdGVzLiBXZSBjYW4gdGhlbiBzdW1tYXJpemUgYWNyb3NzIGNvaG9ydCwgYW5kIGFjcm9zcyB5ZWFyLCBpbiBvcmRlciB0byBnZXQgdGhlIGZpdmUteWVhciBhdmVyYWdlIGZvciB0aGUgc2Nob29sIGRpc3RyaWN0cy4gV2hlbiBzdW1tYXJpemluZyBhY3Jvc3MgdGhlIHllYXJzLCB3ZSBhbHNvIGNhbm5vdCBhc3N1bWUgdGhhdCB0aGUgdG90YWwgbnVtYmVyIG9mIHN0dWRlbnRzIGFjcm9zcyBhbGwgY29ob3J0cyBpcyB0aGUgc2FtZSwgc28gd2UgYWxzbyBuZWVkIHRvIHRha2UgYSB3ZWlnaHRlZCBhdmVyYWdlIGFjcm9zcyBhbGwgb2YgdGhlIHllYXJzLiBUaGlzIGlzIG5vdCB0b28gZGlmZmljdWx0LCBhcyB3ZSBzaW1wbHkgbmVlZCB0byB0YWtlIGNvbHVtbiBzdW1zIGluIHRoZSBncm91cGVkIGRhdGEgZnJhbWVzLCBjcmVhdGUgd2VpZ2h0aW5nIHByb3BvcnRpb25zLCBhbmQgc3VtIHRoZSBwcm9kdWN0IG9mIHRoZSB3ZWlnaHMgd2l0aCB0aGUgZ3JhZHVhdGlvbiByYXRlcy4gVGhpcyBpcyBkZW1vbnN0cmF0ZWQgYmVsb3csIGFuZCB0aGUgZmluYWwgcHJvZHVjdCBpcyBhIGRhdGEgZnJhbWUgZm9yIHdoaWNoIG9uZSByb3cgaXMgb25lIGhpZ2ggc2Nob29sIGRpc3RyaWN0LCB3aXRoIGdyYWR1YXRpb24gcmF0ZSBkYXRhIHByb3Blcmx5IGF2ZXJhZ2VkIGZvciB0aGUgZml2ZS15ZWFyIG1lYXN1cmVtZW50LiAKCmBgYHtyLCBldmFsID0gRn0KZ3JhZF9kYXRhX3N1bW1hcml6ZWQgPC0gZ3JhZF9kYXRhICU+JSAKICAgIGZpbHRlcigKICAgICAgICAhaXMubmEoY29ob3J0X251bSksIGdyYWRfcmF0ZV9taWRwdCA+IDAKICAgICklPiUKICAgIGdyb3VwX2J5KAogICAgICAgIGxlYWlkLCB5ZWFyCiAgICApICU+JSAKICAgIG11dGF0ZSgKICAgICAgICBjb2hvcnRfdG90YWwgPSBzdW0oY29ob3J0X251bSwgbmEucm0gPSBUUlVFKSwKICAgICAgICBjb2hvcnRfd2VpZ2h0ID0gY29ob3J0X251bS9jb2hvcnRfdG90YWwsCiAgICAgICAgLmFmdGVyID0gY29ob3J0X251bQogICAgKSAlPiUgCiAgICBzdW1tYXJpemUoICMgV2VpZ2h0ZWQgQXZlcmFnZXMgd2l0aGluIHllYXIgYmFzZWQgb24gY29ob3J0IHNpemUKICAgICAgICBjb2hvcnRfdG90YWwgPSBtYXgoY29ob3J0X3RvdGFsKSwgCiAgICAgICAgZ3JhZF9yYXRlX21pZHB0ID0gc3VtKGNvaG9ydF93ZWlnaHQgKiBncmFkX3JhdGVfbWlkcHQpCiAgICApICU+JSAKICAgIGdyb3VwX2J5KGxlYWlkKSAlPiUKICAgIG11dGF0ZSgKICAgICAgICBzdHVkZW50X3RvdGFsID0gc3VtKGNvaG9ydF90b3RhbCwgbmEucm0gPSBUUlVFKSwKICAgICAgICBzdHVkZW50X3dlaWdodCA9IGNvaG9ydF90b3RhbC9zdHVkZW50X3RvdGFsLAogICAgICAgIC5hZnRlciA9IGNvaG9ydF90b3RhbAogICAgKSAlPiUKICAgIHN1bW1hcml6ZSggIyBXZWlnaHRlZCBBdmVyYWdlcyB3aXRoaW4gZGlzdHJpY3QgYmFzZWQgb24gdG90YWwgc2l6ZQogICAgICAgIHN0dWRlbnRfdG90YWwgPSBtYXgoc3R1ZGVudF90b3RhbCksIAogICAgICAgIGdyYWRfcmF0ZV9taWRwdCA9IHN1bShzdHVkZW50X3dlaWdodCAqIGdyYWRfcmF0ZV9taWRwdCkKICAgICkKYGBgCgojIEpvaW5pbmcgdGhlIERhdGEKCkxldCdzIGpvaW4gdGhlIGdyYWR1YXRpb24gZGF0YSB3aXRoIHRoZSByYWNlIGRhdGEsIGFuZCB0YWtlIGEgbG9vayBhdCB0aGUgc2Nob29sIGRpc3RyaWN0cyB0aGF0IGRpZG4ndCBwcm9wZXJseSBqb2luIHdpdGggdGhlIGdyYWR1YXRpb24gZGF0YS4gV2UnbGwgbGVmdCBqb2luIHRoZSByYWNlIGRhdGEgd2l0aCB0aGUgZ3JhZHVhdGlvbiBkYXRhLiAKCgpgYGB7ciBqb2luLWRhdGFzZXRzfQpucm93KHJhY2VfZGF0YSkKbnJvdyhncmFkX2RhdGEpCgpkYXRhX2pvaW5lZCA8LSBoaF9kYXRhICU+JQogICAgbGVmdF9qb2luKHJhY2VfZGF0YSwgCiAgICAgICAgICAgICAgYnkgPSBjKCJsZWFpZCIgPSAibGVhaWQiKSkgJT4lIAogICAgbGVmdF9qb2luKGdyYWRfZGF0YSwgCiAgICAgICAgICAgICAgYnkgPSBjKCJsZWFpZCIgPSAibGVhaWQiKSkKICAgIAoKZGF0YV9ub3Rfam9pbmVkIDwtIGhoX2RhdGEgJT4lCiAgICBsZWZ0X2pvaW4ocmFjZV9kYXRhLCAKICAgICAgICAgICAgICBieSA9IGMoImxlYWlkIiA9ICJsZWFpZCIpKSAlPiUgCiAgICBhbnRpX2pvaW4oZ3JhZF9kYXRhLCAKICAgICAgICAgICAgICBieSA9IGMoImxlYWlkIiA9ICJsZWFpZCIpKSAlPiUKICAgIHNlbGVjdCgtZW5kc193aXRoKCIueSIpKQpuYW1lcyhkYXRhX25vdF9qb2luZWQpIDwtIG5hbWVzKGRhdGFfbm90X2pvaW5lZCkgJT4lCiAgICBzdHJfcmVtb3ZlX2FsbCgiLngiKQpgYGAKCiMgRXhwbGFpbmluZyBVbi1qb2luZWQgRGF0YQoKTm93IHRoYXQgd2UndmUgam9pbmVkIG91ciBkYXRhLCBsZXQncyB0YWtlIGEgbG9vayBhdCBzb21lIG9mIHRoZSBkYXRhIHRoYXQgd2Fzbid0IHByb3Blcmx5IGpvaW5lZC4gRmlyc3QsIGxldCdzIGxvb2sgYXQgc29tZSBvZiB0aGUgbGFyZ2VyIG9uZXMuCgpgYGB7cn0KZGF0YV9ub3Rfam9pbmVkICU+JSAKICAgIGlzLm5hKCkgJT4lIAogICAgY29sU3VtcygpCgpkYXRhX25vdF9qb2luZWQgJT4lIAogICAgYXJyYW5nZShkZXNjKGNoaWxkcmVuKSkgJT4lIAogICAgaGVhZCgxMCkgJT4lIAogICAgc2VsZWN0KGRpc3QsIGNoaWxkcmVuKQpgYGAKCk1hbnkgb2YgdGhlc2Ugc2Nob29sIGRpc3RyaWN0cyBkaWRuJ3Qgam9pbiBwcm9wZXJseSBiZWNhdXNlIHRoZXkgbGFjayBMRUEgSURzLiAKCkluaXRpYWxseSB1cG9uIHJ1bm5pbmcgdGhpcyBhbmFseXNpcywgd2Ugb2JzZXJ2ZWQgdGhlIE5ldyBZb3JrIENpdHkgRGVwYXJ0bWVudCBvZiBFZHVjYXRpb24gdG8gYmUgdGhlIGZpcnN0IGFuZCBmb3JlbW9zdCB1bmpvaW5lZCByb3cuIEZvciB0aGUgc2FrZSBvZiBkYXRhIGNsZWFubGluZXNzLCB3ZSBoYXZlIHNpbmNlIHJlbW92ZWQgdGhpcyByb3cgZnJvbSB0aGUgb3JpZ2luYWwsIGNsZWFuZWQgYGhoLmNzdmAgZmlsZSBmb3IgdGhlIGZvbGxvd2luZyByZWFzb246Cgo+ICJOZXcgWW9yayDigJMgRGF0YSBmb3IgdGhlIE5ldyBZb3JrIENpdHkgU2Nob29sIERpc3RyaWN0IChOQ0VTIExFQUlEIOKAmDM2MjA1ODDigJkpIGhhcyBiZWVuIHN1Ym1pdHRlZCBhcyBhIHN1cGVydmlzb3J5IHVuaW9uIHdpdGggMzIgc3Vib3JkaW5hdGUgc2Nob29sIGRpc3RyaWN0cy4gRWFjaCByZWNvcmQgd2l0aGluIHRoaXMgZmlsZSBpbmNsdWRlcyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgbG9jYWwgZWR1Y2F0aW9uIGFnZW5jeSAoTEVBKSB0byB3aGljaCB0aGUgc2Nob29sIGJlbG9uZ3MuIFRoZSBzY2hvb2xzIGluY2x1ZGVkIGluIHRoaXMgZmlsZSBhcmUgcmVwb3J0ZWQgYXMgdGhleSB3ZXJlIHN1Ym1pdHRlZCB0byBFREZhY3RzLCB3aXRoIGFzc29jaWF0aW9ucyBmb3IgYWxsIE5ldyBZb3JrIENpdHkgYmVpbmcgdG8gdGhlc2Ugc3Vib3JkaW5hdGUgc2Nob29sIGRpc3RyaWN0cy4gQWxsIG9mIHRoZSBzdWJvcmRpbmF0ZSBzY2hvb2wgZGlzdHJpY3RzIGhhdmUgdGhlIG5hbWUg4oCcTmV3IFlvcmsgQ2l0eSBHZW9ncmFwaGljIERpc3RyaWN0ICMj4oCdIHdoZXJlICMjIGlzIGEgbnVtYmVyIGJldHdlZW4gMSBhbmQgMzIuIElmIHlvdSBhcmUgaW50ZXJlc3RlZCBpbiBhZ2dyZWdhdGluZyB0aGUgc3VibWl0dGVkIHNjaG9vbCBsZXZlbCBkYXRhIHRvIHRoZSBsZXZlbCBvZiB0aGUgTmV3IFlvcmsgQ2l0eSBTY2hvb2wgRGlzdHJpY3QsIHVzZSB0aGUgbmFtZXMgYW5kIExFQSBJRHMgaW4gdGhlIFRhYmxlIDE2IHRvIGlkZW50aWZ5IHRoZSBwcm9wZXIgcmVjb3JkcyB3aXRoaW4gdGhlIGRhdGEgZmlsZS4iCgpVbmZvcnR1bmF0ZWx5LCBvdXIgb3JpZ2luYWwgZGF0YSBzZXQgZG9lc24ndCBkaXN0aW5ndWlzaCBiZXR3ZWVuIGFsbCBvZiB0aGVzZSBnZW9ncmFwaGljYWwgZGlzdHJpY3RzLCBzbyB3ZSB3aWxsIG5vdCBwcm92aWRlIGdyYWR1YXRpb24gcmF0ZSBkYXRhIGZvciB0aGlzIGNvbmdsb21lcmF0ZSBvZiAzMiBkaXN0cmljdHMuIAoKYGBge3J9CmRhdGFfbm90X2pvaW5lZCAlPiUgCiAgICBnZ3Bsb3QoYWVzKHggPSBjaGlsZHJlbikpICsgCiAgICBzY2FsZV94X2xvZzEwKCkgKyAKICAgIGdlb21fYm94cGxvdCgpICsgCiAgICBnZW9tX2RlbnNpdHkoKQpgYGAKCgpBYm91dCA1MCUgb2YgdGhlIHNjaG9vbCBkaXN0cmljdCBmb3Igd2hpY2ggdGhlcmUgaXMgbm8gZGF0YSBvbiBncmFkdWF0aW9uIHJhdGVzIChkaWRuJ3Qgam9pbiBwcm9wZXJseSwgcmV2ZWFsZWQgYnkgYGFudGlfam9pbigpYCkgaGF2ZSBiZXR3ZWVuIGFib3V0IDEwMCBhbmQgMTAwMCBzdHVkZW50cy0tLWluIG90aGVyIHdvcmRzLCB0aGVzZSBhcmUgdmVyeSBzbWFsbCBzY2hvb2wgZGlzdHJpY3RzLiAKCkl0IHR1cm5zIG91dCB0aGF0IHRoZXJlIHdhcyBhbHNvIG5vIGdyYWR1YXRpb24gcmF0ZSBkYXRhIGZvciB0aGUgTmV3IFlvcmsgQ2l0eSBEZXBhcnRtZW50IG9mIEVkdWNhdGlvbiwgd2hpY2ggaXMgYSBjb25nbG9tZXJhdGlvbiBvZiAzMiBpbmRpdmlkdWFsIHNjaG9vbCBkaXN0cmljdHMuIFdoaWxlIHdlIGNvdWxkIHJlcGxhY2UgdGhpcyBkZXBhcnRtZW50IG9mIGVkdWNhdGlvbiB3aXRoIGl0cyBjb25zdGl0dWVudCBzY2hvb2wgZGlzdHJpY3RzLCB0aGUgZGF0YSB3ZSB3ZXJlIGdpdmVuIGluaXRpYWxseSBpbiBvdXIgZGF0YSBzZXQgaW5jbHVkZWQgdGhpcyByb3csIHdoaWNoIGlzIG5vdCwgaW4gZmFjdCwgYSBzY2hvb2wgZGlzdHJpY3QuIFNpbWlsYXIgdG8gdGhlIE5ldyBZb3JrIFRpbWVzIGF0IHNvbWUgb2YgdGhlIG5ld2VyIENPVklEIG91dGJyZWFrcywgd2Ugd2lsbCBpZ25vcmUgTmV3IFlvcmsgQ2l0eSBpbiBvdXIgYW5hbHlzaXMuIA==

Joining Graduation Data

Jon Geiger, Noel Goodwin, Abigail Joppa

2022-06-07

Data Import and Glimpsing

Data Cleaning

Joining the Data

Explaining Un-joined Data