library(tidyverse)
library(openintro)
library(ggthemes)
theme_set(theme_clean())
<- read_delim("../raw/race_raw.txt", delim = "|", escape_double = FALSE, trim_ws = TRUE)
race_data <- read_csv("../data/hh.csv") hh
names(race_data)
## [1] "GeoId" "Geography" "LEAID" "Year"
## [5] "Iteration" "CDP05.3_51est" "CDP05.3_51moe" "CDP05.3_52est"
## [9] "CDP05.3_52moe" "CDP05.3_52pct" "CDP05.3_52pctmoe" "CDP05.3_53est"
## [13] "CDP05.3_53moe" "CDP05.3_53pct" "CDP05.3_53pctmoe" "CDP05.3_54est"
## [17] "CDP05.3_54moe" "CDP05.3_54pct" "CDP05.3_54pctmoe" "CDP05.3_55est"
## [21] "CDP05.3_55moe" "CDP05.3_55pct" "CDP05.3_55pctmoe" "CDP05.3_56est"
## [25] "CDP05.3_56moe" "CDP05.3_56pct" "CDP05.3_56pctmoe" "CDP05.3_57est"
## [29] "CDP05.3_57moe" "CDP05.3_57pct" "CDP05.3_57pctmoe" "CDP05.3_58est"
## [33] "CDP05.3_58moe" "CDP05.3_58pct" "CDP05.3_58pctmoe" "CDP05.3_59est"
## [37] "CDP05.3_59moe" "CDP05.3_59pct" "CDP05.3_59pctmoe" "CDP05.3_60est"
## [41] "CDP05.3_60moe" "CDP05.3_60pct" "CDP05.3_60pctmoe" "CDP05.3_61est"
## [45] "CDP05.3_61moe" "CDP05.3_61pct" "CDP05.3_61pctmoe" "CDP05.3_62est"
## [49] "CDP05.3_62moe" "CDP05.3_62pct" "CDP05.3_62pctmoe" "CDP05.3_63est"
## [53] "CDP05.3_63moe" "CDP05.3_63pct" "CDP05.3_63pctmoe" "CDP05.3_64est"
## [57] "CDP05.3_64moe" "CDP05.3_64pct" "CDP05.3_64pctmoe" "CDP05.3_65est"
## [61] "CDP05.3_65moe" "CDP05.3_65pct" "CDP05.3_65pctmoe" "CDP05.3_66est"
## [65] "CDP05.3_66moe" "CDP05.3_66pct" "CDP05.3_66pctmoe"
I used the data dictionary provided to rename the variable names. *note: It looks like they only looked combined all of the Asian population which is a pretty big umbrella. It may be worth looking into what exactly this variable includes.
<- c("geo_id",
race_names"dist",
"leaid",
"year",
"Iteration",
"total_pop_est", #Total population
"total_pop_moe", # total population MOE
"total_hisp_latino", #Total population; Hispanic or Latino (of any race)
"total_hisp_latino_moe", #Total population; Hispanic or Latino (of any race) MOE
"pct_hisp_latino", # % total population: Hispanic or Latino (of any race)
"pct_hisp_latino_moe", # % total population: Hispanic or Latino (of any race) MOE
"total_mexican", # total mexican population
"total_mexican_moe", # total mexican population MOE
"pct_mexican", # % mexican population
"pct_mexican_moe", # % mexican population MOE
"total_puertrican", # total puertorican population
"total_puertrican_moe", # total puertorican population MOE
"pct_puertrican", # % puertorican population
"pct_puertrican_moe", # % puertorican population MOE
"total_cuban", # total cuban population
"total_cuban_moe", # total cuban population MOE
"pct_cuban", # % cuban population
"pct_cuban_moe", # % cuban population MOE
"total_other_hl", # total population of other hispanic and latino
"total_other_hl_moe", # total population of other hispanic and latino MOE
"pct_other_hl", # % population of other hispanic and latino
"pct_other_hl_moe", # % population of other hispanic and latino MOE
"total_NOT_hl", # total population that are not hispanic nor latino
"total_NOT_hlmoe", # total population that are not hispanic nor latino MOE
"pct_NOT_hl", # % population that are not hispanic nor latino
"pct_NOT_hl_moe", # % population that are not hispanic nor latino MOE
"total_white", #total population white
"total_white_moe", #total population white MOE
"pct_white", # % population white
"pct_white_moe", # % population white MOE
"total_black", #total population black or african american
"total_black_moe", #total population black or african american MOE
"pct_black", # % population black or african american
"pct_black_moe", # % population black or african american MOE
"total_native", #total native population
"total_native_moe", #total native population MOE
"pct_native", # % native population
"pct_native_moe", # % native population MOE
"total_asian", #total asian population
"total_asian_moe", #total asian population MOE
"pct_asian", # % asian population
"pct_asian_moe", # % asian population MOE
"total_PI", # total PI = Native Hawiian/PAcific Islander
"total_PI_moe", # total PI = Native Hawiian/PAcific Islander MOE
"pct_PI", # % PI
"pct_PI_moe", # % PI MOE
"total_other", #total people of other race (nonhisp/latino)
"total_other_moe", #total people of other race (nonhisp/latino) MOE
"pct_other", # % people of other race (nonhisp/latino)
"pct_other_moe", #% people of other race (nonhisp/latino) MOE
"total_nonhl_2race", #non hispanic or latio with 2 races or more
"total_nonhl_2race_moe", #non hispanic or latio with 2 races or more MOE
"pct_nonhl_2race", # % non hispanic or latio with 2 races or more
"pct_nonhl_2race_moe", # % non hispanic or latio with 2 races or more MOE
"total_nonhl_2_other", #Total population; Not Hispanic or Latino; Two or more races; Two races including Some other race
"total_nonhl_2_other_moe", #MOE
"pct_nonhl_2_other", # % population; Not Hispanic or Latino; Two or more races; Two races including Some other race
"pct_nonhl_2_other_moe",
"total_nonhl_2_3other", # Total population; Not Hispanic or Latino; Two or more races; Two races excluding Some other race, and Three or more races
"total_nonhl_2_3other_moe",
"pct_nonhl_2_3other", # % population; Not Hispanic or Latino; Two or more races; Two races excluding Some other race, and Three or more races
"pct_nonhl_2_3other_moe"
)names(race_data) <- race_names
<- race_data %>%
race_data select(-"Iteration") %>% # This variable is not relevant, as it is the same through each entry. We can keep in mind that it is 105.
mutate(leaid = as.integer(leaid))
%>% head(10) race_data
## # A tibble: 10 × 66
## geo_id dist leaid year total_pop_est total_pop_moe total_hisp_lati…
## <chr> <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 97000US27001… A.C.… 2.70e6 2014… 885 105 50
## 2 97000US45006… Abbe… 4.50e6 2014… 3420 223 25
## 3 97000US55000… Abbo… 5.50e6 2014… 685 124 285
## 4 97000US48073… Abbo… 4.81e6 2014… 190 58 4
## 5 97000US28003… Aber… 2.80e6 2014… 1405 233 10
## 6 97000US53000… Aber… 5.30e6 2014… 3480 347 980
## 7 97000US16000… Aber… 1.60e6 2014… 745 196 365
## 8 97000US48074… Aber… 4.81e6 2014… 805 166 515
## 9 97000US48074… Abil… 4.81e6 2014… 15410 602 6515
## 10 97000US20031… Abil… 2.00e6 2014… 1270 177 60
## # … with 59 more variables: total_hisp_latino_moe <dbl>, pct_hisp_latino <dbl>,
## # pct_hisp_latino_moe <dbl>, total_mexican <dbl>, total_mexican_moe <dbl>,
## # pct_mexican <dbl>, pct_mexican_moe <dbl>, total_puertrican <dbl>,
## # total_puertrican_moe <dbl>, pct_puertrican <dbl>, pct_puertrican_moe <dbl>,
## # total_cuban <dbl>, total_cuban_moe <dbl>, pct_cuban <dbl>,
## # pct_cuban_moe <dbl>, total_other_hl <dbl>, total_other_hl_moe <dbl>,
## # pct_other_hl <dbl>, pct_other_hl_moe <dbl>, total_NOT_hl <dbl>, …
It appears that the geoid** is a combination of some sort of US code (97000US), followed by a two digit state fips code, and lastly the district ID used in the original NHGIS district data. The same names of school districts, without specification about which state they are from (see Lincoln County School District). This may be problematic in the joining process.
%>%
race_data filter(str_detect(dist, 'Lincoln County School District'))
## # A tibble: 9 × 66
## geo_id dist leaid year total_pop_est total_pop_moe total_hisp_lati…
## <chr> <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 97000US1303330 Linc… 1.30e6 2014… 1070 120 55
## 2 97000US2103480 Linc… 2.10e6 2014… 3675 279 140
## 3 97000US3200270 Linc… 3.20e6 2014… 645 192 115
## 4 97000US4107500 Linc… 4.11e6 2014… 5060 233 1010
## 5 97000US5400660 Linc… 5.40e6 2014… 3395 152 40
## 6 97000US4702490 Linc… 4.70e6 2014… 4540 296 345
## 7 97000US2802640 Linc… 2.80e6 2014… 3160 374 150
## 8 97000US5604030 Linc… 5.60e6 2014… 610 126 50
## 9 97000US5604060 Linc… 5.60e6 2014… 2900 270 185
## # … with 59 more variables: total_hisp_latino_moe <dbl>, pct_hisp_latino <dbl>,
## # pct_hisp_latino_moe <dbl>, total_mexican <dbl>, total_mexican_moe <dbl>,
## # pct_mexican <dbl>, pct_mexican_moe <dbl>, total_puertrican <dbl>,
## # total_puertrican_moe <dbl>, pct_puertrican <dbl>, pct_puertrican_moe <dbl>,
## # total_cuban <dbl>, total_cuban_moe <dbl>, pct_cuban <dbl>,
## # pct_cuban_moe <dbl>, total_other_hl <dbl>, total_other_hl_moe <dbl>,
## # pct_other_hl <dbl>, pct_other_hl_moe <dbl>, total_NOT_hl <dbl>, …
Update: After some exploration, we now know that that the two digit state code (fips) and the district ID make up the leaid which is already included in the race data. Instead of separating the geoid as we did previously, we will use the leaid to join with the household data.
There are 1,404 missing values which is the difference between the two data sets - household data (13314), race_data (11910). This is because the household data is missing 1404 leaids. We successfully joined all of the available race data with the district data.
<- hh %>%
dist_race_join left_join(race_data, by=c("leaid"))%>%
select(-dist.y)%>%
rename(dist = dist.x) %>%
relocate(where(is.numeric), .after = where(is.character))
colSums(is.na(dist_race_join))
## state dist region
## 0 0 2
## geo_id year leaid
## 1404 1404 0
## children pct_pov pct_SP
## 0 0 0
## SP_MOE pct_HHVJ HHVJ_MOE
## 0 0 0
## pct_CC CC_MOE pct_NCI
## 0 0 0
## nci_MOE pct_CD CD_MOE
## 0 0 0
## pct_CLI CLI_MOE total_pop_est
## 0 0 1404
## total_pop_moe total_hisp_latino total_hisp_latino_moe
## 1404 1404 1404
## pct_hisp_latino pct_hisp_latino_moe total_mexican
## 1404 1404 1404
## total_mexican_moe pct_mexican pct_mexican_moe
## 1404 1404 1404
## total_puertrican total_puertrican_moe pct_puertrican
## 1404 1404 1404
## pct_puertrican_moe total_cuban total_cuban_moe
## 1404 1404 1404
## pct_cuban pct_cuban_moe total_other_hl
## 1404 1404 1404
## total_other_hl_moe pct_other_hl pct_other_hl_moe
## 1404 1404 1404
## total_NOT_hl total_NOT_hlmoe pct_NOT_hl
## 1404 1404 1404
## pct_NOT_hl_moe total_white total_white_moe
## 1404 1404 1404
## pct_white pct_white_moe total_black
## 1404 1404 1404
## total_black_moe pct_black pct_black_moe
## 1404 1404 1404
## total_native total_native_moe pct_native
## 1404 1404 1404
## pct_native_moe total_asian total_asian_moe
## 1404 1404 1404
## pct_asian pct_asian_moe total_PI
## 1404 1404 1404
## total_PI_moe pct_PI pct_PI_moe
## 1404 1404 1404
## total_other total_other_moe pct_other
## 1404 1404 1404
## pct_other_moe total_nonhl_2race total_nonhl_2race_moe
## 1404 1404 1404
## pct_nonhl_2race pct_nonhl_2race_moe total_nonhl_2_other
## 1404 1404 1404
## total_nonhl_2_other_moe pct_nonhl_2_other pct_nonhl_2_other_moe
## 1404 1404 1404
## total_nonhl_2_3other total_nonhl_2_3other_moe pct_nonhl_2_3other
## 1404 1404 1404
## pct_nonhl_2_3other_moe
## 1404
Lets see which districts did not join and therefore are the missing school districts in the race data set.
<- dist_race_join %>%
missing_race_dist filter_all(any_vars(is.na(.))) %>%
select(leaid, state, dist, children)
%>% head(10) missing_race_dist
## # A tibble: 10 × 4
## leaid state dist children
## <dbl> <chr> <chr> <dbl>
## 1 102130 Alabama Linden City School District 566
## 2 200010 Alaska Aleutian Region School District 64
## 3 200060 Alaska Cordova City School District 448
## 4 200540 Alaska Nenana City School District 83
## 5 200630 Alaska Pelican City School District 3
## 6 200715 Alaska Tanana City School District 39
## 7 200780 Alaska Valdez City School District 634
## 8 200800 Alaska Chugach School District 81
## 9 400005 Arizona Salome Consolidated Elementary District 158
## 10 400022 Arizona Vernon Elementary District 139
This may not be too big of an issue if most of those schools are small districts that we were going to drop anyway, but we can see that there are about 463 missing school districts that have more than 150 students which may be important for our analysis.
%>%
missing_race_dist filter(children >= 150)%>%
dim()
## [1] 465 4