10.8 Census data with tidycensus
See R/census
This is example code that can be used to pull data from the US census into R as a data frame
At the top of get.census.data.r
, there are URLs you can use to determine what variables you want to pull. For the census data used in these notes, I used the ACS link, and when I got to that page, I chose HTML format for 2019.
If you keep the line us <- unique(fips_codes$state)[1]
the same, it will only pull census info for the first state, Alabama. It’s quick and a good way to check and make sure your code is running properly. To do all states and DC, change [1] to [1:51]. There are some US territories in this data, so you likely want to stop at 51.
## get.census.data.r
## Script retrieves, cleans and saves Census data to rds file
## (GEOID, Median Income, Total Population, Total Age, etc)
## See these for the variable codes:
# Census API: https://www.census.gov/data/developers/data-sets.html
# ACS: https://www.census.gov/data/developers/data-sets/acs-1year.html
# Decennial: https://www.census.gov/data/developers/data-sets/decennial-census.html
library(tidycensus)
library(tidyverse)
year=2019
## You need a census API key.
## See https://api.census.gov/data/key_signup.html
## Once you get it, you can store it using
## census_api_key('yourAPIkeyHere', install=TRUE)
## That only needs to be done once (hopefully)
## and it will be stored in your .Renviron file.
## Then you can use this
readRenviron("~/.Renviron")
## all
us <- unique(fips_codes$state)[1] ## change to 1:51 for all states and DC
## for ACS
vars <- c(# Total, Male, Female
"B01001_001", "B01001_002", "B01001_026",
# Median age, Male age, Female age
"B01002_001", "B01002_002", "B01002_003",
# Race: White, Black, Native, Asian, Pacific, Other, 2+
"B02001_002", "B02001_003", "B02001_004",
"B02001_005", "B02001_006", "B02001_007", "B02001_008",
# white non-hispanic, hispanic, white hisp, black hisp
"B03002_003", "B03002_012", "B03002_013", 'B03002_014',
# Household income ranges:
# <10k, 10000-14999, ... , 150000-199999, >200k
"B19001_001", "B19001_002", "B19001_003", "B19001_004",
"B19001_005", "B19001_006", "B19001_007", "B19001_008",
"B19001_009", "B19001_010", "B19001_011", "B19001_012",
"B19001_013", "B19001_014", "B19001_015", "B19001_016",
"B19001_017",
"B19013_001", # Median Household Income
"B25077_001", # Median Housing Value
'B05009_001' # Children
)
# fails with geometry = TRUE, so removing.
# We get tract info from elsewhere anyway.
d = get_acs(geography = "tract",
variables = vars,
state = us,
year = year)
# census_data <- get_decennial(geography = "tract",
# variables = vars,
# state = 'CT',
# year=2020)
dd <- d %>%
mutate(tract = gsub(',.+|Census Tract ', '', NAME),
county = gsub(', [A-z]+$|' , '', NAME),
county = gsub('^.+, ' , '', county),
state.full = gsub('.+, ' , '', NAME)) %>%
select(GEOID, tract, county, state.full, variable, estimate) %>%
pivot_wider(names_from = variable,
values_from = estimate) %>%
rename( 'pop' = 'B01001_001', ## total population
'male' = 'B01001_002', ## sex
'female' = 'B01001_026',
'age' = 'B01002_001', ## age
'male.age' = 'B01002_002',
'female.age' = 'B01002_003',
'white' = 'B02001_002', ## race
'black' = 'B02001_003',
'indian.alaskan' = 'B02001_004',
'asian' = 'B02001_005',
'pacific' = 'B02001_006',
'other' = 'B02001_007',
'two.or.more' = 'B02001_008',
'white.not.hisp' = 'B03002_003',
'hisp' = 'B03002_012',
'white.hisp' = 'B03002_013',
'black.hisp' = 'B03002_014',
'households' = 'B19001_001',
'i10orless' = 'B19001_002', ## income
'i10to14' = 'B19001_003',
'i15to19' = 'B19001_004',
'i20to24' = 'B19001_005',
'i25to29' = 'B19001_006',
'i30to34' = 'B19001_007',
'i35to39' = 'B19001_008',
'i40to44' = 'B19001_009',
'i45to49' = 'B19001_010',
'i50to59' = 'B19001_011',
'i60to74' = 'B19001_012',
'i75to99' = 'B19001_013',
'i100to124'= 'B19001_014',
'i125to149'= 'B19001_015',
'i150to199'= 'B19001_016',
'i200ormore'= 'B19001_017',
'hh.income' = 'B19013_001',
'house.value' = 'B25077_001',
'TotalChildren' = 'B05009_001' # doesn't exist for 2009
) %>%
as.data.frame()
head(dd)
## save census data
# filename = paste0('rawdata/census', year, '.rds')
# saveRDS(dd, file = filename)
Census tracts
library(sf) ## st_centroid() function
library(tigris) ## census shapefile. tracts() function. fips_codes object.
year=2019
## vector of states
us = unique(fips_codes$state)[1:51]
us
## census tracts
tracts = tracts(state = 'CT',
county = 1:999,
cb = TRUE,
year = year)
## centroids
centroids = st_centroid(tracts,
of_largest_polygon = TRUE)