UI-Research · brchen3 · Oct 20, 2025 · Oct 21, 2025 · Jan 22, 2026
diff --git a/.Rhistory b/.Rhistory
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -64,5 +64,7 @@ Suggests:
     knitr,
     qualtRics,
     rmarkdown,
+    testthat (>= 3.0.0),
     tidyverse
 VignetteBuilder: knitr
+Config/testthat/edition: 3
diff --git a/R/get_business_patterns.R b/R/get_business_patterns.R
@@ -1,9 +1,78 @@
+#' @importFrom magrittr %>%
+#'
 #' Obtain County Business Patterns (CBP) Estimates per County
 #'
-#' @param year The vintage of CBP data desired. Data are available from 1986, though this function likely only supports more recent years (it it tested on 2022-vintage data only). Default is 2022.
-#' @param naics_code_digits One of c(2, 3). Default is 2. NAICS codes range in specificity; 2-digit codes describe the highest groupings of industries, while six-digit codes are exceedingly detailed. There are 20 2-digit NAICS codes and 196 3-digit codes.
-#' @param naics_codes A vector of NAICS codes to query. If NULL, the function will query all available codes with the specified number of digits. If not NULL, this argument overrides the `naics_code_digits` argument.
-#' @return A tibble with data on county-level employees, employers, and aggregate annual payrolls by industry and employer size
+#' @param year The vintage of CBP data desired. Data are available from 1986,
+#'     though this function likely only supports more recent years (it it tested on 2022-vintage data only).
+#'     Default is 2022.
+#' @param geo The level of geography of CBP data desired. Either "county" or "zipcode". Zipcode
+#'     level data only The ZIP Code Business Patterns (ZBP) dataset includes the number of establishments,
+#'     employment during the week of March 12th, first quarter and annual payroll for NAICS 00 (total for all sectors).
+#'     Additionally, the number of establishments (but not employment or payroll) are available by employment
+#'     size of the establishment for 2- through 6-digit NAICS.
+#' @param naics_code_digits One of c(2, 3). Default is 2. NAICS codes range in
+#'     specificity; 2-digit codes describe the highest groupings of industries,
+#'     while six-digit codes are exceedingly detailed. There are 20 2-digit NAICS
+#'     codes and 196 3-digit codes. If more specific codes are desired, leave this
+#'     argument as NULL and supply the desired codes as the argument to `naics_codes`.
+#' @param naics_codes A vector of NAICS codes to query. If NULL, the function will
+#'     query all available codes with the specified number of digits. If not NULL,
+#'     this argument overrides the `naics_code_digits` argument.
+#'
+#'
+#' @details
+#' County Business Patterns (CBP) is an annual series that provides subnational
+#' economic data for establishments with paid employees by industry and employment size.
+#' This series includes the number of establishments, employment during the week of
+#' March 12, first quarter payroll, and annual payroll. Industry classification of business
+#' establishments in CBP is according to the North American Industry Classification System (NAICS)
+#' https://www.census.gov/naics/
+#'
+#' CBP data are useful for studying economic activity of small areas. Federal agencies
+#' use the data to determine employee concentrations and trends by industry.
+#' State and local government offices use the data to assess business changes, develop
+#' fiscal policies, and plan future policies and programs. CBP data are used to benchmark
+#' public and private sector statistical series, surveys, and databases between economic census years.
+#'
+#' While similar to LEHD Origin-Destination Employment Statistics (LODES) data in it's coverage of employment
+#' statistics, CBP differs mainly due to its broader geographies (county vs. tract) and
+#' focus on framing the statistics at an establishment/company level rather than at the individual/job
+#' level found in LODES data. CBP also does not offer information on locations of the jobs in relation to
+#' where the employee actually resides.
+#'
+#' The series excludes data on self-employed individuals, employees of private households,
+#' railroad employees, agricultural production employees, and most government employees.
+#' A certain amount of undercoverage occurs in the universe, as the Census Bureau does
+#' not create a multi-unit company structure in the Business Register for very small employers
+#' (less than 10 employees) identified in the Economic Census.
+#'
+#' CBP covers most NAICS industries excluding Crop and Animal Production (NAICS 111,112);
+#' Rail Transportation (NAICS 482); Postal Service (NAICS 491); Pension, Health, Welfare,
+#' and Other Insurance Funds (NAICS 525110, 525120, 525190); Trusts, Estates, and Agency
+#' Accounts (NAICS 525920); Offices of Notaries (NAICS 541120); Private Households (NAICS 814);
+#' and Public Administration (NAICS 92)
+#'
+#'
+#'
+#' @return A tibble with data on county-level employees, employers, and aggregate
+#'     annual payrolls by industry and employer size
+#'     \describe{
+#'         \item{year}{the year for which CBP data is pulled from}
+#'         \item{state}{A two-digit state identifier.}
+#'         \item{county}{A three-digit county identifier.}
+#'         \item{employees}{number of individual employees employed in that particular industry
+#'              and establishment size combination}
+#'         \item{employers}{number of establishments of each employment size}
+#'         \item{annual_payroll}{total annual payroll expenditures measured in $1,000's of USD}
+#'         \item{industry}{industry classification according to North American Industry Classification System.
+#'              Refer to details for additional information}
+#'         \item{employee_size_range_label}{range for the employment size of establishments included in each
+#'              given grouping}
+#'         \item{employee_size_range_code}{three-digit code used to categorize employment sizes}
+#'         \item{naics_code}{two to six-digit code used by the NAICS to categorize and sub-categorize industries}
+#'         }
+#'
+#'
 #' @export
 #'
 #' @examples
@@ -17,16 +86,22 @@
 #'  naics_codes = c(221111, 221112))
 #' }
 
-get_business_patterns = function(year = 2022, naics_code_digits = 2, naics_codes = NULL) {
+get_business_patterns = function(year = 2022, geo = "county", naics_code_digits = 2, naics_codes = NULL) {
   if (year < 1986) { stop("Year must be 1986 or later.") }
+  if (year > 2023) { stop("Most recent year for data is 2023.") }
+  if (! geo %in% c("county", "zipcode")) { stop("`geo` must be one of 'county' or 'zipcode'.") }
   if (! naics_code_digits %in% c(2, 3)) {
     stop("`naics_code_digits` must be one of c(2, 3). For more detailed codes, explicitly pass desired codes to the `naics_codes` parameter.") }
 
   naics_codes_metadata = censusapi::listCensusMetadata(
-    name = "cbp",
-    vintage = "2022",
-    type = "variables",
-    include_values = TRUE)
+      name = "cbp",
+      vintage = "2022",
+      type = "variables",
+      include_values = TRUE) %>%
+    #filter out codes 92 and 95 which do not appear to have data associated and
+    #don't appear on the census list of naics codes at
+    #https://www2.census.gov/programs-surveys/cbp/technical-documentation/reference/naics-descriptions/naics2017.txt
+    dplyr::filter(!stringr::str_starts(values_code, "92|95"))
 
   if (!is.null(naics_codes)) {
     naics_code_check = naics_codes_metadata %>%
@@ -62,30 +137,34 @@ get_business_patterns = function(year = 2022, naics_code_digits = 2, naics_codes
     ~ tryCatch({
       censusapi::getCensus(
         name = "cbp",
-        vintage = 2022,
+        vintage = year,
         vars = c(
           "EMP",
+          "YEAR",
           "ESTAB",
           "PAYANN",
           "EMPSZES",
           "NAICS2017_LABEL"),
-        region = "county:*",
-        NAICS2017 = .x)},
+        region = paste0(geo, ":*"),
+        NAICS2017 = .x) %>%
+        mutate(naics_code = .x)},
       error = function(e) {
         message("Error in NAICS2017: ", .x)
         return(tibble::tibble())})) %>%
-    dplyr::select(
-      state,
-      county,
+    dplyr::mutate(
+      # state,
+      # county,
       employees = EMP,
       employers = ESTAB,
       annual_payroll = PAYANN,
       employee_size_range = EMPSZES,
-      industry = NAICS2017_LABEL) %>%
+      industry = NAICS2017_LABEL,
+      naics_code) %>%
     dplyr::mutate(
       industry = industry %>%
         stringr::str_to_lower() %>%
         stringr::str_replace_all(c(" " = "_", ",|\\(|\\)|_for_all_sectors|and_" = "")),
+      year = year,
       ## this recoding is mapped from: https://www2.census.gov/programs-surveys/bds/technical-documentation/label_empszes.csv
       employee_size_range_label = dplyr::case_when(
         employee_size_range == "001" ~ "All establishments",
@@ -137,12 +216,45 @@ get_business_patterns = function(year = 2022, naics_code_digits = 2, naics_codes
         stringr::str_extract(employee_size_range_label, "[0-9]{4}") %>% as.numeric >= 1000 ~ "1000+",
         TRUE ~ employee_size_range_label)) %>%
     dplyr::rename(employee_size_range_code = employee_size_range) %>%
-    dplyr::select(state, county, employees, employers, annual_payroll, industry, employee_size_range_label, employee_size_range_code)
+    {
+    if (geo == "county") {
+      dplyr::select(.,
+        year, state, county, employees, employers, annual_payroll,
+        industry, employee_size_range_label, employee_size_range_code, naics_code
+      )
+    } else if (geo == "zipcode") {
+      dplyr::select(.,
+        year, zip_code, employees, employers, annual_payroll,
+        industry, employee_size_range_label, employee_size_range_code, naics_code)
+    }
+    }
+
+#    dplyr::select(year, state, county, employees, employers, annual_payroll, industry, employee_size_range_label, employee_size_range_code, naics_code)
+
+  high_missingness = cbp %>%
+    skimr::skim() %>%
+    dplyr::filter(complete_rate < .9) %>%
+    dplyr::pull(skim_variable)
+
+
+  if (length(high_missingness) > 0) {
+    base::warning(
+      stringr::str_c(
+        "Variables with high missingness in County Business Patterns",
+        ": ",
+        base::paste(high_missingness, collapse = ", ")
+      ),
+      call. = FALSE
+    )
+  } else {
+    base::print("No variables have high missingness (complete_rate >= 0.9).")
+  }
+
 
   return(cbp)
 }
 
 utils::globalVariables(
   c("EMP", "EMPSZES", "ESTAB", "NAICS2017_LABEL", "PAYANN", "annual_payroll",
     "employee_size_range", "employee_size_range_code", "employee_size_range_label",
-    "employees", "employers", "industry", "values_code"))
+    "employees", "employers", "industry", "values_code", "naics_code"))
diff --git a/R/get_lodes.R b/R/get_lodes.R
@@ -93,14 +93,72 @@ rename_lodes_variables = function(.df) {
 #' Get LEHD Origin-Destination Employment Statistics (LODES) data
 #' Returned data are from LODES Version 8, which is enumerated in 2020-vintage geometries.
 #'
-#' @param lodes_type One of c("rac", "wac", "od"). "rac" = Residence Area Characteristics, where jobs are associated with employees' residences. "wac" = Workplace Area Characteristics, where jobs are associated with employees' workplaces. "od" = Origin-Destination data, where jobs are associated with both workers' residences and their workplaces.
-#' @param jobs_type One of c("all", "primary"). Default is "all", which includes multiple jobs for workers with multiple jobs. "primary" includes only the highest-paying job per worker.
+#' @param lodes_type One of c("rac", "wac", "od"). "rac" = Residence Area
+#'     Characteristics, where jobs are associated with employees' residences.
+#'     "wac" = Workplace Area Characteristics, where jobs are associated with
+#'     employees' workplaces. "od" = Origin-Destination data, where jobs are associated
+#'     with both workers' residences and their workplaces.
+#' @param jobs_type One of c("all", "primary"). Default is "all", which includes
+#'     multiple jobs for workers with multiple jobs. "primary" includes only the
+#'     highest-paying job per worker.
 #' @param states A vector of state abbreviations.
 #' @param years A vector of years.
-#' @param geography One of c("block", "block group", "tract", "county", "state"). Default is "tract".
-#' @param state_part One of c("main", "aux"). Default is "main", which includes only workers who reside inside the state where they work. "aux" returns only workers who work in the specified state but live outside of that state.
+#' @param geography One of c("block", "block group", "tract", "county", "state").
+#'     Default is "tract".
+#' @param state_part One of c("main", "aux"). Default is "main", which includes
+#'     only workers who reside inside the state where they work. "aux" returns
+#'     only workers who work in the specified state but live outside of that state.
+#'
+#' @details
+#' The Longitudinal Employer-Household Dynamics (LEHD) data at the U.S. Census Bureau
+#' is a quarterly database of linked employer-employee data covering over 95% of employment
+#' in the United States. The LEHD data are generated by merging previously collected survey
+#' and administrative data on jobs, businesses, and workers.
+#'
+#' LEHD Origin-Destination Employment Statistics (LODES)is a partially synthetic dataset
+#' that describes geographic patterns of jobs by their employment locations and residential
+#' locations as well as the connections between the two locations. The microdata link employee
+#' and employer data by combining administrative state unemployment insurance wage records
+#' with other administrative and survey data. The source data are aggregated and adjusted
+#' to protect confidentiality.
+#'
+#' LODES data includes three datasets:
+#'    Residence Area Characteristics (RAC):
+#'    This file lists the total number of jobs by the census block where the employee lives.
+#'
+#'    Workplace Area Characteristics (WAC):
+#'    This file lists the total number of jobs by the census block where the employee works.
+#'
+#'    Origin-Destination (OD):
+#'    This file lists job totals by both the census block where the employee lives and the
+#'    census block where the employee works
+#'
+#' While similar to County Business Patterns (CBP) data in it's coverage of employment
+#' statistics, LODES differs mainly due to its more granular geographies (tract vs. county) and
+#' focus on framing the statistics at the individual/job level found in LODES data.
+#'
+#' @return A tibble with one record per geography per year per job type. Attributes
+#'     include total jobs and jobs by worker earnings, industry, and demographics;
+#'     the origin-destination results have more limited demographics compared to
+#'     the "wac" and "rac" results.
+#'     \describe{
+#'         \item{year}{the year for which LODES data is pulled from}
+#'         \item{state}{A two-digit state identifier.}
+#'         \item{GEOID}{11 digit identifier denoted as either h_GEOID representing the employees' residence census block code or w_GEOID representing the employees' workplace census block code}
+#'         \item{job_type}{one of either 'all' jobs or only 'federal' jobs}
+#'         \item{total_jobs}{total number of jobs in a given tract}
+#'         \item{jobs_workers_age}{number of employees by given age range}
+#'         \item{jobs_earnings}{number of employees by given monthly earnings range}
+#'         \item{jobs_industry}{number of employees by given industry}
+#'         \item{jobs_workers_race}{number of employees by given race, inclusive of hispanic or latino; only available in 'wac' and 'rac' datasets}
+#'         \item{jobs_workers_ethnicity}{number of employess by hispanic or latino status, regardless of race; only available in 'wac' and 'rac' datasets}
+#'         \item{jobs_workers_educational_attainment}{number of employees by highest level of education attained; only available in 'wac' and 'rac' datasets}
+#'         \item{jobs_workers_sex}{number of employees by sex; only available in 'wac' and 'rac' datasets}
+#'         \item{jobs_firm_age}{number of employees by the age of employing firm; only available in 'wac' datasets}
+#'         \item{jobs_firm_size}{number of employees for a given range in employer size; only available in 'wac' datasets}
+#'     }
+#'
 #'
-#' @return A tibble with one record per geography per year per job type. Attributes include total jobs and jobs by worker earnings, industry, and demographics; the origin-destination results have more limited demographics compared to the "wac" and "rac" results.
 #' @export
 get_lodes = function(
     lodes_type,
@@ -124,6 +182,12 @@ get_lodes = function(
   if (!state_part %in% c("main", "aux")) {
     stop("`state_part` must be one of 'main' or 'aux'.")}
 
+
+  # if states == "all" then set states parameter as all 50 states plus DC
+  if ("all" %in% states) {
+    states <-  c(state.abb, "DC")
+    }
+
   years = years %>% as.numeric
 
   states = states %>% stringr::str_to_lower()
@@ -147,6 +211,9 @@ form of multi-year job count comparison, we return by default federal job counts
 alongside those for all jobs; users can subtract federal job counts to create
 a temporally-consistent measure of total jobs.\n") }
 
+
+# list of state & year combinations that are missing data as of 12/2025. Sourced from
+# here: https://lehd.ces.census.gov/doc/help/onthemap/LODESTechDoc.pdf
   state_years_missing = tibble::tribble(
     ~ year, ~ state,
     2002, "AK",
@@ -172,16 +239,15 @@ a temporally-consistent measure of total jobs.\n") }
     2009, "DC",
     2009, "MA",
     2010, "MA",
+    2017, "AK",
     2018, "AK",
     2019, "AK",
-    2019, "MS",
     2020, "AK",
-    2020, "MS",
     2021, "AK",
-    2021, "MS",
     2022, "AK",
-    2022, "MS",
-    2022, "MI")
+    2022, "MI",
+    2023, "AK",
+    2023, "MI")
 
   state_years_supplied = expand.grid(years, states %>% stringr::str_to_upper()) %>%
     tibble::as_tibble() %>%
@@ -202,7 +268,7 @@ Returning for only those states that are available for all specified years.\n")
 
   #https://lehd.ces.census.gov/doc/help/onthemap/LODESDataNote-FedEmp2015.pdf
 
-  ## geography identifying variables are variably-named across different geography
+  ## geography-identifying variables are variably named across different geography
   ## parameters; we standardize these to always be "GEOID"
   geoid_rename = c("_geocode|_tract|_bg|_county|_state" = "_GEOID")
 
@@ -212,12 +278,7 @@ Returning for only those states that are available for all specified years.\n")
     jobs_type_all = "JT01"
     jobs_type_federal = "JT05" }
 
-  # states = "TX"
-  # years = 2022
-  # agg_geo = "tract"
-  # lodes_type = "od"
-
-  ## else this is noisy
+  ## supress messages/warnings else this is noisy
   suppressWarnings({suppressMessages({
     lodes_all_jobs = lehdr::grab_lodes(
         state = states,
@@ -239,6 +300,14 @@ Returning for only those states that are available for all specified years.\n")
 include federal jobs for 2010 and later. Records for pre-2010 federal jobs are listed
 as NA.\n") }
 
+
+  ## if only years are pre-2010, returns jobs without federal job data
+  if (years %>% max < 2010) {
+
+    return(lodes_all_jobs)
+
+  } else {
+
   suppressWarnings({suppressMessages({
     lodes_federal_jobs = lehdr::grab_lodes(
         state = states,
@@ -255,8 +324,14 @@ as NA.\n") }
       dplyr::select(-dplyr::matches("create")) })})
 
   join_by = c("year", "GEOID")
+
   if (lodes_type == "od") {
-    join_by = c("year", "w_GEOID", "h_GEOID") }
+    join_by = c("year", "w_GEOID", "h_GEOID") } else if (lodes_type == "rac") {
+      join_by = c("year", "h_GEOID")
+    } else if (lodes_type == "wac") {
+      join_by = c("year", "w_GEOID")
+  }
+
 
   ## both all jobs and all federal jobs
   lodes_all_nonfederal_jobs = lodes_all_jobs %>%
@@ -287,6 +362,7 @@ as NA.\n") }
     rename_lodes_variables()
 
   return(lodes_all_nonfederal_jobs)
+  }
 }
 
 utils::globalVariables(c(