From cee0081564271350e187772999602552e4c45581 Mon Sep 17 00:00:00 2001
From: Will Curran-Groome <wcurrangroome@urban.org>
Date: Sun, 7 Dec 2025 16:31:22 -0500
Subject: [PATCH] adding a new caching function

---
 NAMESPACE       |   1 +
 R/cache_it.R    | 213 ++++++++++++++++++++++++++++++++++++++++++++++++
 _pkgdown.yml    |   1 +
 man/cache_it.Rd |  84 +++++++++++++++++++
 4 files changed, 299 insertions(+)
 create mode 100644 R/cache_it.R
 create mode 100644 man/cache_it.Rd

diff --git a/NAMESPACE b/NAMESPACE
index e3f4fd0..0e38021 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(cache_it)
 export(convert_table_text_to_dataframe)
 export(estimate_units_per_parcel)
 export(estimate_zoning_envelope)
diff --git a/R/cache_it.R b/R/cache_it.R
new file mode 100644
index 0000000..a66a423
--- /dev/null
+++ b/R/cache_it.R
@@ -0,0 +1,213 @@
+#' Cache an object to a parquet file; optionally read from disk
+#'
+#' This function writes an R object to a parquet file with an automatic datestamp
+#' (YYYY_MM_DD format) in the filename. It can also read from an existing cached
+#' file if one exists. For sf objects, the function automatically uses sfarrow for
+#' reading/writing and adds "_sf" to the filename to indicate the file format.
+#'
+#' @param object A dataframe, tibble, or sf object to cache. Can be provided as
+#'   either a quoted or unquoted name. Optional when reading from cache - in this
+#'   case, file_name must be provided.
+#' @param file_name File name (without extension). Optional when object is provided
+#'   (uses object's name). Required when object is missing and reading from cache.
+#' @param path Directory path where the file should be saved/read. Defaults to /data.
+#'   If the path does not exist, the user will be prompted to create it (in
+#'   interactive sessions) or an error will be thrown (in non-interactive sessions).
+#' @param read Logical or character. TRUE by default.
+#'   - TRUE: Find and read the most recent cached version based on datestamp.
+#'   - FALSE: Skip reading, always write a new cached file
+#'   - Character: Read the specific file with this exact filename (including extension).
+#'   Defaults to TRUE.
+#'
+#' @return The object that was cached (either written or read)
+#'
+#' @examples
+#' \dontrun{
+#' ## Note: datestamps in filenames are illustrative; user results will
+#' ## vary depending on the the date at runtime
+#'
+#' # Regular data frames
+#' my_data <- tibble(x = 1:10, y = letters[1:10])
+#'
+#' # Cache with automatic naming and datestamp
+#' cache_it(my_data)  # Creates: my_data_2025_12_07.parquet
+#'
+#' # Cache with custom filename
+#' cache_it(my_data, file_name = "custom_name")
+#'
+#' # Read most recent cached version if exists, otherwise write
+#' cached_data <- cache_it(my_data, read = TRUE)
+#'
+#' # Always write a new file, don't read from cache
+#' cache_it(my_data, read = FALSE)
+#'
+#' # Read a specific cached file by name
+#' old_data <- cache_it(my_data, read = "my_data_2025_12_01.parquet")
+#'
+#' # Read from cache when object doesn't exist in environment yet (using file_name)
+#' my_data <- cache_it(file_name = "my_data", read = TRUE)
+#'
+#' # Read from cache when object doesn't exist (using quoted name)
+#' my_data <- cache_it("my_data", read = TRUE)
+#'
+#' # Read from cache when object doesn't exist (using unquoted name)
+#' my_data <- cache_it(my_data, read = TRUE)
+#'
+#' # Read specific file when object doesn't exist
+#' old_data <- cache_it(read = "my_data_2025_12_01.parquet")
+#'
+#' # SF objects (automatically uses sfarrow)
+#' my_sf <- sf::st_read(system.file("shape/nc.shp", package="sf"))
+#' cache_it(my_sf)  # Creates: my_sf_2025_12_07_sf.parquet
+#'
+#' # Read most recent sf cached file
+#' cached_sf <- cache_it(my_sf, read = TRUE)
+#'
+#' # Read specific sf cached file
+#' old_sf <- cache_it(my_sf, read = "my_sf_2025_12_01_sf.parquet")
+#' }
+#'
+#' @export
+cache_it <- function(object,
+                     file_name = NULL,
+                     path = "/data",
+                     read = TRUE) {
+
+  # Determine if object parameter was provided
+  object_provided <- !missing(object)
+
+  # Get the name to use for the file and check if we have an actual object value
+  is_string_literal <- FALSE
+  if (is.null(file_name)) {
+    if (!object_provided) {
+      stop("Either 'object' or 'file_name' must be provided")
+    }
+
+    # Get what was passed to object
+    obj_expr <- substitute(object)
+
+    # If a string literal was passed, use it as file_name but note we don't have the object
+    if (is.character(obj_expr)) {
+      file_name <- obj_expr
+      is_string_literal <- TRUE
+    } else {
+      # Otherwise deparse to get the name
+      file_name <- deparse(obj_expr)
+    }
+  }
+
+  # Try to access the actual object value (if provided and not a string literal)
+  has_object_value <- FALSE
+  if (object_provided && !is_string_literal) {
+    has_object_value <- tryCatch({
+      # Force evaluation of object
+      force(object)
+      TRUE
+    }, error = function(e) {
+      FALSE
+    })
+  }
+
+  # Check if object is an sf object (only if we have access to the value)
+  is_sf <- FALSE
+  if (has_object_value) {
+    is_sf <- inherits(object, "sf")
+  }
+
+  # Add datestamp to filename, with _sf suffix for sf objects
+  date_str <- format(Sys.Date(), "%Y-%m-%d") |> stringr::str_replace_all("-", "_")
+  if (is_sf) {
+    full_file_name <- stringr::str_c(file_name, "_", date_str, "_sf.parquet")
+  } else {
+    full_file_name <- stringr::str_c(file_name, "_", date_str, ".parquet")
+  }
+
+  # Construct full file path
+  full_path <- file.path(path, full_file_name)
+
+  # if the specified `path` does not exist, check with user about creating it
+  if (!dir.exists(path)) {
+    if (interactive()) {
+      create_dir <- readline(prompt = stringr::str_c("The specified `path` does not exist. Do you want to create a directory at ", path, "? Y/N: "))
+      if (create_dir %in% c("Y", "y")) {
+        dir.create(path, recursive = TRUE)
+      } else {
+        stop("Specify alternate parameters.")
+      }
+    } else {
+      stop("Path does not exist: ", path)
+    }
+  }
+
+  # Handle reading based on read parameter
+  if (isTRUE(read)) {
+    # Find the most recent cached version (both regular and sf files)
+    pattern <- stringr::str_c("^", file_name, "_\\d{4}_\\d{2}_\\d{2}(_sf)?\\.parquet$")
+    cached_files <- list.files(path, pattern = pattern, full.names = TRUE)
+
+    if (length(cached_files) > 0) {
+      # Extract dates from filenames and find the most recent
+      file_dates <- cached_files |>
+        basename() |>
+        stringr::str_extract("\\d{4}_\\d{2}_\\d{2}") |>
+        stringr::str_replace_all("_", "-") |>
+        as.Date()
+
+      most_recent_file <- cached_files[which.max(file_dates)]
+      most_recent_date <- format(max(file_dates), "%Y_%m_%d")
+
+      # Check if file is an sf object based on filename
+      file_is_sf <- stringr::str_detect(most_recent_file, "_sf\\.parquet$")
+
+      message(stringr::str_c("Reading most recent cached file: ", basename(most_recent_file),
+                            " (dated ", most_recent_date, ")"))
+
+      if (file_is_sf) {
+        return(sfarrow::st_read_parquet(most_recent_file))
+      } else {
+        return(arrow::read_parquet(most_recent_file))
+      }
+    } else {
+      message(stringr::str_c("No cached files found for '", file_name,
+                            "'. Writing new file."))
+    }
+
+  } else if (is.character(read)) {
+    # Read specific file
+    specific_path <- file.path(path, read)
+
+    if (file.exists(specific_path)) {
+      # Check if file is an sf object based on filename
+      file_is_sf <- stringr::str_detect(specific_path, "_sf\\.parquet$")
+
+      message(stringr::str_c("Reading specified cached file: ", read))
+
+      if (file_is_sf) {
+        return(sfarrow::st_read_parquet(specific_path))
+      } else {
+        return(arrow::read_parquet(specific_path))
+      }
+    } else {
+      stop("Specified file does not exist: ", specific_path)
+    }
+
+  } else if (isFALSE(read)) {
+    # Don't read, proceed to writing
+    message(stringr::str_c("Skipping read. Writing new cached file."))
+  }
+
+  # Write object to parquet file
+  if (!has_object_value) {
+    stop("No cached file found and no object provided to write. Please provide an object or check the file_name/path.")
+  }
+
+  if (is_sf) {
+    sfarrow::st_write_parquet(obj = object, dsn = full_path)
+    message(stringr::str_c("Cached sf object to: ", basename(full_path)))
+  } else {
+    arrow::write_parquet(object, full_path)
+    message(stringr::str_c("Cached object to: ", basename(full_path)))
+  }
+
+  return(object)
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index cfe7e91..def5c1f 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -56,3 +56,4 @@ reference:
   - polygons_to_linestring
   - read_ipums_cached
   - inflation_adjust
+  - cache_it
diff --git a/man/cache_it.Rd b/man/cache_it.Rd
new file mode 100644
index 0000000..bd2b832
--- /dev/null
+++ b/man/cache_it.Rd
@@ -0,0 +1,84 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cache_it.R
+\name{cache_it}
+\alias{cache_it}
+\title{Cache an object to a parquet file; optionally read from disk}
+\usage{
+cache_it(object, file_name = NULL, path = "/data", read = TRUE)
+}
+\arguments{
+\item{object}{A dataframe, tibble, or sf object to cache. Can be provided as
+either a quoted or unquoted name. Optional when reading from cache - in this
+case, file_name must be provided.}
+
+\item{file_name}{File name (without extension). Optional when object is provided
+(uses object's name). Required when object is missing and reading from cache.}
+
+\item{path}{Directory path where the file should be saved/read. Defaults to /data.
+If the path does not exist, the user will be prompted to create it (in
+interactive sessions) or an error will be thrown (in non-interactive sessions).}
+
+\item{read}{Logical or character. TRUE by default.
+\itemize{
+\item TRUE: Find and read the most recent cached version based on datestamp.
+\item FALSE: Skip reading, always write a new cached file
+\item Character: Read the specific file with this exact filename (including extension).
+Defaults to TRUE.
+}}
+}
+\value{
+The object that was cached (either written or read)
+}
+\description{
+This function writes an R object to a parquet file with an automatic datestamp
+(YYYY_MM_DD format) in the filename. It can also read from an existing cached
+file if one exists. For sf objects, the function automatically uses sfarrow for
+reading/writing and adds "_sf" to the filename to indicate the file format.
+}
+\examples{
+\dontrun{
+## Note: datestamps in filenames are illustrative; user results will
+## vary depending on the the date at runtime
+
+# Regular data frames
+my_data <- tibble(x = 1:10, y = letters[1:10])
+
+# Cache with automatic naming and datestamp
+cache_it(my_data)  # Creates: my_data_2025_12_07.parquet
+
+# Cache with custom filename
+cache_it(my_data, file_name = "custom_name")
+
+# Read most recent cached version if exists, otherwise write
+cached_data <- cache_it(my_data, read = TRUE)
+
+# Always write a new file, don't read from cache
+cache_it(my_data, read = FALSE)
+
+# Read a specific cached file by name
+old_data <- cache_it(my_data, read = "my_data_2025_12_01.parquet")
+
+# Read from cache when object doesn't exist in environment yet (using file_name)
+my_data <- cache_it(file_name = "my_data", read = TRUE)
+
+# Read from cache when object doesn't exist (using quoted name)
+my_data <- cache_it("my_data", read = TRUE)
+
+# Read from cache when object doesn't exist (using unquoted name)
+my_data <- cache_it(my_data, read = TRUE)
+
+# Read specific file when object doesn't exist
+old_data <- cache_it(read = "my_data_2025_12_01.parquet")
+
+# SF objects (automatically uses sfarrow)
+my_sf <- sf::st_read(system.file("shape/nc.shp", package="sf"))
+cache_it(my_sf)  # Creates: my_sf_2025_12_07_sf.parquet
+
+# Read most recent sf cached file
+cached_sf <- cache_it(my_sf, read = TRUE)
+
+# Read specific sf cached file
+old_sf <- cache_it(my_sf, read = "my_sf_2025_12_01_sf.parquet")
+}
+
+}