diff --git a/flow/flow.gap.fill.nonrglr/Dockerfile b/flow/flow.gap.fill.nonrglr/Dockerfile new file mode 100644 index 000000000..4478d4d25 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/Dockerfile @@ -0,0 +1,27 @@ +# Dockerfile for NEON IS Data Processing - flow.gap.fill.nonrglr + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.gap.fill.nonrglr" + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Create app user +RUN groupadd app && \ + useradd app -g app +WORKDIR /home/app + +# Copy in application code +COPY ${FLOW_DIR}/${APP_DIR}/flow.gap.fill.nonrglr.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.gap.fill.nonrglr.R . + +# Run as app user +RUN chown app:app -R /home/app +USER app diff --git a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R new file mode 100644 index 000000000..e3ee724b7 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R @@ -0,0 +1,249 @@ +############################################################################################## +#' @title Gap filling module for non-regularized data in NEON IS data processing. + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} \cr + +#' @description Workflow. Bin data to generate a regular time sequence of observations. +#' General code workflow: +#' Parse input parameters +#' Read in output schemas if indicated in parameters +#' Determine datums to process (set of files/folders to process as a single unit) +#' For each datum: +#' Create output directories and copy (by symbolic link) unmodified components +#' Read regularization frequency from location file (if not in input parameters) +#' Loop through all data files +#' Regularize data in each file +#' Write out the gap filled data +#' +#' This script is run at the command line with the following arguments. Each argument must be a +#' string in the format "Para=value", where "Para" is the intended parameter name and "value" is +#' the value of the parameter. Note: If the "value" string begins with a $ (e.g. $DIR_IN), the +#' value of the parameter will be assigned from the system environment variable matching the value +#' string. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", where value is the path to the input data directory. NOTE: This path must be a +#' parent of the terminal directory where the data to be gap filled reside. See argument "DirFill" +#' below to indicate the terminal directory. +#' +#' The input path is structured as follows: #/pfs/BASE_REPO/#/yyyy/mm/dd/#, where # indicates any +#' number of parent and child directories of any name, so long as they are not 'pfs', the same name +#' as the terminal directory indicated in argument "DirFill", or recognizable as the 'yyyy/mm/dd' +#' structure which indicates the 4-digit year, 2-digit month, and 2-digit day of the data contained +#' in the folder. +#' +#' For example: +#' Input path = /scratch/pfs/sunav2_fill_date_gaps/sunav2/2019/01/01 +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' 4. "DirFill=value", where value is the name of the terminal directory where the data to be +#' gap filled resides. This will be one or more child levels away from "DirIn". All files in the +#' terminal directory will be gap filled. The value may also be a vector of terminal directories, +#' separated by pipes (|). All terminal directories must be present and at the same directory level. +#' For example, "DirFill=data|flags" indicates to regularize the data files within each the data +#' and flags directories. +#' +#' #' 5. "FileSchm=value" (optional), where value is the full path to schema for data output by +#' this workflow. The value may be NA, in which case the output schema will be the same as the input +#' data. The value may be a single file, in which case it will apply to all output, or +#' multiple values in which case the argument is formatted as dir:value|dir:value... +#' where dir is one of the directories specified in DirFill and value is the path to the schema file +#' for the output of that directory. Multiple dir:value pairs are separated by pipes (|). +#' For example, "FileSchm=data:/path/to/schemaData.avsc|flags:NA" indicates that the +#' output from the data directory will be written with the schema /path/to/schemaData.avsc and the +#' output from the flags directory will be the same as the input files found in that +#' directory. +#' +#' 6. "WndwFill=value", where value is the window in minutes in which data are expected. It is formatted as a 3 character sequence, +#' representing the number of minutes over which any number of measurements are expected. +#' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a +#' 30-minute interval. +#' +#' 7. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, at the same level as the regularization folder in the input path that are to be copied with a +#' symbolic link to the output path. +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return gap filled data and flag output in Parquet format in DirOut, where DirOut directory +#' replaces BASE_REPO but otherwise retains the child directory structure of the input path. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' Stepping through the code in R studio +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/23/CFGLOC110819", +# "DirOut=~/pfs/out ", +# "DirErr=~/pfs/out/errored_datums ", +# "DirFill=data|flags", +# "WndwFill=015", +# "FileSchm=data:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_calibration_flags.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_log_flags.avsc", +# "DirSubCopy=location|uncertainty_coef") +#' @seealso \code{\link[eddy4R.base]{def.rglr}} + +# changelog and author contributions / copyrights +# Nora Catolico (12/4/2025) +# original creation +############################################################################################## +library(foreach) +library(doParallel) +library(dplyr) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.gap.fill.nonrglr.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- + NEONprocIS.base::def.arg.pars( + arg = arg, + NameParaReqd = c( + "DirIn", + "DirOut", + "DirErr", + "DirFill", + "WndwFill" + ), + NameParaOptn = c( + "DirSubCopy", + "FileSchm" + ), + log = log + ) + +# Retrieve output schema(s) +log$debug(base::paste0( + 'Output schema(s) for regularized data: ', + base::paste0(Para$FileSchm, collapse = ',') +)) +if(length(Para$FileSchm)>0){ + SchmFill <- + NEONprocIS.base::def.vect.pars.pair( + vect = Para$FileSchm, + KeyExp = Para$DirFill, + ValuDflt = 'NA', + NameCol = c('DirFill', 'FileSchmFill'), + log = log + ) + # Read in the schema(s) + SchmFill$SchmFill <- NA + for (idxSchmFill in 1:base::length(SchmFill$FileSchmFill)) { + if (SchmFill$FileSchmFill[idxSchmFill] != 'NA') { + SchmFill$SchmFill[idxSchmFill] <- + base::paste0(base::readLines(SchmFill$FileSchmFill[idxSchmFill]), + collapse = '') + } + } +}else{ + SchmFill <- NA +} + + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0( + 'Terminal Directories to regularize: ', + base::paste0(Para$DirFill, collapse = ',') +)) + +# Retrieve intervals for gap filling +WndwFill <- base::as.numeric(Para$WndwFill) +log$debug(base::paste0('Interval for gap filling, in minutes: ',base::paste0(WndwFill,collapse=','))) + + +# Retrieve output schema(s) +log$debug(base::paste0( + 'Output schema(s) for gap filled data: ', + base::paste0(Para$FileSchmFill, collapse = ',') +)) + +# Retrieve optional subdirectories to copy over +DirSubCopy <- + base::unique(base::setdiff(Para$DirSubCopy, Para$DirFill)) +log$debug(base::paste0( + 'Additional subdirectories to copy: ', + base::paste0(DirSubCopy, collapse = ',') +)) + +nameDirSub <- base::as.list(c(Para$DirFill)) +log$debug(base::paste0( + 'Expected subdirectories of each datum path: ', + base::paste0(nameDirSub, collapse = ',') +)) + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = nameDirSub, + log = log) + +# Process each datum +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxDirIn = DirIn) %dopar% { + + log$info(base::paste0('Processing datum path: ', idxDirIn)) + + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.gap.fill.nonrglr(DirIn=idxDirIn, + DirOutBase=Para$DirOut, + WndwFill=WndwFill, + DirFill=Para$DirFill, + SchmFill=SchmFill, + DirSubCopy=DirSubCopy, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + + # Re-route the failed datum + NEONprocIS.base::def.err.datm( + err=err, + call.stack=call.stack, + DirDatm=idxDirIn, + DirErrBase=Para$DirErr, + RmvDatmOut=TRUE, + DirOutBase=Para$DirOut, + log=log + ) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + + return() + +} # End loop around datum paths diff --git a/flow/flow.gap.fill.nonrglr/renv.lock b/flow/flow.gap.fill.nonrglr/renv.lock new file mode 100644 index 000000000..e1243e0b3 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/renv.lock @@ -0,0 +1,235 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "fs": { + "Package": "fs", + "Version": "1.6.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7eb1e342eee7e0a7449c49cdaa526d39", + "Requirements": [] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R new file mode 100644 index 000000000..31bb7c01a --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -0,0 +1,202 @@ +############################################################################################## +#' @title Gap filling module for non-regularized data in NEON IS data processing. + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} + +#' @description Wrapper function. Bin data to generate a regular time sequence of observations. +#' General code workflow: +#' Error-check input parameters +#' Read regularization frequency from location file if expected +#' Create output directories and copy (by symbolic link) unmodified components +#' Loop through all data files +#' Regularize data in each file +#' Write out the regularized data +#' +#' +#' @param DirIn Character value. The input path to the data from a single sensor or location, structured as follows: +#' #/pfs/BASE_REPO/#/yyyy/mm/dd/#/id, where # indicates any number of parent and child directories +#' of any name, so long as they are not 'pfs' or recognizable as the 'yyyy/mm/dd' structure which indicates +#' the 4-digit year, 2-digit month, and' 2-digit day. The id is the unique identifier of the sensor or location. \cr +#' +#' Nested within this path are the folders: +#' /data +#' /flags +#' +#' @param DirOutBase Character value. The output path that will replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' @param DirFill List of the terminal directories where the data to be +#' gap filled resides. This will be one or more child levels away from "DirIn". All files in the +#' terminal directory will be gap filled. The value may also be a vector of terminal directories, +#' separated by pipes (|). All terminal directories must be present and at the same directory level. +#' For example, "DirFill=data|flags" indicates to regularize the data files within each the data +#' and flags directories. +#' +#' @param FileSchm Character value (optional), where value is the full path to schema for data output by +#' this workflow. The value may be NA, in which case the output schema will be the same as the input +#' data. The value may be a single file, in which case it will apply to all output, or +#' multiple values in which case the argument is formatted as dir:value|dir:value... +#' where dir is one of the directories specified in DirFill and value is the path to the schema file +#' for the output of that directory. Multiple dir:value pairs are separated by pipes (|). +#' For example, "FileSchm=data:/path/to/schemaData.avsc|flags:NA" indicates that the +#' output from the data directory will be written with the schema /path/to/schemaData.avsc and the +#' output from the flags directory will be the same as the input files found in that +#' directory. +#' +#' @param WndwFill Character value. The window in minutes in which data are expected. It is formatted as a 3 character sequence, +#' representing the number of minutes over which any number of measurements are expected. +#' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a +#' 30-minute interval. +#' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). + +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. + +#' @return Regularized data output in Parquet format in DirOutBase, where DirOutBase directory +#' replaces BASE_REPO of DirIn but otherwise retains the child directory structure of the input path. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' # Not run + +#' @seealso None currently + +# changelog and author contributions / copyrights +# Nora Catolico (2025-12-4) +# original creation +############################################################################################## +wrap.gap.fill.nonrglr <- function(DirIn, + DirOutBase, + DirFill, + WndwFill, + SchmFill, + DirSubCopy=NULL, + log=NULL +){ + + # Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + # Gather info about the input directory (including date) and create the output directory. + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn,log=log) + dirOut <- base::paste0(DirOutBase, InfoDirIn$dirRepo) + + timeBgn <-InfoDirIn$time # Earliest possible start date for the data + timeEnd <- InfoDirIn$time + base::as.difftime(1, units = 'days') + # All minute window start times in [timeBgn, timeEnd) + all_starts <- seq(timeBgn, timeEnd - WndwFill*60, by = WndwFill*60) + + # Helper to floor readout_times to window starts + floor_15m <- function(x) { + as.POSIXct(floor(as.numeric(x) / (WndwFill*60)) * (WndwFill*60), + origin = "1970-01-01", tz = attr(x, "tzone")) + } + + # Copy with a symbolic link the desired subfolders + if (base::length(DirSubCopy) > 0) { + NEONprocIS.base::def.dir.copy.symb(base::paste0(DirIn, '/', DirSubCopy), + dirOut, + log = log) + } + + + # --------- loop through the directories ---------- + for (i in 1:length(DirFill)){ + + subDir<-DirFill[i] + + # Take stock of our files. + subDirIn <- fs::path(DirIn,subDir) + files <- base::list.files(subDirIn,full.names=FALSE) + + #loop through files in directory + for (j in 1:length(files)){ + fileName <- files[j] + + # Load in file in parquet format into data frame + df <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(subDirIn, '/', fileName), + log = log), + silent = FALSE) + if (base::any(base::class(df) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', subDirIn, '/', fileName, ' is unreadable.')) + base::stop() + } + df$readout_time <- base::as.POSIXlt(df$readout_time) + + # Windows that already have at least one observation + present <- unique(floor_15m(df$readout_time)) + + # Missing windows + missing <- all_starts[!all_starts %in% present] + + # Build blank rows for missing windows + blanks <- data.frame(readout_time = missing) + + # Combine and sort + df_filled <- bind_rows(df, blanks) + df_filled <- df_filled[order(df_filled$readout_time), ] + + #add in source id if needed + if("source_id" %in% colnames(df_filled)){ + source_id<-unique(df_filled$source_id[!is.na(df_filled$source_id)]) + if(length(source_id>0)){ + df_filled$source_id[is.na(df_filled$source_id)]<-source_id[1] + }else{ + df_filled$source_id[is.na(df_filled$source_id)]<-"99999" + } + } + + # create output directories + subDirOut <- paste0(dirOut,'/',subDir,'/') + base::dir.create(subDirOut,recursive=TRUE) + + # select output schema + if(!is.na(SchmFill)){ + FileSchmFill<-SchmFill$FileSchmFill[grepl(subDir,SchmFill$DirFill)] + if(length(FileSchmFill)>1){ + #specific to suna for now. can be updated if needed down the road + if(grepl("log",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("log",FileSchmFill,ignore.case = TRUE)] + } + if(grepl("cal",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("cal",FileSchmFill,ignore.case = TRUE)] + } + } + if (base::is.na(FileSchmFill)|FileSchmFill=="NA"|length(FileSchmFill)>1) { + # use the output data to generate a schema + idxSchmFill <- base::attr(df_filled, 'schema') + } else { + idxSchmFill <- SchmFill$SchmFill[SchmFill$FileSchmFill==FileSchmFill] + } + }else{ + # use the output data to generate a schema + idxSchmFill <- base::attr(df_filled, 'schema') + } + + + # write out data + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = df_filled, + NameFile = base::paste0(subDirOut,fileName),Schm = idxSchmFill),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write file to ',base::paste0(subDirOut,fileName),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('File written successfully in ', base::paste0(subDirOut,fileName))) + } + + } + } + +} diff --git a/flow/flow.insufficient.data/Dockerfile b/flow/flow.insufficient.data/Dockerfile new file mode 100644 index 000000000..0e560f46c --- /dev/null +++ b/flow/flow.insufficient.data/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - insufficient data + +# Start with the NEON IS base package image +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.insufficient.data" + +# maintainer handle +MAINTAINER "Bobby Hensley" hensley@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.insufficient.data.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.insufficient.data.R . + diff --git a/flow/flow.insufficient.data/flow.insufficient.data.R b/flow/flow.insufficient.data/flow.insufficient.data.R new file mode 100644 index 000000000..f24efc991 --- /dev/null +++ b/flow/flow.insufficient.data/flow.insufficient.data.R @@ -0,0 +1,166 @@ +############################################################################################## +#' @title Workflow for insufficient data calculations + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Uses number of measuremnts in averaging window to determine whether insufficient +#' data quality flag should be applied. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The base file path to the statistics data (including number of points) and the QM data. +#' +#' 2. "minPoints=value", The minimum number of points required to not trigger the insufficient data quality flag. +#' Currently set in the yaml. +#' +#' 3. "DirOut=value", The base file path for the output data. +#' +#' 4. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 5. "SchmStats=value" (optional), The avro schema for the input and output stats file. +#' +#' 6. "SchmQMs=value" (optional), The avro schema for the updated QMs (insufficientDataQF added). +#' +#' 7. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, that are to be copied with a symbolic link to the output path. +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Updated stats and QMs data files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.insufficient.data <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +#' minPoints=10, +#' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , +#' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), +#' SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), +#' log=log) +#' Stepping through the code in R studio +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt_updated/nitrate-surfacewater_SUGG103100", +# "minPoints=5","DirOut=~/pfs/nitrate_null_gap_ucrt_updated2","DirErr=~/pfs/out/errored_datums","DirSubCopy=location", +# "SchmQMs=~/pfs/nitrate_avro_schemas/nitrate/nitrate_insufficient_data.avsc") +# rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-10-31) +#' Initial creation. +#' Nora Catolico (2025-11-04) +#' add in copied directories +#' Nora Catolico (2025-12-11) +#' fix schema outputs + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.insufficient.data.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","minPoints","DirOut","DirErr"), + NameParaOptn = c("SchmStats","SchmQMs","DirSubCopy"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Minimum points: ', Para$minPoints)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) +log$debug(base::paste0('Schema for output QMs: ', Para$SchmQMs)) +log$debug(base::paste0('Director to copy: ', Para$DirSubCopy)) + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ + SchmStats <- NULL +} else { + SchmStats <- base::paste0(base::readLines(Para$SchmStats),collapse='') +} +if(base::is.null(Para$SchmQMs) || Para$SchmQMs == 'NA'){ + SchmQMs <- NULL +} else { + SchmQMs <- base::paste0(base::readLines(Para$SchmQMs),collapse='') +} + + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = c('stats','quality_metrics'), + log = log) + +# Retrieve optional subdirectories to copy over +DirSubCopy <- base::unique(base::setdiff(Para$DirSubCopy,'stats')) +log$debug(base::paste0('Additional subdirectories to copy: ',base::paste0(DirSubCopy,collapse=','))) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.insufficient.data( + DirIn=idxFileIn, + minPoints=Para$minPoints, + DirOutBase=Para$DirOut, + SchmStats=SchmStats, + SchmQMs=SchmQMs, + DirSubCopy=DirSubCopy, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.insufficient.data/renv.lock b/flow/flow.insufficient.data/renv.lock new file mode 100644 index 000000000..f2d45d4f4 --- /dev/null +++ b/flow/flow.insufficient.data/renv.lock @@ -0,0 +1,256 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/flow/flow.insufficient.data/wrap.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R new file mode 100644 index 000000000..f9df78b23 --- /dev/null +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -0,0 +1,161 @@ +############################################################################################## +#' @title Wrapper for insufficient data calculations + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Determines the number of available measurements within an +#' averaging period, and whether an insufficient data quality flag should be applied. +#' This insufficient data quality flag is then used to determine whether the final quality +#' flag should be applied. It assumes that measurements that have failed individual +#' plausibility and sensor-specific tests have been removed and the number of remaining +#' measurements available for averaging is the only factor determining the final data quality. +#' +#' @param DirIn Character value. The base file path to the averaged stats and quality metrics. +#' +#' @param minPoints Character value. The minimum number of points required to not trigger the insufficient data quality flag. +#' +#' @param DirOut Character value. The base file path for the output data. +#' +#' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. +#' Should be the same as the input. +#' +#' @param SchmQMs (optional), A json-formatted character string containing the schema for the output quality metrics parquet +#' with insufficient data quality flag added. +#' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Averaged stats file and quality metric file in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# minPoints=5 +# DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') +# SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' +#' @changelog +#' Bobby Hensley (2025-10-31) +#' Initial creation. +#' +#' Bobby Hensley (2025-12-18) +#' Updated so that finalQF is solely determined by insufficientDataQF. +############################################################################################## +wrap.insufficient.data <- function(DirIn, + minPoints, + DirOutBase, + SchmStats=NULL, + SchmQMs=NULL, + DirSubCopy=NULL, + log=NULL +){ + + #' Start logging if not already. + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) + DirInStats <- paste0(DirIn,"/stats") + DirInQMs <- paste0(DirIn,"/quality_metrics") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutStats <- base::paste0(DirOut,"/stats") + base::dir.create(DirOutStats,recursive=TRUE) + DirOutQMs <- base::paste0(DirOut,"/quality_metrics") + base::dir.create(DirOutQMs,recursive=TRUE) + + # Copy with a symbolic link the desired subfolders + if(base::length(DirSubCopy) > 0){ + NEONprocIS.base::def.dir.copy.symb(DirSrc=base::paste0(DirIn,'/',DirSubCopy), + DirDest=DirOut, + LnkSubObj=TRUE, + log=log) + } + + #' Read in parquet file of averaged stats. + statsFileName<-base::list.files(DirInStats,full.names=FALSE) + if(length(statsFileName)==0){ + log$error(base::paste0('Stats file not found in ', DirInStats)) + stop() + } else { + statsData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInStats, '/', statsFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',statsFileName)) + } + + #' Read in parquet file of quality metrics. + qmFileName<-base::list.files(DirInQMs,full.names=FALSE) + if(length(qmFileName)==0){ + log$error(base::paste0('Quality metrics not found in ', DirInQMs)) + stop() + } else { + qmData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInQMs, '/', qmFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',qmFileName)) + } + + #' Identify the column name with the number of points and finalQF + ptsColName<-grep("NumPts",names(statsData),value=TRUE) + finalQfColName<-grep("FinalQF",names(qmData),value=TRUE) + + #' If the number of points is NA, set it to 0. + for(i in 1:nrow(statsData)){ + if(is.na(statsData[i,which(colnames(statsData)==ptsColName)])){ + statsData[i,which(colnames(statsData)==ptsColName)]=0}} + + #' If the number of points is greater than or equal to the minimum required, + #' revert the insufficient data quality flag (default is to apply it). + qmData$insufficientDataQF=1 + minPoints<-as.numeric(minPoints) + for(i in 1:nrow(statsData)){ + if(statsData[i,which(colnames(statsData)==ptsColName)]>=minPoints){ + qmData[i,which(colnames(qmData)=='insufficientDataQF')]=0}} + + #' If there is insufficient data, set the final quality flag to 1. + #' If there is sufficient data, set the final quality flag to 0. + for(i in 1:nrow(qmData)){ + if(qmData[i,which(colnames(qmData)=='insufficientDataQF')]==1){ + qmData[i,which(colnames(qmData)==finalQfColName)]=1} + else{qmData[i,which(colnames(qmData)==finalQfColName)]=0}} + qmData <- qmData[c(setdiff(names(qmData), finalQfColName), finalQfColName)] #' Move finalQF back to the end + + #' Write out stats file. + rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, + NameFile = base::paste0(DirOutStats,'/',statsFileName), + Schm = SchmStats),silent=TRUE) + if(class(rptOutStats)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName),'. ',attr(rptOutStats, "condition"))) + stop() + } else { + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName))) + } + + #' Write out QMs file. + rptOutQMs <- try(NEONprocIS.base::def.wrte.parq(data = qmData, + NameFile = base::paste0(DirOutQMs,'/',qmFileName), + Schm = SchmQMs),silent=TRUE) + if(class(rptOutQMs)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName),'. ',attr(rptOutQMs, "condition"))) + stop() + } else { + log$info(base::paste0('Updated QMs written successfully in ', base::paste0(DirOutQMs,'/',qmFileName))) + } + +} + + + diff --git a/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt b/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt new file mode 100644 index 000000000..75b5d274e --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt @@ -0,0 +1 @@ +sunav2_ucrt_group \ No newline at end of file diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R new file mode 100644 index 000000000..d8416befc --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -0,0 +1,150 @@ +############################################################################################## +#' @title Workflow for SUNA expanded uncertainty calculation + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Calculates the expanded uncertainty for each SUNA burst. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The base file path to the statistics data and calibration coefficients +#' +#' 2. "DirOut=value", The base file path for the output data. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "SchmStats=value" (optional), The avro schema for the input and output stats file. +#' +#' 5. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, that are to be copied with a symbolic link to the output path. +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Updated stats files with expanded uncertainty in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.exp.uncert <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +#' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , +#' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), +#' log=log) +#' Stepping through the code in R studio +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/testing/nitrate-surfacewater_SUGG103100", +# "DirOut=~/pfs/nitrate_null_gap_ucrt_updated", +# "DirErr=~/pfs/nitrate_null_gap_ucrt_updated/errored_datums", +# "DirSubCopy=group|location|quality_metrics", +# "SchmStats=~/pfs/nitrate_avro_schemas/nitrate/nitrate_ucrt.avsc") +# rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-10-31) +#' Initial creation. +#' Nora Catolico (2025-11-04) +#' add in copied directories + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.exp.uncert.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("SchmStats","DirSubCopy"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) +log$debug(base::paste0('Director to copy: ', Para$DirSubCopy)) + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ + SchmStats <- NULL +} else { + SchmStats <- base::paste0(base::readLines(Para$SchmStats),collapse='') +} + + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = c('stats','uncertainty_coef'), + log = log) + +# Retrieve optional subdirectories to copy over +DirSubCopy <- base::unique(base::setdiff(Para$DirSubCopy,'stats')) +log$debug(base::paste0('Additional subdirectories to copy: ',base::paste0(DirSubCopy,collapse=','))) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.exp.uncert( + DirIn=idxFileIn, + DirOutBase=Para$DirOut, + SchmStats=SchmStats, + DirSubCopy=DirSubCopy, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.exp.uncert/renv.lock b/flow/flow.sunav2.exp.uncert/renv.lock new file mode 100644 index 000000000..7283865a0 --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/renv.lock @@ -0,0 +1,101 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + } + } +} diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R new file mode 100644 index 000000000..fe0704c46 --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -0,0 +1,171 @@ +############################################################################################## +#' @title Wrapper for SUNA expanded uncertainty calculation + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Calculates the expanded uncertainty for each SUNA burst. +#' +#' @param DirIn Character value. The base file path to the averaged stats and uncertainty coefficients. +#' +#' @param DirOutBase Character value. The base file path for the output data. +#' +#' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. +#' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Averaged stats file and quality metric file in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' +#' @changelog +#' Bobby Hensley (2025-11-03) +#' Initial creation. +#' +############################################################################################## +wrap.sunav2.exp.uncert <- function(DirIn, + DirOutBase, + SchmStats=NULL, + DirSubCopy=NULL, + log=NULL +){ + + #' Start logging if not already. + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) + DirInStats <- paste0(DirIn,"/stats") + DirInCoeff <- paste0(DirIn,"/uncertainty_coef") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutStats <- base::paste0(DirOut,"/stats") + base::dir.create(DirOutStats,recursive=TRUE) + + # Copy with a symbolic link the desired subfolders + if(base::length(DirSubCopy) > 0){ + NEONprocIS.base::def.dir.copy.symb(DirSrc=base::paste0(DirIn,'/',DirSubCopy), + DirDest=DirOut, + LnkSubObj=TRUE, + log=log) + } + + #' Read in parquet file of averaged stats. + statsFileName<-base::list.files(DirInStats,full.names=FALSE) + if(length(statsFileName)==0){ + log$error(base::paste0('Stats file not found in ', DirInStats)) + stop() + } else { + statsData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInStats, '/', statsFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',statsFileName)) + } + + #' Read in json file of uncertainty coefficients. + coeffileName<-base::list.files(DirInCoeff,full.names=FALSE) + if(length(coeffileName)==0){ + log$error(base::paste0('Uncertainty coefficient not found in ', DirInCoeff)) + stop() + } else { + uncertCoeff<-base::try(NEONprocIS.cal::def.read.ucrt.coef.fdas(NameFile = base::paste0(DirInCoeff, '/', coeffileName)), + silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',coeffileName)) + } + + if(length(uncertCoeff)>0){ + #' Converts uncertainty coefficient dates to POSIXct and values to numeric + uncertCoeff$start_date <- as.POSIXct(uncertCoeff$start_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$end_date <- as.POSIXct(uncertCoeff$end_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$Value<-as.numeric(uncertCoeff$Value) + + #' Determines which uncertainty coefficients to be applied to each time interval. + #' (In case there are more than one on a particular day) + uncertCoeff<-uncertCoeff[order(uncertCoeff$start_date), ] + uncertCoeffA1<-uncertCoeff[(uncertCoeff$Name=="U_CVALA1"),] + statsData$uncertCoeffA1<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA1)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA1")]=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="Value")]}}} + uncertCoeffA3<-uncertCoeff[(uncertCoeff$Name=="U_CVALA3"),] + statsData$uncertCoeffA3<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA3)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA3")]=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="Value")]}}} + + #' Identify the column name with the mean, variance and number of points + meanName<-grep("Mean",names(statsData),value=TRUE) + varianceName<-grep("Variance",names(statsData),value=TRUE) + pointsName<-grep("NumPts",names(statsData),value=TRUE) + + #' Calculates calibration uncertainty. See ATBD for more details. + #' Concentrations <= 20 mg/L have fixed calibration uncertainty equal to coeffA1. + #' Concentrations greater than 20 mg/L uncertainty equals concentration times coeffA1. + #' Note stats data concentrations are in uM so threshold needs to be converted from mg/L by dividing by 0.014 (14 g/mol / 1000 ug/mg) + statsData$calUncert<-NA + for (i in 1:nrow(statsData)){ + if(is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="calUncert")]=NA} + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){ + if(statsData[i,which(colnames(statsData)==meanName)]<=(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA1")]} + if(statsData[i,which(colnames(statsData)==meanName)]>(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA3")]} + } + } + + #' Calculates the repeatability (natural variation). See ATBD for more details. + statsData$natVar<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="natVar")]= + sqrt(statsData[i,which(colnames(statsData)==varianceName)]/statsData[i,which(colnames(statsData)==pointsName)])} + } + + #' Calculates the expanded uncertainty, which is estimated as 2x the combined uncertainty. See ATBD for more details. + statsData$surfWaterNitrateExpUncert<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="surfWaterNitrateExpUncert")]= + 2*sqrt(statsData[i,which(colnames(statsData)=="natVar")]+statsData[i,which(colnames(statsData)=="calUncert")])} + } + + #' Removes unnecessary columns. + statsData<-subset(statsData,select=-c(uncertCoeffA3,uncertCoeffA1,calUncert,natVar)) + }else{ + #add required columns to stats data + statsData$surfWaterNitrateExpUncert<-NA + } + + statsData$surfWaterNitrateMean[is.nan(statsData$surfWaterNitrateMean)]<-NA + + + #' Write out updated stats file. + rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, + NameFile = base::paste0(DirOutStats,'/',statsFileName), + Schm = SchmStats),silent=TRUE) + if(class(rptOutStats)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName),'. ',attr(rptOutStats, "condition"))) + stop() + } else { + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName))) + } + + +} + + + diff --git a/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt b/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt new file mode 100644 index 000000000..c3ba2706c --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt @@ -0,0 +1 @@ +sunav2_logs_group_and_fill \ No newline at end of file diff --git a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R new file mode 100644 index 000000000..10389faf0 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R @@ -0,0 +1,150 @@ +############################################################################################## +#' @title Workflow for SUNA Log File Comparison and Gap Filling + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} + +#' @description Workflow. Compares logged data to streamed data and fills gaps. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id.The source-id folder may have multiple csv log files. +#' The source-id is the unique identifier of the sensor.#' +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "FileSchmData=value" (optional), where values is the full path to the avro schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' 5. "FileSchmFlags=value" (optional), where values is the full path to the avro schema for the output flags +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Cleaned sunav2 log files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' Stepping through the code in Rstudio +# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349') #cleaned log data +# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349') #streamed L0 data +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +# Nora Catolico (2024-01-30) original creation +# Bobby Hensley \email{hensley@battelleecology.org} + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.logfiles.fill.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("FileSchmData","FileSchmFlags"),log = log) + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) +log$debug(base::paste0('Schema for output flags: ', Para$FileSchmFlags)) + + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') +} +if(base::is.null(Para$FileSchmFlags) || Para$FileSchmFlags == 'NA'){ + SchmFlagsOut <- NULL +} else { + SchmFlagsOut <- base::paste0(base::readLines(Para$FileSchmFlags),collapse='') +} + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = 'data', + log = log) + + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxDirIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to datum: ', idxDirIn)) + + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.logfiles.fill( + DirIn=idxDirIn, + DirOutBase=Para$DirOut, + SchmDataOut=SchmDataOut, + SchmFlagsOut=SchmFlagsOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + + # Re-route the failed datum + NEONprocIS.base::def.err.datm( + err=err, + call.stack=call.stack, + DirDatm=idxDirIn, + DirErrBase=Para$DirErr, + RmvDatmOut=TRUE, + DirOutBase=Para$DirOut, + log=log + ) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + diff --git a/flow/flow.sunav2.logfiles.fill/renv.lock b/flow/flow.sunav2.logfiles.fill/renv.lock new file mode 100644 index 000000000..04ce2a180 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/renv.lock @@ -0,0 +1,308 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077", + "Requirements": [ + "rlang" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "fs": { + "Package": "fs", + "Version": "1.6.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7eb1e342eee7e0a7449c49cdaa526d39", + "Requirements": [] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc8b5d43f90551fa6df0a6be5d640a4f", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "892124978869b74935dc3934c42bfe5a", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d8b95b7fee945d7da6888cf7eb71a49c", + "Requirements": [ + "cpp11", + "dplyr", + "ellipsis", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R new file mode 100644 index 000000000..de0eda989 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -0,0 +1,188 @@ +############################################################################################## +#' @title Wrapper for SUNA Log File Comparison and Gap Filling + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Compares logged data to streamed data. +#' +#' @param DirIn Character value. The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirInStream (optional) Character value. This input is used for testing purposes only prior to joining repos. +#' The input path to the streamed L0 data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirInLogs (optional) Character value. This input is used for testing purposes only prior to joining repos. +#' The input path to the log data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirOut Character value. The output path that will replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags +#' file. If this input is not provided, the output schema for the data will be the same as the input flags +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Combined logged and streamed L0 data in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirInLogs<-"~/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/11/20349" #cleaned log data +# DirInStream<-"~/pfs/sunav2_trino_data_parser/sunav2/2025/06/22/20345" #streamed L0 data +# DirIn<-NULL +# DirOutBase="~/pfs/out" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') +#' +#' @changelog +#' Nora Catolico (2024-01-30) original creation +#' Bobby Hensley (2025-05-30) adapted for suna +#' +############################################################################################## +wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, + DirInStream=NULL, + DirIn, + DirOutBase, + SchmDataOut=NULL, + SchmFlagsOut=NULL, + log=NULL +){ + + # Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + # Gather info about the input directory (including date), and create base output directory + if(is.null(DirInLogs)){ + DirInLogs<-DirIn #only need one dir if this is run after filter joiner + } + if(is.null(DirInStream)){ + DirInStream<-DirIn #only need one dir if this is run after filter joiner + } + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirInStream) + dirInDataStream <- fs::path(DirInStream,'data') + dirInDataLogs <- fs::path(DirInLogs,'data') + timeBgn <- InfoDirIn$time # Earliest possible start date for the data + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutData <- base::paste0(DirOut,'/data') + base::dir.create(DirOutData,recursive=TRUE) + DirOutFlags <- base::paste0(DirOut,'/flags') + base::dir.create(DirOutFlags,recursive=TRUE) + +#' Load any L0 streamed data + fileDataStream<-base::list.files(dirInDataStream,full.names=FALSE) + L0File <- fileDataStream[!grepl('_log',fileDataStream)] + if(length(L0File)>=1){ + L0Data <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(dirInDataStream, '/', L0File), + log = log),silent = FALSE) + if (base::any(base::class(L0Data) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', dirInDataStream, '/', L0File, ' is unreadable.')) + base::stop()} + }else{ + L0Data<-NULL + } + +#' Load any logged data + fileDataLogs<-base::list.files(dirInDataLogs,full.names=FALSE) + logFile <- fileDataLogs[grepl('_log',fileDataLogs)] + if(length(logFile)>=1){ + logData <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(dirInDataLogs, '/', logFile), + log = log),silent = FALSE) + if (base::any(base::class(logData) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', dirInDataLogs, '/', logFile, ' is unreadable.')) + base::stop()} + }else{ + logData<-NULL + } + +#' update columns to same format + if(length(L0Data)>=1){ + L0Data$spectrum_channels <- lapply(L0Data$spectrum_channels, function(x) paste(x, collapse = ";")) + } + + +#' Determine whether to use logged or streamed data. + #' Logged data is used if available, and log data flag set to 1 + if(length(logFile)>=1){ + dataOut<-as.data.frame(logData) + flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) + flagsOut$readout_time<-dataOut$readout_time + flagsOut$sunaLogDataQF<-1 + } + #' Streamed data is used if no logged data is available, and log data flags set to 0 + if(length(logFile)<1 & length(L0Data)>=1){ + dataOut<-as.data.frame(L0Data) + flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) + flagsOut$readout_time<-dataOut$readout_time + flagsOut$sunaLogDataQF<-0 + } + +#' Write out data file and log flags file + + #write out data file + fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name + asset<-tail(x=fileOutSplt,n=1) + csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d")) + + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, + NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutData,'/',csv_name,".parquet"))) + } + + #write out log flags file + csv_name_flags <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') + + rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagsOut, + NameFile = base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"), + Schm = SchmFlagsOut),silent=TRUE) + if(class(rptOutFlags)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Flags to ',base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"),'. ',attr(rptOutFlags, "condition"))) + stop() + } else { + log$info(base::paste0('Flags written successfully in ', base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"))) + } + +} + + + + + + + + + + + + + + + + diff --git a/flow/flow.sunav2.logfiles/Dockerfile b/flow/flow.sunav2.logfiles/Dockerfile new file mode 100644 index 000000000..74ae616be --- /dev/null +++ b/flow/flow.sunav2.logfiles/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 Logfile Processing + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.logfiles" + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.logfiles.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.logfiles.R . + diff --git a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R new file mode 100644 index 000000000..a653364ca --- /dev/null +++ b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R @@ -0,0 +1,149 @@ +############################################################################################## +#' @title Workflow for SUNA Log File Processing + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Validates, cleans, and formats sunav2 log files into daily parquets. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id.The source-id folder may have multiple csv log files. +#' The source-id is the unique identifier of the sensor.#' +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "FileSchmData=value" (optional), where values is the full path to the avro schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Cleaned sunav2 log files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.logfiles <- function(FileIn = "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", +#' DirOut="~/pfs/out", +#' SchmDataOut=NULL, +#' log=log) +#' Stepping through the code in R studio +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Nora Catolico (2024-01-09) original creation +#' Bobby Hensley (2025-04-09) adapted for SUNA +# +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.logfiles.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirOut","DirErr"), + NameParaOptn = c("FileSchmData"),log = log) + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) + + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') +} + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = NULL, + log = log) + +# Take stock of our data files. +fileData <- base::list.files(DirIn,full.names=TRUE) +log$debug(base::paste0('Files identified:', fileData)) + + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = fileData) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.logfiles( + FileIn=idxFileIn, + DirOut=Para$DirOut, + SchmDataOut=SchmDataOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.logfiles/renv.lock b/flow/flow.sunav2.logfiles/renv.lock new file mode 100644 index 000000000..7283865a0 --- /dev/null +++ b/flow/flow.sunav2.logfiles/renv.lock @@ -0,0 +1,101 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + } + } +} diff --git a/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R new file mode 100644 index 000000000..796ce8a32 --- /dev/null +++ b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R @@ -0,0 +1,216 @@ +############################################################################################## +#' @title Wrapper for SUNA Log File Processing + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Validates, cleans, and formats SUNA log files into daily parquets. +#' +#' @param FileIn Character value. The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id/file. +#' The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirOut Character value. The output path that will replace the #/pfs/BASE_REPO portion of FileIn. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Data from SUNA log files in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# FileIn <- "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" +# DirOut="~/pfs/sunav2_logs_output" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' wrap.sunav2.logfiles <- function(FileIn = "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", +#' DirOut="~/pfs/out", +#' SchmDataOut=NULL, +#' log=log) +#' +#' @changelog +#' Nora Catolico (2024-01-09) original creation +#' Bobby Hensley (2025-04-09) adapted for SUNA +############################################################################################## +wrap.sunav2.logfiles <- function(FileIn, + DirOut, + SchmDataOut=NULL, + log=NULL +){ + +#' Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + +#' Load in the csv log file(s) + logFile <- + base::try(read.table(paste0(FileIn), header = FALSE, sep = ",", + col.names = paste0("V",seq_len(286)),encoding = 'utf-8', + stringsAsFactors = FALSE,fill = TRUE,strip.white = TRUE,na.strings=c(-1,''))) + if (base::any(base::class(logFile) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', FileIn, ' is unreadable. Likely not a data file.')) + base::stop() + } + if(any(grepl('TROLL',logFile))){ + log$debug(base::paste0('skipping troll file: ', FileIn)) + base::stop() + }else if(any(grepl('Turbidity',logFile))){ + log$debug(base::paste0('skipping sonde file: ', FileIn)) + base::stop() + } + +#' Find row where data actually starts + start<-which(grepl('Zeiss Coefficient',logFile$V2))+1 + # Separate data and metadata + logData<-logFile[start:(nrow(logFile)),] + logMetadata<-logFile[1:(start-1),2:6] + +#' Update names of existing columns to match avro schema + names(logData)<-c("header_serial_number","year_and_day","time","nitrate_concentration","nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm", + "bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", + "channel_1","channel_2","channel_3","channel_4","channel_5","channel_6","channel_7","channel_8","channel_9","channel_10", + "channel_11","channel_12","channel_13","channel_14","channel_15","channel_16","channel_17","channel_18","channel_19","channel_20", + "channel_21","channel_22","channel_23","channel_24","channel_25","channel_26","channel_27","channel_28","channel_29","channel_30", + "channel_31","channel_32","channel_33","channel_34","channel_35","channel_36","channel_37","channel_38","channel_39","channel_40", + "channel_41","channel_42","channel_43","channel_44","channel_45","channel_46","channel_47","channel_48","channel_49","channel_50", + "channel_51","channel_52","channel_53","channel_54","channel_55","channel_56","channel_57","channel_58","channel_59","channel_60", + "channel_61","channel_62","channel_63","channel_64","channel_65","channel_66","channel_67","channel_68","channel_69","channel_70", + "channel_71","channel_72","channel_73","channel_74","channel_75","channel_76","channel_77","channel_78","channel_79","channel_80", + "channel_81","channel_82","channel_83","channel_84","channel_85","channel_86","channel_87","channel_88","channel_89","channel_90", + "channel_91","channel_92","channel_93","channel_94","channel_95","channel_96","channel_97","channel_98","channel_99","channel_100", + "channel_101","channel_102","channel_103","channel_104","channel_105","channel_106","channel_107","channel_108","channel_109","channel_110", + "channel_111","channel_112","channel_113","channel_114","channel_115","channel_116","channel_117","channel_118","channel_119","channel_120", + "channel_121","channel_122","channel_123","channel_124","channel_125","channel_126","channel_127","channel_128","channel_129","channel_130", + "channel_131","channel_132","channel_133","channel_134","channel_135","channel_136","channel_137","channel_138","channel_139","channel_140", + "channel_141","channel_142","channel_143","channel_144","channel_145","channel_146","channel_147","channel_148","channel_149","channel_150", + "channel_151","channel_152","channel_153","channel_154","channel_155","channel_156","channel_157","channel_158","channel_159","channel_160", + "channel_161","channel_162","channel_163","channel_164","channel_165","channel_166","channel_167","channel_168","channel_169","channel_170", + "channel_171","channel_172","channel_173","channel_174","channel_175","channel_176","channel_177","channel_178","channel_179","channel_180", + "channel_181","channel_182","channel_183","channel_184","channel_185","channel_186","channel_187","channel_188","channel_189","channel_190", + "channel_191","channel_192","channel_193","channel_194","channel_195","channel_196","channel_197","channel_198","channel_199","channel_200", + "channel_201","channel_202","channel_203","channel_204","channel_205","channel_206","channel_207","channel_208","channel_209","channel_210", + "channel_211","channel_212","channel_213","channel_214","channel_215","channel_216","channel_217","channel_218","channel_219","channel_220", + "channel_221","channel_222","channel_223","channel_224","channel_225","channel_226","channel_227","channel_228","channel_229","channel_230", + "channel_231","channel_232","channel_233","channel_234","channel_235","channel_236","channel_237","channel_238","channel_239","channel_240", + "channel_241","channel_242","channel_243","channel_244","channel_245","channel_246","channel_247","channel_248","channel_249","channel_250", + "channel_251","channel_252","channel_253","channel_254","channel_255","channel_256", + "internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", + "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature", + "ctd_pressure","check_sum") + +#' Checks that each data burst is complete (Right now only checks whether last column is a value or not) + logData$error_missing_data<-NA + for(i in 1:nrow(logData)){if(is.na(logData[i,which(colnames(logData)=="check_sum")])){logData[i,which(colnames(logData)=="error_missing_data")]=TRUE} + else{logData[i,which(colnames(logData)=="error_missing_data")]=FALSE}} + +#' Combines all 256 spectrum channels into single array + logData$spectrum_channels<-paste(logData$channel_1,logData$channel_2,logData$channel_3,logData$channel_4,logData$channel_5,logData$channel_6,logData$channel_7,logData$channel_8,logData$channel_9,logData$channel_10, + logData$channel_11,logData$channel_12,logData$channel_13,logData$channel_14,logData$channel_15,logData$channel_16,logData$channel_17,logData$channel_18,logData$channel_19,logData$channel_20, + logData$channel_21,logData$channel_22,logData$channel_23,logData$channel_24,logData$channel_25,logData$channel_26,logData$channel_27,logData$channel_28,logData$channel_29,logData$channel_30, + logData$channel_31,logData$channel_32,logData$channel_33,logData$channel_34,logData$channel_35,logData$channel_36,logData$channel_37,logData$channel_38,logData$channel_39,logData$channel_40, + logData$channel_41,logData$channel_42,logData$channel_43,logData$channel_44,logData$channel_45,logData$channel_46,logData$channel_47,logData$channel_48,logData$channel_49,logData$channel_50, + logData$channel_51,logData$channel_52,logData$channel_53,logData$channel_54,logData$channel_55,logData$channel_56,logData$channel_57,logData$channel_58,logData$channel_59,logData$channel_60, + logData$channel_61,logData$channel_62,logData$channel_63,logData$channel_64,logData$channel_65,logData$channel_66,logData$channel_67,logData$channel_68,logData$channel_69,logData$channel_70, + logData$channel_71,logData$channel_72,logData$channel_73,logData$channel_74,logData$channel_75,logData$channel_76,logData$channel_77,logData$channel_78,logData$channel_79,logData$channel_80, + logData$channel_81,logData$channel_82,logData$channel_83,logData$channel_84,logData$channel_85,logData$channel_86,logData$channel_87,logData$channel_88,logData$channel_89,logData$channel_90, + logData$channel_91,logData$channel_92,logData$channel_93,logData$channel_94,logData$channel_95,logData$channel_96,logData$channel_97,logData$channel_98,logData$channel_99,logData$channel_100, + logData$channel_101,logData$channel_102,logData$channel_103,logData$channel_104,logData$channel_105,logData$channel_106,logData$channel_107,logData$channel_108,logData$channel_109,logData$channel_110, + logData$channel_111,logData$channel_112,logData$channel_113,logData$channel_114,logData$channel_115,logData$channel_116,logData$channel_117,logData$channel_118,logData$channel_119,logData$channel_120, + logData$channel_121,logData$channel_122,logData$channel_123,logData$channel_124,logData$channel_125,logData$channel_126,logData$channel_127,logData$channel_128,logData$channel_129,logData$channel_130, + logData$channel_131,logData$channel_132,logData$channel_133,logData$channel_134,logData$channel_135,logData$channel_136,logData$channel_137,logData$channel_138,logData$channel_139,logData$channel_140, + logData$channel_141,logData$channel_142,logData$channel_143,logData$channel_144,logData$channel_145,logData$channel_146,logData$channel_147,logData$channel_148,logData$channel_149,logData$channel_150, + logData$channel_151,logData$channel_152,logData$channel_153,logData$channel_154,logData$channel_155,logData$channel_156,logData$channel_157,logData$channel_158,logData$channel_159,logData$channel_160, + logData$channel_161,logData$channel_162,logData$channel_163,logData$channel_164,logData$channel_165,logData$channel_166,logData$channel_167,logData$channel_168,logData$channel_169,logData$channel_170, + logData$channel_171,logData$channel_172,logData$channel_173,logData$channel_174,logData$channel_175,logData$channel_176,logData$channel_177,logData$channel_178,logData$channel_179,logData$channel_180, + logData$channel_181,logData$channel_182,logData$channel_183,logData$channel_184,logData$channel_185,logData$channel_186,logData$channel_187,logData$channel_188,logData$channel_189,logData$channel_190, + logData$channel_191,logData$channel_192,logData$channel_193,logData$channel_194,logData$channel_195,logData$channel_196,logData$channel_197,logData$channel_198,logData$channel_199,logData$channel_200, + logData$channel_201,logData$channel_202,logData$channel_203,logData$channel_204,logData$channel_205,logData$channel_206,logData$channel_207,logData$channel_208,logData$channel_209,logData$channel_210, + logData$channel_211,logData$channel_212,logData$channel_213,logData$channel_214,logData$channel_215,logData$channel_216,logData$channel_217,logData$channel_218,logData$channel_219,logData$channel_220, + logData$channel_221,logData$channel_222,logData$channel_223,logData$channel_224,logData$channel_225,logData$channel_226,logData$channel_227,logData$channel_228,logData$channel_229,logData$channel_230, + logData$channel_231,logData$channel_232,logData$channel_233,logData$channel_234,logData$channel_235,logData$channel_236,logData$channel_237,logData$channel_238,logData$channel_239,logData$channel_240, + logData$channel_241,logData$channel_242,logData$channel_243,logData$channel_244,logData$channel_245,logData$channel_246,logData$channel_247,logData$channel_248,logData$channel_249,logData$channel_250, + logData$channel_251,logData$channel_252,logData$channel_253,logData$channel_254,logData$channel_255,logData$channel_256,sep=";") + +#' Calculates the readout date and time in POSIXct format + logData$readout_time<-lubridate::parse_date_time(as.character(logData$year_and_day),order="yj") + op <- options(digits.secs=3) + logData$readout_time<-lubridate::with_tz(logData$readout_time+(as.numeric(logData$time)*60*60),'UTC') + +#' Create additional header columns needed to match avro schema + asset_string <- regexpr("\\/[0-9]{5}\\/",FileIn) #' For SUNA asset info not included in log file header. Need it from input file folder name. + asset<-gsub("\\/","",substr(FileIn,asset_string[1],asset_string[1]+attributes(asset_string)$match.length-1)) + logData$source_id<-asset + logData$site_id<-NA #' This can be left blank for now + serial_number<-as.data.frame(strsplit(logMetadata[1,2],":")) + logData$header_manufacturer<-"SATS" + logData$header_serial_number<-serial_number[2,1] + logData$header_light_frame<-NA + for(i in 1:nrow(logData)){if(logData[i,which(colnames(logData)=="dark_value_used_for_fit")]==0){logData[i,which(colnames(logData)=="header_light_frame")]=0} + else{logData[i,which(colnames(logData)=="header_light_frame")]=1}} + +#' Re-orders columns so they match the avro schema + logData<-logData[,c("source_id","site_id","readout_time","header_manufacturer","header_serial_number","header_light_frame","year_and_day","time","nitrate_concentration", + "nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm","bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", + "spectrum_channels","internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", + "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature","ctd_pressure", + "check_sum","error_missing_data")] + +#' Checks that there are no dates prior to when NEON began collecting IS data + if(any(logData$readout_time<"2014-01-01 00:00:00 UTC")){ + log$debug(base::paste0("Data contains dates prior to when NEON began collecting IS data"))} +#' Checks that there are no future dates after the current date + if(any(logData$readout_time>Sys.time())){ + log$debug(base::paste0("Data contains future dates after the current date"))} + +#' Output file + #' Create output directory + year <- substr(logData$readout_time[1],1,4) + month <- substr(logData$readout_time[1],6,7) + day <- substr(logData$readout_time[1],9,10) + DirOutLogFile <- paste0(DirOut,'/sunav2/',year,'/',month,'/',day,'/',asset,'/data/') + base::dir.create(DirOutLogFile,recursive=TRUE) + csv_name <-paste0('sunav2_',asset,'_',year,'-',month,'-',day,'_log') + #' Writes parquet file to output directory + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = logData, + NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutLogFile,csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) + } + +} +#' End of file + + + + + + + + + + + + + + + + + diff --git a/flow/flow.sunav2.quality.flags/Dockerfile b/flow/flow.sunav2.quality.flags/Dockerfile new file mode 100644 index 000000000..c1a98605b --- /dev/null +++ b/flow/flow.sunav2.quality.flags/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 sensor-specific quality flags + +# Start with the neon-is-pack-qaqc-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-qaqc-r:v1.1.10 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.quality.flags" + +# maintainer handle +MAINTAINER "Bobby Hensley" hensley@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.quality.flags.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.quality.flags.R . + diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R new file mode 100644 index 000000000..2b69bd44e --- /dev/null +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -0,0 +1,158 @@ +############################################################################################## +#' @title Workflow for SUNA Sensor-specific Quality Flags + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Uses thresholds to apply sensor-specific quality flags to SUNA data. +#' Measurements where the lamp has not had enough time to stabilze (nitrateLampStabilizeQF=1) are removed. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. +#' #/pfs/BASE_REPO/date/location/sunav2/cfgloc, where files will then be in /data, /flags and /threshold sub-folders. +#' +#' 2. "DirOut=value", The base file path for the output data. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "FileSchmData=value" (optional), The avro schema for the input and output data file. +#' +#' 5. "FileSchmQf=value" (optional), The avro schema for the combined flag file. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Sensor-specific quality flag files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/nitrate_thresh_select_ts_pad/2025/06/25/nitrate_HOPB112100", +#' DirOut="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733", +#' FileSchmQf=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') +#' log=log) +#' Stepping through the code in R studio +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-08-30) +#' Initial creation. +#' +#' Bobby Hensley (2025-09-18) +#' Updated so that measurements prior to lamp stabilization (never intended to be +#' used in downstream pipeline) are removed. +#' +#' Nora Catolico (2025-09-22) +#' combined input df and updated error logging +# + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.quality.flags.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("FileSchmData","FileSchmQf"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) +log$debug(base::paste0('Schema for output flags: ', Para$FileSchmQf)) + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') +} +if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ + SchmFlagsOut <- NULL +} else { + SchmFlagsOut <- base::paste0(base::readLines(Para$FileSchmQf),collapse='') +} + + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = c('data','flags'), + log = log) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.quality.flags( + DirIn=idxFileIn, + DirOutBase=Para$DirOut, + SchmDataOut=SchmDataOut, + SchmFlagsOut=SchmFlagsOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.quality.flags/renv.lock b/flow/flow.sunav2.quality.flags/renv.lock new file mode 100644 index 000000000..dd613294d --- /dev/null +++ b/flow/flow.sunav2.quality.flags/renv.lock @@ -0,0 +1,182 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", + "Requirements": [] + }, + "stringi": { + "Package": "stringi", + "Version": "1.7.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "bba431031d30789535745a9627ac9271", + "Requirements": [] + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "960e2ae9e09656611e0b8214ad543207", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + } + } +} diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R new file mode 100644 index 000000000..b18dafcce --- /dev/null +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -0,0 +1,286 @@ +############################################################################################## +#' @title Wrapper for SUNA sensor-specific quality flagging + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Uses thresholds to apply sensor-specific quality flags to SUNA data. +#' Measurements where the lamp has not had enough time to stabilize (nitrateLampStabilizeQF=1) are removed. +#' +#' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. +#' +#' @param DirOut Character value. The base file path for the output data. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the data file. +#' This should be the same for the input as the output. Only the number of rows of measurements should change. +#' +#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return SUNA data file and combined flag file in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/01/nitrate-surfacewater_CRAM103100/sunav2/CFGLOC110733" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/01/nitrate-surfacewater_CRAM103100/sunav2/CFGLOC110733" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') +# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' +#' @changelog +#' Bobby Hensley (2025-08-30) +#' Initial creation. +#' +#' Bobby Hensley (2025-09-18) +#' Updated so that measurements prior to lamp stabilization (never intended to be +#' used in downstream pipeline) are removed. +#' +#' Bobby Hensley (2025-09-22) +#' Updated to use single input directory and added check that data and flag file +#' have same number of measurements. +#' +#' Bobby Hensley (2025-10-30) +#' Updated to revert over-flagged measurements at end of burst. +#' +#' Bobby Hensley (2025-12-10) +#' Updated lamp stabilization to pass added null "filler" for completely missing bursts. +#' +#' Bobby Hensley (2025-12-16) +#' Updated so that dark measurements caused by lamp temperature cutoff are still counted as part of same burst. +#' Updated so that any low transmittance error codes ("-1") are always flagged and set to NA. +#' +#' Bobby Hensley (2025-12-18) +#' Updated so lamp stabilization test sets failed values to NA rather than removing entire line. +############################################################################################## +wrap.sunav2.quality.flags <- function(DirIn, + DirOutBase, + SchmDataOut=NULL, + SchmFlagsOut=NULL, + log=NULL +){ + + #' Start logging if not already. + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) + DirInData <- paste0(DirIn,"/data") + DirInFlags <- paste0(DirIn,"/flags") + DirInThresholds <- paste0(DirIn,"/threshold") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutData <- base::paste0(DirOut,"/data") + base::dir.create(DirOutData,recursive=TRUE) + DirOutFlags <- base::paste0(DirOut,"/flags") + base::dir.create(DirOutFlags,recursive=TRUE) + + #' Read in parquet file of SUNA data. + dataFileName<-base::list.files(DirInData,full.names=FALSE) + if(length(dataFileName)==0){ + log$error(base::paste0('Data file not found in ', DirInData)) + stop() + } else { + sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',dataFileName)) + } + + #' Read in parquet file of QAQC plausibility flags. + plausFileName<-grep("flagsPlaus",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(plausFileName)==0){ + log$error(base::paste0('Plausibility flags not found in ', DirInFlags)) + stop() + } else { + plausFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', plausFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',plausFileName)) + } + + #' Read in parquet file of calibration flags. + calFileName<-grep("flagsCal",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(calFileName)==0){ + log$error(base::paste0('Calibration flags not found in ', DirInFlags)) + stop() + } else { + calFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', calFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',calFileName)) + } + + #' Read in parquet file of logged file flags. + logFileName<-grep("logFlags",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(calFileName)==0){ + log$error(base::paste0('Log flags not found in ', DirInFlags)) + stop() + } else { + logFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', logFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',logFileName)) + } + + #' Convert measurements to be tested from class character to numeric. + sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) + sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) + sunaData$spec_average<-as.numeric(sunaData$spec_average) + sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) + + #' Create data frame of input data file readout_times to serve as basis of sensor specific flag file. + sensorFlags<-as.data.frame(sunaData$readout_time) + colnames(sensorFlags)<-c("readout_time") + + #' Read in json file of quality flag thresholds. + thresholdFileName<-base::list.files(DirInThresholds,full.names=FALSE) + sunaThresholds<-base::try(NEONprocIS.qaqc::def.read.thsh.qaqc.df(NameFile = base::paste0(DirInThresholds, '/', thresholdFileName)),silent = FALSE) + + #' Perform internal humidity test. + humidityThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Internal humidity"),] + maxHumidity<-humidityThreshold$number_value + sensorFlags$nitrateHumidityQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ + if(sunaData[i,which(colnames(sunaData)=='relative_humidity')]>maxHumidity){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=1} + else{sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=0}} + } + + #' Perform lamp temperature test (New condition need to be created. Using default for now). + # lampTempThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Lamp Temperature"),] + # maxLampTemp<-lampTempThreshold$number_value + maxLampTemp=35 #' Hard-coded until thresholds are updated. + sensorFlags$nitrateLampTempQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ + if(sunaData[i,which(colnames(sunaData)=='lamp_temperature')]>maxLampTemp){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=1} + else{sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=0}} + } + + #' Perform light to dark spectral ratio test. + spectralRatioThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Minimum Light to Dark Spec Average Ratio"),] + minLightDarkRatio<-spectralRatioThreshold$number_value + sensorFlags$nitrateLightDarkRatioQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])|is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLightDarkRatioQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])&!is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ + if(sunaData[i,which(colnames(sunaData)=='spec_average')]/sunaData[i,which(colnames(sunaData)=='dark_signal_average')] /dev/null 2>&1; then + for DIR in $OUT_PATH/NEON.DOM.SITE*; do + echo "Starting non-MDP sites==================" + echo "Syncing $DIR to bucket $BUCKET_NAME" + # Parse the product + [[ "$DIR" =~ ^$OUT_PATH/(.*)$ ]] + PRODUCT="${BASH_REMATCH[1]}" + echo "PRODUCT is $PRODUCT" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${OUT_PATH}/${PRODUCT}" \ + ":gcs://${BUCKET_NAME}/${PRODUCT}" + done + echo "============ Done for non-MDP sites" + else + echo "No pub output to egress" + fi + + # + # Do the same for MDP sites if mdp sites exists in the output + # Check to see if the output need to be sent to the staging or not + # For example, BUCKET_NAME_MDP: neon-aa-dev-md03-staging/Publication for staging SITE=MD03 + # Read mdp_site_list from githubusercontent + # + if ls $OUT_PATH_MDP/NEON.DOM.SITE* 1> /dev/null 2>&1; then + for DIR in $OUT_PATH_MDP/NEON.DOM.SITE*; do + echo "=" + echo "Starting MDP sites==================" + # Parse the product + [[ "$DIR" =~ ^$OUT_PATH_MDP/(.*)$ ]] + PRODUCT="${BASH_REMATCH[1]}" + echo "PRODUCT is $PRODUCT" + for DIR_SUB in $DIR/MD*; do + echo "DIR is $DIR" + echo "DIR_SUB is $DIR_SUB" + # Parse the site + [[ "$DIR_SUB" =~ ^$DIR/(.*)$ ]] + SITE="${BASH_REMATCH[1]}" + # to change to lowercase in case + # export site="${SITE,,}" + # + while read -r mdpsite prod staging bucket_name + do + if [[ $SITE == $mdpsite ]] && [[ $prod == $PROD ]] && [[ $staging == $STAGING ]]; then + BUCKET_NAME_MDP=$bucket_name + echo "$mdpsite products to $bucket_name bucket" + else echo "**** No products available for $mdpsite to $bucket_name bucket" + fi + done < $OUT_MDP_SITES + echo "Syncing $SITE products directory $DIR to mdp bucket $BUCKET_NAME_MDP" + done + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${OUT_PATH_MDP}/${PRODUCT}" \ + ":gcs://${BUCKET_NAME_MDP}/${PRODUCT}" + done + echo "============ Done for MDP sites" + cp -f "$OUT_MDP_SITES" $OUT_PATH_MDP/mdp_sites.txt + else + echo "No MDP pub output to egress" + fi + + # Run second module - pub_upload (using environment variables below as input parameters) + echo "run pub uploader ..." + export DATA_PATH=$OUT_PATH + python3 -m pub_uploader.pub_uploader_main + # Run third module - pub_sync (using environment variables below as input parameters) + echo "run pub sync sites ..." + python3 -m pub_sync.pub_sync_main + EOF + env: + LOG_LEVEL: INFO + + # Environment variables for 1st module: pub_egress. The pub bucket and egress url are specified via secrets below. + OUT_PATH: "/pfs/out" + OUT_PATH_MDP: "/pfs/out/mdp" + OUT_MDP_SITES: "/tmp/mdp_sites.txt" + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + STARTING_PATH_INDEX: "2" # starting path index to process pub packages. Use "2" to process the whole repo with path structure /pfs/repo_name/... + PROD: "false" # false for non-prod, true for prod + STAGING: "true" # The default is true. + + # Environment variables for 2nd module: pub_upload. + # DATA_PATH is set in the code above to the output from the egress module + # Uses STARTING_PATH_INDEX above + VERSION: 'pachyderm_v1' + CHANGE_BY: pachyderm + + # Environment variables for 3rd module: pub_sync. + # Uses DATE_PATH from input spec. DATA_PATH is set in the code above to the output from the egress module + # Uses CHANGE_BY above + DATE_PATH_YEAR_INDEX: "3" + DATE_PATH_MONTH_INDEX: "4" + DATA_PATH_PRODUCT_INDEX: "3" + DATA_PATH_SITE_INDEX: "4" + DATA_PATH_DATE_INDEX: "5" + DATA_PATH_PACKAGE_INDEX: "6" + PRODUCTS: NEON.DOM.SITE.DP1.20033.001 # CAN BE MULTIPLE, COMMA-SEPARATED + SITES: "all" # CAN BE MULTIPLE, COMMA-SEPARATED array of NEON site codes. "all" will find all sites with pub records in the database. + + secrets: + - name: pdr-secret + mount_path: /var/db_secret + - name: pub-bucket + env_var: BUCKET_NAME + key: BUCKET_NAME + - name: pub-bucket + env_var: EGRESS_URL + key: EGRESS_URL + +input: + group: + - join: + - pfs: + name: DATA_PATH + repo: nitrate_pub_format_and_package + # Glob must be at each intended pub datum (i.e. each site/year/month), grouped by month + glob: /*/*/(*/*) + joinOn: $1 + group_by: $1 + - pfs: + name: DATE_PATH + repo: nitrate_cron_monthly_and_pub_control + glob: /(*/*) + joinOn: $1 + outer_join: True # We want to run even if no data so pub_sync runs + group_by: $1 + empty_files: true +autoscaling: true +resource_requests: + memory: 500M + cpu: .5 +resource_limits: + memory: 1G + cpu: 1.3 +sidecar_resource_requests: + memory: 2G + cpu: 1.3 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_pub_format_and_package.yaml b/pipe/nitrate/nitrate_pub_format_and_package.yaml new file mode 100644 index 000000000..d3db6206c --- /dev/null +++ b/pipe/nitrate/nitrate_pub_format_and_package.yaml @@ -0,0 +1,163 @@ +--- +pipeline: + name: nitrate_pub_format_and_package +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-grp-pack:v4.2.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH_TRANSFORMER + rm -rf $OUT_PATH_PACKAGER + rm -rf $OUT_PATH_PUBFILES + rm -rf $OUT_PATH_MAINTFILES + mkdir $OUT_PATH_TRANSFORMER + mkdir $OUT_PATH_PACKAGER + mkdir $OUT_PATH_PUBFILES + mkdir $OUT_PATH_MAINTFILES + + # Set some environment variables for the pub transformer module + export DATA_PATH=$GROUPED_PATH + export OUT_PATH=$OUT_PATH_TRANSFORMER + + # Run pub_workbook_loader to load pub workbooks for pub_transformer and os_table_loader. + python3 -m pub_workbook_loader.pub_workbook_loader_main + + # Run pub_transformer (using environment variables below as input parameters) + python3 -m pub_transformer.pub_transformer_main + + # Run pub_packager. Packager needs to be run at monthly glob. Get those paths. + export OUT_PATH=$OUT_PATH_PACKAGER + product_month_paths="${OUT_PATH_TRANSFORMER}/*/*/*" + for path in $product_month_paths; do + echo "Processing product-month path $path" + export DATA_PATH=$path + python3 -m pub_packager.pub_packager_main + done + + # Clean up after pub_transformer. + rm -rf $OUT_PATH_TRANSFORMER + + # Run pub_files. + export OUT_PATH=$OUT_PATH_PUBFILES + export IN_PATH=$OUT_PATH_PACKAGER + export LOCATION_PATH=$GROUPED_PATH + python3 -m pub_files.main + + # Run os_table_loader for maintenance files + export IN_PATH=$OUT_PATH_PUBFILES + export OUT_PATH=$OUT_PATH_MAINTFILES + export WORKBOOK_PATH=$WORKBOOK_PATH + export PARTIAL_TABLE_NAME="maintenance" + export FILE_TYPE="csv" + export DB_CONFIG_SOURCE="mount" + export LOG_LEVEL=$LOG_LEVEL + export INPUT_PATH_PARSE_INDEX="3" + export DATA_PRODUCT_PATH_INDEX="3" + export SITE_PATH_INDEX="4" + export YEAR_PATH_INDEX="5" + export MONTH_PATH_INDEX="6" + export PACKAGE_TYPE_PATH_INDEX="7" + python3 -m os_table_loader.publication_main + + # Run os_table_loader for SUNA clean/cal files + export IN_PATH=$OUT_PATH_MAINTFILES + export OUT_PATH="/pfs/out" + export WORKBOOK_PATH=$WORKBOOK_PATH + export PARTIAL_TABLE_NAME="sunaCleanAndCal" + export FILE_TYPE="csv" + export DB_CONFIG_SOURCE="mount" + export LOG_LEVEL=$LOG_LEVEL + export INPUT_PATH_PARSE_INDEX="3" + export DATA_PRODUCT_PATH_INDEX="3" + export SITE_PATH_INDEX="4" + export YEAR_PATH_INDEX="5" + export MONTH_PATH_INDEX="6" + export PACKAGE_TYPE_PATH_INDEX="7" + python3 -m os_table_loader.publication_main + + + EOF + env: + # Environment variables for 2nd (part A) module: pub_workbook_loader. + OUT_PATH_WORKBOOK: "/tmp/pub_workbooks" + PRODUCTS: NEON.DOM.SITE.DP1.20033.001 # Format: NEON.DOM.SITE.DPX.XXXXX.XXX,NEON.DOM.SITE.DPX.XXXXX.XXX,etc + + # Environment variables for 2nd module (part B): pub_transformer. + LOG_LEVEL: INFO + PRODUCT_INDEX: '3' # input path index of the data product identifier. Also shared with pub_packager. + YEAR_INDEX: '4' + MONTH_INDEX: '5' + DAY_INDEX: '7' + DATA_TYPE_INDEX: '8' + GROUP_METADATA_DIR: group + DATA_PATH_PARSE_INDEX: '2' + OUT_PATH_TRANSFORMER: "/tmp/pub_transformer" + WORKBOOK_PATH: "/tmp/pub_workbooks" + + # Environment variables for module: pub_packager. Also uses PRODUCT_INDEX from pub_transformer. + OUT_PATH_PACKAGER: "/tmp/pub_packager" + ERR_PATH_PACKAGER: "/pfs/out/packager/errored_datums" + PUBLOC_INDEX: '6' # input path index of the pub package location (typically the site) + DATE_INDEX: '4' # Starting index of date in path (i.e. year index) + DATE_INDEX_LENGTH: '2' # length of date index for pub package (should be 2 for monthly) + SORT_INDEX: '10' # File name index corresponding to date field (delimiter = .) + + # Environment variables for module: pub_files. + OUT_PATH_PUBFILES: "/tmp/pub_files" + RELATIVE_PATH_INDEX: '3' + DB_SECRETS_PATH: /var/db_secret + GITHUB_PEM_PATH: /var/github_secret/key + GITHUB_APP_ID: '300002' + GITHUB_INSTALLATION_ID: '34765458' + GITHUB_HOST: https://api.github.com + GITHUB_REPO_OWNER: NEONScience + GITHUB_README_REPO: neon-metadata-docs + GITHUB_README_PATH: readme/template.j2 + GITHUB_EML_REPO: neon-metadata-docs + GITHUB_EML_BOILERPLATE_PATH: eml/neon_components/NEON_EML_Boilerplate.xml + GITHUB_EML_CONTACT_PATH: eml/neon_components/neon_contact.xml + GITHUB_EML_INTELLECTUAL_RIGHTS_PATH: eml/neon_components/neon_intellectualRights.xml + GITHUB_EML_UNIT_TYPES_PATH: eml/neon_components/neon_unitTypes.xml + GITHUB_EML_UNITS_PATH: eml/neon_components/NEON_units.txt + GITHUB_BRANCH: main + + # Environment variables for module: ais_maintenance table loader + OUT_PATH_MAINTFILES: "/tmp/maint_out" + + secrets: + - name: pdr-secret + mount_path: /var/db_secret + - name: github-neonscience-app-secret + mount_path: /var/github_secret + +input: + pfs: + name: GROUPED_PATH + repo: nitrate_pub_group + # Glob must be product-monthly or product-site-monthly. Product-site-month datums reduce unneccesary republication. + # path structure is e.g. DP1.00098.001/2023/04/CPER/04 (product/year/month/site/day) + glob: /*/*/*/* +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.2 +resource_limits: + memory: 800M + cpu: 1.2 +sidecar_resource_requests: + memory: 3.5G + cpu: 0.4 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_pub_group.yaml b/pipe/nitrate/nitrate_pub_group.yaml new file mode 100644 index 000000000..c20de26de --- /dev/null +++ b/pipe/nitrate/nitrate_pub_group.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: nitrate_pub_group +transform: +# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-grp-pack:v4.2.0 + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - '# Run first module - pub_grouper (using environment variables below as input parameters)' + - python3 -m pub_grouper.pub_grouper_main + env: + # Environment variables for 1st module: pub_grouper. + LOG_LEVEL: INFO + OUT_PATH: "/pfs/out" + ERR_PATH_GROUPER: "pfs/out/errored_datums" + YEAR_INDEX: '3' + GROUP_INDEX: '6' + DATA_TYPE_INDEX: '7' # Also shared with pub_transform + GROUP_METADATA_DIR: group + PUBLOC_KEY: site + LINK_TYPE: SYMLINK + +input: + join: + - pfs: + name: DATA_PATH + repo: nitrate_level1_group_consolidate_srf + # Glob should be monthly and joined with pub_control to hold pub until month is likely complete + glob: /(*/*) + joinOn: $1 + - pfs: + repo: nitrate_cron_monthly_and_pub_control + glob: /(*/*) + joinOn: $1 + empty_files: true +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 1.8G + cpu: 1 +resource_limits: + memory: 2.5G + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_qm_group_and_compute.yaml b/pipe/nitrate/nitrate_qm_group_and_compute.yaml new file mode 100644 index 000000000..7b65a30d2 --- /dev/null +++ b/pipe/nitrate/nitrate_qm_group_and_compute.yaml @@ -0,0 +1,68 @@ +--- +pipeline: + name: nitrate_qm_group_and_compute +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-qaqc-qm-grp:v2.1.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # ---- Run module - quality metrics (averaged) ---- + Rscript ./flow.qaqc.qm.R \ + DirIn=$QAQC_PLAUSIBILITY_PATH \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "WndwAgr=015" \ + "WghtAlphBeta=2|1" \ + Thsh=0.2 \ + FileSchmQm=$FILE_SCHEMA_QM \ + "GrpQfAlph1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF|nitratePersistenceQF|nitrateHumidityQF|nitrateLampTempQF|nitrateLightDarkRatioQF|nitrateLampStabilizeQF" \ + "GrpQfBeta1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF" + env: + LOG_LEVEL: DEBUG +input: + cross: + - pfs: + name: FILE_SCHEMA_QM + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_quality_metrics.avsc + - pfs: + name: QAQC_PLAUSIBILITY_PATH + repo: nitrate_flags_specific + glob: /*/*/*/* +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 500M + cpu: 1.2 +resource_limits: + memory: 1G + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_srf_assignment.yaml b/pipe/nitrate/nitrate_srf_assignment.yaml new file mode 100644 index 000000000..dbea14b0d --- /dev/null +++ b/pipe/nitrate/nitrate_srf_assignment.yaml @@ -0,0 +1,58 @@ +--- +pipeline: + name: nitrate_srf_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.srf.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-srf-asgn:v1.1.2 + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: nitrate_srf_loader + glob: /* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 4 +autoscaling: true +resource_requests: + memory: 200M + cpu: 0.8 +resource_limits: + memory: 800M + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.2 +datum_set_spec: + number: 10 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_srf_loader.yaml b/pipe/nitrate/nitrate_srf_loader.yaml new file mode 100644 index 000000000..7e1d59cd9 --- /dev/null +++ b/pipe/nitrate/nitrate_srf_loader.yaml @@ -0,0 +1,51 @@ +pipeline: + name: nitrate_srf_loader +transform: + cmd: + - /bin/bash + env: + GROUP_PREFIX: nitrate-surfacewater_ + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-srf-loader:v1.0.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + secrets: + - name: pdr-secret + mount_path: /var/db_secret + stdin: + - '#!/bin/bash' + - python3 -m srf_loader.srf_loader_main +input: + pfs: + branch: master + empty_files: true + glob: /* + repo: sunav2_cron_daily_and_date_control_tick +autoscaling: true +resource_requests: + memory: 50M + cpu: 0.1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_stats_group_and_compute.yaml b/pipe/nitrate/nitrate_stats_group_and_compute.yaml new file mode 100644 index 000000000..9ec23ab58 --- /dev/null +++ b/pipe/nitrate/nitrate_stats_group_and_compute.yaml @@ -0,0 +1,89 @@ +--- +pipeline: + name: nitrate_stats_group_and_compute +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-stat-basc-grp:v2.0.2 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/filter_joined + mkdir -p /tmp/pfs/filter_joined + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Run second module - basic stats + Rscript ./flow.stat.basc.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + WndwAgr=015 \ + FileSchmStat=$FILE_SCHEMA_STATS \ + "TermStat1=nitrate:mean|minimum|maximum|variance|numPts|stdEr" + EOF + env: + # Environment variables for filter-joiner + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: QAQC_PATH + # Filter for data & uncertainty_data directories + glob_pattern: /pfs/QAQC_PATH/*/*/*/*/*/*/data/** + # Join on named location (already joined below by day) + join_indices: [6] + - path: + name: UNCERTAINTY_PATH + # Filter for data directory + glob_pattern: /pfs/UNCERTAINTY_PATH/*/*/*/*/*/*/uncertainty*/** + # Join on named location (already joined below by day) + join_indices: [6] + OUT_PATH: /tmp/pfs/filter_joined + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. + # Environment variables for calibration module + PARALLELIZATION_INTERNAL: '5' # Option for stats module +input: + cross: + - pfs: + name: FILE_SCHEMA_STATS + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_stats.avsc + - join: + - pfs: + name: QAQC_PATH + repo: nitrate_flags_specific + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: UNCERTAINTY_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY +parallelism_spec: + constant: 5 +resource_requests: + memory: 1.8G + cpu: 6 +resource_limits: + memory: 3G + cpu: 7 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +autoscaling: true +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml new file mode 100644 index 000000000..a15e3a4de --- /dev/null +++ b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml @@ -0,0 +1,81 @@ +--- +pipeline: + name: nitrate_thresh_select_ts_pad +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-thsh-slct-ts-pad:v2.1.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/threshold_select + rm -rf /tmp/threshold_selectCopy + mkdir -p /tmp/threshold_select + # Run first module - threshold_select' + Rscript ./flow.thsh.slct.R \ + DirIn=$REPO_LOCATIONS \ + DirOut=/tmp/threshold_select \ + DirErr=/pfs/out/errored_datums \ + FileThsh=$FILE_THRESHOLDS \ + "TermCtxt1=nitrate" \ + "DirSubCopy=location|data|flags" + # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) + cp -rL /tmp/threshold_select /tmp/threshold_selectCopy || : # Allow to fail without exit code (happens if step above produced no output) || : # Allow to fail without exit code (happens if step above produced no output) + rm -r -f /tmp/threshold_select + # Run second module - timeseries_padder + python3 -m timeseries_padder.timeseries_padder.variable_pad_main --yearindex 3 --monthindex 4 --dayindex 5 --locindex 8 --subdirindex 9 + EOF + env: + DATA_PATH: /tmp/threshold_selectCopy + OUT_PATH: /pfs/out + LOG_LEVEL: INFO + PAD_DIR: data + COPY_DIR: flags # Can be multiple, separated by commas without spaces. Directories other than the pad directory and threshold directory to copy to the output (e.g. location,flags). Set to something like 'none' if none other are desired. + RELATIVE_PATH_INDEX: '3' + PARALLELIZATION_INTERNAL: '1' # For threshold select module +output_branch: master +input: + cross: + - pfs: + name: REPO_LOCATIONS + repo: nitrate_group_path + glob: /*/*/* + - pfs: + name: FILE_THRESHOLDS + repo: nitrate_threshold + glob: /thresholds.json +parallelism_spec: + constant: 1 +autoscaling: true +resource_requests: + memory: 300M + cpu: 1.1 +resource_limits: + memory: 1.5G + cpu: 5 +sidecar_resource_requests: + memory: 3G + cpu: 0.7 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_threshold.yaml b/pipe/nitrate/nitrate_threshold.yaml new file mode 100644 index 000000000..18faa901e --- /dev/null +++ b/pipe/nitrate/nitrate_threshold.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: nitrate_threshold +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-threshold-loader:v1.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m threshold_loader.threshold_loader_main + env: + OUT_PATH: /pfs/out + LOG_LEVEL: INFO + # Separate multiple terms with a pipe (|). Enter "none" to retrieve all terms + TERM: nitrate + CTXT: nitrate + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: sunav2_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 32M + cpu: 0.05 +resource_limits: + memory: 200M + cpu: 0.5 +sidecar_resource_requests: + memory: 200M + cpu: 0.5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/pipe_list_nitrate.txt b/pipe/nitrate/pipe_list_nitrate.txt new file mode 100644 index 000000000..bf8f2e611 --- /dev/null +++ b/pipe/nitrate/pipe_list_nitrate.txt @@ -0,0 +1,17 @@ +nitrate_group_loader.yaml +nitrate_group_assignment.yaml +nitrate_srf_loader.yaml +nitrate_srf_assignment.yaml +nitrate_group_path.yaml +nitrate_threshold.yaml +nitrate_thresh_select_ts_pad.yaml +nitrate_analyze_pad_and_qaqc_plau.yaml +nitrate_flags_specific.yaml +nitrate_stats_group_and_compute.yaml +nitrate_null_gap_ucrt.yaml +nitrate_qm_group_and_compute.yaml +nitrate_level1_group_consolidate_srf.yaml +nitrate_cron_monthly_and_pub_control.yaml +nitrate_pub_group.yaml +nitrate_pub_format_and_package.yaml +nitrate_pub_egress_and_publish.yaml diff --git a/pipe/sunav2/data_source_sunav2_list_years.json b/pipe/sunav2/data_source_sunav2_list_years.json deleted file mode 100644 index 386f47bae..000000000 --- a/pipe/sunav2/data_source_sunav2_list_years.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "pipeline": { - "name": "data_source_sunav2_list_years" - }, - "transform": { - "image":"registry.access.redhat.com/ubi8/ubi-minimal:8.3", - "cmd": [ - "/bin/bash" - ], - "stdin": [ - "#!/bin/bash", - "ls $REPO_IN > /pfs/out/data_years.txt" - ] - }, - "input": { - "pfs": { - "name": "REPO_IN", - "repo": "data_source_sunav2", - "glob": "/sunav2", - "empty_files": true - } - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "1K", - "cpu": 0.01 - } -} diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt new file mode 100644 index 000000000..1fb91b452 --- /dev/null +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -0,0 +1,18 @@ +sunav2_cron_daily_and_date_control.yaml +sunav2_logjam_list_files.yaml +sunav2_logjam_load_files.yaml +sunav2_logjam_assign_clean_files.yaml +sunav2_data_source_kafka.yaml +sunav2_data_source_trino.yaml +sunav2_trino_data_parser.yaml +sunav2_fill_log_files.yaml +sunav2_calibration_list_files.yaml +sunav2_calibration_loader.yaml +sunav2_calibration_assignment.yaml +sunav2_calibration_group_and_convert.yaml +sunav2_location_asset.yaml +sunav2_location_asset_assignment.yaml +sunav2_location_loader.yaml +sunav2_location_active_dates_assignment.yaml +sunav2_location_group_and_restructure.yaml +sunav2_fill_date_gaps.yaml \ No newline at end of file diff --git a/pipe/sunav2/site-list.json b/pipe/sunav2/site-list.json new file mode 100644 index 000000000..c7b5ce526 --- /dev/null +++ b/pipe/sunav2/site-list.json @@ -0,0 +1,142 @@ +[ + { + "site" : "ARIK", + "kafka_start_date" : "2024-03-01" + }, + { + "site" : "BARC", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "BIGC", + "kafka_start_date" : "2024-06-01" + }, + { + "site" : "BLDE", + "kafka_start_date" : "2024-05-08" + }, + { + "site" : "BLUE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "BLWA", + "kafka_start_date" : "2024-08-22" + }, + { + "site" : "CARI", + "kafka_start_date" : "2024-03-01" + }, + { + "site" : "COMO", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "CRAM", + "kafka_start_date" : "2024-07-20" + }, + { + "site" : "CUPE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "FLNT", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "GUIL", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "HOPB", + "kafka_start_date" : "2024-01-17" + }, + { + "site" : "HQTW", + "kafka_start_date" : "2023-06-01", + }, + { + "site" : "KING", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "LECO", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "LEWI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "LIRO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "MART", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MAYF", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCDI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCRA", + "kafka_start_date" : "2024-02-05" + }, + { + "site" : "OKSR", + "kafka_start_date" : "2024-04-06" + }, + { + "site" : "POSE", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "PRIN", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "PRLA", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "PRPO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "REDB", + "kafka_start_date" : "2024-02-06" + }, + { + "site" : "SUGG", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "SYCA", + "kafka_start_date" : "2024-04-11" + }, + { + "site" : "TECR", + "kafka_start_date" : "2024-03-17" + }, + { + "site" : "TOMB", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "TOOK", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "WALK", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "WLOU", + "kafka_start_date" : "2024-02-06" + } +] diff --git a/pipe/sunav2/sunav2_calibrated_location_group.yaml b/pipe/sunav2/sunav2_calibrated_location_group.yaml deleted file mode 100644 index a044e48ce..000000000 --- a/pipe/sunav2/sunav2_calibrated_location_group.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -pipeline: - name: sunav2_calibrated_location_group -transform: -# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-filt-join:v1.0.0 - cmd: ["/bin/bash"] - stdin: - - "#!/bin/bash" - - "python3 -m filter_joiner.filter_joiner_main" - env: - CONFIG: | - --- - # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. - # Metadata indices will typically begin at index 3. - input_paths: - - path: - name: DATA_PATH - # Filter for data directory - glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - outer_join: true - - path: - name: LOCATION_PATH - # Filter for data directory - glob_pattern: /pfs/LOCATION_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - OUT_PATH: /pfs/out - LOG_LEVEL: INFO - RELATIVE_PATH_INDEX: "3" -input: - join: - - pfs: - name: DATA_PATH - repo: sunav2_data_calibration_group - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: true - - pfs: - name: LOCATION_PATH - repo: sunav2_location_asset_assignment - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - empty_files: true -parallelism_spec: - constant: "1" -enable_stats: false -standby: true diff --git a/pipe/sunav2/sunav2_calibration_assignment.json b/pipe/sunav2/sunav2_calibration_assignment.json deleted file mode 100644 index bd8d52ee7..000000000 --- a/pipe/sunav2/sunav2_calibration_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_calibration_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.cal.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "PadDay=-1|1" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.3", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "calibration", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "4" - } -} diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml new file mode 100644 index 000000000..bfd45d287 --- /dev/null +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -0,0 +1,46 @@ +--- +pipeline: + name: sunav2_calibration_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.cal.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.3 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: sunav2_calibration_loader + glob: /sunav2/* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 200M + cpu: 0.8 +resource_limits: + memory: 600M + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml new file mode 100644 index 000000000..2d445c76e --- /dev/null +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -0,0 +1,103 @@ +--- +pipeline: + name: sunav2_calibration_group_and_convert +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v3.1.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/filter_joined + rm -rf $OUT_PATH + mkdir -p /tmp/pfs/filter_joined + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + # Run filter-joiner for data (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + + # Run calibration conversion module + Rscript ./flow.cal.conv.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + TermQf=nitrate \ + FileSchmData=$FILE_SCHEMA_DATA \ + FileSchmQf=$FILE_SCHEMA_FLAGS \ + DirSubCopy=flags + EOF + env: + # Environment variables for filter-joiner. + CONFIG: | + --- + # Configuration for filter-joiner module that will bring together the data and calibrations + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH + # Filter for data directory + glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** + # Join on named location (already joined below by source type and day) + join_indices: [7] + outer_join: true + - path: + name: CALIBRATION_PATH + # Filter for data directory + glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + OUT_PATH: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" # Must be consistent across inputs + LINK_TYPE: COPY # options are COPY or SYMLINK. MUST BE SIMLINK IF USING COMBINED MODULE. + # Environment variables for calibration module + PARALLELIZATION_INTERNAL: '3' # Option for calibration conversion module +input: + cross: + - pfs: + name: FILE_SCHEMA_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_calibration_flags.avsc + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + # Outer join all repos so that varying sensors between kafka and trino loaders will all get joined with calibrations. Filter-joiner will narrow down. + - join: + - pfs: + name: CALIBRATION_PATH + repo: sunav2_calibration_assignment + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATA_PATH + repo: sunav2_fill_log_files + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1.5G + cpu: 3.3 +resource_limits: + memory: 3G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.6 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_list_files.yaml b/pipe/sunav2/sunav2_calibration_list_files.yaml new file mode 100644 index 000000000..3d42f72ca --- /dev/null +++ b/pipe/sunav2/sunav2_calibration_list_files.yaml @@ -0,0 +1,30 @@ +--- +pipeline: + name: sunav2_calibration_list_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v3.0.0 + cmd: ["/bin/bash"] + env: + CVAL_INGEST_BUCKET: neon-cval + OUT_PATH: /pfs/out + stdin: + - "#!/bin/bash" + - python3 -m calval_loader.calval_loader +input: + pfs: + repo: sunav2_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.4 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 1G + cpu: 0.5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml new file mode 100644 index 000000000..71f17324b --- /dev/null +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -0,0 +1,50 @@ +--- +pipeline: + name: sunav2_calibration_loader +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v3.0.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-sstrict-mode/ + set -euo pipefail + IFS=$'\n\t' + + python3 -m calval_loader.load_all_calval_files # run the calibration loader + + EOF + env: + CVAL_INGEST_BUCKET: neon-cval + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + SOURCE_TYPE: "sunav2" + SCHEMA_NAME: "sunav2_raw" + STARTING_PATH_INDEX: "5" + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + name: IN_PATH + repo: sunav2_calibration_list_files + glob: /*/*/*/* + empty_files: true +parallelism_spec: + constant: 10 +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.5 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 800M + cpu: 0.2 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml new file mode 100644 index 000000000..22eb8fbe7 --- /dev/null +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -0,0 +1,44 @@ +--- +pipeline: + name: sunav2_cron_daily_and_date_control +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 + cmd: ["/bin/bash"] + env: + # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure + # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file + # start_date field in the site-list file is the earliest date to pull data from a site + # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka + # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. + OUT_PATH: /pfs/out + START_DATE: "2019-10-29" # Inclusive + END_DATE: "2019-12-02" # Inclusive + SOURCE_TYPE: "sunav2" + stdin: + - "#!/bin/bash" + - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main +input: + cross: + # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. + - cron: + name: tick + spec: "@never" + #spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + overwrite: true + - pfs: + name: SITE_FILE + repo: sunav2_site_list + glob: /site-list.json +resource_requests: + memory: 100M + cpu: 1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.5 +autoscaling: true +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml new file mode 100644 index 000000000..bdcdf9aae --- /dev/null +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -0,0 +1,43 @@ +--- +pipeline: + name: sunav2_cron_daily_and_date_control_kafka +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 + cmd: ["/bin/bash"] + env: + # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure + # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file + # start_date field in the site-list file is the earliest date to pull data from a site + # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka + # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. + OUT_PATH: /pfs/out + START_DATE: "2025-08-25" # Inclusive + SOURCE_TYPE: "sunav2" + stdin: + - "#!/bin/bash" + - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main +input: + cross: + # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. + - cron: + name: tick + #spec: "@never" + spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + overwrite: true + - pfs: + name: SITE_FILE + repo: sunav2_site_list + glob: /site-list.json +resource_requests: + memory: 100M + cpu: 1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.5 +autoscaling: true +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_data_calibration_group.yaml b/pipe/sunav2/sunav2_data_calibration_group.yaml deleted file mode 100644 index 8e54d6158..000000000 --- a/pipe/sunav2/sunav2_data_calibration_group.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -pipeline: - name: sunav2_data_calibration_group -transform: -# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-filt-join:v1.0.0 - cmd: ["/bin/bash"] - stdin: - - "#!/bin/bash" - - "python3 -m filter_joiner.filter_joiner_main" - env: - CONFIG: | - --- - # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. - # Metadata indices will typically begin at index 3. - input_paths: - - path: - name: DATA_PATH - # Filter for data directory - glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - outer_join: true - - path: - name: CALIBRATION_PATH - # Filter for data directory - glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - OUT_PATH: /pfs/out - LOG_LEVEL: INFO - RELATIVE_PATH_INDEX: "3" -input: - join: - - pfs: - name: DATA_PATH - repo: data_source_sunav2 - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: true - - pfs: - name: CALIBRATION_PATH - repo: sunav2_calibration_assignment - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - empty_files: true -parallelism_spec: - constant: "1" -enable_stats: false -standby: true diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml new file mode 100644 index 000000000..7b612f90b --- /dev/null +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -0,0 +1,209 @@ +--- +pipeline: + name: sunav2_data_source_kafka +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.10.1 + env: + # environment variables for kafka loader + OUT_PATH: /pfs/out #also used for parser + SOURCE_TYPE: "sunav2_raw" + LOG_LEVEL: INFO + YEAR_INDEX: "5" + MONTH_INDEX: "6" + DAY_INDEX: "7" + KAFKA_RETENTION_DAYS: "15" + + # environment variables for the parser + PARSE_FIELD: serial_output + RELATIVE_PATH_INDEX: "4" + PARSED_SCHEMA_PATH: /usr/src/app/parsed-schemas/sunav2/sunav2_parsed.avsc + SOURCE_TYPE: 'sunav2_raw' + DATA_PATH: /pfs/out # takes output of kafka loader as it's input to parse + UPDATE_TRIGGER_TABLE: "False" + RM_OFFSETS: "False" + secrets: + - name: pachyderm-kafka-auth + env_var: KAFKA_USER + key: KAFKA_USER + - name: pachyderm-kafka-auth + env_var: KAFKA_PASSWORD + key: KAFKA_PASSWORD + - name: pachyderm-kafka-auth + env_var: KAFKA_BROKER + key: KAFKA_BROKER + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET + - name: pdr-secret + env_var: PDR_HOST + key: hostname + - name: pdr-secret + env_var: PDR_DBNAME + key: database + - name: pdr-secret + env_var: PDR_USER + key: username + - name: pdr-secret + env_var: PDR_PASSWORD + key: password + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Get GCP zone + meta=$(curl -sH "Metadata-Flavor: Google" "http://metadata/computeMetadata/v1/instance/zone") + zone=$(echo $meta | cut -d "/" -f 4) + echo $zone + + # Get today's date for evaluating kafka data retention period + date_today=$(date -u +%Y-%m-%d) + kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) + + # Get date from input path. Terminal path structure must be /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE + # Datum must be set at /SOURCE_TYPE/YYYY/MM/DD or /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE + date_path=$(echo $import_trigger | cut -f $YEAR_INDEX,$MONTH_INDEX,$DAY_INDEX -d "/") + echo $date_path + date_str=$(date -u +%Y-%m-%d -d $date_path) + + # Get each site to run + if [[ -f ${import_trigger} ]]; then + import_trigger_glob="${import_trigger}" + else + import_trigger_glob="${import_trigger}/*" + fi + + sites_output=() + + for site_kafka in $import_trigger_glob; do + site_file=$(basename $site_kafka) # Strip off any path prefix + site=$(echo $site_file | cut -f 1 -d "." --only-delimited) # Extract the site from site.kafka. Ignore site-only files (e.g. CPER vs. CPER.kafka) + type=$(echo $site_file | cut -f 2 -d "." --only-delimited) # Extract the 'kafka' from site.kafka + if [ "$type" != "kafka" ] + then + echo "$site_file is not indicated to be streaming from Kafka. Skipping..." + continue + elif [ "$(date -u +%s -d "$date_str")" -lt "$(date -u +%s -d "$kafka_min_date")" ] + then + echo -n "Cannot extract $date_str Kafka data for $site. " + echo -n "Today's date ($date_today) is beyond the Kafka retention period ($KAFKA_RETENTION_DAYS days). Skipping..." + continue + fi + + # We are ok to run + echo "Extracting $date_str kafka data for $SOURCE_TYPE at $site" + + # Get "current data" - data that came in on the specified day, which is the same day it was measured + # Note: We cannot use the --removeoffset flag on the kafka loader (which removes the offsets from the filenames. This will often violate the Pachyderm requirement that different datums cannot write the same file) + ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str --only current --consumer client.rack=$zone + + # Get "non-current data" - data that came in on the specified day, which is NOT the same day it was measured + date_str_1=$(date +%Y-%m-%d -d "$date_str + 1 day") + ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str_1 --only noncurrent --consumer client.rack=$zone + + sites_output+=($site) + + done + + # Upload L0 files to bucket, compacting with any existing file with the same name + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # /pfs/out/li191r/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/(.*)/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[7]}" + # fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + + done + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" --stripoffset + + # Update the airflow triggering table + for site_output in "${sites_output[@]}"; do + ./update-trigger-table.py -s $site_output -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" + done + + # set +x # Uncomment for debugging + rm -rf $linkdir + fi + + # run data parser + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + python3 -m raw_data_parsers.raw_data_parser.suna_data_parser_main + + # save parsed data to gcs + export SOURCE_TYPE=sunav2 + + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/${SOURCE_TYPE}/**/*.parquet" + # /pfs/out/sunav2/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/(.*)/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[7]}" + # fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + + done + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" --stripoffset + + # set +x # Uncomment for debugging + rm -rf $linkdir + + fi + + EOF +input: + pfs: + name: import_trigger + repo: sunav2_cron_daily_and_date_control_kafka + # Must be datum by day (e.g. /SOURCE_TYPE/*/*/*) or by day/site (e.g. /SOURCE_TYPE/*/*/*/*) + glob: "/sunav2/*/*/*" +parallelism_spec: + constant: 3 +autoscaling: true +resource_requests: + memory: 2.5G + cpu: 2 +resource_limits: + memory: 3G + cpu: 2.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml new file mode 100644 index 000000000..15a3b6079 --- /dev/null +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -0,0 +1,146 @@ +--- +pipeline: + name: sunav2_data_source_trino +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.3.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + interimDir="/tmp/interimData" + rm -rf $interimDir + + # Get today's date for evaluating kafka data retention period + date_today=$(date -u +%Y-%m-%d) + kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) + + # Run first module - data_source_site (pull data from database by site) + # Split data source path + for path in $(find -L $import_trigger -type f); do + echo "Processing $path" + p=${path#/pfs} + IFS="/"; arr=($p); unset IFS; + year=${arr[3]} + month=${arr[4]} + day=${arr[5]} + site=${arr[6]} + type=$(echo $site | cut -f 2 -d "." --only-delimited); # Extract the "kafka" from site.kafka if present + if [ "$type" = "kafka" ] && [ $(date -u +%s -d $year-$month-$day) -lt $(date -u +%s -d $kafka_min_date) ] + then + site=$(echo $site | cut -f 1 -d "." --only-delimited); # Extract the site from site.kafka. + echo "$year/$month/$day for $site is indicated to be streaming from Kafka but has passed the Kafka retention period ($KAFKA_RETENTION_DAYS days)." + elif [ "$type" = "kafka" ] + then + echo "$year/$month/$day/$site is indicated to be streaming from Kafka. Skipping..." + continue + fi + echo "Extracting data from Trino for $year/$month/$day/$site" + export GEN_DATE=$year-$month-$day + export GEN_SITE_NAME=$site + export GEN_OUTPUT_DIR=$interimDir/$SOURCE_TYPE/$year/$month/$day + export REQUESTS_CA_BUNDLE=/etc/pki/tls/cert.pem + mkdir -p $GEN_OUTPUT_DIR + /usr/src/app/genscript/genparquet.py --storesitename --codec gzip + done + + # Run second module - parquet_linkmerge (merges data from a source id that collected data from multiple sites in one day + python3 -m parquet_linkmerge.parquet_linkmerge_main + + # Export L0 data to bucket + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # Example: /pfs/out/sunav2_raw/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + done + + echo "Syncing files to bucket" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + --metadata-set "content-type=application/vnd.apache.parquet" \ + copy \ + "${linkdir}" \ + ":gcs://${BUCKET_NAME}" + + echo "Removing temporary files" + rm -rf $linkdir + + # set +x # Uncomment for debugging + fi + EOF + env: + # Environment variables for data conversion step + GEN_YAML_CONF: "/usr/src/app/genscript/configs/sunav2_streams.yaml" + GEN_SCHEMA_FILE: "/usr/src/app/schemas/sunav2/sunav2_raw.avsc" + LOG_LEVEL: INFO + REQUESTS_CA_BUNDLE: "/etc/pki/tls/cert.pem" + # Environment variables for linkmerge step + IN_PATH: /tmp/interimData + OUT_PATH: /pfs/out + SOURCE_TYPE_INDEX: '3' + YEAR_INDEX: '4' + MONTH_INDEX: '5' + DAY_INDEX: '6' + SOURCE_ID_INDEX: '7' + KAFKA_RETENTION_DAYS: "15" + # Environment variables for bash code + SOURCE_TYPE: 'sunav2_raw' + secrets: + - name: pachd-trino-secret + key: TRINO_HOST + env_var: PRESTO_HOST + - name: pachd-trino-secret + key: TRINO_PASSWORD + env_var: PRESTO_PASSWORD + - name: pachd-trino-secret + key: TRINO_USER + env_var: PRESTO_USER + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET +input: + pfs: + name: import_trigger + repo: sunav2_cron_daily_and_date_control #update in cert + glob: "/sunav2/*/*/*" +output_branch: master +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 800M + cpu: 1.2 +resource_limits: + memory: 1600M + cpu: 2 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_date_gaps.yaml b/pipe/sunav2/sunav2_fill_date_gaps.yaml new file mode 100644 index 000000000..91fd9860f --- /dev/null +++ b/pipe/sunav2/sunav2_fill_date_gaps.yaml @@ -0,0 +1,113 @@ +--- +pipeline: + name: sunav2_fill_date_gaps +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gap-fill-nonrglr:sha-98108c3 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/interim + rm -rf $OUT_PATH + mkdir -p /tmp/pfs/interim + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + # Run first module - date-gap-filler (using environment variables below as input parameters) + python3 -m date_gap_filler.date_gap_filler_main + + #run gap filler for nonregularized data + Rscript ./flow.gap.fill.nonrglr.R \ + DirIn=/tmp/pfs/interim \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirFill=data|flags" \ + WndwFill="015" \ + "DirSubCopy=location|uncertainty_coef" \ + "FileSchm=data:$FILE_SCHEMA_DATA|flags:$FILE_SCHEMA_CAL_FLAGS|flags:$FILE_SCHEMA_LOG_FLAGS" + + EOF + env: + # Environment variables for date gap filler + LOG_LEVEL: INFO + OUT_PATH: /tmp/pfs/interim + OUTPUT_DIRECTORIES: data,location,uncertainty_coef,flags + DATA_SOURCE_TYPE_INDEX: '3' + DATA_YEAR_INDEX: '4' + DATA_MONTH_INDEX: '5' + DATA_DAY_INDEX: '6' + DATA_LOCATION_INDEX: '7' + DATA_TYPE_INDEX: '8' + LOCATION_SOURCE_TYPE_INDEX: '3' + LOCATION_YEAR_INDEX: '4' + LOCATION_MONTH_INDEX: '5' + LOCATION_DAY_INDEX: '6' + LOCATION_INDEX: '7' + EMPTY_FILE_TYPE_INDEX: '4' + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. + # Environment variables for regularizer + PARALLELIZATION_INTERNAL: '3' # Parallelization within R. If increased, adjust resource requests appropriately. +input: + cross: + - pfs: + name: EMPTY_FILE_PATH + repo: sunav2_empty_files + glob: /sunav2 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + - pfs: + name: FILE_SCHEMA_CAL_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_calibration_flags.avsc + - pfs: + name: FILE_SCHEMA_LOG_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_log_flags.avsc + - group: + - pfs: + name: DATA_PATH + repo: sunav2_location_group_and_restructure + glob: /(*/*/*/*) + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - join: + - pfs: + name: LOCATION_PATH + repo: sunav2_location_active_dates_assignment + glob: /(*/*/*/*) + joinOn: $1 + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATE_LIMITER_PATH + repo: sunav2_cron_daily_and_date_control + glob: /(*/*/*/*) + joinOn: $1 + group_by: $1 + empty_files: true # This can remain true even if LINK_TYPE=COPY + +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2G + cpu: 3.3 +resource_limits: + memory: 3G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml new file mode 100644 index 000000000..272f9cf96 --- /dev/null +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -0,0 +1,188 @@ +--- +pipeline: + name: sunav2_fill_log_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-4fa99b1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/kafka_merged + rm -rf $OUT_PATH_LIMIT_LOGFILES + rm -rf $OUT_PATH_JOIN_SOURCES + mkdir -p /tmp/kafka_merged # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p $OUT_PATH_LIMIT_LOGFILES # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p $OUT_PATH_JOIN_SOURCES # R modules must have pfs in the repo structure + # + # Check if there is any data (could just be the DATE_CONTROL, in which case we'll skip) + data="F" + if [ ${DATA_PATH_KAFKA+x} ]; then + data="T" + fi + if [ ${DATA_PATH_TRINO+x} ]; then + data="T" + fi + if [ ${DATA_PATH_LOG+x} ]; then + data="T" + fi + if [ $data = "F" ]; then + echo "No actual data in datum. Skipping..." + exit 0 + fi + # + # Get source type + path_glob="/pfs/DATA_PATH_*/*/" + for path in $path_glob; do + # Parse the path + [[ "$path" =~ ^/pfs/DATA_PATH_(.*)/(.*)/$ ]] + source_type="${BASH_REMATCH[2]}" + done + # + # If we have log files, limit them to the dates in the date_control pipeline + echo "Running filter-joiner to limit log files" + export CONFIG=$CONFIG_LIMIT_LOGFILES + export OUT_PATH=$OUT_PATH_LIMIT_LOGFILES + python3 -m filter_joiner.filter_joiner_main + # + # If data come from Kafka, run the Kafka-merger (could be multiple files) + if [ ${DATA_PATH_KAFKA+x} ]; then + # Data from kafka. + # Run kafka combiner + Rscript ./flow.kfka.comb.R \ + DirIn=$DATA_PATH_KAFKA \ + DirOut=/tmp/kafka_merged \ + DirErr=/pfs/out/errored_datums + fi + # Run the filter joiner to merge files from all sources. + echo "Running filter-joiner to merge all data sources" + export CONFIG=$CONFIG_JOIN_SOURCES + export OUT_PATH=$OUT_PATH_JOIN_SOURCES + python3 -m filter_joiner.filter_joiner_main + # + # Run log filler script + Rscript ./flow.sunav2.logfiles.fill.R \ + DirIn=$OUT_PATH_JOIN_SOURCES \ + DirOut=/pfs/out \ + FileSchmData=$FILE_SCHEMA_DATA \ + DirErr=/pfs/out/errored_datums + EOF + env: + # Environment variables for filter-joiner. + # Ensure the path for the kafka data is listed prior to that for the archive data. When a conflict arises, + # such as when Kafka re-streams data, the Kafka data will take precedence because it is + # the latest and greatest. + CONFIG_LIMIT_LOGFILES: | + --- + # Configuration for filter-joiner module that will limit log files to the dates in + # the date control pipeline + # Make sure the DATE_CONTROL path is second. We actually don't want these files and + # they won't be copied if log files for the site are present + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH_LOG + # Filter for data directory + glob_pattern: /pfs/DATA_PATH_LOG/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: False + - path: + name: DATE_CONTROL + # Filter for data directory + glob_pattern: /pfs/DATE_CONTROL/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: False + CONFIG_JOIN_SOURCES: | + --- + # Configuration for filter-joiner module that will bring together all sources of data + # Make sure the DATA_PATH_LOG path is second. Any site files from the date_control pipeline + # won't be copied if there are files from the archive, kafka, or the log files. + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + # - path: + # name: DATA_PATH_KAFKA + # # Filter for data directory + # glob_pattern: /tmp/kafka_merged/*/*/*/*/*/** + # # Join on named location (already joined below by day) + # join_indices: [3,4,5,6] + # outer_join: true + - path: + name: DATA_PATH_TRINO + # Filter for data directory + glob_pattern: /pfs/DATA_PATH_TRINO/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: True + - path: + name: DATA_PATH_LOG + # Filter for data directory + glob_pattern: /tmp/log_limited/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: True + OUT_PATH_LIMIT_LOGFILES: /tmp/log_limited + OUT_PATH_JOIN_SOURCES: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums + LOG_LEVEL: DEBUG + RELATIVE_PATH_INDEX: "3" # Must be consistent across inputs + LINK_TYPE: COPY # options are COPY or SYMLINK. MUST BE SIMLINK IF USING COMBINED MODULE. +input: + cross: + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + - join: + - pfs: + name: DATA_PATH_TRINO + repo: sunav2_trino_data_parser + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + # - pfs: + # name: DATA_PATH_KAFKA + # repo: sunav2_data_source_kafka + # glob: /(sunav2/*/*/*) + # joinOn: $1 + # empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + # outer_join: true + - pfs: + name: DATA_PATH_LOG + repo: sunav2_logjam_assign_clean_files + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + - pfs: + name: DATE_CONTROL + repo: sunav2_cron_daily_and_date_control + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1G + cpu: 1.5 +resource_limits: + memory: 2G + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.json b/pipe/sunav2/sunav2_location_active_dates_assignment.json deleted file mode 100644 index 30012ffca..000000000 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_active_dates_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=namedLocation" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_loader", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "200M", - "cpu": 1 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.yaml b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml new file mode 100644 index 000000000..d685e6dab --- /dev/null +++ b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml @@ -0,0 +1,48 @@ +--- +pipeline: + name: sunav2_location_active_dates_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.loc.grp.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + TypeFile=namedLocation + "Prop=HOR|VER|name|description|site|Data Rate|active_periods" + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: sunav2_location_loader + glob: /sunav2/* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 210M + cpu: 1.2 +resource_limits: + memory: 500M + cpu: 1.6 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_asset.yaml b/pipe/sunav2/sunav2_location_asset.yaml new file mode 100644 index 000000000..c44f94456 --- /dev/null +++ b/pipe/sunav2/sunav2_location_asset.yaml @@ -0,0 +1,54 @@ +--- +pipeline: + name: sunav2_location_asset +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.1.0 + + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + python3 -m location_asset_loader.location_asset_loader_main + + cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT + + EOF + env: + OUT_PATH: /tmp/out + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + LOG_LEVEL: INFO + SOURCE_TYPE: sunav2_raw + SOURCE_TYPE_OUT: sunav2 + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: sunav2_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 100M + cpu: 0.15 +resource_limits: + memory: 300M + cpu: 0.5 +sidecar_resource_requests: + memory: 250M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_asset_assignment.json b/pipe/sunav2/sunav2_location_asset_assignment.json deleted file mode 100644 index 63121cab2..000000000 --- a/pipe/sunav2/sunav2_location_asset_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_asset_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=asset" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_asset", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_location_asset_assignment.yaml b/pipe/sunav2/sunav2_location_asset_assignment.yaml new file mode 100644 index 000000000..209ebf699 --- /dev/null +++ b/pipe/sunav2/sunav2_location_asset_assignment.yaml @@ -0,0 +1,48 @@ +--- +pipeline: + name: sunav2_location_asset_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.loc.grp.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + TypeFile=asset + "Prop=HOR|VER|install_date|remove_date|name|site|Data Rate" + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: sunav2_location_asset + glob: /sunav2/* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.5 +resource_limits: + memory: 800M + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml new file mode 100644 index 000000000..ab889695d --- /dev/null +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -0,0 +1,97 @@ +--- +pipeline: + name: sunav2_location_group_and_restructure +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-strc-comb:v1.2.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/pfs/filter_joined + rm -rf /tmp/pfs/structured + rm -rf /tmp/pfs/structuredCopy + mkdir -p /tmp/pfs/filter_joined + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Run second module - structure repo by location + Rscript ./flow.loc.repo.strc.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/tmp/pfs/structured \ + DirErr=/pfs/out/errored_datums \ + Comb=TRUE + # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) + cp -rL /tmp/pfs/structured /tmp/pfs/structuredCopy || : # Allow to fail without exit code (happens if step above produced no output) + rm -rf /tmp/pfs/filter_joined + rm -rf /tmp/pfs/structured + # Run third module - merge data by location + Rscript ./flow.loc.data.trnc.comb.R \ + DirIn=/tmp/pfs/structuredCopy \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirSubCombData=data|flags" \ + DirSubCombUcrt=uncertainty_coef \ + DirSubCopy=location + EOF + env: + # Environment variables for filter-joiner + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH + # Filter for data directory + glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + outer_join: true + - path: + name: LOCATION_PATH + # Filter for data directory + glob_pattern: /pfs/LOCATION_PATH/sunav2/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + OUT_PATH: /tmp/pfs/filter_joined + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. + # Environment variables for R modules + PARALLELIZATION_INTERNAL: '3' +input: + join: + - pfs: + name: DATA_PATH + repo: sunav2_calibration_group_and_convert + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: LOCATION_PATH + repo: sunav2_location_asset_assignment + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2.2G + cpu: 3.3 +resource_limits: + memory: 4G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_loader.yaml b/pipe/sunav2/sunav2_location_loader.yaml new file mode 100644 index 000000000..1f820410a --- /dev/null +++ b/pipe/sunav2/sunav2_location_loader.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: sunav2_location_loader +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-loader:v1.0.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + python3 -m location_loader.location_loader_main #run the location loader + + cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT + + EOF + env: + LOCATION_TYPE: CONFIG + SOURCE_TYPE: sunav2_raw + SOURCE_TYPE_OUT: sunav2 + OUT_PATH: /tmp/out + LOG_LEVEL: INFO + ERR_PATH: /pfs/out/errored_datums + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: sunav2_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 100M + cpu: 0.1 +resource_limits: + memory: 300M + cpu: 0.5 +sidecar_resource_requests: + memory: 300M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml new file mode 100644 index 000000000..e382a87d3 --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -0,0 +1,47 @@ +--- +pipeline: + name: sunav2_logjam_assign_clean_files +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - Rscript + ./flow.sunav2.logfiles.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileSchmData=$FILE_SCHEMA_DATA + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-538bf66 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: DEBUG + ERR_PATH: /pfs/out/errored_datums +input: + cross: + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + - join: + - pfs: + name: DIR_IN + repo: sunav2_logjam_load_files + glob: /* +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1G + cpu: 1.5 +resource_limits: + memory: 1.5G + cpu: 2 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_list_files.yaml b/pipe/sunav2/sunav2_logjam_list_files.yaml new file mode 100644 index 000000000..68db8f5cf --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_list_files.yaml @@ -0,0 +1,32 @@ +--- +pipeline: + name: sunav2_logjam_list_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-logjam-loader:v2.0.0 + cmd: ["/bin/bash"] + env: + LOGJAM_INGEST_BUCKET: neon-is-logjam-ingest + OUT_PATH: /pfs/out + LOG_LEVEL: DEBUG + stdin: + - "#!/bin/bash" + - python3 --version + - python3 -m logjam_loader.logjam_loader +input: + pfs: + repo: nitrate_cron_monthly_and_pub_control_tick + glob: /(*-*)-* + empty_files: true +autoscaling: true +resource_requests: + memory: 1G + cpu: 1 +resource_limits: + memory: 1.5G + cpu: 1.5 +sidecar_resource_requests: + memory: 1G + cpu: 0.4 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_load_files.yaml b/pipe/sunav2/sunav2_logjam_load_files.yaml new file mode 100644 index 000000000..ffd1e04bf --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_load_files.yaml @@ -0,0 +1,45 @@ +--- +pipeline: + name: sunav2_logjam_load_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-logjam-loader:v2.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 --version + - python3 -c "import environs; print(environs.__version__)" + - python3 -c "import marshmallow; print(marshmallow.__version__)" + - python3 -m logjam_loader.load_all_logjam_files + + env: + LOGJAM_INGEST_BUCKET: neon-is-logjam-ingest + OUT_PATH: /pfs/out + LOG_LEVEL: DEBUG + STARTING_PATH_INDEX: "7" + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + name: IN_PATH + repo: sunav2_logjam_list_files + glob: /*/*/*/logjam_prod/sunav2/ + empty_files: true +parallelism_spec: + constant: 10 +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.5 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.2 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_merge_data_by_location.json b/pipe/sunav2/sunav2_merge_data_by_location.json deleted file mode 100644 index bfdfea21f..000000000 --- a/pipe/sunav2/sunav2_merge_data_by_location.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_merge_data_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.data.trnc.comb.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "DirSubCombData=data", - "DirSubCopy=location|calibration" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-data-trnc-comb:v1.1.1", - "env": { - "LOG_LEVEL": "INFO", - "PARALLELIZATION_INTERNAL": "1" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_structure_repo_by_location", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": false, - "resource_requests": { - "memory": "80M", - "cpu": 0.3 - } -} diff --git a/pipe/sunav2/sunav2_structure_repo_by_location.json b/pipe/sunav2/sunav2_structure_repo_by_location.json deleted file mode 100644 index edb59b29c..000000000 --- a/pipe/sunav2/sunav2_structure_repo_by_location.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_structure_repo_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.repo.strc.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "Comb=TRUE" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-repo-strc:v1.0.7", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_calibrated_location_group", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "100M", - "cpu": 0.13 - } -} diff --git a/pipe/sunav2/sunav2_trino_data_parser.yaml b/pipe/sunav2/sunav2_trino_data_parser.yaml new file mode 100644 index 000000000..d15be2042 --- /dev/null +++ b/pipe/sunav2/sunav2_trino_data_parser.yaml @@ -0,0 +1,98 @@ +pipeline: + name: sunav2_trino_data_parser +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.13.1 + env: + # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field + # if use default for FIELD_START_INDEX and FIELD_END_INDEX, + # skip first 3 fields (source_id, site_id, readout_time) in parsed schema + LOG_LEVEL: DEBUG + OUT_PATH: /pfs/out + PARSE_FIELD: serial_output + RELATIVE_PATH_INDEX: "4" + PARSED_SCHEMA_PATH: /usr/src/app/parsed-schemas/sunav2/sunav2_parsed.avsc + SOURCE_TYPE: 'sunav2_raw' + secrets: + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET + - name: pdr-secret + env_var: PDR_HOST + key: hostname + - name: pdr-secret + env_var: PDR_DBNAME + key: database + - name: pdr-secret + env_var: PDR_USER + key: username + - name: pdr-secret + env_var: PDR_PASSWORD + key: password + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # run data parser + python3 -m raw_data_parsers.raw_data_parser.suna_data_parser_main + + # Upload L0 files to bucket, compacting with any existing file with the same name + # when SOURCE_TYPE is sunav2_raw, OUT_SOURCE_TYPE is sunav2 + OUT_SOURCE_TYPE=${SOURCE_TYPE%%_raw} + if [[ -d "$OUT_PATH/$OUT_SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # /pfs/out/sunav2/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + + done + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" + + # set +x # Uncomment for debugging + rm -rf $linkdir + fi + + EOF +input: + pfs: + name: DATA_PATH + repo: sunav2_data_source_trino + glob: /sunav2_raw/*/*/* +parallelism_spec: + constant: 3 +autoscaling: true +resource_requests: + memory: 1.5G + cpu: 0.5 +resource_limits: + memory: 3G + cpu: 1.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/utilities/vignettes/stand_up_product_dag_example.sh b/utilities/vignettes/stand_up_product_dag_example.sh index 43f21b6f9..1c8301954 100644 --- a/utilities/vignettes/stand_up_product_dag_example.sh +++ b/utilities/vignettes/stand_up_product_dag_example.sh @@ -9,12 +9,12 @@ # Define paths data_path='/scratch/pfs' # Where base repos like avro_schemas, empty_files, etc. are stored -git_path_pipelines='/home/NEON/ncatolico/NEON-IS-data-processing-homeDir/pipe' -git_path_avro='/home/NEON/ncatolico/NEON-IS-avro-schemas' -git_path_avro_l0='/home/NEON/ncatolico/neon-avro-schemas' +git_path_pipelines='/home/NEON/ncatolico/R/NEON-IS-data-processing/pipe' +git_path_avro='/home/NEON/ncatolico/R/NEON-IS-avro-schemas' +#git_path_avro_l0='/home/NEON/ncatolico/neon-avro-schemas' pipe_list_prefix='pipe_list_' -source_type='tchain' -product='tempSpecificDepthLakes' +source_type='sunav2' +product='nitrate' # Define paths based on base paths and product information above spec_path_source_type=$git_path_pipelines/$source_type