From 478c060df92ea0f39aa35bff4df3bd5b8435444e Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Wed, 9 Apr 2025 11:52:06 -0600 Subject: [PATCH 001/182] suna log file checker for pachyderm. --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 393 +++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 flow/flow.suna.logfiles/wrap.suna.logfiles.R diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R new file mode 100644 index 000000000..a038ea562 --- /dev/null +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -0,0 +1,393 @@ +############################################################################################## +#' @title Wrapper for SUNA Log File Processing + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Validates, cleans, and formats SUNA log files into daily parquets. +#' +#' @param FileIn Character value. The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id/file. +#' The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirOut Character value. The output path that will replace the #/pfs/BASE_REPO portion of FileIn. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Cleaned SUNA log files in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +#' FileIn <- "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" +#' log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' wrap.suna.logfiles <- function(FileIn = "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", +#' DirOut="~/pfs/out", +#' SchmDataOut=NULL, +#' log=log) +#' +#' @changelog +#' Nora Catolico (2024-01-09) original creation +#' Bobby Hensley (2025-04-09) adapted for SUNA +############################################################################################## +wrap.suna.logfiles <- function(FileIn, + DirOut, + SchmDataOut=NULL, + log=NULL +){ + + # Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + # --------- Load the data ---------- + # Load in the csv log file(s) + log_file <- + base::try(read.table(paste0(FileIn), header = FALSE, sep = ",", + col.names = paste0("V",seq_len(286)),encoding = 'utf-8', + stringsAsFactors = FALSE,fill = TRUE,strip.white = TRUE,na.strings=c(-1,''))) + if (base::any(base::class(log_file) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', FileIn, ' is unreadable. Likely not a data file.')) + base::stop() + } + if(any(grepl('TROLL',log_file))){ + log$debug(base::paste0('skipping troll file: ', FileIn)) + base::stop() + }else if(any(grepl('Turbidity',log_file))){ + log$debug(base::paste0('skipping sonde file: ', FileIn)) + base::stop() + }else{ + #find row where data actually starts + start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 + #figure out column order and standardize headers (sometimes differs based on log settings/ version) + col1<-log_file$V1[start-1] + col2<-log_file$V2[start-1] + col3<-log_file$V3[start-1] + col3<-substr(col3,1,14) + col4<-log_file$V4[start-1] + col4<-substr(col4,1,14) + col5<-log_file$V5[start-1] + col5<-substr(col5,1,14) + col6<-log_file$V6[start-1] + col6<-substr(col6,1,14) + + if(start>0){ + log_data<-log_file[start:(length(log_file$V1)),1:6] + if(grepl('date', tolower(col1))){ + colnames(log_data)[1]<-'readout_time' + }else{ + log$error(base::paste0('File Error: No datetime column where expected in ', FileIn)) + } + if(grepl('seconds', tolower(col2))){ + colnames(log_data)[2]<-'seconds' + }else{ + log$error(base::paste0('File Error: No seconds column where expected in ', FileIn)) + } + if(grepl('pressure', tolower(col3))){ + colnames(log_data)[3]<-'pressure' + }else if(grepl('temp', tolower(col3))){ + colnames(log_data)[3]<-'temperature' + }else if(grepl('cond', tolower(col3))){ + colnames(log_data)[3]<-'conductivity' + }else if(grepl('depth', tolower(col3))){ + colnames(log_data)[3]<-'depth' + }else{ + log$error(base::paste0('File Error: No expected streams present in column 3 of ', FileIn)) + } + if(grepl('pressure', tolower(col4))){ + colnames(log_data)[4]<-'pressure' + }else if(grepl('temp', tolower(col4))){ + colnames(log_data)[4]<-'temperature' + }else if(grepl('cond', tolower(col4))){ + colnames(log_data)[4]<-'conductivity' + }else if(grepl('depth', tolower(col4))){ + colnames(log_data)[4]<-'depth' + }else{ + log$error(base::paste0('File Error: No expected streams present in column 4 of ', FileIn)) + } + if(!is.na(col5)){ + if(grepl('cond', tolower(col5))){ + colnames(log_data)[5]<-'conductivity' + }else if(grepl('pressure', tolower(col5))){ + colnames(log_data)[5]<-'pressure' + }else if(grepl('temp', tolower(col5))){ + colnames(log_data)[5]<-'temperature' + }else if(grepl('depth', tolower(col5))|grepl('elevation', tolower(col5))){ + colnames(log_data)[5]<-'depth' + }else{ + log$error(base::paste0('File Error: No expected streams present in column 5 of ', FileIn)) + } + } + if(!is.na(col6)){ + if(grepl('cond', tolower(col6))){ + colnames(log_data)[6]<-'conductivity' + }else if(grepl('pressure', tolower(col6))){ + colnames(log_data)[6]<-'pressure' + }else if(grepl('temp', tolower(col6))){ + colnames(log_data)[6]<-'temperature' + }else if(grepl('depth', tolower(col6))|grepl('elevation', tolower(col6))){ + colnames(log_data)[6]<-'depth' + }else{ + log$error(base::paste0('File Error: No expected streams present in column 5 of ', FileIn)) + } + } + log_data<-log_data[!is.na(log_data$readout_time),] + log_metadata<-log_file[1:start,] + }else{ + log$error(base::paste0('File Error: No data in ', FileIn)) + } + + #check timezone. lot's of different styles... + if(any(grepl('Time Zone: ',log_metadata$V1))){ + timezone<-log_metadata$V1[grepl('Time Zone: ',log_metadata$V1)] + timezone<-gsub('Time Zone: ','',timezone) + }else if(any(grepl('Time Zone',log_metadata$V1))){ + timezone<-log_metadata$V2[grepl('Time Zone',log_metadata$V1)] + }else if(any(grepl('Time Zone',log_metadata$V2))){ + timezone<-log_metadata$V3[grepl('Time Zone',log_metadata$V2)] + }else{ + timezone<-'ERROR' + log$error(base::paste0('File Error: timezone not specified in ', FileIn)) + } + #then clean up TZ + #grep("Dateline", OlsonNames(), value=TRUE) + if(timezone=="Coordinated Universal Time"){ + timezone<-'UTC' + }else if(grepl('Eastern Standard Time',timezone)|grepl('Eastern Daylight Time',timezone)|grepl('Dateline',timezone)){ + timezone<-'EST' + }else if(grepl('Central Daylight Time',timezone)|grepl('Central Standard Time',timezone)){ + timezone<-'US/Central' + }else if(grepl('Pacific Daylight Time',timezone)|grepl('Pacific Standard Time',timezone)|grepl('UTC-08',timezone)){ + timezone<-'US/Pacific' + }else if(grepl('Mountain Daylight Time',timezone)|grepl('Mountain Standard Time',timezone)){ + timezone<-'US/Mountain' + }else if(grepl('Alaskan Daylight Time',timezone)|grepl('Alaskan Standard Time',timezone)|grepl('UTC-09',timezone)){ + timezone<-'US/Alaska' + }else if(grepl('SA Western Daylight Time',timezone)|grepl('SA Western Standard Time',timezone)){ + timezone<-'America/Puerto_Rico' + }else if(grepl('GMT',timezone)|grepl('Greenwich Standard Time',timezone)){ + timezone<-'GMT' + }else if(timezone=='Unknown'){ + if(any(grepl('UTC',log_metadata))){ + timezone<-'UTC' + } + } + + #clean up metadata + removeAfter<-which(log_metadata$V1=='Log Notes:') + if(length(removeAfter)>0){ + log_metadata <- as.data.frame(log_metadata[1:(removeAfter),]) + } + log_metadata$V1[is.na(log_metadata$V1)]<-log_metadata$V2[is.na(log_metadata$V1)] + log_metadata$V2[!is.na(log_metadata$V3)]<-log_metadata$V3[!is.na(log_metadata$V3)] + log_metadata<-log_metadata[,1:2] + colnames(log_metadata)<-c("label","value") + + #Metadata values + logName <- log_metadata$value[!is.na(log_metadata$label) & (log_metadata$label=="Log Name"|log_metadata$label=="File Name")][1] + Troll_SN <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Serial Number"][1] + Asset <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Device Name"][1] + #log$debug(base::paste0('metadata: ',logName,'_',Troll_SN,'_',Asset)) + if(length(Asset)<1 || Asset == " " || nchar(Asset) == 0){ + log$error(base::paste0('File Info: No asset specified in ', FileIn)) + stop() + } + #define Site + Site <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Site"] + if(length(Site)<1){ + log$info(base::paste0('File Info: No site specified in ', FileIn)) + }else if(Site == 'Default Site'){ + Site <- NA + log$info(base::paste0('File Info: Default site specified in ', FileIn)) + }else if(length(Site)>1){ + log$info(base::paste0('File Info: More than one site specified in ', FileIn)) + }else if(nchar(Site)>4){ + Site <-substr(Site,5,8) + } + #fix for specific use case + if(grepl('Central America Standard Time',timezone) & !is.na(Site) & (Site == "MCDI"|Site == "KING")){ + timezone<-'US/Central' + } + Device <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Device"][1] + if(!is.na(Device) & grepl('level',tolower(Device))){ + Device<-"Level TROLL 500" + Context <- "surfacewater" + sensor <- "leveltroll500" + keep<-c('readout_time','seconds','pressure','temperature') + log_data<-log_data[keep] + }else if(!is.na(Device) & grepl('aqua',tolower(Device))){ + Device<-"Aqua TROLL 200" + sensor <- "aquatroll200" + if(!is.na(logName) & (grepl("IN",logName)|grepl("Inlet",logName)|grepl("OT",logName)|grepl("Outlet",logName)|grepl("L1",logName)| + grepl("L2",logName)|grepl("Lit",logName)|grepl("S1",logName)|grepl("S2",logName))){ + Context <- "surfacewater" + }else if(!is.na(logName) & (grepl("GW",logName)|any(grepl("conductivity",tolower(colnames(log_data)))))){ + Context <- "groundwater" + }else{ + log$error(base::paste0('File Error: Context not specified in ', FileIn)) + } + }else{ + log$error(base::paste0('File Error: Device not specified in ', FileIn)) + } + + + ###check and update date format + #sometimes ymd others mdy, sometimes has / others -, some don't have seconds + #check if date contains seconds + if(length(base::gregexpr(':', log_data$readout_time[1])[[1]])==2){ #if 2 : then it has seconds + #Check if date begins with year (assume 4 digit year, which seems to always be true) + if(grepl('^\\d{4}', log_data$readout_time[1])){ + #ymd format + log_data$dateTime <- lubridate::ymd_hms(log_data$readout_time, tz = timezone) + }else{ + #assume mdy format + log_data$dateTime <- lubridate::mdy_hms(log_data$readout_time, tz = timezone) + } + }else if(length(base::gregexpr(':', log_data$readout_time[1])[[1]])==1){ + #doesn't have seconds + #Check if date begins with year (assuming 4 digit year) + if(grepl('^\\d{4}', log_data$readout_time[1])){ + #ymd format + log_data$dateTime <- lubridate::ymd_hm(log_data$readout_time, tz = timezone) + }else{ + #assume mdy format + log_data$dateTime <- lubridate::mdy_hm(log_data$readout_time, tz = timezone) + } + }else{ + log$error(base::paste0('File Error: Invalid date time format',log_data$readout_time[1],' in ', FileIn))#this shouldn't happen + } + log_data<-log_data[!is.na(log_data$dateTime),] + + #add date as UTC + log_data$dateUTC<-lubridate::with_tz(log_data$dateTime,'UTC') + + #check that dates are 2018 or later (some files have 1970 error) + log_data$logFlag<-1 + log_data$logDateErrorFlag<-0 + + if(any(log_data$dateUTC<"2018-01-01 00:00:00 UTC")){ + log$debug(base::paste0("Data contains dates prior to NEON logging implementation. Attempt will be made to align and flag data.")) + logDateError<-which(log_data$dateUTC<"2018-01-01 00:00:00 UTC") + if(logDateError[1]!=1){ #If there is a good date before the 1970 shift we can try to continue the data and add a flag + # sampling frequency + if(Context=='groundwater'){ + freq <- 300 #5 min in seconds + }else{ + freq <- 60 + } + idx_start<-logDateError[1] + idx_end<-logDateError[length(logDateError)] + time_last_read<-log_data$dateUTC[idx_start-1] + first_new_time<-time_last_read + freq + num_readings<-length(logDateError) + new_times <- seq(first_new_time, by = freq, length = num_readings) + log_data$dateUTC[idx_start:idx_end]<-new_times + log_data$logDateErrorFlag[idx_start:idx_end]<-1 + }else{ + #cannot use log data with bad dates + #log$debug(base::paste0("Log data contains erroneous dates that cannot be linked to the correct time.")) + log_data<-log_data[log_data$dateUTC>"2018-01-01 00:00:00 UTC",] + log$debug(base::paste0('File Error: ALL DATA 1970 in ', FileIn)) + } + } + if(nrow(log_data)>0){ + log_data$readout_time<-log_data$dateUTC + + #round to minute + if(Context=='surfacewater'){ + log_data$readout_time<-lubridate::round_date(log_data$dateUTC,unit = "minute") + }else if(Context=='groundwater'){ + log_data$readout_time<-lubridate::round_date(log_data$dateUTC,unit = "5 minutes") + } + + log_data$day<-lubridate::floor_date(log_data$dateUTC,"days") + + log_data$source_id<-Asset + + #format output file + #create any missing columns in log file + if(!'pressure' %in% names(log_data)){log_data$pressure<-NA} + if(!'temperature' %in% names(log_data)){log_data$temperature<-NA} + if(sensor=='aquatroll200'){ + if(!'conductivity' %in% names(log_data)){log_data$conductivity<-NA} + out_columns <- c('source_id','readout_time','pressure','temperature','conductivity','logFlag','logDateErrorFlag','day') + }else if(sensor=='leveltroll500'){ + out_columns <- c('source_id','readout_time','pressure','temperature','logFlag','logDateErrorFlag','day') + } + out<-log_data[out_columns] + + first_reading<-log_data$dateUTC[1] + if(length(log_data$dateUTC)>0){ + last_reading<-log_data$dateUTC[length(log_data$dateUTC)] + }else{ + last_reading<-NA + } + + ###subset into 1-day data files + all_days<-split(out, as.Date(out$day)) + #output daily files + if(length(all_days)>0){ + for(j in 1:length(all_days)){ + #create DF + out_file <- as.data.frame(all_days[j]) + colnames(out_file) <- out_columns + year <- substr(out_file$day[1],1,4) + month <- substr(out_file$day[1],6,7) + day <- substr(out_file$day[1],9,10) + if(sensor=='aquatroll200'){ + out_file <- out_file[,c('source_id','readout_time','pressure','temperature','conductivity','logFlag','logDateErrorFlag')] + }else if(sensor=='leveltroll500'){ + out_file <- out_file[,c('source_id','readout_time','pressure','temperature','logFlag','logDateErrorFlag')] + } + #create output directory + DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',Asset,'/data/') + base::dir.create(DirOutLogFile,recursive=TRUE) + csv_name <-paste0(sensor,'_',Asset,'_',year,'-',month,'-',day,'_log') + + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = out_file, + NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutLogFile,csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) + } + }#end of days loop + }else{ + log$error(base::paste0('No days can be written out for ', FileIn)) + } + } + } +} #end of file + + + + + + + + + + + + + + + + + From 139ec5001b10c194ee9f8ecd25004eb53f8c7033 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Wed, 9 Apr 2025 12:45:05 -0600 Subject: [PATCH 002/182] Additional work on suna log file checker --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 103 +++++-------------- 1 file changed, 24 insertions(+), 79 deletions(-) diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index a038ea562..5e598c709 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -70,86 +70,31 @@ wrap.suna.logfiles <- function(FileIn, }else if(any(grepl('Turbidity',log_file))){ log$debug(base::paste0('skipping sonde file: ', FileIn)) base::stop() - }else{ - #find row where data actually starts - start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 - #figure out column order and standardize headers (sometimes differs based on log settings/ version) - col1<-log_file$V1[start-1] - col2<-log_file$V2[start-1] - col3<-log_file$V3[start-1] - col3<-substr(col3,1,14) - col4<-log_file$V4[start-1] - col4<-substr(col4,1,14) - col5<-log_file$V5[start-1] - col5<-substr(col5,1,14) - col6<-log_file$V6[start-1] - col6<-substr(col6,1,14) + } + # Find row where data actually starts + start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 + # Separate data and metadata + log_data<-log_file[start:(length(log_file)),] + log_metadata<-log_file[1:(start-1),2:6] + + + + + + + + + + + + + + - if(start>0){ - log_data<-log_file[start:(length(log_file$V1)),1:6] - if(grepl('date', tolower(col1))){ - colnames(log_data)[1]<-'readout_time' - }else{ - log$error(base::paste0('File Error: No datetime column where expected in ', FileIn)) - } - if(grepl('seconds', tolower(col2))){ - colnames(log_data)[2]<-'seconds' - }else{ - log$error(base::paste0('File Error: No seconds column where expected in ', FileIn)) - } - if(grepl('pressure', tolower(col3))){ - colnames(log_data)[3]<-'pressure' - }else if(grepl('temp', tolower(col3))){ - colnames(log_data)[3]<-'temperature' - }else if(grepl('cond', tolower(col3))){ - colnames(log_data)[3]<-'conductivity' - }else if(grepl('depth', tolower(col3))){ - colnames(log_data)[3]<-'depth' - }else{ - log$error(base::paste0('File Error: No expected streams present in column 3 of ', FileIn)) - } - if(grepl('pressure', tolower(col4))){ - colnames(log_data)[4]<-'pressure' - }else if(grepl('temp', tolower(col4))){ - colnames(log_data)[4]<-'temperature' - }else if(grepl('cond', tolower(col4))){ - colnames(log_data)[4]<-'conductivity' - }else if(grepl('depth', tolower(col4))){ - colnames(log_data)[4]<-'depth' - }else{ - log$error(base::paste0('File Error: No expected streams present in column 4 of ', FileIn)) - } - if(!is.na(col5)){ - if(grepl('cond', tolower(col5))){ - colnames(log_data)[5]<-'conductivity' - }else if(grepl('pressure', tolower(col5))){ - colnames(log_data)[5]<-'pressure' - }else if(grepl('temp', tolower(col5))){ - colnames(log_data)[5]<-'temperature' - }else if(grepl('depth', tolower(col5))|grepl('elevation', tolower(col5))){ - colnames(log_data)[5]<-'depth' - }else{ - log$error(base::paste0('File Error: No expected streams present in column 5 of ', FileIn)) - } - } - if(!is.na(col6)){ - if(grepl('cond', tolower(col6))){ - colnames(log_data)[6]<-'conductivity' - }else if(grepl('pressure', tolower(col6))){ - colnames(log_data)[6]<-'pressure' - }else if(grepl('temp', tolower(col6))){ - colnames(log_data)[6]<-'temperature' - }else if(grepl('depth', tolower(col6))|grepl('elevation', tolower(col6))){ - colnames(log_data)[6]<-'depth' - }else{ - log$error(base::paste0('File Error: No expected streams present in column 5 of ', FileIn)) - } - } - log_data<-log_data[!is.na(log_data$readout_time),] - log_metadata<-log_file[1:start,] - }else{ - log$error(base::paste0('File Error: No data in ', FileIn)) - } + + + + #check timezone. lot's of different styles... if(any(grepl('Time Zone: ',log_metadata$V1))){ From 4b9bb3cb448affdf4aa6d1499d6a30c8306bb6cc Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 9 Apr 2025 14:50:33 -0600 Subject: [PATCH 003/182] suna pipeline structure --- pipe/suna/pipe_list_suna.txt | 16 ++ pipe/suna/site-list.json | 5 + pipe/suna/suna_calibration_assignment.yaml | 60 ++++++ .../suna_calibration_group_and_convert.yaml | 170 +++++++++++++++++ pipe/suna/suna_calibration_list_files.yaml | 44 +++++ pipe/suna/suna_calibration_loader.yaml | 58 ++++++ .../suna_cron_daily_and_date_control.yaml | 57 ++++++ pipe/suna/suna_data_source_gcs.yaml | 90 +++++++++ pipe/suna/suna_data_source_kafka.yaml | 177 ++++++++++++++++++ .../suna_fill_date_gaps_and_regularize.yaml | 117 ++++++++++++ ...suna_location_active_dates_assignment.yaml | 62 ++++++ pipe/suna/suna_location_asset.yaml | 53 ++++++ pipe/suna/suna_location_asset_assignment.yaml | 62 ++++++ .../suna_location_group_and_restructure.yaml | 111 +++++++++++ pipe/suna/suna_location_loader.yaml | 54 ++++++ 15 files changed, 1136 insertions(+) create mode 100644 pipe/suna/pipe_list_suna.txt create mode 100644 pipe/suna/site-list.json create mode 100644 pipe/suna/suna_calibration_assignment.yaml create mode 100644 pipe/suna/suna_calibration_group_and_convert.yaml create mode 100644 pipe/suna/suna_calibration_list_files.yaml create mode 100644 pipe/suna/suna_calibration_loader.yaml create mode 100644 pipe/suna/suna_cron_daily_and_date_control.yaml create mode 100644 pipe/suna/suna_data_source_gcs.yaml create mode 100644 pipe/suna/suna_data_source_kafka.yaml create mode 100644 pipe/suna/suna_fill_date_gaps_and_regularize.yaml create mode 100644 pipe/suna/suna_location_active_dates_assignment.yaml create mode 100644 pipe/suna/suna_location_asset.yaml create mode 100644 pipe/suna/suna_location_asset_assignment.yaml create mode 100644 pipe/suna/suna_location_group_and_restructure.yaml create mode 100644 pipe/suna/suna_location_loader.yaml diff --git a/pipe/suna/pipe_list_suna.txt b/pipe/suna/pipe_list_suna.txt new file mode 100644 index 000000000..c47513d44 --- /dev/null +++ b/pipe/suna/pipe_list_suna.txt @@ -0,0 +1,16 @@ +suna_cron_daily_and_date_control.yaml +suna_logjam_list_files.yaml +suna_logjam_load_files.yaml +suna_logjam_assign_clean_files.yaml +suna_data_source_kafka.yaml +suna_data_source_gcs.yaml +suna_calibration_list_files.yaml +suna_calibration_loader.yaml +suna_location_asset.yaml +suna_location_loader.yaml +suna_calibration_assignment.yaml +suna_location_asset_assignment.yaml +suna_location_active_dates_assignment.yaml +suna_calibration_group_and_convert.yaml +suna_location_group_and_restructure.yaml +suna_fill_date_gaps_and_regularize.yaml diff --git a/pipe/suna/site-list.json b/pipe/suna/site-list.json new file mode 100644 index 000000000..57f7a998c --- /dev/null +++ b/pipe/suna/site-list.json @@ -0,0 +1,5 @@ +[ + { + "site" : "CRAM" + } +] \ No newline at end of file diff --git a/pipe/suna/suna_calibration_assignment.yaml b/pipe/suna/suna_calibration_assignment.yaml new file mode 100644 index 000000000..3bb18ef16 --- /dev/null +++ b/pipe/suna/suna_calibration_assignment.yaml @@ -0,0 +1,60 @@ +--- +pipeline: + name: suna_calibration_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.cal.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v1.0.6 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: suna_calibration_loader + glob: /suna/* + - pfs: + name: FILE_YEAR + repo: suna_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 200M + cpu: 0.8 +resource_limits: + memory: 600M + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_calibration_group_and_convert.yaml b/pipe/suna/suna_calibration_group_and_convert.yaml new file mode 100644 index 000000000..5a8af6221 --- /dev/null +++ b/pipe/suna/suna_calibration_group_and_convert.yaml @@ -0,0 +1,170 @@ +--- +pipeline: + name: suna_calibration_group_and_convert +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v1.3.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/kafka_merged + rm -rf $OUT_PATH + mkdir -p /tmp/kafka_merged # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + # Detect if we have data coming from Kafka or the archive + # Note that we run the filter-joiner in sequential if statements rather than an elif statement + # ... so that if there is any overlap in sensor data coming from both Kafka and the archive on the same day, the + # ... kafka data wins (filter joiner will not copy a file if it is already in the destination). This scenario + # ... should only arise during initial data load and a site back-streams data from kafka outside the Kafka + # ... retention period for data that have already been loaded from the archive. + # ... When a conflict does arise, the kafka data will take precedence, assuming that it is the latest + # ... and greatest. + + if [ ${KAFKA_UNMERGED_DATA+x} ]; then + # Data from kafka. + + # Run kafka combiner + Rscript ./flow.kfka.comb.R \ + DirIn=$KAFKA_UNMERGED_DATA \ + DirOut=/tmp/kafka_merged \ + DirErr=/pfs/out/errored_datums \ + FileSchmL0=$FILE_SCHEMA_L0 + + # Run filter joiner + python3 -m filter_joiner.filter_joiner_main + fi + if [ ${DATA_PATH_ARCHIVE+x} ]; then + # Data from the archive. + + # Run kafka combiner - note that this works for both trino-loaded data and kafka loaded data. If both + # exist in the folder for the same sensor and day, likely there will be duplicate data written to file + # because the Trino timestamps are truncated to the second whereas Kafka readout times are not. However, + # this scenario should be rare and duplicates will be removed in the regularization module. + Rscript ./flow.kfka.comb.R \ + DirIn=$DATA_PATH_ARCHIVE \ + DirOut=/tmp/kafka_merged \ + DirErr=/pfs/out/errored_datums \ + FileSchmL0=$FILE_SCHEMA_L0 + + # Run filter joiner + python3 -m filter_joiner.filter_joiner_main + fi + + # Run calibration conversion module + Rscript ./flow.cal.conv.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + FileSchmData=$FILE_SCHEMA_DATA \ + FileSchmQf=$FILE_SCHEMA_FLAGS \ + TermFuncConv=voltage:def.cal.conv.poly \ + TermQf=voltage \ + TermFuncUcrt=voltage:def.ucrt.meas.mult,def.ucrt.fdas.volt.poly \ + FileUcrtFdas=$FILE_UNCERTAINTY_FDAS + EOF + env: + # Environment variables for filter-joiner. + CONFIG: | + --- + # Configuration for filter-joiner module that will bring together the data and calibrations + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH_KAFKA_MERGED + # Filter for data directory + glob_pattern: /tmp/kafka_merged/suna/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + outer_join: true + - path: + name: CALIBRATION_PATH + # Filter for data directory + glob_pattern: /pfs/CALIBRATION_PATH/suna/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + OUT_PATH: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" # Must be consistent across inputs + LINK_TYPE: COPY # options are COPY or SYMLINK. MUST BE SIMLINK IF USING COMBINED MODULE. + # Environment variables for calibration module + PARALLELIZATION_INTERNAL: '3' # Option for calibration conversion module +input: + cross: + - pfs: + name: FILE_SCHEMA_L0 + repo: suna_avro_schemas + glob: /suna/suna.avsc + - pfs: + name: FILE_SCHEMA_DATA + repo: suna_avro_schemas + glob: /suna/suna_calibrated.avsc + - pfs: + name: FILE_SCHEMA_FLAGS + repo: suna_avro_schemas + glob: /suna/flags_calibration_suna.avsc + - pfs: + name: FILE_UNCERTAINTY_FDAS + repo: suna_uncertainty_fdas + glob: /fdas_calibration_uncertainty_general.json + # Outer join all days so that varying sensors between kafka and archive loaders will all get joined with calibrations. Filter-joiner will narrow down. + - join: + - pfs: + name: CALIBRATION_PATH + repo: suna_calibration_assignment + glob: /suna/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATA_PATH_ARCHIVE + repo: suna_data_source_gcs + glob: /suna/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: KAFKA_UNMERGED_DATA + repo: suna_data_source_kafka + glob: /suna/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1.5G + cpu: 3.3 +resource_limits: + memory: 3G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.6 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_calibration_list_files.yaml b/pipe/suna/suna_calibration_list_files.yaml new file mode 100644 index 000000000..a87c26a04 --- /dev/null +++ b/pipe/suna/suna_calibration_list_files.yaml @@ -0,0 +1,44 @@ +--- +pipeline: + name: suna_calibration_list_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 + cmd: ["/bin/bash"] + env: + CVAL_INGEST_BUCKET: neon-cval + OUT_PATH: /pfs/out + stdin: + - "#!/bin/bash" + - python3 -m calval_loader.calval_loader +input: + pfs: + repo: suna_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.4 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 1G + cpu: 0.5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_calibration_loader.yaml b/pipe/suna/suna_calibration_loader.yaml new file mode 100644 index 000000000..551f9c58e --- /dev/null +++ b/pipe/suna/suna_calibration_loader.yaml @@ -0,0 +1,58 @@ +--- +pipeline: + name: suna_calibration_loader +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m calval_loader.load_all_calval_files + env: + CVAL_INGEST_BUCKET: neon-cval + OUT_PATH: /pfs/out + LOG_LEVEL: INFO + SOURCE_TYPE: suna + STARTING_PATH_INDEX: "5" + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + name: IN_PATH + repo: suna_calibration_list_files + glob: /*/*/*/* + empty_files: true +parallelism_spec: + constant: 10 +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.5 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_cron_daily_and_date_control.yaml b/pipe/suna/suna_cron_daily_and_date_control.yaml new file mode 100644 index 000000000..4be0808dc --- /dev/null +++ b/pipe/suna/suna_cron_daily_and_date_control.yaml @@ -0,0 +1,57 @@ +--- +pipeline: + name: suna_cron_daily_and_date_control +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 + cmd: ["/bin/bash"] + env: + # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure + # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file + # start_date field in the site-list file is the earliest date to pull data from a site + # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka + # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. + OUT_PATH: /pfs/out + START_DATE: "2024-07-28" # Inclusive + END_DATE: "2027-08-10" # Inclusive + SOURCE_TYPE: "suna" + stdin: + - "#!/bin/bash" + - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main +input: + cross: + # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. + - cron: + name: tick + spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + overwrite: true + - pfs: + name: SITE_FILE + repo: suna_site_list + glob: /site-list.json +resource_requests: + memory: 100M + cpu: 1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.5 +autoscaling: true +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_data_source_gcs.yaml b/pipe/suna/suna_data_source_gcs.yaml new file mode 100644 index 000000000..001eead59 --- /dev/null +++ b/pipe/suna/suna_data_source_gcs.yaml @@ -0,0 +1,90 @@ +--- +pipeline: + name: suna_data_source_gcs +transform: + image_pull_secrets: + - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-l0-gcs-loader:v2.0.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Get today's date for evaluating kafka data retention period + date_today=$(date -u +%Y-%m-%d) + kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) + + # Get data from GCS bucket + echo "Processing $import_trigger" + p=${import_trigger#/pfs} + IFS="/"; arr=($p); unset IFS; + source_type=${arr[2]} + year=${arr[3]} + month=${arr[4]} + day=${arr[5]} + if [ $(date -u +%s -d $year-$month-$day) -lt $(date -u +%s -d $kafka_min_date) ] + then + echo "Extracting $year-$month-$day for $source_type from GCS" + python3 -m l0_gcs_loader.l0_gcs_loader + else + echo "$year/$month/$day is within the Kafka retention period and should be loaded from Kafka. Skipping..." + fi + + EOF + env: + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + KAFKA_RETENTION_DAYS: "15" + BUCKET_VERSION_PATH: "v2" # The root path of the bucket, indicative of the version (e.g. v2) + SOURCE_TYPE_INDEX: "3" + YEAR_INDEX: "4" + MONTH_INDEX: "5" + DAY_INDEX: "6" + BUCKET_NAME: neon-l0-ingest # Always pull from prod bucket + # secrets: + # - name: l0-bucket # Using this secret will use the dev/cert/prod bucket linked to the Pachyderm environment + # env_var: BUCKET_NAME + # key: LO_BUCKET + +input: + pfs: + name: import_trigger + repo: suna_cron_daily_and_date_control + # Glob must be daily + glob: "/*/*/*/*" +output_branch: master +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 400M + cpu: 0.5 +resource_limits: + memory: 800M + cpu: 1.5 +sidecar_resource_requests: + memory: 2.4G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_data_source_kafka.yaml b/pipe/suna/suna_data_source_kafka.yaml new file mode 100644 index 000000000..c1153e380 --- /dev/null +++ b/pipe/suna/suna_data_source_kafka.yaml @@ -0,0 +1,177 @@ +--- +pipeline: + name: suna_data_source_kafka +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.4.0 + image_pull_secrets: + - battelleecology-quay-read-all-pull-secret + env: + OUT_PATH: /pfs/out + SOURCE_TYPE: "suna" + LOG_LEVEL: INFO + YEAR_INDEX: "5" + MONTH_INDEX: "6" + DAY_INDEX: "7" + KAFKA_RETENTION_DAYS: "15" + secrets: + - name: pachyderm-kafka-auth + env_var: KAFKA_USER + key: KAFKA_USER + - name: pachyderm-kafka-auth + env_var: KAFKA_PASSWORD + key: KAFKA_PASSWORD + - name: pachyderm-kafka-auth + env_var: KAFKA_BROKER + key: KAFKA_BROKER + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET + - name: pdr-secret + env_var: PDR_HOST + key: hostname + - name: pdr-secret + env_var: PDR_DBNAME + key: database + - name: pdr-secret + env_var: PDR_USER + key: username + - name: pdr-secret + env_var: PDR_PASSWORD + key: password + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Get GCP zone + meta=$(curl -sH "Metadata-Flavor: Google" "http://metadata/computeMetadata/v1/instance/zone") + zone=$(echo $meta | cut -d "/" -f 4) + echo $zone + + # Get today's date for evaluating kafka data retention period + date_today=$(date -u +%Y-%m-%d) + kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) + + # Get date from input path. Terminal path structure must be /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE + # Datum must be set at /SOURCE_TYPE/YYYY/MM/DD or /SOURCE_TYPE/YYYY/MM/DD/SITE_FILE + date_path=$(echo $import_trigger | cut -f $YEAR_INDEX,$MONTH_INDEX,$DAY_INDEX -d "/") + echo $date_path + date_str=$(date -u +%Y-%m-%d -d $date_path) + + # Get each site to run + if [[ -f ${import_trigger} ]]; then + import_trigger_glob="${import_trigger}" + else + import_trigger_glob="${import_trigger}/*" + fi + + sites_output=() + + for site_kafka in $import_trigger_glob; do + site_file=$(basename $site_kafka) # Strip off any path prefix + site=$(echo $site_file | cut -f 1 -d "." --only-delimited) # Extract the site from site.kafka. Ignore site-only files (e.g. CPER vs. CPER.kafka) + type=$(echo $site_file | cut -f 2 -d "." --only-delimited) # Extract the 'kafka' from site.kafka + if [ "$type" != "kafka" ] + then + echo "$site_file is not indicated to be streaming from Kafka. Skipping..." + continue + elif [ "$(date -u +%s -d "$date_str")" -lt "$(date -u +%s -d "$kafka_min_date")" ] + then + echo -n "Cannot extract $date_str Kafka data for $site. " + echo -n "Today's date ($date_today) is beyond the Kafka retention period ($KAFKA_RETENTION_DAYS days). Skipping..." + continue + fi + + # We are ok to run + echo "Extracting $date_str kafka data for $site" + + # Get "current data" - data that came in on the specified day, which is the same day it was measured + # Note: We cannot use the --removeoffset flag on the kafka loader (which removes the offsets from the filenames. This will often violate the Pachyderm requirement that different datums cannot write the same file) + ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str --only current --consumer client.rack=$zone + + # Get "non-current data" - data that came in on the specified day, which is NOT the same day it was measured + date_str_1=$(date +%Y-%m-%d -d "$date_str + 1 day") + ./extract-kafka-sensor.py -s $site -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" -d $date_str_1 --only noncurrent --consumer client.rack=$zone + + sites_output+=($site) + + done + + # Upload L0 files to bucket, compacting with any existing file with the same name + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # /pfs/out/suna/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname_out}" + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" + rm -rf "${outdir}" + done + + # Update the airflow triggering table + for site_output in "${sites_output[@]}"; do + ./update-trigger-table.py -s $site_output -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" + done + + # set +x # Uncomment for debugging + rm -rf $linkdir + fi + EOF +input: + pfs: + name: import_trigger + repo: suna_cron_daily_and_date_control + # Must be datum by day (e.g. /SOURCE_TYPE/*/*/*) or by day/site (e.g. /SOURCE_TYPE/*/*/*/*) + glob: "/suna/*/*/*" +parallelism_spec: + constant: 3 +autoscaling: true +resource_requests: + memory: 300M + cpu: 1.6 +resource_limits: + memory: 1.5G + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_fill_date_gaps_and_regularize.yaml b/pipe/suna/suna_fill_date_gaps_and_regularize.yaml new file mode 100644 index 000000000..45cb7b7b3 --- /dev/null +++ b/pipe/suna/suna_fill_date_gaps_and_regularize.yaml @@ -0,0 +1,117 @@ +--- +pipeline: + name: suna_fill_date_gaps_and_regularize +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gf-rglr:v1.1.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH + # Run first module - date-gap-filler (using environment variables below as input parameters) + python3 -m date_gap_filler.date_gap_filler_main + # Run second module - regularize + Rscript ./flow.rglr.R \ + DirIn=/tmp/pfs/date_filled \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirRglr=data|uncertainty_data|flags" \ + MethRglr=CybiEc \ + WndwRglr=Trlg \ + IdxWndw=IdxWndwMin \ + RptTimeWndw=FALSE \ + DropNotNumc=FALSE \ + "DirSubCopy=location|uncertainty_coef" + EOF + env: + # Environment variables for date gap filler + LOG_LEVEL: INFO + OUT_PATH: /tmp/pfs/date_filled + OUTPUT_DIRECTORIES: data,location,uncertainty_data,uncertainty_coef,flags + DATA_SOURCE_TYPE_INDEX: '3' + DATA_YEAR_INDEX: '4' + DATA_MONTH_INDEX: '5' + DATA_DAY_INDEX: '6' + DATA_LOCATION_INDEX: '7' + DATA_TYPE_INDEX: '8' + LOCATION_SOURCE_TYPE_INDEX: '3' + LOCATION_YEAR_INDEX: '4' + LOCATION_MONTH_INDEX: '5' + LOCATION_DAY_INDEX: '6' + LOCATION_INDEX: '7' + EMPTY_FILE_TYPE_INDEX: '4' + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. + # Environment variables for regularizer + PARALLELIZATION_INTERNAL: '3' # Parallelization within R. If increased, adjust resource requests appropriately. +input: + cross: + - pfs: + name: EMPTY_FILE_PATH + repo: suna_empty_files + glob: /suna + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - group: + - pfs: + name: DATA_PATH + repo: suna_location_group_and_restructure + # For full-scale daily processing, glob should be /suna/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /suna/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + glob: /suna/(*/*/*) + #glob: /suna/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - join: + - pfs: + name: LOCATION_PATH + repo: suna_location_active_dates_assignment + # For full-scale daily processing, glob should be /suna/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /suna/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + glob: /suna/(*/*/*) + #glob: /suna/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + # For full-scale daily processing, joinOn be $1. When limiting to particular CFGLOCs, joinOn will be $2 to match parentheses around (*/*/*) + joinOn: $1 + #joinOn: $2 + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATE_LIMITER_PATH + repo: suna_cron_daily_and_date_control + glob: /suna/(*/*/*) + joinOn: $1 + group_by: $1 + empty_files: true # This can remain true even if LINK_TYPE=COPY +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2G + cpu: 3.3 +resource_limits: + memory: 3G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_location_active_dates_assignment.yaml b/pipe/suna/suna_location_active_dates_assignment.yaml new file mode 100644 index 000000000..535a0a8d8 --- /dev/null +++ b/pipe/suna/suna_location_active_dates_assignment.yaml @@ -0,0 +1,62 @@ +--- +pipeline: + name: suna_location_active_dates_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.loc.grp.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + TypeFile=namedLocation + "Prop=HOR|VER|name|description|site|Data Rate|active_periods" + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: suna_location_loader + glob: /suna/* + - pfs: + name: FILE_YEAR + repo: suna_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 210M + cpu: 1.2 +resource_limits: + memory: 500M + cpu: 1.6 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_location_asset.yaml b/pipe/suna/suna_location_asset.yaml new file mode 100644 index 000000000..53d66749f --- /dev/null +++ b/pipe/suna/suna_location_asset.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: suna_location_asset +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.1.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m location_asset_loader.location_asset_loader_main + env: + OUT_PATH: /pfs/out + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + LOG_LEVEL: INFO + SOURCE_TYPE: suna + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: suna_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 100M + cpu: 0.15 +resource_limits: + memory: 300M + cpu: 0.5 +sidecar_resource_requests: + memory: 250M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_location_asset_assignment.yaml b/pipe/suna/suna_location_asset_assignment.yaml new file mode 100644 index 000000000..8ad6031fb --- /dev/null +++ b/pipe/suna/suna_location_asset_assignment.yaml @@ -0,0 +1,62 @@ +--- +pipeline: + name: suna_location_asset_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.loc.grp.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + TypeFile=asset + "Prop=HOR|VER|install_date|remove_date|name|site|Data Rate" + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: suna_location_asset + glob: /suna/* + - pfs: + name: FILE_YEAR + repo: suna_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.5 +resource_limits: + memory: 800M + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_location_group_and_restructure.yaml b/pipe/suna/suna_location_group_and_restructure.yaml new file mode 100644 index 000000000..9a9b6221b --- /dev/null +++ b/pipe/suna/suna_location_group_and_restructure.yaml @@ -0,0 +1,111 @@ +--- +pipeline: + name: suna_location_group_and_restructure +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-strc-comb:v1.2.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/pfs/filter_joined + rm -rf /tmp/pfs/structured + rm -rf /tmp/pfs/structuredCopy + mkdir -p /tmp/pfs/filter_joined + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Run second module - structure repo by location + Rscript ./flow.loc.repo.strc.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/tmp/pfs/structured \ + DirErr=/pfs/out/errored_datums \ + Comb=TRUE + # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) + cp -rL /tmp/pfs/structured /tmp/pfs/structuredCopy || : # Allow to fail without exit code (happens if step above produced no output) + rm -rf /tmp/pfs/filter_joined + rm -rf /tmp/pfs/structured + # Run third module - merge data by location + Rscript ./flow.loc.data.trnc.comb.R \ + DirIn=/tmp/pfs/structuredCopy \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirSubCombData=data|flags|uncertainty_data" \ + DirSubCombUcrt=uncertainty_coef \ + DirSubCopy=location + EOF + env: + # Environment variables for filter-joiner + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH + # Filter for data directory + glob_pattern: /pfs/DATA_PATH/suna/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + outer_join: true + - path: + name: LOCATION_PATH + # Filter for data directory + glob_pattern: /pfs/LOCATION_PATH/suna/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [7] + OUT_PATH: /tmp/pfs/filter_joined + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. + # Environment variables for R modules + PARALLELIZATION_INTERNAL: '3' +input: + join: + - pfs: + name: DATA_PATH + repo: suna_calibration_group_and_convert + glob: /suna/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: LOCATION_PATH + repo: suna_location_asset_assignment + glob: /suna/(*)/(*)/(*) + joinOn: $1/$2/$3 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2.2G + cpu: 3.3 +resource_limits: + memory: 4G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/suna/suna_location_loader.yaml b/pipe/suna/suna_location_loader.yaml new file mode 100644 index 000000000..6c1fa1bd5 --- /dev/null +++ b/pipe/suna/suna_location_loader.yaml @@ -0,0 +1,54 @@ +--- +pipeline: + name: suna_location_loader +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-loader:v1.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m location_loader.location_loader_main + env: + LOCATION_TYPE: CONFIG + SOURCE_TYPE: suna + OUT_PATH: /pfs/out + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + LOG_LEVEL: INFO + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: suna_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 100M + cpu: 0.1 +resource_limits: + memory: 300M + cpu: 0.5 +sidecar_resource_requests: + memory: 300M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } From ec0eb5ccf7572ef8ca6571b146e05ffa51aa591d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 9 Apr 2025 15:32:59 -0600 Subject: [PATCH 004/182] trino data --- .../suna_calibration_group_and_convert.yaml | 2 +- pipe/suna/suna_data_source_gcs.yaml | 90 ---------- pipe/suna/suna_data_source_trino.yaml | 161 ++++++++++++++++++ 3 files changed, 162 insertions(+), 91 deletions(-) delete mode 100644 pipe/suna/suna_data_source_gcs.yaml create mode 100644 pipe/suna/suna_data_source_trino.yaml diff --git a/pipe/suna/suna_calibration_group_and_convert.yaml b/pipe/suna/suna_calibration_group_and_convert.yaml index 5a8af6221..3c4a95096 100644 --- a/pipe/suna/suna_calibration_group_and_convert.yaml +++ b/pipe/suna/suna_calibration_group_and_convert.yaml @@ -125,7 +125,7 @@ input: empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: DATA_PATH_ARCHIVE - repo: suna_data_source_gcs + repo: suna_data_source_trino glob: /suna/(*)/(*)/(*) joinOn: $1/$2/$3 outer_join: true diff --git a/pipe/suna/suna_data_source_gcs.yaml b/pipe/suna/suna_data_source_gcs.yaml deleted file mode 100644 index 001eead59..000000000 --- a/pipe/suna/suna_data_source_gcs.yaml +++ /dev/null @@ -1,90 +0,0 @@ ---- -pipeline: - name: suna_data_source_gcs -transform: - image_pull_secrets: - - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-l0-gcs-loader:v2.0.0 - cmd: - - sh - - "-c" - - |- - /bin/bash <<'EOF' - # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ - set -euo pipefail - IFS=$'\n\t' - - # Get today's date for evaluating kafka data retention period - date_today=$(date -u +%Y-%m-%d) - kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) - - # Get data from GCS bucket - echo "Processing $import_trigger" - p=${import_trigger#/pfs} - IFS="/"; arr=($p); unset IFS; - source_type=${arr[2]} - year=${arr[3]} - month=${arr[4]} - day=${arr[5]} - if [ $(date -u +%s -d $year-$month-$day) -lt $(date -u +%s -d $kafka_min_date) ] - then - echo "Extracting $year-$month-$day for $source_type from GCS" - python3 -m l0_gcs_loader.l0_gcs_loader - else - echo "$year/$month/$day is within the Kafka retention period and should be loaded from Kafka. Skipping..." - fi - - EOF - env: - LOG_LEVEL: INFO - OUT_PATH: /pfs/out - KAFKA_RETENTION_DAYS: "15" - BUCKET_VERSION_PATH: "v2" # The root path of the bucket, indicative of the version (e.g. v2) - SOURCE_TYPE_INDEX: "3" - YEAR_INDEX: "4" - MONTH_INDEX: "5" - DAY_INDEX: "6" - BUCKET_NAME: neon-l0-ingest # Always pull from prod bucket - # secrets: - # - name: l0-bucket # Using this secret will use the dev/cert/prod bucket linked to the Pachyderm environment - # env_var: BUCKET_NAME - # key: LO_BUCKET - -input: - pfs: - name: import_trigger - repo: suna_cron_daily_and_date_control - # Glob must be daily - glob: "/*/*/*/*" -output_branch: master -parallelism_spec: - constant: 5 -autoscaling: true -resource_requests: - memory: 400M - cpu: 0.5 -resource_limits: - memory: 800M - cpu: 1.5 -sidecar_resource_requests: - memory: 2.4G - cpu: 0.5 -datum_set_spec: - number: 1 -scheduling_spec: - node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } diff --git a/pipe/suna/suna_data_source_trino.yaml b/pipe/suna/suna_data_source_trino.yaml new file mode 100644 index 000000000..0769e5327 --- /dev/null +++ b/pipe/suna/suna_data_source_trino.yaml @@ -0,0 +1,161 @@ +--- +pipeline: + name: suna_data_source_trino +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.1.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + interimDir="/tmp/interimData" + rm -rf $interimDir + + # Get today's date for evaluating kafka data retention period + date_today=$(date -u +%Y-%m-%d) + kafka_min_date=$(date -u -d "$KAFKA_RETENTION_DAYS days ago" +%Y-%m-%d) + + # Run first module - data_source_site (pull data from database by site) + # Split data source path + for path in $(find -L $import_trigger -type f); do + echo "Processing $path" + p=${path#/pfs} + IFS="/"; arr=($p); unset IFS; + source_type=${arr[2]} + export SOURCE_TYPE=$source_type + year=${arr[3]} + month=${arr[4]} + day=${arr[5]} + site=${arr[6]} + type=$(echo $site | cut -f 2 -d "." --only-delimited); # Extract the "kafka" from site.kafka if present + if [ "$type" = "kafka" ] && [ $(date -u +%s -d $year-$month-$day) -lt $(date -u +%s -d $kafka_min_date) ] + then + site=$(echo $site | cut -f 1 -d "." --only-delimited); # Extract the site from site.kafka. + echo "$year/$month/$day for $site is indicated to be streaming from Kafka but has passed the Kafka retention period ($KAFKA_RETENTION_DAYS days)." + elif [ "$type" = "kafka" ] + then + echo "$year/$month/$day/$site is indicated to be streaming from Kafka. Skipping..." + continue + fi + + # Set env vars for trino loader + export GEN_DATE=$year-$month-$day + export GEN_SITE_NAME=$site + export REQUESTS_CA_BUNDLE=/etc/pki/tls/cert.pem + export GEN_YAML_CONF="/usr/src/app/genscript/configs/$(echo $SOURCE_TYPE)_streams.yaml" + export GEN_SCHEMA_FILE="/usr/src/app/schemas/$(echo $SOURCE_TYPE)/$(echo $SOURCE_TYPE).avsc" + echo "Extracting $SOURCE_TYPE from Trino for $year/$month/$day/$site" + export GEN_OUTPUT_DIR=$interimDir/$SOURCE_TYPE/$year/$month/$day + mkdir -p $GEN_OUTPUT_DIR + /usr/src/app/genscript/genparquet.py --storesitename --codec gzip + done + + # Run second module - parquet_linkmerge (merges data from a source id that collected data from multiple sites in one day + python3 -m parquet_linkmerge.parquet_linkmerge_main + + # Export L0 data to bucket + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # Example: /pfs/out/li191r/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + done + + echo "Syncing files to bucket" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${linkdir}" \ + ":gcs://${BUCKET_NAME}" + + echo "Removing temporary files" + rm -rf $linkdir + + # set +x # Uncomment for debugging + fi + EOF + env: + # Static environment variables for data conversion step + LOG_LEVEL: INFO + REQUESTS_CA_BUNDLE: "/etc/pki/tls/cert.pem" + # Environment variables for linkmerge step + IN_PATH: /tmp/interimData + OUT_PATH: /pfs/out + SOURCE_TYPE_INDEX: '3' + YEAR_INDEX: '4' + MONTH_INDEX: '5' + DAY_INDEX: '6' + SOURCE_ID_INDEX: '7' + KAFKA_RETENTION_DAYS: "15" + secrets: + - name: pachd-trino-secret + key: TRINO_HOST + env_var: PRESTO_HOST + - name: pachd-trino-secret + key: TRINO_PASSWORD + env_var: PRESTO_PASSWORD + - name: pachd-trino-secret + key: TRINO_USER + env_var: PRESTO_USER + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET +input: + pfs: + name: import_trigger + repo: suna_cron_daily_and_date_control + glob: "/*/*/*/*" +output_branch: master +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.2 +resource_limits: + memory: 800M + cpu: 2 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } From 14fb0442476a8552c246e2f8edd32a611d45f8a7 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Thu, 10 Apr 2025 11:31:58 -0600 Subject: [PATCH 005/182] Additional updated to SUNA log checker --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 45 +++++++++++++++++--- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index 5e598c709..f4527d4da 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -74,15 +74,46 @@ wrap.suna.logfiles <- function(FileIn, # Find row where data actually starts start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 # Separate data and metadata - log_data<-log_file[start:(length(log_file)),] + log_data<-log_file[start:(nrow(log_file)),] log_metadata<-log_file[1:(start-1),2:6] + # Create column names for data + names(log_data)<-c("serial_number","date","time","nitrate_uM","nitrate_mgL","absorbance_254","absorbance_350","bromide", + "spec_avg","dark_value","int_time_factor", + "channel_1","channel_2","channel_3","channel_4","channel_5","channel_6","channel_7","channel_8","channel_9","channel_10", + "channel_11","channel_12","channel_13","channel_14","channel_15","channel_16","channel_17","channel_18","channel_19","channel_20", + "channel_21","channel_22","channel_23","channel_24","channel_25","channel_26","channel_27","channel_28","channel_29","channel_30", + "channel_31","channel_32","channel_33","channel_34","channel_35","channel_36","channel_37","channel_38","channel_39","channel_40", + "channel_41","channel_42","channel_43","channel_44","channel_45","channel_46","channel_47","channel_48","channel_49","channel_50", + "channel_51","channel_52","channel_53","channel_54","channel_55","channel_56","channel_57","channel_58","channel_59","channel_60", + "channel_61","channel_62","channel_63","channel_64","channel_65","channel_66","channel_67","channel_68","channel_69","channel_70", + "channel_71","channel_72","channel_73","channel_74","channel_75","channel_76","channel_77","channel_78","channel_79","channel_80", + "channel_81","channel_82","channel_83","channel_84","channel_85","channel_86","channel_87","channel_88","channel_89","channel_90", + "channel_91","channel_92","channel_93","channel_94","channel_95","channel_96","channel_97","channel_98","channel_99","channel_100", + "channel_101","channel_102","channel_103","channel_104","channel_105","channel_106","channel_107","channel_108","channel_109","channel_110", + "channel_111","channel_112","channel_113","channel_114","channel_115","channel_116","channel_117","channel_118","channel_119","channel_120", + "channel_121","channel_122","channel_123","channel_124","channel_125","channel_126","channel_127","channel_128","channel_129","channel_130", + "channel_131","channel_132","channel_133","channel_134","channel_135","channel_136","channel_137","channel_138","channel_139","channel_140", + "channel_141","channel_142","channel_143","channel_144","channel_145","channel_146","channel_147","channel_148","channel_149","channel_150", + "channel_151","channel_152","channel_153","channel_154","channel_155","channel_156","channel_157","channel_158","channel_159","channel_160", + "channel_161","channel_162","channel_163","channel_164","channel_165","channel_166","channel_167","channel_168","channel_169","channel_170", + "channel_171","channel_172","channel_173","channel_174","channel_175","channel_176","channel_177","channel_178","channel_179","channel_180", + "channel_181","channel_182","channel_183","channel_184","channel_185","channel_186","channel_187","channel_188","channel_189","channel_190", + "channel_191","channel_192","channel_193","channel_194","channel_195","channel_196","channel_197","channel_198","channel_199","channel_200", + "channel_201","channel_202","channel_203","channel_204","channel_205","channel_206","channel_207","channel_208","channel_209","channel_210", + "channel_211","channel_212","channel_213","channel_214","channel_215","channel_216","channel_217","channel_218","channel_219","channel_220", + "channel_221","channel_222","channel_223","channel_224","channel_225","channel_226","channel_227","channel_228","channel_229","channel_230", + "channel_231","channel_232","channel_233","channel_234","channel_235","channel_236","channel_237","channel_238","channel_239","channel_240", + "channel_241","channel_242","channel_243","channel_244","channel_245","channel_246","channel_247","channel_248","channel_249","channel_250", + "channel_251","channel_252","channel_253","channel_254","channel_255","channel_256", + "internal_temp","spec_temp","lamp_temp","cum_lamp_time","humidity","main_volt","lamp_volt","internal_volt","current","fit_aux_1","fit_aux_2", + "fit_base_1","fit_base_2","fit_RMSE","ctd_time","ctd_salinity","ctd_temp","ctd_pressure","check_sum") - - - - - - + # Calculates the date and time in POSIXct format + log_data$date<-lubridate::parse_date_time(as.character(log_data$date),order="yj") + log_data$date<-lubridate::with_tz(log_data$date+(as.numeric(log_data$time)*60*60),'UTC') + # Checks that there are no dates prior to when NEON began collecting IS data + if(any(log_data$date<"2014-01-01 00:00:00 UTC")){ + log$debug(base::paste0("Data contains dates prior to when NEON began collecting IS data"))} From 1738b81c854f92f6181de4151c5f75a33f9b67ce Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 10 Apr 2025 14:48:46 -0600 Subject: [PATCH 006/182] add LECO --- pipe/suna/site-list.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipe/suna/site-list.json b/pipe/suna/site-list.json index 57f7a998c..42a514c5e 100644 --- a/pipe/suna/site-list.json +++ b/pipe/suna/site-list.json @@ -1,5 +1,8 @@ [ { "site" : "CRAM" + }, + { + "site" : "LECO" } ] \ No newline at end of file From a05d72d438ef13a12e89ca465c1b6db53e3c92d2 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 15 Apr 2025 10:56:12 -0600 Subject: [PATCH 007/182] Updates to suna logged data checker. --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 50 +++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index f4527d4da..47f3cfd8b 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -53,7 +53,6 @@ wrap.suna.logfiles <- function(FileIn, log <- NEONprocIS.base::def.log.init() } - # --------- Load the data ---------- # Load in the csv log file(s) log_file <- base::try(read.table(paste0(FileIn), header = FALSE, sep = ",", @@ -71,6 +70,7 @@ wrap.suna.logfiles <- function(FileIn, log$debug(base::paste0('skipping sonde file: ', FileIn)) base::stop() } + # Find row where data actually starts start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 # Separate data and metadata @@ -108,6 +108,11 @@ wrap.suna.logfiles <- function(FileIn, "internal_temp","spec_temp","lamp_temp","cum_lamp_time","humidity","main_volt","lamp_volt","internal_volt","current","fit_aux_1","fit_aux_2", "fit_base_1","fit_base_2","fit_RMSE","ctd_time","ctd_salinity","ctd_temp","ctd_pressure","check_sum") + # Gets metadata + sensor<-"suna" + serial_number<-log_metadata[1,2] + eprom<-"20349" #' Need to figure out a way to get this from the folder name the file came from since it's not included in the file itself + # Calculates the date and time in POSIXct format log_data$date<-lubridate::parse_date_time(as.character(log_data$date),order="yj") log_data$date<-lubridate::with_tz(log_data$date+(as.numeric(log_data$time)*60*60),'UTC') @@ -115,13 +120,54 @@ wrap.suna.logfiles <- function(FileIn, if(any(log_data$date<"2014-01-01 00:00:00 UTC")){ log$debug(base::paste0("Data contains dates prior to when NEON began collecting IS data"))} + # Output file + # Create output directory + year <- substr(log_data$date[1],1,4) + month <- substr(log_data$date[1],6,7) + day <- substr(log_data$date[1],9,10) + DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',eprom,'/data/') + base::dir.create(DirOutLogFile,recursive=TRUE) + csv_name <-paste0(sensor,'_',eprom,'_',year,'-',month,'-',day,'_log') + # Writes parquet file to output directory + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = log_data, + NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutLogFile,csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) + } + + + + + + + - + + + + + + + + + + + + + + + + + From 360624798c358f7b097aa45711cfd7a85111616f Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 15 Apr 2025 14:07:27 -0600 Subject: [PATCH 008/182] Updates to SUNA log file checker. --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 258 +------------------ 1 file changed, 1 insertion(+), 257 deletions(-) diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index 47f3cfd8b..6c9ed0e42 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -138,264 +138,8 @@ wrap.suna.logfiles <- function(FileIn, } else { log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #check timezone. lot's of different styles... - if(any(grepl('Time Zone: ',log_metadata$V1))){ - timezone<-log_metadata$V1[grepl('Time Zone: ',log_metadata$V1)] - timezone<-gsub('Time Zone: ','',timezone) - }else if(any(grepl('Time Zone',log_metadata$V1))){ - timezone<-log_metadata$V2[grepl('Time Zone',log_metadata$V1)] - }else if(any(grepl('Time Zone',log_metadata$V2))){ - timezone<-log_metadata$V3[grepl('Time Zone',log_metadata$V2)] - }else{ - timezone<-'ERROR' - log$error(base::paste0('File Error: timezone not specified in ', FileIn)) - } - #then clean up TZ - #grep("Dateline", OlsonNames(), value=TRUE) - if(timezone=="Coordinated Universal Time"){ - timezone<-'UTC' - }else if(grepl('Eastern Standard Time',timezone)|grepl('Eastern Daylight Time',timezone)|grepl('Dateline',timezone)){ - timezone<-'EST' - }else if(grepl('Central Daylight Time',timezone)|grepl('Central Standard Time',timezone)){ - timezone<-'US/Central' - }else if(grepl('Pacific Daylight Time',timezone)|grepl('Pacific Standard Time',timezone)|grepl('UTC-08',timezone)){ - timezone<-'US/Pacific' - }else if(grepl('Mountain Daylight Time',timezone)|grepl('Mountain Standard Time',timezone)){ - timezone<-'US/Mountain' - }else if(grepl('Alaskan Daylight Time',timezone)|grepl('Alaskan Standard Time',timezone)|grepl('UTC-09',timezone)){ - timezone<-'US/Alaska' - }else if(grepl('SA Western Daylight Time',timezone)|grepl('SA Western Standard Time',timezone)){ - timezone<-'America/Puerto_Rico' - }else if(grepl('GMT',timezone)|grepl('Greenwich Standard Time',timezone)){ - timezone<-'GMT' - }else if(timezone=='Unknown'){ - if(any(grepl('UTC',log_metadata))){ - timezone<-'UTC' - } - } - - #clean up metadata - removeAfter<-which(log_metadata$V1=='Log Notes:') - if(length(removeAfter)>0){ - log_metadata <- as.data.frame(log_metadata[1:(removeAfter),]) - } - log_metadata$V1[is.na(log_metadata$V1)]<-log_metadata$V2[is.na(log_metadata$V1)] - log_metadata$V2[!is.na(log_metadata$V3)]<-log_metadata$V3[!is.na(log_metadata$V3)] - log_metadata<-log_metadata[,1:2] - colnames(log_metadata)<-c("label","value") - - #Metadata values - logName <- log_metadata$value[!is.na(log_metadata$label) & (log_metadata$label=="Log Name"|log_metadata$label=="File Name")][1] - Troll_SN <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Serial Number"][1] - Asset <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Device Name"][1] - #log$debug(base::paste0('metadata: ',logName,'_',Troll_SN,'_',Asset)) - if(length(Asset)<1 || Asset == " " || nchar(Asset) == 0){ - log$error(base::paste0('File Info: No asset specified in ', FileIn)) - stop() - } - #define Site - Site <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Site"] - if(length(Site)<1){ - log$info(base::paste0('File Info: No site specified in ', FileIn)) - }else if(Site == 'Default Site'){ - Site <- NA - log$info(base::paste0('File Info: Default site specified in ', FileIn)) - }else if(length(Site)>1){ - log$info(base::paste0('File Info: More than one site specified in ', FileIn)) - }else if(nchar(Site)>4){ - Site <-substr(Site,5,8) - } - #fix for specific use case - if(grepl('Central America Standard Time',timezone) & !is.na(Site) & (Site == "MCDI"|Site == "KING")){ - timezone<-'US/Central' - } - Device <- log_metadata$value[!is.na(log_metadata$label) & log_metadata$label=="Device"][1] - if(!is.na(Device) & grepl('level',tolower(Device))){ - Device<-"Level TROLL 500" - Context <- "surfacewater" - sensor <- "leveltroll500" - keep<-c('readout_time','seconds','pressure','temperature') - log_data<-log_data[keep] - }else if(!is.na(Device) & grepl('aqua',tolower(Device))){ - Device<-"Aqua TROLL 200" - sensor <- "aquatroll200" - if(!is.na(logName) & (grepl("IN",logName)|grepl("Inlet",logName)|grepl("OT",logName)|grepl("Outlet",logName)|grepl("L1",logName)| - grepl("L2",logName)|grepl("Lit",logName)|grepl("S1",logName)|grepl("S2",logName))){ - Context <- "surfacewater" - }else if(!is.na(logName) & (grepl("GW",logName)|any(grepl("conductivity",tolower(colnames(log_data)))))){ - Context <- "groundwater" - }else{ - log$error(base::paste0('File Error: Context not specified in ', FileIn)) - } - }else{ - log$error(base::paste0('File Error: Device not specified in ', FileIn)) - } - - - ###check and update date format - #sometimes ymd others mdy, sometimes has / others -, some don't have seconds - #check if date contains seconds - if(length(base::gregexpr(':', log_data$readout_time[1])[[1]])==2){ #if 2 : then it has seconds - #Check if date begins with year (assume 4 digit year, which seems to always be true) - if(grepl('^\\d{4}', log_data$readout_time[1])){ - #ymd format - log_data$dateTime <- lubridate::ymd_hms(log_data$readout_time, tz = timezone) - }else{ - #assume mdy format - log_data$dateTime <- lubridate::mdy_hms(log_data$readout_time, tz = timezone) - } - }else if(length(base::gregexpr(':', log_data$readout_time[1])[[1]])==1){ - #doesn't have seconds - #Check if date begins with year (assuming 4 digit year) - if(grepl('^\\d{4}', log_data$readout_time[1])){ - #ymd format - log_data$dateTime <- lubridate::ymd_hm(log_data$readout_time, tz = timezone) - }else{ - #assume mdy format - log_data$dateTime <- lubridate::mdy_hm(log_data$readout_time, tz = timezone) - } - }else{ - log$error(base::paste0('File Error: Invalid date time format',log_data$readout_time[1],' in ', FileIn))#this shouldn't happen - } - log_data<-log_data[!is.na(log_data$dateTime),] - - #add date as UTC - log_data$dateUTC<-lubridate::with_tz(log_data$dateTime,'UTC') - - #check that dates are 2018 or later (some files have 1970 error) - log_data$logFlag<-1 - log_data$logDateErrorFlag<-0 - - if(any(log_data$dateUTC<"2018-01-01 00:00:00 UTC")){ - log$debug(base::paste0("Data contains dates prior to NEON logging implementation. Attempt will be made to align and flag data.")) - logDateError<-which(log_data$dateUTC<"2018-01-01 00:00:00 UTC") - if(logDateError[1]!=1){ #If there is a good date before the 1970 shift we can try to continue the data and add a flag - # sampling frequency - if(Context=='groundwater'){ - freq <- 300 #5 min in seconds - }else{ - freq <- 60 - } - idx_start<-logDateError[1] - idx_end<-logDateError[length(logDateError)] - time_last_read<-log_data$dateUTC[idx_start-1] - first_new_time<-time_last_read + freq - num_readings<-length(logDateError) - new_times <- seq(first_new_time, by = freq, length = num_readings) - log_data$dateUTC[idx_start:idx_end]<-new_times - log_data$logDateErrorFlag[idx_start:idx_end]<-1 - }else{ - #cannot use log data with bad dates - #log$debug(base::paste0("Log data contains erroneous dates that cannot be linked to the correct time.")) - log_data<-log_data[log_data$dateUTC>"2018-01-01 00:00:00 UTC",] - log$debug(base::paste0('File Error: ALL DATA 1970 in ', FileIn)) - } - } - if(nrow(log_data)>0){ - log_data$readout_time<-log_data$dateUTC - - #round to minute - if(Context=='surfacewater'){ - log_data$readout_time<-lubridate::round_date(log_data$dateUTC,unit = "minute") - }else if(Context=='groundwater'){ - log_data$readout_time<-lubridate::round_date(log_data$dateUTC,unit = "5 minutes") - } - - log_data$day<-lubridate::floor_date(log_data$dateUTC,"days") - - log_data$source_id<-Asset - - #format output file - #create any missing columns in log file - if(!'pressure' %in% names(log_data)){log_data$pressure<-NA} - if(!'temperature' %in% names(log_data)){log_data$temperature<-NA} - if(sensor=='aquatroll200'){ - if(!'conductivity' %in% names(log_data)){log_data$conductivity<-NA} - out_columns <- c('source_id','readout_time','pressure','temperature','conductivity','logFlag','logDateErrorFlag','day') - }else if(sensor=='leveltroll500'){ - out_columns <- c('source_id','readout_time','pressure','temperature','logFlag','logDateErrorFlag','day') - } - out<-log_data[out_columns] - - first_reading<-log_data$dateUTC[1] - if(length(log_data$dateUTC)>0){ - last_reading<-log_data$dateUTC[length(log_data$dateUTC)] - }else{ - last_reading<-NA - } - - ###subset into 1-day data files - all_days<-split(out, as.Date(out$day)) - #output daily files - if(length(all_days)>0){ - for(j in 1:length(all_days)){ - #create DF - out_file <- as.data.frame(all_days[j]) - colnames(out_file) <- out_columns - year <- substr(out_file$day[1],1,4) - month <- substr(out_file$day[1],6,7) - day <- substr(out_file$day[1],9,10) - if(sensor=='aquatroll200'){ - out_file <- out_file[,c('source_id','readout_time','pressure','temperature','conductivity','logFlag','logDateErrorFlag')] - }else if(sensor=='leveltroll500'){ - out_file <- out_file[,c('source_id','readout_time','pressure','temperature','logFlag','logDateErrorFlag')] - } - #create output directory - DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',Asset,'/data/') - base::dir.create(DirOutLogFile,recursive=TRUE) - csv_name <-paste0(sensor,'_',Asset,'_',year,'-',month,'-',day,'_log') - - rptOut <- try(NEONprocIS.base::def.wrte.parq(data = out_file, - NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), - Schm = SchmDataOut),silent=TRUE) - if(class(rptOut)[1] == 'try-error'){ - log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutLogFile,csv_name,".parquet"),'. ',attr(rptOut, "condition"))) - stop() - } else { - log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) - } - }#end of days loop - }else{ - log$error(base::paste0('No days can be written out for ', FileIn)) - } - } - } -} #end of file +} # End of file From 26a1fafa14c2c7c2b6286a13b30a3d37b75eb638 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 18 Apr 2025 15:02:28 -0600 Subject: [PATCH 009/182] Update to SUNA log file formatter that pulls asset information from folder name. --- flow/flow.suna.logfiles/wrap.suna.logfiles.R | 21 ++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index 6c9ed0e42..64080f10e 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -22,7 +22,7 @@ #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init #' for more details. #' -#' @return Cleaned SUNA log files in daily parquets. +#' @return Data from SUNA log files in daily parquets. #' #' @references #' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 @@ -76,6 +76,13 @@ wrap.suna.logfiles <- function(FileIn, # Separate data and metadata log_data<-log_file[start:(nrow(log_file)),] log_metadata<-log_file[1:(start-1),2:6] + + # Gets metadata + sensor<-"suna" + serial_number<-log_metadata[1,2] + asset_string <- regexpr("\\/[0-9]{5}\\/",FileIn) #' For SUNA asset info not included in log file header. Need it from input file folder name. + asset<-gsub("\\/","",substr(FileIn,asset_string[1],asset_string[1]+attributes(asset_string)$match.length-1)) + # Create column names for data names(log_data)<-c("serial_number","date","time","nitrate_uM","nitrate_mgL","absorbance_254","absorbance_350","bromide", "spec_avg","dark_value","int_time_factor", @@ -108,26 +115,24 @@ wrap.suna.logfiles <- function(FileIn, "internal_temp","spec_temp","lamp_temp","cum_lamp_time","humidity","main_volt","lamp_volt","internal_volt","current","fit_aux_1","fit_aux_2", "fit_base_1","fit_base_2","fit_RMSE","ctd_time","ctd_salinity","ctd_temp","ctd_pressure","check_sum") - # Gets metadata - sensor<-"suna" - serial_number<-log_metadata[1,2] - eprom<-"20349" #' Need to figure out a way to get this from the folder name the file came from since it's not included in the file itself - # Calculates the date and time in POSIXct format log_data$date<-lubridate::parse_date_time(as.character(log_data$date),order="yj") log_data$date<-lubridate::with_tz(log_data$date+(as.numeric(log_data$time)*60*60),'UTC') # Checks that there are no dates prior to when NEON began collecting IS data if(any(log_data$date<"2014-01-01 00:00:00 UTC")){ log$debug(base::paste0("Data contains dates prior to when NEON began collecting IS data"))} + # Checks that there are no future dates after the current date + if(any(log_data$date>Sys.time())){ + log$debug(base::paste0("Data contains future dates after the current date"))} # Output file # Create output directory year <- substr(log_data$date[1],1,4) month <- substr(log_data$date[1],6,7) day <- substr(log_data$date[1],9,10) - DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',eprom,'/data/') + DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',asset,'/data/') base::dir.create(DirOutLogFile,recursive=TRUE) - csv_name <-paste0(sensor,'_',eprom,'_',year,'-',month,'-',day,'_log') + csv_name <-paste0(sensor,'_',asset,'_',year,'-',month,'-',day,'_log') # Writes parquet file to output directory rptOut <- try(NEONprocIS.base::def.wrte.parq(data = log_data, NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), From 2fcaea8f993a00e8dd8e1727723cb7625e07a080 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 29 May 2025 16:42:44 -0600 Subject: [PATCH 010/182] update to sunav2 --- pipe/suna/pipe_list_suna.txt | 16 ------ .../sunav2/data_source_sunav2_list_years.json | 29 ----------- pipe/sunav2/pipe_list_sunav2.txt | 16 ++++++ pipe/{suna => sunav2}/site-list.json | 0 .../sunav2_calibrated_location_group.yaml | 51 ------------------- .../sunav2/sunav2_calibration_assignment.json | 47 ----------------- .../sunav2_calibration_assignment.yaml} | 8 +-- ...sunav2_calibration_group_and_convert.yaml} | 32 ++++++------ .../sunav2_calibration_list_files.yaml} | 4 +- .../sunav2_calibration_loader.yaml} | 6 +-- .../sunav2_cron_daily_and_date_control.yaml} | 6 +-- .../sunav2/sunav2_data_calibration_group.yaml | 51 ------------------- .../sunav2_data_source_kafka.yaml} | 12 ++--- .../sunav2_data_source_trino.yaml} | 4 +- ...sunav2_fill_date_gaps_and_regularize.yaml} | 26 +++++----- ...nav2_location_active_dates_assignment.json | 47 ----------------- ...av2_location_active_dates_assignment.yaml} | 8 +-- .../sunav2_location_asset.yaml} | 6 +-- .../sunav2_location_asset_assignment.json | 47 ----------------- .../sunav2_location_asset_assignment.yaml} | 8 +-- ...unav2_location_group_and_restructure.yaml} | 14 ++--- .../sunav2_location_loader.yaml} | 6 +-- .../sunav2/sunav2_merge_data_by_location.json | 34 ------------- .../sunav2_structure_repo_by_location.json | 32 ------------ 24 files changed, 86 insertions(+), 424 deletions(-) delete mode 100644 pipe/suna/pipe_list_suna.txt delete mode 100644 pipe/sunav2/data_source_sunav2_list_years.json create mode 100644 pipe/sunav2/pipe_list_sunav2.txt rename pipe/{suna => sunav2}/site-list.json (100%) delete mode 100644 pipe/sunav2/sunav2_calibrated_location_group.yaml delete mode 100644 pipe/sunav2/sunav2_calibration_assignment.json rename pipe/{suna/suna_calibration_assignment.yaml => sunav2/sunav2_calibration_assignment.yaml} (89%) rename pipe/{suna/suna_calibration_group_and_convert.yaml => sunav2/sunav2_calibration_group_and_convert.yaml} (89%) rename pipe/{suna/suna_calibration_list_files.yaml => sunav2/sunav2_calibration_list_files.yaml} (91%) rename pipe/{suna/suna_calibration_loader.yaml => sunav2/sunav2_calibration_loader.yaml} (92%) rename pipe/{suna/suna_cron_daily_and_date_control.yaml => sunav2/sunav2_cron_daily_and_date_control.yaml} (94%) delete mode 100644 pipe/sunav2/sunav2_data_calibration_group.yaml rename pipe/{suna/suna_data_source_kafka.yaml => sunav2/sunav2_data_source_kafka.yaml} (96%) rename pipe/{suna/suna_data_source_trino.yaml => sunav2/sunav2_data_source_trino.yaml} (98%) rename pipe/{suna/suna_fill_date_gaps_and_regularize.yaml => sunav2/sunav2_fill_date_gaps_and_regularize.yaml} (78%) delete mode 100644 pipe/sunav2/sunav2_location_active_dates_assignment.json rename pipe/{suna/suna_location_active_dates_assignment.yaml => sunav2/sunav2_location_active_dates_assignment.yaml} (89%) rename pipe/{suna/suna_location_asset.yaml => sunav2/sunav2_location_asset.yaml} (91%) delete mode 100644 pipe/sunav2/sunav2_location_asset_assignment.json rename pipe/{suna/suna_location_asset_assignment.yaml => sunav2/sunav2_location_asset_assignment.yaml} (89%) rename pipe/{suna/suna_location_group_and_restructure.yaml => sunav2/sunav2_location_group_and_restructure.yaml} (91%) rename pipe/{suna/suna_location_loader.yaml => sunav2/sunav2_location_loader.yaml} (91%) delete mode 100644 pipe/sunav2/sunav2_merge_data_by_location.json delete mode 100644 pipe/sunav2/sunav2_structure_repo_by_location.json diff --git a/pipe/suna/pipe_list_suna.txt b/pipe/suna/pipe_list_suna.txt deleted file mode 100644 index c47513d44..000000000 --- a/pipe/suna/pipe_list_suna.txt +++ /dev/null @@ -1,16 +0,0 @@ -suna_cron_daily_and_date_control.yaml -suna_logjam_list_files.yaml -suna_logjam_load_files.yaml -suna_logjam_assign_clean_files.yaml -suna_data_source_kafka.yaml -suna_data_source_gcs.yaml -suna_calibration_list_files.yaml -suna_calibration_loader.yaml -suna_location_asset.yaml -suna_location_loader.yaml -suna_calibration_assignment.yaml -suna_location_asset_assignment.yaml -suna_location_active_dates_assignment.yaml -suna_calibration_group_and_convert.yaml -suna_location_group_and_restructure.yaml -suna_fill_date_gaps_and_regularize.yaml diff --git a/pipe/sunav2/data_source_sunav2_list_years.json b/pipe/sunav2/data_source_sunav2_list_years.json deleted file mode 100644 index 386f47bae..000000000 --- a/pipe/sunav2/data_source_sunav2_list_years.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "pipeline": { - "name": "data_source_sunav2_list_years" - }, - "transform": { - "image":"registry.access.redhat.com/ubi8/ubi-minimal:8.3", - "cmd": [ - "/bin/bash" - ], - "stdin": [ - "#!/bin/bash", - "ls $REPO_IN > /pfs/out/data_years.txt" - ] - }, - "input": { - "pfs": { - "name": "REPO_IN", - "repo": "data_source_sunav2", - "glob": "/sunav2", - "empty_files": true - } - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "1K", - "cpu": 0.01 - } -} diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt new file mode 100644 index 000000000..9fb24f30a --- /dev/null +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -0,0 +1,16 @@ +sunav2_cron_daily_and_date_control.yaml +sunav2_logjam_list_files.yaml +sunav2_logjam_load_files.yaml +sunav2_logjam_assign_clean_files.yaml +sunav2_data_source_kafka.yaml +sunav2_data_source_trino.yaml +sunav2_calibration_list_files.yaml +sunav2_calibration_loader.yaml +sunav2_location_asset.yaml +sunav2_location_loader.yaml +sunav2_calibration_assignment.yaml +sunav2_location_asset_assignment.yaml +sunav2_location_active_dates_assignment.yaml +sunav2_calibration_group_and_convert.yaml +sunav2_location_group_and_restructure.yaml +sunav2_fill_date_gaps_and_regularize.yaml diff --git a/pipe/suna/site-list.json b/pipe/sunav2/site-list.json similarity index 100% rename from pipe/suna/site-list.json rename to pipe/sunav2/site-list.json diff --git a/pipe/sunav2/sunav2_calibrated_location_group.yaml b/pipe/sunav2/sunav2_calibrated_location_group.yaml deleted file mode 100644 index a044e48ce..000000000 --- a/pipe/sunav2/sunav2_calibrated_location_group.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -pipeline: - name: sunav2_calibrated_location_group -transform: -# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-filt-join:v1.0.0 - cmd: ["/bin/bash"] - stdin: - - "#!/bin/bash" - - "python3 -m filter_joiner.filter_joiner_main" - env: - CONFIG: | - --- - # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. - # Metadata indices will typically begin at index 3. - input_paths: - - path: - name: DATA_PATH - # Filter for data directory - glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - outer_join: true - - path: - name: LOCATION_PATH - # Filter for data directory - glob_pattern: /pfs/LOCATION_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - OUT_PATH: /pfs/out - LOG_LEVEL: INFO - RELATIVE_PATH_INDEX: "3" -input: - join: - - pfs: - name: DATA_PATH - repo: sunav2_data_calibration_group - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: true - - pfs: - name: LOCATION_PATH - repo: sunav2_location_asset_assignment - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - empty_files: true -parallelism_spec: - constant: "1" -enable_stats: false -standby: true diff --git a/pipe/sunav2/sunav2_calibration_assignment.json b/pipe/sunav2/sunav2_calibration_assignment.json deleted file mode 100644 index 7c255757d..000000000 --- a/pipe/sunav2/sunav2_calibration_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_calibration_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.cal.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "PadDay=-1|1" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v1.0.6", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "calibration", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "4" - } -} diff --git a/pipe/suna/suna_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml similarity index 89% rename from pipe/suna/suna_calibration_assignment.yaml rename to pipe/sunav2/sunav2_calibration_assignment.yaml index 3bb18ef16..cf7072399 100644 --- a/pipe/suna/suna_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_calibration_assignment + name: sunav2_calibration_assignment transform: cmd: ["/bin/bash"] stdin: @@ -21,11 +21,11 @@ input: cross: - pfs: name: DIR_IN - repo: suna_calibration_loader - glob: /suna/* + repo: sunav2_calibration_loader + glob: /sunav2/* - pfs: name: FILE_YEAR - repo: suna_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control glob: /data_year*.txt parallelism_spec: constant: 2 diff --git a/pipe/suna/suna_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml similarity index 89% rename from pipe/suna/suna_calibration_group_and_convert.yaml rename to pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 3c4a95096..7890fd170 100644 --- a/pipe/suna/suna_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_calibration_group_and_convert + name: sunav2_calibration_group_and_convert transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v1.3.0 cmd: @@ -80,14 +80,14 @@ transform: - path: name: DATA_PATH_KAFKA_MERGED # Filter for data directory - glob_pattern: /tmp/kafka_merged/suna/*/*/*/*/** + glob_pattern: /tmp/kafka_merged/sunav2/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] outer_join: true - path: name: CALIBRATION_PATH # Filter for data directory - glob_pattern: /pfs/CALIBRATION_PATH/suna/*/*/*/*/** + glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] OUT_PATH: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums @@ -100,40 +100,40 @@ input: cross: - pfs: name: FILE_SCHEMA_L0 - repo: suna_avro_schemas - glob: /suna/suna.avsc + repo: sunav2_avro_schemas + glob: /sunav2/sunav2.avsc - pfs: name: FILE_SCHEMA_DATA - repo: suna_avro_schemas - glob: /suna/suna_calibrated.avsc + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_calibrated.avsc - pfs: name: FILE_SCHEMA_FLAGS - repo: suna_avro_schemas - glob: /suna/flags_calibration_suna.avsc + repo: sunav2_avro_schemas + glob: /sunav2/flags_calibration_sunav2.avsc - pfs: name: FILE_UNCERTAINTY_FDAS - repo: suna_uncertainty_fdas + repo: sunav2_uncertainty_fdas glob: /fdas_calibration_uncertainty_general.json # Outer join all days so that varying sensors between kafka and archive loaders will all get joined with calibrations. Filter-joiner will narrow down. - join: - pfs: name: CALIBRATION_PATH - repo: suna_calibration_assignment - glob: /suna/(*)/(*)/(*) + repo: sunav2_calibration_assignment + glob: /sunav2/(*)/(*)/(*) joinOn: $1/$2/$3 outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: DATA_PATH_ARCHIVE - repo: suna_data_source_trino - glob: /suna/(*)/(*)/(*) + repo: sunav2_data_source_trino + glob: /sunav2/(*)/(*)/(*) joinOn: $1/$2/$3 outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: KAFKA_UNMERGED_DATA - repo: suna_data_source_kafka - glob: /suna/(*)/(*)/(*) + repo: sunav2_data_source_kafka + glob: /sunav2/(*)/(*)/(*) joinOn: $1/$2/$3 outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. diff --git a/pipe/suna/suna_calibration_list_files.yaml b/pipe/sunav2/sunav2_calibration_list_files.yaml similarity index 91% rename from pipe/suna/suna_calibration_list_files.yaml rename to pipe/sunav2/sunav2_calibration_list_files.yaml index a87c26a04..e7cb05b79 100644 --- a/pipe/suna/suna_calibration_list_files.yaml +++ b/pipe/sunav2/sunav2_calibration_list_files.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_calibration_list_files + name: sunav2_calibration_list_files transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 cmd: ["/bin/bash"] @@ -12,7 +12,7 @@ transform: - python3 -m calval_loader.calval_loader input: pfs: - repo: suna_cron_daily_and_date_control_tick + repo: sunav2_cron_daily_and_date_control_tick glob: /* empty_files: true autoscaling: true diff --git a/pipe/suna/suna_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml similarity index 92% rename from pipe/suna/suna_calibration_loader.yaml rename to pipe/sunav2/sunav2_calibration_loader.yaml index 551f9c58e..35da08966 100644 --- a/pipe/suna/suna_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_calibration_loader + name: sunav2_calibration_loader transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret @@ -14,7 +14,7 @@ transform: CVAL_INGEST_BUCKET: neon-cval OUT_PATH: /pfs/out LOG_LEVEL: INFO - SOURCE_TYPE: suna + SOURCE_TYPE: sunav2 STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret @@ -22,7 +22,7 @@ transform: input: pfs: name: IN_PATH - repo: suna_calibration_list_files + repo: sunav2_calibration_list_files glob: /*/*/*/* empty_files: true parallelism_spec: diff --git a/pipe/suna/suna_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml similarity index 94% rename from pipe/suna/suna_cron_daily_and_date_control.yaml rename to pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 4be0808dc..97f6c00dd 100644 --- a/pipe/suna/suna_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_cron_daily_and_date_control + name: sunav2_cron_daily_and_date_control transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 cmd: ["/bin/bash"] @@ -13,7 +13,7 @@ transform: OUT_PATH: /pfs/out START_DATE: "2024-07-28" # Inclusive END_DATE: "2027-08-10" # Inclusive - SOURCE_TYPE: "suna" + SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main @@ -26,7 +26,7 @@ input: overwrite: true - pfs: name: SITE_FILE - repo: suna_site_list + repo: sunav2_site_list glob: /site-list.json resource_requests: memory: 100M diff --git a/pipe/sunav2/sunav2_data_calibration_group.yaml b/pipe/sunav2/sunav2_data_calibration_group.yaml deleted file mode 100644 index 8e54d6158..000000000 --- a/pipe/sunav2/sunav2_data_calibration_group.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -pipeline: - name: sunav2_data_calibration_group -transform: -# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-filt-join:v1.0.0 - cmd: ["/bin/bash"] - stdin: - - "#!/bin/bash" - - "python3 -m filter_joiner.filter_joiner_main" - env: - CONFIG: | - --- - # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. - # Metadata indices will typically begin at index 3. - input_paths: - - path: - name: DATA_PATH - # Filter for data directory - glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - outer_join: true - - path: - name: CALIBRATION_PATH - # Filter for data directory - glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [7] - OUT_PATH: /pfs/out - LOG_LEVEL: INFO - RELATIVE_PATH_INDEX: "3" -input: - join: - - pfs: - name: DATA_PATH - repo: data_source_sunav2 - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: true - - pfs: - name: CALIBRATION_PATH - repo: sunav2_calibration_assignment - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - empty_files: true -parallelism_spec: - constant: "1" -enable_stats: false -standby: true diff --git a/pipe/suna/suna_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml similarity index 96% rename from pipe/suna/suna_data_source_kafka.yaml rename to pipe/sunav2/sunav2_data_source_kafka.yaml index c1153e380..b9bf26a75 100644 --- a/pipe/suna/suna_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -1,13 +1,13 @@ --- pipeline: - name: suna_data_source_kafka + name: sunav2_data_source_kafka transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.4.0 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.7.0 image_pull_secrets: - battelleecology-quay-read-all-pull-secret env: OUT_PATH: /pfs/out - SOURCE_TYPE: "suna" + SOURCE_TYPE: "sunav2" LOG_LEVEL: INFO YEAR_INDEX: "5" MONTH_INDEX: "6" @@ -107,7 +107,7 @@ transform: linkdir=$(mktemp -d) shopt -s globstar out_parquet_glob="${OUT_PATH}/**/*.parquet" - # /pfs/out/suna/2023/01/01/12345/data/file.parquet + # /pfs/out/li191r/2023/01/01/12345/data/file.parquet echo "Linking output files to ${linkdir}" # set -x # Uncomment for debugging for f in $out_parquet_glob; do @@ -141,9 +141,9 @@ transform: input: pfs: name: import_trigger - repo: suna_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control # Must be datum by day (e.g. /SOURCE_TYPE/*/*/*) or by day/site (e.g. /SOURCE_TYPE/*/*/*/*) - glob: "/suna/*/*/*" + glob: "/sunav2/*/*/*" parallelism_spec: constant: 3 autoscaling: true diff --git a/pipe/suna/suna_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml similarity index 98% rename from pipe/suna/suna_data_source_trino.yaml rename to pipe/sunav2/sunav2_data_source_trino.yaml index 0769e5327..24c66fe65 100644 --- a/pipe/suna/suna_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_data_source_trino + name: sunav2_data_source_trino transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.1.1 cmd: @@ -125,7 +125,7 @@ transform: input: pfs: name: import_trigger - repo: suna_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control glob: "/*/*/*/*" output_branch: master parallelism_spec: diff --git a/pipe/suna/suna_fill_date_gaps_and_regularize.yaml b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml similarity index 78% rename from pipe/suna/suna_fill_date_gaps_and_regularize.yaml rename to pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml index 45cb7b7b3..370d181c8 100644 --- a/pipe/suna/suna_fill_date_gaps_and_regularize.yaml +++ b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_fill_date_gaps_and_regularize + name: sunav2_fill_date_gaps_and_regularize transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gf-rglr:v1.1.0 cmd: @@ -53,25 +53,25 @@ input: cross: - pfs: name: EMPTY_FILE_PATH - repo: suna_empty_files - glob: /suna + repo: sunav2_empty_files + glob: /sunav2 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - group: - pfs: name: DATA_PATH - repo: suna_location_group_and_restructure - # For full-scale daily processing, glob should be /suna/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /suna/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) - glob: /suna/(*/*/*) - #glob: /suna/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + repo: sunav2_location_group_and_restructure + # For full-scale daily processing, glob should be /sunav2/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /sunav2/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + glob: /sunav2/(*/*/*) + #glob: /sunav2/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) group_by: $1 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - join: - pfs: name: LOCATION_PATH - repo: suna_location_active_dates_assignment - # For full-scale daily processing, glob should be /suna/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /suna/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) - glob: /suna/(*/*/*) - #glob: /suna/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + repo: sunav2_location_active_dates_assignment + # For full-scale daily processing, glob should be /sunav2/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /sunav2/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) + glob: /sunav2/(*/*/*) + #glob: /sunav2/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) # For full-scale daily processing, joinOn be $1. When limiting to particular CFGLOCs, joinOn will be $2 to match parentheses around (*/*/*) joinOn: $1 #joinOn: $2 @@ -79,8 +79,8 @@ input: empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: DATE_LIMITER_PATH - repo: suna_cron_daily_and_date_control - glob: /suna/(*/*/*) + repo: sunav2_cron_daily_and_date_control + glob: /sunav2/(*/*/*) joinOn: $1 group_by: $1 empty_files: true # This can remain true even if LINK_TYPE=COPY diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.json b/pipe/sunav2/sunav2_location_active_dates_assignment.json deleted file mode 100644 index 316548474..000000000 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_active_dates_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=namedLocation" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_loader", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "200M", - "cpu": 1 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/suna/suna_location_active_dates_assignment.yaml b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml similarity index 89% rename from pipe/suna/suna_location_active_dates_assignment.yaml rename to pipe/sunav2/sunav2_location_active_dates_assignment.yaml index 535a0a8d8..e8961ac81 100644 --- a/pipe/suna/suna_location_active_dates_assignment.yaml +++ b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_location_active_dates_assignment + name: sunav2_location_active_dates_assignment transform: cmd: ["/bin/bash"] stdin: @@ -23,11 +23,11 @@ input: cross: - pfs: name: DIR_IN - repo: suna_location_loader - glob: /suna/* + repo: sunav2_location_loader + glob: /sunav2/* - pfs: name: FILE_YEAR - repo: suna_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control glob: /data_year*.txt parallelism_spec: constant: 2 diff --git a/pipe/suna/suna_location_asset.yaml b/pipe/sunav2/sunav2_location_asset.yaml similarity index 91% rename from pipe/suna/suna_location_asset.yaml rename to pipe/sunav2/sunav2_location_asset.yaml index 53d66749f..cca69b65a 100644 --- a/pipe/suna/suna_location_asset.yaml +++ b/pipe/sunav2/sunav2_location_asset.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_location_asset + name: sunav2_location_asset transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret @@ -15,13 +15,13 @@ transform: # ERR_PATH can be changed, it is user specified ERR_PATH: /pfs/out/errored_datums LOG_LEVEL: INFO - SOURCE_TYPE: suna + SOURCE_TYPE: sunav2 secrets: - name: pdr-secret mount_path: /var/db_secret input: pfs: - repo: suna_cron_daily_and_date_control_tick + repo: sunav2_cron_daily_and_date_control_tick glob: /* empty_files: true autoscaling: true diff --git a/pipe/sunav2/sunav2_location_asset_assignment.json b/pipe/sunav2/sunav2_location_asset_assignment.json deleted file mode 100644 index 017a818f4..000000000 --- a/pipe/sunav2/sunav2_location_asset_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_asset_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=asset" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_asset", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/suna/suna_location_asset_assignment.yaml b/pipe/sunav2/sunav2_location_asset_assignment.yaml similarity index 89% rename from pipe/suna/suna_location_asset_assignment.yaml rename to pipe/sunav2/sunav2_location_asset_assignment.yaml index 8ad6031fb..074ecf4b5 100644 --- a/pipe/suna/suna_location_asset_assignment.yaml +++ b/pipe/sunav2/sunav2_location_asset_assignment.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_location_asset_assignment + name: sunav2_location_asset_assignment transform: cmd: ["/bin/bash"] stdin: @@ -23,11 +23,11 @@ input: cross: - pfs: name: DIR_IN - repo: suna_location_asset - glob: /suna/* + repo: sunav2_location_asset + glob: /sunav2/* - pfs: name: FILE_YEAR - repo: suna_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control glob: /data_year*.txt parallelism_spec: constant: 2 diff --git a/pipe/suna/suna_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml similarity index 91% rename from pipe/suna/suna_location_group_and_restructure.yaml rename to pipe/sunav2/sunav2_location_group_and_restructure.yaml index 9a9b6221b..4adf27e22 100644 --- a/pipe/suna/suna_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_location_group_and_restructure + name: sunav2_location_group_and_restructure transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-strc-comb:v1.2.0 cmd: @@ -47,14 +47,14 @@ transform: - path: name: DATA_PATH # Filter for data directory - glob_pattern: /pfs/DATA_PATH/suna/*/*/*/*/** + glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] outer_join: true - path: name: LOCATION_PATH # Filter for data directory - glob_pattern: /pfs/LOCATION_PATH/suna/*/*/*/*/** + glob_pattern: /pfs/LOCATION_PATH/sunav2/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] OUT_PATH: /tmp/pfs/filter_joined @@ -67,15 +67,15 @@ input: join: - pfs: name: DATA_PATH - repo: suna_calibration_group_and_convert - glob: /suna/(*)/(*)/(*) + repo: sunav2_calibration_group_and_convert + glob: /sunav2/(*)/(*)/(*) joinOn: $1/$2/$3 outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: LOCATION_PATH - repo: suna_location_asset_assignment - glob: /suna/(*)/(*)/(*) + repo: sunav2_location_asset_assignment + glob: /sunav2/(*)/(*)/(*) joinOn: $1/$2/$3 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. parallelism_spec: diff --git a/pipe/suna/suna_location_loader.yaml b/pipe/sunav2/sunav2_location_loader.yaml similarity index 91% rename from pipe/suna/suna_location_loader.yaml rename to pipe/sunav2/sunav2_location_loader.yaml index 6c1fa1bd5..672ea2797 100644 --- a/pipe/suna/suna_location_loader.yaml +++ b/pipe/sunav2/sunav2_location_loader.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_location_loader + name: sunav2_location_loader transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret @@ -12,7 +12,7 @@ transform: - python3 -m location_loader.location_loader_main env: LOCATION_TYPE: CONFIG - SOURCE_TYPE: suna + SOURCE_TYPE: sunav2 OUT_PATH: /pfs/out # ERR_PATH can be changed, it is user specified ERR_PATH: /pfs/out/errored_datums @@ -22,7 +22,7 @@ transform: mount_path: /var/db_secret input: pfs: - repo: suna_cron_daily_and_date_control_tick + repo: sunav2_cron_daily_and_date_control_tick glob: /* empty_files: true autoscaling: true diff --git a/pipe/sunav2/sunav2_merge_data_by_location.json b/pipe/sunav2/sunav2_merge_data_by_location.json deleted file mode 100644 index 5644fd32f..000000000 --- a/pipe/sunav2/sunav2_merge_data_by_location.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_merge_data_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.data.trnc.comb.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "DirSubCombData=data", - "DirSubCopy=location|calibration" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-data-trnc-comb:v1.1.0", - "env": { - "LOG_LEVEL": "INFO", - "PARALLELIZATION_INTERNAL": "1" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_structure_repo_by_location", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": false, - "resource_requests": { - "memory": "80M", - "cpu": 0.3 - } -} diff --git a/pipe/sunav2/sunav2_structure_repo_by_location.json b/pipe/sunav2/sunav2_structure_repo_by_location.json deleted file mode 100644 index bb6f52494..000000000 --- a/pipe/sunav2/sunav2_structure_repo_by_location.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_structure_repo_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.repo.strc.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "Comb=TRUE" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-repo-strc:sha-86151412", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_calibrated_location_group", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "100M", - "cpu": 0.13 - } -} From b6e16abcd9fa322cfff5541cad4ff65e6a017cdb Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 29 May 2025 17:00:12 -0600 Subject: [PATCH 011/182] debug --- pipe/sunav2/sunav2_data_source_kafka.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index b9bf26a75..ff42aab76 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -8,7 +8,7 @@ transform: env: OUT_PATH: /pfs/out SOURCE_TYPE: "sunav2" - LOG_LEVEL: INFO + LOG_LEVEL: DEBUG YEAR_INDEX: "5" MONTH_INDEX: "6" DAY_INDEX: "7" From d65913361a2c6619d97c703e18c7265551dbf8bc Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 29 May 2025 17:17:34 -0600 Subject: [PATCH 012/182] update dates --- pipe/sunav2/site-list.json | 6 ++++-- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/site-list.json b/pipe/sunav2/site-list.json index 42a514c5e..72c62c1df 100644 --- a/pipe/sunav2/site-list.json +++ b/pipe/sunav2/site-list.json @@ -1,8 +1,10 @@ [ { - "site" : "CRAM" + "site" : "CRAM", + "kafka_start_date" : "2024-03-01" }, { - "site" : "LECO" + "site" : "LECO", + "kafka_start_date" : "2024-03-01" } ] \ No newline at end of file diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 97f6c00dd..c1b18c95c 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -12,7 +12,7 @@ transform: # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out START_DATE: "2024-07-28" # Inclusive - END_DATE: "2027-08-10" # Inclusive + END_DATE: "2024-08-10" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" From 5be5712a2fd342dd3a4bc65a1010f91574a16a84 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 30 May 2025 09:44:09 -0600 Subject: [PATCH 013/182] latest --- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 4 ++-- pipe/sunav2/sunav2_data_source_kafka.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index c1b18c95c..f37b37c41 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2024-07-28" # Inclusive - END_DATE: "2024-08-10" # Inclusive + START_DATE: "2025-05-18" # Inclusive + END_DATE: "2025-05-28" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index ff42aab76..d3d0e48b5 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -7,7 +7,7 @@ transform: - battelleecology-quay-read-all-pull-secret env: OUT_PATH: /pfs/out - SOURCE_TYPE: "sunav2" + SOURCE_TYPE: "sunav2_raw" LOG_LEVEL: DEBUG YEAR_INDEX: "5" MONTH_INDEX: "6" From 18dd3ca0e7cb8cd24c7d5822be7fdf5047fc0101 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 30 May 2025 11:09:35 -0600 Subject: [PATCH 014/182] suna trino loader --- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 4 ++-- pipe/sunav2/sunav2_data_source_trino.yaml | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index f37b37c41..678a1faa9 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-05-18" # Inclusive - END_DATE: "2025-05-28" # Inclusive + START_DATE: "2024-09-05" # Inclusive + END_DATE: "2024-09-15" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index 24c66fe65..d8bb50554 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -26,8 +26,6 @@ transform: echo "Processing $path" p=${path#/pfs} IFS="/"; arr=($p); unset IFS; - source_type=${arr[2]} - export SOURCE_TYPE=$source_type year=${arr[3]} month=${arr[4]} day=${arr[5]} @@ -48,7 +46,7 @@ transform: export GEN_SITE_NAME=$site export REQUESTS_CA_BUNDLE=/etc/pki/tls/cert.pem export GEN_YAML_CONF="/usr/src/app/genscript/configs/$(echo $SOURCE_TYPE)_streams.yaml" - export GEN_SCHEMA_FILE="/usr/src/app/schemas/$(echo $SOURCE_TYPE)/$(echo $SOURCE_TYPE).avsc" + export GEN_SCHEMA_FILE="/usr/src/app/schemas/$(echo $SOURCE_TYPE)/$(echo $SOURCE_TYPE)_raw.avsc" echo "Extracting $SOURCE_TYPE from Trino for $year/$month/$day/$site" export GEN_OUTPUT_DIR=$interimDir/$SOURCE_TYPE/$year/$month/$day mkdir -p $GEN_OUTPUT_DIR @@ -98,7 +96,7 @@ transform: EOF env: # Static environment variables for data conversion step - LOG_LEVEL: INFO + LOG_LEVEL: DEBUG REQUESTS_CA_BUNDLE: "/etc/pki/tls/cert.pem" # Environment variables for linkmerge step IN_PATH: /tmp/interimData @@ -109,6 +107,7 @@ transform: DAY_INDEX: '6' SOURCE_ID_INDEX: '7' KAFKA_RETENTION_DAYS: "15" + SOURCE_TYPE: "sunav2" secrets: - name: pachd-trino-secret key: TRINO_HOST @@ -126,7 +125,7 @@ input: pfs: name: import_trigger repo: sunav2_cron_daily_and_date_control - glob: "/*/*/*/*" + glob: "/sunav2/*/*/*" output_branch: master parallelism_spec: constant: 5 From b39dacc4106b0b0b447aae570e03accf4e3d7c82 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 08:26:59 -0600 Subject: [PATCH 015/182] fast forward --- .../sunav2/sunav2_calibration_assignment.json | 47 ++++++++++++++++ pipe/sunav2/sunav2_data_parser.yaml | 55 +++++++++++++++++++ ...nav2_location_active_dates_assignment.json | 47 ++++++++++++++++ .../sunav2_location_asset_assignment.json | 47 ++++++++++++++++ .../sunav2/sunav2_merge_data_by_location.json | 34 ++++++++++++ .../sunav2_structure_repo_by_location.json | 32 +++++++++++ 6 files changed, 262 insertions(+) create mode 100644 pipe/sunav2/sunav2_calibration_assignment.json create mode 100644 pipe/sunav2/sunav2_data_parser.yaml create mode 100644 pipe/sunav2/sunav2_location_active_dates_assignment.json create mode 100644 pipe/sunav2/sunav2_location_asset_assignment.json create mode 100644 pipe/sunav2/sunav2_merge_data_by_location.json create mode 100644 pipe/sunav2/sunav2_structure_repo_by_location.json diff --git a/pipe/sunav2/sunav2_calibration_assignment.json b/pipe/sunav2/sunav2_calibration_assignment.json new file mode 100644 index 000000000..0671e27e0 --- /dev/null +++ b/pipe/sunav2/sunav2_calibration_assignment.json @@ -0,0 +1,47 @@ +{ + "pipeline": { + "name": "sunav2_calibration_assignment" + }, + "transform": { + "cmd": [ + "Rscript", + "./flow.cal.asgn.R", + "DirIn=$DIR_IN", + "DirOut=/pfs/out", + "DirErr=/pfs/out/errored_datums", + "FileYear=$FILE_YEAR", + "PadDay=-1|1" + ], + "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.2", + "env": { + "LOG_LEVEL": "INFO" + } + }, + "input": { + "cross": [ + { + "pfs": { + "name": "DIR_IN", + "repo": "calibration", + "glob": "/sunav2/*" + } + }, + { + "pfs": { + "name": "FILE_YEAR", + "repo": "data_source_sunav2_list_years", + "glob": "/data_year*.txt" + } + } + ] + }, + "enable_stats": false, + "standby": true, + "resource_requests": { + "memory": "210M", + "cpu": 0.3 + }, + "parallelism_spec": { + "constant": "4" + } +} diff --git a/pipe/sunav2/sunav2_data_parser.yaml b/pipe/sunav2/sunav2_data_parser.yaml new file mode 100644 index 000000000..f494dde87 --- /dev/null +++ b/pipe/sunav2/sunav2_data_parser.yaml @@ -0,0 +1,55 @@ +pipeline: + name: sunav2_data_parser +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-raw-data-parser:v4.3.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m raw_data_parser.data_parser_main + env: + # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field + # if use default for FIELD_START_INDEX and FIELD_END_INDEX, + # skip first 3 fields (source_id, site_id, readout_time) in parsed schema + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + PARSE_FIELD: serial_output + RELATIVE_PATH_INDEX: "4" + PARSED_SCHEMA_PATH: /usr/src/app/schemas/sunav2/sunav2.avsc + SOURCE_TYPE: 'sunav2_raw' +input: + pfs: + name: DATA_PATH + repo: sunav2_data_source_kafka + glob: /sunav2_raw/*/*/* +parallelism_spec: + constant: 3 +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.5 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: 'true' + nodepool.neonscience.org/pipeline: 'yes' + cloud.google.com/gke-spot: 'true' +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.json b/pipe/sunav2/sunav2_location_active_dates_assignment.json new file mode 100644 index 000000000..44350351f --- /dev/null +++ b/pipe/sunav2/sunav2_location_active_dates_assignment.json @@ -0,0 +1,47 @@ +{ + "pipeline": { + "name": "sunav2_location_active_dates_assignment" + }, + "transform": { + "cmd": [ + "Rscript", + "./flow.loc.grp.asgn.R", + "DirIn=$DIR_IN", + "DirOut=/pfs/out", + "DirErr=/pfs/out/errored_datums", + "FileYear=$FILE_YEAR", + "TypeFile=namedLocation" + ], + "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1", + "env": { + "LOG_LEVEL": "INFO" + } + }, + "input": { + "cross": [ + { + "pfs": { + "name": "DIR_IN", + "repo": "location_loader", + "glob": "/sunav2/*" + } + }, + { + "pfs": { + "name": "FILE_YEAR", + "repo": "data_source_sunav2_list_years", + "glob": "/data_year*.txt" + } + } + ] + }, + "enable_stats": false, + "standby": true, + "resource_requests": { + "memory": "200M", + "cpu": 1 + }, + "parallelism_spec": { + "constant": "8" + } +} diff --git a/pipe/sunav2/sunav2_location_asset_assignment.json b/pipe/sunav2/sunav2_location_asset_assignment.json new file mode 100644 index 000000000..2ea094c2a --- /dev/null +++ b/pipe/sunav2/sunav2_location_asset_assignment.json @@ -0,0 +1,47 @@ +{ + "pipeline": { + "name": "sunav2_location_asset_assignment" + }, + "transform": { + "cmd": [ + "Rscript", + "./flow.loc.grp.asgn.R", + "DirIn=$DIR_IN", + "DirOut=/pfs/out", + "DirErr=/pfs/out/errored_datums", + "FileYear=$FILE_YEAR", + "TypeFile=asset" + ], + "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1", + "env": { + "LOG_LEVEL": "INFO" + } + }, + "input": { + "cross": [ + { + "pfs": { + "name": "DIR_IN", + "repo": "location_asset", + "glob": "/sunav2/*" + } + }, + { + "pfs": { + "name": "FILE_YEAR", + "repo": "data_source_sunav2_list_years", + "glob": "/data_year*.txt" + } + } + ] + }, + "enable_stats": false, + "standby": true, + "resource_requests": { + "memory": "210M", + "cpu": 0.3 + }, + "parallelism_spec": { + "constant": "8" + } +} diff --git a/pipe/sunav2/sunav2_merge_data_by_location.json b/pipe/sunav2/sunav2_merge_data_by_location.json new file mode 100644 index 000000000..bfdfea21f --- /dev/null +++ b/pipe/sunav2/sunav2_merge_data_by_location.json @@ -0,0 +1,34 @@ +{ + "pipeline": { + "name": "sunav2_merge_data_by_location" + }, + "transform": { + "cmd": [ + "Rscript", + "./flow.loc.data.trnc.comb.R", + "DirIn=$DIR_IN", + "DirOut=/pfs/out", + "DirErr=/pfs/out/errored_datums", + "DirSubCombData=data", + "DirSubCopy=location|calibration" + ], + "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-data-trnc-comb:v1.1.1", + "env": { + "LOG_LEVEL": "INFO", + "PARALLELIZATION_INTERNAL": "1" + } + }, + "input": { + "pfs": { + "name": "DIR_IN", + "repo": "sunav2_structure_repo_by_location", + "glob": "/sunav2/*/*/*" + } + }, + "enable_stats": false, + "standby": false, + "resource_requests": { + "memory": "80M", + "cpu": 0.3 + } +} diff --git a/pipe/sunav2/sunav2_structure_repo_by_location.json b/pipe/sunav2/sunav2_structure_repo_by_location.json new file mode 100644 index 000000000..edb59b29c --- /dev/null +++ b/pipe/sunav2/sunav2_structure_repo_by_location.json @@ -0,0 +1,32 @@ +{ + "pipeline": { + "name": "sunav2_structure_repo_by_location" + }, + "transform": { + "cmd": [ + "Rscript", + "./flow.loc.repo.strc.R", + "DirIn=$DIR_IN", + "DirOut=/pfs/out", + "DirErr=/pfs/out/errored_datums", + "Comb=TRUE" + ], + "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-repo-strc:v1.0.7", + "env": { + "LOG_LEVEL": "INFO" + } + }, + "input": { + "pfs": { + "name": "DIR_IN", + "repo": "sunav2_calibrated_location_group", + "glob": "/sunav2/*/*/*" + } + }, + "enable_stats": false, + "standby": true, + "resource_requests": { + "memory": "100M", + "cpu": 0.13 + } +} From 8a124f73b16c1f236da2948b3ff54068e19908d9 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 27 Jun 2025 08:46:07 -0600 Subject: [PATCH 016/182] Updates to SUNA modules. --- .../wrap.suna.logfiles.fill.R | 295 ++++++++++++++++++ flow/flow.suna.logfiles/wrap.suna.logfiles.R | 132 +++++--- 2 files changed, 387 insertions(+), 40 deletions(-) create mode 100644 flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R diff --git a/flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R b/flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R new file mode 100644 index 000000000..8ed73ad36 --- /dev/null +++ b/flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R @@ -0,0 +1,295 @@ +############################################################################################## +#' @title Wrapper for SUNA Log File Comparison and Gap Filling + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Compares logged data to streamed data. +#' +#' @param DirIn Character value. The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirInStream (optional) Character value. This input is used for testing purposes only prior to joining repos. +#' The input path to the streamed L0 data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirInLogs (optional) Character value. This input is used for testing purposes only prior to joining repos. +#' The input path to the log data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/sensor/yyyy/mm/dd/source-id. The source-id is the unique identifier of the sensor. \cr#' +#' +#' @param DirOut Character value. The output path that will replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. Note that you will need to distinguish between the aquatroll200 (outputs conductivity) and the +#' leveltroll500 (does not output conductivity) in your schema. +#' +#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags +#' file. If this input is not provided, the output schema for the data will be the same as the input flags +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. Note that you will need to distinguish between the aquatroll200 (outputs conductivity) and the +#' leveltroll500 (does not output conductivity) in your schema. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Combined logged and streamed L0 data in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data +DirInStream<-"~/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349" #streamed L0 data +DirIn<-NULL +DirOutBase="~/pfs/sunav2_filled_output" +SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') +# wrap.troll.logfiles.fill( +# DirInLogs=DirInLogs, +# DirInStream=DirInStream, +# DirIn=DirIn, +# DirOutBase="~/pfs/out", +# SchmDataOut="~/pfs/aquatroll200_avro_schemas/aquatroll200/aquatroll200_log_data.avsc", +# SchmFlagsOut=SchmFlagsOut, +# log=log) +#' +#' @changelog +#' Nora Catolico (2024-01-30) original creation +#' Bobby Hensley (2025-05-30) adapted for suna +#' +############################################################################################## +wrap.suna.logfiles.fill <- function(DirInLogs=NULL, + DirInStream=NULL, + DirIn, + DirOutBase, + SchmDataOut=NULL, + SchmFlagsOut=NULL, + log=NULL +){ + + # Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + # Gather info about the input directory (including date), and create base output directory + if(is.null(DirInLogs)){ + DirInLogs<-DirIn #only need one dir if this is run after filter joiner + } + if(is.null(DirInStream)){ + DirInStream<-DirIn #only need one dir if this is run after filter joiner + } + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirInStream) + dirInDataStream <- fs::path(DirInStream,'data') + dirInDataLogs <- fs::path(DirInLogs,'data') + timeBgn <- InfoDirIn$time # Earliest possible start date for the data + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutData <- base::paste0(DirOut,'/data') + base::dir.create(DirOutData,recursive=TRUE) + DirOutFlags <- base::paste0(DirOut,'/flags') + base::dir.create(DirOutFlags,recursive=TRUE) + +#' Load any L0 streamed data + fileDataStream<-base::list.files(dirInDataStream,full.names=FALSE) + L0File <- fileDataStream[!grepl('_log',fileDataStream)] + if(length(L0File)>=1){ + L0Data <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(dirInDataStream, '/', L0File), + log = log),silent = FALSE) + if (base::any(base::class(L0Data) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', dirInDataStream, '/', L0File, ' is unreadable.')) + base::stop()} + } + +#' Load any logged data + fileDataLogs<-base::list.files(dirInDataLogs,full.names=FALSE) + logFile <- fileDataLogs[grepl('_log',fileDataLogs)] + if(length(logFile)>=1){ + logData <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(dirInDataLogs, '/', logFile), + log = log),silent = FALSE) + if (base::any(base::class(logData) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', dirInDataLogs, '/', logFile, ' is unreadable.')) + base::stop()} + } + +#' Parse serial output into individual columns + L0DataParsed<-tidyr::separate(data = L0Data,col = serial_output,sep = ",|:", + into = c("pos_0","header_light_frame","pos_1","year_and_day","pos_2","time","pos_3","nitrate_concentration","pos_4","nitrogen_in_nitrate", + "pos_5","absorbance_254nm","pos_6","absorbance_350nm","pos_7","bromide_trace","pos_8","spectrum_average","pos_9","dark_value_used_for_fit","pos_10","integration_time_factor", + "pos_11","channel_1","pos_12","channel_2","pos_13","channel_3","pos_14","channel_4","pos_15","channel_5", + "pos_16","channel_6","pos_17","channel_7","pos_18","channel_8","pos_19","channel_9","pos_20","channel_10", + "pos_21","channel_11","pos_22","channel_12","pos_23","channel_13","pos_24","channel_14","pos_25","channel_15", + "pos_26","channel_16","pos_27","channel_17","pos_28","channel_18","pos_29","channel_19","pos_30","channel_20", + "pos_31","channel_21","pos_32","channel_22","pos_33","channel_23","pos_34","channel_24","pos_35","channel_25", + "pos_36","channel_26","pos_37","channel_27","pos_38","channel_28","pos_39","channel_29","pos_40","channel_30", + "pos_41","channel_31","pos_42","channel_32","pos_43","channel_33","pos_44","channel_34","pos_45","channel_35", + "pos_46","channel_36","pos_47","channel_37","pos_48","channel_38","pos_49","channel_39","pos_50","channel_40", + "pos_51","channel_41","pos_52","channel_42","pos_53","channel_43","pos_54","channel_44","pos_55","channel_45", + "pos_56","channel_46","pos_57","channel_47","pos_58","channel_48","pos_59","channel_49","pos_60","channel_50", + "pos_61","channel_51","pos_62","channel_52","pos_63","channel_53","pos_64","channel_54","pos_65","channel_55", + "pos_66","channel_56","pos_67","channel_57","pos_68","channel_58","pos_69","channel_59","pos_70","channel_60", + "pos_71","channel_61","pos_72","channel_62","pos_73","channel_63","pos_74","channel_64","pos_75","channel_65", + "pos_76","channel_66","pos_77","channel_67","pos_78","channel_68","pos_79","channel_69","pos_80","channel_70", + "pos_81","channel_71","pos_82","channel_72","pos_83","channel_73","pos_84","channel_74","pos_85","channel_75", + "pos_86","channel_76","pos_87","channel_77","pos_88","channel_78","pos_89","channel_79","pos_90","channel_80", + "pos_91","channel_81","pos_92","channel_82","pos_93","channel_83","pos_94","channel_84","pos_95","channel_85", + "pos_96","channel_86","pos_97","channel_87","pos_98","channel_88","pos_99","channel_89","pos_100","channel_90", + "pos_101","channel_91","pos_102","channel_92","pos_103","channel_93","pos_104","channel_94","pos_105","channel_95", + "pos_106","channel_96","pos_107","channel_97","pos_108","channel_98","pos_109","channel_99","pos_110","channel_100", + "pos_111","channel_101","pos_112","channel_102","pos_113","channel_103","pos_114","channel_104","pos_115","channel_105", + "pos_116","channel_106","pos_117","channel_107","pos_118","channel_108","pos_119","channel_109","pos_120","channel_110", + "pos_121","channel_111","pos_122","channel_112","pos_123","channel_113","pos_124","channel_114","pos_125","channel_115", + "pos_126","channel_116","pos_127","channel_117","pos_128","channel_118","pos_129","channel_119","pos_130","channel_120", + "pos_131","channel_121","pos_132","channel_122","pos_133","channel_123","pos_134","channel_124","pos_135","channel_125", + "pos_136","channel_126","pos_137","channel_127","pos_138","channel_128","pos_139","channel_129","pos_140","channel_130", + "pos_141","channel_131","pos_142","channel_132","pos_143","channel_133","pos_144","channel_134","pos_145","channel_135", + "pos_146","channel_136","pos_147","channel_137","pos_148","channel_138","pos_149","channel_139","pos_150","channel_140", + "pos_151","channel_141","pos_152","channel_142","pos_153","channel_143","pos_154","channel_144","pos_155","channel_145", + "pos_156","channel_146","pos_157","channel_147","pos_158","channel_148","pos_159","channel_149","pos_160","channel_150", + "pos_161","channel_151","pos_162","channel_152","pos_163","channel_153","pos_164","channel_154","pos_165","channel_155", + "pos_166","channel_156","pos_167","channel_157","pos_168","channel_158","pos_169","channel_159","pos_170","channel_160", + "pos_171","channel_161","pos_172","channel_162","pos_173","channel_163","pos_174","channel_164","pos_175","channel_165", + "pos_176","channel_166","pos_177","channel_167","pos_178","channel_168","pos_179","channel_169","pos_180","channel_170", + "pos_181","channel_171","pos_182","channel_172","pos_183","channel_173","pos_184","channel_174","pos_185","channel_175", + "pos_186","channel_176","pos_187","channel_177","pos_188","channel_178","pos_189","channel_179","pos_190","channel_180", + "pos_191","channel_181","pos_192","channel_182","pos_193","channel_183","pos_194","channel_184","pos_195","channel_185", + "pos_196","channel_186","pos_197","channel_187","pos_198","channel_188","pos_199","channel_189","pos_200","channel_190", + "pos_201","channel_191","pos_202","channel_192","pos_203","channel_193","pos_204","channel_194","pos_205","channel_195", + "pos_206","channel_196","pos_207","channel_197","pos_208","channel_198","pos_209","channel_199","pos_210","channel_200", + "pos_211","channel_201","pos_212","channel_202","pos_213","channel_203","pos_214","channel_204","pos_215","channel_205", + "pos_216","channel_206","pos_217","channel_207","pos_218","channel_208","pos_219","channel_209","pos_220","channel_210", + "pos_221","channel_211","pos_222","channel_212","pos_223","channel_213","pos_224","channel_214","pos_225","channel_215", + "pos_226","channel_216","pos_227","channel_217","pos_228","channel_218","pos_229","channel_219","pos_230","channel_220", + "pos_231","channel_221","pos_232","channel_222","pos_233","channel_223","pos_234","channel_224","pos_235","channel_225", + "pos_236","channel_226","pos_237","channel_227","pos_238","channel_228","pos_239","channel_229","pos_240","channel_230", + "pos_241","channel_231","pos_242","channel_232","pos_243","channel_233","pos_244","channel_234","pos_245","channel_235", + "pos_246","channel_236","pos_247","channel_237","pos_248","channel_238","pos_249","channel_239","pos_250","channel_240", + "pos_251","channel_241","pos_252","channel_242","pos_253","channel_243","pos_254","channel_244","pos_255","channel_245", + "pos_256","channel_246","pos_257","channel_247","pos_258","channel_248","pos_259","channel_249","pos_260","channel_250", + "pos_261","channel_251","pos_262","channel_252","pos_263","channel_253","pos_264","channel_254","pos_265","channel_255", + "pos_266","channel_256", + "pos_267","internal_temperature","pos_268","spectrometer_temperature","pos_269","lamp_temperature","pos_270","lamp_on_time", + "pos_271","relative_humidity","pos_272","main_voltage","pos_273","lamp_voltage","pos_274","internal_voltage", + "pos_275","main_current","pos_276","fit_aux_1","pos_277","fit_aux_2","pos_278","fit_base_1","pos_279","fit_base_2", + "pos_280","fit_rmse","pos_281","ctd_time","pos_282","ctd_salinity","pos_283","ctd_temperature", + "pos_284","ctd_pressure","pos_285","check_sum")) + +#' Drops serial output position columns + L0DataParsed<-L0DataParsed[!grepl("pos",names(L0DataParsed))] + +#' Combines all 256 spectrum channels into single array + L0DataParsed$spectrum_channels<-paste(L0DataParsed$channel_1,L0DataParsed$channel_2,L0DataParsed$channel_3,L0DataParsed$channel_4,L0DataParsed$channel_5,L0DataParsed$channel_6,L0DataParsed$channel_7,L0DataParsed$channel_8,L0DataParsed$channel_9,L0DataParsed$channel_10, + L0DataParsed$channel_11,L0DataParsed$channel_12,L0DataParsed$channel_13,L0DataParsed$channel_14,L0DataParsed$channel_15,L0DataParsed$channel_16,L0DataParsed$channel_17,L0DataParsed$channel_18,L0DataParsed$channel_19,L0DataParsed$channel_20, + L0DataParsed$channel_21,L0DataParsed$channel_22,L0DataParsed$channel_23,L0DataParsed$channel_24,L0DataParsed$channel_25,L0DataParsed$channel_26,L0DataParsed$channel_27,L0DataParsed$channel_28,L0DataParsed$channel_29,L0DataParsed$channel_30, + L0DataParsed$channel_31,L0DataParsed$channel_32,L0DataParsed$channel_33,L0DataParsed$channel_34,L0DataParsed$channel_35,L0DataParsed$channel_36,L0DataParsed$channel_37,L0DataParsed$channel_38,L0DataParsed$channel_39,L0DataParsed$channel_40, + L0DataParsed$channel_41,L0DataParsed$channel_42,L0DataParsed$channel_43,L0DataParsed$channel_44,L0DataParsed$channel_45,L0DataParsed$channel_46,L0DataParsed$channel_47,L0DataParsed$channel_48,L0DataParsed$channel_49,L0DataParsed$channel_50, + L0DataParsed$channel_51,L0DataParsed$channel_52,L0DataParsed$channel_53,L0DataParsed$channel_54,L0DataParsed$channel_55,L0DataParsed$channel_56,L0DataParsed$channel_57,L0DataParsed$channel_58,L0DataParsed$channel_59,L0DataParsed$channel_60, + L0DataParsed$channel_61,L0DataParsed$channel_62,L0DataParsed$channel_63,L0DataParsed$channel_64,L0DataParsed$channel_65,L0DataParsed$channel_66,L0DataParsed$channel_67,L0DataParsed$channel_68,L0DataParsed$channel_69,L0DataParsed$channel_70, + L0DataParsed$channel_71,L0DataParsed$channel_72,L0DataParsed$channel_73,L0DataParsed$channel_74,L0DataParsed$channel_75,L0DataParsed$channel_76,L0DataParsed$channel_77,L0DataParsed$channel_78,L0DataParsed$channel_79,L0DataParsed$channel_80, + L0DataParsed$channel_81,L0DataParsed$channel_82,L0DataParsed$channel_83,L0DataParsed$channel_84,L0DataParsed$channel_85,L0DataParsed$channel_86,L0DataParsed$channel_87,L0DataParsed$channel_88,L0DataParsed$channel_89,L0DataParsed$channel_90, + L0DataParsed$channel_91,L0DataParsed$channel_92,L0DataParsed$channel_93,L0DataParsed$channel_94,L0DataParsed$channel_95,L0DataParsed$channel_96,L0DataParsed$channel_97,L0DataParsed$channel_98,L0DataParsed$channel_99,L0DataParsed$channel_100, + L0DataParsed$channel_101,L0DataParsed$channel_102,L0DataParsed$channel_103,L0DataParsed$channel_104,L0DataParsed$channel_105,L0DataParsed$channel_106,L0DataParsed$channel_107,L0DataParsed$channel_108,L0DataParsed$channel_109,L0DataParsed$channel_110, + L0DataParsed$channel_111,L0DataParsed$channel_112,L0DataParsed$channel_113,L0DataParsed$channel_114,L0DataParsed$channel_115,L0DataParsed$channel_116,L0DataParsed$channel_117,L0DataParsed$channel_118,L0DataParsed$channel_119,L0DataParsed$channel_120, + L0DataParsed$channel_121,L0DataParsed$channel_122,L0DataParsed$channel_123,L0DataParsed$channel_124,L0DataParsed$channel_125,L0DataParsed$channel_126,L0DataParsed$channel_127,L0DataParsed$channel_128,L0DataParsed$channel_129,L0DataParsed$channel_130, + L0DataParsed$channel_131,L0DataParsed$channel_132,L0DataParsed$channel_133,L0DataParsed$channel_134,L0DataParsed$channel_135,L0DataParsed$channel_136,L0DataParsed$channel_137,L0DataParsed$channel_138,L0DataParsed$channel_139,L0DataParsed$channel_140, + L0DataParsed$channel_141,L0DataParsed$channel_142,L0DataParsed$channel_143,L0DataParsed$channel_144,L0DataParsed$channel_145,L0DataParsed$channel_146,L0DataParsed$channel_147,L0DataParsed$channel_148,L0DataParsed$channel_149,L0DataParsed$channel_150, + L0DataParsed$channel_151,L0DataParsed$channel_152,L0DataParsed$channel_153,L0DataParsed$channel_154,L0DataParsed$channel_155,L0DataParsed$channel_156,L0DataParsed$channel_157,L0DataParsed$channel_158,L0DataParsed$channel_159,L0DataParsed$channel_160, + L0DataParsed$channel_161,L0DataParsed$channel_162,L0DataParsed$channel_163,L0DataParsed$channel_164,L0DataParsed$channel_165,L0DataParsed$channel_166,L0DataParsed$channel_167,L0DataParsed$channel_168,L0DataParsed$channel_169,L0DataParsed$channel_170, + L0DataParsed$channel_171,L0DataParsed$channel_172,L0DataParsed$channel_173,L0DataParsed$channel_174,L0DataParsed$channel_175,L0DataParsed$channel_176,L0DataParsed$channel_177,L0DataParsed$channel_178,L0DataParsed$channel_179,L0DataParsed$channel_180, + L0DataParsed$channel_181,L0DataParsed$channel_182,L0DataParsed$channel_183,L0DataParsed$channel_184,L0DataParsed$channel_185,L0DataParsed$channel_186,L0DataParsed$channel_187,L0DataParsed$channel_188,L0DataParsed$channel_189,L0DataParsed$channel_190, + L0DataParsed$channel_191,L0DataParsed$channel_192,L0DataParsed$channel_193,L0DataParsed$channel_194,L0DataParsed$channel_195,L0DataParsed$channel_196,L0DataParsed$channel_197,L0DataParsed$channel_198,L0DataParsed$channel_199,L0DataParsed$channel_200, + L0DataParsed$channel_201,L0DataParsed$channel_202,L0DataParsed$channel_203,L0DataParsed$channel_204,L0DataParsed$channel_205,L0DataParsed$channel_206,L0DataParsed$channel_207,L0DataParsed$channel_208,L0DataParsed$channel_209,L0DataParsed$channel_210, + L0DataParsed$channel_211,L0DataParsed$channel_212,L0DataParsed$channel_213,L0DataParsed$channel_214,L0DataParsed$channel_215,L0DataParsed$channel_216,L0DataParsed$channel_217,L0DataParsed$channel_218,L0DataParsed$channel_219,L0DataParsed$channel_220, + L0DataParsed$channel_221,L0DataParsed$channel_222,L0DataParsed$channel_223,L0DataParsed$channel_224,L0DataParsed$channel_225,L0DataParsed$channel_226,L0DataParsed$channel_227,L0DataParsed$channel_228,L0DataParsed$channel_229,L0DataParsed$channel_230, + L0DataParsed$channel_231,L0DataParsed$channel_232,L0DataParsed$channel_233,L0DataParsed$channel_234,L0DataParsed$channel_235,L0DataParsed$channel_236,L0DataParsed$channel_237,L0DataParsed$channel_238,L0DataParsed$channel_239,L0DataParsed$channel_240, + L0DataParsed$channel_241,L0DataParsed$channel_242,L0DataParsed$channel_243,L0DataParsed$channel_244,L0DataParsed$channel_245,L0DataParsed$channel_246,L0DataParsed$channel_247,L0DataParsed$channel_248,L0DataParsed$channel_249,L0DataParsed$channel_250, + L0DataParsed$channel_251,L0DataParsed$channel_252,L0DataParsed$channel_253,L0DataParsed$channel_254,L0DataParsed$channel_255,L0DataParsed$channel_256,sep=";") + +#' Checks that each data burst is complete (Right now only checks whether last column is a value or not) + L0DataParsed$error_missing_data<-NA + for(i in 1:nrow(L0DataParsed)){if(is.na(L0DataParsed[i,which(colnames(L0DataParsed)=="check_sum")])){L0DataParsed[i,which(colnames(L0DataParsed)=="error_missing_data")]=TRUE} + else{L0DataParsed[i,which(colnames(L0DataParsed)=="error_missing_data")]=FALSE}} + +#' Create additional header columns needed to match avro schema + L0DataParsed$header_manufacturer<-"SATS" + L0DataParsed$header_serial_number<-NA #' Can leave this blank for now + +#' Re-orders columns so they match the avro schema + L0DataParsed<-L0DataParsed[,c("source_id","site_id","readout_time","header_manufacturer","header_serial_number","header_light_frame","year_and_day","time","nitrate_concentration", + "nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm","bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", + "spectrum_channels","internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", + "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature","ctd_pressure", + "check_sum","error_missing_data")] + +#' Determine whether to use logged or streamed data. + #' Preference is to use logged data if available + if(!is.null(logData)){dataOut<-logData} + if(is.null(logData) & !is.null(L0DataParsed)){dataOut<-L0DataParsed} + if(is.null(logData) & is.null(L0DataParsed)){dataOut<-L0DataParsed} + +#' Write out data + + + + + + + + + + #write out data + fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name + asset<-tail(x=fileOutSplt,n=1) + csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_filled') + + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, + NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutData,'/',csv_name,".parquet"))) + } + + #write out flags + csv_name_flags <-paste0(sensor,'_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') + + rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagsOut, + NameFile = base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"), + Schm = SchmFlagsOut),silent=TRUE) + if(class(rptOutFlags)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Flags to ',base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"),'. ',attr(rptOutFlags, "condition"))) + stop() + } else { + log$info(base::paste0('Flags written successfully in ', base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"))) + } + +} + + + + + + + + + + + + + + + + diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.suna.logfiles/wrap.suna.logfiles.R index 64080f10e..aaa1ce938 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.suna.logfiles/wrap.suna.logfiles.R @@ -31,8 +31,10 @@ #' #' @examples #' # Not run -#' FileIn <- "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" -#' log <- NEONprocIS.base::def.log.init(Lvl = "debug") +FileIn <- "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" +DirOut="~/pfs/sunav2_logs_output" +SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' wrap.suna.logfiles <- function(FileIn = "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", #' DirOut="~/pfs/out", #' SchmDataOut=NULL, @@ -48,44 +50,38 @@ wrap.suna.logfiles <- function(FileIn, log=NULL ){ - # Start logging if not already +#' Start logging if not already if(base::is.null(log)){ log <- NEONprocIS.base::def.log.init() } - # Load in the csv log file(s) - log_file <- +#' Load in the csv log file(s) + logFile <- base::try(read.table(paste0(FileIn), header = FALSE, sep = ",", col.names = paste0("V",seq_len(286)),encoding = 'utf-8', stringsAsFactors = FALSE,fill = TRUE,strip.white = TRUE,na.strings=c(-1,''))) - if (base::any(base::class(log_file) == 'try-error')) { + if (base::any(base::class(logFile) == 'try-error')) { # Generate error and stop execution log$error(base::paste0('File ', FileIn, ' is unreadable. Likely not a data file.')) base::stop() } - if(any(grepl('TROLL',log_file))){ + if(any(grepl('TROLL',logFile))){ log$debug(base::paste0('skipping troll file: ', FileIn)) base::stop() - }else if(any(grepl('Turbidity',log_file))){ + }else if(any(grepl('Turbidity',logFile))){ log$debug(base::paste0('skipping sonde file: ', FileIn)) base::stop() } - # Find row where data actually starts - start<-which(grepl('Zeiss Coefficient',log_file$V2))+1 +#' Find row where data actually starts + start<-which(grepl('Zeiss Coefficient',logFile$V2))+1 # Separate data and metadata - log_data<-log_file[start:(nrow(log_file)),] - log_metadata<-log_file[1:(start-1),2:6] + logData<-logFile[start:(nrow(logFile)),] + logMetadata<-logFile[1:(start-1),2:6] - # Gets metadata - sensor<-"suna" - serial_number<-log_metadata[1,2] - asset_string <- regexpr("\\/[0-9]{5}\\/",FileIn) #' For SUNA asset info not included in log file header. Need it from input file folder name. - asset<-gsub("\\/","",substr(FileIn,asset_string[1],asset_string[1]+attributes(asset_string)$match.length-1)) - - # Create column names for data - names(log_data)<-c("serial_number","date","time","nitrate_uM","nitrate_mgL","absorbance_254","absorbance_350","bromide", - "spec_avg","dark_value","int_time_factor", +#' Update names of existing columns to match avro schema + names(logData)<-c("header_serial_number","year_and_day","time","nitrate_concentration","nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm", + "bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", "channel_1","channel_2","channel_3","channel_4","channel_5","channel_6","channel_7","channel_8","channel_9","channel_10", "channel_11","channel_12","channel_13","channel_14","channel_15","channel_16","channel_17","channel_18","channel_19","channel_20", "channel_21","channel_22","channel_23","channel_24","channel_25","channel_26","channel_27","channel_28","channel_29","channel_30", @@ -112,29 +108,84 @@ wrap.suna.logfiles <- function(FileIn, "channel_231","channel_232","channel_233","channel_234","channel_235","channel_236","channel_237","channel_238","channel_239","channel_240", "channel_241","channel_242","channel_243","channel_244","channel_245","channel_246","channel_247","channel_248","channel_249","channel_250", "channel_251","channel_252","channel_253","channel_254","channel_255","channel_256", - "internal_temp","spec_temp","lamp_temp","cum_lamp_time","humidity","main_volt","lamp_volt","internal_volt","current","fit_aux_1","fit_aux_2", - "fit_base_1","fit_base_2","fit_RMSE","ctd_time","ctd_salinity","ctd_temp","ctd_pressure","check_sum") + "internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", + "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature", + "ctd_pressure","check_sum") + +#' Checks that each data burst is complete (Right now only checks whether last column is a value or not) + logData$error_missing_data<-NA + for(i in 1:nrow(logData)){if(is.na(logData[i,which(colnames(logData)=="check_sum")])){logData[i,which(colnames(logData)=="error_missing_data")]=TRUE} + else{logData[i,which(colnames(logData)=="error_missing_data")]=FALSE}} + +#' Combines all 256 spectrum channels into single array + logData$spectrum_channels<-paste(logData$channel_1,logData$channel_2,logData$channel_3,logData$channel_4,logData$channel_5,logData$channel_6,logData$channel_7,logData$channel_8,logData$channel_9,logData$channel_10, + logData$channel_11,logData$channel_12,logData$channel_13,logData$channel_14,logData$channel_15,logData$channel_16,logData$channel_17,logData$channel_18,logData$channel_19,logData$channel_20, + logData$channel_21,logData$channel_22,logData$channel_23,logData$channel_24,logData$channel_25,logData$channel_26,logData$channel_27,logData$channel_28,logData$channel_29,logData$channel_30, + logData$channel_31,logData$channel_32,logData$channel_33,logData$channel_34,logData$channel_35,logData$channel_36,logData$channel_37,logData$channel_38,logData$channel_39,logData$channel_40, + logData$channel_41,logData$channel_42,logData$channel_43,logData$channel_44,logData$channel_45,logData$channel_46,logData$channel_47,logData$channel_48,logData$channel_49,logData$channel_50, + logData$channel_51,logData$channel_52,logData$channel_53,logData$channel_54,logData$channel_55,logData$channel_56,logData$channel_57,logData$channel_58,logData$channel_59,logData$channel_60, + logData$channel_61,logData$channel_62,logData$channel_63,logData$channel_64,logData$channel_65,logData$channel_66,logData$channel_67,logData$channel_68,logData$channel_69,logData$channel_70, + logData$channel_71,logData$channel_72,logData$channel_73,logData$channel_74,logData$channel_75,logData$channel_76,logData$channel_77,logData$channel_78,logData$channel_79,logData$channel_80, + logData$channel_81,logData$channel_82,logData$channel_83,logData$channel_84,logData$channel_85,logData$channel_86,logData$channel_87,logData$channel_88,logData$channel_89,logData$channel_90, + logData$channel_91,logData$channel_92,logData$channel_93,logData$channel_94,logData$channel_95,logData$channel_96,logData$channel_97,logData$channel_98,logData$channel_99,logData$channel_100, + logData$channel_101,logData$channel_102,logData$channel_103,logData$channel_104,logData$channel_105,logData$channel_106,logData$channel_107,logData$channel_108,logData$channel_109,logData$channel_110, + logData$channel_111,logData$channel_112,logData$channel_113,logData$channel_114,logData$channel_115,logData$channel_116,logData$channel_117,logData$channel_118,logData$channel_119,logData$channel_120, + logData$channel_121,logData$channel_122,logData$channel_123,logData$channel_124,logData$channel_125,logData$channel_126,logData$channel_127,logData$channel_128,logData$channel_129,logData$channel_130, + logData$channel_131,logData$channel_132,logData$channel_133,logData$channel_134,logData$channel_135,logData$channel_136,logData$channel_137,logData$channel_138,logData$channel_139,logData$channel_140, + logData$channel_141,logData$channel_142,logData$channel_143,logData$channel_144,logData$channel_145,logData$channel_146,logData$channel_147,logData$channel_148,logData$channel_149,logData$channel_150, + logData$channel_151,logData$channel_152,logData$channel_153,logData$channel_154,logData$channel_155,logData$channel_156,logData$channel_157,logData$channel_158,logData$channel_159,logData$channel_160, + logData$channel_161,logData$channel_162,logData$channel_163,logData$channel_164,logData$channel_165,logData$channel_166,logData$channel_167,logData$channel_168,logData$channel_169,logData$channel_170, + logData$channel_171,logData$channel_172,logData$channel_173,logData$channel_174,logData$channel_175,logData$channel_176,logData$channel_177,logData$channel_178,logData$channel_179,logData$channel_180, + logData$channel_181,logData$channel_182,logData$channel_183,logData$channel_184,logData$channel_185,logData$channel_186,logData$channel_187,logData$channel_188,logData$channel_189,logData$channel_190, + logData$channel_191,logData$channel_192,logData$channel_193,logData$channel_194,logData$channel_195,logData$channel_196,logData$channel_197,logData$channel_198,logData$channel_199,logData$channel_200, + logData$channel_201,logData$channel_202,logData$channel_203,logData$channel_204,logData$channel_205,logData$channel_206,logData$channel_207,logData$channel_208,logData$channel_209,logData$channel_210, + logData$channel_211,logData$channel_212,logData$channel_213,logData$channel_214,logData$channel_215,logData$channel_216,logData$channel_217,logData$channel_218,logData$channel_219,logData$channel_220, + logData$channel_221,logData$channel_222,logData$channel_223,logData$channel_224,logData$channel_225,logData$channel_226,logData$channel_227,logData$channel_228,logData$channel_229,logData$channel_230, + logData$channel_231,logData$channel_232,logData$channel_233,logData$channel_234,logData$channel_235,logData$channel_236,logData$channel_237,logData$channel_238,logData$channel_239,logData$channel_240, + logData$channel_241,logData$channel_242,logData$channel_243,logData$channel_244,logData$channel_245,logData$channel_246,logData$channel_247,logData$channel_248,logData$channel_249,logData$channel_250, + logData$channel_251,logData$channel_252,logData$channel_253,logData$channel_254,logData$channel_255,logData$channel_256,sep=";") + +#' Calculates the readout date and time in POSIXct format + logData$readout_time<-lubridate::parse_date_time(as.character(logData$year_and_day),order="yj") + op <- options(digits.secs=3) + logData$readout_time<-lubridate::with_tz(logData$readout_time+(as.numeric(logData$time)*60*60),'UTC') + +#' Create additional header columns needed to match avro schema + asset_string <- regexpr("\\/[0-9]{5}\\/",FileIn) #' For SUNA asset info not included in log file header. Need it from input file folder name. + asset<-gsub("\\/","",substr(FileIn,asset_string[1],asset_string[1]+attributes(asset_string)$match.length-1)) + logData$source_id<-asset + logData$site_id<-NA #' This can be left blank for now + serial_number<-as.data.frame(strsplit(logMetadata[1,2],":")) + logData$header_manufacturer<-"SATS" + logData$header_serial_number<-serial_number[2,1] + logData$header_light_frame<-NA + for(i in 1:nrow(logData)){if(logData[i,which(colnames(logData)=="dark_value_used_for_fit")]==0){logData[i,which(colnames(logData)=="header_light_frame")]=0} + else{logData[i,which(colnames(logData)=="header_light_frame")]=1}} + +#' Re-orders columns so they match the avro schema + logData<-logData[,c("source_id","site_id","readout_time","header_manufacturer","header_serial_number","header_light_frame","year_and_day","time","nitrate_concentration", + "nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm","bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", + "spectrum_channels","internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", + "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature","ctd_pressure", + "check_sum","error_missing_data")] - # Calculates the date and time in POSIXct format - log_data$date<-lubridate::parse_date_time(as.character(log_data$date),order="yj") - log_data$date<-lubridate::with_tz(log_data$date+(as.numeric(log_data$time)*60*60),'UTC') - # Checks that there are no dates prior to when NEON began collecting IS data - if(any(log_data$date<"2014-01-01 00:00:00 UTC")){ +#' Checks that there are no dates prior to when NEON began collecting IS data + if(any(logData$readout_time<"2014-01-01 00:00:00 UTC")){ log$debug(base::paste0("Data contains dates prior to when NEON began collecting IS data"))} - # Checks that there are no future dates after the current date - if(any(log_data$date>Sys.time())){ +#' Checks that there are no future dates after the current date + if(any(logData$readout_time>Sys.time())){ log$debug(base::paste0("Data contains future dates after the current date"))} - # Output file - # Create output directory - year <- substr(log_data$date[1],1,4) - month <- substr(log_data$date[1],6,7) - day <- substr(log_data$date[1],9,10) - DirOutLogFile <- paste0(DirOut,'/',sensor,'/',year,'/',month,'/',day,'/',asset,'/data/') +#' Output file + #' Create output directory + year <- substr(logData$readout_time[1],1,4) + month <- substr(logData$readout_time[1],6,7) + day <- substr(logData$readout_time[1],9,10) + DirOutLogFile <- paste0(DirOut,'/sunav2/',year,'/',month,'/',day,'/',asset,'/data/') base::dir.create(DirOutLogFile,recursive=TRUE) - csv_name <-paste0(sensor,'_',asset,'_',year,'-',month,'-',day,'_log') - # Writes parquet file to output directory - rptOut <- try(NEONprocIS.base::def.wrte.parq(data = log_data, + csv_name <-paste0('sunav2_',asset,'_',year,'-',month,'-',day,'_log') + #' Writes parquet file to output directory + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = logData, NameFile = base::paste0(DirOutLogFile,csv_name,".parquet"), Schm = SchmDataOut),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ @@ -144,7 +195,8 @@ wrap.suna.logfiles <- function(FileIn, log$info(base::paste0('Data written successfully in ', base::paste0(DirOutLogFile,csv_name,".parquet"))) } -} # End of file +} +#' End of file From 66b0dc6dfd9185e6c30029ddf6cb2ae268d041d7 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 10:02:29 -0600 Subject: [PATCH 017/182] latest --- flow/flow.sunav2.logfiles.fill/Dockerfile | 20 ++ .../flow.sunav2.logfiles.fill.R | 152 +++++++++ flow/flow.sunav2.logfiles.fill/renv.lock | 308 ++++++++++++++++++ .../wrap.sunav2.logfiles.fill.R} | 26 +- flow/flow.sunav2.logfiles/Dockerfile | 20 ++ .../flow.sunav2.logfiles.R | 149 +++++++++ flow/flow.sunav2.logfiles/renv.lock | 101 ++++++ .../wrap.sunav2.logfiles.R} | 8 +- .../nitrate_cron_monthly_and_pub_control.yaml | 49 +++ .../sunav2_logjam_assign_clean_files.yaml | 55 ++++ pipe/sunav2/sunav2_logjam_list_files.yaml | 48 +++ pipe/sunav2/sunav2_logjam_load_files.yaml | 59 ++++ 12 files changed, 973 insertions(+), 22 deletions(-) create mode 100644 flow/flow.sunav2.logfiles.fill/Dockerfile create mode 100644 flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R create mode 100644 flow/flow.sunav2.logfiles.fill/renv.lock rename flow/{flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R => flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R} (95%) create mode 100644 flow/flow.sunav2.logfiles/Dockerfile create mode 100644 flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R create mode 100644 flow/flow.sunav2.logfiles/renv.lock rename flow/{flow.suna.logfiles/wrap.suna.logfiles.R => flow.sunav2.logfiles/wrap.sunav2.logfiles.R} (98%) create mode 100644 pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml create mode 100644 pipe/sunav2/sunav2_logjam_assign_clean_files.yaml create mode 100644 pipe/sunav2/sunav2_logjam_list_files.yaml create mode 100644 pipe/sunav2/sunav2_logjam_load_files.yaml diff --git a/flow/flow.sunav2.logfiles.fill/Dockerfile b/flow/flow.sunav2.logfiles.fill/Dockerfile new file mode 100644 index 000000000..3305d1127 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 Logfile Processing + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.logfiles.fill" + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.logfiles.fill.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.logfiles.fill.R . + diff --git a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R new file mode 100644 index 000000000..76a0cbde8 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R @@ -0,0 +1,152 @@ +############################################################################################## +#' @title Workflow for SUNA Log File Comparison and Gap Filling + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} + +#' @description Workflow. Compares logged data to streamed data and fills gaps. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id.The source-id folder may have multiple csv log files. +#' The source-id is the unique identifier of the sensor.#' +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "FileSchmData=value" (optional), where values is the full path to the avro schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' 5. "FileSchmFlags=value" (optional), where values is the full path to the avro schema for the output flags +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Cleaned sunav2 log files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' Stepping through the code in Rstudio +# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349') #cleaned log data +# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349') #streamed L0 data +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2.avsc") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +# Nora Catolico (2024-01-30) original creation +# Bobby Hensley \email{hensley@battelleecology.org} + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.logfiles.fill.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("FileSchmData","FileSchmFlags"),log = log) + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) +log$debug(base::paste0('Schema for output flags: ', Para$FileSchmFlags)) + + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') +} +if(base::is.null(Para$FileSchmFlags) || Para$FileSchmFlags == 'NA'){ + SchmFlagsOut <- NULL +} else { + SchmFlagsOut <- base::paste0(base::readLines(Para$FileSchmFlags),collapse='') +} + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = 'data', + log = log) + + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxDirIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to datum: ', idxDirIn)) + + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.logfiles.fill( + DirIn=idxDirIn, + DirOutBase=Para$DirOut, + SchmDataOut=SchmDataOut, + SchmFlagsOut=SchmFlagsOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + + # Re-route the failed datum + NEONprocIS.base::def.err.datm( + err=err, + call.stack=call.stack, + DirDatm=idxDirIn, + DirErrBase=Para$DirErr, + RmvDatmOut=TRUE, + DirOutBase=Para$DirOut, + log=log + ) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.logfiles.fill/renv.lock b/flow/flow.sunav2.logfiles.fill/renv.lock new file mode 100644 index 000000000..04ce2a180 --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/renv.lock @@ -0,0 +1,308 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077", + "Requirements": [ + "rlang" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "fs": { + "Package": "fs", + "Version": "1.6.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7eb1e342eee7e0a7449c49cdaa526d39", + "Requirements": [] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc8b5d43f90551fa6df0a6be5d640a4f", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "892124978869b74935dc3934c42bfe5a", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d8b95b7fee945d7da6888cf7eb71a49c", + "Requirements": [ + "cpp11", + "dplyr", + "ellipsis", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R similarity index 95% rename from flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R rename to flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 8ed73ad36..5b077aa92 100644 --- a/flow/flow.suna.logfiles.fill/wrap.suna.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -23,14 +23,12 @@ #' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data #' file. If this input is not provided, the output schema for the data will be the same as the input data #' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF -#' THE INPUT DATA. Note that you will need to distinguish between the aquatroll200 (outputs conductivity) and the -#' leveltroll500 (does not output conductivity) in your schema. +#' THE INPUT DATA. #' #' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags #' file. If this input is not provided, the output schema for the data will be the same as the input flags #' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF -#' THE INPUT DATA. Note that you will need to distinguish between the aquatroll200 (outputs conductivity) and the -#' leveltroll500 (does not output conductivity) in your schema. +#' THE INPUT DATA. #' #' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init @@ -45,21 +43,13 @@ #' #' @examples #' # Not run -DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data -DirInStream<-"~/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349" #streamed L0 data -DirIn<-NULL -DirOutBase="~/pfs/sunav2_filled_output" -SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data +# DirInStream<-"~/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349" #streamed L0 data +# DirIn<-NULL +# DirOutBase="~/pfs/sunav2_filled_output" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') -# wrap.troll.logfiles.fill( -# DirInLogs=DirInLogs, -# DirInStream=DirInStream, -# DirIn=DirIn, -# DirOutBase="~/pfs/out", -# SchmDataOut="~/pfs/aquatroll200_avro_schemas/aquatroll200/aquatroll200_log_data.avsc", -# SchmFlagsOut=SchmFlagsOut, -# log=log) #' #' @changelog #' Nora Catolico (2024-01-30) original creation diff --git a/flow/flow.sunav2.logfiles/Dockerfile b/flow/flow.sunav2.logfiles/Dockerfile new file mode 100644 index 000000000..b417409b1 --- /dev/null +++ b/flow/flow.sunav2.logfiles/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 Logfile Processing + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.logfiles" + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.logfiles.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.logfiles.R . + diff --git a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R new file mode 100644 index 000000000..b84cc09db --- /dev/null +++ b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R @@ -0,0 +1,149 @@ +############################################################################################## +#' @title Workflow for SUNA Log File Processing + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Validates, cleans, and formats sunav2 log files into daily parquets. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The input path to the data from a single source ID, structured as follows: +#' #/pfs/BASE_REPO/source-id.The source-id folder may have multiple csv log files. +#' The source-id is the unique identifier of the sensor.#' +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "FileSchmData=value" (optional), where values is the full path to the avro schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Cleaned sunav2 log files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.logfiles <- function(FileIn = "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", +#' DirOut="~/pfs/out", +#' SchmDataOut=NULL, +#' log=log) +#' Stepping through the code in R studio +Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Nora Catolico (2024-01-09) original creation +#' Bobby Hensley (2025-04-09) adapted for SUNA +# +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.logfiles.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirOut","DirErr"), + NameParaOptn = c("FileSchmData"),log = log) + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) + + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') +} + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = NULL, + log = log) + +# Take stock of our data files. +fileData <- base::list.files(DirIn,full.names=TRUE) +log$debug(base::paste0('Files identified:', fileData)) + + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = fileData) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.logfiles( + FileIn=idxFileIn, + DirOut=Para$DirOut, + SchmDataOut=SchmDataOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.logfiles/renv.lock b/flow/flow.sunav2.logfiles/renv.lock new file mode 100644 index 000000000..7283865a0 --- /dev/null +++ b/flow/flow.sunav2.logfiles/renv.lock @@ -0,0 +1,101 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + } + } +} diff --git a/flow/flow.suna.logfiles/wrap.suna.logfiles.R b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R similarity index 98% rename from flow/flow.suna.logfiles/wrap.suna.logfiles.R rename to flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R index aaa1ce938..402dea047 100644 --- a/flow/flow.suna.logfiles/wrap.suna.logfiles.R +++ b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R @@ -31,10 +31,10 @@ #' #' @examples #' # Not run -FileIn <- "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" -DirOut="~/pfs/sunav2_logs_output" -SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# FileIn <- "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv" +# DirOut="~/pfs/sunav2_logs_output" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' wrap.suna.logfiles <- function(FileIn = "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", #' DirOut="~/pfs/out", #' SchmDataOut=NULL, diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml new file mode 100644 index 000000000..f8337ec9b --- /dev/null +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -0,0 +1,49 @@ +--- +pipeline: + name: nitrate_cron_monthly_and_pub_control +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-cntl:v1.1.0 + cmd: ["/bin/bash"] + env: + # START_MONTH and END_MONTH indicate the date range (inclusive) to create the /Y/M folder structure + # START_DATE must be set, format "YYYY-MM" + # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. + OUT_PATH: /pfs/out + START_MONTH: "2024-09" + #END_MONTH: "2024-09" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over + stdin: + - "#!/bin/bash" + - ./cron_monthly_and_pub_control/populate_pub_months.sh +input: + # Choose a monthly cron date to be something sufficiently after the 1st to allow kafka lag and timeseries pad + cron: + name: tick + spec: "0 7 5 * *" # Run at 00:00 MST (07:00 GMT) on the 5th of the month + overwrite: true +autoscaling: true +resource_requests: + memory: 64M + cpu: 0.1 +resource_limits: + memory: 200M + cpu: 1 +sidecar_resource_requests: + memory: 200M + cpu: 0.1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml new file mode 100644 index 000000000..af0596985 --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -0,0 +1,55 @@ +--- +pipeline: + name: troll_logjam_assign_clean_files +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - Rscript + ./flow.troll.logfiles.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-troll-logfiles:v1.0.1 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO + ERR_PATH: /pfs/out/errored_datums +input: + cross: + - pfs: + name: DIR_IN + repo: troll_logjam_load_files + glob: /* +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1G + cpu: 1.5 +resource_limits: + memory: 1.5G + cpu: 2 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_logjam_list_files.yaml b/pipe/sunav2/sunav2_logjam_list_files.yaml new file mode 100644 index 000000000..5e2dc0b02 --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_list_files.yaml @@ -0,0 +1,48 @@ +--- +pipeline: + name: sunav2_logjam_list_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-logjam-loader:v2.0.0 + cmd: ["/bin/bash"] + env: + LOGJAM_INGEST_BUCKET: neon-nonprod-is-logjam-ingest + OUT_PATH: /pfs/out + LOG_LEVEL: DEBUG + stdin: + - "#!/bin/bash" + - python3 --version + - python3 -c "import environs; print(environs.__version__)" + - python3 -c "import marshmallow; print(marshmallow.__version__)" + - python3 -m logjam_loader.logjam_loader +input: + pfs: + repo: nitrate_cron_monthly_and_pub_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 1G + cpu: 1 +resource_limits: + memory: 1.5G + cpu: 1.5 +sidecar_resource_requests: + memory: 1G + cpu: 0.4 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_logjam_load_files.yaml b/pipe/sunav2/sunav2_logjam_load_files.yaml new file mode 100644 index 000000000..f981ce62b --- /dev/null +++ b/pipe/sunav2/sunav2_logjam_load_files.yaml @@ -0,0 +1,59 @@ +--- +pipeline: + name: sunav2_logjam_load_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-logjam-loader:v2.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 --version + - python3 -c "import environs; print(environs.__version__)" + - python3 -c "import marshmallow; print(marshmallow.__version__)" + - python3 -m logjam_loader.load_all_logjam_files + + env: + LOGJAM_INGEST_BUCKET: neon-nonprod-is-logjam-ingest + OUT_PATH: /pfs/out + LOG_LEVEL: DEBUG + STARTING_PATH_INDEX: "7" + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + name: IN_PATH + repo: sunav2_logjam_list_files + glob: /*/*/*/logjam_dev/sunav2/ + empty_files: true +parallelism_spec: + constant: 10 +autoscaling: true +resource_requests: + memory: 500M + cpu: 0.5 +resource_limits: + memory: 1G + cpu: 1.5 +sidecar_resource_requests: + memory: 2G + cpu: 0.2 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } From efa81a18f3b256be593758ec3468bddb6c3a173b Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 10:21:32 -0600 Subject: [PATCH 018/182] latest --- pipe/sunav2/sunav2_logjam_list_files.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pipe/sunav2/sunav2_logjam_list_files.yaml b/pipe/sunav2/sunav2_logjam_list_files.yaml index 5e2dc0b02..cd52d3be5 100644 --- a/pipe/sunav2/sunav2_logjam_list_files.yaml +++ b/pipe/sunav2/sunav2_logjam_list_files.yaml @@ -11,13 +11,11 @@ transform: stdin: - "#!/bin/bash" - python3 --version - - python3 -c "import environs; print(environs.__version__)" - - python3 -c "import marshmallow; print(marshmallow.__version__)" - python3 -m logjam_loader.logjam_loader input: pfs: repo: nitrate_cron_monthly_and_pub_control_tick - glob: /* + glob: /(*-*)-* empty_files: true autoscaling: true resource_requests: From 13c8203d2a164b5d501ea52d97389adc36db6563 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 11:18:52 -0600 Subject: [PATCH 019/182] latest --- pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml | 6 +++--- pipe/sunav2/sunav2_logjam_assign_clean_files.yaml | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index f8337ec9b..56db88a3f 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -9,8 +9,8 @@ transform: # START_DATE must be set, format "YYYY-MM" # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. OUT_PATH: /pfs/out - START_MONTH: "2024-09" - #END_MONTH: "2024-09" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over + START_MONTH: "2024-08" + # END_MONTH: "2024-09" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over stdin: - "#!/bin/bash" - ./cron_monthly_and_pub_control/populate_pub_months.sh @@ -18,7 +18,7 @@ input: # Choose a monthly cron date to be something sufficiently after the 1st to allow kafka lag and timeseries pad cron: name: tick - spec: "0 7 5 * *" # Run at 00:00 MST (07:00 GMT) on the 5th of the month + spec: "0 7 10 * *" # Run at 00:00 MST (07:00 GMT) on the 10th of the month overwrite: true autoscaling: true resource_requests: diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index af0596985..354b55f11 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -1,16 +1,16 @@ --- pipeline: - name: troll_logjam_assign_clean_files + name: sunav2_logjam_assign_clean_files transform: cmd: ["/bin/bash"] stdin: - "#!/bin/bash" - Rscript - ./flow.troll.logfiles.R + ./flow.sunav2.logfiles.R DirIn=$DIR_IN DirOut=/pfs/out DirErr=$ERR_PATH - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-troll-logfiles:v1.0.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:v1.0.1 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: @@ -20,7 +20,7 @@ input: cross: - pfs: name: DIR_IN - repo: troll_logjam_load_files + repo: sunav2_logjam_load_files glob: /* parallelism_spec: constant: 5 From b2df2d3cf9aad19f1cbe675227faad137116b79b Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 11:50:08 -0600 Subject: [PATCH 020/182] latest --- flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R index b84cc09db..18a6ec8e2 100644 --- a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R +++ b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R @@ -41,9 +41,9 @@ #' SchmDataOut=NULL, #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently From 75cbee26394063712640905eddd42f86ff50e820 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 12:24:10 -0600 Subject: [PATCH 021/182] latest --- flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R | 6 +++--- flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R index 18a6ec8e2..b84cc09db 100644 --- a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R +++ b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R @@ -41,9 +41,9 @@ #' SchmDataOut=NULL, #' log=log) #' Stepping through the code in R studio -# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') -# log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") +Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R index 402dea047..796ce8a32 100644 --- a/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R +++ b/flow/flow.sunav2.logfiles/wrap.sunav2.logfiles.R @@ -35,7 +35,7 @@ # DirOut="~/pfs/sunav2_logs_output" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -#' wrap.suna.logfiles <- function(FileIn = "~/pfs/suna_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", +#' wrap.sunav2.logfiles <- function(FileIn = "~/pfs/sunav2_logjam_load_files/20349/logjam_prod_20349_0b05a4c0da3bb05af840fece674fe34c.csv", #' DirOut="~/pfs/out", #' SchmDataOut=NULL, #' log=log) @@ -44,7 +44,7 @@ #' Nora Catolico (2024-01-09) original creation #' Bobby Hensley (2025-04-09) adapted for SUNA ############################################################################################## -wrap.suna.logfiles <- function(FileIn, +wrap.sunav2.logfiles <- function(FileIn, DirOut, SchmDataOut=NULL, log=NULL From 431b28e13683b339f84768a10b7b2946b6186dfa Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 12:30:45 -0600 Subject: [PATCH 022/182] latest --- flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R | 2 +- pipe/sunav2/sunav2_logjam_assign_clean_files.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 5b077aa92..c629ca530 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -56,7 +56,7 @@ #' Bobby Hensley (2025-05-30) adapted for suna #' ############################################################################################## -wrap.suna.logfiles.fill <- function(DirInLogs=NULL, +wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, DirInStream=NULL, DirIn, DirOutBase, diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 354b55f11..0942307a7 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -10,7 +10,7 @@ transform: DirIn=$DIR_IN DirOut=/pfs/out DirErr=$ERR_PATH - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:v1.0.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-75cbee2 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From 4982871accbd1921e37dff149b2954bff00810a7 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 27 Jun 2025 13:02:47 -0600 Subject: [PATCH 023/182] latest --- pipe/sunav2/pipe_list_sunav2.txt | 7 + .../sunav2/sunav2_calibration_assignment.json | 47 ----- .../sunav2/sunav2_calibration_assignment.yaml | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 196 ++++++++++++++++++ ...nav2_location_active_dates_assignment.json | 47 ----- .../sunav2_location_asset_assignment.json | 47 ----- .../sunav2_logjam_assign_clean_files.yaml | 2 +- .../sunav2/sunav2_merge_data_by_location.json | 34 --- .../sunav2_structure_repo_by_location.json | 32 --- 9 files changed, 205 insertions(+), 209 deletions(-) delete mode 100644 pipe/sunav2/sunav2_calibration_assignment.json create mode 100644 pipe/sunav2/sunav2_fill_log_files.yaml delete mode 100644 pipe/sunav2/sunav2_location_active_dates_assignment.json delete mode 100644 pipe/sunav2/sunav2_location_asset_assignment.json delete mode 100644 pipe/sunav2/sunav2_merge_data_by_location.json delete mode 100644 pipe/sunav2/sunav2_structure_repo_by_location.json diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index 9fb24f30a..d446a73fd 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -4,8 +4,15 @@ sunav2_logjam_load_files.yaml sunav2_logjam_assign_clean_files.yaml sunav2_data_source_kafka.yaml sunav2_data_source_trino.yaml +sunav2_fill_log_files.yaml sunav2_calibration_list_files.yaml sunav2_calibration_loader.yaml +sunav2_calibration_assignment.yaml +sunav2_calibration_group_and_convert.yaml + + + + sunav2_location_asset.yaml sunav2_location_loader.yaml sunav2_calibration_assignment.yaml diff --git a/pipe/sunav2/sunav2_calibration_assignment.json b/pipe/sunav2/sunav2_calibration_assignment.json deleted file mode 100644 index 0671e27e0..000000000 --- a/pipe/sunav2/sunav2_calibration_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_calibration_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.cal.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "PadDay=-1|1" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.2", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "calibration", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "4" - } -} diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml index cf7072399..42fd8d720 100644 --- a/pipe/sunav2/sunav2_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -12,7 +12,7 @@ transform: DirOut=/pfs/out DirErr=$ERR_PATH FileYear=$FILE_YEAR - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v1.0.6 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.2 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml new file mode 100644 index 000000000..f8d22cdfb --- /dev/null +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -0,0 +1,196 @@ +--- +pipeline: + name: sunav2_fill_log_files +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles-fill:sha-431b28e + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/kafka_merged + rm -rf $OUT_PATH_LIMIT_LOGFILES + rm -rf $OUT_PATH_JOIN_SOURCES + mkdir -p /tmp/kafka_merged # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p $OUT_PATH_LIMIT_LOGFILES # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p $OUT_PATH_JOIN_SOURCES # R modules must have pfs in the repo structure + # + # Check if there is any data (could just be the DATE_CONTROL, in which case we'll skip) + data="F" + if [ ${DATA_PATH_KAFKA+x} ]; then + data="T" + fi + if [ ${DATA_PATH_TRINO+x} ]; then + data="T" + fi + if [ ${DATA_PATH_LOG+x} ]; then + data="T" + fi + if [ $data = "F" ]; then + echo "No actual data in datum. Skipping..." + exit 0 + fi + # + # Get source type + path_glob="/pfs/DATA_PATH_*/*/" + for path in $path_glob; do + # Parse the path + [[ "$path" =~ ^/pfs/DATA_PATH_(.*)/(.*)/$ ]] + source_type="${BASH_REMATCH[2]}" + done + # + # If we have log files, limit them to the dates in the date_control pipeline + echo "Running filter-joiner to limit log files" + export CONFIG=$CONFIG_LIMIT_LOGFILES + export OUT_PATH=$OUT_PATH_LIMIT_LOGFILES + python3 -m filter_joiner.filter_joiner_main + # + # If data come from Kafka, run the Kafka-merger (could be multiple files) + if [ ${DATA_PATH_KAFKA+x} ]; then + # Data from kafka. + # Run kafka combiner + Rscript ./flow.kfka.comb.R \ + DirIn=$DATA_PATH_KAFKA \ + DirOut=/tmp/kafka_merged \ + DirErr=/pfs/out/errored_datums + fi + # Run the filter joiner to merge files from all sources. + echo "Running filter-joiner to merge all data sources" + export CONFIG=$CONFIG_JOIN_SOURCES + export OUT_PATH=$OUT_PATH_JOIN_SOURCES + python3 -m filter_joiner.filter_joiner_main + # + # Run log filler script + Rscript ./flow.sunav2.logfiles.fill.R \ + DirIn=$OUT_PATH_JOIN_SOURCES \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums + EOF + env: + # Environment variables for filter-joiner. + # Ensure the path for the kafka data is listed prior to that for the archive data. When a conflict arises, + # such as when Kafka re-streams data, the Kafka data will take precedence because it is + # the latest and greatest. + CONFIG_LIMIT_LOGFILES: | + --- + # Configuration for filter-joiner module that will limit log files to the dates in + # the date control pipeline + # Make sure the DATE_CONTROL path is second. We actually don't want these files and + # they won't be copied if log files for the site are present + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH_LOG + # Filter for data directory + glob_pattern: /pfs/DATA_PATH_LOG/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: False + - path: + name: DATE_CONTROL + # Filter for data directory + glob_pattern: /pfs/DATE_CONTROL/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: False + CONFIG_JOIN_SOURCES: | + --- + # Configuration for filter-joiner module that will bring together all sources of data + # Make sure the DATA_PATH_LOG path is second. Any site files from the date_control pipeline + # won't be copied if there are files from the archive, kafka, or the log files. + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + input_paths: + - path: + name: DATA_PATH_KAFKA + # Filter for data directory + glob_pattern: /tmp/kafka_merged/*/*/*/*/*/** + # Join on named location (already joined below by day) + join_indices: [3,4,5,6] + outer_join: true + - path: + name: DATA_PATH_ARCHIVE + # Filter for data directory + glob_pattern: /pfs/DATA_PATH_ARCHIVE/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: True + - path: + name: DATA_PATH_LOG + # Filter for data directory + glob_pattern: /tmp/log_limited/*/*/*/** + # Join on y/m/d and sourceID + join_indices: [3,4,5,6] + outer_join: True + OUT_PATH_LIMIT_LOGFILES: /tmp/log_limited + OUT_PATH_JOIN_SOURCES: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums + LOG_LEVEL: DEBUG + RELATIVE_PATH_INDEX: "3" # Must be consistent across inputs + LINK_TYPE: COPY # options are COPY or SYMLINK. MUST BE SIMLINK IF USING COMBINED MODULE. +input: + join: + - pfs: + name: DATA_PATH_TRINO + repo: sunav2_data_source_trino + glob: /(*/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + - pfs: + name: DATA_PATH_KAFKA + repo: sunav2_data_source_kafka + glob: /(*/*/*/*) + joinOn: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + outer_join: true + - pfs: + name: DATA_PATH_LOG + repo: sunav2_logjam_assign_clean_files + glob: /(*/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + - pfs: + name: DATE_CONTROL + repo: sunav2_cron_daily_and_date_control + glob: /(*/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.5 +resource_limits: + memory: 800M + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.3 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.json b/pipe/sunav2/sunav2_location_active_dates_assignment.json deleted file mode 100644 index 44350351f..000000000 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_active_dates_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=namedLocation" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_loader", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "200M", - "cpu": 1 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_location_asset_assignment.json b/pipe/sunav2/sunav2_location_asset_assignment.json deleted file mode 100644 index 2ea094c2a..000000000 --- a/pipe/sunav2/sunav2_location_asset_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_asset_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=asset" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_asset", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 0942307a7..891ab896f 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -14,7 +14,7 @@ transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: - LOG_LEVEL: INFO + LOG_LEVEL: DEBUG ERR_PATH: /pfs/out/errored_datums input: cross: diff --git a/pipe/sunav2/sunav2_merge_data_by_location.json b/pipe/sunav2/sunav2_merge_data_by_location.json deleted file mode 100644 index bfdfea21f..000000000 --- a/pipe/sunav2/sunav2_merge_data_by_location.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_merge_data_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.data.trnc.comb.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "DirSubCombData=data", - "DirSubCopy=location|calibration" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-data-trnc-comb:v1.1.1", - "env": { - "LOG_LEVEL": "INFO", - "PARALLELIZATION_INTERNAL": "1" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_structure_repo_by_location", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": false, - "resource_requests": { - "memory": "80M", - "cpu": 0.3 - } -} diff --git a/pipe/sunav2/sunav2_structure_repo_by_location.json b/pipe/sunav2/sunav2_structure_repo_by_location.json deleted file mode 100644 index edb59b29c..000000000 --- a/pipe/sunav2/sunav2_structure_repo_by_location.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_structure_repo_by_location" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.repo.strc.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "Comb=TRUE" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-repo-strc:v1.0.7", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "pfs": { - "name": "DIR_IN", - "repo": "sunav2_calibrated_location_group", - "glob": "/sunav2/*/*/*" - } - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "100M", - "cpu": 0.13 - } -} From 8f87d5ca610ea4231b84204fcaf9d8229d891742 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 09:22:25 -0600 Subject: [PATCH 024/182] latest --- .../wrap.sunav2.logfiles.fill.R | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index c629ca530..54f408cf7 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -43,10 +43,10 @@ #' #' @examples #' # Not run -# DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data +# DirInLogs<-"~/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349" #cleaned log data # DirInStream<-"~/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349" #streamed L0 data # DirIn<-NULL -# DirOutBase="~/pfs/sunav2_filled_output" +# DirOutBase="~/pfs/out" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') @@ -230,14 +230,6 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, #' Write out data - - - - - - - - #write out data fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name asset<-tail(x=fileOutSplt,n=1) @@ -254,7 +246,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, } #write out flags - csv_name_flags <-paste0(sensor,'_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') + csv_name_flags <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagsOut, NameFile = base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"), From 1a9c472f5f11336ff3e693808ff6af2e5455038f Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 30 Jun 2025 10:59:30 -0600 Subject: [PATCH 025/182] Added code to generate SUNA log QF output file. --- .../wrap.sunav2.logfiles.fill.R | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 54f408cf7..3e3ca1911 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -223,14 +223,24 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, "check_sum","error_missing_data")] #' Determine whether to use logged or streamed data. - #' Preference is to use logged data if available - if(!is.null(logData)){dataOut<-logData} - if(is.null(logData) & !is.null(L0DataParsed)){dataOut<-L0DataParsed} - if(is.null(logData) & is.null(L0DataParsed)){dataOut<-L0DataParsed} + #' Logged data is used if available, and log data flag set to 1 + if(!is.null(logData)){ + dataOut<-logData + flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) + flagsOut$readout_time<-dataOut$readout_time + flagsOut$sunaLogDataQF<-1 + } + #' Streamed data is used if no logged data is available, and log data flags set to 0 + if(is.null(logData) & !is.null(L0DataParsed)){ + dataOut<-L0DataParsed + flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) + flagsOut$readout_time<-dataOut$readout_time + flagsOut$sunaLogDataQF<-0 + } -#' Write out data +#' Write out data file and log flags file - #write out data + #write out data file fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name asset<-tail(x=fileOutSplt,n=1) csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_filled') @@ -245,7 +255,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, log$info(base::paste0('Data written successfully in ', base::paste0(DirOutData,'/',csv_name,".parquet"))) } - #write out flags + #write out log flags file csv_name_flags <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagsOut, From dd94e8cd36d530b4067d1963ccfe893704ad041e Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 12:14:27 -0600 Subject: [PATCH 026/182] combined suna log fill module --- .../troll_logs_group_and_fill/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules_combined/troll_logs_group_and_fill/Dockerfile b/modules_combined/troll_logs_group_and_fill/Dockerfile index 44f6b6507..6b8625a32 100644 --- a/modules_combined/troll_logs_group_and_fill/Dockerfile +++ b/modules_combined/troll_logs_group_and_fill/Dockerfile @@ -1,6 +1,6 @@ -# Dockerfile for NEON IS Data Processing - Troll logs fill module combined with filter-joiner +# Dockerfile for NEON IS Data Processing - SUNA logs fill module combined with filter-joiner # Example command (must be run from project parent directory to include modules/ and flow/ paths in Docker context): -# docker build -t neon-is-troll-logs-group-fill -f ./modules_combined/troll_logs_group_and_fill/Dockerfile . +# docker build -t neon-is-sunav2-logs-group-fill -f ./modules_combined/sunav2_logs_group_and_fill/Dockerfile . # Start with the base R image. FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 @@ -48,15 +48,15 @@ COPY ./flow/flow.kfka.comb/wrap.kfka.comb.R . # Build in the log file filler module ARG MODULE_DIR="flow" -ARG APP_DIR_1="flow.troll.logfiles.fill" +ARG APP_DIR_1="flow.sunav2.logfiles.fill" # Copy the lockfile and restore known working versions of R dependency packages -COPY ./flow/flow.troll.logfiles.fill/renv.lock . +COPY ./flow/flow.sunav2.logfiles.fill/renv.lock . RUN R -e 'renv::restore(lockfile="./renv.lock")' # Copy in R code -COPY ./flow/flow.troll.logfiles.fill/flow.troll.logfiles.fill.R . -COPY ./flow/flow.troll.logfiles.fill/wrap.troll.logfiles.fill.R . +COPY ./flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R . +COPY ./flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R . # Run as app user USER appuser From d48d008f13a829ff3910fc0603dff433be12cf6d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 12:18:58 -0600 Subject: [PATCH 027/182] combined module git action --- pipe/sunav2/sunav2_fill_log_files.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index f8d22cdfb..cc807051b 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles-fill:sha-431b28e + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles-fill:sha-1a9c472 cmd: - sh - "-c" @@ -115,9 +115,9 @@ transform: join_indices: [3,4,5,6] outer_join: true - path: - name: DATA_PATH_ARCHIVE + name: DATA_PATH_TRINO # Filter for data directory - glob_pattern: /pfs/DATA_PATH_ARCHIVE/*/*/*/** + glob_pattern: /pfs/DATA_PATH_TRINO/*/*/*/** # Join on y/m/d and sourceID join_indices: [3,4,5,6] outer_join: True From 91a3f4da5f41b920306d460dacf26f35aae4e997 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 12:19:33 -0600 Subject: [PATCH 028/182] combined module --- .../sunav2_logs_group_and_fill/Dockerfile | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 modules_combined/sunav2_logs_group_and_fill/Dockerfile diff --git a/modules_combined/sunav2_logs_group_and_fill/Dockerfile b/modules_combined/sunav2_logs_group_and_fill/Dockerfile new file mode 100644 index 000000000..44f6b6507 --- /dev/null +++ b/modules_combined/sunav2_logs_group_and_fill/Dockerfile @@ -0,0 +1,62 @@ +# Dockerfile for NEON IS Data Processing - Troll logs fill module combined with filter-joiner +# Example command (must be run from project parent directory to include modules/ and flow/ paths in Docker context): +# docker build -t neon-is-troll-logs-group-fill -f ./modules_combined/troll_logs_group_and_fill/Dockerfile . + +# Start with the base R image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Add in the python-based filter-joiner module +ARG MODULE_DIR="modules" +ARG APP_DIR="filter_joiner" +ARG COMMON_DIR="common" +ARG CONTAINER_APP_DIR="/usr/src/app" +ENV PYTHONPATH="${PYTHONPATH}:${CONTAINER_APP_DIR}" + +WORKDIR ${CONTAINER_APP_DIR} + +COPY ${MODULE_DIR}/${APP_DIR}/requirements.txt ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt + + +RUN apt update && \ + apt-get install -y --no-install-recommends \ + python3.8 && \ + apt install -y python3-pip && \ + python3 -mpip install --no-cache-dir --upgrade pip setuptools wheel && \ + python3 -mpip install --no-cache-dir -r ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt && \ + apt-get autoremove -y && \ + apt-get autoclean -y && \ + rm -rf /var/lib/apt/lists/* && \ + groupadd -g 9999 appuser && \ + useradd -r -u 9999 -g appuser appuser + +# Copy in python code +COPY ${MODULE_DIR}/${APP_DIR} ${CONTAINER_APP_DIR}/${APP_DIR} +COPY ${MODULE_DIR}/${COMMON_DIR} ${CONTAINER_APP_DIR}/${COMMON_DIR} + +# Load kafka combiner +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ./flow/flow.kfka.comb/renv.lock . +RUN R -e 'renv::restore(lockfile="./renv.lock")' + +# Copy in R code +COPY ./flow/flow.kfka.comb/flow.kfka.comb.R . +COPY ./flow/flow.kfka.comb/wrap.kfka.comb.R . + +# Build in the log file filler module +ARG MODULE_DIR="flow" +ARG APP_DIR_1="flow.troll.logfiles.fill" + +# Copy the lockfile and restore known working versions of R dependency packages +COPY ./flow/flow.troll.logfiles.fill/renv.lock . +RUN R -e 'renv::restore(lockfile="./renv.lock")' + +# Copy in R code +COPY ./flow/flow.troll.logfiles.fill/flow.troll.logfiles.fill.R . +COPY ./flow/flow.troll.logfiles.fill/wrap.troll.logfiles.fill.R . + +# Run as app user +USER appuser From c5f1411e3db2c50360c720c1c5441c0329c4f1fe Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 13:05:35 -0600 Subject: [PATCH 029/182] latest --- .../sunav2_logs_group_and_fill/Dockerfile | 12 ++++++------ pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modules_combined/sunav2_logs_group_and_fill/Dockerfile b/modules_combined/sunav2_logs_group_and_fill/Dockerfile index 44f6b6507..da1a5e850 100644 --- a/modules_combined/sunav2_logs_group_and_fill/Dockerfile +++ b/modules_combined/sunav2_logs_group_and_fill/Dockerfile @@ -1,6 +1,6 @@ -# Dockerfile for NEON IS Data Processing - Troll logs fill module combined with filter-joiner +# Dockerfile for NEON IS Data Processing - sunav2 logs fill module combined with filter-joiner # Example command (must be run from project parent directory to include modules/ and flow/ paths in Docker context): -# docker build -t neon-is-troll-logs-group-fill -f ./modules_combined/troll_logs_group_and_fill/Dockerfile . +# docker build -t neon-is-sunav2-logs-group-fill -f ./modules_combined/sunav2_logs_group_and_fill/Dockerfile . # Start with the base R image. FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 @@ -48,15 +48,15 @@ COPY ./flow/flow.kfka.comb/wrap.kfka.comb.R . # Build in the log file filler module ARG MODULE_DIR="flow" -ARG APP_DIR_1="flow.troll.logfiles.fill" +ARG APP_DIR_1="flow.sunav2.logfiles.fill" # Copy the lockfile and restore known working versions of R dependency packages -COPY ./flow/flow.troll.logfiles.fill/renv.lock . +COPY ./flow/flow.sunav2.logfiles.fill/renv.lock . RUN R -e 'renv::restore(lockfile="./renv.lock")' # Copy in R code -COPY ./flow/flow.troll.logfiles.fill/flow.troll.logfiles.fill.R . -COPY ./flow/flow.troll.logfiles.fill/wrap.troll.logfiles.fill.R . +COPY ./flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R . +COPY ./flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R . # Run as app user USER appuser diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index cc807051b..2cb0a412d 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles-fill:sha-1a9c472 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-d92ba06 cmd: - sh - "-c" From 2476643026101a34691ffe8b17eb1668a54b8b0d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 15:25:19 -0600 Subject: [PATCH 030/182] minor --- flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 3e3ca1911..004feb664 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -243,7 +243,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, #write out data file fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name asset<-tail(x=fileOutSplt,n=1) - csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_filled') + csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d")) rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 2cb0a412d..f3941004c 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-d92ba06 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-c5f1411 cmd: - sh - "-c" From 6f50b9827158f7273eec4f79ffb517f73a42f5c8 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 30 Jun 2025 16:02:11 -0600 Subject: [PATCH 031/182] latest --- .../sunav2_calibration_group_and_convert.yaml | 129 +++++------------- pipe/sunav2/sunav2_calibration_loader.yaml | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 3 files changed, 35 insertions(+), 98 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 7890fd170..e2fac2f5f 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_group_and_convert transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v1.3.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v2.3.1 cmd: - sh - "-c" @@ -11,63 +11,25 @@ transform: # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ set -euo pipefail IFS=$'\n\t' - + # # Refresh interim directories with each datum (otherwise they persist and cause probs) - rm -rf /tmp/kafka_merged + rm -r -f /tmp/pfs/filter_joined rm -rf $OUT_PATH - mkdir -p /tmp/kafka_merged # Filter joiner relies on the same path positions among inputs (i.e. repo name in 2nd position) + mkdir -p /tmp/pfs/filter_joined mkdir -p $OUT_PATH # R modules must have pfs in the repo structure - - # Detect if we have data coming from Kafka or the archive - # Note that we run the filter-joiner in sequential if statements rather than an elif statement - # ... so that if there is any overlap in sensor data coming from both Kafka and the archive on the same day, the - # ... kafka data wins (filter joiner will not copy a file if it is already in the destination). This scenario - # ... should only arise during initial data load and a site back-streams data from kafka outside the Kafka - # ... retention period for data that have already been loaded from the archive. - # ... When a conflict does arise, the kafka data will take precedence, assuming that it is the latest - # ... and greatest. - - if [ ${KAFKA_UNMERGED_DATA+x} ]; then - # Data from kafka. - - # Run kafka combiner - Rscript ./flow.kfka.comb.R \ - DirIn=$KAFKA_UNMERGED_DATA \ - DirOut=/tmp/kafka_merged \ - DirErr=/pfs/out/errored_datums \ - FileSchmL0=$FILE_SCHEMA_L0 - - # Run filter joiner - python3 -m filter_joiner.filter_joiner_main - fi - if [ ${DATA_PATH_ARCHIVE+x} ]; then - # Data from the archive. - - # Run kafka combiner - note that this works for both trino-loaded data and kafka loaded data. If both - # exist in the folder for the same sensor and day, likely there will be duplicate data written to file - # because the Trino timestamps are truncated to the second whereas Kafka readout times are not. However, - # this scenario should be rare and duplicates will be removed in the regularization module. - Rscript ./flow.kfka.comb.R \ - DirIn=$DATA_PATH_ARCHIVE \ - DirOut=/tmp/kafka_merged \ - DirErr=/pfs/out/errored_datums \ - FileSchmL0=$FILE_SCHEMA_L0 - - # Run filter joiner - python3 -m filter_joiner.filter_joiner_main - fi - + # + # Run filter-joiner for data (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # # Run calibration conversion module - Rscript ./flow.cal.conv.R \ - DirIn=/tmp/pfs/filter_joined \ - DirOut=/pfs/out \ - DirErr=/pfs/out/errored_datums \ - FileSchmData=$FILE_SCHEMA_DATA \ - FileSchmQf=$FILE_SCHEMA_FLAGS \ - TermFuncConv=voltage:def.cal.conv.poly \ - TermQf=voltage \ - TermFuncUcrt=voltage:def.ucrt.meas.mult,def.ucrt.fdas.volt.poly \ - FileUcrtFdas=$FILE_UNCERTAINTY_FDAS + Rscript ./flow.cal.conv.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + DirSubCopy=flags \ + "TermFuncConv=pressure:def.cal.conv.poly.b|temperature:def.cal.conv.poly|conductivity:def.cal.conv.poly.split" \ + "TermQf=pressure|temperature|conductivity" \ + "TermFuncUcrt=pressure:def.ucrt.meas.cnst|temperature:def.ucrt.meas.cnst|conductivity:def.ucrt.meas.cnst" EOF env: # Environment variables for filter-joiner. @@ -78,16 +40,16 @@ transform: # Metadata indices will typically begin at index 3. input_paths: - path: - name: DATA_PATH_KAFKA_MERGED + name: DATA_PATH # Filter for data directory - glob_pattern: /tmp/kafka_merged/sunav2/*/*/*/*/** - # Join on named location (already joined below by day) + glob_pattern: /pfs/DATA_PATH/*/*/*/*/*/** + # Join on named location (already joined below by source type and day) join_indices: [7] outer_join: true - path: name: CALIBRATION_PATH # Filter for data directory - glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** + glob_pattern: /pfs/CALIBRATION_PATH/*/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] OUT_PATH: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums @@ -97,46 +59,21 @@ transform: # Environment variables for calibration module PARALLELIZATION_INTERNAL: '3' # Option for calibration conversion module input: - cross: - - pfs: - name: FILE_SCHEMA_L0 - repo: sunav2_avro_schemas - glob: /sunav2/sunav2.avsc - - pfs: - name: FILE_SCHEMA_DATA - repo: sunav2_avro_schemas - glob: /sunav2/sunav2_calibrated.avsc + # Outer join all days + join: - pfs: - name: FILE_SCHEMA_FLAGS - repo: sunav2_avro_schemas - glob: /sunav2/flags_calibration_sunav2.avsc + name: CALIBRATION_PATH + repo: sunav2_calibration_assignment + glob: /(*)/(*)/(*)/(*) + joinOn: $1/$2/$3/$4 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: - name: FILE_UNCERTAINTY_FDAS - repo: sunav2_uncertainty_fdas - glob: /fdas_calibration_uncertainty_general.json - # Outer join all days so that varying sensors between kafka and archive loaders will all get joined with calibrations. Filter-joiner will narrow down. - - join: - - pfs: - name: CALIBRATION_PATH - repo: sunav2_calibration_assignment - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - - pfs: - name: DATA_PATH_ARCHIVE - repo: sunav2_data_source_trino - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - - pfs: - name: KAFKA_UNMERGED_DATA - repo: sunav2_data_source_kafka - glob: /sunav2/(*)/(*)/(*) - joinOn: $1/$2/$3 - outer_join: true - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + name: DATA_PATH + repo: sunav2_fill_log_files + glob: /(*)/(*)/(*)/(*) + joinOn: $1/$2/$3/$4 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 35da08966..edb1fe148 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -23,7 +23,7 @@ input: pfs: name: IN_PATH repo: sunav2_calibration_list_files - glob: /*/*/*/* + glob: /*/*/*/*/* empty_files: true parallelism_spec: constant: 10 diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index f3941004c..da76a0ed7 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-c5f1411 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-2476643 cmd: - sh - "-c" From 63873b37d72fc7945515045ef0f0b9580b361a25 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 2 Jul 2025 14:35:15 -0600 Subject: [PATCH 032/182] fix --- pipe/sunav2/sunav2_calibration_loader.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index edb1fe148..9405478c3 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -14,7 +14,7 @@ transform: CVAL_INGEST_BUCKET: neon-cval OUT_PATH: /pfs/out LOG_LEVEL: INFO - SOURCE_TYPE: sunav2 + SOURCE_TYPE: suna STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret @@ -23,7 +23,7 @@ input: pfs: name: IN_PATH repo: sunav2_calibration_list_files - glob: /*/*/*/*/* + glob: /*/*/*/* empty_files: true parallelism_spec: constant: 10 From 4946c27abb8571146798b1dcbbaeaec4dddacf62 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 2 Jul 2025 16:13:56 -0600 Subject: [PATCH 033/182] testing --- modules/calval_loader/get_avro_schema_name.py | 4 ++-- modules/calval_loader/get_calibration_stream_name.py | 2 +- modules/calval_loader/load_all_calval_files.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/calval_loader/get_avro_schema_name.py b/modules/calval_loader/get_avro_schema_name.py index 7a854d245..f7a0a72a7 100644 --- a/modules/calval_loader/get_avro_schema_name.py +++ b/modules/calval_loader/get_avro_schema_name.py @@ -19,12 +19,12 @@ def get_avro_schema_name(connection, asset_uid : int) -> Optional[str]: iaa2.asset_uid = %(asset_uid)s ''' with closing(connection.cursor()) as cursor: - #print('avro schema sql is', sql) + print('avro schema sql is', sql) cursor.execute(sql, dict(asset_uid=asset_uid)) row = cursor.fetchone() if row is None: logging.error(f'Avro schema name not found for asset id ID {asset_uid} .') return None avro_schema_name = row[0] - #print(f'avro_schema_name: {avro_schema_name}') + print(f'avro_schema_name: {avro_schema_name}') return avro_schema_name diff --git a/modules/calval_loader/get_calibration_stream_name.py b/modules/calval_loader/get_calibration_stream_name.py index 19178af4c..5f31b3a1d 100644 --- a/modules/calval_loader/get_calibration_stream_name.py +++ b/modules/calval_loader/get_calibration_stream_name.py @@ -36,5 +36,5 @@ def get_calibration_stream_name(connection, asset_type: str, stream_number: int) logging.error(f'Stream name not found for stream ID {stream_number} and asset type {asset_type}.') return None stream_name = row[0] - # print(f'asset_type: {asset_type} stream_name: {stream_name}') + print(f'asset_type: {asset_type} stream_name: {stream_name}') return stream_name diff --git a/modules/calval_loader/load_all_calval_files.py b/modules/calval_loader/load_all_calval_files.py index 6184d4085..6a4305b87 100644 --- a/modules/calval_loader/load_all_calval_files.py +++ b/modules/calval_loader/load_all_calval_files.py @@ -17,7 +17,7 @@ def load() -> None: env = environs.Env() ingest_bucket_name = env.str('CVAL_INGEST_BUCKET') in_path: Path = env.path('IN_PATH') - # print("IN_PATH value is:", in_path) + print("IN_PATH value is:", in_path) output_directory: Path = env.path('OUT_PATH') sensor_type = env.list('SOURCE_TYPE') db_config = read_from_mount(Path('/var/db_secret')) From e5a94b82871b2d41854c7db66c2c773e0273e282 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 2 Jul 2025 16:26:24 -0600 Subject: [PATCH 034/182] update environs --- modules/calval_loader/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/calval_loader/requirements.txt b/modules/calval_loader/requirements.txt index 7ccbc99e8..ec1141ef7 100644 --- a/modules/calval_loader/requirements.txt +++ b/modules/calval_loader/requirements.txt @@ -1,7 +1,8 @@ functions-framework==3.5.0 betterproto==2.0.0b4 certifi==2024.7.4 -environs==6.0.0 +environs==11.0.0 +marshmallow==3.21.3 python-pachyderm==7.4.0 google-cloud==0.34.0 google-cloud-logging==3.1.2 From cfa9a118b3bed9e1435bd0a484026b70dcb874d1 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 3 Jul 2025 11:06:28 -0600 Subject: [PATCH 035/182] more logs --- modules/calval_loader/get_avro_schema_name.py | 4 ++-- modules/calval_loader/load_all_calval_files.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/calval_loader/get_avro_schema_name.py b/modules/calval_loader/get_avro_schema_name.py index f7a0a72a7..7a854d245 100644 --- a/modules/calval_loader/get_avro_schema_name.py +++ b/modules/calval_loader/get_avro_schema_name.py @@ -19,12 +19,12 @@ def get_avro_schema_name(connection, asset_uid : int) -> Optional[str]: iaa2.asset_uid = %(asset_uid)s ''' with closing(connection.cursor()) as cursor: - print('avro schema sql is', sql) + #print('avro schema sql is', sql) cursor.execute(sql, dict(asset_uid=asset_uid)) row = cursor.fetchone() if row is None: logging.error(f'Avro schema name not found for asset id ID {asset_uid} .') return None avro_schema_name = row[0] - print(f'avro_schema_name: {avro_schema_name}') + #print(f'avro_schema_name: {avro_schema_name}') return avro_schema_name diff --git a/modules/calval_loader/load_all_calval_files.py b/modules/calval_loader/load_all_calval_files.py index 6a4305b87..ba0fdf931 100644 --- a/modules/calval_loader/load_all_calval_files.py +++ b/modules/calval_loader/load_all_calval_files.py @@ -17,7 +17,7 @@ def load() -> None: env = environs.Env() ingest_bucket_name = env.str('CVAL_INGEST_BUCKET') in_path: Path = env.path('IN_PATH') - print("IN_PATH value is:", in_path) + #print("IN_PATH value is:", in_path) output_directory: Path = env.path('OUT_PATH') sensor_type = env.list('SOURCE_TYPE') db_config = read_from_mount(Path('/var/db_secret')) @@ -37,13 +37,15 @@ def load() -> None: filename = pathname.split('/') filename = filename[-1] + ".xml" - print("FileName is: ", filename) + #print("FileName is: ", filename) blob = ingest_bucket.get_blob(filename) with blob.open("r") as f: root = ET.fromstring(blob.download_as_string()) asset_id = root.find('SensorID').find('MxAssetID').text avro_schema_name = get_avro_schema_name(connector.get_connection(), asset_id) + print('sensor_type:', sensor_type) + print('avro_schema_name:', sensor_type) if ((avro_schema_name != None) and (avro_schema_name in sensor_type)): stream_id = root.find('StreamCalVal').find('StreamID').text stream_name = get_calibration_stream_name(connector.get_connection(), avro_schema_name, From a243d3e980fa3f58c6f62ccb521fbd8c58f19576 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 3 Jul 2025 13:13:31 -0600 Subject: [PATCH 036/182] logs --- modules/calval_loader/load_all_calval_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/calval_loader/load_all_calval_files.py b/modules/calval_loader/load_all_calval_files.py index ba0fdf931..194db139a 100644 --- a/modules/calval_loader/load_all_calval_files.py +++ b/modules/calval_loader/load_all_calval_files.py @@ -45,7 +45,7 @@ def load() -> None: asset_id = root.find('SensorID').find('MxAssetID').text avro_schema_name = get_avro_schema_name(connector.get_connection(), asset_id) print('sensor_type:', sensor_type) - print('avro_schema_name:', sensor_type) + print('avro_schema_name:', avro_schema_name) if ((avro_schema_name != None) and (avro_schema_name in sensor_type)): stream_id = root.find('StreamCalVal').find('StreamID').text stream_name = get_calibration_stream_name(connector.get_connection(), avro_schema_name, From 6025c5129f9e221c8f34cea0a7c76cbd8ab5221b Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 3 Jul 2025 15:26:53 -0600 Subject: [PATCH 037/182] latest --- pipe/sunav2/sunav2_calibration_assignment.yaml | 2 +- pipe/sunav2/sunav2_calibration_loader.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml index 42fd8d720..f5fbd64cf 100644 --- a/pipe/sunav2/sunav2_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -22,7 +22,7 @@ input: - pfs: name: DIR_IN repo: sunav2_calibration_loader - glob: /sunav2/* + glob: /sunav2_raw/* - pfs: name: FILE_YEAR repo: sunav2_cron_daily_and_date_control diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 9405478c3..a751f9dbb 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -4,7 +4,7 @@ pipeline: transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:sha-a243d3e cmd: - /bin/bash stdin: @@ -13,8 +13,8 @@ transform: env: CVAL_INGEST_BUCKET: neon-cval OUT_PATH: /pfs/out - LOG_LEVEL: INFO - SOURCE_TYPE: suna + LOG_LEVEL: DEBUG + SOURCE_TYPE: sunav2_raw STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret From e100ec62d93325de260f539a464468d98fd76e64 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Fri, 11 Jul 2025 09:34:35 -0600 Subject: [PATCH 038/182] Update sunav2_data_parser.yaml --- pipe/sunav2/sunav2_data_parser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_data_parser.yaml b/pipe/sunav2/sunav2_data_parser.yaml index f494dde87..e6f3707c5 100644 --- a/pipe/sunav2/sunav2_data_parser.yaml +++ b/pipe/sunav2/sunav2_data_parser.yaml @@ -1,7 +1,7 @@ pipeline: name: sunav2_data_parser transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-raw-data-parser:v4.3.0 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-raw-data-parser:v4.9.7 cmd: - /bin/bash stdin: From afb2bfa0f2dc1d2b5743e57fda68eb6773a95e25 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Fri, 11 Jul 2025 09:36:00 -0600 Subject: [PATCH 039/182] Update sunav2_data_parser.yaml point to parsed schema --- pipe/sunav2/sunav2_data_parser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_data_parser.yaml b/pipe/sunav2/sunav2_data_parser.yaml index e6f3707c5..9f9fae789 100644 --- a/pipe/sunav2/sunav2_data_parser.yaml +++ b/pipe/sunav2/sunav2_data_parser.yaml @@ -15,7 +15,7 @@ transform: OUT_PATH: /pfs/out PARSE_FIELD: serial_output RELATIVE_PATH_INDEX: "4" - PARSED_SCHEMA_PATH: /usr/src/app/schemas/sunav2/sunav2.avsc + PARSED_SCHEMA_PATH: /usr/src/app/parsed-schemas/sunav2/sunav2_parsed.avsc SOURCE_TYPE: 'sunav2_raw' input: pfs: From 696cae751373e5a653338650b67fcc9baae30611 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 11 Jul 2025 12:55:37 -0600 Subject: [PATCH 040/182] latest --- ...av2_cron_daily_and_date_control_kafka.yaml | 56 +++++++++++++++ pipe/sunav2/sunav2_data_parser.yaml | 71 +++++++++++++++++-- pipe/sunav2/sunav2_data_source_kafka.yaml | 4 +- 3 files changed, 122 insertions(+), 9 deletions(-) create mode 100644 pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml new file mode 100644 index 000000000..171c3ec40 --- /dev/null +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -0,0 +1,56 @@ +--- +pipeline: + name: sunav2_cron_daily_and_date_control_kafka +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 + cmd: ["/bin/bash"] + env: + # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure + # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file + # start_date field in the site-list file is the earliest date to pull data from a site + # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka + # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. + OUT_PATH: /pfs/out + START_DATE: "2025-06-01" # Inclusive + SOURCE_TYPE: "sunav2" + stdin: + - "#!/bin/bash" + - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main +input: + cross: + # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. + - cron: + name: tick + spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + overwrite: true + - pfs: + name: SITE_FILE + repo: sunav2_site_list + glob: /site-list.json +resource_requests: + memory: 100M + cpu: 1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.5 +autoscaling: true +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/sunav2/sunav2_data_parser.yaml b/pipe/sunav2/sunav2_data_parser.yaml index 9f9fae789..0150ef9ee 100644 --- a/pipe/sunav2/sunav2_data_parser.yaml +++ b/pipe/sunav2/sunav2_data_parser.yaml @@ -1,22 +1,79 @@ pipeline: name: sunav2_data_parser transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-raw-data-parser:v4.9.7 - cmd: - - /bin/bash - stdin: - - '#!/bin/bash' - - python3 -m raw_data_parser.data_parser_main + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.9.7 env: # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field # if use default for FIELD_START_INDEX and FIELD_END_INDEX, # skip first 3 fields (source_id, site_id, readout_time) in parsed schema - LOG_LEVEL: INFO + LOG_LEVEL: DEBUG OUT_PATH: /pfs/out PARSE_FIELD: serial_output RELATIVE_PATH_INDEX: "4" PARSED_SCHEMA_PATH: /usr/src/app/parsed-schemas/sunav2/sunav2_parsed.avsc SOURCE_TYPE: 'sunav2_raw' + secrets: + - name: l0-bucket + env_var: BUCKET_NAME + key: LO_BUCKET + - name: pdr-secret + env_var: PDR_HOST + key: hostname + - name: pdr-secret + env_var: PDR_DBNAME + key: database + - name: pdr-secret + env_var: PDR_USER + key: username + - name: pdr-secret + env_var: PDR_PASSWORD + key: password + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # run data parser + python3 -m raw_data_parsers.raw_data_parser.data_parser_main + + # Upload L0 files to bucket, compacting with any existing file with the same name + # when SOURCE_TYPE is sunav2_raw, OUT_SOURCE_TYPE is sunav2 + OUT_SOURCE_TYPE=${SOURCE_TYPE%%_raw} + if [[ -d "$OUT_PATH/$OUT_SOURCE_TYPE" ]]; then + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # /pfs/out/sunav2/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x # Uncomment for debugging + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[6]}" + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + + done + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" + + # set +x # Uncomment for debugging + rm -rf $linkdir + fi + + EOF input: pfs: name: DATA_PATH diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index d3d0e48b5..dbab74d51 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_data_source_kafka transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.7.0 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.9.7 image_pull_secrets: - battelleecology-quay-read-all-pull-secret env: @@ -141,7 +141,7 @@ transform: input: pfs: name: import_trigger - repo: sunav2_cron_daily_and_date_control + repo: sunav2_cron_daily_and_date_control_kafka # Must be datum by day (e.g. /SOURCE_TYPE/*/*/*) or by day/site (e.g. /SOURCE_TYPE/*/*/*/*) glob: "/sunav2/*/*/*" parallelism_spec: From 5646b28ba201250db0534b3df54e75e5300dad72 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 15 Jul 2025 08:56:15 -0600 Subject: [PATCH 041/182] Deleted parsing of L0 data within sunav2.log.fill, since we realized this will already have happened prior to this. --- .../wrap.sunav2.logfiles.fill.R | 119 +----------------- 1 file changed, 5 insertions(+), 114 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 004feb664..1b44bc532 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -43,11 +43,11 @@ #' #' @examples #' # Not run -# DirInLogs<-"~/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349" #cleaned log data -# DirInStream<-"~/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349" #streamed L0 data +# DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data +# DirInStream<-"~/pfs/sunav2_data_parser_trino/sunav2/2024/09/10/20349" #streamed L0 data # DirIn<-NULL # DirOutBase="~/pfs/out" -# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2.avsc'),collapse='') +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_parsed.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') #' @@ -113,115 +113,6 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, base::stop()} } -#' Parse serial output into individual columns - L0DataParsed<-tidyr::separate(data = L0Data,col = serial_output,sep = ",|:", - into = c("pos_0","header_light_frame","pos_1","year_and_day","pos_2","time","pos_3","nitrate_concentration","pos_4","nitrogen_in_nitrate", - "pos_5","absorbance_254nm","pos_6","absorbance_350nm","pos_7","bromide_trace","pos_8","spectrum_average","pos_9","dark_value_used_for_fit","pos_10","integration_time_factor", - "pos_11","channel_1","pos_12","channel_2","pos_13","channel_3","pos_14","channel_4","pos_15","channel_5", - "pos_16","channel_6","pos_17","channel_7","pos_18","channel_8","pos_19","channel_9","pos_20","channel_10", - "pos_21","channel_11","pos_22","channel_12","pos_23","channel_13","pos_24","channel_14","pos_25","channel_15", - "pos_26","channel_16","pos_27","channel_17","pos_28","channel_18","pos_29","channel_19","pos_30","channel_20", - "pos_31","channel_21","pos_32","channel_22","pos_33","channel_23","pos_34","channel_24","pos_35","channel_25", - "pos_36","channel_26","pos_37","channel_27","pos_38","channel_28","pos_39","channel_29","pos_40","channel_30", - "pos_41","channel_31","pos_42","channel_32","pos_43","channel_33","pos_44","channel_34","pos_45","channel_35", - "pos_46","channel_36","pos_47","channel_37","pos_48","channel_38","pos_49","channel_39","pos_50","channel_40", - "pos_51","channel_41","pos_52","channel_42","pos_53","channel_43","pos_54","channel_44","pos_55","channel_45", - "pos_56","channel_46","pos_57","channel_47","pos_58","channel_48","pos_59","channel_49","pos_60","channel_50", - "pos_61","channel_51","pos_62","channel_52","pos_63","channel_53","pos_64","channel_54","pos_65","channel_55", - "pos_66","channel_56","pos_67","channel_57","pos_68","channel_58","pos_69","channel_59","pos_70","channel_60", - "pos_71","channel_61","pos_72","channel_62","pos_73","channel_63","pos_74","channel_64","pos_75","channel_65", - "pos_76","channel_66","pos_77","channel_67","pos_78","channel_68","pos_79","channel_69","pos_80","channel_70", - "pos_81","channel_71","pos_82","channel_72","pos_83","channel_73","pos_84","channel_74","pos_85","channel_75", - "pos_86","channel_76","pos_87","channel_77","pos_88","channel_78","pos_89","channel_79","pos_90","channel_80", - "pos_91","channel_81","pos_92","channel_82","pos_93","channel_83","pos_94","channel_84","pos_95","channel_85", - "pos_96","channel_86","pos_97","channel_87","pos_98","channel_88","pos_99","channel_89","pos_100","channel_90", - "pos_101","channel_91","pos_102","channel_92","pos_103","channel_93","pos_104","channel_94","pos_105","channel_95", - "pos_106","channel_96","pos_107","channel_97","pos_108","channel_98","pos_109","channel_99","pos_110","channel_100", - "pos_111","channel_101","pos_112","channel_102","pos_113","channel_103","pos_114","channel_104","pos_115","channel_105", - "pos_116","channel_106","pos_117","channel_107","pos_118","channel_108","pos_119","channel_109","pos_120","channel_110", - "pos_121","channel_111","pos_122","channel_112","pos_123","channel_113","pos_124","channel_114","pos_125","channel_115", - "pos_126","channel_116","pos_127","channel_117","pos_128","channel_118","pos_129","channel_119","pos_130","channel_120", - "pos_131","channel_121","pos_132","channel_122","pos_133","channel_123","pos_134","channel_124","pos_135","channel_125", - "pos_136","channel_126","pos_137","channel_127","pos_138","channel_128","pos_139","channel_129","pos_140","channel_130", - "pos_141","channel_131","pos_142","channel_132","pos_143","channel_133","pos_144","channel_134","pos_145","channel_135", - "pos_146","channel_136","pos_147","channel_137","pos_148","channel_138","pos_149","channel_139","pos_150","channel_140", - "pos_151","channel_141","pos_152","channel_142","pos_153","channel_143","pos_154","channel_144","pos_155","channel_145", - "pos_156","channel_146","pos_157","channel_147","pos_158","channel_148","pos_159","channel_149","pos_160","channel_150", - "pos_161","channel_151","pos_162","channel_152","pos_163","channel_153","pos_164","channel_154","pos_165","channel_155", - "pos_166","channel_156","pos_167","channel_157","pos_168","channel_158","pos_169","channel_159","pos_170","channel_160", - "pos_171","channel_161","pos_172","channel_162","pos_173","channel_163","pos_174","channel_164","pos_175","channel_165", - "pos_176","channel_166","pos_177","channel_167","pos_178","channel_168","pos_179","channel_169","pos_180","channel_170", - "pos_181","channel_171","pos_182","channel_172","pos_183","channel_173","pos_184","channel_174","pos_185","channel_175", - "pos_186","channel_176","pos_187","channel_177","pos_188","channel_178","pos_189","channel_179","pos_190","channel_180", - "pos_191","channel_181","pos_192","channel_182","pos_193","channel_183","pos_194","channel_184","pos_195","channel_185", - "pos_196","channel_186","pos_197","channel_187","pos_198","channel_188","pos_199","channel_189","pos_200","channel_190", - "pos_201","channel_191","pos_202","channel_192","pos_203","channel_193","pos_204","channel_194","pos_205","channel_195", - "pos_206","channel_196","pos_207","channel_197","pos_208","channel_198","pos_209","channel_199","pos_210","channel_200", - "pos_211","channel_201","pos_212","channel_202","pos_213","channel_203","pos_214","channel_204","pos_215","channel_205", - "pos_216","channel_206","pos_217","channel_207","pos_218","channel_208","pos_219","channel_209","pos_220","channel_210", - "pos_221","channel_211","pos_222","channel_212","pos_223","channel_213","pos_224","channel_214","pos_225","channel_215", - "pos_226","channel_216","pos_227","channel_217","pos_228","channel_218","pos_229","channel_219","pos_230","channel_220", - "pos_231","channel_221","pos_232","channel_222","pos_233","channel_223","pos_234","channel_224","pos_235","channel_225", - "pos_236","channel_226","pos_237","channel_227","pos_238","channel_228","pos_239","channel_229","pos_240","channel_230", - "pos_241","channel_231","pos_242","channel_232","pos_243","channel_233","pos_244","channel_234","pos_245","channel_235", - "pos_246","channel_236","pos_247","channel_237","pos_248","channel_238","pos_249","channel_239","pos_250","channel_240", - "pos_251","channel_241","pos_252","channel_242","pos_253","channel_243","pos_254","channel_244","pos_255","channel_245", - "pos_256","channel_246","pos_257","channel_247","pos_258","channel_248","pos_259","channel_249","pos_260","channel_250", - "pos_261","channel_251","pos_262","channel_252","pos_263","channel_253","pos_264","channel_254","pos_265","channel_255", - "pos_266","channel_256", - "pos_267","internal_temperature","pos_268","spectrometer_temperature","pos_269","lamp_temperature","pos_270","lamp_on_time", - "pos_271","relative_humidity","pos_272","main_voltage","pos_273","lamp_voltage","pos_274","internal_voltage", - "pos_275","main_current","pos_276","fit_aux_1","pos_277","fit_aux_2","pos_278","fit_base_1","pos_279","fit_base_2", - "pos_280","fit_rmse","pos_281","ctd_time","pos_282","ctd_salinity","pos_283","ctd_temperature", - "pos_284","ctd_pressure","pos_285","check_sum")) - -#' Drops serial output position columns - L0DataParsed<-L0DataParsed[!grepl("pos",names(L0DataParsed))] - -#' Combines all 256 spectrum channels into single array - L0DataParsed$spectrum_channels<-paste(L0DataParsed$channel_1,L0DataParsed$channel_2,L0DataParsed$channel_3,L0DataParsed$channel_4,L0DataParsed$channel_5,L0DataParsed$channel_6,L0DataParsed$channel_7,L0DataParsed$channel_8,L0DataParsed$channel_9,L0DataParsed$channel_10, - L0DataParsed$channel_11,L0DataParsed$channel_12,L0DataParsed$channel_13,L0DataParsed$channel_14,L0DataParsed$channel_15,L0DataParsed$channel_16,L0DataParsed$channel_17,L0DataParsed$channel_18,L0DataParsed$channel_19,L0DataParsed$channel_20, - L0DataParsed$channel_21,L0DataParsed$channel_22,L0DataParsed$channel_23,L0DataParsed$channel_24,L0DataParsed$channel_25,L0DataParsed$channel_26,L0DataParsed$channel_27,L0DataParsed$channel_28,L0DataParsed$channel_29,L0DataParsed$channel_30, - L0DataParsed$channel_31,L0DataParsed$channel_32,L0DataParsed$channel_33,L0DataParsed$channel_34,L0DataParsed$channel_35,L0DataParsed$channel_36,L0DataParsed$channel_37,L0DataParsed$channel_38,L0DataParsed$channel_39,L0DataParsed$channel_40, - L0DataParsed$channel_41,L0DataParsed$channel_42,L0DataParsed$channel_43,L0DataParsed$channel_44,L0DataParsed$channel_45,L0DataParsed$channel_46,L0DataParsed$channel_47,L0DataParsed$channel_48,L0DataParsed$channel_49,L0DataParsed$channel_50, - L0DataParsed$channel_51,L0DataParsed$channel_52,L0DataParsed$channel_53,L0DataParsed$channel_54,L0DataParsed$channel_55,L0DataParsed$channel_56,L0DataParsed$channel_57,L0DataParsed$channel_58,L0DataParsed$channel_59,L0DataParsed$channel_60, - L0DataParsed$channel_61,L0DataParsed$channel_62,L0DataParsed$channel_63,L0DataParsed$channel_64,L0DataParsed$channel_65,L0DataParsed$channel_66,L0DataParsed$channel_67,L0DataParsed$channel_68,L0DataParsed$channel_69,L0DataParsed$channel_70, - L0DataParsed$channel_71,L0DataParsed$channel_72,L0DataParsed$channel_73,L0DataParsed$channel_74,L0DataParsed$channel_75,L0DataParsed$channel_76,L0DataParsed$channel_77,L0DataParsed$channel_78,L0DataParsed$channel_79,L0DataParsed$channel_80, - L0DataParsed$channel_81,L0DataParsed$channel_82,L0DataParsed$channel_83,L0DataParsed$channel_84,L0DataParsed$channel_85,L0DataParsed$channel_86,L0DataParsed$channel_87,L0DataParsed$channel_88,L0DataParsed$channel_89,L0DataParsed$channel_90, - L0DataParsed$channel_91,L0DataParsed$channel_92,L0DataParsed$channel_93,L0DataParsed$channel_94,L0DataParsed$channel_95,L0DataParsed$channel_96,L0DataParsed$channel_97,L0DataParsed$channel_98,L0DataParsed$channel_99,L0DataParsed$channel_100, - L0DataParsed$channel_101,L0DataParsed$channel_102,L0DataParsed$channel_103,L0DataParsed$channel_104,L0DataParsed$channel_105,L0DataParsed$channel_106,L0DataParsed$channel_107,L0DataParsed$channel_108,L0DataParsed$channel_109,L0DataParsed$channel_110, - L0DataParsed$channel_111,L0DataParsed$channel_112,L0DataParsed$channel_113,L0DataParsed$channel_114,L0DataParsed$channel_115,L0DataParsed$channel_116,L0DataParsed$channel_117,L0DataParsed$channel_118,L0DataParsed$channel_119,L0DataParsed$channel_120, - L0DataParsed$channel_121,L0DataParsed$channel_122,L0DataParsed$channel_123,L0DataParsed$channel_124,L0DataParsed$channel_125,L0DataParsed$channel_126,L0DataParsed$channel_127,L0DataParsed$channel_128,L0DataParsed$channel_129,L0DataParsed$channel_130, - L0DataParsed$channel_131,L0DataParsed$channel_132,L0DataParsed$channel_133,L0DataParsed$channel_134,L0DataParsed$channel_135,L0DataParsed$channel_136,L0DataParsed$channel_137,L0DataParsed$channel_138,L0DataParsed$channel_139,L0DataParsed$channel_140, - L0DataParsed$channel_141,L0DataParsed$channel_142,L0DataParsed$channel_143,L0DataParsed$channel_144,L0DataParsed$channel_145,L0DataParsed$channel_146,L0DataParsed$channel_147,L0DataParsed$channel_148,L0DataParsed$channel_149,L0DataParsed$channel_150, - L0DataParsed$channel_151,L0DataParsed$channel_152,L0DataParsed$channel_153,L0DataParsed$channel_154,L0DataParsed$channel_155,L0DataParsed$channel_156,L0DataParsed$channel_157,L0DataParsed$channel_158,L0DataParsed$channel_159,L0DataParsed$channel_160, - L0DataParsed$channel_161,L0DataParsed$channel_162,L0DataParsed$channel_163,L0DataParsed$channel_164,L0DataParsed$channel_165,L0DataParsed$channel_166,L0DataParsed$channel_167,L0DataParsed$channel_168,L0DataParsed$channel_169,L0DataParsed$channel_170, - L0DataParsed$channel_171,L0DataParsed$channel_172,L0DataParsed$channel_173,L0DataParsed$channel_174,L0DataParsed$channel_175,L0DataParsed$channel_176,L0DataParsed$channel_177,L0DataParsed$channel_178,L0DataParsed$channel_179,L0DataParsed$channel_180, - L0DataParsed$channel_181,L0DataParsed$channel_182,L0DataParsed$channel_183,L0DataParsed$channel_184,L0DataParsed$channel_185,L0DataParsed$channel_186,L0DataParsed$channel_187,L0DataParsed$channel_188,L0DataParsed$channel_189,L0DataParsed$channel_190, - L0DataParsed$channel_191,L0DataParsed$channel_192,L0DataParsed$channel_193,L0DataParsed$channel_194,L0DataParsed$channel_195,L0DataParsed$channel_196,L0DataParsed$channel_197,L0DataParsed$channel_198,L0DataParsed$channel_199,L0DataParsed$channel_200, - L0DataParsed$channel_201,L0DataParsed$channel_202,L0DataParsed$channel_203,L0DataParsed$channel_204,L0DataParsed$channel_205,L0DataParsed$channel_206,L0DataParsed$channel_207,L0DataParsed$channel_208,L0DataParsed$channel_209,L0DataParsed$channel_210, - L0DataParsed$channel_211,L0DataParsed$channel_212,L0DataParsed$channel_213,L0DataParsed$channel_214,L0DataParsed$channel_215,L0DataParsed$channel_216,L0DataParsed$channel_217,L0DataParsed$channel_218,L0DataParsed$channel_219,L0DataParsed$channel_220, - L0DataParsed$channel_221,L0DataParsed$channel_222,L0DataParsed$channel_223,L0DataParsed$channel_224,L0DataParsed$channel_225,L0DataParsed$channel_226,L0DataParsed$channel_227,L0DataParsed$channel_228,L0DataParsed$channel_229,L0DataParsed$channel_230, - L0DataParsed$channel_231,L0DataParsed$channel_232,L0DataParsed$channel_233,L0DataParsed$channel_234,L0DataParsed$channel_235,L0DataParsed$channel_236,L0DataParsed$channel_237,L0DataParsed$channel_238,L0DataParsed$channel_239,L0DataParsed$channel_240, - L0DataParsed$channel_241,L0DataParsed$channel_242,L0DataParsed$channel_243,L0DataParsed$channel_244,L0DataParsed$channel_245,L0DataParsed$channel_246,L0DataParsed$channel_247,L0DataParsed$channel_248,L0DataParsed$channel_249,L0DataParsed$channel_250, - L0DataParsed$channel_251,L0DataParsed$channel_252,L0DataParsed$channel_253,L0DataParsed$channel_254,L0DataParsed$channel_255,L0DataParsed$channel_256,sep=";") - -#' Checks that each data burst is complete (Right now only checks whether last column is a value or not) - L0DataParsed$error_missing_data<-NA - for(i in 1:nrow(L0DataParsed)){if(is.na(L0DataParsed[i,which(colnames(L0DataParsed)=="check_sum")])){L0DataParsed[i,which(colnames(L0DataParsed)=="error_missing_data")]=TRUE} - else{L0DataParsed[i,which(colnames(L0DataParsed)=="error_missing_data")]=FALSE}} - -#' Create additional header columns needed to match avro schema - L0DataParsed$header_manufacturer<-"SATS" - L0DataParsed$header_serial_number<-NA #' Can leave this blank for now - -#' Re-orders columns so they match the avro schema - L0DataParsed<-L0DataParsed[,c("source_id","site_id","readout_time","header_manufacturer","header_serial_number","header_light_frame","year_and_day","time","nitrate_concentration", - "nitrogen_in_nitrate","absorbance_254nm","absorbance_350nm","bromide_trace","spectrum_average","dark_value_used_for_fit","integration_time_factor", - "spectrum_channels","internal_temperature","spectrometer_temperature","lamp_temperature","lamp_on_time","relative_humidity","main_voltage","lamp_voltage", - "internal_voltage","main_current","fit_aux_1","fit_aux_2","fit_base_1","fit_base_2","fit_rmse","ctd_time","ctd_salinity","ctd_temperature","ctd_pressure", - "check_sum","error_missing_data")] - #' Determine whether to use logged or streamed data. #' Logged data is used if available, and log data flag set to 1 if(!is.null(logData)){ @@ -231,8 +122,8 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, flagsOut$sunaLogDataQF<-1 } #' Streamed data is used if no logged data is available, and log data flags set to 0 - if(is.null(logData) & !is.null(L0DataParsed)){ - dataOut<-L0DataParsed + if(is.null(logData) & !is.null(L0Data)){ + dataOut<-L0Data flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 From 06f552b929acaddcbac63bfe12ad58e3e81fa802 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 16 Jul 2025 18:31:27 -0600 Subject: [PATCH 042/182] load and parse suna data --- pipe/sunav2/pipe_list_sunav2.txt | 1 + pipe/sunav2/site-list-full.json | 138 ++++++++++++++++++ ...av2_cron_daily_and_date_control_kafka.yaml | 4 +- pipe/sunav2/sunav2_data_source_kafka.yaml | 72 +++++++-- pipe/sunav2/sunav2_data_source_trino.yaml | 22 +-- pipe/sunav2/sunav2_fill_log_files.yaml | 10 +- pipe/sunav2/sunav2_location_asset.yaml | 29 +++- pipe/sunav2/sunav2_location_loader.yaml | 29 +++- ...ser.yaml => sunav2_trino_data_parser.yaml} | 8 +- 9 files changed, 263 insertions(+), 50 deletions(-) create mode 100644 pipe/sunav2/site-list-full.json rename pipe/sunav2/{sunav2_data_parser.yaml => sunav2_trino_data_parser.yaml} (94%) diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index d446a73fd..a35e92f55 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -4,6 +4,7 @@ sunav2_logjam_load_files.yaml sunav2_logjam_assign_clean_files.yaml sunav2_data_source_kafka.yaml sunav2_data_source_trino.yaml +sunav2_trino_data_parser.yaml sunav2_fill_log_files.yaml sunav2_calibration_list_files.yaml sunav2_calibration_loader.yaml diff --git a/pipe/sunav2/site-list-full.json b/pipe/sunav2/site-list-full.json new file mode 100644 index 000000000..38df06e24 --- /dev/null +++ b/pipe/sunav2/site-list-full.json @@ -0,0 +1,138 @@ +[ + { + "site" : "ARIK", + "kafka_start_date" : "2024-03-01" + }, + { + "site" : "BARC", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "BIGC", + "kafka_start_date" : "2024-06-01" + }, + { + "site" : "BLDE", + "kafka_start_date" : "2024-05-08" + }, + { + "site" : "BLUE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "BLWA", + "kafka_start_date" : "2024-08-22" + }, + { + "site" : "CARI", + "kafka_start_date" : "2024-03-01" + }, + { + "site" : "COMO", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "CRAM", + "kafka_start_date" : "2024-07-20" + }, + { + "site" : "CUPE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "FLNT", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "GUIL", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "HOPB", + "kafka_start_date" : "2024-01-17" + }, + { + "site" : "KING", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "LECO", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "LEWI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "LIRO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "MART", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MAYF", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCDI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCRA", + "kafka_start_date" : "2024-02-05" + }, + { + "site" : "OKSR", + "kafka_start_date" : "2024-04-06" + }, + { + "site" : "POSE", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "PRIN", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "PRLA", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "PRPO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "REDB", + "kafka_start_date" : "2024-02-06" + }, + { + "site" : "SUGG", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "SYCA", + "kafka_start_date" : "2024-04-11" + }, + { + "site" : "TECR", + "kafka_start_date" : "2024-03-17" + }, + { + "site" : "TOMB", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "TOOK", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "WALK", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "WLOU", + "kafka_start_date" : "2024-02-06" + } +] \ No newline at end of file diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 171c3ec40..6b5026a7a 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -11,7 +11,7 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-06-01" # Inclusive + START_DATE: "2025-07-01" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" @@ -25,7 +25,7 @@ input: overwrite: true - pfs: name: SITE_FILE - repo: sunav2_site_list + repo: sunav2_site_list_kafka glob: /site-list.json resource_requests: memory: 100M diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index dbab74d51..b6c2a2820 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -2,17 +2,25 @@ pipeline: name: sunav2_data_source_kafka transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.9.7 - image_pull_secrets: - - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:sha-37cf497 env: - OUT_PATH: /pfs/out + # environment variables for kafka loader + OUT_PATH: /pfs/out #also used for parser SOURCE_TYPE: "sunav2_raw" - LOG_LEVEL: DEBUG + LOG_LEVEL: INFO YEAR_INDEX: "5" MONTH_INDEX: "6" DAY_INDEX: "7" KAFKA_RETENTION_DAYS: "15" + + # environment variables for the parser + PARSE_FIELD: serial_output + RELATIVE_PATH_INDEX: "4" + PARSED_SCHEMA_PATH: /usr/src/app/parsed-schemas/sunav2/sunav2_parsed.avsc + SOURCE_TYPE: 'sunav2_raw' + DATA_PATH: /pfs/out # takes output of kafka loader as it's input to parse + UPDATE_TRIGGER_TABLE: "False" + RM_OFFSETS: "False" secrets: - name: pachyderm-kafka-auth env_var: KAFKA_USER @@ -88,7 +96,7 @@ transform: fi # We are ok to run - echo "Extracting $date_str kafka data for $site" + echo "Extracting $date_str kafka data for $SOURCE_TYPE at $site" # Get "current data" - data that came in on the specified day, which is the same day it was measured # Note: We cannot use the --removeoffset flag on the kafka loader (which removes the offsets from the filenames. This will often violate the Pachyderm requirement that different datums cannot write the same file) @@ -112,23 +120,23 @@ transform: # set -x # Uncomment for debugging for f in $out_parquet_glob; do # Parse the path - [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/data/(.*)$ ]] + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/(.*)/(.*)$ ]] fsourcetype="${BASH_REMATCH[1]}" fyear="${BASH_REMATCH[2]}" fmonth="${BASH_REMATCH[3]}" fday="${BASH_REMATCH[4]}" fsourceid="${BASH_REMATCH[5]}" - fname="${BASH_REMATCH[6]}" - fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename + fname="${BASH_REMATCH[7]}" + # fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" mkdir -p "${outdir}" - ln -s "${f}" "${outdir}/${fname_out}" + ln -s "${f}" "${outdir}/${fname}" - # Upload to bucket, compacting with any existing file - ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" - rm -rf "${outdir}" done + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" --stripoffset + # Update the airflow triggering table for site_output in "${sites_output[@]}"; do ./update-trigger-table.py -s $site_output -S $SOURCE_TYPE -D "$OUT_PATH/$SOURCE_TYPE" @@ -137,6 +145,44 @@ transform: # set +x # Uncomment for debugging rm -rf $linkdir fi + + # run data parser + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + python3 -m raw_data_parsers.raw_data_parser.suna_data_parser_main + + # save parsed data to gcs + export SOURCE_TYPE=sunav2 + + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/${SOURCE_TYPE}/**/*.parquet" + # /pfs/out/sunav2/2023/01/01/12345/data/file.parquet + echo "Linking output files to ${linkdir}" + # set -x + for f in $out_parquet_glob; do + # Parse the path + [[ "$f" =~ ^$OUT_PATH/(.*)/([0-9]+)/([0-9]+)/([0-9]+)/(.*)/(.*)/(.*)$ ]] + fsourcetype="${BASH_REMATCH[1]}" + fyear="${BASH_REMATCH[2]}" + fmonth="${BASH_REMATCH[3]}" + fday="${BASH_REMATCH[4]}" + fsourceid="${BASH_REMATCH[5]}" + fname="${BASH_REMATCH[7]}" + # fname_out="${fsourcetype}_${fsourceid}_${fyear}-${fmonth}-${fday}.parquet" # Remove offsets from the filename + outdir="${linkdir}/v2/${fsourcetype}/ms=${fyear}-${fmonth}/source_id=${fsourceid}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + + done + + # Upload to bucket, compacting with any existing file + ./compact-bucket-copy.py --sourcepath "${linkdir}" --destbucket "${BUCKET_NAME}" --stripoffset + + # set +x # Uncomment for debugging + rm -rf $linkdir + + fi + EOF input: pfs: diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index d8bb50554..42d53cdf2 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_data_source_trino transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.1.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.2.4 cmd: - sh - "-c" @@ -40,15 +40,11 @@ transform: echo "$year/$month/$day/$site is indicated to be streaming from Kafka. Skipping..." continue fi - - # Set env vars for trino loader + echo "Extracting data from Trino for $year/$month/$day/$site" export GEN_DATE=$year-$month-$day export GEN_SITE_NAME=$site - export REQUESTS_CA_BUNDLE=/etc/pki/tls/cert.pem - export GEN_YAML_CONF="/usr/src/app/genscript/configs/$(echo $SOURCE_TYPE)_streams.yaml" - export GEN_SCHEMA_FILE="/usr/src/app/schemas/$(echo $SOURCE_TYPE)/$(echo $SOURCE_TYPE)_raw.avsc" - echo "Extracting $SOURCE_TYPE from Trino for $year/$month/$day/$site" export GEN_OUTPUT_DIR=$interimDir/$SOURCE_TYPE/$year/$month/$day + export REQUESTS_CA_BUNDLE=/etc/pki/tls/cert.pem mkdir -p $GEN_OUTPUT_DIR /usr/src/app/genscript/genparquet.py --storesitename --codec gzip done @@ -61,7 +57,7 @@ transform: linkdir=$(mktemp -d) shopt -s globstar out_parquet_glob="${OUT_PATH}/**/*.parquet" - # Example: /pfs/out/li191r/2023/01/01/12345/data/file.parquet + # Example: /pfs/out/sunav2_raw/2023/01/01/12345/data/file.parquet echo "Linking output files to ${linkdir}" # set -x # Uncomment for debugging for f in $out_parquet_glob; do @@ -84,6 +80,7 @@ transform: --copy-links \ --gcs-bucket-policy-only \ --gcs-no-check-bucket \ + --metadata-set "content-type=application/vnd.apache.parquet" \ copy \ "${linkdir}" \ ":gcs://${BUCKET_NAME}" @@ -95,8 +92,10 @@ transform: fi EOF env: - # Static environment variables for data conversion step - LOG_LEVEL: DEBUG + # Environment variables for data conversion step + GEN_YAML_CONF: "/usr/src/app/genscript/configs/sunav2_streams.yaml" + GEN_SCHEMA_FILE: "/usr/src/app/schemas/sunav2/sunav2_raw.avsc" + LOG_LEVEL: INFO REQUESTS_CA_BUNDLE: "/etc/pki/tls/cert.pem" # Environment variables for linkmerge step IN_PATH: /tmp/interimData @@ -107,7 +106,8 @@ transform: DAY_INDEX: '6' SOURCE_ID_INDEX: '7' KAFKA_RETENTION_DAYS: "15" - SOURCE_TYPE: "sunav2" + # Environment variables for bash code + SOURCE_TYPE: 'sunav2_raw' secrets: - name: pachd-trino-secret key: TRINO_HOST diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index da76a0ed7..a89c68113 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -137,29 +137,29 @@ input: join: - pfs: name: DATA_PATH_TRINO - repo: sunav2_data_source_trino - glob: /(*/*/*/*) #sunav2/Y/M/D + repo: sunav2_trino_data_parser + glob: /(sunav2/*/*/*) #sunav2/Y/M/D joinOn: $1 empty_files: false # Make sure this is false for LINK_TYPE=COPY outer_join: true - pfs: name: DATA_PATH_KAFKA repo: sunav2_data_source_kafka - glob: /(*/*/*/*) + glob: /(sunav2/*/*/*) joinOn: $1 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. outer_join: true - pfs: name: DATA_PATH_LOG repo: sunav2_logjam_assign_clean_files - glob: /(*/*/*/*) #sunav2/Y/M/D + glob: /(sunav2/*/*/*) #sunav2/Y/M/D joinOn: $1 empty_files: false # Make sure this is false for LINK_TYPE=COPY outer_join: true - pfs: name: DATE_CONTROL repo: sunav2_cron_daily_and_date_control - glob: /(*/*/*/*) #sunav2/Y/M/D + glob: /(sunav2/*/*/*) #sunav2/Y/M/D joinOn: $1 empty_files: false # Make sure this is false for LINK_TYPE=COPY outer_join: true diff --git a/pipe/sunav2/sunav2_location_asset.yaml b/pipe/sunav2/sunav2_location_asset.yaml index cca69b65a..902a1d9ac 100644 --- a/pipe/sunav2/sunav2_location_asset.yaml +++ b/pipe/sunav2/sunav2_location_asset.yaml @@ -4,18 +4,33 @@ pipeline: transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.1.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.0.0 + cmd: - - /bin/bash - stdin: - - '#!/bin/bash' - - python3 -m location_asset_loader.location_asset_loader_main + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + python3 -m location_asset_loader.location_asset_loader_main + + cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT + + EOF env: - OUT_PATH: /pfs/out + OUT_PATH: /tmp/out # ERR_PATH can be changed, it is user specified ERR_PATH: /pfs/out/errored_datums LOG_LEVEL: INFO - SOURCE_TYPE: sunav2 + SOURCE_TYPE: sunav2_raw + SOURCE_TYPE_OUT: sunav2 secrets: - name: pdr-secret mount_path: /var/db_secret diff --git a/pipe/sunav2/sunav2_location_loader.yaml b/pipe/sunav2/sunav2_location_loader.yaml index 672ea2797..b5815e716 100644 --- a/pipe/sunav2/sunav2_location_loader.yaml +++ b/pipe/sunav2/sunav2_location_loader.yaml @@ -6,17 +6,30 @@ transform: # - battelleecology-quay-read-all-pull-secret image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-loader:v1.0.0 cmd: - - /bin/bash - stdin: - - '#!/bin/bash' - - python3 -m location_loader.location_loader_main + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + python3 -m location_loader.location_loader_main #run the location loader + + cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT + + EOF env: LOCATION_TYPE: CONFIG - SOURCE_TYPE: sunav2 - OUT_PATH: /pfs/out - # ERR_PATH can be changed, it is user specified - ERR_PATH: /pfs/out/errored_datums + SOURCE_TYPE: sunav2_raw + SOURCE_TYPE_OUT: sunav2 + OUT_PATH: /tmp/out LOG_LEVEL: INFO + ERR_PATH: /pfs/out/errored_datums secrets: - name: pdr-secret mount_path: /var/db_secret diff --git a/pipe/sunav2/sunav2_data_parser.yaml b/pipe/sunav2/sunav2_trino_data_parser.yaml similarity index 94% rename from pipe/sunav2/sunav2_data_parser.yaml rename to pipe/sunav2/sunav2_trino_data_parser.yaml index 0150ef9ee..14a432103 100644 --- a/pipe/sunav2/sunav2_data_parser.yaml +++ b/pipe/sunav2/sunav2_trino_data_parser.yaml @@ -1,7 +1,7 @@ pipeline: - name: sunav2_data_parser + name: sunav2_trino_data_parser transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.9.7 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:sha-37cf497 env: # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field # if use default for FIELD_START_INDEX and FIELD_END_INDEX, @@ -39,7 +39,7 @@ transform: IFS=$'\n\t' # run data parser - python3 -m raw_data_parsers.raw_data_parser.data_parser_main + python3 -m raw_data_parsers.raw_data_parser.suna_data_parser_main # Upload L0 files to bucket, compacting with any existing file with the same name # when SOURCE_TYPE is sunav2_raw, OUT_SOURCE_TYPE is sunav2 @@ -77,7 +77,7 @@ transform: input: pfs: name: DATA_PATH - repo: sunav2_data_source_kafka + repo: sunav2_data_source_trino glob: /sunav2_raw/*/*/* parallelism_spec: constant: 3 From ff8d2f7ea38cae8a94d2aa5b65be95e10e1e1fd5 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 16 Jul 2025 18:32:17 -0600 Subject: [PATCH 043/182] change source type in suna calibration loader to "sunav2" instead of "sunav2_raw" --- .../sunav2/sunav2_calibration_assignment.yaml | 2 +- pipe/sunav2/sunav2_calibration_loader.yaml | 42 ++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml index f5fbd64cf..42fd8d720 100644 --- a/pipe/sunav2/sunav2_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -22,7 +22,7 @@ input: - pfs: name: DIR_IN repo: sunav2_calibration_loader - glob: /sunav2_raw/* + glob: /sunav2/* - pfs: name: FILE_YEAR repo: sunav2_cron_daily_and_date_control diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index a751f9dbb..10abcc746 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -1,20 +1,34 @@ --- pipeline: - name: sunav2_calibration_loader + name: suna_calibration_loader transform: - # image_pull_secrets: - # - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:sha-a243d3e + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 cmd: - - /bin/bash - stdin: - - '#!/bin/bash' - - python3 -m calval_loader.load_all_calval_files + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-sstrict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + + python3 -m calval_loader.load_all_calval_files #run the calibration loader + + if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then + cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT + fi + + EOF env: CVAL_INGEST_BUCKET: neon-cval - OUT_PATH: /pfs/out - LOG_LEVEL: DEBUG - SOURCE_TYPE: sunav2_raw + OUT_PATH: /tmp/out + LOG_LEVEL: INFO + SOURCE_TYPE: "suna_raw" + SOURCE_TYPE_OUT: "suna" STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret @@ -22,7 +36,7 @@ transform: input: pfs: name: IN_PATH - repo: sunav2_calibration_list_files + repo: suna_calibration_list_files glob: /*/*/*/* empty_files: true parallelism_spec: @@ -35,8 +49,8 @@ resource_limits: memory: 1G cpu: 1.5 sidecar_resource_requests: - memory: 2G - cpu: 0.5 + memory: 800M + cpu: 0.2 datum_set_spec: number: 1 scheduling_spec: From 638456ab3a3f37144abea8ee24f73c64b8362569 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 16 Jul 2025 18:38:44 -0600 Subject: [PATCH 044/182] bug fix --- pipe/sunav2/sunav2_calibration_loader.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 10abcc746..9a6757367 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -36,7 +36,7 @@ transform: input: pfs: name: IN_PATH - repo: suna_calibration_list_files + repo: sunav2_calibration_list_files glob: /*/*/*/* empty_files: true parallelism_spec: From 9666aec3caeca58b93a9b00444ab3960e351284f Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 16 Jul 2025 20:30:10 -0600 Subject: [PATCH 045/182] fix source type --- pipe/sunav2/sunav2_calibration_loader.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 9a6757367..1772a996d 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: suna_calibration_loader + name: sunav2_calibration_loader transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 cmd: @@ -27,8 +27,8 @@ transform: CVAL_INGEST_BUCKET: neon-cval OUT_PATH: /tmp/out LOG_LEVEL: INFO - SOURCE_TYPE: "suna_raw" - SOURCE_TYPE_OUT: "suna" + SOURCE_TYPE: "sunav2_raw" + SOURCE_TYPE_OUT: "sunav2" STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret From 155c3d89b7e83742a18f891443b896d9d717a1f9 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 17 Jul 2025 07:45:37 -0600 Subject: [PATCH 046/182] update image --- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index a89c68113..13f18af38 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-2476643 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-9666aec cmd: - sh - "-c" From 704b1496004b78e75b41497e9a2c3baf0c4dce55 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 17 Jul 2025 09:22:18 -0600 Subject: [PATCH 047/182] ignor kafka data for now --- pipe/sunav2/sunav2_fill_log_files.yaml | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 13f18af38..dd1f2dfea 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-9666aec + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-155c3d8 cmd: - sh - "-c" @@ -107,13 +107,13 @@ transform: # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. # Metadata indices will typically begin at index 3. input_paths: - - path: - name: DATA_PATH_KAFKA - # Filter for data directory - glob_pattern: /tmp/kafka_merged/*/*/*/*/*/** - # Join on named location (already joined below by day) - join_indices: [3,4,5,6] - outer_join: true + # - path: + # name: DATA_PATH_KAFKA + # # Filter for data directory + # glob_pattern: /tmp/kafka_merged/*/*/*/*/*/** + # # Join on named location (already joined below by day) + # join_indices: [3,4,5,6] + # outer_join: true - path: name: DATA_PATH_TRINO # Filter for data directory @@ -142,13 +142,13 @@ input: joinOn: $1 empty_files: false # Make sure this is false for LINK_TYPE=COPY outer_join: true - - pfs: - name: DATA_PATH_KAFKA - repo: sunav2_data_source_kafka - glob: /(sunav2/*/*/*) - joinOn: $1 - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - outer_join: true + # - pfs: + # name: DATA_PATH_KAFKA + # repo: sunav2_data_source_kafka + # glob: /(sunav2/*/*/*) + # joinOn: $1 + # empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + # outer_join: true - pfs: name: DATA_PATH_LOG repo: sunav2_logjam_assign_clean_files From b28f2da63a6775067d47e0b351b249aaf7feedae Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 17 Jul 2025 10:42:39 -0600 Subject: [PATCH 048/182] latest --- pipe/sunav2/sunav2_location_group_and_restructure.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml index 4adf27e22..1edea9484 100644 --- a/pipe/sunav2/sunav2_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -33,8 +33,7 @@ transform: DirIn=/tmp/pfs/structuredCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - "DirSubCombData=data|flags|uncertainty_data" \ - DirSubCombUcrt=uncertainty_coef \ + "DirSubCombData=data|flags" \ #add in uncertainty data later DirSubCopy=location EOF env: From 898036cd9053ec45f592579cdda2e95f0d0fa383 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 17 Jul 2025 10:42:39 -0600 Subject: [PATCH 049/182] latest --- pipe/sunav2/pipe_list_sunav2.txt | 9 ++++----- pipe/sunav2/sunav2_location_group_and_restructure.yaml | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index a35e92f55..71302f636 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -9,16 +9,15 @@ sunav2_fill_log_files.yaml sunav2_calibration_list_files.yaml sunav2_calibration_loader.yaml sunav2_calibration_assignment.yaml -sunav2_calibration_group_and_convert.yaml - +sunav2_calibration_group_and_convert.yaml sunav2_location_asset.yaml -sunav2_location_loader.yaml -sunav2_calibration_assignment.yaml sunav2_location_asset_assignment.yaml +sunav2_location_loader.yaml sunav2_location_active_dates_assignment.yaml -sunav2_calibration_group_and_convert.yaml + + sunav2_location_group_and_restructure.yaml sunav2_fill_date_gaps_and_regularize.yaml diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml index 4adf27e22..1edea9484 100644 --- a/pipe/sunav2/sunav2_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -33,8 +33,7 @@ transform: DirIn=/tmp/pfs/structuredCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - "DirSubCombData=data|flags|uncertainty_data" \ - DirSubCombUcrt=uncertainty_coef \ + "DirSubCombData=data|flags" \ #add in uncertainty data later DirSubCopy=location EOF env: From 976f388d7424efad626ea901d6db35578d4b75e3 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Thu, 17 Jul 2025 11:24:38 -0600 Subject: [PATCH 050/182] update kafka image --- pipe/sunav2/sunav2_data_source_kafka.yaml | 2 +- pipe/sunav2/sunav2_trino_data_parser.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index b6c2a2820..582122965 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_data_source_kafka transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:sha-37cf497 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.10.1 env: # environment variables for kafka loader OUT_PATH: /pfs/out #also used for parser diff --git a/pipe/sunav2/sunav2_trino_data_parser.yaml b/pipe/sunav2/sunav2_trino_data_parser.yaml index 14a432103..38d6148cb 100644 --- a/pipe/sunav2/sunav2_trino_data_parser.yaml +++ b/pipe/sunav2/sunav2_trino_data_parser.yaml @@ -1,7 +1,7 @@ pipeline: name: sunav2_trino_data_parser transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:sha-37cf497 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.10.1 env: # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field # if use default for FIELD_START_INDEX and FIELD_END_INDEX, From 3919b2e1c8422bf6e54ebeba4fe8f394af430fbe Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 30 Jul 2025 17:42:17 -0600 Subject: [PATCH 051/182] update resource request --- pipe/sunav2/sunav2_data_source_kafka.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index 582122965..b4ba246a7 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -194,10 +194,10 @@ parallelism_spec: constant: 3 autoscaling: true resource_requests: - memory: 300M + memory: 1G cpu: 1.6 resource_limits: - memory: 1.5G + memory: 2G cpu: 2 sidecar_resource_requests: memory: 2G From 8f8994d8c237194c33b0953d03dba5a59cab2dd7 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Thu, 31 Jul 2025 10:54:46 -0600 Subject: [PATCH 052/182] update resource requests --- pipe/sunav2/sunav2_data_source_kafka.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index b4ba246a7..bae43cb2f 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -194,11 +194,11 @@ parallelism_spec: constant: 3 autoscaling: true resource_requests: - memory: 1G - cpu: 1.6 -resource_limits: - memory: 2G + memory: 2.5G cpu: 2 +resource_limits: + memory: 3G + cpu: 2.5 sidecar_resource_requests: memory: 2G cpu: 0.5 From 4324225d5b878d625fd8856e39a083f57b8c44ff Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 18 Aug 2025 15:10:41 -0600 Subject: [PATCH 053/182] latest suna pipeline --- pipe/sunav2/pipe_list_sunav2.txt | 5 --- .../sunav2_calibration_group_and_convert.yaml | 40 ++++++++++--------- .../sunav2_fill_date_gaps_and_regularize.yaml | 9 ++--- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- ...sunav2_location_group_and_restructure.yaml | 3 +- .../sunav2_logjam_assign_clean_files.yaml | 2 +- 6 files changed, 29 insertions(+), 32 deletions(-) diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index 71302f636..c5d097b53 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -9,15 +9,10 @@ sunav2_fill_log_files.yaml sunav2_calibration_list_files.yaml sunav2_calibration_loader.yaml sunav2_calibration_assignment.yaml - sunav2_calibration_group_and_convert.yaml - - sunav2_location_asset.yaml sunav2_location_asset_assignment.yaml sunav2_location_loader.yaml sunav2_location_active_dates_assignment.yaml - - sunav2_location_group_and_restructure.yaml sunav2_fill_date_gaps_and_regularize.yaml diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index e2fac2f5f..19bb228bb 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -26,10 +26,9 @@ transform: DirIn=/tmp/pfs/filter_joined \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - DirSubCopy=flags \ - "TermFuncConv=pressure:def.cal.conv.poly.b|temperature:def.cal.conv.poly|conductivity:def.cal.conv.poly.split" \ - "TermQf=pressure|temperature|conductivity" \ - "TermFuncUcrt=pressure:def.ucrt.meas.cnst|temperature:def.ucrt.meas.cnst|conductivity:def.ucrt.meas.cnst" + TermQf=nitrate_concentration \ + FileSchmQf=$FILE_SCHEMA_FLAGS \ + DirSubCopy=flags EOF env: # Environment variables for filter-joiner. @@ -59,21 +58,26 @@ transform: # Environment variables for calibration module PARALLELIZATION_INTERNAL: '3' # Option for calibration conversion module input: - # Outer join all days - join: - - pfs: - name: CALIBRATION_PATH - repo: sunav2_calibration_assignment - glob: /(*)/(*)/(*)/(*) - joinOn: $1/$2/$3/$4 - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + cross: - pfs: - name: DATA_PATH - repo: sunav2_fill_log_files - glob: /(*)/(*)/(*)/(*) - joinOn: $1/$2/$3/$4 - outer_join: true - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + name: FILE_SCHEMA_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/flags_calibration_sunav2.avsc + # Outer join all days + - join: + - pfs: + name: CALIBRATION_PATH + repo: sunav2_calibration_assignment + glob: /(*)/(*)/(*)/(*) + joinOn: $1/$2/$3/$4 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATA_PATH + repo: sunav2_fill_log_files + glob: /(*)/(*)/(*)/(*) + joinOn: $1/$2/$3/$4 + outer_join: true + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml index 370d181c8..a3d8ed4a7 100644 --- a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml +++ b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml @@ -11,11 +11,14 @@ transform: # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ set -euo pipefail IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) rm -rf $OUT_PATH mkdir -p $OUT_PATH + # Run first module - date-gap-filler (using environment variables below as input parameters) python3 -m date_gap_filler.date_gap_filler_main + # Run second module - regularize Rscript ./flow.rglr.R \ DirIn=/tmp/pfs/date_filled \ @@ -60,21 +63,15 @@ input: - pfs: name: DATA_PATH repo: sunav2_location_group_and_restructure - # For full-scale daily processing, glob should be /sunav2/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /sunav2/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) glob: /sunav2/(*/*/*) - #glob: /sunav2/(*/*/*/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) group_by: $1 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - join: - pfs: name: LOCATION_PATH repo: sunav2_location_active_dates_assignment - # For full-scale daily processing, glob should be /sunav2/(*/*/*). To limit to particular CFGLOCs, note the parentheses and enter something like /sunav2/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) glob: /sunav2/(*/*/*) - #glob: /sunav2/((*/*/*)/(CFGLOC100445|CFGLOC100218|CFGLOC100219|CFGLOC100446|CFGLOC100449|CFGLOC100087)) - # For full-scale daily processing, joinOn be $1. When limiting to particular CFGLOCs, joinOn will be $2 to match parentheses around (*/*/*) joinOn: $1 - #joinOn: $2 group_by: $1 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index dd1f2dfea..0148b79b9 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-155c3d8 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-a1ff444 cmd: - sh - "-c" diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml index 1edea9484..dde304e5a 100644 --- a/pipe/sunav2/sunav2_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -33,7 +33,8 @@ transform: DirIn=/tmp/pfs/structuredCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - "DirSubCombData=data|flags" \ #add in uncertainty data later + "DirSubCombData=data|flags" \ + DirSubCombUcrt=uncertainty_coef \ DirSubCopy=location EOF env: diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 891ab896f..432e3324e 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -10,7 +10,7 @@ transform: DirIn=$DIR_IN DirOut=/pfs/out DirErr=$ERR_PATH - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-75cbee2 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-a1ff444 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From 390f2d57fe8cb71bb1406a07547dabac0a1241a3 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Wed, 20 Aug 2025 16:30:52 -0600 Subject: [PATCH 054/182] Started work on SUNA flagging module. --- .../wrap.suna.quality.flags.R | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R diff --git a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R new file mode 100644 index 000000000..7aff81a43 --- /dev/null +++ b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R @@ -0,0 +1,124 @@ +############################################################################################## +#' @title Wrapper for SUNA quality flagging + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Uses thresholds to apply quality flags to SUNA data. +#' +#' @param DirIn Character value. The file path to the input data. +#' +#' @param DirOut Character value. The file path for the output data. +#' +#' @param DirThresholds Character value. The file path for the quality flag thresholds. +#' +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' file. If this input is not provided, the output schema for the data will be the same as the input data +#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF +#' THE INPUT DATA. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return SUNA data with quality flags applied in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data" +# DirOut<-"~/pfs/sunav2_quality_flagged_data/sunav2/2024/09/10/CFGLOC110733" +# DirThresholds<-"~/pfs/sunav2_thresholds" +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_flagged.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' @changelog +#' Bobby Hensley (2025-08-30) created +#' +############################################################################################## +wrap.sunav2.quality.flags <- function(DirIn=NULL, + DirOut=NULL, + SchmDataOut=NULL, + log=NULL +){ + + #' Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + #' Read in parquet file of input data + fileName<-base::list.files(DirIn,full.names=FALSE) + sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirIn, '/', fileName), + log = log),silent = FALSE) + + #' Read in csv file of quality flag thresholds + sunaThresholds<-read.csv(file=(base::paste0(DirThresholds,'/sunav2_thresholds.csv'))) + ############################################################################################# + #' Will be a different module that find the thresholds for the site and date range of the data + #' This is just a temporary workaround + siteThresholds<-sunaThresholds[(sunaThresholds$Named.Location.Name=="CRAM"),] + ############################################################################################# + + + + + + + + + + + + + + + + + + + + + + + + + + #' Write out data file and log flags file + + #write out data file + fileOutSplt <- base::strsplit(DirInStream,'[/]')[[1]] # Separate underscore-delimited components of the file name + asset<-tail(x=fileOutSplt,n=1) + csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d")) + + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, + NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), + Schm = SchmDataOut),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',csv_name,".parquet"),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('Data written successfully in ', base::paste0(DirOutData,'/',csv_name,".parquet"))) + } + + #write out log flags file + csv_name_flags <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d"),'_logFlags') + + rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagsOut, + NameFile = base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"), + Schm = SchmFlagsOut),silent=TRUE) + if(class(rptOutFlags)[1] == 'try-error'){ + log$error(base::paste0('Cannot write Flags to ',base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"),'. ',attr(rptOutFlags, "condition"))) + stop() + } else { + log$info(base::paste0('Flags written successfully in ', base::paste0(DirOutFlags,'/',csv_name_flags,".parquet"))) + } + +} + + + From e1d6a9a316ee6d2f77c11a7810fa0488c9c5d525 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 21 Aug 2025 10:23:58 -0600 Subject: [PATCH 055/182] nitrate latest --- .../nitrate_analyze_pad_and_qaqc_plau.yaml | 87 +++++++++++++++++++ pipe/nitrate/nitrate_group_assignment.yaml | 61 +++++++++++++ pipe/nitrate/nitrate_group_loader.yaml | 53 +++++++++++ pipe/nitrate/nitrate_group_path.yaml | 87 +++++++++++++++++++ pipe/nitrate/nitrate_srf_assignment.yaml | 58 +++++++++++++ pipe/nitrate/nitrate_srf_loader.yaml | 51 +++++++++++ .../nitrate/nitrate_thresh_select_ts_pad.yaml | 81 +++++++++++++++++ pipe/nitrate/nitrate_threshold.yaml | 53 +++++++++++ pipe/nitrate/pipe_list_nitrate.txt | 16 ++++ pipe/sunav2/pipe_list_sunav2.txt | 3 +- 10 files changed, 548 insertions(+), 2 deletions(-) create mode 100644 pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml create mode 100644 pipe/nitrate/nitrate_group_assignment.yaml create mode 100644 pipe/nitrate/nitrate_group_loader.yaml create mode 100644 pipe/nitrate/nitrate_group_path.yaml create mode 100644 pipe/nitrate/nitrate_srf_assignment.yaml create mode 100644 pipe/nitrate/nitrate_srf_loader.yaml create mode 100644 pipe/nitrate/nitrate_thresh_select_ts_pad.yaml create mode 100644 pipe/nitrate/nitrate_threshold.yaml create mode 100644 pipe/nitrate/pipe_list_nitrate.txt diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml new file mode 100644 index 000000000..39b29eacd --- /dev/null +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -0,0 +1,87 @@ +--- +pipeline: + name: parQuantumLine_analyze_pad_and_qaqc_plau +transform: + image_pull_secrets: + - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-ts-pad-anls-qaqc-plau:v1.1.2 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/pfs/padded_analyzer + rm -rf /tmp/pfs/padded_analyzerCopy + mkdir -p /tmp/pfs/padded_analyzer + + # Run first module - padded_timeseries_analyzer + python3 -m padded_timeseries_analyzer.padded_timeseries_analyzer.padded_timeseries_analyzer_main + + # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) + cp -rL /tmp/pfs/padded_analyzer /tmp/pfs/padded_analyzerCopy || : # Allow to fail without exit code (happens if step above produced no output) + rm -r -f /tmp/pfs/padded_analyzer + + # Run second module - qaqc plausibility + Rscript ./flow.qaqc.plau.R \ + DirIn=/tmp/pfs/padded_analyzerCopy \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + FileSchmQf=$SCHEMA_FLAGS \ + "TermTest1=par:null|gap|range(rmv)|step(rmv)|persistence" + + EOF + env: + # Environment variables for padded timeseries analyzer + OUT_PATH: /tmp/pfs/padded_analyzer + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: '3' + ERR_PATH: /pfs/out/errored_datums + # Environment variables for qaqc plausibility + PARALLELIZATION_INTERNAL: '5' +input: + cross: + - pfs: + name: DATA_PATH + repo: parQuantumLine_thresh_select_ts_pad + glob: /*/*/* + - pfs: + name: SCHEMA_FLAGS + repo: parQuantumLine_avro_schemas + glob: /parQuantumLine/flags_plausibility_parQuantumLine.avsc +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2G + cpu: 5.5 +resource_limits: + memory: 4G + cpu: 7 +sidecar_resource_requests: + memory: 3G + cpu: 0.3 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_group_assignment.yaml b/pipe/nitrate/nitrate_group_assignment.yaml new file mode 100644 index 000000000..a7cefa03e --- /dev/null +++ b/pipe/nitrate/nitrate_group_assignment.yaml @@ -0,0 +1,61 @@ +--- +pipeline: + name: nitrate_group_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.loc.grp.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + TypeFile=group + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: nitrate_group_loader + glob: /nitrate/* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year_*.txt +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 250M + cpu: 1 +resource_limits: + memory: 400M + cpu: 2 +sidecar_resource_requests: + memory: 1G + cpu: 0.5 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_group_loader.yaml b/pipe/nitrate/nitrate_group_loader.yaml new file mode 100644 index 000000000..8a8657c68 --- /dev/null +++ b/pipe/nitrate/nitrate_group_loader.yaml @@ -0,0 +1,53 @@ +pipeline: + name: nitrate_group_loader +transform: + cmd: + - /bin/bash + env: + GROUP_PREFIX: nitrate_ + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-group-loader:v1.0.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + secrets: + - name: pdr-secret + mount_path: /var/db_secret + stdin: + - '#!/bin/bash' + - python3 -m group_loader.group_loader_main +input: + pfs: + branch: master + empty_files: true + glob: /* + repo: sunav2_cron_daily_and_date_control_tick +autoscaling: true +resource_requests: + memory: 50M + cpu: 0.1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 200M + cpu: 0.2 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_group_path.yaml b/pipe/nitrate/nitrate_group_path.yaml new file mode 100644 index 000000000..5c3360247 --- /dev/null +++ b/pipe/nitrate/nitrate_group_path.yaml @@ -0,0 +1,87 @@ +--- +pipeline: + name: nitrate_group_path +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-group-path:v1.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m group_path.group_path_main + env: + GROUP: nitrate_ + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + GROUP_ASSIGNMENT_YEAR_INDEX: '4' + GROUP_ASSIGNMENT_MONTH_INDEX: '5' + GROUP_ASSIGNMENT_DAY_INDEX: '6' + GROUP_ASSIGNMENT_MEMBER_INDEX: '7' + GROUP_ASSIGNMENT_DATA_TYPE_INDEX: '8' + LOCATION_FOCUS_SOURCE_TYPE_INDEX: '3' + LOCATION_FOCUS_YEAR_INDEX: '4' + LOCATION_FOCUS_MONTH_INDEX: '5' + LOCATION_FOCUS_DAY_INDEX: '6' + LOCATION_FOCUS_LOCATION_INDEX: '7' + LOCATION_FOCUS_DATA_TYPE_INDEX: '8' + GROUP_FOCUS_YEAR_INDEX: '3' + GROUP_FOCUS_MONTH_INDEX: '4' + GROUP_FOCUS_DAY_INDEX: '5' + GROUP_FOCUS_GROUP_INDEX: '6' +input: +# The input to the group_path module must be a join between the group_assignment module and the union of any/all +# data repos that are currently in a location focus structure or a (different/L1+ dependency) group focus structure + join: + - pfs: + # name must be GROUP_ASSIGNMENT_PATH + name: GROUP_ASSIGNMENT_PATH + repo: nitrate_group_assignment + glob: /nitrate/(*/*/*) + joinOn: $1 + - union: + - pfs: + # Any/all repos in location focus name must be named LOCATION_FOCUS_PATH + name: LOCATION_FOCUS_PATH + repo: sunav2_location_group_and_restructure + glob: /sunav2/(*/*/*) + joinOn: $1 +# - pfs: +# # Any/all repos in L1+ dependency group focus name must be named GROUP_FOCUS_PATH +# name: GROUP_FOCUS_PATH +# repo: +# glob: /(*/*/*) +# joinOn: $1 +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 600M + cpu: 0.4 +resource_limits: + memory: 2G + cpu: 1.2 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_srf_assignment.yaml b/pipe/nitrate/nitrate_srf_assignment.yaml new file mode 100644 index 000000000..dbea14b0d --- /dev/null +++ b/pipe/nitrate/nitrate_srf_assignment.yaml @@ -0,0 +1,58 @@ +--- +pipeline: + name: nitrate_srf_assignment +transform: + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - export ERR_PATH="/pfs/out/errored_datums$FILE_YEAR" + - Rscript + ./flow.srf.asgn.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=$ERR_PATH + FileYear=$FILE_YEAR + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-srf-asgn:v1.1.2 + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: nitrate_srf_loader + glob: /* + - pfs: + name: FILE_YEAR + repo: sunav2_cron_daily_and_date_control + glob: /data_year*.txt +parallelism_spec: + constant: 4 +autoscaling: true +resource_requests: + memory: 200M + cpu: 0.8 +resource_limits: + memory: 800M + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.2 +datum_set_spec: + number: 10 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_srf_loader.yaml b/pipe/nitrate/nitrate_srf_loader.yaml new file mode 100644 index 000000000..dd0902e7b --- /dev/null +++ b/pipe/nitrate/nitrate_srf_loader.yaml @@ -0,0 +1,51 @@ +pipeline: + name: nitrate_srf_loader +transform: + cmd: + - /bin/bash + env: + GROUP_PREFIX: nitrate_ + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-srf-loader:v1.0.0 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + secrets: + - name: pdr-secret + mount_path: /var/db_secret + stdin: + - '#!/bin/bash' + - python3 -m srf_loader.srf_loader_main +input: + pfs: + branch: master + empty_files: true + glob: /* + repo: sunav2_cron_daily_and_date_control_tick +autoscaling: true +resource_requests: + memory: 50M + cpu: 0.1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.3 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml new file mode 100644 index 000000000..036afa7a5 --- /dev/null +++ b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml @@ -0,0 +1,81 @@ +--- +pipeline: + name: nitrate_thresh_select_ts_pad +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-thsh-slct-ts-pad:v2.1.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/threshold_select + rm -rf /tmp/threshold_selectCopy + mkdir -p /tmp/threshold_select + # Run first module - threshold_select' + Rscript ./flow.thsh.slct.R \ + DirIn=$REPO_LOCATIONS \ + DirOut=/tmp/threshold_select \ + DirErr=/pfs/out/errored_datums \ + FileThsh=$FILE_THRESHOLDS \ + "TermCtxt1=rawNitrateSingleCompressedStream" \ + "DirSubCopy=location|data" + # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) + cp -rL /tmp/threshold_select /tmp/threshold_selectCopy || : # Allow to fail without exit code (happens if step above produced no output) || : # Allow to fail without exit code (happens if step above produced no output) + rm -r -f /tmp/threshold_select + # Run second module - timeseries_padder + python3 -m timeseries_padder.timeseries_padder.variable_pad_main --yearindex 3 --monthindex 4 --dayindex 5 --locindex 8 --subdirindex 9 + EOF + env: + DATA_PATH: /tmp/threshold_selectCopy + OUT_PATH: /pfs/out + LOG_LEVEL: INFO + PAD_DIR: data + COPY_DIR: none # Can be multiple, separated by commas without spaces. Directories other than the pad directory and threshold directory to copy to the output (e.g. location,flags). Set to something like 'none' if none other are desired. + RELATIVE_PATH_INDEX: '3' + PARALLELIZATION_INTERNAL: '3' # For threshold select module +output_branch: master +input: + cross: + - pfs: + name: REPO_LOCATIONS + repo: nitrate_group_path + glob: /*/*/* + - pfs: + name: FILE_THRESHOLDS + repo: nitrate_threshold + glob: /thresholds.json +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 800M + cpu: 3.3 +resource_limits: + memory: 1.5G + cpu: 5 +sidecar_resource_requests: + memory: 3G + cpu: 0.7 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_threshold.yaml b/pipe/nitrate/nitrate_threshold.yaml new file mode 100644 index 000000000..4aed0f27a --- /dev/null +++ b/pipe/nitrate/nitrate_threshold.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: nitrate_threshold +transform: + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-threshold-loader:v1.0.0 + cmd: + - /bin/bash + stdin: + - '#!/bin/bash' + - python3 -m threshold_loader.threshold_loader_main + env: + OUT_PATH: /pfs/out + LOG_LEVEL: INFO + # Separate multiple terms with a pipe (|). Enter "none" to retrieve all terms + TERM: rawNitrateSingleCompressedStream + CTXT: nitrate + secrets: + - name: pdr-secret + mount_path: /var/db_secret +input: + pfs: + repo: sunav2_cron_daily_and_date_control_tick + glob: /* + empty_files: true +autoscaling: true +resource_requests: + memory: 32M + cpu: 0.05 +resource_limits: + memory: 200M + cpu: 0.5 +sidecar_resource_requests: + memory: 200M + cpu: 0.5 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/pipe_list_nitrate.txt b/pipe/nitrate/pipe_list_nitrate.txt new file mode 100644 index 000000000..833973908 --- /dev/null +++ b/pipe/nitrate/pipe_list_nitrate.txt @@ -0,0 +1,16 @@ +nitrate_group_loader.yaml +nitrate_group_assignment.yaml +nitrate_srf_loader.yaml +nitrate_srf_assignment.yaml +nitrate_group_path.yaml +nitrate_threshold.yaml +nitrate_thresh_select_ts_pad.yaml +nitrate_analyze_pad_and_qaqc_plau.yaml +nitrate_pre_stats_qm.yaml +nitrate_stats_group_and_compute.yaml +nitrate_qm_group_and_compute.yaml +nitrate_level1_group_consolidate_srf.yaml +nitrate_cron_monthly_and_pub_control.yaml +nitrate_pub_group.yaml +nitrate_pub_format_and_package.yaml +nitrate_pub_egress_and_publish.yaml diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index c5d097b53..f520bd707 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -14,5 +14,4 @@ sunav2_location_asset.yaml sunav2_location_asset_assignment.yaml sunav2_location_loader.yaml sunav2_location_active_dates_assignment.yaml -sunav2_location_group_and_restructure.yaml -sunav2_fill_date_gaps_and_regularize.yaml +sunav2_location_group_and_restructure.yaml \ No newline at end of file From d2c004ea4d378da286537f6bc71b6d97c3a15845 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 22 Aug 2025 07:36:07 -0600 Subject: [PATCH 056/182] update column name --- .../wrap.sunav2.logfiles.fill.R | 1 + .../nitrate_analyze_pad_and_qaqc_plau.yaml | 18 ++++++------------ pipe/nitrate/nitrate_thresh_select_ts_pad.yaml | 2 +- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 1b44bc532..884d918ce 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -128,6 +128,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 } + names(dataOut)[names(dataOut) == 'nitrate_concentration'] <- 'nitrate' #' Write out data file and log flags file diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index 39b29eacd..4edff5cd8 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -1,6 +1,6 @@ --- pipeline: - name: parQuantumLine_analyze_pad_and_qaqc_plau + name: nitrate_analyze_pad_and_qaqc_plau transform: image_pull_secrets: - battelleecology-quay-read-all-pull-secret @@ -32,8 +32,7 @@ transform: DirIn=/tmp/pfs/padded_analyzerCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - FileSchmQf=$SCHEMA_FLAGS \ - "TermTest1=par:null|gap|range(rmv)|step(rmv)|persistence" + "TermTest1=nitrate:range|step|persistence|spike" EOF env: @@ -45,15 +44,10 @@ transform: # Environment variables for qaqc plausibility PARALLELIZATION_INTERNAL: '5' input: - cross: - - pfs: - name: DATA_PATH - repo: parQuantumLine_thresh_select_ts_pad - glob: /*/*/* - - pfs: - name: SCHEMA_FLAGS - repo: parQuantumLine_avro_schemas - glob: /parQuantumLine/flags_plausibility_parQuantumLine.avsc + pfs: + name: DATA_PATH + repo: nitrate_thresh_select_ts_pad + glob: /*/*/* parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml index 036afa7a5..69cd2a584 100644 --- a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml +++ b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml @@ -21,7 +21,7 @@ transform: DirOut=/tmp/threshold_select \ DirErr=/pfs/out/errored_datums \ FileThsh=$FILE_THRESHOLDS \ - "TermCtxt1=rawNitrateSingleCompressedStream" \ + "TermCtxt1=nitrate" \ "DirSubCopy=location|data" # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) cp -rL /tmp/threshold_select /tmp/threshold_selectCopy || : # Allow to fail without exit code (happens if step above produced no output) || : # Allow to fail without exit code (happens if step above produced no output) From df76ba6ea59bf8246cf52d99e6a32b9b68925c02 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 22 Aug 2025 10:21:21 -0600 Subject: [PATCH 057/182] SUNA sensor-specific QAQC --- .../wrap.suna.quality.flags.R | 131 +++++++++++++++--- 1 file changed, 113 insertions(+), 18 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R index 7aff81a43..a9b66b638 100644 --- a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R @@ -31,11 +31,18 @@ #' @examples #' # Not run # DirIn<-"~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data" +# DirIn<-"~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733" # DirOut<-"~/pfs/sunav2_quality_flagged_data/sunav2/2024/09/10/CFGLOC110733" # DirThresholds<-"~/pfs/sunav2_thresholds" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_flagged.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -#' +#' +#' ParaTest <- list(nitrate_concentration=list(term='nitrate_concentration',test=c("null","gap","range","step","spike","persistence"), +#' rmv=c(FALSE,FALSE,TRUE,TRUE,FALSE,TRUE))) +#' +#' +#' +#' #' @changelog #' Bobby Hensley (2025-08-30) created #' @@ -56,32 +63,120 @@ wrap.sunav2.quality.flags <- function(DirIn=NULL, sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirIn, '/', fileName), log = log),silent = FALSE) + #' Identify each burst using dark measurements + sunaData$burst_number<-1 + for(i in 2:nrow(sunaData)){ + if(sunaData[i,which(colnames(sunaData)=='header_light_frame')]=='0'){ + sunaData[i,which(colnames(sunaData)=='burst_number')]=sunaData[i-1,which(colnames(sunaData)=='burst_number')]+1} + else{sunaData[i,which(colnames(sunaData)=='burst_number')]=sunaData[i-1,which(colnames(sunaData)=='burst_number')]} + } + + #' Identify measurement number within burst + sunaData$number_within_burst<-1 + for(i in 2:nrow(sunaData)){ + if(sunaData[i,which(colnames(sunaData)=='burst_number')]==sunaData[i-1,which(colnames(sunaData)=='burst_number')]){ + sunaData[i,which(colnames(sunaData)=='number_within_burst')]=sunaData[i-1,which(colnames(sunaData)=='number_within_burst')]+1} + else{sunaData[i,which(colnames(sunaData)=='number_within_burst')]=1} + } + #' Read in csv file of quality flag thresholds sunaThresholds<-read.csv(file=(base::paste0(DirThresholds,'/sunav2_thresholds.csv'))) ############################################################################################# #' Will be a different module that find the thresholds for the site and date range of the data #' This is just a temporary workaround - siteThresholds<-sunaThresholds[(sunaThresholds$Named.Location.Name=="CRAM"),] + sunaThresholds<-sunaThresholds[(sunaThresholds$Named.Location.Name=="CRAM"),] ############################################################################################# + #' Loads individual quality flag thresholds + HumidityMax<-sunaThresholds$Nitrates.Maximum.Internal.humidity + MinLightDarkRatio<-sunaThresholds$Nitrates.Minimum.Light.to.Dark.Spec.Average.Ratio + #' LampTempMax<-sunaThresholds$Nitrates.Maximum.Lamp.Temp #' New test we need to add + #' LampTempMax<-35 + RangeMin<-sunaThresholds$Range.Threshold.Hard.Min + RangeMax<-sunaThresholds$Range.Threshold.Hard.Max + StepMax<-sunaThresholds$Step.Test.value + Gap.Test.value.....missing.points<-sunaThresholds$Gap.Test.value.....missing.points + Persistence..time...seconds.<-sunaThresholds$Persistence..time...seconds. + Persistence..change.<-sunaThresholds$Persistence..change. + Despiking.Method<-sunaThresholds$Despiking.Method + Despiking.window.size...points<-sunaThresholds$Despiking.window.size...points + Despiking.window.step...points.<-sunaThresholds$Despiking.window.step...points. + Despiking.maximum.consecutive.points..n.<-sunaThresholds$Despiking.maximum.consecutive.points..n. + Despiking.maximum.....missing.points.per.window<-sunaThresholds$Despiking.maximum.....missing.points.per.window + Despiking.MAD<-sunaThresholds$Despiking.MAD + + #' Converts measurements to be tested from class character to numeric + sunaData$nitrate_concentration<-as.numeric(sunaData$nitrate_concentration) + sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) + sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) + sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) + sunaData$dark_value_used_for_fit<-as.numeric(sunaData$dark_value_used_for_fit) + + #' Performs range test + sunaData$rangeQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ + sunaData[i,which(colnames(sunaData)=='rangeQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ + if(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')]RangeMax){ + sunaData[i,which(colnames(sunaData)=='rangeQF')]=1} + else{sunaData[i,which(colnames(sunaData)=='rangeQF')]=0}} + } + + #' Performs step test (only applied if sequential measurements are in the same burst) + sunaData$stepQF<-NA + for(i in 2:nrow(sunaData)){ + if(is.na(sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])|is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ + sunaData[i,which(colnames(sunaData)=='stepQF')]=-1} + if(!is.na(sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])&!is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ + if((abs(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')]-sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])>StepMax)& + (sunaData[i,which(colnames(sunaData)=='burst_number')]==sunaData[i-1,which(colnames(sunaData)=='burst_number')])){ + (sunaData[i,which(colnames(sunaData)=='stepQF')]=1)&(sunaData[i-1,which(colnames(sunaData)=='stepQF')]=1)} + else{sunaData[i,which(colnames(sunaData)=='stepQF')]=0}} + } + #' Performs internal humidity test + sunaData$humidityQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ + sunaData[i,which(colnames(sunaData)=='humidityQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ + if(sunaData[i,which(colnames(sunaData)=='relative_humidity')]>HumidityMax){ + sunaData[i,which(colnames(sunaData)=='humidityQF')]=1} + else{sunaData[i,which(colnames(sunaData)=='humidityQF')]=0}} + } - - - - - - - - - - - - - - - - + #' Performs lamp temperature test + sunaData$lampTempQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ + sunaData[i,which(colnames(sunaData)=='lampTempQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ + if(sunaData[i,which(colnames(sunaData)=='lamp_temperature')]>LampTempMax){ + sunaData[i,which(colnames(sunaData)=='lampTempQF')]=1} + else{sunaData[i,which(colnames(sunaData)=='lampTempQF')]=0}} + } + + #' Performs light to dark spectral ratio test + sunaData$spectralRatioQF<-NA + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])|is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ + sunaData[i,which(colnames(sunaData)=='spectralRatioQF')]=-1} + if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])&!is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ + if(sunaData[i,which(colnames(sunaData)=='spectrum_average')]/sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')] Date: Fri, 22 Aug 2025 10:31:16 -0600 Subject: [PATCH 058/182] latest --- .../wrap.sunav2.logfiles.fill.R | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 884d918ce..c924b7b12 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -43,8 +43,8 @@ #' #' @examples #' # Not run -# DirInLogs<-"~/pfs/sunav2_logs_output/sunav2/2024/09/10/20349" #cleaned log data -# DirInStream<-"~/pfs/sunav2_data_parser_trino/sunav2/2024/09/10/20349" #streamed L0 data +# DirInLogs<-"~/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/11/20349" #cleaned log data +# DirInStream<-"~/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349" #streamed L0 data # DirIn<-NULL # DirOutBase="~/pfs/out" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_parsed.avsc'),collapse='') @@ -115,20 +115,23 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, #' Determine whether to use logged or streamed data. #' Logged data is used if available, and log data flag set to 1 - if(!is.null(logData)){ - dataOut<-logData + if(length(logFile)>=1){ + dataOut<-as.data.frame(logData) flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-1 } #' Streamed data is used if no logged data is available, and log data flags set to 0 - if(is.null(logData) & !is.null(L0Data)){ - dataOut<-L0Data + if(length(logFile)<1 & length(L0Data)>=1){ + dataOut<-as.data.frame(L0Data) flagsOut<-data.frame(matrix(ncol=2,nrow=nrow(dataOut), dimnames=list(NULL, c("readout_time", "sunaLogDataQF")))) flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 } - names(dataOut)[names(dataOut) == 'nitrate_concentration'] <- 'nitrate' + dataOut$spectrum_channels<-NULL #remove list + dataOutFrame<-as.data.frame(dataOut) + names(dataOutFrame)[names(dataOutFrame) == 'nitrate_concentration'] <- 'nitrate' + #' Write out data file and log flags file @@ -137,9 +140,9 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, asset<-tail(x=fileOutSplt,n=1) csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d")) - rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOutFrame, NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), - Schm = SchmDataOut),silent=TRUE) + Schm = SchmFlagsOut),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',csv_name,".parquet"),'. ',attr(rptOut, "condition"))) stop() From bae8aad9f2215f757906d2a12f82557d843ce72f Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 22 Aug 2025 12:59:15 -0600 Subject: [PATCH 059/182] latest --- .../flow.sunav2.logfiles.fill.R | 6 +- .../wrap.sunav2.logfiles.fill.R | 4 +- .../sunav2_calibration_group_and_convert.yaml | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 65 ++++++++++--------- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R index 76a0cbde8..d7243e9da 100644 --- a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R @@ -42,9 +42,9 @@ #' @examples #' Stepping through the code in Rstudio # Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349') #cleaned log data -# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_data_source_trino/sunav2/2024/09/10/20349') #streamed L0 data +# Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349') #streamed L0 data # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2.avsc") +# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_calibrated.avsc") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -148,5 +148,3 @@ foreach::foreach(idxDirIn = DirIn) %dopar% { } - - diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index c924b7b12..07d6e9b6f 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -47,7 +47,7 @@ # DirInStream<-"~/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349" #streamed L0 data # DirIn<-NULL # DirOutBase="~/pfs/out" -# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_parsed.avsc'),collapse='') +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2/sunav2_calibrated.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') #' @@ -142,7 +142,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOutFrame, NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), - Schm = SchmFlagsOut),silent=TRUE) + Schm = SchmDataOut),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',csv_name,".parquet"),'. ',attr(rptOut, "condition"))) stop() diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 19bb228bb..ba750f544 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -26,7 +26,7 @@ transform: DirIn=/tmp/pfs/filter_joined \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - TermQf=nitrate_concentration \ + TermQf=nitrate \ FileSchmQf=$FILE_SCHEMA_FLAGS \ DirSubCopy=flags EOF diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 0148b79b9..0d05494b6 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-a1ff444 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-1a1ca7a cmd: - sh - "-c" @@ -69,6 +69,7 @@ transform: Rscript ./flow.sunav2.logfiles.fill.R \ DirIn=$OUT_PATH_JOIN_SOURCES \ DirOut=/pfs/out \ + FileSchmData=$FILE_SCHEMA_DATA \ DirErr=/pfs/out/errored_datums EOF env: @@ -134,35 +135,41 @@ transform: RELATIVE_PATH_INDEX: "3" # Must be consistent across inputs LINK_TYPE: COPY # options are COPY or SYMLINK. MUST BE SIMLINK IF USING COMBINED MODULE. input: - join: + cross: - pfs: - name: DATA_PATH_TRINO - repo: sunav2_trino_data_parser - glob: /(sunav2/*/*/*) #sunav2/Y/M/D - joinOn: $1 - empty_files: false # Make sure this is false for LINK_TYPE=COPY - outer_join: true - # - pfs: - # name: DATA_PATH_KAFKA - # repo: sunav2_data_source_kafka - # glob: /(sunav2/*/*/*) - # joinOn: $1 - # empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - # outer_join: true - - pfs: - name: DATA_PATH_LOG - repo: sunav2_logjam_assign_clean_files - glob: /(sunav2/*/*/*) #sunav2/Y/M/D - joinOn: $1 - empty_files: false # Make sure this is false for LINK_TYPE=COPY - outer_join: true - - pfs: - name: DATE_CONTROL - repo: sunav2_cron_daily_and_date_control - glob: /(sunav2/*/*/*) #sunav2/Y/M/D - joinOn: $1 - empty_files: false # Make sure this is false for LINK_TYPE=COPY - outer_join: true + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_calibrated.avsc + + - join: + - pfs: + name: DATA_PATH_TRINO + repo: sunav2_trino_data_parser + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + # - pfs: + # name: DATA_PATH_KAFKA + # repo: sunav2_data_source_kafka + # glob: /(sunav2/*/*/*) + # joinOn: $1 + # empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + # outer_join: true + - pfs: + name: DATA_PATH_LOG + repo: sunav2_logjam_assign_clean_files + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true + - pfs: + name: DATE_CONTROL + repo: sunav2_cron_daily_and_date_control + glob: /(sunav2/*/*/*) #sunav2/Y/M/D + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + outer_join: true parallelism_spec: constant: 5 autoscaling: true From a91bb0921ca0ffb2ee1bdb7111ca12d007aa2866 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 22 Aug 2025 14:42:33 -0600 Subject: [PATCH 060/182] latest --- pipe/sunav2/sunav2_fill_log_files.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 0d05494b6..10f6df4bb 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-1a1ca7a + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-bae8aad cmd: - sh - "-c" @@ -140,7 +140,6 @@ input: name: FILE_SCHEMA_DATA repo: sunav2_avro_schemas glob: /sunav2/sunav2_calibrated.avsc - - join: - pfs: name: DATA_PATH_TRINO From b3411bc09358d9b98f8beca465ee078ef27a0022 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 22 Aug 2025 14:57:32 -0600 Subject: [PATCH 061/182] latest --- .../flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 07d6e9b6f..d0bea0db8 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -128,9 +128,9 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 } - dataOut$spectrum_channels<-NULL #remove list - dataOutFrame<-as.data.frame(dataOut) - names(dataOutFrame)[names(dataOutFrame) == 'nitrate_concentration'] <- 'nitrate' + #dataOut$spectrum_channels<-NULL #remove list + #dataOutFrame<-as.data.frame(dataOut) + #names(dataOutFrame)[names(dataOutFrame) == 'nitrate_concentration'] <- 'nitrate' #' Write out data file and log flags file @@ -140,7 +140,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, asset<-tail(x=fileOutSplt,n=1) csv_name <-paste0('sunav2_',asset,'_',format(timeBgn,format = "%Y-%m-%d")) - rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOutFrame, + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, NameFile = base::paste0(DirOutData,'/',csv_name,".parquet"), Schm = SchmDataOut),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ From 9220a18f22942a56ae6c5fb76da8fcd0457a6a70 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 22 Aug 2025 14:57:32 -0600 Subject: [PATCH 062/182] latest --- pipe/nitrate/nitrate_threshold.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_threshold.yaml b/pipe/nitrate/nitrate_threshold.yaml index 4aed0f27a..18faa901e 100644 --- a/pipe/nitrate/nitrate_threshold.yaml +++ b/pipe/nitrate/nitrate_threshold.yaml @@ -14,7 +14,7 @@ transform: OUT_PATH: /pfs/out LOG_LEVEL: INFO # Separate multiple terms with a pipe (|). Enter "none" to retrieve all terms - TERM: rawNitrateSingleCompressedStream + TERM: nitrate CTXT: nitrate secrets: - name: pdr-secret From 46ec1c4fa4397aa40429f1dfee7b3a3841b3e8bb Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 25 Aug 2025 14:54:48 -0600 Subject: [PATCH 063/182] Updates to SUNA-specific quality flag module. --- .../wrap.suna.quality.flags.R | 196 +++++------------- 1 file changed, 57 insertions(+), 139 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R index a9b66b638..d93a57bb2 100644 --- a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R @@ -1,5 +1,5 @@ ############################################################################################## -#' @title Wrapper for SUNA quality flagging +#' @title Wrapper for SUNA sensor-specific quality flagging #' @author #' Bobby Hensley \email{hensley@battelleecology.org} @@ -21,7 +21,7 @@ #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init #' for more details. #' -#' @return SUNA data with quality flags applied in daily parquets. +#' @return SUNA data with sensor-specific quality flags applied in daily parquets. #' #' @references #' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 @@ -30,15 +30,12 @@ #' #' @examples #' # Not run -# DirIn<-"~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data" -# DirIn<-"~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733" -# DirOut<-"~/pfs/sunav2_quality_flagged_data/sunav2/2024/09/10/CFGLOC110733" -# DirThresholds<-"~/pfs/sunav2_thresholds" -# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_flagged.avsc'),collapse='') +# DirInData<-"~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data" +# DirInThresholds<-"~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold" +# DirOutFlags<-"~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags/" +# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' -#' ParaTest <- list(nitrate_concentration=list(term='nitrate_concentration',test=c("null","gap","range","step","spike","persistence"), -#' rmv=c(FALSE,FALSE,TRUE,TRUE,FALSE,TRUE))) #' #' #' @@ -47,9 +44,10 @@ #' Bobby Hensley (2025-08-30) created #' ############################################################################################## -wrap.sunav2.quality.flags <- function(DirIn=NULL, - DirOut=NULL, - SchmDataOut=NULL, +wrap.sunav2.quality.flags <- function(DirInData=NULL, + DirInThresholds=NULL, + DirOutFlags=NULL, + SchmFlagsOut=NULL, log=NULL ){ @@ -58,161 +56,81 @@ wrap.sunav2.quality.flags <- function(DirIn=NULL, log <- NEONprocIS.base::def.log.init() } - #' Read in parquet file of input data - fileName<-base::list.files(DirIn,full.names=FALSE) - sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirIn, '/', fileName), + #' Read in parquet file of SUNA data + dataFileName<-base::list.files(DirInData,full.names=FALSE) + sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), log = log),silent = FALSE) - #' Identify each burst using dark measurements - sunaData$burst_number<-1 - for(i in 2:nrow(sunaData)){ - if(sunaData[i,which(colnames(sunaData)=='header_light_frame')]=='0'){ - sunaData[i,which(colnames(sunaData)=='burst_number')]=sunaData[i-1,which(colnames(sunaData)=='burst_number')]+1} - else{sunaData[i,which(colnames(sunaData)=='burst_number')]=sunaData[i-1,which(colnames(sunaData)=='burst_number')]} - } - - #' Identify measurement number within burst - sunaData$number_within_burst<-1 - for(i in 2:nrow(sunaData)){ - if(sunaData[i,which(colnames(sunaData)=='burst_number')]==sunaData[i-1,which(colnames(sunaData)=='burst_number')]){ - sunaData[i,which(colnames(sunaData)=='number_within_burst')]=sunaData[i-1,which(colnames(sunaData)=='number_within_burst')]+1} - else{sunaData[i,which(colnames(sunaData)=='number_within_burst')]=1} - } - - #' Read in csv file of quality flag thresholds - sunaThresholds<-read.csv(file=(base::paste0(DirThresholds,'/sunav2_thresholds.csv'))) - ############################################################################################# - #' Will be a different module that find the thresholds for the site and date range of the data - #' This is just a temporary workaround - sunaThresholds<-sunaThresholds[(sunaThresholds$Named.Location.Name=="CRAM"),] - ############################################################################################# - - #' Loads individual quality flag thresholds - HumidityMax<-sunaThresholds$Nitrates.Maximum.Internal.humidity - MinLightDarkRatio<-sunaThresholds$Nitrates.Minimum.Light.to.Dark.Spec.Average.Ratio - #' LampTempMax<-sunaThresholds$Nitrates.Maximum.Lamp.Temp #' New test we need to add - #' LampTempMax<-35 - RangeMin<-sunaThresholds$Range.Threshold.Hard.Min - RangeMax<-sunaThresholds$Range.Threshold.Hard.Max - StepMax<-sunaThresholds$Step.Test.value - Gap.Test.value.....missing.points<-sunaThresholds$Gap.Test.value.....missing.points - Persistence..time...seconds.<-sunaThresholds$Persistence..time...seconds. - Persistence..change.<-sunaThresholds$Persistence..change. - Despiking.Method<-sunaThresholds$Despiking.Method - Despiking.window.size...points<-sunaThresholds$Despiking.window.size...points - Despiking.window.step...points.<-sunaThresholds$Despiking.window.step...points. - Despiking.maximum.consecutive.points..n.<-sunaThresholds$Despiking.maximum.consecutive.points..n. - Despiking.maximum.....missing.points.per.window<-sunaThresholds$Despiking.maximum.....missing.points.per.window - Despiking.MAD<-sunaThresholds$Despiking.MAD - - #' Converts measurements to be tested from class character to numeric - sunaData$nitrate_concentration<-as.numeric(sunaData$nitrate_concentration) + #' Convert measurements to be tested from class character to numeric sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) sunaData$dark_value_used_for_fit<-as.numeric(sunaData$dark_value_used_for_fit) - #' Performs range test - sunaData$rangeQF<-NA - for(i in 1:nrow(sunaData)){ - if(is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ - sunaData[i,which(colnames(sunaData)=='rangeQF')]=-1} - if(!is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ - if(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')]RangeMax){ - sunaData[i,which(colnames(sunaData)=='rangeQF')]=1} - else{sunaData[i,which(colnames(sunaData)=='rangeQF')]=0}} - } - - #' Performs step test (only applied if sequential measurements are in the same burst) - sunaData$stepQF<-NA - for(i in 2:nrow(sunaData)){ - if(is.na(sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])|is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ - sunaData[i,which(colnames(sunaData)=='stepQF')]=-1} - if(!is.na(sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])&!is.na(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')])){ - if((abs(sunaData[i,which(colnames(sunaData)=='nitrate_concentration')]-sunaData[i-1,which(colnames(sunaData)=='nitrate_concentration')])>StepMax)& - (sunaData[i,which(colnames(sunaData)=='burst_number')]==sunaData[i-1,which(colnames(sunaData)=='burst_number')])){ - (sunaData[i,which(colnames(sunaData)=='stepQF')]=1)&(sunaData[i-1,which(colnames(sunaData)=='stepQF')]=1)} - else{sunaData[i,which(colnames(sunaData)=='stepQF')]=0}} - } + #' Create data frame of input file readout_times to serve as basis of output flag file + flagFile<-as.data.frame(sunaData$readout_time) + colnames(flagFile)<-c("readout_time") - #' Performs internal humidity test - sunaData$humidityQF<-NA + #' Read in json file of quality flag thresholds + thresholdFileName<-base::list.files(DirInThresholds,full.names=FALSE) + sunaThresholds<-base::try(NEONprocIS.qaqc::def.read.thsh.qaqc.df(NameFile = base::paste0(DirInThresholds, '/', thresholdFileName)),silent = FALSE) + + #' Perform internal humidity test + humidityThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Internal humidity"),] + maxHumidity<-humidityThreshold$number_value + flagFile$nitrateHumidityQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ - sunaData[i,which(colnames(sunaData)=='humidityQF')]=-1} + flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ - if(sunaData[i,which(colnames(sunaData)=='relative_humidity')]>HumidityMax){ - sunaData[i,which(colnames(sunaData)=='humidityQF')]=1} - else{sunaData[i,which(colnames(sunaData)=='humidityQF')]=0}} + if(sunaData[i,which(colnames(sunaData)=='relative_humidity')]>maxHumidity){ + flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=1} + else{flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=0}} } - #' Performs lamp temperature test - sunaData$lampTempQF<-NA + #' Perform lamp temperature test (New condition need to be created. Using default for now) + # lampTempThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Lamp Temperature"),] + # maxLampTemp<-lampTempThreshold$number_value + maxLampTemp=35 + flagFile$nitrateLampTempQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ - sunaData[i,which(colnames(sunaData)=='lampTempQF')]=-1} + flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ - if(sunaData[i,which(colnames(sunaData)=='lamp_temperature')]>LampTempMax){ - sunaData[i,which(colnames(sunaData)=='lampTempQF')]=1} - else{sunaData[i,which(colnames(sunaData)=='lampTempQF')]=0}} - } - - #' Performs light to dark spectral ratio test - sunaData$spectralRatioQF<-NA + if(sunaData[i,which(colnames(sunaData)=='lamp_temperature')]>maxLampTemp){ + flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=1} + else{flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=0}} + } + + #' Perform light to dark spectral ratio test + spectralRatioThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Minimum Light to Dark Spec Average Ratio"),] + minLightDarkRatio<-spectralRatioThreshold$number_value + flagFile$nitrateLightDarkRatioQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])|is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ - sunaData[i,which(colnames(sunaData)=='spectralRatioQF')]=-1} + flagFile[i,which(colnames(flagFile)=='nitrateLightDarkRatioQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])&!is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ - if(sunaData[i,which(colnames(sunaData)=='spectrum_average')]/sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')] Date: Mon, 25 Aug 2025 14:59:37 -0600 Subject: [PATCH 064/182] Fixed param names in header. --- flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R index d93a57bb2..c168c7def 100644 --- a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R @@ -6,11 +6,11 @@ #' #' @description Wrapper function. Uses thresholds to apply quality flags to SUNA data. #' -#' @param DirIn Character value. The file path to the input data. +#' @param DirInData Character value. The file path to the input data. +#' +#' @param DirThresholds Character value. The file path for the quality flag thresholds. #' -#' @param DirOut Character value. The file path for the output data. -#' -#' @param DirThresholds Character value. The file path for the quality flag thresholds. +#' @param DirOutFlags Character value. The file path for the output data. #' #' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data #' file. If this input is not provided, the output schema for the data will be the same as the input data From e375937f6a6c4961a04562aa7deb2bff1bb78a0c Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 26 Aug 2025 08:26:49 -0600 Subject: [PATCH 065/182] latest --- .../nitrate_analyze_pad_and_qaqc_plau.yaml | 6 +- .../nitrate_cron_monthly_and_pub_control.yaml | 2 +- .../nitrate/nitrate_qm_group_and_compute.yaml | 102 ++++++++++++++++++ .../nitrate/nitrate_thresh_select_ts_pad.yaml | 8 +- .../sunav2_cron_daily_and_date_control.yaml | 4 +- 5 files changed, 112 insertions(+), 10 deletions(-) create mode 100644 pipe/nitrate/nitrate_qm_group_and_compute.yaml diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index 4edff5cd8..00e0da170 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -42,7 +42,7 @@ transform: RELATIVE_PATH_INDEX: '3' ERR_PATH: /pfs/out/errored_datums # Environment variables for qaqc plausibility - PARALLELIZATION_INTERNAL: '5' + PARALLELIZATION_INTERNAL: '1' input: pfs: name: DATA_PATH @@ -52,8 +52,8 @@ parallelism_spec: constant: 5 autoscaling: true resource_requests: - memory: 2G - cpu: 5.5 + memory: 700M + cpu: 2 resource_limits: memory: 4G cpu: 7 diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index 56db88a3f..f1cc96635 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -9,7 +9,7 @@ transform: # START_DATE must be set, format "YYYY-MM" # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. OUT_PATH: /pfs/out - START_MONTH: "2024-08" + START_MONTH: "2024-03" # END_MONTH: "2024-09" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over stdin: - "#!/bin/bash" diff --git a/pipe/nitrate/nitrate_qm_group_and_compute.yaml b/pipe/nitrate/nitrate_qm_group_and_compute.yaml new file mode 100644 index 000000000..2cb7ea5b2 --- /dev/null +++ b/pipe/nitrate/nitrate_qm_group_and_compute.yaml @@ -0,0 +1,102 @@ +--- +pipeline: + name: nitrate_qm_group_and_compute +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-qaqc-qm-grp:v2.1.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/filter_joined + mkdir -p /tmp/pfs/filter_joined + + # ---- Run first module - filter-joiner (using environment variables below as input parameters) ---- + python3 -m filter_joiner.filter_joiner_main + + # ---- Run second module - quality metrics (averaged) ---- + Rscript ./flow.qaqc.qm.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "WndwAgr=015" \ + "WghtAlphBeta=2|1" \ + Thsh=0.2 \ + "GrpQfAlph1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF|nitratePersistenceQF" \ + "GrpQfBeta1=nitrate:nitrateRangeQF|nitrateStepQF|nitratePersistenceQF" + EOF + env: + # Environment variables for filter-joiner + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: QAQC_PLAUSIBILITY_PATH + # Filter for flags directory + glob_pattern: /pfs/QAQC_PLAUSIBILITY_PATH/*/*/*/*/*/*/flags/** + # Join on named location (already joined below by day) + join_indices: [6] + - path: + name: FLAGS_PATH + # Filter for flags directory + glob_pattern: /pfs/REGULARIZED_FLAGS_PATH/*/*/*/*/*/*/flags/** + # Join on named location (already joined below by day) + join_indices: [6] + OUT_PATH: /tmp/pfs/filter_joined + LOG_LEVEL: DEBUG + RELATIVE_PATH_INDEX: "3" + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. + PARALLELIZATION_INTERNAL: '1' # Option for quality metrics module +input: + join: + - pfs: + name: QAQC_PLAUSIBILITY_PATH + repo: nitrate_analyze_pad_and_qaqc_plau + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: FLAGS_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 500M + cpu: 1.2 +resource_limits: + memory: 1G + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/gke-ephemeral-storage-local-ssd: "true" + nodepool.neonscience.org/pipeline: "yes" + cloud.google.com/gke-spot: "true" +pod_spec: |- + { "tolerations": [ + { + "key": "nodepool.neonscience.org/pipeline", + "operator": "Exists" + }, + { + "effect": "NoSchedule", + "key": "cloud.google.com/gke-spot", + "operator": "Exists" + } + ] } diff --git a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml index 69cd2a584..c95ff3e9b 100644 --- a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml +++ b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml @@ -36,7 +36,7 @@ transform: PAD_DIR: data COPY_DIR: none # Can be multiple, separated by commas without spaces. Directories other than the pad directory and threshold directory to copy to the output (e.g. location,flags). Set to something like 'none' if none other are desired. RELATIVE_PATH_INDEX: '3' - PARALLELIZATION_INTERNAL: '3' # For threshold select module + PARALLELIZATION_INTERNAL: '1' # For threshold select module output_branch: master input: cross: @@ -49,11 +49,11 @@ input: repo: nitrate_threshold glob: /thresholds.json parallelism_spec: - constant: 5 + constant: 1 autoscaling: true resource_requests: - memory: 800M - cpu: 3.3 + memory: 300M + cpu: 1.1 resource_limits: memory: 1.5G cpu: 5 diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 678a1faa9..7ebf6cc71 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2024-09-05" # Inclusive - END_DATE: "2024-09-15" # Inclusive + START_DATE: "2024-03-14" # Inclusive + END_DATE: "2024-03-24" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" From 9951b0135566ef5274d517784e31dac1d61851e1 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 26 Aug 2025 11:34:26 -0600 Subject: [PATCH 066/182] Work on .flow and docker files for suna quality flags. --- .../DockerfileDockerfile | 20 +++ .../flow.sunav2.quality.flags.R | 158 ++++++++++++++++++ ...ty.flags.R => wrap.sunav2.quality.flags.R} | 0 3 files changed, 178 insertions(+) create mode 100644 flow/flow.sunav2.quality.flags/DockerfileDockerfile create mode 100644 flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R rename flow/flow.sunav2.quality.flags/{wrap.suna.quality.flags.R => wrap.sunav2.quality.flags.R} (100%) diff --git a/flow/flow.sunav2.quality.flags/DockerfileDockerfile b/flow/flow.sunav2.quality.flags/DockerfileDockerfile new file mode 100644 index 000000000..f02fce2d3 --- /dev/null +++ b/flow/flow.sunav2.quality.flags/DockerfileDockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 sensor-specific quality flags + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.quality.flags" + +# maintainer handle +MAINTAINER "Bobby Hensley" hensley@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.quality.flags.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.quality.flags.R . + diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R new file mode 100644 index 000000000..03bfb7b7b --- /dev/null +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -0,0 +1,158 @@ +############################################################################################## +#' @title Workflow for SUNA Sensor-specific Quality Flags + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Calculates quality flags for SUNA internal humidity, lamp temperature +#' and light to dark spectral ratio, and saves into daily parquets. +#' +#' The arguments are: +#' +#' 1. "DirInData=value", The input path to the data, structured as follows: +#' #/pfs/BASE_REPO/date/source-id/data. +#' +#' 2. "DirInThresholds=value", The input path to the test thresholds, structured as follows: +#' #/pfs/BASE_REPO/date/source-id/thresholds. +#' +#' 2. "DirOutFlags=value", where the value is the output path. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirInData}. +#' +#' 4. "SchmDataOut=value" (optional), where values is the full path to the avro schema for the output data +#' file. +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Sensor-specific quality flag files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.quality.flags <- function(DirInData="~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", +#' DirInThresholds="~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", +#' DirOutFlags="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", +#' SchmDataOut=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') +#' log=log) +#' Stepping through the code in R studio +Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirInData=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", + "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", + "DirOutFlags=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", + "DirErr=~/pfs/out/errored_datums") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-08-26) Original creation +# +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.quality.flags.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirInData", "DirInThresholds","DirOutFlags","DirErr"), + NameParaOptn = c("SchmDataOut"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirInData)) +log$debug(base::paste0('Input thresholds directory: ', Para$DirInThresholds)) +log$debug(base::paste0('Output directory: ', Para$DirOutFlags)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output data: ', Para$SchmDataOut)) + + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$SchmDataOut) || Para$SchmDataOut == 'NA'){ + SchmDataOut <- NULL +} else { + SchmDataOut <- base::paste0(base::readLines(Para$SchmDataOut),collapse='') +} + +# Find all the input paths (datums). We will process each one. +DirInData <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirInData, + nameDirSub = NULL, + log = log) +DirInThresholds <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirInThresholds, + nameDirSub = NULL, + log = log) +# Take stock of our data files. +fileData <- base::list.files(DirInData,full.names=TRUE) +log$debug(base::paste0('Data Files identified:', fileData)) + +fileThresholds <- base::list.files(DirInThresholds,full.names=TRUE) +log$debug(base::paste0('Threshold Files identified:', fileThresholds)) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = fileData) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.quality.flags( + FileInData=idxFileIn, + FileInThresholds=fileThresholds, + DirOut=Para$DirOut, + SchmDataOut=SchmDataOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R similarity index 100% rename from flow/flow.sunav2.quality.flags/wrap.suna.quality.flags.R rename to flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R From dccb601e86da44f96ffb61ef2890fdcc56e093b1 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 26 Aug 2025 12:14:10 -0600 Subject: [PATCH 067/182] Updates to suna quality flag .flow and docker. --- .../flow.sunav2.quality.flags.R | 30 ++++++++----------- .../wrap.sunav2.quality.flags.R | 4 +-- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 03bfb7b7b..7d56cb9e4 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -20,7 +20,7 @@ #' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will #' replace the #/pfs/BASE_REPO portion of \code{DirInData}. #' -#' 4. "SchmDataOut=value" (optional), where values is the full path to the avro schema for the output data +#' 4. "SchmFlagsOut=value" (optional), where values is the full path to the avro schema for the output data #' file. #' #' @@ -38,7 +38,7 @@ #' flow.sunav2.quality.flags <- function(DirInData="~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", #' DirInThresholds="~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", #' DirOutFlags="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", -#' SchmDataOut=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') +#' SchmFlagsOut=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') #' log=log) #' Stepping through the code in R studio Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data') @@ -82,21 +82,21 @@ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used # Parse the input arguments into parameters Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirInData", "DirInThresholds","DirOutFlags","DirErr"), - NameParaOptn = c("SchmDataOut"),log = log) + NameParaOptn = c("SchmFlagsOut"),log = log) # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirInData)) log$debug(base::paste0('Input thresholds directory: ', Para$DirInThresholds)) log$debug(base::paste0('Output directory: ', Para$DirOutFlags)) log$debug(base::paste0('Error directory: ', Para$DirErr)) -log$debug(base::paste0('Schema for output data: ', Para$SchmDataOut)) +log$debug(base::paste0('Schema for output data: ', Para$SchmFlagsOut)) # Read in the schemas so we only have to do it once and not every time in the avro writer. -if(base::is.null(Para$SchmDataOut) || Para$SchmDataOut == 'NA'){ - SchmDataOut <- NULL +if(base::is.null(Para$SchmFlagsOut) || Para$SchmFlagsOut == 'NA'){ + SchmFlagsOut <- NULL } else { - SchmDataOut <- base::paste0(base::readLines(Para$SchmDataOut),collapse='') + SchmFlagsOut <- base::paste0(base::readLines(Para$SchmFlagsOut),collapse='') } # Find all the input paths (datums). We will process each one. @@ -108,25 +108,19 @@ DirInThresholds <- NEONprocIS.base::def.dir.in(DirBgn = Para$DirInThresholds, nameDirSub = NULL, log = log) -# Take stock of our data files. -fileData <- base::list.files(DirInData,full.names=TRUE) -log$debug(base::paste0('Data Files identified:', fileData)) - -fileThresholds <- base::list.files(DirInThresholds,full.names=TRUE) -log$debug(base::paste0('Threshold Files identified:', fileThresholds)) # Process each datum path doParallel::registerDoParallel(numCoreUse) -foreach::foreach(idxFileIn = fileData) %dopar% { +foreach::foreach(idxFileIn = DirInData) %dopar% { log$info(base::paste0('Processing path to file: ', idxFileIn)) # Run the wrapper function for each datum, with error routing tryCatch( withCallingHandlers( wrap.sunav2.quality.flags( - FileInData=idxFileIn, - FileInThresholds=fileThresholds, - DirOut=Para$DirOut, - SchmDataOut=SchmDataOut, + DirInData=idxFileIn, + DirInThresholds=DirInThresholds, + DirOutFlags=Para$DirOutFlags, + SchmFlagsOut=SchmFlagsOut, log=log ), error = function(err) { diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index c168c7def..498b24d26 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -12,7 +12,7 @@ #' #' @param DirOutFlags Character value. The file path for the output data. #' -#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the output data +#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output data #' file. If this input is not provided, the output schema for the data will be the same as the input data #' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF #' THE INPUT DATA. @@ -117,7 +117,7 @@ wrap.sunav2.quality.flags <- function(DirInData=NULL, #' Write out data file and log flags file base::dir.create(DirOutFlags,recursive=TRUE) - sensorFlagFileName<-paste0(stringr::str_remove(fileName,".parquet"),'_sensor_specific_flags') + sensorFlagFileName<-paste0(stringr::str_remove(dataFileName,".parquet"),'_sensor_specific_flags') rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = flagFile, NameFile = base::paste0(DirOutFlags,'/',sensorFlagFileName,".parquet"), From 31cb3b2b207c8a6e6005d2ddfcaf35027ce875d4 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 26 Aug 2025 14:46:38 -0600 Subject: [PATCH 068/182] latest --- .../flow.sunav2.quality.flags.R | 68 +++++++++---------- .../wrap.sunav2.quality.flags.R | 16 +++-- flow/flow.troll.flags/flow.troll.flags.R | 6 +- .../nitrate_analyze_pad_and_qaqc_plau.yaml | 3 +- pipe/nitrate/pipe_list_nitrate.txt | 3 +- 5 files changed, 49 insertions(+), 47 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 7d56cb9e4..72e576e21 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -9,18 +9,15 @@ #' #' The arguments are: #' -#' 1. "DirInData=value", The input path to the data, structured as follows: -#' #/pfs/BASE_REPO/date/source-id/data. -#' -#' 2. "DirInThresholds=value", The input path to the test thresholds, structured as follows: -#' #/pfs/BASE_REPO/date/source-id/thresholds. +#' 1. "DirIn=value", The input path to the data, structured as follows: +#' #/pfs/BASE_REPO/date/source-id/data. #' -#' 2. "DirOutFlags=value", where the value is the output path. +#' 2. "DirOut=value", where the value is the output path. #' #' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will -#' replace the #/pfs/BASE_REPO portion of \code{DirInData}. +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. #' -#' 4. "SchmFlagsOut=value" (optional), where values is the full path to the avro schema for the output data +#' 4. "FileSchmQf=value" (optional), where values is the full path to the avro schema for the output data #' file. #' #' @@ -35,18 +32,21 @@ #' @keywords Currently none #' @examples -#' flow.sunav2.quality.flags <- function(DirInData="~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", +#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", #' DirInThresholds="~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", -#' DirOutFlags="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", -#' SchmFlagsOut=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') +#' DirOut="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", +#' FileSchmQf=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirInData=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", - "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", - "DirOutFlags=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", - "DirErr=~/pfs/out/errored_datums") +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2024/09/10/nitrate_CRAM103100') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", +# "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", +# "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", +# "DirErr=~/pfs/out/errored_datums") +# arg <- c("DirIn=$DIR_IN", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -81,46 +81,40 @@ if(numCoreUse > numCoreAvail){ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) # Parse the input arguments into parameters -Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirInData", "DirInThresholds","DirOutFlags","DirErr"), - NameParaOptn = c("SchmFlagsOut"),log = log) +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirOut","DirErr"), + NameParaOptn = c("FileSchmQf"),log = log) # Echo arguments -log$debug(base::paste0('Input data directory: ', Para$DirInData)) -log$debug(base::paste0('Input thresholds directory: ', Para$DirInThresholds)) -log$debug(base::paste0('Output directory: ', Para$DirOutFlags)) +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) -log$debug(base::paste0('Schema for output data: ', Para$SchmFlagsOut)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmQf)) # Read in the schemas so we only have to do it once and not every time in the avro writer. -if(base::is.null(Para$SchmFlagsOut) || Para$SchmFlagsOut == 'NA'){ - SchmFlagsOut <- NULL +if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ + FileSchmQf <- NULL } else { - SchmFlagsOut <- base::paste0(base::readLines(Para$SchmFlagsOut),collapse='') + FileSchmQf <- base::paste0(base::readLines(Para$FileSchmQf),collapse='') } # Find all the input paths (datums). We will process each one. -DirInData <- - NEONprocIS.base::def.dir.in(DirBgn = Para$DirInData, - nameDirSub = NULL, - log = log) -DirInThresholds <- - NEONprocIS.base::def.dir.in(DirBgn = Para$DirInThresholds, +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, nameDirSub = NULL, log = log) # Process each datum path doParallel::registerDoParallel(numCoreUse) -foreach::foreach(idxFileIn = DirInData) %dopar% { +foreach::foreach(idxFileIn = DirIn) %dopar% { log$info(base::paste0('Processing path to file: ', idxFileIn)) # Run the wrapper function for each datum, with error routing tryCatch( withCallingHandlers( wrap.sunav2.quality.flags( - DirInData=idxFileIn, - DirInThresholds=DirInThresholds, - DirOutFlags=Para$DirOutFlags, - SchmFlagsOut=SchmFlagsOut, + DirIn=idxFileIn, + DirOut=Para$DirOut, + SchmFlagsOut=FileSchmQf, log=log ), error = function(err) { diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 498b24d26..2a9a3403c 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -6,9 +6,7 @@ #' #' @description Wrapper function. Uses thresholds to apply quality flags to SUNA data. #' -#' @param DirInData Character value. The file path to the input data. -#' -#' @param DirThresholds Character value. The file path for the quality flag thresholds. +#' @param DirIn Character value. The file path to the input data and quality flag thresholds. #' #' @param DirOutFlags Character value. The file path for the output data. #' @@ -44,8 +42,7 @@ #' Bobby Hensley (2025-08-30) created #' ############################################################################################## -wrap.sunav2.quality.flags <- function(DirInData=NULL, - DirInThresholds=NULL, +wrap.sunav2.quality.flags <- function(DirIn, DirOutFlags=NULL, SchmFlagsOut=NULL, log=NULL @@ -56,6 +53,15 @@ wrap.sunav2.quality.flags <- function(DirInData=NULL, log <- NEONprocIS.base::def.log.init() } + DirInData <- + NEONprocIS.base::def.dir.in(DirBgn = DirIn, + nameDirSub = "data", + log = log) + DirInThresholds <- + NEONprocIS.base::def.dir.in(DirBgn = DirIn, + nameDirSub = "threshold", + log = log) + #' Read in parquet file of SUNA data dataFileName<-base::list.files(DirInData,full.names=FALSE) sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), diff --git a/flow/flow.troll.flags/flow.troll.flags.R b/flow/flow.troll.flags/flow.troll.flags.R index 8cf44ba47..1c5f25a31 100644 --- a/flow/flow.troll.flags/flow.troll.flags.R +++ b/flow/flow.troll.flags/flow.troll.flags.R @@ -118,7 +118,7 @@ if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ DirSubCopy <- base::unique(base::setdiff( Para$DirSubCopy, - c('data') + c('data','flags') )) log$debug(base::paste0( 'Additional subdirectories to copy: ', @@ -127,9 +127,9 @@ log$debug(base::paste0( #what are the expected subdirectories of each input path -nameDirSub <- c('data','flags') +nameDirSub <- c('data','flags','threshold') log$debug(base::paste0( - 'Additional subdirectories to copy: ', + 'Expected subdirectories: ', base::paste0(nameDirSub, collapse = ',') )) diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index 00e0da170..d2119c317 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -32,7 +32,8 @@ transform: DirIn=/tmp/pfs/padded_analyzerCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - "TermTest1=nitrate:range|step|persistence|spike" + "TermTest1=nitrate:range|step|persistence|spike" \ + DirSubCopy=threshold EOF env: diff --git a/pipe/nitrate/pipe_list_nitrate.txt b/pipe/nitrate/pipe_list_nitrate.txt index 833973908..51c5af457 100644 --- a/pipe/nitrate/pipe_list_nitrate.txt +++ b/pipe/nitrate/pipe_list_nitrate.txt @@ -6,7 +6,8 @@ nitrate_group_path.yaml nitrate_threshold.yaml nitrate_thresh_select_ts_pad.yaml nitrate_analyze_pad_and_qaqc_plau.yaml -nitrate_pre_stats_qm.yaml + +nitrate_flags_specific.yaml nitrate_stats_group_and_compute.yaml nitrate_qm_group_and_compute.yaml nitrate_level1_group_consolidate_srf.yaml From 4d0c318ad09d7eb1789ced981212a25b6ac3e111 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 26 Aug 2025 14:54:18 -0600 Subject: [PATCH 069/182] rename dockerfile --- .../{DockerfileDockerfile => Dockerfile} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename flow/flow.sunav2.quality.flags/{DockerfileDockerfile => Dockerfile} (100%) diff --git a/flow/flow.sunav2.quality.flags/DockerfileDockerfile b/flow/flow.sunav2.quality.flags/Dockerfile similarity index 100% rename from flow/flow.sunav2.quality.flags/DockerfileDockerfile rename to flow/flow.sunav2.quality.flags/Dockerfile From fe2857821e241df71c13716ac655b92c6bea7c18 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 26 Aug 2025 15:02:26 -0600 Subject: [PATCH 070/182] renv file --- flow/flow.sunav2.quality.flags/renv.lock | 182 +++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 flow/flow.sunav2.quality.flags/renv.lock diff --git a/flow/flow.sunav2.quality.flags/renv.lock b/flow/flow.sunav2.quality.flags/renv.lock new file mode 100644 index 000000000..5ed0cd2d6 --- /dev/null +++ b/flow/flow.sunav2.quality.flags/renv.lock @@ -0,0 +1,182 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "892124978869b74935dc3934c42bfe5a", + "Requirements": [] + }, + "stringi": { + "Package": "stringi", + "Version": "1.7.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "bba431031d30789535745a9627ac9271", + "Requirements": [] + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "960e2ae9e09656611e0b8214ad543207", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + } + } +} From b982e99b6e84d70b09926ab455c1282dc02bf099 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 28 Aug 2025 10:13:31 -0600 Subject: [PATCH 071/182] test continuous data --- pipe/sunav2/site-list.json | 4 ++++ pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/site-list.json b/pipe/sunav2/site-list.json index 72c62c1df..d9ba56eb9 100644 --- a/pipe/sunav2/site-list.json +++ b/pipe/sunav2/site-list.json @@ -1,4 +1,8 @@ [ + { + "site" : "HOPB", + "kafka_start_date" : "2024-03-01" + }, { "site" : "CRAM", "kafka_start_date" : "2024-03-01" diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 7ebf6cc71..1ac072cd3 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2024-03-14" # Inclusive - END_DATE: "2024-03-24" # Inclusive + START_DATE: "2025-06-19" # Inclusive + END_DATE: "2025-06-29" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" From 10563889214122900f48b679c285a2e14e5e4f7e Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 28 Aug 2025 13:27:46 -0600 Subject: [PATCH 072/182] latest --- .../flow.sunav2.quality.flags.R | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 72e576e21..7b8e192ad 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -32,21 +32,21 @@ #' @keywords Currently none #' @examples -#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", +#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/nitrate_thresh_select_ts_pad/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620", #' DirInThresholds="~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", #' DirOut="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", #' FileSchmQf=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') #' log=log) #' Stepping through the code in R studio -# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2024/09/10/nitrate_CRAM103100') -# log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", -# "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", -# "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", -# "DirErr=~/pfs/out/errored_datums") -# arg <- c("DirIn=$DIR_IN", -# "DirOut=~/pfs/out", -# "DirErr=~/pfs/out/errored_datums") +Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", + "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", + "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", + "DirErr=~/pfs/out/errored_datums") +arg <- c("DirIn=$DIR_IN", + "DirOut=~/pfs/out", + "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -101,7 +101,7 @@ if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ # Find all the input paths (datums). We will process each one. DirIn <- NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, - nameDirSub = NULL, + nameDirSub = 'data', log = log) # Process each datum path @@ -113,7 +113,7 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { withCallingHandlers( wrap.sunav2.quality.flags( DirIn=idxFileIn, - DirOut=Para$DirOut, + DirOutFlags=Para$DirOut, SchmFlagsOut=FileSchmQf, log=log ), From bf5a842453d4447cddc169cfc584e37f91e0d5ac Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Tue, 2 Sep 2025 15:27:21 -0600 Subject: [PATCH 073/182] update node selector --- pipe/sunav2/sunav2_calibration_assignment.yaml | 16 +--------------- .../sunav2_calibration_group_and_convert.yaml | 16 +--------------- pipe/sunav2/sunav2_calibration_list_files.yaml | 16 +--------------- pipe/sunav2/sunav2_calibration_loader.yaml | 16 +--------------- .../sunav2_cron_daily_and_date_control.yaml | 16 +--------------- ...nav2_cron_daily_and_date_control_kafka.yaml | 18 ++---------------- pipe/sunav2/sunav2_data_source_kafka.yaml | 16 +--------------- pipe/sunav2/sunav2_data_source_trino.yaml | 16 +--------------- .../sunav2_fill_date_gaps_and_regularize.yaml | 16 +--------------- pipe/sunav2/sunav2_fill_log_files.yaml | 16 +--------------- ...unav2_location_active_dates_assignment.yaml | 16 +--------------- pipe/sunav2/sunav2_location_asset.yaml | 16 +--------------- .../sunav2_location_asset_assignment.yaml | 16 +--------------- .../sunav2_location_group_and_restructure.yaml | 16 +--------------- pipe/sunav2/sunav2_location_loader.yaml | 16 +--------------- .../sunav2_logjam_assign_clean_files.yaml | 16 +--------------- pipe/sunav2/sunav2_logjam_list_files.yaml | 16 +--------------- pipe/sunav2/sunav2_logjam_load_files.yaml | 16 +--------------- pipe/sunav2/sunav2_trino_data_parser.yaml | 16 +--------------- 19 files changed, 20 insertions(+), 286 deletions(-) diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml index 42fd8d720..02ada5f91 100644 --- a/pipe/sunav2/sunav2_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -43,18 +43,4 @@ datum_set_spec: number: 5 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index ba750f544..e2a09a7fc 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -94,18 +94,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_list_files.yaml b/pipe/sunav2/sunav2_calibration_list_files.yaml index e7cb05b79..8c73e9ed9 100644 --- a/pipe/sunav2/sunav2_calibration_list_files.yaml +++ b/pipe/sunav2/sunav2_calibration_list_files.yaml @@ -27,18 +27,4 @@ sidecar_resource_requests: cpu: 0.5 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 1772a996d..6c2367985 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -55,18 +55,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 1ac072cd3..a3fa1aa43 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -40,18 +40,4 @@ sidecar_resource_requests: autoscaling: true scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 6b5026a7a..044e10f73 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -11,7 +11,7 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-07-01" # Inclusive + START_DATE: "2025-08-25" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" @@ -39,18 +39,4 @@ sidecar_resource_requests: autoscaling: true scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_data_source_kafka.yaml b/pipe/sunav2/sunav2_data_source_kafka.yaml index bae43cb2f..7b612f90b 100644 --- a/pipe/sunav2/sunav2_data_source_kafka.yaml +++ b/pipe/sunav2/sunav2_data_source_kafka.yaml @@ -206,18 +206,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index 42d53cdf2..b2100b20c 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -143,18 +143,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml index a3d8ed4a7..bd4caa382 100644 --- a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml +++ b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml @@ -97,18 +97,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 10f6df4bb..fbe59fa6c 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -185,18 +185,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.yaml b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml index e8961ac81..40eec8646 100644 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.yaml +++ b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml @@ -45,18 +45,4 @@ datum_set_spec: number: 5 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_asset.yaml b/pipe/sunav2/sunav2_location_asset.yaml index 902a1d9ac..904c178a8 100644 --- a/pipe/sunav2/sunav2_location_asset.yaml +++ b/pipe/sunav2/sunav2_location_asset.yaml @@ -51,18 +51,4 @@ sidecar_resource_requests: cpu: 0.3 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_asset_assignment.yaml b/pipe/sunav2/sunav2_location_asset_assignment.yaml index 074ecf4b5..bb072eea2 100644 --- a/pipe/sunav2/sunav2_location_asset_assignment.yaml +++ b/pipe/sunav2/sunav2_location_asset_assignment.yaml @@ -45,18 +45,4 @@ datum_set_spec: number: 5 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml index dde304e5a..6a493ac1e 100644 --- a/pipe/sunav2/sunav2_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -94,18 +94,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_location_loader.yaml b/pipe/sunav2/sunav2_location_loader.yaml index b5815e716..1f820410a 100644 --- a/pipe/sunav2/sunav2_location_loader.yaml +++ b/pipe/sunav2/sunav2_location_loader.yaml @@ -50,18 +50,4 @@ sidecar_resource_requests: cpu: 0.3 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 432e3324e..73340a674 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -38,18 +38,4 @@ datum_set_spec: number: 5 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_list_files.yaml b/pipe/sunav2/sunav2_logjam_list_files.yaml index cd52d3be5..9ed435342 100644 --- a/pipe/sunav2/sunav2_logjam_list_files.yaml +++ b/pipe/sunav2/sunav2_logjam_list_files.yaml @@ -29,18 +29,4 @@ sidecar_resource_requests: cpu: 0.4 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_load_files.yaml b/pipe/sunav2/sunav2_logjam_load_files.yaml index f981ce62b..3a2685b52 100644 --- a/pipe/sunav2/sunav2_logjam_load_files.yaml +++ b/pipe/sunav2/sunav2_logjam_load_files.yaml @@ -42,18 +42,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: "true" - nodepool.neonscience.org/pipeline: "yes" - cloud.google.com/gke-spot: "true" -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_trino_data_parser.yaml b/pipe/sunav2/sunav2_trino_data_parser.yaml index 38d6148cb..3985ef178 100644 --- a/pipe/sunav2/sunav2_trino_data_parser.yaml +++ b/pipe/sunav2/sunav2_trino_data_parser.yaml @@ -95,18 +95,4 @@ datum_set_spec: number: 1 scheduling_spec: node_selector: - cloud.google.com/gke-ephemeral-storage-local-ssd: 'true' - nodepool.neonscience.org/pipeline: 'yes' - cloud.google.com/gke-spot: 'true' -pod_spec: |- - { "tolerations": [ - { - "key": "nodepool.neonscience.org/pipeline", - "operator": "Exists" - }, - { - "effect": "NoSchedule", - "key": "cloud.google.com/gke-spot", - "operator": "Exists" - } - ] } + cloud.google.com/compute-class: pach-pipeline-class From 598a6f6b96e24b9c9f547ce1aacdb34cd9d9e13a Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Tue, 2 Sep 2025 15:28:34 -0600 Subject: [PATCH 074/182] point to correct site list --- pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 044e10f73..2886da0f2 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -25,8 +25,8 @@ input: overwrite: true - pfs: name: SITE_FILE - repo: sunav2_site_list_kafka - glob: /site-list.json + repo: sunav2_site_list + glob: /site-list-full.json resource_requests: memory: 100M cpu: 1 From d568fcfa196e45f19c62cbfad83b91483e78f9a2 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 10 Sep 2025 09:49:20 -0600 Subject: [PATCH 075/182] stop triggering daily cron --- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 1 + pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index a3fa1aa43..c7224e564 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -22,6 +22,7 @@ input: # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. - cron: name: tick + spec: "@never" spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) overwrite: true - pfs: diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 2886da0f2..2bb4cfb3c 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -21,7 +21,8 @@ input: # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. - cron: name: tick - spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + spec: "@never" + #spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) overwrite: true - pfs: name: SITE_FILE From 47da90c2cf39214d17dfb74117a65356b7a46b62 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 09:17:32 -0600 Subject: [PATCH 076/182] udpated for dev testing --- pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml | 4 ++-- pipe/sunav2/sunav2_data_source_trino.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index f1cc96635..5f8536e22 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -9,8 +9,8 @@ transform: # START_DATE must be set, format "YYYY-MM" # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. OUT_PATH: /pfs/out - START_MONTH: "2024-03" - # END_MONTH: "2024-09" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over + START_MONTH: "2025-06" + END_MONTH: "2025-06" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over stdin: - "#!/bin/bash" - ./cron_monthly_and_pub_control/populate_pub_months.sh diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index b2100b20c..420dc0804 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -124,7 +124,7 @@ transform: input: pfs: name: import_trigger - repo: sunav2_cron_daily_and_date_control + repo: nitrate_cron_monthly_and_pub_control #update in cert glob: "/sunav2/*/*/*" output_branch: master parallelism_spec: From e69bd7aaed29e1311b5a895dae18afd3e975ba3b Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 11:40:50 -0600 Subject: [PATCH 077/182] latest --- .../flow.sunav2.quality.flags.R | 18 ++++---- pipe/nitrate/nitrate_flags_specific.yaml | 46 +++++++++++++++++++ .../sunav2_calibration_group_and_convert.yaml | 15 +++--- pipe/sunav2/sunav2_data_source_trino.yaml | 2 +- 4 files changed, 64 insertions(+), 17 deletions(-) create mode 100644 pipe/nitrate/nitrate_flags_specific.yaml diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 7b8e192ad..35c817a50 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -38,15 +38,15 @@ #' FileSchmQf=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", - "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", - "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", - "DirErr=~/pfs/out/errored_datums") -arg <- c("DirIn=$DIR_IN", - "DirOut=~/pfs/out", - "DirErr=~/pfs/out/errored_datums") +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", +# "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", +# "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", +# "DirErr=~/pfs/out/errored_datums") +# arg <- c("DirIn=$DIR_IN", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml new file mode 100644 index 000000000..7be2c7109 --- /dev/null +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -0,0 +1,46 @@ +--- +pipeline: + name: troll_flags_specific +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-troll-flags:v1.0.1 + # image_pull_secrets: + # - battelleecology-quay-read-all-pull-secret + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - Rscript ./flow.troll.flags.R + DirIn=$DIR_IN + DirOut=/pfs/out + DirErr=/pfs/out/errored_datums + FileSchmQf=$SCHEMA_FLAGS + "DirSubCopy=uncertainty_data|uncertainty_coef" + env: + LOG_LEVEL: INFO +input: + cross: + - pfs: + name: DIR_IN + repo: troll_calibration_group_and_convert + # /source_type/YYYY/MM/DD + glob: /*/*/*/*/ + - pfs: + name: SCHEMA_FLAGS + repo: troll_avro_schemas + glob: /troll_shared/flags_troll_specific.avsc +parallelism_spec: + constant: 6 +autoscaling: true +resource_requests: + memory: 500M + cpu: 1.2 +resource_limits: + memory: 1G + cpu: 2 +sidecar_resource_requests: + memory: 2G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index e2a09a7fc..bcc3af595 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -41,14 +41,14 @@ transform: - path: name: DATA_PATH # Filter for data directory - glob_pattern: /pfs/DATA_PATH/*/*/*/*/*/** + glob_pattern: /pfs/DATA_PATH/sunav2/*/*/*/*/** # Join on named location (already joined below by source type and day) join_indices: [7] outer_join: true - path: name: CALIBRATION_PATH # Filter for data directory - glob_pattern: /pfs/CALIBRATION_PATH/*/*/*/*/*/** + glob_pattern: /pfs/CALIBRATION_PATH/sunav2/*/*/*/*/** # Join on named location (already joined below by day) join_indices: [7] OUT_PATH: /tmp/pfs/filter_joined # Note that R modules use "pfs" in the path structure to determine datums @@ -63,19 +63,20 @@ input: name: FILE_SCHEMA_FLAGS repo: sunav2_avro_schemas glob: /sunav2/flags_calibration_sunav2.avsc - # Outer join all days + # Outer join all repos so that varying sensors between kafka and trino loaders will all get joined with calibrations. Filter-joiner will narrow down. - join: - pfs: name: CALIBRATION_PATH repo: sunav2_calibration_assignment - glob: /(*)/(*)/(*)/(*) - joinOn: $1/$2/$3/$4 + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 + outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - pfs: name: DATA_PATH repo: sunav2_fill_log_files - glob: /(*)/(*)/(*)/(*) - joinOn: $1/$2/$3/$4 + glob: /sunav2/(*)/(*)/(*) + joinOn: $1/$2/$3 outer_join: true empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. parallelism_spec: diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index 420dc0804..7afa755cf 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -124,7 +124,7 @@ transform: input: pfs: name: import_trigger - repo: nitrate_cron_monthly_and_pub_control #update in cert + repo: sunav2_cron_daily_and_date_control #update in cert glob: "/sunav2/*/*/*" output_branch: master parallelism_spec: From f361dd50dc13a29a7e8561b76ed7d3aa81d66157 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 12:56:36 -0600 Subject: [PATCH 078/182] minor --- .../flow.sunav2.quality.flags.R | 11 +++------- .../wrap.sunav2.quality.flags.R | 20 +++++++------------ 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 35c817a50..55d5950b7 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -32,18 +32,13 @@ #' @keywords Currently none #' @examples -#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/nitrate_thresh_select_ts_pad/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620", -#' DirInThresholds="~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", -#' DirOut="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", +#' flow.sunav2.quality.flags <- function(DirIn="~/pfs/nitrate_thresh_select_ts_pad/2025/06/25/nitrate_HOPB112100", +#' DirOut="~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733", #' FileSchmQf=base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') #' log=log) #' Stepping through the code in R studio -# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/25/nitrate_HOPB112100/sunav2/CFGLOC113620') +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data", -# "DirInThresholds=~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold", -# "DirOut=~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags", -# "DirErr=~/pfs/out/errored_datums") # arg <- c("DirIn=$DIR_IN", # "DirOut=~/pfs/out", # "DirErr=~/pfs/out/errored_datums") diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 2a9a3403c..c2c65476e 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -53,14 +53,8 @@ wrap.sunav2.quality.flags <- function(DirIn, log <- NEONprocIS.base::def.log.init() } - DirInData <- - NEONprocIS.base::def.dir.in(DirBgn = DirIn, - nameDirSub = "data", - log = log) - DirInThresholds <- - NEONprocIS.base::def.dir.in(DirBgn = DirIn, - nameDirSub = "threshold", - log = log) + DirInData <- paste0(DirIn,"/data") + DirInThresholds <- paste0(DirIn,"/threshold") #' Read in parquet file of SUNA data dataFileName<-base::list.files(DirInData,full.names=FALSE) @@ -70,8 +64,8 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Convert measurements to be tested from class character to numeric sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) - sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) - sunaData$dark_value_used_for_fit<-as.numeric(sunaData$dark_value_used_for_fit) + sunaData$spec_average<-as.numeric(sunaData$spec_average) + sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) #' Create data frame of input file readout_times to serve as basis of output flag file flagFile<-as.data.frame(sunaData$readout_time) @@ -113,10 +107,10 @@ wrap.sunav2.quality.flags <- function(DirIn, minLightDarkRatio<-spectralRatioThreshold$number_value flagFile$nitrateLightDarkRatioQF<-NA for(i in 1:nrow(sunaData)){ - if(is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])|is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ + if(is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])|is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ flagFile[i,which(colnames(flagFile)=='nitrateLightDarkRatioQF')]=-1} - if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])&!is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ - if(sunaData[i,which(colnames(sunaData)=='spectrum_average')]/sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')] Date: Mon, 15 Sep 2025 13:22:01 -0600 Subject: [PATCH 079/182] update docker file --- flow/flow.sunav2.quality.flags/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/Dockerfile b/flow/flow.sunav2.quality.flags/Dockerfile index f02fce2d3..603cc646d 100644 --- a/flow/flow.sunav2.quality.flags/Dockerfile +++ b/flow/flow.sunav2.quality.flags/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 sensor-specific quality flags -# Start with the neon-is-base-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +# Start with the neon-is-pack-qaqc-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-qaqc-r:v1.1.8 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.sunav2.quality.flags" From ecbf4e71ec2ff5c31dd2c16161f7cb524155e941 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 13:22:01 -0600 Subject: [PATCH 080/182] update docker file --- flow/flow.sunav2.quality.flags/Dockerfile | 4 ++-- flow/flow.sunav2.quality.flags/renv.lock | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/Dockerfile b/flow/flow.sunav2.quality.flags/Dockerfile index f02fce2d3..603cc646d 100644 --- a/flow/flow.sunav2.quality.flags/Dockerfile +++ b/flow/flow.sunav2.quality.flags/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 sensor-specific quality flags -# Start with the neon-is-base-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +# Start with the neon-is-pack-qaqc-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-qaqc-r:v1.1.8 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.sunav2.quality.flags" diff --git a/flow/flow.sunav2.quality.flags/renv.lock b/flow/flow.sunav2.quality.flags/renv.lock index 5ed0cd2d6..dd613294d 100644 --- a/flow/flow.sunav2.quality.flags/renv.lock +++ b/flow/flow.sunav2.quality.flags/renv.lock @@ -125,10 +125,10 @@ }, "rlang": { "Package": "rlang", - "Version": "1.1.6", + "Version": "1.1.4", "Source": "Repository", "Repository": "CRAN", - "Hash": "892124978869b74935dc3934c42bfe5a", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", "Requirements": [] }, "stringi": { From d6f7873b5177b677509f174c812b9ccce23f606c Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 13:42:47 -0600 Subject: [PATCH 081/182] flags specific pipeline --- pipe/nitrate/nitrate_flags_specific.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 7be2c7109..143b652f5 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -1,32 +1,31 @@ --- pipeline: - name: troll_flags_specific + name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-troll-flags:v1.0.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-612f69c # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] stdin: - "#!/bin/bash" - - Rscript ./flow.troll.flags.R + - Rscript ./flow.sunav2.quality.flags.R DirIn=$DIR_IN DirOut=/pfs/out DirErr=/pfs/out/errored_datums FileSchmQf=$SCHEMA_FLAGS - "DirSubCopy=uncertainty_data|uncertainty_coef" env: LOG_LEVEL: INFO input: cross: - pfs: name: DIR_IN - repo: troll_calibration_group_and_convert + repo: nitrate_analyze_pad_and_qaqc_plau # /source_type/YYYY/MM/DD glob: /*/*/*/*/ - pfs: name: SCHEMA_FLAGS - repo: troll_avro_schemas - glob: /troll_shared/flags_troll_specific.avsc + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_sensor_specific_flags.avsc parallelism_spec: constant: 6 autoscaling: true From 954fbcad187f66265af9bab3a51b2ff9785e9861 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 14:30:16 -0600 Subject: [PATCH 082/182] update output path --- flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R | 2 +- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 55d5950b7..c4310c908 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -108,7 +108,7 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { withCallingHandlers( wrap.sunav2.quality.flags( DirIn=idxFileIn, - DirOutFlags=Para$DirOut, + DirOut=Para$DirOut, SchmFlagsOut=FileSchmQf, log=log ), diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index c2c65476e..9bfa8948b 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -43,7 +43,7 @@ #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, - DirOutFlags=NULL, + DirOut, SchmFlagsOut=NULL, log=NULL ){ @@ -53,8 +53,10 @@ wrap.sunav2.quality.flags <- function(DirIn, log <- NEONprocIS.base::def.log.init() } + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInData <- paste0(DirIn,"/data") DirInThresholds <- paste0(DirIn,"/threshold") + DirOutFlags <- base::paste0(DirOut,InfoDirIn$dirRepo,'flags') #' Read in parquet file of SUNA data dataFileName<-base::list.files(DirInData,full.names=FALSE) From 39d717116f73aaa9946000a30855fcf3cbaab450 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 14:38:42 -0600 Subject: [PATCH 083/182] minor --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 9bfa8948b..7035a7160 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -56,7 +56,7 @@ wrap.sunav2.quality.flags <- function(DirIn, InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInData <- paste0(DirIn,"/data") DirInThresholds <- paste0(DirIn,"/threshold") - DirOutFlags <- base::paste0(DirOut,InfoDirIn$dirRepo,'flags') + DirOutFlags <- base::paste0(DirOut,InfoDirIn$dirRepo,'/flags') #' Read in parquet file of SUNA data dataFileName<-base::list.files(DirInData,full.names=FALSE) From 5deb085ac9ffc6a9e0aabb40aa82336a98086e9c Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Sep 2025 16:06:04 -0600 Subject: [PATCH 084/182] latest --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 143b652f5..2850fc56e 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-612f69c + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-39d7171 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From 67fd9bc45df9baed1c77a65e1484accd4b082156 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 16 Sep 2025 12:53:21 -0600 Subject: [PATCH 085/182] Updates to SUNA sensor specific flagging module. Determines measurments at begining of burst to drop. --- .../wrap.sunav2.quality.flags.R | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 498b24d26..5cbe890e9 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -112,8 +112,31 @@ wrap.sunav2.quality.flags <- function(DirInData=NULL, if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')])&!is.na(sunaData[i,which(colnames(sunaData)=='spectrum_average')])){ if(sunaData[i,which(colnames(sunaData)=='spectrum_average')]/sunaData[i,which(colnames(sunaData)=='dark_value_used_for_fit')] Date: Tue, 16 Sep 2025 13:10:41 -0600 Subject: [PATCH 086/182] Updates to SUNA sensor specific flag module. --- .../wrap.sunav2.quality.flags.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 46b910631..6908eb31e 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -28,9 +28,9 @@ #' #' @examples #' # Not run -# DirInData<-"~/pfs/sunav2_location_group_and_restructure/sunav2/2024/09/10/CFGLOC110733/data" +# DirInData<-"~/pfs/sunav2_location_group_and_restructure/2024/09/10/CFGLOC110733/data" # DirInThresholds<-"~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold" -# DirOutFlags<-"~/pfs/sunav2_sensor_specific_flags/sunav2/2024/09/10/CFGLOC110733/flags/" +# DirOutFlags<-"~/pfs/sunav2_sensor_specific_flags/2024/09/10/CFGLOC110733/flags/" # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' @@ -114,7 +114,7 @@ wrap.sunav2.quality.flags <- function(DirIn, if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])&!is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ if(sunaData[i,which(colnames(sunaData)=='spec_average')]/sunaData[i,which(colnames(sunaData)=='dark_signal_average')] Date: Tue, 16 Sep 2025 14:11:17 -0600 Subject: [PATCH 087/182] latest --- ...v2_cron_daily_and_date_control_logjam.yaml | 44 +++++++++++++++++++ pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml new file mode 100644 index 000000000..84df5e383 --- /dev/null +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml @@ -0,0 +1,44 @@ +--- +pipeline: + name: sunav2_cron_daily_and_date_control_logjam +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 + cmd: ["/bin/bash"] + env: + # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure + # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file + # start_date field in the site-list file is the earliest date to pull data from a site + # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka + # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. + OUT_PATH: /pfs/out + START_DATE: "2025-06-19" # Inclusive + END_DATE: "2025-06-29" # Inclusive + SOURCE_TYPE: "sunav2" + stdin: + - "#!/bin/bash" + - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main +input: + cross: + # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. + - cron: + name: tick + spec: "@never" + spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + overwrite: true + - pfs: + name: SITE_FILE + repo: sunav2_site_list + glob: /site-list.json +resource_requests: + memory: 100M + cpu: 1 +resource_limits: + memory: 300M + cpu: 1.5 +sidecar_resource_requests: + memory: 500M + cpu: 0.5 +autoscaling: true +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index fbe59fa6c..1d25a9648 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -139,7 +139,7 @@ input: - pfs: name: FILE_SCHEMA_DATA repo: sunav2_avro_schemas - glob: /sunav2/sunav2_calibrated.avsc + glob: /sunav2/sunav2_logfilled.avsc - join: - pfs: name: DATA_PATH_TRINO From 4b4f1cef30e0aefa4b2bf57acc76e3f96db585a8 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Sep 2025 15:19:20 -0600 Subject: [PATCH 088/182] ignore problematic spec_channels for now --- flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R | 2 +- flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R index d7243e9da..10389faf0 100644 --- a/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/flow.sunav2.logfiles.fill.R @@ -44,7 +44,7 @@ # Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/10/20349') #cleaned log data # Sys.setenv(DirIn='/home/NEON/ncatolico/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349') #streamed L0 data # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_calibrated.avsc") +# arg <- c("DirIn=$DirIn","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index d0bea0db8..143f35a5d 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -128,9 +128,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 } - #dataOut$spectrum_channels<-NULL #remove list - #dataOutFrame<-as.data.frame(dataOut) - #names(dataOutFrame)[names(dataOutFrame) == 'nitrate_concentration'] <- 'nitrate' + dataOut$spectrum_channels<-0 #remove list for now #' Write out data file and log flags file From 086338d69fb86ed352f499dd8e595cd04689e67d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Sep 2025 15:40:26 -0600 Subject: [PATCH 089/182] change field names back --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 4 ++-- pipe/sunav2/sunav2_calibration_group_and_convert.yaml | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 6908eb31e..bcb0cf45b 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -66,8 +66,8 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Convert measurements to be tested from class character to numeric sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) - sunaData$spec_average<-as.numeric(sunaData$spec_average) - sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) + sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) + sunaData$dark_value_used_for_fit<-as.numeric(sunaData$dark_value_used_for_fit) #' Create data frame of input file readout_times to serve as basis of output flag file flagFile<-as.data.frame(sunaData$readout_time) diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index bcc3af595..d61659758 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -62,7 +62,7 @@ input: - pfs: name: FILE_SCHEMA_FLAGS repo: sunav2_avro_schemas - glob: /sunav2/flags_calibration_sunav2.avsc + glob: /sunav2/sunav2_calibration_flags.avsc # Outer join all repos so that varying sensors between kafka and trino loaders will all get joined with calibrations. Filter-joiner will narrow down. - join: - pfs: diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 1d25a9648..e9bc016da 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-bae8aad + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-4b4f1ce cmd: - sh - "-c" From 326b3b4acc091c5cb5e28793bc235a6631ee2827 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Sep 2025 17:03:05 -0600 Subject: [PATCH 090/182] schema update --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 2 +- pipe/sunav2/sunav2_calibration_group_and_convert.yaml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index bcb0cf45b..9094fc4b4 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -67,7 +67,7 @@ wrap.sunav2.quality.flags <- function(DirIn, sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) - sunaData$dark_value_used_for_fit<-as.numeric(sunaData$dark_value_used_for_fit) + sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) #' Create data frame of input file readout_times to serve as basis of output flag file flagFile<-as.data.frame(sunaData$readout_time) diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index d61659758..8dafbaa5e 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -27,6 +27,7 @@ transform: DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ TermQf=nitrate \ + FileSchmData=$FILE_SCHEMA_DATA \ FileSchmQf=$FILE_SCHEMA_FLAGS \ DirSubCopy=flags EOF @@ -63,6 +64,10 @@ input: name: FILE_SCHEMA_FLAGS repo: sunav2_avro_schemas glob: /sunav2/sunav2_calibration_flags.avsc + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc # Outer join all repos so that varying sensors between kafka and trino loaders will all get joined with calibrations. Filter-joiner will narrow down. - join: - pfs: From 8b0b9a26619e5c5e7692a4f76ede533362836f95 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Sep 2025 17:14:02 -0600 Subject: [PATCH 091/182] latest --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 9094fc4b4..6908eb31e 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -66,7 +66,7 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Convert measurements to be tested from class character to numeric sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) - sunaData$spectrum_average<-as.numeric(sunaData$spectrum_average) + sunaData$spec_average<-as.numeric(sunaData$spec_average) sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) #' Create data frame of input file readout_times to serve as basis of output flag file From d75b5b3e52dde76eef16fa61b0acb496bb04f0fb Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 17 Sep 2025 11:20:00 -0600 Subject: [PATCH 092/182] spec channels update --- .../wrap.sunav2.logfiles.fill.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 143f35a5d..5141171ef 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -111,7 +111,13 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, # Generate error and stop execution log$error(base::paste0('File ', dirInDataLogs, '/', logFile, ' is unreadable.')) base::stop()} - } + } + +#' update columns to same format + if(length(L0Data>=1)){ + L0Data$spectrum_channels <- lapply(L0Data$spectrum_channels, function(x) paste(x, collapse = ";")) + } + #' Determine whether to use logged or streamed data. #' Logged data is used if available, and log data flag set to 1 @@ -128,8 +134,6 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, flagsOut$readout_time<-dataOut$readout_time flagsOut$sunaLogDataQF<-0 } - dataOut$spectrum_channels<-0 #remove list for now - #' Write out data file and log flags file From 67564eb67574f5ad958ec0a749ca9611af694749 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 17 Sep 2025 11:25:50 -0600 Subject: [PATCH 093/182] update images --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 2850fc56e..373ddd34b 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-39d7171 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-8b0b9a2 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index e9bc016da..2bf408898 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-4b4f1ce + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-d75b5b3 cmd: - sh - "-c" From 5bf1c425b8f35492f362baf31b26b42ec9696db3 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 17 Sep 2025 11:51:38 -0600 Subject: [PATCH 094/182] minor --- flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index 5141171ef..bd59cac83 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -98,7 +98,9 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, # Generate error and stop execution log$error(base::paste0('File ', dirInDataStream, '/', L0File, ' is unreadable.')) base::stop()} - } + }else{ + L0Data<-NULL + } #' Load any logged data fileDataLogs<-base::list.files(dirInDataLogs,full.names=FALSE) @@ -111,6 +113,8 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, # Generate error and stop execution log$error(base::paste0('File ', dirInDataLogs, '/', logFile, ' is unreadable.')) base::stop()} + }else{ + logData<-NULL } #' update columns to same format From a9c9ab031d17968c3abcc820fcd6f6a787e65529 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 17 Sep 2025 12:47:42 -0600 Subject: [PATCH 095/182] latest --- .../wrap.sunav2.logfiles.fill.R | 6 +++--- flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R | 6 +++--- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- pipe/sunav2/sunav2_logjam_assign_clean_files.yaml | 12 +++++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R index bd59cac83..de0eda989 100644 --- a/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R +++ b/flow/flow.sunav2.logfiles.fill/wrap.sunav2.logfiles.fill.R @@ -44,10 +44,10 @@ #' @examples #' # Not run # DirInLogs<-"~/pfs/sunav2_logjam_assign_clean_files/sunav2/2024/09/11/20349" #cleaned log data -# DirInStream<-"~/pfs/sunav2_trino_data_parser/sunav2/2024/09/11/20349" #streamed L0 data +# DirInStream<-"~/pfs/sunav2_trino_data_parser/sunav2/2025/06/22/20345" #streamed L0 data # DirIn<-NULL # DirOutBase="~/pfs/out" -# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2/sunav2_calibrated.avsc'),collapse='') +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_log_flags.avsc'),collapse='') #' @@ -118,7 +118,7 @@ wrap.sunav2.logfiles.fill <- function(DirInLogs=NULL, } #' update columns to same format - if(length(L0Data>=1)){ + if(length(L0Data)>=1){ L0Data$spectrum_channels <- lapply(L0Data$spectrum_channels, function(x) paste(x, collapse = ";")) } diff --git a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R index b84cc09db..a653364ca 100644 --- a/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R +++ b/flow/flow.sunav2.logfiles/flow.sunav2.logfiles.R @@ -41,9 +41,9 @@ #' SchmDataOut=NULL, #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums") +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/sunav2_logjam_load_files/20349') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=$DIR_IN","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","FileSchmData=~/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 2bf408898..3d96781ad 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-d75b5b3 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-5bf1c42 cmd: - sh - "-c" diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 73340a674..7776659db 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -10,6 +10,7 @@ transform: DirIn=$DIR_IN DirOut=/pfs/out DirErr=$ERR_PATH + FileSchmData=$FILE_SCHEMA_DATA image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-a1ff444 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret @@ -19,9 +20,14 @@ transform: input: cross: - pfs: - name: DIR_IN - repo: sunav2_logjam_load_files - glob: /* + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + - join: + - pfs: + name: DIR_IN + repo: sunav2_logjam_load_files + glob: /* parallelism_spec: constant: 5 autoscaling: true From f284f8668bfd6c17e1ba995ea33615d8dbbaa2da Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 17 Sep 2025 13:37:45 -0600 Subject: [PATCH 096/182] image --- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 3d96781ad..eebd1fbd4 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-5bf1c42 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-a9c9ab0 cmd: - sh - "-c" From 1c32a7057ee0f9761f183ea1acbab922bfad9d22 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 19 Sep 2025 10:35:04 -0600 Subject: [PATCH 097/182] Updated suna quality flag module to remove measuremnts where lamp has not had enough time to stabilize since these are not intended to be used in downstream pipeline. --- .../flow.sunav2.quality.flags.R | 55 ++++-- .../wrap.sunav2.quality.flags.R | 171 +++++++++++------- 2 files changed, 147 insertions(+), 79 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index c4310c908..621b3709f 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -4,21 +4,24 @@ #' @author #' Bobby Hensley \email{hensley@battelleecology.org} -#' @description Workflow. Calculates quality flags for SUNA internal humidity, lamp temperature -#' and light to dark spectral ratio, and saves into daily parquets. +#' @description Workflow. Uses thresholds to apply sensor-specific quality flags to SUNA data. +#' Measurements where the lamp has not had enough time to stabilze (nitrateLampStabilizeQF=1) are removed. #' #' The arguments are: #' -#' 1. "DirIn=value", The input path to the data, structured as follows: -#' #/pfs/BASE_REPO/date/source-id/data. +#' 1. "DirIn=value", The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. +#' #/pfs/BASE_REPO/date/location/sunav2/cfgloc, where files will then be in /data, /flags and /threshold sub-folders. #' -#' 2. "DirOut=value", where the value is the output path. +#' 2. "DirInAdditional=value", The file path to the log file flags and calibration flags. +#' +#' 2. "DirOut=value", The base file path for the output data. #' #' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will #' replace the #/pfs/BASE_REPO portion of \code{DirIn}. #' -#' 4. "FileSchmQf=value" (optional), where values is the full path to the avro schema for the output data -#' file. +#' 4. "SchmData=value" (optional), The avro schema for the input and output data file. +#' +#' 5. "SchmFlagsOut=value" (optional), The avro schema for the combined flag file. #' #' #' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, @@ -39,15 +42,21 @@ #' Stepping through the code in R studio # Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=$DIR_IN", -# "DirOut=~/pfs/out", -# "DirErr=~/pfs/out/errored_datums") +arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620", + "DirInAdditional=~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags", + "DirOut=~/pfs/out", + "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently # changelog and author contributions / copyrights -#' Bobby Hensley (2025-08-26) Original creation +#' Bobby Hensley (2025-08-30) +#' Initial creation. +#' +#' Bobby Hensley (2025-09-18) +#' Updated so that measurements prior to lamp stabilization (never intended to be +#' used in downstream pipeline) are removed. # ############################################################################################## options(digits.secs = 3) @@ -76,21 +85,27 @@ if(numCoreUse > numCoreAvail){ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) # Parse the input arguments into parameters -Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirOut","DirErr"), - NameParaOptn = c("FileSchmQf"),log = log) +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirInAdditional","DirOut","DirErr"), + NameParaOptn = c("SchmData","SchmFlagsOut"),log = log) # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Additional input data directory: ', Para$DirInAdditional)) log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) -log$debug(base::paste0('Schema for output data: ', Para$FileSchmQf)) - +log$debug(base::paste0('Schema for output data: ', Para$SchmData)) +log$debug(base::paste0('Schema for output flags: ', Para$SchmFlagsOut)) # Read in the schemas so we only have to do it once and not every time in the avro writer. -if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ - FileSchmQf <- NULL +if(base::is.null(Para$SchmData) || Para$SchmData == 'NA'){ + SchmData <- NULL } else { - FileSchmQf <- base::paste0(base::readLines(Para$FileSchmQf),collapse='') + SchmData <- base::paste0(base::readLines(Para$SchmData),collapse='') +} +if(base::is.null(Para$SchmFlagsOut) || Para$SchmFlagsOut == 'NA'){ + SchmFlagsOut <- NULL +} else { + SchmFlagsOut <- base::paste0(base::readLines(Para$SchmFlagsOut),collapse='') } # Find all the input paths (datums). We will process each one. @@ -98,6 +113,10 @@ DirIn <- NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, nameDirSub = 'data', log = log) +DirInAdditional <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirInAdditional, + nameDirSub = 'flags', + log = log) # Process each datum path doParallel::registerDoParallel(numCoreUse) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 6908eb31e..3db9531e4 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -4,22 +4,25 @@ #' @author #' Bobby Hensley \email{hensley@battelleecology.org} #' -#' @description Wrapper function. Uses thresholds to apply quality flags to SUNA data. +#' @description Wrapper function. Uses thresholds to apply sensor-specific quality flags to SUNA data. +#' Measurements where the lamp has not had enough time to stabilze (nitrateLampStabilizeQF=1) are removed. #' -#' @param DirIn Character value. The file path to the input data and quality flag thresholds. +#' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. +#' +#' @param DirInAdditional Character value. The file path to the log file flags and calibration flags. #' -#' @param DirOutFlags Character value. The file path for the output data. +#' @param DirOut Character value. The base file path for the output data. +#' +#' @param SchmData (optional), A json-formatted character string containing the schema for the data file. +#' This should be the same for the input as the output. Only the number of rows of measurements should change. #' -#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output data -#' file. If this input is not provided, the output schema for the data will be the same as the input data -#' file. If a schema is provided, ENSURE THAT ANY PROVIDED OUTPUT SCHEMA FOR THE DATA MATCHES THE COLUMN ORDER OF -#' THE INPUT DATA. +#' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags. #' #' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init #' for more details. #' -#' @return SUNA data with sensor-specific quality flags applied in daily parquets. +#' @return SUNA data file and combined flag file in daily parquets. #' #' @references #' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 @@ -28,134 +31,180 @@ #' #' @examples #' # Not run -# DirInData<-"~/pfs/sunav2_location_group_and_restructure/2024/09/10/CFGLOC110733/data" -# DirInThresholds<-"~/pfs/nitrate_thresh_select_ts_pad/2024/09/10/nitrate_CRAM103100/sunav2/CFGLOC110733/threshold" -# DirOutFlags<-"~/pfs/sunav2_sensor_specific_flags/2024/09/10/CFGLOC110733/flags/" -# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_sensor_specific_flags.avsc'),collapse='') +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" +# DirInAdditional<-"~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" +# SchmData<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') +# SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' -#' -#' -#' #' #' @changelog -#' Bobby Hensley (2025-08-30) created +#' Bobby Hensley (2025-08-30) +#' Initial creation. +#' +#' Bobby Hensley (2025-09-18) +#' Updated so that measurements prior to lamp stabilization (never intended to be +#' used in downstream pipeline) are removed. #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, + DirInAdditional, DirOut, + SchmData=NULL, SchmFlagsOut=NULL, log=NULL ){ - #' Start logging if not already + #' Start logging if not already. if(base::is.null(log)){ log <- NEONprocIS.base::def.log.init() } - InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInData <- paste0(DirIn,"/data") + DirInPlaus <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") - DirOutFlags <- base::paste0(DirOut,InfoDirIn$dirRepo,'/flags') + DirOutData <- base::paste0(DirOut,"/data") + DirOutFlags <- base::paste0(DirOut,"/flags") - #' Read in parquet file of SUNA data + #' Read in parquet file of SUNA data. dataFileName<-base::list.files(DirInData,full.names=FALSE) sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), log = log),silent = FALSE) - #' Convert measurements to be tested from class character to numeric + #' Read in parquet file of QAQC plausibility flags. + plausFileName<-base::list.files(DirInPlaus,full.names=FALSE) + plausFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInPlaus, '/', plausFileName), + log = log),silent = FALSE) + + #' Read in parquet file of calibration flags. + calFileName<-grep("flagsCal",base::list.files(DirInAdditional,full.names=FALSE),value=TRUE) + calFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInAdditional, '/', calFileName), + log = log),silent = FALSE) + + #' Read in parquet file of logged file flags. + logFileName<-grep("logFlags",base::list.files(DirInAdditional,full.names=FALSE),value=TRUE) + logFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInAdditional, '/', logFileName), + log = log),silent = FALSE) + + #' Convert measurements to be tested from class character to numeric. sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) sunaData$lamp_temperature<-as.numeric(sunaData$lamp_temperature) sunaData$spec_average<-as.numeric(sunaData$spec_average) sunaData$dark_signal_average<-as.numeric(sunaData$dark_signal_average) - #' Create data frame of input file readout_times to serve as basis of output flag file - flagFile<-as.data.frame(sunaData$readout_time) - colnames(flagFile)<-c("readout_time") + #' Create data frame of input data file readout_times to serve as basis of sensor specific flag file. + sensorFlags<-as.data.frame(sunaData$readout_time) + colnames(sensorFlags)<-c("readout_time") - #' Read in json file of quality flag thresholds + #' Read in json file of quality flag thresholds. thresholdFileName<-base::list.files(DirInThresholds,full.names=FALSE) sunaThresholds<-base::try(NEONprocIS.qaqc::def.read.thsh.qaqc.df(NameFile = base::paste0(DirInThresholds, '/', thresholdFileName)),silent = FALSE) - #' Perform internal humidity test + #' Perform internal humidity test. humidityThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Internal humidity"),] maxHumidity<-humidityThreshold$number_value - flagFile$nitrateHumidityQF<-NA + sensorFlags$nitrateHumidityQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ - flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=-1} + sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='relative_humidity')])){ if(sunaData[i,which(colnames(sunaData)=='relative_humidity')]>maxHumidity){ - flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=1} - else{flagFile[i,which(colnames(flagFile)=='nitrateHumidityQF')]=0}} + sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=1} + else{sensorFlags[i,which(colnames(sensorFlags)=='nitrateHumidityQF')]=0}} } - #' Perform lamp temperature test (New condition need to be created. Using default for now) + #' Perform lamp temperature test (New condition need to be created. Using default for now). # lampTempThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Lamp Temperature"),] # maxLampTemp<-lampTempThreshold$number_value maxLampTemp=35 - flagFile$nitrateLampTempQF<-NA + sensorFlags$nitrateLampTempQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ - flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=-1} + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ if(sunaData[i,which(colnames(sunaData)=='lamp_temperature')]>maxLampTemp){ - flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=1} - else{flagFile[i,which(colnames(flagFile)=='nitrateLampTempQF')]=0}} + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=1} + else{sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]=0}} } - #' Perform light to dark spectral ratio test + #' Perform light to dark spectral ratio test. spectralRatioThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Minimum Light to Dark Spec Average Ratio"),] minLightDarkRatio<-spectralRatioThreshold$number_value - flagFile$nitrateLightDarkRatioQF<-NA + sensorFlags$nitrateLightDarkRatioQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])|is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ - flagFile[i,which(colnames(flagFile)=='nitrateLightDarkRatioQF')]=-1} + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLightDarkRatioQF')]=-1} if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])&!is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ if(sunaData[i,which(colnames(sunaData)=='spec_average')]/sunaData[i,which(colnames(sunaData)=='dark_signal_average')] Date: Mon, 22 Sep 2025 13:01:12 -0600 Subject: [PATCH 098/182] latest --- .../flow.sunav2.quality.flags.R | 42 ++++++------- .../wrap.sunav2.quality.flags.R | 59 +++++++++++++------ .../nitrate_analyze_pad_and_qaqc_plau.yaml | 3 +- .../nitrate/nitrate_thresh_select_ts_pad.yaml | 4 +- ...v2_cron_daily_and_date_control_logjam.yaml | 44 -------------- .../sunav2_logjam_assign_clean_files.yaml | 2 +- 6 files changed, 62 insertions(+), 92 deletions(-) delete mode 100644 pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 621b3709f..56956c2bf 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -12,16 +12,14 @@ #' 1. "DirIn=value", The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. #' #/pfs/BASE_REPO/date/location/sunav2/cfgloc, where files will then be in /data, /flags and /threshold sub-folders. #' -#' 2. "DirInAdditional=value", The file path to the log file flags and calibration flags. -#' #' 2. "DirOut=value", The base file path for the output data. #' #' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will #' replace the #/pfs/BASE_REPO portion of \code{DirIn}. #' -#' 4. "SchmData=value" (optional), The avro schema for the input and output data file. +#' 4. "FileSchmData=value" (optional), The avro schema for the input and output data file. #' -#' 5. "SchmFlagsOut=value" (optional), The avro schema for the combined flag file. +#' 5. "FileSchmQf=value" (optional), The avro schema for the combined flag file. #' #' #' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, @@ -42,10 +40,9 @@ #' Stepping through the code in R studio # Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620", - "DirInAdditional=~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags", - "DirOut=~/pfs/out", - "DirErr=~/pfs/out/errored_datums") +# arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -85,37 +82,33 @@ if(numCoreUse > numCoreAvail){ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) # Parse the input arguments into parameters -Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn", "DirInAdditional","DirOut","DirErr"), - NameParaOptn = c("SchmData","SchmFlagsOut"),log = log) +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("FileSchmData","FileSchmQf"),log = log) # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirIn)) -log$debug(base::paste0('Additional input data directory: ', Para$DirInAdditional)) log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) -log$debug(base::paste0('Schema for output data: ', Para$SchmData)) -log$debug(base::paste0('Schema for output flags: ', Para$SchmFlagsOut)) +log$debug(base::paste0('Schema for output data: ', Para$FileSchmData)) +log$debug(base::paste0('Schema for output flags: ', Para$FileSchmQf)) # Read in the schemas so we only have to do it once and not every time in the avro writer. -if(base::is.null(Para$SchmData) || Para$SchmData == 'NA'){ - SchmData <- NULL +if(base::is.null(Para$FileSchmData) || Para$FileSchmData == 'NA'){ + SchmDataOut <- NULL } else { - SchmData <- base::paste0(base::readLines(Para$SchmData),collapse='') + SchmDataOut <- base::paste0(base::readLines(Para$FileSchmData),collapse='') } -if(base::is.null(Para$SchmFlagsOut) || Para$SchmFlagsOut == 'NA'){ +if(base::is.null(Para$FileSchmQf) || Para$FileSchmQf == 'NA'){ SchmFlagsOut <- NULL } else { - SchmFlagsOut <- base::paste0(base::readLines(Para$SchmFlagsOut),collapse='') + SchmFlagsOut <- base::paste0(base::readLines(Para$FileSchmQf),collapse='') } + # Find all the input paths (datums). We will process each one. DirIn <- NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, - nameDirSub = 'data', - log = log) -DirInAdditional <- - NEONprocIS.base::def.dir.in(DirBgn = Para$DirInAdditional, - nameDirSub = 'flags', + nameDirSub = c('data','flags'), log = log) # Process each datum path @@ -128,7 +121,8 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { wrap.sunav2.quality.flags( DirIn=idxFileIn, DirOut=Para$DirOut, - SchmFlagsOut=FileSchmQf, + SchmDataOut=SchmDataOut, + SchmFlagsOut=SchmFlagsOut, log=log ), error = function(err) { diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 3db9531e4..4bb692e81 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -8,12 +8,10 @@ #' Measurements where the lamp has not had enough time to stabilze (nitrateLampStabilizeQF=1) are removed. #' #' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. -#' -#' @param DirInAdditional Character value. The file path to the log file flags and calibration flags. #' #' @param DirOut Character value. The base file path for the output data. #' -#' @param SchmData (optional), A json-formatted character string containing the schema for the data file. +#' @param SchmDataOut (optional), A json-formatted character string containing the schema for the data file. #' This should be the same for the input as the output. Only the number of rows of measurements should change. #' #' @param SchmFlagsOut (optional), A json-formatted character string containing the schema for the output flags. @@ -32,7 +30,7 @@ #' @examples #' # Not run # DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" -# DirInAdditional<-"~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags" +# DirIn<-"~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags" # DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" # SchmData<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') @@ -49,9 +47,8 @@ #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, - DirInAdditional, DirOut, - SchmData=NULL, + SchmDataOut=NULL, SchmFlagsOut=NULL, log=NULL ){ @@ -62,30 +59,54 @@ wrap.sunav2.quality.flags <- function(DirIn, } DirInData <- paste0(DirIn,"/data") - DirInPlaus <- paste0(DirIn,"/flags") + DirInFlags <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") DirOutData <- base::paste0(DirOut,"/data") DirOutFlags <- base::paste0(DirOut,"/flags") #' Read in parquet file of SUNA data. dataFileName<-base::list.files(DirInData,full.names=FALSE) - sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), - log = log),silent = FALSE) + if(length(dataFileName)==0){ + log$error(base::paste0('Data file not found in ', DirInData)) + stop() + } else { + sunaData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInData, '/', dataFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',dataFileName)) + } #' Read in parquet file of QAQC plausibility flags. - plausFileName<-base::list.files(DirInPlaus,full.names=FALSE) - plausFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInPlaus, '/', plausFileName), - log = log),silent = FALSE) + plausFileName<-grep("flagsPlaus",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(plausFileName)==0){ + log$error(base::paste0('Plausibility flags not found in ', DirInFlags)) + stop() + } else { + plausFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', plausFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',plausFileName)) + } #' Read in parquet file of calibration flags. - calFileName<-grep("flagsCal",base::list.files(DirInAdditional,full.names=FALSE),value=TRUE) - calFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInAdditional, '/', calFileName), - log = log),silent = FALSE) + calFileName<-grep("flagsCal",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(calFileName)==0){ + log$error(base::paste0('Calibration flags not found in ', DirInFlags)) + stop() + } else { + calFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', calFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',calFileName)) + } #' Read in parquet file of logged file flags. - logFileName<-grep("logFlags",base::list.files(DirInAdditional,full.names=FALSE),value=TRUE) - logFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInAdditional, '/', logFileName), - log = log),silent = FALSE) + logFileName<-grep("logFlags",base::list.files(DirInFlags,full.names=FALSE),value=TRUE) + if(length(calFileName)==0){ + log$error(base::paste0('Log flags not found in ', DirInFlags)) + stop() + } else { + logFlags<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInFlags, '/', logFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',logFileName)) + } #' Convert measurements to be tested from class character to numeric. sunaData$relative_humidity<-as.numeric(sunaData$relative_humidity) @@ -183,7 +204,7 @@ wrap.sunav2.quality.flags <- function(DirIn, base::dir.create(DirOutData,recursive=TRUE) rptOutData <- try(NEONprocIS.base::def.wrte.parq(data = sunaData, NameFile = base::paste0(DirOutData,'/',dataFileName), - Schm = SchmData),silent=TRUE) + Schm = SchmDataOut),silent=TRUE) if(class(rptOutData)[1] == 'try-error'){ log$error(base::paste0('Cannot write Data to ',base::paste0(DirOutData,'/',dataFileName,".parquet"),'. ',attr(rptOutData, "condition"))) stop() diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index d2119c317..a4f09ac21 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -33,8 +33,7 @@ transform: DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ "TermTest1=nitrate:range|step|persistence|spike" \ - DirSubCopy=threshold - + "DirSubCopy=threshold|flags" EOF env: # Environment variables for padded timeseries analyzer diff --git a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml index c95ff3e9b..a15e3a4de 100644 --- a/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml +++ b/pipe/nitrate/nitrate_thresh_select_ts_pad.yaml @@ -22,7 +22,7 @@ transform: DirErr=/pfs/out/errored_datums \ FileThsh=$FILE_THRESHOLDS \ "TermCtxt1=nitrate" \ - "DirSubCopy=location|data" + "DirSubCopy=location|data|flags" # Copy output to another interim folder to destroy links (cannot daisy chain links from pfs input to output) cp -rL /tmp/threshold_select /tmp/threshold_selectCopy || : # Allow to fail without exit code (happens if step above produced no output) || : # Allow to fail without exit code (happens if step above produced no output) rm -r -f /tmp/threshold_select @@ -34,7 +34,7 @@ transform: OUT_PATH: /pfs/out LOG_LEVEL: INFO PAD_DIR: data - COPY_DIR: none # Can be multiple, separated by commas without spaces. Directories other than the pad directory and threshold directory to copy to the output (e.g. location,flags). Set to something like 'none' if none other are desired. + COPY_DIR: flags # Can be multiple, separated by commas without spaces. Directories other than the pad directory and threshold directory to copy to the output (e.g. location,flags). Set to something like 'none' if none other are desired. RELATIVE_PATH_INDEX: '3' PARALLELIZATION_INTERNAL: '1' # For threshold select module output_branch: master diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml deleted file mode 100644 index 84df5e383..000000000 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_logjam.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -pipeline: - name: sunav2_cron_daily_and_date_control_logjam -transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-date-cntl:v2.0.1 - cmd: ["/bin/bash"] - env: - # START_DATE ("YYYY-MM-DD") and END_DATE ("YYYY-MM-DD") indicate the max date range (inclusive) to create the /Y/M/D folder structure - # If START_DATE is not set (remove line entirely to unset), the start_date and/or the kafka_start_date for each site will be used, as indicated in the site-list json file - # start_date field in the site-list file is the earliest date to pull data from a site - # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka - # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. - OUT_PATH: /pfs/out - START_DATE: "2025-06-19" # Inclusive - END_DATE: "2025-06-29" # Inclusive - SOURCE_TYPE: "sunav2" - stdin: - - "#!/bin/bash" - - python3 -m cron_daily_and_date_control.cron_daily_and_date_control_main -input: - cross: - # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. - - cron: - name: tick - spec: "@never" - spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) - overwrite: true - - pfs: - name: SITE_FILE - repo: sunav2_site_list - glob: /site-list.json -resource_requests: - memory: 100M - cpu: 1 -resource_limits: - memory: 300M - cpu: 1.5 -sidecar_resource_requests: - memory: 500M - cpu: 0.5 -autoscaling: true -scheduling_spec: - node_selector: - cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 7776659db..31e310c02 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -11,7 +11,7 @@ transform: DirOut=/pfs/out DirErr=$ERR_PATH FileSchmData=$FILE_SCHEMA_DATA - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-a1ff444 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-4b4f1ce # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From b90a00b3c7013a0cc122b98a22130acb40f6f50f Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 22 Sep 2025 13:08:19 -0600 Subject: [PATCH 099/182] image update --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 373ddd34b..b307aec0f 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-8b0b9a2 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-054e370 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From 17df9ef212a72a1499d5917c1a26135616d35fa9 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 22 Sep 2025 13:37:16 -0600 Subject: [PATCH 100/182] latest --- .../flow.sunav2.quality.flags.R | 6 +++++- .../wrap.sunav2.quality.flags.R | 10 ++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 56956c2bf..3f7ce2b5e 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -54,7 +54,11 @@ #' Bobby Hensley (2025-09-18) #' Updated so that measurements prior to lamp stabilization (never intended to be #' used in downstream pipeline) are removed. +#' +#' Nora Catolico (2025-09-22) +#' combined input df and updated error logging # + ############################################################################################## options(digits.secs = 3) library(foreach) @@ -120,7 +124,7 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { withCallingHandlers( wrap.sunav2.quality.flags( DirIn=idxFileIn, - DirOut=Para$DirOut, + DirOutBase=Para$DirOut, SchmDataOut=SchmDataOut, SchmFlagsOut=SchmFlagsOut, log=log diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 4bb692e81..2dbaf5490 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -9,7 +9,7 @@ #' #' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. #' -#' @param DirOut Character value. The base file path for the output data. +#' @param DirOutBase Character value. The base file path for the output data. #' #' @param SchmDataOut (optional), A json-formatted character string containing the schema for the data file. #' This should be the same for the input as the output. Only the number of rows of measurements should change. @@ -47,7 +47,7 @@ #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, - DirOut, + DirOutBase, SchmDataOut=NULL, SchmFlagsOut=NULL, log=NULL @@ -58,11 +58,15 @@ wrap.sunav2.quality.flags <- function(DirIn, log <- NEONprocIS.base::def.log.init() } + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInData <- paste0(DirIn,"/data") DirInFlags <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) DirOutData <- base::paste0(DirOut,"/data") + base::dir.create(DirOutData,recursive=TRUE) DirOutFlags <- base::paste0(DirOut,"/flags") + base::dir.create(DirOutFlags,recursive=TRUE) #' Read in parquet file of SUNA data. dataFileName<-base::list.files(DirInData,full.names=FALSE) @@ -201,7 +205,6 @@ wrap.sunav2.quality.flags <- function(DirIn, sunaData<-sunaData[,c(2,3,1,4:37)] #' Write out data file. - base::dir.create(DirOutData,recursive=TRUE) rptOutData <- try(NEONprocIS.base::def.wrte.parq(data = sunaData, NameFile = base::paste0(DirOutData,'/',dataFileName), Schm = SchmDataOut),silent=TRUE) @@ -213,7 +216,6 @@ wrap.sunav2.quality.flags <- function(DirIn, } #' Write out flags file. - base::dir.create(DirOutFlags,recursive=TRUE) allFlagFileName<-paste0(stringr::str_remove(dataFileName,".parquet"),'_all_flags') rptOutFlags <- try(NEONprocIS.base::def.wrte.parq(data = allFlags, From b4d82d2d5044e1729543105f26b112d0c58e25e6 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 22 Sep 2025 13:42:17 -0600 Subject: [PATCH 101/182] update image --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index b307aec0f..b723d3f7e 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-054e370 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-17df9ef # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From ca8ce76748f6a8c4fd9e88942c844fa94ec0b13b Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 22 Sep 2025 14:54:35 -0600 Subject: [PATCH 102/182] Added check that output data and flags have same number of measuremnts. --- .../wrap.sunav2.quality.flags.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 2dbaf5490..ab2eccde8 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -9,7 +9,7 @@ #' #' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. #' -#' @param DirOutBase Character value. The base file path for the output data. +#' @param DirOut Character value. The base file path for the output data. #' #' @param SchmDataOut (optional), A json-formatted character string containing the schema for the data file. #' This should be the same for the input as the output. Only the number of rows of measurements should change. @@ -30,9 +30,8 @@ #' @examples #' # Not run # DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" -# DirIn<-"~/pfs/nitrate_group_path/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620/flags" # DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" -# SchmData<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') +# SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' @@ -45,6 +44,10 @@ #' Updated so that measurements prior to lamp stabilization (never intended to be #' used in downstream pipeline) are removed. #' +#' Bobby Hensley (2025-09-22) +#' Updated to use single input directory and added check that data and flag file +#' have same number of measurements. +#' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, DirOutBase, @@ -62,7 +65,6 @@ wrap.sunav2.quality.flags <- function(DirIn, DirInData <- paste0(DirIn,"/data") DirInFlags <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") - DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) DirOutData <- base::paste0(DirOut,"/data") base::dir.create(DirOutData,recursive=TRUE) DirOutFlags <- base::paste0(DirOut,"/flags") @@ -204,6 +206,14 @@ wrap.sunav2.quality.flags <- function(DirIn, sunaData<-sunaData[,-which(colnames(sunaData)=='nitrateLampStabilizeQF')] sunaData<-sunaData[,c(2,3,1,4:37)] + #' Checks that data file and flag file have same number of measurements + if(nrow(sunaData) != nrow(allFlags)){ + log$error(base::paste0('Error: Data and flags have different number of measuremnts')) + stop() + } else { + log$info(base::paste0('Data and flags have same number of measurements')) + } + #' Write out data file. rptOutData <- try(NEONprocIS.base::def.wrte.parq(data = sunaData, NameFile = base::paste0(DirOutData,'/',dataFileName), From 03d73eb466f9a1a221b90dec478bcc2d93f85c04 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 22 Sep 2025 15:42:46 -0600 Subject: [PATCH 103/182] updated image --- pipe/nitrate/nitrate_flags_specific.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index b723d3f7e..403437653 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-17df9ef + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-ca8ce76 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] @@ -25,7 +25,7 @@ input: - pfs: name: SCHEMA_FLAGS repo: sunav2_avro_schemas - glob: /sunav2/sunav2_sensor_specific_flags.avsc + glob: /sunav2/sunav2_all_flags.avsc parallelism_spec: constant: 6 autoscaling: true From 8c3129c84982b7bc366a33bb19716d54a392dcd3 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Fri, 17 Oct 2025 09:10:49 -0600 Subject: [PATCH 104/182] test image --- pipe/sunav2/sunav2_calibration_loader.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 6c2367985..1ae8a4d8b 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_loader transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:sha-c694dca cmd: - sh - "-c" @@ -16,6 +16,12 @@ transform: rm -rf $OUT_PATH mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + # NOTE: sunav2_raw is the name for both the sensor_type and the avro_schema_name in the database + # This is because the airflow transitions use the sunav2_raw L0 data + # When Airflow transitions are no longer active, best to update the sensor_type + # to "sunav2" in the database and use the commented env vars in the typical + # workflow for this pipeline (i.e. not loading to a tempdir and renaming) + python3 -m calval_loader.load_all_calval_files #run the calibration loader if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then @@ -29,6 +35,8 @@ transform: LOG_LEVEL: INFO SOURCE_TYPE: "sunav2_raw" SOURCE_TYPE_OUT: "sunav2" + # SOURCE_TYPE: "sunav2" + # SCHEMA_NAME: "sunav2_raw" STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret From 8fba6efef316f38e0a43c5041fbcffe001c5006f Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Fri, 17 Oct 2025 15:12:11 -0600 Subject: [PATCH 105/182] update with semver tag --- pipe/sunav2/sunav2_calibration_loader.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 1ae8a4d8b..7daab43a0 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_loader transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:sha-c694dca + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v3.0.0 cmd: - sh - "-c" From eb554074c49717494fba8351d62569713aaf3b3c Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 20 Oct 2025 12:19:57 -0600 Subject: [PATCH 106/182] update expired image --- pipe/sunav2/sunav2_fill_log_files.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index eebd1fbd4..1516f4d7f 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-a9c9ab0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-b95b90a cmd: - sh - "-c" From 26aeb64aae275eb41980f60de5729dcd024fb260 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Mon, 27 Oct 2025 17:24:09 -0600 Subject: [PATCH 107/182] update image version --- pipe/nitrate/nitrate_group_path.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_group_path.yaml b/pipe/nitrate/nitrate_group_path.yaml index 5c3360247..c4b3850d1 100644 --- a/pipe/nitrate/nitrate_group_path.yaml +++ b/pipe/nitrate/nitrate_group_path.yaml @@ -4,7 +4,7 @@ pipeline: transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-group-path:v1.0.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-group-path:v1.0.1 cmd: - /bin/bash stdin: From 06aa4ba888de93c87b707b6e5ffba3353bd9b357 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 28 Oct 2025 13:30:42 -0600 Subject: [PATCH 108/182] image update --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 403437653..1b56ecb6a 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-ca8ce76 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-26aeb64 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From 3780a7aa8b76e8b2cb08ba251bcbf02c5c299ab1 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 29 Oct 2025 08:23:54 -0600 Subject: [PATCH 109/182] minor fix --- flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R | 2 +- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R index 3f7ce2b5e..2b69bd44e 100644 --- a/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/flow.sunav2.quality.flags.R @@ -40,7 +40,7 @@ #' Stepping through the code in R studio # Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620", +# arg <- c("DirIn=~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100", # "DirOut=~/pfs/out", # "DirErr=~/pfs/out/errored_datums") #' rm(list=setdiff(ls(),c('arg','log'))) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index ab2eccde8..69e778fc3 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -65,9 +65,9 @@ wrap.sunav2.quality.flags <- function(DirIn, DirInData <- paste0(DirIn,"/data") DirInFlags <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") - DirOutData <- base::paste0(DirOut,"/data") + DirOutData <- base::paste0(DirOutBase,"/data") base::dir.create(DirOutData,recursive=TRUE) - DirOutFlags <- base::paste0(DirOut,"/flags") + DirOutFlags <- base::paste0(DirOutBase,"/flags") base::dir.create(DirOutFlags,recursive=TRUE) #' Read in parquet file of SUNA data. From 3d22ecf15e1e156c7d65dd3456dba1159e7d1c34 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 29 Oct 2025 08:54:34 -0600 Subject: [PATCH 110/182] latest --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 69e778fc3..c50bc9719 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -65,9 +65,10 @@ wrap.sunav2.quality.flags <- function(DirIn, DirInData <- paste0(DirIn,"/data") DirInFlags <- paste0(DirIn,"/flags") DirInThresholds <- paste0(DirIn,"/threshold") - DirOutData <- base::paste0(DirOutBase,"/data") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) + DirOutData <- base::paste0(DirOut,"/data") base::dir.create(DirOutData,recursive=TRUE) - DirOutFlags <- base::paste0(DirOutBase,"/flags") + DirOutFlags <- base::paste0(DirOut,"/flags") base::dir.create(DirOutFlags,recursive=TRUE) #' Read in parquet file of SUNA data. From 2b2d433809c87b6d48a14270035adac7d13cdd95 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 29 Oct 2025 12:02:52 -0600 Subject: [PATCH 111/182] latest --- .../wrap.sunav2.quality.flags.R | 12 ++++++++++-- pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index c50bc9719..8b3071c76 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -212,11 +212,19 @@ wrap.sunav2.quality.flags <- function(DirIn, log$error(base::paste0('Error: Data and flags have different number of measuremnts')) stop() } else { - log$info(base::paste0('Data and flags have same number of measurements')) + log$debug(base::paste0('Data and flags have same number of measurements')) } + #replace with NA's so that falgged data is excluded from averaging + dataOut<-merge(sunaData,allFlags,by='readout_time') + dataOut$nitrate[dataOut$nitrateHumidityQF==1]<-NA + dataOut$nitrate[dataOut$nitrateLampTempQF==1]<-NA + dataOut$nitrate[dataOut$nitrateLightDarkRatioQF==1]<-NA + dataOut$nitrate[dataOut$nitrateLampStabilizeQF==1]<-NA + dataOut<-dataOut[,which(colnames(dataOut)%in%colnames(sunaData))] + #' Write out data file. - rptOutData <- try(NEONprocIS.base::def.wrte.parq(data = sunaData, + rptOutData <- try(NEONprocIS.base::def.wrte.parq(data = dataOut, NameFile = base::paste0(DirOutData,'/',dataFileName), Schm = SchmDataOut),silent=TRUE) if(class(rptOutData)[1] == 'try-error'){ diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index a4f09ac21..77f604354 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -32,7 +32,7 @@ transform: DirIn=/tmp/pfs/padded_analyzerCopy \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - "TermTest1=nitrate:range|step|persistence|spike" \ + "TermTest1=nitrate:range(rmv)|step(rmv)|persistence(rmv)|spike(rmv)" \ "DirSubCopy=threshold|flags" EOF env: From d582b02b22540a496c835b661b9878ec538caf6d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 29 Oct 2025 14:34:15 -0600 Subject: [PATCH 112/182] latest --- pipe/nitrate/nitrate_flags_specific.yaml | 6 +- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 102 ++++++++++++++++++ .../nitrate/nitrate_qm_group_and_compute.yaml | 57 ++-------- .../nitrate_stats_group_and_compute.yaml | 88 +++++++++++++++ 4 files changed, 202 insertions(+), 51 deletions(-) create mode 100644 pipe/nitrate/nitrate_null_gap_ucrt.yaml create mode 100644 pipe/nitrate/nitrate_stats_group_and_compute.yaml diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 1b56ecb6a..fc112b229 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-26aeb64 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-2b2d433 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] @@ -20,8 +20,8 @@ input: - pfs: name: DIR_IN repo: nitrate_analyze_pad_and_qaqc_plau - # /source_type/YYYY/MM/DD - glob: /*/*/*/*/ + # /YYYY/MM/DD/group/sourcetype/CFGLOC + glob: /(*/*/*/*) - pfs: name: SCHEMA_FLAGS repo: sunav2_avro_schemas diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml new file mode 100644 index 000000000..86d784dea --- /dev/null +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -0,0 +1,102 @@ +--- +pipeline: + name: nitrate_null_gap_ucrt +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-levl1-grp-cons-srf:v2.2.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + + EOF + env: + # Environment variables for 1st filter-joiner. Need to join by day again here because an outer join was used on + # these repos in order to pull them in with or without the SRF + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: QUALITY_METRICS_PATH + # Filter for data directory + glob_pattern: /pfs/QUALITY_METRICS_PATH/*/*/*/*/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + - path: + name: STATISTICS_PATH + # Filter for data directory + glob_pattern: /pfs/STATISTICS_PATH/*/*/*/*/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + - path: + name: GROUP_PATH + # Grab group information + glob_pattern: /pfs/GROUP_PATH/*/*/*/*/group/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + - path: + name: GROUP_PATH + # Grab location information + glob_pattern: /pfs/GROUP_PATH/*/*/*/*/*/*/location/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + - path: + name: GROUP_PATH + # Grab location information + glob_pattern: /pfs/GROUP_PATH/*/*/*/*/*/*/uncertainty*/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + OUT_PATH: /pfs/out # Transfered to OUT_PATH for the first module + RELATIVE_PATH_INDEX: "3" # This is shared among the 2 filter joiners and consolidation module + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. Also shared with 2nd & 3rd modules + LOG_LEVEL: INFO # Shared among all modules + +input: + join: + - pfs: + name: QUALITY_METRICS_PATH + repo: nitrate_qm_group_and_compute + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: STATISTICS_PATH + repo: nitrate_stats_group_and_compute + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: GROUP_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1G + cpu: 2.2 +resource_limits: + memory: 2G + cpu: 3.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_qm_group_and_compute.yaml b/pipe/nitrate/nitrate_qm_group_and_compute.yaml index 2cb7ea5b2..e40600b9b 100644 --- a/pipe/nitrate/nitrate_qm_group_and_compute.yaml +++ b/pipe/nitrate/nitrate_qm_group_and_compute.yaml @@ -12,63 +12,24 @@ transform: set -euo pipefail IFS=$'\n\t' - # Refresh interim directories with each datum (otherwise they persist and cause probs) - rm -r -f /tmp/pfs/filter_joined - mkdir -p /tmp/pfs/filter_joined - - # ---- Run first module - filter-joiner (using environment variables below as input parameters) ---- - python3 -m filter_joiner.filter_joiner_main - - # ---- Run second module - quality metrics (averaged) ---- + # ---- Run module - quality metrics (averaged) ---- Rscript ./flow.qaqc.qm.R \ - DirIn=/tmp/pfs/filter_joined \ + DirIn=$QAQC_PLAUSIBILITY_PATH \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ "WndwAgr=015" \ "WghtAlphBeta=2|1" \ Thsh=0.2 \ - "GrpQfAlph1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF|nitratePersistenceQF" \ - "GrpQfBeta1=nitrate:nitrateRangeQF|nitrateStepQF|nitratePersistenceQF" - EOF + "GrpQfAlph1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF|nitratePersistenceQF|nitrateHumidityQF|nitrateLampTempQF|nitrateLightDarkRatioQF|nitrateLampStabilizeQF" \ + "GrpQfBeta1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF" env: - # Environment variables for filter-joiner - CONFIG: | - --- - # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. - # Metadata indices will typically begin at index 3. - # Use unix-style glob pattern to select the desired directories in each repo - input_paths: - - path: - name: QAQC_PLAUSIBILITY_PATH - # Filter for flags directory - glob_pattern: /pfs/QAQC_PLAUSIBILITY_PATH/*/*/*/*/*/*/flags/** - # Join on named location (already joined below by day) - join_indices: [6] - - path: - name: FLAGS_PATH - # Filter for flags directory - glob_pattern: /pfs/REGULARIZED_FLAGS_PATH/*/*/*/*/*/*/flags/** - # Join on named location (already joined below by day) - join_indices: [6] - OUT_PATH: /tmp/pfs/filter_joined LOG_LEVEL: DEBUG - RELATIVE_PATH_INDEX: "3" - LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. - PARALLELIZATION_INTERNAL: '1' # Option for quality metrics module input: - join: - - pfs: - name: QAQC_PLAUSIBILITY_PATH - repo: nitrate_analyze_pad_and_qaqc_plau - glob: /(*/*/*) - joinOn: $1 - empty_files: false # Make sure this is false for LINK_TYPE=COPY - - pfs: - name: FLAGS_PATH - repo: nitrate_group_path - glob: /(*/*/*) - joinOn: $1 - empty_files: false # Make sure this is false for LINK_TYPE=COPY + cross: + - pfs: + name: QAQC_PLAUSIBILITY_PATH + repo: nitrate_flags_specific + glob: /*/*/*/* parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/nitrate/nitrate_stats_group_and_compute.yaml b/pipe/nitrate/nitrate_stats_group_and_compute.yaml new file mode 100644 index 000000000..aa25553c6 --- /dev/null +++ b/pipe/nitrate/nitrate_stats_group_and_compute.yaml @@ -0,0 +1,88 @@ +--- +pipeline: + name: nitrate_stats_group_and_compute +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-stat-basc-grp:v2.0.2 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/filter_joined + mkdir -p /tmp/pfs/filter_joined + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Run second module - basic stats + Rscript ./flow.stat.basc.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + WndwAgr=015 \ + "TermStat1=nitrate:mean|minimum|maximum|variance|numPts|stdEr" + EOF + env: + # Environment variables for filter-joiner + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: QAQC_PATH + # Filter for data & uncertainty_data directories + glob_pattern: /pfs/QAQC_PATH/*/*/*/*/*/*/data/** + # Join on named location (already joined below by day) + join_indices: [6] + - path: + name: UNCERTAINTY_PATH + # Filter for data directory + glob_pattern: /pfs/UNCERTAINTY_PATH/*/*/*/*/*/*/uncertainty*/** + # Join on named location (already joined below by day) + join_indices: [6] + OUT_PATH: /tmp/pfs/filter_joined + LOG_LEVEL: INFO + RELATIVE_PATH_INDEX: "3" + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined module. + # Environment variables for calibration module + PARALLELIZATION_INTERNAL: '5' # Option for stats module +input: + cross: + # - pfs: + # name: FILE_SCHEMA_STATS + # repo: nitrate_avro_schemas + # glob: /nitrate/nitrate_dp01_stats.avsc + - join: + - pfs: + name: QAQC_PATH + repo: nitrate_flags_specific + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: UNCERTAINTY_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY +parallelism_spec: + constant: 5 +resource_requests: + memory: 1.8G + cpu: 6 +resource_limits: + memory: 3G + cpu: 7 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +autoscaling: true +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class From 1485cc0f2ab08f15aed46388cac6762764caab5a Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Thu, 30 Oct 2025 09:56:36 -0600 Subject: [PATCH 113/182] Updated to revert plausibility flags at end of each burst. --- .../wrap.sunav2.quality.flags.R | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index ab2eccde8..8e246ad64 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -144,7 +144,7 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Perform lamp temperature test (New condition need to be created. Using default for now). # lampTempThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Maximum Lamp Temperature"),] # maxLampTemp<-lampTempThreshold$number_value - maxLampTemp=35 + maxLampTemp=35 #' Hard-coded until thresholds are updated. sensorFlags$nitrateLampTempQF<-NA for(i in 1:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='lamp_temperature')])){ @@ -173,7 +173,7 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Identifies light measurement number within burst and performs lamp stabilization test. # lampStabilizeThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Lamp Stabilization Points"),] # lampStabilizePoints<-lampStabilizeThreshold$number_value - lampStabilizePoints=9 + lampStabilizePoints=9 #' Hard-coded until thresholds are updated. sensorFlags$burstNumber<-0 #' Assumes each burst starts with a dark measurement. for(i in 2:nrow(sunaData)){ if(is.na(sunaData[i,which(colnames(sunaData)=='light_dark_frame')])){ @@ -189,13 +189,24 @@ wrap.sunav2.quality.flags <- function(DirIn, if(sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]<=lampStabilizePoints){ sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampStabilizeQF')]=1} } - sensorFlags<-sensorFlags[,-which(colnames(sensorFlags)=='burstNumber')] #' Drops this column since it's no longer needed. #' Combines all flags into a single file. allFlags<-base::merge(plausFlags,sensorFlags) allFlags<-base::merge(allFlags,calFlags) allFlags<-base::merge(allFlags,logFlags) + #' Revert plausibility flags for last measurement of each burst to prevent over-flagging. + #' (Plausibility tests were run across bursts, where the time step is much larger than between measuremnts within bursts) + for(i in 3:nrow(allFlags)){ + if((allFlags[i,which(colnames(allFlags)=='burstNumber')]==0)&(allFlags[i-2,which(colnames(allFlags)=='nitrateStepQF')]==0)){ + allFlags[i-1,which(colnames(allFlags)=='nitrateStepQF')]=0} + } + for(i in 3:nrow(allFlags)){ + if((allFlags[i,which(colnames(allFlags)=='burstNumber')]==0)&(allFlags[i-2,which(colnames(allFlags)=='nitratePersistenceQF')]==0)){ + allFlags[i-1,which(colnames(allFlags)=='nitratePersistenceQF')]=0} + } + allFlags<-allFlags[,-which(colnames(allFlags)=='burstNumber')] #' Drops this column since it's no longer needed. + #' Removes all measurements where lamp has not stabilized from data and flag files. lampStabilizeFlagsOnly<-sensorFlags[,c("readout_time","nitrateLampStabilizeQF")] sunaData<-base::merge(sunaData,lampStabilizeFlagsOnly) #' Adds lamp stabilize QF to data file From b7136856ad114dd7bcdc09ba488640398ed90e88 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Thu, 30 Oct 2025 10:26:00 -0600 Subject: [PATCH 114/182] Updated change log. --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index ecccfc888..43f749113 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -46,7 +46,10 @@ #' #' Bobby Hensley (2025-09-22) #' Updated to use single input directory and added check that data and flag file -#' have same number of measurements. +#' have same number of measurements. +#' +#' Bobby Hensley (2025-10-30) +#' Updated to revert over-flagged measuremnts at end of burst. #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, From 4dc1f5040040689e65bf430ce2a3373012cfebda Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 30 Oct 2025 13:00:16 -0600 Subject: [PATCH 115/182] update image --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index fc112b229..a0c158538 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-2b2d433 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-72c2253 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From e28141b3740b8ca8b546396ae2f8fecf2460d2cf Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 31 Oct 2025 13:20:27 -0600 Subject: [PATCH 116/182] Created insufficient data quality flagging module. --- .../flow.sunav2.insufficient.data.R | 154 ++++++++++++++++++ .../wrap.sunav2.insufficient.data.R | 138 ++++++++++++++++ .../wrap.sunav2.quality.flags.R | 4 +- 3 files changed, 294 insertions(+), 2 deletions(-) create mode 100644 flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R create mode 100644 flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R diff --git a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R new file mode 100644 index 000000000..f258c342a --- /dev/null +++ b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R @@ -0,0 +1,154 @@ +############################################################################################## +#' @title Workflow for insufficient data calculations + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Uses number of measuremnts in averaging window to determine whether insufficient +#' data quality flag should be applied. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The base file path to the statistics data (including number of points) and the QM data. +#' +#' 2. "minPoints=value", The minimum number of points required to not trigger the insufficient data quality flag. +#' Currently set in the yaml. +#' +#' 3. "DirOut=value", The base file path for the output data. +#' +#' 4. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 5. "SchmStats=value" (optional), The avro schema for the input and output stats file. +#' +#' 6. "SchmQMsOut=value" (optional), The avro schema for the updated QMs (insufficientDataQF added). +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Updated stats and QMs data files in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.insufficient.data <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +#' minPoints=10, +#' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , +#' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), +#' SchmQMsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), +#' log=log) +#' Stepping through the code in R studio +# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") +#' rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-10-31) +#' Initial creation. + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.insufficient.data.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","numPoints","DirOut","DirErr"), + NameParaOptn = c("SchmStats","SchmQMsOut"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) +log$debug(base::paste0('Schema for output QMs: ', Para$SchmQMsOut)) + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ + SchmStats <- NULL +} else { + SchmStats <- base::paste0(base::readLines(Para$SchmStats),collapse='') +} +if(base::is.null(Para$SchmQMsOut) || Para$SchmQMsOut == 'NA'){ + SchmQMsOut <- NULL +} else { + SchmQMsOut <- base::paste0(base::readLines(Para$SchmQMsOut),collapse='') +} + + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = c('stats','quality_metrics'), + log = log) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.quality.flags( + DirIn=idxFileIn, + DirOutBase=Para$DirOut, + SchmStats=SchmStats, + SchmQMsOut=SchmQMsOut, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R new file mode 100644 index 000000000..9fae3dd94 --- /dev/null +++ b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R @@ -0,0 +1,138 @@ +############################################################################################## +#' @title Wrapper for insufficient data calculations + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Determines the number of available measurements within an +#' averaging period, and whether an insufficient data quality flag should be applied. +#' +#' @param DirIn Character value. The base file path to the averaged stats and quality metrics. +#' +#' @param minPoints Character value. The minimum number of points required to not trigger the insufficient data quality flag. +#' +#' @param DirOut Character value. The base file path for the output data. +#' +#' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. +#' Should be the same as the input. +#' +#' @param SchmQMsOut (optional), A json-formatted character string containing the schema for the output quality metrics parquet +#' with insufficient data quality flag added. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Averaged stats file and quality metric file in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# minPoints=10 +# DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') +# SchmQMsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' +#' @changelog +#' Bobby Hensley (2025-10-31) +#' Initial creation. +#' +############################################################################################## +wrap.sunav2.insufficient.data <- function(DirIn, + minPoints, + DirOut, + SchmStatsOut=NULL, + SchmQMsOut=NULL, + log=NULL +){ + + #' Start logging if not already. + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) + DirInStats <- paste0(DirIn,"/stats") + DirInQMs <- paste0(DirIn,"/quality_metrics") + DirOutStats <- base::paste0(DirOut,"/stats") + base::dir.create(DirOutStats,recursive=TRUE) + DirOutQMs <- base::paste0(DirOut,"/quality_metrics") + base::dir.create(DirOutQMs,recursive=TRUE) + + #' Read in parquet file of averaged stats. + statsFileName<-base::list.files(DirInStats,full.names=FALSE) + if(length(statsFileName)==0){ + log$error(base::paste0('Stats file not found in ', DirInStats)) + stop() + } else { + statsData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInStats, '/', statsFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',statsFileName)) + } + + #' Read in parquet file of quality metrics. + qmFileName<-base::list.files(DirInQMs,full.names=FALSE) + if(length(qmFileName)==0){ + log$error(base::paste0('Quality metrics not found in ', DirInQMs)) + stop() + } else { + qmData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInQMs, '/', qmFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',qmFileName)) + } + + #' Identify the column name with the number of points and finalQF + ptsColName<-grep("NumPts",names(statsData),value=TRUE) + finalQfColName<-grep("FinalQF",names(qmData),value=TRUE) + + #' If the number of points is NA, set it to 0. + for(i in 1:nrow(statsData)){ + if(is.na(statsData[i,which(colnames(statsData)==ptsColName)])){ + statsData[i,which(colnames(statsData)==ptsColName)]=0}} + + #' If the number of points is greater than or equal to the minimum required, + #' revert the insufficient data quality flag (default is to apply it). + qmData$insufficientDataQF=1 + for(i in 1:nrow(statsData)){ + if(statsData[i,which(colnames(statsData)==ptsColName)]>=minPoints){ + qmData[i,which(colnames(qmData)=='insufficientDataQF')]=0}} + + #' If the insufficient data quality flag has been applied, update the final quality flag. + for(i in 1:nrow(qmData)){ + if(qmData[i,which(colnames(qmData)=='insufficientDataQF')]==1){ + qmData[i,which(colnames(qmData)==finalQfColName)]=1}} + qmData <- qmData %>% dplyr::relocate(finalQfColName, .after = last_col()) #' Move finalQF back to the end + + #' Write out stats file. + rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, + NameFile = base::paste0(DirOutStats,'/',statsFileName), + Schm = NULL),silent=TRUE) + if(class(rptOutStats)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName,".parquet"),'. ',attr(rptOutStats, "condition"))) + stop() + } else { + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName,".parquet"))) + } + + #' Write out QMs file. + rptOutQMs <- try(NEONprocIS.base::def.wrte.parq(data = qmData, + NameFile = base::paste0(DirOutQMs,'/',qmFileName), + Schm = SchmQMsOut),silent=TRUE) + if(class(rptOutQMs)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName,".parquet"),'. ',attr(rptOutFlags, "condition"))) + stop() + } else { + log$info(base::paste0('Updated QMs written successfully in ', base::paste0(DirOutQMs,'/',qmFileName,".parquet"))) + } + +} + + + diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 43f749113..392889f7e 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -5,7 +5,7 @@ #' Bobby Hensley \email{hensley@battelleecology.org} #' #' @description Wrapper function. Uses thresholds to apply sensor-specific quality flags to SUNA data. -#' Measurements where the lamp has not had enough time to stabilze (nitrateLampStabilizeQF=1) are removed. +#' Measurements where the lamp has not had enough time to stabilize (nitrateLampStabilizeQF=1) are removed. #' #' @param DirIn Character value. The base file path to the input data, QA/QC plausibility flags and quality flag thresholds. #' @@ -200,7 +200,7 @@ wrap.sunav2.quality.flags <- function(DirIn, allFlags<-base::merge(allFlags,logFlags) #' Revert plausibility flags for last measurement of each burst to prevent over-flagging. - #' (Plausibility tests were run across bursts, where the time step is much larger than between measuremnts within bursts) + #' (Plausibility tests were run across bursts, where the time step is much larger than between measurements within bursts) for(i in 3:nrow(allFlags)){ if((allFlags[i,which(colnames(allFlags)=='burstNumber')]==0)&(allFlags[i-2,which(colnames(allFlags)=='nitrateStepQF')]==0)){ allFlags[i-1,which(colnames(allFlags)=='nitrateStepQF')]=0} From d571c852f07f425eec31f4231b93497438a52ba6 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 31 Oct 2025 13:54:23 -0600 Subject: [PATCH 117/182] Updates to flow script. --- .../flow.sunav2.insufficient.data.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R index f258c342a..40bbf1151 100644 --- a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R +++ b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R @@ -42,12 +42,13 @@ #' SchmQMsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), #' log=log) #' Stepping through the code in R studio -# Sys.setenv(DIR_IN='/home/NEON/ncatolico/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') -# log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", -# "DirOut=~/pfs/out", -# "DirErr=~/pfs/out/errored_datums") -#' rm(list=setdiff(ls(),c('arg','log'))) +Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", + "minPoints=10", + "DirOut=~/pfs/out", + "DirErr=~/pfs/out/errored_datums") + rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -82,7 +83,7 @@ if(numCoreUse > numCoreAvail){ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) # Parse the input arguments into parameters -Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","numPoints","DirOut","DirErr"), +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","minPoints","DirOut","DirErr"), NameParaOptn = c("SchmStats","SchmQMsOut"),log = log) # Echo arguments @@ -118,9 +119,9 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { # Run the wrapper function for each datum, with error routing tryCatch( withCallingHandlers( - wrap.sunav2.quality.flags( + wrap.sunav2.insufficient.data( DirIn=idxFileIn, - DirOutBase=Para$DirOut, + DirOut=Para$DirOut, SchmStats=SchmStats, SchmQMsOut=SchmQMsOut, log=log From 1f2d34bb229734601eabcdeb4c12cc730ed80279 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Fri, 31 Oct 2025 14:07:16 -0600 Subject: [PATCH 118/182] Updates --- .../flow.sunav2.insufficient.data.R | 2 ++ .../wrap.sunav2.insufficient.data.R | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R index 40bbf1151..45f85598c 100644 --- a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R +++ b/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R @@ -88,6 +88,7 @@ Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","minPoi # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Minimum points: ', Para$minPoints)) log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) @@ -121,6 +122,7 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { withCallingHandlers( wrap.sunav2.insufficient.data( DirIn=idxFileIn, + minPoints=Para$minPoints, DirOut=Para$DirOut, SchmStats=SchmStats, SchmQMsOut=SchmQMsOut, diff --git a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R index 9fae3dd94..2c2964a33 100644 --- a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R +++ b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R @@ -126,7 +126,7 @@ wrap.sunav2.insufficient.data <- function(DirIn, NameFile = base::paste0(DirOutQMs,'/',qmFileName), Schm = SchmQMsOut),silent=TRUE) if(class(rptOutQMs)[1] == 'try-error'){ - log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName,".parquet"),'. ',attr(rptOutFlags, "condition"))) + log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName,".parquet"),'. ',attr(rptOutQMs, "condition"))) stop() } else { log$info(base::paste0('Updated QMs written successfully in ', base::paste0(DirOutQMs,'/',qmFileName,".parquet"))) From 3b8cb4108e102d0ad9211c3dc5b00b4078dd0039 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 3 Nov 2025 11:09:27 -0700 Subject: [PATCH 119/182] Created expanded uncertainty module for SUNA. --- .../wrap.sunav2.exp.uncert.R | 113 ++++++++++++++++++ .../wrap.sunav2.insufficient.data.R | 2 +- 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R new file mode 100644 index 000000000..b7f579c94 --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -0,0 +1,113 @@ +############################################################################################## +#' @title Wrapper for SUNA expanded uncertainty calculation + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} +#' +#' @description Wrapper function. Calculates the expanded uncertainty for each SUNA burst. +#' +#' @param DirIn Character value. The base file path to the averaged stats and uncertainty coefficients. +#' +#' @param DirOut Character value. The base file path for the output data. +#' +#' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. +#' +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. +#' +#' @return Averaged stats file and quality metric file in daily parquets. +#' +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 +#' +#' @keywords Currently none +#' +#' @examples +#' # Not run +# DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" +# SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +#' +#' +#' @changelog +#' Bobby Hensley (2025-11-03) +#' Initial creation. +#' +############################################################################################## +wrap.sunav2.exp.uncert <- function(DirIn, + DirOut, + SchmStats=NULL, + log=NULL +){ + + #' Start logging if not already. + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) + DirInStats <- paste0(DirIn,"/stats") + DirInCoeff <- paste0(DirIn,"/uncertainty_coef") + DirOutStats <- base::paste0(DirOut,"/stats") + base::dir.create(DirOutStats,recursive=TRUE) + + #' Read in parquet file of averaged stats. + statsFileName<-base::list.files(DirInStats,full.names=FALSE) + if(length(statsFileName)==0){ + log$error(base::paste0('Stats file not found in ', DirInStats)) + stop() + } else { + statsData<-base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(DirInStats, '/', statsFileName), + log = log),silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',statsFileName)) + } + + #' Read in json file of uncertainty coefficients. + coeffileName<-base::list.files(DirInCoeff,full.names=FALSE) + if(length(coeffileName)==0){ + log$error(base::paste0('Quality metrics not found in ', DirInCoeff)) + stop() + } else { + uncertCoeff<-base::try(NEONprocIS.cal::def.read.ucrt.coef.fdas(NameFile = base::paste0(DirInCoeff, '/', coeffileName)), + silent = FALSE) + log$debug(base::paste0('Successfully read in file: ',coeffileName)) + } + + + + + + + + + + + + + + + + + + + + + + #' Write out updated stats file. + rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, + NameFile = base::paste0(DirOutStats,'/',statsFileName), + Schm = NULL),silent=TRUE) + if(class(rptOutStats)[1] == 'try-error'){ + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName,".parquet"),'. ',attr(rptOutStats, "condition"))) + stop() + } else { + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName,".parquet"))) + } + + +} + + + diff --git a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R index 2c2964a33..964d14c1a 100644 --- a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R +++ b/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R @@ -48,7 +48,7 @@ wrap.sunav2.insufficient.data <- function(DirIn, minPoints, DirOut, - SchmStatsOut=NULL, + SchmStats=NULL, SchmQMsOut=NULL, log=NULL ){ From 2f8ea6d90652de29d82e93b9b0b07ebeff673670 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 3 Nov 2025 13:35:32 -0700 Subject: [PATCH 120/182] Updates to SUNA expanded uncertainty module. --- .../flow.sunav2.exp.uncert.R | 140 ++++++++++++++++++ .../wrap.sunav2.exp.uncert.R | 70 ++++++--- 2 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R new file mode 100644 index 000000000..c3b7980fb --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -0,0 +1,140 @@ +############################################################################################## +#' @title Workflow for SUNA expanded uncertainty calculation + +#' @author +#' Bobby Hensley \email{hensley@battelleecology.org} + +#' @description Workflow. Calculates the expanded uncertainty for each SUNA burst. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", The base file path to the statistics data and calibration coefficients +#' +#' 2. "DirOut=value", The base file path for the output data. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of \code{DirIn}. +#' +#' 4. "SchmStats=value" (optional), The avro schema for the input and output stats file. +#' +#' +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return Updated stats files with expanded uncertainty in daily parquets. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' flow.sunav2.exp.uncert <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +#' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , +#' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), +#' log=log) +#' Stepping through the code in R studio +Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') +log <- NEONprocIS.base::def.log.init(Lvl = "debug") +arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", + "DirOut=~/pfs/out", + "DirErr=~/pfs/out/errored_datums") + rm(list=setdiff(ls(),c('arg','log'))) + +#' @seealso None currently + +# changelog and author contributions / copyrights +#' Bobby Hensley (2025-10-31) +#' Initial creation. + +############################################################################################## +options(digits.secs = 3) +library(foreach) +library(doParallel) +library(lubridate) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.sunav2.exp.uncert.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), + NameParaOptn = c("SchmStats"),log = log) + +# Echo arguments +log$debug(base::paste0('Input data directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) + +# Read in the schemas so we only have to do it once and not every time in the avro writer. +if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ + SchmStats <- NULL +} else { + SchmStats <- base::paste0(base::readLines(Para$SchmStats),collapse='') +} + + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = c('stats','uncertainty_coef'), + log = log) + +# Process each datum path +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxFileIn = DirIn) %dopar% { + log$info(base::paste0('Processing path to file: ', idxFileIn)) + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.sunav2.exp.uncert( + DirIn=idxFileIn, + DirOut=Para$DirOut, + SchmStats=SchmStats, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + log$error(err$message) + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(idxFileIn, + log = log) + DirSub <- strsplit(InfoDirIn$dirRepo,".", fixed = TRUE)[[1]][1] + NEONprocIS.base::def.dir.crea(DirBgn = Para$DirErr, DirSub = DirSub, + log = log) + csvname <- DirSub %>% + strsplit( "/" ) %>% + sapply( tail, 1 ) + nameFileErr <- base::paste0(Para$DirErr, DirSub, "/",csvname) + log$info(base::paste0("Re-routing failed datum path to ", nameFileErr)) + con <- base::file(nameFileErr, "w") + base::close(con) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + return() +} + + + + diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R index b7f579c94..2de87b2de 100644 --- a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -75,25 +75,61 @@ wrap.sunav2.exp.uncert <- function(DirIn, log$debug(base::paste0('Successfully read in file: ',coeffileName)) } - - - - - - - - - - - - - - - - - + #' Converts uncertainty coefficient dates to POSIXct and values to numeric + uncertCoeff$start_date <- as.POSIXct(uncertCoeff$start_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$end_date <- as.POSIXct(uncertCoeff$end_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$Value<-as.numeric(uncertCoeff$Value) + + #' Determines which uncertainty coefficients to be applied to each time interval. + #' (In case there are more than one on a particular day) + uncertCoeff<-uncertCoeff[order(uncertCoeff$start_date), ] + uncertCoeffA1<-uncertCoeff[(uncertCoeff$Name=="U_CVALA1"),] + statsData$uncertCoeffA1<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA1)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA1")]=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="Value")]}}} + uncertCoeffA3<-uncertCoeff[(uncertCoeff$Name=="U_CVALA3"),] + statsData$uncertCoeffA3<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA3)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA3")]=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="Value")]}}} + + #' Identify the column name with the mean, variance and number of points + meanName<-grep("Mean",names(statsData),value=TRUE) + varianceName<-grep("Variance",names(statsData),value=TRUE) + pointsName<-grep("NumPts",names(statsData),value=TRUE) + + #' Calculates calibration uncertainty. See ATBD for more details. + #' Concentrations <= 20 mg/L have fixed calibration uncertainty equal to coeffA1. + #' Concentrations greater than 20 mg/L uncertainty equals concentration times coeffA1. + #' Note stats data concentrations are in uM so threshold needs to be converted from mg/L by dividing by 0.014 (14 g/mol / 1000 ug/mg) + statsData$calUncert<-NA + for (i in 1:nrow(statsData)){ + if(is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="calUncert")]=NA} + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){ + if(statsData[i,which(colnames(statsData)==meanName)]<=(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA1")]} + if(statsData[i,which(colnames(statsData)==meanName)]>(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA3")]} + } + } + + #' Calculates the repeatability (natural variation). See ATBD for more details. + statsData$natVar<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="natVar")]= + sqrt(statsData[i,which(colnames(statsData)==varianceName)]/statsData[i,which(colnames(statsData)==pointsName)])} + } + #' Calculates the expanded uncertainty, which is estimated as 2x the combined uncertainty. See ATBD for more details. + statsData$surfWaterNitrateExpUncert<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="surfWaterNitrateExpUncert")]= + 2*sqrt(statsData[i,which(colnames(statsData)=="natVar")]+statsData[i,which(colnames(statsData)=="calUncert")])} + } + #' Removes unnecessary columns. + statsData<-subset(statsData,select=-c(uncertCoeffA3,uncertCoeffA1,calUncert,natVar)) #' Write out updated stats file. rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, From 1187b1cfbc6fe284d3f174491c12647107280c20 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 10:45:00 -0700 Subject: [PATCH 121/182] combined module for uncertainty --- modules_combined/sunav2_ucrt_group/Dockerfile | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 modules_combined/sunav2_ucrt_group/Dockerfile diff --git a/modules_combined/sunav2_ucrt_group/Dockerfile b/modules_combined/sunav2_ucrt_group/Dockerfile new file mode 100644 index 000000000..9a7775f29 --- /dev/null +++ b/modules_combined/sunav2_ucrt_group/Dockerfile @@ -0,0 +1,51 @@ +# Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner + +# Start with the base R image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Add in the python-based filter-joiner module +ARG MODULE_DIR="modules" +ARG APP_DIR="filter_joiner" +ARG COMMON_DIR="common" +ARG CONTAINER_APP_DIR="/usr/src/app" +ENV PYTHONPATH="${PYTHONPATH}:${CONTAINER_APP_DIR}" + +WORKDIR ${CONTAINER_APP_DIR} + +COPY ${MODULE_DIR}/${APP_DIR}/requirements.txt ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt + + +RUN apt update && \ + apt-get install -y --no-install-recommends \ + python3.8 && \ + apt install -y python3-pip && \ + python3 -mpip install --no-cache-dir --upgrade pip setuptools wheel && \ + python3 -mpip install --no-cache-dir -r ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt && \ + apt-get autoremove -y && \ + apt-get autoclean -y && \ + rm -rf /var/lib/apt/lists/* && \ + groupadd -g 9999 appuser && \ + useradd -r -u 9999 -g appuser appuser + +# Copy in python code +COPY ${MODULE_DIR}/${APP_DIR} ${CONTAINER_APP_DIR}/${APP_DIR} +COPY ${MODULE_DIR}/${COMMON_DIR} ${CONTAINER_APP_DIR}/${COMMON_DIR} + + +# Build in the uncertainty module +ARG MODULE_DIR="flow" +ARG APP_DIR_1="flow.sunav2.exp.uncert" + +# Copy the lockfile and restore known working versions of R dependency packages +COPY ./flow/flow.sunav2.exp.uncert/renv.lock . +RUN R -e 'renv::restore(lockfile="./renv.lock")' + +# Copy in R code +COPY ./flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R . +COPY ./flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R . + +# Run as app user +USER appuser From ee522b528228d1f685c5c3b87d4c6dec4abc5bc4 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 11:00:34 -0700 Subject: [PATCH 122/182] build flow image --- flow/flow.sunav2.exp.uncert/Dockerfile | 20 ++++ .../flow.sunav2.exp.uncert.R | 11 +- flow/flow.sunav2.exp.uncert/renv.lock | 101 ++++++++++++++++++ .../wrap.sunav2.exp.uncert.R | 4 +- 4 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 flow/flow.sunav2.exp.uncert/Dockerfile create mode 100644 flow/flow.sunav2.exp.uncert/renv.lock diff --git a/flow/flow.sunav2.exp.uncert/Dockerfile b/flow/flow.sunav2.exp.uncert/Dockerfile new file mode 100644 index 000000000..c7ea1f777 --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - sunav2 expanded uncertainty + +# Start with the NEON IS stats package image +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-stat-r:v1.0.2 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.sunav2.exp.uncert" + +# maintainer handle +MAINTAINER "Bobby Hensley" hensley@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.exp.uncert.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.exp.uncert.R . + diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R index c3b7980fb..2571521ef 100644 --- a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -35,12 +35,11 @@ #' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", - "DirOut=~/pfs/out", - "DirErr=~/pfs/out/errored_datums") - rm(list=setdiff(ls(),c('arg','log'))) +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +# "DirOut=~/pfs/out", +# "DirErr=~/pfs/out/errored_datums") +# rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/flow/flow.sunav2.exp.uncert/renv.lock b/flow/flow.sunav2.exp.uncert/renv.lock new file mode 100644 index 000000000..7283865a0 --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/renv.lock @@ -0,0 +1,101 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + } + } +} diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R index 2de87b2de..0a7f105bc 100644 --- a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -136,10 +136,10 @@ wrap.sunav2.exp.uncert <- function(DirIn, NameFile = base::paste0(DirOutStats,'/',statsFileName), Schm = NULL),silent=TRUE) if(class(rptOutStats)[1] == 'try-error'){ - log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName,".parquet"),'. ',attr(rptOutStats, "condition"))) + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName),'. ',attr(rptOutStats, "condition"))) stop() } else { - log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName,".parquet"))) + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName))) } From 036fd47cc4114c7a21f111f999db5e7dbbe4ae83 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 11:54:29 -0700 Subject: [PATCH 123/182] latest --- modules_combined/sunav2_ucrt_group/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules_combined/sunav2_ucrt_group/Dockerfile b/modules_combined/sunav2_ucrt_group/Dockerfile index 9a7775f29..543130515 100644 --- a/modules_combined/sunav2_ucrt_group/Dockerfile +++ b/modules_combined/sunav2_ucrt_group/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner # Start with the base R image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-stat-r:v1.0.2 # maintainer handle MAINTAINER "Nora Catolico" ncatolico@battelleecology.org From 3d1fee33c790af2e49a271cb7fbb4c7f554cc2a8 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 12:09:18 -0700 Subject: [PATCH 124/182] add in cal package --- flow/flow.sunav2.exp.uncert/Dockerfile | 2 +- modules_combined/sunav2_ucrt_group/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.exp.uncert/Dockerfile b/flow/flow.sunav2.exp.uncert/Dockerfile index c7ea1f777..0020ef36f 100644 --- a/flow/flow.sunav2.exp.uncert/Dockerfile +++ b/flow/flow.sunav2.exp.uncert/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 expanded uncertainty # Start with the NEON IS stats package image -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-stat-r:v1.0.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.sunav2.exp.uncert" diff --git a/modules_combined/sunav2_ucrt_group/Dockerfile b/modules_combined/sunav2_ucrt_group/Dockerfile index 543130515..75f619d8b 100644 --- a/modules_combined/sunav2_ucrt_group/Dockerfile +++ b/modules_combined/sunav2_ucrt_group/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner # Start with the base R image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-stat-r:v1.0.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 # maintainer handle MAINTAINER "Nora Catolico" ncatolico@battelleecology.org From d590e7ab036eb3bac418321102c7d7ff575823f4 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 12:45:55 -0700 Subject: [PATCH 125/182] latest --- .../flow.sunav2.exp.uncert.R | 18 ++++++++++++++---- .../wrap.sunav2.exp.uncert.R | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R index 2571521ef..6f7619d68 100644 --- a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -17,7 +17,8 @@ #' #' 4. "SchmStats=value" (optional), The avro schema for the input and output stats file. #' -#' +#' 5. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, that are to be copied with a symbolic link to the output path. #' #' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, #' which uses system environment variables if available. @@ -38,7 +39,8 @@ # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", # "DirOut=~/pfs/out", -# "DirErr=~/pfs/out/errored_datums") +# "DirErr=~/pfs/out/errored_datums", +# "DirSubCopy=location|quality_metrics") # rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -46,6 +48,8 @@ # changelog and author contributions / copyrights #' Bobby Hensley (2025-10-31) #' Initial creation. +#' Nora Catolico (2025-11-04) +#' add in copied directories ############################################################################################## options(digits.secs = 3) @@ -75,13 +79,14 @@ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used # Parse the input arguments into parameters Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","DirOut","DirErr"), - NameParaOptn = c("SchmStats"),log = log) + NameParaOptn = c("SchmStats","DirSubCopy"),log = log) # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirIn)) log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) +log$debug(base::paste0('Director to copy: ', Para$DirSubCopy)) # Read in the schemas so we only have to do it once and not every time in the avro writer. if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ @@ -97,6 +102,10 @@ DirIn <- nameDirSub = c('stats','uncertainty_coef'), log = log) +# Retrieve optional subdirectories to copy over +DirSubCopy <- base::unique(base::setdiff(Para$DirSubCopy,'stats')) +log$debug(base::paste0('Additional subdirectories to copy: ',base::paste0(DirSubCopy,collapse=','))) + # Process each datum path doParallel::registerDoParallel(numCoreUse) foreach::foreach(idxFileIn = DirIn) %dopar% { @@ -106,8 +115,9 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { withCallingHandlers( wrap.sunav2.exp.uncert( DirIn=idxFileIn, - DirOut=Para$DirOut, + DirOutBase=Para$DirOut, SchmStats=SchmStats, + DirSubCopy=DirSubCopy, log=log ), error = function(err) { diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R index 0a7f105bc..3ca32d30f 100644 --- a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -8,10 +8,14 @@ #' #' @param DirIn Character value. The base file path to the averaged stats and uncertainty coefficients. #' -#' @param DirOut Character value. The base file path for the output data. +#' @param DirOutBase Character value. The base file path for the output data. #' #' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. #' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). +#' #' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init #' for more details. @@ -37,8 +41,9 @@ #' ############################################################################################## wrap.sunav2.exp.uncert <- function(DirIn, - DirOut, + DirOutBase, SchmStats=NULL, + DirSubCopy=NULL, log=NULL ){ @@ -50,9 +55,18 @@ wrap.sunav2.exp.uncert <- function(DirIn, InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInStats <- paste0(DirIn,"/stats") DirInCoeff <- paste0(DirIn,"/uncertainty_coef") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) DirOutStats <- base::paste0(DirOut,"/stats") base::dir.create(DirOutStats,recursive=TRUE) + # Copy with a symbolic link the desired subfolders + if(base::length(DirSubCopy) > 0){ + NEONprocIS.base::def.dir.copy.symb(DirSrc=base::paste0(DirIn,'/',DirSubCopy), + DirDest=DirOut, + LnkSubObj=TRUE, + log=log) + } + #' Read in parquet file of averaged stats. statsFileName<-base::list.files(DirInStats,full.names=FALSE) if(length(statsFileName)==0){ From 4a9eb212161068fe8546dea11d311d98b756bc76 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 13:00:59 -0700 Subject: [PATCH 126/182] latest --- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 86d784dea..4fe28c826 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-levl1-grp-cons-srf:v2.2.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-d590e7a cmd: - sh - "-c" @@ -11,10 +11,21 @@ transform: # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ set -euo pipefail IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/filter_joined + mkdir -p /tmp/pfs/filter_joined # Run first module - filter-joiner (using environment variables below as input parameters) python3 -m filter_joiner.filter_joiner_main + # Run second module - basic stats + Rscript ./flow.sunav2.exp.uncert.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirSubCopy=location|quality_metrics" + EOF env: # Environment variables for 1st filter-joiner. Need to join by day again here because an outer join was used on @@ -55,7 +66,7 @@ transform: glob_pattern: /pfs/GROUP_PATH/*/*/*/*/*/*/uncertainty*/** # Join on Y/M/D/group ID join_indices: [3,4,5,6] - OUT_PATH: /pfs/out # Transfered to OUT_PATH for the first module + OUT_PATH: /tmp/pfs/filter_joined RELATIVE_PATH_INDEX: "3" # This is shared among the 2 filter joiners and consolidation module LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. Also shared with 2nd & 3rd modules LOG_LEVEL: INFO # Shared among all modules From 1bdfe3ecbb5da0a012950a9d0d9fb601175465d3 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 4 Nov 2025 15:09:05 -0700 Subject: [PATCH 127/182] create docker image --- flow/flow.insufficient.data/Dockerfile | 20 ++ .../flow.insufficient.data.R} | 6 +- flow/flow.insufficient.data/renv.lock | 256 ++++++++++++++++++ .../wrap.insufficient.data.R} | 2 +- flow/flow.sunav2.exp.uncert/Dockerfile | 2 +- modules_combined/sunav2_ucrt_group/Dockerfile | 13 +- 6 files changed, 292 insertions(+), 7 deletions(-) create mode 100644 flow/flow.insufficient.data/Dockerfile rename flow/{flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R => flow.insufficient.data/flow.insufficient.data.R} (96%) create mode 100644 flow/flow.insufficient.data/renv.lock rename flow/{flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R => flow.insufficient.data/wrap.insufficient.data.R} (99%) diff --git a/flow/flow.insufficient.data/Dockerfile b/flow/flow.insufficient.data/Dockerfile new file mode 100644 index 000000000..106206edc --- /dev/null +++ b/flow/flow.insufficient.data/Dockerfile @@ -0,0 +1,20 @@ +# Dockerfile for NEON IS Data Processing - insufficient data + +# Start with the NEON IS base package image +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.insufficient.data" + +# maintainer handle +MAINTAINER "Bobby Hensley" hensley@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Copy in sunav2 flag workflow +COPY ${FLOW_DIR}/${APP_DIR}/flow.insufficient.data.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.insufficient.data.R . + diff --git a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R b/flow/flow.insufficient.data/flow.insufficient.data.R similarity index 96% rename from flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R rename to flow/flow.insufficient.data/flow.insufficient.data.R index 45f85598c..dd9fcb9f2 100644 --- a/flow/flow.sunav2.insufficient.data/flow.sunav2.insufficient.data.R +++ b/flow/flow.insufficient.data/flow.insufficient.data.R @@ -35,7 +35,7 @@ #' @keywords Currently none #' @examples -#' flow.sunav2.insufficient.data <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +#' flow.insufficient.data <- function(DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", #' minPoints=10, #' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , #' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), @@ -63,7 +63,7 @@ library(doParallel) library(lubridate) # Source the wrapper function. Assume it is in the working directory -source("./wrap.sunav2.insufficient.data.R") +source("./wrap.insufficient.data.R") # Pull in command line arguments (parameters) arg <- base::commandArgs(trailingOnly = TRUE) @@ -120,7 +120,7 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { # Run the wrapper function for each datum, with error routing tryCatch( withCallingHandlers( - wrap.sunav2.insufficient.data( + wrap.insufficient.data( DirIn=idxFileIn, minPoints=Para$minPoints, DirOut=Para$DirOut, diff --git a/flow/flow.insufficient.data/renv.lock b/flow/flow.insufficient.data/renv.lock new file mode 100644 index 000000000..f2d45d4f4 --- /dev/null +++ b/flow/flow.insufficient.data/renv.lock @@ -0,0 +1,256 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2720e3fd3dad08f34b19b56b3d6f073d", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "680ad542fbcf801442c83a6ac5a2126c", + "Requirements": [ + "generics", + "timechange" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "timechange": { + "Package": "timechange", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8548b44f79a35ba1791308b61e6012d7", + "Requirements": [ + "cpp11" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R similarity index 99% rename from flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R rename to flow/flow.insufficient.data/wrap.insufficient.data.R index 964d14c1a..2aff818d5 100644 --- a/flow/flow.sunav2.insufficient.data/wrap.sunav2.insufficient.data.R +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -45,7 +45,7 @@ #' Initial creation. #' ############################################################################################## -wrap.sunav2.insufficient.data <- function(DirIn, +wrap.insufficient.data <- function(DirIn, minPoints, DirOut, SchmStats=NULL, diff --git a/flow/flow.sunav2.exp.uncert/Dockerfile b/flow/flow.sunav2.exp.uncert/Dockerfile index 0020ef36f..44cf9c4b0 100644 --- a/flow/flow.sunav2.exp.uncert/Dockerfile +++ b/flow/flow.sunav2.exp.uncert/Dockerfile @@ -1,6 +1,6 @@ # Dockerfile for NEON IS Data Processing - sunav2 expanded uncertainty -# Start with the NEON IS stats package image +# Start with the NEON IS cal package image FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 ARG FLOW_DIR="./flow" diff --git a/modules_combined/sunav2_ucrt_group/Dockerfile b/modules_combined/sunav2_ucrt_group/Dockerfile index 75f619d8b..a7ec96f17 100644 --- a/modules_combined/sunav2_ucrt_group/Dockerfile +++ b/modules_combined/sunav2_ucrt_group/Dockerfile @@ -1,6 +1,6 @@ -# Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner +# Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner and insufficient data -# Start with the base R image. +# Start with the cal package image. FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 # maintainer handle @@ -38,6 +38,7 @@ COPY ${MODULE_DIR}/${COMMON_DIR} ${CONTAINER_APP_DIR}/${COMMON_DIR} # Build in the uncertainty module ARG MODULE_DIR="flow" ARG APP_DIR_1="flow.sunav2.exp.uncert" +ARG APP_DIR_2="flow.insufficient.data" # Copy the lockfile and restore known working versions of R dependency packages COPY ./flow/flow.sunav2.exp.uncert/renv.lock . @@ -47,5 +48,13 @@ RUN R -e 'renv::restore(lockfile="./renv.lock")' COPY ./flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R . COPY ./flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R . +# Copy the lockfile and restore known working versions of R dependency packages +COPY ./flow/flow.insufficient.data/renv.lock . +RUN R -e 'renv::restore(lockfile="./renv.lock")' + +# Copy in R code +COPY ./flow/flow.insufficient.data/flow.insufficient.data.R . +COPY ./flow/flow.insufficient.data/wrap.insufficient.data.R . + # Run as app user USER appuser From db7a28324ac630f7e7223adc88f283495898c7c4 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 5 Nov 2025 08:41:50 -0700 Subject: [PATCH 128/182] latest --- .../flow.insufficient.data.R | 43 +++++++++++-------- .../wrap.insufficient.data.R | 18 +++++++- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 16 ++++++- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/flow/flow.insufficient.data/flow.insufficient.data.R b/flow/flow.insufficient.data/flow.insufficient.data.R index dd9fcb9f2..0feccbffe 100644 --- a/flow/flow.insufficient.data/flow.insufficient.data.R +++ b/flow/flow.insufficient.data/flow.insufficient.data.R @@ -21,9 +21,11 @@ #' #' 5. "SchmStats=value" (optional), The avro schema for the input and output stats file. #' -#' 6. "SchmQMsOut=value" (optional), The avro schema for the updated QMs (insufficientDataQF added). +#' 6. "SchmQMs=value" (optional), The avro schema for the updated QMs (insufficientDataQF added). +#' +#' 7. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, that are to be copied with a symbolic link to the output path. #' -#' #' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, #' which uses system environment variables if available. #' @@ -39,28 +41,27 @@ #' minPoints=10, #' DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" , #' SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse=''), -#' SchmQMsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), +#' SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), #' log=log) #' Stepping through the code in R studio -Sys.setenv(DIR_IN='/home/NEON/hensley/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') -log <- NEONprocIS.base::def.log.init(Lvl = "debug") -arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", - "minPoints=10", - "DirOut=~/pfs/out", - "DirErr=~/pfs/out/errored_datums") - rm(list=setdiff(ls(),c('arg','log'))) +# Sys.setenv(DIR_IN='~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", +# "minPoints=10","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","DirSubCopy=location") +# rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently # changelog and author contributions / copyrights #' Bobby Hensley (2025-10-31) #' Initial creation. +#' Nora Catolico (2025-11-04) +#' add in copied directories ############################################################################################## options(digits.secs = 3) library(foreach) library(doParallel) -library(lubridate) # Source the wrapper function. Assume it is in the working directory source("./wrap.insufficient.data.R") @@ -84,7 +85,7 @@ log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used # Parse the input arguments into parameters Para <- NEONprocIS.base::def.arg.pars(arg = arg,NameParaReqd = c("DirIn","minPoints","DirOut","DirErr"), - NameParaOptn = c("SchmStats","SchmQMsOut"),log = log) + NameParaOptn = c("SchmStats","SchmQMs","DirSubCopy"),log = log) # Echo arguments log$debug(base::paste0('Input data directory: ', Para$DirIn)) @@ -92,7 +93,8 @@ log$debug(base::paste0('Minimum points: ', Para$minPoints)) log$debug(base::paste0('Output directory: ', Para$DirOut)) log$debug(base::paste0('Error directory: ', Para$DirErr)) log$debug(base::paste0('Schema for output stats: ', Para$SchmStats)) -log$debug(base::paste0('Schema for output QMs: ', Para$SchmQMsOut)) +log$debug(base::paste0('Schema for output QMs: ', Para$SchmQMs)) +log$debug(base::paste0('Director to copy: ', Para$DirSubCopy)) # Read in the schemas so we only have to do it once and not every time in the avro writer. if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ @@ -100,10 +102,10 @@ if(base::is.null(Para$SchmStats) || Para$SchmStats == 'NA'){ } else { SchmStats <- base::paste0(base::readLines(Para$SchmStats),collapse='') } -if(base::is.null(Para$SchmQMsOut) || Para$SchmQMsOut == 'NA'){ - SchmQMsOut <- NULL +if(base::is.null(Para$SchmQMs) || Para$SchmQMs == 'NA'){ + SchmQMs <- NULL } else { - SchmQMsOut <- base::paste0(base::readLines(Para$SchmQMsOut),collapse='') + SchmQMs <- base::paste0(base::readLines(Para$SchmQMs),collapse='') } @@ -113,6 +115,10 @@ DirIn <- nameDirSub = c('stats','quality_metrics'), log = log) +# Retrieve optional subdirectories to copy over +DirSubCopy <- base::unique(base::setdiff(Para$DirSubCopy,'stats')) +log$debug(base::paste0('Additional subdirectories to copy: ',base::paste0(DirSubCopy,collapse=','))) + # Process each datum path doParallel::registerDoParallel(numCoreUse) foreach::foreach(idxFileIn = DirIn) %dopar% { @@ -123,9 +129,10 @@ foreach::foreach(idxFileIn = DirIn) %dopar% { wrap.insufficient.data( DirIn=idxFileIn, minPoints=Para$minPoints, - DirOut=Para$DirOut, + DirOutBase=Para$DirOut, SchmStats=SchmStats, - SchmQMsOut=SchmQMsOut, + SchmQMs=SchmQMs, + DirSubCopy=DirSubCopy, log=log ), error = function(err) { diff --git a/flow/flow.insufficient.data/wrap.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R index 2aff818d5..8a1b6c709 100644 --- a/flow/flow.insufficient.data/wrap.insufficient.data.R +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -19,6 +19,10 @@ #' @param SchmQMsOut (optional), A json-formatted character string containing the schema for the output quality metrics parquet #' with insufficient data quality flag added. #' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). +#' #' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log #' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init #' for more details. @@ -47,9 +51,10 @@ ############################################################################################## wrap.insufficient.data <- function(DirIn, minPoints, - DirOut, + DirOutBase, SchmStats=NULL, SchmQMsOut=NULL, + DirSubCopy=NULL, log=NULL ){ @@ -61,11 +66,20 @@ wrap.insufficient.data <- function(DirIn, InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn) DirInStats <- paste0(DirIn,"/stats") DirInQMs <- paste0(DirIn,"/quality_metrics") + DirOut <- base::paste0(DirOutBase,InfoDirIn$dirRepo) DirOutStats <- base::paste0(DirOut,"/stats") base::dir.create(DirOutStats,recursive=TRUE) DirOutQMs <- base::paste0(DirOut,"/quality_metrics") base::dir.create(DirOutQMs,recursive=TRUE) + # Copy with a symbolic link the desired subfolders + if(base::length(DirSubCopy) > 0){ + NEONprocIS.base::def.dir.copy.symb(DirSrc=base::paste0(DirIn,'/',DirSubCopy), + DirDest=DirOut, + LnkSubObj=TRUE, + log=log) + } + #' Read in parquet file of averaged stats. statsFileName<-base::list.files(DirInStats,full.names=FALSE) if(length(statsFileName)==0){ @@ -108,7 +122,7 @@ wrap.insufficient.data <- function(DirIn, for(i in 1:nrow(qmData)){ if(qmData[i,which(colnames(qmData)=='insufficientDataQF')]==1){ qmData[i,which(colnames(qmData)==finalQfColName)]=1}} - qmData <- qmData %>% dplyr::relocate(finalQfColName, .after = last_col()) #' Move finalQF back to the end + qmData <- qmData[c(setdiff(names(qmData), finalQfColName), finalQfColName)] #' Move finalQF back to the end #' Write out stats file. rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 4fe28c826..3a2e2eaa0 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -14,18 +14,30 @@ transform: # Refresh interim directories with each datum (otherwise they persist and cause probs) rm -r -f /tmp/pfs/filter_joined + rm -rf /tmp/interimA + rm -rf /tmp/interimUcrt mkdir -p /tmp/pfs/filter_joined + mkdir /tmp/interimUcrt + # Run first module - filter-joiner (using environment variables below as input parameters) python3 -m filter_joiner.filter_joiner_main - # Run second module - basic stats + # Run second module - uncertianty Rscript ./flow.sunav2.exp.uncert.R \ DirIn=/tmp/pfs/filter_joined \ - DirOut=/pfs/out \ + DirOut=/pfs/tmp/interimUcrt \ DirErr=/pfs/out/errored_datums \ "DirSubCopy=location|quality_metrics" + # Run third module - insufficient data + Rscript ./flow.insufficient.data.R \ + DirIn=/tmp/pfs/filter_joined \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + minPoints=10 \ + "DirSubCopy=location" + EOF env: # Environment variables for 1st filter-joiner. Need to join by day again here because an outer join was used on From 238d31da58667cdbada1f15612417cffdf552b1b Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 5 Nov 2025 09:03:35 -0700 Subject: [PATCH 129/182] combine modules --- flow/flow.sunav2.exp.uncert/Dockerfile | 20 ------------------- .../dockerfile_in_combined_module.txt | 1 + flow/flow.sunav2.logfiles.fill/Dockerfile | 20 ------------------- .../dockerfile_in_combined_module.txt | 1 + 4 files changed, 2 insertions(+), 40 deletions(-) delete mode 100644 flow/flow.sunav2.exp.uncert/Dockerfile create mode 100644 flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt delete mode 100644 flow/flow.sunav2.logfiles.fill/Dockerfile create mode 100644 flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt diff --git a/flow/flow.sunav2.exp.uncert/Dockerfile b/flow/flow.sunav2.exp.uncert/Dockerfile deleted file mode 100644 index 44cf9c4b0..000000000 --- a/flow/flow.sunav2.exp.uncert/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -# Dockerfile for NEON IS Data Processing - sunav2 expanded uncertainty - -# Start with the NEON IS cal package image -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 - -ARG FLOW_DIR="./flow" -ARG APP_DIR="flow.sunav2.exp.uncert" - -# maintainer handle -MAINTAINER "Bobby Hensley" hensley@battelleecology.org - -# Copy the lockfile and restore known working versions of R dependency packages -# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image -COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock -RUN R -e 'renv::restore(lockfile="/renv.lock")' - -# Copy in sunav2 flag workflow -COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.exp.uncert.R . -COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.exp.uncert.R . - diff --git a/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt b/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt new file mode 100644 index 000000000..75b5d274e --- /dev/null +++ b/flow/flow.sunav2.exp.uncert/dockerfile_in_combined_module.txt @@ -0,0 +1 @@ +sunav2_ucrt_group \ No newline at end of file diff --git a/flow/flow.sunav2.logfiles.fill/Dockerfile b/flow/flow.sunav2.logfiles.fill/Dockerfile deleted file mode 100644 index 3305d1127..000000000 --- a/flow/flow.sunav2.logfiles.fill/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -# Dockerfile for NEON IS Data Processing - sunav2 Logfile Processing - -# Start with the neon-is-base-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 - -ARG FLOW_DIR="./flow" -ARG APP_DIR="flow.sunav2.logfiles.fill" - -# maintainer handle -MAINTAINER "Nora Catolico" ncatolico@battelleecology.org - -# Copy the lockfile and restore known working versions of R dependency packages -# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image -COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock -RUN R -e 'renv::restore(lockfile="/renv.lock")' - -# Copy in sunav2 flag workflow -COPY ${FLOW_DIR}/${APP_DIR}/flow.sunav2.logfiles.fill.R . -COPY ${FLOW_DIR}/${APP_DIR}/wrap.sunav2.logfiles.fill.R . - diff --git a/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt b/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt new file mode 100644 index 000000000..c3ba2706c --- /dev/null +++ b/flow/flow.sunav2.logfiles.fill/dockerfile_in_combined_module.txt @@ -0,0 +1 @@ +sunav2_logs_group_and_fill \ No newline at end of file From 71a3f841f2f8207aba6cd2ae17f26fbe5585b93e Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 5 Nov 2025 15:07:32 -0700 Subject: [PATCH 130/182] latest --- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 3a2e2eaa0..5478817ac 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-d590e7a + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-fee1f15 cmd: - sh - "-c" @@ -14,10 +14,9 @@ transform: # Refresh interim directories with each datum (otherwise they persist and cause probs) rm -r -f /tmp/pfs/filter_joined - rm -rf /tmp/interimA - rm -rf /tmp/interimUcrt + rm -rf /tmp/pfs/interimUcrt mkdir -p /tmp/pfs/filter_joined - mkdir /tmp/interimUcrt + mkdir /tmp/pfs/interimUcrt # Run first module - filter-joiner (using environment variables below as input parameters) @@ -26,13 +25,13 @@ transform: # Run second module - uncertianty Rscript ./flow.sunav2.exp.uncert.R \ DirIn=/tmp/pfs/filter_joined \ - DirOut=/pfs/tmp/interimUcrt \ + DirOut=/tmp/pfs/interimUcrt \ DirErr=/pfs/out/errored_datums \ "DirSubCopy=location|quality_metrics" # Run third module - insufficient data Rscript ./flow.insufficient.data.R \ - DirIn=/tmp/pfs/filter_joined \ + DirIn=/tmp/pfs/interimUcrt \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ minPoints=10 \ From 43af1e4af74bd61108eb5d3597a5005acc385695 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 20 Nov 2025 11:17:31 -0700 Subject: [PATCH 131/182] update cron --- pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml | 3 ++- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index 5f8536e22..afd5bb8ba 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -18,7 +18,8 @@ input: # Choose a monthly cron date to be something sufficiently after the 1st to allow kafka lag and timeseries pad cron: name: tick - spec: "0 7 10 * *" # Run at 00:00 MST (07:00 GMT) on the 10th of the month + spec: "@never" + #spec: "0 7 10 * *" # Run at 00:00 MST (07:00 GMT) on the 10th of the month overwrite: true autoscaling: true resource_requests: diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index c7224e564..4f78ec833 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -23,7 +23,7 @@ input: - cron: name: tick spec: "@never" - spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + #spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) overwrite: true - pfs: name: SITE_FILE From 4fa99b1f9912bc282c65e95034d664113733135e Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 2 Dec 2025 08:17:37 -0700 Subject: [PATCH 132/182] full site list --- pipe/sunav2/site-list-full.json | 138 ------------------ pipe/sunav2/site-list.json | 130 ++++++++++++++++- ...av2_cron_daily_and_date_control_kafka.yaml | 2 +- .../vignettes/stand_up_product_dag_example.sh | 10 +- 4 files changed, 133 insertions(+), 147 deletions(-) delete mode 100644 pipe/sunav2/site-list-full.json diff --git a/pipe/sunav2/site-list-full.json b/pipe/sunav2/site-list-full.json deleted file mode 100644 index 38df06e24..000000000 --- a/pipe/sunav2/site-list-full.json +++ /dev/null @@ -1,138 +0,0 @@ -[ - { - "site" : "ARIK", - "kafka_start_date" : "2024-03-01" - }, - { - "site" : "BARC", - "kafka_start_date" : "2024-08-11" - }, - { - "site" : "BIGC", - "kafka_start_date" : "2024-06-01" - }, - { - "site" : "BLDE", - "kafka_start_date" : "2024-05-08" - }, - { - "site" : "BLUE", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "BLWA", - "kafka_start_date" : "2024-08-22" - }, - { - "site" : "CARI", - "kafka_start_date" : "2024-03-01" - }, - { - "site" : "COMO", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "CRAM", - "kafka_start_date" : "2024-07-20" - }, - { - "site" : "CUPE", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "FLNT", - "kafka_start_date" : "2024-08-11" - }, - { - "site" : "GUIL", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "HOPB", - "kafka_start_date" : "2024-01-17" - }, - { - "site" : "KING", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "LECO", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "LEWI", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "LIRO", - "kafka_start_date" : "2024-08-10" - }, - { - "site" : "MART", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "MAYF", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "MCDI", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "MCRA", - "kafka_start_date" : "2024-02-05" - }, - { - "site" : "OKSR", - "kafka_start_date" : "2024-04-06" - }, - { - "site" : "POSE", - "kafka_start_date" : "2024-01-25" - }, - { - "site" : "PRIN", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "PRLA", - "kafka_start_date" : "2024-08-10" - }, - { - "site" : "PRPO", - "kafka_start_date" : "2024-08-10" - }, - { - "site" : "REDB", - "kafka_start_date" : "2024-02-06" - }, - { - "site" : "SUGG", - "kafka_start_date" : "2024-08-11" - }, - { - "site" : "SYCA", - "kafka_start_date" : "2024-04-11" - }, - { - "site" : "TECR", - "kafka_start_date" : "2024-03-17" - }, - { - "site" : "TOMB", - "kafka_start_date" : "2024-08-10" - }, - { - "site" : "TOOK", - "kafka_start_date" : "2024-08-10" - }, - { - "site" : "WALK", - "kafka_start_date" : "2024-02-09" - }, - { - "site" : "WLOU", - "kafka_start_date" : "2024-02-06" - } -] \ No newline at end of file diff --git a/pipe/sunav2/site-list.json b/pipe/sunav2/site-list.json index d9ba56eb9..38df06e24 100644 --- a/pipe/sunav2/site-list.json +++ b/pipe/sunav2/site-list.json @@ -1,14 +1,138 @@ [ { - "site" : "HOPB", + "site" : "ARIK", "kafka_start_date" : "2024-03-01" }, { - "site" : "CRAM", + "site" : "BARC", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "BIGC", + "kafka_start_date" : "2024-06-01" + }, + { + "site" : "BLDE", + "kafka_start_date" : "2024-05-08" + }, + { + "site" : "BLUE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "BLWA", + "kafka_start_date" : "2024-08-22" + }, + { + "site" : "CARI", "kafka_start_date" : "2024-03-01" }, { + "site" : "COMO", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "CRAM", + "kafka_start_date" : "2024-07-20" + }, + { + "site" : "CUPE", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "FLNT", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "GUIL", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "HOPB", + "kafka_start_date" : "2024-01-17" + }, + { + "site" : "KING", + "kafka_start_date" : "2024-01-25" + }, + { "site" : "LECO", - "kafka_start_date" : "2024-03-01" + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "LEWI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "LIRO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "MART", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MAYF", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCDI", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "MCRA", + "kafka_start_date" : "2024-02-05" + }, + { + "site" : "OKSR", + "kafka_start_date" : "2024-04-06" + }, + { + "site" : "POSE", + "kafka_start_date" : "2024-01-25" + }, + { + "site" : "PRIN", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "PRLA", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "PRPO", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "REDB", + "kafka_start_date" : "2024-02-06" + }, + { + "site" : "SUGG", + "kafka_start_date" : "2024-08-11" + }, + { + "site" : "SYCA", + "kafka_start_date" : "2024-04-11" + }, + { + "site" : "TECR", + "kafka_start_date" : "2024-03-17" + }, + { + "site" : "TOMB", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "TOOK", + "kafka_start_date" : "2024-08-10" + }, + { + "site" : "WALK", + "kafka_start_date" : "2024-02-09" + }, + { + "site" : "WLOU", + "kafka_start_date" : "2024-02-06" } ] \ No newline at end of file diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 2bb4cfb3c..375692d63 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -27,7 +27,7 @@ input: - pfs: name: SITE_FILE repo: sunav2_site_list - glob: /site-list-full.json + glob: /site-list.json resource_requests: memory: 100M cpu: 1 diff --git a/utilities/vignettes/stand_up_product_dag_example.sh b/utilities/vignettes/stand_up_product_dag_example.sh index 43f21b6f9..1c8301954 100644 --- a/utilities/vignettes/stand_up_product_dag_example.sh +++ b/utilities/vignettes/stand_up_product_dag_example.sh @@ -9,12 +9,12 @@ # Define paths data_path='/scratch/pfs' # Where base repos like avro_schemas, empty_files, etc. are stored -git_path_pipelines='/home/NEON/ncatolico/NEON-IS-data-processing-homeDir/pipe' -git_path_avro='/home/NEON/ncatolico/NEON-IS-avro-schemas' -git_path_avro_l0='/home/NEON/ncatolico/neon-avro-schemas' +git_path_pipelines='/home/NEON/ncatolico/R/NEON-IS-data-processing/pipe' +git_path_avro='/home/NEON/ncatolico/R/NEON-IS-avro-schemas' +#git_path_avro_l0='/home/NEON/ncatolico/neon-avro-schemas' pipe_list_prefix='pipe_list_' -source_type='tchain' -product='tempSpecificDepthLakes' +source_type='sunav2' +product='nitrate' # Define paths based on base paths and product information above spec_path_source_type=$git_path_pipelines/$source_type From 70b037227e0a4719094193abe80946be78107d62 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 3 Dec 2025 09:52:30 -0700 Subject: [PATCH 133/182] update cal module image --- pipe/sunav2/sunav2_calibration_group_and_convert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 8dafbaa5e..8aa2adb8c 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_group_and_convert transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v2.3.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v3.0.0 cmd: - sh - "-c" From 34dac9fb49c2340b54077cf7d9e8f87ff1cfebab Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 3 Dec 2025 14:12:48 -0700 Subject: [PATCH 134/182] remove json version --- .../sunav2/sunav2_calibration_assignment.json | 47 ------------------- 1 file changed, 47 deletions(-) delete mode 100644 pipe/sunav2/sunav2_calibration_assignment.json diff --git a/pipe/sunav2/sunav2_calibration_assignment.json b/pipe/sunav2/sunav2_calibration_assignment.json deleted file mode 100644 index bd8d52ee7..000000000 --- a/pipe/sunav2/sunav2_calibration_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_calibration_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.cal.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "PadDay=-1|1" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.3", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "calibration", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "4" - } -} From d7a4a0db5ba25623109e2891e1f75d9f1b39dbfa Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Wed, 3 Dec 2025 14:13:17 -0700 Subject: [PATCH 135/182] update image --- pipe/sunav2/sunav2_calibration_assignment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_assignment.yaml b/pipe/sunav2/sunav2_calibration_assignment.yaml index 02ada5f91..bfd45d287 100644 --- a/pipe/sunav2/sunav2_calibration_assignment.yaml +++ b/pipe/sunav2/sunav2_calibration_assignment.yaml @@ -12,7 +12,7 @@ transform: DirOut=/pfs/out DirErr=$ERR_PATH FileYear=$FILE_YEAR - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.2 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-asgn:v2.0.3 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From 6dd24fc828f5b05b659388353d826c55283f8b37 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 3 Dec 2025 15:40:12 -0700 Subject: [PATCH 136/182] latest --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/nitrate/nitrate_group_assignment.yaml | 2 +- pipe/nitrate/nitrate_group_loader.yaml | 2 +- pipe/nitrate/nitrate_group_path.yaml | 6 +- .../nitrate_level1_group_consolidate_srf.yaml | 214 ++++++++++++++++++ pipe/nitrate/nitrate_null_gap_ucrt.yaml | 2 +- pipe/sunav2/sunav2_data_source_trino.yaml | 6 +- pipe/sunav2/sunav2_fill_date_gaps.yaml | 84 +++++++ pipe/sunav2/sunav2_fill_log_files.yaml | 6 +- pipe/sunav2/sunav2_trino_data_parser.yaml | 6 +- 10 files changed, 314 insertions(+), 16 deletions(-) create mode 100644 pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml create mode 100644 pipe/sunav2/sunav2_fill_date_gaps.yaml diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index a0c158538..c4013c22d 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-72c2253 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-4fa99b1 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/nitrate/nitrate_group_assignment.yaml b/pipe/nitrate/nitrate_group_assignment.yaml index a7cefa03e..d93a4451d 100644 --- a/pipe/nitrate/nitrate_group_assignment.yaml +++ b/pipe/nitrate/nitrate_group_assignment.yaml @@ -23,7 +23,7 @@ input: - pfs: name: DIR_IN repo: nitrate_group_loader - glob: /nitrate/* + glob: /nitrate-surfacewater/* - pfs: name: FILE_YEAR repo: sunav2_cron_daily_and_date_control diff --git a/pipe/nitrate/nitrate_group_loader.yaml b/pipe/nitrate/nitrate_group_loader.yaml index 8a8657c68..cefa9bef5 100644 --- a/pipe/nitrate/nitrate_group_loader.yaml +++ b/pipe/nitrate/nitrate_group_loader.yaml @@ -4,7 +4,7 @@ transform: cmd: - /bin/bash env: - GROUP_PREFIX: nitrate_ + GROUP_PREFIX: nitrate-surfacewater_ LOG_LEVEL: INFO OUT_PATH: /pfs/out # ERR_PATH can be changed, it is user specified diff --git a/pipe/nitrate/nitrate_group_path.yaml b/pipe/nitrate/nitrate_group_path.yaml index c4b3850d1..956cc9e39 100644 --- a/pipe/nitrate/nitrate_group_path.yaml +++ b/pipe/nitrate/nitrate_group_path.yaml @@ -11,7 +11,7 @@ transform: - '#!/bin/bash' - python3 -m group_path.group_path_main env: - GROUP: nitrate_ + GROUP: nitrate-surfacewater_ LOG_LEVEL: INFO OUT_PATH: /pfs/out # ERR_PATH can be changed, it is user specified @@ -39,13 +39,13 @@ input: # name must be GROUP_ASSIGNMENT_PATH name: GROUP_ASSIGNMENT_PATH repo: nitrate_group_assignment - glob: /nitrate/(*/*/*) + glob: /nitrate-surfacewater/(*/*/*) joinOn: $1 - union: - pfs: # Any/all repos in location focus name must be named LOCATION_FOCUS_PATH name: LOCATION_FOCUS_PATH - repo: sunav2_location_group_and_restructure + repo: sunav2_fill_date_gaps glob: /sunav2/(*/*/*) joinOn: $1 # - pfs: diff --git a/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml new file mode 100644 index 000000000..cd704a96a --- /dev/null +++ b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml @@ -0,0 +1,214 @@ +--- +pipeline: + name: nitrate_level1_group_consolidate_srf +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-levl1-grp-cons-srf:v2.2.1 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + # + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf /tmp/interimA + rm -rf /tmp/interimB + rm -rf /tmp/pfs/interimC + mkdir /tmp/interimA + mkdir /tmp/interimB + mkdir -p /tmp/pfs/interimC + # + # Set some environment variables for the first module + export OUT_PATH=$OUT_PATH_1 + # + # Run first module - filter-joiner (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Set some environment variables for the second module + export OUT_PATH=$OUT_PATH_2 + export CONFIG=$CONFIG2 + # + # Run second module - filter-joiner to bring in SRF (using environment variables below as input parameters) + python3 -m filter_joiner.filter_joiner_main + # Clean up 1st interim directory (this is the only one we can clean up bc the rest use symlinks) + rm -rf /tmp/interimA + # Set some environment variables for the 3rd module + export OUT_PATH=$OUT_PATH_3 + # + # Run third module - level 1 consolidate (using environment variables below as input parameters) + python3 -m level1_consolidate.level1_consolidate_main + # + # Run fourth module - pub workbook loader (using environment variables below as input parameters) + python3 -m pub_workbook_loader.pub_workbook_loader_main + # + # Run fifth and final module - create pub tables and apply science review flags (if any) + Rscript ./flow.pub.tabl.srf.R \ + DirIn=/tmp/pfs/interimC \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirData=stats|quality_metrics" \ + PathPubWb=$PUB_WORKBOOKS \ + "DirSubCopy=science_review_flags|group|location" + # + # + # Export Level 1 data to bucket + export OUT_PATH=/pfs/out + linkdir=$(mktemp -d) + shopt -s globstar + out_parquet_glob="${OUT_PATH}/**/*.parquet" + # Example: /2024/01/18/par-quantum-line_UKFS001000/data/par-quantum-line_UKFS001000_2024-01-18_PARQL_1min_001.parquet + echo "Linking output files to ${linkdir}" + #set -x # Echo commands to output for debugging + fname="" + for f in $out_parquet_glob; do + if [[ -f "$f" ]]; then + # Parse the path + [[ "$f" =~ ^$OUT_PATH/([0-9]+)/([0-9]+)/([0-9]+)/(${GROUP_PREFIX}_[A-Za-z0-9]+)/data/(.*)$ ]] + fyear="${BASH_REMATCH[1]}" + fmonth="${BASH_REMATCH[2]}" + fday="${BASH_REMATCH[3]}" + fgroup="${BASH_REMATCH[4]}" + fname="${BASH_REMATCH[5]}" + # Now get the timing index from the file name + [[ "$fname" =~ ^${GROUP_PREFIX}_[A-Za-z0-9]+_${fyear}-${fmonth}-${fday}_[A-Za-z0-9]+_([A-Za-z0-9]+)_([A-Za-z0-9]+).parquet ]] + avg_int="${BASH_REMATCH[2]}" + #Form the output path and link + outdir="${linkdir}/v2/${GROUP_PREFIX}/${avg_int}/group=${fgroup}/ms=${fyear}-${fmonth}" + mkdir -p "${outdir}" + ln -s "${f}" "${outdir}/${fname}" + fi + done + #set +x + if [[ "${fname}" ]]; then + echo "Syncing files to bucket" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${linkdir}" \ + ":gcs://${BUCKET_NAME}" + echo "Removing temporary files" + rm -rf $linkdir + fi + EOF + env: + # Environment variables for 1st filter-joiner. Need to join by day again here because an outer join was used on + # these repos in order to pull them in with or without the SRF + CONFIG: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: DATA_PATH + # Filter for data directory + glob_pattern: /pfs/DATA_PATH/*/*/*/*/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + - path: + name: GROUP_PATH + # Grab group information + glob_pattern: /pfs/GROUP_PATH/*/*/*/*/group/** + # Join on Y/M/D/group ID + join_indices: [3,4,5,6] + OUT_PATH_1: /tmp/interimA # Transfered to OUT_PATH for the first module + RELATIVE_PATH_INDEX: "3" # This is shared among the 2 filter joiners and consolidation module + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. Also shared with 2nd & 3rd modules + LOG_LEVEL: INFO # Shared among all modules + +# Below are the environment variables for 2nd filter-joiner bringing in the Science review flags +# Can't do this in first filter-joiner bc there are only data in the srf assignment +# repo for groups that have applicable SRFs for the day. Need to pass through the +# consolidated output with an outer join. + CONFIG2: | + --- + # In Pachyderm root will be index 0, 'pfs' index 1, and the repo name index 2. + # Metadata indices will typically begin at index 3. + # Use unix-style glob pattern to select the desired directories in each repo + input_paths: + - path: + name: CONSOLIDATED_PATH + # Filter for data directory + glob_pattern: /tmp/interimA/*/*/*/*/** + # Join on group ID (already joined below by day) + join_indices: [6] + outer_join: True + - path: + name: SRF_PATH + # Filter for data directory + glob_pattern: /pfs/SRF_PATH/*/*/*/*/** + # Join on group ID(already joined below by day) + join_indices: [6] + OUT_PATH_2: /tmp/interimB # This will be transfered to OUT_PATH for the this module + +# Environment variables for level 1 consolidation + IN_PATH: /tmp/interimB + OUT_PATH_3: /tmp/pfs/interimC # This will be transfered to OUT_PATH for the second module + GROUP_INDEX: "6" # path index of names of group-level metadata to include in the output + GROUP_METADATA_INDEX: "7" + GROUP_METADATA_NAMES: group,science_review_flags + # path index of names of directories to include in the output + DATA_TYPE_INDEX: "9" + DATA_TYPE_NAMES: location,stats,quality_metrics + +# Environment variables for pub_workbook_loader + OUT_PATH_WORKBOOK: /tmp/pub_workbooks + PRODUCTS: NEON.DOM.SITE.DP1.20033.001 # Format: NEON.DOM.SITE.DPX.XXXXX.XXX,NEON.DOM.SITE.DPX.XXXXX.XXX,etc + +# Environment variables for pub table and srf module + PUB_WORKBOOKS: /tmp/pub_workbooks + PARALLELIZATION_INTERNAL: '2' + +# Environment variables for the L1 archiver + GROUP_PREFIX: nitrate-surfacewater # no ending "_" + + secrets: + - name: pdr-secret + mount_path: /var/db_secret + - name: l1-bucket + env_var: BUCKET_NAME + key: L1_BUCKET + +input: + join: + - pfs: + name: DATA_PATH + repo: nitrate_qm_group_and_compute + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: GROUP_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: SRF_PATH + repo: nitrate_srf_assignment + glob: /(*/*/*) + joinOn: $1 + empty_files: false # Make sure this is false for LINK_TYPE=COPY +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 1G + cpu: 2.2 +resource_limits: + memory: 2G + cpu: 3.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 5478817ac..968a7b0ff 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -27,7 +27,7 @@ transform: DirIn=/tmp/pfs/filter_joined \ DirOut=/tmp/pfs/interimUcrt \ DirErr=/pfs/out/errored_datums \ - "DirSubCopy=location|quality_metrics" + "DirSubCopy=group|location|quality_metrics" # Run third module - insufficient data Rscript ./flow.insufficient.data.R \ diff --git a/pipe/sunav2/sunav2_data_source_trino.yaml b/pipe/sunav2/sunav2_data_source_trino.yaml index 7afa755cf..15a3b6079 100644 --- a/pipe/sunav2/sunav2_data_source_trino.yaml +++ b/pipe/sunav2/sunav2_data_source_trino.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_data_source_trino transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.2.4 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-data-src-trino:v2.3.0 cmd: - sh - "-c" @@ -131,10 +131,10 @@ parallelism_spec: constant: 5 autoscaling: true resource_requests: - memory: 400M + memory: 800M cpu: 1.2 resource_limits: - memory: 800M + memory: 1600M cpu: 2 sidecar_resource_requests: memory: 3G diff --git a/pipe/sunav2/sunav2_fill_date_gaps.yaml b/pipe/sunav2/sunav2_fill_date_gaps.yaml new file mode 100644 index 000000000..f3dcd452c --- /dev/null +++ b/pipe/sunav2/sunav2_fill_date_gaps.yaml @@ -0,0 +1,84 @@ +--- +pipeline: + name: sunav2_fill_date_gaps +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gf-rglr:v1.2.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Run first module - date-gap-filler (using environment variables below as input parameters) + python3 -m date_gap_filler.date_gap_filler_main + + EOF + env: + # Environment variables for date gap filler + LOG_LEVEL: INFO + OUT_PATH: /pfs/out + OUTPUT_DIRECTORIES: data,location,uncertainty_coef,flags + DATA_SOURCE_TYPE_INDEX: '3' + DATA_YEAR_INDEX: '4' + DATA_MONTH_INDEX: '5' + DATA_DAY_INDEX: '6' + DATA_LOCATION_INDEX: '7' + DATA_TYPE_INDEX: '8' + LOCATION_SOURCE_TYPE_INDEX: '3' + LOCATION_YEAR_INDEX: '4' + LOCATION_MONTH_INDEX: '5' + LOCATION_DAY_INDEX: '6' + LOCATION_INDEX: '7' + EMPTY_FILE_TYPE_INDEX: '4' + LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. + # Environment variables for regularizer + PARALLELIZATION_INTERNAL: '3' # Parallelization within R. If increased, adjust resource requests appropriately. +input: + cross: + - pfs: + name: EMPTY_FILE_PATH + repo: sunav2_empty_files + glob: /sunav2 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - group: + - pfs: + name: DATA_PATH + repo: sunav2_location_group_and_restructure + glob: /(*/*/*/*) + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - join: + - pfs: + name: LOCATION_PATH + repo: sunav2_location_active_dates_assignment + glob: /(*/*/*/*) + joinOn: $1 + group_by: $1 + empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: DATE_LIMITER_PATH + repo: sunav2_cron_daily_and_date_control + glob: /(*/*/*/*) + joinOn: $1 + group_by: $1 + empty_files: true # This can remain true even if LINK_TYPE=COPY +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 2G + cpu: 3.3 +resource_limits: + memory: 3G + cpu: 4.5 +sidecar_resource_requests: + memory: 3G + cpu: 0.5 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/sunav2/sunav2_fill_log_files.yaml b/pipe/sunav2/sunav2_fill_log_files.yaml index 1516f4d7f..272f9cf96 100644 --- a/pipe/sunav2/sunav2_fill_log_files.yaml +++ b/pipe/sunav2/sunav2_fill_log_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_log_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-b95b90a + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logs-group-fill:sha-4fa99b1 cmd: - sh - "-c" @@ -173,10 +173,10 @@ parallelism_spec: constant: 5 autoscaling: true resource_requests: - memory: 400M + memory: 1G cpu: 1.5 resource_limits: - memory: 800M + memory: 2G cpu: 2 sidecar_resource_requests: memory: 2G diff --git a/pipe/sunav2/sunav2_trino_data_parser.yaml b/pipe/sunav2/sunav2_trino_data_parser.yaml index 3985ef178..d15be2042 100644 --- a/pipe/sunav2/sunav2_trino_data_parser.yaml +++ b/pipe/sunav2/sunav2_trino_data_parser.yaml @@ -1,7 +1,7 @@ pipeline: name: sunav2_trino_data_parser transform: - image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.10.1 + image: us-central1-docker.pkg.dev/neon-shared-service/bei/neon-avro-kafka-loader:v4.13.1 env: # if use default PARSED_START_INDEX and PARSED_END_INDEX, parse all elements in parse_field # if use default for FIELD_START_INDEX and FIELD_END_INDEX, @@ -83,10 +83,10 @@ parallelism_spec: constant: 3 autoscaling: true resource_requests: - memory: 500M + memory: 1.5G cpu: 0.5 resource_limits: - memory: 1G + memory: 3G cpu: 1.5 sidecar_resource_requests: memory: 2G From 68973316ae47fa7eb8c28a74234d695dc72f9019 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 3 Dec 2025 15:57:58 -0700 Subject: [PATCH 137/182] fill date incorporated --- .../sunav2_fill_date_gaps_and_regularize.yaml | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml diff --git a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml b/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml deleted file mode 100644 index bd4caa382..000000000 --- a/pipe/sunav2/sunav2_fill_date_gaps_and_regularize.yaml +++ /dev/null @@ -1,100 +0,0 @@ ---- -pipeline: - name: sunav2_fill_date_gaps_and_regularize -transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gf-rglr:v1.1.0 - cmd: - - sh - - "-c" - - |- - /bin/bash <<'EOF' - # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ - set -euo pipefail - IFS=$'\n\t' - - # Refresh interim directories with each datum (otherwise they persist and cause probs) - rm -rf $OUT_PATH - mkdir -p $OUT_PATH - - # Run first module - date-gap-filler (using environment variables below as input parameters) - python3 -m date_gap_filler.date_gap_filler_main - - # Run second module - regularize - Rscript ./flow.rglr.R \ - DirIn=/tmp/pfs/date_filled \ - DirOut=/pfs/out \ - DirErr=/pfs/out/errored_datums \ - "DirRglr=data|uncertainty_data|flags" \ - MethRglr=CybiEc \ - WndwRglr=Trlg \ - IdxWndw=IdxWndwMin \ - RptTimeWndw=FALSE \ - DropNotNumc=FALSE \ - "DirSubCopy=location|uncertainty_coef" - EOF - env: - # Environment variables for date gap filler - LOG_LEVEL: INFO - OUT_PATH: /tmp/pfs/date_filled - OUTPUT_DIRECTORIES: data,location,uncertainty_data,uncertainty_coef,flags - DATA_SOURCE_TYPE_INDEX: '3' - DATA_YEAR_INDEX: '4' - DATA_MONTH_INDEX: '5' - DATA_DAY_INDEX: '6' - DATA_LOCATION_INDEX: '7' - DATA_TYPE_INDEX: '8' - LOCATION_SOURCE_TYPE_INDEX: '3' - LOCATION_YEAR_INDEX: '4' - LOCATION_MONTH_INDEX: '5' - LOCATION_DAY_INDEX: '6' - LOCATION_INDEX: '7' - EMPTY_FILE_TYPE_INDEX: '4' - LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. - # Environment variables for regularizer - PARALLELIZATION_INTERNAL: '3' # Parallelization within R. If increased, adjust resource requests appropriately. -input: - cross: - - pfs: - name: EMPTY_FILE_PATH - repo: sunav2_empty_files - glob: /sunav2 - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - - group: - - pfs: - name: DATA_PATH - repo: sunav2_location_group_and_restructure - glob: /sunav2/(*/*/*) - group_by: $1 - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - - join: - - pfs: - name: LOCATION_PATH - repo: sunav2_location_active_dates_assignment - glob: /sunav2/(*/*/*) - joinOn: $1 - group_by: $1 - empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. - - pfs: - name: DATE_LIMITER_PATH - repo: sunav2_cron_daily_and_date_control - glob: /sunav2/(*/*/*) - joinOn: $1 - group_by: $1 - empty_files: true # This can remain true even if LINK_TYPE=COPY -parallelism_spec: - constant: 5 -autoscaling: true -resource_requests: - memory: 2G - cpu: 3.3 -resource_limits: - memory: 3G - cpu: 4.5 -sidecar_resource_requests: - memory: 3G - cpu: 0.5 -datum_set_spec: - number: 1 -scheduling_spec: - node_selector: - cloud.google.com/compute-class: pach-pipeline-class From be62238b75645fee7ada0dd56f1eba47ddf1ddf9 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 4 Dec 2025 15:36:31 -0700 Subject: [PATCH 138/182] new function for gap filling non-regularized data --- flow/flow.gap.fill.nonrglr/Dockerfile | 27 +++ .../flow.gap.fill.nonrglr.R | 208 ++++++++++++++++++ .../wrap.gap.fill.nonrglr.R | 166 ++++++++++++++ 3 files changed, 401 insertions(+) create mode 100644 flow/flow.gap.fill.nonrglr/Dockerfile create mode 100644 flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R create mode 100644 flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R diff --git a/flow/flow.gap.fill.nonrglr/Dockerfile b/flow/flow.gap.fill.nonrglr/Dockerfile new file mode 100644 index 000000000..5fde50d60 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/Dockerfile @@ -0,0 +1,27 @@ +# Dockerfile for NEON IS Data Processing - flow.gap.fill.nonrglr + +# Start with the neon-is-base-r image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.6.0 + +ARG FLOW_DIR="./flow" +ARG APP_DIR="flow.gap.fill.nonrglr" + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Copy the lockfile and restore known working versions of R dependency packages +COPY ${FLOW_DIR}/${APP_DIR}/renv.lock /renv.lock +RUN R -e 'renv::restore(lockfile="/renv.lock")' + +# Create app user +RUN groupadd app && \ + useradd app -g app +WORKDIR /home/app + +# Copy in application code +COPY ${FLOW_DIR}/${APP_DIR}/flow.gap.fill.nonrglr.R . +COPY ${FLOW_DIR}/${APP_DIR}/wrap.gap.fill.nonrglr.R . + +# Run as app user +RUN chown app:app -R /home/app +USER app diff --git a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R new file mode 100644 index 000000000..3931ae08f --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R @@ -0,0 +1,208 @@ +############################################################################################## +#' @title Gap filling module for non-regularized data in NEON IS data processing. + +#' @author +#' Nora Catolico \email{ncatolico@battelleecology.org} \cr + +#' @description Workflow. Bin data to generate a regular time sequence of observations. +#' General code workflow: +#' Parse input parameters +#' Read in output schemas if indicated in parameters +#' Determine datums to process (set of files/folders to process as a single unit) +#' For each datum: +#' Create output directories and copy (by symbolic link) unmodified components +#' Read regularization frequency from location file (if not in input parameters) +#' Loop through all data files +#' Regularize data in each file +#' Write out the gap filled data +#' +#' This script is run at the command line with the following arguments. Each argument must be a +#' string in the format "Para=value", where "Para" is the intended parameter name and "value" is +#' the value of the parameter. Note: If the "value" string begins with a $ (e.g. $DIR_IN), the +#' value of the parameter will be assigned from the system environment variable matching the value +#' string. +#' +#' The arguments are: +#' +#' 1. "DirIn=value", where value is the path to the input data directory. NOTE: This path must be a +#' parent of the terminal directory where the data to be gap filled reside. See argument "DirFill" +#' below to indicate the terminal directory. +#' +#' The input path is structured as follows: #/pfs/BASE_REPO/#/yyyy/mm/dd/#, where # indicates any +#' number of parent and child directories of any name, so long as they are not 'pfs', the same name +#' as the terminal directory indicated in argument "DirFill", or recognizable as the 'yyyy/mm/dd' +#' structure which indicates the 4-digit year, 2-digit month, and 2-digit day of the data contained +#' in the folder. +#' +#' For example: +#' Input path = /scratch/pfs/sunav2_fill_date_gaps/sunav2/2019/01/01 +#' +#' 2. "DirOut=value", where the value is the output path that will replace the #/pfs/BASE_REPO portion +#' of DirIn. +#' +#' 3. "DirErr=value", where the value is the output path to place the path structure of errored datums that will +#' replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' 4. "DirFill=value", where value is the name of the terminal directory where the data to be +#' gap filled resides. This will be one or more child levels away from "DirIn". All files in the +#' terminal directory will be gap filled. The value may also be a vector of terminal directories, +#' separated by pipes (|). All terminal directories must be present and at the same directory level. +#' For example, "DirFill=data|flags" indicates to regularize the data files within each the data +#' and flags directories. +#' +#' 5. "WndwFill=value", where value is the window in minutes in which data are expected. It is formatted as a 3 character sequence, +#' representing the number of minutes over which any number of measurements are expected. +#' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a +#' 30-minute interval. +#' +#' 6. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' pipes, at the same level as the regularization folder in the input path that are to be copied with a +#' symbolic link to the output path. +#' +#' Note: This script implements logging described in \code{\link[NEONprocIS.base]{def.log.init}}, +#' which uses system environment variables if available. +#' +#' @return gap filled data and flag output in Parquet format in DirOut, where DirOut directory +#' replaces BASE_REPO but otherwise retains the child directory structure of the input path. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' Stepping through the code in R studio +# log <- NEONprocIS.base::def.log.init(Lvl = "debug") +# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/22/CFGLOC110819", +# "DirOut=~/pfs/out ", +# "DirErr=~/pfs/out/errored_datums ", +# "DirFill=data|flags", +# "WndwFill=015", +# "DirSubCopy=location|uncertainty_coef") + +#' @seealso \code{\link[eddy4R.base]{def.rglr}} + +# changelog and author contributions / copyrights +# Nora Catolico (12/4/2025) +# original creation +############################################################################################## +library(foreach) +library(doParallel) +library(dplyr) + +# Source the wrapper function. Assume it is in the working directory +source("./wrap.gap.fill.nonrglr.R") + +# Pull in command line arguments (parameters) +arg <- base::commandArgs(trailingOnly = TRUE) + +# Start logging +log <- NEONprocIS.base::def.log.init() + +# Use environment variable to specify how many cores to run on +numCoreUse <- base::as.numeric(Sys.getenv('PARALLELIZATION_INTERNAL')) +numCoreAvail <- parallel::detectCores() +if (base::is.na(numCoreUse)){ + numCoreUse <- 1 +} +if(numCoreUse > numCoreAvail){ + numCoreUse <- numCoreAvail +} +log$debug(paste0(numCoreUse, ' of ',numCoreAvail, ' available cores will be used for internal parallelization.')) + +# Parse the input arguments into parameters +Para <- + NEONprocIS.base::def.arg.pars( + arg = arg, + NameParaReqd = c( + "DirIn", + "DirOut", + "DirErr", + "DirFill", + "WndwFill" + ), + NameParaOptn = c( + "DirSubCopy" + ), + log = log + ) + +# Echo arguments +log$debug(base::paste0('Input directory: ', Para$DirIn)) +log$debug(base::paste0('Output directory: ', Para$DirOut)) +log$debug(base::paste0('Error directory: ', Para$DirErr)) +log$debug(base::paste0( + 'Terminal Directories to regularize: ', + base::paste0(Para$DirFill, collapse = ',') +)) + +# Retrieve intervals for gap filling +WndwFill <- base::as.numeric(Para$WndwFill) +log$debug(base::paste0('Interval for gap filling, in minutes: ',base::paste0(WndwFill,collapse=','))) + + +# Retrieve output schema(s) +log$debug(base::paste0( + 'Output schema(s) for gap filled data: ', + base::paste0(Para$FileSchmFill, collapse = ',') +)) + +# Retrieve optional subdirectories to copy over +DirSubCopy <- + base::unique(base::setdiff(Para$DirSubCopy, Para$DirFill)) +log$debug(base::paste0( + 'Additional subdirectories to copy: ', + base::paste0(DirSubCopy, collapse = ',') +)) + +nameDirSub <- base::as.list(c(Para$DirFill)) +log$debug(base::paste0( + 'Expected subdirectories of each datum path: ', + base::paste0(nameDirSub, collapse = ',') +)) + +# Find all the input paths (datums). We will process each one. +DirIn <- + NEONprocIS.base::def.dir.in(DirBgn = Para$DirIn, + nameDirSub = nameDirSub, + log = log) + +# Process each datum +doParallel::registerDoParallel(numCoreUse) +foreach::foreach(idxDirIn = DirIn) %dopar% { + + log$info(base::paste0('Processing datum path: ', idxDirIn)) + + # Run the wrapper function for each datum, with error routing + tryCatch( + withCallingHandlers( + wrap.gap.fill.nonrglr(DirIn=idxDirIn, + DirOutBase=Para$DirOut, + WndwFill=WndwFill, + DirFill=Para$DirFill, + DirSubCopy=DirSubCopy, + log=log + ), + error = function(err) { + call.stack <- base::sys.calls() # is like a traceback within "withCallingHandlers" + + # Re-route the failed datum + NEONprocIS.base::def.err.datm( + err=err, + call.stack=call.stack, + DirDatm=idxDirIn, + DirErrBase=Para$DirErr, + RmvDatmOut=TRUE, + DirOutBase=Para$DirOut, + log=log + ) + } + ), + # This simply to avoid returning the error + error=function(err) {} + ) + + + return() + +} # End loop around datum paths diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R new file mode 100644 index 000000000..1cbb2b988 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -0,0 +1,166 @@ +############################################################################################## +#' @title Gap filling module for non-regularized data in NEON IS data processing. + +#' @author +#' Cove Sturtevant \email{csturtevant@battelleecology.org} + +#' @description Wrapper function. Bin data to generate a regular time sequence of observations. +#' General code workflow: +#' Error-check input parameters +#' Read regularization frequency from location file if expected +#' Create output directories and copy (by symbolic link) unmodified components +#' Loop through all data files +#' Regularize data in each file +#' Write out the regularized data +#' +#' +#' @param DirIn Character value. The input path to the data from a single sensor or location, structured as follows: +#' #/pfs/BASE_REPO/#/yyyy/mm/dd/#/id, where # indicates any number of parent and child directories +#' of any name, so long as they are not 'pfs' or recognizable as the 'yyyy/mm/dd' structure which indicates +#' the 4-digit year, 2-digit month, and' 2-digit day. The id is the unique identifier of the sensor or location. \cr +#' +#' Nested within this path are the folders: +#' /data +#' /flags +#' +#' @param DirOutBase Character value. The output path that will replace the #/pfs/BASE_REPO portion of DirIn. +#' +#' @param DirFill List of the terminal directories where the data to be +#' gap filled resides. This will be one or more child levels away from "DirIn". All files in the +#' terminal directory will be gap filled. The value may also be a vector of terminal directories, +#' separated by pipes (|). All terminal directories must be present and at the same directory level. +#' For example, "DirFill=data|flags" indicates to regularize the data files within each the data +#' and flags directories. +#' +#' @param WndwFill Character value. The window in minutes in which data are expected. It is formatted as a 3 character sequence, +#' representing the number of minutes over which any number of measurements are expected. +#' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a +#' 30-minute interval. +#' +#' @param DirSubCopy (optional) Character vector. The names of additional subfolders at +#' the same level as the location folder in the input path that are to be copied with a symbolic link to the +#' output path (i.e. not combined but carried through as-is). + +#' @param log A logger object as produced by NEONprocIS.base::def.log.init to produce structured log +#' output. Defaults to NULL, in which the logger will be created and used within the function. See NEONprocIS.base::def.log.init +#' for more details. + +#' @return Regularized data output in Parquet format in DirOutBase, where DirOutBase directory +#' replaces BASE_REPO of DirIn but otherwise retains the child directory structure of the input path. + +#' @references +#' License: (example) GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 + +#' @keywords Currently none + +#' @examples +#' # Not run + +#' @seealso None currently + +# changelog and author contributions / copyrights +# Nora Catolico (2025-12-4) +# original creation +############################################################################################## +wrap.gap.fill.nonrglr <- function(DirIn, + DirOutBase, + DirFill, + WndwFill, + DirSubCopy=NULL, + log=NULL +){ + + # Start logging if not already + if(base::is.null(log)){ + log <- NEONprocIS.base::def.log.init() + } + + # Gather info about the input directory (including date) and create the output directory. + InfoDirIn <- NEONprocIS.base::def.dir.splt.pach.time(DirIn,log=log) + dirOut <- base::paste0(DirOutBase, InfoDirIn$dirRepo) + + timeBgn <-InfoDirIn$time # Earliest possible start date for the data + timeEnd <- InfoDirIn$time + base::as.difftime(1, units = 'days') + # All minute window start times in [timeBgn, timeEnd) + all_starts <- seq(timeBgn, timeEnd - WndwFill*60, by = WndwFill*60) + + # Helper to floor readout_times to window starts + floor_15m <- function(x) { + as.POSIXct(floor(as.numeric(x) / (WndwFill*60)) * (WndwFill*60), + origin = "1970-01-01", tz = attr(x, "tzone")) + } + + # Copy with a symbolic link the desired subfolders + if (base::length(DirSubCopy) > 0) { + NEONprocIS.base::def.dir.copy.symb(base::paste0(DirIn, '/', DirSubCopy), + dirOut, + log = log) + } + + + # --------- loop through the directories ---------- + for (i in 1:length(DirFill)){ + + subDir<-DirFill[i] + + # Take stock of our files. + subDirIn <- fs::path(DirIn,subDir) + files <- base::list.files(subDirIn,full.names=FALSE) + + #loop through files in directory + for (j in 1:length(files)){ + fileName <- files[j] + + # Load in file in parquet format into data frame + df <- + base::try(NEONprocIS.base::def.read.parq(NameFile = base::paste0(subDirIn, '/', fileName), + log = log), + silent = FALSE) + if (base::any(base::class(df) == 'try-error')) { + # Generate error and stop execution + log$error(base::paste0('File ', subDirIn, '/', fileName, ' is unreadable.')) + base::stop() + } + df$readout_time <- base::as.POSIXlt(df$readout_time) + + # Windows that already have at least one observation + present <- unique(floor_15m(df$readout_time)) + + # Missing windows + missing <- all_starts[!all_starts %in% present] + + # Build blank rows for missing windows + blanks <- data.frame(readout_time = missing) + + # Combine and sort + df_filled <- bind_rows(df, blanks) + df_filled <- df_filled[order(df_filled$readout_time), ] + + #add in source id if needed + if("source_id" %in% colnames(df_filled)){ + source_id<-unique(df_filled$source_id[!is.na(df_filled$source_id)]) + if(length(source_id>0)){ + df_filled$source_id[is.na(df_filled$source_id)]<-source_id[1] + }else{ + df_filled$source_id[is.na(df_filled$source_id)]<-"99999" + } + } + + # create output directories + subDirOut <- paste0(dirOut,'/',subDir,'/') + base::dir.create(subDirOut,recursive=TRUE) + + # write out data + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = data_filled, + NameFile = base::paste0(subDirOut,fileName)),silent=TRUE) + if(class(rptOut)[1] == 'try-error'){ + log$error(base::paste0('Cannot write file to ',base::paste0(subDirOut,fileName),'. ',attr(rptOut, "condition"))) + stop() + } else { + log$info(base::paste0('File written successfully in ', base::paste0(subDirOut,fileName))) + } + + } + } + +} From a7ff45509b70ea383eef74b0b987ab03e22ab4b2 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 4 Dec 2025 15:59:17 -0700 Subject: [PATCH 139/182] combined module --- flow/flow.gap.fill.nonrglr/renv.lock | 235 ++++++++++++++++++ .../fill_date_gaps_nonregularized/Dockerfile | 57 +++++ 2 files changed, 292 insertions(+) create mode 100644 flow/flow.gap.fill.nonrglr/renv.lock create mode 100644 modules_combined/fill_date_gaps_nonregularized/Dockerfile diff --git a/flow/flow.gap.fill.nonrglr/renv.lock b/flow/flow.gap.fill.nonrglr/renv.lock new file mode 100644 index 000000000..e1243e0b3 --- /dev/null +++ b/flow/flow.gap.fill.nonrglr/renv.lock @@ -0,0 +1,235 @@ +{ + "R": { + "Version": "4.1.3", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d4335fe7207f1c01ab8c41762f5840d4", + "Requirements": [] + }, + "cli": { + "Package": "cli", + "Version": "3.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "16850760556401a2eeb27d39bd11c9cb", + "Requirements": [] + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-18", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "019388fc48e48b3da0d3a76ff94608a8", + "Requirements": [] + }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "451e5edf411987991ab6a5410c45011f", + "Requirements": [ + "foreach", + "iterators" + ] + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec", + "Requirements": [ + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "pillar", + "rlang", + "tibble", + "tidyselect", + "vctrs" + ] + }, + "foreach": { + "Package": "foreach", + "Version": "1.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "618609b42c9406731ead03adf5379850", + "Requirements": [ + "codetools", + "iterators" + ] + }, + "fs": { + "Package": "fs", + "Version": "1.6.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7eb1e342eee7e0a7449c49cdaa526d39", + "Requirements": [] + }, + "generics": { + "Package": "generics", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4b29bf698d0c7bdb9f1e4976e7ade41d", + "Requirements": [] + }, + "glue": { + "Package": "glue", + "Version": "1.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5899f1eaa825580172bb56c08266f37c", + "Requirements": [] + }, + "iterators": { + "Package": "iterators", + "Version": "1.0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8954069286b4b2b0d023d1b288dce978", + "Requirements": [] + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b8552d117e1b808b09a832f589b79035", + "Requirements": [ + "cli", + "glue", + "rlang" + ] + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "7ce2733a9826b3aeb1775d56fd305472", + "Requirements": [] + }, + "pillar": { + "Package": "pillar", + "Version": "1.10.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "1098920a19b5cd5a15bacdc74a89979d", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "utf8", + "vctrs" + ] + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "01f28d4278f15c76cddbea05899c5d6f", + "Requirements": [] + }, + "renv": { + "Package": "renv", + "Version": "0.16.0", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteRepo": "renv", + "RemoteUsername": "rstudio", + "RemoteRef": "0.16.0", + "RemoteSha": "0e3aab27a928eb261819a3fc45a3ee2b4ba902a5", + "Hash": "9e5e2246d73254a29a4182f4e8257c09", + "Requirements": [] + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1", + "Requirements": [] + }, + "tibble": { + "Package": "tibble", + "Version": "3.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "784b27d0801c3829de602105757b2cd7", + "Requirements": [ + "cli", + "lifecycle", + "magrittr", + "pillar", + "pkgconfig", + "rlang", + "vctrs" + ] + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "829f27b9c4919c16b593794a6344d6c0", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ] + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.6", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d526d558be176e9ceb68c3d1e83479b7", + "Requirements": [] + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c03fa420630029418f7e6da3667aac4a", + "Requirements": [ + "cli", + "glue", + "lifecycle", + "rlang" + ] + }, + "withr": { + "Package": "withr", + "Version": "3.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "cc2d62c76458d425210d1eb1478b30b4", + "Requirements": [] + } + } +} diff --git a/modules_combined/fill_date_gaps_nonregularized/Dockerfile b/modules_combined/fill_date_gaps_nonregularized/Dockerfile new file mode 100644 index 000000000..1e7f6c046 --- /dev/null +++ b/modules_combined/fill_date_gaps_nonregularized/Dockerfile @@ -0,0 +1,57 @@ +# Dockerfile for NEON IS Data Processing - Combined date gap filler and nonregularized gap filler +# This image combines the two modules: date_gap_filler and flow.gap.fill.nonrglr + +# Start with the base R image. +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.6.0 + +# maintainer handle +MAINTAINER "Nora Catolico" ncatolico@battelleecology.org + +# Build in the filter-joiner python module +ARG MODULE_DIR="modules" +ARG APP_DIR="date_gap_filler" +ARG COMMON_DIR="common" +ARG CONTAINER_APP_DIR="/usr/src/app" +ENV PYTHONPATH="${PYTHONPATH}:${CONTAINER_APP_DIR}" + +WORKDIR ${CONTAINER_APP_DIR} + +COPY ${MODULE_DIR}/${APP_DIR}/requirements.txt ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt + + +RUN apt update && \ + apt-get install -y --no-install-recommends \ + python3.8 && \ + apt install -y python3-pip && \ + python3 -mpip install --no-cache-dir --upgrade pip setuptools wheel && \ + python3 -mpip install --no-cache-dir -r ${CONTAINER_APP_DIR}/${APP_DIR}/requirements.txt && \ + apt-get autoremove -y && \ + apt-get autoclean -y && \ + rm -rf /var/lib/apt/lists/* + + +# Copy in python code +COPY ${MODULE_DIR}/${APP_DIR} ${CONTAINER_APP_DIR}/${APP_DIR} +COPY ${MODULE_DIR}/${COMMON_DIR} ${CONTAINER_APP_DIR}/${COMMON_DIR} + +# Build in the module +ARG MODULE_DIR="flow" + +# Copy the lockfile and restore known working versions of R dependency packages +# ENSURE that the renv.lock file is up-to-date and thus has all listed dependencies prior to creating this docker image +COPY ./${MODULE_DIR}/flow.gap.fill.nonrglr/renv.lock ./renv.lock +RUN R -e 'renv::restore(lockfile="./renv.lock")' +#RUN git clone -b deve https://github.com/NEONScience/eddy4R.git +#RUN R -e 'renv::install("./eddy4R/pack/eddy4R.base",repos=c(remotes::bioc_install_repos(),"https://cran.rstudio.com/"))' + +# Create app user +RUN groupadd appuser && \ + useradd appuser -g appuser +WORKDIR /usr/src/app + +# Copy in application code +COPY ./${MODULE_DIR}/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R . +COPY ./${MODULE_DIR}/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R . + +# Run as app user +USER appuser From b1024435f593b7a0c4c835857cb6c99c82ec54c3 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 4 Dec 2025 16:24:53 -0700 Subject: [PATCH 140/182] fix name --- flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R index 1cbb2b988..b0b415489 100644 --- a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -151,7 +151,7 @@ wrap.gap.fill.nonrglr <- function(DirIn, base::dir.create(subDirOut,recursive=TRUE) # write out data - rptOut <- try(NEONprocIS.base::def.wrte.parq(data = data_filled, + rptOut <- try(NEONprocIS.base::def.wrte.parq(data = df_filled, NameFile = base::paste0(subDirOut,fileName)),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ log$error(base::paste0('Cannot write file to ',base::paste0(subDirOut,fileName),'. ',attr(rptOut, "condition"))) From e6a716a5ee8ca62aea13271b967b1c66e6eddbf4 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 8 Dec 2025 13:58:32 -0700 Subject: [PATCH 141/182] add in file schema parameters --- .../flow.gap.fill.nonrglr.R | 44 +++++++++++++++++-- .../wrap.gap.fill.nonrglr.R | 36 ++++++++++++++- 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R index 3931ae08f..6a7901bd5 100644 --- a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R @@ -50,12 +50,23 @@ #' For example, "DirFill=data|flags" indicates to regularize the data files within each the data #' and flags directories. #' -#' 5. "WndwFill=value", where value is the window in minutes in which data are expected. It is formatted as a 3 character sequence, +#' #' 5. "FileSchm=value" (optional), where value is the full path to schema for data output by +#' this workflow. The value may be NA, in which case the output schema will be the same as the input +#' data. The value may be a single file, in which case it will apply to all output, or +#' multiple values in which case the argument is formatted as dir:value|dir:value... +#' where dir is one of the directories specified in DirFill and value is the path to the schema file +#' for the output of that directory. Multiple dir:value pairs are separated by pipes (|). +#' For example, "FileSchm=data:/path/to/schemaData.avsc|flags:NA" indicates that the +#' output from the data directory will be written with the schema /path/to/schemaData.avsc and the +#' output from the flags directory will be the same as the input files found in that +#' directory. +#' +#' 6. "WndwFill=value", where value is the window in minutes in which data are expected. It is formatted as a 3 character sequence, #' representing the number of minutes over which any number of measurements are expected. #' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a #' 30-minute interval. #' -#' 6. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by +#' 7. "DirSubCopy=value" (optional), where value is the names of additional subfolders, separated by #' pipes, at the same level as the regularization folder in the input path that are to be copied with a #' symbolic link to the output path. #' @@ -77,6 +88,7 @@ # "DirOut=~/pfs/out ", # "DirErr=~/pfs/out/errored_datums ", # "DirFill=data|flags", +# "FileSchm=data:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_calibration_flags.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_log_flags.avsc", # "WndwFill=015", # "DirSubCopy=location|uncertainty_coef") @@ -122,11 +134,36 @@ Para <- "WndwFill" ), NameParaOptn = c( - "DirSubCopy" + "DirSubCopy", + "FileSchm" ), log = log ) +# Retrieve output schema(s) +log$debug(base::paste0( + 'Output schema(s) for regularized data: ', + base::paste0(Para$FileSchm, collapse = ',') +)) +SchmFill <- + NEONprocIS.base::def.vect.pars.pair( + vect = Para$FileSchm, + KeyExp = Para$DirFill, + ValuDflt = 'NA', + NameCol = c('DirFill', 'FileSchmFill'), + log = log + ) + +# Read in the schema(s) +SchmFill$SchmFill <- NA +for (idxSchmFill in 1:base::length(SchmFill$FileSchmFill)) { + if (SchmFill$FileSchmFill[idxSchmFill] != 'NA') { + SchmFill$SchmFill[idxSchmFill] <- + base::paste0(base::readLines(SchmFill$FileSchmFill[idxSchmFill]), + collapse = '') + } +} + # Echo arguments log$debug(base::paste0('Input directory: ', Para$DirIn)) log$debug(base::paste0('Output directory: ', Para$DirOut)) @@ -180,6 +217,7 @@ foreach::foreach(idxDirIn = DirIn) %dopar% { DirOutBase=Para$DirOut, WndwFill=WndwFill, DirFill=Para$DirFill, + SchmFill=SchmFill, DirSubCopy=DirSubCopy, log=log ), diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R index b0b415489..22d53656f 100644 --- a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -2,7 +2,7 @@ #' @title Gap filling module for non-regularized data in NEON IS data processing. #' @author -#' Cove Sturtevant \email{csturtevant@battelleecology.org} +#' Nora Catolico \email{ncatolico@battelleecology.org} #' @description Wrapper function. Bin data to generate a regular time sequence of observations. #' General code workflow: @@ -32,6 +32,17 @@ #' For example, "DirFill=data|flags" indicates to regularize the data files within each the data #' and flags directories. #' +#' @param FileSchm Character value (optional), where value is the full path to schema for data output by +#' this workflow. The value may be NA, in which case the output schema will be the same as the input +#' data. The value may be a single file, in which case it will apply to all output, or +#' multiple values in which case the argument is formatted as dir:value|dir:value... +#' where dir is one of the directories specified in DirFill and value is the path to the schema file +#' for the output of that directory. Multiple dir:value pairs are separated by pipes (|). +#' For example, "FileSchm=data:/path/to/schemaData.avsc|flags:NA" indicates that the +#' output from the data directory will be written with the schema /path/to/schemaData.avsc and the +#' output from the flags directory will be the same as the input files found in that +#' directory. +#' #' @param WndwFill Character value. The window in minutes in which data are expected. It is formatted as a 3 character sequence, #' representing the number of minutes over which any number of measurements are expected. #' For example, "WndwFill=015" refers to a 15-minute interval, while "WndwAgr=030" refers to a @@ -66,6 +77,7 @@ wrap.gap.fill.nonrglr <- function(DirIn, DirOutBase, DirFill, WndwFill, + SchmFill, DirSubCopy=NULL, log=NULL ){ @@ -150,9 +162,29 @@ wrap.gap.fill.nonrglr <- function(DirIn, subDirOut <- paste0(dirOut,'/',subDir,'/') base::dir.create(subDirOut,recursive=TRUE) + # select output schema + FileSchmFill<-SchmFill$FileSchmFill[grepl(subDir,SchmFill$DirFill)] + if(length(FileSchmFill>1)){ + #specific to suna for now. can be updated if needed down the road + if(grepl("log",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("log",FileSchmFill,ignore.case = TRUE)] + } + if(grepl("cal",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("cal",FileSchmFill,ignore.case = TRUE)] + } + } + + + if (base::is.na(FileSchmFill)|FileSchmFill=="NA"|length(FileSchmFill)>1) { + # use the output data to generate a schema + idxSchmFill <- base::attr(df_filled, 'schema') + } else { + idxSchmFill <- SchmFill$SchmFill[SchmFill$FileSchmFill==FileSchmFill] + } + # write out data rptOut <- try(NEONprocIS.base::def.wrte.parq(data = df_filled, - NameFile = base::paste0(subDirOut,fileName)),silent=TRUE) + NameFile = base::paste0(subDirOut,fileName),Schm = idxSchmFill),silent=TRUE) if(class(rptOut)[1] == 'try-error'){ log$error(base::paste0('Cannot write file to ',base::paste0(subDirOut,fileName),'. ',attr(rptOut, "condition"))) stop() From 98108c3772c0501f652067efac3c97ea7ede627d Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 8 Dec 2025 16:14:47 -0700 Subject: [PATCH 142/182] no schema case --- .../flow.gap.fill.nonrglr.R | 27 +++++++++-------- .../wrap.gap.fill.nonrglr.R | 30 +++++++++++-------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R index 6a7901bd5..259b442af 100644 --- a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R @@ -84,14 +84,13 @@ #' @examples #' Stepping through the code in R studio # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/22/CFGLOC110819", +# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/22", # "DirOut=~/pfs/out ", # "DirErr=~/pfs/out/errored_datums ", # "DirFill=data|flags", -# "FileSchm=data:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_calibration_flags.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_log_flags.avsc", # "WndwFill=015", +# "FileSchm=data:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_logfilled.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_calibration_flags.avsc|flags:/home/NEON/ncatolico/pfs/sunav2_avro_schemas/sunav2/sunav2_log_flags.avsc", # "DirSubCopy=location|uncertainty_coef") - #' @seealso \code{\link[eddy4R.base]{def.rglr}} # changelog and author contributions / copyrights @@ -145,7 +144,8 @@ log$debug(base::paste0( 'Output schema(s) for regularized data: ', base::paste0(Para$FileSchm, collapse = ',') )) -SchmFill <- +if(length(Para$FileSchm)>0){ + SchmFill <- NEONprocIS.base::def.vect.pars.pair( vect = Para$FileSchm, KeyExp = Para$DirFill, @@ -153,17 +153,20 @@ SchmFill <- NameCol = c('DirFill', 'FileSchmFill'), log = log ) - -# Read in the schema(s) -SchmFill$SchmFill <- NA -for (idxSchmFill in 1:base::length(SchmFill$FileSchmFill)) { - if (SchmFill$FileSchmFill[idxSchmFill] != 'NA') { - SchmFill$SchmFill[idxSchmFill] <- - base::paste0(base::readLines(SchmFill$FileSchmFill[idxSchmFill]), - collapse = '') + # Read in the schema(s) + SchmFill$SchmFill <- NA + for (idxSchmFill in 1:base::length(SchmFill$FileSchmFill)) { + if (SchmFill$FileSchmFill[idxSchmFill] != 'NA') { + SchmFill$SchmFill[idxSchmFill] <- + base::paste0(base::readLines(SchmFill$FileSchmFill[idxSchmFill]), + collapse = '') + } } +}else{ + SchmFill <- NA } + # Echo arguments log$debug(base::paste0('Input directory: ', Para$DirIn)) log$debug(base::paste0('Output directory: ', Para$DirOut)) diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R index 22d53656f..31bb7c01a 100644 --- a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -163,25 +163,29 @@ wrap.gap.fill.nonrglr <- function(DirIn, base::dir.create(subDirOut,recursive=TRUE) # select output schema - FileSchmFill<-SchmFill$FileSchmFill[grepl(subDir,SchmFill$DirFill)] - if(length(FileSchmFill>1)){ - #specific to suna for now. can be updated if needed down the road - if(grepl("log",fileName,ignore.case = TRUE)){ - FileSchmFill<-FileSchmFill[grepl("log",FileSchmFill,ignore.case = TRUE)] + if(!is.na(SchmFill)){ + FileSchmFill<-SchmFill$FileSchmFill[grepl(subDir,SchmFill$DirFill)] + if(length(FileSchmFill)>1){ + #specific to suna for now. can be updated if needed down the road + if(grepl("log",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("log",FileSchmFill,ignore.case = TRUE)] + } + if(grepl("cal",fileName,ignore.case = TRUE)){ + FileSchmFill<-FileSchmFill[grepl("cal",FileSchmFill,ignore.case = TRUE)] + } } - if(grepl("cal",fileName,ignore.case = TRUE)){ - FileSchmFill<-FileSchmFill[grepl("cal",FileSchmFill,ignore.case = TRUE)] + if (base::is.na(FileSchmFill)|FileSchmFill=="NA"|length(FileSchmFill)>1) { + # use the output data to generate a schema + idxSchmFill <- base::attr(df_filled, 'schema') + } else { + idxSchmFill <- SchmFill$SchmFill[SchmFill$FileSchmFill==FileSchmFill] } - } - - - if (base::is.na(FileSchmFill)|FileSchmFill=="NA"|length(FileSchmFill)>1) { + }else{ # use the output data to generate a schema idxSchmFill <- base::attr(df_filled, 'schema') - } else { - idxSchmFill <- SchmFill$SchmFill[SchmFill$FileSchmFill==FileSchmFill] } + # write out data rptOut <- try(NEONprocIS.base::def.wrte.parq(data = df_filled, NameFile = base::paste0(subDirOut,fileName),Schm = idxSchmFill),silent=TRUE) From 7a91b1d5361718f73f4c22f4a95f23382e4f6c69 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 10 Dec 2025 08:28:38 -0700 Subject: [PATCH 143/182] added indicator for gap filling --- .../flow.gap.fill.nonrglr.R | 2 +- .../wrap.gap.fill.nonrglr.R | 6 ++++ pipe/nitrate/nitrate_null_gap_ucrt.yaml | 2 +- pipe/sunav2/sunav2_fill_date_gaps.yaml | 33 +++++++++++++++++-- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R index 259b442af..e3ee724b7 100644 --- a/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/flow.gap.fill.nonrglr.R @@ -84,7 +84,7 @@ #' @examples #' Stepping through the code in R studio # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/22", +# arg<-c( "DirIn=~/pfs/sunav2_fill_date_gaps/sunav2/2025/06/23/CFGLOC110819", # "DirOut=~/pfs/out ", # "DirErr=~/pfs/out/errored_datums ", # "DirFill=data|flags", diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R index 31bb7c01a..70f77f3e6 100644 --- a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -72,6 +72,8 @@ # changelog and author contributions / copyrights # Nora Catolico (2025-12-4) # original creation +# Nora Catolico (2025-12-10) +# added indicator column for gap filled timestamps (needed for SUNA) ############################################################################################## wrap.gap.fill.nonrglr <- function(DirIn, DirOutBase, @@ -134,6 +136,7 @@ wrap.gap.fill.nonrglr <- function(DirIn, base::stop() } df$readout_time <- base::as.POSIXlt(df$readout_time) + df$addedRow<-0 # Windows that already have at least one observation present <- unique(floor_15m(df$readout_time)) @@ -143,6 +146,9 @@ wrap.gap.fill.nonrglr <- function(DirIn, # Build blank rows for missing windows blanks <- data.frame(readout_time = missing) + if(length(blanks$readout_time)>0){ + blanks$addedRow<-1 + } # Combine and sort df_filled <- bind_rows(df, blanks) diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 968a7b0ff..a5dcf2a11 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-fee1f15 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-98108c3 cmd: - sh - "-c" diff --git a/pipe/sunav2/sunav2_fill_date_gaps.yaml b/pipe/sunav2/sunav2_fill_date_gaps.yaml index f3dcd452c..91fd9860f 100644 --- a/pipe/sunav2/sunav2_fill_date_gaps.yaml +++ b/pipe/sunav2/sunav2_fill_date_gaps.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_fill_date_gaps transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gf-rglr:v1.2.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-gap-fill-nonrglr:sha-98108c3 cmd: - sh - "-c" @@ -12,14 +12,30 @@ transform: set -euo pipefail IFS=$'\n\t' + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -r -f /tmp/pfs/interim + rm -rf $OUT_PATH + mkdir -p /tmp/pfs/interim + mkdir -p $OUT_PATH # R modules must have pfs in the repo structure + # Run first module - date-gap-filler (using environment variables below as input parameters) python3 -m date_gap_filler.date_gap_filler_main + + #run gap filler for nonregularized data + Rscript ./flow.gap.fill.nonrglr.R \ + DirIn=/tmp/pfs/interim \ + DirOut=/pfs/out \ + DirErr=/pfs/out/errored_datums \ + "DirFill=data|flags" \ + WndwFill="015" \ + "DirSubCopy=location|uncertainty_coef" \ + "FileSchm=data:$FILE_SCHEMA_DATA|flags:$FILE_SCHEMA_CAL_FLAGS|flags:$FILE_SCHEMA_LOG_FLAGS" EOF env: # Environment variables for date gap filler LOG_LEVEL: INFO - OUT_PATH: /pfs/out + OUT_PATH: /tmp/pfs/interim OUTPUT_DIRECTORIES: data,location,uncertainty_coef,flags DATA_SOURCE_TYPE_INDEX: '3' DATA_YEAR_INDEX: '4' @@ -43,6 +59,18 @@ input: repo: sunav2_empty_files glob: /sunav2 empty_files: false # Make sure to use false if LINK_TYPE=COPY. Can also be set to false for LINK_TYPE=SYMLINK. + - pfs: + name: FILE_SCHEMA_DATA + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_logfilled.avsc + - pfs: + name: FILE_SCHEMA_CAL_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_calibration_flags.avsc + - pfs: + name: FILE_SCHEMA_LOG_FLAGS + repo: sunav2_avro_schemas + glob: /sunav2/sunav2_log_flags.avsc - group: - pfs: name: DATA_PATH @@ -65,6 +93,7 @@ input: joinOn: $1 group_by: $1 empty_files: true # This can remain true even if LINK_TYPE=COPY + parallelism_spec: constant: 5 autoscaling: true From d5e56d6fb92a58b578d33a7a7cff94bd5d5d8209 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 10 Dec 2025 08:51:39 -0700 Subject: [PATCH 144/182] reverting added row --- flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R index 70f77f3e6..31bb7c01a 100644 --- a/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R +++ b/flow/flow.gap.fill.nonrglr/wrap.gap.fill.nonrglr.R @@ -72,8 +72,6 @@ # changelog and author contributions / copyrights # Nora Catolico (2025-12-4) # original creation -# Nora Catolico (2025-12-10) -# added indicator column for gap filled timestamps (needed for SUNA) ############################################################################################## wrap.gap.fill.nonrglr <- function(DirIn, DirOutBase, @@ -136,7 +134,6 @@ wrap.gap.fill.nonrglr <- function(DirIn, base::stop() } df$readout_time <- base::as.POSIXlt(df$readout_time) - df$addedRow<-0 # Windows that already have at least one observation present <- unique(floor_15m(df$readout_time)) @@ -146,9 +143,6 @@ wrap.gap.fill.nonrglr <- function(DirIn, # Build blank rows for missing windows blanks <- data.frame(readout_time = missing) - if(length(blanks$readout_time)>0){ - blanks$addedRow<-1 - } # Combine and sort df_filled <- bind_rows(df, blanks) From 0c617063430caba1cbb8a156d16f0c23550c24d7 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Wed, 10 Dec 2025 09:22:26 -0700 Subject: [PATCH 145/182] Updated SUNA sensor specific flagging module to pass added null filler for completely missing burst. --- .../wrap.sunav2.quality.flags.R | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 392889f7e..497c7dd6b 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -29,8 +29,8 @@ #' #' @examples #' # Not run -# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" -# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/24/nitrate_HOPB112100/sunav2/CFGLOC113620" +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/23/nitrate-surfacewater_SUGG103100/sunav2/CFGLOC110819" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/23/nitrate-surfacewater_SUGG103100/sunav2/CFGLOC110819" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") @@ -49,7 +49,10 @@ #' have same number of measurements. #' #' Bobby Hensley (2025-10-30) -#' Updated to revert over-flagged measuremnts at end of burst. +#' Updated to revert over-flagged measurements at end of burst. +#' +#' Bobby Hensley (2025-12-10) +#' Updated lamp stabilization to pass added null "filler" for completely missing bursts. #' ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, @@ -180,14 +183,16 @@ wrap.sunav2.quality.flags <- function(DirIn, lampStabilizePoints=9 #' Hard-coded until thresholds are updated. sensorFlags$burstNumber<-0 #' Assumes each burst starts with a dark measurement. for(i in 2:nrow(sunaData)){ - if(is.na(sunaData[i,which(colnames(sunaData)=='light_dark_frame')])){ - sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=0} - #' If header is missing, assumes a dark measurement starting a new burst. if(!is.na(sunaData[i,which(colnames(sunaData)=='light_dark_frame')])){ if(sunaData[i,which(colnames(sunaData)=='light_dark_frame')]==1){ sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=sensorFlags[i-1,which(colnames(sensorFlags)=='burstNumber')]+1} else{sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=0}} - } + } + #' If light dark header is missing, assumes value was added null "filler" for a missing burst that needs to be passed. + for(i in 1:nrow(sunaData)){ + if(is.na(sunaData[i,which(colnames(sunaData)=='light_dark_frame')])){ + sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=9999} + } sensorFlags$nitrateLampStabilizeQF<-0 for(i in 1:nrow(sensorFlags)){ if(sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]<=lampStabilizePoints){ @@ -229,7 +234,7 @@ wrap.sunav2.quality.flags <- function(DirIn, log$debug(base::paste0('Data and flags have same number of measurements')) } - #replace with NA's so that falgged data is excluded from averaging + #replace with NA's so that flagged data is excluded from averaging dataOut<-merge(sunaData,allFlags,by='readout_time') dataOut$nitrate[dataOut$nitrateHumidityQF==1]<-NA dataOut$nitrate[dataOut$nitrateLampTempQF==1]<-NA From 62da0a93e8635e84f2b596847fb7c07383436bea Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 10 Dec 2025 12:36:01 -0700 Subject: [PATCH 146/182] updates for case of blank ucrt coef file --- .../flow.sunav2.exp.uncert.R | 8 +- .../wrap.sunav2.exp.uncert.R | 114 +++++++++--------- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- 3 files changed, 65 insertions(+), 59 deletions(-) diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R index 6f7619d68..18a2c920f 100644 --- a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -37,10 +37,10 @@ #' log=log) #' Stepping through the code in R studio # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", -# "DirOut=~/pfs/out", -# "DirErr=~/pfs/out/errored_datums", -# "DirSubCopy=location|quality_metrics") +# arg <- c("DirIn=~/pfs/testing/nitrate-surfacewater_SUGG103100", +# "DirOut=~/pfs/testing2", +# "DirErr=~/pfs/testing2/errored_datums", +# "DirSubCopy=group|location|quality_metrics") # rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R index 3ca32d30f..db6a59e6d 100644 --- a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -81,7 +81,7 @@ wrap.sunav2.exp.uncert <- function(DirIn, #' Read in json file of uncertainty coefficients. coeffileName<-base::list.files(DirInCoeff,full.names=FALSE) if(length(coeffileName)==0){ - log$error(base::paste0('Quality metrics not found in ', DirInCoeff)) + log$error(base::paste0('Uncertainty coefficient not found in ', DirInCoeff)) stop() } else { uncertCoeff<-base::try(NEONprocIS.cal::def.read.ucrt.coef.fdas(NameFile = base::paste0(DirInCoeff, '/', coeffileName)), @@ -89,66 +89,72 @@ wrap.sunav2.exp.uncert <- function(DirIn, log$debug(base::paste0('Successfully read in file: ',coeffileName)) } - #' Converts uncertainty coefficient dates to POSIXct and values to numeric - uncertCoeff$start_date <- as.POSIXct(uncertCoeff$start_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') - uncertCoeff$end_date <- as.POSIXct(uncertCoeff$end_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') - uncertCoeff$Value<-as.numeric(uncertCoeff$Value) - - #' Determines which uncertainty coefficients to be applied to each time interval. - #' (In case there are more than one on a particular day) - uncertCoeff<-uncertCoeff[order(uncertCoeff$start_date), ] - uncertCoeffA1<-uncertCoeff[(uncertCoeff$Name=="U_CVALA1"),] - statsData$uncertCoeffA1<-NA - for (i in 1:nrow(statsData)){ - for (j in 1:nrow(uncertCoeffA1)){ - if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="start_date")]){ - statsData[i,which(colnames(statsData)=="uncertCoeffA1")]=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="Value")]}}} - uncertCoeffA3<-uncertCoeff[(uncertCoeff$Name=="U_CVALA3"),] - statsData$uncertCoeffA3<-NA - for (i in 1:nrow(statsData)){ - for (j in 1:nrow(uncertCoeffA3)){ - if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="start_date")]){ - statsData[i,which(colnames(statsData)=="uncertCoeffA3")]=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="Value")]}}} - - #' Identify the column name with the mean, variance and number of points - meanName<-grep("Mean",names(statsData),value=TRUE) - varianceName<-grep("Variance",names(statsData),value=TRUE) - pointsName<-grep("NumPts",names(statsData),value=TRUE) - - #' Calculates calibration uncertainty. See ATBD for more details. - #' Concentrations <= 20 mg/L have fixed calibration uncertainty equal to coeffA1. - #' Concentrations greater than 20 mg/L uncertainty equals concentration times coeffA1. - #' Note stats data concentrations are in uM so threshold needs to be converted from mg/L by dividing by 0.014 (14 g/mol / 1000 ug/mg) - statsData$calUncert<-NA - for (i in 1:nrow(statsData)){ - if(is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="calUncert")]=NA} - if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){ - if(statsData[i,which(colnames(statsData)==meanName)]<=(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA1")]} - if(statsData[i,which(colnames(statsData)==meanName)]>(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA3")]} + if(length(uncertCoeff)>0){ + #' Converts uncertainty coefficient dates to POSIXct and values to numeric + uncertCoeff$start_date <- as.POSIXct(uncertCoeff$start_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$end_date <- as.POSIXct(uncertCoeff$end_date, format = "%Y-%m-%dT%H:%M:%S", tz='utc') + uncertCoeff$Value<-as.numeric(uncertCoeff$Value) + + #' Determines which uncertainty coefficients to be applied to each time interval. + #' (In case there are more than one on a particular day) + uncertCoeff<-uncertCoeff[order(uncertCoeff$start_date), ] + uncertCoeffA1<-uncertCoeff[(uncertCoeff$Name=="U_CVALA1"),] + statsData$uncertCoeffA1<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA1)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA1")]=uncertCoeffA1[j,which(colnames(uncertCoeffA1)=="Value")]}}} + uncertCoeffA3<-uncertCoeff[(uncertCoeff$Name=="U_CVALA3"),] + statsData$uncertCoeffA3<-NA + for (i in 1:nrow(statsData)){ + for (j in 1:nrow(uncertCoeffA3)){ + if(statsData[i,which(colnames(statsData)=="startDateTime")]>=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="start_date")]){ + statsData[i,which(colnames(statsData)=="uncertCoeffA3")]=uncertCoeffA3[j,which(colnames(uncertCoeffA3)=="Value")]}}} + + #' Identify the column name with the mean, variance and number of points + meanName<-grep("Mean",names(statsData),value=TRUE) + varianceName<-grep("Variance",names(statsData),value=TRUE) + pointsName<-grep("NumPts",names(statsData),value=TRUE) + + #' Calculates calibration uncertainty. See ATBD for more details. + #' Concentrations <= 20 mg/L have fixed calibration uncertainty equal to coeffA1. + #' Concentrations greater than 20 mg/L uncertainty equals concentration times coeffA1. + #' Note stats data concentrations are in uM so threshold needs to be converted from mg/L by dividing by 0.014 (14 g/mol / 1000 ug/mg) + statsData$calUncert<-NA + for (i in 1:nrow(statsData)){ + if(is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="calUncert")]=NA} + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){ + if(statsData[i,which(colnames(statsData)==meanName)]<=(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA1")]} + if(statsData[i,which(colnames(statsData)==meanName)]>(20/0.014)){statsData[i,which(colnames(statsData)=="calUncert")]=statsData[i,which(colnames(statsData)=="uncertCoeffA3")]} + } } - } - - #' Calculates the repeatability (natural variation). See ATBD for more details. - statsData$natVar<-NA - for (i in 1:nrow(statsData)){ - if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="natVar")]= - sqrt(statsData[i,which(colnames(statsData)==varianceName)]/statsData[i,which(colnames(statsData)==pointsName)])} - } - - #' Calculates the expanded uncertainty, which is estimated as 2x the combined uncertainty. See ATBD for more details. - statsData$surfWaterNitrateExpUncert<-NA - for (i in 1:nrow(statsData)){ - if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="surfWaterNitrateExpUncert")]= - 2*sqrt(statsData[i,which(colnames(statsData)=="natVar")]+statsData[i,which(colnames(statsData)=="calUncert")])} + + #' Calculates the repeatability (natural variation). See ATBD for more details. + statsData$natVar<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="natVar")]= + sqrt(statsData[i,which(colnames(statsData)==varianceName)]/statsData[i,which(colnames(statsData)==pointsName)])} + } + + #' Calculates the expanded uncertainty, which is estimated as 2x the combined uncertainty. See ATBD for more details. + statsData$surfWaterNitrateExpUncert<-NA + for (i in 1:nrow(statsData)){ + if(!is.na(statsData[i,which(colnames(statsData)==meanName)])){statsData[i,which(colnames(statsData)=="surfWaterNitrateExpUncert")]= + 2*sqrt(statsData[i,which(colnames(statsData)=="natVar")]+statsData[i,which(colnames(statsData)=="calUncert")])} + } + + #' Removes unnecessary columns. + statsData<-subset(statsData,select=-c(uncertCoeffA3,uncertCoeffA1,calUncert,natVar)) + }else{ + #add required columns to stats data + statsData$surfWaterNitrateExpUncert<-NA } - #' Removes unnecessary columns. - statsData<-subset(statsData,select=-c(uncertCoeffA3,uncertCoeffA1,calUncert,natVar)) #' Write out updated stats file. rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, NameFile = base::paste0(DirOutStats,'/',statsFileName), - Schm = NULL),silent=TRUE) + Schm = SchmStats),silent=TRUE) if(class(rptOutStats)[1] == 'try-error'){ log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName),'. ',attr(rptOutStats, "condition"))) stop() diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index c4013c22d..33912d549 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-4fa99b1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-6fa7b75 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] From 4ec5d703d072cf51da74a4e53030bc24919bf779 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 11 Dec 2025 14:27:19 -0700 Subject: [PATCH 147/182] latest --- .../flow.insufficient.data.R | 8 +-- .../wrap.insufficient.data.R | 18 +++---- .../nitrate_analyze_pad_and_qaqc_plau.yaml | 9 ++-- pipe/nitrate/nitrate_flags_specific.yaml | 4 +- .../nitrate_level1_group_consolidate_srf.yaml | 6 +-- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 50 +++++++++++-------- .../nitrate/nitrate_qm_group_and_compute.yaml | 5 ++ pipe/nitrate/nitrate_srf_loader.yaml | 2 +- .../nitrate_stats_group_and_compute.yaml | 9 ++-- 9 files changed, 63 insertions(+), 48 deletions(-) diff --git a/flow/flow.insufficient.data/flow.insufficient.data.R b/flow/flow.insufficient.data/flow.insufficient.data.R index 0feccbffe..1f14f46b0 100644 --- a/flow/flow.insufficient.data/flow.insufficient.data.R +++ b/flow/flow.insufficient.data/flow.insufficient.data.R @@ -44,10 +44,10 @@ #' SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse=''), #' log=log) #' Stepping through the code in R studio -# Sys.setenv(DIR_IN='~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733", -# "minPoints=10","DirOut=~/pfs/out","DirErr=~/pfs/out/errored_datums","DirSubCopy=location") +# arg <- c("DirIn=~/pfs/testing/nitrate-surfacewater_CARI102100/sunav2", +# "minPoints=10","DirOut=~/pfs/testing2","DirErr=~/pfs/out/errored_datums","DirSubCopy=location", +# "SchmQMs=~/pfs/nitrate_avro_schemas/nitrate/nitrate_insufficient_data.avsc") # rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently @@ -57,6 +57,8 @@ #' Initial creation. #' Nora Catolico (2025-11-04) #' add in copied directories +#' Nora Catolico (2025-12-11) +#' fix schema outputs ############################################################################################## options(digits.secs = 3) diff --git a/flow/flow.insufficient.data/wrap.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R index 8a1b6c709..2cc7453d2 100644 --- a/flow/flow.insufficient.data/wrap.insufficient.data.R +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -16,7 +16,7 @@ #' @param SchmStats (optional), A json-formatted character string containing the schema for the output averaged stats parquet. #' Should be the same as the input. #' -#' @param SchmQMsOut (optional), A json-formatted character string containing the schema for the output quality metrics parquet +#' @param SchmQMs (optional), A json-formatted character string containing the schema for the output quality metrics parquet #' with insufficient data quality flag added. #' #' @param DirSubCopy (optional) Character vector. The names of additional subfolders at @@ -40,7 +40,7 @@ # minPoints=10 # DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" # SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') -# SchmQMsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse='') +# SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") #' #' @@ -53,7 +53,7 @@ wrap.insufficient.data <- function(DirIn, minPoints, DirOutBase, SchmStats=NULL, - SchmQMsOut=NULL, + SchmQMs=NULL, DirSubCopy=NULL, log=NULL ){ @@ -127,23 +127,23 @@ wrap.insufficient.data <- function(DirIn, #' Write out stats file. rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, NameFile = base::paste0(DirOutStats,'/',statsFileName), - Schm = NULL),silent=TRUE) + Schm = SchmStats),silent=TRUE) if(class(rptOutStats)[1] == 'try-error'){ - log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName,".parquet"),'. ',attr(rptOutStats, "condition"))) + log$error(base::paste0('Cannot write updated stats to ',base::paste0(DirOutStats,'/',statsFileName),'. ',attr(rptOutStats, "condition"))) stop() } else { - log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName,".parquet"))) + log$info(base::paste0('Updated stats written successfully in ', base::paste0(DirOutStats,'/',statsFileName))) } #' Write out QMs file. rptOutQMs <- try(NEONprocIS.base::def.wrte.parq(data = qmData, NameFile = base::paste0(DirOutQMs,'/',qmFileName), - Schm = SchmQMsOut),silent=TRUE) + Schm = SchmQMs),silent=TRUE) if(class(rptOutQMs)[1] == 'try-error'){ - log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName,".parquet"),'. ',attr(rptOutQMs, "condition"))) + log$error(base::paste0('Cannot write updated QMs to ',base::paste0(DirOutQMs,'/',qmFileName),'. ',attr(rptOutQMs, "condition"))) stop() } else { - log$info(base::paste0('Updated QMs written successfully in ', base::paste0(DirOutQMs,'/',qmFileName,".parquet"))) + log$info(base::paste0('Updated QMs written successfully in ', base::paste0(DirOutQMs,'/',qmFileName))) } } diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index 77f604354..4641f1c8d 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -44,10 +44,11 @@ transform: # Environment variables for qaqc plausibility PARALLELIZATION_INTERNAL: '1' input: - pfs: - name: DATA_PATH - repo: nitrate_thresh_select_ts_pad - glob: /*/*/* + cross: + - pfs: + name: DATA_PATH + repo: nitrate_thresh_select_ts_pad + glob: /*/*/* parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 33912d549..921345d15 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -24,8 +24,8 @@ input: glob: /(*/*/*/*) - pfs: name: SCHEMA_FLAGS - repo: sunav2_avro_schemas - glob: /sunav2/sunav2_all_flags.avsc + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_all_flags.avsc parallelism_spec: constant: 6 autoscaling: true diff --git a/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml index cd704a96a..22401d71c 100644 --- a/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml +++ b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml @@ -71,7 +71,7 @@ transform: fgroup="${BASH_REMATCH[4]}" fname="${BASH_REMATCH[5]}" # Now get the timing index from the file name - [[ "$fname" =~ ^${GROUP_PREFIX}_[A-Za-z0-9]+_${fyear}-${fmonth}-${fday}_[A-Za-z0-9]+_([A-Za-z0-9]+)_([A-Za-z0-9]+).parquet ]] + [[ "$fname" =~ ^${GROUP_PREFIX}_[A-Za-z0-9]+_${fyear}-${fmonth}-${fday}_[A-Za-z0-9]+_([A-Za-z0-9]+)+_([A-Za-z0-9]+)_([A-Za-z0-9]+).parquet ]] avg_int="${BASH_REMATCH[2]}" #Form the output path and link outdir="${linkdir}/v2/${GROUP_PREFIX}/${avg_int}/group=${fgroup}/ms=${fyear}-${fmonth}" @@ -118,7 +118,7 @@ transform: OUT_PATH_1: /tmp/interimA # Transfered to OUT_PATH for the first module RELATIVE_PATH_INDEX: "3" # This is shared among the 2 filter joiners and consolidation module LINK_TYPE: COPY # options are COPY or SYMLINK. Use COPY for combined modules. Also shared with 2nd & 3rd modules - LOG_LEVEL: INFO # Shared among all modules + LOG_LEVEL: DEBUG # Shared among all modules # Below are the environment variables for 2nd filter-joiner bringing in the Science review flags # Can't do this in first filter-joiner bc there are only data in the srf assignment @@ -177,7 +177,7 @@ input: join: - pfs: name: DATA_PATH - repo: nitrate_qm_group_and_compute + repo: nitrate_null_gap_ucrt glob: /(*/*/*) joinOn: $1 outer_join: true # Need outer join to pull in with or without SRFs diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index a5dcf2a11..13e307c3c 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-98108c3 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-62da0a9 cmd: - sh - "-c" @@ -27,6 +27,7 @@ transform: DirIn=/tmp/pfs/filter_joined \ DirOut=/tmp/pfs/interimUcrt \ DirErr=/pfs/out/errored_datums \ + SchmStats=$FILE_SCHEMA_UCRT \ "DirSubCopy=group|location|quality_metrics" # Run third module - insufficient data @@ -83,28 +84,33 @@ transform: LOG_LEVEL: INFO # Shared among all modules input: - join: + cross: - pfs: - name: QUALITY_METRICS_PATH - repo: nitrate_qm_group_and_compute - glob: /(*/*/*) - joinOn: $1 - outer_join: true # Need outer join to pull in with or without SRFs - empty_files: false # Make sure this is false for LINK_TYPE=COPY - - pfs: - name: STATISTICS_PATH - repo: nitrate_stats_group_and_compute - glob: /(*/*/*) - joinOn: $1 - outer_join: true # Need outer join to pull in with or without SRFs - empty_files: false # Make sure this is false for LINK_TYPE=COPY - - pfs: - name: GROUP_PATH - repo: nitrate_group_path - glob: /(*/*/*) - joinOn: $1 - outer_join: true # Need outer join to pull in with or without SRFs - empty_files: false # Make sure this is false for LINK_TYPE=COPY + name: FILE_SCHEMA_UCRT + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_ucrt.avsc + - join: + - pfs: + name: QUALITY_METRICS_PATH + repo: nitrate_qm_group_and_compute + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: STATISTICS_PATH + repo: nitrate_stats_group_and_compute + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY + - pfs: + name: GROUP_PATH + repo: nitrate_group_path + glob: /(*/*/*) + joinOn: $1 + outer_join: true # Need outer join to pull in with or without SRFs + empty_files: false # Make sure this is false for LINK_TYPE=COPY parallelism_spec: constant: 5 autoscaling: true diff --git a/pipe/nitrate/nitrate_qm_group_and_compute.yaml b/pipe/nitrate/nitrate_qm_group_and_compute.yaml index e40600b9b..7b65a30d2 100644 --- a/pipe/nitrate/nitrate_qm_group_and_compute.yaml +++ b/pipe/nitrate/nitrate_qm_group_and_compute.yaml @@ -20,12 +20,17 @@ transform: "WndwAgr=015" \ "WghtAlphBeta=2|1" \ Thsh=0.2 \ + FileSchmQm=$FILE_SCHEMA_QM \ "GrpQfAlph1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF|nitratePersistenceQF|nitrateHumidityQF|nitrateLampTempQF|nitrateLightDarkRatioQF|nitrateLampStabilizeQF" \ "GrpQfBeta1=nitrate:nitrateRangeQF|nitrateStepQF|nitrateSpikeQF" env: LOG_LEVEL: DEBUG input: cross: + - pfs: + name: FILE_SCHEMA_QM + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_quality_metrics.avsc - pfs: name: QAQC_PLAUSIBILITY_PATH repo: nitrate_flags_specific diff --git a/pipe/nitrate/nitrate_srf_loader.yaml b/pipe/nitrate/nitrate_srf_loader.yaml index dd0902e7b..7e1d59cd9 100644 --- a/pipe/nitrate/nitrate_srf_loader.yaml +++ b/pipe/nitrate/nitrate_srf_loader.yaml @@ -4,7 +4,7 @@ transform: cmd: - /bin/bash env: - GROUP_PREFIX: nitrate_ + GROUP_PREFIX: nitrate-surfacewater_ LOG_LEVEL: INFO OUT_PATH: /pfs/out image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-srf-loader:v1.0.0 diff --git a/pipe/nitrate/nitrate_stats_group_and_compute.yaml b/pipe/nitrate/nitrate_stats_group_and_compute.yaml index aa25553c6..9ec23ab58 100644 --- a/pipe/nitrate/nitrate_stats_group_and_compute.yaml +++ b/pipe/nitrate/nitrate_stats_group_and_compute.yaml @@ -22,6 +22,7 @@ transform: DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ WndwAgr=015 \ + FileSchmStat=$FILE_SCHEMA_STATS \ "TermStat1=nitrate:mean|minimum|maximum|variance|numPts|stdEr" EOF env: @@ -52,10 +53,10 @@ transform: PARALLELIZATION_INTERNAL: '5' # Option for stats module input: cross: - # - pfs: - # name: FILE_SCHEMA_STATS - # repo: nitrate_avro_schemas - # glob: /nitrate/nitrate_dp01_stats.avsc + - pfs: + name: FILE_SCHEMA_STATS + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_stats.avsc - join: - pfs: name: QAQC_PATH From 38f5444391bd97de0cbd5ba2e3f10ce8644ab16f Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 11 Dec 2025 14:33:48 -0700 Subject: [PATCH 148/182] update image --- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 13e307c3c..057629311 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-62da0a9 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-4ec5d70 cmd: - sh - "-c" @@ -36,6 +36,7 @@ transform: DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ minPoints=10 \ + SchmQMs=$FILE_SCHEMA_INSUF \ "DirSubCopy=location" EOF @@ -89,6 +90,10 @@ input: name: FILE_SCHEMA_UCRT repo: nitrate_avro_schemas glob: /nitrate/nitrate_ucrt.avsc + - pfs: + name: FILE_SCHEMA_INSUF + repo: nitrate_avro_schemas + glob: /nitrate/nitrate_insufficient_data.avsc - join: - pfs: name: QUALITY_METRICS_PATH From e0a94827ef6ff00fb2ffeca999502b063a36b26f Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Fri, 12 Dec 2025 13:37:42 -0700 Subject: [PATCH 149/182] turn daily kafka loading back on for sunav2 so ENG can test issues --- pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml index 375692d63..bdcdf9aae 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control_kafka.yaml @@ -21,8 +21,8 @@ input: # This cron is the central driver for daily scheduled updates, such as data ingest and metadata loaders. - cron: name: tick - spec: "@never" - #spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) + #spec: "@never" + spec: "0 7 * * *" # Run at 00:00 MST (07:00 GMT) overwrite: true - pfs: name: SITE_FILE From 861109892c70523135d08cdd34755a38e848930e Mon Sep 17 00:00:00 2001 From: ncatolico Date: Fri, 12 Dec 2025 16:10:35 -0700 Subject: [PATCH 150/182] fix nan --- flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R index db6a59e6d..fe0704c46 100644 --- a/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/wrap.sunav2.exp.uncert.R @@ -150,6 +150,8 @@ wrap.sunav2.exp.uncert <- function(DirIn, statsData$surfWaterNitrateExpUncert<-NA } + statsData$surfWaterNitrateMean[is.nan(statsData$surfWaterNitrateMean)]<-NA + #' Write out updated stats file. rptOutStats <- try(NEONprocIS.base::def.wrte.parq(data = statsData, From d8bedbee9c478efd79bb7a4697e70d194f205e90 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Dec 2025 11:23:12 -0700 Subject: [PATCH 151/182] latest --- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 4 ++-- pipe/nitrate/pipe_list_nitrate.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 057629311..c6a3977ed 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-4ec5d70 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-e0a9482 cmd: - sh - "-c" @@ -35,7 +35,7 @@ transform: DirIn=/tmp/pfs/interimUcrt \ DirOut=/pfs/out \ DirErr=/pfs/out/errored_datums \ - minPoints=10 \ + minPoints=5 \ SchmQMs=$FILE_SCHEMA_INSUF \ "DirSubCopy=location" diff --git a/pipe/nitrate/pipe_list_nitrate.txt b/pipe/nitrate/pipe_list_nitrate.txt index 51c5af457..bf8f2e611 100644 --- a/pipe/nitrate/pipe_list_nitrate.txt +++ b/pipe/nitrate/pipe_list_nitrate.txt @@ -6,9 +6,9 @@ nitrate_group_path.yaml nitrate_threshold.yaml nitrate_thresh_select_ts_pad.yaml nitrate_analyze_pad_and_qaqc_plau.yaml - nitrate_flags_specific.yaml nitrate_stats_group_and_compute.yaml +nitrate_null_gap_ucrt.yaml nitrate_qm_group_and_compute.yaml nitrate_level1_group_consolidate_srf.yaml nitrate_cron_monthly_and_pub_control.yaml From 32315a4162ac4e6b35886708d22a1d83af36f9ca Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Dec 2025 12:30:11 -0700 Subject: [PATCH 152/182] latest --- flow/flow.insufficient.data/flow.insufficient.data.R | 4 ++-- flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/flow/flow.insufficient.data/flow.insufficient.data.R b/flow/flow.insufficient.data/flow.insufficient.data.R index 1f14f46b0..f24efc991 100644 --- a/flow/flow.insufficient.data/flow.insufficient.data.R +++ b/flow/flow.insufficient.data/flow.insufficient.data.R @@ -45,8 +45,8 @@ #' log=log) #' Stepping through the code in R studio # log <- NEONprocIS.base::def.log.init(Lvl = "debug") -# arg <- c("DirIn=~/pfs/testing/nitrate-surfacewater_CARI102100/sunav2", -# "minPoints=10","DirOut=~/pfs/testing2","DirErr=~/pfs/out/errored_datums","DirSubCopy=location", +# arg <- c("DirIn=~/pfs/nitrate_null_gap_ucrt_updated/nitrate-surfacewater_SUGG103100", +# "minPoints=5","DirOut=~/pfs/nitrate_null_gap_ucrt_updated2","DirErr=~/pfs/out/errored_datums","DirSubCopy=location", # "SchmQMs=~/pfs/nitrate_avro_schemas/nitrate/nitrate_insufficient_data.avsc") # rm(list=setdiff(ls(),c('arg','log'))) diff --git a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R index 18a2c920f..d8416befc 100644 --- a/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R +++ b/flow/flow.sunav2.exp.uncert/flow.sunav2.exp.uncert.R @@ -38,9 +38,10 @@ #' Stepping through the code in R studio # log <- NEONprocIS.base::def.log.init(Lvl = "debug") # arg <- c("DirIn=~/pfs/testing/nitrate-surfacewater_SUGG103100", -# "DirOut=~/pfs/testing2", -# "DirErr=~/pfs/testing2/errored_datums", -# "DirSubCopy=group|location|quality_metrics") +# "DirOut=~/pfs/nitrate_null_gap_ucrt_updated", +# "DirErr=~/pfs/nitrate_null_gap_ucrt_updated/errored_datums", +# "DirSubCopy=group|location|quality_metrics", +# "SchmStats=~/pfs/nitrate_avro_schemas/nitrate/nitrate_ucrt.avsc") # rm(list=setdiff(ls(),c('arg','log'))) #' @seealso None currently From 0c988f062056d71850c4eedf4ae0dff178654646 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 15 Dec 2025 15:32:50 -0700 Subject: [PATCH 153/182] publication pipelines --- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 2 +- .../nitrate_pub_egress_and_publish.yaml | 173 ++++++++++++++++++ .../nitrate_pub_format_and_package.yaml | 163 +++++++++++++++++ pipe/nitrate/nitrate_pub_group.yaml | 53 ++++++ 4 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 pipe/nitrate/nitrate_pub_egress_and_publish.yaml create mode 100644 pipe/nitrate/nitrate_pub_format_and_package.yaml create mode 100644 pipe/nitrate/nitrate_pub_group.yaml diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index c6a3977ed..e24e7c9c6 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-e0a9482 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-32315a4 cmd: - sh - "-c" diff --git a/pipe/nitrate/nitrate_pub_egress_and_publish.yaml b/pipe/nitrate/nitrate_pub_egress_and_publish.yaml new file mode 100644 index 000000000..b0defc9e5 --- /dev/null +++ b/pipe/nitrate/nitrate_pub_egress_and_publish.yaml @@ -0,0 +1,173 @@ +--- +pipeline: + name: nitrate_pub_egress_and_publish +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-egrs-publ:v5.0.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + curl -o $OUT_MDP_SITES https://raw.githubusercontent.com/NEONScience/NEON-IS-data-processing-inputs/refs/heads/main/mdp_sites_list.txt + + # Run first module - pub_egress (using environment variables below as input parameters) + if [[ $(echo $DATA_PATH) ]]; then + python3 -m pub_egress.pub_egress_main + fi + # If there is output, egress it + if ls $OUT_PATH/NEON.DOM.SITE* 1> /dev/null 2>&1; then + for DIR in $OUT_PATH/NEON.DOM.SITE*; do + echo "Starting non-MDP sites==================" + echo "Syncing $DIR to bucket $BUCKET_NAME" + # Parse the product + [[ "$DIR" =~ ^$OUT_PATH/(.*)$ ]] + PRODUCT="${BASH_REMATCH[1]}" + echo "PRODUCT is $PRODUCT" + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${OUT_PATH}/${PRODUCT}" \ + ":gcs://${BUCKET_NAME}/${PRODUCT}" + done + echo "============ Done for non-MDP sites" + else + echo "No pub output to egress" + fi + + # + # Do the same for MDP sites if mdp sites exists in the output + # Check to see if the output need to be sent to the staging or not + # For example, BUCKET_NAME_MDP: neon-aa-dev-md03-staging/Publication for staging SITE=MD03 + # Read mdp_site_list from githubusercontent + # + if ls $OUT_PATH_MDP/NEON.DOM.SITE* 1> /dev/null 2>&1; then + for DIR in $OUT_PATH_MDP/NEON.DOM.SITE*; do + echo "=" + echo "Starting MDP sites==================" + # Parse the product + [[ "$DIR" =~ ^$OUT_PATH_MDP/(.*)$ ]] + PRODUCT="${BASH_REMATCH[1]}" + echo "PRODUCT is $PRODUCT" + for DIR_SUB in $DIR/MD*; do + echo "DIR is $DIR" + echo "DIR_SUB is $DIR_SUB" + # Parse the site + [[ "$DIR_SUB" =~ ^$DIR/(.*)$ ]] + SITE="${BASH_REMATCH[1]}" + # to change to lowercase in case + # export site="${SITE,,}" + # + while read -r mdpsite prod staging bucket_name + do + if [[ $SITE == $mdpsite ]] && [[ $prod == $PROD ]] && [[ $staging == $STAGING ]]; then + BUCKET_NAME_MDP=$bucket_name + echo "$mdpsite products to $bucket_name bucket" + else echo "**** No products available for $mdpsite to $bucket_name bucket" + fi + done < $OUT_MDP_SITES + echo "Syncing $SITE products directory $DIR to mdp bucket $BUCKET_NAME_MDP" + done + rclone \ + --no-check-dest \ + --copy-links \ + --gcs-bucket-policy-only \ + --gcs-no-check-bucket \ + copy \ + "${OUT_PATH_MDP}/${PRODUCT}" \ + ":gcs://${BUCKET_NAME_MDP}/${PRODUCT}" + done + echo "============ Done for MDP sites" + cp -f "$OUT_MDP_SITES" $OUT_PATH_MDP/mdp_sites.txt + else + echo "No MDP pub output to egress" + fi + + # Run second module - pub_upload (using environment variables below as input parameters) + echo "run pub uploader ..." + export DATA_PATH=$OUT_PATH + python3 -m pub_uploader.pub_uploader_main + # Run third module - pub_sync (using environment variables below as input parameters) + echo "run pub sync sites ..." + python3 -m pub_sync.pub_sync_main + EOF + env: + LOG_LEVEL: INFO + + # Environment variables for 1st module: pub_egress. The pub bucket and egress url are specified via secrets below. + OUT_PATH: "/pfs/out" + OUT_PATH_MDP: "/pfs/out/mdp" + OUT_MDP_SITES: "/tmp/mdp_sites.txt" + # ERR_PATH can be changed, it is user specified + ERR_PATH: /pfs/out/errored_datums + STARTING_PATH_INDEX: "2" # starting path index to process pub packages. Use "2" to process the whole repo with path structure /pfs/repo_name/... + PROD: "false" # false for non-prod, true for prod + STAGING: "true" # The default is true. + + # Environment variables for 2nd module: pub_upload. + # DATA_PATH is set in the code above to the output from the egress module + # Uses STARTING_PATH_INDEX above + VERSION: 'pachyderm_v1' + CHANGE_BY: pachyderm + + # Environment variables for 3rd module: pub_sync. + # Uses DATE_PATH from input spec. DATA_PATH is set in the code above to the output from the egress module + # Uses CHANGE_BY above + DATE_PATH_YEAR_INDEX: "3" + DATE_PATH_MONTH_INDEX: "4" + DATA_PATH_PRODUCT_INDEX: "3" + DATA_PATH_SITE_INDEX: "4" + DATA_PATH_DATE_INDEX: "5" + DATA_PATH_PACKAGE_INDEX: "6" + PRODUCTS: NEON.DOM.SITE.DP1.20033.001 # CAN BE MULTIPLE, COMMA-SEPARATED + SITES: "all" # CAN BE MULTIPLE, COMMA-SEPARATED array of NEON site codes. "all" will find all sites with pub records in the database. + + secrets: + - name: pdr-secret + mount_path: /var/db_secret + - name: pub-bucket + env_var: BUCKET_NAME + key: BUCKET_NAME + - name: pub-bucket + env_var: EGRESS_URL + key: EGRESS_URL + +input: + group: + - join: + - pfs: + name: DATA_PATH + repo: nitrate_pub_format_and_package + # Glob must be at each intended pub datum (i.e. each site/year/month), grouped by month + glob: /*/*/(*/*) + joinOn: $1 + group_by: $1 + - pfs: + name: DATE_PATH + repo: nitrate_cron_monthly_and_pub_control + glob: /(*/*) + joinOn: $1 + outer_join: True # We want to run even if no data so pub_sync runs + group_by: $1 + empty_files: true +autoscaling: true +resource_requests: + memory: 500M + cpu: .5 +resource_limits: + memory: 1G + cpu: 1.3 +sidecar_resource_requests: + memory: 2G + cpu: 1.3 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_pub_format_and_package.yaml b/pipe/nitrate/nitrate_pub_format_and_package.yaml new file mode 100644 index 000000000..d3db6206c --- /dev/null +++ b/pipe/nitrate/nitrate_pub_format_and_package.yaml @@ -0,0 +1,163 @@ +--- +pipeline: + name: nitrate_pub_format_and_package +transform: + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-grp-pack:v4.2.0 + cmd: + - sh + - "-c" + - |- + /bin/bash <<'EOF' + # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ + set -euo pipefail + IFS=$'\n\t' + + # Refresh interim directories with each datum (otherwise they persist and cause probs) + rm -rf $OUT_PATH_TRANSFORMER + rm -rf $OUT_PATH_PACKAGER + rm -rf $OUT_PATH_PUBFILES + rm -rf $OUT_PATH_MAINTFILES + mkdir $OUT_PATH_TRANSFORMER + mkdir $OUT_PATH_PACKAGER + mkdir $OUT_PATH_PUBFILES + mkdir $OUT_PATH_MAINTFILES + + # Set some environment variables for the pub transformer module + export DATA_PATH=$GROUPED_PATH + export OUT_PATH=$OUT_PATH_TRANSFORMER + + # Run pub_workbook_loader to load pub workbooks for pub_transformer and os_table_loader. + python3 -m pub_workbook_loader.pub_workbook_loader_main + + # Run pub_transformer (using environment variables below as input parameters) + python3 -m pub_transformer.pub_transformer_main + + # Run pub_packager. Packager needs to be run at monthly glob. Get those paths. + export OUT_PATH=$OUT_PATH_PACKAGER + product_month_paths="${OUT_PATH_TRANSFORMER}/*/*/*" + for path in $product_month_paths; do + echo "Processing product-month path $path" + export DATA_PATH=$path + python3 -m pub_packager.pub_packager_main + done + + # Clean up after pub_transformer. + rm -rf $OUT_PATH_TRANSFORMER + + # Run pub_files. + export OUT_PATH=$OUT_PATH_PUBFILES + export IN_PATH=$OUT_PATH_PACKAGER + export LOCATION_PATH=$GROUPED_PATH + python3 -m pub_files.main + + # Run os_table_loader for maintenance files + export IN_PATH=$OUT_PATH_PUBFILES + export OUT_PATH=$OUT_PATH_MAINTFILES + export WORKBOOK_PATH=$WORKBOOK_PATH + export PARTIAL_TABLE_NAME="maintenance" + export FILE_TYPE="csv" + export DB_CONFIG_SOURCE="mount" + export LOG_LEVEL=$LOG_LEVEL + export INPUT_PATH_PARSE_INDEX="3" + export DATA_PRODUCT_PATH_INDEX="3" + export SITE_PATH_INDEX="4" + export YEAR_PATH_INDEX="5" + export MONTH_PATH_INDEX="6" + export PACKAGE_TYPE_PATH_INDEX="7" + python3 -m os_table_loader.publication_main + + # Run os_table_loader for SUNA clean/cal files + export IN_PATH=$OUT_PATH_MAINTFILES + export OUT_PATH="/pfs/out" + export WORKBOOK_PATH=$WORKBOOK_PATH + export PARTIAL_TABLE_NAME="sunaCleanAndCal" + export FILE_TYPE="csv" + export DB_CONFIG_SOURCE="mount" + export LOG_LEVEL=$LOG_LEVEL + export INPUT_PATH_PARSE_INDEX="3" + export DATA_PRODUCT_PATH_INDEX="3" + export SITE_PATH_INDEX="4" + export YEAR_PATH_INDEX="5" + export MONTH_PATH_INDEX="6" + export PACKAGE_TYPE_PATH_INDEX="7" + python3 -m os_table_loader.publication_main + + + EOF + env: + # Environment variables for 2nd (part A) module: pub_workbook_loader. + OUT_PATH_WORKBOOK: "/tmp/pub_workbooks" + PRODUCTS: NEON.DOM.SITE.DP1.20033.001 # Format: NEON.DOM.SITE.DPX.XXXXX.XXX,NEON.DOM.SITE.DPX.XXXXX.XXX,etc + + # Environment variables for 2nd module (part B): pub_transformer. + LOG_LEVEL: INFO + PRODUCT_INDEX: '3' # input path index of the data product identifier. Also shared with pub_packager. + YEAR_INDEX: '4' + MONTH_INDEX: '5' + DAY_INDEX: '7' + DATA_TYPE_INDEX: '8' + GROUP_METADATA_DIR: group + DATA_PATH_PARSE_INDEX: '2' + OUT_PATH_TRANSFORMER: "/tmp/pub_transformer" + WORKBOOK_PATH: "/tmp/pub_workbooks" + + # Environment variables for module: pub_packager. Also uses PRODUCT_INDEX from pub_transformer. + OUT_PATH_PACKAGER: "/tmp/pub_packager" + ERR_PATH_PACKAGER: "/pfs/out/packager/errored_datums" + PUBLOC_INDEX: '6' # input path index of the pub package location (typically the site) + DATE_INDEX: '4' # Starting index of date in path (i.e. year index) + DATE_INDEX_LENGTH: '2' # length of date index for pub package (should be 2 for monthly) + SORT_INDEX: '10' # File name index corresponding to date field (delimiter = .) + + # Environment variables for module: pub_files. + OUT_PATH_PUBFILES: "/tmp/pub_files" + RELATIVE_PATH_INDEX: '3' + DB_SECRETS_PATH: /var/db_secret + GITHUB_PEM_PATH: /var/github_secret/key + GITHUB_APP_ID: '300002' + GITHUB_INSTALLATION_ID: '34765458' + GITHUB_HOST: https://api.github.com + GITHUB_REPO_OWNER: NEONScience + GITHUB_README_REPO: neon-metadata-docs + GITHUB_README_PATH: readme/template.j2 + GITHUB_EML_REPO: neon-metadata-docs + GITHUB_EML_BOILERPLATE_PATH: eml/neon_components/NEON_EML_Boilerplate.xml + GITHUB_EML_CONTACT_PATH: eml/neon_components/neon_contact.xml + GITHUB_EML_INTELLECTUAL_RIGHTS_PATH: eml/neon_components/neon_intellectualRights.xml + GITHUB_EML_UNIT_TYPES_PATH: eml/neon_components/neon_unitTypes.xml + GITHUB_EML_UNITS_PATH: eml/neon_components/NEON_units.txt + GITHUB_BRANCH: main + + # Environment variables for module: ais_maintenance table loader + OUT_PATH_MAINTFILES: "/tmp/maint_out" + + secrets: + - name: pdr-secret + mount_path: /var/db_secret + - name: github-neonscience-app-secret + mount_path: /var/github_secret + +input: + pfs: + name: GROUPED_PATH + repo: nitrate_pub_group + # Glob must be product-monthly or product-site-monthly. Product-site-month datums reduce unneccesary republication. + # path structure is e.g. DP1.00098.001/2023/04/CPER/04 (product/year/month/site/day) + glob: /*/*/*/* +parallelism_spec: + constant: 5 +autoscaling: true +resource_requests: + memory: 400M + cpu: 1.2 +resource_limits: + memory: 800M + cpu: 1.2 +sidecar_resource_requests: + memory: 3.5G + cpu: 0.4 +datum_set_spec: + number: 5 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class diff --git a/pipe/nitrate/nitrate_pub_group.yaml b/pipe/nitrate/nitrate_pub_group.yaml new file mode 100644 index 000000000..c20de26de --- /dev/null +++ b/pipe/nitrate/nitrate_pub_group.yaml @@ -0,0 +1,53 @@ +--- +pipeline: + name: nitrate_pub_group +transform: +# image_pull_secrets: [battelleecology-quay-read-all-pull-secret] + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pub-grp-pack:v4.2.0 + cmd: ["/bin/bash"] + stdin: + - "#!/bin/bash" + - '# Run first module - pub_grouper (using environment variables below as input parameters)' + - python3 -m pub_grouper.pub_grouper_main + env: + # Environment variables for 1st module: pub_grouper. + LOG_LEVEL: INFO + OUT_PATH: "/pfs/out" + ERR_PATH_GROUPER: "pfs/out/errored_datums" + YEAR_INDEX: '3' + GROUP_INDEX: '6' + DATA_TYPE_INDEX: '7' # Also shared with pub_transform + GROUP_METADATA_DIR: group + PUBLOC_KEY: site + LINK_TYPE: SYMLINK + +input: + join: + - pfs: + name: DATA_PATH + repo: nitrate_level1_group_consolidate_srf + # Glob should be monthly and joined with pub_control to hold pub until month is likely complete + glob: /(*/*) + joinOn: $1 + - pfs: + repo: nitrate_cron_monthly_and_pub_control + glob: /(*/*) + joinOn: $1 + empty_files: true +parallelism_spec: + constant: 2 +autoscaling: true +resource_requests: + memory: 1.8G + cpu: 1 +resource_limits: + memory: 2.5G + cpu: 1.5 +sidecar_resource_requests: + memory: 3G + cpu: 1 +datum_set_spec: + number: 1 +scheduling_spec: + node_selector: + cloud.google.com/compute-class: pach-pipeline-class From 7162cce205423a647fe893ec7ef54f7cfb4dca35 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Dec 2025 08:01:01 -0700 Subject: [PATCH 154/182] latest --- pipe/sunav2/pipe_list_sunav2.txt | 3 ++- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/pipe_list_sunav2.txt b/pipe/sunav2/pipe_list_sunav2.txt index f520bd707..1fb91b452 100644 --- a/pipe/sunav2/pipe_list_sunav2.txt +++ b/pipe/sunav2/pipe_list_sunav2.txt @@ -14,4 +14,5 @@ sunav2_location_asset.yaml sunav2_location_asset_assignment.yaml sunav2_location_loader.yaml sunav2_location_active_dates_assignment.yaml -sunav2_location_group_and_restructure.yaml \ No newline at end of file +sunav2_location_group_and_restructure.yaml +sunav2_fill_date_gaps.yaml \ No newline at end of file diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 4f78ec833..cca85fb70 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-06-19" # Inclusive - END_DATE: "2025-06-29" # Inclusive + START_DATE: "2025-05-29" # Inclusive + END_DATE: "2025-07-02" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" From 736e55ee88066c87a1cb145fb05361b75ba89938 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Dec 2025 10:12:00 -0700 Subject: [PATCH 155/182] change minPoints to numeric --- flow/flow.insufficient.data/wrap.insufficient.data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flow/flow.insufficient.data/wrap.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R index 2cc7453d2..b3c52da0f 100644 --- a/flow/flow.insufficient.data/wrap.insufficient.data.R +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -37,7 +37,7 @@ #' @examples #' # Not run # DirIn<-"~/pfs/nitrate_null_gap_ucrt/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" -# minPoints=10 +# minPoints=5 # DirOut<-"~/pfs/nitrate_null_gap_ucrt_updated/2025/06/24/nitrate_CRAM103100/sunav2/CFGLOC110733" # SchmStats<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_stats.avsc'),collapse='') # SchmQMs<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_quality_metrics.avsc'),collapse='') @@ -114,6 +114,7 @@ wrap.insufficient.data <- function(DirIn, #' If the number of points is greater than or equal to the minimum required, #' revert the insufficient data quality flag (default is to apply it). qmData$insufficientDataQF=1 + minPoints<-as.numeric(minPoints) for(i in 1:nrow(statsData)){ if(statsData[i,which(colnames(statsData)==ptsColName)]>=minPoints){ qmData[i,which(colnames(qmData)=='insufficientDataQF')]=0}} From 1610ebb250476e7f33553b96ffb9489ea9281cf8 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 16 Dec 2025 13:30:10 -0700 Subject: [PATCH 156/182] Updated SUNA specific flag script so dark measuremts caused by lamp temperature shutoff are still cunted as part of same burst. --- .../wrap.sunav2.quality.flags.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 497c7dd6b..399f743ab 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -29,8 +29,8 @@ #' #' @examples #' # Not run -# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/23/nitrate-surfacewater_SUGG103100/sunav2/CFGLOC110819" -# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/23/nitrate-surfacewater_SUGG103100/sunav2/CFGLOC110819" +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/01/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/01/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") @@ -54,6 +54,8 @@ #' Bobby Hensley (2025-12-10) #' Updated lamp stabilization to pass added null "filler" for completely missing bursts. #' +#' Bobby Hensley (2025-12-16) +#' Updated so that dark measurements caused by lamp temperature cutoff are still counted as part of same burst. ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, DirOutBase, @@ -182,9 +184,10 @@ wrap.sunav2.quality.flags <- function(DirIn, # lampStabilizePoints<-lampStabilizeThreshold$number_value lampStabilizePoints=9 #' Hard-coded until thresholds are updated. sensorFlags$burstNumber<-0 #' Assumes each burst starts with a dark measurement. + #' If measurement is a light frame, or if the lamp temp caused a dark measurement, it is counted as the next measuremnt in a burst. for(i in 2:nrow(sunaData)){ if(!is.na(sunaData[i,which(colnames(sunaData)=='light_dark_frame')])){ - if(sunaData[i,which(colnames(sunaData)=='light_dark_frame')]==1){ + if(sunaData[i,which(colnames(sunaData)=='light_dark_frame')]==1|sensorFlags[i,which(colnames(sensorFlags)=='nitrateLampTempQF')]==1){ sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=sensorFlags[i-1,which(colnames(sensorFlags)=='burstNumber')]+1} else{sensorFlags[i,which(colnames(sensorFlags)=='burstNumber')]=0}} } @@ -214,7 +217,9 @@ wrap.sunav2.quality.flags <- function(DirIn, if((allFlags[i,which(colnames(allFlags)=='burstNumber')]==0)&(allFlags[i-2,which(colnames(allFlags)=='nitratePersistenceQF')]==0)){ allFlags[i-1,which(colnames(allFlags)=='nitratePersistenceQF')]=0} } - allFlags<-allFlags[,-which(colnames(allFlags)=='burstNumber')] #' Drops this column since it's no longer needed. + + #' Drops burst number column since it's no longer needed. + allFlags<-allFlags[,-which(colnames(allFlags)=='burstNumber')] #' Removes all measurements where lamp has not stabilized from data and flag files. lampStabilizeFlagsOnly<-sensorFlags[,c("readout_time","nitrateLampStabilizeQF")] @@ -236,6 +241,7 @@ wrap.sunav2.quality.flags <- function(DirIn, #replace with NA's so that flagged data is excluded from averaging dataOut<-merge(sunaData,allFlags,by='readout_time') + dataOut$nitrate[dataOut$light_dark_frame==0]<-NA dataOut$nitrate[dataOut$nitrateHumidityQF==1]<-NA dataOut$nitrate[dataOut$nitrateLampTempQF==1]<-NA dataOut$nitrate[dataOut$nitrateLightDarkRatioQF==1]<-NA From ae78d24d97a77a4990be219ae837955e676ddd6c Mon Sep 17 00:00:00 2001 From: ncatolico Date: Tue, 16 Dec 2025 14:34:29 -0700 Subject: [PATCH 157/182] update images --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 921345d15..dd3c10866 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-6fa7b75 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-7eaa562 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index e24e7c9c6..3c506001f 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-32315a4 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-736e55e cmd: - sh - "-c" From f5e027bb2b002d5c523ad8dba018513b4eb1a565 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 16 Dec 2025 15:21:36 -0700 Subject: [PATCH 158/182] Update to to SUNA flagging script to set error codes that didn't otherwise get flagged to NA. --- .../wrap.sunav2.quality.flags.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 399f743ab..2ccb4f9d8 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -29,8 +29,8 @@ #' #' @examples #' # Not run -# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/01/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" -# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/01/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/02/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/02/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") @@ -56,6 +56,7 @@ #' #' Bobby Hensley (2025-12-16) #' Updated so that dark measurements caused by lamp temperature cutoff are still counted as part of same burst. +#' Update so that any low transmittance error codes (-1) are set to NA. ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, DirOutBase, @@ -239,9 +240,10 @@ wrap.sunav2.quality.flags <- function(DirIn, log$debug(base::paste0('Data and flags have same number of measurements')) } - #replace with NA's so that flagged data is excluded from averaging + #' Replace with NA's so that flagged data is excluded from averaging dataOut<-merge(sunaData,allFlags,by='readout_time') - dataOut$nitrate[dataOut$light_dark_frame==0]<-NA + dataOut$nitrate[dataOut$nitrate==-1&dataOut$nitrogen_in_nitrate==-1]<-NA #' Low transmittance error codes + dataOut$nitrate[dataOut$light_dark_frame==0]<-NA #' Measurements where lamp may have failed to turn on dataOut$nitrate[dataOut$nitrateHumidityQF==1]<-NA dataOut$nitrate[dataOut$nitrateLampTempQF==1]<-NA dataOut$nitrate[dataOut$nitrateLightDarkRatioQF==1]<-NA From c5d41aa7557a8533727c17878971d2830f4d9b2e Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 16 Dec 2025 17:22:53 -0700 Subject: [PATCH 159/182] Changed where in script low transmittence values are quality flagged. --- .../wrap.sunav2.quality.flags.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index 2ccb4f9d8..f3b56de34 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -179,6 +179,12 @@ wrap.sunav2.quality.flags <- function(DirIn, sensorFlags[i,which(colnames(sensorFlags)=='nitrateLightDarkRatioQF')]=1} else{sensorFlags[i,which(colnames(sensorFlags)=='nitrateLightDarkRatioQF')]=0}} } + #' Extra test so that low transmittance error codes (-1) always trigger spectral ratio test regardless of threshold + for(i in 1:nrow(sunaData)){ + if(!is.na(sunaData[i,which(colnames(sunaData)=='nitrate')])&!is.na(sunaData[i,which(colnames(sunaData)=='nitrogen_in_nitrate')])){ + if(sunaData[i,which(colnames(sunaData)=='nitrate')]==-1){ + if(sunaData[i,which(colnames(sunaData)=='nitrogen_in_nitrate')]==-1){ + sensorFlags[i,which(colnames(sensorFlags)=='nitrateLightDarkRatioQF')]=1}}}} #' Identifies light measurement number within burst and performs lamp stabilization test. # lampStabilizeThreshold<-sunaThresholds[(sunaThresholds$threshold_name=="Nitrates Lamp Stabilization Points"),] @@ -242,8 +248,7 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Replace with NA's so that flagged data is excluded from averaging dataOut<-merge(sunaData,allFlags,by='readout_time') - dataOut$nitrate[dataOut$nitrate==-1&dataOut$nitrogen_in_nitrate==-1]<-NA #' Low transmittance error codes - dataOut$nitrate[dataOut$light_dark_frame==0]<-NA #' Measurements where lamp may have failed to turn on + dataOut$nitrate[dataOut$light_dark_frame==0]<-NA #' Set any dark measurements to NA (just in case) dataOut$nitrate[dataOut$nitrateHumidityQF==1]<-NA dataOut$nitrate[dataOut$nitrateLampTempQF==1]<-NA dataOut$nitrate[dataOut$nitrateLightDarkRatioQF==1]<-NA From 538bf66faddf83e10b7200cd3604e8f4f3b64054 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Tue, 16 Dec 2025 17:24:07 -0700 Subject: [PATCH 160/182] Updated change log. --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index f3b56de34..a7223765f 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -56,7 +56,7 @@ #' #' Bobby Hensley (2025-12-16) #' Updated so that dark measurements caused by lamp temperature cutoff are still counted as part of same burst. -#' Update so that any low transmittance error codes (-1) are set to NA. +#' Update so that any low transmittance error codes (-1) are always flagged. ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, DirOutBase, From e42f297999438a7f803f048b8d25c4e87a8025d8 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Thu, 18 Dec 2025 08:35:38 -0700 Subject: [PATCH 161/182] dates and images updated --- pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml | 2 +- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 2 +- pipe/sunav2/sunav2_logjam_assign_clean_files.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index afd5bb8ba..f3ba31e7e 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -9,7 +9,7 @@ transform: # START_DATE must be set, format "YYYY-MM" # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. OUT_PATH: /pfs/out - START_MONTH: "2025-06" + START_MONTH: "2025-05" END_MONTH: "2025-06" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over stdin: - "#!/bin/bash" diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index dd3c10866..530ad2646 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-7eaa562 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-538bf66 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index cca85fb70..177fec1b7 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,7 +11,7 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-05-29" # Inclusive + START_DATE: "2025-04-29" # Inclusive END_DATE: "2025-07-02" # Inclusive SOURCE_TYPE: "sunav2" stdin: diff --git a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml index 85e5c0686..e382a87d3 100644 --- a/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml +++ b/pipe/sunav2/sunav2_logjam_assign_clean_files.yaml @@ -11,7 +11,7 @@ transform: DirOut=/pfs/out DirErr=$ERR_PATH FileSchmData=$FILE_SCHEMA_DATA - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-71a3f84 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-logfiles:sha-538bf66 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From 658d7fbdef2814f291670136010cb4043428392f Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Thu, 18 Dec 2025 11:40:36 -0700 Subject: [PATCH 162/182] Update to insifficient data quality flag script that makes it the only input to the final quality flag. --- .../wrap.insufficient.data.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/flow/flow.insufficient.data/wrap.insufficient.data.R b/flow/flow.insufficient.data/wrap.insufficient.data.R index b3c52da0f..f9df78b23 100644 --- a/flow/flow.insufficient.data/wrap.insufficient.data.R +++ b/flow/flow.insufficient.data/wrap.insufficient.data.R @@ -5,7 +5,11 @@ #' Bobby Hensley \email{hensley@battelleecology.org} #' #' @description Wrapper function. Determines the number of available measurements within an -#' averaging period, and whether an insufficient data quality flag should be applied. +#' averaging period, and whether an insufficient data quality flag should be applied. +#' This insufficient data quality flag is then used to determine whether the final quality +#' flag should be applied. It assumes that measurements that have failed individual +#' plausibility and sensor-specific tests have been removed and the number of remaining +#' measurements available for averaging is the only factor determining the final data quality. #' #' @param DirIn Character value. The base file path to the averaged stats and quality metrics. #' @@ -48,6 +52,8 @@ #' Bobby Hensley (2025-10-31) #' Initial creation. #' +#' Bobby Hensley (2025-12-18) +#' Updated so that finalQF is solely determined by insufficientDataQF. ############################################################################################## wrap.insufficient.data <- function(DirIn, minPoints, @@ -119,10 +125,12 @@ wrap.insufficient.data <- function(DirIn, if(statsData[i,which(colnames(statsData)==ptsColName)]>=minPoints){ qmData[i,which(colnames(qmData)=='insufficientDataQF')]=0}} - #' If the insufficient data quality flag has been applied, update the final quality flag. + #' If there is insufficient data, set the final quality flag to 1. + #' If there is sufficient data, set the final quality flag to 0. for(i in 1:nrow(qmData)){ if(qmData[i,which(colnames(qmData)=='insufficientDataQF')]==1){ - qmData[i,which(colnames(qmData)==finalQfColName)]=1}} + qmData[i,which(colnames(qmData)==finalQfColName)]=1} + else{qmData[i,which(colnames(qmData)==finalQfColName)]=0}} qmData <- qmData[c(setdiff(names(qmData), finalQfColName), finalQfColName)] #' Move finalQF back to the end #' Write out stats file. From 0d132ba34fe428f7a6fa08f36e51e7d71b91cad6 Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Thu, 18 Dec 2025 13:44:11 -0700 Subject: [PATCH 163/182] Updated suna specific flag script to set lamp warmup measuremnts to NA rather than deleting line. --- .../wrap.sunav2.quality.flags.R | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index a7223765f..e857ae2ce 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -29,8 +29,8 @@ #' #' @examples #' # Not run -# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/02/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" -# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/02/nitrate-surfacewater_SYCA102100/sunav2/CFGLOC111015" +# DirIn<-"~/pfs/nitrate_analyze_pad_and_qaqc_plau/2025/06/01/nitrate-surfacewater_CRAM103100/sunav2/CFGLOC110733" +# DirOut<-"~/pfs/nitrate_sensor_flag_and_remove/2025/06/01/nitrate-surfacewater_CRAM103100/sunav2/CFGLOC110733" # SchmDataOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_logfilled.avsc'),collapse='') # SchmFlagsOut<-base::paste0(base::readLines('~/pfs/sunav2_avro_schemas/sunav2_all_flags.avsc'),collapse='') # log <- NEONprocIS.base::def.log.init(Lvl = "debug") @@ -56,7 +56,10 @@ #' #' Bobby Hensley (2025-12-16) #' Updated so that dark measurements caused by lamp temperature cutoff are still counted as part of same burst. -#' Update so that any low transmittance error codes (-1) are always flagged. +#' Updated so that any low transmittance error codes ("-1") are always flagged and set to NA. +#' +#' Bobby Hensley (2025-12-18) +#' Updated so lamp stabilization test sets failed values to NA rather than removing entire line. ############################################################################################## wrap.sunav2.quality.flags <- function(DirIn, DirOutBase, @@ -228,15 +231,13 @@ wrap.sunav2.quality.flags <- function(DirIn, #' Drops burst number column since it's no longer needed. allFlags<-allFlags[,-which(colnames(allFlags)=='burstNumber')] - #' Removes all measurements where lamp has not stabilized from data and flag files. - lampStabilizeFlagsOnly<-sensorFlags[,c("readout_time","nitrateLampStabilizeQF")] - sunaData<-base::merge(sunaData,lampStabilizeFlagsOnly) #' Adds lamp stabilize QF to data file - sunaData<-sunaData[(sunaData$nitrateLampStabilizeQF==0),] - allFlags<-allFlags[(allFlags$nitrateLampStabilizeQF==0),] - - #' Rearranges data file to match schema again. - sunaData<-sunaData[,-which(colnames(sunaData)=='nitrateLampStabilizeQF')] - sunaData<-sunaData[,c(2,3,1,4:37)] + #' Removes measurements where lamp has not stabilized from data and flag files. + #lampStabilizeFlagsOnly<-sensorFlags[,c("readout_time","nitrateLampStabilizeQF")] + #sunaData<-base::merge(sunaData,lampStabilizeFlagsOnly) #' Adds lamp stabilize QF to data file + #sunaData<-sunaData[(sunaData$nitrateLampStabilizeQF==0),] + #allFlags<-allFlags[(allFlags$nitrateLampStabilizeQF==0),] + #sunaData<-sunaData[,-which(colnames(sunaData)=='nitrateLampStabilizeQF')] + #sunaData<-sunaData[,c(2,3,1,4:37)] #' Checks that data file and flag file have same number of measurements if(nrow(sunaData) != nrow(allFlags)){ From 016ceb3511d87b5b048df3ca5b4feebfad0b9928 Mon Sep 17 00:00:00 2001 From: ncatolico Date: Mon, 22 Dec 2025 08:16:25 -0700 Subject: [PATCH 164/182] update image --- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/nitrate/nitrate_null_gap_ucrt.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 530ad2646..9f2898ec5 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-538bf66 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-0d132ba # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/nitrate/nitrate_null_gap_ucrt.yaml b/pipe/nitrate/nitrate_null_gap_ucrt.yaml index 3c506001f..a99706df5 100644 --- a/pipe/nitrate/nitrate_null_gap_ucrt.yaml +++ b/pipe/nitrate/nitrate_null_gap_ucrt.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_null_gap_ucrt transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-736e55e + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-ucrt-group:sha-2025667 cmd: - sh - "-c" From 95e1912979f2de4e733be63804721ff07a9002ad Mon Sep 17 00:00:00 2001 From: Bobby Hensley Date: Mon, 22 Dec 2025 12:46:35 -0700 Subject: [PATCH 165/182] Updated light dark ratio test for dark measurements. --- flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R index e857ae2ce..b18dafcce 100644 --- a/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R +++ b/flow/flow.sunav2.quality.flags/wrap.sunav2.quality.flags.R @@ -178,8 +178,8 @@ wrap.sunav2.quality.flags <- function(DirIn, if(!is.na(sunaData[i,which(colnames(sunaData)=='dark_signal_average')])&!is.na(sunaData[i,which(colnames(sunaData)=='spec_average')])){ if(sunaData[i,which(colnames(sunaData)=='spec_average')]/sunaData[i,which(colnames(sunaData)=='dark_signal_average')] Date: Mon, 22 Dec 2025 13:49:10 -0700 Subject: [PATCH 166/182] latest --- pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml | 4 ++-- pipe/nitrate/nitrate_flags_specific.yaml | 2 +- pipe/sunav2/sunav2_cron_daily_and_date_control.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml index f3ba31e7e..dfd20350f 100644 --- a/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml +++ b/pipe/nitrate/nitrate_cron_monthly_and_pub_control.yaml @@ -9,8 +9,8 @@ transform: # START_DATE must be set, format "YYYY-MM" # END_DATE can be set or unset (comment or remove line to unset). If unset, end month will be last month. OUT_PATH: /pfs/out - START_MONTH: "2025-05" - END_MONTH: "2025-06" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over + START_MONTH: "2019-11" + END_MONTH: "2019-11" # Inclusive. Run the pipeline with END_MONTH set to initialize, then comment out and update pipeline (no reprocess) to let the cron take over stdin: - "#!/bin/bash" - ./cron_monthly_and_pub_control/populate_pub_months.sh diff --git a/pipe/nitrate/nitrate_flags_specific.yaml b/pipe/nitrate/nitrate_flags_specific.yaml index 9f2898ec5..1d84027d9 100644 --- a/pipe/nitrate/nitrate_flags_specific.yaml +++ b/pipe/nitrate/nitrate_flags_specific.yaml @@ -2,7 +2,7 @@ pipeline: name: nitrate_flags_specific transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-0d132ba + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-sunav2-flags-specific:sha-95e1912 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret cmd: ["/bin/bash"] diff --git a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml index 177fec1b7..22eb8fbe7 100644 --- a/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml +++ b/pipe/sunav2/sunav2_cron_daily_and_date_control.yaml @@ -11,8 +11,8 @@ transform: # kafka_start_date in the site-list file is the first full day from which data began streaming via Kafka # END_DATE can be set or unset (remove line entirely to unset). If unset, end date will be yesterday. OUT_PATH: /pfs/out - START_DATE: "2025-04-29" # Inclusive - END_DATE: "2025-07-02" # Inclusive + START_DATE: "2019-10-29" # Inclusive + END_DATE: "2019-12-02" # Inclusive SOURCE_TYPE: "sunav2" stdin: - "#!/bin/bash" From dc902603d2be9c04d687af81b9276156e89759ac Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Mon, 22 Dec 2025 14:03:48 -0700 Subject: [PATCH 167/182] update image tags --- pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml | 2 +- pipe/sunav2/sunav2_location_active_dates_assignment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml index 4641f1c8d..9ef95d35b 100644 --- a/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml +++ b/pipe/nitrate/nitrate_analyze_pad_and_qaqc_plau.yaml @@ -4,7 +4,7 @@ pipeline: transform: image_pull_secrets: - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-ts-pad-anls-qaqc-plau:v1.1.2 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-ts-pad-anls-qaqc-plau:v2.0.0 cmd: - sh - "-c" diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.yaml b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml index 40eec8646..d685e6dab 100644 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.yaml +++ b/pipe/sunav2/sunav2_location_active_dates_assignment.yaml @@ -14,7 +14,7 @@ transform: FileYear=$FILE_YEAR TypeFile=namedLocation "Prop=HOR|VER|name|description|site|Data Rate|active_periods" - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From b1c46325a70d73ccd7b0917b3dc764742c008bfd Mon Sep 17 00:00:00 2001 From: ncatolico Date: Wed, 21 Jan 2026 15:15:02 -0700 Subject: [PATCH 168/182] change dev to prod for log file bucket --- pipe/sunav2/sunav2_logjam_list_files.yaml | 2 +- pipe/sunav2/sunav2_logjam_load_files.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipe/sunav2/sunav2_logjam_list_files.yaml b/pipe/sunav2/sunav2_logjam_list_files.yaml index 9ed435342..68db8f5cf 100644 --- a/pipe/sunav2/sunav2_logjam_list_files.yaml +++ b/pipe/sunav2/sunav2_logjam_list_files.yaml @@ -5,7 +5,7 @@ transform: image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-logjam-loader:v2.0.0 cmd: ["/bin/bash"] env: - LOGJAM_INGEST_BUCKET: neon-nonprod-is-logjam-ingest + LOGJAM_INGEST_BUCKET: neon-is-logjam-ingest OUT_PATH: /pfs/out LOG_LEVEL: DEBUG stdin: diff --git a/pipe/sunav2/sunav2_logjam_load_files.yaml b/pipe/sunav2/sunav2_logjam_load_files.yaml index 3a2685b52..ffd1e04bf 100644 --- a/pipe/sunav2/sunav2_logjam_load_files.yaml +++ b/pipe/sunav2/sunav2_logjam_load_files.yaml @@ -13,7 +13,7 @@ transform: - python3 -m logjam_loader.load_all_logjam_files env: - LOGJAM_INGEST_BUCKET: neon-nonprod-is-logjam-ingest + LOGJAM_INGEST_BUCKET: neon-is-logjam-ingest OUT_PATH: /pfs/out LOG_LEVEL: DEBUG STARTING_PATH_INDEX: "7" @@ -24,7 +24,7 @@ input: pfs: name: IN_PATH repo: sunav2_logjam_list_files - glob: /*/*/*/logjam_dev/sunav2/ + glob: /*/*/*/logjam_prod/sunav2/ empty_files: true parallelism_spec: constant: 10 From b9de05355ed8f64d146bdea07b4d150bf7ef1710 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:18:39 -0700 Subject: [PATCH 169/182] Uncomment print statements for debugging Reverting minor change from master --- modules/calval_loader/load_all_calval_files.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/calval_loader/load_all_calval_files.py b/modules/calval_loader/load_all_calval_files.py index 923ffaaa6..959890e06 100644 --- a/modules/calval_loader/load_all_calval_files.py +++ b/modules/calval_loader/load_all_calval_files.py @@ -17,7 +17,7 @@ def load() -> None: env = environs.Env() ingest_bucket_name = env.str('CVAL_INGEST_BUCKET') in_path: Path = env.path('IN_PATH') - #print("IN_PATH value is:", in_path) + # print("IN_PATH value is:", in_path) output_directory: Path = env.path('OUT_PATH') sensor_type = env.str('SOURCE_TYPE') schema_name = env.str('SCHEMA_NAME',sensor_type) @@ -38,7 +38,7 @@ def load() -> None: filename = pathname.split('/') filename = filename[-1] + ".xml" - #print("FileName is: ", filename) + print("FileName is: ", filename) blob = ingest_bucket.get_blob(filename) with blob.open("r") as f: From 643709804d7ea073b6655e5751e45ffc67d07d0d Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:20:43 -0700 Subject: [PATCH 170/182] Update base image version to v1.7.0 bring base image to current --- modules_combined/fill_date_gaps_nonregularized/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules_combined/fill_date_gaps_nonregularized/Dockerfile b/modules_combined/fill_date_gaps_nonregularized/Dockerfile index 1e7f6c046..6dc11c104 100644 --- a/modules_combined/fill_date_gaps_nonregularized/Dockerfile +++ b/modules_combined/fill_date_gaps_nonregularized/Dockerfile @@ -2,7 +2,7 @@ # This image combines the two modules: date_gap_filler and flow.gap.fill.nonrglr # Start with the base R image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.6.0 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 # maintainer handle MAINTAINER "Nora Catolico" ncatolico@battelleecology.org From 2e6eaa79aba53365843b5efd4c3f316956e63ecc Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:21:12 -0700 Subject: [PATCH 171/182] Update base image version in Dockerfile bring base image to current --- flow/flow.gap.fill.nonrglr/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.gap.fill.nonrglr/Dockerfile b/flow/flow.gap.fill.nonrglr/Dockerfile index 5fde50d60..4478d4d25 100644 --- a/flow/flow.gap.fill.nonrglr/Dockerfile +++ b/flow/flow.gap.fill.nonrglr/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - flow.gap.fill.nonrglr # Start with the neon-is-base-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.6.0 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.gap.fill.nonrglr" From 186f0f2988def321ef3e73456d9f12efea93d6d5 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:21:47 -0700 Subject: [PATCH 172/182] Update base image version in Dockerfile bring base image to current --- flow/flow.insufficient.data/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.insufficient.data/Dockerfile b/flow/flow.insufficient.data/Dockerfile index 106206edc..0e560f46c 100644 --- a/flow/flow.insufficient.data/Dockerfile +++ b/flow/flow.insufficient.data/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - insufficient data # Start with the NEON IS base package image -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.insufficient.data" From 140f1da708590d3f3d010e0d7a73d9a49f5b4263 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:22:29 -0700 Subject: [PATCH 173/182] Update Docker base image version to v1.7.0 bring base image to current --- flow/flow.sunav2.logfiles/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.sunav2.logfiles/Dockerfile b/flow/flow.sunav2.logfiles/Dockerfile index b417409b1..74ae616be 100644 --- a/flow/flow.sunav2.logfiles/Dockerfile +++ b/flow/flow.sunav2.logfiles/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 Logfile Processing # Start with the neon-is-base-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.sunav2.logfiles" From 84aa10ddf46abd814e4c7d843d16b71f4d7deac7 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:23:19 -0700 Subject: [PATCH 174/182] Update Dockerfile bring qaqc image to current --- flow/flow.sunav2.quality.flags/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.sunav2.quality.flags/Dockerfile b/flow/flow.sunav2.quality.flags/Dockerfile index 603cc646d..c1a98605b 100644 --- a/flow/flow.sunav2.quality.flags/Dockerfile +++ b/flow/flow.sunav2.quality.flags/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 sensor-specific quality flags # Start with the neon-is-pack-qaqc-r image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-qaqc-r:v1.1.8 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-qaqc-r:v1.1.10 ARG FLOW_DIR="./flow" ARG APP_DIR="flow.sunav2.quality.flags" From 29c70ccfad3edbc92f5ea3d6be8b603afb343a73 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:23:55 -0700 Subject: [PATCH 175/182] Update base image version in Dockerfile bring base image to current --- modules_combined/sunav2_logs_group_and_fill/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules_combined/sunav2_logs_group_and_fill/Dockerfile b/modules_combined/sunav2_logs_group_and_fill/Dockerfile index da1a5e850..fa005b80b 100644 --- a/modules_combined/sunav2_logs_group_and_fill/Dockerfile +++ b/modules_combined/sunav2_logs_group_and_fill/Dockerfile @@ -3,7 +3,7 @@ # docker build -t neon-is-sunav2-logs-group-fill -f ./modules_combined/sunav2_logs_group_and_fill/Dockerfile . # Start with the base R image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.5.2 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-base-r:v1.7.0 # maintainer handle MAINTAINER "Nora Catolico" ncatolico@battelleecology.org From 89bd7ec9e8b2e447225adcff881a7e3a419d60fb Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Thu, 22 Jan 2026 18:25:27 -0700 Subject: [PATCH 176/182] Update base image version in Dockerfile bring cal image to current --- modules_combined/sunav2_ucrt_group/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules_combined/sunav2_ucrt_group/Dockerfile b/modules_combined/sunav2_ucrt_group/Dockerfile index a7ec96f17..a1a784e92 100644 --- a/modules_combined/sunav2_ucrt_group/Dockerfile +++ b/modules_combined/sunav2_ucrt_group/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile for NEON IS Data Processing - sunav2 uncertainty module combined with filter-joiner and insufficient data # Start with the cal package image. -FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v2.1.1 +FROM us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-pack-cal-r:v3.1.0 # maintainer handle MAINTAINER "Nora Catolico" ncatolico@battelleecology.org From 0c0d4dbbee3f8e5f1e8cfd50a50e97c5456aa1ea Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Fri, 23 Jan 2026 15:07:50 -0700 Subject: [PATCH 177/182] Add HQTW --- pipe/sunav2/site-list.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipe/sunav2/site-list.json b/pipe/sunav2/site-list.json index 38df06e24..c7b5ce526 100644 --- a/pipe/sunav2/site-list.json +++ b/pipe/sunav2/site-list.json @@ -52,6 +52,10 @@ "kafka_start_date" : "2024-01-17" }, { + "site" : "HQTW", + "kafka_start_date" : "2023-06-01", + }, + { "site" : "KING", "kafka_start_date" : "2024-01-25" }, @@ -135,4 +139,4 @@ "site" : "WLOU", "kafka_start_date" : "2024-02-06" } -] \ No newline at end of file +] From cd20787265d1e19e5da3af9f188e7d9b4a084ad8 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Fri, 23 Jan 2026 16:04:54 -0700 Subject: [PATCH 178/182] Update Docker image version in YAML configuration bring image to current --- pipe/sunav2/sunav2_calibration_group_and_convert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 8aa2adb8c..6f40baa12 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_group_and_convert transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v3.0.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cal-grp-conv:v3.1.0 cmd: - sh - "-c" From 97f1049d2297776a0215b44633ed070f14baf944 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Fri, 23 Jan 2026 17:14:26 -0700 Subject: [PATCH 179/182] fix L1 bucket structure --- pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml index 22401d71c..da3e25e72 100644 --- a/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml +++ b/pipe/nitrate/nitrate_level1_group_consolidate_srf.yaml @@ -72,7 +72,7 @@ transform: fname="${BASH_REMATCH[5]}" # Now get the timing index from the file name [[ "$fname" =~ ^${GROUP_PREFIX}_[A-Za-z0-9]+_${fyear}-${fmonth}-${fday}_[A-Za-z0-9]+_([A-Za-z0-9]+)+_([A-Za-z0-9]+)_([A-Za-z0-9]+).parquet ]] - avg_int="${BASH_REMATCH[2]}" + avg_int="${BASH_REMATCH[3]}" #Form the output path and link outdir="${linkdir}/v2/${GROUP_PREFIX}/${avg_int}/group=${fgroup}/ms=${fyear}-${fmonth}" mkdir -p "${outdir}" From fc59d423648717e2bfb9a6c6bca3a36b56dabbb1 Mon Sep 17 00:00:00 2001 From: Cove Sturtevant Date: Mon, 26 Jan 2026 08:00:27 -0700 Subject: [PATCH 180/182] Update image version for sunav2_location_asset bring image to current --- pipe/sunav2/sunav2_location_asset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_location_asset.yaml b/pipe/sunav2/sunav2_location_asset.yaml index 904c178a8..c44f94456 100644 --- a/pipe/sunav2/sunav2_location_asset.yaml +++ b/pipe/sunav2/sunav2_location_asset.yaml @@ -4,7 +4,7 @@ pipeline: transform: # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.0.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-asset-loader:v1.1.0 cmd: - sh From 4931319c6fc77d4e7e48832dc3e0102a95f825e3 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Mon, 26 Jan 2026 08:32:35 -0700 Subject: [PATCH 181/182] bring images to current --- pipe/nitrate/nitrate_group_assignment.yaml | 2 +- .../sunav2_calibration_group_and_convert.yaml | 6 +-- .../sunav2/sunav2_calibration_list_files.yaml | 2 +- pipe/sunav2/sunav2_calibration_loader.yaml | 24 ++-------- ...nav2_location_active_dates_assignment.json | 47 ------------------- .../sunav2_location_asset_assignment.json | 47 ------------------- .../sunav2_location_asset_assignment.yaml | 2 +- 7 files changed, 10 insertions(+), 120 deletions(-) delete mode 100644 pipe/sunav2/sunav2_location_active_dates_assignment.json delete mode 100644 pipe/sunav2/sunav2_location_asset_assignment.json diff --git a/pipe/nitrate/nitrate_group_assignment.yaml b/pipe/nitrate/nitrate_group_assignment.yaml index d93a4451d..372679c61 100644 --- a/pipe/nitrate/nitrate_group_assignment.yaml +++ b/pipe/nitrate/nitrate_group_assignment.yaml @@ -13,7 +13,7 @@ transform: DirErr=$ERR_PATH FileYear=$FILE_YEAR TypeFile=group - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.1 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: diff --git a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml index 6f40baa12..2d445c76e 100644 --- a/pipe/sunav2/sunav2_calibration_group_and_convert.yaml +++ b/pipe/sunav2/sunav2_calibration_group_and_convert.yaml @@ -11,16 +11,16 @@ transform: # Use bash-scrict mode. See http://redsymbol.net/articles/unofficial-bash-strict-mode/ set -euo pipefail IFS=$'\n\t' - # + # Refresh interim directories with each datum (otherwise they persist and cause probs) rm -r -f /tmp/pfs/filter_joined rm -rf $OUT_PATH mkdir -p /tmp/pfs/filter_joined mkdir -p $OUT_PATH # R modules must have pfs in the repo structure - # + # Run filter-joiner for data (using environment variables below as input parameters) python3 -m filter_joiner.filter_joiner_main - # + # Run calibration conversion module Rscript ./flow.cal.conv.R \ DirIn=/tmp/pfs/filter_joined \ diff --git a/pipe/sunav2/sunav2_calibration_list_files.yaml b/pipe/sunav2/sunav2_calibration_list_files.yaml index 8c73e9ed9..3d42f72ca 100644 --- a/pipe/sunav2/sunav2_calibration_list_files.yaml +++ b/pipe/sunav2/sunav2_calibration_list_files.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_calibration_list_files transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v2.3.3 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-cval-loader:v3.0.0 cmd: ["/bin/bash"] env: CVAL_INGEST_BUCKET: neon-cval diff --git a/pipe/sunav2/sunav2_calibration_loader.yaml b/pipe/sunav2/sunav2_calibration_loader.yaml index 7daab43a0..71f17324b 100644 --- a/pipe/sunav2/sunav2_calibration_loader.yaml +++ b/pipe/sunav2/sunav2_calibration_loader.yaml @@ -12,31 +12,15 @@ transform: set -euo pipefail IFS=$'\n\t' - # Refresh interim directories with each datum (otherwise they persist and cause probs) - rm -rf $OUT_PATH - mkdir -p $OUT_PATH # R modules must have pfs in the repo structure - - # NOTE: sunav2_raw is the name for both the sensor_type and the avro_schema_name in the database - # This is because the airflow transitions use the sunav2_raw L0 data - # When Airflow transitions are no longer active, best to update the sensor_type - # to "sunav2" in the database and use the commented env vars in the typical - # workflow for this pipeline (i.e. not loading to a tempdir and renaming) - - python3 -m calval_loader.load_all_calval_files #run the calibration loader - - if [[ -d "$OUT_PATH/$SOURCE_TYPE" ]]; then - cp -r $OUT_PATH/$SOURCE_TYPE /pfs/out/$SOURCE_TYPE_OUT - fi + python3 -m calval_loader.load_all_calval_files # run the calibration loader EOF env: CVAL_INGEST_BUCKET: neon-cval - OUT_PATH: /tmp/out LOG_LEVEL: INFO - SOURCE_TYPE: "sunav2_raw" - SOURCE_TYPE_OUT: "sunav2" - # SOURCE_TYPE: "sunav2" - # SCHEMA_NAME: "sunav2_raw" + OUT_PATH: /pfs/out + SOURCE_TYPE: "sunav2" + SCHEMA_NAME: "sunav2_raw" STARTING_PATH_INDEX: "5" secrets: - name: pdr-secret diff --git a/pipe/sunav2/sunav2_location_active_dates_assignment.json b/pipe/sunav2/sunav2_location_active_dates_assignment.json deleted file mode 100644 index 30012ffca..000000000 --- a/pipe/sunav2/sunav2_location_active_dates_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_active_dates_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=namedLocation" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_loader", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "200M", - "cpu": 1 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_location_asset_assignment.json b/pipe/sunav2/sunav2_location_asset_assignment.json deleted file mode 100644 index 63121cab2..000000000 --- a/pipe/sunav2/sunav2_location_asset_assignment.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "pipeline": { - "name": "sunav2_location_asset_assignment" - }, - "transform": { - "cmd": [ - "Rscript", - "./flow.loc.grp.asgn.R", - "DirIn=$DIR_IN", - "DirOut=/pfs/out", - "DirErr=/pfs/out/errored_datums", - "FileYear=$FILE_YEAR", - "TypeFile=asset" - ], - "image": "us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0", - "env": { - "LOG_LEVEL": "INFO" - } - }, - "input": { - "cross": [ - { - "pfs": { - "name": "DIR_IN", - "repo": "location_asset", - "glob": "/sunav2/*" - } - }, - { - "pfs": { - "name": "FILE_YEAR", - "repo": "data_source_sunav2_list_years", - "glob": "/data_year*.txt" - } - } - ] - }, - "enable_stats": false, - "standby": true, - "resource_requests": { - "memory": "210M", - "cpu": 0.3 - }, - "parallelism_spec": { - "constant": "8" - } -} diff --git a/pipe/sunav2/sunav2_location_asset_assignment.yaml b/pipe/sunav2/sunav2_location_asset_assignment.yaml index bb072eea2..209ebf699 100644 --- a/pipe/sunav2/sunav2_location_asset_assignment.yaml +++ b/pipe/sunav2/sunav2_location_asset_assignment.yaml @@ -14,7 +14,7 @@ transform: FileYear=$FILE_YEAR TypeFile=asset "Prop=HOR|VER|install_date|remove_date|name|site|Data Rate" - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.2.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-asgn:v1.3.0 # image_pull_secrets: # - battelleecology-quay-read-all-pull-secret env: From f6ecf1a66d4953978b1bbd05463ee7fcd3e09ba4 Mon Sep 17 00:00:00 2001 From: covesturtevant Date: Mon, 26 Jan 2026 08:41:43 -0700 Subject: [PATCH 182/182] bring image to current --- pipe/sunav2/sunav2_location_group_and_restructure.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipe/sunav2/sunav2_location_group_and_restructure.yaml b/pipe/sunav2/sunav2_location_group_and_restructure.yaml index 6a493ac1e..ab889695d 100644 --- a/pipe/sunav2/sunav2_location_group_and_restructure.yaml +++ b/pipe/sunav2/sunav2_location_group_and_restructure.yaml @@ -2,7 +2,7 @@ pipeline: name: sunav2_location_group_and_restructure transform: - image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-strc-comb:v1.2.0 + image: us-central1-docker.pkg.dev/neon-shared-service/neonscience/neon-is-loc-grp-strc-comb:v1.2.1 cmd: - sh - "-c"