diff --git a/bin/config_CP_input_dauer.R b/bin/config_CP_input_dauer.R deleted file mode 100644 index 17e0690..0000000 --- a/bin/config_CP_input_dauer.R +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env Rscript -library(dplyr) -library(tidyr) -library(tibble) -library(stringr) -library(readr) -library(glue) -library(purrr) -library(data.table) - -#==============================================================================# -# Arguments -#==============================================================================# -# 1 - full path to project directory -# 2 - the path to the well_mask - HARDCODE NOW -# 3 - the group argument from main.nf - default is "plate,well" -# 4 - the edited pipeline path -# 5 - the out path -args <- commandArgs(trailingOnly = TRUE) - -#==============================================================================# -# Make Metadata NEEDS TO BE ADAPATBLE TO MULTIPLE WAVELENGTHS -#==============================================================================# -projDir <- args[1] -projName <- stringr::str_extract(projDir, pattern = "([^/]+$)") - -raw_imagesDir <- paste0(projDir, "/raw_images") - -# parse file names from directory - need wavelength in file name -meta1 <- tibble::tibble(file = list.files(path = raw_imagesDir), - file_path = list.files(path = raw_imagesDir, full.names = T)) %>% - dplyr::mutate(copy = file) %>% - tidyr::separate(col = copy, into = c("date","exp","plate","mag"), sep = "-") %>% - tidyr::separate(col = mag, into = c("mag","well", "wave"), sep = "_") %>% - tidyr::separate(col = wave, into = c("wave","TIF"), sep = "[.]") %>% - dplyr::select(-TIF) %>% - dplyr::mutate(row = stringr::str_extract(well, pattern = "[A-Z]"), - col = stringr::str_extract(well, pattern = "[0-9][0-9]"), - Image_PathName_wellmask_98.png = stringr::str_replace(args[2], pattern = "([^/]+$)", replacement = ""), - Image_FileName_wellmask_98.png = stringr::str_extract(args[2], pattern = "([^/]+$)")) - -# num of wavelengths - add logic for how to make metadata from multiple wavelengths -n_wave <- length(unique(meta1$wave)) - -# add group -groups <- stringr::str_split(args[3], pattern = ",")[[1]] -meta1$group <- apply( meta1[, groups], 1, paste, collapse = "_") - -# add image types and set metadata names - hardcode image names - needs to be flexible for multiple pipeline profiles -meta2 <- meta1 %>% - tidyr::pivot_wider(names_from = wave, values_from = c(file, file_path)) %>% - dplyr::rename(Image_FileName_RawBF = file_w1, - Image_PathName_RawBF = file_path_w1, - Image_FileName_RawRFP = file_w2, - Image_PathName_RawRFP = file_path_w2) %>% - dplyr::mutate(Image_PathName_RawRFP = stringr::str_replace(Image_PathName_RawRFP, pattern = "([^/]+$)", replacement = ""), - Image_PathName_RawBF = stringr::str_replace(Image_PathName_RawBF, pattern = "([^/]+$)", replacement = "")) %>% - dplyr::select(Metadata_Experiment = exp, - Metadata_Date = date, - Metadata_Plate = plate, - Metadata_Well = well, - #Metadata_Column = col, - #Metadata_Row = row, - Metadata_Group = group, - Metadata_Magnification = mag, - Image_FileName_RawBF, - Image_PathName_RawBF, - Image_FileName_RawRFP, - Image_PathName_RawRFP, - Image_FileName_wellmask_98.png, - Image_PathName_wellmask_98.png) - -write.table(meta2, file = glue::glue("metadata.csv"), quote=FALSE, sep=',', row.names = F) - -#==============================================================================# -# Make groups.tsv file for runCP -#==============================================================================# -gs <- meta2 %>% - dplyr::distinct(Metadata_Group, .keep_all=T) %>% - dplyr::mutate(group = paste0("Metadata_Group=", Metadata_Group), - pipeline = args[4], - output = paste0(args[5], "/CP_output/", Metadata_Group)) %>% - dplyr::select(group:output) - -write.table(gs, file = glue::glue("groups.tsv"), quote=FALSE, sep='\t', row.names = F) - -#==============================================================================# -# Make dirs for CP output -#==============================================================================# -for(i in unique(gs$output)){ - dir.create(i, recursive = T) -} diff --git a/bin/config_CP_input_toxin.R b/bin/config_CP_input_toxin.R deleted file mode 100644 index a169b0b..0000000 --- a/bin/config_CP_input_toxin.R +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env Rscript -library(dplyr) -library(tidyr) -library(tibble) -library(stringr) -library(readr) -library(glue) -library(purrr) -library(data.table) - -#==============================================================================# -# Arguments -#==============================================================================# -# 1 - full path to project directory -# 2 - the path to the well_mask - HARDCODE NOW -# 3 - the group argument from main.nf - default is "plate,well" -# 4 - the edited pipeline path -# 5 - the out path -# args <- c("/projects/b1059/projects/Tim/cellprofiler-nf/projects/20220128_GWA09", "/projects/b1059/projects/Tim/cellprofiler-nf/input_data/well_masks/wellmask_98.png", -# "plate,well", "/projects/b1059/projects/Tim/cellprofiler-nf/projects/20220128_GWA09/pipelines/pipeline.cppipe", "/projects/b1059/projects/Tim/cellprofiler-nf/projects/20220128_GWA09/CP_output") -args <- commandArgs(trailingOnly = TRUE) - -#==============================================================================# -# Make Metadata NEEDS TO BE ADAPATBLE TO MULTIPLE WAVELENGTHS -#==============================================================================# -projDir <- args[1] -projName <- stringr::str_extract(projDir, pattern = "([^/]+$)") - -raw_imagesDir <- paste0(projDir, "/raw_images") - -# parse file names from directory - need wavelength in file name -meta1 <- tibble::tibble(file = list.files(path = raw_imagesDir), - file_path = list.files(path = raw_imagesDir, full.names = T)) %>% - dplyr::mutate(copy = file) %>% - tidyr::separate(col = copy, into = c("date","exp","plate","mag"), sep = "-") %>% - tidyr::separate(col = mag, into = c("mag","well"), sep = "_") %>% - tidyr::separate(col = well, into = c("well","TIF"), sep = "[.]") %>% - dplyr::select(-TIF) %>% - dplyr::mutate(row = stringr::str_extract(well, pattern = "[A-Z]"), - col = stringr::str_extract(well, pattern = "[0-9][0-9]"), - Image_PathName_wellmask_98.png = stringr::str_replace(args[2], pattern = "([^/]+$)", replacement = ""), - Image_FileName_wellmask_98.png = stringr::str_extract(args[2], pattern = "([^/]+$)")) - -# add group -groups <- stringr::str_split(args[3], pattern = ",")[[1]] -meta1$group <- apply( meta1[, groups], 1, paste, collapse = "_") - -meta2 <- meta1 %>% - dplyr::mutate(Image_PathName_RawBF = stringr::str_replace(file_path, pattern = "([^/]+$)", replacement = "")) %>% - dplyr::select(Metadata_Experiment = exp, - Metadata_Date = date, - Metadata_Plate = plate, - Metadata_Well = well, - Metadata_Group = group, - Metadata_Magnification = mag, - Image_FileName_RawBF = file, - Image_PathName_RawBF, - Image_FileName_wellmask_98.png, - Image_PathName_wellmask_98.png) - -write.table(meta2, file = glue::glue("metadata.csv"), quote=FALSE, sep=',', row.names = F) - -#==============================================================================# -# Make groups.tsv file for runCP -#==============================================================================# -gs <- meta2 %>% - dplyr::distinct(Metadata_Group, .keep_all=T) %>% - dplyr::mutate(group = paste0("Metadata_Group=", Metadata_Group), - pipeline = args[4], - output = paste0(args[5], "/CP_output/", Metadata_Group)) %>% - dplyr::select(group:output) - -write.table(gs, file = glue::glue("groups.tsv"), quote=FALSE, sep='\t', row.names = F) - -#==============================================================================# -# Make dirs for CP output -#==============================================================================# -for(i in unique(gs$output)){ - dir.create(i, recursive = T) -} diff --git a/bin/makeMetadata_dauer.R b/bin/makeMetadata_dauer.R new file mode 100755 index 0000000..0cba482 --- /dev/null +++ b/bin/makeMetadata_dauer.R @@ -0,0 +1,72 @@ +#!/usr/bin/env -S Rscript --vanilla +library(dplyr) +library(tidyr) +library(tibble) +library(stringr) +library(readr) +library(purrr) +library(data.table) + +#==============================================================================# +# Arguments +#==============================================================================# +# 1 - A list of input files +# 2 - the path to the well_mask - HARDCODE NOW +# 3 - the group argument from main.nf - default is "plate,well" +# 4 - the edited pipeline path +# 5 - the out path +args <- commandArgs(trailingOnly = TRUE) +#==============================================================================# +# Make Metadata NEEDS TO BE ADAPATBLE TO MULTIPLE WAVELENGTHS +#==============================================================================# +# parse file names from directory - need wavelength in file name +meta1 <- read_delim( + args[1], + col_names = FALSE, + delim = "\t") %>% + select(file_path = X1) %>% + extract(file_path, into = "file", remove = FALSE, regex = ".*/(.*)$") %>% + extract(file, + remove = FALSE, + regex = "^(.*)-(.*)-(.*)-(.*)_(.*)_(.*)\\.(.*)$", + into = c("date","exp","plate","mag","well","wave","TIF")) %>% + select(-TIF) %>% + dplyr::mutate(row = stringr::str_extract(well, pattern = "[A-Z]"), + col = stringr::str_extract(well, pattern = "[0-9][0-9]"), + Image_PathName_wellmask_98.png = stringr::str_replace(args[2], pattern = "([^/]+$)", replacement = ""), + Image_FileName_wellmask_98.png = stringr::str_extract(args[2], pattern = "([^/]+$)")) + +# add group +groups <- stringr::str_split(args[3], pattern = ",")[[1]] +meta1$group <- apply( meta1[, groups], 1, paste, collapse = "_") + +# add image types and set metadata names - hardcode image names - needs to be flexible for multiple pipeline profiles +meta2 <- meta1 %>% + tidyr::pivot_wider(names_from = wave, values_from = c(file, file_path), ) %>% + dplyr::rename(Image_FileName_RawBF = file_w1, + Image_PathName_RawBF = file_path_w1, + Image_FileName_RawRFP = file_w2, + Image_PathName_RawRFP = file_path_w2) %>% + dplyr::mutate( + Image_PathName_RawRFP = stringr::str_replace( + Image_PathName_RawRFP, pattern = "([^/]+$)", replacement = "" + ), + Image_PathName_RawBF = stringr::str_replace( + Image_PathName_RawBF, pattern = "([^/]+$)", replacement = "" + ) + ) %>% + dplyr::select( + Metadata_Experiment = exp, + Metadata_Date = date, + Metadata_Plate = plate, + Metadata_Well = well, + Metadata_Group = group, + Metadata_Magnification = mag, + Image_FileName_RawBF, + Image_PathName_RawBF, + Image_FileName_RawRFP, + Image_PathName_RawRFP, + Image_FileName_wellmask_98.png, + Image_PathName_wellmask_98.png) + +write.table(meta2, file = "metadata.csv", quote=FALSE, sep=',', row.names = F) \ No newline at end of file diff --git a/bin/makeMetadata_toxin.R b/bin/makeMetadata_toxin.R new file mode 100755 index 0000000..c4e70d0 --- /dev/null +++ b/bin/makeMetadata_toxin.R @@ -0,0 +1,58 @@ +#!/usr/bin/env -S Rscript --vanilla +library(dplyr) +library(tidyr) +library(tibble) +library(stringr) +library(readr) +library(purrr) +library(data.table) + +#==============================================================================# +# Arguments +#==============================================================================# +# 1 - A list of input files +# 2 - the path to the well_mask - HARDCODE NOW +# 3 - the group argument from main.nf - default is "plate,well" +# 4 - the edited pipeline path +# 5 - the out path +args <- commandArgs(trailingOnly = TRUE) + +#==============================================================================# +# Make Metadata NEEDS TO BE ADAPATBLE TO MULTIPLE WAVELENGTHS +#==============================================================================# +# parse file names from directory - need wavelength in file name +meta1 <- read_delim( + args[1], + col_names = FALSE, + delim = "\t") %>% + select(file_path = X1) %>% + extract(file_path, into = "file", remove = FALSE, regex = ".*/(.*)$") %>% + extract(file, + remove = FALSE, + regex = "^(.*)-(.*)-(.*)-(.*)_(.*)\\.(.*)$", + into = c("date","exp","plate","mag","well","TIF")) %>% + select(-TIF) %>% + dplyr::mutate(row = stringr::str_extract(well, pattern = "[A-Z]"), + col = stringr::str_extract(well, pattern = "[0-9][0-9]"), + Image_PathName_wellmask_98.png = stringr::str_replace(args[2], pattern = "([^/]+$)", replacement = ""), + Image_FileName_wellmask_98.png = stringr::str_extract(args[2], pattern = "([^/]+$)")) + +# add group +groups <- stringr::str_split(args[3], pattern = ",")[[1]] +meta1$group <- apply( meta1[, groups], 1, paste, collapse = "_") + +# add image types and set metadata names - hardcode image names - needs to be flexible for multiple pipeline profiles +meta2 <- meta1 %>% + dplyr::mutate(Image_PathName_RawBF = stringr::str_replace(file_path, pattern = "([^/]+$)", replacement = "")) %>% + dplyr::select(Metadata_Experiment = exp, + Metadata_Date = date, + Metadata_Plate = plate, + Metadata_Well = well, + Metadata_Group = group, + Metadata_Magnification = mag, + Image_FileName_RawBF = file, + Image_PathName_RawBF, + Image_FileName_wellmask_98.png, + Image_PathName_wellmask_98.png) + +write.table(meta2, file = "metadata.csv", quote=FALSE, sep=',', row.names = F) \ No newline at end of file diff --git a/bin/proc_CP_output_dauer.R b/bin/proc_CP_output.R old mode 100644 new mode 100755 similarity index 74% rename from bin/proc_CP_output_dauer.R rename to bin/proc_CP_output.R index 92ad0b1..67ce906 --- a/bin/proc_CP_output_dauer.R +++ b/bin/proc_CP_output.R @@ -1,24 +1,25 @@ -# /usr/bin/Rscript +#!/usr/bin/env -S Rscript --vanilla library(fs) library(dplyr) library(tidyr) library(tibble) library(stringr) library(readr) -library(glue) library(purrr) #==============================================================================# # Arguments #==============================================================================# # 1 - out directory path +# 2 - project name +# 3 - run stamp args <- commandArgs(trailingOnly = TRUE) #==============================================================================# # Read CP output data #==============================================================================# # get the output for each model -dir <- glue::glue("{args[1]}/processed_data") +dir <- "processed_data" # read in files and manipulate with model_df <- dir %>% @@ -36,10 +37,7 @@ model_df_list <- split.data.frame(model_df, model_df$model) lapply(seq_along(model_df_list), function(i) assign(names(model_df_list)[i], model_df_list[[i]], envir = .GlobalEnv)) # save as R.data -proj_name <- stringr::str_extract(args[1], pattern = "[^\\/]+(?=(?:\\/[^\\/]+){1}$)") -run_stamp <- stringr::str_extract(args[1], pattern = "([^/]+$)") +proj_name <- args[1] +run_stamp <- args[2] save(list = c(ls(pattern = "model.outputs")), - file = glue::glue("{args[1]}/processed_data/{proj_name}_{run_stamp}.RData")) - -# clean up extra CP_outputs for now -system(command = glue::glue("if [ -d {args[1]}/CP_output ]; then rm -Rf {args[1]}/CP_output; fi")) \ No newline at end of file + file = paste0("processed_data/", args[1], "_", args[2], ".RData")) \ No newline at end of file diff --git a/bin/proc_CP_output_toxin.R b/bin/proc_CP_output_toxin.R deleted file mode 100644 index 92ad0b1..0000000 --- a/bin/proc_CP_output_toxin.R +++ /dev/null @@ -1,45 +0,0 @@ -# /usr/bin/Rscript -library(fs) -library(dplyr) -library(tidyr) -library(tibble) -library(stringr) -library(readr) -library(glue) -library(purrr) - -#==============================================================================# -# Arguments -#==============================================================================# -# 1 - out directory path -args <- commandArgs(trailingOnly = TRUE) - -#==============================================================================# -# Read CP output data -#==============================================================================# -# get the output for each model -dir <- glue::glue("{args[1]}/processed_data") - -# read in files and manipulate with -model_df <- dir %>% - fs::dir_ls(regexp = "\\.csv$") %>% # find paths to csvs in dir - purrr::map_dfr(readr::read_csv, .id = "model") %>% - dplyr::mutate(Metadata_Date = as.integer(Metadata_Date), - model = stringr::str_remove(fs::path_file(as.character(model)), pattern = ".csv"), - model = paste0(model, ".model.outputs")) %>% - dplyr::arrange(model, ImageNumber) - -# split to list -model_df_list <- split.data.frame(model_df, model_df$model) - -# export list items to global env -lapply(seq_along(model_df_list), function(i) assign(names(model_df_list)[i], model_df_list[[i]], envir = .GlobalEnv)) - -# save as R.data -proj_name <- stringr::str_extract(args[1], pattern = "[^\\/]+(?=(?:\\/[^\\/]+){1}$)") -run_stamp <- stringr::str_extract(args[1], pattern = "([^/]+$)") -save(list = c(ls(pattern = "model.outputs")), - file = glue::glue("{args[1]}/processed_data/{proj_name}_{run_stamp}.RData")) - -# clean up extra CP_outputs for now -system(command = glue::glue("if [ -d {args[1]}/CP_output ]; then rm -Rf {args[1]}/CP_output; fi")) \ No newline at end of file diff --git a/input_data/CP_pipelines/dauer-nf.cppipe b/input_data/CP_pipelines/dauer-nf.cppipe old mode 100644 new mode 100755 index efc0392..0536a9c --- a/input_data/CP_pipelines/dauer-nf.cppipe +++ b/input_data/CP_pipelines/dauer-nf.cppipe @@ -6,8 +6,8 @@ ModuleCount:31 HasImagePlaneDetails:False LoadData:[module_num:1|svn_version:'Unknown'|variable_revision_number:6|show_window:True|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] - Input data file location:Elsewhere...|METADATA_DIR - Name of the file:METADATA_CSV_FILE + Input data file location:Elsewhere...|. + Name of the file:metadata.csv Load images based on this data?:Yes Base image location:Elsewhere...| Process just a range of rows?:No @@ -167,8 +167,8 @@ UntangleWorms:[module_num:10|svn_version:'Unknown'|variable_revision_number:2|sh Overlap style:Both Name the output overlapping worm objects:dauerMod_OverlappingWorms Name the output non-overlapping worm objects:dauerMod_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL1_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:dauerMod.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 @@ -199,8 +199,8 @@ UntangleWorms:[module_num:11|svn_version:'Unknown'|variable_revision_number:2|sh Overlap style:Both Name the output overlapping worm objects:nondauerMod_OverlappingWorms Name the output non-overlapping worm objects:nondauerMod_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL2_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:nondauerMod.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 @@ -230,8 +230,8 @@ StraightenWorms:[module_num:12|svn_version:'Unknown'|variable_revision_number:3| Select the input untangled worm objects:dauerMod_NonOverlappingWorms Name the output straightened worm objects:dauerMod_StraightenedWorms Worm width:20 - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL1_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:dauerMod.xml Image count:1 Measure intensity distribution?:Yes Number of transverse segments:1 @@ -245,8 +245,8 @@ StraightenWorms:[module_num:13|svn_version:'Unknown'|variable_revision_number:3| Select the input untangled worm objects:nondauerMod_NonOverlappingWorms Name the output straightened worm objects:nondauerMod_StraightenedWorms Worm width:20 - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL2_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:nondauerMod.xml Image count:1 Measure intensity distribution?:Yes Number of transverse segments:1 diff --git a/input_data/CP_pipelines/toxin-nf.cppipe b/input_data/CP_pipelines/toxin-nf.cppipe old mode 100644 new mode 100755 index 070fb30..7e580b7 --- a/input_data/CP_pipelines/toxin-nf.cppipe +++ b/input_data/CP_pipelines/toxin-nf.cppipe @@ -6,8 +6,8 @@ ModuleCount:22 HasImagePlaneDetails:False LoadData:[module_num:1|svn_version:'Unknown'|variable_revision_number:6|show_window:False|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] - Input data file location:Elsewhere...|METADATA_DIR - Name of the file:METADATA_CSV_FILE + Input data file location:Elsewhere...|. + Name of the file:metadata.csv Load images based on this data?:Yes Base image location:Default Input Folder| Process just a range of rows?:No @@ -159,8 +159,8 @@ UntangleWorms:[module_num:9|svn_version:'Unknown'|variable_revision_number:2|sho Overlap style:Both Name the output overlapping worm objects:L4_N2_HB101_100w_OverlappingWorms Name the output non-overlapping worm objects:L4_N2_HB101_100w_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL1_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:L4_N2_HB101_100w.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 @@ -191,8 +191,8 @@ UntangleWorms:[module_num:10|svn_version:'Unknown'|variable_revision_number:2|sh Overlap style:Both Name the output overlapping worm objects:L2L3_N2_HB101_100w_OverlappingWorms Name the output non-overlapping worm objects:L2L3_N2_HB101_100w_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL2_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:L2L3_N2_HB101_100w.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 @@ -223,8 +223,8 @@ UntangleWorms:[module_num:11|svn_version:'Unknown'|variable_revision_number:2|sh Overlap style:Both Name the output overlapping worm objects:L1_N2_HB101_100w_OverlappingWorms Name the output non-overlapping worm objects:L1_N2_HB101_100w_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL3_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:L1_N2_HB101_100w.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 @@ -255,8 +255,8 @@ UntangleWorms:[module_num:12|svn_version:'Unknown'|variable_revision_number:2|sh Overlap style:Both Name the output overlapping worm objects:MDHD_OverlappingWorms Name the output non-overlapping worm objects:MDHD_NonOverlappingWorms - Training set file location:Elsewhere...|WORM_MODEL_DIR - Training set file name:MODEL4_XML_FILE + Training set file location:Elsewhere...|input_data/worm_models + Training set file name:MDHD.xml Use training set weights?:Yes Overlap weight:5.0 Leftover weight:10.0 diff --git a/main.nf b/main.nf index 6ad3fdc..ed08ab4 100644 --- a/main.nf +++ b/main.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow -// Use DSL2 -nextflow.preview.dsl=2 +// Using DSL-2 +nextflow.enable.dsl=2 // QUEST nextflow version message if( !nextflow.version.matches('>20.0') ) { @@ -10,63 +10,66 @@ if( !nextflow.version.matches('>20.0') ) { exit 1 } +// INCLUDE modules +// include { +// dauer_workflow +// } from './modules/dauerWorkflow.nf' +// include { +// toxin_workflow +// } from './modules/toxinWorkflow.nf' + /* ~ ~ ~ > * PARAMETERS SETUP */ // Variables date = new Date().format('yyyyMMdd') -// model_name = "NonOverlappingWorms" // JUST FOR NOW // Setup pipeline parameter params.pipeline = null -if("${params.pipeline}" == "dauer") { - pipe = "dauer-nf" - worm_model1 = "dauerMod.xml" - worm_model2 = "nondauerMod.xml" - model_name1 = "dauerMod_NonOverlappingWorms" - model_name2 = "nondauerMod_NonOverlappingWorms" -} else if( "${params.pipeline}" == "toxin" ) { - pipe = "toxin-nf" - worm_model1 = "L4_N2_HB101_100w.xml" - worm_model2 = "L2L3_N2_HB101_100w.xml" - worm_model3 = "L1_N2_HB101_100w.xml" - worm_model4 = "MDHD.xml" - model_name1 = "L4_N2_HB101_100w_NonOverlappingWorms" - model_name2 = "L2L3_N2_HB101_100w_NonOverlappingWorms" - model_name3 = "L1_N2_HB101_100w_NonOverlappingWorms" - model_name4 = "MDHD_NonOverlappingWorms" -} else if(!params.pipeline) { +if(!params.pipeline) { println """ Error: pipeline parameter not specified. Please enter --pipeline dauer or --pipeline toxin in command. """ System.exit(1) -} else if("${params.pipeline}" != "toxin" || "${params.pipeline}" != "dauer" ) { +} else if(params.pipeline != "toxin" && params.pipeline != "dauer" ) { println """ Error: pipeline (${params.pipeline}) does not match expected value. Please enter either dauer or toxin. """ System.exit(1) } -// Configure other parameters +date = new Date().format('yyyyMMdd') + +// Help: params.help = null -params.debug = null +// project directory params.project = null +// Groups to use params.groups = "plate,well" +// directory with input data params.data_dir = "${workflow.projectDir}/input_data" // this is different for gcp -params.bin_dir = "${workflow.projectDir}/bin" // this is different for gcp -params.well_mask = "${params.data_dir}/well_masks/wellmask_98.png" +// mask for the well +params.well_mask = "input_data/well_masks/wellmask_98.png" +// location to put output files params.out = "${params.project}/Analysis-${date}" -params.raw_pipe_dir = "${params.data_dir}/CP_pipelines" -params.raw_pipe = "${params.raw_pipe_dir}/${pipe}.cppipe" -params.edited_pipe = "${params.out}/pipeline/pipeline.cppipe" -params.metadata_dir = "${params.out}/metadata" -params.metadata = "metadata.csv" -params.worm_model_dir = "${params.data_dir}/worm_models" -/* -~ ~ ~ > * LOG AND HELP MESSAGE SETUP -*/ +params.project_name = params.project.split("/").last() +params.project_tag = "Analysis-${date}" + +params.container__general = "docker://andersenlab/nemascan:20220411181933701519" +params.container__cellprofiler = "cellprofiler/cellprofiler:4.2.1" + +include { + listFiles + makeMetadata +} from './modules/process_input.nf' +include { + runCP +} from './modules/cellprofiler.nf' +include { + proc_CP_output +} from './modules/process_output.nf' if (!params.help) { log.info ''' @@ -74,7 +77,8 @@ C E L L P R O F I L E R - N F P I P E L I N E =============================================== ''' log.info "" - log.info "Project = ${params.project}" + log.info "Projcect = ${params.project_name}" + log.info "Project Dir = ${params.project}" log.info "CP pipeline = ${params.pipeline}" log.info "Groups = ${params.groups}" log.info "Output = ${params.out}" @@ -93,256 +97,37 @@ C E L L P R O F I L E R - N F P I P E L I N E log.info "--pipeline The CP pipeline to use: toxin, dauer" log.info "" log.info "Optional arguments:" - log.info "--groups comma separated metadata groupings for CellProfiler, default is plate,well" - log.info "--outdir Output directory to place files, default is project/Analysis-{current date}" + log.info "--project_name A name for the project. Default detects from the project directory." + log.info "--groups Comma separated metadata groupings for CellProfiler, default is plate,well" + log.info "--out Output directory to place files, default is project/Analysis-{current date}" log.info "--help This usage statement." exit 1 } -/* -~ ~ ~ > * WORKFLOW -*/ - workflow { - - if("${params.pipeline}" == "dauer") { - // configure inputs for CellProfiler ONLY FOR DAUER NOW NEED TO CHANGE MODELS IF OTHER - config_cp = Channel.fromPath("${params.raw_pipe}") - .combine(Channel.from("${params.metadata_dir}")) - .combine(Channel.from("${params.metadata}")) - .combine(Channel.from("${params.worm_model_dir}")) - .combine(Channel.from(worm_model1)) // edit here - .combine(Channel.from(worm_model2)) // edit here - .combine(Channel.fromPath("${params.bin_dir}/config_CP_input_dauer.R")) - .combine(Channel.from("${params.project}")) - .combine(Channel.from("${params.well_mask}")) - .combine(Channel.from("${params.groups}")) - .combine(Channel.from("${params.edited_pipe}")) - .combine(Channel.from("${params.out}")) | config_CP_input_dauer - //.view() - - // Run CellProfiler - groups = config_CP_input_dauer.out.groups_file - .splitCsv(header:true, sep: "\t") - .map { row -> - [row.group, file("${row.pipeline}"), file("${row.output}")] - } - //.view() - - runCP(groups) - - // Preprocess CellProfiler output files - proc_cp = runCP.out.cp_output - .last() // This ensures that all items are emitted from runCP - .combine(Channel.from("${params.out}")) - .combine(Channel.from(model_name1)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.from(model_name2)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.fromPath("${params.bin_dir}/proc_CP_output_dauer.R")) - //.view() - - proc_CP_output_dauer(proc_cp) - } - - if("${params.pipeline}" == "toxin") { - // configure inputs for CellProfiler ONLY FOR DAUER NOW NEED TO CHANGE MODELS IF OTHER - config_cp = Channel.fromPath("${params.raw_pipe}") - .combine(Channel.from("${params.metadata_dir}")) - .combine(Channel.from("${params.metadata}")) - .combine(Channel.from("${params.worm_model_dir}")) - .combine(Channel.from(worm_model1)) // edit here - .combine(Channel.from(worm_model2)) // edit here - .combine(Channel.from(worm_model3)) // edit here - .combine(Channel.from(worm_model4)) // edit here - .combine(Channel.fromPath("${params.bin_dir}/config_CP_input_toxin.R")) - .combine(Channel.from("${params.project}")) - .combine(Channel.from("${params.well_mask}")) - .combine(Channel.from("${params.groups}")) - .combine(Channel.from("${params.edited_pipe}")) - .combine(Channel.from("${params.out}")) | config_CP_input_toxin - //.view() - - // Run CellProfiler - groups = config_CP_input_toxin.out.groups_file - .splitCsv(header:true, sep: "\t") - .map { row -> - [row.group, file("${row.pipeline}"), file("${row.output}")] - } - //.view() - - runCP(groups) - - // Preprocess CellProfiler output files - proc_cp = runCP.out.cp_output - .last() // This ensures that all items are emitted from runCP - .combine(Channel.from("${params.out}")) - .combine(Channel.from(model_name1)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.from(model_name2)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.from(model_name3)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.from(model_name4)) // HARDCODE VARIABLE NOW MAKE DEPENDENT ON PROFILE - .combine(Channel.fromPath("${params.bin_dir}/proc_CP_output_toxin.R")) | proc_CP_output_toxin - //.view() - } -} - -/* -~ ~ ~ > * CONFIGURE FILES FOR CELLPROFILER -*/ - -process config_CP_input_dauer { - publishDir "${params.out}/pipeline", mode: 'copy', pattern: "*.cppipe" - publishDir "${params.out}/metadata", mode: 'copy', pattern: "metadata.csv" - publishDir "${params.out}/groups", mode: 'copy', pattern: "groups.tsv" - - input: - tuple file(raw_pipe), val(meta_dir), val(meta), val(model_dir), val(model1), val(model2), - file(config_script), val(project), val(mask), val(group), val(edited_pipe), val(out) - - output: - path "*.cppipe", emit: cp_pipeline_file - path "metadata.csv", emit: metadata_file - path "groups.tsv", emit: groups_file - - - """ - # Configure the raw pipeline for CellProfiler - awk '{gsub(/METADATA_DIR/,"${meta_dir}"); print}' ${raw_pipe} | \\ - awk '{gsub(/METADATA_CSV_FILE/,"${meta}"); print}' | \\ - awk '{gsub(/WORM_MODEL_DIR/,"${model_dir}"); print}' | \\ - awk '{gsub(/MODEL1_XML_FILE/,"${model1}"); print}' | \\ - awk '{gsub(/MODEL2_XML_FILE/,"${model2}"); print}' > pipeline.cppipe - - # Configure metadata and groups for CellProfiller with config_CP_input.R - Rscript --vanilla ${config_script} ${project} ${mask} ${group} ${edited_pipe} ${out} - - """ -} - -process config_CP_input_toxin { - publishDir "${params.out}/pipeline", mode: 'copy', pattern: "*.cppipe" - publishDir "${params.out}/metadata", mode: 'copy', pattern: "metadata.csv" - publishDir "${params.out}/groups", mode: 'copy', pattern: "groups.tsv" - - input: - tuple file(raw_pipe), val(meta_dir), val(meta), val(model_dir), val(model1), val(model2), val(model3), val(model4), - file(config_script), val(project), val(mask), val(group), val(edited_pipe), val(out) - - output: - path "*.cppipe", emit: cp_pipeline_file - path "metadata.csv", emit: metadata_file - path "groups.tsv", emit: groups_file - - - """ - # Configure the raw pipeline for CellProfiler - awk '{gsub(/METADATA_DIR/,"${meta_dir}"); print}' ${raw_pipe} | \\ - awk '{gsub(/METADATA_CSV_FILE/,"${meta}"); print}' | \\ - awk '{gsub(/WORM_MODEL_DIR/,"${model_dir}"); print}' | \\ - awk '{gsub(/MODEL1_XML_FILE/,"${model1}"); print}' | \\ - awk '{gsub(/MODEL2_XML_FILE/,"${model2}"); print}' | \\ - awk '{gsub(/MODEL3_XML_FILE/,"${model3}"); print}' | \\ - awk '{gsub(/MODEL4_XML_FILE/,"${model4}"); print}' > pipeline.cppipe - - # Configure metadata and groups for CellProfiller with config_CP_input.R - Rscript --vanilla ${config_script} ${project} ${mask} ${group} ${edited_pipe} ${out} - - """ -} - -/* -~ ~ ~ > * RUN CELLPROFILER -*/ - -process runCP { - - label "cellpro" - - input: - tuple val(group), file(pipeline), file(output) - - output: - stdout emit: cp_output //tuple file("*.csv"), file("*.png"), emit: cp_output - - """ - # Run cellprofiler headless - cellprofiler -c -r -p ${pipeline} \ - -g ${group} \ - -o ${output} - - """ + in_data_dir = Channel.fromPath("${params.project}") + + listFiles(in_data_dir) + makeMetadata(listFiles.out) + groups = makeMetadata.out + .splitCsv(header: true) + .map { + row -> ["${row.Metadata_Group}", + "input_data/CP_pipelines/${params.pipeline}-nf.cppipe"] + } + runCP_input = groups + .combine(in_data_dir) + .combine( + Channel.fromPath( + "${workflow.projectDir}/input_data" + ) + ) + .combine(makeMetadata.out) + runCP(runCP_input) + concat_outputs = runCP.out.output_files.toList() + proc_CP_output(concat_outputs) } -/* -~ ~ ~ > * PROCESS CELLPROFILER OUTPUTS -*/ - -process proc_CP_output_dauer { - - //publishDir "${params.out}/processed_data", mode: 'copy', pattern: "*.RData" - - input: - tuple val(cp_output), val(out_dir), val(model_name1), val(model_name2), file(proc_CP_out_script) - - output: - //path "*.RData", emit: cp_out_dat - - """ - # remove exisitng directories if present and make fresh - if [ -d ${out_dir}/processed_data ]; then rm -Rf ${out_dir}/processed_data; fi - mkdir ${out_dir}/processed_data - if [ -d ${out_dir}/processed_images ]; then rm -Rf ${out_dir}/processed_images; fi - mkdir ${out_dir}/processed_images - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name1}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name1}.csv - - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name2}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name2}.csv - - # move all the output images to process_images directory END WITH /? - find ${out_dir}/CP_output -name '*.png' -exec mv {} ${out_dir}/processed_images \\; - # Process the CellProfiler output with proc_CP_output.R - Rscript --vanilla ${proc_CP_out_script} ${out_dir} - - """ -} - -process proc_CP_output_toxin { - - //publishDir "${params.out}/processed_data", mode: 'copy', pattern: "*.RData" - - input: - tuple val(cp_output), val(out_dir), val(model_name1), val(model_name2), - val(model_name3), val(model_name4), file(proc_CP_out_script) - - output: - //path "*.RData", emit: cp_out_dat - - """ - # remove exisitng directories if present and make fresh - if [ -d ${out_dir}/processed_data ]; then rm -Rf ${out_dir}/processed_data; fi - mkdir ${out_dir}/processed_data - if [ -d ${out_dir}/processed_images ]; then rm -Rf ${out_dir}/processed_images; fi - mkdir ${out_dir}/processed_images - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name1}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name1}.csv - - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name2}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name2}.csv - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name3}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name3}.csv - # find .csv files, concatenate them, and write new file - find ${out_dir}/CP_output -type f -name '${model_name4}.csv' -print0 | xargs -0 awk 'FNR>1 || NR==1 {print}' > ${out_dir}/processed_data/${model_name4}.csv - - # move all the output images to process_images directory END WITH /? - find ${out_dir}/CP_output -name '*.png' -exec mv {} ${out_dir}/processed_images \\; - # Process the CellProfiler output with proc_CP_output.R - Rscript --vanilla ${proc_CP_out_script} ${out_dir} - - """ -} - -/* -~ ~ ~ > * GENERATE REPORT -*/ workflow.onComplete { summary = """ @@ -357,7 +142,8 @@ workflow.onComplete { Git info: $workflow.repository - $workflow.revision [$workflow.commitId] { Parameters } --------------------------- - Project = ${params.project} + Project = ${params.project_name} + Project Dir = ${params.project} Pipeline Used = ${params.pipeline} Result Directory = ${params.out} """ diff --git a/modules/cellprofiler.nf b/modules/cellprofiler.nf new file mode 100644 index 0000000..1f626ac --- /dev/null +++ b/modules/cellprofiler.nf @@ -0,0 +1,28 @@ +#!/usr/bin/env nextflow + +// Using DSL-2 +nextflow.enable.dsl=2 + +process runCP { + container "${params.container__cellprofiler}" + label "cellpro" + + input: + tuple val(group), + val(pipeline), + path(input_data), + path("input_data"), + path("metadata.csv") + + output: + stdout emit: cp_output + path "${group}", emit: output_files + //tuple file("*.csv"), file("*.png"), emit: cp_output + + // publishDir "${params.out}/${params.project}/Analysis-${date}", + // mode: 'copy', + // pattern: "CP_output/${group}" + + script: + template 'runCP.sh' +} \ No newline at end of file diff --git a/modules/process_input.nf b/modules/process_input.nf new file mode 100644 index 0000000..350d57a --- /dev/null +++ b/modules/process_input.nf @@ -0,0 +1,33 @@ +#!/usr/bin/env nextflow + +// Using DSL-2 +nextflow.enable.dsl=2 + +process listFiles { + container "${params.container__general}" + input: + path input_dir + + output: + path "fileList.txt" + + script: + template 'listFiles.sh' +} + +process makeMetadata { + container "${params.container__general}" + + input: + path in_fileList + + output: + path "metadata.csv" + + publishDir "${params.out}/metadata", + mode: 'copy', + pattern: "metadata.csv" + + script: + template 'makeMetadata.sh' +} diff --git a/modules/process_output.nf b/modules/process_output.nf new file mode 100644 index 0000000..971e506 --- /dev/null +++ b/modules/process_output.nf @@ -0,0 +1,22 @@ +#!/usr/bin/env nextflow + +// Using DSL-2 +nextflow.enable.dsl=2 + +process proc_CP_output { + container "${params.container__general}" + + publishDir "${params.out}", + mode: 'copy', + pattern: "processed_{data,images}" + + input: + path cp_output + + output: + path "processed_data", emit: proc_dat + path "processed_images", emit: proc_img + + script: + template "proc_output_${params.pipeline}.sh" +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4f46da7..634a662 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ timeline { } executor { + name='slurm' queueSize = 5000 submitRateLimit = 10 } @@ -29,22 +30,14 @@ singularity { pullTimeout = '45 min' enabled = true autoMounts = true - cacheDir = "/projects/b1059/singularity" // this is QUEST specific still - } process { executor = 'slurm' - queue = 'genomicsguestA' - clusterOptions = '-A b1042 -t 04:00:00 -e errlog.txt' - container = 'andersenlab/nemascan:20220407173056db3227' // still need to make cp docker img errorStrategy='retry' maxRetries=3 withLabel: cellpro { - container = 'cellprofiler/cellprofiler:4.2.1' - queue = 'genomicsguestA' - clusterOptions = '-A b1042' memory = { 8.GB * task.attempt } time = { 15.min * task.attempt } errorStrategy='retry' diff --git a/templates/listFiles.sh b/templates/listFiles.sh new file mode 100644 index 0000000..b5eb3aa --- /dev/null +++ b/templates/listFiles.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e +set -o pipefail +set -x + +ls -1 ${input_dir}/raw_images/* > fileList.txt \ No newline at end of file diff --git a/templates/makeMetadata.sh b/templates/makeMetadata.sh new file mode 100644 index 0000000..7fe6758 --- /dev/null +++ b/templates/makeMetadata.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -e +set -o pipefail +set -x + +makeMetadata_${params.pipeline}.R \\ +${in_fileList} \\ +${params.well_mask} \\ +${params.groups} +echo "done" \ No newline at end of file diff --git a/templates/proc_output_dauer.sh b/templates/proc_output_dauer.sh new file mode 100644 index 0000000..808a383 --- /dev/null +++ b/templates/proc_output_dauer.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +set -o pipefail +set -x + +# remove exisitng directories if present and make fresh +mkdir processed_data + + +# find .csv files, concatenate them, and write new file +for model_name in dauerMod_NonOverlappingWorms nondauerMod_NonOverlappingWorms; do +first_line=TRUE +for fIter in \$(ls -1 */\${model_name}.csv); do +if [ "\${first_line}" = "TRUE" ]; then +cat \${fIter} > processed_data/\${model_name}.csv +else +tail -n +2 \${fIter} >> processed_data/\${model_name}.csv +fi +done +done + +mkdir processed_images +# move all the output images to process_images directory END WITH /? +cp */*.png processed_images/ + +# Process the CellProfiler output with proc_CP_output.R +proc_CP_output.R \\ +${params.project_name} \\ +${params.project_tag} \ No newline at end of file diff --git a/templates/proc_output_toxin.sh b/templates/proc_output_toxin.sh new file mode 100644 index 0000000..b4f7f43 --- /dev/null +++ b/templates/proc_output_toxin.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +set -o pipefail +set -x + +# remove exisitng directories if present and make fresh +mkdir processed_data + +# find .csv files, concatenate them, and write new file +for model_name in L4_N2_HB101_100w_NonOverlappingWorms L2L3_N2_HB101_100w_NonOverlappingWorms L1_N2_HB101_100w_NonOverlappingWorms MDHD_NonOverlappingWorms; do +first_line=TRUE +for fIter in \$(ls -1 */\${model_name}.csv); do +if [ "\${first_line}" = "TRUE" ]; then +cat \${fIter} > processed_data/\${model_name}.csv +else +tail -n +2 \${fIter} >> processed_data/\${model_name}.csv +fi +done +done + +mkdir processed_images + +# move all the output images to process_images directory END WITH /? +cp */*.png processed_images/ + +# Process the CellProfiler output with proc_CP_output.R +proc_CP_output.R \\ +${params.project_name} \\ +${params.project_tag} \ No newline at end of file diff --git a/templates/runCP.sh b/templates/runCP.sh new file mode 100644 index 0000000..bb73e03 --- /dev/null +++ b/templates/runCP.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e +set -o pipefail +set -x + +pwd +ls -lh + +chmod a+x ${pipeline} +export MPLCONFIGDIR=cellProfiler_tmp + +mkdir cellProfiler_tmp +mkdir -p ${group} + +# Run cellprofiler headless +cellprofiler -c -r \\ +-i \$(pwd) \\ +-p ${pipeline} \\ +-g Metadata_Group=${group} \\ +-o ${group} \\ +-t cellProfiler_tmp \ No newline at end of file