From f1f04e247c05e8a3e95ccda73ace7be642f9a792 Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Thu, 17 Feb 2022 01:03:22 +0100 Subject: [PATCH 1/6] initial commit of the file --- format_results_gplas.sh | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100755 format_results_gplas.sh diff --git a/format_results_gplas.sh b/format_results_gplas.sh new file mode 100755 index 0000000..4f20d5f --- /dev/null +++ b/format_results_gplas.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +while getopts :i:o: flag; do + case $flag in + i) input_file=$OPTARG;; + o) output_file=$OPTARG + esac +done + + +#1. Create file with proper header to hold the results +echo -e '"Prob_Chromosome"\t"Prob_Plasmid"\t"Prediction"\t"Contig_name"\t"Contig_length"' > ${output_file} +#2. Cat the Plascope results and process line by line +cat ${input_file} | while read line +do +classification=$(echo $line | cut -f 2 -d ' ') +contig=$(echo $line | cut -f 1 -d ' ') +length=$(echo $contig | cut -f 3 -d : | cut -f 1 -d _ ) +if [ $classification = 'plasmid' ] +then +echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +elif [ $classification = 'chromosome' ] +then +echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file} +elif [ $classification = 'unclassified' ] +then +echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +fi +done + From 31f08745d040d303bc1f786032636e720b8c6974 Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Thu, 17 Feb 2022 01:03:52 +0100 Subject: [PATCH 2/6] Added code for optionally format output for gplas --- plaScope.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/plaScope.sh b/plaScope.sh index 264885d..9fe3838 100755 --- a/plaScope.sh +++ b/plaScope.sh @@ -63,7 +63,7 @@ Mode 1: SPAdes assembly + contig classification Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!) --fasta SPAdes or Unicycler assembly fasta file [MANDATORY] -a Specify the assembler used: spades or unicycler. Default=spades. - + -g gplas format [OPTIONAL]. Provide results in format compatible with gplas. Example mode 1: plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample @@ -296,7 +296,7 @@ output { print > output }' $contigsortingfile $contigfile #Establish default value for assembler assembler='spades' -while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do +while getopts ":1:2:o:t:-:h:v:n:a:g" optchar; do case "${optchar}" in -) case "${OPTARG}" in @@ -349,6 +349,10 @@ while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do a) assembler=${OPTARG} ;; + g) + gplas_output='true' + ;; + ?) usage exit 1 @@ -509,6 +513,11 @@ fi contig_extraction ${FASTA} ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/${PREFIX} +#Create gplas formatted result file +if [[ "$gplas_output"==true ]]; then + mkdir ${OUTPUT}/gplas_formatted_results + ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab +fi echo "If you use PlaScope please cite: ..." exit 0 From 99c119197830123b0b0fa9dec1c66d9f4ddc3dc7 Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Thu, 17 Feb 2022 01:21:04 +0100 Subject: [PATCH 3/6] Added info of gplas format --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8e02a70..fdc52b5 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Mode 1: SPAdes assembly + contig classification Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!) --fasta SPAdes assembly fasta file [MANDATORY] -a Specify the assembler used: spades or unicycler. Default=spades. - + -g gplas format [OPTIONAL]. Provide results in format compatible with gplas. Example mode 1: plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample @@ -179,4 +179,4 @@ centrifuge-build -p 10 --conversion-table seqid_to_taxid.map --taxonomy-tree nod - Guilhem Royer (CEA-Genoscope, now at Pasteur): design, implementation, evaluation - David Valllenet (CEA-Genoscope): design - - Julian Paganini (UMC Utrecht): new feature: accept unicycler assemblies + - Julian Paganini (UMC Utrecht): new features. 1-Accept unicycler assemblies. 2- Format output for use with gplas From 1c838ad869686330d7ac2ea05cbe035f0850ad9a Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Mon, 21 Feb 2022 17:50:20 +0100 Subject: [PATCH 4/6] removed an extra ' --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fdc52b5..8d64f01 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ usage: plaScope.sh [OPTIONS] [ARGUMENTS] General options: -h, --help display this message and exit -v, --version display version number and exit - -n, --no-banner don't print beautiful banners + -n, --no-banner do not print beautiful banners -t number of threads[OPTIONAL] [default : 8] -o output directory [OPTIONAL] [default : current directory] --sample Sample name [MANDATORY] @@ -104,7 +104,7 @@ plaScope.sh --fasta my_fastafile.fasta -o output_directory --db_dir path/to/DB - Github: https://github.com/GuilhemRoyer/PlaScope -```` +``` `PlaScope` uses a database (see [this section](#DB)) made of 3 files. The argument `--db_dir` is the path to the directory where these 3 files are located. From 24529de48b5d6bfc6f380e6842151da16e7c3843 Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Wed, 2 Mar 2022 17:52:21 +0100 Subject: [PATCH 5/6] Make the gplas output as default, fix contig_extraction function for including unicycler assemblies --- plaScope.sh | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/plaScope.sh b/plaScope.sh index 9fe3838..16322fa 100755 --- a/plaScope.sh +++ b/plaScope.sh @@ -63,7 +63,7 @@ Mode 1: SPAdes assembly + contig classification Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!) --fasta SPAdes or Unicycler assembly fasta file [MANDATORY] -a Specify the assembler used: spades or unicycler. Default=spades. - -g gplas format [OPTIONAL]. Provide results in format compatible with gplas. + -g Don't produce the gplas formatted output [OPTIONAL]. Example mode 1: plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample @@ -206,7 +206,7 @@ local contigcov=${CONTIGCOV} local contiglength=${CONTIGLENGTH} local hitlength=${HITLENGTH} -awk -F'\t' -v contigcov=${contigcov} -v contiglength=${contiglength} -v hitlength=${hitlength} ' +awk -F'\t' -v contiglength=${contiglength} -v hitlength=${hitlength} ' BEGIN { TPLASCOPERES[0]="unclassified" TPLASCOPERES[1]="unclassified" @@ -218,12 +218,8 @@ OFS="\t" getline } -{clab=$1; split(clab,T,":") ; ccov=T[5]; - -if ( $7>=contiglength && $6>=hitlength && ccov>contigcov ) print $1,TPLASCOPERES[$3] - +{ if ( $7>=contiglength && $6>=hitlength ) print $1,TPLASCOPERES[$3] else print $1,TPLASCOPERES[0] - }' $plascopeextendres } @@ -271,7 +267,7 @@ local contigfile="$1" local contigsortingfile="$2" local contigfileprefix="$3" -awk -F'\t' -v contigfileprefix=${contigfileprefix} ' +awk -F'\t| ' -v contigfileprefix=${contigfileprefix} ' NR==FNR{Tcontig[">"$1]=$2;next} /^>/ { @@ -514,9 +510,9 @@ fi contig_extraction ${FASTA} ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/${PREFIX} #Create gplas formatted result file -if [[ "$gplas_output"==true ]]; then +if [ -z "${gplas_output+x}" ]; then mkdir ${OUTPUT}/gplas_formatted_results - ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab + ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_extendedresult -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab -p ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/ -a ${assembler} fi echo "If you use PlaScope please cite: ..." From f32511bbbad10eb9829d25e3a5fee995c0ea5d36 Mon Sep 17 00:00:00 2001 From: Julian Paganini Date: Wed, 2 Mar 2022 17:53:17 +0100 Subject: [PATCH 6/6] Considered separetly processing of files with spades and unicycler headers --- format_results_gplas.sh | 42 +++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/format_results_gplas.sh b/format_results_gplas.sh index 4f20d5f..e6682db 100755 --- a/format_results_gplas.sh +++ b/format_results_gplas.sh @@ -1,9 +1,11 @@ #!/bin/bash -while getopts :i:o: flag; do +while getopts :i:o:a:p: flag; do case $flag in i) input_file=$OPTARG;; - o) output_file=$OPTARG + o) output_file=$OPTARG;; + p) plascope_directory=$OPTARG;; + a) assembler=$OPTARG esac done @@ -11,20 +13,40 @@ done #1. Create file with proper header to hold the results echo -e '"Prob_Chromosome"\t"Prob_Plasmid"\t"Prediction"\t"Contig_name"\t"Contig_length"' > ${output_file} #2. Cat the Plascope results and process line by line -cat ${input_file} | while read line +if [[ $assembler == 'unicycler' ]] +then +tail -n+2 ${input_file} | while read line do -classification=$(echo $line | cut -f 2 -d ' ') -contig=$(echo $line | cut -f 1 -d ' ') -length=$(echo $contig | cut -f 3 -d : | cut -f 1 -d _ ) -if [ $classification = 'plasmid' ] +classification=$(echo $line | cut -f 3 -d ' ') +contig_number=$(echo $line | cut -f 1 -d ' ') +echo 'contig_number',${contig_number} +contig=$(grep -w '>'${contig_number} ${plascope_directory}*fasta | cut -f 2 -d : | sed 's/>//g' ) +length=$(echo $line | cut -f 7 -d ' ') +if [ $classification == '3' ] then echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} -elif [ $classification = 'chromosome' ] +elif [ $classification == '2' ] then echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file} -elif [ $classification = 'unclassified' ] -then +else echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} fi done +else +tail -n+2 ${input_file} | while read line +do +classification=$(echo $line | cut -f 3 -d ' ') +contig=$(echo $line | cut -f 1 -d ' ') +length=$(echo $line | cut -f 7 -d ' ') +if [ $classification == '3' ] +then +echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +elif [ $classification == '2' ] +then +echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file} +else +echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +fi +done +fi