From f1f04e247c05e8a3e95ccda73ace7be642f9a792 Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Thu, 17 Feb 2022 01:03:22 +0100
Subject: [PATCH 1/6] initial commit of the file

---
 format_results_gplas.sh | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100755 format_results_gplas.sh

diff --git a/format_results_gplas.sh b/format_results_gplas.sh
new file mode 100755
index 0000000..4f20d5f
--- /dev/null
+++ b/format_results_gplas.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+while getopts :i:o: flag; do
+	case $flag in
+		i) input_file=$OPTARG;;
+                o) output_file=$OPTARG
+	esac
+done
+
+
+#1. Create file with proper header to hold the results
+echo -e '"Prob_Chromosome"\t"Prob_Plasmid"\t"Prediction"\t"Contig_name"\t"Contig_length"' > ${output_file}
+#2. Cat the Plascope results and process line by line
+cat ${input_file} | while read line
+do
+classification=$(echo $line | cut -f 2 -d ' ')
+contig=$(echo $line | cut -f 1 -d ' ')
+length=$(echo $contig | cut -f 3 -d : | cut -f 1 -d _ )
+if [ $classification = 'plasmid' ]
+then
+echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
+elif [ $classification = 'chromosome' ]
+then
+echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file}
+elif [ $classification = 'unclassified' ]
+then
+echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
+fi
+done
+

From 31f08745d040d303bc1f786032636e720b8c6974 Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Thu, 17 Feb 2022 01:03:52 +0100
Subject: [PATCH 2/6] Added code for optionally format output for gplas

---
 plaScope.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/plaScope.sh b/plaScope.sh
index 264885d..9fe3838 100755
--- a/plaScope.sh
+++ b/plaScope.sh
@@ -63,7 +63,7 @@ Mode 1: SPAdes assembly + contig classification
 Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!)
   --fasta		SPAdes or Unicycler assembly fasta file [MANDATORY]
   -a			Specify the assembler used: spades or unicycler. Default=spades.
-
+  -g			gplas format [OPTIONAL]. Provide results in format compatible with gplas. 
 
 Example mode 1:
 plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory  --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample
@@ -296,7 +296,7 @@ output { print >  output }' $contigsortingfile $contigfile
 #Establish default value for assembler
 assembler='spades'
 
-while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do
+while getopts ":1:2:o:t:-:h:v:n:a:g" optchar; do
 	case "${optchar}" in
 		 -)
 			case "${OPTARG}" in
@@ -349,6 +349,10 @@ while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do
 		a)
 			assembler=${OPTARG}
 			;;
+                g)
+                        gplas_output='true'
+                        ;; 
+                
 		?)
 			usage
 			exit 1
@@ -509,6 +513,11 @@ fi
 
 contig_extraction ${FASTA} ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/${PREFIX}
 
+#Create gplas formatted result file
+if [[ "$gplas_output"==true ]]; then
+   mkdir ${OUTPUT}/gplas_formatted_results
+   ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab
+fi
 echo "If you use PlaScope please cite: ..."
 
 exit 0

From 99c119197830123b0b0fa9dec1c66d9f4ddc3dc7 Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Thu, 17 Feb 2022 01:21:04 +0100
Subject: [PATCH 3/6] Added info of gplas format

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e02a70..fdc52b5 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ Mode 1: SPAdes assembly + contig classification
 Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!)
   --fasta		SPAdes assembly fasta file [MANDATORY]
   -a                    Specify the assembler used: spades or unicycler. Default=spades.
-
+  -g                    gplas format [OPTIONAL]. Provide results in format compatible with gplas.
 
 Example mode 1:
 plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory  --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample
@@ -179,4 +179,4 @@ centrifuge-build -p 10 --conversion-table seqid_to_taxid.map --taxonomy-tree nod
 
   - Guilhem Royer (CEA-Genoscope, now at Pasteur): design, implementation, evaluation
   - David Valllenet (CEA-Genoscope): design
-  - Julian Paganini (UMC Utrecht): new feature: accept unicycler assemblies
+  - Julian Paganini (UMC Utrecht): new features. 1-Accept unicycler assemblies. 2- Format output for use with gplas

From 1c838ad869686330d7ac2ea05cbe035f0850ad9a Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Mon, 21 Feb 2022 17:50:20 +0100
Subject: [PATCH 4/6] removed an extra '

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fdc52b5..8d64f01 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ usage: plaScope.sh [OPTIONS] [ARGUMENTS]
 General options:
   -h, --help		display this message and exit
   -v, --version		display version number and exit
-  -n, --no-banner	don't print beautiful banners
+  -n, --no-banner	do not print beautiful banners
   -t			number of threads[OPTIONAL] [default : 8]
   -o			output directory [OPTIONAL] [default : current directory]
   --sample		Sample name [MANDATORY]
@@ -104,7 +104,7 @@ plaScope.sh --fasta my_fastafile.fasta -o output_directory --db_dir path/to/DB -
 
 Github:
 https://github.com/GuilhemRoyer/PlaScope
-````
+```
 
 `PlaScope` uses a database (see [this section](#DB)) made of 3 files.
 The argument `--db_dir` is the path to the directory where these 3 files are located.

From 24529de48b5d6bfc6f380e6842151da16e7c3843 Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Wed, 2 Mar 2022 17:52:21 +0100
Subject: [PATCH 5/6] Make the gplas output as default, fix contig_extraction
 function for including unicycler assemblies

---
 plaScope.sh | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/plaScope.sh b/plaScope.sh
index 9fe3838..16322fa 100755
--- a/plaScope.sh
+++ b/plaScope.sh
@@ -63,7 +63,7 @@ Mode 1: SPAdes assembly + contig classification
 Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!)
   --fasta		SPAdes or Unicycler assembly fasta file [MANDATORY]
   -a			Specify the assembler used: spades or unicycler. Default=spades.
-  -g			gplas format [OPTIONAL]. Provide results in format compatible with gplas. 
+  -g			Don't produce the gplas formatted output [OPTIONAL]. 
 
 Example mode 1:
 plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory  --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample
@@ -206,7 +206,7 @@ local contigcov=${CONTIGCOV}
 local contiglength=${CONTIGLENGTH}
 local hitlength=${HITLENGTH}
 
-awk -F'\t' -v contigcov=${contigcov} -v contiglength=${contiglength} -v hitlength=${hitlength} '
+awk -F'\t' -v contiglength=${contiglength} -v hitlength=${hitlength} '
 BEGIN {
 TPLASCOPERES[0]="unclassified"
 TPLASCOPERES[1]="unclassified"
@@ -218,12 +218,8 @@ OFS="\t"
 getline
 }
 
-{clab=$1; split(clab,T,":") ; ccov=T[5];
-
-if ( $7>=contiglength && $6>=hitlength && ccov>contigcov )  print $1,TPLASCOPERES[$3]
-	
+{ if ( $7>=contiglength && $6>=hitlength )  print $1,TPLASCOPERES[$3]
 else print $1,TPLASCOPERES[0]
-
 }' $plascopeextendres
 
 }
@@ -271,7 +267,7 @@ local contigfile="$1"
 local contigsortingfile="$2"
 local contigfileprefix="$3"
 
-awk -F'\t' -v contigfileprefix=${contigfileprefix} '
+awk -F'\t| ' -v contigfileprefix=${contigfileprefix} '
 NR==FNR{Tcontig[">"$1]=$2;next}
 
 /^>/ {
@@ -514,9 +510,9 @@ fi
 contig_extraction ${FASTA} ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/${PREFIX}
 
 #Create gplas formatted result file
-if [[ "$gplas_output"==true ]]; then
+if [ -z "${gplas_output+x}" ]; then
    mkdir ${OUTPUT}/gplas_formatted_results
-   ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab
+   ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_extendedresult -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab -p ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/ -a ${assembler}
 fi
 echo "If you use PlaScope please cite: ..."
 

From f32511bbbad10eb9829d25e3a5fee995c0ea5d36 Mon Sep 17 00:00:00 2001
From: Julian Paganini <j.a.paganini@umcutrecht.nl>
Date: Wed, 2 Mar 2022 17:53:17 +0100
Subject: [PATCH 6/6] Considered separetly processing of files with spades and
 unicycler headers

---
 format_results_gplas.sh | 42 +++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/format_results_gplas.sh b/format_results_gplas.sh
index 4f20d5f..e6682db 100755
--- a/format_results_gplas.sh
+++ b/format_results_gplas.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-while getopts :i:o: flag; do
+while getopts :i:o:a:p: flag; do
 	case $flag in
 		i) input_file=$OPTARG;;
-                o) output_file=$OPTARG
+                o) output_file=$OPTARG;;
+                p) plascope_directory=$OPTARG;;
+                a) assembler=$OPTARG
 	esac
 done
 
@@ -11,20 +13,40 @@ done
 #1. Create file with proper header to hold the results
 echo -e '"Prob_Chromosome"\t"Prob_Plasmid"\t"Prediction"\t"Contig_name"\t"Contig_length"' > ${output_file}
 #2. Cat the Plascope results and process line by line
-cat ${input_file} | while read line
+if [[ $assembler == 'unicycler' ]]
+then
+tail -n+2 ${input_file} | while read line
 do
-classification=$(echo $line | cut -f 2 -d ' ')
-contig=$(echo $line | cut -f 1 -d ' ')
-length=$(echo $contig | cut -f 3 -d : | cut -f 1 -d _ )
-if [ $classification = 'plasmid' ]
+classification=$(echo $line | cut -f 3 -d ' ')
+contig_number=$(echo $line | cut -f 1 -d ' ')
+echo 'contig_number',${contig_number}
+contig=$(grep -w '>'${contig_number} ${plascope_directory}*fasta | cut -f 2 -d : | sed 's/>//g' )
+length=$(echo $line | cut -f 7 -d ' ')
+if [ $classification == '3' ]
 then
 echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
-elif [ $classification = 'chromosome' ]
+elif [ $classification == '2' ]
 then
 echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file}
-elif [ $classification = 'unclassified' ]
-then
+else
 echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
 fi
 done
 
+else
+tail -n+2 ${input_file} | while read line
+do
+classification=$(echo $line | cut -f 3 -d ' ')
+contig=$(echo $line | cut -f 1 -d ' ')
+length=$(echo $line | cut -f 7 -d ' ')
+if [ $classification == '3' ]
+then
+echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
+elif [ $classification == '2' ]
+then
+echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file}
+else                                     
+echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file}
+fi
+done
+fi