diff --git a/README.md b/README.md index 8e02a70..8d64f01 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ usage: plaScope.sh [OPTIONS] [ARGUMENTS] General options: -h, --help display this message and exit -v, --version display version number and exit - -n, --no-banner don't print beautiful banners + -n, --no-banner do not print beautiful banners -t number of threads[OPTIONAL] [default : 8] -o output directory [OPTIONAL] [default : current directory] --sample Sample name [MANDATORY] @@ -92,7 +92,7 @@ Mode 1: SPAdes assembly + contig classification Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!) --fasta SPAdes assembly fasta file [MANDATORY] -a Specify the assembler used: spades or unicycler. Default=spades. - + -g gplas format [OPTIONAL]. Provide results in format compatible with gplas. Example mode 1: plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample @@ -104,7 +104,7 @@ plaScope.sh --fasta my_fastafile.fasta -o output_directory --db_dir path/to/DB - Github: https://github.com/GuilhemRoyer/PlaScope -```` +``` `PlaScope` uses a database (see [this section](#DB)) made of 3 files. The argument `--db_dir` is the path to the directory where these 3 files are located. @@ -179,4 +179,4 @@ centrifuge-build -p 10 --conversion-table seqid_to_taxid.map --taxonomy-tree nod - Guilhem Royer (CEA-Genoscope, now at Pasteur): design, implementation, evaluation - David Valllenet (CEA-Genoscope): design - - Julian Paganini (UMC Utrecht): new feature: accept unicycler assemblies + - Julian Paganini (UMC Utrecht): new features. 1-Accept unicycler assemblies. 2- Format output for use with gplas diff --git a/format_results_gplas.sh b/format_results_gplas.sh new file mode 100755 index 0000000..e6682db --- /dev/null +++ b/format_results_gplas.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +while getopts :i:o:a:p: flag; do + case $flag in + i) input_file=$OPTARG;; + o) output_file=$OPTARG;; + p) plascope_directory=$OPTARG;; + a) assembler=$OPTARG + esac +done + + +#1. Create file with proper header to hold the results +echo -e '"Prob_Chromosome"\t"Prob_Plasmid"\t"Prediction"\t"Contig_name"\t"Contig_length"' > ${output_file} +#2. Cat the Plascope results and process line by line +if [[ $assembler == 'unicycler' ]] +then +tail -n+2 ${input_file} | while read line +do +classification=$(echo $line | cut -f 3 -d ' ') +contig_number=$(echo $line | cut -f 1 -d ' ') +echo 'contig_number',${contig_number} +contig=$(grep -w '>'${contig_number} ${plascope_directory}*fasta | cut -f 2 -d : | sed 's/>//g' ) +length=$(echo $line | cut -f 7 -d ' ') +if [ $classification == '3' ] +then +echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +elif [ $classification == '2' ] +then +echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file} +else +echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +fi +done + +else +tail -n+2 ${input_file} | while read line +do +classification=$(echo $line | cut -f 3 -d ' ') +contig=$(echo $line | cut -f 1 -d ' ') +length=$(echo $line | cut -f 7 -d ' ') +if [ $classification == '3' ] +then +echo -e 0'\t'1'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +elif [ $classification == '2' ] +then +echo -e 1'\t'0'\t''"Chromosome"''\t''"'$contig'"''\t'$length >> ${output_file} +else +echo -e 0.5'\t'0.5'\t''"Plasmid"''\t''"'$contig'"''\t'$length >> ${output_file} +fi +done +fi diff --git a/plaScope.sh b/plaScope.sh index 264885d..16322fa 100755 --- a/plaScope.sh +++ b/plaScope.sh @@ -63,7 +63,7 @@ Mode 1: SPAdes assembly + contig classification Mode 2: contig classification of a fasta file (only if you already have your SPAdes or Unicycler assembly!) --fasta SPAdes or Unicycler assembly fasta file [MANDATORY] -a Specify the assembler used: spades or unicycler. Default=spades. - + -g Don't produce the gplas formatted output [OPTIONAL]. Example mode 1: plaScope.sh -1 my_reads_1.fastq.gz -2 my_reads_2.fastq.gz -o output_directory --db_dir path/to/DB --db_name chromosome_plasmid_db --sample name_of_my_sample @@ -206,7 +206,7 @@ local contigcov=${CONTIGCOV} local contiglength=${CONTIGLENGTH} local hitlength=${HITLENGTH} -awk -F'\t' -v contigcov=${contigcov} -v contiglength=${contiglength} -v hitlength=${hitlength} ' +awk -F'\t' -v contiglength=${contiglength} -v hitlength=${hitlength} ' BEGIN { TPLASCOPERES[0]="unclassified" TPLASCOPERES[1]="unclassified" @@ -218,12 +218,8 @@ OFS="\t" getline } -{clab=$1; split(clab,T,":") ; ccov=T[5]; - -if ( $7>=contiglength && $6>=hitlength && ccov>contigcov ) print $1,TPLASCOPERES[$3] - +{ if ( $7>=contiglength && $6>=hitlength ) print $1,TPLASCOPERES[$3] else print $1,TPLASCOPERES[0] - }' $plascopeextendres } @@ -271,7 +267,7 @@ local contigfile="$1" local contigsortingfile="$2" local contigfileprefix="$3" -awk -F'\t' -v contigfileprefix=${contigfileprefix} ' +awk -F'\t| ' -v contigfileprefix=${contigfileprefix} ' NR==FNR{Tcontig[">"$1]=$2;next} /^>/ { @@ -296,7 +292,7 @@ output { print > output }' $contigsortingfile $contigfile #Establish default value for assembler assembler='spades' -while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do +while getopts ":1:2:o:t:-:h:v:n:a:g" optchar; do case "${optchar}" in -) case "${OPTARG}" in @@ -349,6 +345,10 @@ while getopts ":1:2:o:t:-:h:v:n:a:" optchar; do a) assembler=${OPTARG} ;; + g) + gplas_output='true' + ;; + ?) usage exit 1 @@ -509,6 +509,11 @@ fi contig_extraction ${FASTA} ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_list ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/${PREFIX} +#Create gplas formatted result file +if [ -z "${gplas_output+x}" ]; then + mkdir ${OUTPUT}/gplas_formatted_results + ./format_results_gplas.sh -i ${OUTPUT}/${PREFIX}_PlaScope/Centrifuge_results/${PREFIX}_extendedresult -o ${OUTPUT}/gplas_formatted_results/${PREFIX}_plasmid_prediction.tab -p ${OUTPUT}/${PREFIX}_PlaScope/PlaScope_predictions/ -a ${assembler} +fi echo "If you use PlaScope please cite: ..." exit 0