From 32fc20b9c85ec321ba0318d218f92fc5733b6d7f Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Tue, 10 Dec 2019 13:03:21 -0500 Subject: [PATCH 1/8] changed analysis errors print statement --- pima.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pima.py b/pima.py index 9c0f995..6270876 100755 --- a/pima.py +++ b/pima.py @@ -1839,7 +1839,7 @@ def main(opts) : parser.print_help() print(Colors.ENDC) print(Colors.FAIL + '\n\nErrors:') - [print(i) for i in analysis.errors] + print([(i) for i in analysis.errors]) print(Colors.ENDC) sys.exit() From eade59ce63204d178821e0cbfae930e3c5b26ad2 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Tue, 10 Dec 2019 13:06:43 -0500 Subject: [PATCH 2/8] changed shebang to handle virt envs; rearranged modules to differentiate std lib from externals; updated version --- pima.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pima.py b/pima.py index 6270876..3144dc4 100755 --- a/pima.py +++ b/pima.py @@ -1,27 +1,26 @@ -#!/bin/env python +#!/usr/bin/env python -import os +from argparse import ArgumentParser, HelpFormatter import copy +import csv +import datetime import glob -import sys -import time -import shutil -import subprocess import logging +import os import re -import datetime +import shutil +import subprocess +import sys +import time + import numpy -import pandas -import csv import joblib +import pandas import pathos.multiprocessing as mp -import Bio.SeqIO - -from argparse import ArgumentParser, HelpFormatter pandas.set_option('display.max_colwidth', 200) -VERSION=0.1 +VERSION='0.1.1' amr_database_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'amr.fasta' inc_database_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'inc.fasta' From d29268eb71d351db5341dae9541fad5f92294843 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Tue, 10 Dec 2019 13:09:40 -0500 Subject: [PATCH 3/8] add example install; list dependencies; list citations --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d7b734b..d54be48 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,70 @@ -# pima +# **pima** Plasmid and antimicrobial resistance pipeline + +### Example Installation +Pima is not yet packaged, but its primary dependencies are _mostly_ available. `guppy` must be manually installed with info [here](https://nanoporetech.com/community). If you don't yet have `conda` installed you'll need to do that [here](https://docs.anaconda.com/anaconda/install/). Once installed you can create an environment where `pima.py` will run. +```sh +conda create -n pima_env2 python=2.7.16 +source activate pima_env2 +# install python modules +conda install -c conda-forge joblib numpy pandas pathos -y +# install bioinfx packages +conda install -c bioconda bcftools bedtools blast flye mummer medaka miniasm minimap2 nanopolish parallel porechop qcat racon samtools spades wtdbg -y +git clone https://github.com/abconley/pima.git $HOME +python ~/pima/pima.py --help +``` + +##### Compatibility +Currently several unix utilities are used, so this is not compatible with Windows OS. CentOS and Ubuntu are both known working OSes. + + + +##### Dependencies +| System Command | Package | +| -------------- | ------- | +| `bcftools` | BCFtools | +| `bedtools` | BEDTools | +| `blastn` | BLAST+ | +| `dnadiff` | MUMmer | +| `faidx` | SAMtools | +| `flye` | Flye | +| `parallel` | GNU parallel | +| `guppy_basecaller` | Guppy | +| `makeblastdb` | BLAST+ | +| `medaka_consensus` | Medaka | +| `miniasm` | miniasm | +| `minimap2` | minimap2 | +| `nanopolish` | Nanopolish | +| `nanopolish_makerange.py` | Nanopolish | +| `pblat` | pblat | +| `pChunks.R` | local R script | +| `porechop` | Porechop | +| `qcat` | Qcat | +| `racon` | racon | +| `read_fast5_basecaller.py` | Guppy | +| `samtools` | SAMtools | +| `spades.py` | SPAdes | +| `wtdbg2` | Wtdbg2 | +| `wtpoa-cns` | Wtdbg2 | +| | | +| `awk` | Unix | +| `cat` | Unix | +| `grep` | Unix | +| `head` | Unix | +| `ls` | Unix | +| `sed` | Unix | +| `sort` | Unix | + +#### Literature Citations +- McLaughlin HP, Bugrysheva JV, Conley AB, Gulvik CA, Kolton CB, Marston C, Swaney E, Lonsway DR, Cherney B, Gargis AS, Kongphet-Tran T, Lascols C, Michel P, Villanueva J, Hoffmaster ER, Gee JE, Sue D. 2020. When minutes matter: rapid nanopore whole genome sequencing for anthrax emergency preparedness. Emerging Infectious Diseases +- [BCFtools and SAMtools](https://www.ncbi.nlm.nih.gov/pubmed/19505943) +- [BEDTools](https://www.ncbi.nlm.nih.gov/pubmed/25199790) +- [BLAST+](https://www.ncbi.nlm.nih.gov/pubmed/20003500) +- [GNU parallel](https://www.usenix.org/publications/login/february-2011-volume-36-number-1/gnu-parallel-command-line-power-tool) +- [Flye](https://www.ncbi.nlm.nih.gov/pubmed/30936562) +- [miniasm and minimap2](https://www.ncbi.nlm.nih.gov/pubmed/27153593) +- [MUMmer](https://www.ncbi.nlm.nih.gov/pubmed/14759262) +- [pblat](https://www.ncbi.nlm.nih.gov/pubmed/30646844) +- [racon](https://www.ncbi.nlm.nih.gov/pubmed/28100585) +- [SPAdes](https://www.ncbi.nlm.nih.gov/pubmed/22506599) +- [Wtdbg2](https://www.nature.com/articles/s41592-019-0669-3) From 84a6de0ac802172957dfc3331f6dd1a4494d629a Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 12 Dec 2019 23:42:53 -0500 Subject: [PATCH 4/8] py2.7 compatible with recursive Fast5 searches, use realpath instead of abspath to handle symlinks, allow ~ input with expanduser, use path.join for nonsystem call var assignments, cleaned extra spaces on empty lines --- pima.py | 454 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 227 insertions(+), 227 deletions(-) diff --git a/pima.py b/pima.py index 3144dc4..ed25e86 100755 --- a/pima.py +++ b/pima.py @@ -20,12 +20,12 @@ pandas.set_option('display.max_colwidth', 200) -VERSION='0.1.1' +VERSION='0.1.2' -amr_database_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'amr.fasta' -inc_database_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'inc.fasta' -plasmid_database_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'plasmids_and_vectors.fasta' -reference_dir_default = re.sub('pima.py', '', os.path.abspath(__file__)) + 'reference_sequences' +amr_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'amr.fasta' +inc_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'inc.fasta' +plasmid_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'plasmids_and_vectors.fasta' +reference_dir_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'reference_sequences' class Colors: HEADER = '\033[95m' @@ -36,7 +36,7 @@ class Colors: ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' - + class Analysis : def __init__(self, opts, unknown_args) : @@ -44,23 +44,25 @@ def __init__(self, opts, unknown_args) : self.input_given = [] # Input options - self.ont_fast5 = opts.ont_fast5 + self.ont_fast5 = os.path.realpath(os.path.expanduser(opts.ont_fast5)) self.ont_fast5_limit = opts.ont_fast5_limit self.basecaller = opts.basecaller self.no_trim = opts.no_trim - self.albacore_seq_files_file = opts.albacore_seq_file + self.albacore_seq_files_file = os.path.realpath(os.path.expanduser( + opts.albacore_seq_file)) self.barcode_min_fraction = 2.5 - self.ont_fastq = opts.ont_fastq + self.ont_fastq = os.path.realpath(os.path.expanduser(opts.ont_fastq)) self.ont_fastq_dir = None self.only_basecall = opts.only_basecall self.multiplexed = opts.multiplexed self.demux = opts.demux self.error_correct = opts.error_correct - self.illumina_fastq = opts.illumina_fastq - self.genome_fasta = opts.genome + self.illumina_fastq = os.path.realpath(os.path.expanduser( + opts.illumina_fastq)) + self.genome_fasta = os.path.realpath(os.path.expanduser(opts.genome)) self.will_have_genome_fasta = False - - self.output_dir = opts.output + + self.output_dir = os.path.realpath(os.path.expanduser(opts.output)) self.overwrite = opts.overwrite # Assembly options @@ -74,7 +76,7 @@ def __init__(self, opts, unknown_args) : self.pilon = opts.pilon self.no_assembly = opts.no_assembly self.will_have_ont_assembly = False - + # Plasmid and feature options self.plasmid_database = opts.plasmid_database self.plasmids = opts.plasmids @@ -88,23 +90,23 @@ def __init__(self, opts, unknown_args) : self.feature_dirs = [] self.feature_names = [] self.feature_colors = [] - + # Reference options self.reference_dir = opts.reference_dir self.organism = opts.organism self.no_reference = opts.no_reference self.no_mutations = opts.no_mutations - + self.threads = opts.threads self.errors = [] - + # How much stuff to print self.verbosity = opts.verbosity # Dont' actully run any commands self.fake_run = opts.fake_run - + # Verbosity levels and colors self.error_color = Colors.FAIL self.main_process_verbosity = 1 @@ -114,42 +116,42 @@ def __init__(self, opts, unknown_args) : self.sub_process_verbosity = 2 self.sub_process_color = Colors.OKBLUE self.command_verbosity = 3 - + # The actual steps to carry out in the analysis held as a list self.analysis = [] - + # See if we got any unknown args. Not allowed. if len(unknown_args) != 0 : self.errors = self.errors + ['Unknown argument: ' + unknown for unknown in unknown_args] - + def print_and_log(self, text, verbosity, color = Colors.ENDC) : if verbosity <= self.verbosity : time_string = '[' + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ']' print(time_string + ' ' + color + text + Colors.ENDC) - + def run_command(self, command) : if not self.fake_run : subprocess.call(command, shell = True) - + def print_and_run(self, command) : self.print_and_log(command, self.command_verbosity) self.run_command(command) - + def validate_file(self, the_file) : return os.path.isfile(the_file) - + def validate_file_size(self, the_file) : if not self.fake_run : return os.stat(the_file).st_size > 0 else : return True - + def validate_file_and_size_or_error(self, the_file, error_prefix = 'The file', presence_suffix = 'doesn\'t exist', size_suffix = 'is size 0') : @@ -161,7 +163,7 @@ def validate_file_and_size_or_error(self, the_file, error_prefix = 'The file', self.print_and_log(' '.join([error_prefix, the_file, size_suffix]), 0, Colors.FAIL) self.error_out() - + def minimap_ont_fastq(self, genome, fastq, bam) : command = ' '.join(['minimap2 -a', '-t', str(self.threads), @@ -176,7 +178,7 @@ def minimap_ont_fastq(self, genome, fastq, bam) : self.validate_file_and_size_or_error(bam) self.index_bam(bam) - + def minimap_illumina_fastq(self, genome, fastq, bam) : command = ' '.join(['minimap2 -a', '-t', str(self.threads), @@ -191,8 +193,8 @@ def minimap_illumina_fastq(self, genome, fastq, bam) : self.print_and_run(command) self.validate_file_and_size_or_error(bam) self.index_bam(bam) - - + + def index_bam(self, bam) : command = ' '.join(['samtools index', bam, @@ -201,7 +203,7 @@ def index_bam(self, bam) : index_bai = bam + '.bai' self.validate_file_and_size_or_error(index_bai) - + def bwa_index(self, target, std_dir) : bwa_stdout, bwa_stderr = [os.path.join(std_dir, 'bwa_index.' + i) for i in ['stdout', 'stderr']] @@ -209,16 +211,16 @@ def bwa_index(self, target, std_dir) : '1>' + bwa_stdout, '2>' + bwa_stderr]) self.print_and_run(command) - - + + def validate_utility(self, utility, error) : if not shutil.which(utility) : self.errors += [error] return False - + def error_out(self) : - exit(1) + sys.exit(1) def unless_only_basecall(function) : @@ -236,46 +238,44 @@ def wrapper(self) : function(self) return wrapper - + def validate_ont_fast5(self) : if self.only_basecall and (not self.ont_fast5 and not self.ont_fastq) : self.errors += ['--only-basecall requires --ont-fast5 or --ont-fastq'] return - + if not self.ont_fast5 : return - + self.print_and_log('Validating ONT fast5 files and utilities', self.main_process_verbosity, self.main_process_color) - + if not os.path.isdir(self.ont_fast5) : self.errors += ['Input FAST5 directory ' + self.ont_fast5 + ' cannot be found'] return - - #1 - Look for FAST5 files in the directory - #Start with the usual format for FAST5 data, one directory with a bunch of 0..n subdirectories - in_subdirs = False - search_string = self.ont_fast5 + "/**/*fast5" - fast5_files = glob.glob(search_string, recursive = True) - if not len(fast5_files) == 0: - in_subdirs = True - - #If we can't find things in the subdirs, look in the main fast5 dir - in_maindir = False - search_string = self.ont_fast5 + "/*fast5" - fast5_files = glob.glob(search_string, recursive = True) - if not len(fast5_files) == 0: - in_maindir = True - - if not in_subdirs and not in_maindir : - self.errors += ['Could not find FAST5 files in ' + self.ont_fast5 + ' or subdirectories'] + + # Look for FAST5 files in the directory and subdirectories + if sys.version_info < (3, 5): + fast5_files = [] + import fnmatch + for base, dirs, files in os.walk(self.ont_fast5): + found_files = fnmatch.filter(files, '*.fast5') + fast5_files.extend(os.path.join(base, f) for f in found_files) + else + fast5_files = glob.glob(os.path.join(self.ont_fast5, + '**', '*.fast5'), recursive=True) + + if len(fast5_files) == 0 : + self.errors += ['Could not find FAST5 files in ' + self.ont_fast5 + ' or its subdirectories'] + else: + self.print_and_log('Found {} FAST5 files'.format(len(fast5_files)), self.sub_process_verbosity, self.sub_process_color) if not self.ont_fast5_limit is None: if self.ont_fast5_limit <= 0 : - self.errors += ['FAST5 limit must be greater than 0. Got ' + str(self.ont_fast5_limie)] + self.errors += ['FAST5 limit must be greater than 0. Specified input was ' + str(self.ont_fast5_limit)] if self.ont_fast5 is None : self.errors += ['FAST5 limit must be accompanied by FAST5 data.'] - + def validate_ont_fastq(self): if not self.ont_fastq : @@ -283,7 +283,7 @@ def validate_ont_fastq(self): if not os.path.isfile(self.ont_fastq) : self.errors += ['Input FASTQ file ' + self.ont_fastq + ' cannot be found'] - + def validate_genome_fasta(self) : if not self.genome_fasta : return @@ -291,7 +291,7 @@ def validate_genome_fasta(self) : self.errors += ['Input genome FASTA ' + self.genome_fasta + ' cannot be found'] self.will_have_genome_fasta = True - + def validate_output_dir(self) : if not opts.output : self.errors += ['No output directory given (--output)'] @@ -301,13 +301,13 @@ def validate_output_dir(self) : def validate_guppy(self) : - + if self.basecaller != 'guppy' : return if not self.ont_fast5 : return - + if self.ont_fastq : return @@ -315,17 +315,17 @@ def validate_guppy(self) : for utility in ['guppy_basecaller', 'guppy_aligner', 'guppy_barcoder'] : self.validate_utility(utility, utility + ' is not on the PATH (required by --basecaller albacore)') - + self.analysis += ['guppy_ont_fast5'] - + def validate_albacore(self) : if self.basecaller != 'albacore' : return if not self.ont_fast5 : return - + if self.ont_fastq : return @@ -333,10 +333,10 @@ def validate_albacore(self) : for utility in ['read_fast5_basecaller.py'] : self.validate_utility(utility, utility + ' is not on the PATH (required by --basecaller albacore)') - + self.analysis += ['albacore_ont_fast5'] - + def validate_qcat(self) : if self.no_trim : @@ -344,7 +344,7 @@ def validate_qcat(self) : if self.demux != 'qcat' : return - + if not (self.ont_fast5 or self.ont_fastq) : return @@ -354,9 +354,9 @@ def validate_qcat(self) : self.analysis += ['qcat_ont_fastq'] - + def validate_porechop(self) : - + if self.no_trim : return @@ -374,9 +374,9 @@ def validate_porechop(self) : self.analysis += ['porechop_ont_fastq'] - + def validate_lorma(self) : - + if not self.error_correct : return @@ -390,9 +390,9 @@ def validate_lorma(self) : self.analysis += ['lorma_ont_fastq'] - + def validate_racon_correction(self) : - + if self.no_error_correct : return @@ -405,31 +405,31 @@ def validate_racon_correction(self) : self.analysis += ['racon_ont_fastq'] - + @unless_only_basecall @unless_given_genome def validate_miniasm(self) : if self.assembler != 'miniasm' : return - + if self.no_assembly : return - + if not self.ont_fast5 and not self.ont_fastq : self.errors += ['Miniasm assembly requires --ont-fast5 and/or --ont-fastq'] - + self.print_and_log('Validating miniasm utilities', self.main_process_verbosity, self.main_process_color) - + for utility in ['minimap2', 'miniasm'] : self.validate_utility(utility, utility + ' is not on the PATH (required by --assembler miniasm)') self.will_have_genome_fasta = True - + self.analysis += ['miniasm_ont_fastq'] self.racon = True - - + + @unless_only_basecall @unless_given_genome def validate_wtdbg2(self) : @@ -438,23 +438,23 @@ def validate_wtdbg2(self) : if self.no_assembly : return - + if not (self.ont_fast5 or self.ont_fastq) : return - + self.print_and_log('Validating wtdbg2 utilities', self.main_process_verbosity, self.main_process_color) - + if not self.genome_size : self.errors += ['wtdbg2 requires --genome-size'] elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size) : self.errors += ['--genome-size needs to be a floating point number in Mega or kilobases, got ' + str(self.genome_size)] - + for utility in ['pgzf', 'kbm2', 'wtdbg2', 'wtpoa-cns', 'wtdbg-cns'] : self.validate_utility(utility, utility + ' is not on the PATH (required by --assembler wtdbg2)') self.will_have_ont_assembly = True self.will_have_genome_fasta = True - + self.analysis += ['wtdbg2_ont_fastq'] # self.analysis += ['racon_ont_assembly'] @@ -468,9 +468,9 @@ def validate_flye(self) : if not self.ont_fastq and not self.ont_fast5 : return - + self.print_and_log('Validating flye utilities', self.main_process_verbosity, self.main_process_color) - + if not self.ont_fast5 and not self.ont_fastq : self.errors += ['Assembly requires --ont-fast5 and/or --ont-fastq'] @@ -478,7 +478,7 @@ def validate_flye(self) : self.errors += ['Flye requires --genome-size'] elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size) : self.errors += ['--genome-size needs to be a floating point number in Mega or kilobases, got ' + str(self.genome_size)] - + # TODO deal with Flye and 2.7/3.6 issues # for utility in ['flye'] : @@ -489,32 +489,32 @@ def validate_flye(self) : self.analysis += ['flye_ont_fastq'] - + @unless_only_basecall def validate_racon(self) : if not self.racon : return - + if not self.ont_fast5 and not self.ont_fastq : self.errors += ['Assembly requires --ont-fast5 and/or --ont-fastq'] - + self.print_and_log('Validating racon', self.main_process_verbosity, self.main_process_color) - + self.validate_utility('racon', 'racon' + ' is not on the PATH (required by --assembler ' + self.assembler +')') self.analysis += ['racon_ont_assembly'] - + @unless_only_basecall def validate_medaka(self) : - + # if not self.medaka : # return if not self.will_have_ont_assembly : return - + self.print_and_log('Validating medaka', self.main_process_verbosity, self.main_process_color) if not self.ont_fast5 and not self.ont_fastq : @@ -522,18 +522,18 @@ def validate_medaka(self) : if not self.will_have_genome_fasta : self.errors += ['--medaka requires that a genome be assembled or supplied'] - + self.validate_utility('medaka_consensus', 'medaka_consensus is not on the PATH (required by --medaka)') self.analysis += ['medaka_ont_assembly'] - - @unless_only_basecall + + @unless_only_basecall def validate_nanopolish(self) : - + if not self.nanopolish : - return - + return + self.print_and_log('Validating Nanopolish', self.main_process_verbosity, self.main_process_color) if not self.ont_fast5 : @@ -541,24 +541,24 @@ def validate_nanopolish(self) : if not self.will_have_genome_fasta : self.errors += ['--nanopolish requires that a genome be assembled or supplied'] - + self.validate_utility('nanopolish', 'nanopolish is not on the PATH (required by --nanopolish)') self.analysis += ['nanopolish_ont_assembly'] - @unless_only_basecall + @unless_only_basecall def validate_illumina_fastq(self) : - + if not self.illumina_fastq : return self.print_and_log('Validating Illumina data', self.main_process_verbosity, self.main_process_color) - + for r_fastq in self.illumina_fastq : if not os.path.isfile(r_fastq) : self.errors += ['Illumian FASTQ file' + r_fastq + ' cannot be found'] - + # Assume we want to use Illumina data to polish a given genome or ONT assembly if self.ont_fast5 or self.ont_fastq or self.genome_fasta : self.validate_utility('pilon', 'pilon is not on the PATH (required by --illumina-fastq with --ont-fastq, --ont-fast, or --genome)') @@ -568,28 +568,28 @@ def validate_illumina_fastq(self) : self.validate_utility('spades.py', 'spades.py is not on the PATH (required by --illumina-fastq)') self.analysis += ['spades_illumina_fastq'] self.will_have_genome_fasta = True - + @unless_only_basecall def validate_features(self) : if self.feature_fastas is None : self.feature_fastas = [] - + if not self.no_amr : self.feature_fastas += [self.amr_database] if not self.no_inc : self.feature_fastas += [self.inc_database] - + if len(self.feature_fastas) == 0 : return if not self.will_have_genome_fasta : return - + self.print_and_log('Validating feature sets', self.main_process_verbosity, self.main_process_color) - + for feature_fasta in self.feature_fastas : if not os.path.isfile(feature_fasta) : self.errors += ['Can\'t find feature database ' + feature_fasta] @@ -603,16 +603,16 @@ def validate_blast(self) : if not self.will_have_genome_fasta : return - + self.print_and_log('Validating blast utilities', self.main_process_verbosity, self.main_process_color) for util in ['makeblastdb', 'blastn', 'bedtools'] : if not shutil.which(util) : self.errors += ['Can\'t find ' + util + ' on the PATH'] - + self.analysis = self.analysis + ['blast_feature_sets'] - + @unless_only_basecall def validate_reference(self) : @@ -620,7 +620,7 @@ def validate_reference(self) : return self.print_and_log('Validating reference genome and utilities', self.main_process_verbosity, self.main_process_color) - + #1 - Check for mummer utils for util in ['dnadiff', 'nucmer', 'mummer'] : if not shutil.which(util) : @@ -637,7 +637,7 @@ def validate_reference(self) : self.organism_dir = os.path.join(self.reference_dir, self.organism) if not os.path.isdir(self.organism_dir) : self.errors += ['Can\'t find organism directory ' + self.organism_dir] - + #It should be in reference_sequences/organism self.reference_fasta = os.path.join(self.organism_dir, 'genome.fasta') if not os.path.isfile(self.reference_fasta) : @@ -645,14 +645,14 @@ def validate_reference(self) : self.mutation_regions_bed = os.path.join(self.reference_dir, self.organism, 'mutation_regions.bed') if not os.path.isfile(self.mutation_regions_bed) : self.errors += ['Can\'t find mutation_regions.bed in the reference directory.'] - + if self.will_have_genome_fasta : self.analysis = self.analysis + ['call_insertions'] - - @unless_only_basecall + + @unless_only_basecall def validate_mutations(self) : - + if self.no_mutations : return @@ -660,18 +660,18 @@ def validate_mutations(self) : return self.print_and_log('Validating mapping and variant utilities', self.main_process_verbosity, self.main_process_color) - + if not (self.ont_fast5 or self.ont_fastq or self.illumina_fastq): self.errors += ['Can\'t call mutations without a FAST5 or FASTQ dataset'] if not self.organism or self.no_reference : self.errors += ['Can\'t call mutations without a reference genome'] - + for utility in ['minimap2', 'samtools', 'bcftools'] : self.validate_utility(utility, utility + ' is not on the PATH (required for AMR mutations)') - + self.analysis = self.analysis + ['call_amr_mutations'] - + @unless_only_basecall def validate_plasmids(self) : @@ -679,44 +679,44 @@ def validate_plasmids(self) : return self.print_and_log('Validating plasmid database and utilities', self.main_process_verbosity, self.main_process_color) - + for utility in ['pblat', 'Rscript', 'R', 'pChunks.R'] : self.validate_utility(utility, utility + ' is not on the PATH (required for plasmid finding)') - + if not os.path.isfile(self.plasmid_database) : self.errors += ['Can\'t find plasmid database ' + self.plasmid_database] - + if not self.will_have_genome_fasta : self.errors += ['Can\'t call plasmids without a genome or an assembly'] self.analysis = self.analysis + ['call_plasmids'] - + def validate_options(self) : self.validate_output_dir() self.validate_ont_fast5() self.validate_ont_fastq() - + self.validate_guppy() self.validate_albacore() - + self.validate_qcat() self.validate_porechop() - + self.validate_lorma() - + self.validate_genome_fasta() - + self.validate_miniasm() self.validate_wtdbg2() self.validate_flye() self.validate_racon() - + self.validate_medaka() self.validate_nanopolish() - + self.validate_illumina_fastq() self.validate_features() @@ -728,7 +728,7 @@ def validate_options(self) : if len(self.analysis) == 0 : self.errors = self.errors + ['Nothing to do!'] - + def load_organisms() : print('Pretending to load organisms') @@ -736,7 +736,7 @@ def list_organisms() : ''' Lists the set of reference organisms available to Pima ''' - + def make_output_dir(self) : if os.path.isdir(self.output_dir) : @@ -744,22 +744,22 @@ def make_output_dir(self) : os.mkdir(self.output_dir) - + def start_logging(self): self.logging_file = os.path.join(self.output_dir, 'log.txt') - + def guppy_ont_fast5(self) : - + self.print_and_log('Basecalling ONT reads with Guppy', self.main_process_verbosity, self.main_process_color) - + # Make the directory for new FASTQ files self.print_and_log('Running Guppy on raw ONT FAST5', self.sub_process_verbosity, self.sub_process_color) self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) stdout_file = os.path.join(self.ont_fastq_dir, 'guppy.stdout') stderr_file = os.path.join(self.ont_fastq_dir, 'guppy.stderr') - + # Run the basecalling with Guppy command = ' '.join(['guppy_basecaller', '-i', self.ont_fast5, @@ -780,11 +780,11 @@ def guppy_ont_fast5(self) : # Make sure the merged FASTQ file exists and has size > 0 self.validate_file_and_size_or_error(self.ont_raw_fastq, 'ONT raw FASTQ file', 'cannot be found after albacore', 'is empty') - + def albacore_ont_fast5(self) : - + self.print_and_log('Basecalling ONT reads with Albacore', self.main_process_verbosity, self.main_process_color) - + # Make the directory for new FASTQ files self.print_and_log('Running Albacore on raw ONT FAST5', self.sub_process_verbosity, self.sub_process_color) self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') @@ -814,7 +814,7 @@ def albacore_ont_fast5(self) : self.albacore_seq_files = pandas.DataFrame(albacore_seq_files) self.albacore_seq_files_file = os.path.join(self.ont_fastq_dir, 'albacore_seq_files.txt') self.albacore_seq_files.to_csv(path_or_buf = self.albacore_seq_files_file, header = False, index = False) - + # Merge the smaller FASTQ files self.print_and_log('Merging Albacore runs into raw ONT FASTQ', self.sub_process_verbosity, self.sub_process_color) self.ont_raw_fastq = os.path.join(self.ont_fastq_dir, 'ont_raw.fastq') @@ -824,7 +824,7 @@ def albacore_ont_fast5(self) : # Make sure the merged FASTQ file exists and has size > 0 self.validate_file_and_size_or_error(self.ont_raw_fastq, 'ONT raw FASTQ file', 'cannot be found after albacore', 'is empty') - + def porechop_ont_fastq(self) : self.print_and_log('Running Porechop on raw ONT FASTQ', self.sub_process_verbosity, self.sub_process_color) @@ -857,7 +857,7 @@ def qcat_ont_fastq(self) : qcat_input_fastq = self.ont_raw_fastq else : qcat_input_fastq = self.ont_fastq - + self.print_and_log('Running qcat on raw ONT FASTQ', self.sub_process_verbosity, self.sub_process_color) stdout_file, stderr_file = [os.path.join(self.demultiplexed_dir, 'qcat.' + i) for i in ['stdout', 'stderr']] command = ' '.join(['qcat', @@ -883,9 +883,9 @@ def qcat_ont_fastq(self) : self.validate_file_and_size_or_error(barcode_summary_tsv, 'Barcode summary', 'cannot be found after qcat', 'is empty') self.barcode_summary = pandas.read_csv(filepath_or_buffer = barcode_summary_tsv, sep = '\t', header = None) - self.barcode_summary = self.barcode_summary.loc[self.barcode_summary.iloc[:, 2] >= self.barcode_min_fraction, :] + self.barcode_summary = self.barcode_summary.loc[self.barcode_summary.iloc[:, 2] >= self.barcode_min_fraction, :] self.barcodes = self.barcode_summary.iloc[:, 0].values - + # In the case of a single barcode just go on as normal, otherwise, make an Analysis for each barcode. # TODO - handle this differntly if demultiplexing was requested. if len(self.barcodes) == 1 or not self.multiplexed : @@ -893,7 +893,7 @@ def qcat_ont_fastq(self) : else : self.analysis = ['start_barcode_analysis'] - + def start_barcode_analysis(self) : for barcode in self.barcodes : @@ -914,7 +914,7 @@ def lorma_ont_fastq(self) : if not self.ont_fastq_dir : self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) - + # Use lordec-correct to ONT reads self.print_and_log('Running LoRMA on the ONT reads', self.sub_process_verbosity, self.sub_process_color) lorma_fasta, lorma_fastq = [os.path.join(self.ont_fastq_dir, 'lorma.' + i) for i in ['fasta', 'fastq']] @@ -944,7 +944,7 @@ def lorma_ont_fastq(self) : self.ont_fastq = lorma_fastq - + def racon_ont_fastq(self) : self.print_and_log('Using RACON to error-correct ONT reads', self.main_process_verbosity, self.main_process_color) @@ -952,7 +952,7 @@ def racon_ont_fastq(self) : if not self.ont_fastq_dir : self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) - + # Use lordec-correct to ONT reads self.print_and_log('Running LoRMA on the ONT reads', self.sub_process_verbosity, self.sub_process_color) lorma_fasta, lorma_fastq = [os.path.join(self.ont_fastq_dir, 'lorma.' + i) for i in ['fasta', 'fastq']] @@ -981,8 +981,8 @@ def racon_ont_fastq(self) : self.validate_file_and_size_or_error(lorma_fasta, 'LoRMA FASTQ', 'cannot be found after FASTQ conversion', 'is empty') self.ont_fastq = lorma_fastq - - + + def miniasm_ont_fastq(self) : self.print_and_log('Assembling ONT reads using minimap2/miniasm', self.main_process_verbosity, self.main_process_color) @@ -1026,7 +1026,7 @@ def miniasm_ont_fastq(self) : self.print_and_run(command) self.validate_file_and_size_or_error(self.genome_fasta, 'ONT miniasm FASTA', 'cannot be found after miniasm', 'is empty') - + def wtdbg2_ont_fastq(self) : self.print_and_log('Assembling ONT reads using wtdbg2', self.main_process_verbosity, self.main_process_color) @@ -1035,7 +1035,7 @@ def wtdbg2_ont_fastq(self) : self.ont_assembly_dir = os.path.join(self.output_dir, 'ont_assembly') os.makedirs(self.ont_assembly_dir) - # Use wtdbg2 to generate a layout + # Use wtdbg2 to generate a layout self.print_and_log('Running wtdbg2', self.sub_process_verbosity, self.sub_process_color) wtdbg2_prefix = os.path.join(self.ont_assembly_dir, 'assembly') wtdbg2_stdout, wtdbg2_stderr = [os.path.join(self.ont_assembly_dir, 'wtdbg2.' + i) for i in ['stdout', 'stderr']] @@ -1063,7 +1063,7 @@ def wtdbg2_ont_fastq(self) : self.print_and_run(command) self.validate_file_and_size_or_error(self.genome_fasta, 'WTDBG2 assembly fasta', 'cannot be found after miniasm', 'is empty') - + def flye_ont_fastq(self) : self.print_and_log('Assembling ONT reads using flye', self.main_process_verbosity, self.main_process_color) @@ -1077,7 +1077,7 @@ def flye_ont_fastq(self) : # flye_output_dir = os.path.join(self.ont_assembly_dir, 'flye') flye_output_dir = self.ont_assembly_dir flye_stdout, flye_stderr = [os.path.join(self.ont_assembly_dir, 'flye.' + i) for i in ['stdout', 'stderr']] - flye_fasta = flye_output_dir + '/assembly.fasta' + flye_fasta = os.path.join(flye_output_dir, 'assembly.fasta') command = ' '.join(['flye', '--plasmid', '--asm-coverage 50', @@ -1092,17 +1092,17 @@ def flye_ont_fastq(self) : self.validate_file_and_size_or_error(flye_fasta, 'Flye fasta', 'cannot be found after flye', 'is empty') # Copy it to the standard location - self.genome_fasta = self.ont_assembly_dir + '/assembly.fasta' + self.genome_fasta = os.path.join(self.ont_assembly_dir, 'assembly.fasta') command = ' '.join(['cp', flye_fasta, self.genome_fasta]) self.print_and_run(command) self.validate_file_and_size_or_error(self.genome_fasta, 'Genome fasta', 'cannot be found after copying Flye output', 'is empty') - + def racon_ont_assembly(self) : self.racon_dir = os.path.join(self.output_dir, 'racon') os.makedirs(self.racon_dir) - + # Use RACON to generate a consensus assembly self.ont_rva_paf = [] self.ont_racon_fasta = [] @@ -1127,7 +1127,7 @@ def racon_ont_assembly(self) : ont_racon_fasta_i = os.path.join(self.racon_dir, 'ont_racon_' + str(i) + '.fasta') self.ont_racon_fasta = self.ont_racon_fasta + [ont_racon_fasta_i] stderr_file = os.path.join(self.racon_dir, 'racon_' + str(i) + '.stderr') - command = ' '.join(['racon -m 8 -x 6 -g -8 -w 500', + command = ' '.join(['racon -m 8 -x 6 -g -8 -w 500', '-t', str(self.threads), self.ont_fastq, ont_rva_paf_i, input_assembly, '1>' + ont_racon_fasta_i, @@ -1146,14 +1146,14 @@ def racon_ont_assembly(self) : self.print_and_run(command) self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after fixing names', 'is empty') - + def medaka_ont_assembly(self) : self.print_and_log('Running Medaka on ONT assembly', self.main_process_verbosity, self.main_process_color) self.medaka_dir = os.path.join(self.output_dir, 'medaka') os.makedirs(self.medaka_dir) - + # Actually run Medaka self.print_and_log('Starting medaka', self.sub_process_verbosity, self.sub_process_color) self.medaka_fasta = os.path.join(self.medaka_dir, 'consensus.fasta') @@ -1176,8 +1176,8 @@ def medaka_ont_assembly(self) : '>', self.genome_fasta]) self.print_and_run(command) self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after fixing names', 'is empty') - - + + def nanopolish_ont_assembly(self) : self.print_and_log('Running Nanopolish on ONT assembly', self.main_process_verbosity, self.main_process_color) @@ -1192,7 +1192,7 @@ def nanopolish_ont_assembly(self) : self.nanopolish_bam = os.path.join(self.nanopolish_dir, 'minimap.bam') self.minimap_ont_fastq(self.genome_fasta, self.ont_fastq, self.nanopolish_bam) self.index_bam(self.nanopolish_bam) - + # Find the coverage of each contig to see if we need to downsample coverage_file = os.path.join(self.nanopolish_dir, 'contig_coverage.tsv') command = ' '.join(['samtools depth', self.nanopolish_bam, @@ -1201,7 +1201,7 @@ def nanopolish_ont_assembly(self) : self.print_and_run(command) coverage = pandas.read_table(filepath_or_buffer = coverage_file, header = None, index_col = 0) coverage.columns = ['coverage'] - + # If any need to be downsampled, just do them all if (coverage.iloc[:,0] > self.nanopolish_coverage_max * 1.25).any() : contig_bams = [] @@ -1228,11 +1228,11 @@ def nanopolish_ont_assembly(self) : else : command = ' '.join(['cp', contig_bams[0], downsampled_bam]) self.print_and_run(command) - + self.nanopolish_bam = downsampled_bam self.index_bam(self.nanopolish_bam) - + # Index the FAST5 data self.print_and_log('Indexing FAST5 data', self.sub_process_verbosity, self.sub_process_color) index_stdout, index_stderr = [os.path.join(self.nanopolish_dir, 'index.' + i) for i in ['stdout', 'stderr']] @@ -1252,7 +1252,7 @@ def nanopolish_ont_assembly(self) : self.print_and_run(command) self.nanopolish_prefix = os.path.join(self.nanopolish_dir, 'nanopolish') - + # Collect the list of ranges that Nanopolish will work on nanopolish_ranges_file = self.nanopolish_prefix + '_ranges.txt' command = ' '.join(['nanopolish_makerange.py', self.genome_fasta, '>', nanopolish_ranges_file]) @@ -1276,7 +1276,7 @@ def nanopolish_ont_assembly(self) : self.validate_file_and_size_or_error(self.nanopolish_fasta, 'Nanpolish FASTA', 'cannot be found after samtools', 'is empty') self.genome_fasta = self.nanopolish_fasta - + def run_nanopolish_range(self, nanopolish_range) : print(nanopolish_range) nanopolish_vcf = self.nanopolish_prefix + '.' + nanopolish_range + '.vcf' @@ -1324,11 +1324,11 @@ def spades_ont_assembly(self) : self.genome_fasta = assembly_fasta self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after SPAdes', 'is empty') - + def spades_illumina_fastq(self) : self.print_and_log('Assembling Illumina FASTQ data', self.main_process_verbosity, self.main_process_color) - + # Figure out where the assembly is going self.spades_dir = os.path.join(self.output_dir, 'spades') os.makedirs(self.spades_dir) @@ -1350,12 +1350,12 @@ def spades_illumina_fastq(self) : self.print_and_run(command) self.genome_fasta = assembly_fasta self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after SPAdes', 'is empty') - - + + def pilon_assembly(self) : self.print_and_log('Running Pilon on genome assembly', self.main_process_verbosity, self.main_process_color) - + self.pilon_dir = os.path.join(self.output_dir, 'pilon') os.makedirs(self.pilon_dir) @@ -1378,7 +1378,7 @@ def pilon_assembly(self) : '2>', pilon_stderr]) self.print_and_run(command) self.validate_file_and_size_or_error(self.pilon_fasta, 'Pilon FASTA', 'cannot be found after pilon', 'is empty') - + self.genome_fasta = self.pilon_fasta @@ -1386,7 +1386,7 @@ def blast_feature_sets(self) : self.features_dir = os.path.join(self.output_dir, 'features') os.makedirs(self.features_dir) - + for feature_number in range(len(self.feature_fastas)) : feature_fasta = self.feature_fastas[feature_number] feature_name = re.sub('\\.f.*', '', os.path.basename(feature_fasta)) @@ -1394,10 +1394,10 @@ def blast_feature_sets(self) : self.blast_features(feature_fasta, feature_dir) self.feature_dirs += [feature_dir] self.feature_names += [feature_name] - - + + def blast_features(self, feature_fasta, feature_dir) : - + # Make a directory for the new features os.makedirs(feature_dir) @@ -1410,7 +1410,7 @@ def blast_features(self, feature_fasta, feature_dir) : '1>' + stdout_file, '2>' + stderr_file]) self.print_and_run(command) - + # BLASTn the feature set blast_output = os.path.join(feature_dir, 'blast_output.tsv') self.print_and_log('BLASTing features against the assembly', self.sub_process_verbosity, self.sub_process_color) @@ -1426,7 +1426,7 @@ def blast_features(self, feature_fasta, feature_dir) : # Clean up the results into a handy BED file self.print_and_log('Converting feature hits to BED', self.sub_process_verbosity, self.sub_process_color) all_bed = os.path.join(feature_dir, 'all.bed') - command = ' '.join(['cat', blast_output, + command = ' '.join(['cat', blast_output, '| awk -F \'\\t\' \'($3 >= 95) && ($4 / $14 >= .90){OFS = "\\t";' + \ 'print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'', '| sort -k 1,1 -k 2,2n >', all_bed]) @@ -1440,7 +1440,7 @@ def blast_features(self, feature_fasta, feature_dir) : '1>' + merge_bed, '2>' + stderr_file]) self.print_and_run(command) - + # Pick the best hit for each cluster self.print_and_log('Finding the best hit for each feature cluster', self.sub_process_verbosity, self.sub_process_color) best_bed = os.path.join(feature_dir, 'best.bed') @@ -1455,14 +1455,14 @@ def blast_features(self, feature_fasta, feature_dir) : '>' + best_bed]) self.print_and_run(command) - + def call_amr_mutations(self) : - + self.print_and_log('Calling AMR mutations', self.main_process_verbosity, self.main_process_color) # Start of table of mutation results self.amr_mutations = None - + # Make the directory for new assembly files self.mutations_dir = os.path.join(self.output_dir, 'mutations') os.makedirs(self.mutations_dir) @@ -1499,7 +1499,7 @@ def call_amr_mutations(self) : self.validate_file_and_size_or_error(region_bam, 'Region SAM file', 'cannot be found', 'is empty') self.index_bam(region_bam) - + region_pileup = os.path.join(region_dir, 'region.mpileup') stderr_file = os.path.join(region_dir, 'samtools_index.stderr') command = ' '.join(['samtools mpileup', @@ -1524,7 +1524,7 @@ def call_amr_mutations(self) : '2>' + stderr_file]) self.print_and_run(command) self.validate_file_and_size_or_error(region_text_pileup, 'Region plain text MPILEUP file', 'cannot be found', 'is empty') - + # Call the actual SNPs region_vcf = os.path.join(region_dir, 'region.vcf') stderr_file = os.path.join(region_dir, 'bcftools.stderr') @@ -1547,11 +1547,11 @@ def call_amr_mutations(self) : region_mutations.columns = self.amr_mutations.columns self.amr_mutations = self.amr_mutations.append(region_mutations) - + def call_plasmids(self) : self.print_and_log('Calling plasmids', self.main_process_verbosity, self.main_process_color) - + # Make a directory for plasmid stuff self.plasmid_dir = os.path.join(self.output_dir, 'plasmids') os.makedirs(self.plasmid_dir) @@ -1563,7 +1563,7 @@ def call_plasmids(self) : '| awk \'($2 <= 500000){print $1}\'', '| parallel -n1 -n1 faidx', self.genome_fasta, '>', self.smaller_contigs_fasta]) self.print_and_run(command) - + # Query plasmid sequences against the assembly self.print_and_log('Running PBLAT against the plasmid database', self.sub_process_verbosity, self.sub_process_color) self.plasmid_psl = os.path.join(self.plasmid_dir, 'plasmid_hits.psl') @@ -1596,14 +1596,14 @@ def call_plasmids(self) : command = ' '.join(['cp', self.plasmid_tsv, new_plasmid_tsv]) self.print_and_run(command) self.plasmid_tsv = new_plasmid_tsv - + self.plasmids = pandas.read_csv(filepath_or_buffer = self.plasmid_tsv, sep = '\t') - + def call_insertions(self) : self.print_and_log('Calling insertions', self.main_process_verbosity, self.main_process_color) - + # Make the directory for new assembly files self.insertions_dir = os.path.join(self.output_dir, 'insertions') os.makedirs(self.insertions_dir) @@ -1649,7 +1649,7 @@ def call_insertions(self) : '| awk \'($3 - $2 >= 100){OFS = "\\t";print $1,$2,$3,($3 - $2)}\'', '>', reference_insertions_bed]) self.print_and_run(command) - + command = ' '.join(['bedtools complement', '-i', genome_aligned_bed, '-g', genome_sizes, @@ -1664,13 +1664,13 @@ def clean_up(self) : final_fasta = os.path.join(self.output_dir, 'assembly.fasta') command = ' '.join(['cp', self.genome_fasta, final_fasta]) self.print_and_run(command) - + def go(self) : self.analysis = ['make_output_dir', 'start_logging'] + self.analysis + ['clean_up'] analysis_string = '\n'.join([str(i+1) + ') ' + self.analysis[i] for i in range(len(self.analysis))]) print(self.main_process_color + analysis_string + Colors.ENDC) - + while (True) : step = self.analysis[0] self.analysis = self.analysis[1:] @@ -1682,11 +1682,11 @@ def go(self) : def main(opts) : - """ + """ """ if __name__ == '__main__': - + parser = ArgumentParser(prog = "pima.py", add_help = False, description = @@ -1694,14 +1694,14 @@ def main(opts) : P.I.M.A. bacterial genome analysis pipeline ''', formatter_class = lambda prog: HelpFormatter(prog, width = 120, max_help_position = 120)) - + parser._optionals.title = 'Help and version' parser.add_argument('--help', action = 'store_true', help = 'Print this help and exit.') parser.add_argument('--version', action = 'version', help = 'Print the software version.', version = 'PIMA microbial genome analysis pipeline (version {})'.format(VERSION)) - + # Input arguments input_group = parser.add_argument_group('Input and basecalling options') input_group.add_argument('--ont-fast5', required = False, default = None, metavar = '', @@ -1722,21 +1722,21 @@ def main(opts) : help = 'Use LORMA to error-correct ONT reads (default : %(default)s)') input_group.add_argument('--only-basecall', required = False, default = False, action = 'store_true', help = 'Just basecall and demultiplex/trim/correct. No downstream analysis.') - + input_group.add_argument('--illumina-fastq', required = False, default = None, nargs = 2, metavar = '', help = 'Files containing R1 & R2 Illumina reads') input_group.add_argument('--genome', required = False, default = None, metavar = '', help = 'A genome FASTA file to be used in place of assembly') - + output_group = parser.add_argument_group('Output options') output_group.add_argument('--output', required = False, default = None, metavar = '', help = 'Output directory for the analysis') output_group.add_argument('--overwrite', required = False, default = False, action = 'store_true', help = 'Overwrite an existing output directory (default : %(default)s)') - + # Assembly options assembly_group = parser.add_argument_group('Assembly options') assembly_group.add_argument('--assembler', required = False, default = 'flye', type = str, @@ -1760,38 +1760,38 @@ def main(opts) : help = 'Run Pilon if Illumina reads are given (default : %(default)s)') assembly_group.add_argument('--no-assembly', required = False, default = False, action = 'store_true', help = 'Don\'t assemble the given reads (default : %(default)s)') - + # Plasmid options plasmid_group = parser.add_argument_group('Plasmid and vector search options') - plasmid_group.add_argument('--plasmids', required = False, default = False, action = 'store_true', + plasmid_group.add_argument('--plasmids', required = False, default = False, action = 'store_true', help = 'Do a plasmid search (default : %(default)s)') - plasmid_group.add_argument('--plasmid-database', required = False, default = plasmid_database_default, metavar = '', + plasmid_group.add_argument('--plasmid-database', required = False, default = plasmid_database_default, metavar = '', help = 'Path to a FASTA file with reference plasmid sequences') - + # AMR gene options amr_group = parser.add_argument_group('AMR gene search options') amr_group.add_argument('--amr-database', required = False, default = amr_database_default, metavar = '', help = 'Path to a FASTA file with AMR gene sequences (default : %(default)s)') - amr_group.add_argument('--no-amr', required = False, default = False, action = 'store_true', + amr_group.add_argument('--no-amr', required = False, default = False, action = 'store_true', help = 'Skip AMR search (default : %(default)s)') - + # Inc group options inc_group = parser.add_argument_group('Incompatibility group search options') inc_group.add_argument('--inc-database', required = False, default = inc_database_default, metavar = '', help = 'Path to a FASTA file with incompatibility group sequences (default : %(default)s)') - inc_group.add_argument('--no-inc', required = False, default = False, action = 'store_true', + inc_group.add_argument('--no-inc', required = False, default = False, action = 'store_true', help = 'Skip incompatibility group search (default : %(default)s)') - + # Pull in custom feature sets other_feature_group = parser.add_argument_group('Other feature search options') other_feature_group.add_argument('--feature', required = False, default = None, metavar = '', action = 'append', help = 'Path to a FASTA file with feature sequences') - - + + # Which organism are we working with here organism_group = parser.add_argument_group('Organism options') organism_group.add_argument('--reference-dir', required = False, default = reference_dir_default, metavar = '', @@ -1804,8 +1804,8 @@ def main(opts) : help = 'Skip reference organism comparison') organism_group.add_argument('--no-mutations', required = False, action = 'store_true', help = 'Skip looking for AMR conferring mutations') - - + + # Other arguments other_group = parser.add_argument_group('Other options') other_group.add_argument('--threads', required = False, type=int, default = 1, metavar = '', @@ -1815,10 +1815,10 @@ def main(opts) : other_group.add_argument('--fake-run', required = False, default = False, action = 'store_true', help = 'Don\'t actually run the pipeline, just pretend to (default : %(default)s)') - + opts, unknown_args = parser.parse_known_args() opts.logfile = 'pipeline.log' - + if opts.list_organisms: print_organisms() sys.exit(0) @@ -1830,9 +1830,9 @@ def main(opts) : # Start the analysis analysis = Analysis(opts, unknown_args) - + analysis.validate_options() - + if len(analysis.errors) > 0 : print(Colors.HEADER) parser.print_help() From 43687bc0f934dcabcf6ecbccc30750be8cb4bf96 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Mon, 16 Dec 2019 17:06:13 -0500 Subject: [PATCH 5/8] drop unused modules, use mp stdlib, reorganize database_default lines, use and handle None args especially with paths, add filesize requirement >0 for validate_file() to return True, avoid shutil.which for py2 compatibility, use validate_utility() and validate_file() when possible, guppy --device default auto, clarify --genome-size infmt requirement, print analysis.errors as separate lines not as unformatted python list on screen, remove space before colons --- pima.py | 640 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 338 insertions(+), 302 deletions(-) diff --git a/pima.py b/pima.py index ed25e86..f6b8fb3 100755 --- a/pima.py +++ b/pima.py @@ -2,30 +2,28 @@ from argparse import ArgumentParser, HelpFormatter import copy -import csv import datetime import glob -import logging +import multiprocessing as mp import os import re import shutil import subprocess import sys -import time import numpy -import joblib import pandas -import pathos.multiprocessing as mp pandas.set_option('display.max_colwidth', 200) -VERSION='0.1.2' +VERSION='0.1.3' -amr_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'amr.fasta' -inc_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'inc.fasta' -plasmid_database_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'plasmids_and_vectors.fasta' -reference_dir_default = re.sub('pima.py', '', os.path.realpath(__file__)) + 'reference_sequences' +pima_directory = os.path.dirname(os.path.realpath(__file__)) +amr_database_default = os.path.join(pima_directory, 'amr.fasta') +inc_database_default = os.path.join(pima_directory, 'inc.fasta') +plasmid_database_default = os.path.join(pima_directory, + 'plasmids_and_vectors.fasta') +reference_dir_default = os.path.join(pima_directory, 'reference_sequences') class Colors: HEADER = '\033[95m' @@ -37,32 +35,54 @@ class Colors: BOLD = '\033[1m' UNDERLINE = '\033[4m' -class Analysis : +class Analysis: - def __init__(self, opts, unknown_args) : + def __init__(self, opts, unknown_args): self.input_given = [] # Input options - self.ont_fast5 = os.path.realpath(os.path.expanduser(opts.ont_fast5)) + if opts.ont_fast5 is not None: + self.ont_fast5 = os.path.realpath(os.path.expanduser( + opts.ont_fast5)) + else: + self.ont_fast5 = None self.ont_fast5_limit = opts.ont_fast5_limit self.basecaller = opts.basecaller self.no_trim = opts.no_trim - self.albacore_seq_files_file = os.path.realpath(os.path.expanduser( - opts.albacore_seq_file)) + if opts.albacore_seq_file is not None: + self.albacore_seq_files_file = os.path.realpath( + os.path.expanduser(opts.albacore_seq_file)) + else: + self.albacore_seq_files_file = None self.barcode_min_fraction = 2.5 - self.ont_fastq = os.path.realpath(os.path.expanduser(opts.ont_fastq)) + if opts.ont_fastq is not None: + self.ont_fastq = os.path.realpath(os.path.expanduser( + opts.ont_fastq)) + else: + self.ont_fastq = None self.ont_fastq_dir = None self.only_basecall = opts.only_basecall self.multiplexed = opts.multiplexed self.demux = opts.demux self.error_correct = opts.error_correct - self.illumina_fastq = os.path.realpath(os.path.expanduser( - opts.illumina_fastq)) - self.genome_fasta = os.path.realpath(os.path.expanduser(opts.genome)) + if opts.illumina_fastq is not None: + self.illumina_fastq = os.path.realpath(os.path.expanduser( + opts.illumina_fastq)) + else: + self.illumina_fastq = None + if opts.genome is not None: + self.genome_fasta = os.path.realpath(os.path.expanduser( + opts.genome)) + else: + self.genome_fasta = None self.will_have_genome_fasta = False - self.output_dir = os.path.realpath(os.path.expanduser(opts.output)) + if opts.output is not None: + self.output_dir = os.path.realpath(os.path.expanduser( + opts.output)) + else: + self.output_dir = None self.overwrite = opts.overwrite # Assembly options @@ -78,21 +98,37 @@ def __init__(self, opts, unknown_args) : self.will_have_ont_assembly = False # Plasmid and feature options - self.plasmid_database = opts.plasmid_database + if self.validate_file(opts.plasmid_database): + self.plasmid_database = opts.plasmid_database + else: + self.plasmid_database = None self.plasmids = opts.plasmids - self.amr_database = opts.amr_database + if self.validate_file(opts.amr_database): + self.amr_database = opts.amr_database + else: + self.amr_database = None self.no_amr = opts.no_amr - self.inc_database = opts.inc_database + if self.validate_file(opts.inc_database): + self.inc_database = opts.inc_database + else: + self.inc_database = None self.no_inc = opts.no_inc - self.feature_fastas = opts.feature + if opts.feature is not None: + self.feature_fastas = os.path.realpath(os.path.expanduser( + opts.feature)) + else: + self.feature_fastas = None self.feature_dirs = [] self.feature_names = [] self.feature_colors = [] # Reference options - self.reference_dir = opts.reference_dir + if os.path.isdir(opts.reference_dir): + self.reference_dir = os.path.realpath(opts.reference_dir) + else: + self.reference_dir = None self.organism = opts.organism self.no_reference = opts.no_reference self.no_mutations = opts.no_mutations @@ -104,7 +140,7 @@ def __init__(self, opts, unknown_args) : # How much stuff to print self.verbosity = opts.verbosity - # Dont' actully run any commands + # Don't actually run any commands self.fake_run = opts.fake_run # Verbosity levels and colors @@ -121,40 +157,40 @@ def __init__(self, opts, unknown_args) : self.analysis = [] # See if we got any unknown args. Not allowed. - if len(unknown_args) != 0 : - self.errors = self.errors + ['Unknown argument: ' + unknown for unknown in unknown_args] + if len(unknown_args) != 0: + self.errors += ['Unknown argument: ' + str(u) for u in unknown_args] - def print_and_log(self, text, verbosity, color = Colors.ENDC) : - if verbosity <= self.verbosity : - time_string = '[' + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + ']' + def print_and_log(self, text, verbosity, color = Colors.ENDC): + if verbosity <= self.verbosity: + time_string = '[' + str(datetime.datetime.now().\ + strftime("%Y-%m-%d %H:%M:%S")) + ']' print(time_string + ' ' + color + text + Colors.ENDC) - def run_command(self, command) : - if not self.fake_run : + def run_command(self, command): + if not self.fake_run: subprocess.call(command, shell = True) - def print_and_run(self, command) : + def print_and_run(self, command): self.print_and_log(command, self.command_verbosity) self.run_command(command) - def validate_file(self, the_file) : - return os.path.isfile(the_file) + def validate_file(self, the_file): + return os.path.isfile(the_file) and os.path.getsize(the_file) > 0 - - def validate_file_size(self, the_file) : - if not self.fake_run : + def validate_file_size(self, the_file): + if not self.fake_run: return os.stat(the_file).st_size > 0 - else : + else: return True def validate_file_and_size_or_error(self, the_file, error_prefix = 'The file', presence_suffix = 'doesn\'t exist', - size_suffix = 'is size 0') : + size_suffix = 'is size 0'): if not self.validate_file(the_file) and not self.fake_run: self.print_and_log(' '.join([error_prefix, the_file, presence_suffix]), 0, Colors.FAIL) self.error_out() @@ -164,7 +200,7 @@ def validate_file_and_size_or_error(self, the_file, error_prefix = 'The file', self.error_out() - def minimap_ont_fastq(self, genome, fastq, bam) : + def minimap_ont_fastq(self, genome, fastq, bam): command = ' '.join(['minimap2 -a', '-t', str(self.threads), '-x map-ont', @@ -179,7 +215,7 @@ def minimap_ont_fastq(self, genome, fastq, bam) : self.index_bam(bam) - def minimap_illumina_fastq(self, genome, fastq, bam) : + def minimap_illumina_fastq(self, genome, fastq, bam): command = ' '.join(['minimap2 -a', '-t', str(self.threads), '-x sr', @@ -195,7 +231,7 @@ def minimap_illumina_fastq(self, genome, fastq, bam) : self.index_bam(bam) - def index_bam(self, bam) : + def index_bam(self, bam): command = ' '.join(['samtools index', bam, '1>/dev/null 2>/dev/null']) @@ -204,7 +240,7 @@ def index_bam(self, bam) : self.validate_file_and_size_or_error(index_bai) - def bwa_index(self, target, std_dir) : + def bwa_index(self, target, std_dir): bwa_stdout, bwa_stderr = [os.path.join(std_dir, 'bwa_index.' + i) for i in ['stdout', 'stderr']] command = ' '.join(['bwa index', target, @@ -213,44 +249,48 @@ def bwa_index(self, target, std_dir) : self.print_and_run(command) - def validate_utility(self, utility, error) : - if not shutil.which(utility) : - self.errors += [error] - return False + def validate_utility(self, utility, *extra_error_message): + for path in os.environ.get('PATH', '').split(':'): + if os.path.exists(os.path.join(path, utility)) and \ + not os.path.isdir(os.path.join(path, utility)): + return True + self.errors += ['Cannot find {} in PATH {}'.format(utility, + ''.join(extra_error_message))] + return False - def error_out(self) : + def error_out(self): sys.exit(1) - def unless_only_basecall(function) : - def wrapper(self) : - if self.only_basecall : + def unless_only_basecall(function): + def wrapper(self): + if self.only_basecall: return function(self) return wrapper - def unless_given_genome(function) : - def wrapper(self) : - if self.genome_fasta : + def unless_given_genome(function): + def wrapper(self): + if self.genome_fasta: return function(self) return wrapper - def validate_ont_fast5(self) : + def validate_ont_fast5(self): - if self.only_basecall and (not self.ont_fast5 and not self.ont_fastq) : + if self.only_basecall and (not self.ont_fast5 and not self.ont_fastq): self.errors += ['--only-basecall requires --ont-fast5 or --ont-fastq'] return - if not self.ont_fast5 : + if self.ont_fast5 is None: return self.print_and_log('Validating ONT fast5 files and utilities', self.main_process_verbosity, self.main_process_color) - if not os.path.isdir(self.ont_fast5) : + if not os.path.isdir(self.ont_fast5): self.errors += ['Input FAST5 directory ' + self.ont_fast5 + ' cannot be found'] return @@ -261,167 +301,167 @@ def validate_ont_fast5(self) : for base, dirs, files in os.walk(self.ont_fast5): found_files = fnmatch.filter(files, '*.fast5') fast5_files.extend(os.path.join(base, f) for f in found_files) - else + else: fast5_files = glob.glob(os.path.join(self.ont_fast5, '**', '*.fast5'), recursive=True) - if len(fast5_files) == 0 : + if len(fast5_files) == 0: self.errors += ['Could not find FAST5 files in ' + self.ont_fast5 + ' or its subdirectories'] else: self.print_and_log('Found {} FAST5 files'.format(len(fast5_files)), self.sub_process_verbosity, self.sub_process_color) - if not self.ont_fast5_limit is None: - if self.ont_fast5_limit <= 0 : + if self.ont_fast5_limit is not None: + if self.ont_fast5_limit <= 0: self.errors += ['FAST5 limit must be greater than 0. Specified input was ' + str(self.ont_fast5_limit)] - if self.ont_fast5 is None : + if self.ont_fast5 is None: self.errors += ['FAST5 limit must be accompanied by FAST5 data.'] def validate_ont_fastq(self): - if not self.ont_fastq : + if self.ont_fastq is None: return - if not os.path.isfile(self.ont_fastq) : - self.errors += ['Input FASTQ file ' + self.ont_fastq + ' cannot be found'] + if not self.validate_file(self.ont_fastq): + self.errors += ['Input FASTQ file ' + self.ont_fastq + ' is empty or missing'] - def validate_genome_fasta(self) : - if not self.genome_fasta : + def validate_genome_fasta(self): + if self.genome_fasta is None: return - if not os.path.isfile(self.genome_fasta) : - self.errors += ['Input genome FASTA ' + self.genome_fasta + ' cannot be found'] + if not self.validate_file(self.genome_fasta): + self.errors += ['Input genome FASTA ' + self.genome_fasta + ' is empty or missing'] self.will_have_genome_fasta = True - def validate_output_dir(self) : - if not opts.output : + def validate_output_dir(self): + if opts.output is None: self.errors += ['No output directory given (--output)'] return - elif self.output_dir and os.path.isdir(self.output_dir) and not self.overwrite : - self.errors += ['Output directory ' + self.output_dir + ' already exists. Add --overwrite to ignore'] + elif self.output_dir and os.path.isdir(self.output_dir) and not self.overwrite: + self.errors += ['Output directory ' + self.output_dir + ' already exists. Add --overwrite to ignore'] - def validate_guppy(self) : + def validate_guppy(self): - if self.basecaller != 'guppy' : + if self.basecaller != 'guppy': return - if not self.ont_fast5 : + if self.ont_fast5 is None: return - if self.ont_fastq : + if self.ont_fastq is not None: return self.print_and_log('Validating Guppy basecalling utilities', self.main_process_verbosity, self.main_process_color) - for utility in ['guppy_basecaller', 'guppy_aligner', 'guppy_barcoder'] : - self.validate_utility(utility, utility + ' is not on the PATH (required by --basecaller albacore)') + for utility in ['guppy_basecaller', 'guppy_aligner', 'guppy_barcoder']: + self.validate_utility(utility, '(required by --basecaller albacore)') self.analysis += ['guppy_ont_fast5'] - def validate_albacore(self) : - if self.basecaller != 'albacore' : + def validate_albacore(self): + if self.basecaller != 'albacore': return - if not self.ont_fast5 : + if self.ont_fast5 is None: return - if self.ont_fastq : + if self.ont_fastq is not None: return self.print_and_log('Validating Albacore basecalling utilities', self.main_process_verbosity, self.main_process_color) - for utility in ['read_fast5_basecaller.py'] : - self.validate_utility(utility, utility + ' is not on the PATH (required by --basecaller albacore)') + for utility in ['read_fast5_basecaller.py']: + self.validate_utility(utility, '(required by --basecaller albacore)') self.analysis += ['albacore_ont_fast5'] - def validate_qcat(self) : + def validate_qcat(self): - if self.no_trim : + if self.no_trim: return - if self.demux != 'qcat' : + if self.demux != 'qcat': return - if not (self.ont_fast5 or self.ont_fastq) : + if any(x is not None for x in [self.ont_fast5, self.ont_fastq]): return self.print_and_log('Validating qcat demultiplexer', self.main_process_verbosity, self.main_process_color) - self.validate_utility('qcat', 'qcat is not on the PATH') + self.validate_utility('qcat') self.analysis += ['qcat_ont_fastq'] - def validate_porechop(self) : + def validate_porechop(self): - if self.no_trim : + if self.no_trim: return - if self.demux != 'porechop' : + if self.demux != 'porechop': return - if not (self.ont_fast5 or self.ont_fastq) : + if any(x is not None for x in [self.ont_fast5, self.ont_fastq]): return self.print_and_log('Validating porechop trimmer/demultiplexer', self.main_process_verbosity, self.main_process_color) self.print_and_log('Warning: Porechop is no longer supported and is only included for older studies; qcat is preferred', self.warning_verbosity, self.warning_color) - self.validate_utility('porechop', 'porechop is not on the PATH (required by --demux porechop)') + self.validate_utility('porechop', '(required by --demux porechop)') self.analysis += ['porechop_ont_fastq'] - def validate_lorma(self) : + def validate_lorma(self): - if not self.error_correct : + if not self.error_correct: return - if not (self.ont_fast5 or self.ont_fastq) : + if any(x is not None for x in [self.ont_fast5, self.ont_fastq]): self.errors += ['--error-correct requires --ont-fast5 and/or --ont-fastq'] return self.print_and_log('Validating lorma error corrector', self.main_process_verbosity, self.main_process_color) - self.validate_utility('lordec-correct', 'LoRMA is not on the PATH (required for ONT error-correction)') + self.validate_utility('lordec-correct', '(required for ONT error-correction)') self.analysis += ['lorma_ont_fastq'] - def validate_racon_correction(self) : + def validate_racon_correction(self): - if self.no_error_correct : + if self.no_error_correct: return - if not (self.ont_fast5 or self.ont_fastq) : + if any(x is not None for x in [self.ont_fast5, self.ont_fastq]): return self.print_and_log('Validating RACON error corrector', self.main_process_verbosity, self.main_process_color) - self.validate_utility('racon', 'RACON is not on the PATH (required for ONT error-correction)') + self.validate_utility('racon', '(required for ONT error-correction)') self.analysis += ['racon_ont_fastq'] @unless_only_basecall @unless_given_genome - def validate_miniasm(self) : - if self.assembler != 'miniasm' : + def validate_miniasm(self): + if self.assembler != 'miniasm': return - if self.no_assembly : + if self.no_assembly: return - if not self.ont_fast5 and not self.ont_fastq : + if not self.ont_fast5 and not self.ont_fastq: self.errors += ['Miniasm assembly requires --ont-fast5 and/or --ont-fastq'] self.print_and_log('Validating miniasm utilities', self.main_process_verbosity, self.main_process_color) - for utility in ['minimap2', 'miniasm'] : - self.validate_utility(utility, utility + ' is not on the PATH (required by --assembler miniasm)') + for utility in ['minimap2', 'miniasm']: + self.validate_utility(utility, '(required by --assembler miniasm)') self.will_have_genome_fasta = True @@ -432,25 +472,25 @@ def validate_miniasm(self) : @unless_only_basecall @unless_given_genome - def validate_wtdbg2(self) : - if self.assembler != 'wtdbg2' : + def validate_wtdbg2(self): + if self.assembler != 'wtdbg2': return - if self.no_assembly : + if self.no_assembly: return - if not (self.ont_fast5 or self.ont_fastq) : + if any(x is not None for x in [self.ont_fast5, self.ont_fastq]): return self.print_and_log('Validating wtdbg2 utilities', self.main_process_verbosity, self.main_process_color) - if not self.genome_size : + if self.genome_size is None: self.errors += ['wtdbg2 requires --genome-size'] - elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size) : + elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size): self.errors += ['--genome-size needs to be a floating point number in Mega or kilobases, got ' + str(self.genome_size)] - for utility in ['pgzf', 'kbm2', 'wtdbg2', 'wtpoa-cns', 'wtdbg-cns'] : - self.validate_utility(utility, utility + ' is not on the PATH (required by --assembler wtdbg2)') + for utility in ['pgzf', 'kbm2', 'wtdbg2', 'wtpoa-cns', 'wtdbg-cns']: + self.validate_utility(utility, '(required by --assembler wtdbg2)') self.will_have_ont_assembly = True self.will_have_genome_fasta = True @@ -461,28 +501,28 @@ def validate_wtdbg2(self) : @unless_only_basecall @unless_given_genome - def validate_flye(self) : + def validate_flye(self): - if self.assembler != 'flye' : + if self.assembler != 'flye': return - if not self.ont_fastq and not self.ont_fast5 : + if not self.ont_fastq and not self.ont_fast5: return self.print_and_log('Validating flye utilities', self.main_process_verbosity, self.main_process_color) - if not self.ont_fast5 and not self.ont_fastq : + if not self.ont_fast5 and not self.ont_fastq: self.errors += ['Assembly requires --ont-fast5 and/or --ont-fastq'] - if not self.genome_size : + if self.genome_size is None: self.errors += ['Flye requires --genome-size'] - elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size) : + elif not re.match('[0-9]+(\\.[0-9]+)?[mkMK]', self.genome_size): self.errors += ['--genome-size needs to be a floating point number in Mega or kilobases, got ' + str(self.genome_size)] # TODO deal with Flye and 2.7/3.6 issues -# for utility in ['flye'] : -# self.validate_utility(utility, utility + ' is not on the PATH (required by --assembler flye)') +# for utility in ['flye']: +# self.validate_utility(utility, '(required by --assembler flye)') self.will_have_ont_assembly = True self.will_have_genome_fasta = True @@ -491,208 +531,206 @@ def validate_flye(self) : @unless_only_basecall - def validate_racon(self) : + def validate_racon(self): - if not self.racon : + if not self.racon: return - if not self.ont_fast5 and not self.ont_fastq : + if not self.ont_fast5 and not self.ont_fastq: self.errors += ['Assembly requires --ont-fast5 and/or --ont-fastq'] self.print_and_log('Validating racon', self.main_process_verbosity, self.main_process_color) - self.validate_utility('racon', 'racon' + ' is not on the PATH (required by --assembler ' + self.assembler +')') + self.validate_utility('racon', '(required for assembly)') self.analysis += ['racon_ont_assembly'] @unless_only_basecall - def validate_medaka(self) : + def validate_medaka(self): -# if not self.medaka : +# if not self.medaka: # return - if not self.will_have_ont_assembly : + if not self.will_have_ont_assembly: return self.print_and_log('Validating medaka', self.main_process_verbosity, self.main_process_color) - if not self.ont_fast5 and not self.ont_fastq : + if not self.ont_fast5 and not self.ont_fastq: self.errors += ['--medaka requires --ont-fast5 and/or --ont-fastq'] - if not self.will_have_genome_fasta : + if not self.will_have_genome_fasta: self.errors += ['--medaka requires that a genome be assembled or supplied'] - self.validate_utility('medaka_consensus', 'medaka_consensus is not on the PATH (required by --medaka)') + self.validate_utility('medaka_consensus', '(required by --medaka)') self.analysis += ['medaka_ont_assembly'] @unless_only_basecall - def validate_nanopolish(self) : + def validate_nanopolish(self): - if not self.nanopolish : + if not self.nanopolish: return self.print_and_log('Validating Nanopolish', self.main_process_verbosity, self.main_process_color) - if not self.ont_fast5 : + if not self.ont_fast5: self.errors += ['--nanopolish requires --ont-fast5'] - if not self.will_have_genome_fasta : + if not self.will_have_genome_fasta: self.errors += ['--nanopolish requires that a genome be assembled or supplied'] - self.validate_utility('nanopolish', 'nanopolish is not on the PATH (required by --nanopolish)') + self.validate_utility('nanopolish', '(required by --nanopolish)') self.analysis += ['nanopolish_ont_assembly'] @unless_only_basecall - def validate_illumina_fastq(self) : + def validate_illumina_fastq(self): - if not self.illumina_fastq : + if self.illumina_fastq is None: return self.print_and_log('Validating Illumina data', self.main_process_verbosity, self.main_process_color) - for r_fastq in self.illumina_fastq : - if not os.path.isfile(r_fastq) : - self.errors += ['Illumian FASTQ file' + r_fastq + ' cannot be found'] + for r_fastq in self.illumina_fastq: + if not self.validate_file(r_fastq): + self.errors += ['Illumian FASTQ file' + r_fastq + ' is empty or missing'] # Assume we want to use Illumina data to polish a given genome or ONT assembly - if self.ont_fast5 or self.ont_fastq or self.genome_fasta : - self.validate_utility('pilon', 'pilon is not on the PATH (required by --illumina-fastq with --ont-fastq, --ont-fast, or --genome)') + if self.ont_fast5 or self.ont_fastq or self.genome_fasta: + self.validate_utility('pilon', '(required by --illumina-fastq with --ont-fastq, --ont-fast, or --genome)') #self.analysis += ['spades_ont_assembly'] self.analysis += ['pilon_assembly'] - else : - self.validate_utility('spades.py', 'spades.py is not on the PATH (required by --illumina-fastq)') + else: + self.validate_utility('spades.py', '(required by --illumina-fastq)') self.analysis += ['spades_illumina_fastq'] self.will_have_genome_fasta = True @unless_only_basecall - def validate_features(self) : + def validate_features(self): - if self.feature_fastas is None : + if self.feature_fastas is None: self.feature_fastas = [] - if not self.no_amr : + if not self.no_amr: self.feature_fastas += [self.amr_database] - if not self.no_inc : + if not self.no_inc: self.feature_fastas += [self.inc_database] - if len(self.feature_fastas) == 0 : + if len(self.feature_fastas) == 0: return - if not self.will_have_genome_fasta : + if not self.will_have_genome_fasta: return self.print_and_log('Validating feature sets', self.main_process_verbosity, self.main_process_color) - for feature_fasta in self.feature_fastas : - if not os.path.isfile(feature_fasta) : - self.errors += ['Can\'t find feature database ' + feature_fasta] + for feature_fasta in self.feature_fastas: + if not self.validate_file(feature_fasta): + self.errors += ['Missing or empty feature database ' + feature_fasta] @unless_only_basecall - def validate_blast(self) : + def validate_blast(self): - if len(self.feature_fastas) == 0 : + if len(self.feature_fastas) == 0: return - if not self.will_have_genome_fasta : + if not self.will_have_genome_fasta: return self.print_and_log('Validating blast utilities', self.main_process_verbosity, self.main_process_color) - for util in ['makeblastdb', 'blastn', 'bedtools'] : - if not shutil.which(util) : - self.errors += ['Can\'t find ' + util + ' on the PATH'] + for util in ['makeblastdb', 'blastn', 'bedtools']: + self.validate_utility(util) - self.analysis = self.analysis + ['blast_feature_sets'] + self.analysis += ['blast_feature_sets'] @unless_only_basecall - def validate_reference(self) : + def validate_reference(self): - if not self.organism : + if not self.organism: return self.print_and_log('Validating reference genome and utilities', self.main_process_verbosity, self.main_process_color) #1 - Check for mummer utils - for util in ['dnadiff', 'nucmer', 'mummer'] : - if not shutil.which(util) : - self.errors += ['Can\'t find ' + util + ' on the PATH'] + for util in ['dnadiff', 'nucmer', 'mummer']: + self.validate_utility(util) #2 - Check for the reference sequence. - if not os.path.isdir(self.reference_dir) : + if not os.path.isdir(self.reference_dir): self.errors += ['Can\'t find reference directory ' + self.reference_dir] - if not self.no_reference : - if self.organism == None : - self.errors += ['Organism not given. Add --no-reference to skip reference comparison'] - else : + if not self.no_reference: + if self.organism == None: + self.errors += ['Organism not given. Add --no-reference to skip reference comparison'] + else: self.organism_dir = os.path.join(self.reference_dir, self.organism) - if not os.path.isdir(self.organism_dir) : + if not os.path.isdir(self.organism_dir): self.errors += ['Can\'t find organism directory ' + self.organism_dir] #It should be in reference_sequences/organism self.reference_fasta = os.path.join(self.organism_dir, 'genome.fasta') - if not os.path.isfile(self.reference_fasta) : - self.errors += ['Can\'t find genome.fasta in the reference directory.'] + if not self.validate_file(self.reference_fasta): + self.errors += ['Missing or empty genome.fasta in the reference directory'] self.mutation_regions_bed = os.path.join(self.reference_dir, self.organism, 'mutation_regions.bed') - if not os.path.isfile(self.mutation_regions_bed) : - self.errors += ['Can\'t find mutation_regions.bed in the reference directory.'] + if not self.validate_file(self.mutation_regions_bed): + self.errors += ['Missing or empty mutation_regions.bed in the reference directory'] - if self.will_have_genome_fasta : - self.analysis = self.analysis + ['call_insertions'] + if self.will_have_genome_fasta: + self.analysis += ['call_insertions'] @unless_only_basecall - def validate_mutations(self) : + def validate_mutations(self): - if self.no_mutations : + if self.no_mutations: return - if not self.organism : + if not self.organism: return self.print_and_log('Validating mapping and variant utilities', self.main_process_verbosity, self.main_process_color) if not (self.ont_fast5 or self.ont_fastq or self.illumina_fastq): self.errors += ['Can\'t call mutations without a FAST5 or FASTQ dataset'] - if not self.organism or self.no_reference : + if not self.organism or self.no_reference: self.errors += ['Can\'t call mutations without a reference genome'] - for utility in ['minimap2', 'samtools', 'bcftools'] : - self.validate_utility(utility, utility + ' is not on the PATH (required for AMR mutations)') + for utility in ['minimap2', 'samtools', 'bcftools']: + self.validate_utility(utility, '(required for AMR mutations)') - self.analysis = self.analysis + ['call_amr_mutations'] + self.analysis += ['call_amr_mutations'] @unless_only_basecall - def validate_plasmids(self) : + def validate_plasmids(self): - if not self.plasmids : + if not self.plasmids: return self.print_and_log('Validating plasmid database and utilities', self.main_process_verbosity, self.main_process_color) - for utility in ['pblat', 'Rscript', 'R', 'pChunks.R'] : - self.validate_utility(utility, utility + ' is not on the PATH (required for plasmid finding)') + for utility in ['pblat', 'Rscript', 'R', 'pChunks.R']: + self.validate_utility(utility, '(required for plasmid finding)') - if not os.path.isfile(self.plasmid_database) : - self.errors += ['Can\'t find plasmid database ' + self.plasmid_database] + if not self.validate_file(self.plasmid_database): + self.errors += ['Missing or empty plasmid database ' + self.plasmid_database] - if not self.will_have_genome_fasta : + if not self.will_have_genome_fasta: self.errors += ['Can\'t call plasmids without a genome or an assembly'] - self.analysis = self.analysis + ['call_plasmids'] + self.analysis += ['call_plasmids'] - def validate_options(self) : + def validate_options(self): self.validate_output_dir() @@ -725,21 +763,21 @@ def validate_options(self) : self.validate_mutations() self.validate_plasmids() - if len(self.analysis) == 0 : - self.errors = self.errors + ['Nothing to do!'] + if len(self.analysis) == 0: + self.errors += ['Nothing to do!'] - def load_organisms() : + def load_organisms(): print('Pretending to load organisms') - def list_organisms() : + def list_organisms(): ''' Lists the set of reference organisms available to Pima ''' - def make_output_dir(self) : + def make_output_dir(self): - if os.path.isdir(self.output_dir) : + if os.path.isdir(self.output_dir): shutil.rmtree(self.output_dir) os.mkdir(self.output_dir) @@ -749,7 +787,7 @@ def start_logging(self): self.logging_file = os.path.join(self.output_dir, 'log.txt') - def guppy_ont_fast5(self) : + def guppy_ont_fast5(self): self.print_and_log('Basecalling ONT reads with Guppy', self.main_process_verbosity, self.main_process_color) @@ -765,7 +803,7 @@ def guppy_ont_fast5(self) : '-i', self.ont_fast5, '-s', self.ont_fastq_dir, '--compress-fastq', - '--device "cuda:0"', + '--device auto', '--flowcell FLO-MIN106 --kit SQK-RBK004', '1>' + stdout_file, '2>' + stderr_file]) @@ -781,7 +819,7 @@ def guppy_ont_fast5(self) : self.validate_file_and_size_or_error(self.ont_raw_fastq, 'ONT raw FASTQ file', 'cannot be found after albacore', 'is empty') - def albacore_ont_fast5(self) : + def albacore_ont_fast5(self): self.print_and_log('Basecalling ONT reads with Albacore', self.main_process_verbosity, self.main_process_color) @@ -793,7 +831,7 @@ def albacore_ont_fast5(self) : stderr_file = os.path.join(self.ont_fastq_dir, 'albacore') # Run the basecalling with the ONT basecaller command = ['ls', self.ont_fast5, '|'] - if self.ont_fast5_limit : + if self.ont_fast5_limit: command += ['sort -n | head', '-' + str(self.ont_fast5_limit), '|'] command += ['parallel -n1', '-j' + str(self.threads), @@ -825,7 +863,7 @@ def albacore_ont_fast5(self) : self.validate_file_and_size_or_error(self.ont_raw_fastq, 'ONT raw FASTQ file', 'cannot be found after albacore', 'is empty') - def porechop_ont_fastq(self) : + def porechop_ont_fastq(self): self.print_and_log('Running Porechop on raw ONT FASTQ', self.sub_process_verbosity, self.sub_process_color) self.ont_fastq = os.path.join(self.ont_fastq_dir, 'ont.fastq') @@ -842,9 +880,9 @@ def porechop_ont_fastq(self) : self.validate_file_and_size_or_error(self.ont_fastq, 'ONT FASTQ file', 'cannot be found after porechop', 'is empty') - def qcat_ont_fastq(self) : + def qcat_ont_fastq(self): - if not self.ont_fastq_dir : + if self.ont_fastq_dir is None: self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) @@ -853,9 +891,9 @@ def qcat_ont_fastq(self) : # Find the FASTQ file to use qcat_input_fastq = None - if hasattr(self, 'ont_raw_fastq') : + if hasattr(self, 'ont_raw_fastq'): qcat_input_fastq = self.ont_raw_fastq - else : + else: qcat_input_fastq = self.ont_fastq self.print_and_log('Running qcat on raw ONT FASTQ', self.sub_process_verbosity, self.sub_process_color) @@ -888,15 +926,15 @@ def qcat_ont_fastq(self) : # In the case of a single barcode just go on as normal, otherwise, make an Analysis for each barcode. # TODO - handle this differntly if demultiplexing was requested. - if len(self.barcodes) == 1 or not self.multiplexed : + if len(self.barcodes) == 1 or not self.multiplexed: self.ont_fastq = os.path.join(self.demultiplexed_dir, self.barcode_summary.iloc[0,0] + '.fastq') - else : - self.analysis = ['start_barcode_analysis'] + else: + self.analysis += ['start_barcode_analysis'] - def start_barcode_analysis(self) : + def start_barcode_analysis(self): - for barcode in self.barcodes : + for barcode in self.barcodes: barcode_analysis = copy.deepcopy(self) barcode_analysis.no_trim = True barcode_analysis.multiplexed = False @@ -907,11 +945,11 @@ def start_barcode_analysis(self) : barcode_analysis.go() - def lorma_ont_fastq(self) : + def lorma_ont_fastq(self): self.print_and_log('Using lordec-correct to error-correct ONT reads', self.main_process_verbosity, self.main_process_color) - if not self.ont_fastq_dir : + if self.ont_fastq_dir is None: self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) @@ -935,8 +973,8 @@ def lorma_ont_fastq(self) : # Make a FASTQish file from the LoRMA output self.print_and_log('Converting LoRMA FASTA to a FASTQ', self.sub_process_verbosity, self.sub_process_color) quality = 12 - with open(lorma_fasta, 'r') as lorma_fasta_handle, open(lorma_fastq, 'w') as lorma_fastq_handle : - for fasta in Bio.SeqIO.parse(lorma_fasta_handle, 'fasta') : + with open(lorma_fasta, 'r') as lorma_fasta_handle, open(lorma_fastq, 'w') as lorma_fastq_handle: + for fasta in Bio.SeqIO.parse(lorma_fasta_handle, 'fasta'): fasta.letter_annotations['phred_quality'] = [quality] * len(fasta) Bio.SeqIO.write(fasta, lorma_fastq_handle, 'fastq') @@ -945,11 +983,11 @@ def lorma_ont_fastq(self) : self.ont_fastq = lorma_fastq - def racon_ont_fastq(self) : + def racon_ont_fastq(self): self.print_and_log('Using RACON to error-correct ONT reads', self.main_process_verbosity, self.main_process_color) - if not self.ont_fastq_dir : + if self.ont_fastq_dir is None: self.ont_fastq_dir = os.path.join(self.output_dir, 'ont_fastq') os.makedirs(self.ont_fastq_dir) @@ -973,8 +1011,8 @@ def racon_ont_fastq(self) : # Make a FASTQish file from the LoRMA output self.print_and_log('Converting LoRMA FASTA to a FASTQ', self.sub_process_verbosity, self.sub_process_color) quality = 12 - with open(lorma_fasta, 'r') as lorma_fasta_handle, open(lorma_fastq, 'w') as lorma_fastq_handle : - for fasta in Bio.SeqIO.parse(lorma_fasta_handle, 'fasta') : + with open(lorma_fasta, 'r') as lorma_fasta_handle, open(lorma_fastq, 'w') as lorma_fastq_handle: + for fasta in Bio.SeqIO.parse(lorma_fasta_handle, 'fasta'): fasta.letter_annotations['phred_quality'] = [quality] * len(fasta) Bio.SeqIO.write(fasta, lorma_fastq_handle, 'fastq') @@ -983,7 +1021,7 @@ def racon_ont_fastq(self) : self.ont_fastq = lorma_fastq - def miniasm_ont_fastq(self) : + def miniasm_ont_fastq(self): self.print_and_log('Assembling ONT reads using minimap2/miniasm', self.main_process_verbosity, self.main_process_color) @@ -1027,7 +1065,7 @@ def miniasm_ont_fastq(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'ONT miniasm FASTA', 'cannot be found after miniasm', 'is empty') - def wtdbg2_ont_fastq(self) : + def wtdbg2_ont_fastq(self): self.print_and_log('Assembling ONT reads using wtdbg2', self.main_process_verbosity, self.main_process_color) @@ -1064,7 +1102,7 @@ def wtdbg2_ont_fastq(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'WTDBG2 assembly fasta', 'cannot be found after miniasm', 'is empty') - def flye_ont_fastq(self) : + def flye_ont_fastq(self): self.print_and_log('Assembling ONT reads using flye', self.main_process_verbosity, self.main_process_color) @@ -1098,7 +1136,7 @@ def flye_ont_fastq(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'Genome fasta', 'cannot be found after copying Flye output', 'is empty') - def racon_ont_assembly(self) : + def racon_ont_assembly(self): self.racon_dir = os.path.join(self.output_dir, 'racon') os.makedirs(self.racon_dir) @@ -1108,7 +1146,7 @@ def racon_ont_assembly(self) : self.ont_racon_fasta = [] input_assembly = self.genome_fasta - for i in range(0, self.racon_rounds) : + for i in range(0, self.racon_rounds): # Run minimap2 for round 1 self.print_and_log('Running minimap2 round' + str(i), self.sub_process_verbosity, self.sub_process_color) @@ -1147,7 +1185,7 @@ def racon_ont_assembly(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after fixing names', 'is empty') - def medaka_ont_assembly(self) : + def medaka_ont_assembly(self): self.print_and_log('Running Medaka on ONT assembly', self.main_process_verbosity, self.main_process_color) @@ -1178,10 +1216,10 @@ def medaka_ont_assembly(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after fixing names', 'is empty') - def nanopolish_ont_assembly(self) : + def nanopolish_ont_assembly(self): self.print_and_log('Running Nanopolish on ONT assembly', self.main_process_verbosity, self.main_process_color) - if not self.ont_fastq and self.ont_fast5 : + if any(x is not None for x in [self.ont_fastq, self.ont_fast5]): self.print_and_log('Need both ONT FAST5 and ONT FASTQ data to run Nanopolish', 0, self.error_color) self.nanopolish_dir = os.path.join(self.output_dir, 'nanopolish') @@ -1203,13 +1241,13 @@ def nanopolish_ont_assembly(self) : coverage.columns = ['coverage'] # If any need to be downsampled, just do them all - if (coverage.iloc[:,0] > self.nanopolish_coverage_max * 1.25).any() : + if (coverage.iloc[:,0] > self.nanopolish_coverage_max * 1.25).any(): contig_bams = [] - for contig in coverage.index.values : + for contig in coverage.index.values: contig_bam = os.path.join(self.nanopolish_dir, contig + '.bam') - contig_bams += [contig_bam] + contig_bams += [contig_bam] downsampling_fraction = self.nanopolish_coverage_max / coverage.loc[contig,'coverage'] - if downsampling_fraction > 1 : + if downsampling_fraction > 1: downsampling_fraction = 1 command = ' '.join(['samtools view -b', '-s', str(float(downsampling_fraction)), @@ -1220,12 +1258,12 @@ def nanopolish_ont_assembly(self) : # Merge the resulting bams downsampled_bam = os.path.join(self.nanopolish_dir, 'downsampled.bam') - if (len(contig_bams) > 1) : + if (len(contig_bams) > 1): command = ' '.join(['samtools merge', downsampled_bam, ' '.join(contig_bams)]) self.print_and_run(command) - else : + else: command = ' '.join(['cp', contig_bams[0], downsampled_bam]) self.print_and_run(command) @@ -1236,7 +1274,7 @@ def nanopolish_ont_assembly(self) : # Index the FAST5 data self.print_and_log('Indexing FAST5 data', self.sub_process_verbosity, self.sub_process_color) index_stdout, index_stderr = [os.path.join(self.nanopolish_dir, 'index.' + i) for i in ['stdout', 'stderr']] - if self.albacore_seq_files_file is None : + if self.albacore_seq_files_file is None: # Find the Albacore sequencing files search_string = os.path.join(self.ont_fastq_dir, '*', 'sequencing_summary.txt') albacore_seq_files = glob.glob(search_string) @@ -1277,10 +1315,10 @@ def nanopolish_ont_assembly(self) : self.genome_fasta = self.nanopolish_fasta - def run_nanopolish_range(self, nanopolish_range) : + def run_nanopolish_range(self, nanopolish_range): print(nanopolish_range) nanopolish_vcf = self.nanopolish_prefix + '.' + nanopolish_range + '.vcf' - while (1) : + while (1): command = ' '.join(['nanopolish variants --faster --consensus', '-o', nanopolish_vcf, '-w', nanopolish_range, @@ -1291,14 +1329,14 @@ def run_nanopolish_range(self, nanopolish_range) : '1>' + self.nanopolish_stdout, '2>' + self.nanopolish_stderr]) self.print_and_run(command) - if self.validate_file(nanopolish_vcf) : + if self.validate_file(nanopolish_vcf): break - else : + else: self.print_and_log('Nanopolish VCF' + nanopolish_vcf + 'failed; trying again', self.sub_process_verbosity, self.sub_process_color) - def spades_ont_assembly(self) : + def spades_ont_assembly(self): self.print_and_log('Running spades with trusted ONT contigs', self.main_process_verbosity, self.main_process_color) @@ -1325,7 +1363,7 @@ def spades_ont_assembly(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after SPAdes', 'is empty') - def spades_illumina_fastq(self) : + def spades_illumina_fastq(self): self.print_and_log('Assembling Illumina FASTQ data', self.main_process_verbosity, self.main_process_color) @@ -1352,7 +1390,7 @@ def spades_illumina_fastq(self) : self.validate_file_and_size_or_error(self.genome_fasta, 'Genome assembly', 'cannot be found after SPAdes', 'is empty') - def pilon_assembly(self) : + def pilon_assembly(self): self.print_and_log('Running Pilon on genome assembly', self.main_process_verbosity, self.main_process_color) @@ -1382,12 +1420,12 @@ def pilon_assembly(self) : self.genome_fasta = self.pilon_fasta - def blast_feature_sets(self) : + def blast_feature_sets(self): self.features_dir = os.path.join(self.output_dir, 'features') os.makedirs(self.features_dir) - for feature_number in range(len(self.feature_fastas)) : + for feature_number in range(len(self.feature_fastas)): feature_fasta = self.feature_fastas[feature_number] feature_name = re.sub('\\.f.*', '', os.path.basename(feature_fasta)) feature_dir = os.path.join(self.features_dir, feature_name) @@ -1396,7 +1434,7 @@ def blast_feature_sets(self) : self.feature_names += [feature_name] - def blast_features(self, feature_fasta, feature_dir) : + def blast_features(self, feature_fasta, feature_dir): # Make a directory for the new features os.makedirs(feature_dir) @@ -1456,7 +1494,7 @@ def blast_features(self, feature_fasta, feature_dir) : self.print_and_run(command) - def call_amr_mutations(self) : + def call_amr_mutations(self): self.print_and_log('Calling AMR mutations', self.main_process_verbosity, self.main_process_color) @@ -1469,16 +1507,16 @@ def call_amr_mutations(self) : # Map the reads to the reference sequence self.reference_mapping_bam = os.path.join(self.mutations_dir, 'reference_mapping.bam') - if self.illumina_fastq : + if self.illumina_fastq is not None: self.minimap_illumina_fastq(self.reference_fasta, self.illumina_fastq, self.reference_mapping_bam) - else : + else: self.minimap_ont_fastq(self.reference_fasta, self.ont_fastq, self.reference_mapping_bam) self.index_bam(self.reference_mapping_bam) # Pull out each region, one at a time region_data = pandas.read_csv(filepath_or_buffer = self.mutation_regions_bed, sep = '\t', header = None) region_data.columns = ['contig', 'start', 'stop', 'description'] - for region_i in range(region_data.shape[0]) : + for region_i in range(region_data.shape[0]): region_i_description = region_data.loc[region_i,'description'] self.print_and_log('Finding AMR mutations for ' + region_i_description, self.sub_process_verbosity, self.sub_process_color) @@ -1540,15 +1578,15 @@ def call_amr_mutations(self) : region_mutations = pandas.read_csv(filepath_or_buffer = region_vcf, sep = '\t') region_mutations = pandas.concat([region_mutations, pandas.DataFrame(numpy.repeat(region_i_description, region_mutations.shape[0]))], axis = 1) - if self.amr_mutations is None : + if self.amr_mutations is None: self.amr_mutations = region_mutations self.amr_mutations.columns.values[len(self.amr_mutations.columns) - 1] = 'sample' - else : + else: region_mutations.columns = self.amr_mutations.columns self.amr_mutations = self.amr_mutations.append(region_mutations) - def call_plasmids(self) : + def call_plasmids(self): self.print_and_log('Calling plasmids', self.main_process_verbosity, self.main_process_color) @@ -1600,7 +1638,7 @@ def call_plasmids(self) : self.plasmids = pandas.read_csv(filepath_or_buffer = self.plasmid_tsv, sep = '\t') - def call_insertions(self) : + def call_insertions(self): self.print_and_log('Calling insertions', self.main_process_verbosity, self.main_process_color) @@ -1621,7 +1659,7 @@ def call_insertions(self) : # Pull out the aligned regions of the two genomes self.print_and_log('Finding reference and query specific insertions', self.sub_process_verbosity, self.sub_process_color) one_coords = self.dnadiff_prefix + '.1coords' - reference_aligned_bed = os.path.join(self.insertions_dir, 'reference_aligned.bed') + reference_aligned_bed = os.path.join(self.insertions_dir, 'reference_aligned.bed') genome_aligned_bed = os.path.join(self.insertions_dir, 'genome_aligned.bed') command = ' '.join(['cat', one_coords, '| awk \'{OFS = "\\t"; if ($2 < $1){t = $2; $2 = $1; $1 = t} print $12,$1,$2}\' | sort -k 1,1 -k 2,2n', @@ -1658,29 +1696,29 @@ def call_insertions(self) : self.print_and_run(command) - def clean_up(self) : + def clean_up(self): - if (self.genome_fasta) : + if (self.genome_fasta): final_fasta = os.path.join(self.output_dir, 'assembly.fasta') command = ' '.join(['cp', self.genome_fasta, final_fasta]) self.print_and_run(command) - def go(self) : + def go(self): self.analysis = ['make_output_dir', 'start_logging'] + self.analysis + ['clean_up'] analysis_string = '\n'.join([str(i+1) + ') ' + self.analysis[i] for i in range(len(self.analysis))]) print(self.main_process_color + analysis_string + Colors.ENDC) - while (True) : + while (True): step = self.analysis[0] self.analysis = self.analysis[1:] function = getattr(self, step) function() - if (len(self.analysis) == 0) : + if (len(self.analysis) == 0): break -def main(opts) : +def main(opts): """ @@ -1707,25 +1745,23 @@ def main(opts) : input_group.add_argument('--ont-fast5', required = False, default = None, metavar = '', help = 'Directory containing ONT FAST5 files') input_group.add_argument('--ont-fast5-limit', required = False, type = int, default = None, metavar = '', - help = 'Limit on the number of FAST5 directories to include in the analysis (default : all dirs)') + help = 'Limit on the number of FAST5 directories to include in the analysis (default: all dirs)') input_group.add_argument('--basecaller', required = False, default = 'guppy', choices = ['guppy', 'albacore'], - help = 'The basecaller for ONT FAST5 data (default : %(default)s)') + help = 'The basecaller for ONT FAST5 data (default: %(default)s)') input_group.add_argument('--ont-fastq', required = False, default = None, metavar = '', help = 'File containing basecalled ONT reads') input_group.add_argument('--multiplexed', required = False, default = False, action = 'store_true', - help = 'The ONT data are multiplexed (default : %(default)s)') + help = 'The ONT data are multiplexed (default: %(default)s)') input_group.add_argument('--demux', required = False, default = 'qcat', choices = ['qcat', 'porechop'], - help = 'Demultiplexer/trimmer to use (default : %(default)s)') + help = 'Demultiplexer/trimmer to use (default: %(default)s)') input_group.add_argument('--no-trim', required = False, default = False, action = 'store_true', - help = 'Do no trim the ONT reads (default : %(default)s)') + help = 'Do no trim the ONT reads (default: %(default)s)') input_group.add_argument('--error-correct', required = False, default = False, action = 'store_true', - help = 'Use LORMA to error-correct ONT reads (default : %(default)s)') + help = 'Use LORMA to error-correct ONT reads (default: %(default)s)') input_group.add_argument('--only-basecall', required = False, default = False, action = 'store_true', - help = 'Just basecall and demultiplex/trim/correct. No downstream analysis.') - + help = 'Just basecall and demultiplex/trim/correct. No downstream analysis.') input_group.add_argument('--illumina-fastq', required = False, default = None, nargs = 2, metavar = '', help = 'Files containing R1 & R2 Illumina reads') - input_group.add_argument('--genome', required = False, default = None, metavar = '', help = 'A genome FASTA file to be used in place of assembly') @@ -1734,38 +1770,38 @@ def main(opts) : output_group.add_argument('--output', required = False, default = None, metavar = '', help = 'Output directory for the analysis') output_group.add_argument('--overwrite', required = False, default = False, action = 'store_true', - help = 'Overwrite an existing output directory (default : %(default)s)') + help = 'Overwrite an existing output directory (default: %(default)s)') # Assembly options assembly_group = parser.add_argument_group('Assembly options') assembly_group.add_argument('--assembler', required = False, default = 'flye', type = str, choices = ['miniasm', 'wtdbg2', 'flye'], - help = 'Assembler to use (default : %(default)s)') + help = 'Assembler to use (default: %(default)s)') assembly_group.add_argument('--genome-size', required = False, default = None, type = str, metavar = '', - help = 'Genome size estimate for Flye (default : %(default)s))') + help = 'Genome size estimate for flye or wtdbg2 in human readable format, e.g., 4.5M (default: %(default)s))') assembly_group.add_argument('--racon', required = False, default = False, action = 'store_true', - help = 'Force the generation a racon consenus (default : %(default)s)') + help = 'Force the generation a racon consenus (default: %(default)s)') assembly_group.add_argument('--racon-rounds', required = False, default = 4, type = int, metavar = '', - help = 'Number of RACON rounds used to generate a consensus (default : %(default)s)') + help = 'Number of RACON rounds used to generate a consensus (default: %(default)s)') assembly_group.add_argument('--medaka', required = False, default = False, action = 'store_true', - help = 'Run Medaka on the assembly (faster) (default : %(default)s)') + help = 'Run Medaka on the assembly (faster) (default: %(default)s)') assembly_group.add_argument('--nanopolish', required = False, default = False, action = 'store_true', - help = 'Run Nanopolish on the RACON consensus (slow) (default : %(default)s)') + help = 'Run Nanopolish on the RACON consensus (slow) (default: %(default)s)') assembly_group.add_argument('--max-nanopolish-coverage', required = False, type = int, default = 100, metavar = '', - help = 'Maximum coverage before downsampling nanopolish input (default : %(default)s)') + help = 'Maximum coverage before downsampling nanopolish input (default: %(default)s)') assembly_group.add_argument('--albacore-seq-file', required = False, type = str, default = None, metavar = '', - help = 'List of albacore sequencing summary files (default : %(default)s)') + help = 'List of albacore sequencing summary files (default: %(default)s)') assembly_group.add_argument('--pilon', required = False, default = False, action = 'store_true', - help = 'Run Pilon if Illumina reads are given (default : %(default)s)') + help = 'Run Pilon if Illumina reads are given (default: %(default)s)') assembly_group.add_argument('--no-assembly', required = False, default = False, action = 'store_true', - help = 'Don\'t assemble the given reads (default : %(default)s)') + help = 'Don\'t assemble the given reads (default: %(default)s)') # Plasmid options plasmid_group = parser.add_argument_group('Plasmid and vector search options') plasmid_group.add_argument('--plasmids', required = False, default = False, action = 'store_true', - help = 'Do a plasmid search (default : %(default)s)') + help = 'Do a plasmid search (default: %(default)s)') plasmid_group.add_argument('--plasmid-database', required = False, default = plasmid_database_default, metavar = '', help = 'Path to a FASTA file with reference plasmid sequences') @@ -1773,17 +1809,17 @@ def main(opts) : # AMR gene options amr_group = parser.add_argument_group('AMR gene search options') amr_group.add_argument('--amr-database', required = False, default = amr_database_default, metavar = '', - help = 'Path to a FASTA file with AMR gene sequences (default : %(default)s)') + help = 'Path to a FASTA file with AMR gene sequences (default: %(default)s)') amr_group.add_argument('--no-amr', required = False, default = False, action = 'store_true', - help = 'Skip AMR search (default : %(default)s)') + help = 'Skip AMR search (default: %(default)s)') # Inc group options inc_group = parser.add_argument_group('Incompatibility group search options') inc_group.add_argument('--inc-database', required = False, default = inc_database_default, metavar = '', - help = 'Path to a FASTA file with incompatibility group sequences (default : %(default)s)') + help = 'Path to a FASTA file with incompatibility group sequences (default: %(default)s)') inc_group.add_argument('--no-inc', required = False, default = False, action = 'store_true', - help = 'Skip incompatibility group search (default : %(default)s)') + help = 'Skip incompatibility group search (default: %(default)s)') # Pull in custom feature sets @@ -1795,7 +1831,7 @@ def main(opts) : # Which organism are we working with here organism_group = parser.add_argument_group('Organism options') organism_group.add_argument('--reference-dir', required = False, default = reference_dir_default, metavar = '', - help = 'Directory containing refrence organisms (default : %(default)s)' ) + help = 'Directory containing refrence organisms (default: %(default)s)' ) organism_group.add_argument('--organism', required = False, default = None, metavar = '', help = 'Reference organism to compare against') organism_group.add_argument('--list-organisms', required = False, action = 'store_true', @@ -1809,11 +1845,11 @@ def main(opts) : # Other arguments other_group = parser.add_argument_group('Other options') other_group.add_argument('--threads', required = False, type=int, default = 1, metavar = '', - help = 'Number of worker threads to use (default : %(default)s)') + help = 'Number of worker threads to use (default: %(default)s)') other_group.add_argument('--verbosity', required = False, type=int, default = 1, metavar = '', - help = 'How much information to print as PIMA runs (default : %(default)s)') + help = 'How much information to print as PIMA runs (default: %(default)s)') other_group.add_argument('--fake-run', required = False, default = False, action = 'store_true', - help = 'Don\'t actually run the pipeline, just pretend to (default : %(default)s)') + help = 'Don\'t actually run the pipeline, just pretend to (default: %(default)s)') opts, unknown_args = parser.parse_known_args() @@ -1822,7 +1858,7 @@ def main(opts) : if opts.list_organisms: print_organisms() sys.exit(0) - elif opts.help : + elif opts.help: print(Colors.HEADER) parser.print_help() print(Colors.ENDC) @@ -1830,15 +1866,15 @@ def main(opts) : # Start the analysis analysis = Analysis(opts, unknown_args) - analysis.validate_options() - if len(analysis.errors) > 0 : + if len(analysis.errors) > 0: print(Colors.HEADER) parser.print_help() print(Colors.ENDC) print(Colors.FAIL + '\n\nErrors:') - print([(i) for i in analysis.errors]) + for error_message in analysis.errors: + print(error_message) print(Colors.ENDC) sys.exit() From bd2c4e770fbee3229ed35a3d23becd109365bbf8 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Wed, 18 Dec 2019 20:39:20 -0500 Subject: [PATCH 6/8] added -h and -V opts --- pima.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pima.py b/pima.py index f6b8fb3..02e4da5 100755 --- a/pima.py +++ b/pima.py @@ -1727,16 +1727,13 @@ def main(opts): parser = ArgumentParser(prog = "pima.py", add_help = False, - description = - ''' - P.I.M.A. bacterial genome analysis pipeline - ''', + description = 'P.I.M.A. bacterial genome analysis pipeline', formatter_class = lambda prog: HelpFormatter(prog, width = 120, max_help_position = 120)) parser._optionals.title = 'Help and version' - parser.add_argument('--help', action = 'store_true', + parser.add_argument('-h', '--help', action = 'store_true', help = 'Print this help and exit.') - parser.add_argument('--version', action = 'version', + parser.add_argument('-V', '--version', action = 'version', help = 'Print the software version.', version = 'PIMA microbial genome analysis pipeline (version {})'.format(VERSION)) From bafe86bd55290d0fccb5cbe5d31d37418f5c89c2 Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Thu, 19 Dec 2019 00:52:13 -0500 Subject: [PATCH 7/8] default uses all CPUs, unspecified args exits with error code 1, warn --list-organisms not implemented yet --- pima.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pima.py b/pima.py index 02e4da5..c264b7e 100755 --- a/pima.py +++ b/pima.py @@ -16,7 +16,7 @@ pandas.set_option('display.max_colwidth', 200) -VERSION='0.1.3' +VERSION='0.1.4' pima_directory = os.path.dirname(os.path.realpath(__file__)) amr_database_default = os.path.join(pima_directory, 'amr.fasta') @@ -24,6 +24,7 @@ plasmid_database_default = os.path.join(pima_directory, 'plasmids_and_vectors.fasta') reference_dir_default = os.path.join(pima_directory, 'reference_sequences') +cpus_default = mp.cpu_count() class Colors: HEADER = '\033[95m' @@ -133,7 +134,10 @@ def __init__(self, opts, unknown_args): self.no_reference = opts.no_reference self.no_mutations = opts.no_mutations - self.threads = opts.threads + if opts.threads < 1: + self.threads = mp.cpu_count() + else: + self.threads = opts.threads self.errors = [] @@ -156,9 +160,9 @@ def __init__(self, opts, unknown_args): # The actual steps to carry out in the analysis held as a list self.analysis = [] - # See if we got any unknown args. Not allowed. + # Report unknown arguments if len(unknown_args) != 0: - self.errors += ['Unknown argument: ' + str(u) for u in unknown_args] + self.errors += ['Unknown option: ' + str(u) for u in unknown_args] def print_and_log(self, text, verbosity, color = Colors.ENDC): @@ -1828,7 +1832,7 @@ def main(opts): # Which organism are we working with here organism_group = parser.add_argument_group('Organism options') organism_group.add_argument('--reference-dir', required = False, default = reference_dir_default, metavar = '', - help = 'Directory containing refrence organisms (default: %(default)s)' ) + help = 'Directory containing reference organisms (default: %(default)s)' ) organism_group.add_argument('--organism', required = False, default = None, metavar = '', help = 'Reference organism to compare against') organism_group.add_argument('--list-organisms', required = False, action = 'store_true', @@ -1841,7 +1845,7 @@ def main(opts): # Other arguments other_group = parser.add_argument_group('Other options') - other_group.add_argument('--threads', required = False, type=int, default = 1, metavar = '', + other_group.add_argument('--threads', required = False, type=int, default = cpus_default, metavar = '', help = 'Number of worker threads to use (default: %(default)s)') other_group.add_argument('--verbosity', required = False, type=int, default = 1, metavar = '', help = 'How much information to print as PIMA runs (default: %(default)s)') @@ -1853,7 +1857,8 @@ def main(opts): opts.logfile = 'pipeline.log' if opts.list_organisms: - print_organisms() + # print_organisms() + print('print_organisms() function not yet implemented') sys.exit(0) elif opts.help: print(Colors.HEADER) @@ -1873,7 +1878,7 @@ def main(opts): for error_message in analysis.errors: print(error_message) print(Colors.ENDC) - sys.exit() + sys.exit(1) - #If we're still good, start the actual analysis + # If no errors, start the system calls analysis.go() From b26c6d9154c2167c57aae65db3dfce81b215294d Mon Sep 17 00:00:00 2001 From: Chris Gulvik Date: Fri, 20 Dec 2019 17:45:14 -0500 Subject: [PATCH 8/8] added print_organisms function so --list-organisms now works --- pima.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/pima.py b/pima.py index c264b7e..cf221d6 100755 --- a/pima.py +++ b/pima.py @@ -16,7 +16,7 @@ pandas.set_option('display.max_colwidth', 200) -VERSION='0.1.4' +VERSION='0.1.5' pima_directory = os.path.dirname(os.path.realpath(__file__)) amr_database_default = os.path.join(pima_directory, 'amr.fasta') @@ -668,7 +668,7 @@ def validate_reference(self): for util in ['dnadiff', 'nucmer', 'mummer']: self.validate_utility(util) - #2 - Check for the reference sequence. + #2 - Check for the reference sequence if not os.path.isdir(self.reference_dir): self.errors += ['Can\'t find reference directory ' + self.reference_dir] @@ -680,7 +680,7 @@ def validate_reference(self): if not os.path.isdir(self.organism_dir): self.errors += ['Can\'t find organism directory ' + self.organism_dir] - #It should be in reference_sequences/organism + # It should be in reference_sequences/organism self.reference_fasta = os.path.join(self.organism_dir, 'genome.fasta') if not self.validate_file(self.reference_fasta): self.errors += ['Missing or empty genome.fasta in the reference directory'] @@ -771,13 +771,24 @@ def validate_options(self): self.errors += ['Nothing to do!'] - def load_organisms(): - print('Pretending to load organisms') + def print_organisms(self): + + if self.reference_dir is None: + return + + i = 0 + for _, dirs, _ in os.walk(self.reference_dir): + for subdirname in dirs: + organism_file = os.path.join(self.reference_dir, subdirname, + 'genome.fasta') + if self.validate_file(organism_file): + print('Found organism: {}'.format(subdirname)) + i += 1 + else: + print('Missing or empty genome.fasta file in {}'.format( + os.path.join(self.reference_dir, subdirname))) + print('\n{} total reference organisms available'.format(i)) - def list_organisms(): - ''' - Lists the set of reference organisms available to Pima - ''' def make_output_dir(self): @@ -1856,18 +1867,16 @@ def main(opts): opts, unknown_args = parser.parse_known_args() opts.logfile = 'pipeline.log' + # Start the analysis + analysis = Analysis(opts, unknown_args) if opts.list_organisms: - # print_organisms() - print('print_organisms() function not yet implemented') + analysis.print_organisms() sys.exit(0) elif opts.help: print(Colors.HEADER) parser.print_help() print(Colors.ENDC) sys.exit(0) - - # Start the analysis - analysis = Analysis(opts, unknown_args) analysis.validate_options() if len(analysis.errors) > 0: