From 3dd461e252211a3aa336361651dbc079d36fb6c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mokrej=C5=A1?= Date: Thu, 25 Jan 2018 19:58:54 +0100 Subject: [PATCH 1/2] Support multiple input SAM/BAM files Iterate over all input files while increasing internal counters and after that print out the statistics. --- src/bin/sga-astat.py | 70 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/src/bin/sga-astat.py b/src/bin/sga-astat.py index 819b6a2c..a0ef71f3 100755 --- a/src/bin/sga-astat.py +++ b/src/bin/sga-astat.py @@ -30,8 +30,8 @@ def computeAStat(arrivalRate, len, n): arrivalRate = 0 def usage(): - print 'usage: sga-astat.py in.bam' - print 'Compute Myers\' a-statistic for a set of contigs using the read alignments in in.bam' + print 'usage: sga-astat.py [OPTIONS] *.bam' + print 'Compute Myers\' a-statistic for a set of contigs using the read alignments from at least one SAM/BAM file' print 'Options:' print ' -m=INT only compute a-stat for contigs at least INT bases in length' print ' -b=INT use the longest INT contigs to perform the initial estimate' @@ -62,53 +62,53 @@ def usage(): usage() sys.exit(1) -if len(args) == 0: - print 'Error: a BAM file must be provided\n' +if not args: + print 'Error: at least one SAM/BAM file must be provided\n' usage() sys.exit(2) -bamFilename = args[0] - -sys.stderr.write('Reading alignments from ' + bamFilename + '\n') contigData = list() - -# Read the contig names and lengths from the bam -bamFile = pysam.Samfile(bamFilename, "rb") - -for (name, length) in zip(bamFile.references, bamFile.lengths): - t = name, length, 0 - contigData.append(ContigData(name, length)) - #print 'Name: ' + name - #print 'Length: ' + str(len) - -# Read the alignments and populate the counts totalReads = 0 sumReadLength = 0 - last_ref_idx = -1 last_pos = -1 -for alignment in bamFile: +for bamFilename in args: + sys.stderr.write('Reading alignments from ' + bamFilename + '\n') + + # Read the contig names and lengths from the bam + bamFile = pysam.Samfile(bamFilename, "rb") + + for (name, length) in zip(bamFile.references, bamFile.lengths): + t = name, length, 0 + contigData.append(ContigData(name, length)) + #print 'Name: ' + name + #print 'Length: ' + str(len) + + # Read the alignments and populate the counts + for alignment in bamFile: + + if alignment.is_unmapped: + continue - if alignment.is_unmapped: - continue + ref_idx = alignment.rname + ref_name = bamFile.getrname(ref_idx) - ref_idx = alignment.rname - ref_name = bamFile.getrname(ref_idx) + pos = alignment.pos - pos = alignment.pos + if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates: + continue - if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates: - continue + # Update the count + cd = contigData[ref_idx] + cd.n += 1 + totalReads += 1 + sumReadLength += alignment.rlen + assert(cd.name == ref_name) + last_ref_idx = ref_idx + last_pos = pos - # Update the count - cd = contigData[ref_idx] - cd.n += 1 - totalReads += 1 - sumReadLength += alignment.rlen - assert(cd.name == ref_name) - last_ref_idx = ref_idx - last_pos = pos + bamFile.close() avgReadLen = sumReadLength / totalReads contigData.sort(key=lambda c : c.len, reverse=True) From 5fdc9b76f4db3e62cd379e905b72fee9ea4194a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mokrej=C5=A1?= Date: Fri, 2 Feb 2018 19:39:16 +0100 Subject: [PATCH 2/2] Try to automatically convert from CIGARs v1.4 to v1.3 format One can use the full version of rtg-tools (called rtg-core) or use reformat.sh wrapper from Brian Bushnell's BBmap bundle. The patch is untested but passes 'perl -w' checks. --- src/bin/sga-bam2de.pl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/bin/sga-bam2de.pl b/src/bin/sga-bam2de.pl index b0f6f58d..450d0e10 100755 --- a/src/bin/sga-bam2de.pl +++ b/src/bin/sga-bam2de.pl @@ -54,7 +54,24 @@ my $cmd; # fix-mate info -$cmd = "abyss-fixmate -h $prefix.tmp.hist $bamFile | samtools view -Sb - > $prefix.diffcontigs.bam"; +# use reformat.sh from BBmap bundle to convert from v1.4 CIGAR strings into v1.3, alternatively one could have used RTG-CORE (unlike RTG-TOOLS which is the stripped version lacking sammerge feature) +my $mypath = `which reformat.sh`; +my $cigar_convertor_cmd = ""; +my $rtg_core_distdir = ""; +if ($mypath ne '') +{ + $cigar_convertor_cmd = "reformat.sh sam=1.3 in=stdin.sam out=stdout.sam |"; +} +else +{ + $mypath = `which rtg`; + if ($mypath ne '') + { + $rtg_core_distdir = `dirname $mypath`; + $cigar_convertor_cmd = "java -Djava.library.path=${rtg_core_distdir} -XX:ErrorFile=./hs_err_pid'$$'.log -Dreferences.dir=${rtg_core_distdir}/references -Dmodels.dir=${rtg_core_distdir}/models -Dtalkback=false -Xmx38g -jar ${rtg_core_distdir}/RTG.jar sammerge ${bamFile} --legacy-cigars --exclude-duplicates --exclude-unmapped --exclude-unplaced --output=- --no-gzip |"; + } +} +$cmd = "samtools view -h -F 0x800 -F 0x200 -F 0x400 -F 0x100 $bamFile | $cigar_convertor_cmd abyss-fixmate -h $prefix.tmp.hist | samtools view -h -Sb -o $prefix.diffcontigs.bam"; runCmd($cmd); # DistanceEst is very slow when the learned insert size distribution is very wide @@ -88,7 +105,7 @@ sub runCmd { my($c) = @_; print "$c\n"; - system($c); + system($c) and warn "Error: ", chomp($c), " failed with: $!\n" and die; } # Check whether the programs are found and executable @@ -97,7 +114,7 @@ sub checkDependency while(my $program = shift) { my $ret = system("/bin/bash -c \"hash $program\""); if($ret != 0) { - print STDERR "Could not find program $program. Please install it or update your PATH.\n"; + print STDERR "Could not find program $program: $!. Please install it or update your PATH.\n"; print STDERR "Return: $ret\n"; exit(1); }