From 718c93f40d6e0e7d2dd3948709b7843298a9d2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mokrej=C5=A1?= Date: Fri, 26 Jan 2018 01:33:16 +0100 Subject: [PATCH] Support multiple input SAM/BAM files (v2) Add a wrapper loop around existing code so that users does not need to merge multiple BAM's together. We iterate over all input files while increasing internal counters and checking that we indeed poked onto same chromosome with same length as in a previously parsed BAM file. After that print out the statistics. I received same values when running the modified version on same BAM file. Running the modified version on same file twice (provided as 2 arguments on the command line) at least yields same 'Initial genome size estimate' values, the other numbers differ a bit. --- src/bin/sga-astat.py | 80 +++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/src/bin/sga-astat.py b/src/bin/sga-astat.py index 819b6a2c..6b4225ce 100755 --- a/src/bin/sga-astat.py +++ b/src/bin/sga-astat.py @@ -30,8 +30,8 @@ def computeAStat(arrivalRate, len, n): arrivalRate = 0 def usage(): - print 'usage: sga-astat.py in.bam' - print 'Compute Myers\' a-statistic for a set of contigs using the read alignments in in.bam' + print 'usage: sga-astat.py [OPTIONS] *.bam' + print 'Compute Myers\' a-statistic for a set of contigs using the read alignments from at least one SAM/BAM file' print 'Options:' print ' -m=INT only compute a-stat for contigs at least INT bases in length' print ' -b=INT use the longest INT contigs to perform the initial estimate' @@ -62,53 +62,63 @@ def usage(): usage() sys.exit(1) -if len(args) == 0: - print 'Error: a BAM file must be provided\n' +if not args: + print 'Error: at least one SAM/BAM file must be provided\n' usage() sys.exit(2) -bamFilename = args[0] - -sys.stderr.write('Reading alignments from ' + bamFilename + '\n') contigData = list() - -# Read the contig names and lengths from the bam -bamFile = pysam.Samfile(bamFilename, "rb") - -for (name, length) in zip(bamFile.references, bamFile.lengths): - t = name, length, 0 - contigData.append(ContigData(name, length)) - #print 'Name: ' + name - #print 'Length: ' + str(len) - -# Read the alignments and populate the counts totalReads = 0 sumReadLength = 0 - last_ref_idx = -1 last_pos = -1 +firstbamalreadydone = False + +for bamFilename in args: + sys.stderr.write('Reading alignments from ' + bamFilename + '\n') + + # Read the contig names and lengths from the bam + bamFile = pysam.Samfile(bamFilename, "rb") + refnr = 0 + + for (name, length) in zip(bamFile.references, bamFile.lengths): + #t = name, length, 0 + if firstbamalreadydone: + if contigData[refnr].name == name and contigData[refnr].len == length: + pass + else: + raise RuntimeError, "All input SAM/BAM files should be mapped against same reference and coordinate-sorted" + else: + contigData.append(ContigData(name, length)) + #print 'Name: ' + name + #print 'Length: ' + str(len) + refnr += 1 + + # Read the alignments and populate the counts + for alignment in bamFile: -for alignment in bamFile: + if alignment.is_unmapped: + continue - if alignment.is_unmapped: - continue + ref_idx = alignment.rname + ref_name = bamFile.getrname(ref_idx) - ref_idx = alignment.rname - ref_name = bamFile.getrname(ref_idx) + pos = alignment.pos - pos = alignment.pos + if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates: + continue - if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates: - continue + # Update the count + cd = contigData[ref_idx] + cd.n += 1 + totalReads += 1 + sumReadLength += alignment.rlen + assert(cd.name == ref_name) + last_ref_idx = ref_idx + last_pos = pos - # Update the count - cd = contigData[ref_idx] - cd.n += 1 - totalReads += 1 - sumReadLength += alignment.rlen - assert(cd.name == ref_name) - last_ref_idx = ref_idx - last_pos = pos + bamFile.close() + firstbamalreadydone = True avgReadLen = sumReadLength / totalReads contigData.sort(key=lambda c : c.len, reverse=True)