From 718c93f40d6e0e7d2dd3948709b7843298a9d2ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Mokrej=C5=A1?= <mmokrejs@gmail.com>
Date: Fri, 26 Jan 2018 01:33:16 +0100
Subject: [PATCH] Support multiple input SAM/BAM files (v2)

Add a wrapper loop around existing code so that users does not need
to merge multiple BAM's together. We iterate over all input
files while increasing internal counters and checking that we
indeed poked onto same chromosome with same length as in a previously
parsed BAM file.

After that print out the statistics. I received same values when running
the modified version on same BAM file. Running the modified version
on same file twice (provided as 2 arguments on the command line) at least
yields same 'Initial genome size estimate' values, the other numbers
differ a bit.
---
 src/bin/sga-astat.py | 80 +++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/src/bin/sga-astat.py b/src/bin/sga-astat.py
index 819b6a2c..6b4225ce 100755
--- a/src/bin/sga-astat.py
+++ b/src/bin/sga-astat.py
@@ -30,8 +30,8 @@ def computeAStat(arrivalRate, len, n):
 arrivalRate = 0
 
 def usage():
-    print 'usage: sga-astat.py in.bam'
-    print 'Compute Myers\' a-statistic for a set of contigs using the read alignments in in.bam'
+    print 'usage: sga-astat.py [OPTIONS] *.bam'
+    print 'Compute Myers\' a-statistic for a set of contigs using the read alignments from at least one SAM/BAM file'
     print 'Options:'
     print '    -m=INT          only compute a-stat for contigs at least INT bases in length'
     print '    -b=INT          use the longest INT contigs to perform the initial estimate'
@@ -62,53 +62,63 @@ def usage():
             usage()
             sys.exit(1)
 
-if len(args) == 0:
-    print 'Error: a BAM file must be provided\n'
+if not args:
+    print 'Error: at least one SAM/BAM file must be provided\n'
     usage()
     sys.exit(2)
 
-bamFilename = args[0]
-
-sys.stderr.write('Reading alignments from ' + bamFilename + '\n')
 contigData = list()
-
-# Read the contig names and lengths from the bam
-bamFile = pysam.Samfile(bamFilename, "rb")
-
-for (name, length) in zip(bamFile.references, bamFile.lengths):
-    t = name, length, 0
-    contigData.append(ContigData(name, length))
-    #print 'Name: ' + name
-    #print 'Length: ' + str(len)
-
-# Read the alignments and populate the counts
 totalReads = 0
 sumReadLength = 0
-
 last_ref_idx = -1
 last_pos = -1
+firstbamalreadydone = False
+
+for bamFilename in args:
+    sys.stderr.write('Reading alignments from ' + bamFilename + '\n')
+
+    # Read the contig names and lengths from the bam
+    bamFile = pysam.Samfile(bamFilename, "rb")
+    refnr = 0
+
+    for (name, length) in zip(bamFile.references, bamFile.lengths):
+        #t = name, length, 0
+        if firstbamalreadydone:
+            if contigData[refnr].name == name and contigData[refnr].len == length:
+                pass
+            else:
+                raise RuntimeError, "All input SAM/BAM files should be mapped against same reference and coordinate-sorted"
+        else:
+            contigData.append(ContigData(name, length))
+        #print 'Name: ' + name
+        #print 'Length: ' + str(len)
+        refnr += 1
+
+    # Read the alignments and populate the counts
+    for alignment in bamFile:
 
-for alignment in bamFile:
+        if alignment.is_unmapped:
+            continue
 
-    if alignment.is_unmapped:
-        continue
+        ref_idx = alignment.rname
+        ref_name = bamFile.getrname(ref_idx)
 
-    ref_idx = alignment.rname
-    ref_name = bamFile.getrname(ref_idx)
+        pos = alignment.pos
 
-    pos = alignment.pos
+        if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates:
+            continue
 
-    if ref_idx == last_ref_idx and pos == last_pos and not bKeepDuplicates:
-        continue
+        # Update the count
+        cd = contigData[ref_idx]
+        cd.n += 1
+        totalReads += 1
+        sumReadLength += alignment.rlen
+        assert(cd.name == ref_name)
+        last_ref_idx = ref_idx
+        last_pos = pos
 
-    # Update the count
-    cd = contigData[ref_idx]
-    cd.n += 1
-    totalReads += 1
-    sumReadLength += alignment.rlen
-    assert(cd.name == ref_name)
-    last_ref_idx = ref_idx
-    last_pos = pos
+    bamFile.close()
+    firstbamalreadydone = True
 
 avgReadLen = sumReadLength / totalReads
 contigData.sort(key=lambda c : c.len, reverse=True)