From e5b519d8c8e28b51327649be1bc25c3e169c963b Mon Sep 17 00:00:00 2001
From: Dale Roberts <dale.roberts1@anu.edu.au>
Date: Thu, 12 Jun 2025 10:17:31 +1000
Subject: [PATCH 1/2] Unzip artefacts one at a time to save memory

---
 memprof_plotter/plotter.py | 77 ++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/memprof_plotter/plotter.py b/memprof_plotter/plotter.py
index e8bac4a..6b1c7be 100644
--- a/memprof_plotter/plotter.py
+++ b/memprof_plotter/plotter.py
@@ -40,7 +40,7 @@
 """
 
 
-def download_artefact(url: str) -> bytes | None:
+def download_artefact(url: str) -> zipfile.ZipFile | None:
     """
     PyGithub does not support retrieving artefacts into buffers, so we have to resort
     to requests
@@ -58,13 +58,15 @@ def download_artefact(url: str) -> bytes | None:
         return None
     zf = zipfile.ZipFile(BytesIO(req.content))
     if "tsp_db.sqlite3" in zf.namelist():
-        return zf.read("tsp_db.sqlite3")
+        return zf
     else:
         print("Artefact does not contain required TSP database")
         return None
 
 
-def get_artefacts(nruns: int, workflow: github.Workflow.Workflow, artefact: str, filter: list[str]) -> dict[int, bytes]:
+def get_artefacts(
+    nruns: int, workflow: github.Workflow.Workflow, artefact: str, filter: list[str]
+) -> dict[str, zipfile.ZipFile]:
     irun = 0
     runs = {}
     for run in workflow.get_runs(status="success"):
@@ -84,6 +86,27 @@ def get_artefacts(nruns: int, workflow: github.Workflow.Workflow, artefact: str,
     return runs
 
 
+class Zip_to_sql_conn:
+    def __init__(self, zip: zipfile.ZipFile):
+        self.db = zip.read("tsp_db.sqlite3")
+        self.conn = sqlite3.connect(":memory:")
+        self.tmpfile = None
+        if hasattr(self.conn, "deserialize"):
+            self.conn.deserialize(self.db)
+        else:
+            self.tmpfile = tempfile.NamedTemporaryFile()
+            self.tmpfile.write(self.db)
+            self.conn = sqlite3.connect(self.tmpfile.name)
+
+    def __enter__(self) -> sqlite3.Connection:
+        return self.conn
+
+    def __exit__(self, type, value, traceback):
+        self.conn.close()
+        if self.tmpfile:
+            self.tmpfile.close()
+
+
 def main():
     if gh_token == "BAD_KEY":
         raise KeyError("GH_TOKEN must be set in environment")
@@ -136,35 +159,25 @@ def main():
     d_cat = {}
     d_names = {}
 
-    tmpfile = None
-
-    for runid, run in runs.items():
-        conn = sqlite3.connect(":memory:")
-        if hasattr(conn, "deserialize"):
-            conn.deserialize(run)
-        else:
-            tmpfile = tempfile.NamedTemporaryFile()
-            tmpfile.write(run)
-            conn = sqlite3.connect(tmpfile.name)
-        cur = conn.cursor()
-        cur.execute(get_all_cmds_query)
-        for cmd, cat in cur.fetchall():
-            d_cat[f"{cat}_{cmd}"] = cat or "other"
-            d_times[f"{cat}_{cmd}"][runid] = []
-            d_rss[f"{cat}_{cmd}"][runid] = []
-            d_names[f"{cat}_{cmd}"] = cmd
-
-        try:
-            cur.execute(get_mem_query)
-        except sqlite3.OperationalError:
-            ### No such table memprof
-            continue
-        for cmd, cat, time, rss in cur.fetchall():
-            d_times[f"{cat}_{cmd}"][runid].append(time)
-            d_rss[f"{cat}_{cmd}"][runid].append(rss)
-        conn.close()
-        if tmpfile:
-            tmpfile.close()
+    for runid, zf in runs.items():
+        ##conn = zip_to_sql_conn(zf)
+        with Zip_to_sql_conn(zf) as conn:
+            cur = conn.cursor()
+            cur.execute(get_all_cmds_query)
+            for cmd, cat in cur.fetchall():
+                d_cat[f"{cat}_{cmd}"] = cat or "other"
+                d_times[f"{cat}_{cmd}"][runid] = []
+                d_rss[f"{cat}_{cmd}"][runid] = []
+                d_names[f"{cat}_{cmd}"] = cmd
+
+            try:
+                cur.execute(get_mem_query)
+            except sqlite3.OperationalError:
+                ### No such table memprof
+                continue
+            for cmd, cat, time, rss in cur.fetchall():
+                d_times[f"{cat}_{cmd}"][runid].append(time)
+                d_rss[f"{cat}_{cmd}"][runid].append(rss)
 
     for k, v in d_rss.items():
         os.makedirs(f"{ns.outdir}/{d_cat[k]}", exist_ok=True)

From dc92292eeb0a0f9c036abcf4ae333e0410ef3dfb Mon Sep 17 00:00:00 2001
From: Dale Roberts <dale.roberts1@anu.edu.au>
Date: Thu, 12 Jun 2025 10:29:07 +1000
Subject: [PATCH 2/2] Remove comment

---
 memprof_plotter/plotter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/memprof_plotter/plotter.py b/memprof_plotter/plotter.py
index 6b1c7be..d1e7cce 100644
--- a/memprof_plotter/plotter.py
+++ b/memprof_plotter/plotter.py
@@ -160,7 +160,6 @@ def main():
     d_names = {}
 
     for runid, zf in runs.items():
-        ##conn = zip_to_sql_conn(zf)
         with Zip_to_sql_conn(zf) as conn:
             cur = conn.cursor()
             cur.execute(get_all_cmds_query)