diff --git a/.gitignore b/.gitignore index 6b9c6d8..21f1420 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ website/core.db build/ __pycache__/ *.sqlite3 +*.sqlite env diff --git a/README.md b/README.md index 1619144..f1df3de 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ python sample_registry/app.py How you want to deploy this will depend on your needs, facilities, and ability. We have it deployed by a Kubernetes cluster but you could also 1) just run it in development mode from a lab computer or 2) setup Nginx/Apache on a dedicated server or 3) run it serverlessly in the cloud (e.g. with [Zappa](https://github.com/zappa/Zappa) on AWS) or 4) do something else. There are lots of well documented examples of deploying Flask sites out there, look around and find what works best for you. -When running, it will default to using a SQLite3 database located in the root of this repository (automatically created if it doesn't already exist). You can change to use a different backend by setting the `SAMPLE_REGISTRY_DB_URI` environment variable before running the app. For example, another sqlite database could be specified with a URI like this: `export SAMPLE_REGISTRY_DB_URI=sqlite:////path/to/db.sqlite3`. +When running, it will default to using a SQLite3 database located in the root of this repository (automatically created if it doesn't already exist). You can change to use a different backend by setting the `SAMPLE_REGISTRY_DB_URI` environment variable before running the app. For example, another sqlite database could be specified with a URI like this: `export SAMPLE_REGISTRY_DB_URI=sqlite:////path/to/db.sqlite`. ## Using the library diff --git a/sample_registry/__init__.py b/sample_registry/__init__.py index d95a285..62e061e 100644 --- a/sample_registry/__init__.py +++ b/sample_registry/__init__.py @@ -8,6 +8,10 @@ __version__ = "1.3.0" +# Define archive root path +ARCHIVE_ROOT = Path( + os.environ.get("SAMPLE_REGISTRY_ARCHIVE_ROOT", "/mnt/isilon/microbiome/") +) # Doesn't include "NA" because that's what we fill in for missing values NULL_VALUES: list[Optional[str]] = [ None, @@ -34,7 +38,7 @@ def sample_registry_version(): "Missing database connection information in environment, using test SQLite database\n" ) SQLALCHEMY_DATABASE_URI = ( - f"sqlite:///{Path(__file__).parent.parent.resolve()}/sample_registry.sqlite3" + f"sqlite:///{Path(__file__).parent.parent.resolve()}/sample_registry.sqlite" ) @@ -42,8 +46,9 @@ def sample_registry_version(): # Set SQLALCHEMY_DATABASE_URI to an in-memory SQLite database for testing SQLALCHEMY_DATABASE_URI = "sqlite:///:memory:" -# Create database engine -engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False) + +sys.stderr.write(f"Connecting to database at {SQLALCHEMY_DATABASE_URI}\n") +engine = create_engine(SQLALCHEMY_DATABASE_URI) # Create database session Session = sessionmaker(bind=engine) diff --git a/sample_registry/app.py b/sample_registry/app.py index 723d34e..966201d 100644 --- a/sample_registry/app.py +++ b/sample_registry/app.py @@ -1,6 +1,8 @@ import csv import pickle import os +from collections import defaultdict +from datetime import datetime from flask import ( Flask, make_response, @@ -10,11 +12,12 @@ redirect, send_file, send_from_directory, + jsonify, ) from flask_sqlalchemy import SQLAlchemy from io import StringIO from pathlib import Path -from sample_registry import SQLALCHEMY_DATABASE_URI +from sample_registry import ARCHIVE_ROOT, SQLALCHEMY_DATABASE_URI from sample_registry.models import ( Base, Annotation, @@ -33,14 +36,15 @@ # whatever production server you are using instead. It's ok to leave this in when running the dev server. app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_prefix=1) +# Sanitize and RO db connection +SQLALCHEMY_DATABASE_URI = f"{SQLALCHEMY_DATABASE_URI.split('?')[0]}?mode=ro" app.config["SQLALCHEMY_DATABASE_URI"] = SQLALCHEMY_DATABASE_URI print(SQLALCHEMY_DATABASE_URI) +# Ensure SQLite explicitly opens in read-only mode +app.config["SQLALCHEMY_ENGINE_OPTIONS"] = {"connect_args": {"uri": True}} db = SQLAlchemy(model_class=Base) db.init_app(app) -with app.app_context(): - db.create_all() - @app.route("/favicon.ico") def favicon(): @@ -294,6 +298,76 @@ def show_stats(): ) +def _parsed_month(date_str: str): + for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%y", "%m/%d/%Y"): + try: + return datetime.strptime(date_str, fmt).strftime("%Y-%m") + except ValueError: + continue + return None + + +def _archive_size_for_run(run, warnings): + archive_path = (ARCHIVE_ROOT / run.data_uri).parent + run_label = f"CMR{run.run_accession:06d}" + + if not archive_path.exists(): + warnings.append(f"{run_label}: Archive path {archive_path} does not exist") + return 0 + + if not archive_path.is_dir(): + warnings.append(f"{run_label}: Archive path {archive_path} is not a directory") + return 0 + + total_size = 0 + for entry in archive_path.rglob("*"): + try: + if entry.is_file(): + total_size += entry.stat().st_size + except OSError as exc: + warnings.append(f"{run_label}: Error accessing {entry}: {exc}") + + if total_size == 0: + warnings.append(f"{run_label}: Archive at {archive_path} has size 0 bytes") + + return total_size + + +@app.route("/api/archive_sizes") +def archive_sizes(): + runs = db.session.query(Run).all() + warnings = [] + max_warnings = 50 + totals_by_month = defaultdict(int) + + for run in runs: + month_label = _parsed_month(run.run_date) + if not month_label: + if len(warnings) < max_warnings: + warnings.append( + f"CMR{run.run_accession:06d}: Unable to parse run_date '{run.run_date}'" + ) + continue + + archive_size = _archive_size_for_run(run, warnings) + totals_by_month[month_label] += archive_size + + if len(warnings) >= max_warnings: + warnings.append("... Additional warnings truncated ...") + + by_month = [ + {"month": month, "size_bytes": totals_by_month[month]} + for month in sorted(totals_by_month.keys()) + ] + + return jsonify({"by_month": by_month, "warnings": warnings}) + + +@app.route("/archive") +def archive(): + return render_template("archive.html") + + @app.route("/download/", methods=["GET", "POST"]) def download(run_acc): ext = run_acc[-4:] diff --git a/sample_registry/db.py b/sample_registry/db.py index 9b9a911..6322348 100644 --- a/sample_registry/db.py +++ b/sample_registry/db.py @@ -13,7 +13,6 @@ StandardHostSpecies, ) - STANDARD_TAGS = { "SampleType": "sample_type", "SubjectID": "subject_id", diff --git a/sample_registry/register.py b/sample_registry/register.py index 7f0d8a1..1530470 100644 --- a/sample_registry/register.py +++ b/sample_registry/register.py @@ -9,7 +9,6 @@ from sample_registry.registrar import SampleRegistry from seqBackupLib.illumina import IlluminaFastq - SAMPLES_DESC = """\ Add new samples to the registry, with annotations. """ diff --git a/sample_registry/templates/archive.html b/sample_registry/templates/archive.html new file mode 100644 index 0000000..8462e4f --- /dev/null +++ b/sample_registry/templates/archive.html @@ -0,0 +1,98 @@ +{% extends 'base.html' %} + +{% block head %} + +{% endblock %} + +{% block body %} +
+
+
+

Archive usage

+

Net archive size grouped by month. Data are gathered directly from the NFS archive paths for each run.

+
+
+
+
+ +
+
+
+
+

+ +

+
+
    +
    +
    +
    +
    + + +{% endblock %} diff --git a/sample_registry/templates/base.html b/sample_registry/templates/base.html index 30f5c90..641b076 100644 --- a/sample_registry/templates/base.html +++ b/sample_registry/templates/base.html @@ -41,10 +41,11 @@

    - Runs - Metadata - Stats -
    + Runs + Metadata + Stats + Archive +
    diff --git a/sample_registry/templates/browse_runs.html b/sample_registry/templates/browse_runs.html index 0db170f..f322884 100644 --- a/sample_registry/templates/browse_runs.html +++ b/sample_registry/templates/browse_runs.html @@ -19,13 +19,8 @@

    Sequencing runs

    - {% for run, sample_count in sample_counts.items() %} - {% if run.machine_type.startswith('Illumina') %} - {% set platform = 'Illumina' %} - {% else %} - {% set platform = run.machine_type %} - {% endif %} - {% set platform = platform + ' ' + run.machine_kit %} + {% for run, sample_count in sample_counts.items() %} + {% set platform = run.machine_type + ' ' + run.machine_kit %} {{ "CMR{:06d}".format(run.run_accession) }} {{ run.run_date }} diff --git a/sample_registry/templates/show_run.html b/sample_registry/templates/show_run.html index 6142295..a092818 100644 --- a/sample_registry/templates/show_run.html +++ b/sample_registry/templates/show_run.html @@ -19,7 +19,7 @@

  • Date: {{ run.run_date }}
  • Lane: {{ run.lane }}
  • Platform: {{ run.machine_type }} {{ run.machine_kit }}
  • -
  • Data file: {{ run.data_uri.split('/')|last }}
  • +
  • Data file: /mnt/isilon/microbiome/{{ run.data_uri }}
  • Export metadata for all samples:
    diff --git a/tests/test_mapping.py b/tests/test_mapping.py index c5fc035..33c3c8f 100644 --- a/tests/test_mapping.py +++ b/tests/test_mapping.py @@ -1,7 +1,6 @@ import io from sample_registry.mapping import SampleTable - NORMAL_TSV = """\ SampleID BarcodeSequence HostSpecies SubjectID S1 GCCT Human Hu23 diff --git a/tests/test_register.py b/tests/test_register.py index 7d4d574..12ccfbf 100644 --- a/tests/test_register.py +++ b/tests/test_register.py @@ -25,7 +25,6 @@ register_host_species, ) - samples = [ { "SampleID": "abc123",