Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions dockers/dictys_v4/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
FROM ubuntu:22.04

ARG DEBIAN_FRONTEND=noninteractive
ENV TZ="America/New_York"

# Base OS deps (build tools + common libs) + libpng for matplotlib
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates git unzip zip \
build-essential pkg-config \
zlib1g-dev libbz2-dev liblzma-dev \
libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev \
libfreetype6-dev libpng-dev \
python3-venv python3-distutils python3-dev \
# bio tools via apt instead of building
samtools tabix \
perl \
&& rm -rf /var/lib/apt/lists/*

# Install CPython 3.9.17 from source
ARG PYTHON_VERSION=3.9.17
RUN set -eux; \
cd /tmp; \
curl -fsSLO https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz; \
tar -xzf Python-${PYTHON_VERSION}.tgz; \
cd Python-${PYTHON_VERSION}; \
./configure --enable-optimizations; \
make -j"$(nproc)"; \
make install; \
cd /; rm -rf /tmp/Python-${PYTHON_VERSION}*; \
ln -s /usr/local/bin/python3 /usr/local/bin/python; \
ln -s /usr/local/bin/pip3 /usr/local/bin/pip

# Make constraints global for all pip installs
COPY constraints.txt /tmp/constraints.txt
ENV PIP_CONSTRAINT=/tmp/constraints.txt \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=180

# Clean any existing numpy/matplotlib remnants aggressively
RUN python - <<'PY'
import sys, site, pkgutil, shutil, pathlib
paths = set(site.getsitepackages() + [site.getusersitepackages()])
for p in list(paths):
if not p:
continue
for name in ("numpy", "matplotlib"):
for m in pathlib.Path(p).glob(name):
shutil.rmtree(m, ignore_errors=True)
for m in pathlib.Path(p).glob(f"{name}-*.dist-info"):
shutil.rmtree(m, ignore_errors=True)
for m in pathlib.Path(p).glob(f"{name}-*.egg-info"):
shutil.rmtree(m, ignore_errors=True)
print("Cleaned numpy/matplotlib from:", *paths, sep="\n - ")
PY

# Install bedtools
RUN apt-get update && apt-get install -y --no-install-recommends bedtools \
&& rm -rf /var/lib/apt/lists/*

# Install tools + exact pins
RUN python -m pip install --no-cache-dir -U pip setuptools wheel \
&& pip install --no-cache-dir --upgrade --force-reinstall \
"numpy==1.26.4" "matplotlib==3.8.4" "cython<3"

# Install MACS2. Build-from-source packages must reuse our pinned toolchain
RUN pip install --no-cache-dir --no-build-isolation MACS2==2.2.9.1

# Install Dictys without dependencies (we'll install them manually right after)
RUN pip install --no-cache-dir --no-build-isolation --no-deps \
git+https://github.com/pinellolab/dictys.git@a82930fe8030af2785f9069ef5e909e49acc866f

# Install Dictys dependencies and more
RUN pip install --no-cache-dir --prefer-binary \
pandas scipy networkx h5py threadpoolctl joblib \
jupyter jupyterlab adjustText pyro-ppl docutils requests

# Install pyDNase and anndata without dependencies so it can't pin matplotlib<2
RUN pip install --no-cache-dir --no-build-isolation --no-deps pyDNase clint pysam packaging array_api_compat legacy-api-wrap zarr natsort anndata

# Install pybedtools version that works with cython<3
RUN pip install --no-cache-dir --no-build-isolation "pybedtools==0.9.1"

# Install pytorch
# RUN pip install --no-cache-dir --prefer-binary --index-url https://download.pytorch.org/whl/cpu torch

# HOMER prerequisites
RUN apt-get update && apt-get install -y --no-install-recommends \
wget perl unzip build-essential zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*

# Install HOMER core + hg38 genome
RUN set -eux; \
mkdir -p /opt/homer && cd /opt/homer; \
curl -fsSLO http://homer.ucsd.edu/homer/configureHomer.pl; \
chmod +x configureHomer.pl; \
perl configureHomer.pl -install homer; \
perl configureHomer.pl -install homerTools; \
perl configureHomer.pl -install hg38
ENV PATH="/opt/homer/bin:${PATH}"

# hg38 annotations
RUN set -eux; \
cd /opt/homer; \
grep "hg38" update.txt > tmp.txt && mv tmp.txt update.txt; \
cd update && ./updateUCSCGenomeAnnotations.pl ../update.txt

# Install CUDA
# RUN curl -fsSLo /etc/apt/preferences.d/cuda-repository-pin-600 \
# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
# curl -fsSLo /usr/share/keyrings/nvidia-cuda.gpg \
# https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
# echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \
# > /etc/apt/sources.list.d/cuda.list && \
# apt-get update && apt-get install -y --no-install-recommends cuda && \
# rm -rf /var/lib/apt/lists/*

# Install AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
unzip awscliv2.zip && \
./aws/install

CMD ["/bin/bash"]
3 changes: 3 additions & 0 deletions dockers/dictys_v4/constraints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
numpy==1.26.4
matplotlib<3.9
cython<3
31 changes: 31 additions & 0 deletions dockers/dictys_v5/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM lfwa/dictys-cpu:latest

ARG DEBIAN_FRONTEND=noninteractive
ENV TZ="America/New_York"

# Install bio tools
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl unzip less groff \
tabix samtools \
&& rm -rf /var/lib/apt/lists/*

# Install anndata
RUN pip install --no-cache-dir anndata

# Install AWS CLI v2
RUN curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" \
&& unzip -q /tmp/awscliv2.zip -d /tmp \
&& /tmp/aws/install --update \
&& rm -rf /tmp/aws /tmp/awscliv2.zip

# Clean MACS2, install everything into dictys, add MACS2 wrapper
RUN source /opt/conda/etc/profile.d/conda.sh \
&& conda activate dictys \
&& (pip uninstall -y MACS2 || true) \
&& (conda remove -y macs2 || true) \
&& conda install -y -c conda-forge -c bioconda macs3 \
&& printf '#!/usr/bin/env bash\nexec macs3 "$@"\n' > "$(conda run -n dictys --no-capture-output python -c 'import sys,os; print(os.path.join(os.path.dirname(sys.executable), "macs2"))')" \
&& chmod +x "$(conda run -n dictys --no-capture-output python -c 'import sys,os; print(os.path.join(os.path.dirname(sys.executable), "macs2"))')" \
&& conda clean -afy

CMD ["/bin/bash"]
3 changes: 3 additions & 0 deletions dockers/dictys_v5/constraints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
numpy==1.26.4
matplotlib<3.9
cython<3
128 changes: 92 additions & 36 deletions src/methods/dictys/helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["MKL_THREADING_LAYER"] = "GNU"
import shutil
import json
from typing import Optional, List

import numpy as np
import pandas as pd
Expand All @@ -11,6 +14,31 @@
warnings.filterwarnings("ignore")


OVERRIDE_DOWNLOAD = True


def run_cmd(cmd: List[str], cwd: Optional[str] = None) -> None:
kwargs = {}
if cwd is not None:
kwargs['cwd'] = cwd
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
**kwargs
) as proc:
for line in proc.stdout:
print(line, end="")
rc = proc.wait()
if rc != 0:
raise RuntimeError(f"Command {cmd} failed with exit code {rc}")


def run_cmd_inside_dictys(cmd: List[str], cwd: Optional[str] = None) -> None:
run_cmd(["conda", "run", "-n", "dictys", "--no-capture-output", *cmd], cwd=cwd)


def define_vars(par):
os.makedirs(par['temp_dir'], exist_ok=True)
Expand All @@ -28,6 +56,7 @@ def define_vars(par):
par['bams_dir'] = f"{par['data_dir']}/bams/"

par['gene_bed'] = f"{par['data_dir']}/gene.bed"
par['make_dir'] = f"{par['temp_dir']}/makefiles"


def extract_exp(par):
Expand Down Expand Up @@ -88,6 +117,7 @@ def extract_atac(par):
print(f'Sort and compress tsv file {frags_path}')
os.system(f"sort -k1,1 -k2,2n {temp_path} | bgzip -c > {frags_path}")


def create_bam(par):
print('Creating BAM file from fragments', flush=True)
cmd = f"python {par['frag_to_bam']} --fnames {par['frags_path']} --barcodes {par['barcodes']}"
Expand All @@ -107,9 +137,24 @@ def bam_to_bams(par):
- 'bams_dir': path to output folder for per-cell BAMs
- 'exp_path': path to reference expression file
"""

print('Delete temp BAM directories', flush=True)
folders = [
par['bams_dir'],
os.path.join(par['bams_dir'], '..', 'bams_text'),
os.path.join(par['bams_dir'], '..', 'bams_header')
]
for folder in folders:
if os.path.exists(folder):
shutil.rmtree(folder)

print('Splitting BAM into per-cell BAMs', flush=True)
cmd = f"bash dictys_helper split_bam.sh {par['bam_name']} {par['bams_dir']} --section CB:Z: --ref_expression {par['exp_path']}"
run_cmd(cmd)
run_cmd_inside_dictys([
"dictys_helper", "split_bam.sh", par['bam_name'], par['bams_dir'],
"--section", "CB:Z:", "--ref_expression", par['exp_path']
])


def extrac_clusters(par):
print('Extracting clusters', flush=True)
subsets = f"{par['data_dir']}/subsets.txt"
Expand All @@ -127,15 +172,6 @@ def extrac_clusters(par):
subprocess.run(cp, shell=True, check=True)
print('Extracting clusters successful', flush=True)

def run_cmd(cmd):
try:
result = subprocess.run(cmd, check=True, text=True, capture_output=True, shell=True)
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)
except subprocess.CalledProcessError as e:
print("Command failed with exit code", e.returncode)
print("STDOUT:", e.stdout)
print("STDERR:", e.stderr)

def download_file(url, dest):
import requests
Expand All @@ -145,66 +181,85 @@ def download_file(url, dest):
with open(dest, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)


def get_priors(par):
import gzip
import shutil
# - get the genome
print('Getting genome ...', flush=True)
os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
cmd = f"aws s3 cp s3://openproblems-data/resources/grn/supp_data/genome/genome.fa {par['data_dir']}/genome/ --no-sign-request"
try:
run_cmd(cmd)
except:
if OVERRIDE_DOWNLOAD or (not os.path.exists(f"{par['data_dir']}/genome/genome.fa")):
os.makedirs(f"{par['data_dir']}/genome/", exist_ok=True)
try:
cmd = f"cp resources/supp_data/genome/genome.fa {par['data_dir']}/genome/"
run_cmd(cmd)
run_cmd([
"aws", "s3", "cp", "s3://openproblems-data/resources/grn/supp_data/genome/genome.fa",
f"{par['data_dir']}/genome/", "--no-sign-request"
])
except:
raise ValueError("Could not get the genome")
try:
run_cmd([
"cp", "resources/supp_data/genome/genome.fa", f"{par['data_dir']}/genome/"
])
except:
raise ValueError("Could not get the reference genome")

# - get gene annotation
print('Getting gene annotation ...', flush=True)
data_dir = Path(par['data_dir'])
gtf_gz = data_dir / "gene.gtf.gz"
gtf = data_dir / "gene.gtf"
url = "http://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
download_file(url, gtf_gz)
if OVERRIDE_DOWNLOAD or (not os.path.exists(gtf_gz)):
download_file(url, gtf_gz)

with gzip.open(gtf_gz, "rb") as f_in, open(gtf, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
gtf_gz.unlink()

cmd = f"bash dictys_helper gene_gtf.sh {gtf} {par['gene_bed']}"
print('Making bed files for gene annotation ...', flush=True)
run_cmd(cmd)
run_cmd_inside_dictys([
"dictys_helper", "gene_gtf.sh", str(gtf), str(par['gene_bed'])
])

print('Downloading motif file...', flush=True)

url='https://hocomoco11.autosome.org/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.0001.motif'
motif_file = data_dir / 'motifs.motif'
download_file(url, motif_file)

if OVERRIDE_DOWNLOAD or (not os.path.exists(motif_file)):
download_file(url, motif_file)


def configure(par):
import json
device='cuda:0' #cuda:0 , cpu
par['make_dir'] = f"{par['temp_dir']}/makefiles"

device = 'cpu'
os.makedirs(par['make_dir'], exist_ok=True)
cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_template.sh common.mk config.mk env_none.mk static.mk"
run_cmd(cmd)
run_cmd_inside_dictys([
"dictys_helper", "makefile_template.sh", "common.mk", "config.mk", "env_none.mk", "static.mk"
], cwd=par['make_dir'])

json_arg = json.dumps({
"DEVICE": device,
"GENOME_MACS2": "hs",
"JOINT": "1"
})

cmd = f"cd {par['make_dir']} && bash dictys_helper makefile_update.py config.mk '{json_arg}'"
run_cmd(cmd)
cmd = f"cd {par['temp_dir']} && bash dictys_helper makefile_check.py"
run_cmd(cmd)
run_cmd_inside_dictys([
"dictys_helper", "makefile_update.py", "config.mk", json_arg
], cwd=par['make_dir'])

run_cmd_inside_dictys([
"dictys_helper", "makefile_check.py", "--dir_makefiles", par['make_dir'],
"--dir_data", par['data_dir']
])


def infer_grn(par):
print('Inferring GRNs', flush=True)
cmd = f"cd {par['temp_dir']} && bash dictys_helper network_inference.sh -j {par['num_workers']} -J 1 static"
run_cmd(cmd)
run_cmd_inside_dictys([
"dictys_helper", "network_inference.sh", "-j", str(par['num_workers']), "-J", "1", "static"
], cwd=par['temp_dir'])


def export_net(par):
from util import process_links
from dictys.net import network
Expand All @@ -223,9 +278,10 @@ def export_net(par):
output = ad.AnnData(X=None, uns={"method_id": "dictys", "dataset_id": ad.read_h5ad(par['rna'], backed='r').uns['dataset_id'], "prediction": net[["source", "target", "weight"]]})
output.write(par['prediction'])


def main(par):
define_vars(par)

define_vars(par)
extract_exp(par)
extract_atac(par)
create_bam(par)
Expand Down
Loading
Loading