Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 74 additions & 25 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,85 @@ concurrency:
cancel-in-progress: true

jobs:
# 1. SETUP: Dynamically find the samples
# 1. SETUP: Dynamically find the samples and create chunks for large benchmarks
setup:
runs-on: ubuntu-latest
outputs:
samples: ${{ steps.set-matrix.outputs.samples }}
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v4
- id: set-matrix
run: |
# Auto-discover benchmarks from resources/regions/, but only keep
# those that include a target regions BED (required by the workflow).
SAMPLES=$(for d in resources/regions/*/; do [ -f "$d/target-regions.bed" ] && basename "$d"; done | jq -R -s -c 'split("\n")[:-1]')
echo "samples=$SAMPLES" >> $GITHUB_OUTPUT

# 2. EVALUATE: The matrix-based analysis
# Create a matrix that splits large benchmarks into chunks to avoid timeouts.
# Benchmarks with many variant calls are split into smaller groups.
cat > /tmp/generate_matrix.py << 'EOF'
import json
import yaml
import os

# Load config
with open('config/config.yaml', 'r') as f:
config = yaml.safe_load(f)

# Find samples (benchmarks with target-regions.bed)
samples = []
for entry in os.listdir('resources/regions'):
path = os.path.join('resources/regions', entry, 'target-regions.bed')
if os.path.isfile(path):
samples.append(entry)

# Group variant calls by benchmark
benchmarks = {}
for key, callset in config['variant-calls'].items():
benchmark = callset.get('benchmark')
if benchmark and benchmark in samples:
if benchmark not in benchmarks:
benchmarks[benchmark] = []
benchmarks[benchmark].append(key)

# Create matrix entries, splitting large benchmarks into chunks
matrix = []
CHUNK_SIZE = 4 # Max variant calls per job to avoid timeouts

for sample in sorted(samples):
calls = benchmarks.get(sample, [])
num_calls = len(calls)

if num_calls <= CHUNK_SIZE:
# Small benchmark - process as a single job
matrix.append({
'sample': sample,
'chunk': 'all',
'chunk_index': 0
})
else:
# Large benchmark - split into chunks
num_chunks = (num_calls + CHUNK_SIZE - 1) // CHUNK_SIZE
for i in range(num_chunks):
matrix.append({
'sample': sample,
'chunk': f'{i+1}of{num_chunks}',
'chunk_index': i
})

print(json.dumps({'include': matrix}))
EOF

python3 /tmp/generate_matrix.py > /tmp/matrix.json
MATRIX=$(cat /tmp/matrix.json)
echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
echo "Generated matrix:" >> $GITHUB_STEP_SUMMARY
echo '```json' >> $GITHUB_STEP_SUMMARY
cat /tmp/matrix.json | jq . >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY

# 2. EVALUATE: The matrix-based analysis (split by sample and chunk)
evaluate:
needs: setup
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
sample: ${{ fromJson(needs.setup.outputs.samples) }}
matrix: ${{fromJson(needs.setup.outputs.matrix)}}
env:
FTP_PASSWORD: ${{ secrets.FTP_PASSWORD }}
ZENODO_TOKEN: ${{ secrets.ZENODO_TOKEN }}
Expand Down Expand Up @@ -73,18 +130,7 @@ jobs:
with:
directory: "."
snakefile: "workflow/Snakefile"
args: "--sdm conda --cores 1 --conda-cleanup-pkgs cache --until benchmark_get_truth --config benchmark=${{ matrix.sample }}"
stagein: |
pip install snakemake-storage-plugin-zenodo
pip install snakemake-storage-plugin-http

- name: Fix modification dates
uses: snakemake/snakemake-github-action@v2.0.3
with:
directory: "."
snakefile: "workflow/Snakefile"
# Only touch the files for THIS matrix sample
args: "--cores 1 --sdm conda --touch resources/regions/${{ matrix.sample }}/test-regions.cov-*.bed"
args: "--sdm conda --cores 1 --conda-cleanup-pkgs cache --until benchmark_get_truth --config benchmark=${{ matrix.sample }} chunk_index=${{ matrix.chunk_index }} chunk_size=4"
stagein: |
pip install snakemake-storage-plugin-zenodo
pip install snakemake-storage-plugin-http
Expand All @@ -96,7 +142,7 @@ jobs:
snakefile: "workflow/Snakefile"
args: >
benchmark_all
--config benchmark=${{ matrix.sample }}
--config benchmark=${{ matrix.sample }} chunk_index=${{ matrix.chunk_index }} chunk_size=4
--cores 4 --sdm conda --conda-cleanup-pkgs cache --rerun-triggers mtime --all-temp
stagein: |
pip install snakemake-storage-plugin-zenodo
Expand All @@ -105,22 +151,25 @@ jobs:
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.sample }}
name: ${{ matrix.sample }}-${{ matrix.chunk }}
# Upload the workflow outputs (keeps the on-disk paths Snakemake expects).
path: results/
retention-days: 1

# 3. REPORT: Combine and Deploy
report:
needs: evaluate
# Run report even if some evaluate jobs fail, as long as at least one succeeded
if: ${{ !cancelled() && (success() || failure()) }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Download all results
uses: actions/download-artifact@v4
with:
# Merge all benchmark artifacts back into the workspace so Snakemake sees `results/`.
# Download all available benchmark artifacts (pattern: giab-*-*)
pattern: "giab-*"
path: .
merge-multiple: true

Expand Down
21 changes: 21 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@ if "benchmark" in config:
}


# Filter callsets by chunk if specified via --config chunk_index=... and chunk_size=...
# This allows splitting large benchmarks into smaller groups to avoid timeouts.
if "chunk_index" in config and "chunk_size" in config:
# Get all variant calls for this benchmark (already filtered above)
all_calls = sorted(config["variant-calls"].keys())
chunk_index = int(config["chunk_index"])
chunk_size = int(config["chunk_size"])

# Select the chunk
start_idx = chunk_index * chunk_size
end_idx = start_idx + chunk_size
chunk_calls = all_calls[start_idx:end_idx]

# Filter to only this chunk
config["variant-calls"] = {
key: callset
for key, callset in config["variant-calls"].items()
if key in chunk_calls
}


# add path to callsets
for key, callset in config["variant-calls"].items():
if "zenodo" in callset:
Expand Down
Loading