diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a0f4e74..373c694 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,28 +11,85 @@ concurrency: cancel-in-progress: true jobs: - # 1. SETUP: Dynamically find the samples + # 1. SETUP: Dynamically find the samples and create chunks for large benchmarks setup: runs-on: ubuntu-latest outputs: - samples: ${{ steps.set-matrix.outputs.samples }} + matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - uses: actions/checkout@v4 - id: set-matrix run: | - # Auto-discover benchmarks from resources/regions/, but only keep - # those that include a target regions BED (required by the workflow). - SAMPLES=$(for d in resources/regions/*/; do [ -f "$d/target-regions.bed" ] && basename "$d"; done | jq -R -s -c 'split("\n")[:-1]') - echo "samples=$SAMPLES" >> $GITHUB_OUTPUT - - # 2. EVALUATE: The matrix-based analysis + # Create a matrix that splits large benchmarks into chunks to avoid timeouts. + # Benchmarks with many variant calls are split into smaller groups. + cat > /tmp/generate_matrix.py << 'EOF' + import json + import yaml + import os + + # Load config + with open('config/config.yaml', 'r') as f: + config = yaml.safe_load(f) + + # Find samples (benchmarks with target-regions.bed) + samples = [] + for entry in os.listdir('resources/regions'): + path = os.path.join('resources/regions', entry, 'target-regions.bed') + if os.path.isfile(path): + samples.append(entry) + + # Group variant calls by benchmark + benchmarks = {} + for key, callset in config['variant-calls'].items(): + benchmark = callset.get('benchmark') + if benchmark and benchmark in samples: + if benchmark not in benchmarks: + benchmarks[benchmark] = [] + benchmarks[benchmark].append(key) + + # Create matrix entries, splitting large benchmarks into chunks + matrix = [] + CHUNK_SIZE = 4 # Max variant calls per job to avoid timeouts + + for sample in sorted(samples): + calls = benchmarks.get(sample, []) + num_calls = len(calls) + + if num_calls <= CHUNK_SIZE: + # Small benchmark - process as a single job + matrix.append({ + 'sample': sample, + 'chunk': 'all', + 'chunk_index': 0 + }) + else: + # Large benchmark - split into chunks + num_chunks = (num_calls + CHUNK_SIZE - 1) // CHUNK_SIZE + for i in range(num_chunks): + matrix.append({ + 'sample': sample, + 'chunk': f'{i+1}of{num_chunks}', + 'chunk_index': i + }) + + print(json.dumps({'include': matrix})) + EOF + + python3 /tmp/generate_matrix.py > /tmp/matrix.json + MATRIX=$(cat /tmp/matrix.json) + echo "matrix=$MATRIX" >> $GITHUB_OUTPUT + echo "Generated matrix:" >> $GITHUB_STEP_SUMMARY + echo '```json' >> $GITHUB_STEP_SUMMARY + cat /tmp/matrix.json | jq . >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + # 2. EVALUATE: The matrix-based analysis (split by sample and chunk) evaluate: needs: setup runs-on: ubuntu-latest strategy: fail-fast: false - matrix: - sample: ${{ fromJson(needs.setup.outputs.samples) }} + matrix: ${{fromJson(needs.setup.outputs.matrix)}} env: FTP_PASSWORD: ${{ secrets.FTP_PASSWORD }} ZENODO_TOKEN: ${{ secrets.ZENODO_TOKEN }} @@ -73,18 +130,7 @@ jobs: with: directory: "." snakefile: "workflow/Snakefile" - args: "--sdm conda --cores 1 --conda-cleanup-pkgs cache --until benchmark_get_truth --config benchmark=${{ matrix.sample }}" - stagein: | - pip install snakemake-storage-plugin-zenodo - pip install snakemake-storage-plugin-http - - - name: Fix modification dates - uses: snakemake/snakemake-github-action@v2.0.3 - with: - directory: "." - snakefile: "workflow/Snakefile" - # Only touch the files for THIS matrix sample - args: "--cores 1 --sdm conda --touch resources/regions/${{ matrix.sample }}/test-regions.cov-*.bed" + args: "--sdm conda --cores 1 --conda-cleanup-pkgs cache --until benchmark_get_truth --config benchmark=${{ matrix.sample }} chunk_index=${{ matrix.chunk_index }} chunk_size=4" stagein: | pip install snakemake-storage-plugin-zenodo pip install snakemake-storage-plugin-http @@ -96,7 +142,7 @@ jobs: snakefile: "workflow/Snakefile" args: > benchmark_all - --config benchmark=${{ matrix.sample }} + --config benchmark=${{ matrix.sample }} chunk_index=${{ matrix.chunk_index }} chunk_size=4 --cores 4 --sdm conda --conda-cleanup-pkgs cache --rerun-triggers mtime --all-temp stagein: | pip install snakemake-storage-plugin-zenodo @@ -105,7 +151,7 @@ jobs: - name: Upload results uses: actions/upload-artifact@v4 with: - name: ${{ matrix.sample }} + name: ${{ matrix.sample }}-${{ matrix.chunk }} # Upload the workflow outputs (keeps the on-disk paths Snakemake expects). path: results/ retention-days: 1 @@ -113,6 +159,8 @@ jobs: # 3. REPORT: Combine and Deploy report: needs: evaluate + # Run report even if some evaluate jobs fail, as long as at least one succeeded + if: ${{ !cancelled() && (success() || failure()) }} runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -120,7 +168,8 @@ jobs: - name: Download all results uses: actions/download-artifact@v4 with: - # Merge all benchmark artifacts back into the workspace so Snakemake sees `results/`. + # Download all available benchmark artifacts (pattern: giab-*-*) + pattern: "giab-*" path: . merge-multiple: true diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 2795bf3..0a7c00c 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -25,6 +25,27 @@ if "benchmark" in config: } +# Filter callsets by chunk if specified via --config chunk_index=... and chunk_size=... +# This allows splitting large benchmarks into smaller groups to avoid timeouts. +if "chunk_index" in config and "chunk_size" in config: + # Get all variant calls for this benchmark (already filtered above) + all_calls = sorted(config["variant-calls"].keys()) + chunk_index = int(config["chunk_index"]) + chunk_size = int(config["chunk_size"]) + + # Select the chunk + start_idx = chunk_index * chunk_size + end_idx = start_idx + chunk_size + chunk_calls = all_calls[start_idx:end_idx] + + # Filter to only this chunk + config["variant-calls"] = { + key: callset + for key, callset in config["variant-calls"].items() + if key in chunk_calls + } + + # add path to callsets for key, callset in config["variant-calls"].items(): if "zenodo" in callset: