diff --git a/.github/actions/upload-benchmark-results/action.yml b/.github/actions/upload-benchmark-results/action.yml index 521f16e776..1a33fcd378 100644 --- a/.github/actions/upload-benchmark-results/action.yml +++ b/.github/actions/upload-benchmark-results/action.yml @@ -3,7 +3,10 @@ name: Upload benchmark results inputs: benchmark-results-dir: description: 'The path to the directory with all the results in JSON format' - required: True + required: true + benchmark-name: + description: 'Manually set the name of the benchmark' + default: '' dry-run: default: 'true' schema-version: @@ -12,7 +15,6 @@ inputs: default: '' venv: description: 'Path to virtual environment to activate' - required: false default: '' runs: @@ -26,26 +28,53 @@ runs: if [[ -n "${{ inputs.venv }}" ]]; then source "${{ inputs.venv }}" fi - python3 -mpip install boto3==1.35.33 psutil==7.0.0 pynvml==12.0.0 + python3 -mpip install boto3==1.35.33 psutil==7.0.0 nvidia-ml-py==13.580.82 - DEVICE_NAME="" - DEVICE_TYPE="" + - name: Get device name + shell: bash + run: | + set -eux if command -v nvidia-smi; then - # NB: I'm using PyTorch here to get the device name, however, it needs to - # install the correct version of PyTorch manually for now. Any PyTorch - # version is fine, I just use 2.7.1 to satify PYPIDEP linter - python3 -mpip install torch==2.7.1 - elif command -v rocminfo; then - # NB: Installing torch on ROCm runner with pip here causes CI to fail - # with a memoryview is too large error only on MI300 runners. Is pip - # version on ROCm runner there too old? As a workaround, let's use the - # GPU device name coming from rocminfo instead + DEVICE_NAME=cuda + nvidia-smi + elif command -v rocm-smi; then DEVICE_NAME=rocm - DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + rocm-smi + elif command -v hl-smi; then + DEVICE_NAME=hpu + hl-smi + else + arch=$(uname -m) + + case "$arch" in + aarch64|arm64) + DEVICE_NAME=arm64-cpu + ;; + *) + DEVICE_NAME=cpu + ;; + esac + lscpu fi - echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV + + - name: Get device type + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cuda" ]]; then + DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "hpu" ]]; then + DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE="$(lscpu | grep "Model name" | sed -E 's/.*Model name:[[:space:]]*//; s/Intel\(R\)//g; s/\(R\)//g; s/\(TM\)//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core\(s\) per socket/ {c=$2} /Socket\(s\)/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))" + elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + fi echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV - name: Check that GITHUB_TOKEN is defined @@ -79,6 +108,7 @@ runs: RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ inputs.github-token != '' && steps.get-job-id.outputs.job-id || '0' }} JOB_NAME: ${{ inputs.github-token != '' && steps.get-job-id.outputs.job-name || '' }} + BENCHMARK_NAME: ${{ inputs.benchmark-name || '' }} run: | set -eux @@ -121,6 +151,7 @@ runs: shell: bash env: BENCHMARK_RESULTS_DIR: ${{ inputs.benchmark-results-dir }} + BENCHMARK_NAME: ${{ inputs.benchmark-name || '' }} DRY_RUN: ${{ inputs.dry-run }} # Additional information about the benchmarks BENCHMARK_METADATA: ${{ steps.gather-metadata.outputs.metadata }} diff --git a/.github/scripts/benchmarks/gather_metadata.py b/.github/scripts/benchmarks/gather_metadata.py index e38c8b5bdf..8bb9969fa4 100755 --- a/.github/scripts/benchmarks/gather_metadata.py +++ b/.github/scripts/benchmarks/gather_metadata.py @@ -83,7 +83,7 @@ def main() -> None: metadata = { "timestamp": int(time.time()), "schema_version": args.schema_version, - "name": args.job_name, + "name": os.getenv("BENCHMARK_NAME", args.job_name), "repo": args.repo, "head_branch": args.head_branch, "head_sha": args.head_sha, diff --git a/.github/scripts/upload_benchmark_results.py b/.github/scripts/upload_benchmark_results.py index 5513030003..93fe9f0e42 100755 --- a/.github/scripts/upload_benchmark_results.py +++ b/.github/scripts/upload_benchmark_results.py @@ -10,6 +10,7 @@ import json import logging import os +import sys import time from argparse import Action, ArgumentParser, Namespace from decimal import Decimal @@ -185,7 +186,7 @@ def upload_to_dynamodb( def read_benchmark_results(filepath: str) -> List[Dict[str, Any]]: - benchmark_results = [] + benchmark_results: List[Dict[str, Any]] = [] with open(filepath) as f: try: r = json.load(f) @@ -216,6 +217,15 @@ def read_benchmark_results(filepath: str) -> List[Dict[str, Any]]: except JSONDecodeError: warn(f"Invalid JSON {line}, skipping") + # Overwrite the benchmark name if needed + if os.getenv("BENCHMARK_NAME"): + benchmark_name = os.getenv("BENCHMARK_NAME") + for bresult in benchmark_results: + if bresult.get("benchmark", {}) and bresult.get("benchmark", {}).get( + "name" + ): + bresult["benchmark"]["name"] = benchmark_name + return benchmark_results @@ -319,6 +329,7 @@ def upload_to_s3( def main() -> None: args = parse_args() + has_results_uploaded = False for file in os.listdir(args.benchmark_results_dir): if not file.endswith(".json"): continue @@ -349,6 +360,7 @@ def main() -> None: if not benchmark_results: continue + has_results_uploaded = True upload_to_s3( s3_bucket=OSSCI_BENCHMARKS_BUCKET, filepath=filepath, @@ -357,6 +369,12 @@ def main() -> None: dry_run=args.dry_run, ) + # When there is no benchmark results, treat it as a failure. This is better + # than failing silently. + if not has_results_uploaded: + warn(f"Find no benchmark results in {args.benchmark_results}") + sys.exit(1) + if __name__ == "__main__": main()