Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions .github/actions/upload-benchmark-results/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ name: Upload benchmark results
inputs:
benchmark-results-dir:
description: 'The path to the directory with all the results in JSON format'
required: True
required: true
benchmark-name:
description: 'Manually set the name of the benchmark'
default: ''
dry-run:
default: 'true'
schema-version:
Expand All @@ -12,7 +15,6 @@ inputs:
default: ''
venv:
description: 'Path to virtual environment to activate'
required: false
default: ''

runs:
Expand All @@ -26,26 +28,53 @@ runs:
if [[ -n "${{ inputs.venv }}" ]]; then
source "${{ inputs.venv }}"
fi
python3 -mpip install boto3==1.35.33 psutil==7.0.0 pynvml==12.0.0
python3 -mpip install boto3==1.35.33 psutil==7.0.0 nvidia-ml-py==13.580.82

DEVICE_NAME=""
DEVICE_TYPE=""
- name: Get device name
shell: bash
run: |
set -eux

if command -v nvidia-smi; then
# NB: I'm using PyTorch here to get the device name, however, it needs to
# install the correct version of PyTorch manually for now. Any PyTorch
# version is fine, I just use 2.7.1 to satify PYPIDEP linter
python3 -mpip install torch==2.7.1
elif command -v rocminfo; then
# NB: Installing torch on ROCm runner with pip here causes CI to fail
# with a memoryview is too large error only on MI300 runners. Is pip
# version on ROCm runner there too old? As a workaround, let's use the
# GPU device name coming from rocminfo instead
DEVICE_NAME=cuda
nvidia-smi
elif command -v rocm-smi; then
DEVICE_NAME=rocm
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
rocm-smi
elif command -v hl-smi; then
DEVICE_NAME=hpu
hl-smi
else
arch=$(uname -m)

case "$arch" in
aarch64|arm64)
DEVICE_NAME=arm64-cpu
;;
*)
DEVICE_NAME=cpu
;;
esac
lscpu
fi

echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV

- name: Get device type
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is from https://github.com/pytorch/pytorch-integration-testing/blob/main/.github/workflows/vllm-benchmark.yml#L139. There is a task to consolidate this into this official upload benchmark results GHA

shell: bash
run: |
set -eux

if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DEVICE_TYPE="$(lscpu | grep "Model name" | sed -E 's/.*Model name:[[:space:]]*//; s/Intel\(R\)//g; s/\(R\)//g; s/\(TM\)//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core\(s\) per socket/ {c=$2} /Socket\(s\)/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))"
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV

- name: Check that GITHUB_TOKEN is defined
Expand Down Expand Up @@ -79,6 +108,7 @@ runs:
RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ inputs.github-token != '' && steps.get-job-id.outputs.job-id || '0' }}
JOB_NAME: ${{ inputs.github-token != '' && steps.get-job-id.outputs.job-name || '' }}
BENCHMARK_NAME: ${{ inputs.benchmark-name || '' }}
run: |
set -eux

Expand Down Expand Up @@ -121,6 +151,7 @@ runs:
shell: bash
env:
BENCHMARK_RESULTS_DIR: ${{ inputs.benchmark-results-dir }}
BENCHMARK_NAME: ${{ inputs.benchmark-name || '' }}
DRY_RUN: ${{ inputs.dry-run }}
# Additional information about the benchmarks
BENCHMARK_METADATA: ${{ steps.gather-metadata.outputs.metadata }}
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/benchmarks/gather_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def main() -> None:
metadata = {
"timestamp": int(time.time()),
"schema_version": args.schema_version,
"name": args.job_name,
"name": os.getenv("BENCHMARK_NAME", args.job_name),
"repo": args.repo,
"head_branch": args.head_branch,
"head_sha": args.head_sha,
Expand Down
20 changes: 19 additions & 1 deletion .github/scripts/upload_benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
import logging
import os
import sys
import time
from argparse import Action, ArgumentParser, Namespace
from decimal import Decimal
Expand Down Expand Up @@ -185,7 +186,7 @@ def upload_to_dynamodb(


def read_benchmark_results(filepath: str) -> List[Dict[str, Any]]:
benchmark_results = []
benchmark_results: List[Dict[str, Any]] = []
with open(filepath) as f:
try:
r = json.load(f)
Expand Down Expand Up @@ -216,6 +217,15 @@ def read_benchmark_results(filepath: str) -> List[Dict[str, Any]]:
except JSONDecodeError:
warn(f"Invalid JSON {line}, skipping")

# Overwrite the benchmark name if needed
if os.getenv("BENCHMARK_NAME"):
benchmark_name = os.getenv("BENCHMARK_NAME")
for bresult in benchmark_results:
if bresult.get("benchmark", {}) and bresult.get("benchmark", {}).get(
"name"
):
bresult["benchmark"]["name"] = benchmark_name

return benchmark_results


Expand Down Expand Up @@ -319,6 +329,7 @@ def upload_to_s3(
def main() -> None:
args = parse_args()

has_results_uploaded = False
for file in os.listdir(args.benchmark_results_dir):
if not file.endswith(".json"):
continue
Expand Down Expand Up @@ -349,6 +360,7 @@ def main() -> None:
if not benchmark_results:
continue

has_results_uploaded = True
upload_to_s3(
s3_bucket=OSSCI_BENCHMARKS_BUCKET,
filepath=filepath,
Expand All @@ -357,6 +369,12 @@ def main() -> None:
dry_run=args.dry_run,
)

# When there is no benchmark results, treat it as a failure. This is better
# than failing silently.
if not has_results_uploaded:
warn(f"Find no benchmark results in {args.benchmark_results}")
sys.exit(1)


if __name__ == "__main__":
main()