From 8a392e33d26092bfc83f38d32f4ae5f3ff091ad0 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Mon, 15 Dec 2025 18:27:21 +0000 Subject: [PATCH 1/7] Build Ops Agent GPU Testing Images --- .../gpu-image-builder/build_packer_builder.sh | 20 ++++ .../gpu-image-builder/check_source_image.sh | 31 ++++++ cloudbuild/gpu-image-builder/cloudbuild.yaml | 53 +++++++++ cloudbuild/gpu-image-builder/packer.pkr.hcl | 104 ++++++++++++++++++ .../scripts/debian-11/setup_vm.sh | 1 + .../scripts/debian-12/post_reboot.sh | 66 +++++++++++ .../scripts/debian-12/setup_vm.sh | 28 +++++ .../scripts/debian-13/setup_vm.sh | 25 +++++ cloudbuild/gpu-image-builder/scripts/noop.sh | 0 .../scripts/rocky-linux-8/setup_vm.sh | 55 +++++++++ .../scripts/rocky-linux-9/setup_vm.sh | 55 +++++++++ .../scripts/sles-15/setup_vm.sh | 67 +++++++++++ .../scripts/ubuntu-2204-lts/setup_vm.sh | 57 ++++++++++ .../scripts/ubuntu-2404-lts-amd64/setup_vm.sh | 57 ++++++++++ 14 files changed, 619 insertions(+) create mode 100644 cloudbuild/gpu-image-builder/build_packer_builder.sh create mode 100644 cloudbuild/gpu-image-builder/check_source_image.sh create mode 100644 cloudbuild/gpu-image-builder/cloudbuild.yaml create mode 100644 cloudbuild/gpu-image-builder/packer.pkr.hcl create mode 100644 cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/noop.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh create mode 100644 cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh diff --git a/cloudbuild/gpu-image-builder/build_packer_builder.sh b/cloudbuild/gpu-image-builder/build_packer_builder.sh new file mode 100644 index 0000000000..594db7f4c3 --- /dev/null +++ b/cloudbuild/gpu-image-builder/build_packer_builder.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# build_packer_builder.sh +# Builds the custom Packer Cloud Build builder if it doesn't exist. +# https://docs.cloud.google.com/build/docs/building/build-vm-images-with-packer + +set -euo pipefail + +PROJECT_ID="${1}" +PACKER_BUILDER_IMAGE="gcr.io/${PROJECT_ID}/packer" + +if gcloud container images describe "${PACKER_BUILDER_IMAGE}" > /dev/null 2>&1; then + echo "Packer builder image '${PACKER_BUILDER_IMAGE}' exists, skipping build." +else + echo "Packer builder image not found. Building it now..." + git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git --depth=1 + cd cloud-builders-community/packer + gcloud builds submit --project="${PROJECT_ID}" . + cd - + echo "Packer builder image built." +fi \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/check_source_image.sh b/cloudbuild/gpu-image-builder/check_source_image.sh new file mode 100644 index 0000000000..f0b21b2b92 --- /dev/null +++ b/cloudbuild/gpu-image-builder/check_source_image.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# check_source_image.sh +# Checks if the latest public image is newer than the source of our last build. + +set -euo pipefail + +PROJECT_ID="${1}" +SOURCE_IMAGE_FAMILY="${2}" +SOURCE_IMAGE_PROJECT="${3}" +TARGET_IMAGE_FAMILY="${4}" + +echo "--- Checking for New Source Image ---" +LATEST_PUBLIC_IMAGE=$(gcloud compute images describe-from-family "${SOURCE_IMAGE_FAMILY}" --project="${SOURCE_IMAGE_PROJECT}" --format="value(name)") +echo "Latest available public image: ${LATEST_PUBLIC_IMAGE}" + +LAST_CURATED_SOURCE_IMAGE="" +if gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" &> /dev/null; then + LAST_CURATED_SOURCE_IMAGE=$(gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" --format="value(labels.source-gce-image)") + echo "Source image of our latest curated image: ${LAST_CURATED_SOURCE_IMAGE}" +else + echo "Image family '${TARGET_IMAGE_FAMILY}' not found. Assuming this is the first build." +fi + +if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]]; then + echo "Source image '${LATEST_PUBLIC_IMAGE}' has not changed. Signaling to skip build." + echo "SKIP" > /workspace/build_status.txt +else + echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build." + echo "${LATEST_PUBLIC_IMAGE}" > /workspace/new_source_image.txt + echo "RUN" > /workspace/build_status.txt +fi diff --git a/cloudbuild/gpu-image-builder/cloudbuild.yaml b/cloudbuild/gpu-image-builder/cloudbuild.yaml new file mode 100644 index 0000000000..4b4a95c93b --- /dev/null +++ b/cloudbuild/gpu-image-builder/cloudbuild.yaml @@ -0,0 +1,53 @@ +# cloudbuild.yaml +steps: +# Check for new source image. Runs 'check_source_image.sh'. +- id: 'check-source-image' + name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + chmod +x check_source_image.sh + ./check_source_image.sh "${PROJECT_ID}" \ + "${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \ + "${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ + "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" + waitFor: ['-'] + +# Conditionally build the Packer builder image. Runs 'build_packer_builder.sh'. +- id: 'build-packer-builder' + name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + chmod +x build_packer_builder.sh + ./build_packer_builder.sh "${PROJECT_ID}" + waitFor: ['-'] # Can run in parallel with check-source-image + +# 2. Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN. +- id: 'packer-build-gpu-image' + name: 'gcr.io/${PROJECT_ID}/packer' # Use the custom Packer builder image + entrypoint: 'bash' + args: + - '-c' + - | + if [[ "$(cat /workspace/build_status.txt)" == "SKIP" ]]; then + echo "Skipping Packer build as source image has not changed." + exit 0 + fi + + /usr/bin/packer build \ + -var "project_id=${PROJECT_ID}" \ + -var "image_name=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}-$(date +%m-%d-%Y)" \ + -var "image_family=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ + -var "source_image=$(cat /workspace/new_source_image.txt)" \ + -var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ + -var "gpu_driver_version=535.161.01" \ + -var "cuda_version=12.8.0" \ + -var "zone=us-central1-a" \ + -var "build_id=${BUILD_ID}" \ + packer.pkr.hcl + waitFor: ['check-source-image', 'build-packer-builder'] + +timeout: 14400s diff --git a/cloudbuild/gpu-image-builder/packer.pkr.hcl b/cloudbuild/gpu-image-builder/packer.pkr.hcl new file mode 100644 index 0000000000..c0d63462dd --- /dev/null +++ b/cloudbuild/gpu-image-builder/packer.pkr.hcl @@ -0,0 +1,104 @@ +// packer.pkr.hcl +variable "project_id" { + type = string + description = "GCP Project ID" +} + +variable "image_name" { + type = string + description = "Name of the created GCE image" +} + +variable "image_family" { + type = string + description = "Image family for the created GCE image" +} + +variable "source_image" { + type = string + description = "The specific source GCE image name (e.g., ubuntu-2204-jammy-v20240115)" +} + +variable "source_image_project" { + type = string + description = "The specific source GCE image project (e.g., ubuntu-os-cloud)" +} + + +variable "gpu_driver_version" { + type = string + default = "535.161.01" // Pin specific NVIDIA driver version + description = "Specific NVIDIA GPU driver version to install" +} + +variable "cuda_version" { + type = string + default = "12.2.2" // Pin specific CUDA Toolkit version + description = "Specific CUDA Toolkit version to install" +} + +variable "zone" { + type = string + default = "us-central1-a" + description = "GCP zone for the temporary build instance" +} + +variable "build_id" { + type = string + description = "Cloud Build ID for traceability" + default = "manual" +} + +source "googlecompute" "gpu_image" { + project_id = var.project_id + zone = var.zone + source_image = var.source_image + source_image_project_id = [var.source_image_project] + image_name = var.image_name + image_family = var.image_family + ssh_username = "packer" + disk_size = 50 + disk_type = "pd-standard" + machine_type = "n1-standard-4" // Use a standard VM for building, no GPU needed here + tags = ["packer-build"] + + // *** IMPORTANT: Label the created image with its source image *** + image_labels = { + source-gce-image = "${var.source_image}" + built-by = "louhi" + cloud-build-id = "${var.build_id}" + } +} + +build { + sources = ["source.googlecompute.gpu_image"] + provisioner "shell" { + script = "./scripts/${var.image_family}/setup_vm.sh" + # Packer will pass these variables as PACKER_VAR_* env vars + environment_vars = [ + "PACKER_VAR_project_id=${var.project_id}", + "PACKER_VAR_gpu_driver_version=${var.gpu_driver_version}", + "PACKER_VAR_cuda_version=${var.cuda_version}", + "PACKER_VAR_build_id=${var.build_id}" + ] + # Expect a disconnect/reboot after GPU driver install + expect_disconnect = true + # Give some time for SSH to come back up + timeout = "240m" + } + + // Provisioner 2: Handles the post-reboot part, ONLY for Debian 12. + provisioner "shell" { + script = var.image_family == "debian-12" ? "./scripts/${var.image_family}/post_reboot.sh" : "./scripts/noop.sh" + environment_vars = [ + "PACKER_VAR_project_id=${var.project_id}", + "PACKER_VAR_gpu_driver_version=${var.gpu_driver_version}", + "PACKER_VAR_cuda_version=${var.cuda_version}", + "PACKER_VAR_build_id=${var.build_id}" + ] + # Wait for the reboot to be complete + pause_before = "60s" + expect_disconnect = false // No reboot expected in this second phase. + timeout = "240m" + } +} diff --git a/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh new file mode 100644 index 0000000000..004c2e34da --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh @@ -0,0 +1 @@ +/opt/deeplearning/install-driver.sh \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh new file mode 100644 index 0000000000..4795a1b5f0 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# post_reboot_gpu_setup.sh - Runs setup steps after the VM has rebooted on Debian 12. +set -euo pipefail +echo "--- Starting Packer Post-Reboot Provisioning on Debian 12 $(date) ---" + +PROJECT_ID="${PACKER_VAR_project_id}" +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +# --- Persistent Installer Path --- +INSTALLER_DIR="/var/lib/cuda-installer" +CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" + +# Ensure the installer exists (it should, as it was downloaded in the first phase) +if [ ! -f "${CUDA_INSTALLER_PATH}" ]; then + echo "ERROR: cuda_installer.pyz not found at ${CUDA_INSTALLER_PATH}!" + exit 1 +fi + +echo "Running cuda_installer.pyz install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" +sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } + +echo "Running cuda_installer.pyz install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" +sudo python3 "${CUDA_INSTALLER_PATH}" install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_cuda failed!"; exit 1; } + +# # --- Install CUDA Samples --- +# echo "--- Installing CUDA Samples for ${CUDA_VERSION} ---" +# # Convert CUDA_VERSION (e.g., 12.2.2) to the apt package format (e.g., 12-2) +# CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 +# CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 +# CUDA_SAMPLES_PACKAGE="cuda-demo-suite-${CUDA_VERSION_DASHED}" + +# # Ensure NVIDIA repo is added - it should have been by cuda_installer.pyz or is pre-configured +# sudo apt-get update -y + +# if apt-cache show "${CUDA_SAMPLES_PACKAGE}" &> /dev/null; then +# echo "Package '${CUDA_SAMPLES_PACKAGE}' found. Installing..." +# sudo apt-get install -y --no-install-recommends "${CUDA_SAMPLES_PACKAGE}" || { echo "ERROR: Failed to install ${CUDA_SAMPLES_PACKAGE}!"; exit 1; } +# echo "CUDA Samples installed." +# else +# echo "WARNING: CUDA Samples package '${CUDA_SAMPLES_PACKAGE}' not found in repositories." +# echo "You may need to manually add the NVIDIA repos or check package naming for Debian 12." +# fi + +# # --- Create Symbolic Link --- +# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" +# LINK_NAME="/usr/local/cuda" +# if [ -d "$TARGET_DIR" ]; then +# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" +# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" +# echo "Symlink created at $LINK_NAME" +# else +# echo "WARNING: Target directory $TARGET_DIR not found for symlink." +# fi + +# echo "--- Cleaning up ---" +# sudo apt-get clean +# sudo rm -rf /var/lib/apt/lists/* +# # The cuda_installer.pyz is deliberately left in ${INSTALLER_DIR} as requested. +# echo "--- Packer post-reboot setup complete $(date) ---" +# echo "--- Provisioning script finished $(date) ---" diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh new file mode 100644 index 0000000000..b72cef1291 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# initial_gpu_setup.sh - Runs initial setup and GPU driver installation on Debian 12. +set -euo pipefail +echo "--- Starting Packer Initial Provisioning on Debian 12 $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +# GPU_DRIVER_VERSION is not directly used by cuda_installer.pyz install_driver +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " BUILD_ID: ${BUILD_ID}" + +echo "--- Running apt updates and installing prerequisites ---" +sudo apt-get update -y +sudo apt-get install -y --no-install-recommends python3 python3-pip wget curl gnupg git || { echo "ERROR: Failed to install prerequisites!"; exit 1; } + +echo "--- Installing GPU Driver using cuda_installer.pyz ---" +INSTALLER_DIR="/var/lib/cuda-installer" +CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" +sudo mkdir -p "${INSTALLER_DIR}" +sudo curl -L https://storage.googleapis.com/compute-gpu-installation-us/installer/latest/cuda_installer.pyz --output "${CUDA_INSTALLER_PATH}" +sudo chmod +x "${CUDA_INSTALLER_PATH}" + +echo "Running cuda_installer.pyz install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" +sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } +# The script will reboot \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh new file mode 100644 index 0000000000..f7f4295bb2 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# initial_gpu_setup.sh - Runs initial setup and GPU driver installation on Debian 13. +set -euo pipefail +echo "--- Starting Packer Initial Provisioning on Debian 13 $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +# GPU_DRIVER_VERSION is not directly used by cuda_installer.pyz install_driver +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " BUILD_ID: ${BUILD_ID}" + + +sudo apt update +KERNEL_VERSION=`uname -r` +sudo apt install -y linux-headers-${KERNEL_VERSION} pciutils gcc make dkms wget git + +wget https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update + +sudo apt-get -y install cuda-13-1 + diff --git a/cloudbuild/gpu-image-builder/scripts/noop.sh b/cloudbuild/gpu-image-builder/scripts/noop.sh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh new file mode 100644 index 0000000000..910598494a --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# setup_gpu_apps.sh - Provisioning script for Packer on Rocky Linux 8, executed via Shell Provisioner. +set -euo pipefail + +echo "--- Starting Packer Provisioning on Rocky Linux 8 $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +# Convert CUDA_VERSION (e.g., 12.2.2) to the format used in package names (e.g., 12-2) +CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 +CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 + +# --- NVIDIA Repository Setup for Rocky Linux 8 --- +echo "--- Configuring NVIDIA CUDA Repository for Rocky Linux 8 ---" +sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo + +sudo dnf clean all +sudo dnf makecache +sudo dnf install -y git make + +# --- Install CUDA Toolkit and Samples --- +echo "--- Installing CUDA Toolkit ${CUDA_VERSION} and Samples ---" + +echo "Installing CUDA Toolkit and Samples..." +sudo dnf install -y cuda-"${CUDA_VERSION_DASHED}" + +# # --- Create Symbolic Link --- +# # CUDA on Linux often installs to /usr/local/cuda-${CUDA_VERSION_SHORT} +# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" +# LINK_NAME="/usr/local/cuda" + +# if [ -d "$TARGET_DIR" ]; then +# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" +# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" + +# echo "----------------------------------------------------------------" +# echo "Success! CUDA Toolkit and Samples installed." +# echo "Symlink created at $LINK_NAME" +# echo "Samples located at: $LINK_NAME/samples/" +# echo "----------------------------------------------------------------" +# else +# echo "Error: Installation completed, but target directory $TARGET_DIR was not found." +# exit 1 +# fi +echo "--- Provisioning script finished $(date) ---" diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh new file mode 100644 index 0000000000..1507e853e5 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# setup_gpu_apps.sh - Provisioning script for Packer on Rocky Linux 9, executed via Shell Provisioner. +set -euo pipefail + +echo "--- Starting Packer Provisioning on Rocky Linux 9 $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +# Convert CUDA_VERSION (e.g., 12.2.2) to the format used in package names (e.g., 12-2) +CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 +CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 + +# --- NVIDIA Repository Setup for Rocky Linux 9 --- +echo "--- Configuring NVIDIA CUDA Repository for Rocky Linux 9 ---" +sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + +sudo dnf clean all +sudo dnf makecache +sudo dnf install -y git make + +# --- Install CUDA Toolkit and Samples --- +echo "--- Installing CUDA Toolkit ${CUDA_VERSION} and Samples ---" + +echo "Installing CUDA Toolkit and Samples..." +sudo dnf install -y cuda-"${CUDA_VERSION_DASHED}" + +# # --- Create Symbolic Link --- +# # CUDA on Linux often installs to /usr/local/cuda-${CUDA_VERSION_SHORT} +# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" +# LINK_NAME="/usr/local/cuda" + +# if [ -d "$TARGET_DIR" ]; then +# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" +# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" + +# echo "----------------------------------------------------------------" +# echo "Success! CUDA Toolkit and Samples installed." +# echo "Symlink created at $LINK_NAME" +# echo "Samples located at: $LINK_NAME/samples/" +# echo "----------------------------------------------------------------" +# else +# echo "Error: Installation completed, but target directory $TARGET_DIR was not found." +# exit 1 +# fi +echo "--- Provisioning script finished $(date) ---" diff --git a/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh new file mode 100644 index 0000000000..e033762b57 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +echo "--- Starting Packer Provisioning $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +retry_command() { + local max_attempts="$1" + local sleep_time="$2" + local cmd="$3" + + echo "Starting command: $cmd" + echo "----------------------------------------" + + for ((i=1; i<=max_attempts; i++)); do + echo "[Attempt $i/$max_attempts] Running..." + + # Run the command using bash -c to handle complex commands (like those with &&) + if bash -c "$cmd"; then + echo "----------------------------------------" + echo "Success!" + return 0 + fi + + echo "Attempt failed." + + # Sleep only if we have attempts left + if [ $i -lt $max_attempts ]; then + echo "Waiting $sleep_time seconds before retrying..." + sleep $sleep_time + fi + done + + echo "----------------------------------------" + echo "Error: Command failed after $max_attempts attempts." + exit 1 +} + +retry_command 5 5 "sudo /usr/sbin/registercloudguest --force" +retry_command 120 5 "sudo zypper --non-interactive --gpg-auto-import-keys refresh && sudo zypper --non-interactive install --force coreutils" + +sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget git + +# Install CUDA and driver together, since the `exercise` script needs to run a +# CUDA sample app to generating GPU process metrics +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation; fallback to the runfile method if the package +# manager's package is not working or not compitible with the GPU model +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') +echo "Installing latest version of NVIDIA CUDA and driver" +sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo +sudo zypper --gpg-auto-import-keys --non-interactive refresh +sudo zypper --non-interactive install -y nvidia-compute-utils-G06 +sudo zypper --non-interactive install -y cuda-12-9 + diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh new file mode 100644 index 0000000000..db6dc0f735 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +echo "--- Starting Packer Provisioning $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" +CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" +PACKAGE_NAME="cuda-${CUDA_VERSION_DASHED}" + +echo "Target package name: $PACKAGE_NAME" + +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get install -y build-essential + +# Check if the exact package exists and install +if apt-cache show "$PACKAGE_NAME" &> /dev/null; then + echo "Package '$PACKAGE_NAME' found. Installing..." + + sudo apt-get install -y --no-install-recommends "$PACKAGE_NAME" + + # Create Symbolic Link + # We use -snf to force the link creation and prevent dereferencing if it already exists + TARGET_DIR="/usr/local/cuda-$CUDA_VERSION_SHORT" + LINK_NAME="/usr/local/cuda" + + if [ -d "$TARGET_DIR" ]; then + echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" + sudo ln -snf "$TARGET_DIR" "$LINK_NAME" + + echo "----------------------------------------------------------------" + echo "Success! Samples installed." + echo "Symlink created at $LINK_NAME" + echo "Binaries located at: $LINK_NAME/extras/demo_suite/" + echo "----------------------------------------------------------------" + else + echo "Error: Installation completed, but target directory $TARGET_DIR was not found." + exit 1 + fi +else + echo "Error: Package '$PACKAGE_NAME' was not found in your repositories." + exit 1 +fi \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh new file mode 100644 index 0000000000..47b2534b1a --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +echo "--- Starting Packer Provisioning $(date) ---" + +# --- Input Variables from PACKER_VAR_* environment variables --- +PROJECT_ID="${PACKER_VAR_project_id}" +GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 +CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 +BUILD_ID="${PACKER_VAR_build_id}" + +echo "Fetched variables from PACKER_VAR_*:" +echo " PROJECT_ID: ${PROJECT_ID}" +echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" +echo " CUDA_VERSION: ${CUDA_VERSION}" +echo " BUILD_ID: ${BUILD_ID}" + +CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" +CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" +PACKAGE_NAME="cuda-${CUDA_VERSION_DASHED}" + +echo "Target package name: $PACKAGE_NAME" + +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get install -y build-essential + +# Check if the exact package exists and install +if apt-cache show "$PACKAGE_NAME" &> /dev/null; then + echo "Package '$PACKAGE_NAME' found. Installing..." + + sudo apt-get install -y --no-install-recommends "$PACKAGE_NAME" + + # Create Symbolic Link + # We use -snf to force the link creation and prevent dereferencing if it already exists + TARGET_DIR="/usr/local/cuda-$CUDA_VERSION_SHORT" + LINK_NAME="/usr/local/cuda" + + if [ -d "$TARGET_DIR" ]; then + echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" + sudo ln -snf "$TARGET_DIR" "$LINK_NAME" + + echo "----------------------------------------------------------------" + echo "Success! Samples installed." + echo "Symlink created at $LINK_NAME" + echo "Binaries located at: $LINK_NAME/extras/demo_suite/" + echo "----------------------------------------------------------------" + else + echo "Error: Installation completed, but target directory $TARGET_DIR was not found." + exit 1 + fi +else + echo "Error: Package '$PACKAGE_NAME' was not found in your repositories." + exit 1 +fi \ No newline at end of file From f1426575d994232b857cad32fdcff9c6cbee58a9 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Fri, 2 Jan 2026 16:35:55 +0000 Subject: [PATCH 2/7] Update image building scripts --- .../gpu-image-builder/check_source_image.sh | 10 +++- cloudbuild/gpu-image-builder/cloudbuild.yaml | 5 +- .../scripts/debian-11/setup_vm.sh | 9 +++ .../scripts/debian-12/post_reboot.sh | 60 +------------------ .../scripts/debian-12/setup_vm.sh | 22 ++----- .../scripts/debian-13/setup_vm.sh | 21 +++---- cloudbuild/gpu-image-builder/scripts/noop.sh | 2 + .../scripts/rocky-linux-8/setup_vm.sh | 57 +++--------------- .../scripts/rocky-linux-9/setup_vm.sh | 57 +++--------------- .../scripts/sles-15/setup_vm.sh | 31 ++++------ .../scripts/ubuntu-2204-lts/setup_vm.sh | 60 +++---------------- .../scripts/ubuntu-2404-lts-amd64/setup_vm.sh | 60 +++---------------- 12 files changed, 82 insertions(+), 312 deletions(-) diff --git a/cloudbuild/gpu-image-builder/check_source_image.sh b/cloudbuild/gpu-image-builder/check_source_image.sh index f0b21b2b92..6f20467f1b 100644 --- a/cloudbuild/gpu-image-builder/check_source_image.sh +++ b/cloudbuild/gpu-image-builder/check_source_image.sh @@ -1,6 +1,6 @@ #!/bin/bash # check_source_image.sh -# Checks if the latest public image is newer than the source of our last build. +# Checks if the latest public image is newer than the source of our last build and if we need a new build set -euo pipefail @@ -8,6 +8,8 @@ PROJECT_ID="${1}" SOURCE_IMAGE_FAMILY="${2}" SOURCE_IMAGE_PROJECT="${3}" TARGET_IMAGE_FAMILY="${4}" +# Louhi set trigger type as either "cron-trigger" or "git-change-trigger" +LOUHI_TRIGGER_TYPE="${5}" echo "--- Checking for New Source Image ---" LATEST_PUBLIC_IMAGE=$(gcloud compute images describe-from-family "${SOURCE_IMAGE_FAMILY}" --project="${SOURCE_IMAGE_PROJECT}" --format="value(name)") @@ -21,9 +23,13 @@ else echo "Image family '${TARGET_IMAGE_FAMILY}' not found. Assuming this is the first build." fi -if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]]; then +# Only skip when running nightly, and there is no new base image +if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]] && \ + [[ "${LOUHI_TRIGGER_TYPE}" == "cron-trigger" ]]; then echo "Source image '${LATEST_PUBLIC_IMAGE}' has not changed. Signaling to skip build." echo "SKIP" > /workspace/build_status.txt +# Else, we either have a new image, or this is trigger by git changes +# Note that we set the Louhi Git trigger to only watch cloudbuild/gpu-image-builder directory else echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build." echo "${LATEST_PUBLIC_IMAGE}" > /workspace/new_source_image.txt diff --git a/cloudbuild/gpu-image-builder/cloudbuild.yaml b/cloudbuild/gpu-image-builder/cloudbuild.yaml index 4b4a95c93b..1786eb641f 100644 --- a/cloudbuild/gpu-image-builder/cloudbuild.yaml +++ b/cloudbuild/gpu-image-builder/cloudbuild.yaml @@ -11,7 +11,8 @@ steps: ./check_source_image.sh "${PROJECT_ID}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ - "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" + "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ + "${_LOUHI_TRIGGER_TYPE}" waitFor: ['-'] # Conditionally build the Packer builder image. Runs 'build_packer_builder.sh'. @@ -39,7 +40,7 @@ steps: /usr/bin/packer build \ -var "project_id=${PROJECT_ID}" \ - -var "image_name=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}-$(date +%m-%d-%Y)" \ + -var "image_name=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}-$(date -u +%Y%m%d-%H%M%S)" \ -var "image_family=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ -var "source_image=$(cat /workspace/new_source_image.txt)" \ -var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ diff --git a/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh index 004c2e34da..5123ade43a 100644 --- a/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh @@ -1 +1,10 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: ml-images:common-gpu-debian-11-py310 +# Source Image description: Google, Deep Learning VM with CUDA 11.8, M126, Debian 11, Python 3.10. With CUDA 11.8 preinstalled. +# Output Image: stackdriver-test-143416:debian-11 + +# DLVM images come with a script to install the driver and CUDA toolkit. /opt/deeplearning/install-driver.sh \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh index 4795a1b5f0..00a026ae69 100644 --- a/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh @@ -1,66 +1,12 @@ #!/bin/bash -# post_reboot_gpu_setup.sh - Runs setup steps after the VM has rebooted on Debian 12. +# post_reboot.sh - Runs setup steps after the VM has rebooted on Debian 12. Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Post-Reboot Provisioning on Debian 12 $(date) ---" -PROJECT_ID="${PACKER_VAR_project_id}" -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" - -# --- Persistent Installer Path --- INSTALLER_DIR="/var/lib/cuda-installer" CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" -# Ensure the installer exists (it should, as it was downloaded in the first phase) -if [ ! -f "${CUDA_INSTALLER_PATH}" ]; then - echo "ERROR: cuda_installer.pyz not found at ${CUDA_INSTALLER_PATH}!" - exit 1 -fi - -echo "Running cuda_installer.pyz install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" +# Rerun `install_driver` to finish driver installation sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } -echo "Running cuda_installer.pyz install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" +# Install CUDA toolkit sudo python3 "${CUDA_INSTALLER_PATH}" install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_cuda failed!"; exit 1; } - -# # --- Install CUDA Samples --- -# echo "--- Installing CUDA Samples for ${CUDA_VERSION} ---" -# # Convert CUDA_VERSION (e.g., 12.2.2) to the apt package format (e.g., 12-2) -# CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 -# CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 -# CUDA_SAMPLES_PACKAGE="cuda-demo-suite-${CUDA_VERSION_DASHED}" - -# # Ensure NVIDIA repo is added - it should have been by cuda_installer.pyz or is pre-configured -# sudo apt-get update -y - -# if apt-cache show "${CUDA_SAMPLES_PACKAGE}" &> /dev/null; then -# echo "Package '${CUDA_SAMPLES_PACKAGE}' found. Installing..." -# sudo apt-get install -y --no-install-recommends "${CUDA_SAMPLES_PACKAGE}" || { echo "ERROR: Failed to install ${CUDA_SAMPLES_PACKAGE}!"; exit 1; } -# echo "CUDA Samples installed." -# else -# echo "WARNING: CUDA Samples package '${CUDA_SAMPLES_PACKAGE}' not found in repositories." -# echo "You may need to manually add the NVIDIA repos or check package naming for Debian 12." -# fi - -# # --- Create Symbolic Link --- -# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" -# LINK_NAME="/usr/local/cuda" -# if [ -d "$TARGET_DIR" ]; then -# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" -# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" -# echo "Symlink created at $LINK_NAME" -# else -# echo "WARNING: Target directory $TARGET_DIR not found for symlink." -# fi - -# echo "--- Cleaning up ---" -# sudo apt-get clean -# sudo rm -rf /var/lib/apt/lists/* -# # The cuda_installer.pyz is deliberately left in ${INSTALLER_DIR} as requested. -# echo "--- Packer post-reboot setup complete $(date) ---" -# echo "--- Provisioning script finished $(date) ---" diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh index b72cef1291..5e75b56794 100644 --- a/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh @@ -1,28 +1,18 @@ #!/bin/bash -# initial_gpu_setup.sh - Runs initial setup and GPU driver installation on Debian 12. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Initial Provisioning on Debian 12 $(date) ---" -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -# GPU_DRIVER_VERSION is not directly used by cuda_installer.pyz install_driver -BUILD_ID="${PACKER_VAR_build_id}" +# Source Image: debian-cloud:debian-12 +# Output Image: stackdriver-test-143416:debian-12 -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " BUILD_ID: ${BUILD_ID}" +sudo apt update -y +sudo apt install -y --no-install-recommends python3 python3-pip wget curl gnupg git || { echo "ERROR: Failed to install prerequisites!"; exit 1; } -echo "--- Running apt updates and installing prerequisites ---" -sudo apt-get update -y -sudo apt-get install -y --no-install-recommends python3 python3-pip wget curl gnupg git || { echo "ERROR: Failed to install prerequisites!"; exit 1; } - -echo "--- Installing GPU Driver using cuda_installer.pyz ---" INSTALLER_DIR="/var/lib/cuda-installer" CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" sudo mkdir -p "${INSTALLER_DIR}" sudo curl -L https://storage.googleapis.com/compute-gpu-installation-us/installer/latest/cuda_installer.pyz --output "${CUDA_INSTALLER_PATH}" sudo chmod +x "${CUDA_INSTALLER_PATH}" -echo "Running cuda_installer.pyz install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb" sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } -# The script will reboot \ No newline at end of file +# The script will reboot diff --git a/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh index f7f4295bb2..3a2632e756 100644 --- a/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh @@ -1,25 +1,18 @@ #!/bin/bash -# initial_gpu_setup.sh - Runs initial setup and GPU driver installation on Debian 13. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Initial Provisioning on Debian 13 $(date) ---" -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -# GPU_DRIVER_VERSION is not directly used by cuda_installer.pyz install_driver -BUILD_ID="${PACKER_VAR_build_id}" +# Source Image: debian-cloud:debian-13 +# Output Image: stackdriver-test-143416:debian-13 -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " BUILD_ID: ${BUILD_ID}" - - -sudo apt update +# Install driver and CUDA toolkit +sudo apt update -y KERNEL_VERSION=`uname -r` sudo apt install -y linux-headers-${KERNEL_VERSION} pciutils gcc make dkms wget git wget https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt-get update +sudo apt update -sudo apt-get -y install cuda-13-1 +sudo apt -y install cuda-13-1 diff --git a/cloudbuild/gpu-image-builder/scripts/noop.sh b/cloudbuild/gpu-image-builder/scripts/noop.sh index e69de29bb2..138e5ca3a8 100644 --- a/cloudbuild/gpu-image-builder/scripts/noop.sh +++ b/cloudbuild/gpu-image-builder/scripts/noop.sh @@ -0,0 +1,2 @@ +#!/bin/bash +# Empty Script as a placeholder for noop steps \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh index 910598494a..28cefc0cdb 100644 --- a/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh @@ -1,55 +1,14 @@ #!/bin/bash -# setup_gpu_apps.sh - Provisioning script for Packer on Rocky Linux 8, executed via Shell Provisioner. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Provisioning on Rocky Linux 8 $(date) ---" +# Source Image: rocky-linux-accelerator-cloud:rocky-linux-8-optimized-gcp-nvidia-580 +# Source Image Description: Rocky Linux, Rocky Linux, 8 with the Nvidia 580 driver, x86_64 optimized for GCP built on {date} +# Output Image: stackdriver-test-143416:rocky-linux-8 -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" - -# Convert CUDA_VERSION (e.g., 12.2.2) to the format used in package names (e.g., 12-2) -CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 -CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 - -# --- NVIDIA Repository Setup for Rocky Linux 8 --- -echo "--- Configuring NVIDIA CUDA Repository for Rocky Linux 8 ---" +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=8&target_type=rpm_network +# to install the matching CUDA toolkit 13.0 (without driver) sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo - sudo dnf clean all -sudo dnf makecache -sudo dnf install -y git make - -# --- Install CUDA Toolkit and Samples --- -echo "--- Installing CUDA Toolkit ${CUDA_VERSION} and Samples ---" - -echo "Installing CUDA Toolkit and Samples..." -sudo dnf install -y cuda-"${CUDA_VERSION_DASHED}" - -# # --- Create Symbolic Link --- -# # CUDA on Linux often installs to /usr/local/cuda-${CUDA_VERSION_SHORT} -# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" -# LINK_NAME="/usr/local/cuda" - -# if [ -d "$TARGET_DIR" ]; then -# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" -# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" - -# echo "----------------------------------------------------------------" -# echo "Success! CUDA Toolkit and Samples installed." -# echo "Symlink created at $LINK_NAME" -# echo "Samples located at: $LINK_NAME/samples/" -# echo "----------------------------------------------------------------" -# else -# echo "Error: Installation completed, but target directory $TARGET_DIR was not found." -# exit 1 -# fi -echo "--- Provisioning script finished $(date) ---" +sudo dnf -y install cuda-toolkit-13-0 git make diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh index 1507e853e5..5c470d4b9e 100644 --- a/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh @@ -1,55 +1,14 @@ #!/bin/bash -# setup_gpu_apps.sh - Provisioning script for Packer on Rocky Linux 9, executed via Shell Provisioner. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Provisioning on Rocky Linux 9 $(date) ---" +# Source Image: rocky-linux-accelerator-cloud:rocky-linux-9-optimized-gcp-nvidia-580 +# Source Image Description: Rocky Linux, Rocky Linux, 9 with the Nvidia 580 driver, x86_64 optimized for GCP with the Nvidia 580 driver built on {date} +# Output Image: stackdriver-test-143416:rocky-linux-9 -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" - -# Convert CUDA_VERSION (e.g., 12.2.2) to the format used in package names (e.g., 12-2) -CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" # 12.2 -CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" # 12-2 - -# --- NVIDIA Repository Setup for Rocky Linux 9 --- -echo "--- Configuring NVIDIA CUDA Repository for Rocky Linux 9 ---" +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=9&target_type=rpm_network +# to install the matching CUDA toolkit 13.0 (without driver) sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo - sudo dnf clean all -sudo dnf makecache -sudo dnf install -y git make - -# --- Install CUDA Toolkit and Samples --- -echo "--- Installing CUDA Toolkit ${CUDA_VERSION} and Samples ---" - -echo "Installing CUDA Toolkit and Samples..." -sudo dnf install -y cuda-"${CUDA_VERSION_DASHED}" - -# # --- Create Symbolic Link --- -# # CUDA on Linux often installs to /usr/local/cuda-${CUDA_VERSION_SHORT} -# TARGET_DIR="/usr/local/cuda-${CUDA_VERSION_SHORT}" -# LINK_NAME="/usr/local/cuda" - -# if [ -d "$TARGET_DIR" ]; then -# echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" -# sudo ln -snf "$TARGET_DIR" "$LINK_NAME" - -# echo "----------------------------------------------------------------" -# echo "Success! CUDA Toolkit and Samples installed." -# echo "Symlink created at $LINK_NAME" -# echo "Samples located at: $LINK_NAME/samples/" -# echo "----------------------------------------------------------------" -# else -# echo "Error: Installation completed, but target directory $TARGET_DIR was not found." -# exit 1 -# fi -echo "--- Provisioning script finished $(date) ---" +sudo dnf -y install cuda-toolkit-13-0 git make \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh index e033762b57..0f106cbbdb 100644 --- a/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh @@ -1,21 +1,12 @@ #!/bin/bash -# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Provisioning $(date) ---" - -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" +# Source Image: suse-cloud:sles-15 +# Output Image: stackdriver-test-143416:sles-15 +# Mimic our prepareSLES() logic in gce_testing.go +# https://github.com/GoogleCloudPlatform/opentelemetry-operations-collector/blob/ec757f2f48c865c7aa1afaed27891d8727a28f2e/integration_test/gce-testing-internal/gce/gce_testing.go#L1057 retry_command() { local max_attempts="$1" local sleep_time="$2" @@ -54,14 +45,12 @@ retry_command 120 5 "sudo zypper --non-interactive --gpg-auto-import-keys refres sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget git # Install CUDA and driver together, since the `exercise` script needs to run a -# CUDA sample app to generating GPU process metrics +# CUDA app to generating GPU process metrics # Prefer to install from the package manager since it is normally faster and has -# less errors on installation; fallback to the runfile method if the package -# manager's package is not working or not compitible with the GPU model -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') -echo "Installing latest version of NVIDIA CUDA and driver" -sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo +# less errors on installation. The cuda-12-9 mega-package installs driver and +# CUDA together +sudo zypper --non-interactive addrepo https://developer.download.nvidia.com/compute/cuda/repos/sles15/x86_64/cuda-sles15.repo sudo zypper --gpg-auto-import-keys --non-interactive refresh +# CUDA 13 is not yet working with the SLES 15 image sudo zypper --non-interactive install -y nvidia-compute-utils-G06 sudo zypper --non-interactive install -y cuda-12-9 - diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh index db6dc0f735..7719319a07 100644 --- a/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh @@ -1,57 +1,15 @@ #!/bin/bash -# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Provisioning $(date) ---" +# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2204-amd64-with-nvidia-580 +# Source Image Description: Canonical, Ubuntu, 22.04 LTS NVIDIA version: 580, amd64 jammy image built on {date} +# Output Image: stackdriver-test-143416:ubuntu-2204-lts -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" - -CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" -CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" -PACKAGE_NAME="cuda-${CUDA_VERSION_DASHED}" - -echo "Target package name: $PACKAGE_NAME" - -wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network +# to install the matching CUDA toolkit 13.0 (without driver) +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update -sudo apt-get install -y build-essential - -# Check if the exact package exists and install -if apt-cache show "$PACKAGE_NAME" &> /dev/null; then - echo "Package '$PACKAGE_NAME' found. Installing..." - - sudo apt-get install -y --no-install-recommends "$PACKAGE_NAME" - - # Create Symbolic Link - # We use -snf to force the link creation and prevent dereferencing if it already exists - TARGET_DIR="/usr/local/cuda-$CUDA_VERSION_SHORT" - LINK_NAME="/usr/local/cuda" - - if [ -d "$TARGET_DIR" ]; then - echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" - sudo ln -snf "$TARGET_DIR" "$LINK_NAME" - - echo "----------------------------------------------------------------" - echo "Success! Samples installed." - echo "Symlink created at $LINK_NAME" - echo "Binaries located at: $LINK_NAME/extras/demo_suite/" - echo "----------------------------------------------------------------" - else - echo "Error: Installation completed, but target directory $TARGET_DIR was not found." - exit 1 - fi -else - echo "Error: Package '$PACKAGE_NAME' was not found in your repositories." - exit 1 -fi \ No newline at end of file +sudo apt-get -y install build-essential cuda-toolkit-13-0 \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh index 47b2534b1a..2ea45ce29c 100644 --- a/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh @@ -1,57 +1,15 @@ #!/bin/bash -# setup_gpu_apps.sh - Provisioning script for Packer, executed via Shell Provisioner. +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. set -euo pipefail -echo "--- Starting Packer Provisioning $(date) ---" +# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2404-amd64-with-nvidia-580 +# Source Image Description: Canonical, Ubuntu, 24.04 LTS NVIDIA version: 580, amd64 noble image built on {date} +# Output Image: stackdriver-test-143416:ubuntu-2404-lts -# --- Input Variables from PACKER_VAR_* environment variables --- -PROJECT_ID="${PACKER_VAR_project_id}" -GPU_DRIVER_VERSION="${PACKER_VAR_gpu_driver_version}" # e.g., 535.161.01 -CUDA_VERSION="${PACKER_VAR_cuda_version}" # e.g., 12.2.2 -BUILD_ID="${PACKER_VAR_build_id}" - -echo "Fetched variables from PACKER_VAR_*:" -echo " PROJECT_ID: ${PROJECT_ID}" -echo " GPU_DRIVER_VERSION: ${GPU_DRIVER_VERSION}" -echo " CUDA_VERSION: ${CUDA_VERSION}" -echo " BUILD_ID: ${BUILD_ID}" - -CUDA_VERSION_SHORT="${CUDA_VERSION%.*}" -CUDA_VERSION_DASHED="${CUDA_VERSION_SHORT//./-}" -PACKAGE_NAME="cuda-${CUDA_VERSION_DASHED}" - -echo "Target package name: $PACKAGE_NAME" - -wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=24.04&target_type=deb_network +# to install the matching CUDA toolkit 13.0 (without driver) +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update -sudo apt-get install -y build-essential - -# Check if the exact package exists and install -if apt-cache show "$PACKAGE_NAME" &> /dev/null; then - echo "Package '$PACKAGE_NAME' found. Installing..." - - sudo apt-get install -y --no-install-recommends "$PACKAGE_NAME" - - # Create Symbolic Link - # We use -snf to force the link creation and prevent dereferencing if it already exists - TARGET_DIR="/usr/local/cuda-$CUDA_VERSION_SHORT" - LINK_NAME="/usr/local/cuda" - - if [ -d "$TARGET_DIR" ]; then - echo "Creating symbolic link: $LINK_NAME -> $TARGET_DIR" - sudo ln -snf "$TARGET_DIR" "$LINK_NAME" - - echo "----------------------------------------------------------------" - echo "Success! Samples installed." - echo "Symlink created at $LINK_NAME" - echo "Binaries located at: $LINK_NAME/extras/demo_suite/" - echo "----------------------------------------------------------------" - else - echo "Error: Installation completed, but target directory $TARGET_DIR was not found." - exit 1 - fi -else - echo "Error: Package '$PACKAGE_NAME' was not found in your repositories." - exit 1 -fi \ No newline at end of file +sudo apt-get -y install build-essential cuda-toolkit-13-0 \ No newline at end of file From 46aa6aa99bf3a90198cb43b2ed956c18609cce08 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Mon, 5 Jan 2026 18:31:23 +0000 Subject: [PATCH 3/7] Update the GCB script to run with Louhi --- .../gpu-image-builder/check_source_image.sh | 6 +++- cloudbuild/gpu-image-builder/cloudbuild.yaml | 16 ++++----- cloudbuild/gpu-image-builder/packer.pkr.hcl | 35 +++---------------- 3 files changed, 16 insertions(+), 41 deletions(-) diff --git a/cloudbuild/gpu-image-builder/check_source_image.sh b/cloudbuild/gpu-image-builder/check_source_image.sh index 6f20467f1b..d8507350e8 100644 --- a/cloudbuild/gpu-image-builder/check_source_image.sh +++ b/cloudbuild/gpu-image-builder/check_source_image.sh @@ -31,7 +31,11 @@ if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]] && \ # Else, we either have a new image, or this is trigger by git changes # Note that we set the Louhi Git trigger to only watch cloudbuild/gpu-image-builder directory else - echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build." + if [[ "${LATEST_PUBLIC_IMAGE}" != "${LAST_CURATED_SOURCE_IMAGE}" ]]; then + echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build." + else + echo "New image building triggered by GitHub changes (Louhi trigger type = '${LOUHI_TRIGGER_TYPE}')" + fi echo "${LATEST_PUBLIC_IMAGE}" > /workspace/new_source_image.txt echo "RUN" > /workspace/build_status.txt fi diff --git a/cloudbuild/gpu-image-builder/cloudbuild.yaml b/cloudbuild/gpu-image-builder/cloudbuild.yaml index 1786eb641f..f8015c8d6b 100644 --- a/cloudbuild/gpu-image-builder/cloudbuild.yaml +++ b/cloudbuild/gpu-image-builder/cloudbuild.yaml @@ -7,8 +7,8 @@ steps: args: - '-c' - | - chmod +x check_source_image.sh - ./check_source_image.sh "${PROJECT_ID}" \ + chmod +x louhi_ws/cloudbuild/gpu-image-builder/gpu-check_source_image.sh + louhi_ws/cloudbuild/gpu-image-builder/check_source_image.sh "${PROJECT_ID}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ @@ -22,13 +22,13 @@ steps: args: - '-c' - | - chmod +x build_packer_builder.sh - ./build_packer_builder.sh "${PROJECT_ID}" + chmod +x louhi_ws/cloudbuild/gpu-image-builder/build_packer_builder.sh + louhi_ws/cloudbuild/gpu-image-builder/build_packer_builder.sh "${PROJECT_ID}" waitFor: ['-'] # Can run in parallel with check-source-image -# 2. Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN. +# Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN. - id: 'packer-build-gpu-image' - name: 'gcr.io/${PROJECT_ID}/packer' # Use the custom Packer builder image + name: 'gcr.io/${PROJECT_ID}/packer' entrypoint: 'bash' args: - '-c' @@ -44,11 +44,9 @@ steps: -var "image_family=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ -var "source_image=$(cat /workspace/new_source_image.txt)" \ -var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ - -var "gpu_driver_version=535.161.01" \ - -var "cuda_version=12.8.0" \ -var "zone=us-central1-a" \ -var "build_id=${BUILD_ID}" \ - packer.pkr.hcl + louhi_ws/cloudbuild/gpu-image-builder/packer.pkr.hcl waitFor: ['check-source-image', 'build-packer-builder'] timeout: 14400s diff --git a/cloudbuild/gpu-image-builder/packer.pkr.hcl b/cloudbuild/gpu-image-builder/packer.pkr.hcl index c0d63462dd..d9f28ddbf8 100644 --- a/cloudbuild/gpu-image-builder/packer.pkr.hcl +++ b/cloudbuild/gpu-image-builder/packer.pkr.hcl @@ -24,19 +24,6 @@ variable "source_image_project" { description = "The specific source GCE image project (e.g., ubuntu-os-cloud)" } - -variable "gpu_driver_version" { - type = string - default = "535.161.01" // Pin specific NVIDIA driver version - description = "Specific NVIDIA GPU driver version to install" -} - -variable "cuda_version" { - type = string - default = "12.2.2" // Pin specific CUDA Toolkit version - description = "Specific CUDA Toolkit version to install" -} - variable "zone" { type = string default = "us-central1-a" @@ -72,32 +59,18 @@ source "googlecompute" "gpu_image" { build { sources = ["source.googlecompute.gpu_image"] + + // Provisioner 1: Most distros only need one step provisioner "shell" { script = "./scripts/${var.image_family}/setup_vm.sh" - # Packer will pass these variables as PACKER_VAR_* env vars - environment_vars = [ - "PACKER_VAR_project_id=${var.project_id}", - "PACKER_VAR_gpu_driver_version=${var.gpu_driver_version}", - "PACKER_VAR_cuda_version=${var.cuda_version}", - "PACKER_VAR_build_id=${var.build_id}" - ] - # Expect a disconnect/reboot after GPU driver install - expect_disconnect = true - # Give some time for SSH to come back up + expect_disconnect = true // Expect a disconnect/reboot after GPU driver install timeout = "240m" } // Provisioner 2: Handles the post-reboot part, ONLY for Debian 12. provisioner "shell" { script = var.image_family == "debian-12" ? "./scripts/${var.image_family}/post_reboot.sh" : "./scripts/noop.sh" - environment_vars = [ - "PACKER_VAR_project_id=${var.project_id}", - "PACKER_VAR_gpu_driver_version=${var.gpu_driver_version}", - "PACKER_VAR_cuda_version=${var.cuda_version}", - "PACKER_VAR_build_id=${var.build_id}" - ] - # Wait for the reboot to be complete - pause_before = "60s" + pause_before = "60s" // Wait for the reboot to be complete expect_disconnect = false // No reboot expected in this second phase. timeout = "240m" } From 4e13ed5d100c7985b96d12ce9d78818b7da68b23 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Mon, 5 Jan 2026 18:48:29 +0000 Subject: [PATCH 4/7] Update GCB path again --- cloudbuild/gpu-image-builder/cloudbuild.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cloudbuild/gpu-image-builder/cloudbuild.yaml b/cloudbuild/gpu-image-builder/cloudbuild.yaml index f8015c8d6b..7a78ee3015 100644 --- a/cloudbuild/gpu-image-builder/cloudbuild.yaml +++ b/cloudbuild/gpu-image-builder/cloudbuild.yaml @@ -7,8 +7,8 @@ steps: args: - '-c' - | - chmod +x louhi_ws/cloudbuild/gpu-image-builder/gpu-check_source_image.sh - louhi_ws/cloudbuild/gpu-image-builder/check_source_image.sh "${PROJECT_ID}" \ + chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/gpu-check_source_image.sh + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/check_source_image.sh "${PROJECT_ID}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \ "${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ @@ -22,8 +22,8 @@ steps: args: - '-c' - | - chmod +x louhi_ws/cloudbuild/gpu-image-builder/build_packer_builder.sh - louhi_ws/cloudbuild/gpu-image-builder/build_packer_builder.sh "${PROJECT_ID}" + chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh "${PROJECT_ID}" waitFor: ['-'] # Can run in parallel with check-source-image # Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN. @@ -46,7 +46,7 @@ steps: -var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ -var "zone=us-central1-a" \ -var "build_id=${BUILD_ID}" \ - louhi_ws/cloudbuild/gpu-image-builder/packer.pkr.hcl + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/packer.pkr.hcl waitFor: ['check-source-image', 'build-packer-builder'] timeout: 14400s From d027f7ff95f6a0e35de60864de35ff2fc26409c3 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Tue, 6 Jan 2026 18:52:20 +0000 Subject: [PATCH 5/7] Specify SA for nested GCB --- cloudbuild/gpu-image-builder/build_packer_builder.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudbuild/gpu-image-builder/build_packer_builder.sh b/cloudbuild/gpu-image-builder/build_packer_builder.sh index 594db7f4c3..d68cb8a5c3 100644 --- a/cloudbuild/gpu-image-builder/build_packer_builder.sh +++ b/cloudbuild/gpu-image-builder/build_packer_builder.sh @@ -3,7 +3,7 @@ # Builds the custom Packer Cloud Build builder if it doesn't exist. # https://docs.cloud.google.com/build/docs/building/build-vm-images-with-packer -set -euo pipefail +set -xeuo pipefail PROJECT_ID="${1}" PACKER_BUILDER_IMAGE="gcr.io/${PROJECT_ID}/packer" @@ -14,7 +14,7 @@ else echo "Packer builder image not found. Building it now..." git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git --depth=1 cd cloud-builders-community/packer - gcloud builds submit --project="${PROJECT_ID}" . + gcloud builds submit --project="${PROJECT_ID}" --service-account=projects/stackdriver-test-143416/serviceAccounts/build-and-test@stackdriver-test-143416.iam.gserviceaccount.com . cd - echo "Packer builder image built." fi \ No newline at end of file From 8a141382900e16d070be542bb96fe9992acaedca Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Wed, 7 Jan 2026 02:49:49 +0000 Subject: [PATCH 6/7] Using git source for nested GCB --- cloudbuild/gpu-image-builder/build_packer_builder.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cloudbuild/gpu-image-builder/build_packer_builder.sh b/cloudbuild/gpu-image-builder/build_packer_builder.sh index d68cb8a5c3..6b75fd3422 100644 --- a/cloudbuild/gpu-image-builder/build_packer_builder.sh +++ b/cloudbuild/gpu-image-builder/build_packer_builder.sh @@ -14,7 +14,13 @@ else echo "Packer builder image not found. Building it now..." git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git --depth=1 cd cloud-builders-community/packer - gcloud builds submit --project="${PROJECT_ID}" --service-account=projects/stackdriver-test-143416/serviceAccounts/build-and-test@stackdriver-test-143416.iam.gserviceaccount.com . + gcloud builds submit \ + https://github.com/GoogleCloudPlatform/cloud-builders-community \ + --git-source-revision=master \ + --git-source-dir=./packer/ \ + --project="${PROJECT_ID}" \ + --service-account=projects/stackdriver-test-143416/serviceAccounts/build-and-test@stackdriver-test-143416.iam.gserviceaccount.com \ + --gcs-log-dir=gs://cloud-built-otel-collector-buckets-test-logs cd - echo "Packer builder image built." fi \ No newline at end of file From 29a4314634058af53e5aab2627f89dceae184573 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Wed, 7 Jan 2026 03:12:34 +0000 Subject: [PATCH 7/7] Update packer config file paths --- cloudbuild/gpu-image-builder/packer.pkr.hcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudbuild/gpu-image-builder/packer.pkr.hcl b/cloudbuild/gpu-image-builder/packer.pkr.hcl index d9f28ddbf8..3700846f9b 100644 --- a/cloudbuild/gpu-image-builder/packer.pkr.hcl +++ b/cloudbuild/gpu-image-builder/packer.pkr.hcl @@ -62,14 +62,14 @@ build { // Provisioner 1: Most distros only need one step provisioner "shell" { - script = "./scripts/${var.image_family}/setup_vm.sh" + script = "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/${var.image_family}/setup_vm.sh" expect_disconnect = true // Expect a disconnect/reboot after GPU driver install timeout = "240m" } // Provisioner 2: Handles the post-reboot part, ONLY for Debian 12. provisioner "shell" { - script = var.image_family == "debian-12" ? "./scripts/${var.image_family}/post_reboot.sh" : "./scripts/noop.sh" + script = var.image_family == "debian-12" ? "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/${var.image_family}/post_reboot.sh" : "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/noop.sh" pause_before = "60s" // Wait for the reboot to be complete expect_disconnect = false // No reboot expected in this second phase. timeout = "240m"