From 089d7d295d4491b468e21585b85fb253aa3ad491 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 00:16:00 +0000 Subject: [PATCH 01/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 200 ++++++++++++++++++++++ mpi/cuda-mpich-base/manifest.json | 35 ++++ mpi/cuda-mpich-base/mpich_patches.tgz | Bin 0 -> 499 bytes 3 files changed, 235 insertions(+) create mode 100644 mpi/cuda-mpich-base/dockerfile.dockerfile create mode 100644 mpi/cuda-mpich-base/manifest.json create mode 100644 mpi/cuda-mpich-base/mpich_patches.tgz diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile new file mode 100644 index 0000000..aec78be --- /dev/null +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -0,0 +1,200 @@ +# ========================= Common Args ========================= +ARG OS_VERSION="24.04" +ARG LINUX_KERNEL="6.8.0-31" +ARG LIBFABRIC_VERSION="1.18.1" +ARG MPICH_VERSION="3.4.3" +ARG MPI4PY_VERSION="3.1.5" +ARG ENABLE_OSU="1" +ARG CUDA_VERSION="13.0.2" +ARG OSU_VERSION="7.3" +ARG IMAGE_NAME="nvidia/cuda" + +# ====================== Stage 1: Builder (full build environment) ====================== +FROM ${IMAGE_NAME}:${CUDA_VERSION}-devel-ubuntu${OS_VERSION} AS builder + +ARG OS_VERSION +ARG LINUX_KERNEL +ARG LIBFABRIC_VERSION +ARG MPICH_VERSION +ARG CUDA_VERSION +ARG OSU_VERSION +ARG ENABLE_OSU +ENV DEBIAN_FRONTEND=noninteractive + +# Install all build dependencies +RUN apt-get update -qq && apt-get -y --no-install-recommends install \ + build-essential gcc-12 g++-12 gfortran-12 \ + gnupg gnupg2 ca-certificates gdb wget git curl \ + python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ + patchelf strace ltrace \ + libcrypt-dev libcurl4-openssl-dev libpython3-dev libreadline-dev libssl-dev \ + sudo autoconf automake bison flex gcovr libtool m4 make openssh-server patch \ + subversion tzdata valgrind vim xsltproc zlib1g-dev ninja-build libnuma-dev swig \ + linux-tools-generic linux-source software-properties-common \ + libkeyutils-dev libnl-genl-3-dev libyaml-dev libmount-dev pkg-config \ + linux-headers-${LINUX_KERNEL}-generic linux-headers-${LINUX_KERNEL} \ + fakeroot devscripts dpkg-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install modern CMake +RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-aarch64.sh \ + && chmod +x cmake-3.31.7-linux-aarch64.sh \ + && yes | ./cmake-3.31.7-linux-aarch64.sh --prefix=/usr \ + && cmake --version \ + && rm -f cmake-3.31.7-linux-aarch64.sh + +# Generate kernel config for Lustre +RUN echo "deb-src http://archive.ubuntu.com/ubuntu noble main restricted" >> /etc/apt/sources.list \ + && apt-get update -qq \ + && cd /tmp \ + && apt-get source linux \ + && cd linux-* \ + && chmod +x ./debian/scripts/misc/annotations \ + && ./debian/scripts/misc/annotations --arch arm64 --flavour generic --export > .config \ + && cp .config /usr/lib/modules/${LINUX_KERNEL}-generic/build/ \ + && cd /tmp && rm -rf linux-* + +# Build libfabric +RUN mkdir -p /tmp/build && cd /tmp/build \ + && wget -q https://github.com/ofiwg/libfabric/archive/refs/tags/v${LIBFABRIC_VERSION}.tar.gz \ + && tar xf v${LIBFABRIC_VERSION}.tar.gz \ + && cd libfabric-${LIBFABRIC_VERSION} \ + && ./autogen.sh && ./configure \ + && make -j"$(nproc)" && make install \ + && rm -rf /tmp/build/libfabric-* + +# Build Lustre (using GitHub mirror) +RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \ + && for i in 1 2 3; do \ + echo "Cloning Lustre (attempt $i)..." && \ + git clone --depth 1 https://github.com/lustre/lustre-release.git && break || { \ + echo "Clone failed. Retrying in 5s..."; sleep 5; \ + }; \ + done \ + && cd lustre-release \ + && bash autogen.sh \ + && ./configure --disable-server --enable-client \ + --with-linux=/usr/lib/modules/${LINUX_KERNEL}-generic/build \ + --disable-tests \ + CFLAGS=-Wno-error=attribute-warning \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && cd / && rm -rf /tmp/lustre-build + +# Build MPICH with Lustre support +COPY mpich_patches.tgz /tmp/ +RUN mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ + && wget -q http://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz \ + && tar xf mpich-${MPICH_VERSION}.tar.gz \ + && cd mpich-${MPICH_VERSION} \ + && tar xf /tmp/mpich_patches.tgz \ + && patch -p0 < csel.patch \ + && patch -p0 < ch4r_init.patch \ + && ./configure \ + --without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \ + --enable-debuginfo --enable-yield=sched_yield --enable-g=mem \ + --with-device=ch4:ofi --with-namepublisher=file \ + --with-shared-memory=sysv --disable-allowport --with-pm=gforker \ + --with-file-system=ufs+lustre+nfs \ + --enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ + CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && cd / && rm -rf /tmp/mpich-build + + +# Build aws-ofi-nccl (CUDA version of aws-ofi-rccl) +RUN cd /tmp \ + && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ + && cd aws-ofi-nccl \ + && ./autogen.sh \ + && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda --with-nccl=/usr CC=gcc-12 CXX=g++-12 \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && cd /tmp && rm -rf aws-ofi-nccl + +# Build OSU microbenchmarks +RUN if [ "${ENABLE_OSU}" = "1" ]; then \ + cd /tmp && \ + wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \ + tar xf osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \ + cd osu-micro-benchmarks-${OSU_VERSION} && \ + ./configure --prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda && \ + make -j"$(nproc)" && \ + make install && \ + cd /tmp && rm -rf osu-micro-benchmarks-*; \ + fi + +# Check installed files for debugging +RUN echo "=== Checking Lustre files ===" \ + && find /usr -name "*lustre*" -o -name "liblustreapi*" 2>/dev/null | head -20 || true + +# ====================== Stage 2: Runtime (minimal runtime environment) ====================== +ARG TARGETARCH +FROM ${IMAGE_NAME}:${CUDA_VERSION}-runtime-ubuntu${OS_VERSION} AS runtime + +ARG MPI4PY_VERSION +ARG CUDA_VERSION +ARG ENABLE_OSU +ENV DEBIAN_FRONTEND=noninteractive + +# Install minimal runtime dependencies +RUN apt-get update -qq && apt-get install -y --no-install-recommends \ + bash ca-certificates wget gnupg lsb-release \ + libnuma1 libgfortran5 libgcc-s1 libstdc++6 \ + libyaml-0-2 keyutils \ + python3 python3-pip python3-venv \ + tzdata \ + && rm -rf /var/lib/apt/lists/* + +# Copy runtime files from builder +COPY --from=builder /usr/lib/liblustreapi* /usr/lib/ +COPY --from=builder /usr/lib/libfabric* /usr/lib/ +COPY --from=builder /usr/lib/libmpi* /usr/lib/ +COPY --from=builder /usr/lib/libmpich* /usr/lib/ +COPY --from=builder /usr/lib/libmpl* /usr/lib/ +COPY --from=builder /usr/lib/libopa* /usr/lib/ +COPY --from=builder /usr/lib/libnccl_net.so* /usr/lib/ +COPY --from=builder /usr/bin/mpi* /usr/bin/ +COPY --from=builder /usr/bin/hydra* /usr/bin/ +COPY --from=builder /usr/bin/parkill /usr/bin/ +COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/osu-micro-benchmarks + +RUN ldconfig + +# Install mpi4py +RUN pip install --break-system-packages mpi4py==${MPI4PY_VERSION} + +# Set up environment +ENV PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH" \ + NCCL_SOCKET_IFNAME=hsn \ + CXI_FORK_SAFE=1 \ + CXI_FORK_SAFE_HP=1 \ + FI_CXI_DISABLE_CQ_HUGETLB=1 \ + CUDA_PATH=/usr/local/cuda \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} + +# Singularity environment injection +RUN mkdir -p /.singularity.d/env/ \ + && echo "export NCCL_SOCKET_IFNAME=hsn" >> /.singularity.d/env/91-environment.sh \ + && echo "export CXI_FORK_SAFE=1" >> /.singularity.d/env/91-environment.sh \ + && echo "export CXI_FORK_SAFE_HP=1" >> /.singularity.d/env/91-environment.sh \ + && echo "export FI_CXI_DISABLE_CQ_HUGETLB=1" >> /.singularity.d/env/91-environment.sh \ + && echo "export CUDA_PATH=/usr/local/cuda" >> /.singularity.d/env/91-environment.sh \ + && echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:\${LD_LIBRARY_PATH}" >> /.singularity.d/env/91-environment.sh + +# Cleanup +RUN rm -rf /usr/share/doc/* /usr/share/man/* /usr/share/locale/* || true + +# Debug: Check what was copied +RUN echo "=== Runtime libraries check ===" \ + && ls -lh /usr/lib/liblustreapi* || echo "No Lustre libs" \ + && ls -lh /usr/lib/libmpi* || echo "No MPI libs" \ + && which mpicc || echo "No mpicc" \ + && which mpirun || echo "No mpirun" + +WORKDIR /workspace +LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=false org.opencontainers.image.platform=arm diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json new file mode 100644 index 0000000..90a6b3c --- /dev/null +++ b/mpi/cuda-mpich-base/manifest.json @@ -0,0 +1,35 @@ +{ + "main": "mpi", + "feature": "cuda-mpich-base", + "version": "0.0.1", + "devmode": true, + "noscan": false, + "platform": "arm", + "shpc": false, + "targets": [ + "setonix-registry" + ], + "private-targets": [ + "liu268" + ], + "github": { + "repo": "Quleaf/ImageManagerAction", + "branch": "cicd-mpi/cuda-mpich-base", + "path": "mpi/cuda-mpich-base/" + }, + "metadata": { + "timestamp": "2025-12-12T00:16:00.905156Z", + "correlation_id": "cf4055ba-fa2f-4783-865f-99ed83baa014" + }, + "template": { + "OS_VERSION": "24.04", + "LINUX_KERNEL": "6.8.0-31", + "LIBFABRIC_VERSION": "1.18.1", + "MPICH_VERSION": "3.4.3", + "MPI4PY_VERSION": "3.1.5", + "ENABLE_OSU": "1", + "CUDA_VERSION": "13.0.2", + "OSU_VERSION": "7.3", + "IMAGE_NAME": "nvidia/cuda" + } +} \ No newline at end of file diff --git a/mpi/cuda-mpich-base/mpich_patches.tgz b/mpi/cuda-mpich-base/mpich_patches.tgz new file mode 100644 index 0000000000000000000000000000000000000000..0549268865db82e82efece8ec7f3703cc110bc56 GIT binary patch literal 499 zcmVnO7?HqU`eEo3H*oLjS1YiB1qU-& z1h85e@Um<~HDo!QsYiCn_^9G+5`W$Adq*9bw2L>>@VCBgbjZ99ZABxiKVrJNuG29U z43PU>p53+#orErqYs%(0!AX@+JQQD=2MSOCIQ!zfFCBo zw$ZJ*zVxn=_+$Jzxm3&g{g=(#joMXPWs@J-J=XE20c9iM!>4!?zZ>5!5p7+ctH4#V zR^MO_I&ksox5O;(W!LVH=YDU?4CZ&)>TgKWs%k}2=k Date: Fri, 12 Dec 2025 00:27:06 +0000 Subject: [PATCH 02/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 5 ++++- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index aec78be..b5eacae 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -39,7 +39,10 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ # Install modern CMake RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-aarch64.sh \ && chmod +x cmake-3.31.7-linux-aarch64.sh \ - && yes | ./cmake-3.31.7-linux-aarch64.sh --prefix=/usr \ + && ./cmake-3.31.7-linux-aarch64.sh --skip-license --prefix=/usr --include-subdir \ + && ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/cmake /usr/bin/cmake \ + && ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/ctest /usr/bin/ctest \ + && ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/cpack /usr/bin/cpack \ && cmake --version \ && rm -f cmake-3.31.7-linux-aarch64.sh diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 90a6b3c..244681b 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T00:16:00.905156Z", - "correlation_id": "cf4055ba-fa2f-4783-865f-99ed83baa014" + "timestamp": "2025-12-12T00:27:06.166625Z", + "correlation_id": "f1d18b87-ad34-42e0-8310-8d1d2d229e57" }, "template": { "OS_VERSION": "24.04", From 40b221fb401ed61e29179b7f4af5f0740da00089 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 00:33:56 +0000 Subject: [PATCH 03/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 1 + mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index b5eacae..f7cce4b 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -32,6 +32,7 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ subversion tzdata valgrind vim xsltproc zlib1g-dev ninja-build libnuma-dev swig \ linux-tools-generic linux-source software-properties-common \ libkeyutils-dev libnl-genl-3-dev libyaml-dev libmount-dev pkg-config \ + libhwloc-dev hwloc \ linux-headers-${LINUX_KERNEL}-generic linux-headers-${LINUX_KERNEL} \ fakeroot devscripts dpkg-dev \ && rm -rf /var/lib/apt/lists/* diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 244681b..a38f952 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T00:27:06.166625Z", - "correlation_id": "f1d18b87-ad34-42e0-8310-8d1d2d229e57" + "timestamp": "2025-12-12T00:33:56.679855Z", + "correlation_id": "a4481fd7-fb90-4b52-99db-a75556aec774" }, "template": { "OS_VERSION": "24.04", From e066b8c2980d1a63650d891bf452b6991743b0ce Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 00:43:21 +0000 Subject: [PATCH 04/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 3 ++- mpi/cuda-mpich-base/manifest.json | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index f7cce4b..fad4fe1 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -24,6 +24,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install all build dependencies RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential gcc-12 g++-12 gfortran-12 \ + libc6-dev libc6-dev-arm64 \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ @@ -201,4 +202,4 @@ RUN echo "=== Runtime libraries check ===" \ && which mpirun || echo "No mpirun" WORKDIR /workspace -LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=false org.opencontainers.image.platform=arm +LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=arm diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index a38f952..2109c26 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -3,7 +3,7 @@ "feature": "cuda-mpich-base", "version": "0.0.1", "devmode": true, - "noscan": false, + "noscan": true, "platform": "arm", "shpc": false, "targets": [ @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T00:33:56.679855Z", - "correlation_id": "a4481fd7-fb90-4b52-99db-a75556aec774" + "timestamp": "2025-12-12T00:43:21.137536Z", + "correlation_id": "824fc54e-04f6-4c83-b32b-29ba2f10cafc" }, "template": { "OS_VERSION": "24.04", From 37a5669de8c624d08d6c87b2dd4c225e12bb04a5 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 00:46:01 +0000 Subject: [PATCH 05/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 2 +- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index fad4fe1..8bf276c 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -24,7 +24,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install all build dependencies RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential gcc-12 g++-12 gfortran-12 \ - libc6-dev libc6-dev-arm64 \ + libc6-dev \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 2109c26..5ef5774 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T00:43:21.137536Z", - "correlation_id": "824fc54e-04f6-4c83-b32b-29ba2f10cafc" + "timestamp": "2025-12-12T00:46:01.048774Z", + "correlation_id": "23921b33-102f-4436-9c59-f3707104b5be" }, "template": { "OS_VERSION": "24.04", From a9ea16e6036d1bd76f9ae174abd2fb69ecb3c49d Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 01:09:57 +0000 Subject: [PATCH 06/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 9 ++++----- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 8bf276c..c0392a2 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -21,10 +21,9 @@ ARG OSU_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive -# Install all build dependencies +# Install all build dependencies (use system default compiler from Ubuntu 24.04) RUN apt-get update -qq && apt-get -y --no-install-recommends install \ - build-essential gcc-12 g++-12 gfortran-12 \ - libc6-dev \ + build-essential gfortran \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ @@ -103,7 +102,7 @@ RUN mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ --with-shared-memory=sysv --disable-allowport --with-pm=gforker \ --with-file-system=ufs+lustre+nfs \ --enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ - CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch \ + FFLAGS=-fallow-argument-mismatch \ && make -j"$(nproc)" \ && make install \ && ldconfig \ @@ -115,7 +114,7 @@ RUN cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ && ./autogen.sh \ - && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda --with-nccl=/usr CC=gcc-12 CXX=g++-12 \ + && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda \ && make -j"$(nproc)" \ && make install \ && ldconfig \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 5ef5774..6ecab27 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T00:46:01.048774Z", - "correlation_id": "23921b33-102f-4436-9c59-f3707104b5be" + "timestamp": "2025-12-12T01:09:57.701260Z", + "correlation_id": "128be99a-6781-48e2-9633-af222dab2d58" }, "template": { "OS_VERSION": "24.04", From 2a201ae78541b1b3b55c1b6610d332062b792fa9 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 01:21:14 +0000 Subject: [PATCH 07/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 11 +++++++++++ mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index c0392a2..5e4dc42 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -22,8 +22,10 @@ ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive # Install all build dependencies (use system default compiler from Ubuntu 24.04) +# libc6-dev and linux-libc-dev are required for C/C++ standard library headers (stdlib.h) RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential gfortran \ + libc6-dev linux-libc-dev \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ @@ -110,6 +112,15 @@ RUN mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ # Build aws-ofi-nccl (CUDA version of aws-ofi-rccl) +# Debug: verify stdlib.h exists and check include paths +RUN ls -la /usr/include/stdlib.h && echo "stdlib.h found" || echo "stdlib.h NOT FOUND" +RUN echo | cpp -v 2>&1 | grep -A 20 "include" + +# Clear CUDA-related env vars that may interfere with include paths, then build +ENV CPLUS_INCLUDE_PATH="" +ENV C_INCLUDE_PATH="" +ENV CPATH="" + RUN cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 6ecab27..068782d 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T01:09:57.701260Z", - "correlation_id": "128be99a-6781-48e2-9633-af222dab2d58" + "timestamp": "2025-12-12T01:21:14.945935Z", + "correlation_id": "5ceb01e6-9be6-4154-8d6f-20e43f43b4a9" }, "template": { "OS_VERSION": "24.04", From ae9658df0c91ccb93a72153eef61e411cde7abd6 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 01:34:24 +0000 Subject: [PATCH 08/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 33 +++++++++++++---------- mpi/cuda-mpich-base/manifest.json | 4 +-- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 5e4dc42..27e2129 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -10,7 +10,8 @@ ARG OSU_VERSION="7.3" ARG IMAGE_NAME="nvidia/cuda" # ====================== Stage 1: Builder (full build environment) ====================== -FROM ${IMAGE_NAME}:${CUDA_VERSION}-devel-ubuntu${OS_VERSION} AS builder +# Use clean Ubuntu image to avoid NVIDIA CUDA image's broken C++ include paths +FROM ubuntu:${OS_VERSION} AS builder ARG OS_VERSION ARG LINUX_KERNEL @@ -21,11 +22,9 @@ ARG OSU_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive -# Install all build dependencies (use system default compiler from Ubuntu 24.04) -# libc6-dev and linux-libc-dev are required for C/C++ standard library headers (stdlib.h) +# Install all build dependencies first (clean Ubuntu environment) RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential gfortran \ - libc6-dev linux-libc-dev \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ @@ -39,6 +38,21 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ fakeroot devscripts dpkg-dev \ && rm -rf /var/lib/apt/lists/* +# Install CUDA toolkit from NVIDIA repository +RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm cuda-keyring_1.1-1_all.deb \ + && apt-get update -qq \ + && apt-get install -y --no-install-recommends \ + cuda-toolkit-13-0 \ + libnccl2 libnccl-dev \ + && rm -rf /var/lib/apt/lists/* + +# Set CUDA environment variables +ENV PATH="/usr/local/cuda/bin:${PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV CUDA_HOME="/usr/local/cuda" + # Install modern CMake RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-aarch64.sh \ && chmod +x cmake-3.31.7-linux-aarch64.sh \ @@ -111,16 +125,7 @@ RUN mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ && cd / && rm -rf /tmp/mpich-build -# Build aws-ofi-nccl (CUDA version of aws-ofi-rccl) -# Debug: verify stdlib.h exists and check include paths -RUN ls -la /usr/include/stdlib.h && echo "stdlib.h found" || echo "stdlib.h NOT FOUND" -RUN echo | cpp -v 2>&1 | grep -A 20 "include" - -# Clear CUDA-related env vars that may interfere with include paths, then build -ENV CPLUS_INCLUDE_PATH="" -ENV C_INCLUDE_PATH="" -ENV CPATH="" - +# Build aws-ofi-nccl (CUDA NCCL plugin for libfabric) RUN cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 068782d..1a77d82 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T01:21:14.945935Z", - "correlation_id": "5ceb01e6-9be6-4154-8d6f-20e43f43b4a9" + "timestamp": "2025-12-12T01:34:24.337958Z", + "correlation_id": "596e890a-f7ae-4f4f-bc6c-c2a5c0303dd2" }, "template": { "OS_VERSION": "24.04", From 2d60edbfe35cff16252569f6e9ac392cf9971bce Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 02:33:49 +0000 Subject: [PATCH 09/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 61 ++++++++++++++--------- mpi/cuda-mpich-base/manifest.json | 10 ++-- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 27e2129..bd1f5ee 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -5,12 +5,11 @@ ARG LIBFABRIC_VERSION="1.18.1" ARG MPICH_VERSION="3.4.3" ARG MPI4PY_VERSION="3.1.5" ARG ENABLE_OSU="1" -ARG CUDA_VERSION="13.0.2" +ARG CUDA_VERSION="13-0" ARG OSU_VERSION="7.3" ARG IMAGE_NAME="nvidia/cuda" # ====================== Stage 1: Builder (full build environment) ====================== -# Use clean Ubuntu image to avoid NVIDIA CUDA image's broken C++ include paths FROM ubuntu:${OS_VERSION} AS builder ARG OS_VERSION @@ -22,9 +21,10 @@ ARG OSU_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive -# Install all build dependencies first (clean Ubuntu environment) +# Install all build dependencies first (use gcc-12 like ROCm version for stability) RUN apt-get update -qq && apt-get -y --no-install-recommends install \ - build-essential gfortran \ + build-essential \ + gcc-12 g++-12 gfortran-12 \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ patchelf strace ltrace \ @@ -38,15 +38,20 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ fakeroot devscripts dpkg-dev \ && rm -rf /var/lib/apt/lists/* -# Install CUDA toolkit from NVIDIA repository +# Install minimal CUDA packages from NVIDIA repository (avoid full toolkit that may break system) RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \ && dpkg -i cuda-keyring_1.1-1_all.deb \ && rm cuda-keyring_1.1-1_all.deb \ && apt-get update -qq \ && apt-get install -y --no-install-recommends \ - cuda-toolkit-13-0 \ + cuda-cudart-dev-${CUDA_VERSION} \ + cuda-nvcc-${CUDA_VERSION} \ + cuda-crt-${CUDA_VERSION} \ + cuda-cudart-${CUDA_VERSION} \ + cuda-driver-dev-${CUDA_VERSION} \ libnccl2 libnccl-dev \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + && ln -s /usr/local/cuda-13.0 /usr/local/cuda # Set CUDA environment variables ENV PATH="/usr/local/cuda/bin:${PATH}" @@ -103,49 +108,58 @@ RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \ && cd / && rm -rf /tmp/lustre-build # Build MPICH with Lustre support +ARG MPICH_CONFIGURE_OPTIONS="--without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \ +--enable-debuginfo --enable-yield=sched_yield --enable-g=mem \ +--with-device=ch4:ofi --with-namepublisher=file \ +--with-shared-memory=sysv --disable-allowport --with-pm=gforker \ +--with-file-system=ufs+lustre+nfs \ +--enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ +CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch" COPY mpich_patches.tgz /tmp/ -RUN mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ +RUN echo "Building MPICH..." \ + && mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ && wget -q http://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz \ && tar xf mpich-${MPICH_VERSION}.tar.gz \ && cd mpich-${MPICH_VERSION} \ && tar xf /tmp/mpich_patches.tgz \ && patch -p0 < csel.patch \ && patch -p0 < ch4r_init.patch \ - && ./configure \ - --without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \ - --enable-debuginfo --enable-yield=sched_yield --enable-g=mem \ - --with-device=ch4:ofi --with-namepublisher=file \ - --with-shared-memory=sysv --disable-allowport --with-pm=gforker \ - --with-file-system=ufs+lustre+nfs \ - --enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ - FFLAGS=-fallow-argument-mismatch \ + && ./configure ${MPICH_CONFIGURE_OPTIONS} \ && make -j"$(nproc)" \ && make install \ && ldconfig \ - && cd / && rm -rf /tmp/mpich-build + && cd / && rm -rf /tmp/mpich-build \ + && echo "Finished building MPICH" # Build aws-ofi-nccl (CUDA NCCL plugin for libfabric) -RUN cd /tmp \ +# Use gcc-12/g++-12 explicitly like ROCm version for stability +ARG NCCL_CONFIGURE_OPTIONS="--prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda CC=gcc-12 CXX=g++-12" +RUN echo "Build aws-ofi-nccl" \ + && cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ && ./autogen.sh \ - && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda \ + && ./configure ${NCCL_CONFIGURE_OPTIONS} \ && make -j"$(nproc)" \ && make install \ && ldconfig \ - && cd /tmp && rm -rf aws-ofi-nccl + && cd /tmp && rm -rf aws-ofi-nccl \ + && echo "Done" # Build OSU microbenchmarks +ARG OSU_CONFIGURE_OPTIONS="--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda" RUN if [ "${ENABLE_OSU}" = "1" ]; then \ + echo "Building OSU..." && \ cd /tmp && \ wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \ tar xf osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \ cd osu-micro-benchmarks-${OSU_VERSION} && \ - ./configure --prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda && \ + ./configure ${OSU_CONFIGURE_OPTIONS} && \ make -j"$(nproc)" && \ make install && \ - cd /tmp && rm -rf osu-micro-benchmarks-*; \ + cd /tmp && rm -rf osu-micro-benchmarks-* && \ + echo "Done"; \ fi # Check installed files for debugging @@ -153,8 +167,7 @@ RUN echo "=== Checking Lustre files ===" \ && find /usr -name "*lustre*" -o -name "liblustreapi*" 2>/dev/null | head -20 || true # ====================== Stage 2: Runtime (minimal runtime environment) ====================== -ARG TARGETARCH -FROM ${IMAGE_NAME}:${CUDA_VERSION}-runtime-ubuntu${OS_VERSION} AS runtime +FROM ${IMAGE_NAME}:13.0.2-runtime-ubuntu${OS_VERSION} AS runtime ARG MPI4PY_VERSION ARG CUDA_VERSION diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 1a77d82..e6eef46 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T01:34:24.337958Z", - "correlation_id": "596e890a-f7ae-4f4f-bc6c-c2a5c0303dd2" + "timestamp": "2025-12-12T02:33:49.134941Z", + "correlation_id": "31f4d7b6-2c40-4e71-b426-da5248f1423e" }, "template": { "OS_VERSION": "24.04", @@ -28,8 +28,10 @@ "MPICH_VERSION": "3.4.3", "MPI4PY_VERSION": "3.1.5", "ENABLE_OSU": "1", - "CUDA_VERSION": "13.0.2", + "CUDA_VERSION": "13-0", "OSU_VERSION": "7.3", - "IMAGE_NAME": "nvidia/cuda" + "IMAGE_NAME": "nvidia/cuda", + "NCCL_CONFIGURE_OPTIONS": "--prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda CC=gcc-12 CXX=g++-12", + "OSU_CONFIGURE_OPTIONS": "--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda" } } \ No newline at end of file From 1a2b9bd8bb88c21bd0648ea19a2a975142bb4353 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 02:43:04 +0000 Subject: [PATCH 10/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 11 ++++++++--- mpi/cuda-mpich-base/manifest.json | 5 ++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index bd1f5ee..3bd7907 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -38,7 +38,7 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ fakeroot devscripts dpkg-dev \ && rm -rf /var/lib/apt/lists/* -# Install minimal CUDA packages from NVIDIA repository (avoid full toolkit that may break system) +# Install CUDA packages from NVIDIA repository RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \ && dpkg -i cuda-keyring_1.1-1_all.deb \ && rm cuda-keyring_1.1-1_all.deb \ @@ -49,6 +49,8 @@ RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/ cuda-crt-${CUDA_VERSION} \ cuda-cudart-${CUDA_VERSION} \ cuda-driver-dev-${CUDA_VERSION} \ + cuda-libraries-dev-${CUDA_VERSION} \ + libcudnn9-dev-cuda-12 \ libnccl2 libnccl-dev \ && rm -rf /var/lib/apt/lists/* \ && ln -s /usr/local/cuda-13.0 /usr/local/cuda @@ -134,13 +136,16 @@ RUN echo "Building MPICH..." \ # Build aws-ofi-nccl (CUDA NCCL plugin for libfabric) # Use gcc-12/g++-12 explicitly like ROCm version for stability -ARG NCCL_CONFIGURE_OPTIONS="--prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda CC=gcc-12 CXX=g++-12" RUN echo "Build aws-ofi-nccl" \ && cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ && ./autogen.sh \ - && ./configure ${NCCL_CONFIGURE_OPTIONS} \ + && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda \ + CC=gcc-12 CXX=g++-12 \ + LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \ + CFLAGS="-I/usr/local/cuda/include" \ + CXXFLAGS="-I/usr/local/cuda/include" \ && make -j"$(nproc)" \ && make install \ && ldconfig \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index e6eef46..674d9b4 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T02:33:49.134941Z", - "correlation_id": "31f4d7b6-2c40-4e71-b426-da5248f1423e" + "timestamp": "2025-12-12T02:43:04.235785Z", + "correlation_id": "93c3a62c-44e8-434f-a3a2-342182b04a7d" }, "template": { "OS_VERSION": "24.04", @@ -31,7 +31,6 @@ "CUDA_VERSION": "13-0", "OSU_VERSION": "7.3", "IMAGE_NAME": "nvidia/cuda", - "NCCL_CONFIGURE_OPTIONS": "--prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda CC=gcc-12 CXX=g++-12", "OSU_CONFIGURE_OPTIONS": "--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda" } } \ No newline at end of file From 693c030852e3e543f387e65308eb68d5fb091f9d Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:10:18 +0000 Subject: [PATCH 11/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=x86 targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 34 ++++++++++------------- mpi/cuda-mpich-base/manifest.json | 6 ++-- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 3bd7907..b73c721 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -21,9 +21,10 @@ ARG OSU_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive -# Install all build dependencies first (use gcc-12 like ROCm version for stability) +# Build toolchain & headers RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential \ + libc6-dev \ # <<< CHANGED: 确保 stdlib.h 等 C 头文件存在 gcc-12 g++-12 gfortran-12 \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ @@ -55,12 +56,11 @@ RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/ && rm -rf /var/lib/apt/lists/* \ && ln -s /usr/local/cuda-13.0 /usr/local/cuda -# Set CUDA environment variables ENV PATH="/usr/local/cuda/bin:${PATH}" ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" ENV CUDA_HOME="/usr/local/cuda" -# Install modern CMake +# Modern CMake RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-aarch64.sh \ && chmod +x cmake-3.31.7-linux-aarch64.sh \ && ./cmake-3.31.7-linux-aarch64.sh --skip-license --prefix=/usr --include-subdir \ @@ -70,7 +70,7 @@ RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.3 && cmake --version \ && rm -f cmake-3.31.7-linux-aarch64.sh -# Generate kernel config for Lustre +# Kernel config for Lustre RUN echo "deb-src http://archive.ubuntu.com/ubuntu noble main restricted" >> /etc/apt/sources.list \ && apt-get update -qq \ && cd /tmp \ @@ -90,7 +90,7 @@ RUN mkdir -p /tmp/build && cd /tmp/build \ && make -j"$(nproc)" && make install \ && rm -rf /tmp/build/libfabric-* -# Build Lustre (using GitHub mirror) +# Build Lustre client RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \ && for i in 1 2 3; do \ echo "Cloning Lustre (attempt $i)..." && \ @@ -110,13 +110,13 @@ RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \ && cd / && rm -rf /tmp/lustre-build # Build MPICH with Lustre support -ARG MPICH_CONFIGURE_OPTIONS="--without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \ +ARG MPICH_CONFIGURE_OPTIONS="--prefix=/usr --without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \ --enable-debuginfo --enable-yield=sched_yield --enable-g=mem \ --with-device=ch4:ofi --with-namepublisher=file \ --with-shared-memory=sysv --disable-allowport --with-pm=gforker \ --with-file-system=ufs+lustre+nfs \ --enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ -CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch" +CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch" # <<< CHANGED: 加 prefix=/usr COPY mpich_patches.tgz /tmp/ RUN echo "Building MPICH..." \ && mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ @@ -133,19 +133,18 @@ RUN echo "Building MPICH..." \ && cd / && rm -rf /tmp/mpich-build \ && echo "Finished building MPICH" - # Build aws-ofi-nccl (CUDA NCCL plugin for libfabric) -# Use gcc-12/g++-12 explicitly like ROCm version for stability RUN echo "Build aws-ofi-nccl" \ && cd /tmp \ && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ && cd aws-ofi-nccl \ && ./autogen.sh \ - && ./configure --prefix=/usr --with-mpi=/usr --with-libfabric=/usr --with-cuda=/usr/local/cuda \ - CC=gcc-12 CXX=g++-12 \ - LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \ - CFLAGS="-I/usr/local/cuda/include" \ - CXXFLAGS="-I/usr/local/cuda/include" \ + && CC=gcc-12 CXX=g++-12 \ + ./configure --prefix=/usr \ + --with-mpi=/usr \ + --with-libfabric=/usr \ + --with-cuda=/usr/local/cuda \ + LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \ && make -j"$(nproc)" \ && make install \ && ldconfig \ @@ -179,7 +178,6 @@ ARG CUDA_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive -# Install minimal runtime dependencies RUN apt-get update -qq && apt-get install -y --no-install-recommends \ bash ca-certificates wget gnupg lsb-release \ libnuma1 libgfortran5 libgcc-s1 libstdc++6 \ @@ -203,10 +201,8 @@ COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/o RUN ldconfig -# Install mpi4py RUN pip install --break-system-packages mpi4py==${MPI4PY_VERSION} -# Set up environment ENV PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH" \ NCCL_SOCKET_IFNAME=hsn \ CXI_FORK_SAFE=1 \ @@ -224,10 +220,8 @@ RUN mkdir -p /.singularity.d/env/ \ && echo "export CUDA_PATH=/usr/local/cuda" >> /.singularity.d/env/91-environment.sh \ && echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:\${LD_LIBRARY_PATH}" >> /.singularity.d/env/91-environment.sh -# Cleanup RUN rm -rf /usr/share/doc/* /usr/share/man/* /usr/share/locale/* || true -# Debug: Check what was copied RUN echo "=== Runtime libraries check ===" \ && ls -lh /usr/lib/liblustreapi* || echo "No Lustre libs" \ && ls -lh /usr/lib/libmpi* || echo "No MPI libs" \ @@ -235,4 +229,4 @@ RUN echo "=== Runtime libraries check ===" \ && which mpirun || echo "No mpirun" WORKDIR /workspace -LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=arm +LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=x86 diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 674d9b4..1a55141 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -4,7 +4,7 @@ "version": "0.0.1", "devmode": true, "noscan": true, - "platform": "arm", + "platform": "x86", "shpc": false, "targets": [ "setonix-registry" @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T02:43:04.235785Z", - "correlation_id": "93c3a62c-44e8-434f-a3a2-342182b04a7d" + "timestamp": "2025-12-12T03:10:17.985972Z", + "correlation_id": "fae3fd69-5b8d-40cc-8522-3296fbf1c273" }, "template": { "OS_VERSION": "24.04", From c38d9533df3ae4b1fcd69bf5b5c9e4b4771807ec Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:13:54 +0000 Subject: [PATCH 12/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 2 +- mpi/cuda-mpich-base/manifest.json | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index b73c721..402123d 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -229,4 +229,4 @@ RUN echo "=== Runtime libraries check ===" \ && which mpirun || echo "No mpirun" WORKDIR /workspace -LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=x86 +LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=arm diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 1a55141..66c7160 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -4,7 +4,7 @@ "version": "0.0.1", "devmode": true, "noscan": true, - "platform": "x86", + "platform": "arm", "shpc": false, "targets": [ "setonix-registry" @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T03:10:17.985972Z", - "correlation_id": "fae3fd69-5b8d-40cc-8522-3296fbf1c273" + "timestamp": "2025-12-12T03:13:54.411538Z", + "correlation_id": "c2d4abd7-e416-450a-b8e9-dca633de3391" }, "template": { "OS_VERSION": "24.04", From 74b10b7e6a255e177c370ad842e1bffb8e17b427 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:16:55 +0000 Subject: [PATCH 13/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 2 +- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 402123d..c8e9c03 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -24,7 +24,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Build toolchain & headers RUN apt-get update -qq && apt-get -y --no-install-recommends install \ build-essential \ - libc6-dev \ # <<< CHANGED: 确保 stdlib.h 等 C 头文件存在 + libc6-dev \ gcc-12 g++-12 gfortran-12 \ gnupg gnupg2 ca-certificates gdb wget git curl \ python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 66c7160..b5b98e5 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T03:13:54.411538Z", - "correlation_id": "c2d4abd7-e416-450a-b8e9-dca633de3391" + "timestamp": "2025-12-12T03:16:55.621358Z", + "correlation_id": "2d9b93f0-d726-403a-9e8d-c46c6fe12553" }, "template": { "OS_VERSION": "24.04", From 2b39f274d5723a5e5c75858322660a2f918edda4 Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:33:48 +0000 Subject: [PATCH 14/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 32 +++++++++++------------ mpi/cuda-mpich-base/manifest.json | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index c8e9c03..481d2ee 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -134,22 +134,22 @@ RUN echo "Building MPICH..." \ && echo "Finished building MPICH" # Build aws-ofi-nccl (CUDA NCCL plugin for libfabric) -RUN echo "Build aws-ofi-nccl" \ - && cd /tmp \ - && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ - && cd aws-ofi-nccl \ - && ./autogen.sh \ - && CC=gcc-12 CXX=g++-12 \ - ./configure --prefix=/usr \ - --with-mpi=/usr \ - --with-libfabric=/usr \ - --with-cuda=/usr/local/cuda \ - LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \ - && make -j"$(nproc)" \ - && make install \ - && ldconfig \ - && cd /tmp && rm -rf aws-ofi-nccl \ - && echo "Done" +# RUN echo "Build aws-ofi-nccl" \ +# && cd /tmp \ +# && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \ +# && cd aws-ofi-nccl \ +# && ./autogen.sh \ +# && CC=gcc-12 CXX=g++-12 \ +# ./configure --prefix=/usr \ +# --with-mpi=/usr \ +# --with-libfabric=/usr \ +# --with-cuda=/usr/local/cuda \ +# LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \ +# && make -j"$(nproc)" \ +# && make install \ +# && ldconfig \ +# && cd /tmp && rm -rf aws-ofi-nccl \ +# && echo "Done" # Build OSU microbenchmarks ARG OSU_CONFIGURE_OPTIONS="--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda" diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index b5b98e5..1a01d2a 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T03:16:55.621358Z", - "correlation_id": "2d9b93f0-d726-403a-9e8d-c46c6fe12553" + "timestamp": "2025-12-12T03:33:48.299423Z", + "correlation_id": "fb1d1849-c687-4212-a294-c512cd59c207" }, "template": { "OS_VERSION": "24.04", From 7bf0c8fe403e73f671aadd93abbafcd1d671dffb Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:37:18 +0000 Subject: [PATCH 15/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 7 ++++--- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 481d2ee..0700322 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -187,13 +187,14 @@ RUN apt-get update -qq && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Copy runtime files from builder -COPY --from=builder /usr/lib/liblustreapi* /usr/lib/ -COPY --from=builder /usr/lib/libfabric* /usr/lib/ +# libfabric and lustre install to /usr/local/lib by default +# mpich installs to /usr (--prefix=/usr) +COPY --from=builder /usr/local/lib/liblustreapi* /usr/local/lib/ +COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/ COPY --from=builder /usr/lib/libmpi* /usr/lib/ COPY --from=builder /usr/lib/libmpich* /usr/lib/ COPY --from=builder /usr/lib/libmpl* /usr/lib/ COPY --from=builder /usr/lib/libopa* /usr/lib/ -COPY --from=builder /usr/lib/libnccl_net.so* /usr/lib/ COPY --from=builder /usr/bin/mpi* /usr/bin/ COPY --from=builder /usr/bin/hydra* /usr/bin/ COPY --from=builder /usr/bin/parkill /usr/bin/ diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 1a01d2a..519aa78 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T03:33:48.299423Z", - "correlation_id": "fb1d1849-c687-4212-a294-c512cd59c207" + "timestamp": "2025-12-12T03:37:18.513986Z", + "correlation_id": "01d88513-cc5d-4ae3-bddf-27ccc510be11" }, "template": { "OS_VERSION": "24.04", From 8afc94a53172f4337eefbe77148cf7a41d47477b Mon Sep 17 00:00:00 2001 From: ImageManager Bot Date: Fri, 12 Dec 2025 03:40:54 +0000 Subject: [PATCH 16/20] feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true platform=arm targets=setonix-registry] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with ImageManager Bot --- mpi/cuda-mpich-base/dockerfile.dockerfile | 27 ++++++++++++++++------- mpi/cuda-mpich-base/manifest.json | 4 ++-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 0700322..7c6cc6b 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -166,9 +166,13 @@ RUN if [ "${ENABLE_OSU}" = "1" ]; then \ echo "Done"; \ fi -# Check installed files for debugging -RUN echo "=== Checking Lustre files ===" \ - && find /usr -name "*lustre*" -o -name "liblustreapi*" 2>/dev/null | head -20 || true +# Check installed files for debugging - find actual library locations +RUN echo "=== Checking library locations ===" \ + && echo "--- Lustre libs ---" && find /usr -name "liblustreapi*" -type f 2>/dev/null \ + && echo "--- libfabric libs ---" && find /usr -name "libfabric*" -type f 2>/dev/null \ + && echo "--- MPI libs ---" && find /usr -name "libmpi*" -type f 2>/dev/null | head -10 \ + && echo "--- MPICH libs ---" && find /usr -name "libmpich*" -type f 2>/dev/null | head -10 \ + && echo "=== Done ===" # ====================== Stage 2: Runtime (minimal runtime environment) ====================== FROM ${IMAGE_NAME}:13.0.2-runtime-ubuntu${OS_VERSION} AS runtime @@ -186,11 +190,8 @@ RUN apt-get update -qq && apt-get install -y --no-install-recommends \ tzdata \ && rm -rf /var/lib/apt/lists/* -# Copy runtime files from builder -# libfabric and lustre install to /usr/local/lib by default -# mpich installs to /usr (--prefix=/usr) -COPY --from=builder /usr/local/lib/liblustreapi* /usr/local/lib/ -COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/ +# Copy runtime files from builder using multi-stage COPY +# MPI libraries and binaries (installed to /usr with --prefix=/usr) COPY --from=builder /usr/lib/libmpi* /usr/lib/ COPY --from=builder /usr/lib/libmpich* /usr/lib/ COPY --from=builder /usr/lib/libmpl* /usr/lib/ @@ -198,6 +199,16 @@ COPY --from=builder /usr/lib/libopa* /usr/lib/ COPY --from=builder /usr/bin/mpi* /usr/bin/ COPY --from=builder /usr/bin/hydra* /usr/bin/ COPY --from=builder /usr/bin/parkill /usr/bin/ + +# libfabric (installed to /usr/local by default) +COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/ + +# Lustre libraries (check actual location - may be in /usr/lib or /usr/lib64) +# Using RUN to handle flexible paths +RUN mkdir -p /usr/lib /usr/local/lib /usr/lib64 +COPY --from=builder /usr/lib/liblustreapi* /usr/lib/ + +# OSU benchmarks COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/osu-micro-benchmarks RUN ldconfig diff --git a/mpi/cuda-mpich-base/manifest.json b/mpi/cuda-mpich-base/manifest.json index 519aa78..4a6fee4 100644 --- a/mpi/cuda-mpich-base/manifest.json +++ b/mpi/cuda-mpich-base/manifest.json @@ -18,8 +18,8 @@ "path": "mpi/cuda-mpich-base/" }, "metadata": { - "timestamp": "2025-12-12T03:37:18.513986Z", - "correlation_id": "01d88513-cc5d-4ae3-bddf-27ccc510be11" + "timestamp": "2025-12-12T03:40:54.769384Z", + "correlation_id": "812057ba-44ee-41f1-8249-2438f9e8b78b" }, "template": { "OS_VERSION": "24.04", From c9f6b726088321ba653bcf7a73859bef997b894b Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Fri, 12 Dec 2025 14:54:59 +1100 Subject: [PATCH 17/20] feat(docker): add optional MPI binaries handling in cuda-mpich-base Dockerfile - Create a tar archive of optional MPI binaries (hydra* and parkill) if they exist. - Update the Dockerfile to copy the tar archive and extract it during the runtime stage. - Ensure the process handles cases where the binaries may not be present. --- mpi/cuda-mpich-base/dockerfile.dockerfile | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 7c6cc6b..7f31578 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -174,6 +174,16 @@ RUN echo "=== Checking library locations ===" \ && echo "--- MPICH libs ---" && find /usr -name "libmpich*" -type f 2>/dev/null | head -10 \ && echo "=== Done ===" +# Create a tar archive of optional MPI binaries (handles missing hydra* and parkill) +RUN mkdir -p /tmp/mpi-binaries-optional \ + && (cp -a /usr/bin/hydra* /tmp/mpi-binaries-optional/ 2>/dev/null || true) \ + && (cp -a /usr/bin/parkill /tmp/mpi-binaries-optional/ 2>/dev/null || true) \ + && if [ -n "$(ls -A /tmp/mpi-binaries-optional/* 2>/dev/null)" ]; then \ + tar czf /tmp/mpi-binaries-optional.tar.gz -C /tmp/mpi-binaries-optional .; \ + else \ + touch /tmp/mpi-binaries-optional.tar.gz; \ + fi + # ====================== Stage 2: Runtime (minimal runtime environment) ====================== FROM ${IMAGE_NAME}:13.0.2-runtime-ubuntu${OS_VERSION} AS runtime @@ -197,8 +207,12 @@ COPY --from=builder /usr/lib/libmpich* /usr/lib/ COPY --from=builder /usr/lib/libmpl* /usr/lib/ COPY --from=builder /usr/lib/libopa* /usr/lib/ COPY --from=builder /usr/bin/mpi* /usr/bin/ -COPY --from=builder /usr/bin/hydra* /usr/bin/ -COPY --from=builder /usr/bin/parkill /usr/bin/ +# Copy optional binaries (hydra* and parkill may not exist with gforker PM) +COPY --from=builder /tmp/mpi-binaries-optional.tar.gz /tmp/ +RUN if [ -s /tmp/mpi-binaries-optional.tar.gz ]; then \ + tar xzf /tmp/mpi-binaries-optional.tar.gz -C /usr/bin/ 2>/dev/null || true; \ + fi \ + && rm -f /tmp/mpi-binaries-optional.tar.gz # libfabric (installed to /usr/local by default) COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/ From a3a78f8a2a20153b30ad1ad25f059453a4c2d530 Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Fri, 12 Dec 2025 14:59:10 +1100 Subject: [PATCH 18/20] feat(docker): enhance cuda-mpich-base Dockerfile with build tools and cleanup --- mpi/cuda-mpich-base/dockerfile.dockerfile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 7f31578..72ffdc5 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -196,7 +196,9 @@ RUN apt-get update -qq && apt-get install -y --no-install-recommends \ bash ca-certificates wget gnupg lsb-release \ libnuma1 libgfortran5 libgcc-s1 libstdc++6 \ libyaml-0-2 keyutils \ - python3 python3-pip python3-venv \ + python3 python3-pip python3-venv python3-dev \ + gcc-12 g++-12 gfortran-12 \ + libc6-dev \ tzdata \ && rm -rf /var/lib/apt/lists/* @@ -227,8 +229,14 @@ COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/o RUN ldconfig +# Install mpi4py (requires build tools) RUN pip install --break-system-packages mpi4py==${MPI4PY_VERSION} +# Remove build tools to reduce image size (keep only runtime libraries) +RUN apt-get remove -y gcc-12 g++-12 gfortran-12 libc6-dev python3-dev \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* + ENV PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH" \ NCCL_SOCKET_IFNAME=hsn \ CXI_FORK_SAFE=1 \ From d4d374c8997aae1602233c91b7800fe88c962e82 Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Fri, 12 Dec 2025 15:02:56 +1100 Subject: [PATCH 19/20] feat(docker): add MPI header files handling in cuda-mpich-base Dockerfile --- mpi/cuda-mpich-base/dockerfile.dockerfile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index 72ffdc5..abde0c2 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -184,6 +184,19 @@ RUN mkdir -p /tmp/mpi-binaries-optional \ touch /tmp/mpi-binaries-optional.tar.gz; \ fi +# Create a tar archive of MPI header files (required for mpi4py compilation) +RUN mkdir -p /tmp/mpi-headers \ + && (cp -a /usr/include/mpi* /tmp/mpi-headers/ 2>/dev/null || true) \ + && (cp -a /usr/include/mpif* /tmp/mpi-headers/ 2>/dev/null || true) \ + && if [ -d /usr/include/mpich ]; then \ + cp -r /usr/include/mpich* /tmp/mpi-headers/ 2>/dev/null || true; \ + fi \ + && if [ -n "$(ls -A /tmp/mpi-headers/* 2>/dev/null)" ]; then \ + tar czf /tmp/mpi-headers.tar.gz -C /tmp/mpi-headers .; \ + else \ + echo "Warning: No MPI headers found" && touch /tmp/mpi-headers.tar.gz; \ + fi + # ====================== Stage 2: Runtime (minimal runtime environment) ====================== FROM ${IMAGE_NAME}:13.0.2-runtime-ubuntu${OS_VERSION} AS runtime @@ -209,6 +222,13 @@ COPY --from=builder /usr/lib/libmpich* /usr/lib/ COPY --from=builder /usr/lib/libmpl* /usr/lib/ COPY --from=builder /usr/lib/libopa* /usr/lib/ COPY --from=builder /usr/bin/mpi* /usr/bin/ +# Copy MPI header files (required for mpi4py compilation) +COPY --from=builder /tmp/mpi-headers.tar.gz /tmp/ +RUN mkdir -p /usr/include \ + && if [ -s /tmp/mpi-headers.tar.gz ]; then \ + tar xzf /tmp/mpi-headers.tar.gz -C /usr/include/ 2>/dev/null || true; \ + fi \ + && rm -f /tmp/mpi-headers.tar.gz # Copy optional binaries (hydra* and parkill may not exist with gforker PM) COPY --from=builder /tmp/mpi-binaries-optional.tar.gz /tmp/ RUN if [ -s /tmp/mpi-binaries-optional.tar.gz ]; then \ From 05b2ece88cebbd907c41beed5b7e17a46f99d86a Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Fri, 12 Dec 2025 15:20:40 +1100 Subject: [PATCH 20/20] feat(docker): update cuda-mpich-base Dockerfile with CUDA runtime version and cleanup --- mpi/cuda-mpich-base/dockerfile.dockerfile | 54 ++++++++++++++--------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/mpi/cuda-mpich-base/dockerfile.dockerfile b/mpi/cuda-mpich-base/dockerfile.dockerfile index abde0c2..1d81f4c 100644 --- a/mpi/cuda-mpich-base/dockerfile.dockerfile +++ b/mpi/cuda-mpich-base/dockerfile.dockerfile @@ -6,6 +6,7 @@ ARG MPICH_VERSION="3.4.3" ARG MPI4PY_VERSION="3.1.5" ARG ENABLE_OSU="1" ARG CUDA_VERSION="13-0" +ARG CUDA_RUNTIME_VERSION="13.0.2" ARG OSU_VERSION="7.3" ARG IMAGE_NAME="nvidia/cuda" @@ -17,6 +18,7 @@ ARG LINUX_KERNEL ARG LIBFABRIC_VERSION ARG MPICH_VERSION ARG CUDA_VERSION +ARG CUDA_RUNTIME_VERSION ARG OSU_VERSION ARG ENABLE_OSU ENV DEBIAN_FRONTEND=noninteractive @@ -54,7 +56,9 @@ RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/ libcudnn9-dev-cuda-12 \ libnccl2 libnccl-dev \ && rm -rf /var/lib/apt/lists/* \ - && ln -s /usr/local/cuda-13.0 /usr/local/cuda + && CUDA_MAJOR=$(echo ${CUDA_VERSION} | cut -d'-' -f1) \ + && CUDA_MINOR=$(echo ${CUDA_VERSION} | cut -d'-' -f2) \ + && ln -sf /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda ENV PATH="/usr/local/cuda/bin:${PATH}" ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" @@ -88,7 +92,7 @@ RUN mkdir -p /tmp/build && cd /tmp/build \ && cd libfabric-${LIBFABRIC_VERSION} \ && ./autogen.sh && ./configure \ && make -j"$(nproc)" && make install \ - && rm -rf /tmp/build/libfabric-* + && cd / && rm -rf /tmp/build # Build Lustre client RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \ @@ -116,7 +120,7 @@ ARG MPICH_CONFIGURE_OPTIONS="--prefix=/usr --without-mpe --enable-fortran=all -- --with-shared-memory=sysv --disable-allowport --with-pm=gforker \ --with-file-system=ufs+lustre+nfs \ --enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \ -CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch" # <<< CHANGED: 加 prefix=/usr +CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch" COPY mpich_patches.tgz /tmp/ RUN echo "Building MPICH..." \ && mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \ @@ -166,14 +170,6 @@ RUN if [ "${ENABLE_OSU}" = "1" ]; then \ echo "Done"; \ fi -# Check installed files for debugging - find actual library locations -RUN echo "=== Checking library locations ===" \ - && echo "--- Lustre libs ---" && find /usr -name "liblustreapi*" -type f 2>/dev/null \ - && echo "--- libfabric libs ---" && find /usr -name "libfabric*" -type f 2>/dev/null \ - && echo "--- MPI libs ---" && find /usr -name "libmpi*" -type f 2>/dev/null | head -10 \ - && echo "--- MPICH libs ---" && find /usr -name "libmpich*" -type f 2>/dev/null | head -10 \ - && echo "=== Done ===" - # Create a tar archive of optional MPI binaries (handles missing hydra* and parkill) RUN mkdir -p /tmp/mpi-binaries-optional \ && (cp -a /usr/bin/hydra* /tmp/mpi-binaries-optional/ 2>/dev/null || true) \ @@ -182,7 +178,8 @@ RUN mkdir -p /tmp/mpi-binaries-optional \ tar czf /tmp/mpi-binaries-optional.tar.gz -C /tmp/mpi-binaries-optional .; \ else \ touch /tmp/mpi-binaries-optional.tar.gz; \ - fi + fi \ + && rm -rf /tmp/mpi-binaries-optional # Create a tar archive of MPI header files (required for mpi4py compilation) RUN mkdir -p /tmp/mpi-headers \ @@ -195,10 +192,16 @@ RUN mkdir -p /tmp/mpi-headers \ tar czf /tmp/mpi-headers.tar.gz -C /tmp/mpi-headers .; \ else \ echo "Warning: No MPI headers found" && touch /tmp/mpi-headers.tar.gz; \ - fi + fi \ + && rm -rf /tmp/mpi-headers # ====================== Stage 2: Runtime (minimal runtime environment) ====================== -FROM ${IMAGE_NAME}:13.0.2-runtime-ubuntu${OS_VERSION} AS runtime +# CUDA runtime version for base image tag (e.g., "13.0.2" for CUDA_VERSION="13-0") +# This should match the patch version of the CUDA runtime image +ARG CUDA_RUNTIME_VERSION="13.0.2" +ARG OS_VERSION +ARG IMAGE_NAME +FROM ${IMAGE_NAME}:${CUDA_RUNTIME_VERSION}-runtime-ubuntu${OS_VERSION} AS runtime ARG MPI4PY_VERSION ARG CUDA_VERSION @@ -244,8 +247,12 @@ COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/ RUN mkdir -p /usr/lib /usr/local/lib /usr/lib64 COPY --from=builder /usr/lib/liblustreapi* /usr/lib/ -# OSU benchmarks +# OSU benchmarks (only if enabled) +RUN mkdir -p /usr/local/libexec COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/osu-micro-benchmarks +RUN if [ "${ENABLE_OSU}" != "1" ] && [ -d /usr/local/libexec/osu-micro-benchmarks ]; then \ + rm -rf /usr/local/libexec/osu-micro-benchmarks; \ + fi RUN ldconfig @@ -257,13 +264,16 @@ RUN apt-get remove -y gcc-12 g++-12 gfortran-12 libc6-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* -ENV PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH" \ - NCCL_SOCKET_IFNAME=hsn \ +ENV NCCL_SOCKET_IFNAME=hsn \ CXI_FORK_SAFE=1 \ CXI_FORK_SAFE_HP=1 \ FI_CXI_DISABLE_CQ_HUGETLB=1 \ CUDA_PATH=/usr/local/cuda \ - LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} +# Add OSU to PATH if enabled (paths are harmless if directory doesn't exist) +RUN if [ "${ENABLE_OSU}" = "1" ]; then \ + echo 'export PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH"' >> /etc/profile.d/osu.sh; \ + fi # Singularity environment injection RUN mkdir -p /.singularity.d/env/ \ @@ -272,15 +282,17 @@ RUN mkdir -p /.singularity.d/env/ \ && echo "export CXI_FORK_SAFE_HP=1" >> /.singularity.d/env/91-environment.sh \ && echo "export FI_CXI_DISABLE_CQ_HUGETLB=1" >> /.singularity.d/env/91-environment.sh \ && echo "export CUDA_PATH=/usr/local/cuda" >> /.singularity.d/env/91-environment.sh \ - && echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:\${LD_LIBRARY_PATH}" >> /.singularity.d/env/91-environment.sh + && echo "export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:\${LD_LIBRARY_PATH}" >> /.singularity.d/env/91-environment.sh RUN rm -rf /usr/share/doc/* /usr/share/man/* /usr/share/locale/* || true RUN echo "=== Runtime libraries check ===" \ && ls -lh /usr/lib/liblustreapi* || echo "No Lustre libs" \ && ls -lh /usr/lib/libmpi* || echo "No MPI libs" \ - && which mpicc || echo "No mpicc" \ - && which mpirun || echo "No mpirun" + && echo "--- MPI executables ---" \ + && (which mpicc && echo "mpicc: $(which mpicc)") || echo "No mpicc" \ + && (which mpiexec && echo "mpiexec: $(which mpiexec)") || echo "No mpiexec" \ + && (which mpirun && echo "mpirun: $(which mpirun)") || echo "No mpirun (using gforker PM, mpiexec should be used instead)" WORKDIR /workspace LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=arm