Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
089d7d2
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false p…
Dec 12, 2025
bb42d84
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false p…
Dec 12, 2025
40b221f
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=false p…
Dec 12, 2025
e066b8c
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
37a5669
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
a9ea16e
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
2a201ae
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
ae9658d
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
2d60edb
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
1a2b9bd
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
693c030
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
c38d953
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
74b10b7
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
2b39f27
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
7bf0c8f
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
8afc94a
feat(upload): mpi/cuda-mpich-base v0.0.1 [devmode=true noscan=true pl…
Dec 12, 2025
c9f6b72
feat(docker): add optional MPI binaries handling in cuda-mpich-base D…
Quleaf Dec 12, 2025
a3a78f8
feat(docker): enhance cuda-mpich-base Dockerfile with build tools and…
Quleaf Dec 12, 2025
d4d374c
feat(docker): add MPI header files handling in cuda-mpich-base Docker…
Quleaf Dec 12, 2025
05b2ece
feat(docker): update cuda-mpich-base Dockerfile with CUDA runtime ver…
Quleaf Dec 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 298 additions & 0 deletions mpi/cuda-mpich-base/dockerfile.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
# ========================= Common Args =========================
ARG OS_VERSION="24.04"
ARG LINUX_KERNEL="6.8.0-31"
ARG LIBFABRIC_VERSION="1.18.1"
ARG MPICH_VERSION="3.4.3"
ARG MPI4PY_VERSION="3.1.5"
ARG ENABLE_OSU="1"
ARG CUDA_VERSION="13-0"
ARG CUDA_RUNTIME_VERSION="13.0.2"
ARG OSU_VERSION="7.3"
ARG IMAGE_NAME="nvidia/cuda"

# ====================== Stage 1: Builder (full build environment) ======================
FROM ubuntu:${OS_VERSION} AS builder

ARG OS_VERSION
ARG LINUX_KERNEL
ARG LIBFABRIC_VERSION
ARG MPICH_VERSION
ARG CUDA_VERSION
ARG CUDA_RUNTIME_VERSION
ARG OSU_VERSION
ARG ENABLE_OSU
ENV DEBIAN_FRONTEND=noninteractive

# Build toolchain & headers
RUN apt-get update -qq && apt-get -y --no-install-recommends install \
build-essential \
libc6-dev \
gcc-12 g++-12 gfortran-12 \
gnupg gnupg2 ca-certificates gdb wget git curl \
python3-six python3-setuptools python3-numpy python3-pip python3-scipy python3-venv python3-dev \
patchelf strace ltrace \
libcrypt-dev libcurl4-openssl-dev libpython3-dev libreadline-dev libssl-dev \
sudo autoconf automake bison flex gcovr libtool m4 make openssh-server patch \
subversion tzdata valgrind vim xsltproc zlib1g-dev ninja-build libnuma-dev swig \
linux-tools-generic linux-source software-properties-common \
libkeyutils-dev libnl-genl-3-dev libyaml-dev libmount-dev pkg-config \
libhwloc-dev hwloc \
linux-headers-${LINUX_KERNEL}-generic linux-headers-${LINUX_KERNEL} \
fakeroot devscripts dpkg-dev \
&& rm -rf /var/lib/apt/lists/*

# Install CUDA packages from NVIDIA repository
RUN wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \
&& dpkg -i cuda-keyring_1.1-1_all.deb \
&& rm cuda-keyring_1.1-1_all.deb \
&& apt-get update -qq \
&& apt-get install -y --no-install-recommends \
cuda-cudart-dev-${CUDA_VERSION} \
cuda-nvcc-${CUDA_VERSION} \
cuda-crt-${CUDA_VERSION} \
cuda-cudart-${CUDA_VERSION} \
cuda-driver-dev-${CUDA_VERSION} \
cuda-libraries-dev-${CUDA_VERSION} \
libcudnn9-dev-cuda-12 \
libnccl2 libnccl-dev \
&& rm -rf /var/lib/apt/lists/* \
&& CUDA_MAJOR=$(echo ${CUDA_VERSION} | cut -d'-' -f1) \
&& CUDA_MINOR=$(echo ${CUDA_VERSION} | cut -d'-' -f2) \
&& ln -sf /usr/local/cuda-${CUDA_MAJOR}.${CUDA_MINOR} /usr/local/cuda

ENV PATH="/usr/local/cuda/bin:${PATH}"
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
ENV CUDA_HOME="/usr/local/cuda"

# Modern CMake
RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-aarch64.sh \
&& chmod +x cmake-3.31.7-linux-aarch64.sh \
&& ./cmake-3.31.7-linux-aarch64.sh --skip-license --prefix=/usr --include-subdir \
&& ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/cmake /usr/bin/cmake \
&& ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/ctest /usr/bin/ctest \
&& ln -sf /usr/cmake-3.31.7-linux-aarch64/bin/cpack /usr/bin/cpack \
&& cmake --version \
&& rm -f cmake-3.31.7-linux-aarch64.sh

# Kernel config for Lustre
RUN echo "deb-src http://archive.ubuntu.com/ubuntu noble main restricted" >> /etc/apt/sources.list \
&& apt-get update -qq \
&& cd /tmp \
&& apt-get source linux \
&& cd linux-* \
&& chmod +x ./debian/scripts/misc/annotations \
&& ./debian/scripts/misc/annotations --arch arm64 --flavour generic --export > .config \
&& cp .config /usr/lib/modules/${LINUX_KERNEL}-generic/build/ \
&& cd /tmp && rm -rf linux-*

# Build libfabric
RUN mkdir -p /tmp/build && cd /tmp/build \
&& wget -q https://github.com/ofiwg/libfabric/archive/refs/tags/v${LIBFABRIC_VERSION}.tar.gz \
&& tar xf v${LIBFABRIC_VERSION}.tar.gz \
&& cd libfabric-${LIBFABRIC_VERSION} \
&& ./autogen.sh && ./configure \
&& make -j"$(nproc)" && make install \
&& cd / && rm -rf /tmp/build

# Build Lustre client
RUN mkdir -p /tmp/lustre-build && cd /tmp/lustre-build \
&& for i in 1 2 3; do \
echo "Cloning Lustre (attempt $i)..." && \
git clone --depth 1 https://github.com/lustre/lustre-release.git && break || { \
echo "Clone failed. Retrying in 5s..."; sleep 5; \
}; \
done \
&& cd lustre-release \
&& bash autogen.sh \
&& ./configure --disable-server --enable-client \
--with-linux=/usr/lib/modules/${LINUX_KERNEL}-generic/build \
--disable-tests \
CFLAGS=-Wno-error=attribute-warning \
&& make -j"$(nproc)" \
&& make install \
&& ldconfig \
&& cd / && rm -rf /tmp/lustre-build

# Build MPICH with Lustre support
ARG MPICH_CONFIGURE_OPTIONS="--prefix=/usr --without-mpe --enable-fortran=all --enable-shared --enable-sharedlibs=gcc \
--enable-debuginfo --enable-yield=sched_yield --enable-g=mem \
--with-device=ch4:ofi --with-namepublisher=file \
--with-shared-memory=sysv --disable-allowport --with-pm=gforker \
--with-file-system=ufs+lustre+nfs \
--enable-threads=runtime --enable-fast=O2 --enable-thread-cs=global \
CC=gcc-12 CXX=g++-12 FC=gfortran-12 FFLAGS=-fallow-argument-mismatch"
COPY mpich_patches.tgz /tmp/
RUN echo "Building MPICH..." \
&& mkdir -p /tmp/mpich-build && cd /tmp/mpich-build \
&& wget -q http://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz \
&& tar xf mpich-${MPICH_VERSION}.tar.gz \
&& cd mpich-${MPICH_VERSION} \
&& tar xf /tmp/mpich_patches.tgz \
&& patch -p0 < csel.patch \
&& patch -p0 < ch4r_init.patch \
&& ./configure ${MPICH_CONFIGURE_OPTIONS} \
&& make -j"$(nproc)" \
&& make install \
&& ldconfig \
&& cd / && rm -rf /tmp/mpich-build \
&& echo "Finished building MPICH"

# Build aws-ofi-nccl (CUDA NCCL plugin for libfabric)
# RUN echo "Build aws-ofi-nccl" \
# && cd /tmp \
# && git clone --depth 1 https://github.com/aws/aws-ofi-nccl.git \
# && cd aws-ofi-nccl \
# && ./autogen.sh \
# && CC=gcc-12 CXX=g++-12 \
# ./configure --prefix=/usr \
# --with-mpi=/usr \
# --with-libfabric=/usr \
# --with-cuda=/usr/local/cuda \
# LDFLAGS="-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs" \
# && make -j"$(nproc)" \
# && make install \
# && ldconfig \
# && cd /tmp && rm -rf aws-ofi-nccl \
# && echo "Done"

# Build OSU microbenchmarks
ARG OSU_CONFIGURE_OPTIONS="--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda"
RUN if [ "${ENABLE_OSU}" = "1" ]; then \
echo "Building OSU..." && \
cd /tmp && \
wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \
tar xf osu-micro-benchmarks-${OSU_VERSION}.tar.gz && \
cd osu-micro-benchmarks-${OSU_VERSION} && \
./configure ${OSU_CONFIGURE_OPTIONS} && \
make -j"$(nproc)" && \
make install && \
cd /tmp && rm -rf osu-micro-benchmarks-* && \
echo "Done"; \
fi

# Create a tar archive of optional MPI binaries (handles missing hydra* and parkill)
RUN mkdir -p /tmp/mpi-binaries-optional \
&& (cp -a /usr/bin/hydra* /tmp/mpi-binaries-optional/ 2>/dev/null || true) \
&& (cp -a /usr/bin/parkill /tmp/mpi-binaries-optional/ 2>/dev/null || true) \
&& if [ -n "$(ls -A /tmp/mpi-binaries-optional/* 2>/dev/null)" ]; then \
tar czf /tmp/mpi-binaries-optional.tar.gz -C /tmp/mpi-binaries-optional .; \
else \
touch /tmp/mpi-binaries-optional.tar.gz; \
fi \
&& rm -rf /tmp/mpi-binaries-optional

# Create a tar archive of MPI header files (required for mpi4py compilation)
RUN mkdir -p /tmp/mpi-headers \
&& (cp -a /usr/include/mpi* /tmp/mpi-headers/ 2>/dev/null || true) \
&& (cp -a /usr/include/mpif* /tmp/mpi-headers/ 2>/dev/null || true) \
&& if [ -d /usr/include/mpich ]; then \
cp -r /usr/include/mpich* /tmp/mpi-headers/ 2>/dev/null || true; \
fi \
&& if [ -n "$(ls -A /tmp/mpi-headers/* 2>/dev/null)" ]; then \
tar czf /tmp/mpi-headers.tar.gz -C /tmp/mpi-headers .; \
else \
echo "Warning: No MPI headers found" && touch /tmp/mpi-headers.tar.gz; \
fi \
&& rm -rf /tmp/mpi-headers

# ====================== Stage 2: Runtime (minimal runtime environment) ======================
# CUDA runtime version for base image tag (e.g., "13.0.2" for CUDA_VERSION="13-0")
# This should match the patch version of the CUDA runtime image
ARG CUDA_RUNTIME_VERSION="13.0.2"
ARG OS_VERSION
ARG IMAGE_NAME
FROM ${IMAGE_NAME}:${CUDA_RUNTIME_VERSION}-runtime-ubuntu${OS_VERSION} AS runtime

ARG MPI4PY_VERSION
ARG CUDA_VERSION
ARG ENABLE_OSU
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update -qq && apt-get install -y --no-install-recommends \
bash ca-certificates wget gnupg lsb-release \
libnuma1 libgfortran5 libgcc-s1 libstdc++6 \
libyaml-0-2 keyutils \
python3 python3-pip python3-venv python3-dev \
gcc-12 g++-12 gfortran-12 \
libc6-dev \
tzdata \
&& rm -rf /var/lib/apt/lists/*

# Copy runtime files from builder using multi-stage COPY
# MPI libraries and binaries (installed to /usr with --prefix=/usr)
COPY --from=builder /usr/lib/libmpi* /usr/lib/
COPY --from=builder /usr/lib/libmpich* /usr/lib/
COPY --from=builder /usr/lib/libmpl* /usr/lib/
COPY --from=builder /usr/lib/libopa* /usr/lib/
COPY --from=builder /usr/bin/mpi* /usr/bin/
# Copy MPI header files (required for mpi4py compilation)
COPY --from=builder /tmp/mpi-headers.tar.gz /tmp/
RUN mkdir -p /usr/include \
&& if [ -s /tmp/mpi-headers.tar.gz ]; then \
tar xzf /tmp/mpi-headers.tar.gz -C /usr/include/ 2>/dev/null || true; \
fi \
&& rm -f /tmp/mpi-headers.tar.gz
# Copy optional binaries (hydra* and parkill may not exist with gforker PM)
COPY --from=builder /tmp/mpi-binaries-optional.tar.gz /tmp/
RUN if [ -s /tmp/mpi-binaries-optional.tar.gz ]; then \
tar xzf /tmp/mpi-binaries-optional.tar.gz -C /usr/bin/ 2>/dev/null || true; \
fi \
&& rm -f /tmp/mpi-binaries-optional.tar.gz

# libfabric (installed to /usr/local by default)
COPY --from=builder /usr/local/lib/libfabric* /usr/local/lib/

# Lustre libraries (check actual location - may be in /usr/lib or /usr/lib64)
# Using RUN to handle flexible paths
RUN mkdir -p /usr/lib /usr/local/lib /usr/lib64
COPY --from=builder /usr/lib/liblustreapi* /usr/lib/

# OSU benchmarks (only if enabled)
RUN mkdir -p /usr/local/libexec
COPY --from=builder /usr/local/libexec/osu-micro-benchmarks /usr/local/libexec/osu-micro-benchmarks
RUN if [ "${ENABLE_OSU}" != "1" ] && [ -d /usr/local/libexec/osu-micro-benchmarks ]; then \
rm -rf /usr/local/libexec/osu-micro-benchmarks; \
fi

RUN ldconfig

# Install mpi4py (requires build tools)
RUN pip install --break-system-packages mpi4py==${MPI4PY_VERSION}

# Remove build tools to reduce image size (keep only runtime libraries)
RUN apt-get remove -y gcc-12 g++-12 gfortran-12 libc6-dev python3-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*

ENV NCCL_SOCKET_IFNAME=hsn \
CXI_FORK_SAFE=1 \
CXI_FORK_SAFE_HP=1 \
FI_CXI_DISABLE_CQ_HUGETLB=1 \
CUDA_PATH=/usr/local/cuda \
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH}
# Add OSU to PATH if enabled (paths are harmless if directory doesn't exist)
RUN if [ "${ENABLE_OSU}" = "1" ]; then \
echo 'export PATH="/usr/local/libexec/osu-micro-benchmarks/mpi/collective:/usr/local/libexec/osu-micro-benchmarks/mpi/one-sided:/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt:/usr/local/libexec/osu-micro-benchmarks/mpi/startup:$PATH"' >> /etc/profile.d/osu.sh; \
fi

# Singularity environment injection
RUN mkdir -p /.singularity.d/env/ \
&& echo "export NCCL_SOCKET_IFNAME=hsn" >> /.singularity.d/env/91-environment.sh \
&& echo "export CXI_FORK_SAFE=1" >> /.singularity.d/env/91-environment.sh \
&& echo "export CXI_FORK_SAFE_HP=1" >> /.singularity.d/env/91-environment.sh \
&& echo "export FI_CXI_DISABLE_CQ_HUGETLB=1" >> /.singularity.d/env/91-environment.sh \
&& echo "export CUDA_PATH=/usr/local/cuda" >> /.singularity.d/env/91-environment.sh \
&& echo "export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:\${LD_LIBRARY_PATH}" >> /.singularity.d/env/91-environment.sh

RUN rm -rf /usr/share/doc/* /usr/share/man/* /usr/share/locale/* || true

RUN echo "=== Runtime libraries check ===" \
&& ls -lh /usr/lib/liblustreapi* || echo "No Lustre libs" \
&& ls -lh /usr/lib/libmpi* || echo "No MPI libs" \
&& echo "--- MPI executables ---" \
&& (which mpicc && echo "mpicc: $(which mpicc)") || echo "No mpicc" \
&& (which mpiexec && echo "mpiexec: $(which mpiexec)") || echo "No mpiexec" \
&& (which mpirun && echo "mpirun: $(which mpirun)") || echo "No mpirun (using gforker PM, mpiexec should be used instead)"

WORKDIR /workspace
LABEL org.opencontainers.image.version=0.0.1 org.opencontainers.image.devmode=true org.opencontainers.image.noscan=true org.opencontainers.image.platform=arm
36 changes: 36 additions & 0 deletions mpi/cuda-mpich-base/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"main": "mpi",
"feature": "cuda-mpich-base",
"version": "0.0.1",
"devmode": true,
"noscan": true,
"platform": "arm",
"shpc": false,
"targets": [
"setonix-registry"
],
"private-targets": [
"liu268"
],
"github": {
"repo": "Quleaf/ImageManagerAction",
"branch": "cicd-mpi/cuda-mpich-base",
"path": "mpi/cuda-mpich-base/"
},
"metadata": {
"timestamp": "2025-12-12T03:40:54.769384Z",
"correlation_id": "812057ba-44ee-41f1-8249-2438f9e8b78b"
},
"template": {
"OS_VERSION": "24.04",
"LINUX_KERNEL": "6.8.0-31",
"LIBFABRIC_VERSION": "1.18.1",
"MPICH_VERSION": "3.4.3",
"MPI4PY_VERSION": "3.1.5",
"ENABLE_OSU": "1",
"CUDA_VERSION": "13-0",
"OSU_VERSION": "7.3",
"IMAGE_NAME": "nvidia/cuda",
"OSU_CONFIGURE_OPTIONS": "--prefix=/usr/local CC=mpicc CXX=mpicxx CFLAGS=-O3 --enable-cuda --with-cuda=/usr/local/cuda"
}
}
Binary file added mpi/cuda-mpich-base/mpich_patches.tgz
Binary file not shown.