Skip to content

GEOS latest branch failing to build on HPC cluster with Slurm-enabled OpenMPI with CUDA: hypre_CheckMemoryLocation error #3907

@danesnick

Description

@danesnick

Describe the bug

Build fails to pass all ctests; repeated hypre error of the form:

hypre_CheckMemoryLocation(void*, hypre_MemoryLocation): Assertion `location == location_ptr' failed.

To Reproduce

CMAKE script

set( CONFIG_NAME "dev-gcc-ompi-cuda" ) 

# Set compilers path
set(CMAKE_C_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/gcc" CACHE PATH "")   # This is typically something like /usr/bin/gcc ... or clang
set(CMAKE_CXX_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/g++" CACHE PATH "") # This is typically something like /usr/bin/g++ ... or clang++
set(CMAKE_FC_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/gfortran" CACHE PATH "") # This is typically something like /usr/bin/g++ ... or clang++
set(ENABLE_FORTRAN ON CACHE BOOL "" FORCE)

# Set paths to mpi
set(ENABLE_MPI ON CACHE PATH "")
set(MPI_C_COMPILER "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpicc" CACHE PATH "")    # This is typically something like /usr/bin/mpicc
set(MPI_CXX_COMPILER "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpicxx" CACHE PATH "") # This is typically something like /usr/bin/mpicxx
set(MPIEXEC "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpirun" CACHE PATH "")          # This is typically something like /usr/bin/mpirun

# Set paths to blas and lapack
set( BLAS_LIBRARIES "/sw/libs/openblas/0.3.25/lib/libopenblas.a" CACHE PATH "" FORCE )     # This is typically something like /usr/lib64/libblas.so 
set( LAPACK_LIBRARIES "/sw/libs/openblas/0.3.25/lib/libopenblas.a" CACHE PATH "" FORCE ) # This is typically something like /usr/lib64/liblapack.so
#set( CONDUIT_DIR "/sw/apps/GEOS-project/dev/conduit/build" CACHE PATH "" FORCE ) # This is typically something like /usr/lib64/liblapack.so



# CUDA
set(ENABLE_CUDA ON CACHE BOOL "" FORCE)
set(ENABLE_HYPRE_CUDA ON CACHE BOOL "" FORCE)

set(CUDA_TOOLKIT_ROOT_DIR "/sw/libs/cuda/12.9.1" CACHE PATH "")
set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "")
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "")
set(CMAKE_CUDA_STANDARD 17)

set(CUDA_SEPARABLE_COMPILIATION ON CACHE BOOL "")
set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "")
set(CUDA_ARCH "sm_70" CACHE STRING "")
#set(CMAKE_CUDA_FLAGS "-O2 -restrict -arch ${CUDA_ARCH} --extended-lambda" CACHE STRING "")
# set(CMAKE_CUDA_LINK_FLAGS "-Xlinker –rpath –Xlinker /usr/bin/mpicxx" CACHE STRING "")

#set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations -arch sm_70" CACHE STRING "")
set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda -Wno-deprecated-gpu-targets -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")


# Python options
set(ENABLE_PYLVARRAY ON CACHE BOOL "")
set(ENABLE_PYGEOSX ON CACHE BOOL "")
set(Python3_ROOT_DIR "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")
set(Python3_EXECUTABLE "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")
set(PYTHON_EXECUTABLE "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")


# openMP
set( ENABLE_OPENMP OFF CACHE PATH "" FORCE )
set(ENABLE_HYPRE ON CACHE BOOL "")
set(ENABLE_HYPRE_DEVICE "CUDA" CACHE BOOL "")




# enable PAMELA and PVTPackage
set(ENABLE_PAMELA ON CACHE BOOL "" FORCE)
set(ENABLE_PVTPackage ON CACHE BOOL "" FORCE)

# Python options
set(ENABLE_PYLVARRAY ON CACHE BOOL "")
set(ENABLE_PYGEOSX ON CACHE BOOL "")
set( ENABLE_OPENMP ON CACHE PATH "" FORCE )


# TPLs
set( ENABLE_TRILINOS OFF CACHE PATH "" FORCE )
set( ENABLE_CALIPER OFF CACHE PATH "" FORCE )
set( ENABLE_DOXYGEN OFF CACHE BOOL "" FORCE)
set( ENABLE_MATHPRESSO OFF CACHE BOOL "" FORCE )

if (DEFINED ENV{GEOSX_TPL_DIR})
    set(GEOS_TPL_DIR "$ENV{GEOSX_TPL_DIR}" CACHE PATH "" FORCE)
endif()
#include(${CMAKE_CURRENT_LIST_DIR}/tpls.cmake)
#include(${GEOS_TPL_DIR}/install-wendian-quick-start-debug/tpls.cmake)
include(/sw/apps/GEOS-project/dev-gcc-ompi-cuda/tpls.cmake)

and the tpls.cmake:

#
# Performance portability
#
message("in tpls.cmake GEOS_TPL_DIR=${GEOS_TPL_DIR}")

#
# General TPL Folder verifications
#
if(NOT EXISTS ${GEOS_TPL_DIR})
  message(WARNING "'GEOS_TPL_DIR' does not exist.\n")
endif()


if(EXISTS ${GEOS_TPL_DIR}/raja)
  set(RAJA_DIR ${GEOS_TPL_DIR}/raja CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/blt)
  set(BLT_DIR ${GEOS_TPL_DIR}/blt CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/chai)
  set(UMPIRE_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
  set(CHAI_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
endif()

#
# IO TPLs
#
if(EXISTS ${GEOS_TPL_DIR}/hdf5)
  set(HDF5_DIR ${GEOS_TPL_DIR}/hdf5 CACHE PATH "" FORCE)
  message(STATUS "HDF5_DIR = ${HDF5_DIR}")
endif()

if(EXISTS ${GEOS_TPL_DIR}/conduit)
  set(CONDUIT_DIR ${GEOS_TPL_DIR}/conduit CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/silo)
  set(SILO_DIR ${GEOS_TPL_DIR}/silo CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/adiak)
  set(ADIAK_DIR ${GEOS_TPL_DIR}/adiak CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/caliper)
  set(CALIPER_DIR ${GEOS_TPL_DIR}/caliper CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/pugixml)
  set(PUGIXML_DIR ${GEOS_TPL_DIR}/pugixml CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/vtk)
  set(VTK_DIR ${GEOS_TPL_DIR}/vtk CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/fmt)
#  set(FMT_DIR ${GEOS_TPL_DIR}/fmt CACHE PATH "" FORCE)
  set(FMT_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
endif()

#
# Math TPLs
#
if(EXISTS ${GEOS_TPL_DIR}/metis)
  set(METIS_DIR ${GEOS_TPL_DIR}/metis CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/parmetis)
  set(PARMETIS_DIR ${GEOS_TPL_DIR}/parmetis CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/superlu_dist)
  set(SUPERLU_DIST_DIR ${GEOS_TPL_DIR}/superlu_dist CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/suitesparse)
  set(SUITESPARSE_DIR ${GEOS_TPL_DIR}/suitesparse CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/trilinos)
  set(TRILINOS_DIR ${GEOS_TPL_DIR}/trilinos CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/hypre)
  set(HYPRE_DIR ${GEOS_TPL_DIR}/hypre CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/scotch)
  set(SCOTCH_DIR ${GEOS_TPL_DIR}/scotch CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/petsc AND (NOT DEFINED ENABLE_PETSC OR ENABLE_PETSC))
  set(PETSC_DIR ${GEOS_TPL_DIR}/petsc CACHE PATH "" FORCE)
endif()

#
# Development tools
#
if(EXISTS ${GEOS_TPL_DIR}/uncrustify/bin/uncrustify)
  set(UNCRUSTIFY_EXECUTABLE ${GEOS_TPL_DIR}/uncrustify/bin/uncrustify CACHE PATH "" FORCE)
endif()

if(EXISTS ${GEOS_TPL_DIR}/doxygen/bin/doxygen)
  set(DOXYGEN_EXECUTABLE ${GEOS_TPL_DIR}/doxygen/bin/doxygen CACHE PATH "" FORCE)
endif()

#
# Other
#
if(EXISTS ${GEOS_TPL_DIR}/mathpresso)
  set(MATHPRESSO_DIR ${GEOS_TPL_DIR}/mathpresso CACHE PATH "" FORCE)
endif()

Build script

#!/bin/bash 
# the last build setp requires connecting to a GPU, so I recommend running this on a GPU node interactively
source /projects/shared_conda_envs/setup_env.sh
download_source=$1
conda env remove -n geos_env -y
conda create -n geos_env -c conda-forge numpy python=3.11 -y
conda activate geos_env
module load compilers/gcc/12.2.1
module load libs/cuda/12.9.1
module load mpi/openmpi/gcc-cuda12/4.1.8

# go to GEOS project folder
build_target="dev-gcc-ompi-cuda"
env > env_geos_build.out

mkdir -p /sw/apps/GEOS-project/${build_target}-git

cp ${build_target}.cmake  /sw/apps/GEOS-project/${build_target}-git/
cp tpls.cmake  /sw/apps/GEOS-project/${build_target}-git/

cd /sw/apps/GEOS-project/${build_target}-git


if [[ "$download_source" == 1 ]]; then
	rm -rf thirdPartyLibs 
	git clone https://github.com/GEOS-DEV/thirdPartyLibs.git
	cd thirdPartyLibs 
	git submodule init
	git submodule update
else
	cd thirdPartyLibs 
fi



if [[ "$download_source" == 1 ]]; then
# grab all the tpls manually from github using wget
cd tplMirror
rm *
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/Caliper-2.12.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/RAJA-v2025.03.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/SuiteSparse-5.10.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/VTK-9.4.2.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/adiak-0.4.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/asmjit-2e93826.zip
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/chai-2025.03.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/conduit-0.9.2.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/doxygen-1.8.20.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/fmt-11.0.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/hdf5-1.12.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/hypre-v2.33.0-12-gbe52325a3.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/mathpresso-24d60e5.zip
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/parmetis-4.0.3-IDXTYPEWIDTH_64.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/petsc-3.19.4.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/pugixml-1.13.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/scotch-v7.0.8.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/silo-4.11.1-bsd.tar.xz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/silo-4.11.patch
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/superlu_dist-0f6efc3.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/superlu_dist-9.0.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/trilinos-release-16-1-0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/uncrustify-401a409.zip
fi


cd /sw/apps/GEOS-project/${build_target}-git/thirdPartyLibs
python scripts/config-build.py -hc /sw/apps/GEOS-project/${build_target}-git/${build_target}.cmake

cd build-${build_target}-debug
make -j6

export GEOS_TPL_DIR=/sw/apps/GEOS-project/dev-gcc-ompi-cuda-git/thirdPartyLibs/install-dev-gcc-ompi-cuda-debug
export GEOSX_TPL_DIR=$GEOS_TPL_DIR

cd /sw/apps/GEOS-project/${build_target}-git
if [[ "$download_source" == 1 ]]; then
	rm -rf GEOS
	git clone https://github.com/GEOS-DEV/GEOS.git
	cd GEOS
	git submodule init
	git submodule update
else
	cd GEOS
fi
python scripts/config-build.py -hc /sw/apps/GEOS-project/dev-gcc-ompi-cuda/${build_target}.cmake
cd build-${build_target}-debug
make -j6

With OpenMPI build with PMI

Leads to the following error at the end of build

[g003:3126332] OPAL ERROR: Unreachable in file pmix3x_client.c at line 111
--------------------------------------------------------------------------
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM's PMI support and therefore cannot
execute. There are several options for building PMI support under
SLURM, depending upon the SLURM version you are using:

  version 16.05 or later: you can use SLURM's PMIx support. This
  requires that you configure and build SLURM --with-pmix.

  Versions earlier than 16.05: you must use either SLURM's PMI-1 or
  PMI-2 support. SLURM builds PMI-1 by default, or you can manually
  install PMI-2. You must then build Open MPI using --with-pmi pointing
  to the SLURM PMI library location.

Please configure as appropriate and try again.
--------------------------------------------------------------------------
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
***    and potentially your MPI job)
[g003:3126332] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!
make[2]: *** [CMakeFiles/geosx_generate_install_schema.dir/build.make:71: CMakeFiles/geosx_generate_install_schema] Error 1
make[1]: *** [CMakeFiles/Makefile2:2426: CMakeFiles/geosx_generate_install_schema.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....
[100%] Linking CXX executable ../../../tests/testWavePropagationAttenuationAcousticVTI
[100%] Built target testWavePropagationAttenuationAcousticVTI
[100%] Linking CXX executable ../../../tests/testWavePropagationAcousticFirstOrder
[100%] Linking CXX executable ../../../tests/testWavePropagationAdjoint1
[100%] Built target testWavePropagationAcousticFirstOrder
[100%] Built target testWavePropagationAdjoint1
[100%] Linking CXX shared library ../lib/pygeosx.so
[100%] Built target pygeosx
make: *** [Makefile:146: all] Error 2

Running ctest with this build, notable errors:

197: testFlowStatistics: memory.c:99: void hypre_CheckMemoryLocation(void*, hypre_MemoryLocation): Assertion `location == location_ptr' failed.

And the final summary

92% tests passed, 20 tests failed out of 265

Total Test time (real) = 627.93 sec

The following tests FAILED:
          4 - blt_mpi_smoke (SEGFAULT)
          9 - blt_cuda_mpi_smoke (SEGFAULT)
         16 - testArrayOfSets (Subprocess aborted)
         85 - testPyCRSMatrix (SEGFAULT)
        107 - testMpiWrapper_mpi (SEGFAULT)
        183 - testGraphColoringMPI_mpi (SEGFAULT)
        188 - testMatrices (SEGFAULT)
        189 - testVectors (SEGFAULT)
        190 - testExternalSolvers (SEGFAULT)
        191 - testKrylovSolvers (SEGFAULT)
        192 - testReverseCutHillMcKeeOrdering (SEGFAULT)
        193 - testLAIHelperFunctions (SEGFAULT)
        195 - testHDFParallelFile (SEGFAULT)
        197 - testFlowStatistics (Failed)
        205 - testDofManager (SEGFAULT)
        233 - testNeighborCommunicator_mpi (SEGFAULT)
        234 - testVTKImport_mpi (SEGFAULT)
        235 - testSolverStats (Failed)
        238 - testTransmissibility (Failed)
        240 - testSinglePhaseMFDPolyhedral (Failed)
Errors while running CTest
Output from these tests are in: /wendianSoftware/x86_64/sw/apps/GEOS-project/dev-gcc-ompi-cuda-git/GEOS/build-dev-gcc-ompi-cuda-debug/Testing/Temporary/LastTest.log
Use "--rerun-failed --output-on-failure" to re-run the failed cases verbosely.
Job has finished!

Platform:

  • Machine: On-premise HPC cluster called "Wendian" System specs here: https://rc-docs.mines.edu/pages/systems.html#wendian, x86-based with Intel CPUs + NVIDIA V100 GPU's
  • Compiler: GCC 12.2.1, Cuda 12.9.1, OpenMPI 4.18, Python 3.11 (conda environment base with numpy installed)
  • GEOS Version: Latest dev branch (commit 889ea4e)

Metadata

Metadata

Assignees

No one assigned

    Labels

    type: bugSomething isn't workingtype: newA new issue has been created and requires attention

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions