-
Notifications
You must be signed in to change notification settings - Fork 97
Open
Labels
type: bugSomething isn't workingSomething isn't workingtype: newA new issue has been created and requires attentionA new issue has been created and requires attention
Description
Describe the bug
Build fails to pass all ctests; repeated hypre error of the form:
hypre_CheckMemoryLocation(void*, hypre_MemoryLocation): Assertion `location == location_ptr' failed.
To Reproduce
CMAKE script
set( CONFIG_NAME "dev-gcc-ompi-cuda" )
# Set compilers path
set(CMAKE_C_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/gcc" CACHE PATH "") # This is typically something like /usr/bin/gcc ... or clang
set(CMAKE_CXX_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/g++" CACHE PATH "") # This is typically something like /usr/bin/g++ ... or clang++
set(CMAKE_FC_COMPILER "/opt/rh/gcc-toolset-12/root/usr/bin/gfortran" CACHE PATH "") # This is typically something like /usr/bin/g++ ... or clang++
set(ENABLE_FORTRAN ON CACHE BOOL "" FORCE)
# Set paths to mpi
set(ENABLE_MPI ON CACHE PATH "")
set(MPI_C_COMPILER "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpicc" CACHE PATH "") # This is typically something like /usr/bin/mpicc
set(MPI_CXX_COMPILER "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpicxx" CACHE PATH "") # This is typically something like /usr/bin/mpicxx
set(MPIEXEC "/sw/mpi/openmpi/4.1.6/gcc-cuda12/bin/mpirun" CACHE PATH "") # This is typically something like /usr/bin/mpirun
# Set paths to blas and lapack
set( BLAS_LIBRARIES "/sw/libs/openblas/0.3.25/lib/libopenblas.a" CACHE PATH "" FORCE ) # This is typically something like /usr/lib64/libblas.so
set( LAPACK_LIBRARIES "/sw/libs/openblas/0.3.25/lib/libopenblas.a" CACHE PATH "" FORCE ) # This is typically something like /usr/lib64/liblapack.so
#set( CONDUIT_DIR "/sw/apps/GEOS-project/dev/conduit/build" CACHE PATH "" FORCE ) # This is typically something like /usr/lib64/liblapack.so
# CUDA
set(ENABLE_CUDA ON CACHE BOOL "" FORCE)
set(ENABLE_HYPRE_CUDA ON CACHE BOOL "" FORCE)
set(CUDA_TOOLKIT_ROOT_DIR "/sw/libs/cuda/12.9.1" CACHE PATH "")
set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "")
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "")
set(CMAKE_CUDA_STANDARD 17)
set(CUDA_SEPARABLE_COMPILIATION ON CACHE BOOL "")
set(CMAKE_CUDA_ARCHITECTURES "70" CACHE STRING "")
set(CUDA_ARCH "sm_70" CACHE STRING "")
#set(CMAKE_CUDA_FLAGS "-O2 -restrict -arch ${CUDA_ARCH} --extended-lambda" CACHE STRING "")
# set(CMAKE_CUDA_LINK_FLAGS "-Xlinker –rpath –Xlinker /usr/bin/mpicxx" CACHE STRING "")
#set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations -arch sm_70" CACHE STRING "")
set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda -Wno-deprecated-gpu-targets -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
# Python options
set(ENABLE_PYLVARRAY ON CACHE BOOL "")
set(ENABLE_PYGEOSX ON CACHE BOOL "")
set(Python3_ROOT_DIR "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")
set(Python3_EXECUTABLE "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")
set(PYTHON_EXECUTABLE "/beegfs/projects/shared_conda_envs/envs/geos_env/bin/python" CACHE PATH "")
# openMP
set( ENABLE_OPENMP OFF CACHE PATH "" FORCE )
set(ENABLE_HYPRE ON CACHE BOOL "")
set(ENABLE_HYPRE_DEVICE "CUDA" CACHE BOOL "")
# enable PAMELA and PVTPackage
set(ENABLE_PAMELA ON CACHE BOOL "" FORCE)
set(ENABLE_PVTPackage ON CACHE BOOL "" FORCE)
# Python options
set(ENABLE_PYLVARRAY ON CACHE BOOL "")
set(ENABLE_PYGEOSX ON CACHE BOOL "")
set( ENABLE_OPENMP ON CACHE PATH "" FORCE )
# TPLs
set( ENABLE_TRILINOS OFF CACHE PATH "" FORCE )
set( ENABLE_CALIPER OFF CACHE PATH "" FORCE )
set( ENABLE_DOXYGEN OFF CACHE BOOL "" FORCE)
set( ENABLE_MATHPRESSO OFF CACHE BOOL "" FORCE )
if (DEFINED ENV{GEOSX_TPL_DIR})
set(GEOS_TPL_DIR "$ENV{GEOSX_TPL_DIR}" CACHE PATH "" FORCE)
endif()
#include(${CMAKE_CURRENT_LIST_DIR}/tpls.cmake)
#include(${GEOS_TPL_DIR}/install-wendian-quick-start-debug/tpls.cmake)
include(/sw/apps/GEOS-project/dev-gcc-ompi-cuda/tpls.cmake)
and the tpls.cmake:
#
# Performance portability
#
message("in tpls.cmake GEOS_TPL_DIR=${GEOS_TPL_DIR}")
#
# General TPL Folder verifications
#
if(NOT EXISTS ${GEOS_TPL_DIR})
message(WARNING "'GEOS_TPL_DIR' does not exist.\n")
endif()
if(EXISTS ${GEOS_TPL_DIR}/raja)
set(RAJA_DIR ${GEOS_TPL_DIR}/raja CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/blt)
set(BLT_DIR ${GEOS_TPL_DIR}/blt CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/chai)
set(UMPIRE_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
set(CHAI_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
endif()
#
# IO TPLs
#
if(EXISTS ${GEOS_TPL_DIR}/hdf5)
set(HDF5_DIR ${GEOS_TPL_DIR}/hdf5 CACHE PATH "" FORCE)
message(STATUS "HDF5_DIR = ${HDF5_DIR}")
endif()
if(EXISTS ${GEOS_TPL_DIR}/conduit)
set(CONDUIT_DIR ${GEOS_TPL_DIR}/conduit CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/silo)
set(SILO_DIR ${GEOS_TPL_DIR}/silo CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/adiak)
set(ADIAK_DIR ${GEOS_TPL_DIR}/adiak CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/caliper)
set(CALIPER_DIR ${GEOS_TPL_DIR}/caliper CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/pugixml)
set(PUGIXML_DIR ${GEOS_TPL_DIR}/pugixml CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/vtk)
set(VTK_DIR ${GEOS_TPL_DIR}/vtk CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/fmt)
# set(FMT_DIR ${GEOS_TPL_DIR}/fmt CACHE PATH "" FORCE)
set(FMT_DIR ${GEOS_TPL_DIR}/chai CACHE PATH "" FORCE)
endif()
#
# Math TPLs
#
if(EXISTS ${GEOS_TPL_DIR}/metis)
set(METIS_DIR ${GEOS_TPL_DIR}/metis CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/parmetis)
set(PARMETIS_DIR ${GEOS_TPL_DIR}/parmetis CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/superlu_dist)
set(SUPERLU_DIST_DIR ${GEOS_TPL_DIR}/superlu_dist CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/suitesparse)
set(SUITESPARSE_DIR ${GEOS_TPL_DIR}/suitesparse CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/trilinos)
set(TRILINOS_DIR ${GEOS_TPL_DIR}/trilinos CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/hypre)
set(HYPRE_DIR ${GEOS_TPL_DIR}/hypre CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/scotch)
set(SCOTCH_DIR ${GEOS_TPL_DIR}/scotch CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/petsc AND (NOT DEFINED ENABLE_PETSC OR ENABLE_PETSC))
set(PETSC_DIR ${GEOS_TPL_DIR}/petsc CACHE PATH "" FORCE)
endif()
#
# Development tools
#
if(EXISTS ${GEOS_TPL_DIR}/uncrustify/bin/uncrustify)
set(UNCRUSTIFY_EXECUTABLE ${GEOS_TPL_DIR}/uncrustify/bin/uncrustify CACHE PATH "" FORCE)
endif()
if(EXISTS ${GEOS_TPL_DIR}/doxygen/bin/doxygen)
set(DOXYGEN_EXECUTABLE ${GEOS_TPL_DIR}/doxygen/bin/doxygen CACHE PATH "" FORCE)
endif()
#
# Other
#
if(EXISTS ${GEOS_TPL_DIR}/mathpresso)
set(MATHPRESSO_DIR ${GEOS_TPL_DIR}/mathpresso CACHE PATH "" FORCE)
endif()
Build script
#!/bin/bash
# the last build setp requires connecting to a GPU, so I recommend running this on a GPU node interactively
source /projects/shared_conda_envs/setup_env.sh
download_source=$1
conda env remove -n geos_env -y
conda create -n geos_env -c conda-forge numpy python=3.11 -y
conda activate geos_env
module load compilers/gcc/12.2.1
module load libs/cuda/12.9.1
module load mpi/openmpi/gcc-cuda12/4.1.8
# go to GEOS project folder
build_target="dev-gcc-ompi-cuda"
env > env_geos_build.out
mkdir -p /sw/apps/GEOS-project/${build_target}-git
cp ${build_target}.cmake /sw/apps/GEOS-project/${build_target}-git/
cp tpls.cmake /sw/apps/GEOS-project/${build_target}-git/
cd /sw/apps/GEOS-project/${build_target}-git
if [[ "$download_source" == 1 ]]; then
rm -rf thirdPartyLibs
git clone https://github.com/GEOS-DEV/thirdPartyLibs.git
cd thirdPartyLibs
git submodule init
git submodule update
else
cd thirdPartyLibs
fi
if [[ "$download_source" == 1 ]]; then
# grab all the tpls manually from github using wget
cd tplMirror
rm *
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/Caliper-2.12.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/RAJA-v2025.03.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/SuiteSparse-5.10.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/VTK-9.4.2.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/adiak-0.4.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/asmjit-2e93826.zip
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/chai-2025.03.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/conduit-0.9.2.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/doxygen-1.8.20.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/fmt-11.0.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/hdf5-1.12.1.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/hypre-v2.33.0-12-gbe52325a3.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/mathpresso-24d60e5.zip
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/parmetis-4.0.3-IDXTYPEWIDTH_64.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/petsc-3.19.4.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/pugixml-1.13.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/scotch-v7.0.8.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/silo-4.11.1-bsd.tar.xz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/silo-4.11.patch
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/superlu_dist-0f6efc3.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/superlu_dist-9.0.0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/trilinos-release-16-1-0.tar.gz
wget https://github.com/GEOS-DEV/thirdPartyLibs/raw/refs/heads/master/tplMirror/uncrustify-401a409.zip
fi
cd /sw/apps/GEOS-project/${build_target}-git/thirdPartyLibs
python scripts/config-build.py -hc /sw/apps/GEOS-project/${build_target}-git/${build_target}.cmake
cd build-${build_target}-debug
make -j6
export GEOS_TPL_DIR=/sw/apps/GEOS-project/dev-gcc-ompi-cuda-git/thirdPartyLibs/install-dev-gcc-ompi-cuda-debug
export GEOSX_TPL_DIR=$GEOS_TPL_DIR
cd /sw/apps/GEOS-project/${build_target}-git
if [[ "$download_source" == 1 ]]; then
rm -rf GEOS
git clone https://github.com/GEOS-DEV/GEOS.git
cd GEOS
git submodule init
git submodule update
else
cd GEOS
fi
python scripts/config-build.py -hc /sw/apps/GEOS-project/dev-gcc-ompi-cuda/${build_target}.cmake
cd build-${build_target}-debug
make -j6
With OpenMPI build with PMI
Leads to the following error at the end of build
[g003:3126332] OPAL ERROR: Unreachable in file pmix3x_client.c at line 111
--------------------------------------------------------------------------
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM's PMI support and therefore cannot
execute. There are several options for building PMI support under
SLURM, depending upon the SLURM version you are using:
version 16.05 or later: you can use SLURM's PMIx support. This
requires that you configure and build SLURM --with-pmix.
Versions earlier than 16.05: you must use either SLURM's PMI-1 or
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
install PMI-2. You must then build Open MPI using --with-pmi pointing
to the SLURM PMI library location.
Please configure as appropriate and try again.
--------------------------------------------------------------------------
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
[g003:3126332] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!
make[2]: *** [CMakeFiles/geosx_generate_install_schema.dir/build.make:71: CMakeFiles/geosx_generate_install_schema] Error 1
make[1]: *** [CMakeFiles/Makefile2:2426: CMakeFiles/geosx_generate_install_schema.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....
[100%] Linking CXX executable ../../../tests/testWavePropagationAttenuationAcousticVTI
[100%] Built target testWavePropagationAttenuationAcousticVTI
[100%] Linking CXX executable ../../../tests/testWavePropagationAcousticFirstOrder
[100%] Linking CXX executable ../../../tests/testWavePropagationAdjoint1
[100%] Built target testWavePropagationAcousticFirstOrder
[100%] Built target testWavePropagationAdjoint1
[100%] Linking CXX shared library ../lib/pygeosx.so
[100%] Built target pygeosx
make: *** [Makefile:146: all] Error 2
Running ctest with this build, notable errors:
197: testFlowStatistics: memory.c:99: void hypre_CheckMemoryLocation(void*, hypre_MemoryLocation): Assertion `location == location_ptr' failed.
And the final summary
92% tests passed, 20 tests failed out of 265
Total Test time (real) = 627.93 sec
The following tests FAILED:
4 - blt_mpi_smoke (SEGFAULT)
9 - blt_cuda_mpi_smoke (SEGFAULT)
16 - testArrayOfSets (Subprocess aborted)
85 - testPyCRSMatrix (SEGFAULT)
107 - testMpiWrapper_mpi (SEGFAULT)
183 - testGraphColoringMPI_mpi (SEGFAULT)
188 - testMatrices (SEGFAULT)
189 - testVectors (SEGFAULT)
190 - testExternalSolvers (SEGFAULT)
191 - testKrylovSolvers (SEGFAULT)
192 - testReverseCutHillMcKeeOrdering (SEGFAULT)
193 - testLAIHelperFunctions (SEGFAULT)
195 - testHDFParallelFile (SEGFAULT)
197 - testFlowStatistics (Failed)
205 - testDofManager (SEGFAULT)
233 - testNeighborCommunicator_mpi (SEGFAULT)
234 - testVTKImport_mpi (SEGFAULT)
235 - testSolverStats (Failed)
238 - testTransmissibility (Failed)
240 - testSinglePhaseMFDPolyhedral (Failed)
Errors while running CTest
Output from these tests are in: /wendianSoftware/x86_64/sw/apps/GEOS-project/dev-gcc-ompi-cuda-git/GEOS/build-dev-gcc-ompi-cuda-debug/Testing/Temporary/LastTest.log
Use "--rerun-failed --output-on-failure" to re-run the failed cases verbosely.
Job has finished!
Platform:
- Machine: On-premise HPC cluster called "Wendian" System specs here: https://rc-docs.mines.edu/pages/systems.html#wendian, x86-based with Intel CPUs + NVIDIA V100 GPU's
- Compiler: GCC 12.2.1, Cuda 12.9.1, OpenMPI 4.18, Python 3.11 (conda environment base with numpy installed)
- GEOS Version: Latest dev branch (commit
889ea4e)
Metadata
Metadata
Assignees
Labels
type: bugSomething isn't workingSomething isn't workingtype: newA new issue has been created and requires attentionA new issue has been created and requires attention