Skip to content

Commit 6f7dd88

Browse files
authored
Revert obcount runtime config on branch-25.10 (#3034)
1 parent 7ad0c45 commit 6f7dd88

File tree

9 files changed

+48
-63
lines changed

9 files changed

+48
-63
lines changed

.github/workflows/gh-gasnet.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
name: Build Gasnet wrapper
1717
strategy:
1818
fail-fast: false
19-
runs-on: linux-amd64-gpu-l4-latest-1
19+
runs-on: linux-amd64-cpu4
2020
timeout-minutes: 15
2121
container:
2222
options: -u root

.github/workflows/gh-mpi.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
name: Build MPI wrapper
1717
strategy:
1818
fail-fast: false
19-
runs-on: linux-amd64-gpu-l4-latest-1
19+
runs-on: linux-amd64-cpu4
2020
timeout-minutes: 15
2121
container:
2222
options: -u root

conda/gasnet_wrapper/build-gex-wrapper.sh

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ set -eo pipefail
88
readonly DEFAULT_CONDUIT="ofi"
99
readonly DEFAULT_SYSTEM_CONFIG="slingshot11"
1010
readonly DEFAULT_CUDA="ON"
11+
# Threading suffix used by GASNet archives (libgasnet-<conduit>-<thread>.a)
12+
readonly DEFAULT_THREADING="par"
13+
# Pinned GASNet commit (matches what we validated on Perlmutter)
14+
readonly GASNET_GITREF_SHA="e0af36ac9d3d632824be1851bcb3bc23bf05e489"
1115

1216
# Determine script directory dynamically
1317
readonly SCRIPT_DIR="${CONDA_PREFIX}/gex-wrapper"
@@ -16,22 +20,25 @@ readonly SCRIPT_DIR="${CONDA_PREFIX}/gex-wrapper"
1620
conduit="${DEFAULT_CONDUIT}"
1721
system_config="${DEFAULT_SYSTEM_CONFIG}"
1822
cuda="${DEFAULT_CUDA}"
23+
threading="${DEFAULT_THREADING}"
24+
extra_linker_flags=""
1925

2026
# Help function to display usage
2127
gex_wrapper_help() {
22-
echo "Usage: build-gex-wrapper [-h | --help] [-c conduit | --conduit conduit] [-s system_config | --system_config system_config] [-u ON/OFF | --use-cuda ON/OFF]"
28+
echo "Usage: build-gex-wrapper [-h | --help] [-c conduit | --conduit conduit] [-s system_config | --system_config system_config] [-u ON/OFF | --use-cuda ON/OFF] [-f flags | --linker-flags \"<flags>\"]"
2329
echo "Build the Realm GASNet-EX wrapper in your conda environment."
2430
echo
2531
echo "Options:"
2632
echo " -h, --help Display this help and exit"
27-
echo " -c, --conduit CONDUIT Specify the GASNet conduit to use (default '${DEFAULT_CONDUIT}')"
28-
echo " -s, --system_config SYS Specify the system-specific configuration (default '${DEFAULT_SYSTEM_CONFIG}')"
33+
echo " -c, --conduit CONDUIT GASNet conduit to use (default '${DEFAULT_CONDUIT}')"
34+
echo " -s, --system_config SYS System-specific configuration (default '${DEFAULT_SYSTEM_CONFIG}')"
2935
echo " -u, --use-cuda ON/OFF Enable (ON) or disable (OFF) CUDA (default '${DEFAULT_CUDA}')"
36+
echo " -f, --linker-flags STR Extra linker flags to append (default '-lhugetlbfs' when conduit='ofi' and system='slingshot11')"
3037
echo
3138
}
3239

3340
# Parse command-line options (supporting both single-dash and double-dash)
34-
ARGS=$(getopt -o hc:s:u: -l help,conduit:,system_config:,use-cuda: -- "$@") || {
41+
ARGS=$(getopt -o hc:s:u:f: -l help,conduit:,system_config:,use-cuda:,linker-flags: -- "$@") || {
3542
gex_wrapper_help
3643
exit 1
3744
}
@@ -59,6 +66,10 @@ while true; do
5966
fi
6067
shift 2
6168
;;
69+
-f | --linker-flags)
70+
extra_linker_flags="$2"
71+
shift 2
72+
;;
6273
--)
6374
shift
6475
break
@@ -71,11 +82,16 @@ while true; do
7182
esac
7283
done
7384

85+
# Default linker flags for Perlmutter OFI/slingshot11, unless overridden
86+
if [[ -z "${extra_linker_flags}" && "${conduit}" == "ofi" && "${system_config}" == "slingshot11" ]]; then
87+
extra_linker_flags="-lhugetlbfs"
88+
fi
89+
7490
# Ensure CONDA_PREFIX is set
7591
if [[ -z "${CONDA_PREFIX}" ]]; then
7692
echo "Error: Please activate a conda environment before running this script."
7793
echo "Run:"
78-
echo " \$ conda activate <your-env-name>"
94+
echo " $ conda activate <your-env-name>"
7995
echo "Then re-run this script."
8096
exit 1
8197
fi
@@ -84,7 +100,7 @@ fi
84100
if ! command -v cmake &>/dev/null; then
85101
echo "Error: cmake is not installed or not in PATH."
86102
echo "Please install it via your package manager or conda:"
87-
echo " \$ conda install -c conda-forge cmake"
103+
echo " $ conda install -c conda-forge cmake"
88104
exit 1
89105
fi
90106

@@ -110,19 +126,35 @@ CMAKE_ARGS=(
110126
-DGASNet_CONDUIT="${conduit}"
111127
-DGASNet_SYSTEM="${system_config}"
112128
-DGEX_WRAPPER_BUILD_SHARED=ON
129+
-DGASNet_GITREF="${GASNET_GITREF_SHA}"
113130
)
114131

115132
if [[ "${cuda}" == "ON" ]]; then
116133
CMAKE_ARGS+=(-DGASNet_CONFIGURE_ARGS="--enable-kind-cuda-uva")
117134
fi
118135

136+
# Whole-archive embed of the conduit archive into the wrapper DSO.
137+
# Note: libgasnet-<conduit>-par.a already contains gasnet_tools, so do NOT also
138+
# link libgasnet_tools-par.a to avoid duplicate symbols.
139+
GASNET_LIBDIR_EMBED="${SCRIPT_DIR}/src/build/embed-gasnet/install/lib"
140+
MAIN_A="${GASNET_LIBDIR_EMBED}/libgasnet-${conduit}-${threading}.a"
141+
LINK_FLAGS=("-Wl,--whole-archive,${MAIN_A},-no-whole-archive")
142+
143+
if [[ -n "${extra_linker_flags}" ]]; then
144+
read -r -a extra_linker_flags_array <<< "${extra_linker_flags}"
145+
LINK_FLAGS+=("${extra_linker_flags_array[@]}")
146+
fi
147+
148+
CMAKE_ARGS+=(-DCMAKE_SHARED_LINKER_FLAGS="${LINK_FLAGS[*]}")
149+
119150
cmake "${CMAKE_ARGS[@]}" ..
120151
cmake --build .
121152
cmake --install .
122153

123154
echo
124155
echo "Reactivate the conda environment to set necessary environment variables:"
125156
echo
126-
echo " \$ conda deactivate"
127157
# shellcheck disable=SC2154
128-
echo " \$ conda activate ${CONDA_DEFAULT_ENV}"
158+
echo " $ conda deactivate"
159+
# shellcheck disable=SC2154
160+
echo " $ conda activate ${CONDA_DEFAULT_ENV}"

conda/gasnet_wrapper/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ source:
1919

2020
build:
2121
include_recipe: false
22-
number: 18
22+
number: 22
2323
skip: true # [not linux]
2424
noarch: generic
2525
script_env:

conda/mpi_wrapper/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ source:
1515

1616
build:
1717
include_recipe: false
18-
number: 19
18+
number: 20
1919
skip: true # [not linux]
2020
noarch: generic
2121
script_env:

share/legate/mpi_wrapper/install.bash

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ cmake_install_args=(--install "${script_dir}/build")
3838

3939
if [[ "${prefix}" != "" ]]; then
4040
cmake_configure_args+=("-DCMAKE_INSTALL_PREFIX=${prefix}")
41-
# Export the same value as all 3
4241
export CMAKE_INSTALL_PREFIX="${prefix}"
43-
export DESTDIR="${prefix}"
4442
export PREFIX="${prefix}"
4543
fi
4644

src/cpp/legate/runtime/detail/argument_parsing/util.cc

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
#include <legate/utilities/assert.h>
1212
#include <legate/utilities/detail/env.h>
13-
#include <legate/utilities/detail/string_utils.h>
1413
#include <legate/utilities/macros.h>
1514

1615
#include <kvikio/shim/cufile.hpp>
@@ -26,32 +25,17 @@
2625

2726
namespace legate::detail {
2827

29-
unsigned int num_ranks()
28+
bool multi_node_job()
3029
{
3130
constexpr EnvironmentVariable<std::uint32_t> OMPI_COMM_WORLD_SIZE{"OMPI_COMM_WORLD_SIZE"};
3231
constexpr EnvironmentVariable<std::uint32_t> MV2_COMM_WORLD_SIZE{"MV2_COMM_WORLD_SIZE"};
3332
constexpr EnvironmentVariable<std::uint32_t> SLURM_NTASKS{"SLURM_NTASKS"};
3433

35-
const auto ompi_comm_world_size = OMPI_COMM_WORLD_SIZE.get().value_or(1);
36-
if (ompi_comm_world_size > 1) {
37-
return ompi_comm_world_size;
38-
}
39-
40-
const auto mv2_comm_world_size = MV2_COMM_WORLD_SIZE.get().value_or(1);
41-
if (mv2_comm_world_size > 1) {
42-
return mv2_comm_world_size;
43-
}
44-
45-
const auto slurm_ntasks = SLURM_NTASKS.get().value_or(1);
46-
if (slurm_ntasks > 1) {
47-
return slurm_ntasks;
48-
}
49-
50-
return 1;
34+
return OMPI_COMM_WORLD_SIZE.get().value_or(1) > 1 || //
35+
MV2_COMM_WORLD_SIZE.get().value_or(1) > 1 || //
36+
SLURM_NTASKS.get().value_or(1) > 1;
5137
}
5238

53-
bool multi_node_job() { return num_ranks() > 1; }
54-
5539
std::vector<std::string> deduplicate_command_line_flags(Span<const std::string> args)
5640
{
5741
// A dummy name that is used only in case the first arguments are positional. Currently

src/cpp/legate/runtime/detail/argument_parsing/util.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,6 @@
1313

1414
namespace legate::detail {
1515

16-
/**
17-
* @brief Determine the number of ranks in use based on environment variables
18-
* OMPI_COMM_WORLD_SIZE, MV2_COMM_WORLD_SIZE, and SLURM_NTASKS.
19-
*
20-
* @return the number of ranks in use
21-
*/
22-
[[nodiscard]] std::uint32_t num_ranks();
23-
2416
/**
2517
* @return `true` when Legate is being invoked as a multi-node job, `false` otherwise.
2618
*/

src/cpp/legate/runtime/detail/runtime.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
#include <legate/utilities/detail/env_defaults.h>
6262
#include <legate/utilities/detail/formatters.h>
6363
#include <legate/utilities/detail/linearize.h>
64-
#include <legate/utilities/detail/string_utils.h>
6564
#include <legate/utilities/detail/traced_exception.h>
6665
#include <legate/utilities/detail/tuple.h>
6766
#include <legate/utilities/hash.h>
@@ -1731,7 +1730,6 @@ void handle_realm_default_args(bool need_network_init)
17311730
"Legate was run on multiple nodes but was not built with networking support. Please "
17321731
"install Legate again with networking support (e.g. configured \"--with-ucx\")"};
17331732
}
1734-
17351733
// We have to pass an explicit `-ll:networks` flag, otherwise Realm will silently continue with
17361734
// single-node execution if network initialization fails. Therefore, even though Realm's default
17371735
// priority list for network modules is good enough for us, we have to duplicate it here.
@@ -1740,25 +1738,6 @@ void handle_realm_default_args(bool need_network_init)
17401738
#endif
17411739
#ifdef REALM_USE_GASNETEX
17421740
ss << " -ll:networks gasnetex";
1743-
1744-
// Need to set -gex:obcount appropriately, see
1745-
// https://github.com/StanfordLegion/realm/issues/239
1746-
const auto num_gpus = Runtime::get_runtime().local_machine().total_gpu_count();
1747-
auto ranks = num_ranks();
1748-
1749-
if (ranks == 1) {
1750-
constexpr EnvironmentVariable<std::string> WORKER_PEERS_INFO{"WORKER_PEERS_INFO"};
1751-
const auto realm_ucp_bootstrap_mode = REALM_UCP_BOOTSTRAP_MODE.get();
1752-
1753-
if (realm_ucp_bootstrap_mode == "p2p") {
1754-
if (const auto workers_peer_info = WORKER_PEERS_INFO.get(); workers_peer_info.has_value()) {
1755-
ranks = string_split(*workers_peer_info).size();
1756-
}
1757-
}
1758-
}
1759-
const auto obcount = 4 * ranks + 2 * num_gpus;
1760-
1761-
ss << " -gex:obcount " << obcount;
17621741
#endif
17631742
#ifdef REALM_USE_GASNET1
17641743
ss << " -ll:networks gasnet1";

0 commit comments

Comments
 (0)