From 320f7066196cafac6193743b862283a5b66a7d9c Mon Sep 17 00:00:00 2001 From: Vasanth Sabavat Date: Wed, 31 Dec 2025 02:52:10 +0000 Subject: [PATCH 1/2] cache rootfs of containers --- Makefile | 4 + PLAN_CONTAINER_CACHE.md | 168 +++++++++++ args.c | 33 +++ args.h | 1 + config.c | 27 ++ config.h | 3 + pyxis_slurmstepd.c | 589 ++++++++++++++++++++++++++++++++++++- tests/container_cache.bats | 386 ++++++++++++++++++++++++ 8 files changed, 1203 insertions(+), 8 deletions(-) create mode 100644 PLAN_CONTAINER_CACHE.md create mode 100644 tests/container_cache.bats diff --git a/Makefile b/Makefile index cea4a27..6e232d5 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,10 @@ CPPFLAGS := -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 -DPYXIS_VERSION=\"$(PYXIS_VER)\" $ CFLAGS := -std=gnu11 -O2 -g -Wall -Wunused-variable -fstack-protector-strong -fpic $(CFLAGS) LDFLAGS := -Wl,-znoexecstack -Wl,-zrelro -Wl,-znow $(LDFLAGS) +ifneq ($(strip $(SLURM_ROOT)),) +CPPFLAGS += -I$(SLURM_ROOT)/include +endif + C_SRCS := common.c args.c pyxis_slurmstepd.c pyxis_slurmd.c pyxis_srun.c pyxis_alloc.c pyxis_dispatch.c config.c enroot.c importer.c C_OBJS := $(C_SRCS:.c=.o) diff --git a/PLAN_CONTAINER_CACHE.md b/PLAN_CONTAINER_CACHE.md new file mode 100644 index 0000000..80061d3 --- /dev/null +++ b/PLAN_CONTAINER_CACHE.md @@ -0,0 +1,168 @@ +# Plan: `--container-cache` (Module 3A) — Persistent RootFS Reuse with Pyxis + Enroot + +This file is a working plan/spec for implementing and validating **Module 3A** in Pyxis: + +- New user-facing flag: `srun --container-cache` +- Goal: reuse the **unpacked** Enroot rootfs across jobs on the **same node** to achieve near-1s warm starts +- Constraints: + - Must work with the existing cluster cleanup behavior (Epilog deletes `pyxis_${SLURM_JOB_ID}*`) + - **Disallow** `--container-writable` and `--container-save` in cache mode (to avoid unsafe cross-job state) + - Must not require users to encode digest/version into `--container-name` + +--- + +## Background / Why this exists + +On our cluster: +- Cold start (`--container-name` first use) costs ~14–18s/node +- Warm start can be ~1s **only if** the unpacked rootfs persists +- Today rootfs does **not** persist across jobs because: + - job-scoped naming (`pyxis_${JOBID}_...`) and + - `/etc/slurm/epilog.d/70-enroot-container-cleanup.sh` removing `pyxis_${SLURM_JOB_ID}*` + +So we need a **Pyxis-native** mechanism to: +1) generate stable cache identities and +2) ensure the resulting rootfs directories do not match job-scoped cleanup patterns. + +--- + +## Design summary + +### User interface + +- **Flag:** `--container-cache` + - Also available via env: `PYXIS_CONTAINER_CACHE=1` +- When set: + - Requires `--container-image` + - Rejects `--container-writable` + - Rejects `--container-save` + - Forces read-only rootfs (`ENROOT_ROOTFS_WRITABLE=n`) + +### How caching works + +1) **Stable cache key** is derived from the image identity: + - For `.sqsh` paths: `abs_path + mtime + size` (fast; avoids hashing huge files) + - For non-path images: hash the image string (future improvement: OCI digest) + +2) **Stable container name** is auto-generated from the cache key: + - Example prefix: `pyxis_cache__` + - **Important:** In cache mode, naming must be **non-job-scoped** (no jobid prefix). + +3) **Reuse behavior** + - If the container already exists: Pyxis reuses the filesystem and skips `enroot create` + - Otherwise: Pyxis creates it once (cold path), then it persists + +### Cache directory layout + +- **Required config for cache mode** (plugstack arg): + - `container_cache_data_path=/raid/containers/data` +- **Per-user cache directory**: + - Pyxis creates/uses `/` with mode `0700` and ownership `:` + - For cache mode, Pyxis sets `ENROOT_DATA_PATH=/` for all Enroot calls +- **Cached rootfs directory name**: + - `pyxis_cache_u_` + - Full path: `//pyxis_cache_u_` +- **Locking + "last used" signal**: + - Pyxis creates `/.pyxis_cache_lock` + - jobs hold a **shared** lock for the job lifetime + - GC tries an **exclusive non-blocking** lock; if it can’t lock, the entry is treated as in-use + - Each time a cached rootfs is used, Pyxis `touch`es the rootfs directory to update its `mtime` + - GC uses this `mtime` as the LRU timestamp (no reliance on filesystem `atime`) + +### GC / LRU eviction (global across users) + +Goal: prevent jobs failing due to the cache filesystem being full. + +- **When GC runs**: + - Opportunistic, at job start in cache mode **only if** we’re about to create a new cached rootfs (cold path) + - Reuse path should not trigger GC +- **Watermarks** (admin-configurable): + - `container_cache_gc_high=85` (default): start evicting when used% is \(\ge\) high + - `container_cache_gc_low=80` (default): stop evicting when used% drops below low +- **Global serialization**: + - GC takes an exclusive lock on `/pyxis-container-cache-gc.lock` so multiple jobs don’t evict concurrently +- **Candidate selection (LRU)**: + - Scan all user dirs: `/*/pyxis_cache_*` + - Sort candidates by directory `mtime` (oldest first) +- **Eviction loop**: + - For each candidate, try to acquire an **exclusive non-blocking** lock on `/.pyxis_cache_lock` + - if locked: recursively delete the candidate directory + - if not lockable: skip (in-use) + - Re-check used% and stop once below `container_cache_gc_low` +- **Cross-user behavior**: + - Because `/` is `0700`, GC must run in a privileged SPANK hook so it can traverse/evict other users’ entries. + +--- + +## Code changes (high-level) + +### 1) Add new flag and env var plumbing +- `args.h`: add `int container_cache;` +- `args.c`: + - register `--container-cache` + - parse `PYXIS_CONTAINER_CACHE` + +### 2) Implement cache mode in `pyxis_slurmstepd.c` +- During `slurm_spank_user_init`: + - validate incompatible flags (`--container-writable`, `--container-save`) + - compute stable name + - force `container_scope=global` for cache mode + - derive a per-user cache directory from `container_cache_data_path` (`/`) + +- During Enroot execution (`enroot_set_env`): + - set `ENROOT_DATA_PATH` for cache mode (`/`) + +- During container create: + - run GC if needed (global `/raid` thresholds) + - touch/lock cached entries to prevent eviction while in-use + +### 3) Config knobs +Extend `config.[ch]` to parse: +- `container_cache_data_path=...` +- `container_cache_gc_high=...` +- `container_cache_gc_low=...` + +--- + +## Testing plan (cluster) + +### Automated (BATS) +- Ensure Slurm env is set (cluster-specific; adjust paths as needed): + +```bash +export SLURM_ROOT=/cm/local/apps/slurm/24.11 +export PATH="$SLURM_ROOT/bin:$SLURM_ROOT/sbin:$PATH" +export LD_LIBRARY_PATH="$SLURM_ROOT/lib64:${LD_LIBRARY_PATH:-}" +export SLURM_CONF=/etc/slurm/slurm.conf +``` + +- Run: `bats tests/container_cache.bats` + - Covers: policy enforcement, stable naming/layout under `/`, read-only enforcement, env var enablement, and GC behavior (including cross-user eviction) when usage is above the configured high watermark. + - If needed: `PYXIS_TEST_SQSH_IMAGE=/path/to/image.sqsh bats tests/container_cache.bats` + +### Functional correctness (manual) +1) Cold create: + - `srun --container-cache --container-image= ...` + - expect: rootfs directory `//pyxis_cache_u_` is created +2) Warm reuse (separate job, same node): + - run the same command on the same node + - expect: the same cached rootfs is reused (near-1s startup) + +### Cleanup compatibility +- Verify cached directories are **non-job-scoped** (e.g. `pyxis_cache_u_*`) and do **not** match Epilog `pyxis_${JOBID}*` patterns. + +### GC/LRU (manual) +- Trigger GC by filling the cache filesystem above `container_cache_gc_high`, then create new cached rootfs entries. +- Confirm oldest caches are evicted first and locked/in-use caches are not evicted. + +--- + +## Open items / future improvements + +- Use image digest for OCI images (instead of hashing only the image string) +- More robust last-used tracking (avoid relying on directory `mtime`) +- Expand test coverage for concurrency/stress scenarios on a multi-node cluster + +--- + + diff --git a/args.c b/args.c index d4d5800..a3cc799 100644 --- a/args.c +++ b/args.c @@ -17,6 +17,7 @@ static struct plugin_args pyxis_args = { .workdir = NULL, .container_name = NULL, .container_name_flags = NULL, + .container_cache = -1, .container_save = NULL, .mount_home = -1, .remap_root = -1, @@ -31,6 +32,7 @@ static int spank_option_image(int val, const char *optarg, int remote); static int spank_option_mount(int val, const char *optarg, int remote); static int spank_option_workdir(int val, const char *optarg, int remote); static int spank_option_container_name(int val, const char *optarg, int remote); +static int spank_option_container_cache(int val, const char *optarg, int remote); static int spank_option_container_save(int val, const char *optarg, int remote); static int spank_option_container_mount_home(int val, const char *optarg, int remote); static int spank_option_container_remap_root(int val, const char *optarg, int remote); @@ -69,6 +71,15 @@ struct spank_option spank_opts[] = "If a container with this name already exists, the existing container is used and the import is skipped.", 1, 0, spank_option_container_name }, + { + "container-cache", + NULL, + "[pyxis] enable persistent container root filesystem caching. " + "When set, pyxis derives a stable container name from the image identity and attempts to reuse " + "the container filesystem across jobs on the same node. " + "Incompatible with --container-writable and --container-save.", + 0, 1, spank_option_container_cache + }, { "container-save", "PATH", @@ -185,6 +196,13 @@ void pyxis_args_check_environment_variables(spank_t sp) if (env_val != NULL && pyxis_args.container_name == NULL) spank_option_container_name(0, env_val, 0); + env_val = get_env_var(sp, "PYXIS_CONTAINER_CACHE", buf, sizeof(buf)); + if (env_val != NULL && pyxis_args.container_cache == -1) { + ret = parse_bool(env_val); + if (ret >= 0) + spank_option_container_cache(ret, NULL, 0); + } + env_val = get_env_var(sp, "PYXIS_CONTAINER_SAVE", buf, sizeof(buf)); if (env_val != NULL && pyxis_args.container_save == NULL) spank_option_container_save(0, env_val, 0); @@ -490,6 +508,19 @@ static int spank_option_container_name(int val, const char *optarg, int remote) return (rv); } +static int spank_option_container_cache(int val, const char *optarg, int remote) +{ + (void)optarg; + (void)remote; + + /* Slurm may call us multiple times with the same value. */ + if (pyxis_args.container_cache == val) + return (0); + + pyxis_args.container_cache = val; + return (0); +} + static int spank_option_container_save(int val, const char *optarg, int remote) { if (optarg == NULL || *optarg == '\0') { @@ -630,6 +661,8 @@ struct plugin_args *pyxis_args_register(spank_t sp) bool pyxis_args_enabled(void) { if (pyxis_args.image == NULL && pyxis_args.container_name == NULL) { + if (pyxis_args.container_cache == 1) + return (true); if (pyxis_args.mounts_len > 0) slurm_error("pyxis: ignoring --container-mounts because neither --container-image nor --container-name is set"); if (pyxis_args.workdir != NULL) diff --git a/args.h b/args.h index b55da3f..c5976dc 100644 --- a/args.h +++ b/args.h @@ -17,6 +17,7 @@ struct plugin_args { char *workdir; char *container_name; char *container_name_flags; + int container_cache; char *container_save; int mount_home; int remap_root; diff --git a/config.c b/config.c index 02be6fc..4066be3 100644 --- a/config.c +++ b/config.c @@ -3,6 +3,7 @@ */ #include +#include #include #include @@ -24,6 +25,8 @@ int pyxis_config_parse(struct plugin_config *config, int ac, char **av) { int ret; const char *optarg; + const char *cache_data_prefix = "container_cache_data_path="; + const size_t cache_data_prefix_len = sizeof("container_cache_data_path=") - 1; memset(config, 0, sizeof(*config)); @@ -38,6 +41,9 @@ int pyxis_config_parse(struct plugin_config *config, int ac, char **av) config->sbatch_support = true; config->use_enroot_load = false; config->importer_path[0] = '\0'; + config->container_cache_data_path[0] = '\0'; + config->container_cache_gc_high = 85; + config->container_cache_gc_low = 80; for (int i = 0; i < ac; ++i) { if (strncmp("runtime_path=", av[i], 13) == 0) { @@ -88,11 +94,32 @@ int pyxis_config_parse(struct plugin_config *config, int ac, char **av) slurm_error("pyxis: importer: path too long: %s", optarg); return (-1); } + } else if (strncmp(cache_data_prefix, av[i], cache_data_prefix_len) == 0) { + optarg = av[i] + cache_data_prefix_len; + ret = snprintf(config->container_cache_data_path, sizeof(config->container_cache_data_path), "%s", optarg); + if (ret < 0 || ret >= (int)sizeof(config->container_cache_data_path)) { + slurm_error("pyxis: container_cache_data_path: path too long: %s", optarg); + return (-1); + } + } else if (strncmp("container_cache_gc_high=", av[i], 24) == 0) { + optarg = av[i] + 24; + config->container_cache_gc_high = atoi(optarg); + } else if (strncmp("container_cache_gc_low=", av[i], 23) == 0) { + optarg = av[i] + 23; + config->container_cache_gc_low = atoi(optarg); } else { slurm_error("pyxis: unknown configuration option: %s", av[i]); return (-1); } } + if (config->container_cache_gc_high < 1 || config->container_cache_gc_high > 99 || + config->container_cache_gc_low < 1 || config->container_cache_gc_low > 99 || + config->container_cache_gc_low >= config->container_cache_gc_high) { + slurm_error("pyxis: invalid container cache GC configuration: high=%d low=%d", + config->container_cache_gc_high, config->container_cache_gc_low); + return (-1); + } + return (0); } diff --git a/config.h b/config.h index e2165df..3fdb36f 100644 --- a/config.h +++ b/config.h @@ -20,6 +20,9 @@ struct plugin_config { bool sbatch_support; bool use_enroot_load; char importer_path[PATH_MAX]; + char container_cache_data_path[PATH_MAX]; + int container_cache_gc_high; + int container_cache_gc_low; }; int pyxis_config_parse(struct plugin_config *config, int ac, char **av); diff --git a/pyxis_slurmstepd.c b/pyxis_slurmstepd.c index b128ec7..98a1bb4 100644 --- a/pyxis_slurmstepd.c +++ b/pyxis_slurmstepd.c @@ -4,14 +4,18 @@ #include +#include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -34,6 +38,40 @@ #include "enroot.h" #include "importer.h" +static bool pyxis_debug = false; + +#define PYXIS_DEBUG_LOG(fmt, ...) do { \ + if (pyxis_debug) \ + slurm_spank_log("pyxis: debug: " fmt, ##__VA_ARGS__); \ +} while (0) + +static bool env_bool(spank_t sp, const char *name, bool def) +{ + char buf[64]; + const char *val = NULL; + + if (sp != NULL && spank_context() == S_CTX_REMOTE) { + spank_err_t rc = spank_getenv(sp, name, buf, sizeof(buf)); + if (rc == ESPANK_SUCCESS) + val = buf; + } + if (val == NULL) + val = getenv(name); + if (val == NULL) + return def; + + int ret = parse_bool(val); + if (ret >= 0) + return ret != 0; + + char *end = NULL; + long n = strtol(val, &end, 10); + if (end != NULL && end != val && *end == '\0') + return n != 0; + + return def; +} + struct container { char *name; char *squashfs_path; @@ -41,6 +79,10 @@ struct container { bool reuse_rootfs; bool reuse_ns; bool temporary_rootfs; + bool cache_mode; + char *cache_data_path_root; + char *cache_data_path; + int cache_lock_fd; bool use_enroot_import; bool use_enroot_load; bool use_importer; @@ -87,6 +129,368 @@ static double timespec_diff_ms(const struct timespec *start, const struct timesp return (end->tv_sec - start->tv_sec) * 1000.0 + (end->tv_nsec - start->tv_nsec) / 1000000.0; } +/* ---- container-cache helpers ---- */ + +#define PYXIS_CACHE_CONTAINER_BASENAME_PREFIX "cache_u" +#define PYXIS_CACHE_CONTAINER_PREFIX "pyxis_cache_" +#define PYXIS_CACHE_LOCKFILE ".pyxis_cache_lock" + +static uint64_t fnv1a64_update(uint64_t h, const void *data, size_t len) +{ + const unsigned char *p = (const unsigned char *)data; + for (size_t i = 0; i < len; ++i) { + h ^= (uint64_t)p[i]; + h *= 1099511628211ULL; + } + return h; +} + +static bool image_looks_like_path(const char *image) +{ + /* Matches the logic used later in user_init() when detecting squashfs paths. */ + return image != NULL && strspn(image, "./") > 0; +} + +static int container_cache_build_basename(char **out, const char *image, uid_t uid) +{ + struct stat st; + uint64_t h = 1469598103934665603ULL; + char buf[128]; + int ret; + + if (out == NULL || image == NULL) + return (-1); + + h = fnv1a64_update(h, image, strlen(image)); + + if (image_looks_like_path(image) && stat(image, &st) == 0) { + ret = snprintf(buf, sizeof(buf), "|%lld|%lld", + (long long)st.st_mtime, (long long)st.st_size); + if (ret > 0 && (size_t)ret < sizeof(buf)) + h = fnv1a64_update(h, buf, (size_t)ret); + } + + ret = xasprintf(out, "%s%u_%016" PRIx64, PYXIS_CACHE_CONTAINER_BASENAME_PREFIX, (unsigned)uid, h); + if (ret < 0) + return (-1); + + return (0); +} + +static char *cache_data_path_for_uid(const char *root, uid_t uid) +{ + char *out = NULL; + int ret; + + if (root == NULL) + return NULL; + + ret = xasprintf(&out, "%s/%u", root, (unsigned)uid); + if (ret < 0) + return NULL; + return out; +} + +static int mkdir_p_owned(const char *path, uid_t uid, gid_t gid, mode_t mode) +{ + char tmp[PATH_MAX]; + size_t len; + struct stat st; + int ret; + + if (path == NULL) + return (-1); + + len = strnlen(path, sizeof(tmp)); + if (len == 0 || len >= sizeof(tmp)) + return (-1); + memcpy(tmp, path, len + 1); + + for (char *p = tmp + 1; *p; ++p) { + if (*p != '/') + continue; + *p = '\0'; + (void)mkdir(tmp, 0755); + *p = '/'; + } + + ret = mkdir(tmp, mode); + if (ret < 0 && errno != EEXIST) + return (-1); + + if (stat(tmp, &st) < 0) + return (-1); + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return (-1); + } + + if (st.st_uid != uid || st.st_gid != gid) { + if (chown(tmp, uid, gid) < 0) { + slurm_error("pyxis: couldn't chown %s: %s", tmp, strerror(errno)); + return (-1); + } + } + + if ((st.st_mode & 07777) != mode) { + if (chmod(tmp, mode) < 0) { + slurm_error("pyxis: couldn't chmod %s: %s", tmp, strerror(errno)); + return (-1); + } + } + return (0); +} + +static int touch_path(const char *path) +{ + struct timespec ts[2]; + + if (path == NULL) + return (-1); + + clock_gettime(CLOCK_REALTIME, &ts[0]); + ts[1] = ts[0]; + return utimensat(AT_FDCWD, path, ts, 0); +} + +static int cache_lock_shared(const char *container_dir, int *out_fd) +{ + char lock_path[PATH_MAX]; + int fd; + int ret; + + if (container_dir == NULL || out_fd == NULL) + return (-1); + + ret = snprintf(lock_path, sizeof(lock_path), "%s/%s", container_dir, PYXIS_CACHE_LOCKFILE); + if (ret < 0 || ret >= (int)sizeof(lock_path)) + return (-1); + + fd = open(lock_path, O_CREAT | O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) + return (-1); + + if (flock(fd, LOCK_SH) < 0) { + close(fd); + return (-1); + } + + *out_fd = fd; + return (0); +} + +static int cache_gc_lock_fd(const char *data_path_root) +{ + char path[PATH_MAX]; + int fd; + int ret; + + if (data_path_root == NULL || data_path_root[0] == '\0') + return (-1); + + ret = snprintf(path, sizeof(path), "%s/pyxis-container-cache-gc.lock", data_path_root); + if (ret < 0 || ret >= (int)sizeof(path)) + return (-1); + + fd = open(path, O_CREAT | O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) + return (-1); + return fd; +} + +static int rm_rf_cb(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) +{ + (void)sb; + (void)ftwbuf; + if (typeflag == FTW_DP || typeflag == FTW_D) + (void)rmdir(fpath); + else + (void)unlink(fpath); + return 0; +} + +static int rm_rf(const char *path) +{ + if (path == NULL) + return (-1); + return nftw(path, rm_rf_cb, 64, FTW_DEPTH | FTW_PHYS); +} + +struct cache_candidate { + char *path; + time_t mtime; +}; + +static int cache_candidate_cmp(const void *a, const void *b) +{ + const struct cache_candidate *ca = (const struct cache_candidate *)a; + const struct cache_candidate *cb = (const struct cache_candidate *)b; + if (ca->mtime < cb->mtime) + return -1; + if (ca->mtime > cb->mtime) + return 1; + return 0; +} + +static int cache_collect_candidates(const char *user_dirs_glob, struct cache_candidate **out, size_t *out_len) +{ + glob_t g_users = {0}, g_cache = {0}; + struct cache_candidate *list = NULL; + size_t list_len = 0; + int ret; + + if (out == NULL || out_len == NULL || user_dirs_glob == NULL) + return (-1); + + ret = glob(user_dirs_glob, 0, NULL, &g_users); + if (ret != 0) { + globfree(&g_users); + *out = NULL; + *out_len = 0; + return (0); + } + + for (size_t i = 0; i < g_users.gl_pathc; ++i) { + char pattern[PATH_MAX]; + int n = snprintf(pattern, sizeof(pattern), "%s/%s*", g_users.gl_pathv[i], PYXIS_CACHE_CONTAINER_PREFIX); + if (n < 0 || n >= (int)sizeof(pattern)) + continue; + + memset(&g_cache, 0, sizeof(g_cache)); + if (glob(pattern, 0, NULL, &g_cache) != 0) { + globfree(&g_cache); + continue; + } + + for (size_t j = 0; j < g_cache.gl_pathc; ++j) { + struct stat st; + if (lstat(g_cache.gl_pathv[j], &st) < 0) + continue; + if (!S_ISDIR(st.st_mode)) + continue; + + struct cache_candidate *tmp = realloc(list, (list_len + 1) * sizeof(*list)); + if (tmp == NULL) + continue; + list = tmp; + list[list_len].path = strdup(g_cache.gl_pathv[j]); + list[list_len].mtime = st.st_mtime; + list_len++; + } + globfree(&g_cache); + } + + globfree(&g_users); + *out = list; + *out_len = list_len; + return (0); +} + +static void cache_free_candidates(struct cache_candidate *list, size_t len) +{ + if (list == NULL) + return; + for (size_t i = 0; i < len; ++i) + free(list[i].path); + free(list); +} + +static int cache_fs_used_percent(const char *path, int *out_used_percent) +{ + struct statvfs vfs; + unsigned long long total, avail, used; + int used_pct; + + if (path == NULL || out_used_percent == NULL) + return (-1); + + if (statvfs(path, &vfs) < 0) + return (-1); + + total = (unsigned long long)vfs.f_blocks * (unsigned long long)vfs.f_frsize; + avail = (unsigned long long)vfs.f_bavail * (unsigned long long)vfs.f_frsize; + used = (total > avail) ? (total - avail) : 0; + + used_pct = (total == 0) ? 0 : (int)((used * 100ULL) / total); + *out_used_percent = used_pct; + return (0); +} + +static int cache_gc_if_needed(const char *data_path_root, int high_water, int low_water) +{ + int used_pct; + int ret; + int lock_fd = -1; + struct cache_candidate *cands = NULL; + size_t cands_len = 0; + char *user_glob = NULL; + + if (data_path_root == NULL) + return (0); + + ret = cache_fs_used_percent(data_path_root, &used_pct); + if (ret < 0) + return (0); + + if (used_pct < high_water) + return (0); + + /* Serialize GC for the whole cache root. */ + lock_fd = cache_gc_lock_fd(data_path_root); + if (lock_fd < 0) + return (0); + if (flock(lock_fd, LOCK_EX) < 0) { + close(lock_fd); + return (0); + } + + ret = xasprintf(&user_glob, "%s/*", data_path_root); + if (ret < 0 || user_glob == NULL) + goto out; + + ret = cache_collect_candidates(user_glob, &cands, &cands_len); + if (ret < 0) + goto out; + + qsort(cands, cands_len, sizeof(*cands), cache_candidate_cmp); + + for (size_t i = 0; i < cands_len; ++i) { + char lock_path[PATH_MAX]; + int fd; + int n; + + ret = cache_fs_used_percent(data_path_root, &used_pct); + if (ret == 0 && used_pct < low_water) + break; + + n = snprintf(lock_path, sizeof(lock_path), "%s/%s", cands[i].path, PYXIS_CACHE_LOCKFILE); + if (n < 0 || n >= (int)sizeof(lock_path)) + continue; + + fd = open(lock_path, O_CREAT | O_RDWR | O_CLOEXEC, 0644); + if (fd < 0) + continue; + + if (flock(fd, LOCK_EX | LOCK_NB) < 0) { + close(fd); + continue; /* in use */ + } + + slurm_info("pyxis: container-cache GC: evicting %s", cands[i].path); + (void)rm_rf(cands[i].path); + close(fd); + } + +out: + cache_free_candidates(cands, cands_len); + free(user_glob); + flock(lock_fd, LOCK_UN); + close(lock_fd); + return (0); +} + +/* ---- end container-cache helpers ---- */ + static struct plugin_context context = { .enabled = false, .log_fd = -1, @@ -101,6 +505,7 @@ static struct plugin_context context = { .container = { .name = NULL, .squashfs_path = NULL, .save_path = NULL, .reuse_rootfs = false, .reuse_ns = false, .temporary_rootfs = false, + .cache_mode = false, .cache_data_path_root = NULL, .cache_data_path = NULL, .cache_lock_fd = -1, .use_enroot_import = false, .use_enroot_load = false, .use_importer = false, .userns_fd = -1, .mntns_fd = -1, .cgroupns_fd = -1, .cwd_fd = -1 @@ -268,6 +673,7 @@ int pyxis_slurmstepd_post_opt(spank_t sp, int ac, char **av) /* Check environment variables for default values after command-line processing */ pyxis_args_check_environment_variables(sp); + pyxis_debug = env_bool(sp, "PYXIS_DEBUG", false); if (!pyxis_args_enabled()) return (0); @@ -282,6 +688,60 @@ int pyxis_slurmstepd_post_opt(spank_t sp, int ac, char **av) if (ret < 0) return (-1); + /* Pre-compute cache paths and run GC early (this hook runs with elevated privileges). */ + if (context.args->container_cache == 1) { + context.container.cache_mode = true; + + if (context.config.container_cache_data_path[0] == '\0') { + slurm_error("pyxis: error: --container-cache requires container_cache_data_path to be configured"); + return (-1); + } + + if (context.container.cache_data_path_root == NULL) { + char *root = NULL; + + root = strdup(context.config.container_cache_data_path); + if (root != NULL) { + context.container.cache_data_path_root = root; + context.container.cache_data_path = cache_data_path_for_uid(root, context.job.uid); + if (context.container.cache_data_path != NULL) + (void)mkdir_p_owned(context.container.cache_data_path, context.job.uid, context.job.gid, 0700); + } else + return (-1); + } + + /* + * Only run GC when we're about to create a new cached rootfs. + * (Avoid evicting a rootfs this job will just reuse.) + */ + if (context.args->image != NULL && + context.container.cache_data_path_root != NULL && + context.container.cache_data_path != NULL) { + char *cache_basename = NULL; + char *container_name = NULL; + char dir_path[PATH_MAX]; + struct stat st; + int n; + + ret = container_cache_build_basename(&cache_basename, context.args->image, context.job.uid); + if (ret == 0) { + ret = xasprintf(&container_name, "pyxis_%s", cache_basename); + } + if (ret >= 0 && container_name != NULL) { + n = snprintf(dir_path, sizeof(dir_path), "%s/%s", context.container.cache_data_path, container_name); + if (n > 0 && n < (int)sizeof(dir_path) && + (stat(dir_path, &st) < 0 || !S_ISDIR(st.st_mode))) { + (void)cache_gc_if_needed(context.container.cache_data_path_root, + context.config.container_cache_gc_high, + context.config.container_cache_gc_low); + } + } + + free(container_name); + free(cache_basename); + } + } + return (0); } @@ -424,6 +884,12 @@ static int enroot_set_env(void) /* If writable/readonly was not set by the user, we rely on the setting specified in the enroot config. */ } + /* container-cache may override ENROOT_DATA_PATH to a persistent, node-local directory. */ + if (context.container.cache_mode && context.container.cache_data_path != NULL) { + if (setenv("ENROOT_DATA_PATH", context.container.cache_data_path, 1) < 0) + return (-1); + } + if (setenv("PYXIS_RUNTIME_PATH", context.config.runtime_path, 1) < 0) return (-1); @@ -713,6 +1179,21 @@ static int enroot_container_create(void) clock_gettime(CLOCK_MONOTONIC, &end_time); slurm_info("pyxis: completed container setup in %.0f ms", timespec_diff_ms(&start_time, &end_time)); + /* Mark the cached rootfs as recently used and hold a shared lock for the job lifetime */ + if (context.container.cache_mode && context.container.cache_data_path != NULL && context.container.cache_lock_fd < 0) { + char dir_path[PATH_MAX]; + ret = snprintf(dir_path, sizeof(dir_path), "%s/%s", context.container.cache_data_path, context.container.name); + if (ret > 0 && ret < (int)sizeof(dir_path)) { + if (touch_path(dir_path) < 0) + PYXIS_DEBUG_LOG("container-cache: touch failed: %s", dir_path); + if (cache_lock_shared(dir_path, &context.container.cache_lock_fd) < 0) { + PYXIS_DEBUG_LOG("container-cache: lock failed: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + } else { + PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + } + } + } + rv = 0; fail: @@ -1040,8 +1521,12 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) int spank_argc = 0; char **spank_argv = NULL; char *container_name = NULL; + char *cache_basename = NULL; pid_t pid; int rv = -1; + enum container_scope name_scope; + const char *requested_name; + const char *requested_flags; if (!context.enabled) return (0); @@ -1075,11 +1560,80 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) } } - if (context.args->container_name != NULL) { - if (context.config.container_scope == SCOPE_JOB) - ret = xasprintf(&container_name, "pyxis_%u_%s", context.job.jobid, context.args->container_name); + name_scope = context.config.container_scope; + requested_name = context.args->container_name; + requested_flags = context.args->container_name_flags; + + if (context.args->container_cache == 1) { + /* container-cache requires an image to derive a stable cache key */ + if (context.args->image == NULL) { + slurm_error("pyxis: error: --container-cache requires --container-image"); + goto fail; + } + if (context.args->container_save != NULL) { + slurm_error("pyxis: error: --container-cache is incompatible with --container-save"); + goto fail; + } + if (context.args->writable == 1) { + slurm_error("pyxis: error: --container-cache is incompatible with --container-writable"); + goto fail; + } + /* Force read-only containers in cache mode (prevents cross-job contamination) */ + context.args->writable = 0; + + /* Disallow special container-name flags in cache mode */ + if (requested_flags != NULL && strcmp(requested_flags, "auto") != 0) { + slurm_error("pyxis: error: --container-cache is incompatible with --container-name flags (use plain --container-name or omit it)"); + goto fail; + } + + /* Compute a stable cache basename for this user+image */ + ret = container_cache_build_basename(&cache_basename, context.args->image, context.job.uid); + if (ret < 0) { + slurm_error("pyxis: error: --container-cache: couldn't derive stable name"); + goto fail; + } + PYXIS_DEBUG_LOG("container-cache: basename=%s", cache_basename); + + requested_name = cache_basename; + requested_flags = "auto"; + /* Override job-scoped naming; cached containers must outlive a single job */ + name_scope = SCOPE_GLOBAL; + context.container.cache_mode = true; + } + + /* In cache mode, determine a persistent ENROOT_DATA_PATH root and derive a per-user directory. */ + if (context.container.cache_mode && + (context.container.cache_data_path_root == NULL || context.container.cache_data_path == NULL)) { + char *root = NULL; + + if (context.config.container_cache_data_path[0] == '\0') { + slurm_error("pyxis: error: --container-cache requires container_cache_data_path to be configured"); + goto fail; + } + + root = strdup(context.config.container_cache_data_path); + + if (root != NULL) { + context.container.cache_data_path_root = root; + context.container.cache_data_path = cache_data_path_for_uid(root, context.job.uid); + PYXIS_DEBUG_LOG("container-cache: ENROOT_DATA_PATH root=%s", root); + PYXIS_DEBUG_LOG("container-cache: ENROOT_DATA_PATH=%s", + context.container.cache_data_path ? context.container.cache_data_path : "(null)"); + if (context.container.cache_data_path != NULL) { + if (mkdir_p_owned(context.container.cache_data_path, context.job.uid, context.job.gid, 0700) < 0) + PYXIS_DEBUG_LOG("container-cache: couldn't init ENROOT_DATA_PATH=%s", + context.container.cache_data_path); + } + } else + goto fail; + } + + if (requested_name != NULL) { + if (name_scope == SCOPE_JOB) + ret = xasprintf(&container_name, "pyxis_%u_%s", context.job.jobid, requested_name); else - ret = xasprintf(&container_name, "pyxis_%s", context.args->container_name); + ret = xasprintf(&container_name, "pyxis_%s", requested_name); if (ret < 0) goto fail; @@ -1089,15 +1643,15 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) goto fail; } - if (strcmp(context.args->container_name_flags, "create") == 0 && pid >= 0) { + if (requested_flags != NULL && strcmp(requested_flags, "create") == 0 && pid >= 0) { slurm_error("pyxis: error: \"create\" flag was passed to --container-name but the container already exists"); goto fail; } - if (strcmp(context.args->container_name_flags, "exec") == 0 && pid <= 0) { + if (requested_flags != NULL && strcmp(requested_flags, "exec") == 0 && pid <= 0) { slurm_error("pyxis: error: \"exec\" flag was passed to --container-name but the container is not running"); goto fail; } - if (strcmp(context.args->container_name_flags, "no_exec") == 0 && pid > 0) + if (requested_flags != NULL && strcmp(requested_flags, "no_exec") == 0 && pid > 0) pid = 0; if (pid > 0) { @@ -1116,7 +1670,7 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) context.container.name = container_name; container_name = NULL; } else { - if (context.config.container_scope == SCOPE_JOB) + if (name_scope == SCOPE_JOB) ret = xasprintf(&context.container.name, "pyxis_%u_%u.%u", context.job.jobid, context.job.jobid, context.job.stepid); else ret = xasprintf(&context.container.name, "pyxis_%u.%u", context.job.jobid, context.job.stepid); @@ -1165,6 +1719,21 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) goto fail; } + /* If cache mode is enabled and the rootfs already exists, mark it as recently used and lock it */ + if (context.container.cache_mode && context.container.reuse_rootfs && context.container.cache_data_path != NULL) { + char dir_path[PATH_MAX]; + ret = snprintf(dir_path, sizeof(dir_path), "%s/%s", context.container.cache_data_path, context.container.name); + if (ret > 0 && ret < (int)sizeof(dir_path)) { + if (touch_path(dir_path) < 0) + PYXIS_DEBUG_LOG("container-cache: touch failed: %s", dir_path); + if (cache_lock_shared(dir_path, &context.container.cache_lock_fd) < 0) { + PYXIS_DEBUG_LOG("container-cache: lock failed: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + } else { + PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + } + } + } + rv = 0; fail: @@ -1179,6 +1748,7 @@ int slurm_spank_user_init(spank_t sp, int ac, char **av) slurm_debug("pyxis: user_init() failed with rc=%d; postponing error for now, will report later", rv); context.user_init_rv = rv; free(container_name); + free(cache_basename); return (0); } @@ -1518,6 +2088,8 @@ int pyxis_slurmstepd_exit(spank_t sp, int ac, char **av) free(context.container.name); free(context.container.squashfs_path); free(context.container.save_path); + free(context.container.cache_data_path_root); + free(context.container.cache_data_path); if (context.job.environ != NULL) { for (int i = 0; context.job.environ[i] != NULL; ++i) @@ -1529,6 +2101,7 @@ int pyxis_slurmstepd_exit(spank_t sp, int ac, char **av) xclose(context.container.mntns_fd); xclose(context.container.cgroupns_fd); xclose(context.container.cwd_fd); + xclose(context.container.cache_lock_fd); xclose(context.log_fd); ret = shm_destroy(context.shm); diff --git a/tests/container_cache.bats b/tests/container_cache.bats new file mode 100644 index 0000000..690e7ef --- /dev/null +++ b/tests/container_cache.bats @@ -0,0 +1,386 @@ +#!/usr/bin/env bats + +load ./common + +function _test_image() { + local img="${PYXIS_TEST_SQSH_IMAGE:-/home/horde/vsabavat-code/enroot/ubuntu+latest.sqsh}" + if [ ! -f "${img}" ]; then + skip "set PYXIS_TEST_SQSH_IMAGE to a local .sqsh image path" + fi + echo "${img}" +} + +function _cache_root() { + if [ -n "${PYXIS_TEST_CACHE_ROOT:-}" ]; then + echo "${PYXIS_TEST_CACHE_ROOT}" + return 0 + fi + if [ -r /opt/slurm-test/etc/plugstack.conf ]; then + sed -n 's/.*container_cache_data_path=\([^ ]*\).*/\1/p' /opt/slurm-test/etc/plugstack.conf | head -n1 + return 0 + fi + echo "/tmp/enroot-data" +} + +function _gc_high() { + if [ -n "${PYXIS_TEST_GC_HIGH:-}" ]; then + echo "${PYXIS_TEST_GC_HIGH}" + return 0 + fi + if [ -r /opt/slurm-test/etc/plugstack.conf ]; then + local v + v="$(sed -n 's/.*container_cache_gc_high=\([^ ]*\).*/\1/p' /opt/slurm-test/etc/plugstack.conf | head -n1)" + if [ -n "${v}" ]; then + echo "${v}" + return 0 + fi + fi + echo "85" +} + +function _gc_low() { + if [ -n "${PYXIS_TEST_GC_LOW:-}" ]; then + echo "${PYXIS_TEST_GC_LOW}" + return 0 + fi + if [ -r /opt/slurm-test/etc/plugstack.conf ]; then + local v + v="$(sed -n 's/.*container_cache_gc_low=\([^ ]*\).*/\1/p' /opt/slurm-test/etc/plugstack.conf | head -n1)" + if [ -n "${v}" ]; then + echo "${v}" + return 0 + fi + fi + echo "80" +} + +function _mkimglink() { + local target="$1" + local link + link="$(mktemp -p /tmp pyxis-cache-img.XXXXXX.sqsh)" + rm -f "${link}" + ln -s "${target}" "${link}" + echo "${link}" +} + +function _cache_container_name_for_image() { + local image="$1" + local uid + uid="$(id -u)" + python3 - "${image}" "${uid}" <<'PY' +import os, sys + +image = sys.argv[1] +uid = int(sys.argv[2]) + +FNV_OFFSET = 1469598103934665603 +FNV_PRIME = 1099511628211 +MASK = (1 << 64) - 1 + +def fnv1a64_update(h: int, data: bytes) -> int: + for b in data: + h ^= b + h = (h * FNV_PRIME) & MASK + return h + +h = FNV_OFFSET +h = fnv1a64_update(h, image.encode()) + +if image and image[0] in ('.', '/'): + try: + st = os.stat(image) + extra = f"|{int(st.st_mtime)}|{int(st.st_size)}".encode() + h = fnv1a64_update(h, extra) + except FileNotFoundError: + pass + +print(f"pyxis_cache_u{uid}_{h:016x}") +PY +} + +function _cache_container_dir_for_image() { + local image="$1" + local uid + uid="$(id -u)" + local root + root="$(_cache_root)" + echo "${root}/${uid}/$(_cache_container_name_for_image "${image}")" +} + +function _cleanup_cache_for_image() { + local image="$1" + local dir + dir="$(_cache_container_dir_for_image "${image}")" + rm -rf "${dir}" || true +} + +function setup() { + unset PYXIS_DEBUG || true +} + +function teardown() { + unset PYXIS_DEBUG || true +} + +@test "--container-cache is incompatible with --container-writable" { + local img link + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + run_srun_unchecked --container-cache --container-image="${link}" --container-writable true + [ "${status}" -ne 0 ] + [[ "${output}" == *"--container-cache is incompatible with --container-writable"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "--container-cache is incompatible with --container-save" { + local img link + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + run_srun_unchecked --container-cache --container-image="${link}" --container-save=/tmp/pyxis-cache-test.sqsh true + [ "${status}" -ne 0 ] + [[ "${output}" == *"--container-cache is incompatible with --container-save"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "--container-cache requires --container-image" { + run_srun_unchecked --container-cache true + [ "${status}" -ne 0 ] + [[ "${output}" == *"--container-cache requires --container-image"* ]] +} + +@test "--container-cache is incompatible with --container-name flags" { + local img link + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + run_srun_unchecked --container-cache --container-image="${link}" --container-name=cache-test:exec true + [ "${status}" -ne 0 ] + [[ "${output}" == *"--container-cache is incompatible with --container-name flags"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "--container-cache creates a stable pyxis_cache_u_* rootfs under container_cache_data_path" { + local img link uid root name dir + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + name="$(_cache_container_name_for_image "${link}")" + dir="$(_cache_container_dir_for_image "${link}")" + + rm -rf "${dir}" || true + + run_srun --container-cache --container-image="${link}" true + + [ -d "${dir}" ] + [ -f "${dir}/.pyxis_cache_lock" ] + [ "$(stat -c %a "${root}/${uid}")" = "700" ] + [ "$(stat -c %u "${root}/${uid}")" = "${uid}" ] + [[ "${name}" == "pyxis_cache_u${uid}_"* ]] + [[ "${dir}" == "${root}/${uid}/"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "--container-cache ignores plain --container-name (stable name is still used)" { + local img link uid root stable_dir ignored_name ignored_dir + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + + ignored_name="cache-name-ignored-${RANDOM}" + ignored_dir="${root}/${uid}/pyxis_${ignored_name}" + stable_dir="$(_cache_container_dir_for_image "${link}")" + + rm -rf "${ignored_dir}" "${stable_dir}" || true + + run_srun --container-cache --container-image="${link}" --container-name="${ignored_name}" true + + [ -d "${stable_dir}" ] + [ ! -e "${ignored_dir}" ] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "--container-cache forces a read-only rootfs (even with --container-remap-root)" { + local img link + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + run_srun_unchecked --container-cache --container-image="${link}" --container-remap-root sh -c 'touch /pyxis_ro_test' + [ "${status}" -ne 0 ] + [[ "${output}" == *"Read-only file system"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "container-cache GC does not evict cache entries that are locked/in-use" { + if ! command -v flock >/dev/null 2>&1; then + skip "flock not available" + fi + + local img link uid root cache_dir locked_dir unlocked_dir lock_file lock_pid + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + cache_dir="${root}/${uid}" + + mkdir -p "${cache_dir}" + + local high used_pct + high="$(_gc_high)" + used_pct="$(python3 - "${cache_dir}" <<'PY' +import os +import sys + +path = sys.argv[1] +st = os.statvfs(path) +total = st.f_blocks * st.f_frsize +avail = st.f_bavail * st.f_frsize +used = total - avail +pct = int((used * 100) // total) if total else 0 +print(pct) +PY +)" + if [ "${used_pct}" -lt "${high}" ]; then + rm -f "${link}" || true + skip "cache fs usage ${used_pct}% < ${high}% (GC won't run)" + fi + + locked_dir="${cache_dir}/pyxis_cache_u${uid}_lockedtest_${RANDOM}" + unlocked_dir="${cache_dir}/pyxis_cache_u${uid}_unlockedtest_${RANDOM}" + + mkdir -p "${locked_dir}" "${unlocked_dir}" + lock_file="${locked_dir}/.pyxis_cache_lock" + : > "${lock_file}" + + touch -d "1970-01-01 00:00:00" "${locked_dir}" 2>/dev/null || true + touch -d "1970-01-01 00:00:01" "${unlocked_dir}" 2>/dev/null || true + + flock -s "${lock_file}" -c 'sleep 300' & + lock_pid=$! + + run_srun --container-cache --container-image="${link}" true + + kill "${lock_pid}" >/dev/null 2>&1 || true + wait "${lock_pid}" >/dev/null 2>&1 || true + + [ -d "${locked_dir}" ] + [ ! -e "${unlocked_dir}" ] + + rm -rf "${locked_dir}" || true + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "container-cache GC can evict entries in other user dirs when space is tight" { + local img link uid root high used_pct other_uid other_dir victim + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + high="$(_gc_high)" + + used_pct="$(python3 - "${root}" <<'PY' +import os +import sys + +path = sys.argv[1] +st = os.statvfs(path) +total = st.f_blocks * st.f_frsize +avail = st.f_bavail * st.f_frsize +used = total - avail +pct = int((used * 100) // total) if total else 0 +print(pct) +PY +)" + if [ "${used_pct}" -lt "${high}" ]; then + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true + skip "cache fs usage ${used_pct}% < ${high}% (GC won't run)" + fi + + other_uid="9999${RANDOM}" + other_dir="${root}/${other_uid}" + victim="${other_dir}/pyxis_cache_u${other_uid}_victim_${RANDOM}" + + cleanup() { + chmod 700 "${other_dir}" 2>/dev/null || true + rm -rf "${other_dir}" 2>/dev/null || true + _cleanup_cache_for_image "${link}" 2>/dev/null || true + rm -f "${link}" 2>/dev/null || true + } + trap cleanup EXIT + + if ! mkdir -p "${victim}"; then + skip "cannot create ${victim} (cache root not writable?)" + fi + : > "${victim}/.pyxis_cache_lock" + touch -d "1970-01-01 00:00:00" "${victim}" 2>/dev/null || true + chmod 000 "${other_dir}" 2>/dev/null || true + + run_srun --container-cache --container-image="${link}" true + + chmod 700 "${other_dir}" 2>/dev/null || true + [ ! -e "${victim}" ] +} + +@test "container-cache uses a stable rootfs directory name across runs" { + local img link container_dir + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + container_dir="$(_cache_container_dir_for_image "${link}")" + rm -rf "${container_dir}" || true + + run_srun --container-cache --container-image="${link}" true + [ -d "${container_dir}" ] + + run_srun --container-cache --container-image="${link}" true + [ -d "${container_dir}" ] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + +@test "PYXIS_CONTAINER_CACHE=1 enables cache mode (no --container-cache flag needed)" { + local img link uid root name dir + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + name="$(_cache_container_name_for_image "${link}")" + dir="$(_cache_container_dir_for_image "${link}")" + + rm -rf "${dir}" || true + + export PYXIS_CONTAINER_CACHE=1 + run_srun --container-image="${link}" true + unset PYXIS_CONTAINER_CACHE + + [ -d "${dir}" ] + [[ "${name}" == "pyxis_cache_u${uid}_"* ]] + + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + + From 4aa489cc2622c35dc699e78843ad55811db46881 Mon Sep 17 00:00:00 2001 From: Vasanth Sabavat Date: Tue, 6 Jan 2026 17:24:48 +0000 Subject: [PATCH 2/2] use a temp directory during create and atomic move to rootfs dir --- pyxis_slurmstepd.c | 109 +++++++++++++++++++++++++++++++++---- tests/container_cache.bats | 32 ++++++++++- 2 files changed, 129 insertions(+), 12 deletions(-) diff --git a/pyxis_slurmstepd.c b/pyxis_slurmstepd.c index 98a1bb4..c0b0523 100644 --- a/pyxis_slurmstepd.c +++ b/pyxis_slurmstepd.c @@ -134,6 +134,7 @@ static double timespec_diff_ms(const struct timespec *start, const struct timesp #define PYXIS_CACHE_CONTAINER_BASENAME_PREFIX "cache_u" #define PYXIS_CACHE_CONTAINER_PREFIX "pyxis_cache_" #define PYXIS_CACHE_LOCKFILE ".pyxis_cache_lock" +#define PYXIS_CACHE_TMP_CONTAINER_PREFIX "pyxis_tmp_" static uint64_t fnv1a64_update(uint64_t h, const void *data, size_t len) { @@ -266,7 +267,33 @@ static int cache_lock_shared(const char *container_dir, int *out_fd) if (ret < 0 || ret >= (int)sizeof(lock_path)) return (-1); - fd = open(lock_path, O_CREAT | O_RDWR | O_CLOEXEC, 0644); + fd = open(lock_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return (-1); + + if (flock(fd, LOCK_SH) < 0) { + close(fd); + return (-1); + } + + *out_fd = fd; + return (0); +} + +static int cache_lock_create_shared(const char *container_dir, int *out_fd) +{ + char lock_path[PATH_MAX]; + int fd; + int ret; + + if (container_dir == NULL || out_fd == NULL) + return (-1); + + ret = snprintf(lock_path, sizeof(lock_path), "%s/%s", container_dir, PYXIS_CACHE_LOCKFILE); + if (ret < 0 || ret >= (int)sizeof(lock_path)) + return (-1); + + fd = open(lock_path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644); if (fd < 0) return (-1); @@ -467,9 +494,14 @@ static int cache_gc_if_needed(const char *data_path_root, int high_water, int lo if (n < 0 || n >= (int)sizeof(lock_path)) continue; - fd = open(lock_path, O_CREAT | O_RDWR | O_CLOEXEC, 0644); - if (fd < 0) + fd = open(lock_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + if (errno == ENOENT) { + slurm_info("pyxis: container-cache GC: removing stale (missing lock): %s", cands[i].path); + (void)rm_rf(cands[i].path); + } continue; + } if (flock(fd, LOCK_EX | LOCK_NB) < 0) { close(fd); @@ -1110,9 +1142,24 @@ static int enroot_container_create(void) { int ret; char *enroot_uri = NULL; + const char *final_name = context.container.name; + char *create_name = context.container.name; + char *tmp_name = NULL; + char tmp_dir_path[PATH_MAX]; + char final_dir_path[PATH_MAX]; + bool cache_atomic = false; struct timespec start_time, end_time; int rv = -1; + /* In cache mode, build into a temporary rootfs name and atomically publish it via rename(2). */ + if (context.container.cache_mode && context.container.cache_data_path != NULL && context.container.name != NULL) { + cache_atomic = true; + ret = xasprintf(&tmp_name, "%s%s_%u", PYXIS_CACHE_TMP_CONTAINER_PREFIX, context.container.name, (unsigned)getpid()); + if (ret < 0 || tmp_name == NULL) + goto fail; + create_name = tmp_name; + } + if (context.container.use_enroot_import || context.container.use_enroot_load || context.container.use_importer) { if (strncmp("docker://", context.args->image, sizeof("docker://") - 1) == 0 || strncmp("dockerd://", context.args->image, sizeof("dockerd://") - 1) == 0) { @@ -1137,7 +1184,7 @@ static int enroot_container_create(void) clock_gettime(CLOCK_MONOTONIC, &start_time); if (context.container.use_enroot_load) { - ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "load", "--name", context.container.name, enroot_uri, NULL }); + ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "load", "--name", create_name, enroot_uri, NULL }); if (ret < 0) { slurm_error("pyxis: failed to import docker image"); enroot_print_log_ctx(true); @@ -1165,9 +1212,9 @@ static int enroot_container_create(void) } if (context.container.squashfs_path != NULL) { - slurm_info("pyxis: creating container filesystem: %s", context.container.name); + slurm_info("pyxis: creating container filesystem: %s", final_name); - ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "create", "--name", context.container.name, context.container.squashfs_path, NULL }); + ret = enroot_exec_wait_ctx((char *const[]){ "enroot", "create", "--name", create_name, context.container.squashfs_path, NULL }); if (ret < 0) { slurm_error("pyxis: failed to create container filesystem"); enroot_print_log_ctx(true); @@ -1176,20 +1223,53 @@ static int enroot_container_create(void) } } + /* Publish the cache rootfs only after a successful create/load. */ + if (cache_atomic) { + ret = snprintf(tmp_dir_path, sizeof(tmp_dir_path), "%s/%s", context.container.cache_data_path, create_name); + if (ret <= 0 || ret >= (int)sizeof(tmp_dir_path)) + goto fail; + ret = snprintf(final_dir_path, sizeof(final_dir_path), "%s/%s", context.container.cache_data_path, final_name); + if (ret <= 0 || ret >= (int)sizeof(final_dir_path)) + goto fail; + + /* Create+hold the cache lock before the final name becomes visible. */ + if (context.container.cache_lock_fd < 0) { + if (cache_lock_create_shared(tmp_dir_path, &context.container.cache_lock_fd) < 0) + goto fail; + PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", tmp_dir_path, PYXIS_CACHE_LOCKFILE); + } + + if (rename(tmp_dir_path, final_dir_path) < 0) { + if (errno == EEXIST || errno == ENOTEMPTY) { + /* Another job won the race; drop our temp rootfs and reuse the published one. */ + xclose(context.container.cache_lock_fd); + context.container.cache_lock_fd = -1; + (void)rm_rf(tmp_dir_path); + if (cache_lock_shared(final_dir_path, &context.container.cache_lock_fd) < 0) + PYXIS_DEBUG_LOG("container-cache: lock failed: %s/%s", final_dir_path, PYXIS_CACHE_LOCKFILE); + else + PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", final_dir_path, PYXIS_CACHE_LOCKFILE); + } else { + goto fail; + } + } + } + clock_gettime(CLOCK_MONOTONIC, &end_time); slurm_info("pyxis: completed container setup in %.0f ms", timespec_diff_ms(&start_time, &end_time)); /* Mark the cached rootfs as recently used and hold a shared lock for the job lifetime */ - if (context.container.cache_mode && context.container.cache_data_path != NULL && context.container.cache_lock_fd < 0) { + if (context.container.cache_mode && context.container.cache_data_path != NULL) { char dir_path[PATH_MAX]; ret = snprintf(dir_path, sizeof(dir_path), "%s/%s", context.container.cache_data_path, context.container.name); if (ret > 0 && ret < (int)sizeof(dir_path)) { if (touch_path(dir_path) < 0) PYXIS_DEBUG_LOG("container-cache: touch failed: %s", dir_path); - if (cache_lock_shared(dir_path, &context.container.cache_lock_fd) < 0) { - PYXIS_DEBUG_LOG("container-cache: lock failed: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); - } else { - PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + if (context.container.cache_lock_fd < 0) { + if (cache_lock_create_shared(dir_path, &context.container.cache_lock_fd) < 0) + PYXIS_DEBUG_LOG("container-cache: lock failed: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); + else + PYXIS_DEBUG_LOG("container-cache: locked: %s/%s", dir_path, PYXIS_CACHE_LOCKFILE); } } } @@ -1198,6 +1278,13 @@ static int enroot_container_create(void) fail: free(enroot_uri); + if (cache_atomic && tmp_name != NULL && context.container.cache_data_path != NULL) { + char path[PATH_MAX]; + ret = snprintf(path, sizeof(path), "%s/%s", context.container.cache_data_path, tmp_name); + if (ret > 0 && ret < (int)sizeof(path)) + (void)rm_rf(path); + } + free(tmp_name); if (context.container.use_enroot_import && context.container.squashfs_path != NULL) { ret = unlink(context.container.squashfs_path); if (ret < 0) diff --git a/tests/container_cache.bats b/tests/container_cache.bats index 690e7ef..7651fd0 100644 --- a/tests/container_cache.bats +++ b/tests/container_cache.bats @@ -273,7 +273,7 @@ PY touch -d "1970-01-01 00:00:00" "${locked_dir}" 2>/dev/null || true touch -d "1970-01-01 00:00:01" "${unlocked_dir}" 2>/dev/null || true - flock -s "${lock_file}" -c 'sleep 300' & + flock -s "${lock_file}" sleep 300 & lock_pid=$! run_srun --container-cache --container-image="${link}" true @@ -360,6 +360,36 @@ PY rm -f "${link}" || true } +@test "container-cache ignores stale temp build dirs and still creates the final cache rootfs" { + local img link uid root name dir tmp_dir tmp_sentinel + img="$(_test_image)" + link="$(_mkimglink "${img}")" + + uid="$(id -u)" + root="$(_cache_root)" + name="$(_cache_container_name_for_image "${link}")" + dir="$(_cache_container_dir_for_image "${link}")" + + tmp_dir="${root}/${uid}/pyxis_tmp_${name}_${RANDOM}" + tmp_sentinel="${tmp_dir}/.pyxis_tmp_sentinel" + + rm -rf "${dir}" "${tmp_dir}" || true + mkdir -p "${tmp_dir}" + echo "tmp" > "${tmp_sentinel}" + rm -f "${tmp_dir}/.pyxis_cache_lock" || true + + run_srun --container-cache --container-image="${link}" true + + [ -d "${dir}" ] + [ -f "${dir}/.pyxis_cache_lock" ] + [ -d "${tmp_dir}" ] + [ -f "${tmp_sentinel}" ] + + rm -rf "${tmp_dir}" || true + _cleanup_cache_for_image "${link}" + rm -f "${link}" || true +} + @test "PYXIS_CONTAINER_CACHE=1 enables cache mode (no --container-cache flag needed)" { local img link uid root name dir img="$(_test_image)"