From 8ce48d39766f381a0b1368a3de6f82987ded3e9c Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 20 Jan 2026 00:45:50 -0800 Subject: [PATCH 1/9] incorporated new shm code --- Makefile | 2 +- src/platform_linux/platform_heap.c | 23 +- src/platform_linux/platform_heap.h | 46 +- src/platform_linux/shmem.c | 1299 --------------------- src/platform_linux/shmem.h | 109 -- src/splinterdb.c | 3 - tests/config.c | 14 - tests/unit/splinter_shmem_test.c | 628 ---------- tests/unit/splinterdb_forked_child_test.c | 37 - tests/unit/splinterdb_heap_id_mgmt_test.c | 336 ++++-- 10 files changed, 255 insertions(+), 2242 deletions(-) delete mode 100644 src/platform_linux/shmem.c delete mode 100644 src/platform_linux/shmem.h delete mode 100644 tests/unit/splinter_shmem_test.c diff --git a/Makefile b/Makefile index 216af299..8931373f 100644 --- a/Makefile +++ b/Makefile @@ -394,7 +394,7 @@ PLATFORM_SYS = $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/platform_assert.o \ $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/platform_log.o \ $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/platform_mutex.o \ $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/platform_threads.o \ - $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/shmem.o + $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/shmalloc.o PLATFORM_IO_SYS = $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/platform_io.o \ $(OBJDIR)/$(SRCDIR)/$(PLATFORM_DIR)/laio.o diff --git a/src/platform_linux/platform_heap.c b/src/platform_linux/platform_heap.c index 654bafbf..624e09b0 100644 --- a/src/platform_linux/platform_heap.c +++ b/src/platform_linux/platform_heap.c @@ -3,6 +3,7 @@ #include "platform_heap.h" #include "platform_status.h" +#include /* * Declare globals to track heap handle/ID that may have been created when @@ -25,24 +26,28 @@ platform_heap_create(platform_module_id UNUSED_PARAM(module_id), bool use_shmem, platform_heap_id *heap_id) { - *heap_id = PROCESS_PRIVATE_HEAP_ID; - if (use_shmem) { - platform_status rc = platform_shmcreate(max, (shmem_heap **)heap_id); - if (SUCCESS(rc)) { - Heap_id = *heap_id; + shmallocator *shm = mmap( + NULL, max, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (shm == MAP_FAILED) { + return STATUS_NO_MEMORY; } - return rc; + shmallocator_init(shm, max / 4096, max); + *heap_id = (platform_heap_id)shm; + + } else { + *heap_id = PROCESS_PRIVATE_HEAP_ID; } - *heap_id = NULL; return STATUS_OK; } void platform_heap_destroy(platform_heap_id *heap_id) { - // If shared segment was allocated, it's being tracked thru heap ID. if (*heap_id) { - return platform_shmdestroy((shmem_heap **)heap_id); + size_t size = shmallocator_size((shmallocator *)*heap_id); + shmallocator_deinit((shmallocator *)*heap_id); + munmap((void *)*heap_id, size); + *heap_id = NULL; } } diff --git a/src/platform_linux/platform_heap.h b/src/platform_linux/platform_heap.h index 061be9d2..bc22c042 100644 --- a/src/platform_linux/platform_heap.h +++ b/src/platform_linux/platform_heap.h @@ -8,7 +8,7 @@ #include "platform_util.h" #include "platform_machine.h" #include "platform_log.h" -#include "shmem.h" +#include "shmalloc.h" #include #include @@ -73,24 +73,13 @@ platform_aligned_malloc(const platform_heap_id heap_id, { // Requirement for aligned_alloc platform_assert(IS_POWER_OF_2(alignment)); + size_t aligned_size = (size + alignment - 1) & ~((uintptr_t)alignment - 1); - /* - * aligned_alloc requires size to be a multiple of alignment - * round up to nearest multiple of alignment - * - * Note that since this is inlined, the compiler will turn the constant - * (power of 2) alignment mod operations into bitwise & - */ - // RESOLVE: Delete this padding from caller. Push this down to - // platform_shm_alloc(). - const size_t padding = platform_align_bytes_reqd(alignment, size); - const size_t required = (size + padding); - - void *retptr = - (heap_id - ? platform_shm_alloc(heap_id, required, objname, func, file, lineno) - : aligned_alloc(alignment, required)); - return retptr; + if (heap_id) { + return shmalloc(heap_id, alignment, size); + } else { + return aligned_alloc(alignment, aligned_size); + } } /* @@ -115,16 +104,7 @@ platform_realloc(const platform_heap_id heap_id, // Farm control off to shared-memory based realloc, if it's configured if (heap_id) { - // The shmem-based allocator is expecting all memory requests to be of - // aligned sizes, as that's what platform_aligned_malloc() does. So, to - // keep that allocator happy, align this memory request if needed. - // As this is the case of realloc, we assume that it would suffice to - // align at platform's natural cacheline boundary. - const size_t padding = - platform_align_bytes_reqd(PLATFORM_CACHELINE_SIZE, newsize); - const size_t required = (newsize + padding); - return platform_shm_realloc( - heap_id, ptr, oldsize, required, __func__, __FILE__, __LINE__); + return shrealloc(heap_id, ptr, newsize); } else { return realloc(ptr, newsize); } @@ -139,14 +119,12 @@ platform_free_from_heap(platform_heap_id heap_id, int lineno) { if (heap_id) { - platform_shm_free(heap_id, ptr, objname, func, file, lineno); + shfree(heap_id, ptr); } else { free(ptr); } } -typedef struct shmem_heap shmem_heap; - platform_status platform_heap_create(platform_module_id module_id, size_t max, @@ -156,12 +134,6 @@ platform_heap_create(platform_module_id module_id, void platform_heap_destroy(platform_heap_id *heap_id); -void -platform_shm_set_splinterdb_handle(platform_heap_id heap_id, void *addr); - -shmem_heap * -platform_heap_id_to_shmaddr(platform_heap_id hid); - /* * Similar to the TYPED_MALLOC functions, for all the free functions we need to * call platform_get_heap_id() from a macro instead of an inline function diff --git a/src/platform_linux/shmem.c b/src/platform_linux/shmem.c deleted file mode 100644 index 504967ed..00000000 --- a/src/platform_linux/shmem.c +++ /dev/null @@ -1,1299 +0,0 @@ -// Copyright 2018-2026 VMware, Inc. -// SPDX-License-Identifier: Apache-2.0 - -/* - * shmem.c -- - * - * This file contains the implementation for managing shared memory created - * for use by SplinterDB and all its innards. - */ -#include "shmem.h" -#include "splinterdb/platform_linux/public_platform.h" -#include "platform_util.h" -#include "platform_machine.h" -#include "platform_threads.h" -#include "platform_log.h" -#include "platform_heap.h" -#include "platform_assert.h" -#include "platform_status.h" -#include "platform_spinlock.h" -#include "platform_units.h" -#include - -// SplinterDB's shared segment magic identifier. Mainly needed for diagnostics. -#define SPLINTERDB_SHMEM_MAGIC (uint64)0x543e4a6d - -// Boolean globals controlling tracing of shared memory allocs / frees -static bool Trace_shmem_allocs = FALSE; -static bool Trace_shmem_frees = FALSE; -static bool Trace_shmem = FALSE; -static bool Trace_large_frags = FALSE; - -/* - * --------------------------------------------------------------------------- - * shm_large_frag_info{} - Struct describing a large memory fragment allocation. - * - * This is a specialized memory-fragment tracker solely constructed to satisfy - * large memory requests. In Splinter large memory requests, i.e. something over - * 1 M bytes, occur rarely, mainly when we do stuff like compact or pack. - * And these operations are generally short-lived but could occur very - * frequently in heavy insert workloads. This mechanism is a way to track a - * free-list of such memory fragments that were allocated previously and are now - * 'freed'. They will be re-allocated to the next requestor, thereby, keeping - * the overall space required in shared memory to be somewhat optimal. - * - * NOTE: {to_pid, to_tid} and {by_pid, by_tid} fields go hand-in-hand. - * We track both for improved debugging. - * - * Lifecyle: - * - When a large fragment is initially allocated, frag_addr / frag_size will - * be set. - * - (allocated_to_pid != 0) && (freed_by_pid == 0) - Fragment is in use. - * - (allocated_to_pid != 0) && (freed_by_pid != 0) - Fragment is free. - * --------------------------------------------------------------------------- - */ -typedef struct shm_large_frag_info { - void *frag_addr; // Start address of this memory fragment - // NULL => tracking fragment is empty - size_t frag_size; // bytes (Used in re-allocation logic.) - - // Following fields are used mainly for assertions and diagnostics. - int frag_allocated_to_pid; // Allocated to this OS-pid - threadid frag_allocated_to_tid; // Allocated to this Splinter thread-ID - int frag_freed_by_pid; // OS-pid that freed this large fragment - threadid frag_freed_by_tid; // Splinter thread-ID that freed fragment -} shm_large_frag_info; - -/* - * All memory allocations of this size or larger will be tracked in the - * above fragment tracker array. For large inserts workload, we allocate large - * memory chunks for fingerprint array, which is more than a MiB. For scans, - * splinterdb_iterator_init() allocates memory for an iterator which is ~92+KiB. - * Set this to a lower value so we can re-cycle free fragments for iterators - * also. - */ -#if SPLINTER_DEBUG -# define SHM_LARGE_FRAG_SIZE (90 * KiB) -#else -# define SHM_LARGE_FRAG_SIZE (38 * KiB) -#endif // SPLINTER_DEBUG - -/* - * In the worst case we may have all threads performing activities that need - * such large memory fragments. We track up to twice the # of configured - * threads, which is still a small array to search. - */ -#define SHM_NUM_LARGE_FRAGS (MAX_THREADS * 2) - -/* - * ------------------------------------------------------------------------ - * Shared-memory usage statistics & metrics: - * - * Set of usage-stats fields copied from shmem_info{} struct, so that we - * can print these after shared segment has been destroyed. - * ------------------------------------------------------------------------ - */ -typedef struct shminfo_usage_stats { - size_t total_bytes; // Total size of shared segment allocated initially. - size_t used_bytes; // Used bytes of memory left (that were allocated) - size_t free_bytes; // Free bytes of memory left (that can be allocated) - size_t used_bytes_HWM; // High-water mark of memory used bytes - size_t nfrees; // # of calls to free memory - size_t nfrees_last_frag; // Freed last small-fragment - size_t nf_search_skipped; - size_t used_by_large_frags_bytes; // Actually reserved - uint32 nlarge_frags_tracked; - uint32 nlarge_frags_inuse; - uint32 nlarge_frags_inuse_HWM; - int nlarge_frags_found_in_use; - int shmid; -} shminfo_usage_stats; - -/* - * ----------------------------------------------------------------------------- - * shmem_heap{}: Shared memory Control Block: Used as a heap for memory allocs - * - * Core structure describing shared memory segment created. This lives right - * at the start of the allocated shared segment. - * - * NOTE(s): - * - shm_large_frag_hip tracks the highest-address of all the large fragments - * that are tracked. This is an optimization to short-circuit the search - * done when freeing any fragment, to see if it's a large-fragment. - * ----------------------------------------------------------------------------- - */ -struct shmem_heap { - void *shm_start; // Points to start address of shared segment. - void *shm_end; // Points to end address; one past end of sh segment - void *shm_next; // Points to next 'free' address to allocate from. - void *shm_last_alloc; // Points to address most-recently allocated - void *shm_splinterdb_handle; - void *shm_large_frag_hip; // Highest addr of large-fragments tracked - - platform_spinlock shm_mem_lock; // To sync alloc / free - - platform_spinlock shm_mem_frags_lock; - // Protected by shm_mem_frags_lock. Must hold to read or modify. - shm_large_frag_info shm_large_frags[SHM_NUM_LARGE_FRAGS]; - - shminfo_usage_stats usage; - uint64 shm_magic; // Magic identifier for shared memory segment - int shm_id; // Shared memory ID returned by shmget() - -} PLATFORM_CACHELINE_ALIGNED; - -/* Permissions for accessing shared memory and IPC objects */ -#define PLATFORM_IPC_OBJS_PERMS (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP) - -// Function prototypes - -static void -platform_shm_track_large_alloc(shmem_heap *shm, void *addr, size_t size); - -static void -platform_shm_track_free(shmem_heap *shm, - void *addr, - const char *objname, - const char *func, - const char *file, - const int lineno); - -static void * -platform_shm_find_large(shmem_heap *shm, - size_t size, - const char *objname, - const char *func, - const char *file, - const int lineno); - -static void -platform_shm_trace_allocs(shmem_heap *shm, - const size_t size, - const char *verb, - const void *retptr, - const char *objname, - const char *func, - const char *file, - const int lineno); - -static int -platform_trace_large_frags(shmem_heap *shm); - -bool -platform_shm_heap_valid(shmem_heap *shmheap); - -/* - * PLATFORM_HEAP_ID_TO_SHMADDR() -- - * - * The shared memory create function returns the address of shmem_heap->shm_id - * as the platform_heap_id heap-ID to the caller. Rest of Splinter will use this - * heap-ID as a 'handle' to manage / allocate shared memory. This macro converts - * the heap-ID handle to the shared memory's start address, from which the - * location of the next-free-byte can be tracked. - */ -shmem_heap * -platform_heap_id_to_shmaddr(platform_heap_id hid) -{ - debug_assert(hid != NULL); - return (shmem_heap *)hid; -} - -static inline void -shm_lock_mem(shmem_heap *shm) -{ - platform_spin_lock(&shm->shm_mem_lock); -} - -static inline void -shm_unlock_mem(shmem_heap *shm) -{ - platform_spin_unlock(&shm->shm_mem_lock); -} - -static inline void -shm_lock_mem_frags(shmem_heap *shm) -{ - platform_spin_lock(&shm->shm_mem_frags_lock); -} - -static inline void -shm_unlock_mem_frags(shmem_heap *shm) -{ - platform_spin_unlock(&shm->shm_mem_frags_lock); -} - -/* - * platform_valid_addr_in_heap(), platform_valid_addr_in_shm() - * - * Address 'addr' is valid if it's just past end of control block and within - * shared segment. - */ -static inline bool -platform_valid_addr_in_shm(shmem_heap *shmaddr, const void *addr) -{ - return ((addr >= ((void *)shmaddr + platform_shm_ctrlblock_size())) - && (addr < shmaddr->shm_end)); -} - -/* - * Validate that input address 'addr' is a valid address within shared segment - * region. - */ -bool -platform_valid_addr_in_heap(shmem_heap *heap, const void *addr) -{ - return platform_valid_addr_in_shm(heap, addr); -} - -/* - * Produce a formatted one-line output of shared memory usage stats / metrics. - */ -void -platform_shm_print_usage_stats(shminfo_usage_stats *usage) -{ - fraction used_bytes_pct; - fraction used_bytes_HWM_pct; - fraction free_bytes_pct; - fraction freed_last_frag_pct = zero_fraction; - fraction nf_search_skipped_pct = zero_fraction; - - used_bytes_pct = init_fraction(usage->used_bytes, usage->total_bytes); - used_bytes_HWM_pct = - init_fraction(usage->used_bytes_HWM, usage->total_bytes); - free_bytes_pct = init_fraction(usage->free_bytes, usage->total_bytes); - if (usage->nfrees) { - freed_last_frag_pct = - init_fraction(usage->nfrees_last_frag, usage->nfrees); - nf_search_skipped_pct = - init_fraction(usage->nf_search_skipped, usage->nfrees); - } - - // clang-format off - platform_default_log( - "Shared memory usage stats shmid=%d:" - " Total=%lu bytes (%s)" - ", Used=%lu bytes (%s, " FRACTION_FMT(4, 2) " %%)" - ", UsedHWM=%lu bytes (%s, " FRACTION_FMT(4, 2) " %%)" - ", Free=%lu bytes (%s, " FRACTION_FMT(4, 2) " %%)" - ", nfrees=%lu" - ", nfrees-last-small-frag=%lu (" FRACTION_FMT(4, 2) " %%)" - ", nf_search_skipped=%lu (" FRACTION_FMT(4, 2) " %%)" - ", Large fragments in-use HWM=%u (found in-use=%d)" - ", consumed=%lu bytes (%s)" - ".\n", - usage->shmid, - usage->total_bytes, size_str(usage->total_bytes), - - usage->used_bytes, size_str(usage->used_bytes), - (FRACTION_ARGS(used_bytes_pct) * 100), - - usage->used_bytes_HWM, size_str(usage->used_bytes_HWM), - (FRACTION_ARGS(used_bytes_HWM_pct) * 100), - - usage->free_bytes, size_str(usage->free_bytes), - (FRACTION_ARGS(free_bytes_pct) * 100), - - usage->nfrees, - usage->nfrees_last_frag, - (FRACTION_ARGS(freed_last_frag_pct) * 100), - - usage->nf_search_skipped, - (FRACTION_ARGS(nf_search_skipped_pct) * 100), - - usage->nlarge_frags_inuse_HWM, - usage->nlarge_frags_found_in_use, - usage->used_by_large_frags_bytes, - size_str(usage->used_by_large_frags_bytes)); - // clang-format on -} - -/* - * Save off shared memory usage stats in a usage struct. This is really - * needed to print usage-stats before we dismantle the shared segment - * entirely. - * - * Returns: # of large free-fragments found in-use. Usually when this - * function is called for diagnostics, this may not matter. When the - * shared segment is dismantled, a non-zero count for "in-use" large fragments - * is an indication that something is amiss. - */ -int -platform_save_usage_stats(shminfo_usage_stats *usage, shmem_heap *shm) -{ - *usage = shm->usage; - usage->nlarge_frags_found_in_use = platform_trace_large_frags(shm); - return usage->nlarge_frags_found_in_use; -} - -/* - * ----------------------------------------------------------------------------- - * Interface to print shared memory usage stats. (Callable from the debugger) - * This is mainly intended as a diagnostics tool, so we don't work too hard - * to grab metrics under exclusive access. - */ -void -platform_shm_print_usage(platform_heap_id hid) -{ - shmem_heap *shm = platform_heap_id_to_shmaddr(hid); - shminfo_usage_stats usage; - platform_save_usage_stats(&usage, shm); - platform_shm_print_usage_stats(&usage); -} - -/* - * ----------------------------------------------------------------------------- - * platform_shmcreate() -- Create a new shared memory segment. - * - * For a given heap ID, we expect that this create method will only be called - * once. [ Otherwise, it means some code is erroneously creating - * the shared segment twice, clobbering previously established handles. ] - * ----------------------------------------------------------------------------- - */ -platform_status -platform_shmcreate(size_t size, shmem_heap **heap) // Out -{ - platform_assert((*heap == NULL), - "Heap handle is expected to be NULL while creating a new " - "shared segment.\n"); - - int shmid = shmget(0, size, (IPC_CREAT | PLATFORM_IPC_OBJS_PERMS)); - if (shmid == -1) { - platform_error_log( - "Failed to created shared segment of size %lu bytes (%s).\n", - size, - size_str(size)); - return STATUS_NO_MEMORY; - } - platform_default_log( - "Created shared memory of size %lu bytes (%s), shmid=%d.\n", - size, - size_str(size), - shmid); - - // Get start of allocated shared segment - void *shmaddr = shmat(shmid, NULL, 0); - - if (shmaddr == (void *)-1) { - platform_error_log("Failed to attach to shared segment, shmid=%d.\n", - shmid); - return STATUS_NO_MEMORY; - } - - // Setup shared segment's control block at head of shared segment. - shmem_heap *shm = (shmem_heap *)shmaddr; - - shm->shm_start = shmaddr; - shm->shm_end = (shmaddr + size); - shm->shm_next = (shmaddr + sizeof(shmem_heap)); - shm->shm_id = shmid; - shm->shm_magic = SPLINTERDB_SHMEM_MAGIC; - - size_t free_bytes = (size - sizeof(shmem_heap)); - shm->usage.total_bytes = size; - shm->usage.free_bytes = free_bytes; - - // Return heap handle pointing to start addr of shared segment. - if (heap) { - *heap = shm; - } - - platform_spinlock_init(&shm->shm_mem_lock); - - // Initialize spinlock needed to access memory fragments tracker - platform_spinlock_init(&shm->shm_mem_frags_lock); - - // Always trace creation of shared memory segment. - platform_default_log("Completed setup of shared memory of size " - "%lu bytes (%s), shmaddr=%p, shmid=%d," - " available memory = %lu bytes (%s).\n", - size, - size_str(size), - shmaddr, - shmid, - free_bytes, - size_str(free_bytes)); - return STATUS_OK; -} - -/* - * ----------------------------------------------------------------------------- - * platform_shmdestroy() -- Destroy a shared memory created for SplinterDB. - * ----------------------------------------------------------------------------- - */ -void -platform_shmdestroy(shmem_heap **heap) -{ - if (!heap) { - platform_error_log( - "Error! Attempt to destroy shared memory with NULL heap pointer!"); - return; - } - - const void *shmaddr = (const void *)(*heap); - - // Heap pointer may be coming from the shared segment, itself, that we will - // be detaching from now and freeing, below. So, an attempt to NULL out - // this handle after memory is freed will run into an exception. Clear - // out this handle prior to all this circus. - *heap = NULL; - - // Use a cached copy in case we are dealing with bogus input shmem address. - shmem_heap shmem_heap_struct; - memmove(&shmem_heap_struct, shmaddr, sizeof(shmem_heap_struct)); - - shmem_heap *shm = &shmem_heap_struct; - - if (shm->shm_magic != SPLINTERDB_SHMEM_MAGIC) { - platform_error_log("%s(): Input heap, %p, does not seem to be a " - "valid SplinterDB shared segment's start address." - " Found magic 0x%lX does not match expected" - " magic 0x%lX.\n", - __func__, - shmaddr, - shm->shm_magic, - SPLINTERDB_SHMEM_MAGIC); - return; - } - - // Retain some memory usage stats before releasing shmem - shminfo_usage_stats usage; - platform_save_usage_stats(&usage, shm); - - int shmid = shm->shm_id; - int rv = shmdt(shmaddr); - if (rv != 0) { - platform_error_log("Failed to detach from shared segment at address " - "%p, shmid=%d.\n", - shmaddr, - shmid); - return; - } - - // Externally, heap_id is pointing to this field. In anticipation that the - // removal of shared segment will succeed, below, clear this out. This way, - // any future accesses to this shared segment by its heap-ID will run into - // assertions. - shm->shm_id = 0; - - rv = shmctl(shmid, IPC_RMID, NULL); - if (rv != 0) { - platform_error_log( - "shmctl failed to remove shared segment at address %p, shmid=%d.\n", - shmaddr, - shmid); - - // restore state - shm->shm_id = shmid; - return; - } - - // Reset globals to NULL; to avoid accessing stale handles. - Heap_id = NULL; - - // Always trace destroy of shared memory segment. - platform_default_log("Deallocated SplinterDB shared memory " - "segment at %p, shmid=%d.\n", - shmaddr, - shmid); - - platform_shm_print_usage_stats(&usage); -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_alloc() -- Allocate n-bytes from shared memory segment. - * - * Allocation request is expected to have added-in pad-bytes required for - * alignment to some power-of-2 # of bytes. As a result, we can assert that - * the addr-of-next-free-byte is always aligned to PLATFORM_CACHELINE_SIZE. - * ----------------------------------------------------------------------------- - */ -// RESOLVE: Pass down user requested alignment and handle it here. -void * -platform_shm_alloc(shmem_heap *heap, - const size_t size, - const char *objname, - const char *func, - const char *file, - const int lineno) -{ - shmem_heap *shm = heap; - - debug_assert((platform_shm_heap_valid(shm) == TRUE), - "Shared memory heap at %p is not a valid shared memory ptr.", - heap); - - debug_assert(((size % PLATFORM_CACHELINE_SIZE) == 0), - "size=%lu is not aligned to PLATFORM_CACHELINE_SIZE", - size); - platform_assert(((((uint64)shm->shm_next) % PLATFORM_CACHELINE_SIZE) == 0), - "[%s:%d] Next free-addr is not aligned: " - "shm_next=%p, total_bytes=%lu, used_bytes=%lu" - ", free_bytes=%lu", - file, - lineno, - shm->shm_next, - shm->usage.total_bytes, - shm->usage.used_bytes, - shm->usage.free_bytes); - - void *retptr = NULL; - - // See if we can satisfy requests for large memory fragments from a cached - // list of used/free fragments that are tracked separately. - if ((size >= SHM_LARGE_FRAG_SIZE) - && ((retptr = - platform_shm_find_large(shm, size, objname, func, file, lineno)) - != NULL)) - { - return retptr; - } - - _Static_assert(sizeof(void *) == sizeof(size_t), - "check our casts are valid"); - - shm_lock_mem(shm); - - // Optimistically, allocate the requested 'size' bytes of memory. - retptr = __sync_fetch_and_add(&shm->shm_next, (void *)size); - - if ((retptr + size) > shm->shm_end) { - // This memory request cannot fit in available space. Reset. - __sync_fetch_and_sub(&shm->shm_next, (void *)size); - shm_unlock_mem(shm); - - platform_error_log( - "[%s:%d::%s()]: Insufficient memory in shared segment" - " to allocate %lu bytes for '%s'. Approx free space=%lu bytes." - " nlarge_frags_tracked=%u, nlarge_frags_inuse=%u (HWM=%u).\n", - file, - lineno, - func, - size, - objname, - shm->usage.free_bytes, - shm->usage.nlarge_frags_tracked, - shm->usage.nlarge_frags_inuse, - shm->usage.nlarge_frags_inuse_HWM); - platform_trace_large_frags(shm); - return NULL; - } - - shm->shm_last_alloc = retptr; - // Track approx memory usage metrics; mainly for troubleshooting - __sync_fetch_and_add(&shm->usage.used_bytes, size); - __sync_fetch_and_sub(&shm->usage.free_bytes, size); - if (shm->usage.used_bytes > shm->usage.used_bytes_HWM) { - shm->usage.used_bytes_HWM = shm->usage.used_bytes; - } - shm_unlock_mem(shm); - - if (size >= SHM_LARGE_FRAG_SIZE) { - platform_shm_track_large_alloc(shm, retptr, size); - } - - // Trace shared memory allocation; then return memory ptr. - if (Trace_shmem || Trace_shmem_allocs - || (Trace_large_frags && (size >= SHM_LARGE_FRAG_SIZE))) - { - platform_shm_trace_allocs(shm, - size, - "Allocated new fragment", - retptr, - objname, - func, - file, - lineno); - } - return retptr; -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_realloc() -- Re-allocate n-bytes from shared segment. - * - * Functionally is similar to 'realloc' system call. We allocate required # of - * bytes, copy over the old contents (if any), and do a fake free of the oldptr. - * ----------------------------------------------------------------------------- - */ -void * -platform_shm_realloc(shmem_heap *heap, - void *oldptr, - const size_t oldsize, - const size_t newsize, - const char *func, - const char *file, - const int lineno) -{ - debug_assert(((oldptr == NULL) && (oldsize == 0)) || (oldptr && oldsize), - "oldptr=%p, oldsize=%lu", - oldptr, - oldsize); - - // We can only realloc from an oldptr that's allocated from shmem - debug_assert(!oldptr || platform_valid_addr_in_heap(heap, oldptr), - "oldptr=%p is not allocated from shared memory", - oldptr); - - void *retptr = - platform_shm_alloc(heap, newsize, "Unknown", func, file, lineno); - if (retptr) { - - // Copy over old contents, if any, and free that memory piece - if (oldptr) { - memcpy(retptr, oldptr, oldsize); - platform_shm_free(heap, oldptr, "Unknown", func, file, lineno); - } - } else { - // Report approx memory usage metrics w/o spinlock (diagnostics) - shmem_heap *shm = heap; - size_t total_bytes = shm->usage.total_bytes; - size_t used_bytes = shm->usage.used_bytes; - size_t free_bytes = shm->usage.free_bytes; - size_t num_frees = shm->usage.nfrees; - fraction used_bytes_pct; - fraction free_bytes_pct; - used_bytes_pct = init_fraction(used_bytes, total_bytes); - free_bytes_pct = init_fraction(free_bytes, total_bytes); - - // clang-format off - platform_error_log("%s() failed to reallocate newsize=%lu bytes (%s)" - ", oldsize=%lu bytes (%s)" - ", Used=%lu bytes (%s, " FRACTION_FMT(4, 2) - " %%), Free=%lu bytes (%s, " FRACTION_FMT(4, 2) - " %%)" - ", num-free-calls=%lu\n", - __func__, - newsize, - size_str(newsize), - oldsize, - size_str(oldsize), - used_bytes, - size_str(used_bytes), - (FRACTION_ARGS(used_bytes_pct) * 100), - free_bytes, - size_str(free_bytes), - (FRACTION_ARGS(free_bytes_pct) * 100), - num_frees); - // clang-format off - } - return retptr; -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_free() -- 'Free' the memory fragment at given address in shmem. - * - * We expect that the 'ptr' is a valid address within the shared segment. - * Otherwise, it means that Splinter was configured to run with shared memory, - * -and- in some code path we allocated w/o using shared memory - * (i.e. PROCESS_PRIVATE_HEAP_ID interface), but ended up calling shmem-free - * interface. That would be a code error which results in a memory leak. - * ----------------------------------------------------------------------------- - */ -void -platform_shm_free(shmem_heap *heap, - void *ptr, - const char *objname, - const char *func, - const char *file, - const int lineno) -{ - shmem_heap *shm = heap; - - debug_assert( - (platform_shm_heap_valid(shm) == TRUE), - "Shared memory heap at %p is not a valid shared memory handle.", - heap); - - if (!platform_valid_addr_in_heap(heap, ptr)) { - platform_error_log("[%s:%d::%s()] -> %s: Requesting to free memory" - " at %p, for object '%s' which is a memory chunk not" - " allocated from shared memory {start=%p, end=%p}.\n", - file, - lineno, - func, - __func__, - ptr, - objname, - (void *)shm, - (void *)(shm->shm_end - 1)); - return; - } - - // Micro-optimization for very-last-fragment-allocated being freed - bool maybe_large_frag = TRUE; - size_t frag_size = 0; - - shm_lock_mem(shm); - shm->usage.nfrees++; - if (shm->shm_last_alloc == ptr) { - debug_assert( - shm->shm_next > ptr, "shm_next=%p, free-ptr=%p", shm->shm_next, ptr); - frag_size = (shm->shm_next - ptr); - if (frag_size < SHM_LARGE_FRAG_SIZE) { - // Recycle the most-recently-allocated-small-fragment, now being freed. - shm->shm_next = ptr; - shm->shm_last_alloc = NULL; - shm->usage.free_bytes += frag_size; - shm->usage.used_bytes -= frag_size; - shm->usage.nfrees_last_frag += 1; - - // We know fragment being freed is not a large fragment - maybe_large_frag = FALSE; - } - } - shm_unlock_mem(shm); - - if (maybe_large_frag) { - platform_shm_track_free(shm, ptr, objname, func, file, lineno); - } - - if (Trace_shmem || Trace_shmem_frees) { - platform_default_log(" [%s:%d::%s()] -> %s: Request to free memory at " - "%p for object '%s'.\n", - file, - lineno, - func, - __func__, - ptr, - objname); - } - return; -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_track_large_alloc() - Track the allocation of this large fragment. - * 'Tracking' here means we record this large-fragment in an array tracking - * large-memory fragments allocated. - * ----------------------------------------------------------------------------- - */ -static void -platform_shm_track_large_alloc(shmem_heap *shm, void *addr, size_t size) -{ - debug_assert( - (size >= SHM_LARGE_FRAG_SIZE), - "Incorrect usage of this interface for requested size=%lu bytes." - " Size should be >= %lu bytes.\n", - size, - SHM_LARGE_FRAG_SIZE); - - - // Iterate through the list of memory fragments being tracked. - int fctr = 0; - shm_large_frag_info *frag = shm->shm_large_frags; - shm_lock_mem_frags(shm); - while ((fctr < ARRAY_SIZE(shm->shm_large_frags)) && frag->frag_addr) { - // As this is a newly allocated fragment being tracked, it should - // not be found elsewhere in the tracker array. - platform_assert((frag->frag_addr != addr), - "Error! Newly allocated large memory fragment at %p" - " is already tracked at slot %d." - " Fragment is allocated to PID=%d, to tid=%lu," - " and is %s (freed by PID=%d, by tid=%lu)\n.", - addr, - fctr, - frag->frag_allocated_to_pid, - frag->frag_allocated_to_tid, - (frag->frag_freed_by_pid ? "free" : "in use"), - frag->frag_freed_by_pid, - frag->frag_freed_by_tid); - fctr++; - frag++; - } - // If we found a free slot, track our memory fragment at fctr'th slot. - if (fctr < ARRAY_SIZE(shm->shm_large_frags)) { - shm->usage.nlarge_frags_tracked++; - shm->usage.nlarge_frags_inuse++; - shm->usage.used_by_large_frags_bytes += size; - if (shm->usage.nlarge_frags_inuse > shm->usage.nlarge_frags_inuse_HWM) { - shm->usage.nlarge_frags_inuse_HWM = shm->usage.nlarge_frags_inuse; - } - - // We should really assert that the other fields are zero, but for now - // re-init this fragment tracker. - memset(frag, 0, sizeof(*frag)); - frag->frag_addr = addr; - frag->frag_size = size; - - frag->frag_allocated_to_pid = platform_get_os_pid(); - frag->frag_allocated_to_tid = platform_get_tid(); - - // The freed_by_pid/freed_by_tid == 0 means fragment is still allocated. - - // Track highest address of large-fragment that is being tracked. - if (shm->shm_large_frag_hip < addr) { - shm->shm_large_frag_hip = addr; - } - } - - shm_unlock_mem_frags(shm); -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_track_free() - See if this memory fragment being freed is - * already being tracked. If so, it's a large fragment allocation, which can be - * re-cycled after this free. Do the book-keeping accordingly to record that - * this large-fragment is no longer in-use and can be recycled. - * ----------------------------------------------------------------------------- - */ -static void -platform_shm_track_free(shmem_heap *shm, - void *addr, - const char *objname, - const char *func, - const char *file, - const int lineno) -{ - shm_lock_mem_frags(shm); - - // If we are freeing a fragment beyond the high-address of all - // large fragments tracked, then this is certainly not a large - // fragment. So, no further need to see if it's a tracked fragment. - if (addr > shm->shm_large_frag_hip) { - shm_unlock_mem_frags(shm); - return; - } - bool found_tracked_frag = FALSE; - bool trace_shmem = (Trace_shmem || Trace_shmem_frees); - - shm_large_frag_info *frag = shm->shm_large_frags; - int fctr = 0; - while ((fctr < ARRAY_SIZE(shm->shm_large_frags)) - && (!frag->frag_addr || (frag->frag_addr != addr))) - { - fctr++; - frag++; - } - if (fctr < ARRAY_SIZE(shm->shm_large_frags)) { - debug_assert(frag->frag_addr == addr); - found_tracked_frag = TRUE; - - // Cross-check the recording we did when fragment was allocated - // We cannot check the tid as the parent process' tid may be 0. - // We could have come across a free fragment that was previously - // used by the parent process. - // debug_assert(frag->frag_allocated_to_tid != 0); - debug_assert(frag->frag_allocated_to_pid != 0); - debug_assert(frag->frag_size != 0); - - shm->usage.nlarge_frags_inuse--; - - // Mark the fragment as in-use by recording the process/thread that's - // doing the free. - frag->frag_freed_by_pid = platform_get_os_pid(); - frag->frag_freed_by_tid = platform_get_tid(); - - if (trace_shmem) { - platform_default_log("OS-pid=%d, ThreadID=%lu" - ", Track freed fragment of size=%lu bytes" - ", at slot=%d, addr=%p" - ", allocated_to_pid=%d, allocated_to_tid=%lu" - ", shm_large_frag_hip=%p\n", - frag->frag_freed_by_pid, - frag->frag_freed_by_tid, - frag->frag_size, - fctr, - addr, - frag->frag_allocated_to_pid, - frag->frag_allocated_to_tid, - shm->shm_large_frag_hip); - } - } - shm_unlock_mem_frags(shm); - - if (!found_tracked_frag && trace_shmem) { - platform_default_log("[OS-pid=%d, ThreadID=%lu, %s:%d::%s()] " - ", Fragment %p for object '%s' is not tracked\n", - platform_get_os_pid(), - platform_get_tid(), - file, - lineno, - func, - addr, - objname); - } -} - -/* - * ----------------------------------------------------------------------------- - * platform_shm_find_large() - Search the array of large-fragments being tracked - * to see if there is an already allocated and now-free large memory fragment. - * If so, allocate that fragment to this requester. Do the book-keeping - * accordingly. - * ----------------------------------------------------------------------------- - */ -static void * -platform_shm_find_large(shmem_heap *shm, - size_t size, - const char *objname, - const char *func, - const char *file, - const int lineno) -{ - debug_assert((size >= SHM_LARGE_FRAG_SIZE), - "Incorrect usage of this interface for requested" - " size=%lu bytes. Size should be >= %lu bytes.\n", - size, - SHM_LARGE_FRAG_SIZE); - - void *retptr = NULL; - shm_large_frag_info *frag = shm->shm_large_frags; - int local_in_use = 0; // Tracked while iterating in this fn, locally - - int found_at_fctr = -1; - bool found_tracked_frag = FALSE; - - shm_lock_mem_frags(shm); - - uint32 nlarge_frags_tracked = shm->usage.nlarge_frags_tracked; - uint32 nlarge_frags_inuse = shm->usage.nlarge_frags_inuse; - - for (int fctr = 0; fctr < ARRAY_SIZE(shm->shm_large_frags); fctr++, frag++) { - if (!frag->frag_addr || (frag->frag_size < size)) { - continue; - } - - // Skip fragment if it's still in-use - if (frag->frag_freed_by_pid == 0) { - platform_assert((frag->frag_freed_by_tid == 0), - "Invalid state found for fragment at index %d," - "freed_by_pid=%d but freed_by_tid=%lu " - "(which should also be 0)\n", - fctr, - frag->frag_freed_by_pid, - frag->frag_freed_by_tid); - - local_in_use++; - continue; - } - found_tracked_frag = TRUE; - found_at_fctr = fctr; - - // Record the process/thread to which free fragment is being allocated - frag->frag_allocated_to_pid = platform_get_os_pid(); - frag->frag_allocated_to_tid = platform_get_tid(); - - shm->usage.nlarge_frags_inuse++; - if (shm->usage.nlarge_frags_inuse > shm->usage.nlarge_frags_inuse_HWM) { - shm->usage.nlarge_frags_inuse_HWM = shm->usage.nlarge_frags_inuse; - } - nlarge_frags_inuse = shm->usage.nlarge_frags_inuse; - - // Now, mark that this fragment is in-use - frag->frag_freed_by_pid = 0; - frag->frag_freed_by_tid = 0; - - retptr = frag->frag_addr; - - // Zero out the recycled large-memory fragment, just to be sure ... - memset(retptr, 0, frag->frag_size); - break; - } - shm_unlock_mem_frags(shm); - - // Trace whether we found tracked fragment or not. - if (Trace_shmem || Trace_shmem_allocs) { - char msg[200]; - if (found_tracked_frag) { - /* - * In this trace message, don't be confused if you see a wide gap - * between local_in_use and nlarge_frags_inuse. The latter is a global - * counter while the former is a local counter. We may have found a - * free fragment to reallocate early in the array w/o processing the - * full array. Hence, these two values are likely to diff (by a big - * margin, even). - */ - snprintf(msg, - sizeof(msg), - "Reallocated free fragment at slot=%d, addr=%p, " - "nlarge_frags_tracked=%u, nlarge_frags_inuse=%u" - " (local_in_use=%d)", - found_at_fctr, - retptr, - nlarge_frags_tracked, - nlarge_frags_inuse, - local_in_use); - } else { - snprintf(msg, - sizeof(msg), - "Did not find free fragment of size=%lu bytes to reallocate." - " nlarge_frags_tracked=%u, nlarge_frags_inuse=%u" - " (local_in_use=%d)", - size, - nlarge_frags_tracked, - nlarge_frags_inuse, - local_in_use); - } - platform_shm_trace_allocs( - shm, size, msg, retptr, objname, func, file, lineno); - } - return retptr; -} - -/* - * ----------------------------------------------------------------------------- - * platform_trace_large_frags() - Walk through large-fragments tracking array - * and dump info about fragments that still appear "in-use". - * This diagnostic routine will -always- be called when shared segment is - * being destroyed. If any "in-use" large-fragments are found, a message will - * be generated. - * ----------------------------------------------------------------------------- - */ -static int -platform_trace_large_frags(shmem_heap *shm) -{ - int local_in_use = 0; // Tracked while iterating in this fn, locally - shm_large_frag_info *frag = shm->shm_large_frags; - - threadid thread_tid = platform_get_tid(); - bool print_new_line = false; - // Walk the tracked-fragments array looking for an in-use fragment - for (int fctr = 0; fctr < ARRAY_SIZE(shm->shm_large_frags); fctr++, frag++) { - if (!frag->frag_addr) { - continue; - } - - // Skip freed fragments. - if (frag->frag_freed_by_pid != 0) { - continue; - } - // Found a large fragment that is still "in-use" - local_in_use++; - - // Do -NOT- assert here. As this is a diagnostic routine, report - // the inconsistency, and continue to find more stray large fragments. - if (frag->frag_freed_by_tid != 0) { - platform_error_log("Invalid state found for fragment at index %d," - "freed_by_pid=%d but freed_by_tid=%lu " - "(which should also be 0)\n", - fctr, - frag->frag_freed_by_pid, - frag->frag_freed_by_tid); - } - - if (!print_new_line) { - platform_error_log("\n**** [TID=%lu] Large fragment usage " - "diagnostics:\n", - thread_tid); - print_new_line = true; - } - - platform_error_log(" **** [TID=%lu] Fragment at slot=%d, addr=%p" - ", size=%lu (%s) is in-use, allocated_to_pid=%d" - ", allocated_to_tid=%lu\n", - thread_tid, - fctr, - frag->frag_addr, - frag->frag_size, - size_str(frag->frag_size), - frag->frag_allocated_to_pid, - frag->frag_allocated_to_tid); - } - return local_in_use; -} - -/* - * ----------------------------------------------------------------------------- - * Accessor interfaces - mainly intended as assert / testing / debugging - * hooks. - * ----------------------------------------------------------------------------- - */ -bool -platform_shm_heap_valid(shmem_heap *shmheap) -{ - // Use a cached copy in case we are dealing with a bogus input shmem - // address. - shmem_heap shmem_heap_struct; - memmove(&shmem_heap_struct, (void *)shmheap, sizeof(shmem_heap_struct)); - - shmem_heap *shm = &shmem_heap_struct; - - if (shm->shm_magic != SPLINTERDB_SHMEM_MAGIC) { - platform_error_log( - "%s(): Input shared memory heap, %p, does not seem to be a valid " - "SplinterDB shared segment's start address." - " Found magic 0x%lX does not match expected magic 0x%lX.\n", - __func__, - shmheap, - shm->shm_magic, - SPLINTERDB_SHMEM_MAGIC); - return FALSE; - } - - return TRUE; -} - -/* - * ----------------------------------------------------------------------------- - * Warning! Testing & Diagnostics interfaces, which are written to support - * verification of splinterdb handle from forked child processes when running - * Splinter configured with shared-segment. - * - * platform_heap_set_splinterdb_handle() - Save-off the handle to splinterdb * - * in the shared segment's control block. - * - * platform_heap_get_splinterdb_handle() - Return the handle to splinterdb * - * saved-off in the shared segment's control block. - * ----------------------------------------------------------------------------- - */ -bool -platform_shm_heap_id_valid(const platform_heap_id heap_id) -{ - const shmem_heap *shm = platform_heap_id_to_shmaddr(heap_id); - return (shm->shm_magic == SPLINTERDB_SHMEM_MAGIC); -} - -void -platform_shm_set_splinterdb_handle(platform_heap_id heap_id, void *addr) -{ - debug_assert(platform_shm_heap_id_valid(heap_id)); - shmem_heap *shm = platform_heap_id_to_shmaddr(heap_id); - shm->shm_splinterdb_handle = addr; -} - -void * -platform_heap_get_splinterdb_handle(shmem_heap *heap) -{ - debug_assert(platform_shm_heap_valid(heap)); - return heap->shm_splinterdb_handle; -} - -/* - * Initialize tracing of shared memory allocs / frees. This is invoked as a - * result of parsing command-line args: - */ -void -platform_shm_tracing_init(const bool trace_shmem, - const bool trace_shmem_allocs, - const bool trace_shmem_frees) -{ - if (trace_shmem) { - Trace_shmem = TRUE; - } - if (trace_shmem_allocs) { - Trace_shmem_allocs = TRUE; - } - if (trace_shmem_frees) { - Trace_shmem_frees = TRUE; - } -} - -/* - * Action-methods to enable / disable tracing of shared memory operations: - * ops == allocs & frees. - */ -void -platform_enable_tracing_shm_ops() -{ - Trace_shmem = TRUE; -} - -void -platform_enable_tracing_shm_allocs() -{ - Trace_shmem_allocs = TRUE; -} - -void -platform_enable_tracing_shm_frees() -{ - Trace_shmem_frees = TRUE; -} - -void -platform_disable_tracing_shm_ops() -{ - Trace_shmem = FALSE; - Trace_shmem_allocs = FALSE; - Trace_shmem_frees = FALSE; -} - -void -platform_disable_tracing_shm_allocs() -{ - Trace_shmem_allocs = FALSE; -} - -void -platform_disable_tracing_shm_frees() -{ - Trace_shmem_frees = FALSE; -} - -void -platform_enable_tracing_large_frags() -{ - Trace_large_frags = TRUE; -} - -void -platform_disable_tracing_large_frags() -{ - Trace_large_frags = FALSE; -} - -/* Size of control block at start of shared memory describing shared segment - */ -size_t -platform_shm_ctrlblock_size() -{ - return sizeof(shmem_heap); -} - -/* - * Shmem-accessor interfaces by heap. - */ -size_t -platform_shmsize(shmem_heap *heap) -{ - return (heap->usage.total_bytes); -} - -size_t -platform_shmbytes_used(shmem_heap *heap) -{ - return (heap->usage.used_bytes); -} -size_t -platform_shmbytes_free(shmem_heap *heap) -{ - return (heap->usage.free_bytes); -} - -void * -platform_shm_next_free_addr(shmem_heap *heap) -{ - return (heap->shm_next); -} - -static void -platform_shm_trace_allocs(shmem_heap *shm, - const size_t size, - const char *verb, - const void *retptr, - const char *objname, - const char *func, - const char *file, - const int lineno) -{ - platform_default_log(" [OS-pid=%d,ThreadID=%lu, %s:%d::%s()] " - "-> %s: %s size=%lu bytes (%s)" - " for object '%s', at %p, " - "free bytes=%lu (%s).\n", - platform_get_os_pid(), - platform_get_tid(), - file, - lineno, - func, - __func__, - verb, - size, - size_str(size), - objname, - retptr, - shm->usage.free_bytes, - size_str(shm->usage.free_bytes)); -} diff --git a/src/platform_linux/shmem.h b/src/platform_linux/shmem.h deleted file mode 100644 index 5fe5a71e..00000000 --- a/src/platform_linux/shmem.h +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2018-2026 VMware, Inc. -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "platform_status.h" -#include -#include - -typedef struct shmem_heap shmem_heap; - -platform_status -platform_shmcreate(size_t size, shmem_heap **heap); - -void -platform_shmdestroy(shmem_heap **heap); - -/* - * Allocate memory fragment from the shared memory of requested 'size'. - */ -void * -platform_shm_alloc(shmem_heap *heap, - const size_t size, - const char *objname, - const char *func, - const char *file, - const int lineno); - -/* - * Free the memory fragment at 'ptr' address. - */ -void -platform_shm_free(shmem_heap *heap, - void *ptr, - const char *objname, - const char *func, - const char *file, - const int lineno); - -/* - * Reallocate the memory (fragment) at 'oldptr' of size 'oldsize' bytes. - * Any contents at 'oldptr' are copied to 'newptr' for 'oldsize' bytes. - * - * NOTE: This interface does -not- do any cache-line alignment for 'newsize' - * request. Caller is expected to do so. platform_realloc() takes care of it. - * - * Returns ptr to re-allocated memory of 'newsize' bytes. - */ -void * -platform_shm_realloc(shmem_heap *heap, - void *oldptr, - const size_t oldsize, - const size_t newsize, - const char *func, - const char *file, - const int lineno); - -void -platform_shm_tracing_init(const bool trace_shmem, - const bool trace_shmem_allocs, - const bool trace_shmem_frees); - -void -platform_enable_tracing_shm_ops(); - -void -platform_enable_tracing_shm_allocs(); - -void -platform_enable_tracing_shm_frees(); - -void -platform_disable_tracing_shm_ops(); - -void -platform_disable_tracing_shm_allocs(); - -void -platform_disable_tracing_shm_frees(); - -void -platform_enable_tracing_large_frags(); - -void -platform_disable_tracing_large_frags(); - -size_t -platform_shm_ctrlblock_size(); - -/* - * Interfaces to retrieve size(s) using heap. - */ -size_t -platform_shmsize(shmem_heap *heap); - -size_t -platform_shmbytes_free(shmem_heap *heap); - -size_t -platform_shmbytes_used(shmem_heap *heap); - -void * -platform_shm_next_free_addr(shmem_heap *heap); - -bool -platform_valid_addr_in_heap(shmem_heap *heap, const void *addr); - -void * -platform_heap_get_splinterdb_handle(shmem_heap *heap); diff --git a/src/splinterdb.c b/src/splinterdb.c index 55db337a..361bd14e 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -300,9 +300,6 @@ splinterdb_create_or_open(const splinterdb_config *kvs_cfg, // IN // All future memory allocation should come from shared memory, if so // configured. kvs->heap_id = use_this_heap_id; - if (we_created_heap) { - platform_shm_set_splinterdb_handle(use_this_heap_id, (void *)kvs); - } kvs->io_handle = io_handle_create(&kvs->io_cfg, kvs->heap_id); if (kvs->io_handle == NULL) { diff --git a/tests/config.c b/tests/config.c index 3e310139..2c259215 100644 --- a/tests/config.c +++ b/tests/config.c @@ -158,7 +158,6 @@ config_usage() platform_error_log( "\t--use-shmem **** Experimental feature ****\n"); // clang-format off - platform_error_log("\t [ --trace-shmem | --trace-shmem-allocs | --trace-shmem-frees ]\n"); platform_error_log("\t [ --shmem-capacity-mib (%lu) | --shmem-capacity-gib (%d) ]\n", (TEST_CONFIG_DEFAULT_SHMEM_SIZE_GB * KiB), TEST_CONFIG_DEFAULT_SHMEM_SIZE_GB); @@ -355,19 +354,6 @@ config_parse(master_config *cfg, const uint8 num_config, int argc, char *argv[]) } config_set_mib("shmem-capacity", cfg, shmem_size) {} config_set_gib("shmem-capacity", cfg, shmem_size) {} - config_has_option("trace-shmem-allocs") - { - platform_enable_tracing_shm_allocs(); - } - config_has_option("trace-shmem-frees") - { - platform_enable_tracing_shm_frees(); - } - config_has_option("trace-shmem") - { - // Trace both allocations & frees from shared memory segment. - platform_enable_tracing_shm_ops(); - } // Parameter should only be used with --use-shmem argument. config_has_option("fork-child") { diff --git a/tests/unit/splinter_shmem_test.c b/tests/unit/splinter_shmem_test.c deleted file mode 100644 index 7cb078cd..00000000 --- a/tests/unit/splinter_shmem_test.c +++ /dev/null @@ -1,628 +0,0 @@ -// Copyright 2021-2026 VMware, Inc. -// SPDX-License-Identifier: Apache-2.0 - -/* - * ----------------------------------------------------------------------------- - * splinter_shmem_test.c -- - * - * Exercises the interfaces in SplinterDB shared memory allocation module. - * ----------------------------------------------------------------------------- - */ -#include "platform_threads.h" -#include "platform_units.h" -#include "platform_typed_alloc.h" -#include "platform_log.h" -#include "unit_tests.h" -#include "ctest.h" // This is required for all test-case files. -#include "shmem.h" -#include "splinterdb/splinterdb.h" -#include "splinterdb/default_data_config.h" - -#define TEST_MAX_KEY_SIZE 42 // Just something to get going ... - -// Test these many threads concurrently performing memory allocation. -#define TEST_MAX_THREADS 8 - -/* - * To test heavily concurrent memory allocation from the shared memory, each - * thread will allocate a small fragment described by this structure. We then - * validate that the fragments are not clobbered by concurrent allocations. - */ -typedef struct shm_memfrag { - threadid owner; - struct shm_memfrag *next; -} shm_memfrag; - -// Configuration for each worker thread -typedef struct { - splinterdb *splinter; - platform_thread this_thread_id; // OS-generated thread ID - threadid exp_thread_idx; // Splinter-generated expected thread index - shm_memfrag *start; // Start of chain of allocated memfrags -} thread_config; - -// Function prototypes -static void -setup_cfg_for_test(splinterdb_config *out_cfg, data_config *default_data_cfg); - -static void -exec_thread_memalloc(void *arg); - -/* - * Global data declaration macro: - */ -CTEST_DATA(splinter_shmem) -{ - // Declare heap handles to shake out shared memory based allocation. - size_t shmem_capacity; // In bytes - platform_heap_id hid; -}; - -// By default, all test cases will deal with small shared memory segment. -CTEST_SETUP(splinter_shmem) -{ - platform_register_thread(); - data->shmem_capacity = (256 * MiB); // bytes - platform_status rc = platform_heap_create( - platform_get_module_id(), data->shmem_capacity, TRUE, &data->hid); - ASSERT_TRUE(SUCCESS(rc)); - - // Enable tracing all allocs / frees from shmem for this test. - platform_enable_tracing_shm_ops(); -} - -// Tear down the test shared segment. -CTEST_TEARDOWN(splinter_shmem) -{ - platform_heap_destroy(&data->hid); - platform_deregister_thread(); -} - -/* - * Basic test case. This goes through the basic create / destroy - * interfaces to setup a shared memory segment. While at it, run through - * few lookup interfaces to validate sizes. - */ -CTEST2(splinter_shmem, test_create_destroy_shmem) -{ - platform_heap_id hid = NULL; - size_t requested = (512 * MiB); // bytes - size_t heap_capacity = requested; - platform_status rc = - platform_heap_create(platform_get_module_id(), heap_capacity, TRUE, &hid); - ASSERT_TRUE(SUCCESS(rc)); - - // Total size of shared segment must be what requested for. - ASSERT_EQUAL(platform_shmsize(hid), requested); - - // A small chunk at the head is used for shmem_info{} tracking struct - ASSERT_EQUAL(platform_shmbytes_free(hid), - (requested - platform_shm_ctrlblock_size())); - - // Destroy shared memory and release memory. - platform_shmdestroy((shmem_heap **)&hid); - ASSERT_TRUE(hid == NULL); -} - -/* - * --------------------------------------------------------------------------- - * Test that used space and pad-bytes tracking is happening correctly - * when all allocation requests are fully aligned. No pad bytes should - * have been generated for alignment. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_aligned_allocations) -{ - int keybuf_size = 64; - int msgbuf_size = (2 * keybuf_size); - - // Self-documenting assertion ... to future-proof this area. - ASSERT_EQUAL(keybuf_size, PLATFORM_CACHELINE_SIZE); - - void *next_free = platform_shm_next_free_addr(data->hid); - uint8 *keybuf = TYPED_MANUAL_MALLOC(data->hid, keybuf, keybuf_size); - - // Validate returned memory-ptrs, knowing that no pad bytes were needed. - ASSERT_TRUE((void *)keybuf == next_free); - - next_free = platform_shm_next_free_addr(data->hid); - uint8 *msgbuf = TYPED_MANUAL_MALLOC(data->hid, msgbuf, msgbuf_size); - ASSERT_TRUE((void *)msgbuf == next_free); - - // Sum of requested alloc-sizes == total # of used-bytes - ASSERT_EQUAL((keybuf_size + msgbuf_size), platform_shmbytes_used(data->hid)); - - // Free bytes left in shared segment == (sum of requested alloc sizes, less - // a small bit of the control block.) - ASSERT_EQUAL((data->shmem_capacity - - (keybuf_size + msgbuf_size + platform_shm_ctrlblock_size())), - platform_shmbytes_free(data->hid)); -} - -/* - * --------------------------------------------------------------------------- - * Test that used space and pad-bytes tracking is happening correctly - * when some allocation requests are not-fully aligned. Test verifies the - * tracking and computation of pad-bytes, free/used space. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_unaligned_allocations) -{ - void *next_free = platform_shm_next_free_addr(data->hid); - int keybuf_size = 42; - uint8 *keybuf = TYPED_MANUAL_MALLOC(data->hid, keybuf, keybuf_size); - - int keybuf_pad = - platform_align_bytes_reqd(PLATFORM_CACHELINE_SIZE, keybuf_size); - - // Sum of requested allocation + pad-bytes == total # of used-bytes - ASSERT_EQUAL((keybuf_size + keybuf_pad), platform_shmbytes_used(data->hid)); - - // Should have allocated what was previously determined as next free byte. - ASSERT_TRUE((void *)keybuf == next_free); - - // Validate returned memory-ptrs, knowing that pad bytes were needed. - next_free = platform_shm_next_free_addr(data->hid); - ASSERT_TRUE(next_free == (void *)keybuf + keybuf_size + keybuf_pad); - - int msgbuf_size = 100; - int msgbuf_pad = - platform_align_bytes_reqd(PLATFORM_CACHELINE_SIZE, msgbuf_size); - uint8 *msgbuf = TYPED_MANUAL_MALLOC(data->hid, msgbuf, msgbuf_size); - - // Next allocation will abut prev-allocation + pad-bytes - ASSERT_TRUE((void *)msgbuf == (void *)keybuf + keybuf_size + keybuf_pad); - - // Sum of requested allocation + pad-bytes == total # of used-bytes - ASSERT_EQUAL((keybuf_size + keybuf_pad + msgbuf_size + msgbuf_pad), - platform_shmbytes_used(data->hid)); - - // After accounting for the control block, next-free-addr should be - // exactly past the 2 allocations + their pad-bytes. - next_free = platform_shm_next_free_addr(data->hid); - void *exp_free = ((void *)platform_heap_id_to_shmaddr(data->hid) - + platform_shm_ctrlblock_size() + keybuf_size + keybuf_pad - + msgbuf_size + msgbuf_pad); - ASSERT_TRUE(next_free == exp_free, - "next_free=%p != exp_free=%p\n", - next_free, - exp_free); -} - -/* - * --------------------------------------------------------------------------- - * Test allocation requests that result in an OOM from shared segment. - * Verify limits of memory allocation and handling of free/used bytes. - * These stats are maintained w/o full spinlocks, so will be approximate - * in concurrent scenarios. But for single-threaded allocations, these stats - * should be accurate even when shmem-OOMs occur. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_allocations_causing_OOMs) -{ - int keybuf_size = 64; - - // Self-documenting assertion ... to future-proof this area. - ASSERT_EQUAL(keybuf_size, PLATFORM_CACHELINE_SIZE); - - void *next_free = platform_shm_next_free_addr(data->hid); - uint8 *keybuf = TYPED_MANUAL_MALLOC(data->hid, keybuf, keybuf_size); - - // Validate returned memory-ptr, knowing that no pad bytes were needed. - ASSERT_TRUE((void *)keybuf == next_free); - - next_free = platform_shm_next_free_addr(data->hid); - - size_t space_left = - (data->shmem_capacity - (keybuf_size + platform_shm_ctrlblock_size())); - - ASSERT_EQUAL(space_left, platform_shmbytes_free(data->hid)); - - platform_error_log("\nNOTE: Test case intentionally triggers out-of-space" - " errors in shared segment. 'Insufficient memory'" - " error messages below are to be expected.\n"); - - // Note that although we have asked for 1 more byte than free space available - // the allocation interfaces round-up the # bytes for alignment. So the - // requested # of bytes will be a bit larger than free space in the error - // message you will see below. - keybuf_size = (space_left + 1); - uint8 *keybuf_oom = TYPED_MANUAL_MALLOC(data->hid, keybuf_oom, keybuf_size); - ASSERT_TRUE(keybuf_oom == NULL); - - // Free space counter is not touched if allocation fails. - ASSERT_EQUAL(space_left, platform_shmbytes_free(data->hid)); - - // As every memory request is rounded-up for alignment, the space left - // counter should always be an integral multiple of this constant. - ASSERT_EQUAL(0, (space_left % PLATFORM_CACHELINE_SIZE)); - - // If we request exactly what's available, it should succeed. - keybuf_size = space_left; - uint8 *keybuf_no_oom = - TYPED_MANUAL_MALLOC(data->hid, keybuf_no_oom, keybuf_size); - ASSERT_TRUE(keybuf_no_oom != NULL); - CTEST_LOG_INFO("Successfully allocated all remaining %lu bytes " - "from shared segment.\n", - space_left); - - // We should be out of space by now. - ASSERT_EQUAL(0, platform_shmbytes_free(data->hid)); - - // This should fail. - keybuf_size = 1; - keybuf_oom = TYPED_MANUAL_MALLOC(data->hid, keybuf_oom, keybuf_size); - ASSERT_TRUE(keybuf_oom == NULL); - - // Free allocated memory before exiting. - platform_free(data->hid, keybuf); - platform_free(data->hid, keybuf_no_oom); -} - -/* - * --------------------------------------------------------------------------- - * Test allocation interface using platform_get_heap_id() accessor, which - * is supposed to return in-use heap-ID. But, by default, this is NULL. This - * test shows that using this API will [correctly] allocate from shared memory - * once we've created the shared segment, and, therefore, all call-sites in - * the running library to platform_get_heap_id() should return the right - * handle(s) to the shared segment. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_allocations_using_get_heap_id) -{ - int keybuf_size = 64; - - void *next_free = platform_shm_next_free_addr(data->hid); - uint8 *keybuf = - TYPED_MANUAL_MALLOC(platform_get_heap_id(), keybuf, keybuf_size); - - // Validate returned memory-ptrs, knowing that no pad bytes were needed. - ASSERT_TRUE((void *)keybuf == next_free); -} - -/* - * --------------------------------------------------------------------------- - * Currently 'free' is a no-op; no space is released. Do minimal testing of - * this feature, to ensure that at least the code flow is exectuing correctly. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_free) -{ - int keybuf_size = 64; - uint8 *keybuf = TYPED_MANUAL_MALLOC(data->hid, keybuf, keybuf_size); - - int msgbuf_size = (2 * keybuf_size); - uint8 *msgbuf = TYPED_MANUAL_MALLOC(data->hid, msgbuf, msgbuf_size); - - size_t mem_used = platform_shmbytes_used(data->hid); - - void *next_free = platform_shm_next_free_addr(data->hid); - - platform_free(data->hid, keybuf); - - // Even though we freed some memory, the next addr-to-allocate is unchanged. - ASSERT_TRUE(next_free == platform_shm_next_free_addr(data->hid)); - - // Space used remains unchanged, as free didn't quite return any memory - ASSERT_EQUAL(mem_used, platform_shmbytes_used(data->hid)); -} - -/* - * --------------------------------------------------------------------------- - * test_concurrent_allocs_by_n_threads() - Verify concurrency control - * implemented during shared memory allocation. - * - * Exercise concurrent memory allocations from the shared memory of small - * memory fragments. Each thread will record its ownership on the fragment - * allocated. After all memory is exhausted, we cross-check the chain of - * fragments allocated by each thread to verify that fragment still shows up - * as owned by the allocating thread. - * - * In the rudimentary version of allocation from shared memory, we did not have - * any concurrency control for allocations. So, it's likely that we may have - * been clobbering allocated memory. - * - * This test case does a basic verification of the fixes implemented to avoid - * such races during concurrent memory allocation. - * - * NOTE: This test case will exit immediately upon finding the first fragment - * whose ownership is flawed. That may still leave many other fragments waiting - * to be discovered with flawed ownership. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_concurrent_allocs_by_n_threads) -{ - splinterdb *kvsb; - splinterdb_config cfg; - data_config default_data_cfg; - - platform_disable_tracing_shm_ops(); - - ZERO_STRUCT(cfg); - ZERO_STRUCT(default_data_cfg); - - default_data_config_init(TEST_MAX_KEY_SIZE, &default_data_cfg); - setup_cfg_for_test(&cfg, &default_data_cfg); - - int rv = splinterdb_create(&cfg, &kvsb); - ASSERT_EQUAL(0, rv); - - // Setup multiple threads for concurrent memory allocation. - platform_thread new_thread; - thread_config thread_cfg[TEST_MAX_THREADS]; - thread_config *thread_cfgp = NULL; - int tctr = 0; - platform_status rc = STATUS_OK; - - ZERO_ARRAY(thread_cfg); - - platform_error_log("\nExecute %d concurrent threads peforming memory" - " allocation till we run out of memory in the shared" - " segment.\n'Insufficient memory' error messages" - " below are to be expected.\n", - TEST_MAX_THREADS); - - // Start-up n-threads, record their expected thread-IDs, which will be - // validated by the thread's execution function below. - for (tctr = 1, thread_cfgp = &thread_cfg[tctr]; - tctr < ARRAY_SIZE(thread_cfg); - tctr++, thread_cfgp++) - { - // These are independent of the new thread's creation. - thread_cfgp->splinter = kvsb; - thread_cfgp->exp_thread_idx = tctr; - - rc = platform_thread_create( - &new_thread, FALSE, exec_thread_memalloc, thread_cfgp, NULL); - ASSERT_TRUE(SUCCESS(rc)); - - thread_cfgp->this_thread_id = new_thread; - } - - // Complete execution of n-threads. Worker fn does the validation. - for (tctr = 1, thread_cfgp = &thread_cfg[tctr]; - tctr < ARRAY_SIZE(thread_cfg); - tctr++, thread_cfgp++) - { - rc = platform_thread_join(&thread_cfgp->this_thread_id); - ASSERT_TRUE(SUCCESS(rc)); - } - - // Now run thru memory fragments allocated by each thread and verify that - // the identity recorded is kosher. If the same memory fragment was allocated - // to multiple threads, we should catch that error here. - for (tctr = 1, thread_cfgp = &thread_cfg[tctr]; - tctr < ARRAY_SIZE(thread_cfg); - tctr++, thread_cfgp++) - { - shm_memfrag *this_frag = thread_cfgp->start; - while (this_frag) { - ASSERT_EQUAL(tctr, - this_frag->owner, - "Owner=%lu of memory frag=%p is not expected owner=%lu\n", - this_frag->owner, - this_frag, - tctr); - this_frag = this_frag->next; - } - } - - splinterdb_close(&kvsb); - - platform_enable_tracing_shm_ops(); -} - -/* - * --------------------------------------------------------------------------- - * Test allocation, free and re-allocation of a large fragment should find - * this large fragment in the local tracker. That previously allocated - * fragment should be re-allocated. "Next-free-ptr" should, therefore, remain - * unchanged. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_realloc_of_large_fragment) -{ - void *next_free = platform_shm_next_free_addr(data->hid); - - // Large fragments are tracked if their size >= this size. - size_t size = (1 * MiB); - uint8 *keybuf = TYPED_MANUAL_MALLOC(data->hid, keybuf, size); - - // Validate that a new large fragment will create a new allocation. - ASSERT_TRUE((void *)keybuf == next_free); - - // Re-establish next-free-ptr after this large allocation. We will use it - // below to assert that this location will not change when we re-use this - // large fragment for reallocation after it's been freed. - next_free = platform_shm_next_free_addr(data->hid); - - // Save this off, as free below will NULL out handle. - uint8 *keybuf_old = keybuf; - - // If you free this fragment and reallocate exactly the same size, - // it should recycle the freed fragment. - platform_free(data->hid, keybuf); - - uint8 *keybuf_new = TYPED_MANUAL_MALLOC(data->hid, keybuf_new, size); - ASSERT_TRUE((keybuf_old == keybuf_new), - "keybuf_old=%p, keybuf_new=%p\n", - keybuf_old, - keybuf_new); - - // We have re-used freed fragment, so the next-free-ptr should be unchanged. - ASSERT_TRUE(next_free == platform_shm_next_free_addr(data->hid)); - - platform_free(data->hid, keybuf_new); -} - -/* - * --------------------------------------------------------------------------- - * Test that free followed by a request of the same size will reallocate the - * recently-freed fragment, avoiding any existing in-use fragments of the same - * size. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_free_realloc_around_inuse_fragments) -{ - void *next_free = platform_shm_next_free_addr(data->hid); - - // Large fragments are tracked if their size >= this size. - size_t size = (1 * MiB); - uint8 *keybuf1_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf1_1MiB, size); - uint8 *keybuf2_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf2_1MiB, size); - uint8 *keybuf3_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf3_1MiB, size); - - // Re-establish next-free-ptr after this large allocation. We will use it - // below to assert that this location will not change when we re-use a - // large fragment for reallocation after it's been freed. - next_free = platform_shm_next_free_addr(data->hid); - - // Save off fragment handles as free will NULL out ptr. - uint8 *old_keybuf2_1MiB = keybuf2_1MiB; - - // Free the middle fragment that should get reallocated, below. - platform_free(data->hid, keybuf2_1MiB); - - // Re-request (new) fragments of the same size. - keybuf2_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf2_1MiB, size); - ASSERT_TRUE((keybuf2_1MiB == old_keybuf2_1MiB), - "Expected to satisfy new 1MiB request at %p" - " with old 1MiB fragment ptr at %p\n", - keybuf2_1MiB, - old_keybuf2_1MiB); - - ASSERT_TRUE(next_free == platform_shm_next_free_addr(data->hid)); - - // As large-fragments allocated / freed are tracked in an array, verify - // that we will find the 1st one upon a re-request after a free. - uint8 *old_keybuf1_1MiB = keybuf1_1MiB; - platform_free(data->hid, keybuf1_1MiB); - platform_free(data->hid, keybuf2_1MiB); - - // This re-request should re-allocate the 1st free fragment found. - keybuf2_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf2_1MiB, size); - ASSERT_TRUE((keybuf2_1MiB == old_keybuf1_1MiB), - "Expected to satisfy new 1MiB request at %p" - " with old 1MiB fragment ptr at %p\n", - keybuf2_1MiB, - old_keybuf1_1MiB); - - // We've already freed keybuf1_1MiB; can't free a NULL ptr again. - // platform_free(data->hid, keybuf1_1MiB); - - platform_free(data->hid, keybuf2_1MiB); - platform_free(data->hid, keybuf3_1MiB); -} - -/* - * --------------------------------------------------------------------------- - * Finding a free-fragment that's tracked for re-allocation implements a - * very naive linear-search; first-fit algorigthm. This test case verifies - * that: - * - * - Allocate 3 fragments of 1MiB, 5MiB, 2MiB - * - Free them all. - * - Request for a 2MiB fragment. 2nd free fragment (5MiB) will be used. - * - Request for a 5MiB fragment. We will allocate a new fragment. - * - * An improved best-fit search algorithm would have allocated the free 2MiB - * and then satisfy the next request with the free 5 MiB fragment. - * --------------------------------------------------------------------------- - */ -CTEST2(splinter_shmem, test_realloc_of_free_fragments_uses_first_fit) -{ - void *next_free = platform_shm_next_free_addr(data->hid); - - // Large fragments are tracked if their size >= this size. - size_t size = (1 * MiB); - uint8 *keybuf_1MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf_1MiB, size); - - size = (5 * MiB); - uint8 *keybuf_5MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf_5MiB, size); - - size = (2 * MiB); - uint8 *keybuf_2MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf_2MiB, size); - - // Re-establish next-free-ptr after this large allocation. We will use it - // below to assert that this location will not change when we re-use a - // large fragment for reallocation after it's been freed. - next_free = platform_shm_next_free_addr(data->hid); - - // Save off fragment handles as free will NULL out ptr. - uint8 *old_keybuf_1MiB = keybuf_1MiB; - uint8 *old_keybuf_5MiB = keybuf_5MiB; - uint8 *old_keybuf_2MiB = keybuf_2MiB; - - // Order in which we free these fragments does not matter. - platform_free(data->hid, keybuf_1MiB); - platform_free(data->hid, keybuf_2MiB); - platform_free(data->hid, keybuf_5MiB); - - // Re-request (new) fragments in diff size order. - size = (2 * MiB); - keybuf_2MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf_2MiB, size); - ASSERT_TRUE((keybuf_2MiB == old_keybuf_5MiB), - "Expected to satisfy new 2MiB request at %p" - " with old 5MiB fragment ptr at %p\n", - keybuf_2MiB, - old_keybuf_5MiB); - ; - - // We have re-used freed 5MiB fragment; so next-free-ptr should be unchanged. - ASSERT_TRUE(next_free == platform_shm_next_free_addr(data->hid)); - - size = (5 * MiB); - keybuf_5MiB = TYPED_MANUAL_MALLOC(data->hid, keybuf_5MiB, size); - - // We allocated a new fragment at next-free-ptr - ASSERT_TRUE(keybuf_5MiB != old_keybuf_1MiB); - ASSERT_TRUE(keybuf_5MiB != old_keybuf_2MiB); - ASSERT_TRUE(keybuf_5MiB == next_free); - - platform_free(data->hid, keybuf_2MiB); - platform_free(data->hid, keybuf_5MiB); -} - -static void -setup_cfg_for_test(splinterdb_config *out_cfg, data_config *default_data_cfg) -{ - *out_cfg = (splinterdb_config){.filename = TEST_DB_NAME, - .cache_size = 512 * Mega, - .disk_size = 2 * Giga, - .use_shmem = TRUE, - .data_cfg = default_data_cfg}; -} - -/* - * exec_thread_memalloc() - Worker fn for each thread to do concurrent memory - * allocation from the shared segment. - */ -static void -exec_thread_memalloc(void *arg) -{ - thread_config *thread_cfg = (thread_config *)arg; - - // Allocate a new memory fragment and connect head to output variable for - // thread - shm_memfrag **fragpp = &thread_cfg->start; - shm_memfrag *new_frag = NULL; - - uint64 nallocs = 0; - threadid this_thread_idx = thread_cfg->exp_thread_idx; - - // Keep allocating fragments till we run out of memory. - // Build a linked list of memory fragments for this thread. - while ((new_frag = TYPED_ZALLOC(platform_get_heap_id(), new_frag)) != NULL) { - *fragpp = new_frag; - new_frag->owner = this_thread_idx; - fragpp = &new_frag->next; - nallocs++; - } - - platform_default_log( - "Thread-ID=%lu allocated %lu memory fragments of %lu bytes each.\n", - this_thread_idx, - nallocs, - sizeof(*new_frag)); -} diff --git a/tests/unit/splinterdb_forked_child_test.c b/tests/unit/splinterdb_forked_child_test.c index f5776155..bdf44938 100644 --- a/tests/unit/splinterdb_forked_child_test.c +++ b/tests/unit/splinterdb_forked_child_test.c @@ -17,7 +17,6 @@ #include "splinterdb/default_data_config.h" #include "splinterdb/splinterdb.h" -#include "shmem.h" #include "config.h" #include "splinterdb_tests_private.h" #include "test_common.h" @@ -131,42 +130,6 @@ CTEST2(splinterdb_forked_child, test_data_structures_handles) } else if (pid == 0) { platform_register_thread(); - // Verify that child process sees the same handle to a running Splinter - // as seen by the parent. (We cross-check using the copy saved off in - // shared memory.) - platform_heap_id spl_heap_id = splinterdb_get_heap_id(spl_handle); - - void *child_splinterdb_handle = - platform_heap_get_splinterdb_handle(spl_heap_id); - - ASSERT_TRUE((void *)spl_handle == child_splinterdb_handle, - "spl_handle=%p != child_splinterdb_handle=%p\n", - spl_handle, - child_splinterdb_handle); - - // Verify that the splinter handle and handles to other sub-systems are - // all valid addresses allocated from the shared segment setup by the main - // process. - - ASSERT_TRUE(platform_valid_addr_in_heap( - spl_heap_id, splinterdb_get_task_system_handle(spl_handle))); - - ASSERT_TRUE( - platform_valid_addr_in_heap(splinterdb_get_heap_id(spl_handle), - splinterdb_get_io_handle(spl_handle))); - - ASSERT_TRUE(platform_valid_addr_in_heap( - spl_heap_id, splinterdb_get_allocator_handle(spl_handle))); - - ASSERT_TRUE(platform_valid_addr_in_heap( - spl_heap_id, splinterdb_get_cache_handle(spl_handle))); - - ASSERT_TRUE(platform_valid_addr_in_heap( - spl_heap_id, splinterdb_get_trunk_handle(spl_handle))); - - ASSERT_TRUE(platform_valid_addr_in_heap( - spl_heap_id, splinterdb_get_memtable_context_handle(spl_handle))); - // After registering w/Splinter, child process' tid will change. ASSERT_EQUAL(1, platform_get_tid()); diff --git a/tests/unit/splinterdb_heap_id_mgmt_test.c b/tests/unit/splinterdb_heap_id_mgmt_test.c index 2a8bf0b1..620287b9 100644 --- a/tests/unit/splinterdb_heap_id_mgmt_test.c +++ b/tests/unit/splinterdb_heap_id_mgmt_test.c @@ -19,146 +19,272 @@ * task_system_test.c etc. * ----------------------------------------------------------------------------- */ -#include "platform_units.h" -#include "splinterdb/splinterdb.h" -#include "splinterdb/default_data_config.h" -#include "config.h" -#include "unit_tests.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "shmalloc.h" +#include "ctest.h" -#define TEST_MAX_KEY_SIZE 20 +#define IS_ALIGNED(ptr, alignment) \ + ((((uintptr_t)(ptr)) & ((uintptr_t)(alignment) - 1)) == 0) + +uint64_t log_stress_iterations = 100000; +uint64_t log_max_allocations = 1000000; +uint64_t log_max_allocation_size = 22; /* 4 MB */ +uint64_t fast_mode = 0; + +typedef struct allocation { + pthread_spinlock_t lock; + uint8_t *p; + size_t size; + size_t alignment; + int c; +} allocation; + +allocation *allocs = NULL; + +uint64_t +random_size(unsigned int *seed) +{ + uint64_t oom = rand_r(seed) % (log_max_allocation_size + 2); + if (oom == log_max_allocation_size) { + return 0; + } + if (oom == log_max_allocation_size + 1) { + return 1ULL << log_max_allocation_size; + } + return (1ULL << oom) + (rand_r(seed) % (1ULL << oom)); +} static void -create_default_cfg(splinterdb_config *out_kvs_cfg, - data_config *default_data_cfg, - bool use_shmem); +validate_data(uint8_t *ptr, size_t old_size, size_t new_size, int c) +{ + if (fast_mode) { + if (new_size < old_size) { + ASSERT_EQUAL(ptr[0], c); + } else if (old_size > 0) { + ASSERT_EQUAL(ptr[0], c); + ASSERT_EQUAL(ptr[old_size - 1], c); + } + } else { + uint64_t size = old_size < new_size ? old_size : new_size; + for (size_t j = 0; j < size; j++) { + ASSERT_EQUAL(ptr[j], c); + } + } +} /* - * Global data declaration macro: + * Stress test: Random allocations and deallocations */ -CTEST_DATA(splinterdb_heap_id_mgmt) -{ - data_config default_data_cfg; - platform_heap_id hid; - splinterdb_config kvs_cfg; - splinterdb *kvs; - size_t shmem_size; // for assert validation -}; - -CTEST_SETUP(splinterdb_heap_id_mgmt) +static void * +thread_worker(void *arg) { - platform_status rc = STATUS_OK; + shmallocator *shm = (shmallocator *)arg; + + unsigned int seed = (unsigned int)(time(NULL) ^ pthread_self()); - platform_register_thread(); + uint64_t failed_allocations = 0; - bool use_shmem = config_parse_use_shmem(Ctest_argc, (char **)Ctest_argv); + for (int i = 0; i < (1ULL << log_stress_iterations); i++) { + /* Randomly decide: allocate or deallocate */ + int idx = rand_r(&seed) % (1ULL << log_max_allocations); + while (pthread_spin_trylock(&allocs[idx].lock) != 0) { + idx = rand_r(&seed) % (1ULL << log_max_allocations); + } - uint64 heap_capacity = (256 * MiB); // small heap is sufficient. - default_data_config_init(TEST_MAX_KEY_SIZE, &data->default_data_cfg); + if (allocs[idx].p == NULL || rand_r(&seed) % 2) { + ASSERT_TRUE(allocs[idx].p != NULL || allocs[idx].size == 0); - memset(&data->kvs_cfg, 0, sizeof(data->kvs_cfg)); - create_default_cfg(&data->kvs_cfg, &data->default_data_cfg, use_shmem); + /* Allocate */ + size_t size = random_size(&seed); + size_t alignment; - platform_module_id mid = platform_get_module_id(); + uint8_t *ptr; + if (allocs[idx].p == NULL && rand_r(&seed) % 2) { + alignment = 1ULL << (rand_r(&seed) % 13); + ptr = shmalloc(shm, alignment, size); + } else { + alignment = 1; + ptr = shrealloc(shm, allocs[idx].p, size); + } - rc = platform_heap_create( - mid, heap_capacity, use_shmem, &data->kvs_cfg.heap_id); - platform_assert_status_ok(rc); + if (allocs[idx].p != NULL && ptr != NULL) { + validate_data(ptr, allocs[idx].size, size, allocs[idx].c); + } - // Remember the heap-ID created here; used to cross-check in assertions - data->hid = data->kvs_cfg.heap_id; - if (data->kvs_cfg.use_shmem) { - data->shmem_size = platform_shmsize(data->hid); + if (ptr != NULL || size == 0) { + if (allocs[idx].p == NULL) { + allocs[idx].alignment = alignment; + } + allocs[idx].p = ptr; + allocs[idx].size = size; + + + ASSERT_TRUE(IS_ALIGNED(ptr, allocs[idx].alignment)); + + /* Fill with pattern based on thread number */ + allocs[idx].c = (unsigned char)(seed & 0xFF); + if (fast_mode && size > 0) { + ptr[0] = allocs[idx].c; + ptr[size - 1] = allocs[idx].c; + } else { + memset(ptr, allocs[idx].c, size); + } + } else { + failed_allocations++; + } + } else { + /* Deallocate a random allocation */ + uint8_t *ptr = allocs[idx].p; + size_t size = allocs[idx].size; + + validate_data(ptr, size, size, allocs[idx].c); + + shfree(shm, ptr); + + allocs[idx].p = NULL; + allocs[idx].size = 0; + } + + pthread_spin_unlock(&allocs[idx].lock); } -} -CTEST_TEARDOWN(splinterdb_heap_id_mgmt) -{ - platform_heap_destroy(&data->kvs_cfg.heap_id); - platform_deregister_thread(); + /* Free all remaining allocations */ + for (int i = 0; i < (1ULL << log_max_allocations); i++) { + if (pthread_spin_trylock(&allocs[i].lock) == 0) { + if (allocs[i].p != NULL) { + shfree(shm, allocs[i].p); + allocs[i].p = NULL; + allocs[i].size = 0; + } + pthread_spin_unlock(&allocs[i].lock); + } + } + + printf("Failed allocations: %lu\n", failed_allocations); + return NULL; } -/* - * Test create and close interfaces. - */ -CTEST2(splinterdb_heap_id_mgmt, test_create_close) + +static void +stress_test(uint64_t lg_stress_iterations, + uint64_t lg_max_allocations, + uint64_t lg_max_allocation_size, + uint64_t fst_mode, + uint64_t log_shared_region_size, + uint64_t num_processes, + uint64_t num_threads_per_process) { - int rc = splinterdb_create(&data->kvs_cfg, &data->kvs); - ASSERT_EQUAL(0, rc); + log_stress_iterations = lg_stress_iterations; + log_max_allocations = lg_max_allocations; + log_max_allocation_size = lg_max_allocation_size; + fast_mode = fst_mode; - ASSERT_EQUAL((size_t)data->hid, (size_t)data->kvs_cfg.heap_id); + allocs = mmap(NULL, + sizeof(allocation) * (1ULL << log_max_allocations), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + if (allocs == MAP_FAILED) { + perror("mmap failed"); + exit(1); + } - splinterdb_close(&data->kvs); + for (int i = 0; i < (1ULL << log_max_allocations); i++) { + pthread_spin_init(&allocs[i].lock, PTHREAD_PROCESS_SHARED); + } - // As we created the heap externally, in this test, close should not - // have destroyed the heap. Exercise interfaces which will seg-fault - // otherwise. - if (data->kvs_cfg.use_shmem) { - ASSERT_EQUAL(data->shmem_size, platform_shmsize(data->hid)); + shmallocator *shm = mmap(NULL, + (1ULL << log_shared_region_size), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + if (shm == MAP_FAILED) { + perror("mmap failed"); + exit(1); } -} -/* - * Test create, close followed by re-open of SplinterDB. Re-use the previously - * setup splinterdb_config{} struct, which should have a handle to the platform - * heap. The same (shared memory) heap should be re-used. - */ -CTEST2(splinterdb_heap_id_mgmt, test_create_close_and_reopen) -{ - int rc = splinterdb_create(&data->kvs_cfg, &data->kvs); - ASSERT_EQUAL(0, rc); - splinterdb_close(&data->kvs); + if (shmallocator_init( + shm, (1ULL << log_max_allocations), (1ULL << log_shared_region_size)) + != 0) + { + perror("shmallocator_init failed"); + exit(1); + } - rc = splinterdb_open(&data->kvs_cfg, &data->kvs); + ASSERT_TRUE(num_processes <= 64); - // The heap ID should not change. - ASSERT_EQUAL((size_t)data->hid, (size_t)data->kvs_cfg.heap_id); + pid_t pids[64]; + memset(pids, 0, sizeof(pids)); - splinterdb_close(&data->kvs); - // Shared memory heap should still be around and accessible. - if (data->kvs_cfg.use_shmem) { - ASSERT_EQUAL(data->shmem_size, platform_shmsize(data->hid)); + for (int i = 0; i < num_processes - 1; i++) { + pid_t pid = fork(); + ASSERT_TRUE(pid >= 0); + if (pid == 0) { + printf("Child %d created\n", getpid()); + break; + } else { + pids[i] = pid; + } } -} -/* - * Test error path while creating SplinterDB which can trip-up due to a bad - * configuration. Induce such an error and verify that platform heap is not - * messed-up due to backout code. - */ -CTEST2(splinterdb_heap_id_mgmt, test_failed_init_config) -{ - size_t save_cache_size = data->kvs_cfg.cache_size; - data->kvs_cfg.cache_size = 0; - int rc = splinterdb_create(&data->kvs_cfg, &data->kvs); - ASSERT_NOT_EQUAL(0, rc); + ASSERT_TRUE(num_threads_per_process <= 64); + pthread_t threads[64]; + for (int i = 0; i < num_threads_per_process - 1; i++) { + ASSERT_TRUE(pthread_create(&threads[i], NULL, thread_worker, shm) == 0); + } - data->kvs_cfg.cache_size = save_cache_size; + thread_worker(shm); - // The heap ID should not change. - ASSERT_EQUAL((size_t)data->hid, (size_t)data->kvs_cfg.heap_id); + for (int i = 0; i < num_threads_per_process - 1; i++) { + ASSERT_TRUE(pthread_join(threads[i], NULL) == 0); + } - // Shared memory heap should still be around and accessible. - if (data->kvs_cfg.use_shmem) { - ASSERT_EQUAL(data->shmem_size, platform_shmsize(data->hid)); + // If we are the parent process, wait for all children to complete + if (num_processes > 1) { + if (pids[num_processes - 2] != 0) { + for (int i = 0; i < num_processes - 1; i++) { + int child_pid; + int status; + child_pid = wait(&status); + ASSERT_TRUE(child_pid != -1); + ASSERT_EQUAL(WIFEXITED(status), 1); + ASSERT_EQUAL(WEXITSTATUS(status), 0); + } + shmallocator_deinit(shm); + } + } + + + munmap(allocs, sizeof(allocation) * (1ULL << log_max_allocations)); + munmap(shm, (1ULL << log_shared_region_size)); + + if (num_processes > 1 && pids[num_processes - 2] == 0) { + printf("Child %d completed\n", getpid()); + exit(0); } } -/* - * Helper routine to create a valid Splinter configuration using default - * page- and extent-size. Shared-memory usage is OFF by default. - */ -static void -create_default_cfg(splinterdb_config *out_kvs_cfg, - data_config *default_data_cfg, - bool use_shmem) + +CTEST_DATA(shmalloc){}; + +CTEST_SETUP(shmalloc) {} + +CTEST_TEARDOWN(shmalloc) {} + +CTEST2(shmalloc, stress_test) { - *out_kvs_cfg = - (splinterdb_config){.filename = TEST_DB_NAME, - .cache_size = 64 * Mega, - .disk_size = 127 * Mega, - .page_size = TEST_CONFIG_DEFAULT_PAGE_SIZE, - .extent_size = TEST_CONFIG_DEFAULT_EXTENT_SIZE, - .use_shmem = use_shmem, - .data_cfg = default_data_cfg}; + stress_test(18, 16, 22, 0, 32, 4, 4); } From 336c15ad87aa2985cbb44bd108d71b782bbe62a3 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 20 Jan 2026 11:07:48 -0800 Subject: [PATCH 2/9] change splinter_perf range lookup counts --- tests/functional/splinter_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/functional/splinter_test.c b/tests/functional/splinter_test.c index 4b5ebb04..656bd9ee 100644 --- a/tests/functional/splinter_test.c +++ b/tests/functional/splinter_test.c @@ -1410,10 +1410,10 @@ test_splinter_perf(system_config *cfg, // clang-format off // Define a set of parameters to drive trunk range query perf trunk_range_perf_params perf_ranges[] = { - // number min max - {"Small range" , 128 , 1 , 100 }, + // number min max + {"Small range" , 2048 , 1 , 100 }, {"Medium range" , 512 , 512 , 1024 }, - {"Large range" , 2048 , (131072 - 16384) , 131072 } + {"Large range" , 128 , (131072 - 16384) , 131072 } }; // clang-format on From e5c2f470d4ec45641d27c3fd196851913b76dfab Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 20 Jan 2026 11:24:56 -0800 Subject: [PATCH 3/9] add shmalloc.[hc] Signed-off-by: Rob Johnson --- src/platform_linux/shmalloc.c | 613 ++++++++++++++++++++++++++++++++++ src/platform_linux/shmalloc.h | 26 ++ 2 files changed, 639 insertions(+) create mode 100644 src/platform_linux/shmalloc.c create mode 100644 src/platform_linux/shmalloc.h diff --git a/src/platform_linux/shmalloc.c b/src/platform_linux/shmalloc.c new file mode 100644 index 00000000..59ce70ed --- /dev/null +++ b/src/platform_linux/shmalloc.c @@ -0,0 +1,613 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "platform_units.h" +#include "shmalloc.h" + +#define IS_ALIGNED(ptr, alignment) \ + ((((uintptr_t)(ptr)) & ((uintptr_t)(alignment) - 1)) == 0) +#define ALIGN_UP(value, alignment) \ + (((uintptr_t)(value) + (uintptr_t)(alignment) - 1) \ + & ~((uintptr_t)(alignment) - 1)) +#define ALIGN_DOWN(value, alignment) \ + (((uintptr_t)(value)) & ~((uintptr_t)(alignment) - 1)) + +typedef enum chunk_state { + CHUNK_STATE_UNUSED = 0, + CHUNK_STATE_FREE = 1, + CHUNK_STATE_ALLOCATED = 2, +} chunk_state; + +typedef struct chunk { + uint64_t size : 55; + uint64_t requested_alignment : 6; + uint64_t hash_index : 1; + chunk_state state : 2; + void *ptr; + struct chunk *phys_pred; + struct chunk *size_pred; + struct chunk *size_next; +} chunk; + +#define NUM_SIZE_CLASSES (48) + +/* The chunk table uses power-of-two-choice hashing. The fingerprint array is + * used to speed up lookups.*/ + +#define CHUNK_TABLE_BUCKET_SIZE (64) +_Static_assert(CHUNK_TABLE_BUCKET_SIZE <= 256, "CHUNK_TABLE_BUCKET_SIZE < 256"); + +typedef struct chunk_table_bucket { + uint8_t occupancy; + uint8_t fingerprints[CHUNK_TABLE_BUCKET_SIZE]; + chunk chunks[CHUNK_TABLE_BUCKET_SIZE]; +} chunk_table_bucket; + +_Static_assert(CHUNK_TABLE_BUCKET_SIZE + < (1ULL << 8 * sizeof((chunk_table_bucket *)NULL)->occupancy), + "CHUNK_TABLE_BUCKET_SIZE is too large for the occupancy field"); + +static const uint64_t chunk_table_hash_seeds[2] = {0x1234567890abcdefULL, + 0xabcdef1234567890ULL}; + +typedef struct shmallocator { + pthread_spinlock_t lock; + void *end; + void *max_allocated_end; + uint64_t chunk_table_num_buckets; + chunk *free[NUM_SIZE_CLASSES]; + chunk_table_bucket chunk_table[]; +} shmallocator; + +typedef struct hash_info { + uint64_t bucket_idx; + uint8_t fingerprint; +} hash_info; + +static hash_info +pointer_hash(shmallocator *shm, void *ptr, uint64_t seed) +{ + hash_info info; + uint64_t hash = XXH64(&ptr, sizeof(void *), chunk_table_hash_seeds[seed]); + info.bucket_idx = (hash >> 8) % shm->chunk_table_num_buckets; + info.fingerprint = (uint8_t)(hash & 0xFF); + if (info.fingerprint == 0) { + info.fingerprint = 1; + } + return info; +} + +static chunk * +chunk_table_alloc_entry(shmallocator *shm, void *ptr) +{ + uint64_t chosen_bucket_idx = 0; + hash_info chosen_bucket_info; + uint8_t chosen_bucket_occupancy = UINT8_MAX; + for (uint64_t i = 0; i < 2; i++) { + hash_info info = pointer_hash(shm, ptr, i); + chunk_table_bucket *bucket = &shm->chunk_table[info.bucket_idx]; + if (bucket->occupancy < chosen_bucket_occupancy) { + chosen_bucket_idx = i; + chosen_bucket_info = info; + chosen_bucket_occupancy = bucket->occupancy; + } + } + + if (chosen_bucket_occupancy == CHUNK_TABLE_BUCKET_SIZE) { + return NULL; + } + + chunk_table_bucket *bucket = + &shm->chunk_table[chosen_bucket_info.bucket_idx]; + assert(bucket->occupancy < CHUNK_TABLE_BUCKET_SIZE); + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + if (bucket->fingerprints[j] == 0) { + assert(bucket->chunks[j].state == CHUNK_STATE_UNUSED); + bucket->fingerprints[j] = chosen_bucket_info.fingerprint; + bucket->chunks[j].hash_index = chosen_bucket_idx; + bucket->occupancy++; + return &bucket->chunks[j]; + } + } + assert(0); +} + +static void +chunk_table_free_entry(shmallocator *shm, chunk *chnk) +{ + hash_info info = pointer_hash(shm, chnk->ptr, chnk->hash_index); + chunk_table_bucket *bucket = &shm->chunk_table[info.bucket_idx]; + assert(&bucket->chunks[0] <= chnk + && chnk < &bucket->chunks[CHUNK_TABLE_BUCKET_SIZE]); + uint64_t j = chnk - &bucket->chunks[0]; + assert(bucket->fingerprints[j] == info.fingerprint); + memset(chnk, 0, sizeof(chunk)); + chnk->state = CHUNK_STATE_UNUSED; + bucket->fingerprints[j] = 0; + bucket->occupancy--; +} + +static chunk * +chunk_table_get_entry(shmallocator *shm, void *ptr) +{ + for (uint64_t i = 0; i < 2; i++) { + hash_info info = pointer_hash(shm, ptr, i); + chunk_table_bucket *bucket = &shm->chunk_table[info.bucket_idx]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + if (bucket->fingerprints[j] == info.fingerprint + && bucket->chunks[j].ptr == ptr) + { + return &bucket->chunks[j]; + } + } + } + return NULL; +} + +static chunk * +chunk_get_physical_successor(shmallocator *shm, chunk *chnk) +{ + if (chnk->ptr + chnk->size == shm->end) { + return NULL; + } + return chunk_table_get_entry(shm, chnk->ptr + chnk->size); +} + +static uint64_t +log_size(uint64_t size) +{ + assert(size > 0); + return 63 - __builtin_clzll(size); +} + +static void +free_list_add(shmallocator *shm, chunk *chnk) +{ + uint64_t lsize = log_size(chnk->size); + assert(lsize < NUM_SIZE_CLASSES); + assert(chnk->state == CHUNK_STATE_FREE); + assert(chnk->size_pred == NULL); + assert(chnk->size_next == NULL); + chnk->size_pred = NULL; + chnk->size_next = shm->free[lsize]; + if (chnk->size_next) { + chnk->size_next->size_pred = chnk; + } + shm->free[lsize] = chnk; +} + +static chunk * +free_list_alloc(shmallocator *shm, size_t size) +{ + uint64_t lsize = 1 + log_size(size); + assert(lsize < NUM_SIZE_CLASSES); + while (lsize < NUM_SIZE_CLASSES && !shm->free[lsize]) { + lsize++; + } + if (lsize >= NUM_SIZE_CLASSES) { + return NULL; + } + chunk *chnk = shm->free[lsize]; + shm->free[lsize] = chnk->size_next; + if (chnk->size_next) { + chnk->size_next->size_pred = NULL; + } + chnk->size_pred = NULL; + chnk->size_next = NULL; + return chnk; +} + +static void +free_list_remove(shmallocator *shm, chunk *chnk) +{ + assert(chnk->state == CHUNK_STATE_FREE); + if (chnk->size_pred) { + chnk->size_pred->size_next = chnk->size_next; + } else { + uint64_t lsize = log_size(chnk->size); + assert(lsize < NUM_SIZE_CLASSES); + assert(shm->free[lsize] == chnk); + shm->free[lsize] = chnk->size_next; + } + if (chnk->size_next) { + chnk->size_next->size_pred = chnk->size_pred; + } + chnk->size_pred = NULL; + chnk->size_next = NULL; +} + +static chunk * +chunk_remove_tail(shmallocator *shm, chunk *chnk, size_t size) +{ + assert(chnk->state == CHUNK_STATE_ALLOCATED); + + chunk *psucc = chunk_get_physical_successor(shm, chnk); + + chunk *new_chunk = chunk_table_alloc_entry(shm, chnk->ptr + size); + if (!new_chunk) { + return NULL; + } + + new_chunk->size = chnk->size - size; + new_chunk->state = CHUNK_STATE_FREE; + new_chunk->ptr = chnk->ptr + size; + new_chunk->phys_pred = chnk; + + if (psucc) { + psucc->phys_pred = new_chunk; + } + + chnk->size = size; + return new_chunk; +} + +static void +chunks_merge(shmallocator *shm, chunk *chnk, chunk *psucc) +{ + assert(chnk->state == CHUNK_STATE_FREE); + assert(psucc->state == CHUNK_STATE_FREE); + + chunk *ppsucc = chunk_get_physical_successor(shm, psucc); + + chnk->size += psucc->size; + chunk_table_free_entry(shm, psucc); + if (ppsucc) { + ppsucc->phys_pred = chnk; + } +} + +static void +free_list_is_free(shmallocator *shm) +{ + uint64_t free_list_count = 0; + for (uint64_t i = 0; i < NUM_SIZE_CLASSES; i++) { + chunk *chnk = shm->free[i]; + while (chnk) { + assert(chnk->state == CHUNK_STATE_FREE); + free_list_count++; + chnk = chnk->size_next; + } + } + + uint64_t chunk_table_count = 0; + for (uint64_t i = 0; i < shm->chunk_table_num_buckets; i++) { + chunk_table_bucket *bucket = &shm->chunk_table[i]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + chunk *chnk = &bucket->chunks[j]; + if (chnk->state == CHUNK_STATE_FREE) { + chunk_table_count++; + } + } + + assert(free_list_count == chunk_table_count); + } +} + +static void +inuse_chunks_are_gettable(shmallocator *shm) +{ + for (uint64_t i = 0; i < shm->chunk_table_num_buckets; i++) { + chunk_table_bucket *bucket = &shm->chunk_table[i]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + chunk *chnk = &bucket->chunks[j]; + if (chnk->state == CHUNK_STATE_ALLOCATED + || chnk->state == CHUNK_STATE_FREE) + { + assert(chnk->ptr != NULL); + assert(chunk_table_get_entry(shm, chnk->ptr) == chnk); + } + } + } +} + +static void +available_chunks_are_zeroed(shmallocator *shm) +{ + for (uint64_t i = 0; i < shm->chunk_table_num_buckets; i++) { + chunk_table_bucket *bucket = &shm->chunk_table[i]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + chunk *chnk = &bucket->chunks[j]; + if (chnk->state == CHUNK_STATE_UNUSED) { + assert(chnk->ptr == NULL); + assert(chnk->size == 0); + assert(chnk->phys_pred == NULL); + assert(chnk->size_pred == NULL); + assert(chnk->size_next == NULL); + } + } + } +} + +static void +allocated_chunks_have_null_size_pred_and_size_next(shmallocator *shm) +{ + for (uint64_t i = 0; i < shm->chunk_table_num_buckets; i++) { + chunk_table_bucket *bucket = &shm->chunk_table[i]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + chunk *chnk = &bucket->chunks[j]; + if (chnk->state == CHUNK_STATE_ALLOCATED) { + assert(chnk->size_pred == NULL); + assert(chnk->size_next == NULL); + } + } + } +} + +static void +physical_ordering_is_consistent(shmallocator *shm) +{ + void *ptr = &shm->chunk_table[shm->chunk_table_num_buckets]; + chunk *chnk = chunk_table_get_entry(shm, ptr); + assert(chnk); + while (chnk) { + assert(chnk->ptr == ptr); + ptr = chnk->ptr + chnk->size; + chunk *next_chnk = chunk_get_physical_successor(shm, chnk); + if (next_chnk) { + assert(next_chnk->phys_pred == chnk); + } + chnk = next_chnk; + } +} + +static void +no_adjacent_free_chunks(shmallocator *shm) +{ + void *ptr = &shm->chunk_table[shm->chunk_table_num_buckets]; + chunk *chnk = chunk_table_get_entry(shm, ptr); + assert(chnk); + while (chnk) { + assert(chnk->state == CHUNK_STATE_FREE + || chnk->state == CHUNK_STATE_ALLOCATED); + assert(chnk->ptr == ptr); + ptr = chnk->ptr + chnk->size; + chunk *next_chnk = chunk_get_physical_successor(shm, chnk); + if (next_chnk) { + assert(next_chnk->state == CHUNK_STATE_ALLOCATED + || chnk->state == CHUNK_STATE_ALLOCATED); + } + + chnk = next_chnk; + } +} + +static void +no_overlapping_chunks(shmallocator *shm) +{ + uint64_t physical_count = 0; + void *ptr = &shm->chunk_table[shm->chunk_table_num_buckets]; + chunk *chnk = chunk_table_get_entry(shm, ptr); + assert(chnk); + while (chnk) { + physical_count++; + ptr = chnk->ptr + chnk->size; + chnk = chunk_get_physical_successor(shm, chnk); + } + + uint64_t chunk_table_count = 0; + for (uint64_t i = 0; i < shm->chunk_table_num_buckets; i++) { + chunk_table_bucket *bucket = &shm->chunk_table[i]; + for (uint64_t j = 0; j < CHUNK_TABLE_BUCKET_SIZE; j++) { + chunk *chnk = &bucket->chunks[j]; + if (chnk->state == CHUNK_STATE_ALLOCATED + || chnk->state == CHUNK_STATE_FREE) + { + chunk_table_count++; + } + } + } + assert(physical_count == chunk_table_count); +} + +static void +all_invariants(shmallocator *shm) +{ + return; + free_list_is_free(shm); + inuse_chunks_are_gettable(shm); + available_chunks_are_zeroed(shm); + allocated_chunks_have_null_size_pred_and_size_next(shm); + physical_ordering_is_consistent(shm); + no_adjacent_free_chunks(shm); + no_overlapping_chunks(shm); +} + +int +shmallocator_init(shmallocator *shm, uint64_t max_allocations, size_t size) +{ + /* There chunk table holds all the free and allocated chunks. There may be + * up to one free chunk between each pair of allocated chunks. So we need to + * double the number of slots in the chunk table. Furthermore, we need to + * allocate a few extra slots to be able to resolve collisions. So, all + * total, we allocate 2.4x as many slots as the max number of allocations.*/ + uint64_t chunk_table_num_buckets = + (24 * max_allocations / 10 + CHUNK_TABLE_BUCKET_SIZE - 1) + / CHUNK_TABLE_BUCKET_SIZE; + + + uint64_t shmsize = sizeof(shmallocator) + + chunk_table_num_buckets * sizeof(chunk_table_bucket); + if (size < shmsize) { + return -1; + } + + memset(shm, 0, shmsize); + shm->chunk_table_num_buckets = chunk_table_num_buckets; + shm->end = ((void *)shm) + size; + + void *ptr = &shm->chunk_table[chunk_table_num_buckets]; + chunk *chnk = chunk_table_alloc_entry(shm, ptr); + chnk->size = size - shmsize; + chnk->state = CHUNK_STATE_FREE; + chnk->ptr = ptr; + chnk->phys_pred = NULL; + chnk->size_pred = NULL; + chnk->size_next = NULL; + free_list_add(shm, chnk); + pthread_spin_init(&shm->lock, PTHREAD_PROCESS_SHARED); + all_invariants(shm); + return 0; +} + +void +shmallocator_deinit(shmallocator *shm) +{ + // printf("shmallocator_deinit: max needed MiBs: %lu (%lu%%)\n", + // B_TO_MiB(shm->max_allocated_end - (void *)shm), + // (uint64_t)(shm->max_allocated_end - (void *)shm) * 100 + // / shmallocator_size(shm)); + pthread_spin_destroy(&shm->lock); +} + +size_t +shmallocator_size(shmallocator *shm) +{ + return shm->end - (void *)shm; +} + +void * +shmalloc(shmallocator *shm, size_t alignment, size_t size) +{ + if (alignment < 16) { + alignment = 16; + } + uint64_t requested_alignment = log_size(alignment); + if (requested_alignment > 63) { + return NULL; + } + if (alignment != (1ULL << requested_alignment)) { + return NULL; + } + + if (size == 0) { + size = 1; + } + + pthread_spin_lock(&shm->lock); + + all_invariants(shm); + + chunk *chnk = free_list_alloc(shm, size + alignment - 1); + if (!chnk) { + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + return NULL; + } + chnk->state = CHUNK_STATE_ALLOCATED; + + if (!IS_ALIGNED(chnk->ptr, alignment)) { + void *aligned_ptr = (void *)ALIGN_UP(chnk->ptr, alignment); + size_t padding = aligned_ptr - chnk->ptr; + chunk *new_chunk = chunk_remove_tail(shm, chnk, padding); + chnk->state = CHUNK_STATE_FREE; + free_list_add(shm, chnk); + if (!new_chunk) { + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + return NULL; + } + chnk = new_chunk; + chnk->state = CHUNK_STATE_ALLOCATED; + } + + if (size < chnk->size) { + chunk *new_chunk = chunk_remove_tail(shm, chnk, size); + if (new_chunk) { + free_list_add(shm, new_chunk); + } + } + + chnk->requested_alignment = requested_alignment; + + if (chnk->ptr + size > shm->max_allocated_end) { + shm->max_allocated_end = chnk->ptr + size; + } + + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + assert((void *)&shm->chunk_table[shm->chunk_table_num_buckets] <= chnk->ptr); + assert(size <= chnk->size); + assert(chnk->ptr + size <= shm->end); + assert(IS_ALIGNED(chnk->ptr, alignment)); + return chnk->ptr; +} + +void +shfree(shmallocator *shm, void *ptr) +{ + pthread_spin_lock(&shm->lock); + all_invariants(shm); + chunk *chnk = chunk_table_get_entry(shm, ptr); + if (!chnk) { + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + assert(0); + } + assert(chnk->state == CHUNK_STATE_ALLOCATED); + assert(chnk->ptr == ptr); + chnk->state = CHUNK_STATE_FREE; + chunk *ppred = chnk->phys_pred; + if (ppred && ppred->state == CHUNK_STATE_FREE) { + free_list_remove(shm, ppred); + chunks_merge(shm, ppred, chnk); + chnk = ppred; + } + chunk *psucc = chunk_get_physical_successor(shm, chnk); + if (psucc && psucc->state == CHUNK_STATE_FREE) { + free_list_remove(shm, psucc); + chunks_merge(shm, chnk, psucc); + } + free_list_add(shm, chnk); + + all_invariants(shm); + pthread_spin_unlock(&shm->lock); +} + +void * +shrealloc(shmallocator *shm, void *ptr, size_t size) +{ + if (ptr == NULL) { + return shmalloc(shm, 0, size); + } + if (size == 0) { + shfree(shm, ptr); + return NULL; + } + + pthread_spin_lock(&shm->lock); + all_invariants(shm); + chunk *chnk = chunk_table_get_entry(shm, ptr); + if (!chnk) { + pthread_spin_unlock(&shm->lock); + assert(0); + } + if (size < chnk->size / 2) { + chunk *new_chunk = chunk_remove_tail(shm, chnk, size); + if (new_chunk) { + free_list_add(shm, new_chunk); + } + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + return chnk->ptr; + } else if (size > chnk->size) { + pthread_spin_unlock(&shm->lock); + if (size < 2 * chnk->size) { + size = 2 * chnk->size; + } + void *pnew = shmalloc(shm, 1ULL << chnk->requested_alignment, size); + if (pnew) { + memcpy(pnew, ptr, chnk->size); + shfree(shm, ptr); + } + return pnew; + } else { + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + return ptr; + } +} diff --git a/src/platform_linux/shmalloc.h b/src/platform_linux/shmalloc.h new file mode 100644 index 00000000..518b838c --- /dev/null +++ b/src/platform_linux/shmalloc.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +typedef struct shmallocator shmallocator; + +int +shmallocator_init(shmallocator *shmallocator, + uint64_t chunk_table_size, + size_t size); + +size_t +shmallocator_size(shmallocator *shmallocator); + +void +shmallocator_deinit(shmallocator *shmallocator); + +void * +shmalloc(shmallocator *shmallocator, size_t alignment, size_t size); + +void +shfree(shmallocator *shmallocator, void *ptr); + +void * +shrealloc(shmallocator *shmallocator, void *ptr, size_t size); \ No newline at end of file From 1464ee450346f7861ab8f5a387047f44aa925439 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 20 Jan 2026 12:16:06 -0800 Subject: [PATCH 4/9] dial down heap capacities --- src/platform_linux/shmalloc.c | 36 ++++++++++++++++++++++--------- src/splinterdb.c | 5 +++-- tests/functional/btree_test.c | 4 ++-- tests/functional/cache_test.c | 4 ++-- tests/functional/filter_test.c | 4 ++-- tests/functional/log_test.c | 4 ++-- tests/functional/splinter_test.c | 12 +++++------ tests/functional/ycsb_test.c | 2 +- tests/unit/btree_stress_test.c | 2 +- tests/unit/btree_test.c | 2 +- tests/unit/config_parse_test.c | 2 +- tests/unit/limitations_test.c | 2 +- tests/unit/splinter_test.c | 4 +--- tests/unit/writable_buffer_test.c | 2 +- 14 files changed, 50 insertions(+), 35 deletions(-) diff --git a/src/platform_linux/shmalloc.c b/src/platform_linux/shmalloc.c index 59ce70ed..44f1a017 100644 --- a/src/platform_linux/shmalloc.c +++ b/src/platform_linux/shmalloc.c @@ -249,7 +249,6 @@ chunk_remove_tail(shmallocator *shm, chunk *chnk, size_t size) static void chunks_merge(shmallocator *shm, chunk *chnk, chunk *psucc) { - assert(chnk->state == CHUNK_STATE_FREE); assert(psucc->state == CHUNK_STATE_FREE); chunk *ppsucc = chunk_get_physical_successor(shm, psucc); @@ -595,16 +594,33 @@ shrealloc(shmallocator *shm, void *ptr, size_t size) pthread_spin_unlock(&shm->lock); return chnk->ptr; } else if (size > chnk->size) { - pthread_spin_unlock(&shm->lock); - if (size < 2 * chnk->size) { - size = 2 * chnk->size; - } - void *pnew = shmalloc(shm, 1ULL << chnk->requested_alignment, size); - if (pnew) { - memcpy(pnew, ptr, chnk->size); - shfree(shm, ptr); + chunk *next_chunk = chunk_get_physical_successor(shm, chnk); + if (next_chunk && next_chunk->state == CHUNK_STATE_FREE + && size <= chnk->size + next_chunk->size) + { + free_list_remove(shm, next_chunk); + chunks_merge(shm, chnk, next_chunk); + if (size < chnk->size / 2) { + chunk *new_chunk = chunk_remove_tail(shm, chnk, size); + if (new_chunk) { + free_list_add(shm, new_chunk); + } + } + all_invariants(shm); + pthread_spin_unlock(&shm->lock); + return chnk->ptr; + } else { + pthread_spin_unlock(&shm->lock); + if (size < 2 * chnk->size) { + size = 2 * chnk->size; + } + void *pnew = shmalloc(shm, 1ULL << chnk->requested_alignment, size); + if (pnew) { + memcpy(pnew, ptr, chnk->size); + shfree(shm, ptr); + } + return pnew; } - return pnew; } else { all_invariants(shm); pthread_spin_unlock(&shm->lock); diff --git a/src/splinterdb.c b/src/splinterdb.c index 361bd14e..125c44ee 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -259,8 +259,9 @@ splinterdb_create_or_open(const splinterdb_config *kvs_cfg, // IN // (Some tests externally create the platform_heap, so we should // only create one if it does not already exist.) if (kvs_cfg->use_shmem && (use_this_heap_id == NULL)) { - size_t shmem_size = (kvs_cfg->shmem_size ? kvs_cfg->shmem_size : 2 * GiB); - status = platform_heap_create( + size_t shmem_size = + (kvs_cfg->shmem_size ? kvs_cfg->shmem_size : 512 * MiB); + status = platform_heap_create( platform_get_module_id(), shmem_size, TRUE, &use_this_heap_id); if (!SUCCESS(status)) { platform_error_log( diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index 0926d1f2..945edba9 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -1558,8 +1558,8 @@ btree_test(int argc, char *argv[]) // Create a heap for io, allocator, cache and splinter platform_heap_id hid = NULL; - rc = - platform_heap_create(platform_get_module_id(), 1 * GiB, use_shmem, &hid); + rc = platform_heap_create( + platform_get_module_id(), 512 * MiB, use_shmem, &hid); platform_assert_status_ok(rc); uint64 num_bg_threads[NUM_TASK_TYPES] = {0}; // no bg threads diff --git a/tests/functional/cache_test.c b/tests/functional/cache_test.c index 46890eb7..3f93bdae 100644 --- a/tests/functional/cache_test.c +++ b/tests/functional/cache_test.c @@ -933,8 +933,8 @@ cache_test(int argc, char *argv[]) // Create a heap for io, allocator, cache and splinter platform_heap_id hid = NULL; - rc = - platform_heap_create(platform_get_module_id(), 1 * GiB, use_shmem, &hid); + rc = platform_heap_create( + platform_get_module_id(), 512 * MiB, use_shmem, &hid); platform_assert_status_ok(rc); uint64 num_bg_threads[NUM_TASK_TYPES] = {0}; // no bg threads diff --git a/tests/functional/filter_test.c b/tests/functional/filter_test.c index d8954462..8846836c 100644 --- a/tests/functional/filter_test.c +++ b/tests/functional/filter_test.c @@ -306,8 +306,8 @@ filter_test(int argc, char *argv[]) // Create a heap for io, allocator, cache and splinter platform_heap_id hid = NULL; - rc = - platform_heap_create(platform_get_module_id(), 1 * GiB, use_shmem, &hid); + rc = platform_heap_create( + platform_get_module_id(), 512 * MiB, use_shmem, &hid); platform_assert_status_ok(rc); uint64 num_memtable_bg_threads_unused = 0; diff --git a/tests/functional/log_test.c b/tests/functional/log_test.c index 67999c07..d7defa8d 100644 --- a/tests/functional/log_test.c +++ b/tests/functional/log_test.c @@ -259,8 +259,8 @@ log_test(int argc, char *argv[]) // Create a heap for io, allocator, cache and splinter platform_heap_id hid = NULL; - status = - platform_heap_create(platform_get_module_id(), 1 * GiB, use_shmem, &hid); + status = platform_heap_create( + platform_get_module_id(), 512 * MiB, use_shmem, &hid); platform_assert_status_ok(status); core_config *cfg = TYPED_MALLOC(hid, cfg); diff --git a/tests/functional/splinter_test.c b/tests/functional/splinter_test.c index 656bd9ee..619264c4 100644 --- a/tests/functional/splinter_test.c +++ b/tests/functional/splinter_test.c @@ -1411,9 +1411,9 @@ test_splinter_perf(system_config *cfg, // Define a set of parameters to drive trunk range query perf trunk_range_perf_params perf_ranges[] = { // number min max - {"Small range" , 2048 , 1 , 100 }, + {"Small range" , 128 , 1 , 100 }, {"Medium range" , 512 , 512 , 1024 }, - {"Large range" , 128 , (131072 - 16384) , 131072 } + {"Large range" , 8192 , (131072 - 16384) , 131072 } }; // clang-format on @@ -2611,10 +2611,10 @@ splinter_test(int argc, char *argv[]) * whichever is greater. * Heap capacity should be within [2 * GiB, UINT32_MAX]. */ - uint8 num_caches = cache_per_table ? num_tables : 1; - uint64 heap_capacity = MAX(1024 * MiB * num_caches, 512 * MiB * num_tables); - heap_capacity = MIN(heap_capacity, UINT32_MAX); - heap_capacity = MAX(heap_capacity, 8 * GiB); + uint8 num_caches = cache_per_table ? num_tables : 1; + // uint64 heap_capacity = MAX(1024 * MiB * num_caches, 512 * MiB * + // num_tables); heap_capacity = MIN(heap_capacity, UINT32_MAX); + uint64 heap_capacity = 512 * MiB; if (use_shmem) { platform_default_log( "Attempt to create shared segment of size %lu bytes.\n", diff --git a/tests/functional/ycsb_test.c b/tests/functional/ycsb_test.c index 73d22569..469c08d5 100644 --- a/tests/functional/ycsb_test.c +++ b/tests/functional/ycsb_test.c @@ -1176,7 +1176,7 @@ ycsb_test(int argc, char *argv[]) // Create a heap for io, allocator, cache and splinter platform_heap_id hid; - rc = platform_heap_create(platform_get_module_id(), 1 * GiB, FALSE, &hid); + rc = platform_heap_create(platform_get_module_id(), 512 * MiB, FALSE, &hid); platform_assert_status_ok(rc); system_config *system_cfg = TYPED_MALLOC(hid, system_cfg); diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 4b1c1e8f..47f65588 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -146,7 +146,7 @@ CTEST_SETUP(btree_stress) // Create a heap for io, allocator, cache and splinter if (!SUCCESS(platform_heap_create(platform_get_module_id(), - 1 * GiB, + 512 * MiB, data->master_cfg.use_shmem, &data->hid))) { diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 62992511..78001a7d 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -76,7 +76,7 @@ CTEST_SETUP(btree) { platform_register_thread(); config_set_defaults(&data->master_cfg); - uint64 heap_capacity = (1 * GiB); + uint64 heap_capacity = (512 * MiB); if (!SUCCESS( config_parse(&data->master_cfg, 1, Ctest_argc, (char **)Ctest_argv))) diff --git a/tests/unit/config_parse_test.c b/tests/unit/config_parse_test.c index 9a728e7d..fb967fc4 100644 --- a/tests/unit/config_parse_test.c +++ b/tests/unit/config_parse_test.c @@ -34,7 +34,7 @@ CTEST_DATA(config_parse) CTEST_SETUP(config_parse) { platform_register_thread(); - uint64 heap_capacity = (1024 * MiB); + uint64 heap_capacity = (512 * MiB); // Create a heap for io, allocator, cache and splinter platform_status rc = platform_heap_create( platform_get_module_id(), heap_capacity, FALSE, &data->hid); diff --git a/tests/unit/limitations_test.c b/tests/unit/limitations_test.c index 981b2a6d..46464fbd 100644 --- a/tests/unit/limitations_test.c +++ b/tests/unit/limitations_test.c @@ -62,7 +62,7 @@ CTEST_SETUP(limitations) // All test cases in this test usually deal with error handling set_log_streams_for_tests(MSG_LEVEL_ERRORS); - uint64 heap_capacity = (1 * GiB); + uint64 heap_capacity = (512 * MiB); data->use_shmem = config_parse_use_shmem(Ctest_argc, (char **)Ctest_argv); // Create a heap for io, allocator, cache and splinter diff --git a/tests/unit/splinter_test.c b/tests/unit/splinter_test.c index 3e7139f6..83aa471b 100644 --- a/tests/unit/splinter_test.c +++ b/tests/unit/splinter_test.c @@ -118,9 +118,7 @@ CTEST_SETUP(splinter) bool32 cache_per_table = FALSE; int num_tables = data->spl_num_tables; // Cache, for re-use below uint8 num_caches = (cache_per_table ? num_tables : 1); - uint64 heap_capacity = MAX(1024 * MiB * num_caches, 512 * MiB * num_tables); - heap_capacity = MIN(heap_capacity, UINT32_MAX); - heap_capacity = MAX(heap_capacity, 2 * GiB); + uint64 heap_capacity = 512 * MiB; // Create a heap for io, allocator, cache and splinter platform_status rc = platform_heap_create(platform_get_module_id(), diff --git a/tests/unit/writable_buffer_test.c b/tests/unit/writable_buffer_test.c index 675b463c..dd978046 100644 --- a/tests/unit/writable_buffer_test.c +++ b/tests/unit/writable_buffer_test.c @@ -34,7 +34,7 @@ CTEST_SETUP(writable_buffer) data->use_shmem = config_parse_use_shmem(Ctest_argc, (char **)Ctest_argv); platform_status rc = platform_heap_create( - platform_get_module_id(), (1 * GiB), data->use_shmem, &data->hid); + platform_get_module_id(), (512 * MiB), data->use_shmem, &data->hid); platform_assert_status_ok(rc); } From 0b69db90c354043d90af536cbc84afdab1bd531a Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 20 Jan 2026 23:53:27 -0800 Subject: [PATCH 5/9] add physical next to shmalloc nodes Signed-off-by: Rob Johnson --- src/platform_linux/shmalloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/platform_linux/shmalloc.c b/src/platform_linux/shmalloc.c index 44f1a017..4b1492dc 100644 --- a/src/platform_linux/shmalloc.c +++ b/src/platform_linux/shmalloc.c @@ -30,6 +30,7 @@ typedef struct chunk { chunk_state state : 2; void *ptr; struct chunk *phys_pred; + void *phys_next; struct chunk *size_pred; struct chunk *size_next; } chunk; @@ -152,10 +153,7 @@ chunk_table_get_entry(shmallocator *shm, void *ptr) static chunk * chunk_get_physical_successor(shmallocator *shm, chunk *chnk) { - if (chnk->ptr + chnk->size == shm->end) { - return NULL; - } - return chunk_table_get_entry(shm, chnk->ptr + chnk->size); + return chnk->phys_next; } static uint64_t @@ -237,12 +235,15 @@ chunk_remove_tail(shmallocator *shm, chunk *chnk, size_t size) new_chunk->state = CHUNK_STATE_FREE; new_chunk->ptr = chnk->ptr + size; new_chunk->phys_pred = chnk; + new_chunk->phys_next = psucc; if (psucc) { psucc->phys_pred = new_chunk; } - chnk->size = size; + chnk->size = size; + chnk->phys_next = new_chunk; + return new_chunk; } @@ -254,6 +255,7 @@ chunks_merge(shmallocator *shm, chunk *chnk, chunk *psucc) chunk *ppsucc = chunk_get_physical_successor(shm, psucc); chnk->size += psucc->size; + chnk->phys_next = psucc->phys_next; chunk_table_free_entry(shm, psucc); if (ppsucc) { ppsucc->phys_pred = chnk; From 771be22cf79148be9ea28c3b2f8a05c60eeb78c2 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 21 Jan 2026 08:18:18 -0800 Subject: [PATCH 6/9] move some allocations out of global heap --- src/core.c | 58 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/core.c b/src/core.c index 29f48f0a..804673eb 100644 --- a/src/core.c +++ b/src/core.c @@ -168,7 +168,7 @@ core_set_super_block(core_handle *spl, if (root_addr != 0) { super->root_addr = root_addr; rc = trunk_inc_ref(spl->cfg.trunk_node_cfg, - spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, spl->cc, spl->al, spl->ts, @@ -205,7 +205,7 @@ core_set_super_block(core_handle *spl, if (old_root_addr != 0 && !is_create) { rc = trunk_dec_ref(spl->cfg.trunk_node_cfg, - spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, spl->cc, spl->al, spl->ts, @@ -384,8 +384,12 @@ core_memtable_insert(core_handle *spl, key tuple_key, message msg) // this call is safe because we hold the insert lock memtable *mt = core_get_memtable(spl, generation); uint64 leaf_generation; // used for ordering the log - rc = memtable_insert( - &spl->mt_ctxt, mt, spl->heap_id, tuple_key, msg, &leaf_generation); + rc = memtable_insert(&spl->mt_ctxt, + mt, + PROCESS_PRIVATE_HEAP_ID, + tuple_key, + msg, + &leaf_generation); if (!SUCCESS(rc)) { goto unlock_insert_lock; } @@ -822,10 +826,10 @@ core_range_iterator_init(core_handle *spl, range_itor->can_prev = TRUE; range_itor->can_next = TRUE; - key_buffer_init(&range_itor->min_key, spl->heap_id); - key_buffer_init(&range_itor->max_key, spl->heap_id); - key_buffer_init(&range_itor->local_min_key, spl->heap_id); - key_buffer_init(&range_itor->local_max_key, spl->heap_id); + key_buffer_init(&range_itor->min_key, PROCESS_PRIVATE_HEAP_ID); + key_buffer_init(&range_itor->max_key, PROCESS_PRIVATE_HEAP_ID); + key_buffer_init(&range_itor->local_min_key, PROCESS_PRIVATE_HEAP_ID); + key_buffer_init(&range_itor->local_max_key, PROCESS_PRIVATE_HEAP_ID); if (core_key_compare(spl, min_key, start_key) > 0) { // in bounds, start at min @@ -971,7 +975,7 @@ core_range_iterator_init(core_handle *spl, range_itor->itor[i] = &btree_itor->super; } - rc = merge_iterator_create(spl->heap_id, + rc = merge_iterator_create(PROCESS_PRIVATE_HEAP_ID, spl->cfg.data_cfg, range_itor->num_branches, range_itor->itor, @@ -994,7 +998,7 @@ core_range_iterator_init(core_handle *spl, if (core_key_compare(spl, local_max, max_key) < 0) { key_buffer local_max_buffer; rc = key_buffer_init_from_key( - &local_max_buffer, spl->heap_id, local_max); + &local_max_buffer, PROCESS_PRIVATE_HEAP_ID, local_max); core_range_iterator_deinit(range_itor); if (!SUCCESS(rc)) { return rc; @@ -1023,7 +1027,7 @@ core_range_iterator_init(core_handle *spl, if (core_key_compare(spl, local_min, min_key) > 0) { key_buffer local_min_buffer; rc = key_buffer_init_from_key( - &local_min_buffer, spl->heap_id, local_min); + &local_min_buffer, PROCESS_PRIVATE_HEAP_ID, local_min); core_range_iterator_deinit(range_itor); if (!SUCCESS(rc)) { return rc; @@ -1075,21 +1079,21 @@ core_range_iterator_next(iterator *itor) if (!range_itor->can_next) { KEY_CREATE_LOCAL_COPY(rc, min_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->min_key)); if (!SUCCESS(rc)) { return rc; } KEY_CREATE_LOCAL_COPY(rc, max_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->max_key)); if (!SUCCESS(rc)) { return rc; } KEY_CREATE_LOCAL_COPY(rc, local_max_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->local_max_key)); if (!SUCCESS(rc)) { return rc; @@ -1134,21 +1138,21 @@ core_range_iterator_prev(iterator *itor) if (!range_itor->can_prev) { KEY_CREATE_LOCAL_COPY(rc, min_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->min_key)); if (!SUCCESS(rc)) { return rc; } KEY_CREATE_LOCAL_COPY(rc, max_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->max_key)); if (!SUCCESS(rc)) { return rc; } KEY_CREATE_LOCAL_COPY(rc, local_min_key, - range_itor->spl->heap_id, + PROCESS_PRIVATE_HEAP_ID, key_buffer_key(&range_itor->local_min_key)); if (!SUCCESS(rc)) { return rc; @@ -1198,7 +1202,7 @@ core_range_iterator_deinit(core_range_iterator *range_itor) { core_handle *spl = range_itor->spl; if (range_itor->merge_itor != NULL) { - merge_iterator_destroy(range_itor->spl->heap_id, &range_itor->merge_itor); + merge_iterator_destroy(PROCESS_PRIVATE_HEAP_ID, &range_itor->merge_itor); for (uint64 i = 0; i < range_itor->num_branches; i++) { btree_iterator *btree_itor = &range_itor->btree_itor[i]; if (range_itor->compacted[i]) { @@ -1766,22 +1770,22 @@ core_print_insertion_stats(platform_log_handle *log_handle, const core_handle *s core_stats *global; - global = TYPED_ZALLOC(spl->heap_id, global); + global = TYPED_ZALLOC(PROCESS_PRIVATE_HEAP_ID, global); if (global == NULL) { platform_error_log("Out of memory for statistics"); return; } histogram_handle insert_lat_accum, update_lat_accum, delete_lat_accum; - histogram_create(spl->heap_id, + histogram_create(PROCESS_PRIVATE_HEAP_ID, LATENCYHISTO_SIZE + 1, latency_histo_buckets, &insert_lat_accum); - histogram_create(spl->heap_id, + histogram_create(PROCESS_PRIVATE_HEAP_ID, LATENCYHISTO_SIZE + 1, latency_histo_buckets, &update_lat_accum); - histogram_create(spl->heap_id, + histogram_create(PROCESS_PRIVATE_HEAP_ID, LATENCYHISTO_SIZE + 1, latency_histo_buckets, &delete_lat_accum); @@ -1840,9 +1844,9 @@ core_print_insertion_stats(platform_log_handle *log_handle, const core_handle *s histogram_print(insert_lat_accum, "Insert Latency Histogram (ns):", log_handle); histogram_print(update_lat_accum, "Update Latency Histogram (ns):", log_handle); histogram_print(delete_lat_accum, "Delete Latency Histogram (ns):", log_handle); - histogram_destroy(spl->heap_id, &insert_lat_accum); - histogram_destroy(spl->heap_id, &update_lat_accum); - histogram_destroy(spl->heap_id, &delete_lat_accum); + histogram_destroy(PROCESS_PRIVATE_HEAP_ID, &insert_lat_accum); + histogram_destroy(PROCESS_PRIVATE_HEAP_ID, &update_lat_accum); + histogram_destroy(PROCESS_PRIVATE_HEAP_ID, &delete_lat_accum); platform_log(log_handle, "Flush Statistics\n"); @@ -1892,7 +1896,7 @@ core_print_insertion_stats(platform_log_handle *log_handle, const core_handle *s platform_log(log_handle, "------------------------------------------------------------------------------------\n"); cache_print_stats(log_handle, spl->cc); platform_log(log_handle, "\n"); - platform_free(spl->heap_id, global); + platform_free(PROCESS_PRIVATE_HEAP_ID, global); } void @@ -1929,7 +1933,7 @@ void core_print_lookup(core_handle *spl, key target, platform_log_handle *log_handle) { merge_accumulator data; - merge_accumulator_init(&data, spl->heap_id); + merge_accumulator_init(&data, PROCESS_PRIVATE_HEAP_ID); platform_stream_handle stream; platform_open_log_stream(&stream); From da2f42b49866d2ec8887769481f83e93979a93ea Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 21 Jan 2026 12:14:20 -0800 Subject: [PATCH 7/9] move more allocations to private heap Signed-off-by: Rob Johnson --- src/platform_linux/shmalloc.c | 8 +- src/trunk.c | 138 +++++++++++++++++++--------------- 2 files changed, 82 insertions(+), 64 deletions(-) diff --git a/src/platform_linux/shmalloc.c b/src/platform_linux/shmalloc.c index 4b1492dc..cf2e9c90 100644 --- a/src/platform_linux/shmalloc.c +++ b/src/platform_linux/shmalloc.c @@ -458,10 +458,10 @@ shmallocator_init(shmallocator *shm, uint64_t max_allocations, size_t size) void shmallocator_deinit(shmallocator *shm) { - // printf("shmallocator_deinit: max needed MiBs: %lu (%lu%%)\n", - // B_TO_MiB(shm->max_allocated_end - (void *)shm), - // (uint64_t)(shm->max_allocated_end - (void *)shm) * 100 - // / shmallocator_size(shm)); + printf("shmallocator_deinit: max needed MiBs: %lu (%lu%%)\n", + B_TO_MiB(shm->max_allocated_end - (void *)shm), + (uint64_t)(shm->max_allocated_end - (void *)shm) * 100 + / shmallocator_size(shm)); pthread_spin_destroy(&shm->lock); } diff --git a/src/trunk.c b/src/trunk.c index e3dc1819..7a5106ce 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -10,6 +10,7 @@ #include "trunk.h" #include "platform.h" #include "data_internal.h" +#include "platform_heap.h" #include "util.h" #include "btree.h" #include "routing_filter.h" @@ -818,7 +819,7 @@ static void trunk_node_deinit(trunk_node *node, const trunk_context *context) { VECTOR_APPLY_TO_ELTS( - &node->pivots, vector_apply_platform_free, context->hid); + &node->pivots, vector_apply_platform_free, PROCESS_PRIVATE_HEAP_ID); VECTOR_APPLY_TO_PTRS(&node->pivot_bundles, bundle_deinit); VECTOR_APPLY_TO_PTRS(&node->inflight_bundles, bundle_deinit); vector_deinit(&node->pivots); @@ -1483,9 +1484,9 @@ trunk_node_deserialize(const trunk_context *context, trunk_pivot_vector pivots; bundle_vector inflight_bundles; bundle_vector pivot_bundles; - vector_init(&pivots, context->hid); - vector_init(&inflight_bundles, context->hid); - vector_init(&pivot_bundles, context->hid); + vector_init(&pivots, PROCESS_PRIVATE_HEAP_ID); + vector_init(&inflight_bundles, PROCESS_PRIVATE_HEAP_ID); + vector_init(&pivot_bundles, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&pivots, header->num_pivots); if (!SUCCESS(rc)) { @@ -1513,7 +1514,8 @@ trunk_node_deserialize(const trunk_context *context, } for (uint64 i = 0; i < header->num_pivots; i++) { - trunk_pivot *imp = trunk_pivot_deserialize(context->hid, &handle, i); + trunk_pivot *imp = + trunk_pivot_deserialize(PROCESS_PRIVATE_HEAP_ID, &handle, i); if (imp == NULL) { platform_error_log( "%s():%d: pivot_deserialize() failed", __func__, __LINE__); @@ -1526,7 +1528,7 @@ trunk_node_deserialize(const trunk_context *context, __func__, __LINE__, platform_status_to_string(rc)); - trunk_pivot_destroy(imp, context->hid); + trunk_pivot_destroy(imp, PROCESS_PRIVATE_HEAP_ID); goto cleanup; } } @@ -1541,7 +1543,7 @@ trunk_node_deserialize(const trunk_context *context, goto cleanup; } rc = VECTOR_EMPLACE_APPEND( - &pivot_bundles, bundle_deserialize, context->hid, odb); + &pivot_bundles, bundle_deserialize, PROCESS_PRIVATE_HEAP_ID, odb); if (!SUCCESS(rc)) { platform_error_log("%s():%d: VECTOR_EMPLACE_APPEND() failed: %s", __func__, @@ -1565,8 +1567,10 @@ trunk_node_deserialize(const trunk_context *context, rc = STATUS_IO_ERROR; goto cleanup; } - rc = VECTOR_EMPLACE_APPEND( - &inflight_bundles, bundle_deserialize, context->hid, odb); + rc = VECTOR_EMPLACE_APPEND(&inflight_bundles, + bundle_deserialize, + PROCESS_PRIVATE_HEAP_ID, + odb); if (!SUCCESS(rc)) { platform_error_log("%s():%d: VECTOR_EMPLACE_APPEND() failed: %s", __func__, @@ -1602,7 +1606,7 @@ trunk_node_deserialize(const trunk_context *context, return STATUS_OK; cleanup: - VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, context->hid); + VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, PROCESS_PRIVATE_HEAP_ID); VECTOR_APPLY_TO_PTRS(&pivot_bundles, bundle_deinit); VECTOR_APPLY_TO_PTRS(&inflight_bundles, bundle_deinit); vector_deinit(&pivots); @@ -2472,7 +2476,7 @@ trunk_apply_changes_internal(trunk_context *context, } trunk_ondisk_node_ref_vector new_child_refs; - vector_init(&new_child_refs, context->hid); + vector_init(&new_child_refs, PROCESS_PRIVATE_HEAP_ID); if (trunk_node_height(&node) == height) { rc = func(context, addr, &node, arg); @@ -3171,7 +3175,7 @@ maplet_compaction_task(task *arg) ZERO_STRUCT(apply_args); apply_args.state = state; - vector_init(&apply_args.branches, context->hid); + vector_init(&apply_args.branches, PROCESS_PRIVATE_HEAP_ID); if (state->abandoned) { if (context->stats) { @@ -3415,7 +3419,7 @@ bundle_compaction_task(task *arg) trunk_branch_merger merger; trunk_branch_merger_init(&merger, - context->hid, + PROCESS_PRIVATE_HEAP_ID, context->cfg->data_cfg, key_buffer_key(&state->key), key_buffer_key(&state->ubkey), @@ -3743,8 +3747,10 @@ trunk_node_receive_bundles(trunk_context *context, } if (pivot_bundle && 0 < bundle_num_branches(pivot_bundle)) { - rc = VECTOR_EMPLACE_APPEND( - &node->inflight_bundles, bundle_init_copy, pivot_bundle, context->hid); + rc = VECTOR_EMPLACE_APPEND(&node->inflight_bundles, + bundle_init_copy, + pivot_bundle, + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("node_receive_bundles: bundle_init_copy failed: " "%d\n", @@ -3755,8 +3761,10 @@ trunk_node_receive_bundles(trunk_context *context, for (uint64 i = inflight_start; i < vector_length(inflight); i++) { bundle *bndl = vector_get_ptr(inflight, i); - rc = VECTOR_EMPLACE_APPEND( - &node->inflight_bundles, bundle_init_copy, bndl, context->hid); + rc = VECTOR_EMPLACE_APPEND(&node->inflight_bundles, + bundle_init_copy, + bndl, + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("node_receive_bundles: bundle_init_copy failed: " "%d\n", @@ -3826,7 +3834,7 @@ leaf_estimate_unique_keys(trunk_context *context, debug_assert(trunk_node_is_well_formed_leaf(context->cfg->data_cfg, leaf)); routing_filter_vector maplets; - vector_init(&maplets, context->hid); + vector_init(&maplets, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&maplets, vector_length(&leaf->inflight_bundles) + 1); if (!SUCCESS(rc)) { @@ -3891,7 +3899,7 @@ leaf_estimate_unique_keys(trunk_context *context, uint32 num_globally_unique_fp = routing_filter_estimate_unique_fp(context->cc, context->cfg->filter_cfg, - context->hid, + PROCESS_PRIVATE_HEAP_ID, vector_data(&maplets), vector_length(&maplets)); @@ -3976,7 +3984,7 @@ leaf_split_select_pivots(trunk_context *context, key max_key = ondisk_key_to_key(&last->key); rc = VECTOR_EMPLACE_APPEND( - pivots, key_buffer_init_from_key, context->hid, min_key); + pivots, key_buffer_init_from_key, PROCESS_PRIVATE_HEAP_ID, min_key); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "VECTOR_EMPLACE_APPEND failed: %d\n", @@ -3986,7 +3994,7 @@ leaf_split_select_pivots(trunk_context *context, trunk_branch_merger merger; trunk_branch_merger_init(&merger, - context->hid, + PROCESS_PRIVATE_HEAP_ID, context->cfg->data_cfg, min_key, max_key, @@ -4048,8 +4056,10 @@ leaf_split_select_pivots(trunk_context *context, && next_boundary <= new_cumulative_kv_bytes) || rflimit < new_tuples) { - rc = VECTOR_EMPLACE_APPEND( - pivots, key_buffer_init_from_key, context->hid, curr_key); + rc = VECTOR_EMPLACE_APPEND(pivots, + key_buffer_init_from_key, + PROCESS_PRIVATE_HEAP_ID, + curr_key); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "VECTOR_EMPLACE_APPEND failed: %d\n", @@ -4066,7 +4076,7 @@ leaf_split_select_pivots(trunk_context *context, } rc = VECTOR_EMPLACE_APPEND( - pivots, key_buffer_init_from_key, context->hid, max_key); + pivots, key_buffer_init_from_key, PROCESS_PRIVATE_HEAP_ID, max_key); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "VECTOR_EMPLACE_APPEND failed: %d\n", @@ -4098,7 +4108,8 @@ leaf_split_init(trunk_node *new_leaf, trunk_pivot *pvt = trunk_node_pivot(leaf, 0); - rc = trunk_node_init_empty_leaf(new_leaf, context->hid, min_key, max_key); + rc = trunk_node_init_empty_leaf( + new_leaf, PROCESS_PRIVATE_HEAP_ID, min_key, max_key); if (!SUCCESS(rc)) { platform_error_log("leaf_split_init: node_init_empty_leaf failed: %d\n", rc.r); @@ -4182,7 +4193,7 @@ leaf_split(trunk_context *context, } *abandon_compactions = FALSE; return VECTOR_EMPLACE_APPEND( - new_leaves, trunk_node_copy_init, leaf, context->hid); + new_leaves, trunk_node_copy_init, leaf, PROCESS_PRIVATE_HEAP_ID); } if (context->stats) { @@ -4193,7 +4204,7 @@ leaf_split(trunk_context *context, key_buffer_vector pivots; - vector_init(&pivots, context->hid); + vector_init(&pivots, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&pivots, target_num_leaves + 1); if (!SUCCESS(rc)) { platform_error_log("leaf_split: vector_ensure_capacity failed: %d\n", @@ -4255,7 +4266,7 @@ index_init_split(trunk_node *new_index, platform_status rc; trunk_pivot_vector pivots; - vector_init(&pivots, hid); + vector_init(&pivots, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&pivots, end_child_num - start_child_num + 1); if (!SUCCESS(rc)) { platform_error_log( @@ -4264,7 +4275,7 @@ index_init_split(trunk_node *new_index, } for (uint64 i = start_child_num; i < end_child_num + 1; i++) { trunk_pivot *pvt = vector_get(&index->pivots, i); - trunk_pivot *copy = trunk_pivot_copy(pvt, hid); + trunk_pivot *copy = trunk_pivot_copy(pvt, PROCESS_PRIVATE_HEAP_ID); if (copy == NULL) { platform_error_log("index_init_split: pivot_copy failed\n"); rc = STATUS_NO_MEMORY; @@ -4275,7 +4286,7 @@ index_init_split(trunk_node *new_index, } bundle_vector pivot_bundles; - vector_init(&pivot_bundles, hid); + vector_init(&pivot_bundles, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&pivot_bundles, end_child_num - start_child_num); if (!SUCCESS(rc)) { platform_error_log( @@ -4286,7 +4297,7 @@ index_init_split(trunk_node *new_index, rc = VECTOR_EMPLACE_APPEND(&pivot_bundles, bundle_init_copy, vector_get_ptr(&index->pivot_bundles, i), - hid); + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("index_init_split: bundle_init_copy failed: %d\n", rc.r); @@ -4295,9 +4306,11 @@ index_init_split(trunk_node *new_index, } bundle_vector inflight_bundles; - vector_init(&inflight_bundles, hid); - rc = VECTOR_EMPLACE_MAP_PTRS( - &inflight_bundles, bundle_init_copy, &index->inflight_bundles, hid); + vector_init(&inflight_bundles, PROCESS_PRIVATE_HEAP_ID); + rc = VECTOR_EMPLACE_MAP_PTRS(&inflight_bundles, + bundle_init_copy, + &index->inflight_bundles, + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("index_init_split: VECTOR_EMPLACE_MAP_PTRS failed: " "%d\n", @@ -4321,7 +4334,7 @@ index_init_split(trunk_node *new_index, VECTOR_APPLY_TO_PTRS(&pivot_bundles, bundle_deinit); vector_deinit(&pivot_bundles); cleanup_pivots: - VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, hid); + VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, PROCESS_PRIVATE_HEAP_ID); vector_deinit(&pivots); return rc; } @@ -4349,7 +4362,7 @@ index_split(trunk_context *context, for (uint64 i = 0; i < num_nodes; i++) { rc = VECTOR_EMPLACE_APPEND(new_indexes, index_init_split, - context->hid, + PROCESS_PRIVATE_HEAP_ID, index, i * num_children / num_nodes, (i + 1) * num_children / num_nodes); @@ -4385,7 +4398,7 @@ restore_balance_leaf(trunk_context *context, incorporation_tasks *itasks) { trunk_node_vector new_nodes; - vector_init(&new_nodes, context->hid); + vector_init(&new_nodes, PROCESS_PRIVATE_HEAP_ID); bool32 abandon_compactions = FALSE; platform_status rc = @@ -4482,7 +4495,7 @@ flush_to_one_child(trunk_context *context, // Perform the flush, getting back the new children trunk_ondisk_node_ref_vector new_childrefs; - vector_init(&new_childrefs, context->hid); + vector_init(&new_childrefs, PROCESS_PRIVATE_HEAP_ID); rc = flush_then_compact(context, &child, trunk_node_pivot_bundle(index, pivot_num), @@ -4499,7 +4512,7 @@ flush_to_one_child(trunk_context *context, // Construct our new pivots for the new children trunk_pivot_vector new_pivots; - vector_init(&new_pivots, context->hid); + vector_init(&new_pivots, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&new_pivots, vector_length(&new_childrefs)); if (!SUCCESS(rc)) { platform_error_log("flush_to_one_child: vector_ensure_capacity failed: " @@ -4510,7 +4523,7 @@ flush_to_one_child(trunk_context *context, rc = VECTOR_MAP_ELTS(&new_pivots, trunk_pivot_create_from_ondisk_node_ref, &new_childrefs, - context->hid); + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("flush_to_one_child: VECTOR_MAP_ELTS failed: %d\n", rc.r); @@ -4525,7 +4538,7 @@ flush_to_one_child(trunk_context *context, // Construct the new empty pivot bundles for the new children bundle_vector new_pivot_bundles; rc = bundle_vector_init_empty( - &new_pivot_bundles, vector_length(&new_pivots), context->hid); + &new_pivot_bundles, vector_length(&new_pivots), PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("flush_to_one_child: bundle_vector_init_empty failed: " "%d\n", @@ -4571,7 +4584,7 @@ flush_to_one_child(trunk_context *context, context, trunk_pivot_key(pvt), trunk_node_height(index)); // Replace the old pivot and pivot bundles with the new ones - trunk_pivot_destroy(pvt, context->hid); + trunk_pivot_destroy(pvt, PROCESS_PRIVATE_HEAP_ID); rc = vector_replace( &index->pivots, pivot_num, 1, &new_pivots, 0, vector_length(&new_pivots)); platform_assert_status_ok(rc); @@ -4617,7 +4630,7 @@ restore_balance_index(trunk_context *context, debug_assert(trunk_node_is_well_formed_index(context->cfg->data_cfg, index)); trunk_ondisk_node_ref_vector all_new_childrefs; - vector_init(&all_new_childrefs, context->hid); + vector_init(&all_new_childrefs, PROCESS_PRIVATE_HEAP_ID); uint64 fullest_child = 0; uint64 fullest_kv_bytes = 0; @@ -4660,7 +4673,7 @@ restore_balance_index(trunk_context *context, } trunk_node_vector new_nodes; - vector_init(&new_nodes, context->hid); + vector_init(&new_nodes, PROCESS_PRIVATE_HEAP_ID); rc = index_split(context, index, &new_nodes); if (!SUCCESS(rc)) { platform_error_log("restore_balance_index: index_split failed: %d\n", @@ -4752,7 +4765,7 @@ build_new_roots(trunk_context *context, // Create the pivots vector for the new root trunk_pivot_vector pivots; - vector_init(&pivots, context->hid); + vector_init(&pivots, PROCESS_PRIVATE_HEAP_ID); rc = vector_ensure_capacity(&pivots, vector_length(node_refs) + 1); if (!SUCCESS(rc)) { platform_error_log("build_new_roots: vector_ensure_capacity failed: %d\n", @@ -4762,12 +4775,12 @@ build_new_roots(trunk_context *context, rc = VECTOR_MAP_ELTS(&pivots, trunk_pivot_create_from_ondisk_node_ref, node_refs, - context->hid); + PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log("build_new_roots: VECTOR_MAP_ELTS failed: %d\n", rc.r); goto cleanup_pivots; } - trunk_pivot *ub_pivot = trunk_pivot_create(context->hid, + trunk_pivot *ub_pivot = trunk_pivot_create(PROCESS_PRIVATE_HEAP_ID, POSITIVE_INFINITY_KEY, 0, 0, @@ -4784,7 +4797,7 @@ build_new_roots(trunk_context *context, // Build a new vector of empty pivot bundles. bundle_vector pivot_bundles; rc = bundle_vector_init_empty( - &pivot_bundles, vector_length(&pivots) - 1, context->hid); + &pivot_bundles, vector_length(&pivots) - 1, PROCESS_PRIVATE_HEAP_ID); if (!SUCCESS(rc)) { platform_error_log( "build_new_roots: bundle_vector_init_empty failed: %d\n", rc.r); @@ -4793,7 +4806,7 @@ build_new_roots(trunk_context *context, // Build a new empty inflight bundle vector bundle_vector inflight; - vector_init(&inflight, context->hid); + vector_init(&inflight, PROCESS_PRIVATE_HEAP_ID); // Build the new root trunk_node new_root; @@ -4805,7 +4818,7 @@ build_new_roots(trunk_context *context, // into the new root. trunk_node_vector new_nodes; - vector_init(&new_nodes, context->hid); + vector_init(&new_nodes, PROCESS_PRIVATE_HEAP_ID); rc = index_split(context, &new_root, &new_nodes); trunk_node_deinit(&new_root, context); if (!SUCCESS(rc)) { @@ -4816,7 +4829,7 @@ build_new_roots(trunk_context *context, } trunk_ondisk_node_ref_vector new_ondisk_node_refs; - vector_init(&new_ondisk_node_refs, context->hid); + vector_init(&new_ondisk_node_refs, PROCESS_PRIVATE_HEAP_ID); rc = serialize_nodes(context, &new_nodes, &new_ondisk_node_refs); VECTOR_APPLY_TO_PTRS(&new_nodes, trunk_node_deinit, context); vector_deinit(&new_nodes); @@ -4841,7 +4854,7 @@ build_new_roots(trunk_context *context, context->hid); vector_deinit(&new_ondisk_node_refs); cleanup_pivots: - VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, context->hid); + VECTOR_APPLY_TO_ELTS(&pivots, trunk_pivot_destroy, PROCESS_PRIVATE_HEAP_ID); vector_deinit(&pivots); return rc; @@ -4860,18 +4873,21 @@ trunk_incorporate_prepare(trunk_context *context, uint64 branch_addr) branch_ref branch = create_branch_ref(branch_addr); bundle_vector inflight; - vector_init(&inflight, context->hid); + vector_init(&inflight, PROCESS_PRIVATE_HEAP_ID); trunk_ondisk_node_ref_vector new_node_refs; - vector_init(&new_node_refs, context->hid); + vector_init(&new_node_refs, PROCESS_PRIVATE_HEAP_ID); trunk_pivot_vector new_pivot; - vector_init(&new_pivot, context->hid); + vector_init(&new_pivot, PROCESS_PRIVATE_HEAP_ID); // Construct a vector of inflight bundles with one singleton bundle for // the new branch. - rc = VECTOR_EMPLACE_APPEND( - &inflight, bundle_init_single, context->hid, NULL_ROUTING_FILTER, branch); + rc = VECTOR_EMPLACE_APPEND(&inflight, + bundle_init_single, + PROCESS_PRIVATE_HEAP_ID, + NULL_ROUTING_FILTER, + branch); if (!SUCCESS(rc)) { platform_error_log( "trunk_incorporate: VECTOR_EMPLACE_APPEND failed: %d\n", rc.r); @@ -4889,8 +4905,10 @@ trunk_incorporate_prepare(trunk_context *context, uint64 branch_addr) } } else { // If there is no root, create an empty one. - rc = trunk_node_init_empty_leaf( - &root, context->hid, NEGATIVE_INFINITY_KEY, POSITIVE_INFINITY_KEY); + rc = trunk_node_init_empty_leaf(&root, + PROCESS_PRIVATE_HEAP_ID, + NEGATIVE_INFINITY_KEY, + POSITIVE_INFINITY_KEY); if (!SUCCESS(rc)) { platform_error_log( "trunk_incorporate: node_init_empty_leaf failed: %d\n", rc.r); @@ -5174,7 +5192,7 @@ trunk_ondisk_bundle_merge_lookup(trunk_context *context, if (log) { merge_accumulator ma; - merge_accumulator_init(&ma, context->hid); + merge_accumulator_init(&ma, PROCESS_PRIVATE_HEAP_ID); rc = btree_lookup_and_merge(context->cc, context->cfg->btree_cfg, branch_ref_addr(bndl->branches[idx]), From d6e68e118cef79873248b82c1c156529300d4275 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 21 Jan 2026 12:21:39 -0800 Subject: [PATCH 8/9] remove space usage printf Signed-off-by: Rob Johnson --- src/platform_linux/shmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/platform_linux/shmalloc.c b/src/platform_linux/shmalloc.c index cf2e9c90..4b1492dc 100644 --- a/src/platform_linux/shmalloc.c +++ b/src/platform_linux/shmalloc.c @@ -458,10 +458,10 @@ shmallocator_init(shmallocator *shm, uint64_t max_allocations, size_t size) void shmallocator_deinit(shmallocator *shm) { - printf("shmallocator_deinit: max needed MiBs: %lu (%lu%%)\n", - B_TO_MiB(shm->max_allocated_end - (void *)shm), - (uint64_t)(shm->max_allocated_end - (void *)shm) * 100 - / shmallocator_size(shm)); + // printf("shmallocator_deinit: max needed MiBs: %lu (%lu%%)\n", + // B_TO_MiB(shm->max_allocated_end - (void *)shm), + // (uint64_t)(shm->max_allocated_end - (void *)shm) * 100 + // / shmallocator_size(shm)); pthread_spin_destroy(&shm->lock); } From 1a01d1fc0a8810048ceaee634f8e2d6514f099e6 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 21 Jan 2026 20:21:39 -0800 Subject: [PATCH 9/9] delete racey assert Signed-off-by: Rob Johnson --- src/clockcache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/clockcache.c b/src/clockcache.c index 1188b66c..e428aad5 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -810,8 +810,6 @@ clockcache_try_set_writeback(clockcache *cc, entry_number, cc->cfg->page_capacity); - platform_assert(cc->entry[entry_number].waiters.head == NULL); - volatile uint32 *status = &cc->entry[entry_number].status; if (__sync_bool_compare_and_swap( status, CC_CLEANABLE1_STATUS, CC_WRITEBACK1_STATUS))