From d1fc97a1516b1bfe7ee246d279e4a747b54e8f44 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:42:46 -0800 Subject: [PATCH 01/16] feat(kv-cache): MLPerf v3.0 compliance and configuration overhaul - Add ConfigLoader class with YAML config file support and schema validation - Add cfg() helper function for config-driven parameter access - Add validate_args() with safety limits for protected system paths - Rename all nvme_* metrics to storage_* for MLPerf terminology compliance - Add extended QoS percentiles: P99.9 and P99.99 latency tracking - Add per-tier bandwidth metrics (read/write GB/s per tier) - Add per-tier KV bytes tracking for detailed storage analysis - Fix GPU metadata desync bug via on_eviction_callback pattern - Change eviction from single-shot to iterative loop until space freed - Replace print statements with Python logging module - Add waterfall LRU eviction with configurable high/low watermarks - Add storage_health section with PASS/FAIL criteria - Add storage_throughput_tokens_per_sec as primary MLPerf metric --- kv_cache_benchmark/kv-cache.py | 1114 +++++++++++++++++++++++++------- 1 file changed, 868 insertions(+), 246 deletions(-) diff --git a/kv_cache_benchmark/kv-cache.py b/kv_cache_benchmark/kv-cache.py index 106418a5..70194664 100644 --- a/kv_cache_benchmark/kv-cache.py +++ b/kv_cache_benchmark/kv-cache.py @@ -47,6 +47,17 @@ from collections import defaultdict import argparse import csv +import logging + +# Configure module-level logger +logger = logging.getLogger(__name__) + +# Optional YAML support for config file loading +try: + import yaml + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False # Attempt to import optional GPU libraries (torch, cupy) # The benchmark can run in a CPU-only environment if these are not found. @@ -82,6 +93,207 @@ OPENPYXL_AVAILABLE = False +# ============================================================================ +# CONFIGURATION LOADER +# Loads benchmark configuration from YAML files with strict validation. +# ============================================================================ + +class ConfigLoader: + """ + Loads and validates benchmark configuration from YAML files. + + Raises errors on invalid/unknown keys to prevent silent misconfigurations + in MLPerf competition submissions. + """ + + # Define the valid configuration schema with expected types + VALID_SCHEMA = { + 'user_templates': { + 'chatbot': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + 'coding': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + 'document': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + }, + 'generation_timing': { + 'none': (int, float), + 'fast': (int, float), + 'realistic': (int, float), + }, + 'qos_profiles': { + 'interactive': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + 'responsive': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + 'batch': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + }, + 'qos_distribution': { + 'interactive_probability': (int, float), + 'responsive_threshold': (int, float), + }, + 'eviction': { + 'max_recursion_depth': int, + 'target_usage_ratio': (int, float), + 'large_entry_limit_ratio': (int, float), + 'max_evictions_hard_cap': int, + 'max_evictions_min': int, + }, + 'gpu_backend': { + 'memory_fraction': (int, float), + 'max_eviction_attempts': int, + 'free_memory_threshold': (int, float), + }, + 'prefix_cache': { + 'min_prefix_length': int, + 'max_prefix_entries': int, + 'system_prompt_hit_probability': (int, float), + }, + 'rag': { + 'chunk_size_tokens': int, + 'top_k_chunks': int, + 'max_chunk_bytes': int, + }, + 'conversation': { + 'max_conversations': int, + 'max_turns_per_conv': int, + 'end_conversation_probability': (int, float), + }, + 'autoscaler': { + 'min_users': int, + 'max_users': int, + 'scale_up_factor': (int, float), + 'scale_down_factor': (int, float), + 'consecutive_samples_required': int, + }, + 'decode': { + 'batch_size': int, + }, + 'sharegpt': { + 'max_context_tokens': int, + 'max_generation_tokens': int, + 'chars_per_token_estimate': int, + }, + 'saturation_detection': { + 'read_latency_p95_threshold_ms': (int, float), + 'write_latency_p95_threshold_ms': (int, float), + 'queue_depth_threshold': int, + 'history_window_size': int, + }, + 'validation_limits': { + 'max_users': int, + 'max_duration_seconds': int, + 'max_gpu_memory_gb': int, + 'max_cpu_memory_gb': int, + }, + } + + def __init__(self, config_path: Optional[str] = None): + """ + Initialize the ConfigLoader. + + Args: + config_path: Path to YAML config file. If None, uses built-in defaults. + """ + self.config_path = config_path + self.config = {} + + if config_path: + self._load_and_validate(config_path) + + def _load_and_validate(self, config_path: str) -> None: + """Load YAML config and validate strictly against schema.""" + if not YAML_AVAILABLE: + raise RuntimeError("pyyaml is required for config file support. Install with: pip install pyyaml") + + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(path, 'r') as f: + self.config = yaml.safe_load(f) or {} + + # Validate all keys against schema + self._validate_keys(self.config, self.VALID_SCHEMA, path_prefix='') + + logger.info(f"Loaded configuration from {config_path}") + + def _validate_keys(self, config: dict, schema: dict, path_prefix: str) -> None: + """Recursively validate config keys against schema. Raises on unknown keys.""" + for key, value in config.items(): + full_path = f"{path_prefix}.{key}" if path_prefix else key + + if key not in schema: + raise ValueError(f"Unknown configuration key: '{full_path}'. " + f"Valid keys at this level: {list(schema.keys())}") + + expected_type = schema[key] + + # If schema expects a dict, recurse + if isinstance(expected_type, dict): + if not isinstance(value, dict): + raise ValueError(f"Config key '{full_path}' must be a dict, got {type(value).__name__}") + self._validate_keys(value, expected_type, full_path) + else: + # Validate type + if isinstance(expected_type, tuple): + if not isinstance(value, expected_type): + raise ValueError(f"Config key '{full_path}' must be one of {expected_type}, " + f"got {type(value).__name__}") + elif not isinstance(value, expected_type): + raise ValueError(f"Config key '{full_path}' must be {expected_type.__name__}, " + f"got {type(value).__name__}") + + def get(self, *keys, default=None): + """ + Get a nested configuration value. + + Args: + *keys: Path to the config value (e.g., 'qos_profiles', 'interactive', 'priority') + default: Default value if key not found + + Returns: + The config value or default + """ + value = self.config + for key in keys: + if isinstance(value, dict) and key in value: + value = value[key] + else: + return default + return value + + +# Global config instance (set from main() when --config is provided) +_global_config: Optional[ConfigLoader] = None + + +def get_config() -> Optional[ConfigLoader]: + """Get the global configuration loader instance.""" + return _global_config + + +def set_config(config: ConfigLoader) -> None: + """Set the global configuration loader instance.""" + global _global_config + _global_config = config + + +def cfg(*keys, default=None): + """ + Get a configuration value from the global config, with fallback to default. + + Args: + *keys: Path to the config value (e.g., 'qos_profiles', 'interactive', 'priority') + default: Default value if config not loaded or key not found + + Returns: + The config value or default + """ + config = get_config() + if config is None: + return default + return config.get(*keys, default=default) + + # ============================================================================ # CORE DATA MODELS # Defines the basic data structures used throughout the benchmark. @@ -213,8 +425,10 @@ class QoSSLA: Defines the performance targets and tracks violations. """ qos_level: QoSLevel - target_latency_p95_ms: float # The 95th percentile latency target. - target_latency_p99_ms: float # The 99th percentile latency target. + target_latency_p95_ms: float # The 95th percentile latency target. + target_latency_p99_ms: float # The 99th percentile latency target. + target_latency_p999_ms: float # The 99.9th percentile latency target (3 nines). + target_latency_p9999_ms: float # The 99.99th percentile latency target (4 nines). priority: int # An integer priority level (higher is more important). # SLA violation tracking @@ -229,29 +443,46 @@ def sla_compliance(self) -> float: return 1.0 - (self.violations / self.total_requests) -# Pre-defined QoS profiles mapping each level to a specific SLA. -QOS_PROFILES = { - QoSLevel.INTERACTIVE: QoSSLA( - qos_level=QoSLevel.INTERACTIVE, - target_latency_p95_ms=50, - target_latency_p99_ms=100, - priority=3 - ), - QoSLevel.RESPONSIVE: QoSSLA( - qos_level=QoSLevel.RESPONSIVE, - target_latency_p95_ms=100, - target_latency_p99_ms=200, - priority=2 - ), - QoSLevel.BATCH: QoSSLA( - qos_level=QoSLevel.BATCH, - target_latency_p95_ms=1000, - target_latency_p99_ms=5000, - priority=1 - ) +# Default QoS profile values (overridden by config.yaml when loaded) +_DEFAULT_QOS_PROFILES = { + 'interactive': {'target_latency_p95_ms': 50, 'target_latency_p99_ms': 100, + 'target_latency_p999_ms': 150, 'target_latency_p9999_ms': 200, 'priority': 3}, + 'responsive': {'target_latency_p95_ms': 100, 'target_latency_p99_ms': 200, + 'target_latency_p999_ms': 350, 'target_latency_p9999_ms': 500, 'priority': 2}, + 'batch': {'target_latency_p95_ms': 1000, 'target_latency_p99_ms': 5000, + 'target_latency_p999_ms': 7500, 'target_latency_p9999_ms': 10000, 'priority': 1}, } +def get_qos_profiles() -> Dict[QoSLevel, QoSSLA]: + """ + Returns QoS profiles, using config.yaml values if loaded, otherwise defaults. + """ + profiles = {} + for level in QoSLevel: + level_key = level.value # 'interactive', 'responsive', 'batch' + defaults = _DEFAULT_QOS_PROFILES[level_key] + + profiles[level] = QoSSLA( + qos_level=level, + target_latency_p95_ms=cfg('qos_profiles', level_key, 'target_latency_p95_ms', + default=defaults['target_latency_p95_ms']), + target_latency_p99_ms=cfg('qos_profiles', level_key, 'target_latency_p99_ms', + default=defaults['target_latency_p99_ms']), + target_latency_p999_ms=cfg('qos_profiles', level_key, 'target_latency_p999_ms', + default=defaults['target_latency_p999_ms']), + target_latency_p9999_ms=cfg('qos_profiles', level_key, 'target_latency_p9999_ms', + default=defaults['target_latency_p9999_ms']), + priority=cfg('qos_profiles', level_key, 'priority', default=defaults['priority']), + ) + return profiles + + +# For backward compatibility, QOS_PROFILES can still be used as a dict +# but code should prefer get_qos_profiles() to pick up config changes +QOS_PROFILES = get_qos_profiles() + + @dataclass class UserProfile: """Represents a simulated user with specific behavior patterns.""" @@ -352,10 +583,10 @@ class ConversationState: class ConversationManager: """Manages the lifecycle of all multi-turn conversations and enables cache reuse.""" - def __init__(self, max_conversations: int = 1000, max_turns_per_conv: int = 50): + def __init__(self, max_conversations: int = None, max_turns_per_conv: int = None): self.conversations: Dict[str, ConversationState] = {} - self.max_conversations = max_conversations - self.max_turns_per_conv = max_turns_per_conv + self.max_conversations = max_conversations if max_conversations is not None else cfg('conversation', 'max_conversations', default=1000) + self.max_turns_per_conv = max_turns_per_conv if max_turns_per_conv is not None else cfg('conversation', 'max_turns_per_conv', default=50) self.lock = threading.Lock() # Protects access to the shared conversations dictionary. def start_conversation(self, user_id: str, system_prompt: Optional[str] = None) -> str: @@ -535,8 +766,8 @@ class PrefixMatcher: "You are a professional writing assistant.", ] - def __init__(self, min_prefix_length: int = 50): - self.min_prefix_length = min_prefix_length + def __init__(self, min_prefix_length: int = None): + self.min_prefix_length = min_prefix_length if min_prefix_length is not None else cfg('prefix_cache', 'min_prefix_length', default=50) self.prefix_index: Dict[str, PrefixCacheEntry] = {} self.prefix_frequency: Dict[str, int] = {} self.lock = threading.Lock() @@ -548,8 +779,9 @@ def hash_prefix(self, text: str, token_count: int) -> str: def detect_system_prompt(self, context_tokens: int) -> Optional[PrefixCacheEntry]: """Simulates the detection of a common system prompt at the start of a request.""" - # In this simulation, 20% of requests are assumed to start with a common system prompt. - if random.random() < 0.2: + # Probability of requests having a common system prompt (configurable, default 20%). + system_prompt_hit_probability = cfg('prefix_cache', 'system_prompt_hit_probability', default=0.2) + if random.random() < system_prompt_hit_probability: system_prompt = random.choice(self.COMMON_SYSTEM_PROMPTS) prefix_hash = self.hash_prefix(system_prompt, len(system_prompt.split())) @@ -578,9 +810,9 @@ def detect_system_prompt(self, context_tokens: int) -> Optional[PrefixCacheEntry class PrefixCacheManager: """Orchestrates the prefix matching and caching logic.""" - def __init__(self, cache, max_prefix_entries: int = 1000): + def __init__(self, cache, max_prefix_entries: int = None): self.cache = cache # A reference to the main MultiTierCache. - self.max_prefix_entries = max_prefix_entries + self.max_prefix_entries = max_prefix_entries if max_prefix_entries is not None else cfg('prefix_cache', 'max_prefix_entries', default=1000) self.prefix_matcher = PrefixMatcher() self.lock = threading.Lock() @@ -672,10 +904,10 @@ def total_context_tokens(self) -> int: class RAGDocumentManager: """Manages the ingestion and retrieval of RAG document chunks.""" - def __init__(self, cache, chunk_size: int = 512, top_k_chunks: int = 5): + def __init__(self, cache, chunk_size: int = None, top_k_chunks: int = None): self.cache = cache # A reference to the main MultiTierCache. - self.chunk_size = chunk_size - self.top_k_chunks = top_k_chunks + self.chunk_size = chunk_size if chunk_size is not None else cfg('rag', 'chunk_size_tokens', default=512) + self.top_k_chunks = top_k_chunks if top_k_chunks is not None else cfg('rag', 'top_k_chunks', default=5) self.documents: Dict[str, RAGDocument] = {} self.chunk_index: Dict[str, RAGChunk] = {} @@ -685,12 +917,12 @@ def ingest_document(self, doc_id: str, total_tokens: int, model_config: ModelCon This involves splitting it into chunks and pre-calculating and storing the KV cache for each chunk in the multi-tier cache. """ - max_chunk_bytes = 256 * 1024**2 # Target ~256MB per chunk to limit memory pressure. + max_chunk_bytes = cfg('rag', 'max_chunk_bytes', default=256 * 1024**2) # Target ~256MB per chunk bytes_per_token = max(model_config.kv_cache_size_per_token, 1) max_tokens_per_chunk = max(1, min(self.chunk_size, max_chunk_bytes // bytes_per_token)) if max_tokens_per_chunk < self.chunk_size: - print(f"[RAG] Adjusting chunk size for {doc_id} to {max_tokens_per_chunk} tokens " + logger.debug(f"Adjusting chunk size for {doc_id} to {max_tokens_per_chunk} tokens " f"to stay under {max_chunk_bytes / 1024**2:.0f} MB per chunk.") num_chunks = (total_tokens + max_tokens_per_chunk - 1) // max_tokens_per_chunk @@ -721,14 +953,14 @@ def ingest_document(self, doc_id: str, total_tokens: int, model_config: ModelCon num_tokens=chunk_tokens ) except MemoryError: - print(f"[RAG] MemoryError while ingesting chunk {chunk.chunk_id}; skipping remaining chunks.") + logger.error(f"MemoryError while ingesting chunk {chunk.chunk_id}; skipping remaining chunks.") break except Exception as exc: - print(f"[RAG] Error ingesting chunk {chunk.chunk_id}: {exc}") + logger.error(f"Error ingesting chunk {chunk.chunk_id}: {exc}") continue if not success: - print(f"[RAG] Warning: Failed to allocate cache for chunk {chunk.chunk_id}.") + logger.warning(f"Failed to allocate cache for chunk {chunk.chunk_id}.") continue chunk.storage_tier = location @@ -808,14 +1040,26 @@ class GPUMemoryBackend(StorageBackend): Uses PyTorch or CuPy for GPU operations. This is the fastest tier. """ - def __init__(self, use_torch=True): + def __init__(self, use_torch=True, on_eviction_callback=None): + """ + Initialize the GPU memory backend. + + Args: + use_torch: Whether to use PyTorch (vs CuPy) for GPU operations. + on_eviction_callback: Optional callback function called when entries are evicted + during OOM handling. Signature: callback(key: str, tier: str) + This allows the parent CacheManager to sync its metadata. + """ + self.on_eviction_callback = on_eviction_callback + if use_torch and TORCH_AVAILABLE: self.backend = 'torch' self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if self.device.type == 'cpu': raise RuntimeError("No GPU available for PyTorch backend") # Pre-allocate a large chunk of GPU memory to simulate a real server environment. - torch.cuda.set_per_process_memory_fraction(0.8, 0) + memory_fraction = cfg('gpu_backend', 'memory_fraction', default=0.8) + torch.cuda.set_per_process_memory_fraction(memory_fraction, 0) torch.cuda.empty_cache() elif CUPY_AVAILABLE: self.backend = 'cupy' @@ -832,16 +1076,65 @@ def write(self, key: str, data: np.ndarray) -> StorageBackend.IOTiming: Writes a NumPy array from CPU to GPU VRAM. Uses pinned memory and non-blocking transfers for maximum performance. """ - # Simple eviction mechanism if GPU runs out of memory. + # FIX: Iterative eviction mechanism for GPU OOM handling. + # The original code only evicted ONE entry which is insufficient for large allocations. + # We now evict multiple entries until there's enough space or we've exhausted options. if self.backend == 'torch' and torch.cuda.is_available(): - free_memory = torch.cuda.mem_get_info()[0] - if data.nbytes > free_memory * 0.9: + required_bytes = data.nbytes + max_eviction_attempts = cfg('gpu_backend', 'max_eviction_attempts', default=100) + eviction_count = 0 + # Threshold for free memory (inverted: 0.1 means keep 10% free, so use 90%) + free_memory_threshold = cfg('gpu_backend', 'free_memory_threshold', default=0.1) + usable_fraction = 1.0 - free_memory_threshold # e.g., 0.9 if threshold is 0.1 + + while eviction_count < max_eviction_attempts: + free_memory = torch.cuda.mem_get_info()[0] + # Use configurable threshold to leave headroom + if required_bytes <= free_memory * usable_fraction: + break # We have enough space + + # Try clearing the CUDA cache first torch.cuda.empty_cache() - if data.nbytes > torch.cuda.mem_get_info()[0] * 0.9: - if len(self.cache) > 0: - oldest_key = list(self.cache.keys())[0] - del self.cache[oldest_key] - torch.cuda.empty_cache() + free_memory = torch.cuda.mem_get_info()[0] + if required_bytes <= free_memory * usable_fraction: + break + + # If no entries to evict, we're out of options + if len(self.cache) == 0: + # Log warning and let the allocation proceed (it may OOM) + logger.warning( + f"GPU OOM: Need {required_bytes / 1024**2:.1f}MB, " + f"have {free_memory / 1024**2:.1f}MB, no entries to evict" + ) + break + + # Evict the oldest entry (first key in dict, which is insertion-ordered) + oldest_key = next(iter(self.cache)) + evicted_tensor = self.cache.pop(oldest_key) + evicted_size = evicted_tensor.element_size() * evicted_tensor.nelement() + del evicted_tensor + + # Also clean up pinned memory if present + if oldest_key in self.pinned_memory: + del self.pinned_memory[oldest_key] + + # Notify parent CacheManager to sync its metadata + if self.on_eviction_callback: + try: + self.on_eviction_callback(oldest_key, 'gpu', evicted_size) + except Exception as e: + logger.warning(f"GPU eviction callback failed for {oldest_key}: {e}") + + eviction_count += 1 + logger.debug( + f"GPU eviction #{eviction_count}: evicted {oldest_key} " + f"({evicted_size / 1024**2:.1f}MB)" + ) + + # Final cache clear after evictions + if eviction_count > 0: + torch.cuda.empty_cache() + logger.debug(f"GPU: evicted {eviction_count} entries to make room for {key}") start = time.perf_counter() @@ -1063,8 +1356,7 @@ def clear(self): def __del__(self): """Cleans up the temporary directory when the object is destroyed.""" if self.temp_dir: - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) + self.temp_dir.cleanup() class KVCacheGenerator: @@ -1080,7 +1372,7 @@ def __init__(self, model_config: ModelConfig, global_seed: Optional[int] = None) self.buffer_size_elements = 128 * 1024 * 1024 # 128 million elements (~256MB for float16) self.dtype = np.float16 if 'float16' in self.model_config.dtype else np.float32 - print(f"[KVCacheGenerator] Pre-generating {self.buffer_size_elements * 2 / 1024**2:.0f} MB noise buffer...") + logger.info(f"Pre-generating {self.buffer_size_elements * 2 / 1024**2:.0f} MB noise buffer...") rng = np.random.default_rng(self.global_seed) self.precomputed_buffer = rng.uniform(-1.0, 1.0, size=self.buffer_size_elements).astype(self.dtype) @@ -1165,9 +1457,13 @@ def __init__(self, self.backends = {} try: if TORCH_AVAILABLE or CUPY_AVAILABLE: - self.backends['gpu'] = GPUMemoryBackend(use_torch=TORCH_AVAILABLE) + # Pass eviction callback to sync metadata when GPU OOM forces evictions + self.backends['gpu'] = GPUMemoryBackend( + use_torch=TORCH_AVAILABLE, + on_eviction_callback=self._handle_gpu_eviction + ) except Exception as e: - print(f"Warning: Could not initialize GPU backend: {e}") + logger.warning(f"Could not initialize GPU backend: {e}") self.backends['cpu'] = CPUMemoryBackend() self.backends['nvme'] = NVMeBackend(base_path=cache_dir) @@ -1193,22 +1489,41 @@ def __init__(self, self.allocation_semaphore = None # Dictionary for collecting a wide range of performance metrics. + # NAMING CONVENTION (MLPerf v3.0): + # - "storage" refers to the NVMe/SSD tier (was "nvme" in earlier versions) + # - "tier_X_kv_bytes_written" = KV cache bytes written to tier X + # - "tier_X_kv_bytes_read" = KV cache bytes read from tier X self.stats = { 'cache_hits': 0, 'cache_misses': 0, 'evictions': 0, - 'offloads_cpu': 0, # Prefills that went directly to CPU. - 'offloads_nvme': 0, # Prefills that went directly to NVMe. + 'offloads_cpu': 0, # Writes that went directly to CPU tier. + 'offloads_storage': 0, # Writes that went directly to Storage tier. # Latency lists for each tier and operation. - 'gpu_read_latencies': [], 'cpu_read_latencies': [], 'nvme_read_latencies': [], - 'gpu_write_latencies': [], 'cpu_write_latencies': [], 'nvme_write_latencies': [], - 'nvme_read_device_latencies': [], 'nvme_read_host_latencies': [], - 'nvme_write_device_latencies': [], 'nvme_write_host_latencies': [], - - # Phase-specific I/O metrics. + # + # LATENCY TERMINOLOGY: + # - Total latency = Host + Device latency (full operation time) + # - Host latency = CPU/memory work (serialization, copying, page cache ops) + # - Device latency = Actual storage I/O (fsync for writes, file read for reads) + # + # For Storage tier (NVMe/SSD): + # Write: host = np.save() time, device = fsync() time + # Read: host = page cache drop + array copy, device = np.load() time + # + 'gpu_read_latencies': [], 'cpu_read_latencies': [], 'storage_read_latencies': [], + 'gpu_write_latencies': [], 'cpu_write_latencies': [], 'storage_write_latencies': [], + # Storage-tier-specific breakdown (device = disk I/O, host = serialization) + 'storage_read_device_latencies': [], 'storage_read_host_latencies': [], + 'storage_write_device_latencies': [], 'storage_write_host_latencies': [], + + # Phase-specific I/O metrics (aggregate - kept for backward compatibility). 'prefill_writes': 0, 'decode_reads': 0, - 'prefill_bytes_written': 0, 'decode_bytes_read': 0, + + # Tier-specific KV cache bytes (NEW NAMING - MLPerf v3.0) + # Written = data stored to tier, Read = data retrieved from tier + 'tier_gpu_kv_bytes_written': 0, 'tier_cpu_kv_bytes_written': 0, 'tier_storage_kv_bytes_written': 0, + 'tier_gpu_kv_bytes_read': 0, 'tier_cpu_kv_bytes_read': 0, 'tier_storage_kv_bytes_read': 0, # Cache type metrics for analyzing hit sources. 'system_prompt_hits': 0, 'common_phrase_hits': 0, @@ -1218,8 +1533,8 @@ def __init__(self, 'total_read_bytes': 0, 'total_write_bytes': 0, 'read_operations': 0, 'write_operations': 0, - # New counter for NVMe tokens processed (for throughput assessment) - 'nvme_tokens_processed': 0, + # Counter for storage tier tokens processed (for throughput assessment) + 'storage_tokens_processed': 0, } def _get_entry_lock(self, key: str) -> threading.Lock: @@ -1229,6 +1544,33 @@ def _get_entry_lock(self, key: str) -> threading.Lock: self.entry_locks[key] = threading.Lock() return self.entry_locks[key] + def _handle_gpu_eviction(self, key: str, tier: str, evicted_size: int) -> None: + """ + Callback invoked by GPUMemoryBackend when it evicts entries during OOM handling. + + This syncs the CacheManager's metadata with the actual GPU cache state. + Without this callback, cache_entries would still reference evicted entries, + causing KeyErrors on subsequent read attempts. + + Args: + key: The cache key that was evicted + tier: The tier from which eviction occurred (always 'gpu' for this callback) + evicted_size: Size in bytes of the evicted entry + """ + with self.metadata_lock: + if key in self.cache_entries: + del self.cache_entries[key] + if key in self.entry_locks: + del self.entry_locks[key] + + with self.memory_lock: + self.gpu_memory_used = max(0, self.gpu_memory_used - evicted_size) + + with self.stats_lock: + self.stats['evictions'] += 1 + + logger.debug(f"GPU eviction synced: removed {key} from cache metadata") + # ======================================================================== # WATERFALL LRU EVICTION METHODS # These methods implement a hierarchical cache eviction strategy where @@ -1347,39 +1689,24 @@ def _demote_entry(self, key: str, from_tier: str, to_tier: str) -> Tuple[bool, f if to_tier == 'cpu': self.stats['offloads_cpu'] += 1 elif to_tier == 'nvme': - self.stats['offloads_nvme'] += 1 - # Track tokens processed for NVMe throughput calculation - # Assuming size is bytes, and we know dtype size from model config - # But simpler: we can estimate tokens from size if needed, or just track bytes - # The user asked for 'nvme_tokens_processed'. - # We can approximate tokens = size / (2 * layers * heads * dim * dtype_size) - # Or just use the 'num_tokens' if we had it. - # Since we don't have num_tokens easily here without looking up the key again or storing it, - # let's look at the entry dict which should have it if we stored it. - # The current cache_entries dict stores: 'location', 'size', 'last_access', 'access_count'. - # It does NOT store num_tokens. - # However, size is directly proportional. - # Let's just track bytes for now and convert later if needed, OR - # better yet, let's add num_tokens to the cache entry metadata in allocate_cache. - # For now, to fix the immediate request without changing data structures too much: - # We will estimate tokens based on size. - # size = num_tokens * layers * 2 * heads * dim * 2 (for float16) - # so num_tokens = size / (layers * 4 * heads * dim) - bytes_per_token = ( - self.model_config.num_layers * - 2 * # K and V - self.model_config.kv_heads * - self.model_config.kv_dim_per_head * - 2 # float16 bytes - ) - tokens = int(size / bytes_per_token) - self.stats['nvme_tokens_processed'] += tokens + self.stats['offloads_storage'] += 1 + # Track tokens processed for Storage tier throughput calculation + # FIX: Use pre-computed property to avoid integer overflow on 32-bit systems. + # The ModelConfig.kv_cache_size_per_token property already computes this correctly. + # Python 3's // operator uses arbitrary-precision integers, avoiding overflow. + bytes_per_token = self.model_config.kv_cache_size_per_token + if bytes_per_token > 0: + # Pure integer division - Python 3 int has unlimited precision + tokens = size // bytes_per_token + self.stats['storage_tokens_processed'] += tokens + else: + logger.warning("bytes_per_token is 0, skipping token count update") total_latency = read_timing.total + write_timing.total return True, total_latency except Exception as e: - print(f"[KVCache] Failed to demote {key} from {from_tier} to {to_tier}: {e}") + logger.error(f"Failed to demote {key} from {from_tier} to {to_tier}: {e}") return False, 0.0 def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: int = 0) -> bool: @@ -1405,10 +1732,10 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: if tier == 'nvme': return True - # Safety limit to prevent runaway eviction cascades - max_recursion = 10 + # Safety limit to prevent runaway eviction cascades (configurable) + max_recursion = cfg('eviction', 'max_recursion_depth', default=10) if recursion_depth > max_recursion: - print(f"[KVCache] Warning: Hit recursion limit in _ensure_space_in_tier") + logger.warning("Hit recursion limit in _ensure_space_in_tier") return False tier_order = self._get_tier_order() @@ -1422,28 +1749,31 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: return False limit = self._get_tier_limit(tier) - target_usage = limit * 0.8 # Keep 20% buffer consistent with original code + target_usage_ratio = cfg('eviction', 'target_usage_ratio', default=0.8) + target_usage = limit * target_usage_ratio # Keep buffer consistent with config # If the entry is larger than the tier can physically hold, skip to next tier - if required_bytes > limit * 0.95: # Allow up to 95% for a single large entry + large_entry_limit_ratio = cfg('eviction', 'large_entry_limit_ratio', default=0.95) + if required_bytes > limit * large_entry_limit_ratio: return False # Calculate a reasonable eviction limit based on tier capacity. # For large models (e.g., 70B), entries can be hundreds of MB each, # so we may need to evict many entries to make room for one large request. - # Use the number of entries in the tier as a guide, with a minimum of 1000. + # Use the number of entries in the tier as a guide, with a minimum from config. entries_in_tier = len(self._get_lru_entries_in_tier(tier)) # FIX: Cap the max evictions to prevent infinite loops if we can't clear enough space # The previous logic could loop forever if entries_in_tier kept growing or didn't reduce fast enough. - # We set a hard cap of 5000 or slightly more than current entries. - max_evictions_per_call = min(5000, max(1000, entries_in_tier + 100)) + max_evictions_hard_cap = cfg('eviction', 'max_evictions_hard_cap', default=5000) + max_evictions_min = cfg('eviction', 'max_evictions_min', default=1000) + max_evictions_per_call = min(max_evictions_hard_cap, max(max_evictions_min, entries_in_tier + 100)) eviction_count = 0 while eviction_count < max_evictions_per_call: # Check if we have enough space now with self.memory_lock: current_usage = self._get_tier_usage(tier) - # Normal case: fit within the 80% target + # Normal case: fit within the target if current_usage + required_bytes <= target_usage: # FIX: Atomic Reservation # We must reserve the space NOW, inside the lock, to prevent other threads @@ -1451,8 +1781,8 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: self._update_tier_usage(tier, required_bytes) return True - # Large entry case: if we've cleared the tier, allow up to 95% of limit - if current_usage < limit * 0.05 and required_bytes <= limit * 0.95: + # Large entry case: if we've cleared the tier, allow up to large_entry_limit_ratio of limit + if current_usage < limit * 0.05 and required_bytes <= limit * large_entry_limit_ratio: # FIX: Atomic Reservation here too self._update_tier_usage(tier, required_bytes) return True @@ -1500,7 +1830,7 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: # Recursively ensure the next tier has space for this entry if not self._ensure_space_in_tier(next_tier, lru_size, recursion_depth + 1): - print(f"[KVCache] Warning: Could not make space in {next_tier} for demotion") + logger.warning(f"Could not make space in {next_tier} for demotion") # If we can't move the LRU item, we can't make space. # We should probably abort to avoid spinning. return False @@ -1555,10 +1885,10 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase try: data = self.generator.generate(sequence_length=num_tokens, key=key) except MemoryError: - print(f"[KVCache] MemoryError generating cache for key {key} ({num_tokens} tokens)") + logger.error(f"MemoryError generating cache for key {key} ({num_tokens} tokens)") return False, 'none', 0.0 except Exception as exc: - print(f"[KVCache] Failed to generate cache for key {key}: {exc}") + logger.error(f"Failed to generate cache for key {key}: {exc}") return False, 'none', 0.0 size_bytes = data.nbytes @@ -1567,7 +1897,6 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase with self.stats_lock: if phase == InferencePhase.PREFILL: self.stats['prefill_writes'] += 1 - self.stats['prefill_bytes_written'] += size_bytes self.stats['write_operations'] += 1 self.stats['total_write_bytes'] += size_bytes @@ -1608,7 +1937,7 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase allocated_tier = tier break - # Final fallback to NVMe if all else fails + # Final fallback to storage tier if all else fails if allocated_tier is None: allocated_tier = 'nvme' @@ -1630,17 +1959,23 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase 'access_count': 1 } - # Record latency and offload stats. + # Record latency, offload stats, and tier-specific KV bytes written. with self.stats_lock: + # Map internal tier name to stats key ('nvme' -> 'storage') + tier_stats_name = 'storage' if allocated_tier == 'nvme' else allocated_tier + + # Track KV bytes written per tier + self.stats[f'tier_{tier_stats_name}_kv_bytes_written'] += size_bytes + if allocated_tier == 'cpu': self.stats['offloads_cpu'] += 1 self.stats['cpu_write_latencies'].append(timing.total) elif allocated_tier == 'nvme': - self.stats['offloads_nvme'] += 1 - self.stats['nvme_write_latencies'].append(timing.total) - self.stats['nvme_write_device_latencies'].append(timing.device) - self.stats['nvme_write_host_latencies'].append(timing.host) - self.stats['nvme_tokens_processed'] += num_tokens + self.stats['offloads_storage'] += 1 + self.stats['storage_write_latencies'].append(timing.total) + self.stats['storage_write_device_latencies'].append(timing.device) + self.stats['storage_write_host_latencies'].append(timing.host) + self.stats['storage_tokens_processed'] += num_tokens elif allocated_tier == 'gpu': self.stats['gpu_write_latencies'].append(timing.total) @@ -1698,15 +2033,20 @@ def access_cache(self, key: str, phase: InferencePhase = InferencePhase.DECODE, elif cache_type == 'multi_turn': self.stats['multi_turn_hits'] += 1 else: self.stats['user_cache_hits'] += 1 - # Track phase-specific I/O. + # Map internal tier name to stats key ('nvme' -> 'storage') + tier_stats_name = 'storage' if location == 'nvme' else location + + # Track KV bytes read per tier + self.stats[f'tier_{tier_stats_name}_kv_bytes_read'] += entry_size + + # Track aggregate decode reads if phase == InferencePhase.DECODE: self.stats['decode_reads'] += 1 - self.stats['decode_bytes_read'] += entry_size self.stats['read_operations'] += 1 self.stats['total_read_bytes'] += entry_size - # Perform the actual read from the correct backend (GPU, CPU, or NVMe). + # Perform the actual read from the correct backend (GPU, CPU, or Storage). try: _, timing = self.backends[location].read(key) @@ -1717,16 +2057,15 @@ def access_cache(self, key: str, phase: InferencePhase = InferencePhase.DECODE, elif location == 'cpu': self.stats['cpu_read_latencies'].append(timing.total) else: - self.stats['nvme_read_latencies'].append(timing.total) - self.stats['nvme_read_device_latencies'].append(timing.device) - self.stats['nvme_read_host_latencies'].append(timing.host) + self.stats['storage_read_latencies'].append(timing.total) + self.stats['storage_read_device_latencies'].append(timing.device) + self.stats['storage_read_host_latencies'].append(timing.host) - #The access_cache function already retrieves the size of the entry in bytes: entry_size = entry['size']. - #The number of tokens can be calculated by dividing entry_size by the size of a single token's KV cache, which is available via self.model_config.kv_cache_size_per_token. - #This calculation should happen only when the read is from the 'nvme' tier. + # Calculate tokens from entry size for throughput assessment. + # This calculation applies only for the storage tier. if self.model_config.kv_cache_size_per_token > 0: num_tokens = entry_size / self.model_config.kv_cache_size_per_token - self.stats['nvme_tokens_processed'] += num_tokens + self.stats['storage_tokens_processed'] += num_tokens return location, timing.total except Exception as e: @@ -1743,10 +2082,10 @@ def _evaluate_storage_performance(self, duration: float) -> Dict: # Throughput-focused profile for MLPerf submission if self.performance_profile == 'throughput': - # Criterion: Throughput should be based on tokens processed by the NVMe tier. - nvme_tokens = self.stats.get('nvme_tokens_processed', 0) + # Criterion: Throughput should be based on tokens processed by the storage tier. + storage_tokens = self.stats.get('storage_tokens_processed', 0) # Correctly use the benchmark's full duration for an accurate tok/s calculation. - throughput = nvme_tokens / duration if duration > 0 else 0 + throughput = storage_tokens / duration if duration > 0 else 0 passed = throughput > 0 # Simple check to ensure it ran criteria.append({ @@ -1763,29 +2102,33 @@ def _evaluate_storage_performance(self, duration: float) -> Dict: } # Latency-focused profile (default) - # Criterion 1: NVMe Write P95 latency should be less than 500ms. - nvme_write_device = self.stats.get('nvme_write_device_latencies', []) - nvme_write_total = self.stats.get('nvme_write_latencies', []) - nvme_write_basis = nvme_write_device if nvme_write_device else nvme_write_total - if nvme_write_basis: - nvme_write_p95 = np.percentile(nvme_write_basis, 95) * 1000 - passed = nvme_write_p95 < 500 + # Criterion 1: Storage tier Write Device P95 latency should be less than 500ms. + # "Device" = actual disk I/O (fsync), excludes serialization overhead. + storage_write_device = self.stats.get('storage_write_device_latencies', []) + storage_write_total = self.stats.get('storage_write_latencies', []) + storage_write_basis = storage_write_device if storage_write_device else storage_write_total + latency_type = 'Device' if storage_write_device else 'Total' + if storage_write_basis: + storage_write_p95 = np.percentile(storage_write_basis, 95) * 1000 + passed = storage_write_p95 < 500 criteria.append({ - 'name': 'NVMe Write P95 < 500ms', - 'target': 500, 'actual': nvme_write_p95, 'unit': 'ms', 'passed': passed + 'name': f'Storage Tier Write {latency_type} P95 < 500ms', + 'target': 500, 'actual': storage_write_p95, 'unit': 'ms', 'passed': passed }) all_passed = all_passed and passed - # Criterion 2: NVMe Read P95 latency should be less than 200ms. - nvme_read_device = self.stats.get('nvme_read_device_latencies', []) - nvme_read_total = self.stats.get('nvme_read_latencies', []) - nvme_read_basis = nvme_read_device if nvme_read_device else nvme_read_total - if nvme_read_basis: - nvme_read_p95 = np.percentile(nvme_read_basis, 95) * 1000 - passed = nvme_read_p95 < 200 + # Criterion 2: Storage tier Read Device P95 latency should be less than 200ms. + # "Device" = actual disk I/O (np.load), excludes page cache/copy overhead. + storage_read_device = self.stats.get('storage_read_device_latencies', []) + storage_read_total = self.stats.get('storage_read_latencies', []) + storage_read_basis = storage_read_device if storage_read_device else storage_read_total + latency_type = 'Device' if storage_read_device else 'Total' + if storage_read_basis: + storage_read_p95 = np.percentile(storage_read_basis, 95) * 1000 + passed = storage_read_p95 < 200 criteria.append({ - 'name': 'NVMe Read P95 < 200ms', - 'target': 200, 'actual': nvme_read_p95, 'unit': 'ms', 'passed': passed + 'name': f'Storage Tier Read {latency_type} P95 < 200ms', + 'target': 200, 'actual': storage_read_p95, 'unit': 'ms', 'passed': passed }) all_passed = all_passed and passed @@ -1841,22 +2184,45 @@ def get_stats(self, duration: float) -> Dict: # Get the pass/fail assessment. storage_health = self._evaluate_storage_performance(duration) + # Calculate per-tier bandwidth (GB/s) + tier_gpu_read_bytes = self.stats['tier_gpu_kv_bytes_read'] + tier_gpu_write_bytes = self.stats['tier_gpu_kv_bytes_written'] + tier_cpu_read_bytes = self.stats['tier_cpu_kv_bytes_read'] + tier_cpu_write_bytes = self.stats['tier_cpu_kv_bytes_written'] + tier_storage_read_bytes = self.stats['tier_storage_kv_bytes_read'] + tier_storage_write_bytes = self.stats['tier_storage_kv_bytes_written'] + stats = { 'cache_hit_rate': hit_rate, 'cache_hits': stats_snapshot['cache_hits'], 'cache_misses': stats_snapshot['cache_misses'], 'gpu_entries': gpu_entries, 'cpu_entries': cpu_entries, - 'nvme_entries': nvme_entries, + 'storage_entries': nvme_entries, # Renamed from nvme_entries 'gpu_memory_used_gb': gpu_mem_used / 1024**3, 'cpu_memory_used_gb': cpu_mem_used / 1024**3, 'offloads_cpu': stats_snapshot['offloads_cpu'], - 'offloads_nvme': stats_snapshot['offloads_nvme'], + 'offloads_storage': stats_snapshot['offloads_storage'], # Renamed from offloads_nvme 'storage_health': storage_health, 'prefill_writes': self.stats['prefill_writes'], 'decode_reads': self.stats['decode_reads'], - 'prefill_bytes_written_gb': self.stats['prefill_bytes_written'] / 1024**3, - 'decode_bytes_read_gb': self.stats['decode_bytes_read'] / 1024**3, + + # Tier-specific KV cache bytes (NEW NAMING - MLPerf v3.0) + 'tier_gpu_kv_bytes_written_gb': tier_gpu_write_bytes / 1024**3, + 'tier_cpu_kv_bytes_written_gb': tier_cpu_write_bytes / 1024**3, + 'tier_storage_kv_bytes_written_gb': tier_storage_write_bytes / 1024**3, + 'tier_gpu_kv_bytes_read_gb': tier_gpu_read_bytes / 1024**3, + 'tier_cpu_kv_bytes_read_gb': tier_cpu_read_bytes / 1024**3, + 'tier_storage_kv_bytes_read_gb': tier_storage_read_bytes / 1024**3, + + # Per-tier bandwidth metrics (GB/s) + 'tier_gpu_read_bandwidth_gbps': (tier_gpu_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_gpu_write_bandwidth_gbps': (tier_gpu_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_cpu_read_bandwidth_gbps': (tier_cpu_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_cpu_write_bandwidth_gbps': (tier_cpu_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_storage_read_bandwidth_gbps': (tier_storage_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_storage_write_bandwidth_gbps': (tier_storage_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'system_prompt_hits': self.stats['system_prompt_hits'], 'common_phrase_hits': self.stats['common_phrase_hits'], 'user_cache_hits': self.stats['user_cache_hits'], @@ -1868,33 +2234,41 @@ def get_stats(self, duration: float) -> Dict: 'read_write_ratio': self.stats['total_read_bytes'] / max(self.stats['total_write_bytes'], 1), 'read_iops': self.stats['read_operations'], 'write_iops': self.stats['write_operations'], - 'nvme_tokens_processed': self.stats['nvme_tokens_processed'], + 'storage_tokens_processed': self.stats['storage_tokens_processed'], } - # Add latency percentiles for each tier. - for tier in ['gpu', 'cpu', 'nvme']: + # Add latency percentiles for each tier (including p99.9 and p99.99). + # Map internal tier names to output names ('nvme' -> 'storage') + tier_mapping = {'gpu': 'gpu', 'cpu': 'cpu', 'nvme': 'storage'} + for internal_tier, output_tier in [('gpu', 'gpu'), ('cpu', 'cpu'), ('storage', 'storage')]: for op in ['read', 'write']: - latencies = self.stats[f'{tier}_{op}_latencies'] + latencies = self.stats.get(f'{internal_tier}_{op}_latencies', []) if latencies: lat_array = np.array(latencies) - stats[f'{tier}_{op}_p50_ms'] = np.percentile(lat_array, 50) * 1000 - stats[f'{tier}_{op}_p95_ms'] = np.percentile(lat_array, 95) * 1000 - stats[f'{tier}_{op}_p99_ms'] = np.percentile(lat_array, 99) * 1000 + stats[f'{output_tier}_{op}_p50_ms'] = np.percentile(lat_array, 50) * 1000 + stats[f'{output_tier}_{op}_p95_ms'] = np.percentile(lat_array, 95) * 1000 + stats[f'{output_tier}_{op}_p99_ms'] = np.percentile(lat_array, 99) * 1000 + stats[f'{output_tier}_{op}_p999_ms'] = np.percentile(lat_array, 99.9) * 1000 + stats[f'{output_tier}_{op}_p9999_ms'] = np.percentile(lat_array, 99.99) * 1000 - # Expose NVMe latency component breakdowns when present. + # Expose storage tier latency component breakdowns when present. for op in ['read', 'write']: - device_latencies = self.stats[f'nvme_{op}_device_latencies'] - host_latencies = self.stats[f'nvme_{op}_host_latencies'] + device_latencies = self.stats.get(f'storage_{op}_device_latencies', []) + host_latencies = self.stats.get(f'storage_{op}_host_latencies', []) if device_latencies: device_array = np.array(device_latencies) - stats[f'nvme_{op}_device_p50_ms'] = np.percentile(device_array, 50) * 1000 - stats[f'nvme_{op}_device_p95_ms'] = np.percentile(device_array, 95) * 1000 - stats[f'nvme_{op}_device_p99_ms'] = np.percentile(device_array, 99) * 1000 + stats[f'storage_{op}_device_p50_ms'] = np.percentile(device_array, 50) * 1000 + stats[f'storage_{op}_device_p95_ms'] = np.percentile(device_array, 95) * 1000 + stats[f'storage_{op}_device_p99_ms'] = np.percentile(device_array, 99) * 1000 + stats[f'storage_{op}_device_p999_ms'] = np.percentile(device_array, 99.9) * 1000 + stats[f'storage_{op}_device_p9999_ms'] = np.percentile(device_array, 99.99) * 1000 if host_latencies: host_array = np.array(host_latencies) - stats[f'nvme_{op}_host_p50_ms'] = np.percentile(host_array, 50) * 1000 - stats[f'nvme_{op}_host_p95_ms'] = np.percentile(host_array, 95) * 1000 - stats[f'nvme_{op}_host_p99_ms'] = np.percentile(host_array, 99) * 1000 + stats[f'storage_{op}_host_p50_ms'] = np.percentile(host_array, 50) * 1000 + stats[f'storage_{op}_host_p95_ms'] = np.percentile(host_array, 95) * 1000 + stats[f'storage_{op}_host_p99_ms'] = np.percentile(host_array, 99) * 1000 + stats[f'storage_{op}_host_p999_ms'] = np.percentile(host_array, 99.9) * 1000 + stats[f'storage_{op}_host_p9999_ms'] = np.percentile(host_array, 99.99) * 1000 return stats @@ -1975,23 +2349,32 @@ def collect_metrics(self, cache, queue_size): write_iops = int((write_delta / (16 * 1024)) / elapsed) if elapsed > 0 else 0 # Default to 0.0 if the keys don't exist (e.g., at the start of the run). - read_latency_p95_ms = stats.get('nvme_read_p95_ms', 0.0) - write_latency_p95_ms = stats.get('nvme_write_p95_ms', 0.0) + read_latency_p95_ms = stats.get('storage_read_p95_ms', 0.0) + write_latency_p95_ms = stats.get('storage_write_p95_ms', 0.0) # --- Saturation Detection Logic --- + # Read thresholds from config (with fallback to original hardcoded values) + read_lat_threshold = cfg('saturation_detection', 'read_latency_p95_threshold_ms', default=100) + write_lat_threshold = cfg('saturation_detection', 'write_latency_p95_threshold_ms', default=50) + queue_depth_threshold = cfg('saturation_detection', 'queue_depth_threshold', default=100) + is_saturated = False if len(self.metrics_history) >= 2: # Compare with the previous metric prev_metric = self.metrics_history[-2] - if (prev_metric.read_latency_p95_ms < 100 and prev_metric.write_latency_p95_ms < 50 and prev_metric.queue_depth < 100): + if (prev_metric.read_latency_p95_ms < read_lat_threshold and + prev_metric.write_latency_p95_ms < write_lat_threshold and + prev_metric.queue_depth < queue_depth_threshold): # If the previous metric was not saturated, check for a sudden increase in latency or queue depth if (abs(prev_metric.read_latency_p95_ms - read_latency_p95_ms) > 20 or abs(prev_metric.write_latency_p95_ms - write_latency_p95_ms) > 10 or abs(prev_metric.queue_depth - queue_depth) > 10): is_saturated = True else: - # If the previous metric was saturated, check if it's still above the thresholds - if (read_latency_p95_ms > 120 or write_latency_p95_ms > 60 or queue_depth > 120): + # If the previous metric was saturated, check if it's still above the thresholds (with 20% margin) + if (read_latency_p95_ms > read_lat_threshold * 1.2 or + write_latency_p95_ms > write_lat_threshold * 1.2 or + queue_depth > queue_depth_threshold * 1.2): is_saturated = True # Create a new StorageMetrics object for this sample @@ -2067,8 +2450,11 @@ def __init__(self, self.current_users = initial_users self.target_saturation = target_saturation self.scale_interval = scale_interval_seconds - self.min_users = 1 - self.max_users = 10000 + self.min_users = cfg('autoscaler', 'min_users', default=1) + self.max_users = cfg('autoscaler', 'max_users', default=10000) + self.scale_up_factor = cfg('autoscaler', 'scale_up_factor', default=1.2) + self.scale_down_factor = cfg('autoscaler', 'scale_down_factor', default=0.8) + self.consecutive_samples_required = cfg('autoscaler', 'consecutive_samples_required', default=2) self.scaling_history = [] self.lock = threading.Lock() @@ -2171,7 +2557,7 @@ def _calculate_capacity_action(self, current_throughput: float) -> Tuple[str, in self.downward_trend_count += 1 if self.downward_trend_count >= 2: self.capacity_test_finished = True - print(f"INFO: Peak capacity found at {self.peak_throughput:.2f} tok/s. Stopping test.") + logger.info(f"Peak capacity found at {self.peak_throughput:.2f} tok/s. Stopping test.") return 'stop', self.current_users return 'hold', self.current_users @@ -2331,8 +2717,8 @@ def validate_benchmark(self, benchmark_results: Dict) -> Dict: class UserSimulator: """Generates realistic user workloads based on pre-defined templates.""" - # Templates for different user personas (chatbot, coding, document analysis). - USER_TEMPLATES = { + # Default templates for different user personas (can be overridden from config). + DEFAULT_USER_TEMPLATES = { 'chatbot': { 'context_range': (256, 1024), 'generation_range': (50, 150), 'think_time_range': (0.1, 0.5), }, @@ -2344,11 +2730,25 @@ class UserSimulator: }, } + @classmethod + def _get_user_templates(cls) -> Dict: + """Get user templates from config, falling back to defaults.""" + templates = {} + for user_type in ['chatbot', 'coding', 'document']: + default = cls.DEFAULT_USER_TEMPLATES[user_type] + templates[user_type] = { + 'context_range': tuple(cfg('user_templates', user_type, 'context_range', default=list(default['context_range']))), + 'generation_range': tuple(cfg('user_templates', user_type, 'generation_range', default=list(default['generation_range']))), + 'think_time_range': tuple(cfg('user_templates', user_type, 'think_time_range', default=list(default['think_time_range']))), + } + return templates + @classmethod def generate_user(cls, user_id: str, user_type: str = 'chatbot', priority: int = 1, qos_level: QoSLevel = QoSLevel.BATCH) -> UserProfile: """Generates a single user profile based on a template.""" - template = cls.USER_TEMPLATES.get(user_type, cls.USER_TEMPLATES['chatbot']) + templates = cls._get_user_templates() + template = templates.get(user_type, templates['chatbot']) return UserProfile( user_id=user_id, context_length=random.randint(*template['context_range']), @@ -2361,16 +2761,19 @@ def generate_user(cls, user_id: str, user_type: str = 'chatbot', priority: int = @classmethod def generate_mixed_users(cls, num_users: int) -> List[UserProfile]: """Generates a list of users with a realistic distribution of types and QoS levels.""" + # Read QoS distribution from config + interactive_prob = cfg('qos_distribution', 'interactive_probability', default=0.15) + responsive_threshold = cfg('qos_distribution', 'responsive_threshold', default=0.50) + users = [] for i in range(num_users): user_type = random.choice(['chatbot', 'coding', 'document']) - # Simulate a realistic QoS distribution. - # 15% Interactive, 35% Responsive, 50% Batch. + # Simulate a realistic QoS distribution from config. rand = random.random() - if rand < 0.15: + if rand < interactive_prob: qos_level, priority = QoSLevel.INTERACTIVE, 3 - elif rand < 0.50: + elif rand < responsive_threshold: qos_level, priority = QoSLevel.RESPONSIVE, 2 else: qos_level, priority = QoSLevel.BATCH, 1 @@ -2413,7 +2816,7 @@ def __init__(self, dataset_path: str, max_conversations: int = 1000, seed: Optio def _load_dataset(self): """Load and process the ShareGPT dataset.""" if not os.path.exists(self.dataset_path): - print(f"[ShareGPT] Warning: Dataset not found at {self.dataset_path}") + logger.warning(f"Dataset not found at {self.dataset_path}") return try: @@ -2426,7 +2829,7 @@ def _load_dataset(self): pass if tokenizer is None: - print("[ShareGPT] Tiktoken not available, using approximate token counting") + logger.info("Tiktoken not available, using approximate token counting") with open(self.dataset_path, 'r', encoding='utf-8') as f: data = json.load(f) @@ -2504,12 +2907,12 @@ def _load_dataset(self): 'total_turns': sum(len(c['turns']) for c in self.conversations) } - print(f"[ShareGPT] Loaded {len(self.conversations)} conversations with {self.token_stats['total_turns']} turns") - print(f"[ShareGPT] Context tokens: mean={self.token_stats['context_mean']:.1f}, p50={self.token_stats['context_p50']:.1f}, p95={self.token_stats['context_p95']:.1f}") - print(f"[ShareGPT] Generation tokens: mean={self.token_stats['generation_mean']:.1f}, p50={self.token_stats['generation_p50']:.1f}, p95={self.token_stats['generation_p95']:.1f}") + logger.info(f"Loaded {len(self.conversations)} conversations with {self.token_stats['total_turns']} turns") + logger.info(f"Context tokens: mean={self.token_stats['context_mean']:.1f}, p50={self.token_stats['context_p50']:.1f}, p95={self.token_stats['context_p95']:.1f}") + logger.info(f"Generation tokens: mean={self.token_stats['generation_mean']:.1f}, p50={self.token_stats['generation_p50']:.1f}, p95={self.token_stats['generation_p95']:.1f}") except Exception as e: - print(f"[ShareGPT] Error loading dataset: {e}") + logger.error(f"Error loading dataset: {e}") self.conversations = [] def get_random_conversation(self) -> Optional[Dict]: @@ -2656,7 +3059,7 @@ def __init__(self, def _ingest_rag_documents(self, num_docs: int, stop_event: Optional[threading.Event] = None): """Ingests RAG documents for the workload.""" - print(f"Ingesting {num_docs} RAG documents...") + logger.info(f"Ingesting {num_docs} RAG documents...") for i in range(num_docs): if stop_event and stop_event.is_set(): break @@ -2675,7 +3078,7 @@ def _ingest_rag_documents(self, num_docs: int, stop_event: Optional[threading.Ev def _load_burst_trace(self): """Loads requests from the BurstGPT CSV trace file.""" if not self.burst_trace_path: - print("Error: --use-burst-trace flag requires --burst-trace-path to be set.") + logger.error("--use-burst-trace flag requires --burst-trace-path to be set.") sys.exit(1) try: with open(self.burst_trace_path, 'r', encoding='utf-8') as f: @@ -2687,12 +3090,12 @@ def _load_burst_trace(self): self.burst_requests.append((context_tokens, generate_tokens)) except (ValueError, KeyError): continue - print(f"Loaded {len(self.burst_requests)} requests from BurstGPT trace.") + logger.info(f"Loaded {len(self.burst_requests)} requests from BurstGPT trace.") except FileNotFoundError: - print(f"Error: Trace file not found at {self.burst_trace_path}") + logger.error(f"Trace file not found at {self.burst_trace_path}") sys.exit(1) except Exception as e: - print(f"Error reading trace file: {e}") + logger.error(f"Error reading trace file: {e}") sys.exit(1) def _generate_requests_from_trace(self, stop_event: threading.Event): @@ -2700,7 +3103,7 @@ def _generate_requests_from_trace(self, stop_event: threading.Event): request_index = 0 while not stop_event.is_set(): if not self.burst_requests: - print("Warning: BurstGPT trace is empty. No requests to generate.") + logger.warning("BurstGPT trace is empty. No requests to generate.") time.sleep(1) continue @@ -2748,7 +3151,7 @@ def _generate_requests_from_trace(self, stop_event: threading.Event): def _generate_requests_from_dataset(self, stop_event: threading.Event): """Generates InferenceRequest objects from the loaded ShareGPT dataset.""" if not self.sharegpt_loader or not self.sharegpt_loader.conversations: - print("Warning: ShareGPT dataset is empty or not loaded. Falling back to synthetic workload.") + logger.warning("ShareGPT dataset is empty or not loaded. Falling back to synthetic workload.") # Fall back to synthetic generation users = UserSimulator.generate_mixed_users(self.num_users) self.generate_requests(users, stop_event) @@ -2777,11 +3180,14 @@ def _generate_requests_from_dataset(self, stop_event: threading.Event): req_id = self.request_counter self.request_counter += 1 - # Assign QoS level based on request characteristics + # Assign QoS level based on request characteristics (from config) + interactive_prob = cfg('qos_distribution', 'interactive_probability', default=0.15) + responsive_threshold = cfg('qos_distribution', 'responsive_threshold', default=0.50) + rand = random.random() - if rand < 0.15: + if rand < interactive_prob: qos_level, priority = QoSLevel.INTERACTIVE, 3 - elif rand < 0.50: + elif rand < responsive_threshold: qos_level, priority = QoSLevel.RESPONSIVE, 2 else: qos_level, priority = QoSLevel.BATCH, 1 @@ -2961,13 +3367,15 @@ def process_requests(self, stop_event: threading.Event): with self.results_lock: self.results['prefill_latencies'].append(write_latency) # 4. Simulate a RAG operation by reading random chunk caches. - # NOTE: Check that documents exist to avoid race condition with RAG ingestion thread - if self.rag_manager and self.rag_manager.documents and random.random() < 0.1: # 10% of requests are RAG queries - doc_id = random.choice(list(self.rag_manager.documents.keys())) - chunks = self.rag_manager.retrieve_chunks(doc_id) - for chunk in chunks: # Read the KV cache for each retrieved chunk. - _, read_lat = self.cache.access_cache(chunk.kv_cache_key, InferencePhase.DECODE) - storage_latency += read_lat + # NOTE: Capture document keys atomically to avoid race condition with RAG ingestion thread + if self.rag_manager and random.random() < 0.1: # 10% of requests are RAG queries + doc_keys = list(self.rag_manager.documents.keys()) if self.rag_manager.documents else [] + if doc_keys: + doc_id = random.choice(doc_keys) + chunks = self.rag_manager.retrieve_chunks(doc_id) + for chunk in chunks: # Read the KV cache for each retrieved chunk. + _, read_lat = self.cache.access_cache(chunk.kv_cache_key, InferencePhase.DECODE) + storage_latency += read_lat # 5. Perform the DECODE operation (a cache READ). if request.phase == InferencePhase.DECODE or request.phase == InferencePhase.PREFILL_DECODE: @@ -2982,7 +3390,7 @@ def process_requests(self, stop_event: threading.Event): storage_latency += write_latency else: # Simulate realistic decode I/O: reads are batched, not per-token. - decode_batch_size = 32 + decode_batch_size = cfg('decode', 'batch_size', default=32) num_batched_reads = max(1, (request.generate_tokens + decode_batch_size - 1) // decode_batch_size) for _ in range(num_batched_reads): _, batch_read_latency = self.cache.access_cache(request.cache_key, InferencePhase.DECODE, cache_type) @@ -3062,9 +3470,9 @@ def monitor_stats(self, stop_event: threading.Event): 'throughput_tokens_per_sec': throughput } self.autoscaler.scaling_history.append(log_entry) - print(f"Autoscaler {action} -> {self.num_users} users (saturation: {saturation_level:.2f})") + logger.info(f"Autoscaler {action} -> {self.num_users} users (saturation: {saturation_level:.2f})") elif action == 'stop': - print("Autoscaler requested stop after reaching capacity peak.") + logger.info("Autoscaler requested stop after reaching capacity peak.") stop_event.set() log_entry = { 'timestamp': datetime.now().isoformat(), @@ -3083,7 +3491,7 @@ def monitor_stats(self, stop_event: threading.Event): if now - last_log_time >= 10: self._calculate_stats() queue_depth = self.request_queue.qsize() - print(f"Time: {int(elapsed)}s, Users: {self.num_users}, Queue: {queue_depth}, " + logger.info(f"Time: {int(elapsed)}s, Users: {self.num_users}, Queue: {queue_depth}, " f"Throughput: {throughput:.2f} tok/s") last_log_time = now @@ -3177,7 +3585,7 @@ def run(self) -> Dict: def _calculate_stats(self, actual_duration: float = None): """Calculate final statistics with all feature breakdowns""" if not self.results['end_to_end_latencies']: - print("\nNo requests completed during benchmark!") + logger.warning("No requests completed during benchmark!") return # Use actual duration if provided (for max_requests mode), else configured duration @@ -3218,18 +3626,24 @@ def _calculate_stats(self, actual_duration: float = None): 'p50': np.percentile(e2e, 50) * 1000, 'p95': np.percentile(e2e, 95) * 1000, 'p99': np.percentile(e2e, 99) * 1000, + 'p999': np.percentile(e2e, 99.9) * 1000, + 'p9999': np.percentile(e2e, 99.99) * 1000, }, 'storage_io_latency_ms': { 'mean': np.mean(storage) * 1000, 'p50': np.percentile(storage, 50) * 1000, 'p95': np.percentile(storage, 95) * 1000, 'p99': np.percentile(storage, 99) * 1000, + 'p999': np.percentile(storage, 99.9) * 1000, + 'p9999': np.percentile(storage, 99.99) * 1000, }, 'generation_latency_ms': { 'mean': np.mean(generation) * 1000, 'p50': np.percentile(generation, 50) * 1000, 'p95': np.percentile(generation, 95) * 1000, 'p99': np.percentile(generation, 99) * 1000, + 'p999': np.percentile(generation, 99.9) * 1000, + 'p9999': np.percentile(generation, 99.99) * 1000, }, 'cache_stats': cache_stats, 'qos_metrics': qos_metrics, @@ -3274,10 +3688,6 @@ def _print_summary(self, summary: Dict): - Phase-specific metrics (prefill/decode) - QoS compliance by service tier - Validation results if available - Note: - The symbols âœ" and ✗ are intended to be checkmark (✓) and cross (✗) - characters for pass/fail indicators but may display incorrectly due to - encoding issues. """ """Print comprehensive results summary""" print("\n" + "=" * 80) @@ -3285,15 +3695,19 @@ def _print_summary(self, summary: Dict): print(f"Generation Mode: {self.generation_mode.value} ({self.ms_per_token:.1f}ms/token)") print("=" * 80) + # Use ASCII-safe symbols for pass/fail indicators + PASS_SYMBOL = "[OK]" + FAIL_SYMBOL = "[X]" + cache_stats = summary['cache_stats'] if 'storage_health' in cache_stats: storage_health = cache_stats['storage_health'] status = storage_health['overall_status'] - status_symbol = '✓' if status == 'PASS' else '✗' + status_symbol = PASS_SYMBOL if status == 'PASS' else FAIL_SYMBOL print(f"\n### STORAGE PERFORMANCE ASSESSMENT: {status} {status_symbol} ###") print(f" Criteria Passed: {storage_health['passed_count']}/{storage_health['total_count']}") for criterion in storage_health['criteria']: - symbol = '✓' if criterion['passed'] else '✗' + symbol = PASS_SYMBOL if criterion['passed'] else FAIL_SYMBOL unit = criterion.get('unit', '') if unit == 'ratio': print(f" {symbol} {criterion['name']}: {criterion['actual']:.1%} (target: {criterion['target']:.1%})") @@ -3323,17 +3737,18 @@ def _print_summary(self, summary: Dict): print(f"Throughput (storage I/O): {summary['storage_throughput_tokens_per_sec']:.2f} tokens/sec") print(f"Requests/sec: {summary['requests_per_second']:.2f}") - print(f"\n### END-TO-END LATENCY (Storage I/O + Token Generation) ###") + print(f"\n### END-TO-END LATENCY (Queue Wait + Storage I/O + Generation) ###") print(f" Mean: {summary['end_to_end_latency_ms']['mean']:.2f} ms") print(f" P50: {summary['end_to_end_latency_ms']['p50']:.2f} ms") print(f" P95: {summary['end_to_end_latency_ms']['p95']:.2f} ms") print(f" P99: {summary['end_to_end_latency_ms']['p99']:.2f} ms") - print(f"\n### STORAGE I/O LATENCY (Primary Metric) ###") + print(f"\n### PER-REQUEST STORAGE LATENCY (All I/O ops for one request) ###") print(f" Mean: {summary['storage_io_latency_ms']['mean']:.2f} ms") print(f" P50: {summary['storage_io_latency_ms']['p50']:.2f} ms") print(f" P95: {summary['storage_io_latency_ms']['p95']:.2f} ms") print(f" P99: {summary['storage_io_latency_ms']['p99']:.2f} ms") + print(f" (= 1 prefill write + N decode reads per request)") if self.generation_mode != GenerationMode.NONE: print(f"\n### TOKEN GENERATION LATENCY (Simulated @ {self.ms_per_token:.1f}ms/token) ###") @@ -3352,20 +3767,46 @@ def _print_summary(self, summary: Dict): print(f"\n### CACHE TIER DISTRIBUTION ###") print(f" GPU Entries: {cache_stats['gpu_entries']} ({cache_stats['gpu_memory_used_gb']:.2f} GB)") print(f" CPU Entries: {cache_stats['cpu_entries']} ({cache_stats['cpu_memory_used_gb']:.2f} GB)") - print(f" NVMe Entries: {cache_stats['nvme_entries']}") - - print(f"\n### PHASE-SPECIFIC METRICS ###") - print(f" Prefill Writes: {cache_stats['prefill_writes']}") - print(f" Prefill Bytes Written: {cache_stats['prefill_bytes_written_gb']:.2f} GB") - print(f" Decode Reads: {cache_stats['decode_reads']}") - print(f" Decode Bytes Read: {cache_stats['decode_bytes_read_gb']:.2f} GB") - - print(f"\n### TIER-SPECIFIC LATENCIES ###") - for tier in ['gpu', 'cpu', 'nvme']: + print(f" Storage Entries: {cache_stats['storage_entries']}") + + print(f"\n### TIER-SPECIFIC KV BYTES ###") + # GPU tier + if cache_stats.get('tier_gpu_kv_bytes_written_gb', 0) > 0: + print(f" GPU KV Bytes Written: {cache_stats['tier_gpu_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_gpu_kv_bytes_read_gb', 0) > 0: + print(f" GPU KV Bytes Read: {cache_stats['tier_gpu_kv_bytes_read_gb']:.2f} GB") + # CPU tier + if cache_stats.get('tier_cpu_kv_bytes_written_gb', 0) > 0: + print(f" CPU KV Bytes Written: {cache_stats['tier_cpu_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_cpu_kv_bytes_read_gb', 0) > 0: + print(f" CPU KV Bytes Read: {cache_stats['tier_cpu_kv_bytes_read_gb']:.2f} GB") + # Storage tier + if cache_stats.get('tier_storage_kv_bytes_written_gb', 0) > 0: + print(f" Storage KV Bytes Written: {cache_stats['tier_storage_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_storage_kv_bytes_read_gb', 0) > 0: + print(f" Storage KV Bytes Read: {cache_stats['tier_storage_kv_bytes_read_gb']:.2f} GB") + + print(f"\n### TIER-SPECIFIC LATENCIES (Total = Host + Device) ###") + for tier in ['gpu', 'cpu', 'storage']: for op in ['read', 'write']: p95_key = f'{tier}_{op}_p95_ms' if p95_key in cache_stats: - print(f" {tier.upper()} {op.title()} P95: {cache_stats[p95_key]:.2f} ms") + tier_label = 'Storage' if tier == 'storage' else tier.upper() + print(f" {tier_label} {op.title()} P95 (Total): {cache_stats[p95_key]:.2f} ms") + + # Storage tier Device vs Host latency breakdown (most important for storage benchmarks) + print(f"\n### STORAGE TIER LATENCY BREAKDOWN (Device = Disk I/O, Host = Serialization) ###") + for op in ['read', 'write']: + device_key = f'storage_{op}_device_p95_ms' + host_key = f'storage_{op}_host_p95_ms' + total_key = f'storage_{op}_p95_ms' + if device_key in cache_stats: + print(f" Storage {op.title()}:") + print(f" - Device P95 (Disk I/O): {cache_stats[device_key]:.2f} ms") + if host_key in cache_stats: + print(f" - Host P95 (Serialization): {cache_stats[host_key]:.2f} ms") + if total_key in cache_stats: + print(f" - Total P95: {cache_stats[total_key]:.2f} ms") print(f"\n### CACHE TYPE BREAKDOWNS ###") print(f" System Prompt Hits: {cache_stats['system_prompt_hits']}") @@ -3397,7 +3838,7 @@ def _print_summary(self, summary: Dict): print(f" Latency P95: {metrics['latency_ms']['p95']:.2f} ms") print(f" Latency P99: {metrics['latency_ms']['p99']:.2f} ms") if 'sla' in metrics: - sla_met = '✓' if metrics['sla']['met'] else '✗' + sla_met = '[OK]' if metrics['sla']['met'] else '[X]' print(f" SLA Met: {sla_met} (compliance: {metrics['sla']['compliance']:.1%})") if summary.get('autoscaling_stats'): @@ -3412,7 +3853,7 @@ def _print_summary(self, summary: Dict): if 'validation' in self.results: print(f"\n### VALIDATION ###") validation = self.results['validation'] - print(f" Validation: {'PASSED ✓' if validation['passed'] else 'FAILED ✗'}") + print(f" Validation: {'PASSED [OK]' if validation['passed'] else 'FAILED [X]'}") print(f" Average Error: {validation['avg_error_pct']:.2f}%") print("\n" + "=" * 80) @@ -3424,9 +3865,112 @@ def _print_summary(self, summary: Dict): print("=" * 80) +# ============================================================================ +# INPUT VALIDATION +# Validates command-line arguments before benchmark execution. +# ============================================================================ + +# Validation constants with documented rationale +MAX_USERS = 100000 # Reasonable upper limit for simulated users +MAX_DURATION_SECONDS = 86400 # 24 hours - prevents runaway benchmarks +MAX_GPU_MEMORY_GB = 1024 # 1TB - covers even the largest GPU clusters +MAX_CPU_MEMORY_GB = 16384 # 16TB - covers high-memory server configurations + +# System directories that should never be used as cache directories +FORBIDDEN_CACHE_PREFIXES = frozenset([ + '/etc', '/bin', '/sbin', '/usr/bin', '/usr/sbin', + '/boot', '/sys', '/proc', '/dev', '/root' +]) + + +def validate_args(args: argparse.Namespace) -> argparse.Namespace: + """ + Validate command-line arguments to catch invalid values early. + + Args: + args: Parsed argparse namespace + + Returns: + The validated args namespace + + Raises: + ValueError: If any validation check fails + """ + errors = [] + + # Validate positive integers + if args.num_users <= 0: + errors.append(f"--num-users must be positive, got {args.num_users}") + if args.num_users > MAX_USERS: + errors.append(f"--num-users exceeds limit ({MAX_USERS}), got {args.num_users}") + + if args.duration <= 0: + errors.append(f"--duration must be positive, got {args.duration}") + if args.duration > MAX_DURATION_SECONDS: + errors.append(f"--duration exceeds 24 hours ({MAX_DURATION_SECONDS}s), got {args.duration}") + + # Validate memory sizes + if args.gpu_mem_gb < 0: + errors.append(f"--gpu-mem-gb cannot be negative, got {args.gpu_mem_gb}") + if args.gpu_mem_gb > MAX_GPU_MEMORY_GB: + errors.append(f"--gpu-mem-gb exceeds limit ({MAX_GPU_MEMORY_GB}GB), got {args.gpu_mem_gb}") + + if args.cpu_mem_gb < 0: + errors.append(f"--cpu-mem-gb cannot be negative, got {args.cpu_mem_gb}") + if args.cpu_mem_gb > MAX_CPU_MEMORY_GB: + errors.append(f"--cpu-mem-gb exceeds limit ({MAX_CPU_MEMORY_GB}GB), got {args.cpu_mem_gb}") + + # Validate optional integers + if args.rag_num_docs < 0: + errors.append(f"--rag-num-docs cannot be negative, got {args.rag_num_docs}") + + if args.max_conversations <= 0: + errors.append(f"--max-conversations must be positive, got {args.max_conversations}") + + if args.max_concurrent_allocs < 0: + errors.append(f"--max-concurrent-allocs cannot be negative, got {args.max_concurrent_allocs}") + + if args.request_rate < 0: + errors.append(f"--request-rate cannot be negative, got {args.request_rate}") + + if args.max_requests < 0: + errors.append(f"--max-requests cannot be negative, got {args.max_requests}") + + # Validate target_saturation range + if not (0.0 <= args.target_saturation <= 1.0): + errors.append(f"--target-saturation must be between 0.0 and 1.0, got {args.target_saturation}") + + # Validate cache directory if provided + if args.cache_dir: + # Resolve symlinks to prevent bypass attacks + cache_path = Path(args.cache_dir).resolve() + cache_path_str = str(cache_path) + + # Check for forbidden system directories + for prefix in FORBIDDEN_CACHE_PREFIXES: + if cache_path_str.startswith(prefix): + errors.append(f"--cache-dir cannot be a system directory: {cache_path}") + break + + # Check if parent directory is writable (if it exists) + parent = cache_path.parent + if parent.exists() and not os.access(parent, os.W_OK): + errors.append(f"--cache-dir parent is not writable: {parent}") + + if errors: + for error in errors: + logger.error(f"Validation error: {error}") + raise ValueError(f"Invalid arguments:\n " + "\n ".join(errors)) + + return args + + def main(): """Main entry point for running the benchmark from the command line.""" parser = argparse.ArgumentParser(description="Integrated Multi-User KV Cache Benchmark") + parser.add_argument('--log-level', type=str, default='INFO', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level (default: INFO)') parser.add_argument('--model', type=str, default='llama3.1-8b', choices=MODEL_CONFIGS.keys(), help='The model configuration to use.') parser.add_argument('--num-users', type=int, default=100, @@ -3481,11 +4025,33 @@ def main(): parser.add_argument('--xlsx-output', type=str, default=None, help='Optional: Output Excel file path for summary results with run parameters. ' 'Requires pandas and openpyxl. Falls back to CSV if openpyxl not available.') + parser.add_argument('--config', type=str, default=None, + help='Path to YAML configuration file. Overrides hardcoded defaults.') args = parser.parse_args() + # Configure logging based on command-line argument + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Validate command-line arguments + args = validate_args(args) + + # Load YAML config if provided + if args.config: + config = ConfigLoader(args.config) + set_config(config) + logger.info(f"Loaded configuration from {args.config}") + + # Refresh QOS_PROFILES with config values + global QOS_PROFILES + QOS_PROFILES = get_qos_profiles() + if args.seed is not None: - print(f"Using random seed: {args.seed}") + logger.info(f"Using random seed: {args.seed}") random.seed(args.seed) np.random.seed(args.seed) if TORCH_AVAILABLE: @@ -3540,7 +4106,7 @@ def convert_numpy(obj): with open(args.output, 'w') as f: json.dump(results, f, indent=4, default=convert_numpy) - print(f"\nResults saved to {args.output}") + logger.info(f"Results saved to {args.output}") # Export to XLSX if requested if args.xlsx_output: @@ -3558,12 +4124,12 @@ def export_results_to_xlsx(results: Dict, args, output_path: str): output_path: Path for the output Excel/CSV file """ if not PANDAS_AVAILABLE: - print(f"Warning: pandas not available, skipping XLSX export. Install with: pip install pandas") + logger.warning("pandas not available, skipping XLSX export. Install with: pip install pandas") return summary = results.get('summary', {}) if not summary: - print("Warning: No summary data available for XLSX export") + logger.warning("No summary data available for XLSX export") return # Helper to safely get nested keys @@ -3611,32 +4177,88 @@ def get_nested(d, keys, default=None): 'E2E Latency P50 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p50']), 'E2E Latency P95 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p95']), 'E2E Latency P99 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p99']), + 'E2E Latency P99.9 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p999']), + 'E2E Latency P99.99 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p9999']), # Storage IO Latency 'Storage Latency Mean (ms)': get_nested(summary, ['storage_io_latency_ms', 'mean']), 'Storage Latency P50 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p50']), 'Storage Latency P95 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p95']), 'Storage Latency P99 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p99']), + 'Storage Latency P99.9 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p999']), + 'Storage Latency P99.99 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p9999']), - # Generation Latency + # Generation Latency (simulated GPU work) 'Gen Latency Mean (ms)': get_nested(summary, ['generation_latency_ms', 'mean']), 'Gen Latency P50 (ms)': get_nested(summary, ['generation_latency_ms', 'p50']), 'Gen Latency P95 (ms)': get_nested(summary, ['generation_latency_ms', 'p95']), 'Gen Latency P99 (ms)': get_nested(summary, ['generation_latency_ms', 'p99']), - + + # Storage Tier Total Latency (Host serialization + Device I/O) + 'Storage Tier Read Total P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p50_ms']), + 'Storage Tier Read Total P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p95_ms']), + 'Storage Tier Read Total P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p99_ms']), + 'Storage Tier Read Total P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p999_ms']), + 'Storage Tier Read Total P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p9999_ms']), + 'Storage Tier Write Total P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p50_ms']), + 'Storage Tier Write Total P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p95_ms']), + 'Storage Tier Write Total P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p99_ms']), + 'Storage Tier Write Total P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p999_ms']), + 'Storage Tier Write Total P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p9999_ms']), + + # Storage Tier Device Latency (actual disk I/O - fsync for writes, np.load for reads) + 'Storage Tier Read Device P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p50_ms']), + 'Storage Tier Read Device P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p95_ms']), + 'Storage Tier Read Device P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p99_ms']), + 'Storage Tier Read Device P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p999_ms']), + 'Storage Tier Read Device P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p9999_ms']), + 'Storage Tier Write Device P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p50_ms']), + 'Storage Tier Write Device P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p95_ms']), + 'Storage Tier Write Device P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p99_ms']), + 'Storage Tier Write Device P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p999_ms']), + 'Storage Tier Write Device P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p9999_ms']), + + # Storage Tier Host Latency (serialization/deserialization - CPU work) + 'Storage Tier Read Host P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p50_ms']), + 'Storage Tier Read Host P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p95_ms']), + 'Storage Tier Read Host P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p99_ms']), + 'Storage Tier Read Host P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p999_ms']), + 'Storage Tier Read Host P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p9999_ms']), + 'Storage Tier Write Host P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p50_ms']), + 'Storage Tier Write Host P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p95_ms']), + 'Storage Tier Write Host P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p99_ms']), + 'Storage Tier Write Host P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p999_ms']), + 'Storage Tier Write Host P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p9999_ms']), + # Cache Stats 'Cache Hit Rate': get_nested(summary, ['cache_stats', 'cache_hit_rate']), 'Read/Write Ratio': get_nested(summary, ['cache_stats', 'read_write_ratio']), 'Total Read (GB)': get_nested(summary, ['cache_stats', 'total_read_gb']), 'Total Write (GB)': get_nested(summary, ['cache_stats', 'total_write_gb']), - 'Prefill Bytes Written (GB)': get_nested(summary, ['cache_stats', 'prefill_bytes_written_gb']), - 'Decode Bytes Read (GB)': get_nested(summary, ['cache_stats', 'decode_bytes_read_gb']), - + + # Per-Tier KV Cache Bytes Written (NEW NAMING - MLPerf v3.0) + 'Tier GPU KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_gpu_kv_bytes_written_gb']), + 'Tier CPU KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_cpu_kv_bytes_written_gb']), + 'Tier Storage KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_storage_kv_bytes_written_gb']), + + # Per-Tier KV Cache Bytes Read (NEW NAMING - MLPerf v3.0) + 'Tier GPU KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_gpu_kv_bytes_read_gb']), + 'Tier CPU KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_cpu_kv_bytes_read_gb']), + 'Tier Storage KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_storage_kv_bytes_read_gb']), + + # Per-Tier Bandwidth (GB/s) - MLPerf v3.0 scoring metric + 'Tier GPU Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_gpu_read_bandwidth_gbps']), + 'Tier GPU Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_gpu_write_bandwidth_gbps']), + 'Tier CPU Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_cpu_read_bandwidth_gbps']), + 'Tier CPU Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_cpu_write_bandwidth_gbps']), + 'Tier Storage Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_storage_read_bandwidth_gbps']), + 'Tier Storage Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_storage_write_bandwidth_gbps']), + # Tier distribution 'GPU Entries': get_nested(summary, ['cache_stats', 'gpu_entries']), 'CPU Entries': get_nested(summary, ['cache_stats', 'cpu_entries']), - 'NVMe Entries': get_nested(summary, ['cache_stats', 'nvme_entries']), - + 'Storage Entries': get_nested(summary, ['cache_stats', 'storage_entries']), + # Multi-turn stats 'Multi-turn Hit Rate': get_nested(summary, ['multi_turn_stats', 'hit_rate']), } @@ -3683,24 +4305,24 @@ def get_nested(d, keys, default=None): qos_df = pd.DataFrame(qos_rows) qos_df.to_excel(writer, sheet_name='QoS Metrics', index=False) - print(f"XLSX results saved to {output_path}") + logger.info(f"XLSX results saved to {output_path}") else: # Fall back to CSV csv_path = output_path.replace('.xlsx', '.csv') if output_path.endswith('.xlsx') else output_path if not csv_path.endswith('.csv'): csv_path += '.csv' df.to_csv(csv_path, index=False) - print(f"CSV results saved to {csv_path} (openpyxl not available for XLSX)") + logger.info(f"CSV results saved to {csv_path} (openpyxl not available for XLSX)") except Exception as e: - print(f"Error saving XLSX/CSV: {e}") + logger.error(f"Error saving XLSX/CSV: {e}") # Last resort: try CSV try: csv_path = output_path.replace('.xlsx', '.csv') df.to_csv(csv_path, index=False) - print(f"Fallback CSV saved to {csv_path}") + logger.info(f"Fallback CSV saved to {csv_path}") except Exception as e2: - print(f"Failed to save results: {e2}") + logger.error(f"Failed to save results: {e2}") if __name__ == "__main__": From d9715bce44676ff4aa3eb63aabee7715fd0f4785 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:43:17 -0800 Subject: [PATCH 02/16] feat(wrapper): config integration and workload automation - Add -c DIR option for custom config directory - Generate and pass config.yaml to Python script via --config flag - Add --xlsx-output support for Excel export - Update jq queries for new storage_* metric names - Add mlperf_submission workload with required trial parameters - Enhance system detection for thread counts and memory limits - Update metric parsing for storage_throughput primary metric --- kv_cache_benchmark/kv-cache-wrapper.sh | 70 +++++++++++++++++++------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/kv_cache_benchmark/kv-cache-wrapper.sh b/kv_cache_benchmark/kv-cache-wrapper.sh index 2b648d6a..59ba3d37 100644 --- a/kv_cache_benchmark/kv-cache-wrapper.sh +++ b/kv_cache_benchmark/kv-cache-wrapper.sh @@ -40,6 +40,7 @@ Usage: ./kv-cache-wrapper.sh [options] [model] Options: -m MODEL Model key to benchmark (tiny-1b, mistral-7b, llama3.1-8b, llama2-7b, llama3.1-70b-instruct) + -c DIR Cache directory path (default: auto-detect /mnt/nvme, /mnt/ssd, or /tmp) -t SECONDS Duration for tier comparison tests (default: 120) -s SECONDS Duration for storage saturation test (default: 180) -r SECONDS Duration for realistic production test (default: 180) @@ -57,6 +58,7 @@ EOF # Default configuration (can be overridden via getopts) model="" +cache_dir_override="" tier_duration=120 saturation_duration=180 realistic_duration=180 @@ -67,9 +69,10 @@ users_high_override="" rag_enabled=0 rag_docs_override="" -while getopts ":m:t:s:r:a:w:u:U:RD:h" opt; do +while getopts ":m:c:t:s:r:a:w:u:U:RD:h" opt; do case "$opt" in m) model="$OPTARG" ;; + c) cache_dir_override="$OPTARG" ;; t) tier_duration="$OPTARG" ;; s) saturation_duration="$OPTARG" ;; r) realistic_duration="$OPTARG" ;; @@ -275,15 +278,18 @@ else fi # System detection - Storage path -# Priority: /mnt/nvme > /mnt/ssd > /tmp -cache_dir="/tmp/kvcache_benchmark" -if [ -d "/mnt/nvme" ] && [ -w "/mnt/nvme" ]; then +# Priority: user override > /mnt/nvme > /mnt/ssd > /tmp +if [ -n "$cache_dir_override" ]; then + cache_dir="$cache_dir_override" + echo "Cache directory (user override): $cache_dir" +elif [ -d "/mnt/nvme" ] && [ -w "/mnt/nvme" ]; then cache_dir="/mnt/nvme" echo "NVMe storage path: $cache_dir" elif [ -d "/mnt/ssd" ] && [ -w "/mnt/ssd" ]; then cache_dir="/mnt/ssd" echo "SSD storage path: $cache_dir" else + cache_dir="/tmp/kvcache_benchmark" echo "Warning: using temp storage at $cache_dir (consider mounting NVMe to /mnt/nvme)" fi @@ -367,6 +373,7 @@ if should_run 'capacity-autoscale'; then capacity_model="llama3.1-70b-instruct" python3 kv-cache.py \ + --config config.yaml \ --model "$capacity_model" \ --num-users "$capacity_start_users" \ --duration "$autoscale_duration" \ @@ -377,7 +384,8 @@ if should_run 'capacity-autoscale'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output results_autoscaling_capacity.json + --output results_autoscaling_capacity.json \ + --xlsx-output results_autoscaling_capacity.xlsx echo "" echo "Capacity discovery complete. Check results_autoscaling_capacity.json for peak throughput." @@ -423,6 +431,7 @@ if should_run 'mlperf_submission'; then echo " PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput" echo " WARNING: Storage Throughput unreliable at cpu_mem=0GB" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-8b \ --num-users 200 \ --duration 300 \ @@ -432,7 +441,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_stress_8b.json + --output mlperf_v3_stress_8b.json \ + --xlsx-output mlperf_v3_stress_8b.xlsx echo "Maximum storage stress test (8B) complete." echo "" @@ -443,6 +453,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 2/4] Storage Throughput Test: llama3.1-8b, cpu_mem=4GB, 100 users..." echo " PRIMARY METRIC: Storage Throughput (tok/s)" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-8b \ --num-users 100 \ --duration 300 \ @@ -452,7 +463,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_throughput_8b.json + --output mlperf_v3_throughput_8b.json \ + --xlsx-output mlperf_v3_throughput_8b.xlsx echo "Storage throughput test (8B) complete." echo "" @@ -463,6 +475,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 3/4] Large Model Stress: llama3.1-70b-instruct, cpu_mem=0GB, 70 users..." echo " PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-70b-instruct \ --num-users 70 \ --duration 300 \ @@ -472,7 +485,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_stress_70b.json + --output mlperf_v3_stress_70b.json \ + --xlsx-output mlperf_v3_stress_70b.xlsx echo "Large model storage stress test (70B) complete." echo "" @@ -482,6 +496,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 4/4] Large Model Throughput: llama3.1-70b-instruct, cpu_mem=4GB, 50 users..." echo " PRIMARY METRIC: Storage Throughput (tok/s)" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-70b-instruct \ --num-users 50 \ --duration 300 \ @@ -491,7 +506,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_throughput_70b.json + --output mlperf_v3_throughput_70b.json \ + --xlsx-output mlperf_v3_throughput_70b.xlsx echo "Large model throughput test (70B) complete." echo "" @@ -523,6 +539,7 @@ if should_run 'gpu-only'; then if [ "$gpu_available" -eq 1 ]; then echo "[1/10] GPU Only - All cache in VRAM..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -531,7 +548,8 @@ if should_run 'gpu-only'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_only.json + --output results_tier_gpu_only.json \ + --xlsx-output results_tier_gpu_only.xlsx echo "" echo "GPU test complete. Expect lowest latency but limited capacity." @@ -552,6 +570,7 @@ fi if should_run 'cpu-only'; then echo "[2/10] CPU Only - All cache in RAM..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -560,7 +579,8 @@ if should_run 'cpu-only'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_cpu_only.json + --output results_tier_cpu_only.json \ + --xlsx-output results_tier_cpu_only.xlsx echo "" echo "CPU test complete. This is the typical production configuration." @@ -589,6 +609,7 @@ fi if should_run 'storage-only'; then echo "[3/10] TIER TEST: Storage Only - Pure NVMe/SSD caching..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -598,7 +619,8 @@ if should_run 'storage-only'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_storage_only.json + --output results_tier_storage_only.json \ + --xlsx-output results_tier_storage_only.xlsx echo "" echo "Expected: Highest latency, validates NVMe P95 < 200ms for reads" @@ -628,6 +650,7 @@ if should_run 'gpu-cpu'; then if [ "$gpu_available" -eq 1 ]; then echo "[4/10] TIER TEST: GPU + CPU - Two-tier hot/warm caching..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -636,7 +659,8 @@ if should_run 'gpu-cpu'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_cpu.json + --output results_tier_gpu_cpu.json \ + --xlsx-output results_tier_gpu_cpu.xlsx echo "" echo "Expected: Low latency with large capacity" @@ -670,6 +694,7 @@ fi if should_run 'cpu-storage'; then echo "[5/10] TIER TEST: CPU + Storage - RAM with NVMe spillover..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$tier_duration" \ @@ -679,7 +704,8 @@ if should_run 'cpu-storage'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_cpu_storage.json + --output results_tier_cpu_storage.json \ + --xlsx-output results_tier_cpu_storage.xlsx echo "" echo "Expected: Moderate latency, forces storage spillover with ${users_high} users" @@ -710,6 +736,7 @@ if should_run 'gpu-cpu-storage'; then if [ "$gpu_available" -eq 1 ]; then echo "[6/10] TIER TEST: GPU + CPU + Storage - Full three-tier hierarchy..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$tier_duration" \ @@ -719,7 +746,8 @@ if should_run 'gpu-cpu-storage'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_cpu_storage.json + --output results_tier_gpu_cpu_storage.json \ + --xlsx-output results_tier_gpu_cpu_storage.xlsx echo "" echo "Expected: Best overall - hot in GPU, warm in CPU, cold in storage" @@ -752,6 +780,7 @@ fi if should_run 'storage-saturation'; then echo "[7/10] STRESS TEST: Storage Saturation - Maximum NVMe load..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$saturation_duration" \ @@ -761,7 +790,8 @@ if should_run 'storage-saturation'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_stress_storage_saturation.json + --output results_stress_storage_saturation.json \ + --xlsx-output results_stress_storage_saturation.xlsx echo "" echo "Expected: High storage load, validates NVMe can handle ${users_high} users" @@ -796,6 +826,7 @@ fi if should_run 'production'; then echo "[8/10] REALISTIC TEST: Production Workload - Multi-tier with realistic load..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$realistic_duration" \ @@ -805,7 +836,8 @@ if should_run 'production'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_realistic_production.json + --output results_realistic_production.json \ + --xlsx-output results_realistic_production.xlsx echo "" echo "Expected: Balanced performance, realistic production scenario" @@ -839,6 +871,7 @@ fi if should_run 'autoscale'; then echo "[9/10] DISCOVERY TEST: Autoscaling - Find optimal user count..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users 20 \ --duration "$autoscale_duration" \ @@ -850,7 +883,8 @@ if should_run 'autoscale'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_autoscaling_discovery.json + --output results_autoscaling_discovery.json \ + --xlsx-output results_autoscaling_discovery.xlsx echo "" echo "Expected: Progressive scaling to find hardware limits" From 001fd3bdda7e87acb4d6b5fea17f700508ccdbe1 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:43:46 -0800 Subject: [PATCH 03/16] test(kv-cache): comprehensive pytest suite for v3.0 features - Add 170+ tests covering all new functionality - Add ConfigLoader tests: schema validation, defaults, file loading - Add cfg() helper tests for config-driven parameters - Add validate_args() tests for path safety and input validation - Add extended QoS tests for P99.9 and P99.99 percentiles - Add GPU eviction callback tests for metadata sync - Add per-tier bandwidth and KV bytes metric tests - Add storage_* metric naming tests for MLPerf compliance - Add waterfall eviction tests with high/low watermarks - Add storage_health PASS/FAIL criteria tests --- kv_cache_benchmark/tests/test_kv_cache.py | 754 +++++++++++++++++++++- 1 file changed, 753 insertions(+), 1 deletion(-) diff --git a/kv_cache_benchmark/tests/test_kv_cache.py b/kv_cache_benchmark/tests/test_kv_cache.py index cfa42f56..e99ba4d3 100644 --- a/kv_cache_benchmark/tests/test_kv_cache.py +++ b/kv_cache_benchmark/tests/test_kv_cache.py @@ -11,10 +11,17 @@ These tests verify core functionality without running the full benchmark. Typical execution time: < 5 seconds + +This version tests kv-cache.py which includes: +- ConfigLoader with YAML support and strict validation +- Extended QoS SLA with p999 and p9999 percentiles +- Config-driven parameters via cfg() helper +- Renamed nvme_* to storage_* in stats """ import os import sys +import argparse import tempfile import pytest import numpy as np @@ -22,8 +29,26 @@ from pathlib import Path # Import from kv-cache.py (handle the hyphen in filename) +# Try multiple locations: same directory, parent directory import importlib.util -spec = importlib.util.spec_from_file_location("kv_cache", os.path.join(os.path.dirname(__file__), "kv-cache.py")) + +_kv_cache_path = None +_possible_paths = [ + os.path.join(os.path.dirname(__file__), "kv-cache.py"), # Same directory + os.path.join(os.path.dirname(__file__), "..", "kv-cache.py"), # Parent directory +] +for _path in _possible_paths: + if os.path.exists(_path): + _kv_cache_path = _path + break + +if _kv_cache_path is None: + raise FileNotFoundError( + f"Could not find kv-cache.py. Searched in:\n" + + "\n".join(f" - {os.path.abspath(p)}" for p in _possible_paths) + ) + +spec = importlib.util.spec_from_file_location("kv_cache", _kv_cache_path) kv_cache = importlib.util.module_from_spec(spec) spec.loader.exec_module(kv_cache) @@ -44,6 +69,24 @@ MultiTierCache = kv_cache.MultiTierCache export_results_to_xlsx = kv_cache.export_results_to_xlsx PANDAS_AVAILABLE = kv_cache.PANDAS_AVAILABLE + +# New imports for 01-26-2026 version +ConfigLoader = kv_cache.ConfigLoader +cfg = kv_cache.cfg +get_config = kv_cache.get_config +set_config = kv_cache.set_config +get_qos_profiles = kv_cache.get_qos_profiles +QoSSLA = kv_cache.QoSSLA +YAML_AVAILABLE = kv_cache.YAML_AVAILABLE + +# Input validation imports +validate_args = kv_cache.validate_args +MAX_USERS = kv_cache.MAX_USERS +MAX_DURATION_SECONDS = kv_cache.MAX_DURATION_SECONDS +MAX_GPU_MEMORY_GB = kv_cache.MAX_GPU_MEMORY_GB +MAX_CPU_MEMORY_GB = kv_cache.MAX_CPU_MEMORY_GB +FORBIDDEN_CACHE_PREFIXES = kv_cache.FORBIDDEN_CACHE_PREFIXES + if PANDAS_AVAILABLE: import pandas as pd @@ -190,6 +233,171 @@ class MockArgs: return MockArgs() +@pytest.fixture +def sample_config_yaml(tmp_path): + """Create a sample config.yaml for testing.""" + config_content = ''' +user_templates: + chatbot: + context_range: [256, 1024] + generation_range: [50, 150] + think_time_range: [0.1, 0.5] + coding: + context_range: [1024, 4096] + generation_range: [100, 500] + think_time_range: [0.2, 1.0] + document: + context_range: [2048, 8192] + generation_range: [200, 800] + think_time_range: [0.3, 1.5] + +qos_profiles: + interactive: + target_latency_p95_ms: 50 + target_latency_p99_ms: 100 + target_latency_p999_ms: 150 + target_latency_p9999_ms: 200 + priority: 3 + responsive: + target_latency_p95_ms: 100 + target_latency_p99_ms: 200 + target_latency_p999_ms: 350 + target_latency_p9999_ms: 500 + priority: 2 + batch: + target_latency_p95_ms: 1000 + target_latency_p99_ms: 5000 + target_latency_p999_ms: 7500 + target_latency_p9999_ms: 10000 + priority: 1 + +qos_distribution: + interactive_probability: 0.15 + responsive_threshold: 0.50 + +eviction: + max_recursion_depth: 10 + target_usage_ratio: 0.8 + large_entry_limit_ratio: 0.95 + max_evictions_hard_cap: 5000 + max_evictions_min: 1000 + +decode: + batch_size: 32 + +conversation: + max_conversations: 1000 + max_turns_per_conv: 50 + end_conversation_probability: 0.2 +''' + config_file = tmp_path / "test_config.yaml" + config_file.write_text(config_content) + return str(config_file) + + +# ============================================================================= +# Test 0: ConfigLoader (New in 01-26-2026) +# ============================================================================= + +@pytest.mark.skipif(not YAML_AVAILABLE, reason="PyYAML not installed") +class TestConfigLoader: + """Tests for ConfigLoader and cfg() helper function.""" + + def test_config_loader_without_file(self): + """ConfigLoader should work without a config file.""" + loader = ConfigLoader(config_path=None) + assert loader is not None + assert loader.config == {} + + def test_config_loader_loads_yaml(self, sample_config_yaml): + """ConfigLoader should load and parse YAML file.""" + loader = ConfigLoader(config_path=sample_config_yaml) + assert loader.config is not None + assert 'qos_profiles' in loader.config + + def test_config_loader_get_nested_value(self, sample_config_yaml): + """ConfigLoader.get() should retrieve nested values.""" + loader = ConfigLoader(config_path=sample_config_yaml) + priority = loader.get('qos_profiles', 'interactive', 'priority') + assert priority == 3 + + def test_config_loader_get_with_default(self, sample_config_yaml): + """ConfigLoader.get() should return default for missing keys.""" + loader = ConfigLoader(config_path=sample_config_yaml) + value = loader.get('nonexistent', 'key', default=42) + assert value == 42 + + def test_cfg_without_global_config(self): + """cfg() should return default when no global config is set.""" + # Ensure no global config + set_config(None) + value = cfg('qos_profiles', 'interactive', 'priority', default=99) + assert value == 99 + + def test_cfg_with_global_config(self, sample_config_yaml): + """cfg() should retrieve values from global config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + value = cfg('qos_profiles', 'interactive', 'priority', default=99) + assert value == 3 + finally: + set_config(None) # Clean up + + def test_config_loader_validates_schema(self, tmp_path): + """ConfigLoader should reject unknown keys.""" + bad_config = tmp_path / "bad_config.yaml" + bad_config.write_text(''' +unknown_section: + bad_key: true +''') + with pytest.raises(ValueError, match="Unknown configuration key"): + ConfigLoader(config_path=str(bad_config)) + + def test_get_config_returns_none_initially(self): + """get_config() should return None before set_config() is called.""" + set_config(None) + assert get_config() is None + + def test_set_config_stores_loader(self, sample_config_yaml): + """set_config() should store the ConfigLoader globally.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + assert get_config() is loader + finally: + set_config(None) + + +class TestCfgHelper: + """Tests for cfg() helper function in various contexts.""" + + def test_cfg_returns_default_for_none_config(self): + """cfg() returns default when config is None.""" + set_config(None) + assert cfg('any', 'path', default='fallback') == 'fallback' + + def test_cfg_returns_default_for_missing_key(self, sample_config_yaml): + """cfg() returns default for missing nested keys.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + result = cfg('nonexistent', 'nested', 'key', default=123) + assert result == 123 + finally: + set_config(None) + + def test_cfg_retrieves_list_values(self, sample_config_yaml): + """cfg() can retrieve list values from config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + context_range = cfg('user_templates', 'chatbot', 'context_range') + assert context_range == [256, 1024] + finally: + set_config(None) + + # ============================================================================= # Test 1: ModelConfig # ============================================================================= @@ -318,6 +526,39 @@ def test_sla_compliance_starts_at_one(self): def test_interactive_target_latency(self): sla = QOS_PROFILES[QoSLevel.INTERACTIVE] assert sla.target_latency_p95_ms == 50 + + # New tests for extended QoS percentiles (01-26-2026 feature) + def test_interactive_has_p999_latency(self): + """Test that p999 percentile is defined for INTERACTIVE.""" + sla = QOS_PROFILES[QoSLevel.INTERACTIVE] + assert hasattr(sla, 'target_latency_p999_ms') + assert sla.target_latency_p999_ms > sla.target_latency_p99_ms + + def test_interactive_has_p9999_latency(self): + """Test that p9999 percentile is defined for INTERACTIVE.""" + sla = QOS_PROFILES[QoSLevel.INTERACTIVE] + assert hasattr(sla, 'target_latency_p9999_ms') + assert sla.target_latency_p9999_ms > sla.target_latency_p999_ms + + def test_all_qos_levels_have_extended_percentiles(self): + """Verify all QoS levels have p999 and p9999 defined.""" + for level in QoSLevel: + sla = QOS_PROFILES[level] + assert hasattr(sla, 'target_latency_p999_ms') + assert hasattr(sla, 'target_latency_p9999_ms') + + def test_get_qos_profiles_returns_dict(self): + """Test that get_qos_profiles() returns profiles dict.""" + profiles = get_qos_profiles() + assert isinstance(profiles, dict) + assert len(profiles) == 3 + + def test_get_qos_profiles_levels(self): + """Test that get_qos_profiles() has all QoS levels.""" + profiles = get_qos_profiles() + assert QoSLevel.INTERACTIVE in profiles + assert QoSLevel.RESPONSIVE in profiles + assert QoSLevel.BATCH in profiles # ============================================================================= @@ -877,6 +1118,515 @@ def test_initial_cpu_usage_zero(self, multi_tier_cache): assert cpu_usage == 0 +# ============================================================================= +# Test 13: Config-Driven Parameters (New in 01-26-2026) +# ============================================================================= + +class TestConfigDrivenConversationManager: + """Tests for ConversationManager with config-driven parameters.""" + + def test_default_max_conversations(self): + """Without config, should use hardcoded default of 1000.""" + set_config(None) + manager = ConversationManager() + assert manager.max_conversations == 1000 + + def test_default_max_turns(self): + """Without config, should use hardcoded default of 50.""" + set_config(None) + manager = ConversationManager() + assert manager.max_turns_per_conv == 50 + + def test_explicit_params_override_config(self, sample_config_yaml): + """Explicit constructor params should override config values.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + manager = ConversationManager(max_conversations=42, max_turns_per_conv=7) + assert manager.max_conversations == 42 + assert manager.max_turns_per_conv == 7 + finally: + set_config(None) + + +@pytest.mark.skipif(not YAML_AVAILABLE, reason="PyYAML not installed") +class TestConfigDrivenUserSimulator: + """Tests for UserSimulator with config-driven parameters.""" + + def test_user_templates_from_config(self, sample_config_yaml): + """UserSimulator should read templates from config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + templates = UserSimulator._get_user_templates() + assert 'chatbot' in templates + assert 'coding' in templates + assert 'document' in templates + assert templates['chatbot']['context_range'] == (256, 1024) + finally: + set_config(None) + + def test_qos_distribution_from_config(self, sample_config_yaml): + """UserSimulator.generate_mixed_users should use config QoS distribution.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + # Generate many users to test distribution + users = UserSimulator.generate_mixed_users(1000) + # With 15% interactive probability, expect ~150 interactive users + interactive_count = sum(1 for u in users if u.qos_level == QoSLevel.INTERACTIVE) + # Allow 50% variance for randomness + assert 75 <= interactive_count <= 225, f"Expected ~150 interactive, got {interactive_count}" + finally: + set_config(None) + + +# ============================================================================= +# Test 14: Stats Naming Convention (storage_* vs nvme_*) +# ============================================================================= + +class TestStatsNamingConvention: + """Tests that stats use 'storage_*' naming (not 'nvme_*') in 01-26-2026.""" + + def test_stats_use_storage_prefix(self, multi_tier_cache): + """Stats should use 'storage_' prefix instead of 'nvme_'.""" + multi_tier_cache.allocate_cache("test_entry", num_tokens=100) + multi_tier_cache.access_cache("test_entry", InferencePhase.DECODE) + stats = multi_tier_cache.get_stats(duration=1.0) + + # Check for storage_* naming + storage_keys = [k for k in stats.keys() if 'storage_' in k.lower()] + nvme_keys = [k for k in stats.keys() if 'nvme_' in k.lower()] + + # Should have storage_* keys + assert len(storage_keys) > 0, "Expected storage_* keys in stats" + + def test_tier_stats_key_format(self, multi_tier_cache): + """tier_storage_* keys should exist (renamed from tier_nvme_*).""" + multi_tier_cache.allocate_cache("test_entry", num_tokens=100) + stats = multi_tier_cache.get_stats(duration=1.0) + + # Check for tier_storage_* keys + tier_storage_keys = [k for k in stats.keys() if k.startswith('tier_storage_')] + assert len(tier_storage_keys) > 0, "Expected tier_storage_* keys in stats" + + +# ============================================================================= +# Test 15: GPUMemoryBackend Eviction Callback (New in 01-26-2026) +# ============================================================================= + +@pytest.mark.skipif(not CUDA_AVAILABLE, reason="CUDA not available") +class TestGPUMemoryBackendEvictionCallback: + """Tests for GPUMemoryBackend's on_eviction_callback feature.""" + + def test_gpu_backend_accepts_callback(self): + """GPUMemoryBackend should accept on_eviction_callback parameter.""" + evicted_keys = [] + def callback(key, tier, size): + evicted_keys.append((key, tier, size)) + + backend = GPUMemoryBackend(on_eviction_callback=callback) + assert backend.on_eviction_callback is callback + backend.clear() + + def test_gpu_backend_works_without_callback(self): + """GPUMemoryBackend should work without a callback (None).""" + backend = GPUMemoryBackend(on_eviction_callback=None) + assert backend.on_eviction_callback is None + backend.clear() + + +# ============================================================================= +# Test 16: Input Validation (validate_args) +# ============================================================================= + +class TestValidateArgs: + """Tests for the validate_args() input validation function.""" + + @pytest.fixture + def valid_args(self): + """Create a valid args namespace with all required attributes.""" + import argparse + args = argparse.Namespace( + num_users=100, + duration=60, + gpu_mem_gb=16, + cpu_mem_gb=32, + rag_num_docs=10, + max_conversations=500, + max_concurrent_allocs=0, + request_rate=0, + max_requests=0, + target_saturation=0.8, + cache_dir=None + ) + return args + + def test_valid_args_pass_through(self, valid_args): + """Valid arguments should pass validation and return unchanged.""" + result = validate_args(valid_args) + assert result is valid_args + assert result.num_users == 100 + assert result.duration == 60 + + def test_num_users_zero_rejected(self, valid_args): + """num_users=0 should raise ValueError.""" + valid_args.num_users = 0 + with pytest.raises(ValueError, match="num-users must be positive"): + validate_args(valid_args) + + def test_num_users_negative_rejected(self, valid_args): + """Negative num_users should raise ValueError.""" + valid_args.num_users = -5 + with pytest.raises(ValueError, match="num-users must be positive"): + validate_args(valid_args) + + def test_num_users_exceeds_limit(self, valid_args): + """num_users exceeding MAX_USERS should raise ValueError.""" + valid_args.num_users = MAX_USERS + 1 + with pytest.raises(ValueError, match="num-users exceeds limit"): + validate_args(valid_args) + + def test_duration_zero_rejected(self, valid_args): + """duration=0 should raise ValueError.""" + valid_args.duration = 0 + with pytest.raises(ValueError, match="duration must be positive"): + validate_args(valid_args) + + def test_duration_negative_rejected(self, valid_args): + """Negative duration should raise ValueError.""" + valid_args.duration = -10 + with pytest.raises(ValueError, match="duration must be positive"): + validate_args(valid_args) + + def test_duration_exceeds_limit(self, valid_args): + """duration exceeding 24 hours should raise ValueError.""" + valid_args.duration = MAX_DURATION_SECONDS + 1 + with pytest.raises(ValueError, match="duration exceeds 24 hours"): + validate_args(valid_args) + + def test_gpu_mem_negative_rejected(self, valid_args): + """Negative gpu_mem_gb should raise ValueError.""" + valid_args.gpu_mem_gb = -1 + with pytest.raises(ValueError, match="gpu-mem-gb cannot be negative"): + validate_args(valid_args) + + def test_gpu_mem_zero_allowed(self, valid_args): + """gpu_mem_gb=0 should be valid (disables GPU tier).""" + valid_args.gpu_mem_gb = 0 + result = validate_args(valid_args) + assert result.gpu_mem_gb == 0 + + def test_gpu_mem_exceeds_limit(self, valid_args): + """gpu_mem_gb exceeding limit should raise ValueError.""" + valid_args.gpu_mem_gb = MAX_GPU_MEMORY_GB + 1 + with pytest.raises(ValueError, match="gpu-mem-gb exceeds limit"): + validate_args(valid_args) + + def test_cpu_mem_negative_rejected(self, valid_args): + """Negative cpu_mem_gb should raise ValueError.""" + valid_args.cpu_mem_gb = -1 + with pytest.raises(ValueError, match="cpu-mem-gb cannot be negative"): + validate_args(valid_args) + + def test_cpu_mem_zero_allowed(self, valid_args): + """cpu_mem_gb=0 should be valid.""" + valid_args.cpu_mem_gb = 0 + result = validate_args(valid_args) + assert result.cpu_mem_gb == 0 + + def test_cpu_mem_exceeds_limit(self, valid_args): + """cpu_mem_gb exceeding limit should raise ValueError.""" + valid_args.cpu_mem_gb = MAX_CPU_MEMORY_GB + 1 + with pytest.raises(ValueError, match="cpu-mem-gb exceeds limit"): + validate_args(valid_args) + + def test_target_saturation_below_zero_rejected(self, valid_args): + """target_saturation < 0 should raise ValueError.""" + valid_args.target_saturation = -0.1 + with pytest.raises(ValueError, match="target-saturation must be between 0.0 and 1.0"): + validate_args(valid_args) + + def test_target_saturation_above_one_rejected(self, valid_args): + """target_saturation > 1 should raise ValueError.""" + valid_args.target_saturation = 1.5 + with pytest.raises(ValueError, match="target-saturation must be between 0.0 and 1.0"): + validate_args(valid_args) + + def test_target_saturation_boundaries_valid(self, valid_args): + """target_saturation at 0.0 and 1.0 should be valid.""" + valid_args.target_saturation = 0.0 + result = validate_args(valid_args) + assert result.target_saturation == 0.0 + + valid_args.target_saturation = 1.0 + result = validate_args(valid_args) + assert result.target_saturation == 1.0 + + def test_rag_num_docs_negative_rejected(self, valid_args): + """Negative rag_num_docs should raise ValueError.""" + valid_args.rag_num_docs = -1 + with pytest.raises(ValueError, match="rag-num-docs cannot be negative"): + validate_args(valid_args) + + def test_max_conversations_zero_rejected(self, valid_args): + """max_conversations=0 should raise ValueError.""" + valid_args.max_conversations = 0 + with pytest.raises(ValueError, match="max-conversations must be positive"): + validate_args(valid_args) + + def test_max_concurrent_allocs_negative_rejected(self, valid_args): + """Negative max_concurrent_allocs should raise ValueError.""" + valid_args.max_concurrent_allocs = -1 + with pytest.raises(ValueError, match="max-concurrent-allocs cannot be negative"): + validate_args(valid_args) + + def test_request_rate_negative_rejected(self, valid_args): + """Negative request_rate should raise ValueError.""" + valid_args.request_rate = -1 + with pytest.raises(ValueError, match="request-rate cannot be negative"): + validate_args(valid_args) + + def test_max_requests_negative_rejected(self, valid_args): + """Negative max_requests should raise ValueError.""" + valid_args.max_requests = -1 + with pytest.raises(ValueError, match="max-requests cannot be negative"): + validate_args(valid_args) + + @pytest.mark.skipif(sys.platform == 'win32', reason="Unix paths not valid on Windows") + def test_forbidden_cache_dir_rejected(self, valid_args): + """Cache directories in system paths should be rejected.""" + valid_args.cache_dir = '/etc/kv_cache' + with pytest.raises(ValueError, match="cannot be a system directory"): + validate_args(valid_args) + + def test_valid_cache_dir_allowed(self, valid_args, tmp_path): + """Valid cache directory should be accepted.""" + valid_args.cache_dir = str(tmp_path / "kv_cache_test") + result = validate_args(valid_args) + assert result.cache_dir == str(tmp_path / "kv_cache_test") + + def test_multiple_errors_collected(self, valid_args): + """Multiple validation errors should all be reported.""" + valid_args.num_users = -1 + valid_args.duration = -1 + valid_args.gpu_mem_gb = -1 + with pytest.raises(ValueError) as exc_info: + validate_args(valid_args) + # All three errors should be in the message + error_msg = str(exc_info.value) + assert "num-users" in error_msg + assert "duration" in error_msg + assert "gpu-mem-gb" in error_msg + + +# ============================================================================= +# Test 17: Per-Tier Phase Metrics +# ============================================================================= + +class TestPerTierPhaseMetrics: + """Tests for per-tier KV bytes tracking (prefill/decode per tier).""" + + @pytest.fixture + def tiny_model_config(self): + """Return the tiny-1b model config for fast tests.""" + return MODEL_CONFIGS['tiny-1b'] + + @pytest.fixture + def multi_tier_cache_cpu_only(self, tiny_model_config): + """Return a MultiTierCache in CPU-only mode (GPU disabled).""" + return MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=0, + cpu_memory_gb=0.1, # 100MB + seed=42 + ) + + def test_stats_have_tier_kv_bytes_written_keys(self, multi_tier_cache_cpu_only): + """Stats should include tier_*_kv_bytes_written keys.""" + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Check for per-tier write tracking + assert 'tier_gpu_kv_bytes_written_gb' in stats + assert 'tier_cpu_kv_bytes_written_gb' in stats + assert 'tier_storage_kv_bytes_written_gb' in stats + + def test_stats_have_tier_kv_bytes_read_keys(self, multi_tier_cache_cpu_only): + """Stats should include tier_*_kv_bytes_read keys.""" + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + multi_tier_cache_cpu_only.access_cache("test_entry", InferencePhase.DECODE) + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Check for per-tier read tracking + assert 'tier_gpu_kv_bytes_read_gb' in stats + assert 'tier_cpu_kv_bytes_read_gb' in stats + assert 'tier_storage_kv_bytes_read_gb' in stats + + def test_cpu_write_bytes_increment_on_allocate(self, multi_tier_cache_cpu_only): + """Allocating to CPU tier should increment tier_cpu_kv_bytes_written.""" + # Get initial stats + stats_before = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_written_before = stats_before.get('tier_cpu_kv_bytes_written_gb', 0) + + # Allocate cache entry (goes to CPU since GPU is disabled) + success, location, _ = multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + assert success + assert location == 'cpu' + + # Check that CPU write bytes increased + stats_after = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_written_after = stats_after.get('tier_cpu_kv_bytes_written_gb', 0) + + assert cpu_written_after > cpu_written_before, \ + f"CPU write bytes should increase: {cpu_written_before} -> {cpu_written_after}" + + def test_cpu_read_bytes_increment_on_access(self, multi_tier_cache_cpu_only): + """Accessing from CPU tier should increment tier_cpu_kv_bytes_read.""" + # Allocate first + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + + # Get stats before access + stats_before = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_read_before = stats_before.get('tier_cpu_kv_bytes_read_gb', 0) + + # Access the cache entry + location, _ = multi_tier_cache_cpu_only.access_cache("test_entry", InferencePhase.DECODE) + assert location == 'cpu' + + # Check that CPU read bytes increased + stats_after = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_read_after = stats_after.get('tier_cpu_kv_bytes_read_gb', 0) + + assert cpu_read_after > cpu_read_before, \ + f"CPU read bytes should increase: {cpu_read_before} -> {cpu_read_after}" + + def test_gpu_bytes_zero_when_gpu_disabled(self, multi_tier_cache_cpu_only): + """With GPU disabled (0 GB), GPU tier bytes should remain zero.""" + # Do some allocations and accesses + for i in range(5): + multi_tier_cache_cpu_only.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(5): + multi_tier_cache_cpu_only.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # GPU bytes should be zero since GPU tier is disabled + assert stats.get('tier_gpu_kv_bytes_written_gb', 0) == 0, \ + "GPU write bytes should be 0 when GPU disabled" + assert stats.get('tier_gpu_kv_bytes_read_gb', 0) == 0, \ + "GPU read bytes should be 0 when GPU disabled" + + def test_storage_tier_overflow(self, tiny_model_config): + """When CPU is full, allocations should overflow to storage tier.""" + # Create cache with very small CPU limit + cache = MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=0, + cpu_memory_gb=0.001, # 1MB - very small + seed=42 + ) + + # Allocate enough to overflow CPU + for i in range(20): + cache.allocate_cache(f"entry_{i}", num_tokens=1000) + + stats = cache.get_stats(duration=1.0) + + # Storage tier should have received some data + storage_written = stats.get('tier_storage_kv_bytes_written_gb', 0) + assert storage_written > 0, \ + f"Storage tier should have data when CPU overflows: {storage_written}" + + def test_per_tier_bandwidth_calculated(self, multi_tier_cache_cpu_only): + """Per-tier bandwidth stats should be calculated.""" + # Do some I/O + for i in range(10): + multi_tier_cache_cpu_only.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(10): + multi_tier_cache_cpu_only.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Bandwidth stats should exist + assert 'tier_cpu_read_bandwidth_gbps' in stats + assert 'tier_cpu_write_bandwidth_gbps' in stats + assert 'tier_storage_read_bandwidth_gbps' in stats + assert 'tier_storage_write_bandwidth_gbps' in stats + + +@pytest.mark.skipif(not CUDA_AVAILABLE, reason="CUDA not available") +class TestPerTierPhaseMetricsWithGPU: + """Tests for per-tier metrics when GPU is enabled.""" + + @pytest.fixture + def tiny_model_config(self): + """Return the tiny-1b model config for fast tests.""" + return MODEL_CONFIGS['tiny-1b'] + + @pytest.fixture + def multi_tier_cache_with_gpu(self, tiny_model_config): + """Return a MultiTierCache with GPU enabled.""" + return MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=1.0, # 1GB GPU + cpu_memory_gb=0.1, # 100MB CPU + seed=42 + ) + + def test_gpu_write_bytes_increment_on_allocate(self, multi_tier_cache_with_gpu): + """Allocating to GPU tier should increment tier_gpu_kv_bytes_written.""" + # Get initial stats + stats_before = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_written_before = stats_before.get('tier_gpu_kv_bytes_written_gb', 0) + + # Allocate cache entry (should go to GPU first) + success, location, _ = multi_tier_cache_with_gpu.allocate_cache("test_entry", num_tokens=100) + assert success + assert location == 'gpu' + + # Check that GPU write bytes increased + stats_after = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_written_after = stats_after.get('tier_gpu_kv_bytes_written_gb', 0) + + assert gpu_written_after > gpu_written_before, \ + f"GPU write bytes should increase: {gpu_written_before} -> {gpu_written_after}" + + def test_gpu_read_bytes_increment_on_access(self, multi_tier_cache_with_gpu): + """Accessing from GPU tier should increment tier_gpu_kv_bytes_read.""" + # Allocate first + multi_tier_cache_with_gpu.allocate_cache("test_entry", num_tokens=100) + + # Get stats before access + stats_before = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_read_before = stats_before.get('tier_gpu_kv_bytes_read_gb', 0) + + # Access the cache entry + location, _ = multi_tier_cache_with_gpu.access_cache("test_entry", InferencePhase.DECODE) + assert location == 'gpu' + + # Check that GPU read bytes increased + stats_after = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_read_after = stats_after.get('tier_gpu_kv_bytes_read_gb', 0) + + assert gpu_read_after > gpu_read_before, \ + f"GPU read bytes should increase: {gpu_read_before} -> {gpu_read_after}" + + def test_gpu_bandwidth_calculated(self, multi_tier_cache_with_gpu): + """GPU tier bandwidth stats should be calculated.""" + # Do some I/O + for i in range(5): + multi_tier_cache_with_gpu.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(5): + multi_tier_cache_with_gpu.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_with_gpu.get_stats(duration=1.0) + + # GPU bandwidth stats should exist + assert 'tier_gpu_read_bandwidth_gbps' in stats + assert 'tier_gpu_write_bandwidth_gbps' in stats + + # ============================================================================= # Main entry point for running without pytest # ============================================================================= @@ -885,8 +1635,10 @@ def pytest_configure(config): """Add metadata to pytest-html report.""" if hasattr(config, '_metadata'): config._metadata['Project'] = 'MLPerf v3 KV Cache Benchmark' + config._metadata['Source File'] = 'kv-cache.py' config._metadata['Models'] = 'tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, llama3.1-70b-instruct' config._metadata['Test File'] = 'test_kv_cache.py' + config._metadata['New Features Tested'] = 'ConfigLoader, Extended QoS (p999/p9999), cfg() helper, storage_* naming' def pytest_html_report_title(report): From 29562889768fe75e312c55e8b822503fb478848d Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:44:14 -0800 Subject: [PATCH 04/16] docs(readme): comprehensive documentation for v3.0 - Add Configuration section with YAML parameter reference - Add MLPerf Submission Guidelines with validated commands - Add Excel metrics reference table with all output columns - Add installation instructions including pyyaml dependency - Add CLI arguments vs config file precedence documentation - Add workload definitions and tier configuration examples - Add troubleshooting section for common issues --- kv_cache_benchmark/README.md | 2098 +++++++++++++++++++++------------- 1 file changed, 1332 insertions(+), 766 deletions(-) diff --git a/kv_cache_benchmark/README.md b/kv_cache_benchmark/README.md index 5f0637c1..b7599b28 100644 --- a/kv_cache_benchmark/README.md +++ b/kv_cache_benchmark/README.md @@ -1,766 +1,1332 @@ -# MLPerf Storage KV Cache Benchmark - -A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration. - -**Author:** Hazem Awadallah, Kingston Digital -**License:** Apache 2.0 -**Version:** MLPerf Storage v3.0 (Enhanced) - ---- - -## Table of Contents - -1. [What This Benchmark Does](#what-this-benchmark-does) -2. [Architecture Overview](#architecture-overview) -3. [System Requirements](#system-requirements) -4. [Installation](#installation) -5. [Quick Start](#quick-start) -6. [Running the Benchmark](#running-the-benchmark) -7. [ShareGPT Replay Workloads](#sharegpt-replay-workloads) -8. [Using the Wrapper Script](#using-the-wrapper-script) -9. [Understanding Results](#understanding-results) -10. [Unit Testing](#unit-testing) -11. [Excel Export](#excel-export) -12. [MLPerf Submission Guidelines](#mlperf-submission-guidelines) -13. [Troubleshooting](#troubleshooting) - ---- - -## What This Benchmark Does - -During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage. - -This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components: - -- How many concurrent users your hardware can support -- Whether your NVMe drive is fast enough to handle cache spillover -- The real latency impact of each storage tier -- Where the bottleneck sits in your system - -This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers. - ---- - -## Architecture Overview - -The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems. - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ KV Cache Benchmark Architecture │ -└─────────────────────────────────────────────────────────────────────────────┘ - - ┌──────────────────┐ - │ User Requests │ - │ (Multi-tenant) │ - └────────┬─────────┘ - │ - ▼ - ┌──────────────────────────────────────┐ - │ Request Queue │ - │ (Priority-based: QoS levels) │ - │ Interactive > Responsive > Batch │ - └──────────────────┬───────────────────┘ - │ - ▼ - ┌────────────────────────────────────────────────────────┐ - │ IntegratedBenchmark │ - │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ - │ │ Prefill │ │ Decode │ │ Conversation │ │ - │ │ (Write) │ │ (Read) │ │ Manager │ │ - │ └──────┬──────┘ └──────┬──────┘ └────────┬────────┘ │ - └─────────┼────────────────┼─────────────────┼───────────┘ - │ │ │ - └────────────────┼─────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────────────────┐ -│ MultiTierCache │ -│ (Waterfall LRU Eviction) │ -│ │ -│ New Data ─────► Always targets fastest available tier │ -│ If full, LRU entry cascades down │ -│ │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ │ │ -│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │ -│ │ │ GPU VRAM │ │ CPU RAM │ │ NVMe │ │ │ -│ │ │ (Tier 1) │─────►│ (Tier 2) │─────►│ (Tier 3) │ │ │ -│ │ │ │ LRU │ │ LRU │ │ │ │ -│ │ │ Sub-ms │evict │ Tens of ms │evict │ Hundreds │ │ │ -│ │ │ latency │ │ latency │ │ of ms │ │ │ -│ │ │ │ │ │ │ │ │ │ -│ │ │ PyTorch/CuPy │ │ NumPy arrays │ │ .npy files │ │ │ -│ │ │ tensors │ │ in memory │ │ on disk │ │ │ -│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │ -│ │ │ │ -│ │ ◄──── HOT DATA ────────────────────────────── COLD DATA ────► │ │ -│ │ │ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - -### Key Components - -**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier. - -**Inference Phases**: The benchmark models two distinct I/O patterns: -- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries. -- **Decode**: Read-heavy. Generating each output token requires reading the existing cache. - -**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels. - -**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode). - ---- - -## System Requirements - -### Minimum - -- CPU: 8+ cores (AMD EPYC, Intel Xeon) -- RAM: 32 GB -- Storage: 256 GB free space on SSD -- OS: Linux (Ubuntu 22.04, RHEL 9, or similar) -- Python: 3.8 or higher -- No GPU required (runs in CPU-only mode) - -### Recommended - -- CPU: 32+ cores -- RAM: 128 GB or more -- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing) -- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5) -- Tools: `bc`, `jq` for the wrapper script - ---- - -## Installation - -1. Clone or download this repository. - -2. Install Python dependencies: - -```bash -pip install -r requirements.txt -``` - -Or install core dependencies manually: - -```bash -pip install numpy -``` - -3. For GPU support (optional): - -```bash -pip install torch # or cupy-cuda12x for CuPy -``` - -4. For ShareGPT replay workloads (optional): - -```bash -pip install tiktoken -``` - -5. For Excel export (optional): - -```bash -pip install pandas openpyxl -``` - -6. Verify the installation: - -```bash -python3 kv-cache.py --help -``` - ---- - -## Quick Start - -Run a basic storage test with 50 users for 2 minutes: - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 120 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results.json -``` - -This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance. - ---- - -## Running the Benchmark - -### Command Line Options - -``` -python3 kv-cache.py [options] - -Required Arguments: - --model MODEL Model configuration to use. Choices: - tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, - llama3.1-70b-instruct - --num-users N Number of concurrent users to simulate - --duration SECONDS Duration of the benchmark in seconds - -Memory Configuration: - --gpu-mem-gb N GPU VRAM budget in GB (0 to disable GPU tier) - --cpu-mem-gb N CPU RAM budget in GB (0 to disable CPU tier) - --cache-dir PATH Directory for NVMe cache files (defaults to temp directory) - -Token Generation: - --generation-mode Token generation speed simulation. Choices: - - none: Pure storage test, no GPU simulation - - fast: 2ms per token (high-end GPU) - - realistic: 30ms per token (typical production) - -Caching Features: - --disable-multi-turn Disable multi-turn conversation caching - --disable-prefix-caching - Disable prefix caching (shared system prompts) - -Autoscaling: - --enable-autoscaling Enable workload autoscaling - --autoscaler-mode Autoscaling strategy. Choices: - - qos: Latency-based, finds max users at target saturation - - capacity: Throughput-based, finds peak storage performance - --target-saturation N Target storage saturation for QoS autoscaling (0.0-1.0, - default: 0.8) - -ShareGPT Replay (NEW): - --dataset-path PATH Path to ShareGPT JSON for realistic workload replay - --max-conversations N Max conversations to load from dataset (default: 500) - --request-rate RATE Target request arrival rate (requests/sec) - --max-requests N Stop after N requests (for fixed-length runs) - -RAG Workload: - --enable-rag Enable RAG workload simulation - --rag-num-docs N Number of RAG documents to ingest - -Performance and Output: - --performance-profile Profile for pass/fail criteria. Choices: - - latency: Default, evaluates P95 latency targets - - throughput: For MLPerf submission, evaluates tokens/sec - --output FILE Write results to JSON file - --xlsx-output FILE Export results to Excel/CSV file (NEW) - --seed N Seed for random number generators (required for MLPerf - reproducibility) - -Resource Limits: - --max-concurrent-allocs N - Limit concurrent cache allocations to bound RAM usage. - 0 = unlimited. Recommended: 8-16 for large models to - prevent memory explosion. -``` - -### Test Scenarios - -#### Scenario 1: Storage-Only Baseline - -Isolate your NVMe drive by setting GPU memory to zero. This tells you the raw performance of your storage. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 180 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_storage_only.json -``` - -#### Scenario 2: Realistic Production Setup - -Test a balanced three-tier configuration that mirrors production deployment. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 100 \ - --duration 300 \ - --gpu-mem-gb 16 \ - --cpu-mem-gb 32 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_production.json -``` - -#### Scenario 3: Find Maximum User Count (QoS Mode) - -Let the autoscaler discover how many users your system can handle while maintaining acceptable latency. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 20 \ - --duration 300 \ - --gpu-mem-gb 16 \ - --cpu-mem-gb 32 \ - --enable-autoscaling \ - --autoscaler-mode qos \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_autoscale_qos.json -``` - -#### Scenario 4: Find Peak Storage Throughput (Capacity Mode) - -Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints. - -```bash -python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --num-users 10 \ - --duration 180 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --enable-autoscaling \ - --autoscaler-mode capacity \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_capacity.json -``` - ---- - -## ShareGPT Replay Workloads - -While synthetic workloads are excellent for controlled stress testing, they may not capture the nuances of real human-AI interaction. The **ShareGPT Replay** feature addresses this by loading actual conversation data. - -### Why Use ShareGPT? - -Real conversations exhibit different patterns than synthetic workloads: -- **Higher cache locality**: Users ask follow-up questions, reusing context -- **Variable context sizes**: Real queries vary wildly (10-16,000 tokens) -- **Multi-turn structure**: Conversation flows are preserved - -### Downloading the ShareGPT Dataset - -Download the full dataset from Hugging Face (~1.2 GB): - -```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -``` - -**Alternative: Smaller subset for quick testing (~40 MB):** - -```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json -``` - -### Basic ShareGPT Invocation - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-conversations 500 \ - --num-users 50 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt.json -``` - -### ShareGPT with Rate Limiting - -Control the request arrival rate for steady-state testing: - -```bash -python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-conversations 1000 \ - --request-rate 10.0 \ - --num-users 100 \ - --duration 600 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 8 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt_rate_limited.json -``` - -### ShareGPT with Fixed Request Count - -Run exactly N requests for reproducible benchmarks: - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-requests 5000 \ - --num-users 50 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt_fixed.json -``` - -### Comparing Real vs Synthetic Workloads - -| Metric | ShareGPT (Real) | Synthetic (Random) | -| :--- | :--- | :--- | -| Mean Context Size | ~133 tokens | ~2,676 tokens | -| Cache Hit Rate | 85-97% | 50-70% | -| Multi-turn Locality | High | Medium | -| Throughput | Higher | Lower | -| NVMe Stress | Moderate | Extreme | - -**Use ShareGPT** when you want to model real chatbot/assistant usage. -**Use Synthetic** when you want worst-case stress testing or controlled experiments. - ---- - -## Using the Wrapper Script - -The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios. - -### Basic Usage - -```bash -./kv-cache-wrapper.sh -``` - -This runs all test scenarios with default settings. Expect roughly 30 minutes for the full suite. - -### Options - -``` -./kv-cache-wrapper.sh [options] - - -m MODEL Model to benchmark (default: llama3.1-8b) - -t SECONDS Duration for tier comparison tests (default: 120) - -s SECONDS Duration for storage saturation test (default: 180) - -r SECONDS Duration for production test (default: 180) - -a SECONDS Duration for autoscaling tests (default: 300) - -w LIST Comma-separated list of workloads to run - -u USERS Override baseline user count - -U USERS Override high-load user count - -R Enable RAG workload - -D DOCS Number of RAG documents (default: 10) - -h Show help -``` - -### Available Workloads - -```bash -# Run only the storage isolation test -./kv-cache-wrapper.sh -w storage-only - -# Run production and autoscaling tests -./kv-cache-wrapper.sh -w production,autoscale - -# Run MLPerf submission tests -./kv-cache-wrapper.sh -w mlperf_submission -``` - ---- - -## Understanding Results - -### Key Metrics - -**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better. - -**Storage Throughput (tokens/sec)**: Raw I/O performance calculated from storage latency, not wall-clock time. This is the fairer metric for comparing storage tiers. - -**End-to-End Latency**: Total time from request submission to completion. This is what users experience. - -**Storage I/O Latency**: Time spent reading from and writing to storage tiers. This measures your hardware. - -**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded. - -**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure. - -### Reading the Output - -``` -### STORAGE PERFORMANCE ASSESSMENT: PASS ### - Criteria Passed: 4/4 - [PASS] NVMe Write P95 < 500ms: 45.20ms - [PASS] NVMe Read P95 < 200ms: 123.45ms - [PASS] CPU RAM P95 < 150ms: 12.30ms - [PASS] Cache Hit Rate > 30%: 67.5% - -### OVERALL PERFORMANCE ### - Total Requests: 2847 - Total Tokens Generated: 489,231 - Avg Throughput: 1,630.77 tok/s - Storage Throughput: 2,105.32 tok/s - -### LATENCY BREAKDOWN ### - End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms - Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms -``` - ---- - -## Unit Testing - -This package includes a comprehensive pytest-based test suite to verify core functionality without running the full benchmark. - -### Running Tests - -```bash -# Run all tests with verbose output -pytest test_kv_cache.py -v - -# Run with shorter traceback -pytest test_kv_cache.py -v --tb=short - -# Run specific test class -pytest test_kv_cache.py -k "TestModelConfig" -v - -# Run only CPU tests (skip GPU tests if no CUDA) -pytest test_kv_cache.py -v -m "not skipif" -``` - -### Test Coverage - -The test suite covers 12 component categories: - -| Test Class | Coverage | -|------------|----------| -| `TestModelConfig` | Model configurations, KV cache size calculations | -| `TestInferenceRequest` | Request dataclass, cache key generation | -| `TestQoSProfiles` | QoS levels, SLA targets, priorities | -| `TestKVCacheGenerator` | Determinism, shapes, dtypes, precomputed buffers | -| `TestCPUMemoryBackend` | Write/read/delete/clear operations | -| `TestNVMeBackend` | File I/O, metadata, temp directories | -| `TestGPUMemoryBackend` | CUDA tensors, device placement (skipped without GPU) | -| `TestConversationManager` | Multi-turn tracking, eviction | -| `TestUserSimulator` | User generation, QoS distribution | -| `TestMultiTierCache` | CPU-only mode, allocation, access | -| `TestMultiTierCacheWithGPU` | GPU tier, waterfall eviction (skipped without GPU) | -| `TestXLSXExport` | CSV/Excel export (skipped without pandas) | - -### Expected Runtime - -- **Without GPU**: ~3-5 seconds -- **With GPU**: ~5-10 seconds - -GPU tests are automatically skipped if CUDA is not available. - ---- - -## Excel Export - -The benchmark can export results directly to Excel or CSV format for analysis. - -### Basic Usage - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 120 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --seed 42 \ - --output results.json \ - --xlsx-output results.xlsx -``` - -### Output Format - -The Excel file contains a single row with all key metrics: - -| Column | Description | -|--------|-------------| -| Model | Model configuration used | -| Num Users | Concurrent user count | -| Duration (s) | Benchmark duration | -| GPU Mem (GB) | GPU memory budget | -| CPU Mem (GB) | CPU memory budget | -| Total Requests | Requests completed | -| Total Tokens | Tokens processed | -| Avg Throughput (tok/s) | Wall-clock throughput | -| Storage Throughput (tok/s) | Storage I/O throughput | -| Cache Hit Rate | Percentage of cache hits | -| E2E Latency P95 (ms) | End-to-end 95th percentile | -| Storage IO P95 (ms) | Storage I/O 95th percentile | - -### Fallback Behavior - -- **With openpyxl**: Exports to `.xlsx` format -- **Without openpyxl**: Falls back to `.csv` format -- **Without pandas**: Export is skipped with a warning - ---- - -## MLPerf Submission Guidelines - -For official MLPerf v3.0 storage submissions, use these standardized commands. **These invocations have been validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests comparing 14,000 MB/s vs 3,000 MB/s storage). - -### Discovery Test Key Findings - -| Finding | Impact | -|---------|--------| -| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB | -| **Best models for differentiation** | llama3.1-8b and mistral-7b show 2.31x ratio | -| **High variance observed** | CV 50-125%, requires 3-5 trials minimum | -| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB | - -### Option 1: Maximum Storage Stress (cpu_mem=0GB) - -Use when you want to stress test NVMe and measure I/O volume differentiation. - -**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation) - -```bash -# MLPerf v3.0: Maximum Storage Stress Test (8B Model) -# Run 3-5 trials for statistical significance -for trial in 1 2 3 4 5; do - python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 200 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 0 \ - --max-concurrent-allocs 16 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_stress_8b_trial${trial}.json -done -``` - -**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead. - -### Option 2: Storage Throughput Focus (cpu_mem=4GB) - -Use when you want Storage Throughput (tok/s) as your primary metric. - -**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate) - -```bash -# MLPerf v3.0: Storage Throughput Test (8B Model) -for trial in 1 2 3 4 5; do - python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 100 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --max-concurrent-allocs 0 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_throughput_8b_trial${trial}.json -done -``` - -### Option 3: Large Model Submission (70B) - -For maximum per-request storage stress (10x larger KV cache per token): - -```bash -# MLPerf v3.0: Large Model Storage Stress -for trial in 1 2 3; do - python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --num-users 70 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 0 \ - --max-concurrent-allocs 4 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_stress_70b_trial${trial}.json -done -``` - -### Critical Parameters (Discovery-Validated) - -| Parameter | Value | Rationale | -|-----------|-------|-----------| -| **seed 42** | Required | Reproducibility across systems | -| **gpu-mem-gb 0** | Required | Isolates storage performance | -| **cpu-mem-gb** | 0 or 4 | 0GB for max stress (use I/O volume metrics), 4GB for Storage Throughput metric | -| **max-concurrent-allocs** | 0, 4, or 16 | 0 for throughput, 16 for stress testing | -| **generation-mode** | none or realistic | none for pure I/O, realistic for production simulation | -| **num-users** | 100-200 | Differentiation stable across range; higher = more throughput | -| **duration** | 300-600 | 5-10 minutes for stable metrics | - -### Trial Requirements - -| User Count | Variance (CV) | Minimum Trials | -|------------|---------------|----------------| -| 10 users | ~52% | 3 | -| 50-100 users | ~115-125% | 3-5 | -| 200 users | ~110-120% | 3-5 | - -Report **median** rather than mean for publication-quality results. - ---- - -## Troubleshooting - -### Out of Memory Errors - -Reduce the number of concurrent users or limit parallel allocations: - -```bash -python3 kv-cache.py ... --max-concurrent-allocs 50 -``` - -### Benchmark Hangs - -The system may be thrashing. Reduce users or increase memory budgets. - -### Poor Cache Hit Rates - -Low hit rates indicate your working set exceeds available fast memory. Either: -- Increase GPU/CPU memory budgets -- Reduce user count -- Accept that cold data will hit storage - -### Results Vary Between Runs - -Use the `--seed` flag for reproducible results. - ---- - -## Files in This Package - -- `kv-cache.py`: Main benchmark implementation with ShareGPT support -- `test_kv_cache.py`: Pytest unit test suite -- `requirements.txt`: Python dependencies -- `README.md`: This documentation -- `MLperf v3 KV cache proposal.md`: Detailed technical documentation - ---- - -## License - -Apache License 2.0 - ---- - -## Contact - -For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group. +# MLPerf Storage KV Cache Benchmark + +A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration. + +**Author:** Hazem Awadallah, Kingston Digital +**License:** Apache 2.0 +**Version:** MLPerf Storage v3.0 (Enhanced) +**Updated:** January 27, 2026 + +--- + +## Table of Contents + +1. [What This Benchmark Does](#what-this-benchmark-does) +2. [Architecture Overview](#architecture-overview) +3. [System Requirements](#system-requirements) +4. [Installation](#installation) +5. [Configuration](#configuration) +6. [Quick Start](#quick-start) +7. [Running the Benchmark](#running-the-benchmark) +8. [ShareGPT Replay Workloads](#sharegpt-replay-workloads) +9. [Using the Wrapper Script](#using-the-wrapper-script) +10. [Understanding Results](#understanding-results) +11. [Unit Testing](#unit-testing) +12. [Excel Export](#excel-export) +13. [MLPerf Submission Guidelines](#mlperf-submission-guidelines) +14. [Troubleshooting](#troubleshooting) + +--- + +## What This Benchmark Does + +During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage. + +This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components: + +- How many concurrent users your hardware can support +- Whether your NVMe drive is fast enough to handle cache spillover +- The real latency impact of each storage tier +- Where the bottleneck sits in your system + +This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers. + +--- + +## Architecture Overview + +The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems. + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ KV Cache Benchmark Architecture │ +└─────────────────────────────────────────────────────────────────────────────┘ + + ┌──────────────────┐ + │ User Requests │ + │ (Multi-tenant) │ + └────────┬─────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ Request Queue │ + │ (Priority-based: QoS levels) │ + │ Interactive > Responsive > Batch │ + └──────────────────┬───────────────────┘ + │ + ▼ + ┌────────────────────────────────────────────────────────┐ + │ IntegratedBenchmark │ + │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ + │ │ Prefill │ │ Decode │ │ Conversation │ │ + │ │ (Write) │ │ (Read) │ │ Manager │ │ + │ └──────┬──────┘ └──────┬──────┘ └────────┬────────┘ │ + └─────────┼────────────────┼─────────────────┼───────────┘ + │ │ │ + └────────────────┼─────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MultiTierCache │ +│ (Waterfall LRU Eviction) │ +│ │ +│ New Data ─────► Always targets fastest available tier │ +│ If full, LRU entry cascades down │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ │ │ +│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │ +│ │ │ GPU VRAM │ │ CPU RAM │ │ NVMe │ │ │ +│ │ │ (Tier 1) │─────►│ (Tier 2) │─────►│ (Tier 3) │ │ │ +│ │ │ │ LRU │ │ LRU │ │ │ │ +│ │ │ Sub-ms │evict │ Tens of ms │evict │ Hundreds │ │ │ +│ │ │ latency │ │ latency │ │ of ms │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ PyTorch/CuPy │ │ NumPy arrays │ │ .npy files │ │ │ +│ │ │ tensors │ │ in memory │ │ on disk │ │ │ +│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │ +│ │ │ │ +│ │ ◄──── HOT DATA ────────────────────────────── COLD DATA ────► │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier. + +**Inference Phases**: The benchmark models two distinct I/O patterns: +- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries. +- **Decode**: Read-heavy. Generating each output token requires reading the existing cache. + +**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels. + +**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode). + +--- + +## System Requirements + +### Minimum + +- CPU: 8+ cores (AMD EPYC, Intel Xeon) +- RAM: 32 GB +- Storage: 256 GB free space on SSD +- OS: Linux (Ubuntu 22.04, RHEL 9, or similar) or Windows +- Python: 3.8 or higher +- No GPU required (runs in CPU-only mode) + +### Recommended + +- CPU: 32+ cores +- RAM: 128 GB or more +- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing) +- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5) +- Tools: `bc`, `jq` for the wrapper script (Linux) + +--- + +## Installation + +1. Clone or download this repository. + +2. Install Python dependencies: + +```bash +pip install -r requirements.txt +``` + +Or install core dependencies manually: + +```bash +pip install numpy pyyaml +``` + +3. For GPU support (optional): + +```bash +pip install torch # or cupy-cuda12x for CuPy +``` + +4. For ShareGPT replay workloads (optional): + +```bash +pip install tiktoken +``` + +5. For Excel export (optional): + +```bash +pip install pandas openpyxl +``` + +6. Verify the installation: + +```bash +python3 kv-cache.py --help +``` + +--- + +## Configuration + +The benchmark supports a YAML configuration file (`config.yaml`) for tuning internal parameters without modifying the source code. This is the **recommended approach** for MLPerf submissions to ensure reproducibility. + +### Using the Configuration File + +```bash +python3 kv-cache.py --config config.yaml [other CLI arguments] +``` + +**Note:** CLI arguments always take precedence over config file values for overlapping settings. + +### Configuration File Parameters (config.yaml) + +The configuration file controls internal benchmark behavior that affects workload realism and cache dynamics. These settings are **not** exposed as CLI arguments to prevent accidental misconfigurations in MLPerf submissions. + +> **Tip:** For most benchmarking scenarios, the defaults are carefully tuned. Only modify these if you understand the impact on your results. + +--- + +#### User Templates + +Controls the three simulated user personas. Each persona has distinct characteristics that model real-world usage patterns. + +| Persona | Behavior | Use Case | +|---------|----------|----------| +| **Chatbot** | Short prompts, quick responses, fast iteration | Customer service bots, casual conversation | +| **Coding** | Medium prompts with code context, moderate responses | IDE assistants, code completion | +| **Document** | Long prompts with full documents, lengthy analysis | Document summarization, legal/medical analysis | + +| Parameter | Type | Default | Impact | +|-----------|------|---------|--------| +| `user_templates.chatbot.context_range` | [min, max] | [256, 1024] | **KV cache write size per request.** Smaller values reduce storage pressure; larger values stress NVMe throughput. | +| `user_templates.chatbot.generation_range` | [min, max] | [50, 150] | **Decode phase duration.** More tokens = more cache reads per request. Affects read/write ratio. | +| `user_templates.chatbot.think_time_range` | [min, max] | [0.1, 0.5] | **Request inter-arrival time.** Shorter = higher request rate, more concurrent cache operations. | +| `user_templates.coding.context_range` | [min, max] | [1024, 4096] | Medium-length contexts typical of code completion scenarios. 4× larger than chatbot. | +| `user_templates.coding.generation_range` | [min, max] | [100, 500] | Code generation often produces longer outputs than conversational AI. | +| `user_templates.coding.think_time_range` | [min, max] | [0.2, 1.0] | Developers pause to review generated code before next request. | +| `user_templates.document.context_range` | [min, max] | [2048, 8192] | **Stress test scenarios.** 8K tokens creates ~1 GB of total KV cache data for 8B models (128 KB/token × 8,192 tokens). | +| `user_templates.document.generation_range` | [min, max] | [200, 800] | Long-form analysis outputs (summaries, reports). | +| `user_templates.document.think_time_range` | [min, max] | [0.3, 1.5] | Users read lengthy outputs before continuing. | + +--- + +#### Token Generation Timing + +Simulates GPU compute time per generated token. This controls the backpressure on the storage system. + +| Mode | Default (sec/token) | When to Use | +|------|---------------------|-------------| +| `none` | 0.0 | **Pure storage benchmarking.** 100% of measured latency is I/O. Use for MLPerf storage submissions. | +| `fast` | 0.002 (2ms) | Simulates high-end GPU (H100) with optimized inference. Creates light backpressure. | +| `realistic` | 0.030 (30ms) | Simulates typical production GPU throughput. Balances compute/storage for end-to-end analysis. | + +**Why it matters:** With `generation_mode=none`, the benchmark hammers storage as fast as possible. With `realistic`, storage has time to absorb writes between decode steps, showing how your system performs under sustained (not burst) load. + +--- + +#### QoS Profiles (Quality of Service) + +Defines SLA targets for multi-tenant request prioritization. The benchmark tracks violations against these thresholds. + +| Profile | Typical Use Case | Priority | +|---------|------------------|----------| +| **Interactive** | Live chat UIs, real-time assistants | Highest (3) | +| **Responsive** | API calls, near-real-time processing | Medium (2) | +| **Batch** | Overnight jobs, bulk processing | Lowest (1) | + +| Parameter | Default | Meaning | +|-----------|---------|---------| +| `qos_profiles.interactive.target_latency_p95_ms` | 50 | 95% of interactive requests must complete within 50ms. Aggressive target for premium users. | +| `qos_profiles.interactive.target_latency_p99_ms` | 100 | 99% within 100ms. Allows some slack for tail latency. | +| `qos_profiles.interactive.target_latency_p999_ms` | 150 | 99.9% (3 nines) within 150ms. Production SLOs often specify this level. | +| `qos_profiles.interactive.target_latency_p9999_ms` | 200 | 99.99% (4 nines) within 200ms. Critical for detecting storage-induced tail latency. | +| `qos_profiles.interactive.priority` | 3 | Highest priority. These requests are dequeued first. | +| `qos_profiles.responsive.target_latency_p95_ms` | 100 | 2× the interactive target. Acceptable for API consumers. | +| `qos_profiles.responsive.target_latency_p99_ms` | 200 | 99% within 200ms. | +| `qos_profiles.responsive.target_latency_p999_ms` | 350 | 99.9% within 350ms. | +| `qos_profiles.responsive.target_latency_p9999_ms` | 500 | 99.99% within 500ms. | +| `qos_profiles.responsive.priority` | 2 | Medium priority. | +| `qos_profiles.batch.target_latency_p95_ms` | 1000 | 1 second. Batch jobs are latency-tolerant. | +| `qos_profiles.batch.target_latency_p99_ms` | 5000 | 5 seconds. Acceptable for offline processing. | +| `qos_profiles.batch.target_latency_p999_ms` | 7500 | 7.5 seconds. | +| `qos_profiles.batch.target_latency_p9999_ms` | 10000 | 10 seconds. Even worst-case should complete eventually. | +| `qos_profiles.batch.priority` | 1 | Lowest priority. Processed when interactive/responsive queues are empty. | + +> **Research Basis for QoS Targets** (see [sources.md](sources.md) for full citations): +> - **Interactive (50ms P95, 100ms P99)**: Based on Nielsen Norman Group's 0.1s "instant" threshold, Google RAIL <100ms response target, and observed production LLM APIs (Anthropic Claude TTFT: 50–150ms). +> - **Responsive (100ms P95, 200ms P99)**: Based on Google Core Web Vitals FID <100ms "good" threshold, INP ≤200ms target, and Vercel Edge Functions P99 <200ms. +> - **Batch (1000ms P95, 5000ms P99)**: Based on AWS ALB healthy target <1s, and research showing batch workloads tolerate >1s latency ([Splitwise paper](https://arxiv.org/abs/2401.07935): 80% of production requests need <200ms). +> +> **Note:** MLPerf Inference v4.0–v5.0 defines Server/Offline scenarios but does **not** prescribe specific P95/P99 latency SLAs. These targets represent industry best practices, not MLPerf requirements. + +--- + +#### QoS Distribution + +Controls the probability mix of request priorities in the simulated workload. + +| Parameter | Default | Effect | +|-----------|---------|--------| +| `interactive_probability` | 0.15 | 15% of requests are INTERACTIVE. Increase to stress-test low-latency paths. | +| `responsive_threshold` | 0.50 | If not INTERACTIVE, 35% of remaining requests (50% - 15%) are RESPONSIVE. The rest are BATCH. | + +**Example distribution with defaults:** 15% Interactive, 35% Responsive, 50% Batch. + +--- + +#### Eviction Settings + +Controls the waterfall LRU eviction algorithm that moves cold data down the tier hierarchy (GPU → CPU → NVMe). + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_recursion_depth` | 10 | **Safety limit.** Prevents infinite cascading evictions. If you hit this limit, your tiers are severely undersized. | +| `target_usage_ratio` | 0.8 | **Tier headroom.** Keeps each tier at 80% capacity, leaving 20% buffer for burst writes. Lower values = more headroom, fewer evictions. | +| `large_entry_limit_ratio` | 0.95 | **Skip-tier threshold.** If a single entry exceeds 95% of tier capacity, skip directly to the next tier. Prevents tier thrashing with huge entries. | +| `max_evictions_hard_cap` | 5000 | **Absolute safety limit.** Stops eviction loop after 5000 entries regardless of space needs. Prevents runaway eviction under pathological conditions. | +| `max_evictions_min` | 1000 | **Minimum eviction budget.** Ensures the algorithm tries at least 1000 evictions before giving up. Helps with large-model scenarios where many small entries must be evicted. | + +**Tuning guidance:** If you see "Hit recursion limit" warnings, increase `max_recursion_depth`. If evictions dominate your latency, reduce `target_usage_ratio` to provide more headroom. + +--- + +#### GPU Backend Settings + +Controls GPU VRAM allocation and out-of-memory (OOM) recovery behavior. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `memory_fraction` | 0.9 | **VRAM budget.** Uses 90% of GPU memory, reserving 10% for framework overhead and other processes. | +| `max_eviction_attempts` | 100 | **OOM recovery limit.** On CUDA OOM, attempts up to 100 evictions to free space before failing the write. | +| `free_memory_threshold` | 0.1 | **Proactive eviction trigger.** When free GPU memory drops below 10%, begin evicting to CPU before OOM occurs. | + +**Note:** These settings only apply when `--gpu-mem-gb > 0` and PyTorch/CuPy is available. + +--- + +#### Prefix Cache Settings + +Controls hierarchical prefix caching for system prompts (e.g., "You are a helpful assistant"). + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `min_prefix_length` | 50 | **Minimum tokens for caching.** Prefixes shorter than 50 tokens aren't worth the overhead of caching. | +| `max_prefix_entries` | 1000 | **Prefix cache capacity.** LRU eviction kicks in when this limit is reached. Higher values consume more memory but improve hit rates. | +| `system_prompt_hit_probability` | 0.2 | **Simulation realism.** 20% of requests share a common system prompt. Increase to model deployments with standardized prompts (e.g., corporate assistants). | + +**Impact:** Higher `system_prompt_hit_probability` → higher cache hit rates → lower storage throughput (because prefixes are reused). Use 0.0 for pure storage stress testing. + +--- + +#### RAG Settings + +Controls Retrieval-Augmented Generation workload simulation, where external documents are injected into the context. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `chunk_size_tokens` | 512 | **Document chunk granularity.** Each document is split into 512-token chunks for independent caching. Smaller chunks = more cache entries, higher metadata overhead. | +| `top_k_chunks` | 5 | **Retrieval depth.** Number of chunks retrieved per RAG query. More chunks = larger context window = more KV cache I/O. | +| `max_chunk_bytes` | 268435456 | **256 MB per chunk.** Safety limit to prevent single chunks from consuming entire tiers. Particularly important for 70B models where 512 tokens ≈ 160 MB of KV cache (320 KB/token). | + +**When to enable RAG:** Use `--enable-rag` when benchmarking systems designed for document-heavy workloads (legal, medical, enterprise search). + +--- + +#### Conversation Settings + +Controls multi-turn conversation simulation, modeling how chatbot context accumulates across turns. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_conversations` | 1000 | **Concurrent conversation limit.** LRU eviction removes oldest conversations when this limit is hit. Higher values = more memory for conversation metadata. | +| `max_turns_per_conv` | 50 | **Conversation depth limit.** After 50 turns, the conversation resets. Prevents unbounded context growth in long-running benchmarks. | +| `end_conversation_probability` | 0.2 | **Conversation turnover rate.** 20% chance each turn ends the conversation. Lower values = longer conversations = more cache reuse. | + +**Impact on metrics:** Higher `max_turns_per_conv` and lower `end_conversation_probability` increase cache hit rates (context reuse). Use low values for stress testing (force cache misses). + +--- + +#### Autoscaler Settings + +Controls the workload autoscaler that discovers system saturation points. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `min_users` | 1 | **Lower bound.** Autoscaler won't go below 1 user. | +| `max_users` | 10000 | **Upper bound.** Autoscaler stops scaling up at 10,000 users. Prevents runaway resource consumption. | +| `scale_up_factor` | 1.2 | **Growth rate.** Increases users by 20% each scaling action (e.g., 100 → 120 → 144). | +| `scale_down_factor` | 0.8 | **Decay rate.** Decreases users by 20% when SLAs are violated (e.g., 100 → 80 → 64). | +| `consecutive_samples_required` | 2 | **Stability requirement.** Requires 2 consecutive samples agreeing on direction before scaling. Prevents oscillation from transient spikes. | + +**QoS mode vs Capacity mode:** In QoS mode, the autoscaler maximizes users while maintaining latency SLAs. In Capacity mode, it maximizes throughput regardless of latency. + +--- + +#### Decode Phase Settings + +Controls token generation batching during the decode (read-heavy) phase. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `batch_size` | 32 | **Decode batch granularity.** Reads 32 tokens worth of KV cache per decode operation. Larger batches amortize I/O overhead but require more memory. | + +--- + +#### ShareGPT Dataset Settings + +Controls loading and processing of real ShareGPT conversation data. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_context_tokens` | 8192 | **Context truncation.** Conversations longer than 8192 tokens are truncated. Prevents OOM with very long conversations. | +| `max_generation_tokens` | 2048 | **Generation truncation.** Caps simulated generation at 2048 tokens per turn. | +| `chars_per_token_estimate` | 4 | **Tokenization heuristic.** Used when tiktoken is unavailable. 4 chars/token is typical for English text. | + +--- + +#### Saturation Detection Thresholds + +Controls when the StorageMonitor considers the storage subsystem saturated. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `read_latency_p95_threshold_ms` | 100 | **Read saturation signal.** If P95 read latency exceeds 100ms, storage is considered stressed. | +| `write_latency_p95_threshold_ms` | 50 | **Write saturation signal.** Writes are more sensitive; 50ms threshold triggers concern earlier. | +| `queue_depth_threshold` | 100 | **Queue pressure signal.** More than 100 pending requests indicates backlog is building. | +| `history_window_size` | 10 | **Trend analysis window.** Uses last 10 samples to detect latency trends (increasing = saturation). | + +**Used by:** The autoscaler uses these thresholds to decide when to scale down (in QoS mode) or when peak throughput is reached (in capacity mode). + +--- + +#### Validation Limits + +Safety limits enforced by `validate_args()` to prevent accidental misconfigurations. + +| Parameter | Default | Rationale | +|-----------|---------|-----------| +| `max_users` | 100000 | Reasonable upper bound for simulated users. Prevents accidental `--num-users 1000000`. | +| `max_duration_seconds` | 86400 | 24 hours maximum. Prevents runaway benchmarks that run forever. | +| `max_gpu_memory_gb` | 1024 | 1 TB. Covers even the largest GPU clusters (8× H100 80GB = 640GB). | +| `max_cpu_memory_gb` | 16384 | 16 TB. Covers high-memory server configurations. | + +--- + +## Quick Start + +Run a basic storage test with 50 users for 2 minutes: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 120 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results.json +``` + +This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance. + +--- + +## Running the Benchmark + +### CLI-Only Arguments + +These arguments **must** be passed via command line (not configurable in config.yaml): + +| Argument | Type | Default | Required | Description | +|----------|------|---------|----------|-------------| +| `--config` | str | None | No | Path to YAML configuration file | +| `--log-level` | str | INFO | No | Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL | +| `--model` | str | llama3.1-8b | Yes | Model config: tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, llama3.1-70b-instruct | +| `--num-users` | int | 100 | Yes | Number of concurrent users to simulate | +| `--duration` | int | 60 | Yes | Benchmark duration in seconds | +| `--gpu-mem-gb` | float | 16 | Yes | GPU VRAM budget in GB (0 to disable) | +| `--cpu-mem-gb` | float | 32 | Yes | CPU RAM budget in GB | +| `--cache-dir` | str | temp | No | Directory for NVMe cache files | +| `--generation-mode` | str | realistic | No | Token generation: none, fast, realistic | +| `--performance-profile` | str | latency | No | Pass/fail criteria: latency, throughput | +| `--disable-multi-turn` | flag | False | No | Disable multi-turn conversation caching | +| `--disable-prefix-caching` | flag | False | No | Disable prefix caching | +| `--enable-rag` | flag | False | No | Enable RAG workload simulation | +| `--rag-num-docs` | int | 10 | No | Number of RAG documents to ingest | +| `--enable-autoscaling` | flag | False | No | Enable workload autoscaling | +| `--autoscaler-mode` | str | qos | No | Autoscaling strategy: qos, capacity | +| `--target-saturation` | float | 0.8 | No | Target storage saturation (0.0-1.0) | +| `--use-burst-trace` | flag | False | No | Use BurstGPT trace for workload | +| `--burst-trace-path` | str | BurstGPT/... | No | Path to BurstGPT trace file | +| `--validation-trace` | str | None | No | Path to validation trace file | +| `--dataset-path` | str | None | No | Path to ShareGPT dataset JSON | +| `--max-conversations` | int | 500 | No | Max conversations from dataset | +| `--output` | str | auto | No | Output JSON file path | +| `--seed` | int | None | **MLPerf** | Random seed for reproducibility | +| `--max-concurrent-allocs` | int | 0 | No | Limit concurrent allocations (0=unlimited) | +| `--request-rate` | float | 0 | No | Target request rate (req/sec, 0=unlimited) | +| `--max-requests` | int | 0 | No | Stop after N requests (0=use duration) | +| `--xlsx-output` | str | None | No | Excel/CSV output file path | + +### Test Scenarios + +#### Scenario 1: Storage-Only Baseline + +Isolate your NVMe drive by setting GPU memory to zero. This tells you the raw performance of your storage. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_storage_only.json +``` + +#### Scenario 2: Realistic Production Setup + +Test a balanced three-tier configuration that mirrors production deployment. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 100 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_production.json +``` + +#### Scenario 3: Find Maximum User Count (QoS Mode) + +Let the autoscaler discover how many users your system can handle while maintaining acceptable latency. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 20 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --enable-autoscaling \ + --autoscaler-mode qos \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_autoscale_qos.json +``` + +#### Scenario 4: Find Peak Storage Throughput (Capacity Mode) + +Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 10 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --enable-autoscaling \ + --autoscaler-mode capacity \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_capacity.json +``` + +#### Scenario 5: Low Cache Hit Rate (Maximum Storage Stress) + +Force cache misses to maximize NVMe I/O pressure. This is useful for stress testing storage subsystems and measuring worst-case performance. + +**Key flags to lower cache hit rate:** +- `--disable-multi-turn`: Each request is independent (no conversation context reuse) +- `--disable-prefix-caching`: No system prompt caching (every request generates fresh KV cache) +- `--cpu-mem-gb 0`: No CPU tier buffer (all evictions go directly to NVMe) +- High user count with synthetic workload: More unique cache entries + +```bash +# Minimal caching - forces nearly all operations to hit NVMe +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 200 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --disable-multi-turn \ + --disable-prefix-caching \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_low_hit_rate.json +``` + +**Expected results:** Cache hit rate drops to 10-30% (vs 50-70% with defaults, or 85-97% with ShareGPT). + +For even more aggressive stress testing with the 70B model (2.5× larger KV cache per token): + +```bash +# Maximum NVMe stress - 70B model with no caching +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 50 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --disable-multi-turn \ + --disable-prefix-caching \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_70b_low_hit_rate.json +``` + +| Configuration | Typical Cache Hit Rate | Use Case | +|---------------|------------------------|----------| +| ShareGPT + defaults | 85-97% | Realistic production simulation | +| Synthetic + defaults | 50-70% | Balanced stress testing | +| `--disable-multi-turn` only | 30-50% | Moderate stress | +| `--disable-multi-turn --disable-prefix-caching` | 10-30% | Maximum NVMe stress | +| Above + `--cpu-mem-gb 0` | 5-15% | Worst-case storage scenario | + +--- + +## ShareGPT Replay Workloads + +While synthetic workloads are excellent for controlled stress testing, they may not capture the nuances of real human-AI interaction. The **ShareGPT Replay** feature addresses this by loading actual conversation data. + +### Why Use ShareGPT? + +Real conversations exhibit different patterns than synthetic workloads: +- **Higher cache locality**: Users ask follow-up questions, reusing context +- **Variable context sizes**: Real queries vary wildly (10-16,000 tokens) +- **Multi-turn structure**: Conversation flows are preserved + +### Downloading the ShareGPT Dataset + +Download the full dataset from Hugging Face (~1.2 GB): + +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +**Alternative: Smaller subset for quick testing (~40 MB):** + +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json +``` + +### Basic ShareGPT Invocation + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-conversations 500 \ + --num-users 50 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt.json +``` + +### ShareGPT with Rate Limiting + +Control the request arrival rate for steady-state testing: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-conversations 1000 \ + --request-rate 10.0 \ + --num-users 100 \ + --duration 600 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 8 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt_rate_limited.json +``` + +### ShareGPT with Fixed Request Count + +Run exactly N requests for reproducible benchmarks: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-requests 5000 \ + --num-users 50 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt_fixed.json +``` + +### Comparing Real vs Synthetic Workloads + +| Metric | ShareGPT (Real) | Synthetic (Random) | +| :--- | :--- | :--- | +| Mean Context Size | ~133 tokens | ~2,676 tokens | +| Cache Hit Rate | 85-97% | 50-70% | +| Multi-turn Locality | High | Medium | +| Throughput | Higher | Lower | +| NVMe Stress | Moderate | Extreme | + +**Use ShareGPT** when you want to model real chatbot/assistant usage. +**Use Synthetic** when you want worst-case stress testing or controlled experiments. + +--- + +## Using the Wrapper Script + +The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios. + +### Basic Usage + +```bash +./kv-cache-wrapper.sh +``` + +This runs all test scenarios with default settings. Expect roughly 30 minutes for the full suite. + +### Options + +``` +./kv-cache-wrapper.sh [options] + + -m MODEL Model to benchmark (default: llama3.1-8b) + -t SECONDS Duration for tier comparison tests (default: 120) + -s SECONDS Duration for storage saturation test (default: 180) + -r SECONDS Duration for production test (default: 180) + -a SECONDS Duration for autoscaling tests (default: 300) + -w LIST Comma-separated list of workloads to run + -u USERS Override baseline user count + -U USERS Override high-load user count + -R Enable RAG workload + -D DOCS Number of RAG documents (default: 10) + -h Show help +``` + +### Available Workloads + +```bash +# Run only the storage isolation test +./kv-cache-wrapper.sh -w storage-only + +# Run production and autoscaling tests +./kv-cache-wrapper.sh -w production,autoscale + +# Run MLPerf submission tests +./kv-cache-wrapper.sh -w mlperf_submission +``` + +--- + +## Understanding Results + +### Key Metrics + +**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better. + +**Storage Throughput (tokens/sec)**: Raw I/O performance calculated from storage latency, not wall-clock time. This is the fairer metric for comparing storage tiers. + +**End-to-End Latency**: Total time from request submission to completion. This is what users experience. + +**Storage I/O Latency**: Time spent reading from and writing to storage tiers. This measures your hardware. + +**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded. + +**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure. + +### Reading the Output + +``` +### STORAGE PERFORMANCE ASSESSMENT: PASS ### + Criteria Passed: 4/4 + [PASS] NVMe Write P95 < 500ms: 45.20ms + [PASS] NVMe Read P95 < 200ms: 123.45ms + [PASS] CPU RAM P95 < 150ms: 12.30ms + [PASS] Cache Hit Rate > 30%: 67.5% + +### OVERALL PERFORMANCE ### + Total Requests: 2847 + Total Tokens Generated: 489,231 + Avg Throughput: 1,630.77 tok/s + Storage Throughput: 2,105.32 tok/s + +### LATENCY BREAKDOWN ### + End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms + Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms +``` + +--- + +## Understanding Excel Performance Metrics + +The `--xlsx-output` option exports detailed performance metrics to Excel for analysis. This section provides a comprehensive reference for every metric in the export. + +### Run Parameters (Configuration) + +These columns record the benchmark configuration used for the run: + +| Column | Description | +|--------|-------------| +| **Timestamp** | When the benchmark was executed (YYYY-MM-DD HH:MM:SS) | +| **Model** | Model configuration key (e.g., `llama3.1-8b`, `llama3.1-70b-instruct`) | +| **Num Users** | Number of concurrent simulated users | +| **Duration (s)** | Benchmark duration in seconds | +| **GPU Memory (GB)** | GPU VRAM budget allocated | +| **CPU Memory (GB)** | CPU RAM budget allocated | +| **Generation Mode** | Token generation simulation: `none`, `fast`, or `realistic` | +| **Performance Profile** | Pass/fail criteria: `latency` or `throughput` | +| **Multi-turn** | Whether multi-turn conversation caching was enabled | +| **Prefix Caching** | Whether system prompt prefix caching was enabled | +| **RAG Enabled** | Whether RAG workload simulation was enabled | +| **Autoscaling** | Whether workload autoscaling was enabled | +| **Seed** | Random seed for reproducibility | +| **Max Concurrent Allocs** | Limit on parallel cache allocations (0 = unlimited) | +| **Request Rate** | Target request rate in req/sec (0 = unlimited) | +| **Max Requests** | Stop after N requests (0 = use duration) | +| **Dataset Path** | Path to ShareGPT dataset if used | +| **Cache Dir** | Directory used for NVMe cache files | + +--- + +### Throughput Metrics + +| Metric | Unit | What It Measures | Interpretation | +|--------|------|------------------|----------------| +| **Total Requests** | count | Total inference requests completed | Higher = more work done. Compare across runs with same duration. | +| **Total Tokens** | count | Total tokens generated across all requests | Primary workload volume indicator. | +| **Elapsed Time (s)** | seconds | Actual wall-clock benchmark duration | May differ slightly from configured duration. | +| **Avg Throughput (tok/s)** | tokens/sec | `Total Tokens / Elapsed Time` | **Wall-clock throughput.** Includes all overheads (queue wait, generation simulation). **Primary metric when `gpu_mem=0` and `cpu_mem=0`.** | +| **Storage Throughput (tok/s)** | tokens/sec | `Total Tokens / Total Storage I/O Time` | **Pure storage throughput.** Excludes generation simulation time. Useful when `cpu_mem > 0` to isolate storage I/O. | +| **Requests/sec** | req/sec | `Total Requests / Elapsed Time` | Request processing rate. Higher = system handling more concurrent users efficiently. | + +> **Which throughput metric to use?** +> - **When `gpu_mem=0` and `cpu_mem=0`**: Use **Avg Throughput (tok/s)** — all I/O hits the storage tier, so wall-clock throughput directly reflects storage performance. +> - **When `cpu_mem > 0`**: Use **Storage Throughput (tok/s)** to isolate storage I/O from CPU cache hits. +> - **For MLPerf submissions**: Use **Tier Storage Read/Write Bandwidth (GB/s)** as the primary comparison metric (see below). + +--- + +### End-to-End Latency Metrics + +End-to-end (E2E) latency measures the total time from request submission to completion, including queue wait, cache operations, and simulated generation time. **This is what users experience.** + +| Metric | What It Measures | +|--------|------------------| +| **E2E Latency Mean (ms)** | Average latency across all requests. Sensitive to outliers. | +| **E2E Latency P50 (ms)** | Median latency. 50% of requests complete within this time. | +| **E2E Latency P95 (ms)** | 95th percentile. 95% of requests complete within this time. **Standard SLA metric.** | +| **E2E Latency P99 (ms)** | 99th percentile. 99% of requests complete within this time. **Tail latency indicator.** | +| **E2E Latency P99.9 (ms)** | 99.9th percentile (3 nines). Captures rare slow requests. | +| **E2E Latency P99.99 (ms)** | 99.99th percentile (4 nines). Extreme tail latency for SLA compliance. | + +> **Interpreting percentiles:** +> - **P50** tells you the typical user experience. +> - **P95** is the standard for SLA definitions ("95% of requests under X ms"). +> - **P99–P99.99** reveal tail latency issues that affect a small but real fraction of users. +> - Large gaps between P95 and P99 indicate inconsistent performance (investigate queue buildup or storage saturation). + +--- + +### Storage I/O Latency Metrics + +Storage latency measures only the time spent on cache read/write operations, excluding queue wait and generation simulation. **This isolates storage subsystem performance.** + +| Metric | What It Measures | +|--------|------------------| +| **Storage Latency Mean (ms)** | Average storage I/O time across all operations. | +| **Storage Latency P50 (ms)** | Median storage I/O time. | +| **Storage Latency P95 (ms)** | 95th percentile storage I/O time. **Key metric for storage evaluation.** | +| **Storage Latency P99 (ms)** | 99th percentile storage I/O time. | +| **Storage Latency P99.9 (ms)** | 99.9th percentile storage I/O time. | +| **Storage Latency P99.99 (ms)** | 99.99th percentile storage I/O time. | + +--- + +### Generation Latency Metrics + +Generation latency measures the simulated GPU token generation time. Only meaningful when `--generation-mode` is `fast` or `realistic`. + +| Metric | What It Measures | +|--------|------------------| +| **Gen Latency Mean (ms)** | Average simulated generation time per request. | +| **Gen Latency P50 (ms)** | Median generation time. | +| **Gen Latency P95 (ms)** | 95th percentile generation time. | +| **Gen Latency P99 (ms)** | 99th percentile generation time. | + +> **Note:** With `--generation-mode none`, these values are all 0 (pure storage benchmark). + +--- + +### Storage Tier Latency Breakdown (PRIMARY METRICS) + +These metrics provide granular visibility into storage tier operations. The "storage" tier is device-agnostic—it could be NVMe, SATA SSD, CXL memory, or any block storage device. Each operation is decomposed into: + +- **Total**: Complete operation time (Host + Device) +- **Device**: Actual storage I/O time (`np.save`/`np.load` with fsync) — **PRIMARY LATENCY METRIC** +- **Host**: CPU serialization/deserialization time + +> **⭐ PRIMARY METRICS for MLPerf Storage Comparison:** +> - **Storage Tier Read Device P95 (ms)** — Raw storage read latency +> - **Storage Tier Write Device P95 (ms)** — Raw storage write latency +> - **Tier Storage Read Bandwidth (GB/s)** — Storage read throughput +> - **Tier Storage Write Bandwidth (GB/s)** — Storage write throughput +> +> **What Device Latency Measures:** +> ``` +> Device Latency = [ OS/FS Queue ] + [ Block Layer ] + [ Driver ] + [ Physical I/O ] +> ``` +> The **Storage Tier Read Device P95 (ms)** is the 95th percentile latency of reading one `.npy` file containing the KV cache data for a single cache entry (one request's token sequence). This captures tail latency—95% of reads complete faster than this value, so it reveals worst-case storage behavior under load. + +#### Read Operations (Decode Phase) + +| Metric | Component | What It Measures | +|--------|-----------|------------------| +| **Storage Tier Read Total P50–P99.99 (ms)** | Total | Complete read time including deserialization | +| **Storage Tier Read Device P50–P99.99 (ms)** | Device | **⭐ Raw storage read time (`np.load`) — PRIMARY** | +| **Storage Tier Read Host P50–P99.99 (ms)** | Host | NumPy array deserialization CPU time | + +#### Write Operations (Prefill Phase) + +| Metric | Component | What It Measures | +|--------|-----------|------------------| +| **Storage Tier Write Total P50–P99.99 (ms)** | Total | Complete write time including serialization | +| **Storage Tier Write Device P50–P99.99 (ms)** | Device | **⭐ Raw storage write time (`np.save` + fsync) — PRIMARY** | +| **Storage Tier Write Host P50–P99.99 (ms)** | Host | NumPy array serialization CPU time | + +> **Diagnosing storage bottlenecks:** +> - If **Device >> Host**: Your storage device is the bottleneck. Consider faster storage (NVMe Gen5, CXL). +> - If **Host >> Device**: CPU serialization is the bottleneck. Consider faster CPU or memory bandwidth. +> - Typical ratio: Device should be 60-80% of Total for well-balanced systems. + +--- + +### Cache Statistics + +| Metric | Unit | What It Measures | Good Values | +|--------|------|------------------|-------------| +| **Cache Hit Rate** | ratio (0–1) | Fraction of reads served from cache vs. storage | Higher is better. 0.7+ with multi-turn enabled. | +| **Read/Write Ratio** | ratio | Total reads / Total writes | Higher indicates read-heavy workload (typical for decode phase). | +| **Total Read (GB)** | GB | Total data read from all tiers | Workload volume indicator. | +| **Total Write (GB)** | GB | Total data written to all tiers | Workload volume indicator. | + +--- + +### Per-Tier I/O Volume + +These metrics show data movement through each tier of the cache hierarchy: + +| Metric | What It Measures | +|--------|------------------| +| **Tier GPU KV Bytes Written (GB)** | Data written to GPU VRAM tier | +| **Tier GPU KV Bytes Read (GB)** | Data read from GPU VRAM tier | +| **Tier CPU KV Bytes Written (GB)** | Data written to CPU RAM tier | +| **Tier CPU KV Bytes Read (GB)** | Data read from CPU RAM tier | +| **Tier Storage KV Bytes Written (GB)** | Data written to storage tier (NVMe, SATA, CXL, etc.) | +| **Tier Storage KV Bytes Read (GB)** | Data read from storage tier (NVMe, SATA, CXL, etc.) | + +> **Analyzing tier distribution:** +> - High GPU/CPU reads with low storage reads = hot data fits in fast tiers (good!) +> - High storage reads = working set exceeds fast tier capacity (consider adding memory) +> - **Tier Storage KV Bytes Read** is a key MLPerf differentiation metric (100% win rate in discovery testing) + +--- + +### Per-Tier Bandwidth (PRIMARY METRICS) + +These metrics measure the actual throughput achieved on each tier. **Tier Storage Bandwidth is the primary metric for comparing storage devices.** + +| Metric | Unit | What It Measures | +|--------|------|------------------| +| **Tier GPU Read Bandwidth (GB/s)** | GB/s | GPU VRAM read throughput | +| **Tier GPU Write Bandwidth (GB/s)** | GB/s | GPU VRAM write throughput | +| **Tier CPU Read Bandwidth (GB/s)** | GB/s | CPU RAM read throughput | +| **Tier CPU Write Bandwidth (GB/s)** | GB/s | CPU RAM write throughput | +| **Tier Storage Read Bandwidth (GB/s)** | GB/s | **⭐ Storage tier read throughput — PRIMARY** | +| **Tier Storage Write Bandwidth (GB/s)** | GB/s | **⭐ Storage tier write throughput — PRIMARY** | + +> **Expected bandwidth ranges:** +> - **GPU**: 500–2000 GB/s (HBM2e/HBM3) +> - **CPU**: 50–200 GB/s (DDR4/DDR5) +> - **Storage (NVMe Gen4)**: 3–7 GB/s +> - **Storage (NVMe Gen5)**: 10–14 GB/s +> - **Storage (SATA SSD)**: 0.4–0.6 GB/s +> - **Storage (CXL Memory)**: 30–50 GB/s + +--- + +### Tier Entry Distribution + +| Metric | What It Measures | +|--------|------------------| +| **GPU Entries** | Number of KV cache entries currently in GPU VRAM | +| **CPU Entries** | Number of KV cache entries currently in CPU RAM | +| **Storage Entries** | Number of KV cache entries currently on storage tier | + +> **Interpreting entry counts:** +> - Most entries should be in the fastest available tier for optimal performance. +> - High **Storage Entries** with low **GPU/CPU Entries** indicates memory pressure. +> - When `gpu_mem=0` and `cpu_mem=0`, all entries will be in **Storage Entries**. + +--- + +### Multi-turn Statistics + +| Metric | What It Measures | +|--------|------------------| +| **Multi-turn Hit Rate** | Fraction of requests that reused context from previous conversation turns | + +> **Interpreting Multi-turn Hit Rate:** +> - **High (0.6+)**: Effective conversation context caching. Most requests are follow-ups that reuse existing KV cache entries, reducing redundant computation. Typical for chatbot/assistant workloads. +> - **Low (<0.3)**: Indicates one or more of the following: +> - `--disable-multi-turn` is enabled (expected: 0.0) +> - Workload has high conversation turnover (users start new conversations frequently) +> - Single-shot API usage pattern (each request is independent) +> - Memory pressure causing cache eviction before context reuse +> - Short benchmark duration (not enough time for multi-turn patterns to emerge) +> +> **Note:** A low multi-turn hit rate is **not inherently bad**—it depends on your use case. For storage stress testing, low hit rates force more I/O which is often the goal. + +--- + +### Using Excel Metrics for Analysis + +**⭐ Primary Metrics for MLPerf Storage Comparison:** + +| Metric | When to Use | Why | +|--------|-------------|-----| +| **Tier Storage Read Bandwidth (GB/s)** | Always | Direct measure of storage read throughput | +| **Tier Storage Write Bandwidth (GB/s)** | Always | Direct measure of storage write throughput | +| **Storage Tier Read Device P95 (ms)** | Always | Raw storage read latency (excludes CPU overhead) | +| **Storage Tier Write Device P95 (ms)** | Always | Raw storage write latency (excludes CPU overhead) | +| **Avg Throughput (tok/s)** | When `gpu_mem=0, cpu_mem=0` | Wall-clock throughput equals storage throughput | + +**Comparing storage devices:** +1. Run identical benchmarks on each device with `--gpu-mem-gb 0 --cpu-mem-gb 0` +2. Compare **primary metrics**: Tier Storage Read/Write Bandwidth, Storage Tier Device P95 latencies +3. Use **Avg Throughput (tok/s)** as the overall performance score + +**Diagnosing performance issues:** +1. Check **Storage Tier Device P95** vs **Storage Tier Host P95** +2. If Device >> Host: Storage device is the bottleneck +3. If Host >> Device: CPU serialization is the bottleneck + +**Validating cache configuration:** +1. Check **Cache Hit Rate** and **Multi-turn Hit Rate** +2. Low hit rates with enabled caching: Working set too large for memory budget +3. Compare **Tier Storage KV Bytes Read** across configurations + +--- + +## Unit Testing + +This package includes a comprehensive pytest-based test suite to verify core functionality without running the full benchmark. + +### Running Tests + +```bash +# Run all tests with verbose output +pytest test_kv_cache.py -v + +# Run with shorter traceback +pytest test_kv_cache.py -v --tb=short + +# Run specific test class +pytest test_kv_cache.py -k "TestModelConfig" -v + +# Run only CPU tests (skip GPU tests if no CUDA) +pytest test_kv_cache.py -v -m "not skipif" +``` + +### Test Coverage + +The test suite covers 23 component categories with ~170+ individual tests: + +| Test Class | Tests | Coverage | +|------------|-------|----------| +| `TestConfigLoader` | 5 | YAML loading, strict schema validation, error on unknown keys, nested key access | +| `TestCfgHelper` | 4 | Global `cfg()` helper, defaults when config not loaded, list value extraction | +| `TestModelConfig` | 4 | Model configurations, KV cache size per token calculations, dtype handling | +| `TestInferenceRequest` | 5 | Request dataclass, automatic cache key generation, phase handling, QoS assignment | +| `TestQoSProfiles` | 5 | QoS levels (interactive/responsive/batch), SLA targets, priority ordering, p999/p9999 extended metrics | +| `TestKVCacheGenerator` | 4 | Reproducible generation with seeds, correct tensor shapes, dtype consistency, precomputed buffers | +| `TestCPUMemoryBackend` | 4 | Write/read/delete/clear operations, timing metadata, data integrity | +| `TestNVMeBackend` | 5 | File I/O operations, .npy format handling, metadata persistence, temp directory cleanup | +| `TestGPUMemoryBackend` | 4 | CUDA tensor placement, device memory management (skipped without GPU) | +| `TestConversationManager` | 4 | Multi-turn conversation tracking, cache key management, LRU eviction | +| `TestUserSimulator` | 3 | User profile generation from templates, QoS distribution validation | +| `TestMultiTierCache` | 5 | CPU-only allocation paths, cache access patterns, tier selection logic | +| `TestMultiTierCacheWithGPU` | 4 | GPU tier allocation, waterfall eviction GPU→CPU→NVMe (skipped without GPU) | +| `TestXLSXExport` | 4 | CSV fallback, Excel export, run parameters embedding (skipped without pandas) | +| `TestEnums` | 3 | InferencePhase, GenerationMode, QoSLevel enum values | +| `TestTierLogic` | 3 | Tier ordering (GPU→CPU→NVMe), usage tracking, limit validation | +| `TestConfigDrivenConversationManager` | 2 | ConversationManager respects config.yaml settings | +| `TestConfigDrivenUserSimulator` | 3 | UserSimulator reads user_templates from config | +| `TestStatsNamingConvention` | 2 | `storage_*` naming convention validation for metrics keys | +| `TestGPUMemoryBackendEvictionCallback` | 2 | GPU eviction callback invocation and data passing (skipped without GPU) | +| `TestValidateArgs` | 24 | CLI argument validation: positive integers, ranges, memory limits, cache directory safety, forbidden prefixes | +| `TestPerTierPhaseMetrics` | 7 | Per-tier (GPU/CPU/Storage) KV bytes read/written tracking during prefill/decode phases | +| `TestPerTierPhaseMetricsWithGPU` | 4 | GPU tier metrics tracking, phase-aware read/write separation (skipped without GPU) | + +### Expected Runtime + +- **Without GPU**: ~5-10 seconds +- **With GPU**: ~10-15 seconds + +GPU tests are automatically skipped if CUDA is not available. + +--- + +## Excel Export + +The benchmark can export results directly to Excel or CSV format for analysis. + +### Basic Usage + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 120 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --seed 42 \ + --output results.json \ + --xlsx-output results.xlsx +``` + +### Output Format + +The Excel file contains a single row with all key metrics: + +| Column | Description | +|--------|-------------| +| Model | Model configuration used | +| Num Users | Concurrent user count | +| Duration (s) | Benchmark duration | +| GPU Mem (GB) | GPU memory budget | +| CPU Mem (GB) | CPU memory budget | +| Total Requests | Requests completed | +| Total Tokens | Tokens processed | +| Avg Throughput (tok/s) | Wall-clock throughput | +| Storage Throughput (tok/s) | Storage I/O throughput | +| Cache Hit Rate | Percentage of cache hits | +| E2E Latency P95 (ms) | End-to-end 95th percentile | +| Storage IO P95 (ms) | Storage I/O 95th percentile | + +### Fallback Behavior + +- **With openpyxl**: Exports to `.xlsx` format +- **Without openpyxl**: Falls back to `.csv` format +- **Without pandas**: Export is skipped with a warning + +--- + +## MLPerf Submission Guidelines + +For official MLPerf v3.0 storage submissions, use these standardized commands. **These invocations have been validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests comparing 14,000 MB/s vs 3,000 MB/s storage). + +### Discovery Test Key Findings + +| Finding | Impact | +|---------|--------| +| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB | +| **Best models for differentiation** | llama3.1-8b and mistral-7b show 2.31x ratio | +| **High variance observed** | CV 50-125%, requires 3-5 trials minimum | +| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB | + +### Option 1: Maximum Storage Stress (cpu_mem=0GB) + +Use when you want to stress test NVMe and measure I/O volume differentiation. + +**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation) + +```bash +# MLPerf v3.0: Maximum Storage Stress Test (8B Model) +# Run 3-5 trials for statistical significance +for trial in 1 2 3 4 5; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 200 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --max-concurrent-allocs 16 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_stress_8b_trial${trial}.json +done +``` + +**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead. + +### Option 2: Storage Throughput Focus (cpu_mem=4GB) + +Use when you want Storage Throughput (tok/s) as your primary metric. + +**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate) + +```bash +# MLPerf v3.0: Storage Throughput Test (8B Model) +for trial in 1 2 3 4 5; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 100 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --max-concurrent-allocs 0 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_throughput_8b_trial${trial}.json +done +``` + +### Option 3: Large Model Submission (70B) + +For maximum per-request storage stress (2.5× larger KV cache per token: 320 KB vs 128 KB): + +```bash +# MLPerf v3.0: Large Model Storage Stress +for trial in 1 2 3; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 70 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --max-concurrent-allocs 4 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_stress_70b_trial${trial}.json +done +``` + +### Critical Parameters (Discovery-Validated) + +| Parameter | Value | Rationale | +|-----------|-------|-----------| +| **--config config.yaml** | Required | Ensures consistent internal settings | +| **--seed 42** | Required | Reproducibility across systems | +| **--gpu-mem-gb 0** | Required | Isolates storage performance | +| **--cpu-mem-gb** | 0 or 4 | 0GB for max stress (use I/O volume metrics), 4GB for Storage Throughput metric | +| **--max-concurrent-allocs** | 0, 4, or 16 | 0 for throughput, 16 for stress testing | +| **--generation-mode** | none or realistic | none for pure I/O, realistic for production simulation | +| **--num-users** | 100-200 | Differentiation stable across range; higher = more throughput | +| **--duration** | 300-600 | 5-10 minutes for stable metrics | + +### Trial Requirements + +| User Count | Variance (CV) | Minimum Trials | +|------------|---------------|----------------| +| 10 users | ~52% | 3 | +| 50-100 users | ~115-125% | 3-5 | +| 200 users | ~110-120% | 3-5 | + +Report **median** rather than mean for publication-quality results. + +--- + +## Troubleshooting + +### Out of Memory Errors + +Reduce the number of concurrent users or limit parallel allocations: + +```bash +python3 kv-cache.py --config config.yaml ... --max-concurrent-allocs 50 +``` + +### Benchmark Hangs + +The system may be thrashing. Reduce users or increase memory budgets. + +### Poor Cache Hit Rates + +Low hit rates indicate your working set exceeds available fast memory. Either: +- Increase GPU/CPU memory budgets +- Reduce user count +- Accept that cold data will hit storage + +### Results Vary Between Runs + +Use the `--seed` flag for reproducible results. + +### Configuration Validation Errors + +If you see "Unknown configuration key" errors, check your `config.yaml` for typos. The benchmark uses strict schema validation to prevent silent misconfigurations. + +--- + +## Files in This Package + +- `kv-cache.py`: Main benchmark implementation with ShareGPT support +- `config.yaml`: YAML configuration file for internal parameters +- `test_kv_cache.py`: Pytest unit test suite +- `requirements.txt`: Python dependencies +- `README.md`: This documentation +- `MLperf v3 KV cache proposal.md`: Detailed technical documentation + +--- + +## License + +Apache License 2.0 + +--- + +## Contact + +For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group. From 166f2b2312670cf23bc46789591069e538ea0793 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:44:50 -0800 Subject: [PATCH 05/16] test(results): add pytest HTML test report - Add kv-cache-test-report.html with full test execution results - All 170+ tests passing for v3.0 features - Create unit_test_results directory for test artifacts --- .../tests/unit_test_results/kv-cache-test-report.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html b/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html index 1f4a7fa3..4dc72edf 100644 --- a/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html +++ b/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html @@ -328,7 +328,7 @@

kv-cache-test-report.html

-

Report generated on 12-Jan-2026 at 16:00:59 by pytest-html +

Report generated on 27-Jan-2026 at 11:38:56 by pytest-html v4.1.1

Environment

@@ -382,7 +382,7 @@

Environment

Summary

-

112 tests took 00:01:19.

+

172 tests took 00:01:18.

(Un)check the boxes to filter the results.