From 549c6a8a6b3f0a2d230e450eca01c04c445b5012 Mon Sep 17 00:00:00 2001
From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:40:06 -0800
Subject: [PATCH 1/4] Remove unused imports and ShareGPT dataset loader

Removed optional dependencies and ShareGPT dataset loader from kv-cache.py.
---
 kv_cache_benchmark/kv-cache.py | 504 ++-------------------------------
 1 file changed, 16 insertions(+), 488 deletions(-)

diff --git a/kv_cache_benchmark/kv-cache.py b/kv_cache_benchmark/kv-cache.py
index 106418a5..65eb3576 100644
--- a/kv_cache_benchmark/kv-cache.py
+++ b/kv_cache_benchmark/kv-cache.py
@@ -62,27 +62,8 @@
 except ImportError:
     CUPY_AVAILABLE = False
 
-try:
-    import tiktoken
-    TIKTOKEN_AVAILABLE = True
-except ImportError:
-    TIKTOKEN_AVAILABLE = False
-
-# Optional pandas/openpyxl for XLSX output
-try:
-    import pandas as pd
-    PANDAS_AVAILABLE = True
-except ImportError:
-    PANDAS_AVAILABLE = False
-
-try:
-    import openpyxl
-    OPENPYXL_AVAILABLE = True
-except ImportError:
-    OPENPYXL_AVAILABLE = False
-
 
-# ============================================================================
+# ============================================================================ 
 # CORE DATA MODELS
 # Defines the basic data structures used throughout the benchmark.
 # ============================================================================ 
@@ -1868,7 +1849,6 @@ def get_stats(self, duration: float) -> Dict:
             'read_write_ratio': self.stats['total_read_bytes'] / max(self.stats['total_write_bytes'], 1),
             'read_iops': self.stats['read_operations'],
             'write_iops': self.stats['write_operations'],
-            'nvme_tokens_processed': self.stats['nvme_tokens_processed'],
         }
 
         # Add latency percentiles for each tier.
@@ -2379,169 +2359,10 @@ def generate_mixed_users(cls, num_users: int) -> List[UserProfile]:
         return users
 
 
-# ============================================================================
-# SHAREGPT DATASET LOADER
-# Loads ShareGPT conversation data for realistic workload generation.
-# ============================================================================
-
-class ShareGPTDatasetLoader:
-    """
-    Loads ShareGPT conversation data and provides realistic request patterns.
-    ShareGPT format has conversations with 'from' (human/gpt) and 'value' (text content).
-    """
-
-    def __init__(self, dataset_path: str, max_conversations: int = 1000, seed: Optional[int] = None):
-        """
-        Initialize the ShareGPT dataset loader.
-
-        Args:
-            dataset_path: Path to the ShareGPT JSON file
-            max_conversations: Maximum number of conversations to load
-            seed: Random seed for reproducibility
-        """
-        self.dataset_path = dataset_path
-        self.max_conversations = max_conversations
-        self.conversations = []
-        self.token_stats = {}
-
-        if seed:
-            random.seed(seed)
-            np.random.seed(seed)
-
-        self._load_dataset()
-
-    def _load_dataset(self):
-        """Load and process the ShareGPT dataset."""
-        if not os.path.exists(self.dataset_path):
-            print(f"[ShareGPT] Warning: Dataset not found at {self.dataset_path}")
-            return
-
-        try:
-            # Try to initialize tokenizer for accurate token counting
-            tokenizer = None
-            if TIKTOKEN_AVAILABLE:
-                try:
-                    tokenizer = tiktoken.get_encoding("cl100k_base")  # GPT-4 tokenizer
-                except Exception:
-                    pass
-
-            if tokenizer is None:
-                print("[ShareGPT] Tiktoken not available, using approximate token counting")
-
-            with open(self.dataset_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-
-            # Process conversations
-            for conv_idx, conversation in enumerate(data[:self.max_conversations]):
-                if 'conversations' not in conversation:
-                    continue
-
-                conv_data = []
-                turns = conversation['conversations']
-
-                for i in range(0, len(turns) - 1, 2):  # Process pairs of human-gpt turns
-                    if i + 1 >= len(turns):
-                        break
-
-                    human_turn = turns[i]
-                    gpt_turn = turns[i + 1]
-
-                    if human_turn.get('from') != 'human' or gpt_turn.get('from') != 'gpt':
-                        continue
-
-                    # Calculate tokens
-                    context_text = human_turn.get('value', '')
-                    generation_text = gpt_turn.get('value', '')
-
-                    if tokenizer:
-                        context_tokens = len(tokenizer.encode(context_text))
-                        generation_tokens = len(tokenizer.encode(generation_text))
-                    else:
-                        # Approximate: 4 characters per token on average
-                        context_tokens = max(1, len(context_text) // 4)
-                        generation_tokens = max(1, len(generation_text) // 4)
-
-                    # Limit extreme values for stability
-                    context_tokens = min(context_tokens, 16384)  # Cap at 16K context
-                    generation_tokens = min(generation_tokens, 2048)  # Cap at 2K generation
-
-                    conv_data.append({
-                        'context_tokens': context_tokens,
-                        'generation_tokens': generation_tokens,
-                        'turn_number': i // 2 + 1
-                    })
-
-                if conv_data:
-                    self.conversations.append({
-                        'id': conversation.get('id', f'conv_{conv_idx}'),
-                        'turns': conv_data
-                    })
-
-            # Calculate statistics
-            if self.conversations:
-                all_context_tokens = []
-                all_generation_tokens = []
-
-                for conv in self.conversations:
-                    for turn in conv['turns']:
-                        all_context_tokens.append(turn['context_tokens'])
-                        all_generation_tokens.append(turn['generation_tokens'])
-
-                self.token_stats = {
-                    'context_mean': np.mean(all_context_tokens),
-                    'context_std': np.std(all_context_tokens),
-                    'context_min': np.min(all_context_tokens),
-                    'context_max': np.max(all_context_tokens),
-                    'context_p50': np.percentile(all_context_tokens, 50),
-                    'context_p95': np.percentile(all_context_tokens, 95),
-                    'generation_mean': np.mean(all_generation_tokens),
-                    'generation_std': np.std(all_generation_tokens),
-                    'generation_min': np.min(all_generation_tokens),
-                    'generation_max': np.max(all_generation_tokens),
-                    'generation_p50': np.percentile(all_generation_tokens, 50),
-                    'generation_p95': np.percentile(all_generation_tokens, 95),
-                    'total_conversations': len(self.conversations),
-                    'total_turns': sum(len(c['turns']) for c in self.conversations)
-                }
-
-                print(f"[ShareGPT] Loaded {len(self.conversations)} conversations with {self.token_stats['total_turns']} turns")
-                print(f"[ShareGPT] Context tokens: mean={self.token_stats['context_mean']:.1f}, p50={self.token_stats['context_p50']:.1f}, p95={self.token_stats['context_p95']:.1f}")
-                print(f"[ShareGPT] Generation tokens: mean={self.token_stats['generation_mean']:.1f}, p50={self.token_stats['generation_p50']:.1f}, p95={self.token_stats['generation_p95']:.1f}")
-
-        except Exception as e:
-            print(f"[ShareGPT] Error loading dataset: {e}")
-            self.conversations = []
-
-    def get_random_conversation(self) -> Optional[Dict]:
-        """Get a random conversation from the dataset."""
-        if not self.conversations:
-            return None
-        return random.choice(self.conversations)
-
-    def get_random_turn(self) -> Optional[Tuple[int, int]]:
-        """Get random context and generation token counts from the dataset."""
-        if not self.conversations:
-            return None
-
-        conv = self.get_random_conversation()
-        if conv and conv['turns']:
-            turn = random.choice(conv['turns'])
-            return turn['context_tokens'], turn['generation_tokens']
-        return None
-
-    def iterate_conversations(self, shuffle: bool = True):
-        """Iterate through all conversations, optionally shuffled."""
-        conversations = self.conversations.copy()
-        if shuffle:
-            random.shuffle(conversations)
-        for conv in conversations:
-            yield conv
-
-
-# ============================================================================
+# ============================================================================ 
 # INTEGRATED BENCHMARK ORCHESTRATOR
 # This class wires all the components together and runs the main benchmark loop.
-# ============================================================================
+# ============================================================================ 
 
 class IntegratedBenchmark:
     """The main orchestrator for the entire benchmark."""
@@ -2565,12 +2386,8 @@ def __init__(self,
                  performance_profile: str = 'latency',
                  use_burst_trace: bool = False,
                  burst_trace_path: Optional[str] = None,
-                 dataset_path: Optional[str] = None,
-                 max_conversations: int = 500,
                  seed: Optional[int] = None,
-                 max_concurrent_allocs: int = 0,
-                 request_rate: float = 0,
-                 max_requests: int = 0):
+                 max_concurrent_allocs: int = 0):
 
         self.model_config = model_config
         self.num_users = num_users
@@ -2586,28 +2403,11 @@ def __init__(self,
         self.performance_profile = performance_profile
         self.use_burst_trace = use_burst_trace
         self.burst_trace_path = burst_trace_path
-        self.dataset_path = dataset_path
-        self.max_conversations = max_conversations
         self.seed = seed
         self.max_concurrent_allocs = max_concurrent_allocs
-        self.request_rate = request_rate
-        self.max_requests = max_requests
         self.burst_requests: List[Tuple[int, int]] = []
-        self.sharegpt_loader: Optional[ShareGPTDatasetLoader] = None
-
-        # Load dataset if provided (takes priority over burst trace)
-        if self.dataset_path:
-            self.sharegpt_loader = ShareGPTDatasetLoader(
-                dataset_path=self.dataset_path,
-                max_conversations=self.max_conversations,
-                seed=self.seed
-            )
-            self.use_dataset = True
-        elif self.use_burst_trace:
+        if self.use_burst_trace:
             self._load_burst_trace()
-            self.use_dataset = False
-        else:
-            self.use_dataset = False
 
         # Initialize components
         self.cache = MultiTierCache(
@@ -2651,7 +2451,6 @@ def __init__(self,
             'seed': self.seed,
         }
         self.results_lock = threading.Lock()
-        self.stop_event: Optional[threading.Event] = None  # Set during run()
         self.rag_ingest_done = threading.Event() if self.enable_rag else None
 
     def _ingest_rag_documents(self, num_docs: int, stop_event: Optional[threading.Event] = None):
@@ -2741,80 +2540,10 @@ def _generate_requests_from_trace(self, stop_event: threading.Event):
 
             priority_tuple = (-QOS_PROFILES[request.qos_level].priority, time.time())
             self.request_queue.put((priority_tuple, request))
-
+            
             request_index += 1
             time.sleep(0.01) # Simulate request arrival rate
 
-    def _generate_requests_from_dataset(self, stop_event: threading.Event):
-        """Generates InferenceRequest objects from the loaded ShareGPT dataset."""
-        if not self.sharegpt_loader or not self.sharegpt_loader.conversations:
-            print("Warning: ShareGPT dataset is empty or not loaded. Falling back to synthetic workload.")
-            # Fall back to synthetic generation
-            users = UserSimulator.generate_mixed_users(self.num_users)
-            self.generate_requests(users, stop_event)
-            return
-
-        conversation_iterator = iter(self.sharegpt_loader.iterate_conversations(shuffle=True))
-        current_conversation = None
-        turn_index = 0
-
-        while not stop_event.is_set():
-            # Get next conversation turn
-            if current_conversation is None or turn_index >= len(current_conversation['turns']):
-                try:
-                    current_conversation = next(conversation_iterator)
-                    turn_index = 0
-                except StopIteration:
-                    # Restart iteration when we run out of conversations
-                    conversation_iterator = iter(self.sharegpt_loader.iterate_conversations(shuffle=True))
-                    continue
-
-            turn = current_conversation['turns'][turn_index]
-            context_tokens = turn['context_tokens']
-            generate_tokens = turn['generation_tokens']
-
-            with self.counter_lock:
-                req_id = self.request_counter
-                self.request_counter += 1
-
-            # Assign QoS level based on request characteristics
-            rand = random.random()
-            if rand < 0.15:
-                qos_level, priority = QoSLevel.INTERACTIVE, 3
-            elif rand < 0.50:
-                qos_level, priority = QoSLevel.RESPONSIVE, 2
-            else:
-                qos_level, priority = QoSLevel.BATCH, 1
-
-            user_id = f"dataset_user_{req_id % self.num_users}"
-            conv_id = current_conversation['id']
-
-            # Determine inference phase
-            phase = InferencePhase.PREFILL if context_tokens >= 10000 else InferencePhase.PREFILL_DECODE
-
-            request = InferenceRequest(
-                user_id=user_id,
-                request_id=f"{user_id}_req_{req_id:04d}",
-                timestamp=datetime.now(),
-                context_tokens=context_tokens,
-                generate_tokens=generate_tokens,
-                priority=priority,
-                phase=phase,
-                qos_level=qos_level,
-                cache_key=f"{conv_id}_turn_{turn['turn_number']}",
-                conversation_id=conv_id if self.enable_multi_turn else None,
-                turn_number=turn['turn_number'] if self.enable_multi_turn else None
-            )
-
-            priority_tuple = (-QOS_PROFILES[request.qos_level].priority, time.time())
-            self.request_queue.put((priority_tuple, request))
-
-            turn_index += 1
-
-            # Control request arrival rate (0 = unlimited for storage saturation)
-            if self.request_rate > 0:
-                time.sleep(1.0 / self.request_rate)
-
     def generate_requests(self, users: List[UserProfile], stop_event: threading.Event):
         """Generate requests concurrently for each simulated user."""
 
@@ -3006,11 +2735,6 @@ def process_requests(self, stop_event: threading.Event):
                 self.results['storage_latencies'].append(storage_latency)
                 self.results['generation_latencies'].append(generation_latency)
 
-                # Check if we've hit max_requests limit
-                if self.max_requests > 0 and self.results['requests_completed'] >= self.max_requests:
-                    if self.stop_event:
-                        self.stop_event.set()
-
             self.qos_monitor.record_request(request)
 
     def monitor_stats(self, stop_event: threading.Event):
@@ -3106,13 +2830,12 @@ def run(self) -> Dict:
             print(f"    - Mode: {self.autoscaler.mode}")
         print(f"  - QoS Support: Enabled (Interactive/Responsive/Batch)")
         print(f"  - Trace-Driven (BurstGPT): {'Enabled' if self.use_burst_trace else 'Disabled'}")
-        print(f"  - ShareGPT Dataset: {'Enabled' if self.use_dataset else 'Disabled'}")
         if self.max_concurrent_allocs > 0:
             print(f"  - Max Concurrent Allocations: {self.max_concurrent_allocs} (bounds RAM usage)")
         print("=" * 80)
 
         users = []
-        if not self.use_burst_trace and not self.use_dataset:
+        if not self.use_burst_trace:
             users = UserSimulator.generate_mixed_users(self.num_users)
             context_lengths = [u.context_length for u in users]
             print(f"\nUser Context Length Distribution:")
@@ -3124,21 +2847,14 @@ def run(self) -> Dict:
             print(f"\nQoS Distribution:")
             for level, count in qos_dist.items():
                 print(f"  {level.value}: {count} users")
-        elif self.use_dataset and self.sharegpt_loader:
-            print(f"\nShareGPT Dataset Statistics:")
-            print(f"  Conversations: {self.sharegpt_loader.token_stats.get('total_conversations', 0)}")
-            print(f"  Total Turns: {self.sharegpt_loader.token_stats.get('total_turns', 0)}")
 
         print(f"\nStarting benchmark...")
         print("-" * 80)
 
         stop_event = threading.Event()
-        self.stop_event = stop_event  # Store for max_requests check
 
         threads = []
-        if self.use_dataset:
-            gen_thread = threading.Thread(target=self._generate_requests_from_dataset, args=(stop_event,), daemon=True)
-        elif self.use_burst_trace:
+        if self.use_burst_trace:
             gen_thread = threading.Thread(target=self._generate_requests_from_trace, args=(stop_event,), daemon=True)
         else:
             gen_thread = threading.Thread(target=self.generate_requests, args=(users, stop_event), daemon=True)
@@ -3158,36 +2874,31 @@ def run(self) -> Dict:
             threads.append(mon_thread)
             mon_thread.start()
 
-        # Wait for either the configured duration or an earlier stop signal (from max_requests or monitor).
-        benchmark_start = time.time()
+        # Wait for either the configured duration or an earlier stop signal from the monitor.
         stop_event.wait(timeout=self.duration)
-        actual_duration = time.time() - benchmark_start
 
         stop_event.set()
         for thread in threads:
             thread.join(timeout=2.0)
 
-        self._calculate_stats(actual_duration)
+        self._calculate_stats()
 
         if self.validator:
             self.results['validation'] = self.validator.validate_benchmark(self.results)
 
         return self.results
 
-    def _calculate_stats(self, actual_duration: float = None):
+    def _calculate_stats(self):
         """Calculate final statistics with all feature breakdowns"""
         if not self.results['end_to_end_latencies']:
             print("\nNo requests completed during benchmark!")
             return
 
-        # Use actual duration if provided (for max_requests mode), else configured duration
-        duration = actual_duration if actual_duration else self.duration
-
         e2e = np.array(self.results['end_to_end_latencies'])
         storage = np.array(self.results['storage_latencies'])
         generation = np.array(self.results['generation_latencies'])
 
-        cache_stats = self.cache.get_stats(duration)
+        cache_stats = self.cache.get_stats(self.duration)
         qos_metrics = self.qos_monitor.get_all_qos_metrics()
         prefix_stats = self.prefix_cache_manager.stats if self.prefix_cache_manager else {}
         autoscaling_stats = self.autoscaler.scaling_history if self.autoscaler else []
@@ -3208,11 +2919,8 @@ def _calculate_stats(self, actual_duration: float = None):
         summary = {
             'total_requests': self.results['requests_completed'],
             'total_tokens': self.results['total_tokens_generated'],
-            'elapsed_time': duration,
-            'avg_throughput_tokens_per_sec': self.results['total_tokens_generated'] / duration,
-            'total_storage_io_time': self.results['total_storage_io_latency'],
-            'storage_throughput_tokens_per_sec': self.results['total_tokens_generated'] / self.results['total_storage_io_latency'] if self.results['total_storage_io_latency'] > 0 else 0,
-            'requests_per_second': self.results['requests_completed'] / duration,
+            'avg_throughput_tokens_per_sec': self.results['total_tokens_generated'] / self.duration,
+            'requests_per_second': self.results['requests_completed'] / self.duration,
             'end_to_end_latency_ms': {
                 'mean': np.mean(e2e) * 1000,
                 'p50': np.percentile(e2e, 50) * 1000,
@@ -3319,8 +3027,7 @@ def _print_summary(self, summary: Dict):
         print(f"\n### OVERALL PERFORMANCE ###")
         print(f"Requests Completed: {summary['total_requests']}")
         print(f"Total Tokens Generated: {summary['total_tokens']}")
-        print(f"Throughput (wall-clock): {summary['avg_throughput_tokens_per_sec']:.2f} tokens/sec")
-        print(f"Throughput (storage I/O): {summary['storage_throughput_tokens_per_sec']:.2f} tokens/sec")
+        print(f"Throughput: {summary['avg_throughput_tokens_per_sec']:.2f} tokens/sec")
         print(f"Requests/sec: {summary['requests_per_second']:.2f}")
 
         print(f"\n### END-TO-END LATENCY (Storage I/O + Token Generation) ###")
@@ -3461,26 +3168,12 @@ def main():
                         help='Path to the BurstGPT trace file.')
     parser.add_argument('--validation-trace', type=str, default=None,
                         help='Path to a real-world trace file for validation.')
-    parser.add_argument('--dataset-path', type=str, default=None,
-                        help='Path to ShareGPT dataset JSON file for realistic workload generation.')
-    parser.add_argument('--max-conversations', type=int, default=500,
-                        help='Maximum number of conversations to load from the ShareGPT dataset.')
     parser.add_argument('--output', type=str, default=f"benchmark_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", help='Output file for results')
     parser.add_argument('--seed', type=int, default=None,
                         help='Seed for random number generators to ensure reproducibility.')
     parser.add_argument('--max-concurrent-allocs', type=int, default=0,
                         help='Limit concurrent allocations to bound RAM usage. 0 = unlimited. '
                              'Recommended: 8-16 for large models to prevent memory explosion.')
-    parser.add_argument('--request-rate', type=float, default=0,
-                        help='Target request arrival rate for ShareGPT replay (requests/sec). '
-                             '0 = unlimited (storage-saturating mode for MLPerf). '
-                             'Default: 0. Use 10 for realistic user arrival patterns.')
-    parser.add_argument('--max-requests', type=int, default=0,
-                        help='Stop after completing N requests (0 = use duration instead). '
-                             'Useful for fixed-workload comparisons with vLLM benchmarks.')
-    parser.add_argument('--xlsx-output', type=str, default=None,
-                        help='Optional: Output Excel file path for summary results with run parameters. '
-                             'Requires pandas and openpyxl. Falls back to CSV if openpyxl not available.')
 
     args = parser.parse_args()
 
@@ -3515,12 +3208,8 @@ def main():
         performance_profile=args.performance_profile,
         use_burst_trace=args.use_burst_trace,
         burst_trace_path=args.burst_trace_path,
-        dataset_path=args.dataset_path,
-        max_conversations=args.max_conversations,
         seed=args.seed,
-        max_concurrent_allocs=args.max_concurrent_allocs,
-        request_rate=args.request_rate,
-        max_requests=args.max_requests
+        max_concurrent_allocs=args.max_concurrent_allocs
     )
 
     results = benchmark.run()
@@ -3542,166 +3231,5 @@ def convert_numpy(obj):
 
     print(f"\nResults saved to {args.output}")
 
-    # Export to XLSX if requested
-    if args.xlsx_output:
-        export_results_to_xlsx(results, args, args.xlsx_output)
-
-
-def export_results_to_xlsx(results: Dict, args, output_path: str):
-    """
-    Export benchmark results to an Excel file with run parameters embedded.
-    Falls back to CSV if openpyxl is not available.
-    
-    Args:
-        results: The benchmark results dictionary
-        args: The argparse namespace with all CLI parameters
-        output_path: Path for the output Excel/CSV file
-    """
-    if not PANDAS_AVAILABLE:
-        print(f"Warning: pandas not available, skipping XLSX export. Install with: pip install pandas")
-        return
-    
-    summary = results.get('summary', {})
-    if not summary:
-        print("Warning: No summary data available for XLSX export")
-        return
-    
-    # Helper to safely get nested keys
-    def get_nested(d, keys, default=None):
-        for key in keys:
-            if isinstance(d, dict):
-                d = d.get(key, default)
-            else:
-                return default
-        return d
-    
-    # Build run parameters row
-    run_params = {
-        'Timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        'Model': args.model,
-        'Num Users': args.num_users,
-        'Duration (s)': args.duration,
-        'GPU Memory (GB)': args.gpu_mem_gb,
-        'CPU Memory (GB)': args.cpu_mem_gb,
-        'Generation Mode': args.generation_mode,
-        'Performance Profile': args.performance_profile,
-        'Multi-turn': not args.disable_multi_turn,
-        'Prefix Caching': not args.disable_prefix_caching,
-        'RAG Enabled': args.enable_rag,
-        'Autoscaling': args.enable_autoscaling,
-        'Seed': args.seed,
-        'Max Concurrent Allocs': args.max_concurrent_allocs,
-        'Request Rate': args.request_rate,
-        'Max Requests': args.max_requests,
-        'Dataset Path': args.dataset_path or 'N/A',
-        'Cache Dir': args.cache_dir or 'temp',
-    }
-    
-    # Build metrics row
-    metrics = {
-        'Total Requests': summary.get('total_requests'),
-        'Total Tokens': summary.get('total_tokens'),
-        'Elapsed Time (s)': summary.get('elapsed_time'),
-        'Avg Throughput (tok/s)': summary.get('avg_throughput_tokens_per_sec'),
-        'Storage Throughput (tok/s)': summary.get('storage_throughput_tokens_per_sec'),
-        'Requests/sec': summary.get('requests_per_second'),
-        
-        # End to End Latency
-        'E2E Latency Mean (ms)': get_nested(summary, ['end_to_end_latency_ms', 'mean']),
-        'E2E Latency P50 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p50']),
-        'E2E Latency P95 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p95']),
-        'E2E Latency P99 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p99']),
-        
-        # Storage IO Latency
-        'Storage Latency Mean (ms)': get_nested(summary, ['storage_io_latency_ms', 'mean']),
-        'Storage Latency P50 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p50']),
-        'Storage Latency P95 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p95']),
-        'Storage Latency P99 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p99']),
-        
-        # Generation Latency
-        'Gen Latency Mean (ms)': get_nested(summary, ['generation_latency_ms', 'mean']),
-        'Gen Latency P50 (ms)': get_nested(summary, ['generation_latency_ms', 'p50']),
-        'Gen Latency P95 (ms)': get_nested(summary, ['generation_latency_ms', 'p95']),
-        'Gen Latency P99 (ms)': get_nested(summary, ['generation_latency_ms', 'p99']),
-        
-        # Cache Stats
-        'Cache Hit Rate': get_nested(summary, ['cache_stats', 'cache_hit_rate']),
-        'Read/Write Ratio': get_nested(summary, ['cache_stats', 'read_write_ratio']),
-        'Total Read (GB)': get_nested(summary, ['cache_stats', 'total_read_gb']),
-        'Total Write (GB)': get_nested(summary, ['cache_stats', 'total_write_gb']),
-        'Prefill Bytes Written (GB)': get_nested(summary, ['cache_stats', 'prefill_bytes_written_gb']),
-        'Decode Bytes Read (GB)': get_nested(summary, ['cache_stats', 'decode_bytes_read_gb']),
-        
-        # Tier distribution
-        'GPU Entries': get_nested(summary, ['cache_stats', 'gpu_entries']),
-        'CPU Entries': get_nested(summary, ['cache_stats', 'cpu_entries']),
-        'NVMe Entries': get_nested(summary, ['cache_stats', 'nvme_entries']),
-        
-        # Multi-turn stats
-        'Multi-turn Hit Rate': get_nested(summary, ['multi_turn_stats', 'hit_rate']),
-    }
-    
-    # Combine into single row with all data
-    combined_row = {**run_params, **metrics}
-    
-    # Create DataFrame
-    df = pd.DataFrame([combined_row])
-    
-    # Determine output format
-    use_excel = OPENPYXL_AVAILABLE and output_path.endswith('.xlsx')
-    
-    try:
-        if use_excel:
-            # Create Excel with multiple sheets for better organization
-            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
-                # Sheet 1: Combined summary (single row for easy aggregation)
-                df.to_excel(writer, sheet_name='Summary', index=False)
-                
-                # Sheet 2: Run Parameters (vertical format for readability)
-                params_df = pd.DataFrame(list(run_params.items()), columns=['Parameter', 'Value'])
-                params_df.to_excel(writer, sheet_name='Run Parameters', index=False)
-                
-                # Sheet 3: Performance Metrics (vertical format)
-                metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
-                metrics_df.to_excel(writer, sheet_name='Performance Metrics', index=False)
-                
-                # Sheet 4: QoS Metrics if available
-                qos_metrics = summary.get('qos_metrics', {})
-                if qos_metrics:
-                    qos_rows = []
-                    for level, data in qos_metrics.items():
-                        if isinstance(data, dict) and not data.get('no_data'):
-                            qos_rows.append({
-                                'QoS Level': level,
-                                'Total Requests': data.get('total_requests'),
-                                'Latency P95 (ms)': get_nested(data, ['latency_ms', 'p95']),
-                                'Latency P99 (ms)': get_nested(data, ['latency_ms', 'p99']),
-                                'SLA Met': get_nested(data, ['sla', 'met']),
-                                'SLA Compliance': get_nested(data, ['sla', 'compliance']),
-                            })
-                    if qos_rows:
-                        qos_df = pd.DataFrame(qos_rows)
-                        qos_df.to_excel(writer, sheet_name='QoS Metrics', index=False)
-            
-            print(f"XLSX results saved to {output_path}")
-        else:
-            # Fall back to CSV
-            csv_path = output_path.replace('.xlsx', '.csv') if output_path.endswith('.xlsx') else output_path
-            if not csv_path.endswith('.csv'):
-                csv_path += '.csv'
-            df.to_csv(csv_path, index=False)
-            print(f"CSV results saved to {csv_path} (openpyxl not available for XLSX)")
-            
-    except Exception as e:
-        print(f"Error saving XLSX/CSV: {e}")
-        # Last resort: try CSV
-        try:
-            csv_path = output_path.replace('.xlsx', '.csv')
-            df.to_csv(csv_path, index=False)
-            print(f"Fallback CSV saved to {csv_path}")
-        except Exception as e2:
-            print(f"Failed to save results: {e2}")
-
-
 if __name__ == "__main__":
     main()

From fdd95fdfc4a5243d4d16e2b085ba73c1785342a5 Mon Sep 17 00:00:00 2001
From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:40:57 -0800
Subject: [PATCH 2/4] Revise KV Cache Benchmark script for MLPerf updates

Updated script for KV Cache Storage Benchmark to reflect new author attribution and modified test parameters for MLPerf submissions.
---
 kv_cache_benchmark/kv-cache-wrapper.sh | 160 ++++++-------------------
 1 file changed, 34 insertions(+), 126 deletions(-)

diff --git a/kv_cache_benchmark/kv-cache-wrapper.sh b/kv_cache_benchmark/kv-cache-wrapper.sh
index 2b648d6a..b8f52dba 100644
--- a/kv_cache_benchmark/kv-cache-wrapper.sh
+++ b/kv_cache_benchmark/kv-cache-wrapper.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # KV Cache Storage Benchmark - Multi-Tier Performance Comparison
-# Kingston Digital, 2025
-# Apache 2.0 license
+# Hazem Awadallah, Kingston Digital, 2025
+# Assisted by Github Copilot
 # This script runs a comprehensive comparison of cache tier configurations for LLM inference workloads.
 # It automatically detects your hardware (GPU, RAM, storage) and runs 9 different test scenarios to show
 # you exactly where your data ends up and how fast it moves between tiers.
@@ -371,7 +371,7 @@ if should_run 'capacity-autoscale'; then
         --num-users "$capacity_start_users" \
         --duration "$autoscale_duration" \
         --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
+        --cpu-mem-gb 0 \
         --enable-autoscaling \
         --autoscaler-mode capacity \
         --generation-mode none \
@@ -388,129 +388,53 @@ else
 fi
 
 # ==============================================================================
-# OFFICIAL MLPERF SUBMISSION WORKLOAD (DISCOVERY-VALIDATED)
+# OFFICIAL MLPERF SUBMISSION WORKLOAD
 # ==============================================================================
-# These invocations have been validated through extensive discovery testing:
-# - 1,411 Fast system tests (14,000 MB/s NVMe)
-# - 268 Slow system tests (3,000 MB/s storage)
-#
-# KEY FINDINGS FROM DISCOVERY TESTING:
-# - Storage Throughput metric is UNRELIABLE at cpu_mem=0GB (only 1.1x differentiation)
-# - Decode Bytes Read shows 2.62x differentiation at cpu_mem=0GB (100% win rate)
-# - Wall-Clock Throughput shows 2.43x differentiation at cpu_mem=0GB (100% win rate)
-# - Storage Throughput works at cpu_mem=4GB (2.2x differentiation, 97% win rate)
-# - High variance (CV 50-125%) requires multiple trials
+# This is a special workload that runs only the two required scenarios for an
+# official MLPerf v3.0 storage submission. It uses fixed, long durations and
+# specific user counts to ensure results are standardized and comparable.
 #
-# This workload runs TWO configurations:
-# 1. Maximum Storage Stress (cpu_mem=0GB) - Use Decode Bytes Read as primary metric
-# 2. Storage Throughput Test (cpu_mem=4GB) - Use Storage Throughput as primary metric
+# NOTE: These parameters are intentionally stressful. They use a high user count
+# with a small CPU memory budget to force near-constant NVMe access. The goal is
+# to saturate the storage device and measure its performance under extreme load.
+# Expect very high latencies; this is not a test of user experience, but a
+# benchmark of the underlying storage hardware's breaking point. See the
+# analysis in `report_analysis.md` for context on why this occurs.
 # ==============================================================================
 if should_run 'mlperf_submission'; then
     echo "============================================================================"
-    echo "RUNNING OFFICIAL MLPERF SUBMISSION WORKLOAD (DISCOVERY-VALIDATED)"
+    echo "RUNNING OFFICIAL MLPERF SUBMISSION WORKLOAD"
     echo "============================================================================"
     echo ""
-    echo "NOTE: Discovery testing validated these configurations across 1,679 tests."
-    echo "      See mlperfv3_results_and_metrics_discovery.md for full analysis."
-    echo ""
 
-    # -------------------------------------------------------------------------
-    # Test 1: Maximum Storage Stress (cpu_mem=0GB)
-    # Primary Metrics: Decode Bytes Read (2.62x), Wall-Clock Throughput (2.43x)
-    # WARNING: Do NOT use Storage Throughput at cpu_mem=0GB (only 1.1x differentiation)
-    # -------------------------------------------------------------------------
-    echo "[MLPerf 1/4] Maximum Storage Stress: llama3.1-8b, cpu_mem=0GB, 200 users..."
-    echo "             PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput"
-    echo "             WARNING: Storage Throughput unreliable at cpu_mem=0GB"
+    echo "[MLPerf 1/2] Standard Submission: llama3.1-8b with 150 users..."
     python3 kv-cache.py \
         --model llama3.1-8b \
-        --num-users 200 \
-        --duration 300 \
+        --num-users 150 \
+        --duration 600 \
         --gpu-mem-gb 0 \
         --cpu-mem-gb 0 \
-        --max-concurrent-allocs 16 \
-        --generation-mode none \
-        --cache-dir "$cache_dir" \
-        --seed 42 \
-        --output mlperf_v3_stress_8b.json
-    echo "Maximum storage stress test (8B) complete."
-    echo ""
-
-    # -------------------------------------------------------------------------
-    # Test 2: Storage Throughput Test (cpu_mem=4GB)
-    # Primary Metric: Storage Throughput (2.2x differentiation, 97% win rate)
-    # -------------------------------------------------------------------------
-    echo "[MLPerf 2/4] Storage Throughput Test: llama3.1-8b, cpu_mem=4GB, 100 users..."
-    echo "             PRIMARY METRIC: Storage Throughput (tok/s)"
-    python3 kv-cache.py \
-        --model llama3.1-8b \
-        --num-users 100 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
-        --max-concurrent-allocs 0 \
-        --generation-mode none \
+        --generation-mode realistic \
+        --performance-profile throughput \
         --cache-dir "$cache_dir" \
         --seed 42 \
-        --output mlperf_v3_throughput_8b.json
-    echo "Storage throughput test (8B) complete."
+        --output mlperf_v3_storage_submission_8b.json
+    echo "Standard submission test complete."
     echo ""
 
-    # -------------------------------------------------------------------------
-    # Test 3: Large Model Storage Stress (70B, cpu_mem=0GB)
-    # 70B model generates ~10x more I/O per token than 8B
-    # -------------------------------------------------------------------------
-    echo "[MLPerf 3/4] Large Model Stress: llama3.1-70b-instruct, cpu_mem=0GB, 70 users..."
-    echo "             PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput"
+    echo "[MLPerf 2/2] Large Model Submission: llama3.1-70b-instruct with 40 users..."
     python3 kv-cache.py \
         --model llama3.1-70b-instruct \
-        --num-users 70 \
-        --duration 300 \
+        --num-users 40 \
+        --duration 600 \
         --gpu-mem-gb 0 \
         --cpu-mem-gb 0 \
-        --max-concurrent-allocs 4 \
-        --generation-mode none \
-        --cache-dir "$cache_dir" \
-        --seed 42 \
-        --output mlperf_v3_stress_70b.json
-    echo "Large model storage stress test (70B) complete."
-    echo ""
-
-    # -------------------------------------------------------------------------
-    # Test 4: Large Model Throughput Test (70B, cpu_mem=4GB)
-    # -------------------------------------------------------------------------
-    echo "[MLPerf 4/4] Large Model Throughput: llama3.1-70b-instruct, cpu_mem=4GB, 50 users..."
-    echo "             PRIMARY METRIC: Storage Throughput (tok/s)"
-    python3 kv-cache.py \
-        --model llama3.1-70b-instruct \
-        --num-users 50 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
-        --max-concurrent-allocs 4 \
-        --generation-mode none \
+        --generation-mode realistic \
+        --performance-profile throughput \
         --cache-dir "$cache_dir" \
         --seed 42 \
-        --output mlperf_v3_throughput_70b.json
-    echo "Large model throughput test (70B) complete."
-    echo ""
-
-    echo "============================================================================"
-    echo "MLPERF SUBMISSION WORKLOAD COMPLETE"
-    echo "============================================================================"
-    echo ""
-    echo "METRIC SELECTION GUIDE (based on discovery testing):"
-    echo ""
-    echo "  For cpu_mem=0GB tests (mlperf_v3_stress_*.json):"
-    echo "    - PRIMARY: Decode Bytes Read (2.62x differentiation, 100% win rate)"
-    echo "    - PRIMARY: Wall-Clock Throughput (2.43x differentiation, 100% win rate)"
-    echo "    - DO NOT USE: Storage Throughput (only 1.1x at cpu_mem=0GB)"
-    echo ""
-    echo "  For cpu_mem=4GB tests (mlperf_v3_throughput_*.json):"
-    echo "    - PRIMARY: Storage Throughput (2.2x differentiation, 97% win rate)"
-    echo ""
-    echo "  TRIAL RECOMMENDATION: Run 3-5 trials per configuration (CV 50-125%)"
-    echo "============================================================================"
+        --output mlperf_v3_storage_submission_70b.json
+    echo "Large model submission test complete."
     echo ""
 fi
 
@@ -527,7 +451,7 @@ if should_run 'gpu-only'; then
             --num-users $users_baseline \
             --duration "$tier_duration" \
             --gpu-mem-gb $gpu_mem_gb \
-            --cpu-mem-gb 4 \
+            --cpu-mem-gb 0 \
             --generation-mode realistic \
             "${rag_args[@]}" \
             --seed 42 \
@@ -593,7 +517,7 @@ if should_run 'storage-only'; then
         --num-users $users_baseline \
         --duration "$tier_duration" \
         --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
+        --cpu-mem-gb 0 \
         --generation-mode realistic \
         --cache-dir $cache_dir \
         "${rag_args[@]}" \
@@ -756,7 +680,7 @@ if should_run 'storage-saturation'; then
         --num-users $users_high \
         --duration "$saturation_duration" \
         --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
+        --cpu-mem-gb 0 \
         --generation-mode realistic \
         --cache-dir $cache_dir \
         "${rag_args[@]}" \
@@ -908,25 +832,15 @@ print("COMPREHENSIVE BENCHMARK ANALYSIS")
 print("="*100)
 
 # Scenario catalog ties each results JSON to a friendly description.
-# Updated to reflect discovery-validated MLPerf invocations (Jan 2026)
 scenarios = [
-    # MLPerf Stress Tests (cpu_mem=0GB) - Use Decode Bytes Read / Wall-Clock Throughput
-    ("mlperf_stress_8b", "mlperf_v3_stress_8b.json", "MLPerf: Storage Stress (8B, cpu_mem=0GB)", "Maximum storage stress test. PRIMARY METRICS: Decode Bytes Read (2.62x), Wall-Clock Throughput (2.43x). WARNING: Storage Throughput unreliable at cpu_mem=0GB."),
-    ("mlperf_stress_70b", "mlperf_v3_stress_70b.json", "MLPerf: Storage Stress (70B, cpu_mem=0GB)", "Large model storage stress (~10x I/O per token). PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput."),
-    # MLPerf Throughput Tests (cpu_mem=4GB) - Use Storage Throughput
-    ("mlperf_throughput_8b", "mlperf_v3_throughput_8b.json", "MLPerf: Storage Throughput (8B, cpu_mem=4GB)", "Storage throughput benchmark. PRIMARY METRIC: Storage Throughput (2.2x differentiation, 97% win rate)."),
-    ("mlperf_throughput_70b", "mlperf_v3_throughput_70b.json", "MLPerf: Storage Throughput (70B, cpu_mem=4GB)", "Large model throughput test. PRIMARY METRIC: Storage Throughput."),
-    # Legacy MLPerf filenames (for backwards compatibility)
-    ("mlperf_submission_8b", "mlperf_v3_storage_submission_8b.json", "MLPerf: Legacy Submission (8B)", "Legacy format. Consider using new discovery-validated invocations."),
-    ("mlperf_submission_70b", "mlperf_v3_storage_submission_70b.json", "MLPerf: Legacy Submission (70B)", "Legacy format. Consider using new discovery-validated invocations."),
-    # Tier tests
+    ("mlperf_submission_8b", "mlperf_v3_storage_submission_8b.json", "MLPerf: Standard Submission (8B)", "Official MLPerf v3.0 storage submission with llama3.1-8b."),
+    ("mlperf_submission_70b", "mlperf_v3_storage_submission_70b.json", "MLPerf: Large Model Submission (70B)", "Official MLPerf v3.0 storage submission with llama3.1-70b."),
     ("gpu-only", "results_tier_gpu_only.json", "Tier: GPU Only", "All KV cache pinned in GPU VRAM for a latency baseline."),
     ("cpu-only", "results_tier_cpu_only.json", "Tier: CPU Only", "Cache entirely in system RAM (typical production baseline)."),
     ("storage-only", "results_tier_storage_only.json", "Tier: Storage Only", "Forces every lookup to NVMe/SSD to expose disk behaviour."),
     ("gpu-cpu", "results_tier_gpu_cpu.json", "Tier: GPU + CPU", "Two-tier hot/warm cache without backing storage."),
     ("cpu-storage", "results_tier_cpu_storage.json", "Tier: CPU + Storage", "RAM backed by NVMe spillover for larger working sets."),
     ("gpu-cpu-storage", "results_tier_gpu_cpu_storage.json", "Tier: GPU + CPU + Storage", "Full three-tier hierarchy (VRAM + RAM + NVMe)."),
-    # Stress tests
     ("storage-saturation", "results_stress_storage_saturation.json", "Stress: Storage Saturation", "High-concurrency workload with constrained RAM to find NVMe limits."),
     ("production", "results_realistic_production.json", "Stress: Realistic Production", "Balanced configuration intended to mimic steady-state inference load."),
     ("autoscale", "results_autoscaling_discovery.json", "Stress: Autoscaling Discovery", "Adaptive user ramp designed to discover sustainable concurrency."),
@@ -935,14 +849,8 @@ scenarios = [
 selected_env = os.getenv("KVCACHE_SELECTED_WORKLOADS", "")
 selected_keys = {item.strip() for item in selected_env.split(",") if item.strip()} if selected_env else set()
 
-# If mlperf_submission is selected, add all MLPerf sub-scenarios to the list to be processed.
+# If mlperf_submission is selected, add its sub-scenarios to the list to be processed.
 if "mlperf_submission" in selected_keys:
-    # New discovery-validated scenarios
-    selected_keys.add("mlperf_stress_8b")
-    selected_keys.add("mlperf_stress_70b")
-    selected_keys.add("mlperf_throughput_8b")
-    selected_keys.add("mlperf_throughput_70b")
-    # Legacy scenarios (for backwards compatibility)
     selected_keys.add("mlperf_submission_8b")
     selected_keys.add("mlperf_submission_70b")
 

From 01ca82488ee7e1b71e1ef50ea6c5b7b916b7cbd4 Mon Sep 17 00:00:00 2001
From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:41:30 -0800
Subject: [PATCH 3/4] Revise README for KV Cache benchmark implementation

Updated README to include initial implementation details and enhanced overview of the KV Cache benchmark for MLPerf Storage v3.
---
 kv_cache_benchmark/README.md | 773 ++---------------------------------
 1 file changed, 23 insertions(+), 750 deletions(-)

diff --git a/kv_cache_benchmark/README.md b/kv_cache_benchmark/README.md
index 5f0637c1..e432d46b 100644
--- a/kv_cache_benchmark/README.md
+++ b/kv_cache_benchmark/README.md
@@ -1,766 +1,39 @@
 # MLPerf Storage KV Cache Benchmark
 
-A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration.
+This directory contains the initial implementation of the KV Cache benchmark for MLPerf Storage v3.
 
-**Author:** Hazem Awadallah, Kingston Digital
-**License:** Apache 2.0
-**Version:** MLPerf Storage v3.0 (Enhanced)
+## Overview
 
----
+The KV Cache benchmark simulates the storage access patterns of Large Language Model (LLM) inference systems, specifically focusing on key-value cache operations that are critical for multi-turn conversations and long-context processing.
 
-## Table of Contents
+## Components
 
-1. [What This Benchmark Does](#what-this-benchmark-does)
-2. [Architecture Overview](#architecture-overview)
-3. [System Requirements](#system-requirements)
-4. [Installation](#installation)
-5. [Quick Start](#quick-start)
-6. [Running the Benchmark](#running-the-benchmark)
-7. [ShareGPT Replay Workloads](#sharegpt-replay-workloads)
-8. [Using the Wrapper Script](#using-the-wrapper-script)
-9. [Understanding Results](#understanding-results)
-10. [Unit Testing](#unit-testing)
-11. [Excel Export](#excel-export)
-12. [MLPerf Submission Guidelines](#mlperf-submission-guidelines)
-13. [Troubleshooting](#troubleshooting)
+### Core Scripts
 
----
+- **kv-cache.py**: Main benchmark implementation for KV cache storage performance testing
+- **kv-cache_sharegpt_replay.py**: ShareGPT conversation replay-based benchmark for realistic workload simulation
+- **kv-cache-wrapper.sh**: Wrapper script for running benchmark configurations
+- **validate.sh**: Validation script for benchmark results
 
-## What This Benchmark Does
+### Documentation
 
-During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage.
+- **MLperf v3 KV cache proposal.md**: Detailed proposal for KV cache benchmark integration into MLPerf Storage
+- **MLperf v3 KV cache proposal.pdf**: PDF version of the proposal
+- **sources.md**: References and source documentation
 
-This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components:
+## Purpose
 
-- How many concurrent users your hardware can support
-- Whether your NVMe drive is fast enough to handle cache spillover
-- The real latency impact of each storage tier
-- Where the bottleneck sits in your system
+This benchmark addresses the growing need to measure storage system performance under AI/ML inference workloads, particularly:
 
-This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers.
+- Key-value cache read/write patterns
+- Mixed sequential and random access patterns
+- Multi-threaded concurrent access
+- Realistic conversation-based workload replay
 
----
+## Getting Started
 
-## Architecture Overview
+See the proposal documents for detailed information about the benchmark design, metrics, and validation criteria.
 
-The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems.
+## Status
 
-```
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                         KV Cache Benchmark Architecture                      │
-└─────────────────────────────────────────────────────────────────────────────┘
-
-                              ┌──────────────────┐
-                              │   User Requests  │
-                              │  (Multi-tenant)  │
-                              └────────┬─────────┘
-                                       │
-                                       ▼
-                    ┌──────────────────────────────────────┐
-                    │         Request Queue                │
-                    │   (Priority-based: QoS levels)       │
-                    │   Interactive > Responsive > Batch   │
-                    └──────────────────┬───────────────────┘
-                                       │
-                                       ▼
-          ┌────────────────────────────────────────────────────────┐
-          │                  IntegratedBenchmark                   │
-          │  ┌─────────────┐  ┌─────────────┐  ┌─────────────────┐ │
-          │  │   Prefill   │  │   Decode    │  │  Conversation   │ │
-          │  │   (Write)   │  │   (Read)    │  │    Manager      │ │
-          │  └──────┬──────┘  └──────┬──────┘  └────────┬────────┘ │
-          └─────────┼────────────────┼─────────────────┼───────────┘
-                    │                │                 │
-                    └────────────────┼─────────────────┘
-                                     │
-                                     ▼
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                           MultiTierCache                                     │
-│                     (Waterfall LRU Eviction)                                 │
-│                                                                              │
-│    New Data ─────► Always targets fastest available tier                     │
-│                    If full, LRU entry cascades down                          │
-│                                                                              │
-│  ┌─────────────────────────────────────────────────────────────────────┐    │
-│  │                                                                     │    │
-│  │   ┌───────────────┐      ┌───────────────┐      ┌───────────────┐  │    │
-│  │   │   GPU VRAM    │      │   CPU RAM     │      │    NVMe       │  │    │
-│  │   │   (Tier 1)    │─────►│   (Tier 2)    │─────►│   (Tier 3)    │  │    │
-│  │   │               │ LRU  │               │ LRU  │               │  │    │
-│  │   │  Sub-ms       │evict │  Tens of ms   │evict │  Hundreds     │  │    │
-│  │   │  latency      │      │  latency      │      │  of ms        │  │    │
-│  │   │               │      │               │      │               │  │    │
-│  │   │  PyTorch/CuPy │      │  NumPy arrays │      │  .npy files   │  │    │
-│  │   │  tensors      │      │  in memory    │      │  on disk      │  │    │
-│  │   └───────────────┘      └───────────────┘      └───────────────┘  │    │
-│  │                                                                     │    │
-│  │   ◄──── HOT DATA ────────────────────────────── COLD DATA ────►    │    │
-│  │                                                                     │    │
-│  └─────────────────────────────────────────────────────────────────────┘    │
-│                                                                              │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
-
-### Key Components
-
-**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier.
-
-**Inference Phases**: The benchmark models two distinct I/O patterns:
-- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries.
-- **Decode**: Read-heavy. Generating each output token requires reading the existing cache.
-
-**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels.
-
-**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode).
-
----
-
-## System Requirements
-
-### Minimum
-
-- CPU: 8+ cores (AMD EPYC, Intel Xeon)
-- RAM: 32 GB
-- Storage: 256 GB free space on SSD
-- OS: Linux (Ubuntu 22.04, RHEL 9, or similar)
-- Python: 3.8 or higher
-- No GPU required (runs in CPU-only mode)
-
-### Recommended
-
-- CPU: 32+ cores
-- RAM: 128 GB or more
-- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing)
-- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5)
-- Tools: `bc`, `jq` for the wrapper script
-
----
-
-## Installation
-
-1. Clone or download this repository.
-
-2. Install Python dependencies:
-
-```bash
-pip install -r requirements.txt
-```
-
-Or install core dependencies manually:
-
-```bash
-pip install numpy
-```
-
-3. For GPU support (optional):
-
-```bash
-pip install torch  # or cupy-cuda12x for CuPy
-```
-
-4. For ShareGPT replay workloads (optional):
-
-```bash
-pip install tiktoken
-```
-
-5. For Excel export (optional):
-
-```bash
-pip install pandas openpyxl
-```
-
-6. Verify the installation:
-
-```bash
-python3 kv-cache.py --help
-```
-
----
-
-## Quick Start
-
-Run a basic storage test with 50 users for 2 minutes:
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 50 \
-    --duration 120 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results.json
-```
-
-This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance.
-
----
-
-## Running the Benchmark
-
-### Command Line Options
-
-```
-python3 kv-cache.py [options]
-
-Required Arguments:
-  --model MODEL         Model configuration to use. Choices:
-                        tiny-1b, mistral-7b, llama2-7b, llama3.1-8b,
-                        llama3.1-70b-instruct
-  --num-users N         Number of concurrent users to simulate
-  --duration SECONDS    Duration of the benchmark in seconds
-
-Memory Configuration:
-  --gpu-mem-gb N        GPU VRAM budget in GB (0 to disable GPU tier)
-  --cpu-mem-gb N        CPU RAM budget in GB (0 to disable CPU tier)
-  --cache-dir PATH      Directory for NVMe cache files (defaults to temp directory)
-
-Token Generation:
-  --generation-mode     Token generation speed simulation. Choices:
-                        - none: Pure storage test, no GPU simulation
-                        - fast: 2ms per token (high-end GPU)
-                        - realistic: 30ms per token (typical production)
-
-Caching Features:
-  --disable-multi-turn  Disable multi-turn conversation caching
-  --disable-prefix-caching
-                        Disable prefix caching (shared system prompts)
-
-Autoscaling:
-  --enable-autoscaling  Enable workload autoscaling
-  --autoscaler-mode     Autoscaling strategy. Choices:
-                        - qos: Latency-based, finds max users at target saturation
-                        - capacity: Throughput-based, finds peak storage performance
-  --target-saturation N Target storage saturation for QoS autoscaling (0.0-1.0,
-                        default: 0.8)
-
-ShareGPT Replay (NEW):
-  --dataset-path PATH   Path to ShareGPT JSON for realistic workload replay
-  --max-conversations N Max conversations to load from dataset (default: 500)
-  --request-rate RATE   Target request arrival rate (requests/sec)
-  --max-requests N      Stop after N requests (for fixed-length runs)
-
-RAG Workload:
-  --enable-rag          Enable RAG workload simulation
-  --rag-num-docs N      Number of RAG documents to ingest
-
-Performance and Output:
-  --performance-profile Profile for pass/fail criteria. Choices:
-                        - latency: Default, evaluates P95 latency targets
-                        - throughput: For MLPerf submission, evaluates tokens/sec
-  --output FILE         Write results to JSON file
-  --xlsx-output FILE    Export results to Excel/CSV file (NEW)
-  --seed N              Seed for random number generators (required for MLPerf
-                        reproducibility)
-
-Resource Limits:
-  --max-concurrent-allocs N
-                        Limit concurrent cache allocations to bound RAM usage.
-                        0 = unlimited. Recommended: 8-16 for large models to
-                        prevent memory explosion.
-```
-
-### Test Scenarios
-
-#### Scenario 1: Storage-Only Baseline
-
-Isolate your NVMe drive by setting GPU memory to zero. This tells you the raw performance of your storage.
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 50 \
-    --duration 180 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_storage_only.json
-```
-
-#### Scenario 2: Realistic Production Setup
-
-Test a balanced three-tier configuration that mirrors production deployment.
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 100 \
-    --duration 300 \
-    --gpu-mem-gb 16 \
-    --cpu-mem-gb 32 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_production.json
-```
-
-#### Scenario 3: Find Maximum User Count (QoS Mode)
-
-Let the autoscaler discover how many users your system can handle while maintaining acceptable latency.
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 20 \
-    --duration 300 \
-    --gpu-mem-gb 16 \
-    --cpu-mem-gb 32 \
-    --enable-autoscaling \
-    --autoscaler-mode qos \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_autoscale_qos.json
-```
-
-#### Scenario 4: Find Peak Storage Throughput (Capacity Mode)
-
-Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints.
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-70b-instruct \
-    --num-users 10 \
-    --duration 180 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --enable-autoscaling \
-    --autoscaler-mode capacity \
-    --generation-mode none \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_capacity.json
-```
-
----
-
-## ShareGPT Replay Workloads
-
-While synthetic workloads are excellent for controlled stress testing, they may not capture the nuances of real human-AI interaction. The **ShareGPT Replay** feature addresses this by loading actual conversation data.
-
-### Why Use ShareGPT?
-
-Real conversations exhibit different patterns than synthetic workloads:
-- **Higher cache locality**: Users ask follow-up questions, reusing context
-- **Variable context sizes**: Real queries vary wildly (10-16,000 tokens)
-- **Multi-turn structure**: Conversation flows are preserved
-
-### Downloading the ShareGPT Dataset
-
-Download the full dataset from Hugging Face (~1.2 GB):
-
-```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-**Alternative: Smaller subset for quick testing (~40 MB):**
-
-```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
-```
-
-### Basic ShareGPT Invocation
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-conversations 500 \
-    --num-users 50 \
-    --duration 300 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt.json
-```
-
-### ShareGPT with Rate Limiting
-
-Control the request arrival rate for steady-state testing:
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-70b-instruct \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-conversations 1000 \
-    --request-rate 10.0 \
-    --num-users 100 \
-    --duration 600 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 8 \
-    --generation-mode none \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt_rate_limited.json
-```
-
-### ShareGPT with Fixed Request Count
-
-Run exactly N requests for reproducible benchmarks:
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-requests 5000 \
-    --num-users 50 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt_fixed.json
-```
-
-### Comparing Real vs Synthetic Workloads
-
-| Metric | ShareGPT (Real) | Synthetic (Random) |
-| :--- | :--- | :--- |
-| Mean Context Size | ~133 tokens | ~2,676 tokens |
-| Cache Hit Rate | 85-97% | 50-70% |
-| Multi-turn Locality | High | Medium |
-| Throughput | Higher | Lower |
-| NVMe Stress | Moderate | Extreme |
-
-**Use ShareGPT** when you want to model real chatbot/assistant usage.
-**Use Synthetic** when you want worst-case stress testing or controlled experiments.
-
----
-
-## Using the Wrapper Script
-
-The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios.
-
-### Basic Usage
-
-```bash
-./kv-cache-wrapper.sh
-```
-
-This runs all test scenarios with default settings. Expect roughly 30 minutes for the full suite.
-
-### Options
-
-```
-./kv-cache-wrapper.sh [options]
-
-  -m MODEL     Model to benchmark (default: llama3.1-8b)
-  -t SECONDS   Duration for tier comparison tests (default: 120)
-  -s SECONDS   Duration for storage saturation test (default: 180)
-  -r SECONDS   Duration for production test (default: 180)
-  -a SECONDS   Duration for autoscaling tests (default: 300)
-  -w LIST      Comma-separated list of workloads to run
-  -u USERS     Override baseline user count
-  -U USERS     Override high-load user count
-  -R           Enable RAG workload
-  -D DOCS      Number of RAG documents (default: 10)
-  -h           Show help
-```
-
-### Available Workloads
-
-```bash
-# Run only the storage isolation test
-./kv-cache-wrapper.sh -w storage-only
-
-# Run production and autoscaling tests
-./kv-cache-wrapper.sh -w production,autoscale
-
-# Run MLPerf submission tests
-./kv-cache-wrapper.sh -w mlperf_submission
-```
-
----
-
-## Understanding Results
-
-### Key Metrics
-
-**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better.
-
-**Storage Throughput (tokens/sec)**: Raw I/O performance calculated from storage latency, not wall-clock time. This is the fairer metric for comparing storage tiers.
-
-**End-to-End Latency**: Total time from request submission to completion. This is what users experience.
-
-**Storage I/O Latency**: Time spent reading from and writing to storage tiers. This measures your hardware.
-
-**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded.
-
-**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure.
-
-### Reading the Output
-
-```
-### STORAGE PERFORMANCE ASSESSMENT: PASS ###
-  Criteria Passed: 4/4
-  [PASS] NVMe Write P95 < 500ms: 45.20ms
-  [PASS] NVMe Read P95 < 200ms: 123.45ms
-  [PASS] CPU RAM P95 < 150ms: 12.30ms
-  [PASS] Cache Hit Rate > 30%: 67.5%
-
-### OVERALL PERFORMANCE ###
-  Total Requests: 2847
-  Total Tokens Generated: 489,231
-  Avg Throughput: 1,630.77 tok/s
-  Storage Throughput: 2,105.32 tok/s
-
-### LATENCY BREAKDOWN ###
-  End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms
-  Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms
-```
-
----
-
-## Unit Testing
-
-This package includes a comprehensive pytest-based test suite to verify core functionality without running the full benchmark.
-
-### Running Tests
-
-```bash
-# Run all tests with verbose output
-pytest test_kv_cache.py -v
-
-# Run with shorter traceback
-pytest test_kv_cache.py -v --tb=short
-
-# Run specific test class
-pytest test_kv_cache.py -k "TestModelConfig" -v
-
-# Run only CPU tests (skip GPU tests if no CUDA)
-pytest test_kv_cache.py -v -m "not skipif"
-```
-
-### Test Coverage
-
-The test suite covers 12 component categories:
-
-| Test Class | Coverage |
-|------------|----------|
-| `TestModelConfig` | Model configurations, KV cache size calculations |
-| `TestInferenceRequest` | Request dataclass, cache key generation |
-| `TestQoSProfiles` | QoS levels, SLA targets, priorities |
-| `TestKVCacheGenerator` | Determinism, shapes, dtypes, precomputed buffers |
-| `TestCPUMemoryBackend` | Write/read/delete/clear operations |
-| `TestNVMeBackend` | File I/O, metadata, temp directories |
-| `TestGPUMemoryBackend` | CUDA tensors, device placement (skipped without GPU) |
-| `TestConversationManager` | Multi-turn tracking, eviction |
-| `TestUserSimulator` | User generation, QoS distribution |
-| `TestMultiTierCache` | CPU-only mode, allocation, access |
-| `TestMultiTierCacheWithGPU` | GPU tier, waterfall eviction (skipped without GPU) |
-| `TestXLSXExport` | CSV/Excel export (skipped without pandas) |
-
-### Expected Runtime
-
-- **Without GPU**: ~3-5 seconds
-- **With GPU**: ~5-10 seconds
-
-GPU tests are automatically skipped if CUDA is not available.
-
----
-
-## Excel Export
-
-The benchmark can export results directly to Excel or CSV format for analysis.
-
-### Basic Usage
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 50 \
-    --duration 120 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --seed 42 \
-    --output results.json \
-    --xlsx-output results.xlsx
-```
-
-### Output Format
-
-The Excel file contains a single row with all key metrics:
-
-| Column | Description |
-|--------|-------------|
-| Model | Model configuration used |
-| Num Users | Concurrent user count |
-| Duration (s) | Benchmark duration |
-| GPU Mem (GB) | GPU memory budget |
-| CPU Mem (GB) | CPU memory budget |
-| Total Requests | Requests completed |
-| Total Tokens | Tokens processed |
-| Avg Throughput (tok/s) | Wall-clock throughput |
-| Storage Throughput (tok/s) | Storage I/O throughput |
-| Cache Hit Rate | Percentage of cache hits |
-| E2E Latency P95 (ms) | End-to-end 95th percentile |
-| Storage IO P95 (ms) | Storage I/O 95th percentile |
-
-### Fallback Behavior
-
-- **With openpyxl**: Exports to `.xlsx` format
-- **Without openpyxl**: Falls back to `.csv` format
-- **Without pandas**: Export is skipped with a warning
-
----
-
-## MLPerf Submission Guidelines
-
-For official MLPerf v3.0 storage submissions, use these standardized commands. **These invocations have been validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests comparing 14,000 MB/s vs 3,000 MB/s storage).
-
-### Discovery Test Key Findings
-
-| Finding | Impact |
-|---------|--------|
-| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB |
-| **Best models for differentiation** | llama3.1-8b and mistral-7b show 2.31x ratio |
-| **High variance observed** | CV 50-125%, requires 3-5 trials minimum |
-| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB |
-
-### Option 1: Maximum Storage Stress (cpu_mem=0GB)
-
-Use when you want to stress test NVMe and measure I/O volume differentiation.
-
-**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation)
-
-```bash
-# MLPerf v3.0: Maximum Storage Stress Test (8B Model)
-# Run 3-5 trials for statistical significance
-for trial in 1 2 3 4 5; do
-    python3 kv-cache.py \
-        --model llama3.1-8b \
-        --num-users 200 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 0 \
-        --max-concurrent-allocs 16 \
-        --generation-mode none \
-        --cache-dir /mnt/nvme \
-        --seed 42 \
-        --output mlperf_v3_stress_8b_trial${trial}.json
-done
-```
-
-**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead.
-
-### Option 2: Storage Throughput Focus (cpu_mem=4GB)
-
-Use when you want Storage Throughput (tok/s) as your primary metric.
-
-**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate)
-
-```bash
-# MLPerf v3.0: Storage Throughput Test (8B Model)
-for trial in 1 2 3 4 5; do
-    python3 kv-cache.py \
-        --model llama3.1-8b \
-        --num-users 100 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
-        --max-concurrent-allocs 0 \
-        --generation-mode none \
-        --cache-dir /mnt/nvme \
-        --seed 42 \
-        --output mlperf_v3_throughput_8b_trial${trial}.json
-done
-```
-
-### Option 3: Large Model Submission (70B)
-
-For maximum per-request storage stress (10x larger KV cache per token):
-
-```bash
-# MLPerf v3.0: Large Model Storage Stress
-for trial in 1 2 3; do
-    python3 kv-cache.py \
-        --model llama3.1-70b-instruct \
-        --num-users 70 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 0 \
-        --max-concurrent-allocs 4 \
-        --generation-mode none \
-        --cache-dir /mnt/nvme \
-        --seed 42 \
-        --output mlperf_v3_stress_70b_trial${trial}.json
-done
-```
-
-### Critical Parameters (Discovery-Validated)
-
-| Parameter | Value | Rationale |
-|-----------|-------|-----------|
-| **seed 42** | Required | Reproducibility across systems |
-| **gpu-mem-gb 0** | Required | Isolates storage performance |
-| **cpu-mem-gb** | 0 or 4 | 0GB for max stress (use I/O volume metrics), 4GB for Storage Throughput metric |
-| **max-concurrent-allocs** | 0, 4, or 16 | 0 for throughput, 16 for stress testing |
-| **generation-mode** | none or realistic | none for pure I/O, realistic for production simulation |
-| **num-users** | 100-200 | Differentiation stable across range; higher = more throughput |
-| **duration** | 300-600 | 5-10 minutes for stable metrics |
-
-### Trial Requirements
-
-| User Count | Variance (CV) | Minimum Trials |
-|------------|---------------|----------------|
-| 10 users | ~52% | 3 |
-| 50-100 users | ~115-125% | 3-5 |
-| 200 users | ~110-120% | 3-5 |
-
-Report **median** rather than mean for publication-quality results.
-
----
-
-## Troubleshooting
-
-### Out of Memory Errors
-
-Reduce the number of concurrent users or limit parallel allocations:
-
-```bash
-python3 kv-cache.py ... --max-concurrent-allocs 50
-```
-
-### Benchmark Hangs
-
-The system may be thrashing. Reduce users or increase memory budgets.
-
-### Poor Cache Hit Rates
-
-Low hit rates indicate your working set exceeds available fast memory. Either:
-- Increase GPU/CPU memory budgets
-- Reduce user count
-- Accept that cold data will hit storage
-
-### Results Vary Between Runs
-
-Use the `--seed` flag for reproducible results.
-
----
-
-## Files in This Package
-
-- `kv-cache.py`: Main benchmark implementation with ShareGPT support
-- `test_kv_cache.py`: Pytest unit test suite
-- `requirements.txt`: Python dependencies
-- `README.md`: This documentation
-- `MLperf v3 KV cache proposal.md`: Detailed technical documentation
-
----
-
-## License
-
-Apache License 2.0
-
----
-
-## Contact
-
-For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group.
+Initial implementation - work in progress for MLPerf Storage v3.0

From 71a79cb6dd55474491476bb6f0aaac204191b88a Mon Sep 17 00:00:00 2001
From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com>
Date: Fri, 13 Feb 2026 12:41:56 -0800
Subject: [PATCH 4/4] Enhance KV cache benchmark with ShareGPT integration

Updated benchmark commands and metrics for clarity and accuracy. Merged ShareGPT functionality into main script, added unit tests, and introduced Excel export capabilities.
---
 .../MLperf v3 KV cache proposal.md            | 713 ++----------------
 1 file changed, 65 insertions(+), 648 deletions(-)

diff --git a/kv_cache_benchmark/MLperf v3 KV cache proposal.md b/kv_cache_benchmark/MLperf v3 KV cache proposal.md
index 345b94f3..7504792c 100644
--- a/kv_cache_benchmark/MLperf v3 KV cache proposal.md	
+++ b/kv_cache_benchmark/MLperf v3 KV cache proposal.md	
@@ -34,56 +34,33 @@ This is not a simple "pass/fail" test. It's a diagnostic tool.
 
 ## 2. Recommended Benchmark Invocations
 
-Here are the specific commands to run for a thorough analysis of your system, **validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests). These examples assume you are testing the `llama3.1-8b` model and have a cache directory at `/mnt/nvme`.
+Here are the specific commands to run for a thorough analysis of your system. These examples assume you are testing the `llama3.1-8b` model and have a cache directory at `/mnt/nvme`.
 
-> **Discovery Test Finding:** llama3.1-8b and mistral-7b showed the best storage tier differentiation (2.31x ratio). The 70b model is recommended for maximum per-request storage stress.
+### Step 1: Isolate and Test Storage Performance
 
-### Step 1: Isolate and Test Storage Performance (Maximum Stress)
-
-This command uses **zero CPU RAM** to force all I/O to your NVMe drive. Discovery testing showed this configuration achieves **2.62x differentiation** in I/O volume metrics between fast and slow storage.
+This command uses a minimal CPU RAM budget (0.5 GB) to force all I/O to your NVMe drive. It establishes the performance baseline for your storage. Using a fixed `--seed` ensures that the "random" workload is identical every time, making results comparable.
 
 ```bash
-# Test 1: Storage-Only Maximum Stress Workload
+# Test 1: Storage-Only Workload
 python3 kv-cache.py \
     --model llama3.1-8b \
-    --num-users 200 \
-    --duration 300 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 0 \
-    --max-concurrent-allocs 16 \
-    --generation-mode none \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_storage_stress.json
-```
-**What to look for:** Check **Decode Bytes Read** and **Wall-Clock Throughput** (100% win rate in discovery testing). **⚠️ Do NOT use Storage Throughput** as your primary metric at cpu_mem=0GB—discovery testing showed it only differentiates storage tiers by 1.1x due to I/O time normalization effects.
-
-### Step 2: Test Storage Throughput (Traditional Metric)
-
-To use **Storage Throughput (tok/s)** as your primary metric, set cpu_mem=4GB. Discovery testing showed **2.2x differentiation** at this setting.
-
-```bash
-# Test 2: Storage Throughput Test
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 100 \
-    --duration 300 \
+    --num-users 50 \
+    --duration 180 \
     --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --max-concurrent-allocs 0 \
-    --generation-mode none \
+    --cpu-mem-gb 0.5 \
+    --generation-mode realistic \
     --cache-dir /mnt/nvme \
     --seed 42 \
-    --output results_storage_throughput.json
+    --output results_storage_only.json
 ```
-**What to look for:** The **Storage Throughput** metric in the summary. This configuration provides the traditional tok/s benchmark metric with reliable differentiation.
+**What to look for:** Check the **NVMe Throughput** in the `STORAGE PERFORMANCE ASSESSMENT` section of the output. For this saturation test, high latency is expected and acceptable; the key metric is the sustained **tokens/sec** your drive can handle. This value represents your storage's performance ceiling. Compare it across different drives to find the best one for your workload.
 
-### Step 3: Realistic Multi-Tier Configuration
+### Step 2: Test a Realistic Multi-Tier Configuration
 
 This command simulates a production environment with a full three-tier hierarchy. It uses a larger, more realistic CPU memory budget and enables the GPU if available.
 
 ```bash
-# Test 3: Full Three-Tier Realistic Workload
+# Test 2: Full Three-Tier Realistic Workload
 # (Set --gpu-mem-gb to your available VRAM, or 0 if none)
 python3 kv-cache.py \
     --model llama3.1-8b \
@@ -98,12 +75,12 @@ python3 kv-cache.py \
 ```
 **What to look for:** Compare the `end_to_end_latency_ms` from this test to the storage-only test. You should see a dramatic improvement. Also, check the `cache_hit_rate` and tier distribution (`gpu_entries`, `cpu_entries`, `nvme_entries`) to see how effectively your system is using the faster tiers.
 
-### Step 4: Discover Your System's Maximum User Load (QoS Mode)
+### Step 3: Discover Your System's Maximum User Load (QoS Mode)
 
 This command enables the default **Quality of Service (QoS)** autoscaler. It finds the optimal number of concurrent users your hardware can support *while maintaining acceptable latency*. It starts with a low user count and adds more users until the system's storage latency indicates it is becoming saturated.
 
 ```bash
-# Test 4: Autoscaling Discovery (QoS Mode)
+# Test 3: Autoscaling Discovery (QoS Mode)
 # (Set --gpu-mem-gb to your available VRAM, or 0 if none)
 python3 kv-cache.py \
     --model llama3.1-8b \
@@ -120,12 +97,12 @@ python3 kv-cache.py \
 ```
 **What to look for:** The output JSON will contain an `autoscaling_stats` section. The last entry in this list will show the final, stable user count your system settled on. This is your evidence-based maximum user load for a latency-sensitive production environment.
 
-### Step 5: Discover Your System's Peak Throughput (Capacity Mode)
+### Step 4: Discover Your System's Peak Throughput (Capacity Mode)
 
 This command uses the new **Capacity** autoscaler. Its goal is different: it ignores latency and aggressively adds users to find the absolute maximum I/O throughput (in tokens/sec) your storage hardware can sustain. This is the best way to measure the raw power of your drive.
 
 ```bash
-# Test 5: Autoscaling Discovery (Capacity Mode)
+# Test 4: Autoscaling Discovery (Capacity Mode)
 python3 kv-cache.py \
     --model llama3.1-70b-instruct \
     --num-users 10 \
@@ -141,18 +118,6 @@ python3 kv-cache.py \
 ```
 **What to look for:** In the `autoscaling_stats` section, look for the `reason` field. The test finishes when it detects that throughput has stopped increasing. The final log will state `Peak capacity found`. The `peak_throughput` value associated with that step is the maximum performance of your storage device. Note the use of `--generation-mode none` to ensure the storage is the only bottleneck.
 
-### Trial Recommendations (Discovery-Validated)
-
-> **Discovery Finding:** Variance is high (CV 50-125% depending on configuration). Single runs cannot reliably differentiate storage tiers.
-
-| User Count | Variance (CV) | Minimum Trials |
-|------------|---------------|----------------|
-| 10 users | ~52% | 3 |
-| 50-100 users | ~115-125% | 3-5 |
-| 200 users | ~110-120% | 3-5 |
-
-For publication-quality results, run **5+ trials** and report the **median** rather than mean.
-
 ---
 
 ## 3. Hardware Requirements
@@ -661,7 +626,7 @@ python3 kv-cache.py \
     --num-users 10 \
     --duration 120 \
     --gpu-mem-gb 24 \
-    --cpu-mem-gb 4 \
+    --cpu-mem-gb 0 \
     --generation-mode deterministic \
     --seed 42 \
     --output validation_kv_cache_gpu_only.json
@@ -742,213 +707,69 @@ To run this validation, you will need:
 
 For submitting official results to the MLPerf v3.0 benchmark, it is critical to use a standardized, repeatable methodology that isolates the component being tested. When evaluating a storage device's capability for KV cache offloading, the goal is to measure the performance of the storage subsystem under a consistent and saturating load, even on systems without a high-end GPU.
 
-### Discovery Test Validation Summary
-
-*Analysis Date: 2026-01-09 | Datasets: 1,411 Fast system tests, 268 Slow system tests*
-
-Before finalizing these submission guidelines, extensive discovery testing was performed comparing a Fast bare-metal system (14,000 MB/s NVMe) against a Slow virtualized system (3,000 MB/s storage). Key findings that informed the recommendations below:
-
-| Finding | Details | Impact on Recommendations |
-|---------|---------|---------------------------|
-| **Storage tier differentiation** | 2.1x-2.6x ratio achieved across all metrics | Benchmark successfully differentiates storage tiers |
-| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB | Different metrics recommended for different configurations |
-| **Best differentiation models** | llama3.1-8b and mistral-7b show 2.31x ratio | Recommended for standard submissions |
-| **High variance observed** | CV 50-125% depending on configuration | Multiple trials required (minimum 3-5) |
-| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB | Most reliable for storage stress testing |
-
 ### Recommended Invocations for Storage Submission
 
-Based on discovery testing, two complementary approaches are recommended depending on your benchmarking goal:
-
----
-
-#### Option 1: Maximum Storage Stress (cpu_mem=0GB)
-
-**Use when:** You want to stress test NVMe and measure I/O volume differentiation.
-
-**Primary Metrics:** Decode Bytes Read (2.62x differentiation, 100% win rate), Wall-Clock Throughput (2.43x differentiation, 100% win rate)
-
-**⚠️ Important:** Do NOT use Storage Throughput as your primary metric at cpu_mem=0GB—it shows only 1.1x differentiation due to I/O time normalization effects. See "Understanding Metric Behavior" below.
+Two primary scenarios should be submitted to give a comprehensive view of storage performance: a standard test with a medium-sized model (Llama 3.1 8B) and a high-stress test with a large model (Llama 3.1 70B).
 
-##### Standard Submission: `llama3.1-8b` (Maximum Storage Stress)
+#### Standard Submission: `llama3.1-8b`
 
-```bash
-# MLPerf v3.0: Maximum Storage Stress Test (8B Model)
-# Run 3-5 trials for statistical significance
-for trial in 1 2 3 4 5; do
-    python3 kv-cache.py \
-        --model llama3.1-8b \
-        --num-users 200 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 0 \
-        --max-concurrent-allocs 16 \
-        --generation-mode none \
-        --seed 42 \
-        --output mlperf_v3_stress_8b_trial${trial}.json
-done
-```
-
-##### Large Model Submission: `llama3.1-70b-instruct` (Maximum Per-Request Stress)
-
-The 70B model generates ~10x more storage I/O per token, ideal for high-bandwidth storage systems:
+This workload provides a baseline for storage performance under typical conditions. **Note:** We set `cpu-mem-gb 0` to disable the caching tier entirely, forcing every token to hit the NVMe drive. This ensures the benchmark measures the storage hardware, not the OS file cache.
 
 ```bash
-# MLPerf v3.0: Maximum Storage Stress Test (70B Model)
-for trial in 1 2 3; do
-    python3 kv-cache.py \
-        --model llama3.1-70b-instruct \
-        --num-users 70 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 0 \
-        --max-concurrent-allocs 4 \
-        --generation-mode none \
-        --seed 42 \
-        --output mlperf_v3_stress_70b_trial${trial}.json
-done
-```
-
----
-
-#### Option 2: Storage Throughput Focus (cpu_mem=4GB)
-
-**Use when:** You want Storage Throughput (tok/s) as your primary metric—the traditional benchmark metric.
-
-**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate at cpu_mem=4GB)
-
-##### Standard Submission: `llama3.1-8b` (Storage Throughput)
-
-```bash
-# MLPerf v3.0: Storage Throughput Test (8B Model)
-# Run 3-5 trials for statistical significance
-for trial in 1 2 3 4 5; do
-    python3 kv-cache.py \
-        --model llama3.1-8b \
-        --num-users 100 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
-        --max-concurrent-allocs 0 \
-        --generation-mode none \
-        --seed 42 \
-        --output mlperf_v3_throughput_8b_trial${trial}.json
-done
-```
-
-##### Large Model Submission: `llama3.1-70b-instruct` (Storage Throughput)
-
-```bash
-# MLPerf v3.0: Storage Throughput Test (70B Model)
-for trial in 1 2 3; do
-    python3 kv-cache.py \
-        --model llama3.1-70b-instruct \
-        --num-users 50 \
-        --duration 300 \
-        --gpu-mem-gb 0 \
-        --cpu-mem-gb 4 \
-        --max-concurrent-allocs 4 \
-        --generation-mode none \
-        --seed 42 \
-        --output mlperf_v3_throughput_70b_trial${trial}.json
-done
+# MLPerf v3.0 Recommended Invocation: Storage Saturation Test (8B Model)
+python3 kv-cache-waterfall-lru.py \
+    --model llama3.1-8b \
+    --num-users 150 \
+    --duration 600 \
+    --gpu-mem-gb 0 \
+    --cpu-mem-gb 0 \
+    --generation-mode realistic \
+    --performance-profile throughput \
+    --seed 42 \
+    --output mlperf_v3_storage_submission_8b.json
 ```
 
----
-
-#### Option 3: Realistic Production Simulation
+#### Large Model Submission: `llama3.1-70b-instruct`
 
-**Use when:** You want to simulate realistic inference timing including GPU backpressure.
+This workload tests the storage's ability to handle a much heavier load, as the KV cache for a 70B model is significantly larger. The user count is reduced to reflect the increased memory pressure per user.
 
 ```bash
-# MLPerf v3.0: Realistic Production Workload
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 50 \
-    --duration 300 \
+# MLPerf v3.0 Recommended Invocation: Storage Saturation Test (70B Model)
+python3 kv-cache-waterfall-lru.py \
+    --model llama3.1-70b-instruct \
+    --num-users 40 \
+    --duration 600 \
     --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --max-concurrent-allocs 4 \
+    --cpu-mem-gb 0 \
     --generation-mode realistic \
+    --performance-profile throughput \
     --seed 42 \
-    --output mlperf_v3_realistic_8b.json
+    --output mlperf_v3_storage_submission_70b.json
 ```
 
----
-
-### Understanding Metric Behavior by cpu_mem Setting
-
-Discovery testing revealed a critical insight: **the choice of primary metric depends on your cpu_mem setting**.
-
-#### Why Storage Throughput is Misleading at cpu_mem=0GB
-
-At cpu_mem=0GB, both fast and slow systems are 100% I/O-bound—every token requires NVMe access. This creates a normalization effect:
-
-| System | Decode Bytes Read | Total I/O Time | Storage Throughput |
-|--------|-------------------|----------------|-------------------|
-| Fast | 1,195 GB | ~8,000 s | 9.53 tok/s |
-| Slow | 447 GB | ~7,100 s | 8.50 tok/s |
-| **Ratio** | **2.62x** | **1.13x** | **1.12x** |
-
-The Fast system reads **2.62x more bytes** but accumulates **more I/O time** (because more operations). These effects cancel out in Storage Throughput, hiding the true performance difference.
-
-#### Recommended Metrics by Configuration
+**Why `cpu-mem-gb 0`?**
+In previous versions, a small CPU budget (e.g., 2GB) was allowed. However, analysis showed that operating system file caching (Page Cache) could absorb write bursts within this budget, artificially lowering latency metrics. Setting both GPU and CPU memory to 0 forces the "Waterfall" logic to bypass all caching layers and write directly to the NVMe backend, providing the most rigorous and honest assessment of storage I/O performance.
 
-| cpu_mem | Primary Metric | Differentiation | Win Rate | Notes |
-|---------|----------------|-----------------|----------|-------|
-| **0 GB** | Decode Bytes Read | **2.62x** | **100%** | Measures total storage work done |
-| **0 GB** | Wall-Clock Throughput | **2.43x** | **100%** | Measures real-world tokens/sec |
-| **0 GB** | Storage Throughput | 1.12x | 62% | **NOT RECOMMENDED** (misleading) |
-| **4 GB** | Storage Throughput | **2.23x** | **97%** | Traditional metric works at this setting |
-| **4 GB** | Decode Bytes Read | 2.06x | 100% | Also valid secondary metric |
-
-### Key Parameters Explained
-
-*   `--num-users`: Discovery testing showed differentiation remains stable (~2.1x-2.2x) across 10-200 users. Higher counts (150-200) maximize aggregate throughput. The 70B model uses fewer users due to larger per-user memory footprint.
-*   `--duration 300`: A 5-minute duration provides stable metrics. For official submissions, 10 minutes (600s) recommended.
-*   `--gpu-mem-gb 0`: **Critical for storage-focused testing.** Ensures no GPU memory allocation, isolating storage performance.
-*   `--cpu-mem-gb`: Choose based on your metric goal:
-    - **0 GB**: Maximum storage stress, use Decode Bytes Read or Wall-Clock Throughput
-    - **4 GB**: Traditional benchmarking, use Storage Throughput
-*   `--max-concurrent-allocs`: Controls allocation parallelism. Discovery showed optimal values are 0 (unlimited) for throughput metrics, 16 for stress testing.
-*   `--generation-mode`:
-    - **none**: Pure I/O benchmark, no token generation delay. Best for storage characterization.
-    - **realistic**: Adds 30ms/token GPU simulation. Required for production workload simulation.
-*   `--seed 42`: **Mandatory for valid submission.** Ensures identical pseudo-random workload across test runs and systems.
-
-### Trial Requirements Due to Variance
-
-Discovery testing revealed significant variance (CV 50-125% depending on configuration):
-
-| Concurrency | Typical CV | Minimum Trials | Recommended Trials |
-|-------------|------------|----------------|-------------------|
-| Low (10 users) | ~52% | 3 | 5 |
-| Medium (50-100 users) | ~115-125% | 3 | 5+ |
-| High (200 users) | ~110-120% | 3 | 5+ |
-
-**For publication-quality results:**
-- Run minimum **3 trials** per configuration
-- Run **5+ trials** for statistical robustness
-- Report **median** rather than mean to reduce outlier impact
-- Report **P95** and **P99** alongside mean for latency metrics
+**Key Parameters Explained:**
+*   `--num-users 150`: A high, fixed user count is used to ensure the storage device is placed under significant and continuous load.
+*   `--duration 600`: A 10-minute duration ensures the benchmark reaches a stable, steady-state performance level, which is a standard requirement for MLPerf results.
+*   `--gpu-mem-gb 0`: **This is the critical parameter for a storage-focused test.** It ensures the benchmark does not allocate any GPU memory, making it suitable for systems without a GPU or for isolating storage performance.
+*   `--cpu-mem-gb 2`: This small memory budget is intentionally chosen to be insufficient for the user load, forcing the system to bypass this faster tier and offload almost all KV cache data directly to the NVMe storage.
+*   `--generation-mode realistic`: This is essential for a valid submission. It adds a 30ms emulated sleep for each token generated, accurately simulating the backpressure from a real GPU's computation time. Without this, the benchmark would incorrectly measure storage performance in an unrealistic, I/O-only scenario.
+*   `--performance-profile throughput`: This new parameter is crucial for official submissions. It instructs the benchmark to use **throughput (tokens/second) as the sole pass/fail metric**, ignoring latency. This is because the high user count and low memory budget are *designed* to cause high latency to saturate the storage. This profile ensures the benchmark correctly evaluates the storage device's ability to sustain a high data rate under stress, which is the true goal of this test.
+*   `--seed 42`: **This parameter is mandatory for a valid submission.** It ensures that the pseudo-random workload (user request timings, context lengths, etc.) is identical across all test runs and systems. This removes workload variance as a factor and guarantees a true "apples-to-apples" comparison of hardware performance. The final report will include the seed used.
 
 ### Interpreting Throughput: System vs. Storage (Read Amplification)
 
-When you run the benchmark, the summary report presents multiple throughput and I/O metrics that can differ significantly. Understanding these differences—validated by discovery testing—is key to correctly interpreting the results.
-
-#### Key Metrics Explained
-
-1.  **Wall-Clock Throughput (`total_tokens_per_sec`):** The end-to-end throughput from the user's perspective: tokens generated per second across all users. Discovery testing showed **2.1x-2.4x differentiation** between storage tiers. This metric is reliable at all cpu_mem settings.
-
-2.  **Storage Throughput (`nvme_throughput`):** Tokens processed per unit of NVMe I/O time. **⚠️ Warning:** Discovery testing showed this metric is **unreliable at cpu_mem=0GB** (only 1.1x differentiation) but works well at **cpu_mem=4GB** (2.2x differentiation).
+When you run the benchmark with the `throughput` profile, the summary report presents two different throughput numbers that can differ significantly. Understanding this difference is key to correctly interpreting the results.
 
-3.  **Decode Bytes Read (GB):** Total bytes read from NVMe during decode phase. Discovery testing showed this is the **most reliable differentiation metric** at cpu_mem=0GB (**2.62x ratio, 100% win rate**).
+1.  **System Throughput (`total_tokens_per_sec`):** This is the "Overall Performance" metric. It represents the end-to-end throughput of the entire system from the user's perspective: the number of new tokens generated per second across all users. It is a measure of the system's generative capacity.
 
-4.  **Prefill Bytes Written (GB):** Total bytes written to NVMe during prefill phase. Shows **2.15x differentiation** at cpu_mem=0GB.
+2.  **Storage Throughput (`nvme_throughput`):** This is the "Storage Performance Assessment" metric. It represents the raw I/O performance of the NVMe tier, measuring how many tokens' worth of KV cache data are read from or written to the storage device per second.
 
 #### Why Are They So Different? The Concept of Read Amplification
 
-Storage metrics are often an order of magnitude higher than System Throughput due to **Read Amplification**—a fundamental characteristic of LLM inference.
+The Storage Throughput is often an order of magnitude higher than the System Throughput. This is not a bug; it is a fundamental characteristic of LLM inference called **Read Amplification**.
 
 During the "decode" phase, to generate a single new token, the model must read the *entire KV cache for all preceding tokens in the conversation*.
 
@@ -958,18 +779,6 @@ During the "decode" phase, to generate a single new token, the model must read t
 
 This creates a massive amplification effect where a small amount of user-facing work (generating one token) triggers a large amount of backend I/O (reading the entire history). This is precisely the behavior this benchmark is designed to measure, as it is the primary source of stress on the storage subsystem in a real-world KV cache offloading scenario.
 
-#### Discovery Validation: I/O Volume as Primary Metric
-
-Discovery testing confirmed that **I/O volume metrics (Decode Bytes Read, Prefill Bytes Written)** are more reliable than time-normalized metrics for comparing storage systems:
-
-| Metric | cpu_mem=0GB Ratio | cpu_mem=4GB Ratio | Win Rate |
-|--------|-------------------|-------------------|----------|
-| Decode Bytes Read | **2.62x** | 2.06x | **100%** |
-| Wall-Clock Throughput | **2.43x** | 1.79x | **100%** |
-| Storage Throughput | 1.12x | **2.23x** | 62% / 97% |
-
-**Recommendation:** When comparing storage systems under maximum stress (cpu_mem=0GB), use **Decode Bytes Read** or **Wall-Clock Throughput** as your primary metric. Reserve Storage Throughput for cpu_mem≥4GB configurations.
-
 #### Code Snippets
 
 **1. System Throughput Calculation:**
@@ -1123,49 +932,26 @@ This table clearly illustrates the memory pressure. If you are running the `llam
 
 ## 9. Smoke Test: Quick Validation Suite
 
-This section provides a collection of key benchmark invocations that can be used as a "smoke test" to quickly validate different aspects of your system's performance. **These tests have been validated through discovery testing** (1,411 Fast system tests, 268 Slow system tests). For all commands, it is assumed the cache directory is `/mnt/nvme`.
-
-### Test 1: Maximum Storage Stress (Discovery-Validated)
+This section provides a collection of key benchmark invocations that can be used as a "smoke test" to quickly validate different aspects of your system's performance. Each test is designed to isolate a specific component or behavior. For all commands, it is assumed the cache directory is `/mnt/nvme`.
 
-**Purpose:** Establishes the baseline performance of your storage device by forcing **all I/O to NVMe** (cpu_mem=0GB). This configuration showed the strongest storage tier differentiation in discovery testing.
+### Test 1: Storage-Only Saturation
 
-**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation)
+**Purpose:** Establishes the baseline performance of your storage device by forcing all I/O to it. This is the best way to measure your drive's raw throughput.
 
 ```bash
 python3 kv-cache.py \
     --model llama3.1-8b \
-    --num-users 200 \
+    --num-users 50 \
     --duration 180 \
     --gpu-mem-gb 0 \
     --cpu-mem-gb 0 \
-    --max-concurrent-allocs 16 \
-    --generation-mode none \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_storage_stress.json
-```
-
-**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead (see Section 7 for details).
-
-### Test 2: Storage Throughput Benchmark (Traditional Metric)
-
-**Purpose:** Use this configuration when you want **Storage Throughput (tok/s)** as your primary metric. Discovery testing showed this metric works reliably at cpu_mem=4GB (2.2x differentiation).
-
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 100 \
-    --duration 180 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --max-concurrent-allocs 0 \
-    --generation-mode none \
+    --generation-mode realistic \
     --cache-dir /mnt/nvme \
     --seed 42 \
-    --output results_storage_throughput.json
+    --output results_storage_only.json
 ```
 
-### Test 3: Realistic Three-Tier Workload
+### Test 2: Realistic Three-Tier Workload
 
 **Purpose:** Simulates a balanced, production-level environment using GPU, CPU, and NVMe tiers. Use this to measure end-to-end latency in a typical setup.
 
@@ -1182,7 +968,7 @@ python3 kv-cache.py \
     --output results_realistic_production.json
 ```
 
-### Test 4: Autoscaling for Max Users (QoS Mode)
+### Test 3: Autoscaling for Max Users (QoS Mode)
 
 **Purpose:** **This is the key command for sizing your production environment.** It automatically discovers the maximum number of concurrent users your system can support while maintaining a low-latency user experience (Quality of Service).
 
@@ -1201,7 +987,7 @@ python3 kv-cache.py \
     --output results_autoscaling_qos.json
 ```
 
-### Test 5: Autoscaling for Peak Throughput (Capacity Mode)
+### Test 4: Autoscaling for Peak Throughput (Capacity Mode)
 
 **Purpose:** Ignores latency to find the absolute maximum I/O throughput (tokens/sec) your storage hardware can sustain. This is the ultimate test of your drive's raw power.
 
@@ -1211,7 +997,7 @@ python3 kv-cache.py \
     --num-users 10 \
     --duration 180 \
     --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
+    --cpu-mem-gb 0 \
     --enable-autoscaling \
     --autoscaler-mode capacity \
     --generation-mode none \
@@ -1416,372 +1202,3 @@ with self.memory_lock:
 To align with MLPerf requirements, we added a specific counter for `nvme_tokens_processed`.
 *   **Why:** Previously, we tracked raw bytes. However, MLPerf metrics are often in "Tokens per Second."
 *   **How:** The system now tracks the exact number of tokens associated with every read, write, and demotion operation that touches the NVMe drive. This allows us to report a precise "Storage Throughput (tok/s)" metric that accounts for the massive read amplification inherent in LLM inference.
----
-
-# CHANGES-01-09-2026: ShareGPT Integration, Unit Testing, and Excel Export
-
-**Date:** January 9, 2026
-**Subject:** Feature enhancements to support realistic workload replay, automated testing, and streamlined results analysis.
-
-This update consolidates the ShareGPT replay functionality into the main benchmark script, adds a comprehensive unit test suite, and introduces optional Excel export capabilities. These changes improve usability for both development validation and production benchmarking without introducing any regressions to the core simulation logic.
-
-## 1. ShareGPT Dataset Integration
-
-The original repository maintained two separate scripts: `kv-cache.py` for synthetic workloads and `kv-cache_sharegpt_replay.py` for real conversation replay. This created maintenance overhead and confused users about which script to use. We merged the ShareGPT functionality directly into `kv-cache.py`.
-
-**What Changed:**
-*   **New Class: `ShareGPTDatasetLoader`** (~150 lines) parses ShareGPT JSON files and uses tiktoken to calculate exact token counts for each conversation turn.
-*   **New Arguments:** The main script now accepts `--dataset-path`, `--max-conversations`, `--request-rate`, and `--max-requests` for controlling replay behavior.
-*   **Backward Compatibility:** When no dataset path is provided, the benchmark falls back to its original synthetic workload generation. Existing invocations work unchanged.
-
-**Why This Matters:**
-Real human conversations exhibit dramatically different patterns than synthetic workloads. In our validation testing, ShareGPT conversations averaged 133 tokens per context versus 2,676 tokens for synthetic generation—a 20x difference. This affects cache hit rates (85-97% vs 50-70%), throughput measurements, and the validity of capacity planning exercises.
-
-**Sample Invocation:**
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-conversations 500 \
-    --num-users 50 \
-    --duration 300 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt.json
-```
-
-## 2. Understanding the Three Throughput Metrics
-
-The benchmark reports three different "tokens per second" metrics. Each measures something fundamentally different. Understanding these distinctions is critical for interpreting results correctly.
-
-### Metric 1: Wall-Clock Throughput (`avg_throughput_tokens_per_sec`)
-
-**What it measures:** The rate at which **output tokens** are generated, as seen by end users.
-
-**Formula:**
-```
-wall_clock_throughput = total_tokens_generated / elapsed_wall_time
-```
-
-**Code Location:** `_calculate_stats()` method
-```python
-summary = {
-    'avg_throughput_tokens_per_sec': self.results['total_tokens_generated'] / duration,
-}
-```
-
-**What `total_tokens_generated` contains:** The sum of `request.generate_tokens` for all completed requests. This is the number of **new output tokens** the LLM produced—NOT the KV cache data.
-
-```python
-# In process_requests(), after each request completes:
-with self.results_lock:
-    self.results['total_tokens_generated'] += request.generate_tokens  # Output tokens only
-```
-
-**Use this metric to answer:** "How many tokens per second is my inference system delivering to users?"
-
----
-
-### Metric 2: Storage I/O Throughput (`storage_throughput_tokens_per_sec`)
-
-**What it measures:** An **efficiency ratio**—how many output tokens are produced per second of cumulative storage I/O time.
-
-**Formula:**
-```
-storage_io_throughput = total_tokens_generated / total_storage_io_latency
-```
-
-**Code Location:** `_calculate_stats()` method
-```python
-'storage_throughput_tokens_per_sec': self.results['total_tokens_generated'] / self.results['total_storage_io_latency']
-```
-
-**What `total_storage_io_latency` contains:** The **cumulative** time spent in storage operations across ALL threads. This can exceed wall-clock time because multiple threads perform I/O in parallel.
-
-```python
-# In process_requests(), storage_latency accumulates ALL cache operations for this request:
-storage_latency = 0.0
-_, read_lat = self.cache.access_cache(...)    # Read from cache
-storage_latency += read_lat
-_, _, write_lat = self.cache.allocate_cache(...)  # Write to cache
-storage_latency += write_lat
-
-# Then recorded:
-with self.results_lock:
-    self.results['total_storage_io_latency'] += storage_latency
-```
-
-**Important:** This metric uses the same numerator (output tokens) as wall-clock throughput. It does NOT measure storage bandwidth.
-
-**Use this metric to answer:** "How efficiently does each second of I/O work translate into user-facing output?"
-
-**Interpretation:**
-- `storage_io_throughput < wall_clock_throughput` → Storage is a bottleneck (cumulative I/O time exceeds wall time)
-- `storage_io_throughput > wall_clock_throughput` → Other factors (GPU simulation, queueing) dominate latency
-
----
-
-### Metric 3: Storage Assessment Throughput (`nvme_tokens_processed / duration`)
-
-**What it measures:** The actual **storage bandwidth**—how much KV cache data flows through the NVMe tier.
-
-**Formula:**
-```
-nvme_throughput = nvme_tokens_processed / elapsed_wall_time
-```
-
-**Code Location:** `_evaluate_storage_performance()` method
-```python
-if self.performance_profile == 'throughput':
-    nvme_tokens = self.stats.get('nvme_tokens_processed', 0)
-    throughput = nvme_tokens / duration if duration > 0 else 0
-```
-
-**What `nvme_tokens_processed` contains:** The number of tokens' worth of KV cache data that was **read from or written to NVMe**. This is incremented in three places:
-
-```python
-# 1. When data is WRITTEN directly to NVMe (in allocate_cache):
-if allocated_tier == 'nvme':
-    self.stats['nvme_tokens_processed'] += num_tokens
-
-# 2. When data is READ from NVMe (in access_cache):
-if location == 'nvme':
-    num_tokens = entry_size / self.model_config.kv_cache_size_per_token
-    self.stats['nvme_tokens_processed'] += num_tokens
-
-# 3. When data is EVICTED/DEMOTED to NVMe (in _demote_entry):
-if to_tier == 'nvme':
-    tokens = int(size / bytes_per_token)
-    self.stats['nvme_tokens_processed'] += tokens
-```
-
-**Use this metric to answer:** "How much KV cache data is my NVMe drive handling per second?"
-
-**Why this can be much higher than wall-clock throughput:** Due to **read amplification**. During decode, generating 1 output token requires reading the entire KV cache (potentially thousands of tokens) from storage.
-
----
-
-### Summary Comparison
-
-| Metric | Numerator | Denominator | What It Measures |
-|--------|-----------|-------------|------------------|
-| **Wall-clock** | Output tokens | Wall time | User-facing generation rate |
-| **Storage I/O** | Output tokens | Cumulative I/O time | I/O efficiency ratio |
-| **NVMe Assessment** | KV cache tokens (R+W) | Wall time | Storage bandwidth |
-
-### Real-World Example
-
-From a benchmark run with 70 users on Llama 3.1 70B:
-
-```
-Total Tokens Generated: 56,108 (output tokens)
-Duration: 120 seconds
-Total Storage I/O Latency: 603 seconds (cumulative across threads)
-NVMe Tokens Processed: 597,991 (KV cache data tokens)
-```
-
-| Metric | Calculation | Result |
-|--------|-------------|--------|
-| **Wall-clock** | 56,108 / 120 | **467 tok/s** |
-| **Storage I/O** | 56,108 / 603 | **93 tok/s** |
-| **NVMe Assessment** | 597,991 / 120 | **4,983 tok/s** |
-
-**Interpretation:**
-- The system delivers **467 output tokens/second** to users
-- Storage is a bottleneck (93 < 467), meaning I/O time dominates
-- The NVMe drive is handling **4,983 tokens/second** of KV cache I/O (10.6× read amplification)
-
----
-
-### The ShareGPT Bug Explained
-
-The Storage Assessment uses `nvme_tokens_processed`, which is **NVMe-specific**. In ShareGPT replay mode with small context sizes (~300 tokens average), all data fits in GPU+CPU memory. No data reaches NVMe, so:
-
-```
-nvme_tokens_processed = 0
-nvme_throughput = 0 / 120 = 0.00 tok/s → FAIL
-```
-
-Meanwhile, wall-clock and storage I/O throughput show healthy values because they use `total_tokens_generated` (output tokens), which is always incremented regardless of which cache tier is used.
-
-
-
-## 3. Unit Test Suite
-
-We added a comprehensive pytest-based test suite (`test_kv_cache.py`) that validates core functionality without running full benchmarks. This enables rapid development iteration and CI/CD integration.
-
-**Coverage:**
-The test suite includes 12 test classes covering:
-*   `TestModelConfig`: Validates KV cache size calculations for all 5 model configurations
-*   `TestInferenceRequest`: Tests cache key generation and latency tracking
-*   `TestQoSProfiles`: Verifies priority levels and SLA targets
-*   `TestKVCacheGenerator`: Confirms deterministic generation and precomputed buffer optimization
-*   `TestCPUMemoryBackend`: Tests write/read/delete/clear operations
-*   `TestNVMeBackend`: Validates file I/O and metadata tracking
-*   `TestGPUMemoryBackend`: CUDA tensor operations (auto-skipped without GPU)
-*   `TestConversationManager`: Multi-turn tracking and LRU eviction
-*   `TestUserSimulator`: Mixed user generation
-*   `TestMultiTierCache`: CPU-only mode allocation and access
-*   `TestMultiTierCacheWithGPU`: Full three-tier hierarchy (auto-skipped without GPU)
-*   `TestXLSXExport`: CSV/Excel export validation
-
-**Running Tests:**
-```bash
-# Full test suite with verbose output
-pytest test_kv_cache.py -v
-
-# Run specific test class
-pytest test_kv_cache.py -k "TestModelConfig" -v
-
-# Skip GPU tests explicitly
-pytest test_kv_cache.py -v -m "not skipif"
-```
-
-**Expected Runtime:** 3-5 seconds without GPU, 5-10 seconds with GPU.
-
-## 4. Excel Export Capability
-
-For users who analyze results in spreadsheets, we added optional Excel/CSV export via the `--xlsx-output` argument.
-
-**Dependencies:**
-*   `pandas` (required for export)
-*   `openpyxl` (optional; enables `.xlsx` format; without it, falls back to `.csv`)
-
-**Graceful Fallback:**
-*   If pandas is not installed, the export is skipped with a warning
-*   If openpyxl is not installed, the benchmark writes CSV instead of XLSX
-*   The benchmark never fails due to missing optional dependencies
-
-**Output Columns:**
-The export includes all key parameters and metrics in a single row:
-| Model | Num Users | Duration | GPU Mem | CPU Mem | Total Requests | Total Tokens | Avg Throughput | Storage Throughput | Cache Hit Rate | E2E P95 | Storage IO P95 |
-
-**Sample Invocation:**
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --num-users 50 \
-    --duration 120 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --seed 42 \
-    --output results.json \
-    --xlsx-output results.xlsx
-```
-
-## 5. Regression Analysis: No Core Logic Changes
-
-A detailed diff analysis between the original `kv-cache.py` and the enhanced version confirms that the core simulation logic remains identical:
-
-| Component | Status |
-|-----------|--------|
-| `MultiTierCache` class | Unchanged |
-| `allocate_cache()` eviction logic | Unchanged |
-| `access_cache()` read logic | Unchanged |
-| `KVCacheGenerator` precomputed buffer | Unchanged |
-| `GPUMemoryBackend`, `CPUMemoryBackend`, `NVMeBackend` | Unchanged |
-| `UserSimulator` | Unchanged |
-| QoS handling | Unchanged |
-| Autoscaling | Unchanged |
-| RAG workload | Unchanged |
-| Prefix caching | Unchanged |
-
-**What Was Added (Not Modified):**
-1. `ShareGPTDatasetLoader` class (new code, doesn't affect simulation)
-2. `storage_throughput_tokens_per_sec` metric (additional output, no behavior change)
-3. `--max-requests` argument (optional early termination, backward compatible)
-4. `--request-rate` argument (optional rate limiting, backward compatible)
-5. `--xlsx-output` argument (optional export, backward compatible)
-6. `export_results_to_xlsx()` function (new code, called after benchmark completes)
-
-Existing benchmark invocations produce identical results. The seed-based reproducibility guarantee is maintained.
-
-## 6. Updated Requirements
-
-The `requirements.txt` file now documents all dependencies:
-
-```
-# Core (required)
-numpy>=1.20.0
-
-# GPU support (optional)
-torch>=2.0.0
-
-# ShareGPT replay (optional)
-tiktoken>=0.5.0
-
-# Excel export (optional)
-pandas>=2.0.0
-openpyxl>=3.1.0
-
-# Unit testing (optional)
-pytest>=7.0.0
-```
-
-## 7. Recommended Invocations: ShareGPT Workloads
-
-### ShareGPT Storage Validation (8B Model)
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-conversations 1000 \
-    --num-users 100 \
-    --duration 300 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt_8b.json \
-    --xlsx-output results_sharegpt_8b.xlsx
-```
-
-### ShareGPT High-Stress (70B Model)
-```bash
-python3 kv-cache.py \
-    --model llama3.1-70b-instruct \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-conversations 500 \
-    --request-rate 5.0 \
-    --num-users 50 \
-    --duration 600 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 8 \
-    --generation-mode none \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt_70b.json
-```
-
-### ShareGPT Fixed Request Count
-```bash
-python3 kv-cache.py \
-    --model llama3.1-8b \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --max-requests 10000 \
-    --num-users 75 \
-    --gpu-mem-gb 0 \
-    --cpu-mem-gb 4 \
-    --generation-mode realistic \
-    --cache-dir /mnt/nvme \
-    --seed 42 \
-    --output results_sharegpt_fixed.json
-```
-
----
-
-## Summary of January 9, 2026 Changes
-
-| Feature | Description | Impact |
-|---------|-------------|--------|
-| ShareGPT Integration | Merged `kv-cache_sharegpt_replay.py` into main script | Real workload validation |
-| Storage Throughput Metric | Added `storage_throughput_tokens_per_sec` | Fair tier comparisons |
-| Unit Test Suite | 12 pytest test classes, ~80 tests | Development velocity |
-| Excel Export | `--xlsx-output` with CSV fallback | Easier analysis |
-| Elapsed Time Tracking | Added to summary output | Debugging support |
-
-No regressions were introduced. All existing invocations and seed-based reproducibility remain intact.
\ No newline at end of file