From 875d69e9a9dc4201251aadb3ff4b81156e6633e7 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 17:40:35 +0100
Subject: [PATCH 1/9] Enhance CSV parsing in metrics parser and update
 histogram/occupancy plots for better precision

---
 profiling_tools/parse_metrics.py  | 219 ++++++++++++++++++++----------
 profiling_tools/plot_histogram.gp |   2 +-
 profiling_tools/plot_occupancy.gp |   6 +-
 3 files changed, 154 insertions(+), 73 deletions(-)

diff --git a/profiling_tools/parse_metrics.py b/profiling_tools/parse_metrics.py
index 8a77ef2..ea56a48 100644
--- a/profiling_tools/parse_metrics.py
+++ b/profiling_tools/parse_metrics.py
@@ -72,13 +72,41 @@ def get_unit_multiplier(unit_str):
     return multipliers.get(u, 1.0)
 
 def read_csv_with_units(path):
-    """Read CSV with optional unit row"""
+    """Read CSV with optional unit row, handling nvprof multi-line format"""
     rows = []
     if not os.path.exists(path):
         return rows
     
     with open(path, 'r', newline='', encoding='utf-8', errors='ignore') as f:
-        lines = f.readlines()
+        content = f.read()
+    
+    if not content.strip():
+        return rows
+    
+    # Clean nvprof output: merge lines that are continuations
+    # nvprof sometimes splits long kernel names across lines
+    lines = []
+    buffer = ""
+    for line in content.split('\n'):
+        stripped = line.strip()
+        # Skip nvprof header lines
+        if stripped.startswith('==') or not stripped:
+            continue
+        
+        # Count quotes to detect incomplete lines
+        buffer += line
+        quote_count = buffer.count('"')
+        
+        if quote_count % 2 == 0:
+            # Complete line
+            lines.append(buffer.strip())
+            buffer = ""
+        else:
+            # Incomplete line, continue buffering
+            buffer += " "
+    
+    if buffer.strip():
+        lines.append(buffer.strip())
     
     if not lines:
         return rows
@@ -86,15 +114,16 @@ def read_csv_with_units(path):
     # Find header line
     header_idx = -1
     for i, line in enumerate(lines[:20]):
-        if '"Name"' in line or 'Name' in line or 'Kernel' in line:
+        if '"Type"' in line or '"Name"' in line or 'Name' in line or 'Kernel' in line:
             header_idx = i
             break
     
     if header_idx == -1:
         return rows
     
-    # Parse header
-    keys = [k.strip().replace('"', '') for k in lines[header_idx].strip().split(',')]
+    # Parse header - handle quoted CSV properly
+    header_line = lines[header_idx]
+    keys = parse_csv_line(header_line)
     
     # Check for unit row
     unit_map = {}
@@ -102,9 +131,9 @@ def read_csv_with_units(path):
     
     if len(lines) > header_idx + 1:
         next_line = lines[header_idx + 1]
-        potential_units = [u.strip().replace('"', '') for u in next_line.strip().split(',')]
+        potential_units = parse_csv_line(next_line)
         
-        # Check if this looks like a unit row
+        # Check if this looks like a unit row (contains time units or %)
         if any(u in ['s', 'ms', 'us', 'ns', '%'] for u in potential_units):
             data_start_idx = header_idx + 2
             for i, u in enumerate(potential_units):
@@ -112,33 +141,63 @@ def read_csv_with_units(path):
                     unit_map[keys[i]] = get_unit_multiplier(u)
     
     # Read data rows
-    reader = csv.DictReader(lines[data_start_idx:], fieldnames=keys)
-    for row in reader:
-        clean_row = {}
-        for k, v in row.items():
-            if not v:
-                clean_row[k] = v
-                continue
-            
-            # Apply unit conversion
-            if k in unit_map and unit_map[k] != 1.0:
-                try:
-                    val_clean = re.sub(r'[^0-9\.]', '', v)
-                    clean_row[k] = float(val_clean) * unit_map[k]
-                except:
-                    clean_row[k] = v
-            else:
-                clean_row[k] = v
+    for line in lines[data_start_idx:]:
+        if not line.strip():
+            continue
+        
+        values = parse_csv_line(line)
+        row = {}
+        
+        for i, val in enumerate(values):
+            if i < len(keys):
+                key = keys[i]
+                # Apply unit conversion
+                if key in unit_map and unit_map[key] != 1.0:
+                    try:
+                        val_clean = re.sub(r'[^0-9\.\-eE]', '', val)
+                        if val_clean:
+                            row[key] = float(val_clean) * unit_map[key]
+                        else:
+                            row[key] = val
+                    except:
+                        row[key] = val
+                else:
+                    row[key] = val
         
-        rows.append(clean_row)
+        if row:
+            rows.append(row)
     
     return rows
 
+def parse_csv_line(line):
+    """Parse a CSV line handling quoted fields"""
+    fields = []
+    current = ""
+    in_quotes = False
+    
+    for char in line:
+        if char == '"':
+            in_quotes = not in_quotes
+        elif char == ',' and not in_quotes:
+            fields.append(current.strip().replace('"', ''))
+            current = ""
+        else:
+            current += char
+    
+    fields.append(current.strip().replace('"', ''))
+    return fields
+
 def extract_kernel_name(full_name, kernel_filter=None):
     """Extract kernel name from full mangled name"""
     if not full_name:
         return None
     
+    # Skip memory operations and API calls
+    skip_patterns = ['[CUDA memcpy', '[CUDA memset', 'cudaLaunch', 'cudaMalloc', 'cudaFree']
+    for skip in skip_patterns:
+        if skip in full_name:
+            return None
+    
     # If filter provided, check if any filter matches
     if kernel_filter:
         for kf in kernel_filter:
@@ -146,18 +205,21 @@ def extract_kernel_name(full_name, kernel_filter=None):
                 return kf
         return None  # No match, skip this kernel
     
-    # Otherwise, extract base kernel name
-    # Try to get meaningful name from mangled C++ names
-    patterns = [
-        r'void\s+(\w+)',  # void kernel_name<...>
-        r'(\w+)(?:<|::)',  # kernel_name<...> or kernel_name::...
-        r'^([a-zA-Z_]\w+)',  # Simple name at start
-    ]
+    # Extract kernel name from mangled C++ name
+    # Pattern 1: "kernel_name(args)" or "kernel_name<template>(args)"
+    match = re.match(r'^([a-zA-Z_][a-zA-Z0-9_]*)', full_name)
+    if match:
+        return match.group(1)
     
-    for pattern in patterns:
-        match = re.search(pattern, full_name)
-        if match:
-            return match.group(1)
+    # Pattern 2: "void kernel_name<...>(...)"
+    match = re.search(r'void\s+(\w+)', full_name)
+    if match:
+        return match.group(1)
+    
+    # Pattern 3: Namespace::kernel_name
+    match = re.search(r'(\w+)(?:<|::|\()', full_name)
+    if match:
+        return match.group(1)
     
     return full_name[:50]  # Truncate long names
 
@@ -212,19 +274,32 @@ def ensure_kernel(k):
     
     # Parse summary (time and invocations)
     for row in csv_data.get('summary', []):
-        kname = extract_kernel_name(row.get('Name', ''), kernel_filter)
+        # Handle different column names for kernel name
+        name_val = row.get('Name', row.get('Kernel', ''))
+        kname = extract_kernel_name(name_val, kernel_filter)
         if not kname:
             continue
         
         ensure_kernel(kname)
         
-        # Time
-        for time_field in ['Time', 'GPU Time', 'Duration']:
+        # Time - check various field names and handle already-converted values
+        time_val = None
+        for time_field in ['Time', 'GPU Time', 'Duration', 'Avg']:
             if time_field in row and row[time_field]:
                 try:
-                    kernels[kname]['time_total_s'] = float(row[time_field])
-                    break
-                except:
+                    val = row[time_field]
+                    if isinstance(val, (int, float)):
+                        time_val = float(val)
+                    else:
+                        # Remove non-numeric chars and parse
+                        val_clean = re.sub(r'[^0-9\.\-eE]', '', str(val))
+                        if val_clean:
+                            time_val = float(val_clean)
+                    if time_val is not None and time_val > 0:
+                        # Accumulate time for same kernel (multiple invocations)
+                        kernels[kname]['time_total_s'] += time_val
+                        break
+                except Exception as e:
                     pass
         
         # Invocations
@@ -232,7 +307,8 @@ def ensure_kernel(k):
             if inv_field in row and row[inv_field]:
                 try:
                     inv_str = str(row[inv_field])
-                    kernels[kname]['invocations'] = int(re.sub(r'[^0-9]', '', inv_str) or 1)
+                    inv_val = int(re.sub(r'[^0-9]', '', inv_str) or 1)
+                    kernels[kname]['invocations'] += inv_val
                     break
                 except:
                     pass
@@ -369,9 +445,9 @@ def main():
     
     for summary_file in summary_files:
         base_path = summary_file.replace('_summary.csv', '')
-        version_name = clean_version_name(base_path)
+        exec_name = clean_version_name(base_path)
         
-        print(f"\n--- {version_name} ---")
+        print(f"\n--- {exec_name} ---")
         
         data = parse_dataset(base_path, kernel_filter)
         kernels = data['kernels']
@@ -380,30 +456,35 @@ def main():
             print(f"  No kernels found")
             continue
         
-        # Aggregate metrics
-        total_time = sum(k['time_total_s'] for k in kernels.values())
-        total_flops = sum(k['total_flops'] for k in kernels.values())
-        total_bytes = sum(k['total_bytes'] for k in kernels.values())
-        
-        all_occupancies = []
-        for k in kernels.values():
-            all_occupancies.extend(k['occupancies'])
-        
-        avg_occupancy = statistics.mean(all_occupancies) if all_occupancies else 0.0
-        
-        # Calculate roofline point
-        if total_flops > 0 and total_time > 0:
-            result = calculate_roofline_point(total_flops, total_bytes, total_time, version_name)
-            if result:
-                gflops, ai = result
-                roofline_data.append({
-                    'label': version_name,
-                    'ai': ai,
-                    'gflops': gflops
-                })
-        
-        time_data.append((version_name, total_time))
-        occupancy_data.append((version_name, avg_occupancy))
+        # Generate data for EACH kernel separately
+        for kernel_name, kdata in kernels.items():
+            # Create label: "ExecName_KernelName" or just "KernelName" if single exec
+            if len(summary_files) > 1:
+                label = f"{exec_name}_{kernel_name}"
+            else:
+                label = kernel_name
+            
+            kernel_time = kdata['time_total_s']
+            kernel_flops = kdata['total_flops']
+            kernel_bytes = kdata['total_bytes']
+            
+            # Calculate roofline point for this kernel
+            if kernel_flops > 0 and kernel_time > 0:
+                result = calculate_roofline_point(kernel_flops, kernel_bytes, kernel_time, exec_name, kernel_name)
+                if result:
+                    gflops, ai = result
+                    roofline_data.append({
+                        'label': label,
+                        'ai': ai,
+                        'gflops': gflops
+                    })
+            
+            # Time data per kernel
+            time_data.append((label, kernel_time))
+            
+            # Occupancy per kernel
+            avg_occupancy = statistics.mean(kdata['occupancies']) if kdata['occupancies'] else 0.0
+            occupancy_data.append((label, avg_occupancy))
     
     # Write output files
     with open(os.path.join(results_dir, 'roofline_data.dat'), 'w') as f:
diff --git a/profiling_tools/plot_histogram.gp b/profiling_tools/plot_histogram.gp
index 7357bea..4bd08fa 100644
--- a/profiling_tools/plot_histogram.gp
+++ b/profiling_tools/plot_histogram.gp
@@ -89,7 +89,7 @@ set output OUTDIR.'/histogram_times.png'
 # Plot with conditional coloring (highlight fastest in green)
 plot data_file using 0:($2 == fastest_time ? $2 : 1/0):xtic(1) with boxes ls 2 title 'Best', \
      ''        using 0:($2 != fastest_time ? $2 : 1/0) with boxes ls 1 title 'Others', \
-     ''        using 0:($2 + ymax*0.02):(sprintf("%.3f s", $2)) with labels center font ",9" tc rgb "#333333" notitle
+     ''        using 0:($2 + ymax*0.02):(sprintf("%f s", $2)) with labels center font ",9" tc rgb "#333333" notitle
 
 # SVG output
 set terminal svg size width_px,height_px enhanced font 'Arial,12' background rgb '#fafafa'
diff --git a/profiling_tools/plot_occupancy.gp b/profiling_tools/plot_occupancy.gp
index 5ee3502..57e0da7 100644
--- a/profiling_tools/plot_occupancy.gp
+++ b/profiling_tools/plot_occupancy.gp
@@ -24,7 +24,7 @@ stats data_file using 2 nooutput prefix "OCC"
 n_entries = OCC_records
 avg_occ = OCC_mean
 
-set title sprintf("GPU SM Occupancy Comparison\nAverage: %.1f%%", avg_occ * 100) font ",16" enhanced
+set title sprintf("GPU SM Occupancy Comparison\nAverage: %.2f%%", avg_occ * 100) font ",16" enhanced
 set xlabel "Implementation" font ",13" offset 0,-0.5
 set ylabel "Occupancy (%)" font ",13" offset -1,0
 
@@ -51,7 +51,7 @@ if (n_entries > 6) {
 }
 
 # Format y-axis as percentage
-set ytics format "%.0f%%" font ",10"
+set ytics format "%.2f%%" font ",10"
 set ytics 10
 
 # Color bars by occupancy level (improved colors)
@@ -91,7 +91,7 @@ plot data_file using 0:($2*100 < 30 ? $2*100 : 1/0):xtic(1) with boxes ls 1 titl
      ''        using 0:($2*100 >= 30 && $2*100 < 50 ? $2*100 : 1/0) with boxes ls 2 title 'Low', \
      ''        using 0:($2*100 >= 50 && $2*100 < 70 ? $2*100 : 1/0) with boxes ls 3 title 'Medium', \
      ''        using 0:($2*100 >= 70 ? $2*100 : 1/0) with boxes ls 4 title 'Good', \
-     ''        using 0:($2*100 + 2):(sprintf("%.0f%%", $2*100)) with labels center font ",9" tc rgb "#333333" notitle
+     ''        using 0:($2*100 + 2):(sprintf("%.2f%%", $2*100)) with labels center font ",9" tc rgb "#333333" notitle
 
 # SVG output (same width)
 set terminal svg size width_px,height_px enhanced font 'Arial,12' background rgb '#fafafa'

From 6f464444d0352500e196a483144eac3fb09f72cd Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:04:04 +0100
Subject: [PATCH 2/9] Add stencil/Makefile

---
 stencil/Makefile | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 stencil/Makefile

diff --git a/stencil/Makefile b/stencil/Makefile
new file mode 100644
index 0000000..4c0fcc8
--- /dev/null
+++ b/stencil/Makefile
@@ -0,0 +1,18 @@
+NVCC ?= nvcc
+NVCCFLAGS ?= -std=c++11 -O2
+TARGET = stencil
+
+all: $(TARGET)
+
+$(TARGET): stencil.cu
+	$(NVCC) $(NVCCFLAGS) -o $@ $^
+
+clean:
+	rm -f $(TARGET) *.o
+
+clean-logs:
+	rm -f *.log
+
+clean-all: clean clean-logs
+
+.PHONY: all clean clean-logs clean-all

From 0711137583f1116b6dd4216ce923ca0aa1cedd94 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:04:38 +0100
Subject: [PATCH 3/9] Add stencil/profile_nvprof.sh

---
 stencil/profile_nvprof.sh | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100755 stencil/profile_nvprof.sh

diff --git a/stencil/profile_nvprof.sh b/stencil/profile_nvprof.sh
new file mode 100755
index 0000000..250db57
--- /dev/null
+++ b/stencil/profile_nvprof.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")"
+
+BIN=./stencil
+
+if [ "$#" -gt 0 ] && { [ "$1" = "-h" ] || [ "$1" = "--help" ]; }; then
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Profile 3D seven-point stencil with nvprof.
+
+Options (passed to stencil):
+  --mode MODE    Kernel: naive|shared|coarsened|register|all (default: all)
+  --nx NX        Grid size in X (default: 256)
+  --ny NY        Grid size in Y (default: 256)
+  --nz NZ        Grid size in Z (default: 256)
+
+Examples:
+  $0 --mode naive --nx 256 --ny 256 --nz 256
+  $0 --mode shared --nx 512 --ny 512 --nz 256
+  $0 --mode register
+  $0 --mode all
+
+Output:
+  Creates nvprof_stencil_TIMESTAMP.log with GPU trace
+EOF
+    exit 0
+fi
+
+if [ ! -x "$BIN" ]; then
+    echo "Building stencil..."
+    make
+fi
+
+TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
+OUTFILE="nvprof_stencil_${TIMESTAMP}.log"
+nvprof --print-gpu-trace --log-file "$OUTFILE" "$BIN" "$@"
+echo "Profile saved to: $OUTFILE"

From 7d04a39297d25c0aca09221e55327c7456ec6ac4 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:04:46 +0100
Subject: [PATCH 4/9] Add stencil/run.sh

---
 stencil/run.sh | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100755 stencil/run.sh

diff --git a/stencil/run.sh b/stencil/run.sh
new file mode 100755
index 0000000..2c80506
--- /dev/null
+++ b/stencil/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")"
+
+# Runner for 3D Seven-Point Stencil example
+
+BIN=./stencil
+
+if [ "$#" -gt 0 ] && { [ "$1" = "-h" ] || [ "$1" = "--help" ]; }; then
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Run 3D seven-point stencil example.
+
+Options (passed to stencil):
+  --mode MODE    Kernel: naive|shared|coarsened|register|all (default: all)
+  --nx NX        Grid size in X (default: 256)
+  --ny NY        Grid size in Y (default: 256)
+  --nz NZ        Grid size in Z (default: 256)
+
+Examples:
+  $0 --mode naive --nx 128 --ny 128 --nz 128
+  $0 --mode shared --nx 512 --ny 512 --nz 256
+  $0 --mode register
+  $0 --mode all --nx 256 --ny 256 --nz 256
+EOF
+    exit 0
+fi
+
+if [ ! -x "$BIN" ]; then
+    echo "Building stencil..."
+    make
+fi
+
+"$BIN" "$@"

From a598bd63782d86082c2ff6a1b4fdb3780f952c21 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:04:55 +0100
Subject: [PATCH 5/9] Add stencil/README

---
 stencil/README.md | 167 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 stencil/README.md

diff --git a/stencil/README.md b/stencil/README.md
new file mode 100644
index 0000000..e332dc5
--- /dev/null
+++ b/stencil/README.md
@@ -0,0 +1,167 @@
+# 3D Seven-Point Stencil
+
+## Overview
+
+This example demonstrates multiple CUDA implementations of the 3D seven-point stencil, a fundamental computational pattern in scientific computing used for solving PDEs, heat diffusion, and iterative solvers.
+
+The seven-point stencil computes:
+
+```
+out[i,j,k] = c0*in[i,j,k]   + c1*in[i-1,j,k] + c2*in[i+1,j,k] +
+             c3*in[i,j-1,k] + c4*in[i,j+1,k] +
+             c5*in[i,j,k-1] + c6*in[i,j,k+1]
+```
+
+Where coefficients are defined as (discrete Laplacian):
+
+- `c0 = -6.0` (center)
+- `c1 = c2 = c3 = c4 = c5 = c6 = 1.0` (neighbors)
+
+## Kernel Implementations
+
+### 1. Naive (`stencil_naive`)
+- **Strategy**: Direct global memory access
+- **Characteristics**:
+  - Each thread computes one output point
+  - 3D thread block organization (8×8×8)
+  - Simple but memory bandwidth limited
+  - Redundant loads from global memory
+
+### 2. Shared Memory Tiling (`stencil_shared`)
+- **Strategy**: 2D xy-plane tiling with shared memory
+- **Characteristics**:
+  - Loads xy-plane tiles into shared memory (with halo)
+  - Reduces redundant global memory accesses for xy-neighbors
+  - z-neighbors still loaded from global memory
+  - One block per z-layer
+
+### 3. Thread Coarsening (`stencil_coarsened`)
+- **Strategy**: Each thread processes multiple z-layers
+- **Characteristics**:
+  - Combines shared memory tiling with z-axis coarsening
+  - Reduces thread launch overhead
+  - Better data reuse along z-dimension
+  - Configurable coarsening factor (default: 8)
+
+### 4. Register Tiling (`stencil_register`)
+- **Strategy**: Register caching along z-axis
+- **Characteristics**:
+  - Maintains sliding window of z-values in registers
+  - Maximizes temporal reuse along z-dimension
+  - Combines with xy-plane shared memory tiling
+  - Most efficient memory access pattern
+
+## Build
+
+```bash
+make
+```
+
+## Run
+
+```bash
+# Show help
+./run.sh --help
+
+# Run all kernels (default 256³ grid)
+./run.sh --mode all
+
+# Run specific kernel
+./run.sh --mode naive --nx 128 --ny 128 --nz 128
+./run.sh --mode shared --nx 512 --ny 512 --nz 256
+./run.sh --mode coarsened
+./run.sh --mode register
+```
+
+## Profiling
+
+```bash
+# Profile with nvprof
+./profile_nvprof.sh --mode all --nx 256 --ny 256 --nz 256
+
+# Use with profiling tools
+../profiling_tools/profile_cuda.sh -d . --metrics time,occupancy
+```
+
+## Output Example
+
+```
+=== 3D Seven-Point Stencil ===
+Grid: 256 x 256 x 256 = 16777216 elements
+Interior points: 16003008
+Data size: 64.00 MB
+Mode: all
+
+Computing CPU reference...
+
+Kernel: stencil_naive
+  Time: 12.345 ms
+  Throughput: 1.30 GPoints/s
+  Est. GFLOP/s: 16.87
+  Est. Bandwidth: 58.23 GB/s
+  Verification: PASSED
+
+Kernel: stencil_shared (tile 32x8)
+  Time: 8.234 ms
+  Throughput: 1.94 GPoints/s
+  Est. GFLOP/s: 25.26
+  Est. Bandwidth: 87.12 GB/s
+  Verification: PASSED
+
+Kernel: stencil_coarsened (tile 32x8, coarse 8)
+  Time: 6.123 ms
+  Throughput: 2.61 GPoints/s
+  Est. GFLOP/s: 33.96
+  Est. Bandwidth: 117.11 GB/s
+  Verification: PASSED
+
+Kernel: stencil_register (tile 32x8, register z-sweep)
+  Time: 4.567 ms
+  Throughput: 3.50 GPoints/s
+  Est. GFLOP/s: 45.55
+  Est. Bandwidth: 157.12 GB/s
+  Verification: PASSED
+```
+
+## Optimization Techniques
+
+### 1. Shared Memory Tiling
+- Reduces redundant global memory accesses
+- Threads in a block cooperatively load data once
+- Halo regions handle boundary conditions
+
+### 2. Thread Coarsening
+- Amortizes thread launch overhead
+- Better instruction-level parallelism
+- Improves data reuse within a thread
+
+### 3. Register Tiling
+- Exploits temporal locality along sweep direction
+- Registers provide fastest memory access
+- Sliding window technique minimizes register pressure
+
+### 4. Memory Coalescing
+- Row-major layout ensures coalesced accesses in x-direction
+- Tile dimensions chosen for optimal memory access patterns
+
+## Parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--mode` | `all` | Kernel selection |
+| `--nx` | 256 | Grid size in X |
+| `--ny` | 256 | Grid size in Y |
+| `--nz` | 256 | Grid size in Z |
+
+## Tile Configuration
+
+- **Shared/Coarsened/Register**: 32×8 xy-tile (256 threads)
+- **Coarsening factor**: 8 z-layers per thread
+- **Shared memory**: (32+2)×(8+2)×4 = 1360 bytes per block
+
+## Performance Considerations
+
+1. **Grid Size**: Larger grids improve GPU utilization
+2. **Memory Bandwidth**: Stencil kernels are typically memory-bound
+3. **Occupancy**: Tile size affects SM occupancy
+4. **Register Pressure**: Register tiling may limit occupancy

From 2c7322497521d23d21be9f430ae785ae9997dba2 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:05:02 +0100
Subject: [PATCH 6/9] Add stencil kernels

---
 stencil/stencil.cu | 702 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 702 insertions(+)
 create mode 100644 stencil/stencil.cu

diff --git a/stencil/stencil.cu b/stencil/stencil.cu
new file mode 100644
index 0000000..92538b9
--- /dev/null
+++ b/stencil/stencil.cu
@@ -0,0 +1,702 @@
+/*
+ * 3D Seven-Point Stencil in CUDA
+ *
+ * Multiple kernel implementations demonstrating different optimization strategies:
+ *   1. Naive (basic global memory access)
+ *   2. Shared Memory Tiling (2D xy-plane tile cached in shared memory)
+ *   3. Thread Coarsening (each thread processes multiple z-layers)
+ *   4. Register Tiling (register caching along z-axis)
+ *
+ * The seven-point stencil computes:
+ *   out[i,j,k] = c0*in[i,j,k]   + c1*in[i-1,j,k] + c2*in[i+1,j,k] +
+ *                c3*in[i,j-1,k] + c4*in[i,j+1,k] +
+ *                c5*in[i,j,k-1] + c6*in[i,j,k+1]
+ *
+ * Usage:
+ *   stencil [--mode MODE] [--nx NX] [--ny NY] [--nz NZ] [--threads THREADS]
+ *           [--tile-x TX] [--tile-y TY] [--coarse COARSE]
+ *
+ * - mode:    naive|shared|coarsened|register|all (default: all)
+ * - nx/ny/nz: grid dimensions (default: 256x256x256)
+ * - threads:  threads per block for naive kernel (default: 256)
+ * - tile-x/tile-y: tile dimensions for tiled kernels (default: 32x8)
+ * - coarse:  coarsening factor along z-axis (default: 8)
+ *
+ * Host-side verification is included.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <cuda_runtime.h>
+#include "../common/cli_utils.h"
+
+#ifndef CHECK_CUDA
+#define CHECK_CUDA(call)                                                                               \
+    do                                                                                                 \
+    {                                                                                                  \
+        cudaError_t err = (call);                                                                      \
+        if (err != cudaSuccess)                                                                        \
+        {                                                                                              \
+            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+            exit(EXIT_FAILURE);                                                                        \
+        }                                                                                              \
+    } while (0)
+#endif
+
+#define DEBUG 0
+#define NX_DEFAULT 256
+#define NY_DEFAULT 256
+#define NZ_DEFAULT 256
+#define THREADS_DEFAULT 256
+#define TILE_X_DEFAULT 32
+#define TILE_Y_DEFAULT 8
+#define COARSE_FACTOR_DEFAULT 8
+
+// Stencil coefficients (7-point stencil)
+#define C0 -6.0f // center weight
+#define C1 1.0f  // x-negative neighbor
+#define C2 1.0f  // x-positive neighbor
+#define C3 1.0f  // y-negative neighbor
+#define C4 1.0f  // y-positive neighbor
+#define C5 1.0f  // z-negative neighbor
+#define C6 1.0f  // z-positive neighbor
+
+/* ============================================================================
+ * Kernel 1: Naive 3D Seven-Point Stencil
+ * Direct global memory access. Each thread computes one output point.
+ * Simple but memory bandwidth limited due to redundant loads.
+ * ============================================================================ */
+__global__ void stencil_naive(const float *in, float *out, int nx, int ny, int nz)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+    int k = blockIdx.z * blockDim.z + threadIdx.z;
+
+    // Skip boundary points
+    if (i >= 1 && i < nx - 1 &&
+        j >= 1 && j < ny - 1 &&
+        k >= 1 && k < nz - 1)
+    {
+        // Linear index for 3D array stored in row-major (x fastest, then y, then z)
+        // index = i + j*nx + k*nx*ny
+        int idx = i + j * nx + k * nx * ny;
+        int stride_x = 1;
+        int stride_y = nx;
+        int stride_z = nx * ny;
+
+        float center = in[idx];
+        float x_neg = in[idx - stride_x];
+        float x_pos = in[idx + stride_x];
+        float y_neg = in[idx - stride_y];
+        float y_pos = in[idx + stride_y];
+        float z_neg = in[idx - stride_z];
+        float z_pos = in[idx + stride_z];
+
+        out[idx] = C0 * center + C1 * x_neg + C2 * x_pos +
+                   C3 * y_neg + C4 * y_pos + C5 * z_neg + C6 * z_pos;
+    }
+}
+
+/* ============================================================================
+ * Kernel 2: Shared Memory Tiling (xy-plane tiling)
+ * 2D tiles in xy-plane are loaded into shared memory.
+ * For each z-layer, load current plane + halo into shared memory.
+ * Neighbors in z are loaded from global memory.
+ * ============================================================================ */
+template <int TILE_X, int TILE_Y>
+__global__ void stencil_shared(const float *in, float *out, int nx, int ny, int nz)
+{
+    // Shared memory includes halo (+1 on each side in x and y)
+    __shared__ float tile[TILE_Y + 2][TILE_X + 2];
+
+    // Global coordinates
+    int i = blockIdx.x * TILE_X + threadIdx.x;
+    int j = blockIdx.y * TILE_Y + threadIdx.y;
+    int k = blockIdx.z + 1; // Start from k=1 (skip boundary)
+
+    // Local coordinates in shared memory (with halo offset)
+    int li = threadIdx.x + 1;
+    int lj = threadIdx.y + 1;
+
+    int stride_z = nx * ny;
+
+    // Process only valid z-layers (interior)
+    if (k >= 1 && k < nz - 1)
+    {
+        // Load center tile
+        if (i < nx && j < ny)
+        {
+            int idx = i + j * nx + k * stride_z;
+            tile[lj][li] = in[idx];
+        }
+
+        // Load x-halo (left and right)
+        if (threadIdx.x == 0 && i > 0)
+        {
+            tile[lj][0] = in[(i - 1) + j * nx + k * stride_z];
+        }
+        if (threadIdx.x == TILE_X - 1 && i < nx - 1)
+        {
+            tile[lj][TILE_X + 1] = in[(i + 1) + j * nx + k * stride_z];
+        }
+        // Handle case where tile is at boundary
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.x < TILE_X - 1)
+        {
+            if (i + 1 < nx)
+                tile[lj][li + 1] = in[(i + 1) + j * nx + k * stride_z];
+        }
+
+        // Load y-halo (top and bottom)
+        if (threadIdx.y == 0 && j > 0)
+        {
+            tile[0][li] = in[i + (j - 1) * nx + k * stride_z];
+        }
+        if (threadIdx.y == TILE_Y - 1 && j < ny - 1)
+        {
+            tile[TILE_Y + 1][li] = in[i + (j + 1) * nx + k * stride_z];
+        }
+        if (threadIdx.y == blockDim.y - 1 && threadIdx.y < TILE_Y - 1)
+        {
+            if (j + 1 < ny)
+                tile[lj + 1][li] = in[i + (j + 1) * nx + k * stride_z];
+        }
+
+        __syncthreads();
+
+        // Compute stencil for interior points only
+        if (i >= 1 && i < nx - 1 && j >= 1 && j < ny - 1)
+        {
+            int idx = i + j * nx + k * stride_z;
+
+            // xy-plane neighbors from shared memory
+            float center = tile[lj][li];
+            float x_neg = tile[lj][li - 1];
+            float x_pos = tile[lj][li + 1];
+            float y_neg = tile[lj - 1][li];
+            float y_pos = tile[lj + 1][li];
+
+            // z-neighbors from global memory
+            float z_neg = in[idx - stride_z];
+            float z_pos = in[idx + stride_z];
+
+            out[idx] = C0 * center + C1 * x_neg + C2 * x_pos +
+                       C3 * y_neg + C4 * y_pos + C5 * z_neg + C6 * z_pos;
+        }
+    }
+}
+
+/* ============================================================================
+ * Kernel 3: Thread Coarsening (z-axis coarsening)
+ * Each thread processes multiple consecutive z-layers.
+ * Reduces thread launch overhead and improves data reuse along z.
+ * ============================================================================ */
+template <int TILE_X, int TILE_Y, int COARSE>
+__global__ void stencil_coarsened(const float *in, float *out, int nx, int ny, int nz)
+{
+    __shared__ float tile[TILE_Y + 2][TILE_X + 2];
+
+    int i = blockIdx.x * TILE_X + threadIdx.x;
+    int j = blockIdx.y * TILE_Y + threadIdx.y;
+    int k_base = blockIdx.z * COARSE + 1; // Start from k=1
+
+    int li = threadIdx.x + 1;
+    int lj = threadIdx.y + 1;
+
+    int stride_z = nx * ny;
+
+    // Each thread processes COARSE z-layers
+    for (int c = 0; c < COARSE; c++)
+    {
+        int k = k_base + c;
+
+        if (k >= nz - 1)
+            break; // Beyond valid range
+
+        // Load tile for current z-layer
+        if (i < nx && j < ny)
+        {
+            int idx = i + j * nx + k * stride_z;
+            tile[lj][li] = in[idx];
+        }
+
+        // Load x-halo
+        if (threadIdx.x == 0 && i > 0)
+        {
+            tile[lj][0] = in[(i - 1) + j * nx + k * stride_z];
+        }
+        if (threadIdx.x == TILE_X - 1 && i < nx - 1)
+        {
+            tile[lj][TILE_X + 1] = in[(i + 1) + j * nx + k * stride_z];
+        }
+
+        // Load y-halo
+        if (threadIdx.y == 0 && j > 0)
+        {
+            tile[0][li] = in[i + (j - 1) * nx + k * stride_z];
+        }
+        if (threadIdx.y == TILE_Y - 1 && j < ny - 1)
+        {
+            tile[TILE_Y + 1][li] = in[i + (j + 1) * nx + k * stride_z];
+        }
+
+        __syncthreads();
+
+        // Compute stencil
+        if (i >= 1 && i < nx - 1 && j >= 1 && j < ny - 1)
+        {
+            int idx = i + j * nx + k * stride_z;
+
+            float center = tile[lj][li];
+            float x_neg = tile[lj][li - 1];
+            float x_pos = tile[lj][li + 1];
+            float y_neg = tile[lj - 1][li];
+            float y_pos = tile[lj + 1][li];
+            float z_neg = in[idx - stride_z];
+            float z_pos = in[idx + stride_z];
+
+            out[idx] = C0 * center + C1 * x_neg + C2 * x_pos +
+                       C3 * y_neg + C4 * y_pos + C5 * z_neg + C6 * z_pos;
+        }
+
+        __syncthreads();
+    }
+}
+
+/* ============================================================================
+ * Kernel 4: Register Tiling (z-axis register caching)
+ * As we sweep through z-layers, cache values in registers.
+ * Each thread maintains registers for prev, curr, next z-values.
+ * Maximizes temporal reuse along z-dimension.
+ * ============================================================================ */
+template <int TILE_X, int TILE_Y>
+__global__ void stencil_register(const float *in, float *out, int nx, int ny, int nz)
+{
+    __shared__ float tile[TILE_Y + 2][TILE_X + 2];
+
+    int i = blockIdx.x * TILE_X + threadIdx.x;
+    int j = blockIdx.y * TILE_Y + threadIdx.y;
+
+    int li = threadIdx.x + 1;
+    int lj = threadIdx.y + 1;
+
+    int stride_z = nx * ny;
+
+    if (i >= nx || j >= ny)
+        return;
+
+    // Register variables for z-values (sliding window)
+    float z_prev, z_curr, z_next;
+
+    // Initialize: load first two z-layers into registers
+    z_prev = in[i + j * nx + 0 * stride_z];
+    z_curr = in[i + j * nx + 1 * stride_z];
+
+    // Sweep through z-layers from k=1 to k=nz-2
+    for (int k = 1; k < nz - 1; k++)
+    {
+        // Prefetch next z-layer
+        z_next = in[i + j * nx + (k + 1) * stride_z];
+
+        // Load xy-tile for current z into shared memory
+        tile[lj][li] = z_curr;
+
+        // Load x-halo
+        if (threadIdx.x == 0 && i > 0)
+        {
+            tile[lj][0] = in[(i - 1) + j * nx + k * stride_z];
+        }
+        if (threadIdx.x == TILE_X - 1 || threadIdx.x == blockDim.x - 1)
+        {
+            if (i + 1 < nx)
+                tile[lj][li + 1] = in[(i + 1) + j * nx + k * stride_z];
+        }
+
+        // Load y-halo
+        if (threadIdx.y == 0 && j > 0)
+        {
+            tile[0][li] = in[i + (j - 1) * nx + k * stride_z];
+        }
+        if (threadIdx.y == TILE_Y - 1 || threadIdx.y == blockDim.y - 1)
+        {
+            if (j + 1 < ny)
+                tile[lj + 1][li] = in[i + (j + 1) * nx + k * stride_z];
+        }
+
+        __syncthreads();
+
+        // Compute stencil for interior points
+        if (i >= 1 && i < nx - 1 && j >= 1 && j < ny - 1)
+        {
+            int idx = i + j * nx + k * stride_z;
+
+            // xy-neighbors from shared memory
+            float x_neg = tile[lj][li - 1];
+            float x_pos = tile[lj][li + 1];
+            float y_neg = tile[lj - 1][li];
+            float y_pos = tile[lj + 1][li];
+
+            // z-neighbors from registers!
+            out[idx] = C0 * z_curr + C1 * x_neg + C2 * x_pos +
+                       C3 * y_neg + C4 * y_pos + C5 * z_prev + C6 * z_next;
+        }
+
+        __syncthreads();
+
+        // Slide register window
+        z_prev = z_curr;
+        z_curr = z_next;
+    }
+}
+
+/* ============================================================================
+ * Host Reference Implementation
+ * ============================================================================ */
+void stencil_cpu(const float *in, float *out, int nx, int ny, int nz)
+{
+    for (int k = 1; k < nz - 1; k++)
+    {
+        for (int j = 1; j < ny - 1; j++)
+        {
+            for (int i = 1; i < nx - 1; i++)
+            {
+                int idx = i + j * nx + k * nx * ny;
+                int stride_x = 1;
+                int stride_y = nx;
+                int stride_z = nx * ny;
+
+                float center = in[idx];
+                float x_neg = in[idx - stride_x];
+                float x_pos = in[idx + stride_x];
+                float y_neg = in[idx - stride_y];
+                float y_pos = in[idx + stride_y];
+                float z_neg = in[idx - stride_z];
+                float z_pos = in[idx + stride_z];
+
+                out[idx] = C0 * center + C1 * x_neg + C2 * x_pos +
+                           C3 * y_neg + C4 * y_pos + C5 * z_neg + C6 * z_pos;
+            }
+        }
+    }
+}
+
+/* ============================================================================
+ * Verification
+ * ============================================================================ */
+int verify_stencil(const float *gpu_out, const float *cpu_out, int nx, int ny, int nz, float tolerance)
+{
+    int errors = 0;
+    for (int k = 1; k < nz - 1; k++)
+    {
+        for (int j = 1; j < ny - 1; j++)
+        {
+            for (int i = 1; i < nx - 1; i++)
+            {
+                int idx = i + j * nx + k * nx * ny;
+                float diff = fabsf(gpu_out[idx] - cpu_out[idx]);
+                if (diff > tolerance)
+                {
+                    if (errors < 5)
+                    {
+                        fprintf(stderr, "Mismatch at [%d,%d,%d]: GPU=%.6f, CPU=%.6f, diff=%.6e\n",
+                                i, j, k, gpu_out[idx], cpu_out[idx], diff);
+                    }
+                    errors++;
+                }
+            }
+        }
+    }
+    if (errors > 0)
+    {
+        fprintf(stderr, "Total mismatches: %d\n", errors);
+    }
+    return (errors == 0);
+}
+
+/* ============================================================================
+ * Timing helper
+ * ============================================================================ */
+typedef void (*kernel_launcher_t)(const float *, float *, int, int, int, cudaEvent_t, cudaEvent_t);
+
+float run_and_time(const float *d_in, float *d_out, int nx, int ny, int nz,
+                   kernel_launcher_t launcher, const char *name)
+{
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    // Clear output
+    CHECK_CUDA(cudaMemset(d_out, 0, (size_t)nx * ny * nz * sizeof(float)));
+
+    launcher(d_in, d_out, nx, ny, nz, start, stop);
+
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float ms = 0;
+    CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "Kernel %s error: %s\n", name, cudaGetErrorString(err));
+    }
+
+    CHECK_CUDA(cudaEventDestroy(start));
+    CHECK_CUDA(cudaEventDestroy(stop));
+
+    return ms;
+}
+
+/* ============================================================================
+ * Kernel launchers
+ * ============================================================================ */
+void launch_naive(const float *d_in, float *d_out, int nx, int ny, int nz,
+                  cudaEvent_t start, cudaEvent_t stop)
+{
+    dim3 blockDim(8, 8, 8);
+    dim3 gridDim((nx + blockDim.x - 1) / blockDim.x,
+                 (ny + blockDim.y - 1) / blockDim.y,
+                 (nz + blockDim.z - 1) / blockDim.z);
+
+    CHECK_CUDA(cudaEventRecord(start));
+    stencil_naive<<<gridDim, blockDim>>>(d_in, d_out, nx, ny, nz);
+    CHECK_CUDA(cudaEventRecord(stop));
+}
+
+#define TILE_X 32
+#define TILE_Y 8
+
+void launch_shared(const float *d_in, float *d_out, int nx, int ny, int nz,
+                   cudaEvent_t start, cudaEvent_t stop)
+{
+    dim3 blockDim(TILE_X, TILE_Y, 1);
+    dim3 gridDim((nx + TILE_X - 1) / TILE_X,
+                 (ny + TILE_Y - 1) / TILE_Y,
+                 nz - 2); // One block per interior z-layer
+
+    CHECK_CUDA(cudaEventRecord(start));
+    stencil_shared<TILE_X, TILE_Y><<<gridDim, blockDim>>>(d_in, d_out, nx, ny, nz);
+    CHECK_CUDA(cudaEventRecord(stop));
+}
+
+#define COARSE_Z 8
+
+void launch_coarsened(const float *d_in, float *d_out, int nx, int ny, int nz,
+                      cudaEvent_t start, cudaEvent_t stop)
+{
+    dim3 blockDim(TILE_X, TILE_Y, 1);
+    int z_blocks = (nz - 2 + COARSE_Z - 1) / COARSE_Z;
+    dim3 gridDim((nx + TILE_X - 1) / TILE_X,
+                 (ny + TILE_Y - 1) / TILE_Y,
+                 z_blocks);
+
+    CHECK_CUDA(cudaEventRecord(start));
+    stencil_coarsened<TILE_X, TILE_Y, COARSE_Z><<<gridDim, blockDim>>>(d_in, d_out, nx, ny, nz);
+    CHECK_CUDA(cudaEventRecord(stop));
+}
+
+void launch_register(const float *d_in, float *d_out, int nx, int ny, int nz,
+                     cudaEvent_t start, cudaEvent_t stop)
+{
+    dim3 blockDim(TILE_X, TILE_Y, 1);
+    dim3 gridDim((nx + TILE_X - 1) / TILE_X,
+                 (ny + TILE_Y - 1) / TILE_Y,
+                 1); // Single z-block, thread sweeps all z
+
+    CHECK_CUDA(cudaEventRecord(start));
+    stencil_register<TILE_X, TILE_Y><<<gridDim, blockDim>>>(d_in, d_out, nx, ny, nz);
+    CHECK_CUDA(cudaEventRecord(stop));
+}
+
+/* ============================================================================
+ * Main
+ * ============================================================================ */
+int main(int argc, char **argv)
+{
+    if (cli_has_help(argc, argv))
+    {
+        printf("Usage: %s [--mode MODE] [--nx NX] [--ny NY] [--nz NZ]\n\n", argv[0]);
+        printf("Options:\n");
+        printf("  --mode MODE    Kernel: naive|shared|coarsened|register|all (default: all)\n");
+        printf("  --nx NX        Grid size in X (default: %d)\n", NX_DEFAULT);
+        printf("  --ny NY        Grid size in Y (default: %d)\n", NY_DEFAULT);
+        printf("  --nz NZ        Grid size in Z (default: %d)\n", NZ_DEFAULT);
+        printf("\n");
+        printf("Stencil coefficients:\n");
+        printf("  c0 = %.2f (center), c1 = %.2f (each neighbor)\n", C0, C1);
+        printf("\n");
+        printf("Tile configuration:\n");
+        printf("  Shared/Coarsened/Register: %dx%d xy-tile\n", TILE_X_DEFAULT, TILE_Y_DEFAULT);
+        printf("  Coarsening factor: %d z-layers per thread\n", COARSE_FACTOR_DEFAULT);
+        return 0;
+    }
+
+    // Parse arguments
+    int nx = NX_DEFAULT;
+    int ny = NY_DEFAULT;
+    int nz = NZ_DEFAULT;
+    const char *mode = "all";
+
+    const char *v;
+    if ((v = cli_find_flag_value(argc, argv, "nx")))
+    {
+        if (!is_positive_integer_str(v))
+        {
+            fprintf(stderr, "Invalid nx\n");
+            return 1;
+        }
+        nx = atoi(v);
+    }
+    if ((v = cli_find_flag_value(argc, argv, "ny")))
+    {
+        if (!is_positive_integer_str(v))
+        {
+            fprintf(stderr, "Invalid ny\n");
+            return 1;
+        }
+        ny = atoi(v);
+    }
+    if ((v = cli_find_flag_value(argc, argv, "nz")))
+    {
+        if (!is_positive_integer_str(v))
+        {
+            fprintf(stderr, "Invalid nz\n");
+            return 1;
+        }
+        nz = atoi(v);
+    }
+    if ((v = cli_find_flag_value(argc, argv, "mode")))
+    {
+        mode = v;
+    }
+
+    // Validate minimum size
+    if (nx < 3 || ny < 3 || nz < 3)
+    {
+        fprintf(stderr, "Grid must be at least 3x3x3 for stencil computation\n");
+        return 1;
+    }
+
+    size_t totalElements = (size_t)nx * ny * nz;
+    size_t dataSize = totalElements * sizeof(float);
+    size_t interiorPoints = (size_t)(nx - 2) * (ny - 2) * (nz - 2);
+
+    printf("=== 3D Seven-Point Stencil ===\n");
+    printf("Grid: %d x %d x %d = %zu elements\n", nx, ny, nz, totalElements);
+    printf("Interior points: %zu\n", interiorPoints);
+    printf("Data size: %.2f MB\n", dataSize / (1024.0 * 1024.0));
+    printf("Mode: %s\n\n", mode);
+
+    // Allocate host memory
+    float *h_in = (float *)malloc(dataSize);
+    float *h_out_gpu = (float *)malloc(dataSize);
+    float *h_out_cpu = (float *)malloc(dataSize);
+
+    if (!h_in || !h_out_gpu || !h_out_cpu)
+    {
+        fprintf(stderr, "Failed to allocate host memory\n");
+        return 1;
+    }
+
+    // Initialize input with random values
+    srand(42);
+    for (size_t i = 0; i < totalElements; i++)
+    {
+        h_in[i] = (float)(rand() % 100) / 100.0f;
+    }
+    memset(h_out_cpu, 0, dataSize);
+
+    // Compute CPU reference
+    printf("Computing CPU reference...\n");
+    stencil_cpu(h_in, h_out_cpu, nx, ny, nz);
+
+    // Allocate device memory
+    float *d_in, *d_out;
+    CHECK_CUDA(cudaMalloc(&d_in, dataSize));
+    CHECK_CUDA(cudaMalloc(&d_out, dataSize));
+    CHECK_CUDA(cudaMemcpy(d_in, h_in, dataSize, cudaMemcpyHostToDevice));
+
+    float ms;
+    int runAll = (strcmp(mode, "all") == 0);
+    float tolerance = 1e-5f;
+
+    // Run kernels based on mode
+    if (runAll || strcmp(mode, "naive") == 0)
+    {
+        ms = run_and_time(d_in, d_out, nx, ny, nz, launch_naive, "stencil_naive");
+        CHECK_CUDA(cudaMemcpy(h_out_gpu, d_out, dataSize, cudaMemcpyDeviceToHost));
+        int ok = verify_stencil(h_out_gpu, h_out_cpu, nx, ny, nz, tolerance);
+
+        double flops = interiorPoints * 13.0; // 7 loads, 6 adds, 7 multiplies -> ~13 FLOPs
+        double gflops = (flops / 1e9) / (ms / 1e3);
+        double bandwidth = (interiorPoints * 7 * sizeof(float) + interiorPoints * sizeof(float)) / 1e9 / (ms / 1e3);
+
+        printf("Kernel: stencil_naive\n");
+        printf("  Time: %.3f ms\n", ms);
+        printf("  Throughput: %.2f GPoints/s\n", (interiorPoints / 1e9) / (ms / 1e3));
+        printf("  Est. GFLOP/s: %.2f\n", gflops);
+        printf("  Est. Bandwidth: %.2f GB/s\n", bandwidth);
+        printf("  Verification: %s\n\n", ok ? "PASSED" : "FAILED");
+    }
+
+    if (runAll || strcmp(mode, "shared") == 0)
+    {
+        ms = run_and_time(d_in, d_out, nx, ny, nz, launch_shared, "stencil_shared");
+        CHECK_CUDA(cudaMemcpy(h_out_gpu, d_out, dataSize, cudaMemcpyDeviceToHost));
+        int ok = verify_stencil(h_out_gpu, h_out_cpu, nx, ny, nz, tolerance);
+
+        double gflops = (interiorPoints * 13.0 / 1e9) / (ms / 1e3);
+        double bandwidth = (interiorPoints * 7 * sizeof(float) + interiorPoints * sizeof(float)) / 1e9 / (ms / 1e3);
+
+        printf("Kernel: stencil_shared (tile %dx%d)\n", TILE_X, TILE_Y);
+        printf("  Time: %.3f ms\n", ms);
+        printf("  Throughput: %.2f GPoints/s\n", (interiorPoints / 1e9) / (ms / 1e3));
+        printf("  Est. GFLOP/s: %.2f\n", gflops);
+        printf("  Est. Bandwidth: %.2f GB/s\n", bandwidth);
+        printf("  Verification: %s\n\n", ok ? "PASSED" : "FAILED");
+    }
+
+    if (runAll || strcmp(mode, "coarsened") == 0)
+    {
+        ms = run_and_time(d_in, d_out, nx, ny, nz, launch_coarsened, "stencil_coarsened");
+        CHECK_CUDA(cudaMemcpy(h_out_gpu, d_out, dataSize, cudaMemcpyDeviceToHost));
+        int ok = verify_stencil(h_out_gpu, h_out_cpu, nx, ny, nz, tolerance);
+
+        double gflops = (interiorPoints * 13.0 / 1e9) / (ms / 1e3);
+        double bandwidth = (interiorPoints * 7 * sizeof(float) + interiorPoints * sizeof(float)) / 1e9 / (ms / 1e3);
+
+        printf("Kernel: stencil_coarsened (tile %dx%d, coarse %d)\n", TILE_X, TILE_Y, COARSE_Z);
+        printf("  Time: %.3f ms\n", ms);
+        printf("  Throughput: %.2f GPoints/s\n", (interiorPoints / 1e9) / (ms / 1e3));
+        printf("  Est. GFLOP/s: %.2f\n", gflops);
+        printf("  Est. Bandwidth: %.2f GB/s\n", bandwidth);
+        printf("  Verification: %s\n\n", ok ? "PASSED" : "FAILED");
+    }
+
+    if (runAll || strcmp(mode, "register") == 0)
+    {
+        ms = run_and_time(d_in, d_out, nx, ny, nz, launch_register, "stencil_register");
+        CHECK_CUDA(cudaMemcpy(h_out_gpu, d_out, dataSize, cudaMemcpyDeviceToHost));
+        int ok = verify_stencil(h_out_gpu, h_out_cpu, nx, ny, nz, tolerance);
+
+        double gflops = (interiorPoints * 13.0 / 1e9) / (ms / 1e3);
+        double bandwidth = (interiorPoints * 7 * sizeof(float) + interiorPoints * sizeof(float)) / 1e9 / (ms / 1e3);
+
+        printf("Kernel: stencil_register (tile %dx%d, register z-sweep)\n", TILE_X, TILE_Y);
+        printf("  Time: %.3f ms\n", ms);
+        printf("  Throughput: %.2f GPoints/s\n", (interiorPoints / 1e9) / (ms / 1e3));
+        printf("  Est. GFLOP/s: %.2f\n", gflops);
+        printf("  Est. Bandwidth: %.2f GB/s\n", bandwidth);
+        printf("  Verification: %s\n\n", ok ? "PASSED" : "FAILED");
+    }
+
+    // Cleanup
+    cudaFree(d_in);
+    cudaFree(d_out);
+    free(h_in);
+    free(h_out_gpu);
+    free(h_out_cpu);
+
+    return 0;
+}

From 337d6678813f24a66a400faf019c47901ba66b03 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:05:25 +0100
Subject: [PATCH 7/9] Add 'stencil' in .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 03a98b1..f884f21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ convolution2D
 vectAdd_errors
 gpu_info
 parallel_histogram
+stencil
 
 # Profiler and logs
 *.nvvp

From 0b50e714ce432eb4812f40b4585025431893c291 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:08:12 +0100
Subject: [PATCH 8/9] Enhance kernel name extraction logic in parse_metrics.py

---
 profiling_tools/parse_metrics.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/profiling_tools/parse_metrics.py b/profiling_tools/parse_metrics.py
index ea56a48..d5bc2b9 100644
--- a/profiling_tools/parse_metrics.py
+++ b/profiling_tools/parse_metrics.py
@@ -206,21 +206,40 @@ def extract_kernel_name(full_name, kernel_filter=None):
         return None  # No match, skip this kernel
     
     # Extract kernel name from mangled C++ name
-    # Pattern 1: "kernel_name(args)" or "kernel_name<template>(args)"
-    match = re.match(r'^([a-zA-Z_][a-zA-Z0-9_]*)', full_name)
+    # Pattern 1: "void kernel_name<...>(...)" - template kernels from nvprof
+    match = re.search(r'\bvoid\s+(\w+)\s*<', full_name)
     if match:
         return match.group(1)
     
-    # Pattern 2: "void kernel_name<...>(...)"
-    match = re.search(r'void\s+(\w+)', full_name)
+    # Pattern 2: "void kernel_name(...)" - simple kernels with void prefix
+    match = re.search(r'\bvoid\s+(\w+)\s*\(', full_name)
     if match:
         return match.group(1)
     
-    # Pattern 3: Namespace::kernel_name
-    match = re.search(r'(\w+)(?:<|::|\()', full_name)
+    # Pattern 3: "kernel_name<template>(args)" - template without void
+    match = re.match(r'^([a-zA-Z_][a-zA-Z0-9_]*)\s*<', full_name)
+    if match:
+        return match.group(1)
+    
+    # Pattern 4: "kernel_name(args)" - simple kernel
+    match = re.match(r'^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(', full_name)
     if match:
         return match.group(1)
     
+    # Pattern 5: Namespace::kernel_name
+    match = re.search(r'(\w+)(?:<|::|\()', full_name)
+    if match:
+        name = match.group(1)
+        # Skip if it's just "void"
+        if name.lower() != 'void':
+            return name
+    
+    # Fallback: first word that's not void
+    words = re.findall(r'\b([a-zA-Z_]\w*)\b', full_name)
+    for w in words:
+        if w.lower() != 'void':
+            return w
+    
     return full_name[:50]  # Truncate long names
 
 def clean_version_name(filename):

From 0632ad45da454af6047fd92e4089c1cf60f0bd43 Mon Sep 17 00:00:00 2001
From: Alessandro Sidero <sideroalessandro@gmail.com>
Date: Tue, 30 Dec 2025 18:17:13 +0100
Subject: [PATCH 9/9] Update all README.md

---
 README.md                              | 12 ++---
 convolution/README.md                  |  9 ++--
 device_specification/README.md         | 33 ++++++++++----
 error_handling/README.md               | 62 +++++++++++++++-----------
 image_manip/README.md                  | 38 +++++++++++-----
 matrix_multiplication/README.md        | 43 ++++++++++++------
 matrix_vector_multiplication/README.md | 31 +++++++++----
 parallel_histogram/README.md           | 15 ++++++-
 stencil/README.md                      |  7 +++
 vector_addition/README.md              | 36 +++++++++++----
 10 files changed, 200 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index 1434229..7fcb15c 100644
--- a/README.md
+++ b/README.md
@@ -59,14 +59,16 @@ If you do not have `nvprof`, install the CUDA toolkit, or run the GitHub Actions
 
 Click the folders below for the example README files and more details:
 
-- [`Vector Addition`](vector_addition/) — vector add example
+- [`Vector Addition`](vector_addition/) — vector add example with multiple modes
 - [`Error Handling`](error_handling/) — examples showing CUDA error handling
 - [`Device Specification`](device_specification/) — device query and capability examples
-- [`Image Manipulation`](image_manip/) — image processing examples (blur, grayscale); includes `stb` helper headers
+- [`Image Manipulation`](image_manip/) — image processing examples (blur, grayscale) with libpng
 - [`Matrix-Vector Multiplication`](matrix_vector_multiplication/) — matrix-vector multiplication example
-- [`Matrix Multiplication`](matrix_multiplication/) — matrix multiplication example
-- [`Convolution`](convolution/) — convolution examples (1D & 2D)
-- [`Profiling Tools`](profiling_tools/) — automated GPU profiling suite with roofline analysis, timing histograms, and occupancy visualization
+- [`Matrix Multiplication`](matrix_multiplication/) — matrix multiplication with naive, tiled, and coarsened kernels
+- [`Convolution`](convolution/) — 1D and 2D convolution with constant memory and tiling
+- [`Parallel Histogram`](parallel_histogram/) — parallel histogram with privatization, aggregation, and coarsening
+- [`3D Stencil`](stencil/) — 3D seven-point stencil with shared memory, coarsening, and register tiling
+- [`Profiling Tools`](profiling_tools/) — automated GPU profiling suite with roofline analysis
 
 Each folder includes a `README.md` with per-example instructions.
 
diff --git a/convolution/README.md b/convolution/README.md
index 5518d24..ae7f759 100644
--- a/convolution/README.md
+++ b/convolution/README.md
@@ -111,7 +111,8 @@ Both implementations use a compile-time filter radius defined as:
 
 ## Notes
 
-- Constant memory provides broadcast capability for filter coefficients accessed by all threads
-- Tiling reduces global memory bandwidth by reusing data in shared memory
-- The halo region in tiled implementations handles boundary conditions
-- Use `NVCCFLAGS` in the Makefile to tune compilation flags for your hardware
+- Constant memory provides broadcast capability for filter coefficients accessed by all threads.
+- Tiling reduces global memory bandwidth by reusing data in shared memory.
+- The halo region in tiled implementations handles boundary conditions.
+- Use `NVCCFLAGS` in the Makefile to tune compilation flags for your hardware.
+- Use profiling tools: `../profiling_tools/profile_cuda.sh -d .`
diff --git a/device_specification/README.md b/device_specification/README.md
index c3de657..7f5a864 100644
--- a/device_specification/README.md
+++ b/device_specification/README.md
@@ -1,27 +1,42 @@
 # Device Specification
 
-Purpose: utility to enumerate CUDA devices and print hardware limits and properties useful for tuning kernels and understanding the platform.
+Utility to enumerate CUDA devices and print hardware limits and properties useful for tuning kernels and understanding the platform.
 
-Build:
+## Build
 
 ```bash
 cd device_specification
 make
 ```
 
-Programs and usage:
+## Usage
 
-- `deviceSpec [--device device_index]` : print properties for all devices or for the supplied device index.
+```bash
+./deviceSpec [--device DEVICE_INDEX]
+```
+
+**Options:**
+
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--device` | Device index to query | All devices |
 
-Run examples (local runner):
+## Run
 
 ```bash
-./run.sh --device=0    # print device 0 only
-# No args prints all devices
+# Print all devices
 ./run.sh
+
+# Print device 0 only
+./run.sh --device=0
 ```
 
-Notes:
+## Profiling
+
+This is a query utility; profiling is not typically needed.
+
+## Notes
 
-- Uses `cudaGetDeviceProperties` to collect a broad set of fields: memory, SMPs, registers, warp size, clock rates, compute capability, PCI IDs, ECC and concurrency flags. Useful baseline for kernel tuning.
+- Uses `cudaGetDeviceProperties` to collect a broad set of fields: memory, SMPs, registers, warp size, clock rates, compute capability, PCI IDs, ECC and concurrency flags.
+- Useful baseline for kernel tuning.
 - No external libraries required beyond the CUDA toolkit.
diff --git a/error_handling/README.md b/error_handling/README.md
index 7365b2b..3da8ae8 100644
--- a/error_handling/README.md
+++ b/error_handling/README.md
@@ -1,48 +1,60 @@
-# Error-handling demos for vector addition
+# Error Handling Demos
 
-This directory contains small programs that intentionally trigger common CUDA
-runtime and kernel errors so you can observe runtime messages and test
-profiling/debugging tools such as `nvprof`.
+This directory contains small programs that intentionally trigger common CUDA runtime and kernel errors so you can observe runtime messages and test profiling/debugging tools.
 
-Build:
+## Build
 
 ```bash
 cd error_handling
 make
 ```
 
-Programs and usage:
+## Usage
 
-- `vectAdd_errors [--mode idx]` : vector-add demo with intentional error modes
-  - Mode 0 (or no mode): safe run
-  - Mode 1 : excessive block size (invalid kernel launch configuration)
-  - Mode 2 : invalid host pointer passed to `cudaMemcpy`
-  - Mode 3 : excessive allocation request (forced `cudaMalloc` failure)
-  - Mode 4 : referencing invalid device pointer in kernel (NULL/invalid)
-  - Mode 5 : out-of-bounds global memory write in kernel
+### vectAdd_errors
 
-- `errorCudaMemcpy` : separate demo that demonstrates common cudaMemcpy /
-  memory-management mistakes, including incorrect sizes, nullptr copies and
-  misuse of `cudaMemcpyDeviceToDevice`. This file includes its own checking
-  macros and intentionally triggers runtime/runtime-sticky errors for testing.
+Vector-add demo with intentional error modes:
 
-Run examples (local runner):
+```bash
+./vectAdd_errors [--mode MODE] [--n N]
+```
+
+**Modes:**
+
+| Mode | Description |
+|------|-------------|
+| 0 | Safe run (no errors) |
+| 1 | Excessive block size (invalid launch configuration) |
+| 2 | Invalid host pointer passed to `cudaMemcpy` |
+| 3 | Excessive allocation request (forced `cudaMalloc` failure) |
+| 4 | Referencing invalid device pointer in kernel |
+| 5 | Out-of-bounds global memory write in kernel |
+
+### errorCudaMemcpy
+
+Demonstrates common `cudaMemcpy` and memory-management mistakes:
+- Incorrect sizes
+- nullptr copies
+- Misuse of `cudaMemcpyDeviceToDevice`
+
+## Run
 
 ```bash
-# Run vectAdd_errors in a specific mode:
+# Run vectAdd_errors in a specific mode
 ./run.sh vectAdd_errors --mode 1 --n 1024
-# Run the errorCudaMemcpy demo:
+
+# Run the errorCudaMemcpy demo
 ./run.sh errorCudaMemcpy --n 1024
 ```
 
-Profile with nvprof:
+## Profiling
 
 ```bash
 ./profile_nvprof.sh errorCudaMemcpy
 ```
 
-Notes:
+## Notes
 
-- The examples are intentionally invalid — run them in a controlled environment
-  for learning and debugging. The programs print CUDA error strings produced by
-  the runtime. Use `nvprof` output to inspect kernel activity and memory events.
+- These examples are **intentionally invalid** — run them in a controlled environment for learning and debugging.
+- The programs print CUDA error strings produced by the runtime.
+- Use `nvprof` or `compute-sanitizer` to inspect kernel activity and memory events.
diff --git a/image_manip/README.md b/image_manip/README.md
index 35ff1c3..2e39897 100644
--- a/image_manip/README.md
+++ b/image_manip/README.md
@@ -1,33 +1,49 @@
-# Image Manipulation (libpng)
+# Image Manipulation
 
-Simple CUDA examples that load PNG images with `libpng`, run GPU kernels (blur and grayscale), and write PNG outputs.
+CUDA examples that load PNG images with `libpng`, run GPU kernels (blur and grayscale), and write PNG outputs.
 
-Build:
+## Build
 
 ```bash
 cd image_manip
 make
 ```
 
-Programs and usage:
+Requires `libpng-dev` (or equivalent) installed on the system.
 
-- `imageBlur [--infile IN.png] [--outfile OUT.png]` : apply a small box blur (GPU)
-- `imageToGrayscale [--infile IN.png] [--outfile OUT.png]` : convert to grayscale on GPU
+## Usage
 
-Run examples (local runner):
+### imageBlur
+
+Apply a box blur filter on GPU:
+
+```bash
+./imageBlur [--infile IN.png] [--outfile OUT.png]
+```
+
+### imageToGrayscale
+
+Convert to grayscale on GPU:
+
+```bash
+./imageToGrayscale [--infile IN.png] [--outfile OUT.png]
+```
+
+## Run
 
 ```bash
 ./run.sh imageBlur --infile=input.png --outfile=output.png
+./run.sh imageToGrayscale --infile=input.png --outfile=gray.png
 ```
 
-Profile with nvprof:
+## Profiling
 
 ```bash
 ./profile_nvprof.sh imageBlur --infile=input.png --outfile=output.png
 ```
 
-Notes:
+## Notes
 
-- These examples use `libpng` from the system. Ensure `libpng-dev` (or equivalent) is installed and visible to the compiler.
-- The binaries link with `-lpng -lz`. If your system puts headers/libraries in non-standard locations, update `Makefile` accordingly.
+- Binaries link with `-lpng -lz`. If your system puts headers/libraries in non-standard locations, update `Makefile` accordingly.
 - Outputs keep the same number of channels as the input (RGB/RGBA).
+- Use `NVCCFLAGS` in the `Makefile` to tune compilation flags.
diff --git a/matrix_multiplication/README.md b/matrix_multiplication/README.md
index ce233b3..b6dbc52 100644
--- a/matrix_multiplication/README.md
+++ b/matrix_multiplication/README.md
@@ -1,34 +1,51 @@
-# matrix_multiplication
+# Matrix Multiplication
 
-Examples and microbenchmarks for matrix-matrix multiplication. This subproject includes
-multiple kernel variants (naive, tiled/shared-memory, coarsened and per-row/col variants),
-convenience runner and profiling helpers.
+Multiple kernel implementations for matrix-matrix multiplication demonstrating different optimization strategies: naive, tiled (shared memory), coarsened, and per-row/per-column variants.
 
-**Build**
+## Build
 
 ```bash
 cd matrix_multiplication
 make
 ```
 
-**Programs and usage**
+## Usage
 
-- `matrixMul [--mode MODE] [--M M] [--K K] [--N N] [--threads THREADS] [--tile TILE] [--coarse COARSE]` : Run a single mode.
-- Modes (supported): `naive`, `tiled`, `coarsened`, `perrows`, `percols`.
-	- `coarsened` accepts additional `COARSE` parameter (1..8) as last argument.
+```bash
+./matrixMul [--mode MODE] [--M M] [--K K] [--N N] [--threads T] [--tile TILE] [--coarse C]
+```
+
+**Options:**
 
-**Run (local runner)**
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--mode` | Kernel: `naive`, `tiled`, `coarsened`, `perrows`, `percols`, `all` | `all` |
+| `--M` | Matrix A rows | 1024 |
+| `--K` | Matrix A cols / B rows | 1024 |
+| `--N` | Matrix B cols | 1024 |
+| `--threads` | Threads per block | 256 |
+| `--tile` | Tile dimension for shared memory | 16 |
+| `--coarse` | Coarsening factor (1-8) | 2 |
+
+## Run
 
 ```bash
 ./run.sh --mode=tiled --M=1024 --K=1024 --N=1024 --threads=256 --tile=16
 ```
 
-**Profile with nvprof**
+## Profiling
 
 ```bash
+# Profile with nvprof
 ./profile_nvprof.sh --M 1024 --K 1024 --N 1024 --threads 256
+
+# Use profiling tools
+../profiling_tools/profile_cuda.sh -d .
 ```
 
-Notes
+## Notes
 
-- The CUDA kernels intentionally demonstrate multiple implementation strategies for microbenchmarking; they are not heavily optimized for every GPU. Use the profiling scripts and gnuplot files to collect timings and generate a Roofline / bar chart.
+- The CUDA kernels demonstrate multiple implementation strategies for microbenchmarking.
+- Use the profiling scripts and gnuplot to collect timings and generate Roofline / bar charts.
+- Tiled kernel uses shared memory to reduce global memory bandwidth requirements.
+- Coarsened kernel computes multiple output elements per thread.
diff --git a/matrix_vector_multiplication/README.md b/matrix_vector_multiplication/README.md
index a02bdbd..7e56da1 100644
--- a/matrix_vector_multiplication/README.md
+++ b/matrix_vector_multiplication/README.md
@@ -1,31 +1,46 @@
 # Matrix-Vector Multiplication
 
-Purpose: simple example that multiplies a matrix A (height x width) by a vector B (width) producing vector C (height) using a straightforward GPU kernel.
+Simple example that multiplies a matrix A (height × width) by a vector B (width) producing vector C (height) using a straightforward GPU kernel.
 
-Build:
+## Build
 
 ```bash
 cd matrix_vector_multiplication
 make
 ```
 
-Programs and usage:
+## Usage
 
-- `matrixVectMul [--width W] [--height H] [--threads T]` : run the example (defaults to 1024 x 1024 with 256 threads if no args provided).
+```bash
+./matrixVectMul [--width W] [--height H] [--threads T]
+```
+
+**Options:**
 
-Run examples (local runner):
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--width` | Matrix width / vector length | 1024 |
+| `--height` | Matrix height / output length | 1024 |
+| `--threads` | Threads per block | 256 |
+
+## Run
 
 ```bash
 ./run.sh --width=2048 --height=1024 --threads=128
 ```
 
-Profile with nvprof:
+## Profiling
 
 ```bash
+# Profile with nvprof
 ./profile_nvprof.sh --width=2048 --height=1024
+
+# Use profiling tools
+../profiling_tools/profile_cuda.sh -d .
 ```
 
-Notes:
+## Notes
 
-- This implementation is intentionally simple. It demonstrates a per-row parallelization where each thread computes one output element. It does not attempt shared-memory tiling or other optimizations.
+- This implementation demonstrates per-row parallelization where each thread computes one output element.
+- Intentionally simple; does not use shared-memory tiling or other optimizations.
 - Use `NVCCFLAGS` in the `Makefile` to tune compile flags.
diff --git a/parallel_histogram/README.md b/parallel_histogram/README.md
index 1702be6..40b0eba 100644
--- a/parallel_histogram/README.md
+++ b/parallel_histogram/README.md
@@ -101,14 +101,25 @@ Thread i processes: input[i*C], input[i*C+1], ..., input[i*C+(C-1)]
 ./parallelHistogram --bins 64 --n 1000000
 ```
 
-## Run Script
+## Run
 
 ```bash
 ./run.sh [OPTIONS]
 ```
 
-## Profile with nvprof
+## Profiling
 
 ```bash
+# Profile with nvprof
 ./profile_nvprof.sh --mode all --n 10000000
+
+# Use profiling tools
+../profiling_tools/profile_cuda.sh -d .
 ```
+
+## Notes
+
+- Maximum bins limited to 4096 due to shared memory constraints.
+- Privatized kernel provides best performance for typical use cases.
+- Coarsened kernel benefits from better memory coalescing.
+- Host-side verification ensures correctness.
diff --git a/stencil/README.md b/stencil/README.md
index e332dc5..3c0b0bb 100644
--- a/stencil/README.md
+++ b/stencil/README.md
@@ -165,3 +165,10 @@ Kernel: stencil_register (tile 32x8, register z-sweep)
 2. **Memory Bandwidth**: Stencil kernels are typically memory-bound
 3. **Occupancy**: Tile size affects SM occupancy
 4. **Register Pressure**: Register tiling may limit occupancy
+
+## Notes
+
+- Uses 7 independent coefficients (c0-c6) for flexibility.
+- Host-side verification ensures correctness against CPU reference.
+- All kernels include boundary handling (skip boundary points).
+- Use `NVCCFLAGS` in the `Makefile` to tune compilation flags.
diff --git a/vector_addition/README.md b/vector_addition/README.md
index 48f7142..e905a94 100644
--- a/vector_addition/README.md
+++ b/vector_addition/README.md
@@ -2,32 +2,50 @@
 
 Small demos that implement vector addition on the GPU. Variants include a baseline implementation, a grid-stride kernel with a thread granularity parameter, and Unified Memory versions (with and without prefetch).
 
-Build:
+## Build
 
 ```bash
 cd vector_addition
 make
 ```
 
-Programs and usage:
+## Usage
 
-- `vectAdd [--mode M] [--n N] [--threads T] [--granularity G]` : run vectAdd examples
-  - `mode` can select implementation or behavior (see source `vectAdd.cu` or launch with `-h` for available modes).
-  - Examples:
-    - Flag-style: `./vectAdd --mode=2 --n=1000000 --threads=256 --granularity=4`
+```bash
+./vectAdd [--mode M] [--n N] [--threads T] [--granularity G]
+```
+
+**Options:**
+
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--mode` | Implementation variant (see source or `-h`) | 0 |
+| `--n` | Number of elements | 1000000 |
+| `--threads` | Threads per block | 256 |
+| `--granularity` | Elements per thread (grid-stride) | 1 |
 
-Run examples (local runner):
+**Example:**
+```bash
+./vectAdd --mode=2 --n=1000000 --threads=256 --granularity=4
+```
+
+## Run
 
 ```bash
 ./run.sh --mode=2 --n=1000000 --threads=256 --granularity=4
 ```
 
-Profile with nvprof:
+## Profiling
 
 ```bash
+# Profile with nvprof
 ./profile_nvprof.sh --mode=1
+
+# Use profiling tools
+../profiling_tools/profile_cuda.sh -d .
 ```
 
-Notes:
+## Notes
 
 - Use `NVCCFLAGS` in the `Makefile` to tune compilation flags for your hardware.
+- Mode 0: basic kernel, Mode 1: grid-stride, Mode 2: Unified Memory, Mode 3: UM with prefetch