diff --git a/.gitignore b/.gitignore index 03a98b1..f884f21 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ convolution2D vectAdd_errors gpu_info parallel_histogram +stencil # Profiler and logs *.nvvp diff --git a/README.md b/README.md index 1434229..7fcb15c 100644 --- a/README.md +++ b/README.md @@ -59,14 +59,16 @@ If you do not have `nvprof`, install the CUDA toolkit, or run the GitHub Actions Click the folders below for the example README files and more details: -- [`Vector Addition`](vector_addition/) — vector add example +- [`Vector Addition`](vector_addition/) — vector add example with multiple modes - [`Error Handling`](error_handling/) — examples showing CUDA error handling - [`Device Specification`](device_specification/) — device query and capability examples -- [`Image Manipulation`](image_manip/) — image processing examples (blur, grayscale); includes `stb` helper headers +- [`Image Manipulation`](image_manip/) — image processing examples (blur, grayscale) with libpng - [`Matrix-Vector Multiplication`](matrix_vector_multiplication/) — matrix-vector multiplication example -- [`Matrix Multiplication`](matrix_multiplication/) — matrix multiplication example -- [`Convolution`](convolution/) — convolution examples (1D & 2D) -- [`Profiling Tools`](profiling_tools/) — automated GPU profiling suite with roofline analysis, timing histograms, and occupancy visualization +- [`Matrix Multiplication`](matrix_multiplication/) — matrix multiplication with naive, tiled, and coarsened kernels +- [`Convolution`](convolution/) — 1D and 2D convolution with constant memory and tiling +- [`Parallel Histogram`](parallel_histogram/) — parallel histogram with privatization, aggregation, and coarsening +- [`3D Stencil`](stencil/) — 3D seven-point stencil with shared memory, coarsening, and register tiling +- [`Profiling Tools`](profiling_tools/) — automated GPU profiling suite with roofline analysis Each folder includes a `README.md` with per-example instructions. diff --git a/convolution/README.md b/convolution/README.md index 5518d24..ae7f759 100644 --- a/convolution/README.md +++ b/convolution/README.md @@ -111,7 +111,8 @@ Both implementations use a compile-time filter radius defined as: ## Notes -- Constant memory provides broadcast capability for filter coefficients accessed by all threads -- Tiling reduces global memory bandwidth by reusing data in shared memory -- The halo region in tiled implementations handles boundary conditions -- Use `NVCCFLAGS` in the Makefile to tune compilation flags for your hardware +- Constant memory provides broadcast capability for filter coefficients accessed by all threads. +- Tiling reduces global memory bandwidth by reusing data in shared memory. +- The halo region in tiled implementations handles boundary conditions. +- Use `NVCCFLAGS` in the Makefile to tune compilation flags for your hardware. +- Use profiling tools: `../profiling_tools/profile_cuda.sh -d .` diff --git a/device_specification/README.md b/device_specification/README.md index c3de657..7f5a864 100644 --- a/device_specification/README.md +++ b/device_specification/README.md @@ -1,27 +1,42 @@ # Device Specification -Purpose: utility to enumerate CUDA devices and print hardware limits and properties useful for tuning kernels and understanding the platform. +Utility to enumerate CUDA devices and print hardware limits and properties useful for tuning kernels and understanding the platform. -Build: +## Build ```bash cd device_specification make ``` -Programs and usage: +## Usage -- `deviceSpec [--device device_index]` : print properties for all devices or for the supplied device index. +```bash +./deviceSpec [--device DEVICE_INDEX] +``` + +**Options:** + +| Flag | Description | Default | +|------|-------------|---------| +| `--device` | Device index to query | All devices | -Run examples (local runner): +## Run ```bash -./run.sh --device=0 # print device 0 only -# No args prints all devices +# Print all devices ./run.sh + +# Print device 0 only +./run.sh --device=0 ``` -Notes: +## Profiling + +This is a query utility; profiling is not typically needed. + +## Notes -- Uses `cudaGetDeviceProperties` to collect a broad set of fields: memory, SMPs, registers, warp size, clock rates, compute capability, PCI IDs, ECC and concurrency flags. Useful baseline for kernel tuning. +- Uses `cudaGetDeviceProperties` to collect a broad set of fields: memory, SMPs, registers, warp size, clock rates, compute capability, PCI IDs, ECC and concurrency flags. +- Useful baseline for kernel tuning. - No external libraries required beyond the CUDA toolkit. diff --git a/error_handling/README.md b/error_handling/README.md index 7365b2b..3da8ae8 100644 --- a/error_handling/README.md +++ b/error_handling/README.md @@ -1,48 +1,60 @@ -# Error-handling demos for vector addition +# Error Handling Demos -This directory contains small programs that intentionally trigger common CUDA -runtime and kernel errors so you can observe runtime messages and test -profiling/debugging tools such as `nvprof`. +This directory contains small programs that intentionally trigger common CUDA runtime and kernel errors so you can observe runtime messages and test profiling/debugging tools. -Build: +## Build ```bash cd error_handling make ``` -Programs and usage: +## Usage -- `vectAdd_errors [--mode idx]` : vector-add demo with intentional error modes - - Mode 0 (or no mode): safe run - - Mode 1 : excessive block size (invalid kernel launch configuration) - - Mode 2 : invalid host pointer passed to `cudaMemcpy` - - Mode 3 : excessive allocation request (forced `cudaMalloc` failure) - - Mode 4 : referencing invalid device pointer in kernel (NULL/invalid) - - Mode 5 : out-of-bounds global memory write in kernel +### vectAdd_errors -- `errorCudaMemcpy` : separate demo that demonstrates common cudaMemcpy / - memory-management mistakes, including incorrect sizes, nullptr copies and - misuse of `cudaMemcpyDeviceToDevice`. This file includes its own checking - macros and intentionally triggers runtime/runtime-sticky errors for testing. +Vector-add demo with intentional error modes: -Run examples (local runner): +```bash +./vectAdd_errors [--mode MODE] [--n N] +``` + +**Modes:** + +| Mode | Description | +|------|-------------| +| 0 | Safe run (no errors) | +| 1 | Excessive block size (invalid launch configuration) | +| 2 | Invalid host pointer passed to `cudaMemcpy` | +| 3 | Excessive allocation request (forced `cudaMalloc` failure) | +| 4 | Referencing invalid device pointer in kernel | +| 5 | Out-of-bounds global memory write in kernel | + +### errorCudaMemcpy + +Demonstrates common `cudaMemcpy` and memory-management mistakes: +- Incorrect sizes +- nullptr copies +- Misuse of `cudaMemcpyDeviceToDevice` + +## Run ```bash -# Run vectAdd_errors in a specific mode: +# Run vectAdd_errors in a specific mode ./run.sh vectAdd_errors --mode 1 --n 1024 -# Run the errorCudaMemcpy demo: + +# Run the errorCudaMemcpy demo ./run.sh errorCudaMemcpy --n 1024 ``` -Profile with nvprof: +## Profiling ```bash ./profile_nvprof.sh errorCudaMemcpy ``` -Notes: +## Notes -- The examples are intentionally invalid — run them in a controlled environment - for learning and debugging. The programs print CUDA error strings produced by - the runtime. Use `nvprof` output to inspect kernel activity and memory events. +- These examples are **intentionally invalid** — run them in a controlled environment for learning and debugging. +- The programs print CUDA error strings produced by the runtime. +- Use `nvprof` or `compute-sanitizer` to inspect kernel activity and memory events. diff --git a/image_manip/README.md b/image_manip/README.md index 35ff1c3..2e39897 100644 --- a/image_manip/README.md +++ b/image_manip/README.md @@ -1,33 +1,49 @@ -# Image Manipulation (libpng) +# Image Manipulation -Simple CUDA examples that load PNG images with `libpng`, run GPU kernels (blur and grayscale), and write PNG outputs. +CUDA examples that load PNG images with `libpng`, run GPU kernels (blur and grayscale), and write PNG outputs. -Build: +## Build ```bash cd image_manip make ``` -Programs and usage: +Requires `libpng-dev` (or equivalent) installed on the system. -- `imageBlur [--infile IN.png] [--outfile OUT.png]` : apply a small box blur (GPU) -- `imageToGrayscale [--infile IN.png] [--outfile OUT.png]` : convert to grayscale on GPU +## Usage -Run examples (local runner): +### imageBlur + +Apply a box blur filter on GPU: + +```bash +./imageBlur [--infile IN.png] [--outfile OUT.png] +``` + +### imageToGrayscale + +Convert to grayscale on GPU: + +```bash +./imageToGrayscale [--infile IN.png] [--outfile OUT.png] +``` + +## Run ```bash ./run.sh imageBlur --infile=input.png --outfile=output.png +./run.sh imageToGrayscale --infile=input.png --outfile=gray.png ``` -Profile with nvprof: +## Profiling ```bash ./profile_nvprof.sh imageBlur --infile=input.png --outfile=output.png ``` -Notes: +## Notes -- These examples use `libpng` from the system. Ensure `libpng-dev` (or equivalent) is installed and visible to the compiler. -- The binaries link with `-lpng -lz`. If your system puts headers/libraries in non-standard locations, update `Makefile` accordingly. +- Binaries link with `-lpng -lz`. If your system puts headers/libraries in non-standard locations, update `Makefile` accordingly. - Outputs keep the same number of channels as the input (RGB/RGBA). +- Use `NVCCFLAGS` in the `Makefile` to tune compilation flags. diff --git a/matrix_multiplication/README.md b/matrix_multiplication/README.md index ce233b3..b6dbc52 100644 --- a/matrix_multiplication/README.md +++ b/matrix_multiplication/README.md @@ -1,34 +1,51 @@ -# matrix_multiplication +# Matrix Multiplication -Examples and microbenchmarks for matrix-matrix multiplication. This subproject includes -multiple kernel variants (naive, tiled/shared-memory, coarsened and per-row/col variants), -convenience runner and profiling helpers. +Multiple kernel implementations for matrix-matrix multiplication demonstrating different optimization strategies: naive, tiled (shared memory), coarsened, and per-row/per-column variants. -**Build** +## Build ```bash cd matrix_multiplication make ``` -**Programs and usage** +## Usage -- `matrixMul [--mode MODE] [--M M] [--K K] [--N N] [--threads THREADS] [--tile TILE] [--coarse COARSE]` : Run a single mode. -- Modes (supported): `naive`, `tiled`, `coarsened`, `perrows`, `percols`. - - `coarsened` accepts additional `COARSE` parameter (1..8) as last argument. +```bash +./matrixMul [--mode MODE] [--M M] [--K K] [--N N] [--threads T] [--tile TILE] [--coarse C] +``` + +**Options:** -**Run (local runner)** +| Flag | Description | Default | +|------|-------------|---------| +| `--mode` | Kernel: `naive`, `tiled`, `coarsened`, `perrows`, `percols`, `all` | `all` | +| `--M` | Matrix A rows | 1024 | +| `--K` | Matrix A cols / B rows | 1024 | +| `--N` | Matrix B cols | 1024 | +| `--threads` | Threads per block | 256 | +| `--tile` | Tile dimension for shared memory | 16 | +| `--coarse` | Coarsening factor (1-8) | 2 | + +## Run ```bash ./run.sh --mode=tiled --M=1024 --K=1024 --N=1024 --threads=256 --tile=16 ``` -**Profile with nvprof** +## Profiling ```bash +# Profile with nvprof ./profile_nvprof.sh --M 1024 --K 1024 --N 1024 --threads 256 + +# Use profiling tools +../profiling_tools/profile_cuda.sh -d . ``` -Notes +## Notes -- The CUDA kernels intentionally demonstrate multiple implementation strategies for microbenchmarking; they are not heavily optimized for every GPU. Use the profiling scripts and gnuplot files to collect timings and generate a Roofline / bar chart. +- The CUDA kernels demonstrate multiple implementation strategies for microbenchmarking. +- Use the profiling scripts and gnuplot to collect timings and generate Roofline / bar charts. +- Tiled kernel uses shared memory to reduce global memory bandwidth requirements. +- Coarsened kernel computes multiple output elements per thread. diff --git a/matrix_vector_multiplication/README.md b/matrix_vector_multiplication/README.md index a02bdbd..7e56da1 100644 --- a/matrix_vector_multiplication/README.md +++ b/matrix_vector_multiplication/README.md @@ -1,31 +1,46 @@ # Matrix-Vector Multiplication -Purpose: simple example that multiplies a matrix A (height x width) by a vector B (width) producing vector C (height) using a straightforward GPU kernel. +Simple example that multiplies a matrix A (height × width) by a vector B (width) producing vector C (height) using a straightforward GPU kernel. -Build: +## Build ```bash cd matrix_vector_multiplication make ``` -Programs and usage: +## Usage -- `matrixVectMul [--width W] [--height H] [--threads T]` : run the example (defaults to 1024 x 1024 with 256 threads if no args provided). +```bash +./matrixVectMul [--width W] [--height H] [--threads T] +``` + +**Options:** -Run examples (local runner): +| Flag | Description | Default | +|------|-------------|---------| +| `--width` | Matrix width / vector length | 1024 | +| `--height` | Matrix height / output length | 1024 | +| `--threads` | Threads per block | 256 | + +## Run ```bash ./run.sh --width=2048 --height=1024 --threads=128 ``` -Profile with nvprof: +## Profiling ```bash +# Profile with nvprof ./profile_nvprof.sh --width=2048 --height=1024 + +# Use profiling tools +../profiling_tools/profile_cuda.sh -d . ``` -Notes: +## Notes -- This implementation is intentionally simple. It demonstrates a per-row parallelization where each thread computes one output element. It does not attempt shared-memory tiling or other optimizations. +- This implementation demonstrates per-row parallelization where each thread computes one output element. +- Intentionally simple; does not use shared-memory tiling or other optimizations. - Use `NVCCFLAGS` in the `Makefile` to tune compile flags. diff --git a/parallel_histogram/README.md b/parallel_histogram/README.md index 1702be6..40b0eba 100644 --- a/parallel_histogram/README.md +++ b/parallel_histogram/README.md @@ -101,14 +101,25 @@ Thread i processes: input[i*C], input[i*C+1], ..., input[i*C+(C-1)] ./parallelHistogram --bins 64 --n 1000000 ``` -## Run Script +## Run ```bash ./run.sh [OPTIONS] ``` -## Profile with nvprof +## Profiling ```bash +# Profile with nvprof ./profile_nvprof.sh --mode all --n 10000000 + +# Use profiling tools +../profiling_tools/profile_cuda.sh -d . ``` + +## Notes + +- Maximum bins limited to 4096 due to shared memory constraints. +- Privatized kernel provides best performance for typical use cases. +- Coarsened kernel benefits from better memory coalescing. +- Host-side verification ensures correctness. diff --git a/profiling_tools/parse_metrics.py b/profiling_tools/parse_metrics.py index 8a77ef2..d5bc2b9 100644 --- a/profiling_tools/parse_metrics.py +++ b/profiling_tools/parse_metrics.py @@ -72,13 +72,41 @@ def get_unit_multiplier(unit_str): return multipliers.get(u, 1.0) def read_csv_with_units(path): - """Read CSV with optional unit row""" + """Read CSV with optional unit row, handling nvprof multi-line format""" rows = [] if not os.path.exists(path): return rows with open(path, 'r', newline='', encoding='utf-8', errors='ignore') as f: - lines = f.readlines() + content = f.read() + + if not content.strip(): + return rows + + # Clean nvprof output: merge lines that are continuations + # nvprof sometimes splits long kernel names across lines + lines = [] + buffer = "" + for line in content.split('\n'): + stripped = line.strip() + # Skip nvprof header lines + if stripped.startswith('==') or not stripped: + continue + + # Count quotes to detect incomplete lines + buffer += line + quote_count = buffer.count('"') + + if quote_count % 2 == 0: + # Complete line + lines.append(buffer.strip()) + buffer = "" + else: + # Incomplete line, continue buffering + buffer += " " + + if buffer.strip(): + lines.append(buffer.strip()) if not lines: return rows @@ -86,15 +114,16 @@ def read_csv_with_units(path): # Find header line header_idx = -1 for i, line in enumerate(lines[:20]): - if '"Name"' in line or 'Name' in line or 'Kernel' in line: + if '"Type"' in line or '"Name"' in line or 'Name' in line or 'Kernel' in line: header_idx = i break if header_idx == -1: return rows - # Parse header - keys = [k.strip().replace('"', '') for k in lines[header_idx].strip().split(',')] + # Parse header - handle quoted CSV properly + header_line = lines[header_idx] + keys = parse_csv_line(header_line) # Check for unit row unit_map = {} @@ -102,9 +131,9 @@ def read_csv_with_units(path): if len(lines) > header_idx + 1: next_line = lines[header_idx + 1] - potential_units = [u.strip().replace('"', '') for u in next_line.strip().split(',')] + potential_units = parse_csv_line(next_line) - # Check if this looks like a unit row + # Check if this looks like a unit row (contains time units or %) if any(u in ['s', 'ms', 'us', 'ns', '%'] for u in potential_units): data_start_idx = header_idx + 2 for i, u in enumerate(potential_units): @@ -112,33 +141,63 @@ def read_csv_with_units(path): unit_map[keys[i]] = get_unit_multiplier(u) # Read data rows - reader = csv.DictReader(lines[data_start_idx:], fieldnames=keys) - for row in reader: - clean_row = {} - for k, v in row.items(): - if not v: - clean_row[k] = v - continue - - # Apply unit conversion - if k in unit_map and unit_map[k] != 1.0: - try: - val_clean = re.sub(r'[^0-9\.]', '', v) - clean_row[k] = float(val_clean) * unit_map[k] - except: - clean_row[k] = v - else: - clean_row[k] = v + for line in lines[data_start_idx:]: + if not line.strip(): + continue + + values = parse_csv_line(line) + row = {} + + for i, val in enumerate(values): + if i < len(keys): + key = keys[i] + # Apply unit conversion + if key in unit_map and unit_map[key] != 1.0: + try: + val_clean = re.sub(r'[^0-9\.\-eE]', '', val) + if val_clean: + row[key] = float(val_clean) * unit_map[key] + else: + row[key] = val + except: + row[key] = val + else: + row[key] = val - rows.append(clean_row) + if row: + rows.append(row) return rows +def parse_csv_line(line): + """Parse a CSV line handling quoted fields""" + fields = [] + current = "" + in_quotes = False + + for char in line: + if char == '"': + in_quotes = not in_quotes + elif char == ',' and not in_quotes: + fields.append(current.strip().replace('"', '')) + current = "" + else: + current += char + + fields.append(current.strip().replace('"', '')) + return fields + def extract_kernel_name(full_name, kernel_filter=None): """Extract kernel name from full mangled name""" if not full_name: return None + # Skip memory operations and API calls + skip_patterns = ['[CUDA memcpy', '[CUDA memset', 'cudaLaunch', 'cudaMalloc', 'cudaFree'] + for skip in skip_patterns: + if skip in full_name: + return None + # If filter provided, check if any filter matches if kernel_filter: for kf in kernel_filter: @@ -146,18 +205,40 @@ def extract_kernel_name(full_name, kernel_filter=None): return kf return None # No match, skip this kernel - # Otherwise, extract base kernel name - # Try to get meaningful name from mangled C++ names - patterns = [ - r'void\s+(\w+)', # void kernel_name<...> - r'(\w+)(?:<|::)', # kernel_name<...> or kernel_name::... - r'^([a-zA-Z_]\w+)', # Simple name at start - ] - - for pattern in patterns: - match = re.search(pattern, full_name) - if match: - return match.group(1) + # Extract kernel name from mangled C++ name + # Pattern 1: "void kernel_name<...>(...)" - template kernels from nvprof + match = re.search(r'\bvoid\s+(\w+)\s*<', full_name) + if match: + return match.group(1) + + # Pattern 2: "void kernel_name(...)" - simple kernels with void prefix + match = re.search(r'\bvoid\s+(\w+)\s*\(', full_name) + if match: + return match.group(1) + + # Pattern 3: "kernel_name