diff --git a/Cargo.toml b/Cargo.toml index 96caa18e2..ea2a1f7a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -88,7 +88,7 @@ rstest = "0.26" serial_test = "3.1.1" bytemuck = "1.16.1" -bytes = { version = "1.10", default-features = false } +bytes = { version = "1.11.1", default-features = false } float-ord = "0.3" float4 = "0.1" float8 = { version = "0.7", default-features = false } diff --git a/conventions.md b/conventions.md new file mode 100644 index 000000000..3f9ad5c43 --- /dev/null +++ b/conventions.md @@ -0,0 +1,46 @@ +# CubeCL Naming Conventions + +This document describes the naming conventions used throughout the CubeCL codebase. + +## Tensor Dimensions + +- Use **`axis`** (not `dim` or `dimension`) when referring to a specific dimension of a tensor. + - `tensor.stride(axis)`, `tensor.shape(axis)`, `tensor.coordinate(index, axis)` + +## Counts + +- Use **`_count` suffix** (not `num_` prefix) for quantities. + - `streaming_multiprocessor_count`, `cpu_core_count`, `tensor_core_count` + - `elem_count`, `cube_count`, `meta_count` + - Constants: `SM_COUNT_APPROX` + +## Line Size + +- Use **`line_size`** (not `vectorization` or `vectorization_factor`) for the number of + elements packed into a line. + - `tensor.line_size()`, `find_line_size()`, `tensor_line_size_parallel()` + - `tensor_vectorization_factor()` remains only as a deprecated compatibility alias. + +## Tensor Ordering + +- **`RowMajor`** / **`ColMajor`** are the primary names for matrix layouts. +- **`DecreasingOrder`** / **`IncreasingOrder`** are available as aliases: + - `MatrixLayout::IncreasingOrder` = `MatrixLayout::ColMajor` + - `MatrixLayout::DecreasingOrder` = `MatrixLayout::RowMajor` + +## Coordinates and Offsets + +- Use **`offset`** for linear buffer/slice positions. +- Use **`coordinate`** for multi-dimensional tensor positions. + +## Topology Constants + +- Use **`POS`** suffix for positions: `UNIT_POS`, `CUBE_POS`, `PLANE_POS`. +- Use **`DIM`** suffix for topology dimensions: `CUBE_DIM`, `PLANE_DIM`. + +## Type Naming + +- Use **postfix suffixes** for type categories: + - `*Error` for error types: `LineSizeError`, `LaunchError` + - `*Strategy` for strategy types: `ReadingStrategy` + - `*Expand` for expand/meta types: `TensorExpand`, `SliceExpand` diff --git a/crates/cubecl-core/src/codegen/metadata.rs b/crates/cubecl-core/src/codegen/metadata.rs index 3e3183685..9ad491b83 100644 --- a/crates/cubecl-core/src/codegen/metadata.rs +++ b/crates/cubecl-core/src/codegen/metadata.rs @@ -44,32 +44,32 @@ const EXTENDED_LEN: u32 = 3; /// Helper to calculate metadata offsets based on buffer count and position #[derive(Clone, Debug, Default)] pub struct Metadata { - num_meta: u32, - num_extended_meta: u32, + meta_count: u32, + extended_meta_count: u32, } impl Metadata { - pub fn new(num_meta: u32, num_extended_meta: u32) -> Self { + pub fn new(meta_count: u32, extended_meta_count: u32) -> Self { Self { - num_meta, - num_extended_meta, + meta_count, + extended_meta_count, } } fn offset_of(&self, id: u32) -> u32 { - self.num_meta * id + self.meta_count * id } fn base_len(&self) -> u32 { - self.num_meta * BASE_LEN + self.meta_count * BASE_LEN } pub fn static_len(&self) -> u32 { - self.num_meta * BASE_LEN + self.num_extended_meta * EXTENDED_LEN + self.meta_count * BASE_LEN + self.extended_meta_count * EXTENDED_LEN } fn offset_of_extended(&self, id: u32) -> u32 { - self.base_len() + self.num_extended_meta * id + self.base_len() + self.extended_meta_count * id } pub fn buffer_len_index(&self, buffer_idx: u32) -> u32 { @@ -163,11 +163,11 @@ impl MetadataBuilder { /// Build the final serialized metadata struct pub fn finish(&mut self, address_type: AddressType) -> MetadataBinding { fn finish_inner(state: &mut State) -> MetadataBinding { - let num_base = state.buffer_lens.len(); - let num_ext = state.ranks.len(); + let base_count = state.buffer_lens.len(); + let ext_count = state.ranks.len(); // All entries have buffer_len and len, extended also have rank, shape_offs, strides_offs - let static_len = num_base * BASE_LEN as usize + num_ext * EXTENDED_LEN as usize; + let static_len = base_count * BASE_LEN as usize + ext_count * EXTENDED_LEN as usize; let dynamic_len = state.shapes.len() + state.strides.len(); let total_len = static_len + dynamic_len; diff --git a/crates/cubecl-core/src/compute/launcher.rs b/crates/cubecl-core/src/compute/launcher.rs index a2dae7224..13f592920 100644 --- a/crates/cubecl-core/src/compute/launcher.rs +++ b/crates/cubecl-core/src/compute/launcher.rs @@ -192,18 +192,16 @@ impl TensorState { } fn process_tensor(&mut self, tensor: &TensorArg<'_, R>) -> Option { - let (tensor, vectorization) = match tensor { + let (tensor, line_size) = match tensor { TensorArg::Handle { - handle, - line_size: vectorization_factor, - .. - } => (handle, vectorization_factor), + handle, line_size, .. + } => (handle, line_size), TensorArg::Alias { .. } => return None, }; - let elem_size = tensor.elem_size * *vectorization; + let elem_size = tensor.elem_size * *line_size; let buffer_len = tensor.handle.size() / elem_size as u64; - let len = tensor.shape.iter().product::() / *vectorization; + let len = tensor.shape.iter().product::() / *line_size; with_metadata(|meta| { meta.register_tensor( tensor.strides.len() as u64, @@ -225,21 +223,19 @@ impl TensorState { } fn process_array(&mut self, array: &ArrayArg<'_, R>) -> Option { - let (array, vectorization) = match array { + let (array, line_size) = match array { ArrayArg::Handle { - handle, - line_size: vectorization_factor, - .. - } => (handle, vectorization_factor), + handle, line_size, .. + } => (handle, line_size), ArrayArg::Alias { .. } => return None, }; - let elem_size = array.elem_size * *vectorization; + let elem_size = array.elem_size * *line_size; let buffer_len = array.handle.size() / elem_size as u64; with_metadata(|meta| { meta.register_array( buffer_len, - array.length[0] as u64 / *vectorization as u64, + array.length[0] as u64 / *line_size as u64, self.address_type(), ) }); diff --git a/crates/cubecl-core/src/frontend/container/tensor/base.rs b/crates/cubecl-core/src/frontend/container/tensor/base.rs index 119f5b8c0..ede0c5705 100644 --- a/crates/cubecl-core/src/frontend/container/tensor/base.rs +++ b/crates/cubecl-core/src/frontend/container/tensor/base.rs @@ -37,15 +37,15 @@ mod metadata { #[cube] impl Tensor { - /// Obtain the stride of input at dimension dim + /// Obtain the stride of input at the given axis #[allow(unused_variables)] - pub fn stride(&self, dim: usize) -> usize { + pub fn stride(&self, axis: usize) -> usize { intrinsic!(|scope| { - let dim: ExpandElement = dim.into(); + let axis: ExpandElement = axis.into(); let out = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Metadata::Stride { - dim: *dim, + axis: *axis, var: self.expand.into(), }, out.clone().into(), @@ -54,15 +54,15 @@ mod metadata { }) } - /// Obtain the shape of input at dimension dim + /// Obtain the shape of input at the given axis #[allow(unused_variables)] - pub fn shape(&self, dim: usize) -> usize { + pub fn shape(&self, axis: usize) -> usize { intrinsic!(|scope| { - let dim: ExpandElement = dim.into(); + let axis: ExpandElement = axis.into(); let out = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Metadata::Shape { - dim: *dim, + axis: *axis, var: self.expand.into(), }, out.clone().into(), @@ -71,32 +71,32 @@ mod metadata { }) } - /// Obtain the coordinate corresponding to the given `index` of the tensor at dimension `dim`. + /// Obtain the coordinate corresponding to the given `index` of the tensor at the given `axis`. /// /// A coordinate is a list of indices corresponding to the multi-dimensional position of an element in the tensor. - /// The `dim` element in a coordinate is the position along the `dim` dimension of the tensor. + /// The `axis` element in a coordinate is the position along that axis of the tensor. #[allow(unused_variables)] - pub fn coordinate(&self, index: usize, dim: usize) -> usize { + pub fn coordinate(&self, index: usize, axis: usize) -> usize { intrinsic!(|scope| { let index: ExpandElement = index.into(); - let stride = self.clone().__expand_stride_method(scope, dim.clone()); - let shape = self.clone().__expand_shape_method(scope, dim.clone()); + let stride = self.clone().__expand_stride_method(scope, axis.clone()); + let shape = self.clone().__expand_shape_method(scope, axis.clone()); - // Compute `num_strides = index / stride`. - let num_strides = scope.create_local(Type::new(usize::as_type(scope))); + // Compute `stride_count = index / stride`. + let stride_count = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Arithmetic::Div(BinaryOperator { lhs: *index, rhs: stride.expand.into(), }), - num_strides.clone().into(), + stride_count.clone().into(), )); - // Compute `coordinate = num_strides % shape `. + // Compute `coordinate = stride_count % shape `. let coordinate = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Arithmetic::Modulo(BinaryOperator { - lhs: *num_strides, + lhs: *stride_count, rhs: shape.expand.into(), }), coordinate.clone().into(), @@ -106,12 +106,12 @@ mod metadata { }) } - /// The number of vectorized elements in the tensor. + /// The number of lined elements in the tensor. /// /// # Warning /// - /// The length will be affected by the vectorization factor. To obtain the number of elements, - /// you should multiply the length by the vectorization factor. + /// The length will be affected by the line size. To obtain the number of elements, + /// you should multiply the length by the line size. #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { intrinsic!(|scope| { @@ -120,12 +120,12 @@ mod metadata { }) } - /// The length of the buffer representing the tensor in terms of vectorized elements. + /// The length of the buffer representing the tensor in terms of lined elements. /// /// # Warning /// - /// The buffer length will be affected by the vectorization factor. To obtain the number of - /// elements, you should multiply the length by the vectorization factor. + /// The buffer length will be affected by the line size. To obtain the number of + /// elements, you should multiply the length by the line size. #[allow(clippy::len_without_is_empty)] pub fn buffer_len(&self) -> usize { intrinsic!(|scope| { diff --git a/crates/cubecl-core/src/frontend/container/tensor/launch.rs b/crates/cubecl-core/src/frontend/container/tensor/launch.rs index 142e1bb1e..564dce568 100644 --- a/crates/cubecl-core/src/frontend/container/tensor/launch.rs +++ b/crates/cubecl-core/src/frontend/container/tensor/launch.rs @@ -21,7 +21,7 @@ pub enum TensorArg<'a, R: Runtime> { Handle { /// The tensor handle. handle: TensorHandleRef<'a, R>, - /// The vectorization factor. + /// The line size. line_size: LineSize, }, /// The tensor is aliasing another input tensor. @@ -118,7 +118,7 @@ impl LaunchArg for Tensor { } impl<'a, R: Runtime> TensorArg<'a, R> { - /// Create a new tensor argument specified with its vectorization factor. + /// Create a new tensor argument specified with its line size. /// /// # Safety /// @@ -128,7 +128,7 @@ impl<'a, R: Runtime> TensorArg<'a, R> { handle: &'a cubecl_runtime::server::Handle, strides: &'a [usize], shape: &'a [usize], - factor: LineSize, + line_size: LineSize, ) -> Self { unsafe { Self::Handle { @@ -138,12 +138,12 @@ impl<'a, R: Runtime> TensorArg<'a, R> { shape, E::size().expect("Element should have a size"), ), - line_size: factor, + line_size, } } } - /// Create a new tensor argument specified with its vectorization factor with a manual element + /// Create a new tensor argument specified with its line size with a manual element /// size in bytes. /// /// # Safety @@ -154,13 +154,13 @@ impl<'a, R: Runtime> TensorArg<'a, R> { handle: &'a cubecl_runtime::server::Handle, strides: &'a [usize], shape: &'a [usize], - factor: LineSize, + line_size: LineSize, elem_size: usize, ) -> Self { unsafe { Self::Handle { handle: TensorHandleRef::from_raw_parts(handle, strides, shape, elem_size), - line_size: factor, + line_size, } } } diff --git a/crates/cubecl-core/src/frontend/container/tensor/tensormap.rs b/crates/cubecl-core/src/frontend/container/tensor/tensormap.rs index 49d6fd681..8908b957b 100644 --- a/crates/cubecl-core/src/frontend/container/tensor/tensormap.rs +++ b/crates/cubecl-core/src/frontend/container/tensor/tensormap.rs @@ -317,41 +317,41 @@ mod metadata { unexpanded!() } - /// Obtain the stride of input at dimension dim - pub fn stride(&self, _dim: usize) -> usize { + /// Obtain the stride of input at the given axis + pub fn stride(&self, _axis: usize) -> usize { unexpanded!() } - /// Obtain the shape of input at dimension dim - pub fn shape(&self, _dim: usize) -> usize { + /// Obtain the shape of input at the given axis + pub fn shape(&self, _axis: usize) -> usize { unexpanded!() } - /// Obtain the coordinate corresponding to the given `index` of the tensor at dimension `dim`. + /// Obtain the coordinate corresponding to the given `index` of the tensor at the given `axis`. /// /// A coordinate is a list of indices corresponding to the multi-dimensional position of an element in the tensor. - /// The `dim` element in a coordinate is the position along the `dim` dimension of the tensor. - pub fn coordinate(&self, _index: usize, _dim: usize) -> usize { + /// The `axis` element in a coordinate is the position along that axis of the tensor. + pub fn coordinate(&self, _index: usize, _axis: usize) -> usize { unexpanded!() } - /// The number of vectorized elements in the tensor. + /// The number of lined elements in the tensor. /// /// # Warning /// - /// The length will be affected by the vectorization factor. To obtain the number of elements, - /// you should multiply the length by the vectorization factor. + /// The length will be affected by the line size. To obtain the number of elements, + /// you should multiply the length by the line size. #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { unexpanded!() } - /// The length of the buffer representing the tensor in terms of vectorized elements. + /// The length of the buffer representing the tensor in terms of lined elements. /// /// # Warning /// - /// The buffer length will be affected by the vectorization factor. To obtain the number of - /// elements, you should multiply the length by the vectorization factor. + /// The buffer length will be affected by the line size. To obtain the number of + /// elements, you should multiply the length by the line size. #[allow(clippy::len_without_is_empty)] pub fn buffer_len(&self) -> usize { unexpanded!() @@ -382,18 +382,18 @@ mod metadata { pub fn __expand_stride( scope: &mut Scope, expand: ExpandElementTyped>, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { - expand.__expand_stride_method(scope, dim) + expand.__expand_stride_method(scope, axis) } // Expand function of [shape](TensorMap::shape). pub fn __expand_shape( scope: &mut Scope, expand: ExpandElementTyped>, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { - expand.__expand_shape_method(scope, dim) + expand.__expand_shape_method(scope, axis) } // Expand function of [coordinate](TensorMap::coordinate). @@ -401,9 +401,9 @@ mod metadata { scope: &mut Scope, expand: ExpandElementTyped>, index: ExpandElementTyped, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { - expand.__expand_coordinate_method(scope, index, dim) + expand.__expand_coordinate_method(scope, index, axis) } // Expand function of [len](TensorMap::len). @@ -449,13 +449,13 @@ mod metadata { pub fn __expand_stride_method( self, scope: &mut Scope, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { - let dim: ExpandElement = dim.into(); + let axis: ExpandElement = axis.into(); let out = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Metadata::Stride { - dim: *dim, + axis: *axis, var: self.expand.into(), }, out.clone().into(), @@ -467,13 +467,13 @@ mod metadata { pub fn __expand_shape_method( self, scope: &mut Scope, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { - let dim: ExpandElement = dim.into(); + let axis: ExpandElement = axis.into(); let out = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Metadata::Shape { - dim: *dim, + axis: *axis, var: self.expand.into(), }, out.clone().into(), @@ -486,27 +486,27 @@ mod metadata { self, scope: &mut Scope, index: ExpandElementTyped, - dim: ExpandElementTyped, + axis: ExpandElementTyped, ) -> ExpandElementTyped { let index: ExpandElement = index.into(); - let stride = self.clone().__expand_stride_method(scope, dim.clone()); - let shape = self.clone().__expand_shape_method(scope, dim.clone()); + let stride = self.clone().__expand_stride_method(scope, axis.clone()); + let shape = self.clone().__expand_shape_method(scope, axis.clone()); - // Compute `num_strides = index / stride`. - let num_strides = scope.create_local(Type::new(usize::as_type(scope))); + // Compute `stride_count = index / stride`. + let stride_count = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Arithmetic::Div(BinaryOperator { lhs: *index, rhs: stride.expand.into(), }), - num_strides.clone().into(), + stride_count.clone().into(), )); - // Compute `coordinate = num_strides % shape `. + // Compute `coordinate = stride_count % shape `. let coordinate = scope.create_local(Type::new(usize::as_type(scope))); scope.register(Instruction::new( Arithmetic::Modulo(BinaryOperator { - lhs: *num_strides, + lhs: *stride_count, rhs: shape.expand.into(), }), coordinate.clone().into(), diff --git a/crates/cubecl-core/src/frontend/operation/base.rs b/crates/cubecl-core/src/frontend/operation/base.rs index 51cddc881..2498604b5 100644 --- a/crates/cubecl-core/src/frontend/operation/base.rs +++ b/crates/cubecl-core/src/frontend/operation/base.rs @@ -26,7 +26,7 @@ where let item_lhs = lhs.ty; let item_rhs = rhs.ty; - let line_size = find_vectorization(item_lhs, item_rhs); + let line_size = find_line_size(item_lhs, item_rhs); let item = item_lhs.line(line_size); @@ -90,7 +90,7 @@ where let vec = if let Some(line_size) = line_size { line_size } else { - find_vectorization(item_lhs, item_rhs) + find_line_size(item_lhs, item_rhs) }; let item = item_lhs.line(vec); @@ -152,7 +152,7 @@ where let item_lhs = lhs.ty; let item_rhs = rhs.ty; - let line_size = find_vectorization(item_lhs, item_rhs); + let line_size = find_line_size(item_lhs, item_rhs); let out_item = Type::scalar(ElemType::Bool).line(line_size); @@ -254,7 +254,7 @@ where out } -pub(crate) fn find_vectorization(lhs: Type, rhs: Type) -> LineSize { +pub(crate) fn find_line_size(lhs: Type, rhs: Type) -> LineSize { if matches!(lhs, Type::Scalar(_)) && matches!(rhs, Type::Scalar(_)) { 0 } else { diff --git a/crates/cubecl-core/src/frontend/operation/binary.rs b/crates/cubecl-core/src/frontend/operation/binary.rs index 583e13a65..a03bb1b16 100644 --- a/crates/cubecl-core/src/frontend/operation/binary.rs +++ b/crates/cubecl-core/src/frontend/operation/binary.rs @@ -45,7 +45,7 @@ pub mod sub { let item_lhs = lhs.expand.ty; let item_rhs = rhs.expand.ty; - let line_size = find_vectorization(item_lhs, item_rhs); + let line_size = find_line_size(item_lhs, item_rhs); let item = item_lhs.line(line_size); let value = (lhs_val - rhs_val).into(); @@ -288,8 +288,8 @@ macro_rules! impl_binary_func { } } -macro_rules! impl_binary_func_fixed_output_vectorization { - ($trait_name:ident, $method_name:ident, $operator:expr, $out_vectorization: expr, $($type:ty),*) => { +macro_rules! impl_binary_func_fixed_output_line_size { + ($trait_name:ident, $method_name:ident, $operator:expr, $out_line_size: expr, $($type:ty),*) => { paste::paste! { pub trait $trait_name: CubePrimitive + CubeType]> + Sized { fn $method_name(self, _rhs: Self) -> Self { @@ -313,7 +313,7 @@ macro_rules! impl_binary_func_fixed_output_vectorization { impl [<$trait_name Expand>] for ExpandElementTyped { fn [<__expand_ $method_name _method>](self, scope: &mut Scope, rhs: Self) -> Self { let lhs: ExpandElement = self.into(); - let item = lhs.ty.line($out_vectorization); + let item = lhs.ty.line($out_line_size); binary_expand_fixed_output(scope, lhs, rhs.into(), item, $operator).into() } } @@ -574,7 +574,7 @@ impl_binary_func!( usize, isize ); -impl_binary_func_fixed_output_vectorization!( +impl_binary_func_fixed_output_line_size!( Dot, dot, Arithmetic::Dot, diff --git a/crates/cubecl-core/src/frontend/operation/unary.rs b/crates/cubecl-core/src/frontend/operation/unary.rs index a17846ddc..b1bbe7593 100644 --- a/crates/cubecl-core/src/frontend/operation/unary.rs +++ b/crates/cubecl-core/src/frontend/operation/unary.rs @@ -72,8 +72,8 @@ impl Exp for f32 { } } -macro_rules! impl_unary_func_fixed_out_vectorization { - ($trait_name:ident, $method_name:ident, $operator:expr, $out_vectorization: expr, $($type:ty),*) => { +macro_rules! impl_unary_func_fixed_out_line_size { + ($trait_name:ident, $method_name:ident, $operator:expr, $out_line_size: expr, $($type:ty),*) => { paste::paste! { pub trait $trait_name: CubePrimitive + CubeType]> + Sized { #[allow(unused_variables)] @@ -94,7 +94,7 @@ macro_rules! impl_unary_func_fixed_out_vectorization { impl [<$trait_name Expand>] for ExpandElementTyped { fn [<__expand_ $method_name _method>](self, scope: &mut Scope) -> Self { let expand_element: ExpandElement = self.into(); - let item = expand_element.ty.line($out_vectorization); + let item = expand_element.ty.line($out_line_size); unary_expand_fixed_output(scope, expand_element, item, $operator).into() } } @@ -410,7 +410,7 @@ impl_unary_func!( f32, f64 ); -impl_unary_func_fixed_out_vectorization!( +impl_unary_func_fixed_out_line_size!( Magnitude, magnitude, Arithmetic::Magnitude, diff --git a/crates/cubecl-core/src/lib.rs b/crates/cubecl-core/src/lib.rs index cd3e8d650..1dcac8947 100644 --- a/crates/cubecl-core/src/lib.rs +++ b/crates/cubecl-core/src/lib.rs @@ -58,28 +58,33 @@ pub use id::*; /// assigned to one element. pub fn calculate_cube_count_elemwise( client: &ComputeClient, - num_elems: usize, + elem_count: usize, cube_dim: CubeDim, ) -> CubeCount { - let num_cubes = num_elems.div_ceil(cube_dim.num_elems() as usize); - CubeCountSelection::new(client, num_cubes as u32).cube_count() + let cube_count = elem_count.div_ceil(cube_dim.num_elems() as usize); + CubeCountSelection::new(client, cube_count as u32).cube_count() } +/// Deprecated alias for [`tensor_line_size`]. +#[deprecated( + since = "0.10.0-pre.1", + note = "use tensor_line_size or tensor_line_size_parallel" +)] pub fn tensor_vectorization_factor( factors: &[LineSize], shape: &[usize], strides: &[usize], - dim: usize, + axis: usize, ) -> LineSize { - tensor_line_size_parallel(factors.iter().cloned(), shape, strides, dim) + tensor_line_size_parallel(factors.iter().cloned(), shape, strides, axis) } pub fn tensor_line_size( factors: &[LineSize], shape: &[usize], strides: &[usize], - dim: usize, + axis: usize, ) -> LineSize { - tensor_line_size_parallel(factors.iter().cloned(), shape, strides, dim) + tensor_line_size_parallel(factors.iter().cloned(), shape, strides, axis) } #[derive(Debug, Clone)] @@ -89,16 +94,16 @@ pub enum LineSizeError { NoValidLineSize, } -/// Find the maximum line size usable for parallel vectorization along the given axis -/// from the supported line sizes or return 1 if vectorization is impossible. +/// Find the maximum line size usable for parallel line-size selection along the given axis +/// from the supported line sizes or return 1 if a line size above 1 is impossible. /// /// This function is designed to never return a line size above 1 by error, /// but doesn't guarantee to always return the actual maximum possible line size. /// That is, it may be overly strict. /// -/// Currently, this checks that the stride of the axis is 1, that it's shape is +/// Currently, this checks that the stride of the axis is 1, that its shape is /// divisible by a candidate line size and that the smallest stride that is not 1 -/// is divisible by the vectorization. +/// is divisible by the candidate line size. /// The last condition ensure that the current axis is contiguous within the next stride. pub fn tensor_line_size_parallel( supported_line_sizes: impl Iterator, @@ -135,8 +140,8 @@ pub fn try_tensor_line_size_parallel( .ok_or(LineSizeError::NoValidLineSize) } -/// Find the maximum line size usable for perpendicular vectorization along the given axis -/// from the supported line sizes or return 1 if vectorization is impossible. +/// Find the maximum line size usable for perpendicular line-size selection along the given axis +/// from the supported line sizes or return 1 if a line size above 1 is impossible. /// /// This function is designed to never return a line size above 1 by error, /// but doesn't guarantee to always return the actual maximum possible line size. diff --git a/crates/cubecl-core/src/runtime_tests/cluster.rs b/crates/cubecl-core/src/runtime_tests/cluster.rs index 3c041d083..681cca20a 100644 --- a/crates/cubecl-core/src/runtime_tests/cluster.rs +++ b/crates/cubecl-core/src/runtime_tests/cluster.rs @@ -33,14 +33,14 @@ pub fn test_cluster_meta(client: ComputeClient) { let cube_count_y = 2; let cube_count_z = 6; let cube_count = CubeCount::new_3d(cube_count_x, cube_count_y, cube_count_z); - let num_cubes = cube_count_x * cube_count_y * cube_count_z; + let total_cube_count = cube_count_x * cube_count_y * cube_count_z; - let handle = client.empty((num_cubes as usize * 4 + 4) * size_of::()); + let handle = client.empty((total_cube_count as usize * 4 + 4) * size_of::()); - let vectorization = 1; + let line_size = 1; cluster_meta_kernel::launch(&client, cube_count, CubeDim::new_single(), unsafe { - ArrayArg::from_raw_parts::(&handle, num_cubes as usize * 8, vectorization) + ArrayArg::from_raw_parts::(&handle, total_cube_count as usize * 8, line_size) }) .unwrap(); diff --git a/crates/cubecl-core/src/runtime_tests/launch.rs b/crates/cubecl-core/src/runtime_tests/launch.rs index 0469c15a5..f52beedbf 100644 --- a/crates/cubecl-core/src/runtime_tests/launch.rs +++ b/crates/cubecl-core/src/runtime_tests/launch.rs @@ -165,7 +165,7 @@ pub fn test_kernel_max_shared(client: ComputeClient) { pub fn test_shared_memory_error(client: ComputeClient) { // No real limit on CPU, so ignore - if client.properties().hardware.num_cpu_cores.is_some() { + if client.properties().hardware.cpu_core_count.is_some() { return; } diff --git a/crates/cubecl-cpp/src/shared/base.rs b/crates/cubecl-cpp/src/shared/base.rs index 7d1ae4b2b..2eee0af82 100644 --- a/crates/cubecl-cpp/src/shared/base.rs +++ b/crates/cubecl-cpp/src/shared/base.rs @@ -335,9 +335,9 @@ impl CppCompiler { } } - let num_meta = all_meta.len(); + let meta_count = all_meta.len(); - self.metadata = cubecl_core::Metadata::new(num_meta as u32, num_ext); + self.metadata = cubecl_core::Metadata::new(meta_count as u32, num_ext); } pub(crate) fn ext_meta_position(&self, var: gpu::Variable) -> u32 { @@ -969,23 +969,23 @@ impl CppCompiler { ) -> Instruction { let out = out.unwrap(); match metadata { - gpu::Metadata::Stride { dim, var } => { + gpu::Metadata::Stride { axis, var } => { let position = self.ext_meta_position(var); let offset = self.metadata.stride_offset_index(position); Instruction::ExtendedMetadata { info_offset: self.compile_variable(offset.into()), - dim: self.compile_variable(dim), + dim: self.compile_variable(axis), split_meta: self.compilation_options.supports_features.grid_constants, static_offset: self.metadata.static_len(), out: self.compile_variable(out), } } - gpu::Metadata::Shape { dim, var } => { + gpu::Metadata::Shape { axis, var } => { let position = self.ext_meta_position(var); let offset = self.metadata.shape_offset_index(position); Instruction::ExtendedMetadata { info_offset: self.compile_variable(offset.into()), - dim: self.compile_variable(dim), + dim: self.compile_variable(axis), split_meta: self.compilation_options.supports_features.grid_constants, static_offset: self.metadata.static_len(), out: self.compile_variable(out), diff --git a/crates/cubecl-cpu/src/compiler/visitor/args_manager.rs b/crates/cubecl-cpu/src/compiler/visitor/args_manager.rs index 2d97fa28a..9fbd18637 100644 --- a/crates/cubecl-cpu/src/compiler/visitor/args_manager.rs +++ b/crates/cubecl-cpu/src/compiler/visitor/args_manager.rs @@ -62,9 +62,9 @@ impl<'a, 'b> ArgsManagerBuilder<'a, 'b> { } } - let num_meta = all_meta.len(); + let meta_count = all_meta.len(); - let metadata = Metadata::new(num_meta as u32, num_ext); + let metadata = Metadata::new(meta_count as u32, num_ext); let scalars = kernel.scalars.clone(); let mut args = Self { diff --git a/crates/cubecl-cpu/src/compiler/visitor/operation/metadata.rs b/crates/cubecl-cpu/src/compiler/visitor/operation/metadata.rs index 2ff47cf29..a4c0f921e 100644 --- a/crates/cubecl-cpu/src/compiler/visitor/operation/metadata.rs +++ b/crates/cubecl-cpu/src/compiler/visitor/operation/metadata.rs @@ -23,7 +23,7 @@ impl<'a> Visitor<'a> { self.insert_variable(out, result); } - fn append_extended_metadata(&mut self, offset: u32, dim: Variable, out: Variable) { + fn append_extended_metadata(&mut self, offset: u32, axis: Variable, out: Variable) { let metadata_memref = self.args_manager.metadata_memref.unwrap(); let offset = self .block @@ -45,8 +45,9 @@ impl<'a> Visitor<'a> { self.location, )); - let dim = self.get_index(dim, dim.ty, true); - let offset = self.append_operation_with_result(arith::addi(first_rank, dim, self.location)); + let axis = self.get_index(axis, axis.ty, true); + let offset = + self.append_operation_with_result(arith::addi(first_rank, axis, self.location)); let result = self.append_operation_with_result(memref::load( metadata_memref, &[offset], @@ -72,15 +73,15 @@ impl<'a> Visitor<'a> { let offset = self.args_manager.metadata.rank_index(position); self.append_metadata(offset, out); } - Metadata::Shape { dim, var } => { + Metadata::Shape { axis, var } => { let position = self.args_manager.ext_meta_position(*var); let offset = self.args_manager.metadata.shape_offset_index(position); - self.append_extended_metadata(offset, *dim, out); + self.append_extended_metadata(offset, *axis, out); } - Metadata::Stride { dim, var } => { + Metadata::Stride { axis, var } => { let position = self.args_manager.ext_meta_position(*var); let offset = self.args_manager.metadata.stride_offset_index(position); - self.append_extended_metadata(offset, *dim, out); + self.append_extended_metadata(offset, *axis, out); } } } diff --git a/crates/cubecl-cpu/src/compiler/visitor/variables.rs b/crates/cubecl-cpu/src/compiler/visitor/variables.rs index 262f72622..0bfc00f6e 100644 --- a/crates/cubecl-cpu/src/compiler/visitor/variables.rs +++ b/crates/cubecl-cpu/src/compiler/visitor/variables.rs @@ -121,13 +121,13 @@ impl<'a> Visitor<'a> { lhs: Variable, rhs: Variable, ) -> (Value<'a, 'a>, Value<'a, 'a>) { - let vectorization_factor = std::cmp::max(lhs.line_size(), rhs.line_size()); + let line_size = std::cmp::max(lhs.line_size(), rhs.line_size()); let (mut lhs_value, mut rhs_value) = (self.get_variable(lhs), self.get_variable(rhs)); if lhs_value.r#type().is_vector() || rhs_value.r#type().is_vector() { if !lhs_value.r#type().is_vector() { let vector_type = Type::vector( - &[vectorization_factor as u64], + &[line_size as u64], lhs.storage_type().to_type(self.context), ); lhs_value = self.append_operation_with_result(vector::splat( @@ -139,7 +139,7 @@ impl<'a> Visitor<'a> { } if !rhs_value.r#type().is_vector() { let vector_type = Type::vector( - &[vectorization_factor as u64], + &[line_size as u64], rhs.storage_type().to_type(self.context), ); rhs_value = self.append_operation_with_result(vector::splat( diff --git a/crates/cubecl-cpu/src/runtime.rs b/crates/cubecl-cpu/src/runtime.rs index 651117bfc..5b2416ba8 100644 --- a/crates/cubecl-cpu/src/runtime.rs +++ b/crates/cubecl-cpu/src/runtime.rs @@ -52,11 +52,11 @@ impl DeviceState for CpuServer { max_bindings: u32::MAX, max_shared_memory_size, max_cube_count, - num_cpu_cores: Some(available_parallelism as u32), + cpu_core_count: Some(available_parallelism as u32), max_units_per_cube: u32::MAX, max_cube_dim, - num_streaming_multiprocessors: None, - num_tensor_cores: None, + streaming_multiprocessor_count: None, + tensor_core_count: None, min_tensor_cores_dim: None, }; diff --git a/crates/cubecl-cuda/src/runtime.rs b/crates/cubecl-cuda/src/runtime.rs index c4023e105..a14a22076 100644 --- a/crates/cubecl-cuda/src/runtime.rs +++ b/crates/cubecl-cuda/src/runtime.rs @@ -127,10 +127,10 @@ impl DeviceState for CudaServer { let grid_dim_z = get_attribute(device_ptr, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z).unwrap(); let max_cube_count = (grid_dim_x as u32, grid_dim_y as u32, grid_dim_z as u32); - let num_streaming_multiprocessors = Some( + let streaming_multiprocessor_count = Some( get_attribute(device_ptr, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT).unwrap() as u32, ); - let num_tensor_cores = tensor_cores_per_sm(arch_version); + let tensor_core_count = tensor_cores_per_sm(arch_version); comp_opts.warp_size = warp_size; @@ -143,14 +143,14 @@ impl DeviceState for CudaServer { max_cube_count, max_units_per_cube: max_threads, max_cube_dim, - num_streaming_multiprocessors, - num_tensor_cores, + streaming_multiprocessor_count, + tensor_core_count, min_tensor_cores_dim: if supported_wmma_combinations.is_empty() { None } else { Some(8) }, - num_cpu_cores: None, + cpu_core_count: None, } }; diff --git a/crates/cubecl-hip/src/runtime.rs b/crates/cubecl-hip/src/runtime.rs index 044765e15..623248e88 100644 --- a/crates/cubecl-hip/src/runtime.rs +++ b/crates/cubecl-hip/src/runtime.rs @@ -131,14 +131,14 @@ impl DeviceState for HipServer { max_cube_count, max_units_per_cube: prop_max_threads, max_cube_dim, - num_streaming_multiprocessors: None, - num_tensor_cores: None, + streaming_multiprocessor_count: None, + tensor_core_count: None, min_tensor_cores_dim: if supported_wmma_combinations.is_empty() { None } else { Some(16) }, - num_cpu_cores: None, + cpu_core_count: None, }; let mut device_props = DeviceProperties::new( diff --git a/crates/cubecl-ir/src/cmma.rs b/crates/cubecl-ir/src/cmma.rs index d77f6f867..b3bf4aeb9 100644 --- a/crates/cubecl-ir/src/cmma.rs +++ b/crates/cubecl-ir/src/cmma.rs @@ -24,6 +24,15 @@ pub enum MatrixLayout { Undefined, } +impl MatrixLayout { + /// Alias for `ColMajor`. In column-major (increasing) order, strides increase with axis index. + #[allow(non_upper_case_globals)] + pub const IncreasingOrder: Self = Self::ColMajor; + /// Alias for `RowMajor`. In row-major (decreasing) order, strides decrease with axis index. + #[allow(non_upper_case_globals)] + pub const DecreasingOrder: Self = Self::RowMajor; +} + #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(new, Debug, Clone, Copy, TypeHash, PartialEq, Eq, Hash, PartialOrd, Ord)] #[allow(missing_docs)] diff --git a/crates/cubecl-ir/src/metadata.rs b/crates/cubecl-ir/src/metadata.rs index 581ef910a..255d24895 100644 --- a/crates/cubecl-ir/src/metadata.rs +++ b/crates/cubecl-ir/src/metadata.rs @@ -12,10 +12,10 @@ use crate::{OperationReflect, Variable}; pub enum Metadata { /// The rank of an array. Rank { var: Variable }, - /// The stride of an array at the given dimension. - Stride { dim: Variable, var: Variable }, - /// The shape of an array at the given dimension. - Shape { dim: Variable, var: Variable }, + /// The stride of an array at the given axis. + Stride { axis: Variable, var: Variable }, + /// The shape of an array at the given axis. + Shape { axis: Variable, var: Variable }, /// The length of an array. Length { var: Variable }, /// The length of an array's underlying buffer. @@ -26,8 +26,8 @@ impl Display for Metadata { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Metadata::Rank { var } => write!(f, "rank({var})"), - Metadata::Stride { dim, var } => write!(f, "{var}.strides[{dim}]"), - Metadata::Shape { dim, var } => write!(f, "{var}.shape[{dim}]"), + Metadata::Stride { axis, var } => write!(f, "{var}.strides[{axis}]"), + Metadata::Shape { axis, var } => write!(f, "{var}.shape[{axis}]"), Metadata::Length { var } => write!(f, "{var}.len()"), Metadata::BufferLength { var } => write!(f, "buffer_len({var})"), } diff --git a/crates/cubecl-ir/src/properties.rs b/crates/cubecl-ir/src/properties.rs index 235bad710..edac2bd5d 100644 --- a/crates/cubecl-ir/src/properties.rs +++ b/crates/cubecl-ir/src/properties.rs @@ -40,11 +40,11 @@ pub struct HardwareProperties { /// Maximum `CubeDim` in x, y, and z dimensions pub max_cube_dim: (u32, u32, u32), /// Number of streaming multiprocessors (SM), if available - pub num_streaming_multiprocessors: Option, + pub streaming_multiprocessor_count: Option, /// Number of available parallel cpu units, if the runtime is CPU. - pub num_cpu_cores: Option, + pub cpu_core_count: Option, /// Number of tensor cores per SM, if any - pub num_tensor_cores: Option, + pub tensor_core_count: Option, /// The minimum tiling dimension for a single axis in tensor cores. /// /// For a backend that only supports 16x16x16, the value would be 16. diff --git a/crates/cubecl-opt/src/instructions.rs b/crates/cubecl-opt/src/instructions.rs index 1cfae9355..a6cc679d4 100644 --- a/crates/cubecl-opt/src/instructions.rs +++ b/crates/cubecl-opt/src/instructions.rs @@ -257,12 +257,12 @@ impl Optimizer { Metadata::Rank { var } => { visit_read(self, var); } - Metadata::Stride { dim, var } => { - visit_read(self, dim); + Metadata::Stride { axis, var } => { + visit_read(self, axis); visit_read(self, var); } - Metadata::Shape { dim, var } => { - visit_read(self, dim); + Metadata::Shape { axis, var } => { + visit_read(self, axis); visit_read(self, var); } Metadata::Length { var } => { diff --git a/crates/cubecl-runtime/src/server.rs b/crates/cubecl-runtime/src/server.rs index e7fd84e1f..1d96cf1eb 100644 --- a/crates/cubecl-runtime/src/server.rs +++ b/crates/cubecl-runtime/src/server.rs @@ -927,7 +927,7 @@ impl CubeDim { let plane_count = Self::calculate_plane_count_per_cube( working_units as u32, plane_size, - properties.hardware.num_cpu_cores, + properties.hardware.cpu_core_count, ); // Make sure it respects the max units per cube (especially on wasm) @@ -939,9 +939,9 @@ impl CubeDim { fn calculate_plane_count_per_cube( working_units: u32, plane_dim: u32, - num_cpu_cores: Option, + cpu_core_count: Option, ) -> u32 { - match num_cpu_cores { + match cpu_core_count { Some(num_cores) => core::cmp::min(num_cores, working_units), None => { let plane_count_max = core::cmp::max(1, working_units / plane_dim); diff --git a/crates/cubecl-runtime/tests/dummy/server.rs b/crates/cubecl-runtime/tests/dummy/server.rs index eb0695c92..f083d38c6 100644 --- a/crates/cubecl-runtime/tests/dummy/server.rs +++ b/crates/cubecl-runtime/tests/dummy/server.rs @@ -270,10 +270,10 @@ impl DummyServer { max_cube_count: (u16::MAX as u32, u16::MAX as u32, u16::MAX as u32), max_units_per_cube: 1024, max_cube_dim: (1024, 1024, 64), - num_streaming_multiprocessors: None, - num_tensor_cores: None, + streaming_multiprocessor_count: None, + tensor_core_count: None, min_tensor_cores_dim: None, - num_cpu_cores: None, + cpu_core_count: None, }; let features = Features::default(); let timing_method = cubecl_common::profile::TimingMethod::System; diff --git a/crates/cubecl-spirv/src/compiler.rs b/crates/cubecl-spirv/src/compiler.rs index e32da9b7e..0c3bcad72 100644 --- a/crates/cubecl-spirv/src/compiler.rs +++ b/crates/cubecl-spirv/src/compiler.rs @@ -178,7 +178,7 @@ impl Compiler for SpirvCompiler { .collect(); all_meta.sort_by_key(|(id, _)| *id); - let num_meta = all_meta.len(); + let meta_count = all_meta.len(); for (_, has_extended_meta) in all_meta.iter() { ext_meta_pos.push(num_ext); @@ -190,7 +190,7 @@ impl Compiler for SpirvCompiler { self.cube_dim = value.cube_dim; self.mode = mode; self.addr_type = addr_type; - self.metadata = Metadata::new(num_meta as u32, num_ext); + self.metadata = Metadata::new(meta_count as u32, num_ext); self.compilation_options = compilation_options.clone(); self.ext_meta_pos = ext_meta_pos; diff --git a/crates/cubecl-spirv/src/metadata.rs b/crates/cubecl-spirv/src/metadata.rs index 9717757fe..c30b42ca2 100644 --- a/crates/cubecl-spirv/src/metadata.rs +++ b/crates/cubecl-spirv/src/metadata.rs @@ -30,9 +30,9 @@ impl SpirvCompiler { let out = self.compile_variable(out); self.buffer_length(&var, Some(&out), uniform); } - Metadata::Stride { dim, var } => { + Metadata::Stride { axis, var } => { let var = self.compile_variable(var); - let dim = self.compile_variable(dim); + let axis = self.compile_variable(axis); let out = self.compile_variable(out); let ty_id = out.item().id(self); @@ -43,16 +43,16 @@ impl SpirvCompiler { let offs_offset = self.metadata.stride_offset_index(pos); let offset = self.load_const_metadata(offs_offset, None, out.item()); - let dim_id = self.read_as(&dim, &out.item()); + let axis_id = self.read_as(&axis, &out.item()); - let index = self.i_add(ty_id, None, offset, dim_id).unwrap(); + let index = self.i_add(ty_id, None, offset, axis_id).unwrap(); self.mark_uniformity(index, uniform); let index = Variable::Raw(index, out.item()); self.load_dyn_metadata(&index, &out, out.item()); } - Metadata::Shape { dim, var } => { + Metadata::Shape { axis, var } => { let var = self.compile_variable(var); - let dim = self.compile_variable(dim); + let axis = self.compile_variable(axis); let out = self.compile_variable(out); let ty_id = out.item().id(self); @@ -63,9 +63,9 @@ impl SpirvCompiler { let offs_offset = self.metadata.shape_offset_index(pos); let offset = self.load_const_metadata(offs_offset, None, out.item()); - let dim_id = self.read_as(&dim, &out.item()); + let axis_id = self.read_as(&axis, &out.item()); - let index = self.i_add(ty_id, None, offset, dim_id).unwrap(); + let index = self.i_add(ty_id, None, offset, axis_id).unwrap(); let index = Variable::Id(index); self.load_dyn_metadata(&index, &out, out.item()); } diff --git a/crates/cubecl-std/src/tensor/contiguous/base.rs b/crates/cubecl-std/src/tensor/contiguous/base.rs index 7646178d9..8ce4c175a 100644 --- a/crates/cubecl-std/src/tensor/contiguous/base.rs +++ b/crates/cubecl-std/src/tensor/contiguous/base.rs @@ -15,7 +15,7 @@ use cubecl_core::{ tensor_line_size_parallel, }; -pub const NUM_SM_APPROX: u32 = 50; +pub const SM_COUNT_APPROX: u32 = 50; /// Returns the offset of the tensor corresponding to the layout tensor. #[cube] @@ -23,15 +23,15 @@ pub fn index_offset_with_layout( tensor: &Tensor>, layout: &Tensor>, offset_layout: usize, - dim_start: usize, - dim_end: usize, + axis_start: usize, + axis_end: usize, #[comptime] unroll: bool, ) -> usize { let offset_ref = offset_layout * tensor.line_size(); let mut offset = 0; #[unroll(unroll)] - for i in dim_start..dim_end { + for i in axis_start..axis_end { let ogwl = offset_ref / layout.stride(i); offset += ogwl % tensor.shape(i) * tensor.stride(i); } @@ -55,10 +55,10 @@ pub fn index_offset_contiguous( #[unroll(unroll)] for i in 0..rank { - let dim = rank - i - 1; - let shape = tensor.shape(dim); + let axis = rank - i - 1; + let shape = tensor.shape(axis); let ogwl = remainder % shape; - offset += ogwl * tensor.stride(dim); + offset += ogwl * tensor.stride(axis); remainder /= shape; } @@ -81,10 +81,10 @@ pub fn index_offset_contiguous_fastdivmod( #[unroll] for i in 0..rank { - let dim = rank - i - 1; + let axis = rank - i - 1; - let (rem, ogwl) = shape[dim].div_mod(remainder); - offset += ogwl * stride[dim]; + let (rem, ogwl) = shape[axis].div_mod(remainder); + offset += ogwl * stride[axis]; remainder = rem; } @@ -160,7 +160,7 @@ fn index_packed( tensor: &Tensor, pos: usize, in_shape: &Sequence>, - #[comptime] packed_dim: usize, + #[comptime] packed_axis: usize, #[comptime] packing: usize, #[comptime] rank: usize, ) -> N { @@ -179,14 +179,14 @@ fn index_packed( #[unroll] for i in 0..rank { - let dim = rank - i - 1; - let (rem, mut local_pos) = in_shape[dim].div_mod(remainder); + let axis = rank - i - 1; + let (rem, mut local_pos) = in_shape[axis].div_mod(remainder); remainder = rem; - if dim == packed_dim { + if axis == packed_axis { packing_offset = local_pos % packing; local_pos /= packing; } - offset += local_pos * tensor.stride(dim); + offset += local_pos * tensor.stride(axis); } let packed_val = tensor[offset]; let shift_in = packing_offset * bits_per_elem; @@ -204,7 +204,7 @@ fn copy_kernel_packed( output: &mut Tensor>, out_layout: LinearLayout, in_shape: Sequence>, - #[comptime] packed_dim: usize, + #[comptime] packed_axis: usize, #[comptime] packing: usize, #[comptime] rank: usize, #[comptime] elems_per_thread: usize, @@ -230,7 +230,7 @@ fn copy_kernel_packed( for k in 0..line_size { let offset_input = offset_input + offset + k; - reg[k] = index_packed(input, offset_input, &in_shape, packed_dim, packing, rank); + reg[k] = index_packed(input, offset_input, &in_shape, packed_axis, packing, rank); } registers[i] = reg; } @@ -254,7 +254,7 @@ fn copy_kernel_packed( pub fn into_contiguous_packed( client: &ComputeClient, input: &TensorHandleRef<'_, R>, - packed_dim: usize, + packed_axis: usize, shape: &[usize], packing: usize, dtype: StorageType, @@ -274,7 +274,7 @@ pub fn into_contiguous_packed( client, input, &output.as_ref(), - packed_dim, + packed_axis, shape, packing, dtype, @@ -290,9 +290,9 @@ pub fn copy_gpu_ref( output: &TensorHandleRef<'_, R>, dtype: StorageType, ) -> Result<(), LaunchError> { - let num_elems: usize = input.shape.iter().product(); + let elem_count: usize = input.shape.iter().product(); - // Vectorization is only enabled when the last dimension is contiguous. + // Line size is only enabled when the last dimension is contiguous. let in_rank = input.strides.len(); let out_rank = output.strides.len(); let line_size_in = tensor_line_size_parallel( @@ -309,38 +309,38 @@ pub fn copy_gpu_ref( ); let line_size = line_size_in.min(line_size_out); - let num_vecs = num_elems / line_size as usize; - let num_sm = client + let vec_count = elem_count / line_size as usize; + let sm_count = client .properties() .hardware - .num_streaming_multiprocessors - .unwrap_or(NUM_SM_APPROX); - let cube_dim = CubeDim::new(client, num_vecs); - let simul_vecs = num_sm * cube_dim.num_elems(); - let mut elems_per_unit = match num_vecs / simul_vecs as usize { + .streaming_multiprocessor_count + .unwrap_or(SM_COUNT_APPROX); + let cube_dim = CubeDim::new(client, vec_count); + let simul_vecs = sm_count * cube_dim.num_elems(); + let mut elems_per_unit = match vec_count / simul_vecs as usize { 0..2 => 1, 2..4 => 2, 4..8 => 4, 8.. => 8, }; - let mut num_elems_per_unit = line_size as usize * elems_per_unit; + let mut elems_per_unit_count = line_size as usize * elems_per_unit; - let last_dim = output.shape[out_rank - 1]; + let last_axis = output.shape[out_rank - 1]; - // If tensor is strided, elems_per_unit must be compatible with last dim - while !last_dim.is_multiple_of(num_elems_per_unit as usize) { + // If tensor is strided, elems_per_unit must be compatible with last axis + while !last_axis.is_multiple_of(elems_per_unit_count as usize) { elems_per_unit /= 2; - num_elems_per_unit /= 2; + elems_per_unit_count /= 2; } let out_vec = if line_size > 1 { line_size } else { - // Recompute because it needs to account for `num_elems_per_unit` + // Recompute because it needs to account for `elems_per_unit_count` client .io_optimized_line_sizes(&dtype) - .filter(|it| num_elems_per_unit.is_multiple_of(*it)) + .filter(|it| elems_per_unit_count.is_multiple_of(*it)) .max() .unwrap_or(1) }; @@ -353,7 +353,7 @@ pub fn copy_gpu_ref( let cube_count = calculate_cube_count_elemwise( client, - num_elems.div_ceil(num_elems_per_unit as usize), + elem_count.div_ceil(elems_per_unit_count as usize), cube_dim, ); @@ -381,47 +381,47 @@ pub fn into_contiguous_packed_ref( client: &ComputeClient, input: &TensorHandleRef<'_, R>, output: &TensorHandleRef<'_, R>, - packed_dim: usize, + packed_axis: usize, shape: &[usize], packing: usize, dtype: StorageType, ) -> Result<(), LaunchError> { - let num_elems: usize = input.shape.iter().product(); + let elem_count: usize = input.shape.iter().product(); - // Vectorization is only enabled when the last dimension is contiguous. + // Line size is only enabled when the last dimension is contiguous. let in_rank = input.strides.len(); let out_rank = output.strides.len(); - let in_packed_dim = in_rank - packed_dim - 1; + let in_packed_axis = in_rank - packed_axis - 1; let line_size = tensor_line_size_parallel( client.io_optimized_line_sizes(&dtype), output.shape, output.strides, out_rank - 1, ); - let num_vecs = num_elems / line_size as usize; - let num_sm = client + let vec_count = elem_count / line_size as usize; + let sm_count = client .properties() .hardware - .num_streaming_multiprocessors - .unwrap_or(NUM_SM_APPROX); + .streaming_multiprocessor_count + .unwrap_or(SM_COUNT_APPROX); - let cube_dim = CubeDim::new(client, num_vecs); - let simul_vecs = num_sm * cube_dim.num_elems(); - let mut elems_per_unit = match num_vecs / simul_vecs as usize { + let cube_dim = CubeDim::new(client, vec_count); + let simul_vecs = sm_count * cube_dim.num_elems(); + let mut elems_per_unit = match vec_count / simul_vecs as usize { 0..2 => 1, 2..4 => 2, 4..8 => 4, 8.. => 8, }; - let mut num_elems_per_unit = line_size as usize * elems_per_unit; + let mut elems_per_unit_count = line_size as usize * elems_per_unit; - let last_dim = output.shape[out_rank - 1]; + let last_axis = output.shape[out_rank - 1]; - // If tensor is strided, elems_per_unit must be compatible with last dim - while !last_dim.is_multiple_of(num_elems_per_unit as usize) { + // If tensor is strided, elems_per_unit must be compatible with last axis + while !last_axis.is_multiple_of(elems_per_unit_count as usize) { elems_per_unit /= 2; - num_elems_per_unit /= 2; + elems_per_unit_count /= 2; } let out_layout = LinearLayoutArgs::from_handle(client, output, line_size); @@ -431,7 +431,7 @@ pub fn into_contiguous_packed_ref( .max(output.required_address_type()); let cube_count = calculate_cube_count_elemwise( client, - num_elems.div_ceil(num_elems_per_unit as usize), + elem_count.div_ceil(elems_per_unit_count as usize), cube_dim, ); @@ -449,7 +449,7 @@ pub fn into_contiguous_packed_ref( output.as_tensor_arg(line_size), out_layout, in_shape, - in_packed_dim, + in_packed_axis, packing, in_rank, elems_per_unit, diff --git a/crates/cubecl-std/src/tensor/contiguous/launch.rs b/crates/cubecl-std/src/tensor/contiguous/launch.rs index 23bf55756..bd77eeac0 100644 --- a/crates/cubecl-std/src/tensor/contiguous/launch.rs +++ b/crates/cubecl-std/src/tensor/contiguous/launch.rs @@ -9,9 +9,9 @@ pub fn into_contiguous_ref( input: &TensorHandleRef<'_, R>, dtype: StorageType, ) -> Result, LaunchError> { - let num_elems: usize = input.shape.iter().product(); + let elem_count: usize = input.shape.iter().product(); - let handle = client.empty(num_elems * dtype.size()); + let handle = client.empty(elem_count * dtype.size()); let output = TensorHandle::new_contiguous(input.shape.to_vec(), handle, dtype); copy_into(client, input, &output.as_ref(), dtype)?; @@ -48,7 +48,7 @@ pub fn copy_into( // It's normally faster on all devices, but since it doesn't parallelize on an axis, it // might be worst on GPU. Should tune at some point. - let is_cpu = client.properties().hardware.num_cpu_cores.is_some(); + let is_cpu = client.properties().hardware.cpu_core_count.is_some(); if input.strides[rank - 1] != 1 && is_cpu { launch_copy_perpendicular_ref(client, input, output, dtype)?; } else { diff --git a/crates/cubecl-std/src/tensor/contiguous/perpendicular.rs b/crates/cubecl-std/src/tensor/contiguous/perpendicular.rs index 20cbb17a3..6179696bd 100644 --- a/crates/cubecl-std/src/tensor/contiguous/perpendicular.rs +++ b/crates/cubecl-std/src/tensor/contiguous/perpendicular.rs @@ -7,7 +7,7 @@ use cubecl_core::{ use std::cmp::min; /// Kernel for converting a non-contiguous tensor into a contiguous one when -/// the vectorization axis is perpendicular to the last dimension. +/// the line-size axis is perpendicular to the last dimension. /// /// This kernel handles the case where memory is laid out such that the unit-stride /// is not on the last dimension, requiring a "gather-and-transpose" pattern @@ -16,13 +16,13 @@ use std::cmp::min; fn copy_perpendicular( input: &Tensor>, output: &mut Tensor>, - axis_vectorized: usize, + axis_lined: usize, #[define(N)] _elem: StorageType, ) { let line_size = input.line_size(); let last_axis = input.rank() - 1; - // Calculate how many vectorized lines fit into the last dimension's shape. + // Calculate how many line-sized chunks fit into the last axis shape. let num_batch = output.shape(last_axis) / line_size; // Local registers to perform a small in-register transpose. @@ -34,9 +34,9 @@ fn copy_perpendicular( } let channel_input_stride_elem = input.stride(last_axis); - let channel_output_stride_elem = output.stride(axis_vectorized); + let channel_output_stride_elem = output.stride(axis_lined); - // Strides adjusted for vectorization (line_size). + // Strides adjusted for line size. let channel_input_stride = channel_input_stride_elem / line_size; let channel_output_stride = channel_output_stride_elem / line_size; @@ -96,8 +96,8 @@ fn copy_perpendicular( /// Launches the perpendicular contiguous kernel. /// /// This is used when the input tensor's memory layout is such that the last dimension -/// is not the one with a stride of 1 (the vectorized dimension). It optimizes -/// the copy by using hardware vectorization (Lines) and an in-register transpose. +/// is not the one with a stride of 1 (the line-size axis). It optimizes +/// the copy by using hardware line sizes (`Line`) and an in-register transpose. pub fn launch_into_contiguous_perpendicular( client: &ComputeClient, input: &TensorHandleRef<'_, R>, @@ -117,8 +117,8 @@ pub fn launch_into_contiguous_perpendicular( /// Launches the perpendicular contiguous kernel. /// /// This is used when the input tensor's memory layout is such that the last dimension -/// is not the one with a stride of 1 (the vectorized dimension). It optimizes -/// the copy by using hardware vectorization (Lines) and an in-register transpose. +/// is not the one with a stride of 1 (the line-size axis). It optimizes +/// the copy by using hardware line sizes (`Line`) and an in-register transpose. pub fn launch_copy_perpendicular_ref( client: &ComputeClient, input: &TensorHandleRef<'_, R>, @@ -149,8 +149,8 @@ pub fn launch_copy_perpendicular_ref( ); let line_size = min(line_size_perpendicular, line_size_parallel); - let num_elems = output.shape.iter().product::(); - let working_units = num_elems / (line_size as usize * output.shape[rank - 1]); + let elem_count = output.shape.iter().product::(); + let working_units = elem_count / (line_size as usize * output.shape[rank - 1]); let cube_dim = CubeDim::new(client, working_units); let cube_count = calculate_cube_count_elemwise(client, working_units, cube_dim); let address_type = input diff --git a/crates/cubecl-std/src/tensor/identity.rs b/crates/cubecl-std/src/tensor/identity.rs index 26a843e7c..3e18ad2dc 100644 --- a/crates/cubecl-std/src/tensor/identity.rs +++ b/crates/cubecl-std/src/tensor/identity.rs @@ -54,7 +54,7 @@ pub fn launch_ref( "input should be a square matrix" ); - let vectorization_factor = tensor_line_size_parallel( + let line_size = tensor_line_size_parallel( R::supported_line_sizes().iter().cloned(), output.shape, output.strides, @@ -62,7 +62,7 @@ pub fn launch_ref( ); let cube_dim = CubeDim::new_2d(16, 16); - let lines_x = output.shape[1] as u32 / vectorization_factor as u32; + let lines_x = output.shape[1] as u32 / line_size as u32; let cube_count_x = lines_x.div_ceil(cube_dim.x); let cube_count_y = (output.shape[0] as u32).div_ceil(cube_dim.y); let cube_count = CubeCount::new_2d(cube_count_x, cube_count_y); @@ -77,7 +77,7 @@ pub fn launch_ref( output.handle, output.strides, output.shape, - vectorization_factor, + line_size, dtype.size(), ), ScalarArg::new(output.strides[0] + 1), diff --git a/crates/cubecl-std/src/tensor/virtual.rs b/crates/cubecl-std/src/tensor/virtual.rs index 9aa488511..cf1bbb3bc 100644 --- a/crates/cubecl-std/src/tensor/virtual.rs +++ b/crates/cubecl-std/src/tensor/virtual.rs @@ -329,9 +329,9 @@ impl VirtualTensor { #[cube] impl VirtualTensor { - pub fn coordinate(&self, index: usize, dim: usize) -> usize { - let num_strides = index / self.stride(dim); - num_strides % self.shape(dim) + pub fn coordinate(&self, index: usize, axis: usize) -> usize { + let stride_count = index / self.stride(axis); + stride_count % self.shape(axis) } } diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs index a855d6c98..c179e178c 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/base.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/base.rs @@ -251,10 +251,10 @@ impl Item { } pub fn size(&self) -> usize { - self.elem().size() * self.vectorization_factor() + self.elem().size() * self.line_size() } - pub fn vectorization_factor(&self) -> usize { + pub fn line_size(&self) -> usize { match self { Item::Vec4(_) => 4, Item::Vec3(_) => 3, diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs index 6127419ab..e7eef4585 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/compiler.rs @@ -104,7 +104,7 @@ impl WgslCompiler { self.strategy = mode; - let num_meta = value.buffers.len(); + let meta_count = value.buffers.len(); self.ext_meta_pos = Vec::new(); let mut num_ext = 0; @@ -116,7 +116,7 @@ impl WgslCompiler { } } - self.metadata = Metadata::new(num_meta as u32, num_ext); + self.metadata = Metadata::new(meta_count as u32, num_ext); let address_type = self.compile_storage_type(address_type); let instructions = self.compile_scope(&mut value.body); @@ -685,21 +685,21 @@ impl WgslCompiler { info_offset: self.compile_variable(offset.into()), } } - cube::Metadata::Stride { dim, var } => { + cube::Metadata::Stride { axis, var } => { let position = self.ext_meta_pos(&var); let offset = self.metadata.stride_offset_index(position); wgsl::Instruction::ExtendedMeta { info_offset: self.compile_variable(offset.into()), - dim: self.compile_variable(dim), + dim: self.compile_variable(axis), out: self.compile_variable(out), } } - cube::Metadata::Shape { dim, var } => { + cube::Metadata::Shape { axis, var } => { let position = self.ext_meta_pos(&var); let offset = self.metadata.shape_offset_index(position); wgsl::Instruction::ExtendedMeta { info_offset: self.compile_variable(offset.into()), - dim: self.compile_variable(dim), + dim: self.compile_variable(axis), out: self.compile_variable(out), } } diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs b/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs index 370514706..022a6f2f5 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/extension.rs @@ -150,7 +150,7 @@ pub fn call_safe_tanh( } fn should_use_scalar_powf(rhs: &Variable) -> bool { - rhs.is_always_scalar() || rhs.item().vectorization_factor() == 1 + rhs.is_always_scalar() || rhs.item().line_size() == 1 } pub fn call_is_nan( @@ -174,7 +174,7 @@ pub fn call_is_inf( } fn construct_vectorized_name(base_name: &str, item: Item) -> String { - let vec_factor = item.vectorization_factor(); + let vec_factor = item.line_size(); let elem = item.elem(); format!("{base_name}_{vec_factor}_{elem}") } @@ -196,7 +196,7 @@ fn construct_vector( output: Item, ) -> core::fmt::Result { let in_item = inputs[0].item; - let vec_factor = output.vectorization_factor(); + let vec_factor = output.line_size(); let function_name = construct_vectorized_name(base_name, in_item); let primitive_name = construct_primitive_name(primitive_name, *in_item.elem()); write!(f, "fn {function_name}(")?; diff --git a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs index 83077f095..171979f86 100644 --- a/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs +++ b/crates/cubecl-wgpu/src/compiler/wgsl/instructions.rs @@ -519,7 +519,7 @@ impl Display for Instruction { } } Instruction::And { lhs, rhs, out } => { - let line_size = out.item().vectorization_factor(); + let line_size = out.item().line_size(); if out.is_atomic() { assert_eq!(lhs, out, "Can't use regular and on atomic"); writeln!(f, "atomicAnd({out}, {rhs});") @@ -539,7 +539,7 @@ impl Display for Instruction { } } Instruction::Or { lhs, rhs, out } => { - let line_size = out.item().vectorization_factor(); + let line_size = out.item().line_size(); if out.is_atomic() { assert_eq!(lhs, out, "Can't use regular or on atomic"); writeln!(f, "atomicOr({out}, {rhs});") @@ -749,8 +749,8 @@ impl Display for Instruction { Instruction::GreaterEqual { lhs, rhs, out } => comparison(lhs, rhs, out, ">=", f), Instruction::NotEqual { lhs, rhs, out } => comparison(lhs, rhs, out, "!=", f), Instruction::Assign { input, out } => { - let vec_left = out.item().vectorization_factor(); - let vec_right = input.item().vectorization_factor(); + let vec_left = out.item().line_size(); + let vec_right = input.item().line_size(); if out.elem().is_atomic() { if !input.is_atomic() { @@ -1087,7 +1087,7 @@ for (var {i}: {i_ty} = {start}; {i} {cmp} {end}; {increment}) {{ writeln!(f, "{out} = length({input});") } Instruction::Normalize { input, out } => { - if input.item().vectorization_factor() == 1 { + if input.item().line_size() == 1 { // We need a check for vectorization factor 1 here, for compatibility with cuda. // You can almost use sign here, however that does not correctly handle the case for x == 0.0. // Therefore we use normalize with vec2, as there is no way to use a NaN literal in wgsl. @@ -1101,7 +1101,7 @@ for (var {i}: {i_ty} = {start}; {i} {cmp} {end}; {increment}) {{ } Instruction::Dot { lhs, rhs, out } => { let out = out.fmt_left(); - if lhs.item().vectorization_factor() == 1 { + if lhs.item().line_size() == 1 { writeln!(f, "{out} = {lhs} * {rhs};") } else { writeln!(f, "{out} = dot({lhs}, {rhs});") @@ -1176,8 +1176,8 @@ fn index( len: Option<&Variable>, ) -> core::fmt::Result { let is_scalar = match lhs { - Variable::LocalMut { item, .. } => item.vectorization_factor() == 1, - Variable::LocalConst { item, .. } => item.vectorization_factor() == 1, + Variable::LocalMut { item, .. } => item.line_size() == 1, + Variable::LocalConst { item, .. } => item.line_size() == 1, Variable::Constant(..) => true, _ => false, }; @@ -1288,14 +1288,14 @@ fn index_assign( let item_out = out.item(); let lhs = IndexOffset::new(lhs, &offset, 0); - let vectorization_factor = item_out.vectorization_factor(); - if vectorization_factor > item_rhs.vectorization_factor() { + let line_size = item_out.line_size(); + if line_size > item_rhs.line_size() { let casting_type = Item::Scalar(*item_out.elem()); - write!(f, "{out}[{lhs}] = vec{vectorization_factor}(")?; - for i in 0..vectorization_factor { + write!(f, "{out}[{lhs}] = vec{line_size}(")?; + for i in 0..line_size { f.write_str(&rhs.index(i).fmt_cast(casting_type))?; - if i < vectorization_factor - 1 { + if i < line_size - 1 { f.write_str(",")?; } } diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs index a5bd96707..5bb09d336 100644 --- a/crates/cubecl-wgpu/src/runtime.rs +++ b/crates/cubecl-wgpu/src/runtime.rs @@ -244,10 +244,10 @@ pub(crate) fn create_server(setup: WgpuSetup, options: RuntimeOptions) -> WgpuSe adapter_limits.max_compute_workgroup_size_y, adapter_limits.max_compute_workgroup_size_z, ), - num_streaming_multiprocessors: None, - num_tensor_cores: None, + streaming_multiprocessor_count: None, + tensor_core_count: None, min_tensor_cores_dim: None, - num_cpu_cores: None, // TODO: Check if device is CPU. + cpu_core_count: None, // TODO: Check if device is CPU. }; let mut compilation_options = Default::default();