From c99c7b517619b49cf3b93723ba2547d7b68de84b Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Fri, 5 Dec 2025 10:19:27 +0530 Subject: [PATCH 1/7] Update xe_vnni_2d.cpp --- test/unit/cute/intel_xe/xe_vnni_2d.cpp | 371 +++++++++++++++++++++++++ 1 file changed, 371 insertions(+) diff --git a/test/unit/cute/intel_xe/xe_vnni_2d.cpp b/test/unit/cute/intel_xe/xe_vnni_2d.cpp index 2112e474b0..5bc36338b7 100644 --- a/test/unit/cute/intel_xe/xe_vnni_2d.cpp +++ b/test/unit/cute/intel_xe/xe_vnni_2d.cpp @@ -60,6 +60,377 @@ TEST(CuTe_Xe, XE_LOAD_2D_VNNI_API_Declaration) { EXPECT_TRUE(true) << "XE_LOAD_2D_VNNI API types declared successfully"; } +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MinimalConfigs) { + // Test minimal 8-bit VNNI configurations + using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>; + using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>; + using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>; + + static_assert(VNNIOp_8bit_1x32::CopyBits == 8); + static_assert(VNNIOp_8bit_1x32::AtomHeight == 1); + static_assert(VNNIOp_8bit_1x32::AtomWidth == 32); + + static_assert(VNNIOp_8bit_2x32::CopyBits == 8); + static_assert(VNNIOp_8bit_2x32::AtomHeight == 2); + static_assert(VNNIOp_8bit_2x32::AtomWidth == 32); + + static_assert(VNNIOp_8bit_4x32::CopyBits == 8); + static_assert(VNNIOp_8bit_4x32::AtomHeight == 4); + static_assert(VNNIOp_8bit_4x32::AtomWidth == 32); + + EXPECT_TRUE(true) << "8-bit minimal VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MediumConfigs) { + // Test medium-sized 8-bit VNNI configurations + using VNNIOp_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; + using VNNIOp_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; + + static_assert(VNNIOp_8bit_8x32::CopyBits == 8); + static_assert(VNNIOp_8bit_8x32::AtomHeight == 8); + static_assert(VNNIOp_8bit_8x32::AtomWidth == 32); + + static_assert(VNNIOp_8bit_16x32::CopyBits == 8); + static_assert(VNNIOp_8bit_16x32::AtomHeight == 16); + static_assert(VNNIOp_8bit_16x32::AtomWidth == 32); + + EXPECT_TRUE(true) << "8-bit medium VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_WideConfigs) { + // Test 8-bit VNNI configurations with wider widths + using VNNIOp_8bit_2x64 = XE_LOAD_2D_VNNI<8, 2, 64>; + using VNNIOp_8bit_4x64 = XE_LOAD_2D_VNNI<8, 4, 64>; + using VNNIOp_8bit_8x64 = XE_LOAD_2D_VNNI<8, 8, 64>; + + static_assert(VNNIOp_8bit_2x64::CopyBits == 8); + static_assert(VNNIOp_8bit_2x64::AtomHeight == 2); + static_assert(VNNIOp_8bit_2x64::AtomWidth == 64); + + static_assert(VNNIOp_8bit_4x64::CopyBits == 8); + static_assert(VNNIOp_8bit_4x64::AtomHeight == 4); + static_assert(VNNIOp_8bit_4x64::AtomWidth == 64); + + static_assert(VNNIOp_8bit_8x64::CopyBits == 8); + static_assert(VNNIOp_8bit_8x64::AtomHeight == 8); + static_assert(VNNIOp_8bit_8x64::AtomWidth == 64); + + EXPECT_TRUE(true) << "8-bit wide VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MinimalConfigs) { + // Test minimal 16-bit VNNI configurations + using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>; + using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>; + using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>; + + static_assert(VNNIOp_16bit_1x16::CopyBits == 16); + static_assert(VNNIOp_16bit_1x16::AtomHeight == 1); + static_assert(VNNIOp_16bit_1x16::AtomWidth == 16); + + static_assert(VNNIOp_16bit_2x16::CopyBits == 16); + static_assert(VNNIOp_16bit_2x16::AtomHeight == 2); + static_assert(VNNIOp_16bit_2x16::AtomWidth == 16); + + static_assert(VNNIOp_16bit_4x16::CopyBits == 16); + static_assert(VNNIOp_16bit_4x16::AtomHeight == 4); + static_assert(VNNIOp_16bit_4x16::AtomWidth == 16); + + EXPECT_TRUE(true) << "16-bit minimal VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MediumConfigs) { + // Test medium-sized 16-bit VNNI configurations + using VNNIOp_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; + using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; + + static_assert(VNNIOp_16bit_8x16::CopyBits == 16); + static_assert(VNNIOp_16bit_8x16::AtomHeight == 8); + static_assert(VNNIOp_16bit_8x16::AtomWidth == 16); + + static_assert(VNNIOp_16bit_16x16::CopyBits == 16); + static_assert(VNNIOp_16bit_16x16::AtomHeight == 16); + static_assert(VNNIOp_16bit_16x16::AtomWidth == 16); + + EXPECT_TRUE(true) << "16-bit medium VNNI configurations validated"; +} + + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_WideConfigs) { + // Test 16-bit VNNI configurations with wider widths + using VNNIOp_16bit_2x32 = XE_LOAD_2D_VNNI<16, 2, 32>; + using VNNIOp_16bit_4x32 = XE_LOAD_2D_VNNI<16, 4, 32>; + using VNNIOp_16bit_8x32 = XE_LOAD_2D_VNNI<16, 8, 32>; + + static_assert(VNNIOp_16bit_2x32::CopyBits == 16); + static_assert(VNNIOp_16bit_2x32::AtomHeight == 2); + static_assert(VNNIOp_16bit_2x32::AtomWidth == 32); + + static_assert(VNNIOp_16bit_4x32::CopyBits == 16); + static_assert(VNNIOp_16bit_4x32::AtomHeight == 4); + static_assert(VNNIOp_16bit_4x32::AtomWidth == 32); + + static_assert(VNNIOp_16bit_8x32::CopyBits == 16); + static_assert(VNNIOp_16bit_8x32::AtomHeight == 8); + static_assert(VNNIOp_16bit_8x32::AtomWidth == 32); + + EXPECT_TRUE(true) << "16-bit wide VNNI configurations validated"; +} + + + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_CustomBlockWidth) { + // Test 8-bit VNNI with custom BlockWidth parameter + using VNNIOp_8bit_4x32_bw16 = XE_LOAD_2D_VNNI<8, 4, 32, 16>; + using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>; + using VNNIOp_8bit_16x32_bw16 = XE_LOAD_2D_VNNI<8, 16, 32, 16>; + + static_assert(VNNIOp_8bit_4x32_bw16::CopyBits == 8); + static_assert(VNNIOp_8bit_4x32_bw16::AtomHeight == 4); + static_assert(VNNIOp_8bit_4x32_bw16::AtomWidth == 32); + + static_assert(VNNIOp_8bit_8x32_bw16::CopyBits == 8); + static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8); + static_assert(VNNIOp_8bit_8x32_bw16::AtomWidth == 32); + + static_assert(VNNIOp_8bit_16x32_bw16::CopyBits == 8); + static_assert(VNNIOp_8bit_16x32_bw16::AtomHeight == 16); + static_assert(VNNIOp_8bit_16x32_bw16::AtomWidth == 32); + + EXPECT_TRUE(true) << "8-bit VNNI with custom BlockWidth validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_CustomBlockWidth) { + // Test 16-bit VNNI with custom BlockWidth parameter + using VNNIOp_16bit_4x16_bw8 = XE_LOAD_2D_VNNI<16, 4, 16, 8>; + using VNNIOp_16bit_8x16_bw8 = XE_LOAD_2D_VNNI<16, 8, 16, 8>; + using VNNIOp_16bit_16x16_bw8 = XE_LOAD_2D_VNNI<16, 16, 16, 8>; + + static_assert(VNNIOp_16bit_4x16_bw8::CopyBits == 16); + static_assert(VNNIOp_16bit_4x16_bw8::AtomHeight == 4); + static_assert(VNNIOp_16bit_4x16_bw8::AtomWidth == 16); + + static_assert(VNNIOp_16bit_8x16_bw8::CopyBits == 16); + static_assert(VNNIOp_16bit_8x16_bw8::AtomHeight == 8); + static_assert(VNNIOp_16bit_8x16_bw8::AtomWidth == 16); + + static_assert(VNNIOp_16bit_16x16_bw8::CopyBits == 16); + static_assert(VNNIOp_16bit_16x16_bw8::AtomHeight == 16); + static_assert(VNNIOp_16bit_16x16_bw8::AtomWidth == 16); + + EXPECT_TRUE(true) << "16-bit VNNI with custom BlockWidth validated"; +} + + + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Int8_GEMMConfigs) { + // Test typical int8 GEMM VNNI configurations for K-dimension packing + using GEMM_Int8_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>; + using GEMM_Int8_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; + using GEMM_Int8_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; + using GEMM_Int8_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>; + + static_assert(GEMM_Int8_4x32::CopyBits == 8); + static_assert(GEMM_Int8_4x32::AtomHeight == 4); + static_assert(GEMM_Int8_4x32::AtomWidth == 32); + + static_assert(GEMM_Int8_8x32::CopyBits == 8); + static_assert(GEMM_Int8_8x32::AtomHeight == 8); + static_assert(GEMM_Int8_8x32::AtomWidth == 32); + + static_assert(GEMM_Int8_16x32::CopyBits == 8); + static_assert(GEMM_Int8_16x32::AtomHeight == 16); + static_assert(GEMM_Int8_16x32::AtomWidth == 32); + + static_assert(GEMM_Int8_32x32::CopyBits == 8); + static_assert(GEMM_Int8_32x32::AtomHeight == 32); + static_assert(GEMM_Int8_32x32::AtomWidth == 32); + + EXPECT_TRUE(true) << "Int8 GEMM VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BF16_GEMMConfigs) { + // Test typical BF16/FP16 GEMM VNNI configurations for K-dimension packing + using GEMM_BF16_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>; + using GEMM_BF16_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; + using GEMM_BF16_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; + using GEMM_BF16_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; + + static_assert(GEMM_BF16_4x16::CopyBits == 16); + static_assert(GEMM_BF16_4x16::AtomHeight == 4); + static_assert(GEMM_BF16_4x16::AtomWidth == 16); + + static_assert(GEMM_BF16_8x16::CopyBits == 16); + static_assert(GEMM_BF16_8x16::AtomHeight == 8); + static_assert(GEMM_BF16_8x16::AtomWidth == 16); + + static_assert(GEMM_BF16_16x16::CopyBits == 16); + static_assert(GEMM_BF16_16x16::AtomHeight == 16); + static_assert(GEMM_BF16_16x16::AtomWidth == 16); + + static_assert(GEMM_BF16_32x16::CopyBits == 16); + static_assert(GEMM_BF16_32x16::AtomHeight == 32); + static_assert(GEMM_BF16_32x16::AtomWidth == 16); + + EXPECT_TRUE(true) << "BF16/FP16 GEMM VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MoE_GEMMConfigs) { + // Test VNNI configurations used in MoE (Mixture of Experts) GEMM + // Based on example from 12_bmg_moe_gemm_cute_interface + using MoE_Load_A = XE_LOAD_2D_VNNI<16, 32, 16, 16>; + using MoE_Load_B_Alt1 = XE_LOAD_2D_VNNI<16, 16, 16>; + using MoE_Load_B_Alt2 = XE_LOAD_2D_VNNI<16, 8, 16>; + + static_assert(MoE_Load_A::CopyBits == 16); + static_assert(MoE_Load_A::AtomHeight == 32); + static_assert(MoE_Load_A::AtomWidth == 16); + + static_assert(MoE_Load_B_Alt1::CopyBits == 16); + static_assert(MoE_Load_B_Alt1::AtomHeight == 16); + static_assert(MoE_Load_B_Alt1::AtomWidth == 16); + + static_assert(MoE_Load_B_Alt2::CopyBits == 16); + static_assert(MoE_Load_B_Alt2::AtomHeight == 8); + static_assert(MoE_Load_B_Alt2::AtomWidth == 16); + + EXPECT_TRUE(true) << "MoE GEMM VNNI configurations validated"; +} + + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MixedBlockWidthConfigs) { + // Test various BlockWidth settings for optimization + using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>; + using VNNIOp_8bit_16x64_bw32 = XE_LOAD_2D_VNNI<8, 16, 64, 32>; + using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>; + using VNNIOp_16bit_16x32_bw16 = XE_LOAD_2D_VNNI<16, 16, 32, 16>; + + static_assert(VNNIOp_8bit_8x64_bw32::CopyBits == 8); + static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8); + static_assert(VNNIOp_8bit_8x64_bw32::AtomWidth == 64); + + static_assert(VNNIOp_8bit_16x64_bw32::CopyBits == 8); + static_assert(VNNIOp_8bit_16x64_bw32::AtomHeight == 16); + static_assert(VNNIOp_8bit_16x64_bw32::AtomWidth == 64); + + static_assert(VNNIOp_16bit_8x32_bw16::CopyBits == 16); + static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8); + static_assert(VNNIOp_16bit_8x32_bw16::AtomWidth == 32); + + static_assert(VNNIOp_16bit_16x32_bw16::CopyBits == 16); + static_assert(VNNIOp_16bit_16x32_bw16::AtomHeight == 16); + static_assert(VNNIOp_16bit_16x32_bw16::AtomWidth == 32); + + EXPECT_TRUE(true) << "Mixed BlockWidth VNNI configurations validated"; +} + + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_DataType_Consistency) { + // Test that CopyBits correctly reflects the data size + using VNNIOp_8bit_small = XE_LOAD_2D_VNNI<8, 2, 16>; + using VNNIOp_8bit_large = XE_LOAD_2D_VNNI<8, 16, 64>; + using VNNIOp_16bit_small = XE_LOAD_2D_VNNI<16, 2, 16>; + using VNNIOp_16bit_large = XE_LOAD_2D_VNNI<16, 16, 32>; + + // All 8-bit variants should have CopyBits == 8 + static_assert(VNNIOp_8bit_small::CopyBits == 8); + static_assert(VNNIOp_8bit_large::CopyBits == 8); + + // All 16-bit variants should have CopyBits == 16 + static_assert(VNNIOp_16bit_small::CopyBits == 16); + static_assert(VNNIOp_16bit_large::CopyBits == 16); + + EXPECT_TRUE(true) << "Data type consistency validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BlockWidth_Divisors) { + // Test BlockWidth as divisors of Width + using VNNIOp_8bit_8x32_bw8 = XE_LOAD_2D_VNNI<8, 8, 32, 8>; + using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>; + using VNNIOp_8bit_8x64_bw16 = XE_LOAD_2D_VNNI<8, 8, 64, 16>; + using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>; + using VNNIOp_16bit_8x32_bw8 = XE_LOAD_2D_VNNI<16, 8, 32, 8>; + using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>; + + static_assert(VNNIOp_8bit_8x32_bw8::AtomHeight == 8 && VNNIOp_8bit_8x32_bw8::AtomWidth == 32); + static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8 && VNNIOp_8bit_8x32_bw16::AtomWidth == 32); + static_assert(VNNIOp_8bit_8x64_bw16::AtomHeight == 8 && VNNIOp_8bit_8x64_bw16::AtomWidth == 64); + static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8 && VNNIOp_8bit_8x64_bw32::AtomWidth == 64); + static_assert(VNNIOp_16bit_8x32_bw8::AtomHeight == 8 && VNNIOp_16bit_8x32_bw8::AtomWidth == 32); + static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8 && VNNIOp_16bit_8x32_bw16::AtomWidth == 32); + + EXPECT_TRUE(true) << "BlockWidth divisor configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Symmetric_Configs) { + // Test configurations with matching height and width values + using VNNIOp_8bit_16x16 = XE_LOAD_2D_VNNI<8, 16, 16>; + using VNNIOp_8bit_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>; + using VNNIOp_16bit_8x8 = XE_LOAD_2D_VNNI<16, 8, 8>; + using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; + using VNNIOp_16bit_32x32 = XE_LOAD_2D_VNNI<16, 32, 32>; + + static_assert(VNNIOp_8bit_16x16::AtomHeight == 16 && VNNIOp_8bit_16x16::AtomWidth == 16); + static_assert(VNNIOp_8bit_32x32::AtomHeight == 32 && VNNIOp_8bit_32x32::AtomWidth == 32); + static_assert(VNNIOp_16bit_8x8::AtomHeight == 8 && VNNIOp_16bit_8x8::AtomWidth == 8); + static_assert(VNNIOp_16bit_16x16::AtomHeight == 16 && VNNIOp_16bit_16x16::AtomWidth == 16); + static_assert(VNNIOp_16bit_32x32::AtomHeight == 32 && VNNIOp_16bit_32x32::AtomWidth == 32); + + EXPECT_TRUE(true) << "Symmetric VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Small_Tiles) { + // Test small tile configurations useful for residual/boundary handling + using VNNIOp_8bit_1x16 = XE_LOAD_2D_VNNI<8, 1, 16>; + using VNNIOp_8bit_2x16 = XE_LOAD_2D_VNNI<8, 2, 16>; + using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>; + using VNNIOp_16bit_1x8 = XE_LOAD_2D_VNNI<16, 1, 8>; + using VNNIOp_16bit_2x8 = XE_LOAD_2D_VNNI<16, 2, 8>; + using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>; + + static_assert(VNNIOp_8bit_1x16::AtomHeight == 1 && VNNIOp_8bit_1x16::AtomWidth == 16); + static_assert(VNNIOp_8bit_2x16::AtomHeight == 2 && VNNIOp_8bit_2x16::AtomWidth == 16); + static_assert(VNNIOp_8bit_1x32::AtomHeight == 1 && VNNIOp_8bit_1x32::AtomWidth == 32); + static_assert(VNNIOp_16bit_1x8::AtomHeight == 1 && VNNIOp_16bit_1x8::AtomWidth == 8); + static_assert(VNNIOp_16bit_2x8::AtomHeight == 2 && VNNIOp_16bit_2x8::AtomWidth == 8); + static_assert(VNNIOp_16bit_1x16::AtomHeight == 1 && VNNIOp_16bit_1x16::AtomWidth == 16); + + EXPECT_TRUE(true) << "Small tile VNNI configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MatMul_Optimized) { + // Test configurations optimized for matrix multiplication (DPAS integration) + // Based on typical DPAS dimensions: N=16 for all, K varies by data type + using MatMul_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; // K=32 for int8 + using MatMul_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; // M=16 tile + using MatMul_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; // K=16 for bf16/fp16 + using MatMul_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; // M=16, N=16 for bf16/fp16 + using MatMul_16bit_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; // Larger M tile + + // Verify dimensions match DPAS requirements + static_assert(MatMul_8bit_8x32::CopyBits == 8); + static_assert(MatMul_8bit_8x32::AtomHeight == 8); + static_assert(MatMul_8bit_8x32::AtomWidth == 32); // Matches int8 DPAS K dimension + + static_assert(MatMul_8bit_16x32::CopyBits == 8); + static_assert(MatMul_8bit_16x32::AtomHeight == 16); + static_assert(MatMul_8bit_16x32::AtomWidth == 32); + + static_assert(MatMul_16bit_8x16::CopyBits == 16); + static_assert(MatMul_16bit_8x16::AtomHeight == 8); + static_assert(MatMul_16bit_8x16::AtomWidth == 16); // Matches bf16/fp16 DPAS K dimension + + static_assert(MatMul_16bit_16x16::CopyBits == 16); + static_assert(MatMul_16bit_16x16::AtomHeight == 16); + static_assert(MatMul_16bit_16x16::AtomWidth == 16); + + static_assert(MatMul_16bit_32x16::CopyBits == 16); + static_assert(MatMul_16bit_32x16::AtomHeight == 32); + static_assert(MatMul_16bit_32x16::AtomWidth == 16); + + EXPECT_TRUE(true) << "MatMul-optimized VNNI configurations validated"; +} + #else TEST(CuTe_Xe, XE_LOAD_2D_VNNI_SKIPPED) { From 6f86ced0f86bccad873a9a294d46856e06ffc104 Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Fri, 5 Dec 2025 10:23:18 +0530 Subject: [PATCH 2/7] Update xe_copy_prefetch_2d.cpp --- .../cute/intel_xe/xe_copy_prefetch_2d.cpp | 155 +++++++++++++++++- 1 file changed, 151 insertions(+), 4 deletions(-) diff --git a/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp b/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp index 675b8d1c43..3659af0060 100644 --- a/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp +++ b/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp @@ -47,8 +47,8 @@ using namespace compat::experimental; #if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) -// Kernel name for unique identification -template +// Kernel name for unique identification - includes Bits to ensure uniqueness +template class XEPrefetch2DKernelName; // Device kernel for XE_PREFETCH_2D testing @@ -106,7 +106,7 @@ void test_xe_prefetch_2d() { // Initialize source with test pattern for (size_t i = 0; i < host_src.size(); ++i) { - host_src[i] = static_cast(i % 256); + host_src[i] = static_cast(static_cast(i % 256)); } // Copy to device @@ -122,7 +122,7 @@ void test_xe_prefetch_2d() { auto gridDim = compat::dim3(1); launch, - XEPrefetch2DKernelName>( + XEPrefetch2DKernelName>( launch_policy{ gridDim, blockDim, kernel_properties{sycl_exp::sub_group_size} @@ -150,6 +150,153 @@ TEST(CuTe_Xe, XE_PREFETCH_2D_float) { test_xe_prefetch_2d(); } + +// Test 4: 8-bit Minimal Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Minimal) { + test_xe_prefetch_2d(); +} + +// Test 5: 8-bit Small Height +TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_SmallHeight) { + test_xe_prefetch_2d(); +} + +// Test 6: 8-bit Medium Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Medium) { + test_xe_prefetch_2d(); +} + +// Test 7: 8-bit Large Height +TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_LargeHeight) { + test_xe_prefetch_2d(); +} + +// Test 8: 8-bit Wide Configuration (respecting 512-bit width limit) +TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Wide) { + test_xe_prefetch_2d(); // 8*64=512 bits (max) +} + +// Test 9: 16-bit Minimal Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Minimal) { + test_xe_prefetch_2d(); +} + +// Test 10: 16-bit Small Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Small) { + test_xe_prefetch_2d(); +} + +// Test 11: 16-bit Medium Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Medium) { + test_xe_prefetch_2d(); +} + +// Test 12: 16-bit Large Height +TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_LargeHeight) { + test_xe_prefetch_2d(); +} + +// Test 13: 16-bit Wide Configuration (respecting 512-bit width limit) +TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Wide) { + test_xe_prefetch_2d(); // 16*32=512 bits (max) +} + +// Test 14: 32-bit Minimal Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Minimal) { + test_xe_prefetch_2d(); // 32*16=512 bits (max) +} + +// Test 15: 32-bit Small Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Small) { + test_xe_prefetch_2d(); +} + +// Test 16: 32-bit Medium Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Medium) { + test_xe_prefetch_2d(); +} + +// Test 17: 32-bit Large Height +TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_LargeHeight) { + test_xe_prefetch_2d(); +} + +// Test 18: 32-bit Wide Configuration (respecting 512-bit width limit) +TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Wide) { + test_xe_prefetch_2d(); // 32*16=512 bits (max) +} + +// Test 19: 64-bit Small Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_Small) { + test_xe_prefetch_2d(); // 64*8=512 bits (max) +} + +// Test 20: 64-bit Medium Configuration +TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_Medium) { + test_xe_prefetch_2d(); // 64*8=512 bits (max) +} + +// Test 21: 64-bit Large Height +TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_LargeHeight) { + test_xe_prefetch_2d(); // 64*8=512 bits (max) +} + +// Test 22: Mixed Data Types - Power of Two Heights +TEST(CuTe_Xe, XE_PREFETCH_2D_PowerOfTwo_Heights) { + // 8-bit with power-of-two heights + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); + + // 16-bit with power-of-two heights + test_xe_prefetch_2d(); + + // 32-bit with power-of-two heights + test_xe_prefetch_2d(); +} + +// Test 23: Various Width Configurations +TEST(CuTe_Xe, XE_PREFETCH_2D_VariousWidths) { + // 8-bit with various widths + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); + + // 16-bit with various widths + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); + + // 32-bit with various widths + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); +} + +// Test 24: Square Tiles +TEST(CuTe_Xe, XE_PREFETCH_2D_SquareTiles) { + // 8-bit square (in memory view) + test_xe_prefetch_2d(); + + // 16-bit square + test_xe_prefetch_2d(); + + // 32-bit square + test_xe_prefetch_2d(); +} + +// Test 25: Tall Tiles (Height > Width) +TEST(CuTe_Xe, XE_PREFETCH_2D_TallTiles) { + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); + test_xe_prefetch_2d(); +} + +// Test 26: Cache Line Optimization +TEST(CuTe_Xe, XE_PREFETCH_2D_CacheOptimized) { + // Configurations aligned to cache lines (64 bytes) + test_xe_prefetch_2d(); // 64 bytes per row + test_xe_prefetch_2d(); // 64 bytes per row + test_xe_prefetch_2d(); // 64 bytes per row + test_xe_prefetch_2d(); // 64 bytes per row +} + #else // For the fallback case From b871d05d6d3be2bcd70d8d316063b44ef576381f Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Fri, 5 Dec 2025 10:23:56 +0530 Subject: [PATCH 3/7] Update xe_transpose_2d.cpp --- test/unit/cute/intel_xe/xe_transpose_2d.cpp | 367 ++++++++++++++++++++ 1 file changed, 367 insertions(+) diff --git a/test/unit/cute/intel_xe/xe_transpose_2d.cpp b/test/unit/cute/intel_xe/xe_transpose_2d.cpp index d2375d2fc8..bb9e5ea4b4 100644 --- a/test/unit/cute/intel_xe/xe_transpose_2d.cpp +++ b/test/unit/cute/intel_xe/xe_transpose_2d.cpp @@ -91,6 +91,373 @@ TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Constraints) { EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE constraint validation successful"; } +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MinimalConfigs) { + // Test minimal 32-bit transpose configurations + using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; + using Transpose_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>; + using Transpose_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>; + using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; + + static_assert(Transpose_32bit_1x1::CopyBits == 32); + static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1); + + static_assert(Transpose_32bit_1x2::CopyBits == 32); + static_assert(Transpose_32bit_1x2::AtomHeight == 1 && Transpose_32bit_1x2::AtomWidth == 2); + + static_assert(Transpose_32bit_2x1::CopyBits == 32); + static_assert(Transpose_32bit_2x1::AtomHeight == 2 && Transpose_32bit_2x1::AtomWidth == 1); + + static_assert(Transpose_32bit_2x2::CopyBits == 32); + static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2); + + EXPECT_TRUE(true) << "32-bit minimal transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width4) { + // Test 32-bit transpose with width = 4 + using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; + using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>; + using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; + using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + + static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4); + static_assert(Transpose_32bit_2x4::AtomHeight == 2 && Transpose_32bit_2x4::AtomWidth == 4); + static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4); + static_assert(Transpose_32bit_8x4::AtomHeight == 8 && Transpose_32bit_8x4::AtomWidth == 4); + + EXPECT_TRUE(true) << "32-bit width=4 transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width8_MaxWidth) { + // Test 32-bit transpose with width = 8 (maximum allowed width) + using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; + using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; + using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; + using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; + using Transpose_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; + + static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8); + static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8); + static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8); + static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8); + static_assert(Transpose_32bit_16x8::AtomHeight == 16 && Transpose_32bit_16x8::AtomWidth == 8); + + EXPECT_TRUE(true) << "32-bit max width=8 transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_VariousHeights) { + // Test 32-bit transpose with various height values + using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; + using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; + using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; + + static_assert(Transpose_32bit_1x4::AtomHeight == 1); + static_assert(Transpose_32bit_4x4::AtomHeight == 4); + static_assert(Transpose_32bit_8x4::AtomHeight == 8); + static_assert(Transpose_32bit_16x4::AtomHeight == 16); + static_assert(Transpose_32bit_32x4::AtomHeight == 32); + + EXPECT_TRUE(true) << "32-bit various height transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SquareConfigs) { + // Test 32-bit square transpose configurations + using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; + using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; + using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; + using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; + + static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1); + static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2); + static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4); + static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8); + + EXPECT_TRUE(true) << "32-bit square transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_TallConfigs) { + // Test 32-bit tall (Height > Width) transpose configurations + using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>; + using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>; + using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + using Transpose_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; + + static_assert(Transpose_32bit_8x1::AtomHeight == 8 && Transpose_32bit_8x1::AtomWidth == 1); + static_assert(Transpose_32bit_8x2::AtomHeight == 8 && Transpose_32bit_8x2::AtomWidth == 2); + static_assert(Transpose_32bit_16x4::AtomHeight == 16 && Transpose_32bit_16x4::AtomWidth == 4); + static_assert(Transpose_32bit_32x8::AtomHeight == 32 && Transpose_32bit_32x8::AtomWidth == 8); + + EXPECT_TRUE(true) << "32-bit tall transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_WideConfigs) { + // Test 32-bit wide (Width > Height) transpose configurations + using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; + using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; + using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; + using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; + + static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8); + static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8); + static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8); + static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4); + + EXPECT_TRUE(true) << "32-bit wide transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_AllValid) { + // Test all valid 64-bit transpose configurations (Height=8, Width<4) + using Transpose_64bit_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; + using Transpose_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; + using Transpose_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; + + static_assert(Transpose_64bit_8x1::CopyBits == 64); + static_assert(Transpose_64bit_8x1::AtomHeight == 8 && Transpose_64bit_8x1::AtomWidth == 1); + + static_assert(Transpose_64bit_8x2::CopyBits == 64); + static_assert(Transpose_64bit_8x2::AtomHeight == 8 && Transpose_64bit_8x2::AtomWidth == 2); + + static_assert(Transpose_64bit_8x3::CopyBits == 64); + static_assert(Transpose_64bit_8x3::AtomHeight == 8 && Transpose_64bit_8x3::AtomWidth == 3); + + EXPECT_TRUE(true) << "64-bit all valid transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_Constraints) { + // Test that 64-bit transpose respects its strict constraints + // Valid: Height == 8 && Width < 4 + using Valid_64bit_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; + using Valid_64bit_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; + using Valid_64bit_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; + + // Verify all have correct dimensions + static_assert(Valid_64bit_1::AtomHeight == 8 && Valid_64bit_1::AtomWidth == 1); + static_assert(Valid_64bit_2::AtomHeight == 8 && Valid_64bit_2::AtomWidth == 2); + static_assert(Valid_64bit_3::AtomHeight == 8 && Valid_64bit_3::AtomWidth == 3); + + // Verify all have 64-bit size + static_assert(Valid_64bit_1::CopyBits == 64); + static_assert(Valid_64bit_2::CopyBits == 64); + static_assert(Valid_64bit_3::CopyBits == 64); + + EXPECT_TRUE(true) << "64-bit constraint validation successful"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_PowerOfTwo_Heights) { + // Test 32-bit transpose with power-of-two heights + using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; + using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>; + using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; + using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; + + static_assert(Transpose_32bit_1x4::AtomHeight == 1); + static_assert(Transpose_32bit_2x4::AtomHeight == 2); + static_assert(Transpose_32bit_4x4::AtomHeight == 4); + static_assert(Transpose_32bit_8x4::AtomHeight == 8); + static_assert(Transpose_32bit_16x4::AtomHeight == 16); + static_assert(Transpose_32bit_32x4::AtomHeight == 32); + + EXPECT_TRUE(true) << "32-bit power-of-two height transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_AllWidths) { + // Test 32-bit transpose with all valid widths (1-8) and height=8 + using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>; + using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>; + using Transpose_32bit_8x3 = XE_LOAD_2D_TRANSPOSE<32, 8, 3>; + using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + using Transpose_32bit_8x5 = XE_LOAD_2D_TRANSPOSE<32, 8, 5>; + using Transpose_32bit_8x6 = XE_LOAD_2D_TRANSPOSE<32, 8, 6>; + using Transpose_32bit_8x7 = XE_LOAD_2D_TRANSPOSE<32, 8, 7>; + using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; + + static_assert(Transpose_32bit_8x1::AtomWidth == 1); + static_assert(Transpose_32bit_8x2::AtomWidth == 2); + static_assert(Transpose_32bit_8x3::AtomWidth == 3); + static_assert(Transpose_32bit_8x4::AtomWidth == 4); + static_assert(Transpose_32bit_8x5::AtomWidth == 5); + static_assert(Transpose_32bit_8x6::AtomWidth == 6); + static_assert(Transpose_32bit_8x7::AtomWidth == 7); + static_assert(Transpose_32bit_8x8::AtomWidth == 8); + + EXPECT_TRUE(true) << "32-bit all width values transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MatMul_Optimized) { + // Test 32-bit transpose configurations useful for matrix multiplication + // Common for transposing A matrix in row-major to column-major for DPAS + using MatMul_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; + using MatMul_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; + using MatMul_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + using MatMul_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + + static_assert(MatMul_32bit_8x8::CopyBits == 32); + static_assert(MatMul_32bit_8x8::AtomHeight == 8 && MatMul_32bit_8x8::AtomWidth == 8); + + static_assert(MatMul_32bit_16x8::CopyBits == 32); + static_assert(MatMul_32bit_16x8::AtomHeight == 16 && MatMul_32bit_16x8::AtomWidth == 8); + + static_assert(MatMul_32bit_8x4::CopyBits == 32); + static_assert(MatMul_32bit_8x4::AtomHeight == 8 && MatMul_32bit_8x4::AtomWidth == 4); + + static_assert(MatMul_32bit_16x4::CopyBits == 32); + static_assert(MatMul_32bit_16x4::AtomHeight == 16 && MatMul_32bit_16x4::AtomWidth == 4); + + EXPECT_TRUE(true) << "32-bit MatMul-optimized transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SmallTiles) { + // Test 32-bit transpose small tiles for boundary handling + using Small_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; + using Small_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>; + using Small_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>; + using Small_32bit_4x2 = XE_LOAD_2D_TRANSPOSE<32, 4, 2>; + + static_assert(Small_32bit_1x1::AtomHeight == 1 && Small_32bit_1x1::AtomWidth == 1); + static_assert(Small_32bit_2x1::AtomHeight == 2 && Small_32bit_2x1::AtomWidth == 1); + static_assert(Small_32bit_1x2::AtomHeight == 1 && Small_32bit_1x2::AtomWidth == 2); + static_assert(Small_32bit_4x2::AtomHeight == 4 && Small_32bit_4x2::AtomWidth == 2); + + EXPECT_TRUE(true) << "32-bit small tile transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_DataSize_Consistency) { + // Test that CopyBits correctly reflects 32 or 64 bits + using Op_32bit_Small = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; + using Op_32bit_Large = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; + using Op_64bit_Valid1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; + using Op_64bit_Valid2 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; + + // All 32-bit variants should have CopyBits == 32 + static_assert(Op_32bit_Small::CopyBits == 32); + static_assert(Op_32bit_Large::CopyBits == 32); + + // All 64-bit variants should have CopyBits == 64 + static_assert(Op_64bit_Valid1::CopyBits == 64); + static_assert(Op_64bit_Valid2::CopyBits == 64); + + EXPECT_TRUE(true) << "Transpose data size consistency validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width_Progression) { + // Test 32-bit transpose with progressive widths and fixed height + using Transpose_16x1 = XE_LOAD_2D_TRANSPOSE<32, 16, 1>; + using Transpose_16x2 = XE_LOAD_2D_TRANSPOSE<32, 16, 2>; + using Transpose_16x3 = XE_LOAD_2D_TRANSPOSE<32, 16, 3>; + using Transpose_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + using Transpose_16x5 = XE_LOAD_2D_TRANSPOSE<32, 16, 5>; + using Transpose_16x6 = XE_LOAD_2D_TRANSPOSE<32, 16, 6>; + using Transpose_16x7 = XE_LOAD_2D_TRANSPOSE<32, 16, 7>; + using Transpose_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; + + static_assert(Transpose_16x1::AtomHeight == 16 && Transpose_16x1::AtomWidth == 1); + static_assert(Transpose_16x2::AtomHeight == 16 && Transpose_16x2::AtomWidth == 2); + static_assert(Transpose_16x3::AtomHeight == 16 && Transpose_16x3::AtomWidth == 3); + static_assert(Transpose_16x4::AtomHeight == 16 && Transpose_16x4::AtomWidth == 4); + static_assert(Transpose_16x5::AtomHeight == 16 && Transpose_16x5::AtomWidth == 5); + static_assert(Transpose_16x6::AtomHeight == 16 && Transpose_16x6::AtomWidth == 6); + static_assert(Transpose_16x7::AtomHeight == 16 && Transpose_16x7::AtomWidth == 7); + static_assert(Transpose_16x8::AtomHeight == 16 && Transpose_16x8::AtomWidth == 8); + + EXPECT_TRUE(true) << "32-bit progressive width transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_LargeTiles) { + // Test 32-bit transpose with larger tile configurations (Height <= 32 limit) + using Large_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; + using Large_32bit_32x6 = XE_LOAD_2D_TRANSPOSE<32, 32, 6>; + using Large_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; + using Large_32bit_32x2 = XE_LOAD_2D_TRANSPOSE<32, 32, 2>; + + static_assert(Large_32bit_32x8::CopyBits == 32); + static_assert(Large_32bit_32x8::AtomHeight == 32 && Large_32bit_32x8::AtomWidth == 8); + + static_assert(Large_32bit_32x6::CopyBits == 32); + static_assert(Large_32bit_32x6::AtomHeight == 32 && Large_32bit_32x6::AtomWidth == 6); + + static_assert(Large_32bit_32x4::CopyBits == 32); + static_assert(Large_32bit_32x4::AtomHeight == 32 && Large_32bit_32x4::AtomWidth == 4); + + static_assert(Large_32bit_32x2::CopyBits == 32); + static_assert(Large_32bit_32x2::AtomHeight == 32 && Large_32bit_32x2::AtomWidth == 2); + + EXPECT_TRUE(true) << "32-bit large tile transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Mixed_AspectRatios) { + // Test various aspect ratios for 32-bit transpose (Height <= 32 limit) + using AspectRatio_1to8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; // 1:8 + using AspectRatio_2to8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; // 1:4 + using AspectRatio_4to8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; // 1:2 + using AspectRatio_8to8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; // 1:1 + using AspectRatio_16to8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; // 2:1 + using AspectRatio_32to8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; // 4:1 + using AspectRatio_32to4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; // 8:1 + + static_assert(AspectRatio_1to8::AtomHeight == 1 && AspectRatio_1to8::AtomWidth == 8); + static_assert(AspectRatio_2to8::AtomHeight == 2 && AspectRatio_2to8::AtomWidth == 8); + static_assert(AspectRatio_4to8::AtomHeight == 4 && AspectRatio_4to8::AtomWidth == 8); + static_assert(AspectRatio_8to8::AtomHeight == 8 && AspectRatio_8to8::AtomWidth == 8); + static_assert(AspectRatio_16to8::AtomHeight == 16 && AspectRatio_16to8::AtomWidth == 8); + static_assert(AspectRatio_32to8::AtomHeight == 32 && AspectRatio_32to8::AtomWidth == 8); + static_assert(AspectRatio_32to4::AtomHeight == 32 && AspectRatio_32to4::AtomWidth == 4); + + EXPECT_TRUE(true) << "Mixed aspect ratio transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_BF16_FP16_UseCase) { + // Test transpose configurations useful for bf16/fp16 data (stored as 32-bit for transpose) + // Transpose allows loading 16-bit data as 32-bit, then converting after transpose + using BF16_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; + using BF16_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; + using BF16_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; + using BF16_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; + + // These are 32-bit operations but can be used with 16-bit data + static_assert(BF16_8x8::CopyBits == 32); + static_assert(BF16_8x8::AtomHeight == 8 && BF16_8x8::AtomWidth == 8); + + static_assert(BF16_16x8::CopyBits == 32); + static_assert(BF16_16x8::AtomHeight == 16 && BF16_16x8::AtomWidth == 8); + + static_assert(BF16_8x4::CopyBits == 32); + static_assert(BF16_8x4::AtomHeight == 8 && BF16_8x4::AtomWidth == 4); + + static_assert(BF16_16x4::CopyBits == 32); + static_assert(BF16_16x4::AtomHeight == 16 && BF16_16x4::AtomWidth == 4); + + EXPECT_TRUE(true) << "BF16/FP16 use case transpose configurations validated"; +} + +TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_UseCase) { + // Test 64-bit transpose use cases (double precision or two 32-bit values) + // Limited to Height=8, Width in {1, 2, 3} + using Double_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; + using Double_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; + using Double_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; + + // All must be 64-bit + static_assert(Double_8x1::CopyBits == 64); + static_assert(Double_8x2::CopyBits == 64); + static_assert(Double_8x3::CopyBits == 64); + + // All must have height=8 + static_assert(Double_8x1::AtomHeight == 8); + static_assert(Double_8x2::AtomHeight == 8); + static_assert(Double_8x3::AtomHeight == 8); + + // Widths must be < 4 + static_assert(Double_8x1::AtomWidth == 1); + static_assert(Double_8x2::AtomWidth == 2); + static_assert(Double_8x3::AtomWidth == 3); + + EXPECT_TRUE(true) << "64-bit use case transpose configurations validated"; +} + #else TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_SKIPPED) { From 043607b4c97fe2efd3134434ae741dcdf1bf087f Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Sun, 7 Dec 2025 23:47:05 +0530 Subject: [PATCH 4/7] Update xe_transpose_2d.cpp --- test/unit/cute/intel_xe/xe_transpose_2d.cpp | 550 ++++++-------------- 1 file changed, 152 insertions(+), 398 deletions(-) diff --git a/test/unit/cute/intel_xe/xe_transpose_2d.cpp b/test/unit/cute/intel_xe/xe_transpose_2d.cpp index bb9e5ea4b4..f7a9752f24 100644 --- a/test/unit/cute/intel_xe/xe_transpose_2d.cpp +++ b/test/unit/cute/intel_xe/xe_transpose_2d.cpp @@ -25,442 +25,196 @@ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ +#include "cutlass/detail/layout.hpp" + #include #include #include #include #include +#include + #include "cutlass_unit_test.h" +#include "utils.hpp" using namespace cute; - -#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) - -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_API_Declaration) { - // Template: XE_LOAD_2D_TRANSPOSE - // Constraints: Bits == 32 || Bits == 64, Width <= 8 - // For 64-bit: Height == 8 && Width < 4 - - // Test 32-bit transpose operations - using TransposeOp_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>; - using TransposeOp_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; - using TransposeOp_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>; - - // Test 64-bit transpose operations (limited constraints) - using TransposeOp_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; - using TransposeOp_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - // Test that the operations have the required static members from XE_Copy_Op_2D_Base - static_assert(TransposeOp_32bit_2x4::AtomHeight == 2); - static_assert(TransposeOp_32bit_2x4::AtomWidth == 4); - static_assert(TransposeOp_32bit_2x4::CopyBits == 32); - - static_assert(TransposeOp_32bit_4x8::AtomHeight == 4); - static_assert(TransposeOp_32bit_4x8::AtomWidth == 8); - static_assert(TransposeOp_32bit_4x8::CopyBits == 32); - - static_assert(TransposeOp_64bit_8x2::AtomHeight == 8); - static_assert(TransposeOp_64bit_8x2::AtomWidth == 2); - static_assert(TransposeOp_64bit_8x2::CopyBits == 64); - - EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE API types declared successfully"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Constraints) { - // Test that the compile-time constraints are enforced - - // Valid 32-bit operations - using Valid32_1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; - using Valid32_2 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; // Width <= 8 - - // Valid 64-bit operations (Height == 8 && Width < 4) - using Valid64_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; - using Valid64_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; - using Valid64_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - static_assert(Valid32_1::CopyBits == 32); - static_assert(Valid32_2::CopyBits == 32); - static_assert(Valid64_1::CopyBits == 64); - static_assert(Valid64_2::CopyBits == 64); - static_assert(Valid64_3::CopyBits == 64); - - EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE constraint validation successful"; +using namespace cutlass; +using namespace compat::experimental; + +#define SUBGROUP_SIZE (16) + +#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) + +// Kernel name for unique identification +template class XETranspose2DKernelName; + +// Device kernel for XE_LOAD_2D_TRANSPOSE testing +// Note: Transpose load performs HW-level transpose during load operation +// Memory layout (Height×Width) is transposed to register layout (Width×Height) +template +void xe_transpose_2d_kernel(SrcTensor src, DstTensor dst) { + using namespace cute; + using Element = typename SrcTensor::value_type; + + // Only execute with the first subgroup to avoid race conditions + if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) { + // Get thread/subgroup information + auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0)); + + // Create block 2D transpose load inside kernel (device-only operation) + using TransposeOp = XE_LOAD_2D_TRANSPOSE; + auto tiled_transpose = make_block_2d_copy(TransposeOp{}, src); + + // Get thread slice of the tiled transpose + auto thr_transpose = tiled_transpose.get_slice(local_id); + + // Create coordinate tensor for a single tile + // Note: coordinates are in memory space (Height×Width) + auto coord_shape = make_shape(Int{}, Int>{}); + Tensor coord_tile = make_identity_tensor(coord_shape); + + // Partition source coordinates for transpose load + auto thr_src_coord = thr_transpose.partition_S(coord_tile); + + // Create destination fragment - transpose changes the layout in registers + auto thr_dst_frag = thr_transpose.partition_fragment_D(coord_tile); + + // Perform the transpose load operation from global memory to registers + // Data is transposed during this operation by hardware + copy(tiled_transpose, thr_src_coord, thr_dst_frag); + + // For verification, we need to store the transposed data back + // Note: Output will be in transposed layout (Width×Height in memory) + // We store to the transposed destination shape + auto dst_coord_shape = make_shape(Int>{}, Int{}); + Tensor dst_coord_tile = make_identity_tensor(dst_coord_shape); + + using StoreOp = XE_STORE_2D; // Swapped dimensions + auto tiled_store = make_block_2d_copy(StoreOp{}, dst); + auto thr_store = tiled_store.get_slice(local_id); + + // Create destination coordinates for the store operation + auto thr_dst_coord = thr_store.partition_D(dst_coord_tile); + auto thr_src_frag = thr_store.partition_fragment_S(dst_coord_tile); + + // Copy from transpose fragment to store fragment + copy(thr_dst_frag, thr_src_frag); + + // Perform the store operation from registers to global memory + copy(tiled_store, thr_src_frag, thr_dst_coord); + + // Synchronize to ensure all threads complete their operations + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group()); + } } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MinimalConfigs) { - // Test minimal 32-bit transpose configurations - using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; - using Transpose_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>; - using Transpose_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>; - using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; - - static_assert(Transpose_32bit_1x1::CopyBits == 32); - static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1); - - static_assert(Transpose_32bit_1x2::CopyBits == 32); - static_assert(Transpose_32bit_1x2::AtomHeight == 1 && Transpose_32bit_1x2::AtomWidth == 2); - - static_assert(Transpose_32bit_2x1::CopyBits == 32); - static_assert(Transpose_32bit_2x1::AtomHeight == 2 && Transpose_32bit_2x1::AtomWidth == 1); - - static_assert(Transpose_32bit_2x2::CopyBits == 32); - static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2); - - EXPECT_TRUE(true) << "32-bit minimal transpose configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width4) { - // Test 32-bit transpose with width = 4 - using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; - using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>; - using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; - using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - - static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4); - static_assert(Transpose_32bit_2x4::AtomHeight == 2 && Transpose_32bit_2x4::AtomWidth == 4); - static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4); - static_assert(Transpose_32bit_8x4::AtomHeight == 8 && Transpose_32bit_8x4::AtomWidth == 4); - - EXPECT_TRUE(true) << "32-bit width=4 transpose configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width8_MaxWidth) { - // Test 32-bit transpose with width = 8 (maximum allowed width) - using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; - using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; - using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; - using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; - using Transpose_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; - - static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8); - static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8); - static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8); - static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8); - static_assert(Transpose_32bit_16x8::AtomHeight == 16 && Transpose_32bit_16x8::AtomWidth == 8); - - EXPECT_TRUE(true) << "32-bit max width=8 transpose configurations validated"; -} +// Host test function template for transpose operations +template +void test_xe_transpose_2d() { + using namespace cute; -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_VariousHeights) { - // Test 32-bit transpose with various height values - using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; - using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; - using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; - - static_assert(Transpose_32bit_1x4::AtomHeight == 1); - static_assert(Transpose_32bit_4x4::AtomHeight == 4); - static_assert(Transpose_32bit_8x4::AtomHeight == 8); - static_assert(Transpose_32bit_16x4::AtomHeight == 16); - static_assert(Transpose_32bit_32x4::AtomHeight == 32); - - EXPECT_TRUE(true) << "32-bit various height transpose configurations validated"; -} + // Source matrix dimensions (Height×Width in memory) + constexpr int M = Height; + constexpr int N = Width * sizeof_bits_v / Bits; -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SquareConfigs) { - // Test 32-bit square transpose configurations - using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; - using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; - using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; - using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; - - static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1); - static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2); - static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4); - static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8); - - EXPECT_TRUE(true) << "32-bit square transpose configurations validated"; -} + // Destination will be transposed (Width×Height in memory) + constexpr int M_dst = N; + constexpr int N_dst = M; -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_TallConfigs) { - // Test 32-bit tall (Height > Width) transpose configurations - using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>; - using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>; - using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - using Transpose_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; - - static_assert(Transpose_32bit_8x1::AtomHeight == 8 && Transpose_32bit_8x1::AtomWidth == 1); - static_assert(Transpose_32bit_8x2::AtomHeight == 8 && Transpose_32bit_8x2::AtomWidth == 2); - static_assert(Transpose_32bit_16x4::AtomHeight == 16 && Transpose_32bit_16x4::AtomWidth == 4); - static_assert(Transpose_32bit_32x8::AtomHeight == 32 && Transpose_32bit_32x8::AtomWidth == 8); - - EXPECT_TRUE(true) << "32-bit tall transpose configurations validated"; -} + // Ensure proper alignment + constexpr int elem_alignment = 16 / sizeof(Element); + constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment; + constexpr int aligned_M_dst = ((M_dst + elem_alignment - 1) / elem_alignment) * elem_alignment; -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_WideConfigs) { - // Test 32-bit wide (Width > Height) transpose configurations - using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; - using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; - using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; - using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; - - static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8); - static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8); - static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8); - static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4); - - EXPECT_TRUE(true) << "32-bit wide transpose configurations validated"; -} + // Allocate host memory + cutlass::host_vector host_src(M * aligned_N); + cutlass::host_vector host_dst(M_dst * aligned_M_dst); -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_AllValid) { - // Test all valid 64-bit transpose configurations (Height=8, Width<4) - using Transpose_64bit_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; - using Transpose_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; - using Transpose_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - static_assert(Transpose_64bit_8x1::CopyBits == 64); - static_assert(Transpose_64bit_8x1::AtomHeight == 8 && Transpose_64bit_8x1::AtomWidth == 1); - - static_assert(Transpose_64bit_8x2::CopyBits == 64); - static_assert(Transpose_64bit_8x2::AtomHeight == 8 && Transpose_64bit_8x2::AtomWidth == 2); - - static_assert(Transpose_64bit_8x3::CopyBits == 64); - static_assert(Transpose_64bit_8x3::AtomHeight == 8 && Transpose_64bit_8x3::AtomWidth == 3); - - EXPECT_TRUE(true) << "64-bit all valid transpose configurations validated"; -} + // Initialize source with test pattern + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + Element val; + if constexpr (std::is_floating_point_v || + std::is_same_v || + std::is_same_v) { + val = Element(static_cast(i * N + j) / 100.0f); + } else { + val = static_cast((i * N + j) % 256); + } + host_src[i * aligned_N + j] = val; + } + } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_Constraints) { - // Test that 64-bit transpose respects its strict constraints - // Valid: Height == 8 && Width < 4 - using Valid_64bit_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; - using Valid_64bit_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; - using Valid_64bit_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - // Verify all have correct dimensions - static_assert(Valid_64bit_1::AtomHeight == 8 && Valid_64bit_1::AtomWidth == 1); - static_assert(Valid_64bit_2::AtomHeight == 8 && Valid_64bit_2::AtomWidth == 2); - static_assert(Valid_64bit_3::AtomHeight == 8 && Valid_64bit_3::AtomWidth == 3); - - // Verify all have 64-bit size - static_assert(Valid_64bit_1::CopyBits == 64); - static_assert(Valid_64bit_2::CopyBits == 64); - static_assert(Valid_64bit_3::CopyBits == 64); - - EXPECT_TRUE(true) << "64-bit constraint validation successful"; -} + // Copy to device + cutlass::device_vector device_src = host_src; + cutlass::device_vector device_dst(M_dst * aligned_M_dst); + + // Create source tensor (Height×Width) + Tensor tensor_src = + make_tensor(make_gmem_ptr(device_src.data()), + make_layout(Shape, Int>{}, + Stride, _1>{})); -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_PowerOfTwo_Heights) { - // Test 32-bit transpose with power-of-two heights - using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>; - using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>; - using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>; - using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; - - static_assert(Transpose_32bit_1x4::AtomHeight == 1); - static_assert(Transpose_32bit_2x4::AtomHeight == 2); - static_assert(Transpose_32bit_4x4::AtomHeight == 4); - static_assert(Transpose_32bit_8x4::AtomHeight == 8); - static_assert(Transpose_32bit_16x4::AtomHeight == 16); - static_assert(Transpose_32bit_32x4::AtomHeight == 32); - - EXPECT_TRUE(true) << "32-bit power-of-two height transpose configurations validated"; -} + // Create destination tensor (Width×Height) - transposed shape + Tensor tensor_dst = + make_tensor(make_gmem_ptr(device_dst.data()), + make_layout(Shape, Int>{}, + Stride, _1>{})); -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_AllWidths) { - // Test 32-bit transpose with all valid widths (1-8) and height=8 - using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>; - using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>; - using Transpose_32bit_8x3 = XE_LOAD_2D_TRANSPOSE<32, 8, 3>; - using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - using Transpose_32bit_8x5 = XE_LOAD_2D_TRANSPOSE<32, 8, 5>; - using Transpose_32bit_8x6 = XE_LOAD_2D_TRANSPOSE<32, 8, 6>; - using Transpose_32bit_8x7 = XE_LOAD_2D_TRANSPOSE<32, 8, 7>; - using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; - - static_assert(Transpose_32bit_8x1::AtomWidth == 1); - static_assert(Transpose_32bit_8x2::AtomWidth == 2); - static_assert(Transpose_32bit_8x3::AtomWidth == 3); - static_assert(Transpose_32bit_8x4::AtomWidth == 4); - static_assert(Transpose_32bit_8x5::AtomWidth == 5); - static_assert(Transpose_32bit_8x6::AtomWidth == 6); - static_assert(Transpose_32bit_8x7::AtomWidth == 7); - static_assert(Transpose_32bit_8x8::AtomWidth == 8); - - EXPECT_TRUE(true) << "32-bit all width values transpose configurations validated"; -} + // Launch kernel + auto blockDim = compat::dim3(SUBGROUP_SIZE); + auto gridDim = compat::dim3(1); -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MatMul_Optimized) { - // Test 32-bit transpose configurations useful for matrix multiplication - // Common for transposing A matrix in row-major to column-major for DPAS - using MatMul_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; - using MatMul_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; - using MatMul_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - using MatMul_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - - static_assert(MatMul_32bit_8x8::CopyBits == 32); - static_assert(MatMul_32bit_8x8::AtomHeight == 8 && MatMul_32bit_8x8::AtomWidth == 8); - - static_assert(MatMul_32bit_16x8::CopyBits == 32); - static_assert(MatMul_32bit_16x8::AtomHeight == 16 && MatMul_32bit_16x8::AtomWidth == 8); - - static_assert(MatMul_32bit_8x4::CopyBits == 32); - static_assert(MatMul_32bit_8x4::AtomHeight == 8 && MatMul_32bit_8x4::AtomWidth == 4); - - static_assert(MatMul_32bit_16x4::CopyBits == 32); - static_assert(MatMul_32bit_16x4::AtomHeight == 16 && MatMul_32bit_16x4::AtomWidth == 4); - - EXPECT_TRUE(true) << "32-bit MatMul-optimized transpose configurations validated"; -} + launch, + XETranspose2DKernelName>( + launch_policy{ + gridDim, blockDim, + kernel_properties{sycl_exp::sub_group_size} + }, + tensor_src, tensor_dst); -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SmallTiles) { - // Test 32-bit transpose small tiles for boundary handling - using Small_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>; - using Small_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>; - using Small_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>; - using Small_32bit_4x2 = XE_LOAD_2D_TRANSPOSE<32, 4, 2>; - - static_assert(Small_32bit_1x1::AtomHeight == 1 && Small_32bit_1x1::AtomWidth == 1); - static_assert(Small_32bit_2x1::AtomHeight == 2 && Small_32bit_2x1::AtomWidth == 1); - static_assert(Small_32bit_1x2::AtomHeight == 1 && Small_32bit_1x2::AtomWidth == 2); - static_assert(Small_32bit_4x2::AtomHeight == 4 && Small_32bit_4x2::AtomWidth == 2); - - EXPECT_TRUE(true) << "32-bit small tile transpose configurations validated"; -} + compat::wait_and_throw(); + host_dst = device_dst; -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_DataSize_Consistency) { - // Test that CopyBits correctly reflects 32 or 64 bits - using Op_32bit_Small = XE_LOAD_2D_TRANSPOSE<32, 2, 2>; - using Op_32bit_Large = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; - using Op_64bit_Valid1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; - using Op_64bit_Valid2 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - // All 32-bit variants should have CopyBits == 32 - static_assert(Op_32bit_Small::CopyBits == 32); - static_assert(Op_32bit_Large::CopyBits == 32); - - // All 64-bit variants should have CopyBits == 64 - static_assert(Op_64bit_Valid1::CopyBits == 64); - static_assert(Op_64bit_Valid2::CopyBits == 64); - - EXPECT_TRUE(true) << "Transpose data size consistency validated"; + // Verify transpose: dst[j][i] should equal src[i][j] + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + Element src_val = host_src[i * aligned_N + j]; + Element dst_val = host_dst[j * aligned_M_dst + i]; + EXPECT_EQ(dst_val, src_val) + << "Mismatch at src[" << i << "][" << j << "] vs dst[" << j << "][" << i << "]"; + } + } } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width_Progression) { - // Test 32-bit transpose with progressive widths and fixed height - using Transpose_16x1 = XE_LOAD_2D_TRANSPOSE<32, 16, 1>; - using Transpose_16x2 = XE_LOAD_2D_TRANSPOSE<32, 16, 2>; - using Transpose_16x3 = XE_LOAD_2D_TRANSPOSE<32, 16, 3>; - using Transpose_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - using Transpose_16x5 = XE_LOAD_2D_TRANSPOSE<32, 16, 5>; - using Transpose_16x6 = XE_LOAD_2D_TRANSPOSE<32, 16, 6>; - using Transpose_16x7 = XE_LOAD_2D_TRANSPOSE<32, 16, 7>; - using Transpose_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; - - static_assert(Transpose_16x1::AtomHeight == 16 && Transpose_16x1::AtomWidth == 1); - static_assert(Transpose_16x2::AtomHeight == 16 && Transpose_16x2::AtomWidth == 2); - static_assert(Transpose_16x3::AtomHeight == 16 && Transpose_16x3::AtomWidth == 3); - static_assert(Transpose_16x4::AtomHeight == 16 && Transpose_16x4::AtomWidth == 4); - static_assert(Transpose_16x5::AtomHeight == 16 && Transpose_16x5::AtomWidth == 5); - static_assert(Transpose_16x6::AtomHeight == 16 && Transpose_16x6::AtomWidth == 6); - static_assert(Transpose_16x7::AtomHeight == 16 && Transpose_16x7::AtomWidth == 7); - static_assert(Transpose_16x8::AtomHeight == 16 && Transpose_16x8::AtomWidth == 8); - - EXPECT_TRUE(true) << "32-bit progressive width transpose configurations validated"; +// Test 32-bit transpose operations (Width ≤ 8 constraint) +TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_4x8) { + test_xe_transpose_2d(); } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_LargeTiles) { - // Test 32-bit transpose with larger tile configurations (Height <= 32 limit) - using Large_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; - using Large_32bit_32x6 = XE_LOAD_2D_TRANSPOSE<32, 32, 6>; - using Large_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; - using Large_32bit_32x2 = XE_LOAD_2D_TRANSPOSE<32, 32, 2>; - - static_assert(Large_32bit_32x8::CopyBits == 32); - static_assert(Large_32bit_32x8::AtomHeight == 32 && Large_32bit_32x8::AtomWidth == 8); - - static_assert(Large_32bit_32x6::CopyBits == 32); - static_assert(Large_32bit_32x6::AtomHeight == 32 && Large_32bit_32x6::AtomWidth == 6); - - static_assert(Large_32bit_32x4::CopyBits == 32); - static_assert(Large_32bit_32x4::AtomHeight == 32 && Large_32bit_32x4::AtomWidth == 4); - - static_assert(Large_32bit_32x2::CopyBits == 32); - static_assert(Large_32bit_32x2::AtomHeight == 32 && Large_32bit_32x2::AtomWidth == 2); - - EXPECT_TRUE(true) << "32-bit large tile transpose configurations validated"; +TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_8x8) { + test_xe_transpose_2d(); } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Mixed_AspectRatios) { - // Test various aspect ratios for 32-bit transpose (Height <= 32 limit) - using AspectRatio_1to8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>; // 1:8 - using AspectRatio_2to8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>; // 1:4 - using AspectRatio_4to8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>; // 1:2 - using AspectRatio_8to8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; // 1:1 - using AspectRatio_16to8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; // 2:1 - using AspectRatio_32to8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>; // 4:1 - using AspectRatio_32to4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>; // 8:1 - - static_assert(AspectRatio_1to8::AtomHeight == 1 && AspectRatio_1to8::AtomWidth == 8); - static_assert(AspectRatio_2to8::AtomHeight == 2 && AspectRatio_2to8::AtomWidth == 8); - static_assert(AspectRatio_4to8::AtomHeight == 4 && AspectRatio_4to8::AtomWidth == 8); - static_assert(AspectRatio_8to8::AtomHeight == 8 && AspectRatio_8to8::AtomWidth == 8); - static_assert(AspectRatio_16to8::AtomHeight == 16 && AspectRatio_16to8::AtomWidth == 8); - static_assert(AspectRatio_32to8::AtomHeight == 32 && AspectRatio_32to8::AtomWidth == 8); - static_assert(AspectRatio_32to4::AtomHeight == 32 && AspectRatio_32to4::AtomWidth == 4); - - EXPECT_TRUE(true) << "Mixed aspect ratio transpose configurations validated"; +TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_4x4) { + test_xe_transpose_2d(); } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_BF16_FP16_UseCase) { - // Test transpose configurations useful for bf16/fp16 data (stored as 32-bit for transpose) - // Transpose allows loading 16-bit data as 32-bit, then converting after transpose - using BF16_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>; - using BF16_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; - using BF16_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>; - using BF16_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>; - - // These are 32-bit operations but can be used with 16-bit data - static_assert(BF16_8x8::CopyBits == 32); - static_assert(BF16_8x8::AtomHeight == 8 && BF16_8x8::AtomWidth == 8); - - static_assert(BF16_16x8::CopyBits == 32); - static_assert(BF16_16x8::AtomHeight == 16 && BF16_16x8::AtomWidth == 8); - - static_assert(BF16_8x4::CopyBits == 32); - static_assert(BF16_8x4::AtomHeight == 8 && BF16_8x4::AtomWidth == 4); - - static_assert(BF16_16x4::CopyBits == 32); - static_assert(BF16_16x4::AtomHeight == 16 && BF16_16x4::AtomWidth == 4); - - EXPECT_TRUE(true) << "BF16/FP16 use case transpose configurations validated"; +TEST(CuTe_Xe, XE_TRANSPOSE_2D_int32_4x8) { + test_xe_transpose_2d(); } -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_UseCase) { - // Test 64-bit transpose use cases (double precision or two 32-bit values) - // Limited to Height=8, Width in {1, 2, 3} - using Double_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>; - using Double_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>; - using Double_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>; - - // All must be 64-bit - static_assert(Double_8x1::CopyBits == 64); - static_assert(Double_8x2::CopyBits == 64); - static_assert(Double_8x3::CopyBits == 64); - - // All must have height=8 - static_assert(Double_8x1::AtomHeight == 8); - static_assert(Double_8x2::AtomHeight == 8); - static_assert(Double_8x3::AtomHeight == 8); - - // Widths must be < 4 - static_assert(Double_8x1::AtomWidth == 1); - static_assert(Double_8x2::AtomWidth == 2); - static_assert(Double_8x3::AtomWidth == 3); - - EXPECT_TRUE(true) << "64-bit use case transpose configurations validated"; +TEST(CuTe_Xe, XE_TRANSPOSE_2D_uint32_4x8) { + test_xe_transpose_2d(); } #else -TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_SKIPPED) { +TEST(CuTe_Xe, XE_TRANSPOSE_2D_SKIPPED) { GTEST_SKIP() << "XE_LOAD_2D_TRANSPOSE tests require IGC version 2.18 or higher. skipped"; } From 51393e5635e36fb7b411ef0c34c579d708d44f3a Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Sun, 7 Dec 2025 23:47:46 +0530 Subject: [PATCH 5/7] Update xe_vnni_2d.cpp --- test/unit/cute/intel_xe/xe_vnni_2d.cpp | 562 +++++++++---------------- 1 file changed, 191 insertions(+), 371 deletions(-) diff --git a/test/unit/cute/intel_xe/xe_vnni_2d.cpp b/test/unit/cute/intel_xe/xe_vnni_2d.cpp index 5bc36338b7..45100aa9ff 100644 --- a/test/unit/cute/intel_xe/xe_vnni_2d.cpp +++ b/test/unit/cute/intel_xe/xe_vnni_2d.cpp @@ -26,415 +26,235 @@ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE. + * **************************************************************************************************/ + /* + * VNNI Usage Summary: + * + * This file demonstrates XE_LOAD_2D_VNNI usage in kernel context. + * + * Key points: + * 1. VNNI is used to load B matrix in GEMM operations + * 2. Hardware performs interleaving during load (free transformation) + * 3. VNNI data flows directly to DPAS operations + * 4. Only 8-bit and 16-bit data types supported + * 5. BlockWidth parameter creates multiple blocks (block_count = Width/BlockWidth) + * + * Real-world usage pattern: + * auto copy_b = make_block_2d_copy_B(XE_LOAD_2D_VNNI<16, 32, 16, 16>{}, mma, gB); + * copy(copy_b, tBgB, tBrB); // Load in VNNI format + * gemm(mma, tCrA, tBrB, tCrC); // DPAS consumes VNNI data + * + * See examples/12_bmg_moe_gemm_cute_interface/ for full GEMM implementation. + + */ + +#include "cutlass/detail/layout.hpp" + #include #include #include #include +#include +#include #include +#include + #include "cutlass_unit_test.h" +#include "utils.hpp" using namespace cute; +using namespace cutlass; +using namespace compat::experimental; + +#define SUBGROUP_SIZE (16) #if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_API_Declaration) { - // Template: XE_LOAD_2D_VNNI - - // Test that the VNNI operation types can be declared - using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>; - using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>; - using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>; - using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>; - - // Test that the operations have the required static members from XE_Copy_Op_2D_Base - static_assert(VNNIOp_8bit_2x32::AtomHeight == 2); - static_assert(VNNIOp_8bit_2x32::AtomWidth == 32); - static_assert(VNNIOp_8bit_2x32::CopyBits == 8); - - static_assert(VNNIOp_16bit_2x16::AtomHeight == 2); - static_assert(VNNIOp_16bit_2x16::AtomWidth == 16); - static_assert(VNNIOp_16bit_2x16::CopyBits == 16); - - EXPECT_TRUE(true) << "XE_LOAD_2D_VNNI API types declared successfully"; -} +// Kernel name for unique identification +template class XEVnniLoadKernelName; -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MinimalConfigs) { - // Test minimal 8-bit VNNI configurations - using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>; - using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>; - using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>; - - static_assert(VNNIOp_8bit_1x32::CopyBits == 8); - static_assert(VNNIOp_8bit_1x32::AtomHeight == 1); - static_assert(VNNIOp_8bit_1x32::AtomWidth == 32); - - static_assert(VNNIOp_8bit_2x32::CopyBits == 8); - static_assert(VNNIOp_8bit_2x32::AtomHeight == 2); - static_assert(VNNIOp_8bit_2x32::AtomWidth == 32); - - static_assert(VNNIOp_8bit_4x32::CopyBits == 8); - static_assert(VNNIOp_8bit_4x32::AtomHeight == 4); - static_assert(VNNIOp_8bit_4x32::AtomWidth == 32); - - EXPECT_TRUE(true) << "8-bit minimal VNNI configurations validated"; -} +// VNNI load demonstration kernel +// Note: VNNI is designed for B matrix in GEMM context with DPAS consumption +// This simplified test only verifies the load operation executes without errors +template +void xe_vnni_load_kernel(SrcTensor src, DstTensor dst) { + using namespace cute; + using Element = typename SrcTensor::value_type; -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MediumConfigs) { - // Test medium-sized 8-bit VNNI configurations - using VNNIOp_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; - using VNNIOp_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; - - static_assert(VNNIOp_8bit_8x32::CopyBits == 8); - static_assert(VNNIOp_8bit_8x32::AtomHeight == 8); - static_assert(VNNIOp_8bit_8x32::AtomWidth == 32); - - static_assert(VNNIOp_8bit_16x32::CopyBits == 8); - static_assert(VNNIOp_8bit_16x32::AtomHeight == 16); - static_assert(VNNIOp_8bit_16x32::AtomWidth == 32); - - EXPECT_TRUE(true) << "8-bit medium VNNI configurations validated"; + // Only execute with the first subgroup to avoid race conditions + if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) { + // Get thread/subgroup information + auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0)); + + // ============================================ + // Use VNNI load instead of regular XE_LOAD_2D + // ============================================ + // Note: VNNI is typically used with make_block_2d_copy_B in GEMM context + // But for demonstration, we show the raw VNNI operation + using VnniOp = XE_LOAD_2D_VNNI; // BlockWidth = Width for single block + auto tiled_copy = make_block_2d_copy(VnniOp{}, src); + + // Get thread slice of the tiled copy + auto thr_copy = tiled_copy.get_slice(local_id); + + // Create coordinate tensor for a single tile + auto coord_shape = make_shape(Int{}, Int>{}); + Tensor coord_tile = make_identity_tensor(coord_shape); + + // Partition source coordinates and create destination fragment + auto thr_src_coord = thr_copy.partition_S(coord_tile); + auto thr_dst_frag = thr_copy.partition_fragment_D(coord_tile); + + // ============================================ + // THIS IS THE VNNI LOAD + // Hardware performs interleaving during this load + // Data in thr_dst_frag is now in VNNI interleaved format + // ============================================ + copy(tiled_copy, thr_src_coord, thr_dst_frag); + + // For verification, store back to destination + // Note: In real usage, thr_dst_frag would go directly to gemm(mma, tCrA, thr_dst_frag, tCrC) + using StoreOp = XE_STORE_2D; + auto tiled_store = make_block_2d_copy(StoreOp{}, dst); + auto thr_store = tiled_store.get_slice(local_id); + + // Create destination coordinates for the store operation + auto thr_dst_coord = thr_store.partition_D(coord_tile); + auto thr_src_frag = thr_store.partition_fragment_S(coord_tile); + + // Copy the loaded data from registers to the fragment for storing + copy(thr_dst_frag, thr_src_frag); + + // Perform the store operation from registers to global memory + copy(tiled_store, thr_src_frag, thr_dst_coord); + + // Synchronize to ensure all threads complete their operations + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group()); + } } -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_WideConfigs) { - // Test 8-bit VNNI configurations with wider widths - using VNNIOp_8bit_2x64 = XE_LOAD_2D_VNNI<8, 2, 64>; - using VNNIOp_8bit_4x64 = XE_LOAD_2D_VNNI<8, 4, 64>; - using VNNIOp_8bit_8x64 = XE_LOAD_2D_VNNI<8, 8, 64>; - - static_assert(VNNIOp_8bit_2x64::CopyBits == 8); - static_assert(VNNIOp_8bit_2x64::AtomHeight == 2); - static_assert(VNNIOp_8bit_2x64::AtomWidth == 64); - - static_assert(VNNIOp_8bit_4x64::CopyBits == 8); - static_assert(VNNIOp_8bit_4x64::AtomHeight == 4); - static_assert(VNNIOp_8bit_4x64::AtomWidth == 64); - - static_assert(VNNIOp_8bit_8x64::CopyBits == 8); - static_assert(VNNIOp_8bit_8x64::AtomHeight == 8); - static_assert(VNNIOp_8bit_8x64::AtomWidth == 64); - - EXPECT_TRUE(true) << "8-bit wide VNNI configurations validated"; -} +// Host test function for VNNI load operation +template +void test_xe_vnni_load() { + using namespace cute; + + // Matrix dimensions - must be compatible with block 2D constraints + constexpr int M = Height; + constexpr int N = Width * sizeof_bits_v / Bits; + + // Ensure proper alignment (required for block 2D operations) + constexpr int elem_alignment = 16 / sizeof(Element); + constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment; + + // Allocate and initialize host data + cutlass::host_vector host_src(M * aligned_N); + cutlass::host_vector host_dst(M * aligned_N); -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MinimalConfigs) { - // Test minimal 16-bit VNNI configurations - using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>; - using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>; - using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>; - - static_assert(VNNIOp_16bit_1x16::CopyBits == 16); - static_assert(VNNIOp_16bit_1x16::AtomHeight == 1); - static_assert(VNNIOp_16bit_1x16::AtomWidth == 16); - static_assert(VNNIOp_16bit_2x16::CopyBits == 16); - static_assert(VNNIOp_16bit_2x16::AtomHeight == 2); - static_assert(VNNIOp_16bit_2x16::AtomWidth == 16); - - static_assert(VNNIOp_16bit_4x16::CopyBits == 16); - static_assert(VNNIOp_16bit_4x16::AtomHeight == 4); - static_assert(VNNIOp_16bit_4x16::AtomWidth == 16); - - EXPECT_TRUE(true) << "16-bit minimal VNNI configurations validated"; -} + // Initialize source with test pattern + for (size_t i = 0; i < host_src.size(); ++i) { + // Use a safe conversion that works for all numeric types + if constexpr (std::is_floating_point_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + + // For floating-point types, convert through float + float val = static_cast(i % 256) / 255.0f; // Normalize to [0,1] + host_src[i] = Element(val); + } else { + // For integer types (including uint64_t) and char, direct conversion is safe + host_src[i] = static_cast(i % 256); + } + } -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MediumConfigs) { - // Test medium-sized 16-bit VNNI configurations - using VNNIOp_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; - using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; - - static_assert(VNNIOp_16bit_8x16::CopyBits == 16); - static_assert(VNNIOp_16bit_8x16::AtomHeight == 8); - static_assert(VNNIOp_16bit_8x16::AtomWidth == 16); - - static_assert(VNNIOp_16bit_16x16::CopyBits == 16); - static_assert(VNNIOp_16bit_16x16::AtomHeight == 16); - static_assert(VNNIOp_16bit_16x16::AtomWidth == 16); - - EXPECT_TRUE(true) << "16-bit medium VNNI configurations validated"; + // Copy to device + cutlass::device_vector device_src = host_src; + cutlass::device_vector device_dst(M * aligned_N); + + // Create tensors with proper layout + Tensor tensor_src = + make_tensor(make_gmem_ptr(device_src.data()), + make_layout(Shape, Int>{}, Stride, _1>{})); + + Tensor tensor_dst = + make_tensor(make_gmem_ptr(device_dst.data()), + make_layout(Shape, Int>{}, Stride, _1>{})); + + // Launch kernel - VNNI load demonstration + auto blockDim = compat::dim3(SUBGROUP_SIZE); + auto gridDim = compat::dim3(1); + + launch, + XEVnniLoadKernelName>( + launch_policy{ + gridDim, blockDim, + kernel_properties{sycl_exp::sub_group_size} + }, + tensor_src, tensor_dst); + + compat::wait_and_throw(); + + // Note: We do NOT verify data matches because VNNI performs interleaving transformation + // The loaded data is in VNNI format (hardware-interleaved for DPAS consumption) + // When stored back to memory, the interleaved pattern is visible + // In real usage, VNNI data goes directly to gemm()/DPAS, never stored back + // This test verifies that VNNI load operation executes without errors } +// ============================================ +// VNNI Tests - Only 8-bit and 16-bit supported +// ============================================ -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_WideConfigs) { - // Test 16-bit VNNI configurations with wider widths - using VNNIOp_16bit_2x32 = XE_LOAD_2D_VNNI<16, 2, 32>; - using VNNIOp_16bit_4x32 = XE_LOAD_2D_VNNI<16, 4, 32>; - using VNNIOp_16bit_8x32 = XE_LOAD_2D_VNNI<16, 8, 32>; - - static_assert(VNNIOp_16bit_2x32::CopyBits == 16); - static_assert(VNNIOp_16bit_2x32::AtomHeight == 2); - static_assert(VNNIOp_16bit_2x32::AtomWidth == 32); - - static_assert(VNNIOp_16bit_4x32::CopyBits == 16); - static_assert(VNNIOp_16bit_4x32::AtomHeight == 4); - static_assert(VNNIOp_16bit_4x32::AtomWidth == 32); - - static_assert(VNNIOp_16bit_8x32::CopyBits == 16); - static_assert(VNNIOp_16bit_8x32::AtomHeight == 8); - static_assert(VNNIOp_16bit_8x32::AtomWidth == 32); - - EXPECT_TRUE(true) << "16-bit wide VNNI configurations validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_uint8) { + // VNNI is used for B matrix in GEMM - typically with BlockWidth creating multiple blocks + test_xe_vnni_load(); // 4 blocks of 16 + test_xe_vnni_load(); // 2 blocks of 32 + test_xe_vnni_load(); // 1 block of 64 } - - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_CustomBlockWidth) { - // Test 8-bit VNNI with custom BlockWidth parameter - using VNNIOp_8bit_4x32_bw16 = XE_LOAD_2D_VNNI<8, 4, 32, 16>; - using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>; - using VNNIOp_8bit_16x32_bw16 = XE_LOAD_2D_VNNI<8, 16, 32, 16>; - - static_assert(VNNIOp_8bit_4x32_bw16::CopyBits == 8); - static_assert(VNNIOp_8bit_4x32_bw16::AtomHeight == 4); - static_assert(VNNIOp_8bit_4x32_bw16::AtomWidth == 32); - - static_assert(VNNIOp_8bit_8x32_bw16::CopyBits == 8); - static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8); - static_assert(VNNIOp_8bit_8x32_bw16::AtomWidth == 32); - - static_assert(VNNIOp_8bit_16x32_bw16::CopyBits == 8); - static_assert(VNNIOp_8bit_16x32_bw16::AtomHeight == 16); - static_assert(VNNIOp_8bit_16x32_bw16::AtomWidth == 32); - - EXPECT_TRUE(true) << "8-bit VNNI with custom BlockWidth validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_int8) { + test_xe_vnni_load(); + test_xe_vnni_load(); + test_xe_vnni_load(); } -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_CustomBlockWidth) { - // Test 16-bit VNNI with custom BlockWidth parameter - using VNNIOp_16bit_4x16_bw8 = XE_LOAD_2D_VNNI<16, 4, 16, 8>; - using VNNIOp_16bit_8x16_bw8 = XE_LOAD_2D_VNNI<16, 8, 16, 8>; - using VNNIOp_16bit_16x16_bw8 = XE_LOAD_2D_VNNI<16, 16, 16, 8>; - - static_assert(VNNIOp_16bit_4x16_bw8::CopyBits == 16); - static_assert(VNNIOp_16bit_4x16_bw8::AtomHeight == 4); - static_assert(VNNIOp_16bit_4x16_bw8::AtomWidth == 16); - - static_assert(VNNIOp_16bit_8x16_bw8::CopyBits == 16); - static_assert(VNNIOp_16bit_8x16_bw8::AtomHeight == 8); - static_assert(VNNIOp_16bit_8x16_bw8::AtomWidth == 16); - - static_assert(VNNIOp_16bit_16x16_bw8::CopyBits == 16); - static_assert(VNNIOp_16bit_16x16_bw8::AtomHeight == 16); - static_assert(VNNIOp_16bit_16x16_bw8::AtomWidth == 16); - - EXPECT_TRUE(true) << "16-bit VNNI with custom BlockWidth validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_uint16) { + test_xe_vnni_load(); // 2 blocks of 16 + test_xe_vnni_load(); // 2 blocks of 16 + test_xe_vnni_load(); // 1 block of 32 } - - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Int8_GEMMConfigs) { - // Test typical int8 GEMM VNNI configurations for K-dimension packing - using GEMM_Int8_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>; - using GEMM_Int8_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; - using GEMM_Int8_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; - using GEMM_Int8_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>; - - static_assert(GEMM_Int8_4x32::CopyBits == 8); - static_assert(GEMM_Int8_4x32::AtomHeight == 4); - static_assert(GEMM_Int8_4x32::AtomWidth == 32); - - static_assert(GEMM_Int8_8x32::CopyBits == 8); - static_assert(GEMM_Int8_8x32::AtomHeight == 8); - static_assert(GEMM_Int8_8x32::AtomWidth == 32); - - static_assert(GEMM_Int8_16x32::CopyBits == 8); - static_assert(GEMM_Int8_16x32::AtomHeight == 16); - static_assert(GEMM_Int8_16x32::AtomWidth == 32); - - static_assert(GEMM_Int8_32x32::CopyBits == 8); - static_assert(GEMM_Int8_32x32::AtomHeight == 32); - static_assert(GEMM_Int8_32x32::AtomWidth == 32); - - EXPECT_TRUE(true) << "Int8 GEMM VNNI configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BF16_GEMMConfigs) { - // Test typical BF16/FP16 GEMM VNNI configurations for K-dimension packing - using GEMM_BF16_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>; - using GEMM_BF16_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; - using GEMM_BF16_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; - using GEMM_BF16_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; - - static_assert(GEMM_BF16_4x16::CopyBits == 16); - static_assert(GEMM_BF16_4x16::AtomHeight == 4); - static_assert(GEMM_BF16_4x16::AtomWidth == 16); - - static_assert(GEMM_BF16_8x16::CopyBits == 16); - static_assert(GEMM_BF16_8x16::AtomHeight == 8); - static_assert(GEMM_BF16_8x16::AtomWidth == 16); - - static_assert(GEMM_BF16_16x16::CopyBits == 16); - static_assert(GEMM_BF16_16x16::AtomHeight == 16); - static_assert(GEMM_BF16_16x16::AtomWidth == 16); - - static_assert(GEMM_BF16_32x16::CopyBits == 16); - static_assert(GEMM_BF16_32x16::AtomHeight == 32); - static_assert(GEMM_BF16_32x16::AtomWidth == 16); - - EXPECT_TRUE(true) << "BF16/FP16 GEMM VNNI configurations validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_int16) { + test_xe_vnni_load(); + test_xe_vnni_load(); + test_xe_vnni_load(); } -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MoE_GEMMConfigs) { - // Test VNNI configurations used in MoE (Mixture of Experts) GEMM - // Based on example from 12_bmg_moe_gemm_cute_interface - using MoE_Load_A = XE_LOAD_2D_VNNI<16, 32, 16, 16>; - using MoE_Load_B_Alt1 = XE_LOAD_2D_VNNI<16, 16, 16>; - using MoE_Load_B_Alt2 = XE_LOAD_2D_VNNI<16, 8, 16>; - - static_assert(MoE_Load_A::CopyBits == 16); - static_assert(MoE_Load_A::AtomHeight == 32); - static_assert(MoE_Load_A::AtomWidth == 16); - - static_assert(MoE_Load_B_Alt1::CopyBits == 16); - static_assert(MoE_Load_B_Alt1::AtomHeight == 16); - static_assert(MoE_Load_B_Alt1::AtomWidth == 16); - - static_assert(MoE_Load_B_Alt2::CopyBits == 16); - static_assert(MoE_Load_B_Alt2::AtomHeight == 8); - static_assert(MoE_Load_B_Alt2::AtomWidth == 16); - - EXPECT_TRUE(true) << "MoE GEMM VNNI configurations validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_half) { + test_xe_vnni_load(); + test_xe_vnni_load(); + test_xe_vnni_load(); } - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MixedBlockWidthConfigs) { - // Test various BlockWidth settings for optimization - using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>; - using VNNIOp_8bit_16x64_bw32 = XE_LOAD_2D_VNNI<8, 16, 64, 32>; - using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>; - using VNNIOp_16bit_16x32_bw16 = XE_LOAD_2D_VNNI<16, 16, 32, 16>; - - static_assert(VNNIOp_8bit_8x64_bw32::CopyBits == 8); - static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8); - static_assert(VNNIOp_8bit_8x64_bw32::AtomWidth == 64); - - static_assert(VNNIOp_8bit_16x64_bw32::CopyBits == 8); - static_assert(VNNIOp_8bit_16x64_bw32::AtomHeight == 16); - static_assert(VNNIOp_8bit_16x64_bw32::AtomWidth == 64); - - static_assert(VNNIOp_16bit_8x32_bw16::CopyBits == 16); - static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8); - static_assert(VNNIOp_16bit_8x32_bw16::AtomWidth == 32); - - static_assert(VNNIOp_16bit_16x32_bw16::CopyBits == 16); - static_assert(VNNIOp_16bit_16x32_bw16::AtomHeight == 16); - static_assert(VNNIOp_16bit_16x32_bw16::AtomWidth == 32); - - EXPECT_TRUE(true) << "Mixed BlockWidth VNNI configurations validated"; +TEST(PVC_CuTe_Xe, XE_VNNI_2D_bfloat16) { + test_xe_vnni_load(); + test_xe_vnni_load(); + test_xe_vnni_load(); } - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_DataType_Consistency) { - // Test that CopyBits correctly reflects the data size - using VNNIOp_8bit_small = XE_LOAD_2D_VNNI<8, 2, 16>; - using VNNIOp_8bit_large = XE_LOAD_2D_VNNI<8, 16, 64>; - using VNNIOp_16bit_small = XE_LOAD_2D_VNNI<16, 2, 16>; - using VNNIOp_16bit_large = XE_LOAD_2D_VNNI<16, 16, 32>; - - // All 8-bit variants should have CopyBits == 8 - static_assert(VNNIOp_8bit_small::CopyBits == 8); - static_assert(VNNIOp_8bit_large::CopyBits == 8); - - // All 16-bit variants should have CopyBits == 16 - static_assert(VNNIOp_16bit_small::CopyBits == 16); - static_assert(VNNIOp_16bit_large::CopyBits == 16); - - EXPECT_TRUE(true) << "Data type consistency validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BlockWidth_Divisors) { - // Test BlockWidth as divisors of Width - using VNNIOp_8bit_8x32_bw8 = XE_LOAD_2D_VNNI<8, 8, 32, 8>; - using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>; - using VNNIOp_8bit_8x64_bw16 = XE_LOAD_2D_VNNI<8, 8, 64, 16>; - using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>; - using VNNIOp_16bit_8x32_bw8 = XE_LOAD_2D_VNNI<16, 8, 32, 8>; - using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>; - - static_assert(VNNIOp_8bit_8x32_bw8::AtomHeight == 8 && VNNIOp_8bit_8x32_bw8::AtomWidth == 32); - static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8 && VNNIOp_8bit_8x32_bw16::AtomWidth == 32); - static_assert(VNNIOp_8bit_8x64_bw16::AtomHeight == 8 && VNNIOp_8bit_8x64_bw16::AtomWidth == 64); - static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8 && VNNIOp_8bit_8x64_bw32::AtomWidth == 64); - static_assert(VNNIOp_16bit_8x32_bw8::AtomHeight == 8 && VNNIOp_16bit_8x32_bw8::AtomWidth == 32); - static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8 && VNNIOp_16bit_8x32_bw16::AtomWidth == 32); - - EXPECT_TRUE(true) << "BlockWidth divisor configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Symmetric_Configs) { - // Test configurations with matching height and width values - using VNNIOp_8bit_16x16 = XE_LOAD_2D_VNNI<8, 16, 16>; - using VNNIOp_8bit_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>; - using VNNIOp_16bit_8x8 = XE_LOAD_2D_VNNI<16, 8, 8>; - using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; - using VNNIOp_16bit_32x32 = XE_LOAD_2D_VNNI<16, 32, 32>; - - static_assert(VNNIOp_8bit_16x16::AtomHeight == 16 && VNNIOp_8bit_16x16::AtomWidth == 16); - static_assert(VNNIOp_8bit_32x32::AtomHeight == 32 && VNNIOp_8bit_32x32::AtomWidth == 32); - static_assert(VNNIOp_16bit_8x8::AtomHeight == 8 && VNNIOp_16bit_8x8::AtomWidth == 8); - static_assert(VNNIOp_16bit_16x16::AtomHeight == 16 && VNNIOp_16bit_16x16::AtomWidth == 16); - static_assert(VNNIOp_16bit_32x32::AtomHeight == 32 && VNNIOp_16bit_32x32::AtomWidth == 32); - - EXPECT_TRUE(true) << "Symmetric VNNI configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Small_Tiles) { - // Test small tile configurations useful for residual/boundary handling - using VNNIOp_8bit_1x16 = XE_LOAD_2D_VNNI<8, 1, 16>; - using VNNIOp_8bit_2x16 = XE_LOAD_2D_VNNI<8, 2, 16>; - using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>; - using VNNIOp_16bit_1x8 = XE_LOAD_2D_VNNI<16, 1, 8>; - using VNNIOp_16bit_2x8 = XE_LOAD_2D_VNNI<16, 2, 8>; - using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>; - - static_assert(VNNIOp_8bit_1x16::AtomHeight == 1 && VNNIOp_8bit_1x16::AtomWidth == 16); - static_assert(VNNIOp_8bit_2x16::AtomHeight == 2 && VNNIOp_8bit_2x16::AtomWidth == 16); - static_assert(VNNIOp_8bit_1x32::AtomHeight == 1 && VNNIOp_8bit_1x32::AtomWidth == 32); - static_assert(VNNIOp_16bit_1x8::AtomHeight == 1 && VNNIOp_16bit_1x8::AtomWidth == 8); - static_assert(VNNIOp_16bit_2x8::AtomHeight == 2 && VNNIOp_16bit_2x8::AtomWidth == 8); - static_assert(VNNIOp_16bit_1x16::AtomHeight == 1 && VNNIOp_16bit_1x16::AtomWidth == 16); - - EXPECT_TRUE(true) << "Small tile VNNI configurations validated"; -} - -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MatMul_Optimized) { - // Test configurations optimized for matrix multiplication (DPAS integration) - // Based on typical DPAS dimensions: N=16 for all, K varies by data type - using MatMul_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>; // K=32 for int8 - using MatMul_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>; // M=16 tile - using MatMul_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>; // K=16 for bf16/fp16 - using MatMul_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; // M=16, N=16 for bf16/fp16 - using MatMul_16bit_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; // Larger M tile - - // Verify dimensions match DPAS requirements - static_assert(MatMul_8bit_8x32::CopyBits == 8); - static_assert(MatMul_8bit_8x32::AtomHeight == 8); - static_assert(MatMul_8bit_8x32::AtomWidth == 32); // Matches int8 DPAS K dimension - - static_assert(MatMul_8bit_16x32::CopyBits == 8); - static_assert(MatMul_8bit_16x32::AtomHeight == 16); - static_assert(MatMul_8bit_16x32::AtomWidth == 32); - - static_assert(MatMul_16bit_8x16::CopyBits == 16); - static_assert(MatMul_16bit_8x16::AtomHeight == 8); - static_assert(MatMul_16bit_8x16::AtomWidth == 16); // Matches bf16/fp16 DPAS K dimension - - static_assert(MatMul_16bit_16x16::CopyBits == 16); - static_assert(MatMul_16bit_16x16::AtomHeight == 16); - static_assert(MatMul_16bit_16x16::AtomWidth == 16); - - static_assert(MatMul_16bit_32x16::CopyBits == 16); - static_assert(MatMul_16bit_32x16::AtomHeight == 32); - static_assert(MatMul_16bit_32x16::AtomWidth == 16); - - EXPECT_TRUE(true) << "MatMul-optimized VNNI configurations validated"; -} +// Note: 32-bit and 64-bit types are NOT supported by VNNI +// VNNI only works with 8-bit and 16-bit data types #else -TEST(CuTe_Xe, XE_LOAD_2D_VNNI_SKIPPED) { - GTEST_SKIP() << "XE_LOAD_2D_VNNI tests require IGC version 2.18 or higher. skipped"; +// For the fallback case +#include "cutlass_unit_test.h" + +TEST(PVC_CuTe_Xe, XE_VNNI_2D_SKIPPED) { + GTEST_SKIP() << "XE_VNNI_2D tests require IGC version 2.18 or higher. skipped"; } #endif From 042620ec182980d0166f9656ee13958640816ee5 Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Tue, 9 Dec 2025 20:44:30 +0530 Subject: [PATCH 6/7] Update CMakeLists.txt --- examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt index 0e79d95484..1b3830649e 100644 --- a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt +++ b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt @@ -69,7 +69,7 @@ cutlass_example_add_executable( if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64") # TODO(codeplay): Remove these once IGC block load loop hoisting bug is fixed target_link_options( 02_bmg_gemm_f16_s8_f16_tensorwise PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) - target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) + # target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) target_link_options( 02_bmg_gemm_f16_u4_f16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) target_link_options( 02_bmg_gemm_f16_u4_s8 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) endif() From 5c2c32bfd8935c12f8e8a6c73cf2409b082c745d Mon Sep 17 00:00:00 2001 From: Rishi Yadav Date: Tue, 9 Dec 2025 22:24:14 +0530 Subject: [PATCH 7/7] Update CMakeLists.txt --- examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt index 1b3830649e..0e79d95484 100644 --- a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt +++ b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt @@ -69,7 +69,7 @@ cutlass_example_add_executable( if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64") # TODO(codeplay): Remove these once IGC block load loop hoisting bug is fixed target_link_options( 02_bmg_gemm_f16_s8_f16_tensorwise PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) - # target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) + target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) target_link_options( 02_bmg_gemm_f16_u4_f16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) target_link_options( 02_bmg_gemm_f16_u4_s8 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" ) endif()