From c99c7b517619b49cf3b93723ba2547d7b68de84b Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Fri, 5 Dec 2025 10:19:27 +0530
Subject: [PATCH 1/7] Update xe_vnni_2d.cpp

---
 test/unit/cute/intel_xe/xe_vnni_2d.cpp | 371 +++++++++++++++++++++++++
 1 file changed, 371 insertions(+)

diff --git a/test/unit/cute/intel_xe/xe_vnni_2d.cpp b/test/unit/cute/intel_xe/xe_vnni_2d.cpp
index 2112e474b0..5bc36338b7 100644
--- a/test/unit/cute/intel_xe/xe_vnni_2d.cpp
+++ b/test/unit/cute/intel_xe/xe_vnni_2d.cpp
@@ -60,6 +60,377 @@ TEST(CuTe_Xe, XE_LOAD_2D_VNNI_API_Declaration) {
   EXPECT_TRUE(true) << "XE_LOAD_2D_VNNI API types declared successfully";
 }
 
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MinimalConfigs) {
+  // Test minimal 8-bit VNNI configurations
+  using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>;
+  using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>;
+  using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
+  
+  static_assert(VNNIOp_8bit_1x32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_1x32::AtomHeight == 1);
+  static_assert(VNNIOp_8bit_1x32::AtomWidth == 32);
+  
+  static_assert(VNNIOp_8bit_2x32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_2x32::AtomHeight == 2);
+  static_assert(VNNIOp_8bit_2x32::AtomWidth == 32);
+  
+  static_assert(VNNIOp_8bit_4x32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_4x32::AtomHeight == 4);
+  static_assert(VNNIOp_8bit_4x32::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "8-bit minimal VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MediumConfigs) {
+  // Test medium-sized 8-bit VNNI configurations
+  using VNNIOp_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;
+  using VNNIOp_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;
+  
+  static_assert(VNNIOp_8bit_8x32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_8x32::AtomHeight == 8);
+  static_assert(VNNIOp_8bit_8x32::AtomWidth == 32);
+  
+  static_assert(VNNIOp_8bit_16x32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_16x32::AtomHeight == 16);
+  static_assert(VNNIOp_8bit_16x32::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "8-bit medium VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_WideConfigs) {
+  // Test 8-bit VNNI configurations with wider widths
+  using VNNIOp_8bit_2x64 = XE_LOAD_2D_VNNI<8, 2, 64>;
+  using VNNIOp_8bit_4x64 = XE_LOAD_2D_VNNI<8, 4, 64>;
+  using VNNIOp_8bit_8x64 = XE_LOAD_2D_VNNI<8, 8, 64>;
+  
+  static_assert(VNNIOp_8bit_2x64::CopyBits == 8);
+  static_assert(VNNIOp_8bit_2x64::AtomHeight == 2);
+  static_assert(VNNIOp_8bit_2x64::AtomWidth == 64);
+  
+  static_assert(VNNIOp_8bit_4x64::CopyBits == 8);
+  static_assert(VNNIOp_8bit_4x64::AtomHeight == 4);
+  static_assert(VNNIOp_8bit_4x64::AtomWidth == 64);
+  
+  static_assert(VNNIOp_8bit_8x64::CopyBits == 8);
+  static_assert(VNNIOp_8bit_8x64::AtomHeight == 8);
+  static_assert(VNNIOp_8bit_8x64::AtomWidth == 64);
+  
+  EXPECT_TRUE(true) << "8-bit wide VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MinimalConfigs) {
+  // Test minimal 16-bit VNNI configurations
+  using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>;
+  using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>;
+  using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
+  
+  static_assert(VNNIOp_16bit_1x16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_1x16::AtomHeight == 1);
+  static_assert(VNNIOp_16bit_1x16::AtomWidth == 16);
+  
+  static_assert(VNNIOp_16bit_2x16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_2x16::AtomHeight == 2);
+  static_assert(VNNIOp_16bit_2x16::AtomWidth == 16);
+  
+  static_assert(VNNIOp_16bit_4x16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_4x16::AtomHeight == 4);
+  static_assert(VNNIOp_16bit_4x16::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "16-bit minimal VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MediumConfigs) {
+  // Test medium-sized 16-bit VNNI configurations
+  using VNNIOp_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;
+  using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
+  
+  static_assert(VNNIOp_16bit_8x16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_8x16::AtomHeight == 8);
+  static_assert(VNNIOp_16bit_8x16::AtomWidth == 16);
+  
+  static_assert(VNNIOp_16bit_16x16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_16x16::AtomHeight == 16);
+  static_assert(VNNIOp_16bit_16x16::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "16-bit medium VNNI configurations validated";
+}
+
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_WideConfigs) {
+  // Test 16-bit VNNI configurations with wider widths
+  using VNNIOp_16bit_2x32 = XE_LOAD_2D_VNNI<16, 2, 32>;
+  using VNNIOp_16bit_4x32 = XE_LOAD_2D_VNNI<16, 4, 32>;
+  using VNNIOp_16bit_8x32 = XE_LOAD_2D_VNNI<16, 8, 32>;
+  
+  static_assert(VNNIOp_16bit_2x32::CopyBits == 16);
+  static_assert(VNNIOp_16bit_2x32::AtomHeight == 2);
+  static_assert(VNNIOp_16bit_2x32::AtomWidth == 32);
+  
+  static_assert(VNNIOp_16bit_4x32::CopyBits == 16);
+  static_assert(VNNIOp_16bit_4x32::AtomHeight == 4);
+  static_assert(VNNIOp_16bit_4x32::AtomWidth == 32);
+  
+  static_assert(VNNIOp_16bit_8x32::CopyBits == 16);
+  static_assert(VNNIOp_16bit_8x32::AtomHeight == 8);
+  static_assert(VNNIOp_16bit_8x32::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "16-bit wide VNNI configurations validated";
+}
+
+
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_CustomBlockWidth) {
+  // Test 8-bit VNNI with custom BlockWidth parameter
+  using VNNIOp_8bit_4x32_bw16 = XE_LOAD_2D_VNNI<8, 4, 32, 16>;
+  using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>;
+  using VNNIOp_8bit_16x32_bw16 = XE_LOAD_2D_VNNI<8, 16, 32, 16>;
+  
+  static_assert(VNNIOp_8bit_4x32_bw16::CopyBits == 8);
+  static_assert(VNNIOp_8bit_4x32_bw16::AtomHeight == 4);
+  static_assert(VNNIOp_8bit_4x32_bw16::AtomWidth == 32);
+  
+  static_assert(VNNIOp_8bit_8x32_bw16::CopyBits == 8);
+  static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8);
+  static_assert(VNNIOp_8bit_8x32_bw16::AtomWidth == 32);
+  
+  static_assert(VNNIOp_8bit_16x32_bw16::CopyBits == 8);
+  static_assert(VNNIOp_8bit_16x32_bw16::AtomHeight == 16);
+  static_assert(VNNIOp_8bit_16x32_bw16::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "8-bit VNNI with custom BlockWidth validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_CustomBlockWidth) {
+  // Test 16-bit VNNI with custom BlockWidth parameter
+  using VNNIOp_16bit_4x16_bw8 = XE_LOAD_2D_VNNI<16, 4, 16, 8>;
+  using VNNIOp_16bit_8x16_bw8 = XE_LOAD_2D_VNNI<16, 8, 16, 8>;
+  using VNNIOp_16bit_16x16_bw8 = XE_LOAD_2D_VNNI<16, 16, 16, 8>;
+  
+  static_assert(VNNIOp_16bit_4x16_bw8::CopyBits == 16);
+  static_assert(VNNIOp_16bit_4x16_bw8::AtomHeight == 4);
+  static_assert(VNNIOp_16bit_4x16_bw8::AtomWidth == 16);
+  
+  static_assert(VNNIOp_16bit_8x16_bw8::CopyBits == 16);
+  static_assert(VNNIOp_16bit_8x16_bw8::AtomHeight == 8);
+  static_assert(VNNIOp_16bit_8x16_bw8::AtomWidth == 16);
+  
+  static_assert(VNNIOp_16bit_16x16_bw8::CopyBits == 16);
+  static_assert(VNNIOp_16bit_16x16_bw8::AtomHeight == 16);
+  static_assert(VNNIOp_16bit_16x16_bw8::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "16-bit VNNI with custom BlockWidth validated";
+}
+
+
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Int8_GEMMConfigs) {
+  // Test typical int8 GEMM VNNI configurations for K-dimension packing
+  using GEMM_Int8_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
+  using GEMM_Int8_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;
+  using GEMM_Int8_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;
+  using GEMM_Int8_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>;
+  
+  static_assert(GEMM_Int8_4x32::CopyBits == 8);
+  static_assert(GEMM_Int8_4x32::AtomHeight == 4);
+  static_assert(GEMM_Int8_4x32::AtomWidth == 32);
+  
+  static_assert(GEMM_Int8_8x32::CopyBits == 8);
+  static_assert(GEMM_Int8_8x32::AtomHeight == 8);
+  static_assert(GEMM_Int8_8x32::AtomWidth == 32);
+  
+  static_assert(GEMM_Int8_16x32::CopyBits == 8);
+  static_assert(GEMM_Int8_16x32::AtomHeight == 16);
+  static_assert(GEMM_Int8_16x32::AtomWidth == 32);
+  
+  static_assert(GEMM_Int8_32x32::CopyBits == 8);
+  static_assert(GEMM_Int8_32x32::AtomHeight == 32);
+  static_assert(GEMM_Int8_32x32::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "Int8 GEMM VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BF16_GEMMConfigs) {
+  // Test typical BF16/FP16 GEMM VNNI configurations for K-dimension packing
+  using GEMM_BF16_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
+  using GEMM_BF16_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;
+  using GEMM_BF16_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
+  using GEMM_BF16_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>;
+  
+  static_assert(GEMM_BF16_4x16::CopyBits == 16);
+  static_assert(GEMM_BF16_4x16::AtomHeight == 4);
+  static_assert(GEMM_BF16_4x16::AtomWidth == 16);
+  
+  static_assert(GEMM_BF16_8x16::CopyBits == 16);
+  static_assert(GEMM_BF16_8x16::AtomHeight == 8);
+  static_assert(GEMM_BF16_8x16::AtomWidth == 16);
+  
+  static_assert(GEMM_BF16_16x16::CopyBits == 16);
+  static_assert(GEMM_BF16_16x16::AtomHeight == 16);
+  static_assert(GEMM_BF16_16x16::AtomWidth == 16);
+  
+  static_assert(GEMM_BF16_32x16::CopyBits == 16);
+  static_assert(GEMM_BF16_32x16::AtomHeight == 32);
+  static_assert(GEMM_BF16_32x16::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "BF16/FP16 GEMM VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MoE_GEMMConfigs) {
+  // Test VNNI configurations used in MoE (Mixture of Experts) GEMM
+  // Based on example from 12_bmg_moe_gemm_cute_interface
+  using MoE_Load_A = XE_LOAD_2D_VNNI<16, 32, 16, 16>;
+  using MoE_Load_B_Alt1 = XE_LOAD_2D_VNNI<16, 16, 16>;
+  using MoE_Load_B_Alt2 = XE_LOAD_2D_VNNI<16, 8, 16>;
+  
+  static_assert(MoE_Load_A::CopyBits == 16);
+  static_assert(MoE_Load_A::AtomHeight == 32);
+  static_assert(MoE_Load_A::AtomWidth == 16);
+  
+  static_assert(MoE_Load_B_Alt1::CopyBits == 16);
+  static_assert(MoE_Load_B_Alt1::AtomHeight == 16);
+  static_assert(MoE_Load_B_Alt1::AtomWidth == 16);
+  
+  static_assert(MoE_Load_B_Alt2::CopyBits == 16);
+  static_assert(MoE_Load_B_Alt2::AtomHeight == 8);
+  static_assert(MoE_Load_B_Alt2::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "MoE GEMM VNNI configurations validated";
+}
+
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MixedBlockWidthConfigs) {
+  // Test various BlockWidth settings for optimization
+  using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>;
+  using VNNIOp_8bit_16x64_bw32 = XE_LOAD_2D_VNNI<8, 16, 64, 32>;
+  using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>;
+  using VNNIOp_16bit_16x32_bw16 = XE_LOAD_2D_VNNI<16, 16, 32, 16>;
+  
+  static_assert(VNNIOp_8bit_8x64_bw32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8);
+  static_assert(VNNIOp_8bit_8x64_bw32::AtomWidth == 64);
+  
+  static_assert(VNNIOp_8bit_16x64_bw32::CopyBits == 8);
+  static_assert(VNNIOp_8bit_16x64_bw32::AtomHeight == 16);
+  static_assert(VNNIOp_8bit_16x64_bw32::AtomWidth == 64);
+  
+  static_assert(VNNIOp_16bit_8x32_bw16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8);
+  static_assert(VNNIOp_16bit_8x32_bw16::AtomWidth == 32);
+  
+  static_assert(VNNIOp_16bit_16x32_bw16::CopyBits == 16);
+  static_assert(VNNIOp_16bit_16x32_bw16::AtomHeight == 16);
+  static_assert(VNNIOp_16bit_16x32_bw16::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "Mixed BlockWidth VNNI configurations validated";
+}
+
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_DataType_Consistency) {
+  // Test that CopyBits correctly reflects the data size
+  using VNNIOp_8bit_small = XE_LOAD_2D_VNNI<8, 2, 16>;
+  using VNNIOp_8bit_large = XE_LOAD_2D_VNNI<8, 16, 64>;
+  using VNNIOp_16bit_small = XE_LOAD_2D_VNNI<16, 2, 16>;
+  using VNNIOp_16bit_large = XE_LOAD_2D_VNNI<16, 16, 32>;
+  
+  // All 8-bit variants should have CopyBits == 8
+  static_assert(VNNIOp_8bit_small::CopyBits == 8);
+  static_assert(VNNIOp_8bit_large::CopyBits == 8);
+  
+  // All 16-bit variants should have CopyBits == 16
+  static_assert(VNNIOp_16bit_small::CopyBits == 16);
+  static_assert(VNNIOp_16bit_large::CopyBits == 16);
+  
+  EXPECT_TRUE(true) << "Data type consistency validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BlockWidth_Divisors) {
+  // Test BlockWidth as divisors of Width
+  using VNNIOp_8bit_8x32_bw8 = XE_LOAD_2D_VNNI<8, 8, 32, 8>;
+  using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>;
+  using VNNIOp_8bit_8x64_bw16 = XE_LOAD_2D_VNNI<8, 8, 64, 16>;
+  using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>;
+  using VNNIOp_16bit_8x32_bw8 = XE_LOAD_2D_VNNI<16, 8, 32, 8>;
+  using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>;
+  
+  static_assert(VNNIOp_8bit_8x32_bw8::AtomHeight == 8 && VNNIOp_8bit_8x32_bw8::AtomWidth == 32);
+  static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8 && VNNIOp_8bit_8x32_bw16::AtomWidth == 32);
+  static_assert(VNNIOp_8bit_8x64_bw16::AtomHeight == 8 && VNNIOp_8bit_8x64_bw16::AtomWidth == 64);
+  static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8 && VNNIOp_8bit_8x64_bw32::AtomWidth == 64);
+  static_assert(VNNIOp_16bit_8x32_bw8::AtomHeight == 8 && VNNIOp_16bit_8x32_bw8::AtomWidth == 32);
+  static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8 && VNNIOp_16bit_8x32_bw16::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "BlockWidth divisor configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Symmetric_Configs) {
+  // Test configurations with matching height and width values
+  using VNNIOp_8bit_16x16 = XE_LOAD_2D_VNNI<8, 16, 16>;
+  using VNNIOp_8bit_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>;
+  using VNNIOp_16bit_8x8 = XE_LOAD_2D_VNNI<16, 8, 8>;
+  using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
+  using VNNIOp_16bit_32x32 = XE_LOAD_2D_VNNI<16, 32, 32>;
+  
+  static_assert(VNNIOp_8bit_16x16::AtomHeight == 16 && VNNIOp_8bit_16x16::AtomWidth == 16);
+  static_assert(VNNIOp_8bit_32x32::AtomHeight == 32 && VNNIOp_8bit_32x32::AtomWidth == 32);
+  static_assert(VNNIOp_16bit_8x8::AtomHeight == 8 && VNNIOp_16bit_8x8::AtomWidth == 8);
+  static_assert(VNNIOp_16bit_16x16::AtomHeight == 16 && VNNIOp_16bit_16x16::AtomWidth == 16);
+  static_assert(VNNIOp_16bit_32x32::AtomHeight == 32 && VNNIOp_16bit_32x32::AtomWidth == 32);
+  
+  EXPECT_TRUE(true) << "Symmetric VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Small_Tiles) {
+  // Test small tile configurations useful for residual/boundary handling
+  using VNNIOp_8bit_1x16 = XE_LOAD_2D_VNNI<8, 1, 16>;
+  using VNNIOp_8bit_2x16 = XE_LOAD_2D_VNNI<8, 2, 16>;
+  using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>;
+  using VNNIOp_16bit_1x8 = XE_LOAD_2D_VNNI<16, 1, 8>;
+  using VNNIOp_16bit_2x8 = XE_LOAD_2D_VNNI<16, 2, 8>;
+  using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>;
+  
+  static_assert(VNNIOp_8bit_1x16::AtomHeight == 1 && VNNIOp_8bit_1x16::AtomWidth == 16);
+  static_assert(VNNIOp_8bit_2x16::AtomHeight == 2 && VNNIOp_8bit_2x16::AtomWidth == 16);
+  static_assert(VNNIOp_8bit_1x32::AtomHeight == 1 && VNNIOp_8bit_1x32::AtomWidth == 32);
+  static_assert(VNNIOp_16bit_1x8::AtomHeight == 1 && VNNIOp_16bit_1x8::AtomWidth == 8);
+  static_assert(VNNIOp_16bit_2x8::AtomHeight == 2 && VNNIOp_16bit_2x8::AtomWidth == 8);
+  static_assert(VNNIOp_16bit_1x16::AtomHeight == 1 && VNNIOp_16bit_1x16::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "Small tile VNNI configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MatMul_Optimized) {
+  // Test configurations optimized for matrix multiplication (DPAS integration)
+  // Based on typical DPAS dimensions: N=16 for all, K varies by data type
+  using MatMul_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;     // K=32 for int8
+  using MatMul_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;   // M=16 tile
+  using MatMul_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;   // K=16 for bf16/fp16
+  using MatMul_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; // M=16, N=16 for bf16/fp16
+  using MatMul_16bit_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; // Larger M tile
+  
+  // Verify dimensions match DPAS requirements
+  static_assert(MatMul_8bit_8x32::CopyBits == 8);
+  static_assert(MatMul_8bit_8x32::AtomHeight == 8);
+  static_assert(MatMul_8bit_8x32::AtomWidth == 32);  // Matches int8 DPAS K dimension
+  
+  static_assert(MatMul_8bit_16x32::CopyBits == 8);
+  static_assert(MatMul_8bit_16x32::AtomHeight == 16);
+  static_assert(MatMul_8bit_16x32::AtomWidth == 32);
+  
+  static_assert(MatMul_16bit_8x16::CopyBits == 16);
+  static_assert(MatMul_16bit_8x16::AtomHeight == 8);
+  static_assert(MatMul_16bit_8x16::AtomWidth == 16);  // Matches bf16/fp16 DPAS K dimension
+  
+  static_assert(MatMul_16bit_16x16::CopyBits == 16);
+  static_assert(MatMul_16bit_16x16::AtomHeight == 16);
+  static_assert(MatMul_16bit_16x16::AtomWidth == 16);
+  
+  static_assert(MatMul_16bit_32x16::CopyBits == 16);
+  static_assert(MatMul_16bit_32x16::AtomHeight == 32);
+  static_assert(MatMul_16bit_32x16::AtomWidth == 16);
+  
+  EXPECT_TRUE(true) << "MatMul-optimized VNNI configurations validated";
+}
+
 #else
 
 TEST(CuTe_Xe, XE_LOAD_2D_VNNI_SKIPPED) {

From 6f86ced0f86bccad873a9a294d46856e06ffc104 Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Fri, 5 Dec 2025 10:23:18 +0530
Subject: [PATCH 2/7] Update xe_copy_prefetch_2d.cpp

---
 .../cute/intel_xe/xe_copy_prefetch_2d.cpp     | 155 +++++++++++++++++-
 1 file changed, 151 insertions(+), 4 deletions(-)

diff --git a/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp b/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp
index 675b8d1c43..3659af0060 100644
--- a/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp
+++ b/test/unit/cute/intel_xe/xe_copy_prefetch_2d.cpp
@@ -47,8 +47,8 @@ using namespace compat::experimental;
 
 #if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
 
-// Kernel name for unique identification
-template<class SrcTensor> 
+// Kernel name for unique identification - includes Bits to ensure uniqueness
+template<class SrcTensor, int Bits, int Height, int Width> 
 class XEPrefetch2DKernelName;
 
 // Device kernel for XE_PREFETCH_2D testing  
@@ -106,7 +106,7 @@ void test_xe_prefetch_2d() {
   
   // Initialize source with test pattern
   for (size_t i = 0; i < host_src.size(); ++i) {
-    host_src[i] = static_cast<Element>(i % 256);
+    host_src[i] = static_cast<Element>(static_cast<float>(i % 256));
   }
 
   // Copy to device
@@ -122,7 +122,7 @@ void test_xe_prefetch_2d() {
   auto gridDim = compat::dim3(1);
   
   launch<xe_prefetch_2d_kernel<decltype(tensor_src), Bits, Height, Width>,
-         XEPrefetch2DKernelName<decltype(tensor_src)>>(
+         XEPrefetch2DKernelName<decltype(tensor_src), Bits, Height, Width>>(
     launch_policy{
       gridDim, blockDim,
       kernel_properties{sycl_exp::sub_group_size<intel::sg_size>}
@@ -150,6 +150,153 @@ TEST(CuTe_Xe, XE_PREFETCH_2D_float) {
   test_xe_prefetch_2d<float, 32, 4, 16>();
 }
 
+
+// Test 4: 8-bit Minimal Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Minimal) {
+  test_xe_prefetch_2d<uint8_t, 8, 1, 32>();
+}
+
+// Test 5: 8-bit Small Height
+TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_SmallHeight) {
+  test_xe_prefetch_2d<uint8_t, 8, 2, 64>();
+}
+
+// Test 6: 8-bit Medium Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Medium) {
+  test_xe_prefetch_2d<uint8_t, 8, 4, 64>();
+}
+
+// Test 7: 8-bit Large Height
+TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_LargeHeight) {
+  test_xe_prefetch_2d<uint8_t, 8, 8, 64>();
+}
+
+// Test 8: 8-bit Wide Configuration (respecting 512-bit width limit)
+TEST(CuTe_Xe, XE_PREFETCH_2D_8bit_Wide) {
+  test_xe_prefetch_2d<int8_t, 8, 4, 64>();  // 8*64=512 bits (max)
+}
+
+// Test 9: 16-bit Minimal Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Minimal) {
+  test_xe_prefetch_2d<int16_t, 16, 1, 16>();
+}
+
+// Test 10: 16-bit Small Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Small) {
+  test_xe_prefetch_2d<int16_t, 16, 2, 32>();
+}
+
+// Test 11: 16-bit Medium Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Medium) {
+  test_xe_prefetch_2d<uint16_t, 16, 4, 32>();
+}
+
+// Test 12: 16-bit Large Height
+TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_LargeHeight) {
+  test_xe_prefetch_2d<int16_t, 16, 8, 32>();
+}
+
+// Test 13: 16-bit Wide Configuration (respecting 512-bit width limit)
+TEST(CuTe_Xe, XE_PREFETCH_2D_16bit_Wide) {
+  test_xe_prefetch_2d<bfloat16_t, 16, 4, 32>();  // 16*32=512 bits (max)
+}
+
+// Test 14: 32-bit Minimal Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Minimal) {
+  test_xe_prefetch_2d<float, 32, 1, 16>();  // 32*16=512 bits (max)
+}
+
+// Test 15: 32-bit Small Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Small) {
+  test_xe_prefetch_2d<float, 32, 2, 16>();
+}
+
+// Test 16: 32-bit Medium Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Medium) {
+  test_xe_prefetch_2d<int32_t, 32, 4, 16>();
+}
+
+// Test 17: 32-bit Large Height
+TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_LargeHeight) {
+  test_xe_prefetch_2d<float, 32, 8, 16>();
+}
+
+// Test 18: 32-bit Wide Configuration (respecting 512-bit width limit)
+TEST(CuTe_Xe, XE_PREFETCH_2D_32bit_Wide) {
+  test_xe_prefetch_2d<float, 32, 4, 16>();  // 32*16=512 bits (max)
+}
+
+// Test 19: 64-bit Small Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_Small) {
+  test_xe_prefetch_2d<double, 64, 2, 8>();  // 64*8=512 bits (max)
+}
+
+// Test 20: 64-bit Medium Configuration
+TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_Medium) {
+  test_xe_prefetch_2d<double, 64, 4, 8>();  // 64*8=512 bits (max)
+}
+
+// Test 21: 64-bit Large Height  
+TEST(CuTe_Xe, XE_PREFETCH_2D_64bit_LargeHeight) {
+  test_xe_prefetch_2d<int64_t, 64, 8, 8>();  // 64*8=512 bits (max)
+}
+
+// Test 22: Mixed Data Types - Power of Two Heights
+TEST(CuTe_Xe, XE_PREFETCH_2D_PowerOfTwo_Heights) {
+  // 8-bit with power-of-two heights
+  test_xe_prefetch_2d<uint8_t, 8, 16, 64>();
+  test_xe_prefetch_2d<uint8_t, 8, 32, 32>();
+  
+  // 16-bit with power-of-two heights
+  test_xe_prefetch_2d<int16_t, 16, 16, 32>();
+  
+  // 32-bit with power-of-two heights
+  test_xe_prefetch_2d<float, 32, 16, 16>();
+}
+
+// Test 23: Various Width Configurations
+TEST(CuTe_Xe, XE_PREFETCH_2D_VariousWidths) {
+  // 8-bit with various widths
+  test_xe_prefetch_2d<uint8_t, 8, 4, 16>();
+  test_xe_prefetch_2d<uint8_t, 8, 4, 32>();
+  
+  // 16-bit with various widths
+  test_xe_prefetch_2d<int16_t, 16, 4, 8>();
+  test_xe_prefetch_2d<int16_t, 16, 4, 16>();
+  
+  // 32-bit with various widths
+  test_xe_prefetch_2d<float, 32, 4, 4>();
+  test_xe_prefetch_2d<float, 32, 4, 8>();
+}
+
+// Test 24: Square Tiles
+TEST(CuTe_Xe, XE_PREFETCH_2D_SquareTiles) {
+  // 8-bit square (in memory view)
+  test_xe_prefetch_2d<uint8_t, 8, 8, 8>();
+  
+  // 16-bit square
+  test_xe_prefetch_2d<int16_t, 16, 4, 4>();
+  
+  // 32-bit square
+  test_xe_prefetch_2d<float, 32, 4, 4>();
+}
+
+// Test 25: Tall Tiles (Height > Width)
+TEST(CuTe_Xe, XE_PREFETCH_2D_TallTiles) {
+  test_xe_prefetch_2d<uint8_t, 8, 16, 8>();
+  test_xe_prefetch_2d<int16_t, 16, 8, 4>();
+  test_xe_prefetch_2d<float, 32, 8, 4>();
+}
+
+// Test 26: Cache Line Optimization
+TEST(CuTe_Xe, XE_PREFETCH_2D_CacheOptimized) {
+  // Configurations aligned to cache lines (64 bytes)
+  test_xe_prefetch_2d<uint8_t, 8, 4, 64>();   // 64 bytes per row
+  test_xe_prefetch_2d<int16_t, 16, 4, 32>();  // 64 bytes per row
+  test_xe_prefetch_2d<float, 32, 4, 16>();    // 64 bytes per row
+  test_xe_prefetch_2d<double, 64, 4, 8>();    // 64 bytes per row
+}
+
 #else
 
 // For the fallback case

From b871d05d6d3be2bcd70d8d316063b44ef576381f Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Fri, 5 Dec 2025 10:23:56 +0530
Subject: [PATCH 3/7] Update xe_transpose_2d.cpp

---
 test/unit/cute/intel_xe/xe_transpose_2d.cpp | 367 ++++++++++++++++++++
 1 file changed, 367 insertions(+)

diff --git a/test/unit/cute/intel_xe/xe_transpose_2d.cpp b/test/unit/cute/intel_xe/xe_transpose_2d.cpp
index d2375d2fc8..bb9e5ea4b4 100644
--- a/test/unit/cute/intel_xe/xe_transpose_2d.cpp
+++ b/test/unit/cute/intel_xe/xe_transpose_2d.cpp
@@ -91,6 +91,373 @@ TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Constraints) {
   EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE constraint validation successful";
 }
 
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MinimalConfigs) {
+  // Test minimal 32-bit transpose configurations
+  using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
+  using Transpose_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>;
+  using Transpose_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>;
+  using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
+  
+  static_assert(Transpose_32bit_1x1::CopyBits == 32);
+  static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1);
+  
+  static_assert(Transpose_32bit_1x2::CopyBits == 32);
+  static_assert(Transpose_32bit_1x2::AtomHeight == 1 && Transpose_32bit_1x2::AtomWidth == 2);
+  
+  static_assert(Transpose_32bit_2x1::CopyBits == 32);
+  static_assert(Transpose_32bit_2x1::AtomHeight == 2 && Transpose_32bit_2x1::AtomWidth == 1);
+  
+  static_assert(Transpose_32bit_2x2::CopyBits == 32);
+  static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2);
+  
+  EXPECT_TRUE(true) << "32-bit minimal transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width4) {
+  // Test 32-bit transpose with width = 4
+  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
+  using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
+  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
+  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  
+  static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_2x4::AtomHeight == 2 && Transpose_32bit_2x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_8x4::AtomHeight == 8 && Transpose_32bit_8x4::AtomWidth == 4);
+  
+  EXPECT_TRUE(true) << "32-bit width=4 transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width8_MaxWidth) {
+  // Test 32-bit transpose with width = 8 (maximum allowed width)
+  using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;
+  using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;
+  using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
+  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
+  using Transpose_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
+  
+  static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_16x8::AtomHeight == 16 && Transpose_32bit_16x8::AtomWidth == 8);
+  
+  EXPECT_TRUE(true) << "32-bit max width=8 transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_VariousHeights) {
+  // Test 32-bit transpose with various height values
+  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
+  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
+  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
+  
+  static_assert(Transpose_32bit_1x4::AtomHeight == 1);
+  static_assert(Transpose_32bit_4x4::AtomHeight == 4);
+  static_assert(Transpose_32bit_8x4::AtomHeight == 8);
+  static_assert(Transpose_32bit_16x4::AtomHeight == 16);
+  static_assert(Transpose_32bit_32x4::AtomHeight == 32);
+  
+  EXPECT_TRUE(true) << "32-bit various height transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SquareConfigs) {
+  // Test 32-bit square transpose configurations
+  using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
+  using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
+  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
+  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
+  
+  static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1);
+  static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2);
+  static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8);
+  
+  EXPECT_TRUE(true) << "32-bit square transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_TallConfigs) {
+  // Test 32-bit tall (Height > Width) transpose configurations
+  using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>;
+  using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
+  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  using Transpose_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
+  
+  static_assert(Transpose_32bit_8x1::AtomHeight == 8 && Transpose_32bit_8x1::AtomWidth == 1);
+  static_assert(Transpose_32bit_8x2::AtomHeight == 8 && Transpose_32bit_8x2::AtomWidth == 2);
+  static_assert(Transpose_32bit_16x4::AtomHeight == 16 && Transpose_32bit_16x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_32x8::AtomHeight == 32 && Transpose_32bit_32x8::AtomWidth == 8);
+  
+  EXPECT_TRUE(true) << "32-bit tall transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_WideConfigs) {
+  // Test 32-bit wide (Width > Height) transpose configurations
+  using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;
+  using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;
+  using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
+  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
+  
+  static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8);
+  static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4);
+  
+  EXPECT_TRUE(true) << "32-bit wide transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_AllValid) {
+  // Test all valid 64-bit transpose configurations (Height=8, Width<4)
+  using Transpose_64bit_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
+  using Transpose_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
+  using Transpose_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+  
+  static_assert(Transpose_64bit_8x1::CopyBits == 64);
+  static_assert(Transpose_64bit_8x1::AtomHeight == 8 && Transpose_64bit_8x1::AtomWidth == 1);
+  
+  static_assert(Transpose_64bit_8x2::CopyBits == 64);
+  static_assert(Transpose_64bit_8x2::AtomHeight == 8 && Transpose_64bit_8x2::AtomWidth == 2);
+  
+  static_assert(Transpose_64bit_8x3::CopyBits == 64);
+  static_assert(Transpose_64bit_8x3::AtomHeight == 8 && Transpose_64bit_8x3::AtomWidth == 3);
+  
+  EXPECT_TRUE(true) << "64-bit all valid transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_Constraints) {
+  // Test that 64-bit transpose respects its strict constraints
+  // Valid: Height == 8 && Width < 4
+  using Valid_64bit_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
+  using Valid_64bit_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
+  using Valid_64bit_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+  
+  // Verify all have correct dimensions
+  static_assert(Valid_64bit_1::AtomHeight == 8 && Valid_64bit_1::AtomWidth == 1);
+  static_assert(Valid_64bit_2::AtomHeight == 8 && Valid_64bit_2::AtomWidth == 2);
+  static_assert(Valid_64bit_3::AtomHeight == 8 && Valid_64bit_3::AtomWidth == 3);
+  
+  // Verify all have 64-bit size
+  static_assert(Valid_64bit_1::CopyBits == 64);
+  static_assert(Valid_64bit_2::CopyBits == 64);
+  static_assert(Valid_64bit_3::CopyBits == 64);
+  
+  EXPECT_TRUE(true) << "64-bit constraint validation successful";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_PowerOfTwo_Heights) {
+  // Test 32-bit transpose with power-of-two heights
+  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
+  using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
+  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
+  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
+  
+  static_assert(Transpose_32bit_1x4::AtomHeight == 1);
+  static_assert(Transpose_32bit_2x4::AtomHeight == 2);
+  static_assert(Transpose_32bit_4x4::AtomHeight == 4);
+  static_assert(Transpose_32bit_8x4::AtomHeight == 8);
+  static_assert(Transpose_32bit_16x4::AtomHeight == 16);
+  static_assert(Transpose_32bit_32x4::AtomHeight == 32);
+  
+  EXPECT_TRUE(true) << "32-bit power-of-two height transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_AllWidths) {
+  // Test 32-bit transpose with all valid widths (1-8) and height=8
+  using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>;
+  using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
+  using Transpose_32bit_8x3 = XE_LOAD_2D_TRANSPOSE<32, 8, 3>;
+  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  using Transpose_32bit_8x5 = XE_LOAD_2D_TRANSPOSE<32, 8, 5>;
+  using Transpose_32bit_8x6 = XE_LOAD_2D_TRANSPOSE<32, 8, 6>;
+  using Transpose_32bit_8x7 = XE_LOAD_2D_TRANSPOSE<32, 8, 7>;
+  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
+  
+  static_assert(Transpose_32bit_8x1::AtomWidth == 1);
+  static_assert(Transpose_32bit_8x2::AtomWidth == 2);
+  static_assert(Transpose_32bit_8x3::AtomWidth == 3);
+  static_assert(Transpose_32bit_8x4::AtomWidth == 4);
+  static_assert(Transpose_32bit_8x5::AtomWidth == 5);
+  static_assert(Transpose_32bit_8x6::AtomWidth == 6);
+  static_assert(Transpose_32bit_8x7::AtomWidth == 7);
+  static_assert(Transpose_32bit_8x8::AtomWidth == 8);
+  
+  EXPECT_TRUE(true) << "32-bit all width values transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MatMul_Optimized) {
+  // Test 32-bit transpose configurations useful for matrix multiplication
+  // Common for transposing A matrix in row-major to column-major for DPAS
+  using MatMul_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
+  using MatMul_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
+  using MatMul_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  using MatMul_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  
+  static_assert(MatMul_32bit_8x8::CopyBits == 32);
+  static_assert(MatMul_32bit_8x8::AtomHeight == 8 && MatMul_32bit_8x8::AtomWidth == 8);
+  
+  static_assert(MatMul_32bit_16x8::CopyBits == 32);
+  static_assert(MatMul_32bit_16x8::AtomHeight == 16 && MatMul_32bit_16x8::AtomWidth == 8);
+  
+  static_assert(MatMul_32bit_8x4::CopyBits == 32);
+  static_assert(MatMul_32bit_8x4::AtomHeight == 8 && MatMul_32bit_8x4::AtomWidth == 4);
+  
+  static_assert(MatMul_32bit_16x4::CopyBits == 32);
+  static_assert(MatMul_32bit_16x4::AtomHeight == 16 && MatMul_32bit_16x4::AtomWidth == 4);
+  
+  EXPECT_TRUE(true) << "32-bit MatMul-optimized transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SmallTiles) {
+  // Test 32-bit transpose small tiles for boundary handling
+  using Small_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
+  using Small_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>;
+  using Small_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>;
+  using Small_32bit_4x2 = XE_LOAD_2D_TRANSPOSE<32, 4, 2>;
+  
+  static_assert(Small_32bit_1x1::AtomHeight == 1 && Small_32bit_1x1::AtomWidth == 1);
+  static_assert(Small_32bit_2x1::AtomHeight == 2 && Small_32bit_2x1::AtomWidth == 1);
+  static_assert(Small_32bit_1x2::AtomHeight == 1 && Small_32bit_1x2::AtomWidth == 2);
+  static_assert(Small_32bit_4x2::AtomHeight == 4 && Small_32bit_4x2::AtomWidth == 2);
+  
+  EXPECT_TRUE(true) << "32-bit small tile transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_DataSize_Consistency) {
+  // Test that CopyBits correctly reflects 32 or 64 bits
+  using Op_32bit_Small = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
+  using Op_32bit_Large = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
+  using Op_64bit_Valid1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
+  using Op_64bit_Valid2 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+  
+  // All 32-bit variants should have CopyBits == 32
+  static_assert(Op_32bit_Small::CopyBits == 32);
+  static_assert(Op_32bit_Large::CopyBits == 32);
+  
+  // All 64-bit variants should have CopyBits == 64
+  static_assert(Op_64bit_Valid1::CopyBits == 64);
+  static_assert(Op_64bit_Valid2::CopyBits == 64);
+  
+  EXPECT_TRUE(true) << "Transpose data size consistency validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width_Progression) {
+  // Test 32-bit transpose with progressive widths and fixed height
+  using Transpose_16x1 = XE_LOAD_2D_TRANSPOSE<32, 16, 1>;
+  using Transpose_16x2 = XE_LOAD_2D_TRANSPOSE<32, 16, 2>;
+  using Transpose_16x3 = XE_LOAD_2D_TRANSPOSE<32, 16, 3>;
+  using Transpose_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  using Transpose_16x5 = XE_LOAD_2D_TRANSPOSE<32, 16, 5>;
+  using Transpose_16x6 = XE_LOAD_2D_TRANSPOSE<32, 16, 6>;
+  using Transpose_16x7 = XE_LOAD_2D_TRANSPOSE<32, 16, 7>;
+  using Transpose_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
+  
+  static_assert(Transpose_16x1::AtomHeight == 16 && Transpose_16x1::AtomWidth == 1);
+  static_assert(Transpose_16x2::AtomHeight == 16 && Transpose_16x2::AtomWidth == 2);
+  static_assert(Transpose_16x3::AtomHeight == 16 && Transpose_16x3::AtomWidth == 3);
+  static_assert(Transpose_16x4::AtomHeight == 16 && Transpose_16x4::AtomWidth == 4);
+  static_assert(Transpose_16x5::AtomHeight == 16 && Transpose_16x5::AtomWidth == 5);
+  static_assert(Transpose_16x6::AtomHeight == 16 && Transpose_16x6::AtomWidth == 6);
+  static_assert(Transpose_16x7::AtomHeight == 16 && Transpose_16x7::AtomWidth == 7);
+  static_assert(Transpose_16x8::AtomHeight == 16 && Transpose_16x8::AtomWidth == 8);
+  
+  EXPECT_TRUE(true) << "32-bit progressive width transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_LargeTiles) {
+  // Test 32-bit transpose with larger tile configurations (Height <= 32 limit)
+  using Large_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
+  using Large_32bit_32x6 = XE_LOAD_2D_TRANSPOSE<32, 32, 6>;
+  using Large_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
+  using Large_32bit_32x2 = XE_LOAD_2D_TRANSPOSE<32, 32, 2>;
+  
+  static_assert(Large_32bit_32x8::CopyBits == 32);
+  static_assert(Large_32bit_32x8::AtomHeight == 32 && Large_32bit_32x8::AtomWidth == 8);
+  
+  static_assert(Large_32bit_32x6::CopyBits == 32);
+  static_assert(Large_32bit_32x6::AtomHeight == 32 && Large_32bit_32x6::AtomWidth == 6);
+  
+  static_assert(Large_32bit_32x4::CopyBits == 32);
+  static_assert(Large_32bit_32x4::AtomHeight == 32 && Large_32bit_32x4::AtomWidth == 4);
+  
+  static_assert(Large_32bit_32x2::CopyBits == 32);
+  static_assert(Large_32bit_32x2::AtomHeight == 32 && Large_32bit_32x2::AtomWidth == 2);
+  
+  EXPECT_TRUE(true) << "32-bit large tile transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Mixed_AspectRatios) {
+  // Test various aspect ratios for 32-bit transpose (Height <= 32 limit)
+  using AspectRatio_1to8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;    // 1:8
+  using AspectRatio_2to8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;    // 1:4
+  using AspectRatio_4to8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;    // 1:2
+  using AspectRatio_8to8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;    // 1:1
+  using AspectRatio_16to8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;  // 2:1
+  using AspectRatio_32to8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;  // 4:1
+  using AspectRatio_32to4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;  // 8:1
+  
+  static_assert(AspectRatio_1to8::AtomHeight == 1 && AspectRatio_1to8::AtomWidth == 8);
+  static_assert(AspectRatio_2to8::AtomHeight == 2 && AspectRatio_2to8::AtomWidth == 8);
+  static_assert(AspectRatio_4to8::AtomHeight == 4 && AspectRatio_4to8::AtomWidth == 8);
+  static_assert(AspectRatio_8to8::AtomHeight == 8 && AspectRatio_8to8::AtomWidth == 8);
+  static_assert(AspectRatio_16to8::AtomHeight == 16 && AspectRatio_16to8::AtomWidth == 8);
+  static_assert(AspectRatio_32to8::AtomHeight == 32 && AspectRatio_32to8::AtomWidth == 8);
+  static_assert(AspectRatio_32to4::AtomHeight == 32 && AspectRatio_32to4::AtomWidth == 4);
+  
+  EXPECT_TRUE(true) << "Mixed aspect ratio transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_BF16_FP16_UseCase) {
+  // Test transpose configurations useful for bf16/fp16 data (stored as 32-bit for transpose)
+  // Transpose allows loading 16-bit data as 32-bit, then converting after transpose
+  using BF16_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
+  using BF16_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
+  using BF16_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
+  using BF16_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
+  
+  // These are 32-bit operations but can be used with 16-bit data
+  static_assert(BF16_8x8::CopyBits == 32);
+  static_assert(BF16_8x8::AtomHeight == 8 && BF16_8x8::AtomWidth == 8);
+  
+  static_assert(BF16_16x8::CopyBits == 32);
+  static_assert(BF16_16x8::AtomHeight == 16 && BF16_16x8::AtomWidth == 8);
+  
+  static_assert(BF16_8x4::CopyBits == 32);
+  static_assert(BF16_8x4::AtomHeight == 8 && BF16_8x4::AtomWidth == 4);
+  
+  static_assert(BF16_16x4::CopyBits == 32);
+  static_assert(BF16_16x4::AtomHeight == 16 && BF16_16x4::AtomWidth == 4);
+  
+  EXPECT_TRUE(true) << "BF16/FP16 use case transpose configurations validated";
+}
+
+TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_UseCase) {
+  // Test 64-bit transpose use cases (double precision or two 32-bit values)
+  // Limited to Height=8, Width in {1, 2, 3}
+  using Double_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
+  using Double_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
+  using Double_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
+  
+  // All must be 64-bit
+  static_assert(Double_8x1::CopyBits == 64);
+  static_assert(Double_8x2::CopyBits == 64);
+  static_assert(Double_8x3::CopyBits == 64);
+  
+  // All must have height=8
+  static_assert(Double_8x1::AtomHeight == 8);
+  static_assert(Double_8x2::AtomHeight == 8);
+  static_assert(Double_8x3::AtomHeight == 8);
+  
+  // Widths must be < 4
+  static_assert(Double_8x1::AtomWidth == 1);
+  static_assert(Double_8x2::AtomWidth == 2);
+  static_assert(Double_8x3::AtomWidth == 3);
+  
+  EXPECT_TRUE(true) << "64-bit use case transpose configurations validated";
+}
+
 #else
 
 TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_SKIPPED) {

From 043607b4c97fe2efd3134434ae741dcdf1bf087f Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Sun, 7 Dec 2025 23:47:05 +0530
Subject: [PATCH 4/7] Update xe_transpose_2d.cpp

---
 test/unit/cute/intel_xe/xe_transpose_2d.cpp | 550 ++++++--------------
 1 file changed, 152 insertions(+), 398 deletions(-)

diff --git a/test/unit/cute/intel_xe/xe_transpose_2d.cpp b/test/unit/cute/intel_xe/xe_transpose_2d.cpp
index bb9e5ea4b4..f7a9752f24 100644
--- a/test/unit/cute/intel_xe/xe_transpose_2d.cpp
+++ b/test/unit/cute/intel_xe/xe_transpose_2d.cpp
@@ -25,442 +25,196 @@
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 
+#include "cutlass/detail/layout.hpp"
+
 #include <cute/tensor.hpp>
 #include <cute/atom/copy_atom.hpp>
 #include <cute/atom/copy_traits_xe_2d.hpp>
 #include <cute/arch/copy_xe_2d.hpp>
 #include <sycl/sycl.hpp>
+#include <cute/util/compat.hpp>
+
 #include "cutlass_unit_test.h"
+#include "utils.hpp"
 
 using namespace cute;
-
-#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
-
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_API_Declaration) {
-  // Template: XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>
-  // Constraints: Bits == 32 || Bits == 64, Width <= 8
-  // For 64-bit: Height == 8 && Width < 4
-  
-  // Test 32-bit transpose operations
-  using TransposeOp_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
-  using TransposeOp_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
-  using TransposeOp_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
-  
-  // Test 64-bit transpose operations (limited constraints)
-  using TransposeOp_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
-  using TransposeOp_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  // Test that the operations have the required static members from XE_Copy_Op_2D_Base
-  static_assert(TransposeOp_32bit_2x4::AtomHeight == 2);
-  static_assert(TransposeOp_32bit_2x4::AtomWidth == 4);
-  static_assert(TransposeOp_32bit_2x4::CopyBits == 32);
-  
-  static_assert(TransposeOp_32bit_4x8::AtomHeight == 4);
-  static_assert(TransposeOp_32bit_4x8::AtomWidth == 8);
-  static_assert(TransposeOp_32bit_4x8::CopyBits == 32);
-  
-  static_assert(TransposeOp_64bit_8x2::AtomHeight == 8);
-  static_assert(TransposeOp_64bit_8x2::AtomWidth == 2);
-  static_assert(TransposeOp_64bit_8x2::CopyBits == 64);
-  
-  EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE API types declared successfully";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Constraints) {
-  // Test that the compile-time constraints are enforced
-  
-  // Valid 32-bit operations
-  using Valid32_1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
-  using Valid32_2 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>; // Width <= 8
-  
-  // Valid 64-bit operations (Height == 8 && Width < 4)
-  using Valid64_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
-  using Valid64_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
-  using Valid64_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  static_assert(Valid32_1::CopyBits == 32);
-  static_assert(Valid32_2::CopyBits == 32);
-  static_assert(Valid64_1::CopyBits == 64);
-  static_assert(Valid64_2::CopyBits == 64);
-  static_assert(Valid64_3::CopyBits == 64);
-  
-  EXPECT_TRUE(true) << "XE_LOAD_2D_TRANSPOSE constraint validation successful";
+using namespace cutlass;
+using namespace compat::experimental;
+
+#define SUBGROUP_SIZE (16)
+
+#if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18)
+
+// Kernel name for unique identification
+template<class...> class XETranspose2DKernelName;
+
+// Device kernel for XE_LOAD_2D_TRANSPOSE testing
+// Note: Transpose load performs HW-level transpose during load operation
+// Memory layout (Height×Width) is transposed to register layout (Width×Height)
+template <class SrcTensor, class DstTensor, int Bits, int Height, int Width>
+void xe_transpose_2d_kernel(SrcTensor src, DstTensor dst) {
+  using namespace cute;
+  using Element = typename SrcTensor::value_type;
+
+  // Only execute with the first subgroup to avoid race conditions
+  if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) {
+    // Get thread/subgroup information
+    auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0));
+
+    // Create block 2D transpose load inside kernel (device-only operation)
+    using TransposeOp = XE_LOAD_2D_TRANSPOSE<Bits, Height, Width>;
+    auto tiled_transpose = make_block_2d_copy(TransposeOp{}, src);
+
+    // Get thread slice of the tiled transpose
+    auto thr_transpose = tiled_transpose.get_slice(local_id);
+
+    // Create coordinate tensor for a single tile
+    // Note: coordinates are in memory space (Height×Width)
+    auto coord_shape = make_shape(Int<Height>{}, Int<Width * Bits / sizeof_bits_v<Element>>{});
+    Tensor coord_tile = make_identity_tensor(coord_shape);
+
+    // Partition source coordinates for transpose load
+    auto thr_src_coord = thr_transpose.partition_S(coord_tile);
+
+    // Create destination fragment - transpose changes the layout in registers
+    auto thr_dst_frag = thr_transpose.partition_fragment_D(coord_tile);
+
+    // Perform the transpose load operation from global memory to registers
+    // Data is transposed during this operation by hardware
+    copy(tiled_transpose, thr_src_coord, thr_dst_frag);
+
+    // For verification, we need to store the transposed data back
+    // Note: Output will be in transposed layout (Width×Height in memory)
+    // We store to the transposed destination shape
+    auto dst_coord_shape = make_shape(Int<Width * Bits / sizeof_bits_v<Element>>{}, Int<Height>{});
+    Tensor dst_coord_tile = make_identity_tensor(dst_coord_shape);
+
+    using StoreOp = XE_STORE_2D<Bits, Width, Height>;  // Swapped dimensions
+    auto tiled_store = make_block_2d_copy(StoreOp{}, dst);
+    auto thr_store = tiled_store.get_slice(local_id);
+
+    // Create destination coordinates for the store operation
+    auto thr_dst_coord = thr_store.partition_D(dst_coord_tile);
+    auto thr_src_frag = thr_store.partition_fragment_S(dst_coord_tile);
+
+    // Copy from transpose fragment to store fragment
+    copy(thr_dst_frag, thr_src_frag);
+
+    // Perform the store operation from registers to global memory
+    copy(tiled_store, thr_src_frag, thr_dst_coord);
+
+    // Synchronize to ensure all threads complete their operations
+    sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group());
+  }
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MinimalConfigs) {
-  // Test minimal 32-bit transpose configurations
-  using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
-  using Transpose_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>;
-  using Transpose_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>;
-  using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
-  
-  static_assert(Transpose_32bit_1x1::CopyBits == 32);
-  static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1);
-  
-  static_assert(Transpose_32bit_1x2::CopyBits == 32);
-  static_assert(Transpose_32bit_1x2::AtomHeight == 1 && Transpose_32bit_1x2::AtomWidth == 2);
-  
-  static_assert(Transpose_32bit_2x1::CopyBits == 32);
-  static_assert(Transpose_32bit_2x1::AtomHeight == 2 && Transpose_32bit_2x1::AtomWidth == 1);
-  
-  static_assert(Transpose_32bit_2x2::CopyBits == 32);
-  static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2);
-  
-  EXPECT_TRUE(true) << "32-bit minimal transpose configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width4) {
-  // Test 32-bit transpose with width = 4
-  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
-  using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
-  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
-  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  
-  static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_2x4::AtomHeight == 2 && Transpose_32bit_2x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_8x4::AtomHeight == 8 && Transpose_32bit_8x4::AtomWidth == 4);
-  
-  EXPECT_TRUE(true) << "32-bit width=4 transpose configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width8_MaxWidth) {
-  // Test 32-bit transpose with width = 8 (maximum allowed width)
-  using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;
-  using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;
-  using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
-  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
-  using Transpose_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
-  
-  static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_16x8::AtomHeight == 16 && Transpose_32bit_16x8::AtomWidth == 8);
-  
-  EXPECT_TRUE(true) << "32-bit max width=8 transpose configurations validated";
-}
+// Host test function template for transpose operations
+template <typename Element, int Bits, int Height, int Width>
+void test_xe_transpose_2d() {
+  using namespace cute;
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_VariousHeights) {
-  // Test 32-bit transpose with various height values
-  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
-  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
-  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
-  
-  static_assert(Transpose_32bit_1x4::AtomHeight == 1);
-  static_assert(Transpose_32bit_4x4::AtomHeight == 4);
-  static_assert(Transpose_32bit_8x4::AtomHeight == 8);
-  static_assert(Transpose_32bit_16x4::AtomHeight == 16);
-  static_assert(Transpose_32bit_32x4::AtomHeight == 32);
-  
-  EXPECT_TRUE(true) << "32-bit various height transpose configurations validated";
-}
+  // Source matrix dimensions (Height×Width in memory)
+  constexpr int M = Height;
+  constexpr int N = Width * sizeof_bits_v<Element> / Bits;
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SquareConfigs) {
-  // Test 32-bit square transpose configurations
-  using Transpose_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
-  using Transpose_32bit_2x2 = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
-  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
-  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
-  
-  static_assert(Transpose_32bit_1x1::AtomHeight == 1 && Transpose_32bit_1x1::AtomWidth == 1);
-  static_assert(Transpose_32bit_2x2::AtomHeight == 2 && Transpose_32bit_2x2::AtomWidth == 2);
-  static_assert(Transpose_32bit_4x4::AtomHeight == 4 && Transpose_32bit_4x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_8x8::AtomHeight == 8 && Transpose_32bit_8x8::AtomWidth == 8);
-  
-  EXPECT_TRUE(true) << "32-bit square transpose configurations validated";
-}
+  // Destination will be transposed (Width×Height in memory)
+  constexpr int M_dst = N;
+  constexpr int N_dst = M;
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_TallConfigs) {
-  // Test 32-bit tall (Height > Width) transpose configurations
-  using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>;
-  using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
-  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  using Transpose_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
-  
-  static_assert(Transpose_32bit_8x1::AtomHeight == 8 && Transpose_32bit_8x1::AtomWidth == 1);
-  static_assert(Transpose_32bit_8x2::AtomHeight == 8 && Transpose_32bit_8x2::AtomWidth == 2);
-  static_assert(Transpose_32bit_16x4::AtomHeight == 16 && Transpose_32bit_16x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_32x8::AtomHeight == 32 && Transpose_32bit_32x8::AtomWidth == 8);
-  
-  EXPECT_TRUE(true) << "32-bit tall transpose configurations validated";
-}
+  // Ensure proper alignment
+  constexpr int elem_alignment = 16 / sizeof(Element);
+  constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment;
+  constexpr int aligned_M_dst = ((M_dst + elem_alignment - 1) / elem_alignment) * elem_alignment;
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_WideConfigs) {
-  // Test 32-bit wide (Width > Height) transpose configurations
-  using Transpose_32bit_1x8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;
-  using Transpose_32bit_2x8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;
-  using Transpose_32bit_4x8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;
-  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
-  
-  static_assert(Transpose_32bit_1x8::AtomHeight == 1 && Transpose_32bit_1x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_2x8::AtomHeight == 2 && Transpose_32bit_2x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_4x8::AtomHeight == 4 && Transpose_32bit_4x8::AtomWidth == 8);
-  static_assert(Transpose_32bit_1x4::AtomHeight == 1 && Transpose_32bit_1x4::AtomWidth == 4);
-  
-  EXPECT_TRUE(true) << "32-bit wide transpose configurations validated";
-}
+  // Allocate host memory
+  cutlass::host_vector<Element> host_src(M * aligned_N);
+  cutlass::host_vector<Element> host_dst(M_dst * aligned_M_dst);
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_AllValid) {
-  // Test all valid 64-bit transpose configurations (Height=8, Width<4)
-  using Transpose_64bit_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
-  using Transpose_64bit_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
-  using Transpose_64bit_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  static_assert(Transpose_64bit_8x1::CopyBits == 64);
-  static_assert(Transpose_64bit_8x1::AtomHeight == 8 && Transpose_64bit_8x1::AtomWidth == 1);
-  
-  static_assert(Transpose_64bit_8x2::CopyBits == 64);
-  static_assert(Transpose_64bit_8x2::AtomHeight == 8 && Transpose_64bit_8x2::AtomWidth == 2);
-  
-  static_assert(Transpose_64bit_8x3::CopyBits == 64);
-  static_assert(Transpose_64bit_8x3::AtomHeight == 8 && Transpose_64bit_8x3::AtomWidth == 3);
-  
-  EXPECT_TRUE(true) << "64-bit all valid transpose configurations validated";
-}
+  // Initialize source with test pattern
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      Element val;
+      if constexpr (std::is_floating_point_v<Element> ||
+                    std::is_same_v<Element, half_t> ||
+                    std::is_same_v<Element, bfloat16_t>) {
+        val = Element(static_cast<float>(i * N + j) / 100.0f);
+      } else {
+        val = static_cast<Element>((i * N + j) % 256);
+      }
+      host_src[i * aligned_N + j] = val;
+    }
+  }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_Constraints) {
-  // Test that 64-bit transpose respects its strict constraints
-  // Valid: Height == 8 && Width < 4
-  using Valid_64bit_1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
-  using Valid_64bit_2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
-  using Valid_64bit_3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  // Verify all have correct dimensions
-  static_assert(Valid_64bit_1::AtomHeight == 8 && Valid_64bit_1::AtomWidth == 1);
-  static_assert(Valid_64bit_2::AtomHeight == 8 && Valid_64bit_2::AtomWidth == 2);
-  static_assert(Valid_64bit_3::AtomHeight == 8 && Valid_64bit_3::AtomWidth == 3);
-  
-  // Verify all have 64-bit size
-  static_assert(Valid_64bit_1::CopyBits == 64);
-  static_assert(Valid_64bit_2::CopyBits == 64);
-  static_assert(Valid_64bit_3::CopyBits == 64);
-  
-  EXPECT_TRUE(true) << "64-bit constraint validation successful";
-}
+  // Copy to device
+  cutlass::device_vector<Element> device_src = host_src;
+  cutlass::device_vector<Element> device_dst(M_dst * aligned_M_dst);
+
+  // Create source tensor (Height×Width)
+  Tensor tensor_src =
+        make_tensor(make_gmem_ptr(device_src.data()),
+                    make_layout(Shape<Int<M>, Int<aligned_N>>{},
+                               Stride<Int<aligned_N>, _1>{}));
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_PowerOfTwo_Heights) {
-  // Test 32-bit transpose with power-of-two heights
-  using Transpose_32bit_1x4 = XE_LOAD_2D_TRANSPOSE<32, 1, 4>;
-  using Transpose_32bit_2x4 = XE_LOAD_2D_TRANSPOSE<32, 2, 4>;
-  using Transpose_32bit_4x4 = XE_LOAD_2D_TRANSPOSE<32, 4, 4>;
-  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  using Transpose_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  using Transpose_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
-  
-  static_assert(Transpose_32bit_1x4::AtomHeight == 1);
-  static_assert(Transpose_32bit_2x4::AtomHeight == 2);
-  static_assert(Transpose_32bit_4x4::AtomHeight == 4);
-  static_assert(Transpose_32bit_8x4::AtomHeight == 8);
-  static_assert(Transpose_32bit_16x4::AtomHeight == 16);
-  static_assert(Transpose_32bit_32x4::AtomHeight == 32);
-  
-  EXPECT_TRUE(true) << "32-bit power-of-two height transpose configurations validated";
-}
+  // Create destination tensor (Width×Height) - transposed shape
+  Tensor tensor_dst =
+        make_tensor(make_gmem_ptr(device_dst.data()),
+                    make_layout(Shape<Int<M_dst>, Int<aligned_M_dst>>{},
+                               Stride<Int<aligned_M_dst>, _1>{}));
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_AllWidths) {
-  // Test 32-bit transpose with all valid widths (1-8) and height=8
-  using Transpose_32bit_8x1 = XE_LOAD_2D_TRANSPOSE<32, 8, 1>;
-  using Transpose_32bit_8x2 = XE_LOAD_2D_TRANSPOSE<32, 8, 2>;
-  using Transpose_32bit_8x3 = XE_LOAD_2D_TRANSPOSE<32, 8, 3>;
-  using Transpose_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  using Transpose_32bit_8x5 = XE_LOAD_2D_TRANSPOSE<32, 8, 5>;
-  using Transpose_32bit_8x6 = XE_LOAD_2D_TRANSPOSE<32, 8, 6>;
-  using Transpose_32bit_8x7 = XE_LOAD_2D_TRANSPOSE<32, 8, 7>;
-  using Transpose_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
-  
-  static_assert(Transpose_32bit_8x1::AtomWidth == 1);
-  static_assert(Transpose_32bit_8x2::AtomWidth == 2);
-  static_assert(Transpose_32bit_8x3::AtomWidth == 3);
-  static_assert(Transpose_32bit_8x4::AtomWidth == 4);
-  static_assert(Transpose_32bit_8x5::AtomWidth == 5);
-  static_assert(Transpose_32bit_8x6::AtomWidth == 6);
-  static_assert(Transpose_32bit_8x7::AtomWidth == 7);
-  static_assert(Transpose_32bit_8x8::AtomWidth == 8);
-  
-  EXPECT_TRUE(true) << "32-bit all width values transpose configurations validated";
-}
+  // Launch kernel
+  auto blockDim = compat::dim3(SUBGROUP_SIZE);
+  auto gridDim = compat::dim3(1);
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_MatMul_Optimized) {
-  // Test 32-bit transpose configurations useful for matrix multiplication
-  // Common for transposing A matrix in row-major to column-major for DPAS
-  using MatMul_32bit_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
-  using MatMul_32bit_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
-  using MatMul_32bit_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  using MatMul_32bit_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  
-  static_assert(MatMul_32bit_8x8::CopyBits == 32);
-  static_assert(MatMul_32bit_8x8::AtomHeight == 8 && MatMul_32bit_8x8::AtomWidth == 8);
-  
-  static_assert(MatMul_32bit_16x8::CopyBits == 32);
-  static_assert(MatMul_32bit_16x8::AtomHeight == 16 && MatMul_32bit_16x8::AtomWidth == 8);
-  
-  static_assert(MatMul_32bit_8x4::CopyBits == 32);
-  static_assert(MatMul_32bit_8x4::AtomHeight == 8 && MatMul_32bit_8x4::AtomWidth == 4);
-  
-  static_assert(MatMul_32bit_16x4::CopyBits == 32);
-  static_assert(MatMul_32bit_16x4::AtomHeight == 16 && MatMul_32bit_16x4::AtomWidth == 4);
-  
-  EXPECT_TRUE(true) << "32-bit MatMul-optimized transpose configurations validated";
-}
+  launch<xe_transpose_2d_kernel<decltype(tensor_src), decltype(tensor_dst), Bits, Height, Width>,
+         XETranspose2DKernelName<decltype(tensor_src), decltype(tensor_dst)>>(
+    launch_policy{
+      gridDim, blockDim,
+      kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}
+    },
+    tensor_src, tensor_dst);
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_SmallTiles) {
-  // Test 32-bit transpose small tiles for boundary handling
-  using Small_32bit_1x1 = XE_LOAD_2D_TRANSPOSE<32, 1, 1>;
-  using Small_32bit_2x1 = XE_LOAD_2D_TRANSPOSE<32, 2, 1>;
-  using Small_32bit_1x2 = XE_LOAD_2D_TRANSPOSE<32, 1, 2>;
-  using Small_32bit_4x2 = XE_LOAD_2D_TRANSPOSE<32, 4, 2>;
-  
-  static_assert(Small_32bit_1x1::AtomHeight == 1 && Small_32bit_1x1::AtomWidth == 1);
-  static_assert(Small_32bit_2x1::AtomHeight == 2 && Small_32bit_2x1::AtomWidth == 1);
-  static_assert(Small_32bit_1x2::AtomHeight == 1 && Small_32bit_1x2::AtomWidth == 2);
-  static_assert(Small_32bit_4x2::AtomHeight == 4 && Small_32bit_4x2::AtomWidth == 2);
-  
-  EXPECT_TRUE(true) << "32-bit small tile transpose configurations validated";
-}
+  compat::wait_and_throw();
+  host_dst = device_dst;
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_DataSize_Consistency) {
-  // Test that CopyBits correctly reflects 32 or 64 bits
-  using Op_32bit_Small = XE_LOAD_2D_TRANSPOSE<32, 2, 2>;
-  using Op_32bit_Large = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
-  using Op_64bit_Valid1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
-  using Op_64bit_Valid2 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  // All 32-bit variants should have CopyBits == 32
-  static_assert(Op_32bit_Small::CopyBits == 32);
-  static_assert(Op_32bit_Large::CopyBits == 32);
-  
-  // All 64-bit variants should have CopyBits == 64
-  static_assert(Op_64bit_Valid1::CopyBits == 64);
-  static_assert(Op_64bit_Valid2::CopyBits == 64);
-  
-  EXPECT_TRUE(true) << "Transpose data size consistency validated";
+  // Verify transpose: dst[j][i] should equal src[i][j]
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      Element src_val = host_src[i * aligned_N + j];
+      Element dst_val = host_dst[j * aligned_M_dst + i];
+      EXPECT_EQ(dst_val, src_val)
+        << "Mismatch at src[" << i << "][" << j << "] vs dst[" << j << "][" << i << "]";
+    }
+  }
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_Width_Progression) {
-  // Test 32-bit transpose with progressive widths and fixed height
-  using Transpose_16x1 = XE_LOAD_2D_TRANSPOSE<32, 16, 1>;
-  using Transpose_16x2 = XE_LOAD_2D_TRANSPOSE<32, 16, 2>;
-  using Transpose_16x3 = XE_LOAD_2D_TRANSPOSE<32, 16, 3>;
-  using Transpose_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  using Transpose_16x5 = XE_LOAD_2D_TRANSPOSE<32, 16, 5>;
-  using Transpose_16x6 = XE_LOAD_2D_TRANSPOSE<32, 16, 6>;
-  using Transpose_16x7 = XE_LOAD_2D_TRANSPOSE<32, 16, 7>;
-  using Transpose_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
-  
-  static_assert(Transpose_16x1::AtomHeight == 16 && Transpose_16x1::AtomWidth == 1);
-  static_assert(Transpose_16x2::AtomHeight == 16 && Transpose_16x2::AtomWidth == 2);
-  static_assert(Transpose_16x3::AtomHeight == 16 && Transpose_16x3::AtomWidth == 3);
-  static_assert(Transpose_16x4::AtomHeight == 16 && Transpose_16x4::AtomWidth == 4);
-  static_assert(Transpose_16x5::AtomHeight == 16 && Transpose_16x5::AtomWidth == 5);
-  static_assert(Transpose_16x6::AtomHeight == 16 && Transpose_16x6::AtomWidth == 6);
-  static_assert(Transpose_16x7::AtomHeight == 16 && Transpose_16x7::AtomWidth == 7);
-  static_assert(Transpose_16x8::AtomHeight == 16 && Transpose_16x8::AtomWidth == 8);
-  
-  EXPECT_TRUE(true) << "32-bit progressive width transpose configurations validated";
+// Test 32-bit transpose operations (Width ≤ 8 constraint)
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_4x8) {
+  test_xe_transpose_2d<float, 32, 4, 8>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_32bit_LargeTiles) {
-  // Test 32-bit transpose with larger tile configurations (Height <= 32 limit)
-  using Large_32bit_32x8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;
-  using Large_32bit_32x6 = XE_LOAD_2D_TRANSPOSE<32, 32, 6>;
-  using Large_32bit_32x4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;
-  using Large_32bit_32x2 = XE_LOAD_2D_TRANSPOSE<32, 32, 2>;
-  
-  static_assert(Large_32bit_32x8::CopyBits == 32);
-  static_assert(Large_32bit_32x8::AtomHeight == 32 && Large_32bit_32x8::AtomWidth == 8);
-  
-  static_assert(Large_32bit_32x6::CopyBits == 32);
-  static_assert(Large_32bit_32x6::AtomHeight == 32 && Large_32bit_32x6::AtomWidth == 6);
-  
-  static_assert(Large_32bit_32x4::CopyBits == 32);
-  static_assert(Large_32bit_32x4::AtomHeight == 32 && Large_32bit_32x4::AtomWidth == 4);
-  
-  static_assert(Large_32bit_32x2::CopyBits == 32);
-  static_assert(Large_32bit_32x2::AtomHeight == 32 && Large_32bit_32x2::AtomWidth == 2);
-  
-  EXPECT_TRUE(true) << "32-bit large tile transpose configurations validated";
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_8x8) {
+  test_xe_transpose_2d<float, 32, 8, 8>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_Mixed_AspectRatios) {
-  // Test various aspect ratios for 32-bit transpose (Height <= 32 limit)
-  using AspectRatio_1to8 = XE_LOAD_2D_TRANSPOSE<32, 1, 8>;    // 1:8
-  using AspectRatio_2to8 = XE_LOAD_2D_TRANSPOSE<32, 2, 8>;    // 1:4
-  using AspectRatio_4to8 = XE_LOAD_2D_TRANSPOSE<32, 4, 8>;    // 1:2
-  using AspectRatio_8to8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;    // 1:1
-  using AspectRatio_16to8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;  // 2:1
-  using AspectRatio_32to8 = XE_LOAD_2D_TRANSPOSE<32, 32, 8>;  // 4:1
-  using AspectRatio_32to4 = XE_LOAD_2D_TRANSPOSE<32, 32, 4>;  // 8:1
-  
-  static_assert(AspectRatio_1to8::AtomHeight == 1 && AspectRatio_1to8::AtomWidth == 8);
-  static_assert(AspectRatio_2to8::AtomHeight == 2 && AspectRatio_2to8::AtomWidth == 8);
-  static_assert(AspectRatio_4to8::AtomHeight == 4 && AspectRatio_4to8::AtomWidth == 8);
-  static_assert(AspectRatio_8to8::AtomHeight == 8 && AspectRatio_8to8::AtomWidth == 8);
-  static_assert(AspectRatio_16to8::AtomHeight == 16 && AspectRatio_16to8::AtomWidth == 8);
-  static_assert(AspectRatio_32to8::AtomHeight == 32 && AspectRatio_32to8::AtomWidth == 8);
-  static_assert(AspectRatio_32to4::AtomHeight == 32 && AspectRatio_32to4::AtomWidth == 4);
-  
-  EXPECT_TRUE(true) << "Mixed aspect ratio transpose configurations validated";
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_float_4x4) {
+  test_xe_transpose_2d<float, 32, 4, 4>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_BF16_FP16_UseCase) {
-  // Test transpose configurations useful for bf16/fp16 data (stored as 32-bit for transpose)
-  // Transpose allows loading 16-bit data as 32-bit, then converting after transpose
-  using BF16_8x8 = XE_LOAD_2D_TRANSPOSE<32, 8, 8>;
-  using BF16_16x8 = XE_LOAD_2D_TRANSPOSE<32, 16, 8>;
-  using BF16_8x4 = XE_LOAD_2D_TRANSPOSE<32, 8, 4>;
-  using BF16_16x4 = XE_LOAD_2D_TRANSPOSE<32, 16, 4>;
-  
-  // These are 32-bit operations but can be used with 16-bit data
-  static_assert(BF16_8x8::CopyBits == 32);
-  static_assert(BF16_8x8::AtomHeight == 8 && BF16_8x8::AtomWidth == 8);
-  
-  static_assert(BF16_16x8::CopyBits == 32);
-  static_assert(BF16_16x8::AtomHeight == 16 && BF16_16x8::AtomWidth == 8);
-  
-  static_assert(BF16_8x4::CopyBits == 32);
-  static_assert(BF16_8x4::AtomHeight == 8 && BF16_8x4::AtomWidth == 4);
-  
-  static_assert(BF16_16x4::CopyBits == 32);
-  static_assert(BF16_16x4::AtomHeight == 16 && BF16_16x4::AtomWidth == 4);
-  
-  EXPECT_TRUE(true) << "BF16/FP16 use case transpose configurations validated";
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_int32_4x8) {
+  test_xe_transpose_2d<int32_t, 32, 4, 8>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_64bit_UseCase) {
-  // Test 64-bit transpose use cases (double precision or two 32-bit values)
-  // Limited to Height=8, Width in {1, 2, 3}
-  using Double_8x1 = XE_LOAD_2D_TRANSPOSE<64, 8, 1>;
-  using Double_8x2 = XE_LOAD_2D_TRANSPOSE<64, 8, 2>;
-  using Double_8x3 = XE_LOAD_2D_TRANSPOSE<64, 8, 3>;
-  
-  // All must be 64-bit
-  static_assert(Double_8x1::CopyBits == 64);
-  static_assert(Double_8x2::CopyBits == 64);
-  static_assert(Double_8x3::CopyBits == 64);
-  
-  // All must have height=8
-  static_assert(Double_8x1::AtomHeight == 8);
-  static_assert(Double_8x2::AtomHeight == 8);
-  static_assert(Double_8x3::AtomHeight == 8);
-  
-  // Widths must be < 4
-  static_assert(Double_8x1::AtomWidth == 1);
-  static_assert(Double_8x2::AtomWidth == 2);
-  static_assert(Double_8x3::AtomWidth == 3);
-  
-  EXPECT_TRUE(true) << "64-bit use case transpose configurations validated";
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_uint32_4x8) {
+  test_xe_transpose_2d<uint32_t, 32, 4, 8>();
 }
 
 #else
 
-TEST(CuTe_Xe, XE_LOAD_2D_TRANSPOSE_SKIPPED) {
+TEST(CuTe_Xe, XE_TRANSPOSE_2D_SKIPPED) {
   GTEST_SKIP() << "XE_LOAD_2D_TRANSPOSE tests require IGC version 2.18 or higher. skipped";
 }
 

From 51393e5635e36fb7b411ef0c34c579d708d44f3a Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Sun, 7 Dec 2025 23:47:46 +0530
Subject: [PATCH 5/7] Update xe_vnni_2d.cpp

---
 test/unit/cute/intel_xe/xe_vnni_2d.cpp | 562 +++++++++----------------
 1 file changed, 191 insertions(+), 371 deletions(-)

diff --git a/test/unit/cute/intel_xe/xe_vnni_2d.cpp b/test/unit/cute/intel_xe/xe_vnni_2d.cpp
index 5bc36338b7..45100aa9ff 100644
--- a/test/unit/cute/intel_xe/xe_vnni_2d.cpp
+++ b/test/unit/cute/intel_xe/xe_vnni_2d.cpp
@@ -26,415 +26,235 @@
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF POSSIBILITY OF SUCH DAMAGE.
+ *
  **************************************************************************************************/
 
+ /*
+ * VNNI Usage Summary:
+ * 
+ * This file demonstrates XE_LOAD_2D_VNNI usage in kernel context.
+ * 
+ * Key points:
+ * 1. VNNI is used to load B matrix in GEMM operations
+ * 2. Hardware performs interleaving during load (free transformation)
+ * 3. VNNI data flows directly to DPAS operations
+ * 4. Only 8-bit and 16-bit data types supported
+ * 5. BlockWidth parameter creates multiple blocks (block_count = Width/BlockWidth)
+ * 
+ * Real-world usage pattern:
+ *   auto copy_b = make_block_2d_copy_B(XE_LOAD_2D_VNNI<16, 32, 16, 16>{}, mma, gB);
+ *   copy(copy_b, tBgB, tBrB);  // Load in VNNI format
+ *   gemm(mma, tCrA, tBrB, tCrC);  // DPAS consumes VNNI data
+ * 
+ * See examples/12_bmg_moe_gemm_cute_interface/ for full GEMM implementation.
+ 
+ */
+
+#include "cutlass/detail/layout.hpp"
+
 #include <cute/tensor.hpp>
 #include <cute/atom/copy_atom.hpp>
 #include <cute/atom/copy_traits_xe_2d.hpp>
 #include <cute/arch/copy_xe_2d.hpp>
+#include <cute/atom/mma_atom.hpp>
+#include <cute/atom/mma_traits_xe.hpp>
 #include <sycl/sycl.hpp>
+#include <cute/util/compat.hpp>
+
 #include "cutlass_unit_test.h"
+#include "utils.hpp"
 
 using namespace cute;
+using namespace cutlass;
+using namespace compat::experimental;
+
+#define SUBGROUP_SIZE (16)
 
 #if (IGC_VERSION_MAJOR > 2) || (IGC_VERSION_MAJOR == 2 && IGC_VERSION_MINOR >= 18) 
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_API_Declaration) {
-  // Template: XE_LOAD_2D_VNNI<Bits, Height, Width, BlockWidth = Width>
-  
-  // Test that the VNNI operation types can be declared
-  using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>;
-  using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
-  using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>;
-  using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
-  
-  // Test that the operations have the required static members from XE_Copy_Op_2D_Base
-  static_assert(VNNIOp_8bit_2x32::AtomHeight == 2);
-  static_assert(VNNIOp_8bit_2x32::AtomWidth == 32);
-  static_assert(VNNIOp_8bit_2x32::CopyBits == 8);
-  
-  static_assert(VNNIOp_16bit_2x16::AtomHeight == 2);
-  static_assert(VNNIOp_16bit_2x16::AtomWidth == 16);
-  static_assert(VNNIOp_16bit_2x16::CopyBits == 16);
-  
-  EXPECT_TRUE(true) << "XE_LOAD_2D_VNNI API types declared successfully";
-}
+// Kernel name for unique identification
+template<class...> class XEVnniLoadKernelName;
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MinimalConfigs) {
-  // Test minimal 8-bit VNNI configurations
-  using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>;
-  using VNNIOp_8bit_2x32 = XE_LOAD_2D_VNNI<8, 2, 32>;
-  using VNNIOp_8bit_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
-  
-  static_assert(VNNIOp_8bit_1x32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_1x32::AtomHeight == 1);
-  static_assert(VNNIOp_8bit_1x32::AtomWidth == 32);
-  
-  static_assert(VNNIOp_8bit_2x32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_2x32::AtomHeight == 2);
-  static_assert(VNNIOp_8bit_2x32::AtomWidth == 32);
-  
-  static_assert(VNNIOp_8bit_4x32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_4x32::AtomHeight == 4);
-  static_assert(VNNIOp_8bit_4x32::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "8-bit minimal VNNI configurations validated";
-}
+// VNNI load demonstration kernel
+// Note: VNNI is designed for B matrix in GEMM context with DPAS consumption
+// This simplified test only verifies the load operation executes without errors
+template <class SrcTensor, class DstTensor, int Bits, int Height, int Width>
+void xe_vnni_load_kernel(SrcTensor src, DstTensor dst) {
+  using namespace cute;
+  using Element = typename SrcTensor::value_type;
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_MediumConfigs) {
-  // Test medium-sized 8-bit VNNI configurations
-  using VNNIOp_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;
-  using VNNIOp_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;
-  
-  static_assert(VNNIOp_8bit_8x32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_8x32::AtomHeight == 8);
-  static_assert(VNNIOp_8bit_8x32::AtomWidth == 32);
-  
-  static_assert(VNNIOp_8bit_16x32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_16x32::AtomHeight == 16);
-  static_assert(VNNIOp_8bit_16x32::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "8-bit medium VNNI configurations validated";
+  // Only execute with the first subgroup to avoid race conditions
+  if (sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group(0) == 0) {
+    // Get thread/subgroup information
+    auto local_id = int(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_local_id(0));
+    
+    // ============================================
+    // Use VNNI load instead of regular XE_LOAD_2D
+    // ============================================
+    // Note: VNNI is typically used with make_block_2d_copy_B in GEMM context
+    // But for demonstration, we show the raw VNNI operation
+    using VnniOp = XE_LOAD_2D_VNNI<Bits, Height, Width, Width>;  // BlockWidth = Width for single block
+    auto tiled_copy = make_block_2d_copy(VnniOp{}, src);
+    
+    // Get thread slice of the tiled copy
+    auto thr_copy = tiled_copy.get_slice(local_id);
+    
+    // Create coordinate tensor for a single tile
+    auto coord_shape = make_shape(Int<Height>{}, Int<Width * Bits / sizeof_bits_v<Element>>{});
+    Tensor coord_tile = make_identity_tensor(coord_shape);
+    
+    // Partition source coordinates and create destination fragment
+    auto thr_src_coord = thr_copy.partition_S(coord_tile);
+    auto thr_dst_frag = thr_copy.partition_fragment_D(coord_tile);
+    
+    // ============================================
+    // THIS IS THE VNNI LOAD
+    // Hardware performs interleaving during this load
+    // Data in thr_dst_frag is now in VNNI interleaved format
+    // ============================================
+    copy(tiled_copy, thr_src_coord, thr_dst_frag);
+    
+    // For verification, store back to destination
+    // Note: In real usage, thr_dst_frag would go directly to gemm(mma, tCrA, thr_dst_frag, tCrC)
+    using StoreOp = XE_STORE_2D<Bits, Height, Width>;
+    auto tiled_store = make_block_2d_copy(StoreOp{}, dst);
+    auto thr_store = tiled_store.get_slice(local_id);
+    
+    // Create destination coordinates for the store operation
+    auto thr_dst_coord = thr_store.partition_D(coord_tile);
+    auto thr_src_frag = thr_store.partition_fragment_S(coord_tile);
+    
+    // Copy the loaded data from registers to the fragment for storing
+    copy(thr_dst_frag, thr_src_frag);
+    
+    // Perform the store operation from registers to global memory
+    copy(tiled_store, thr_src_frag, thr_dst_coord);
+    
+    // Synchronize to ensure all threads complete their operations
+    sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_group());
+  }
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_WideConfigs) {
-  // Test 8-bit VNNI configurations with wider widths
-  using VNNIOp_8bit_2x64 = XE_LOAD_2D_VNNI<8, 2, 64>;
-  using VNNIOp_8bit_4x64 = XE_LOAD_2D_VNNI<8, 4, 64>;
-  using VNNIOp_8bit_8x64 = XE_LOAD_2D_VNNI<8, 8, 64>;
-  
-  static_assert(VNNIOp_8bit_2x64::CopyBits == 8);
-  static_assert(VNNIOp_8bit_2x64::AtomHeight == 2);
-  static_assert(VNNIOp_8bit_2x64::AtomWidth == 64);
-  
-  static_assert(VNNIOp_8bit_4x64::CopyBits == 8);
-  static_assert(VNNIOp_8bit_4x64::AtomHeight == 4);
-  static_assert(VNNIOp_8bit_4x64::AtomWidth == 64);
-  
-  static_assert(VNNIOp_8bit_8x64::CopyBits == 8);
-  static_assert(VNNIOp_8bit_8x64::AtomHeight == 8);
-  static_assert(VNNIOp_8bit_8x64::AtomWidth == 64);
-  
-  EXPECT_TRUE(true) << "8-bit wide VNNI configurations validated";
-}
+// Host test function for VNNI load operation
+template <typename Element, int Bits, int Height, int Width, int BlockWidth = Width>
+void test_xe_vnni_load() {
+  using namespace cute;
+  
+  // Matrix dimensions - must be compatible with block 2D constraints
+  constexpr int M = Height;
+  constexpr int N = Width * sizeof_bits_v<Element> / Bits;
+  
+  // Ensure proper alignment (required for block 2D operations)
+  constexpr int elem_alignment = 16 / sizeof(Element);  
+  constexpr int aligned_N = ((N + elem_alignment - 1) / elem_alignment) * elem_alignment;
+  
+  // Allocate and initialize host data
+  cutlass::host_vector<Element> host_src(M * aligned_N);
+  cutlass::host_vector<Element> host_dst(M * aligned_N);
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MinimalConfigs) {
-  // Test minimal 16-bit VNNI configurations
-  using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>;
-  using VNNIOp_16bit_2x16 = XE_LOAD_2D_VNNI<16, 2, 16>;
-  using VNNIOp_16bit_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
-  
-  static_assert(VNNIOp_16bit_1x16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_1x16::AtomHeight == 1);
-  static_assert(VNNIOp_16bit_1x16::AtomWidth == 16);
   
-  static_assert(VNNIOp_16bit_2x16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_2x16::AtomHeight == 2);
-  static_assert(VNNIOp_16bit_2x16::AtomWidth == 16);
-  
-  static_assert(VNNIOp_16bit_4x16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_4x16::AtomHeight == 4);
-  static_assert(VNNIOp_16bit_4x16::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "16-bit minimal VNNI configurations validated";
-}
+  // Initialize source with test pattern
+  for (size_t i = 0; i < host_src.size(); ++i) {
+    // Use a safe conversion that works for all numeric types
+    if constexpr (std::is_floating_point_v<Element>     || 
+                  std::is_same_v<Element, half_t>       || 
+                  std::is_same_v<Element, bfloat16_t>   ||
+                  std::is_same_v<Element, tfloat32_t>) {
+      
+      // For floating-point types, convert through float
+      float val = static_cast<float>(i % 256) / 255.0f;  // Normalize to [0,1]
+      host_src[i] = Element(val);
+    } else {
+      // For integer types (including uint64_t) and char, direct conversion is safe
+      host_src[i] = static_cast<Element>(i % 256);
+    }
+  }
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_MediumConfigs) {
-  // Test medium-sized 16-bit VNNI configurations
-  using VNNIOp_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;
-  using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
-  
-  static_assert(VNNIOp_16bit_8x16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_8x16::AtomHeight == 8);
-  static_assert(VNNIOp_16bit_8x16::AtomWidth == 16);
-  
-  static_assert(VNNIOp_16bit_16x16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_16x16::AtomHeight == 16);
-  static_assert(VNNIOp_16bit_16x16::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "16-bit medium VNNI configurations validated";
+  // Copy to device
+  cutlass::device_vector<Element> device_src = host_src;
+  cutlass::device_vector<Element> device_dst(M * aligned_N);
+  
+  // Create tensors with proper layout
+  Tensor tensor_src = 
+        make_tensor(make_gmem_ptr(device_src.data()),
+                    make_layout(Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
+  
+  Tensor tensor_dst = 
+        make_tensor(make_gmem_ptr(device_dst.data()),
+                    make_layout(Shape<Int<M>, Int<aligned_N>>{}, Stride<Int<aligned_N>, _1>{}));
+  
+  // Launch kernel - VNNI load demonstration
+  auto blockDim = compat::dim3(SUBGROUP_SIZE);
+  auto gridDim = compat::dim3(1);
+  
+  launch<xe_vnni_load_kernel<decltype(tensor_src), decltype(tensor_dst), Bits, Height, Width>,
+         XEVnniLoadKernelName<decltype(tensor_src), decltype(tensor_dst)>>(
+    launch_policy{
+      gridDim, blockDim,
+      kernel_properties{sycl_exp::sub_group_size<SUBGROUP_SIZE>}
+    },
+    tensor_src, tensor_dst);
+  
+    compat::wait_and_throw();
+    
+    // Note: We do NOT verify data matches because VNNI performs interleaving transformation
+    // The loaded data is in VNNI format (hardware-interleaved for DPAS consumption)
+    // When stored back to memory, the interleaved pattern is visible
+    // In real usage, VNNI data goes directly to gemm()/DPAS, never stored back
+    // This test verifies that VNNI load operation executes without errors
 }
 
+// ============================================
+// VNNI Tests - Only 8-bit and 16-bit supported
+// ============================================
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_WideConfigs) {
-  // Test 16-bit VNNI configurations with wider widths
-  using VNNIOp_16bit_2x32 = XE_LOAD_2D_VNNI<16, 2, 32>;
-  using VNNIOp_16bit_4x32 = XE_LOAD_2D_VNNI<16, 4, 32>;
-  using VNNIOp_16bit_8x32 = XE_LOAD_2D_VNNI<16, 8, 32>;
-  
-  static_assert(VNNIOp_16bit_2x32::CopyBits == 16);
-  static_assert(VNNIOp_16bit_2x32::AtomHeight == 2);
-  static_assert(VNNIOp_16bit_2x32::AtomWidth == 32);
-  
-  static_assert(VNNIOp_16bit_4x32::CopyBits == 16);
-  static_assert(VNNIOp_16bit_4x32::AtomHeight == 4);
-  static_assert(VNNIOp_16bit_4x32::AtomWidth == 32);
-  
-  static_assert(VNNIOp_16bit_8x32::CopyBits == 16);
-  static_assert(VNNIOp_16bit_8x32::AtomHeight == 8);
-  static_assert(VNNIOp_16bit_8x32::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "16-bit wide VNNI configurations validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_uint8) {
+  // VNNI is used for B matrix in GEMM - typically with BlockWidth creating multiple blocks
+  test_xe_vnni_load<uint8_t, 8, 4, 64, 16>();   // 4 blocks of 16
+  test_xe_vnni_load<uint8_t, 8, 8, 64, 32>();   // 2 blocks of 32
+  test_xe_vnni_load<uint8_t, 8, 8, 64, 64>();   // 1 block of 64
 }
 
-
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_8bit_CustomBlockWidth) {
-  // Test 8-bit VNNI with custom BlockWidth parameter
-  using VNNIOp_8bit_4x32_bw16 = XE_LOAD_2D_VNNI<8, 4, 32, 16>;
-  using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>;
-  using VNNIOp_8bit_16x32_bw16 = XE_LOAD_2D_VNNI<8, 16, 32, 16>;
-  
-  static_assert(VNNIOp_8bit_4x32_bw16::CopyBits == 8);
-  static_assert(VNNIOp_8bit_4x32_bw16::AtomHeight == 4);
-  static_assert(VNNIOp_8bit_4x32_bw16::AtomWidth == 32);
-  
-  static_assert(VNNIOp_8bit_8x32_bw16::CopyBits == 8);
-  static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8);
-  static_assert(VNNIOp_8bit_8x32_bw16::AtomWidth == 32);
-  
-  static_assert(VNNIOp_8bit_16x32_bw16::CopyBits == 8);
-  static_assert(VNNIOp_8bit_16x32_bw16::AtomHeight == 16);
-  static_assert(VNNIOp_8bit_16x32_bw16::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "8-bit VNNI with custom BlockWidth validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_int8) {
+  test_xe_vnni_load<int8_t, 8, 4, 64, 16>();
+  test_xe_vnni_load<int8_t, 8, 8, 64, 32>();
+  test_xe_vnni_load<int8_t, 8, 8, 64, 64>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_16bit_CustomBlockWidth) {
-  // Test 16-bit VNNI with custom BlockWidth parameter
-  using VNNIOp_16bit_4x16_bw8 = XE_LOAD_2D_VNNI<16, 4, 16, 8>;
-  using VNNIOp_16bit_8x16_bw8 = XE_LOAD_2D_VNNI<16, 8, 16, 8>;
-  using VNNIOp_16bit_16x16_bw8 = XE_LOAD_2D_VNNI<16, 16, 16, 8>;
-  
-  static_assert(VNNIOp_16bit_4x16_bw8::CopyBits == 16);
-  static_assert(VNNIOp_16bit_4x16_bw8::AtomHeight == 4);
-  static_assert(VNNIOp_16bit_4x16_bw8::AtomWidth == 16);
-  
-  static_assert(VNNIOp_16bit_8x16_bw8::CopyBits == 16);
-  static_assert(VNNIOp_16bit_8x16_bw8::AtomHeight == 8);
-  static_assert(VNNIOp_16bit_8x16_bw8::AtomWidth == 16);
-  
-  static_assert(VNNIOp_16bit_16x16_bw8::CopyBits == 16);
-  static_assert(VNNIOp_16bit_16x16_bw8::AtomHeight == 16);
-  static_assert(VNNIOp_16bit_16x16_bw8::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "16-bit VNNI with custom BlockWidth validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_uint16) {
+  test_xe_vnni_load<uint16_t, 16, 4, 32, 16>();  // 2 blocks of 16
+  test_xe_vnni_load<uint16_t, 16, 8, 32, 16>();  // 2 blocks of 16
+  test_xe_vnni_load<uint16_t, 16, 8, 32, 32>();  // 1 block of 32
 }
 
-
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Int8_GEMMConfigs) {
-  // Test typical int8 GEMM VNNI configurations for K-dimension packing
-  using GEMM_Int8_4x32 = XE_LOAD_2D_VNNI<8, 4, 32>;
-  using GEMM_Int8_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;
-  using GEMM_Int8_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;
-  using GEMM_Int8_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>;
-  
-  static_assert(GEMM_Int8_4x32::CopyBits == 8);
-  static_assert(GEMM_Int8_4x32::AtomHeight == 4);
-  static_assert(GEMM_Int8_4x32::AtomWidth == 32);
-  
-  static_assert(GEMM_Int8_8x32::CopyBits == 8);
-  static_assert(GEMM_Int8_8x32::AtomHeight == 8);
-  static_assert(GEMM_Int8_8x32::AtomWidth == 32);
-  
-  static_assert(GEMM_Int8_16x32::CopyBits == 8);
-  static_assert(GEMM_Int8_16x32::AtomHeight == 16);
-  static_assert(GEMM_Int8_16x32::AtomWidth == 32);
-  
-  static_assert(GEMM_Int8_32x32::CopyBits == 8);
-  static_assert(GEMM_Int8_32x32::AtomHeight == 32);
-  static_assert(GEMM_Int8_32x32::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "Int8 GEMM VNNI configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BF16_GEMMConfigs) {
-  // Test typical BF16/FP16 GEMM VNNI configurations for K-dimension packing
-  using GEMM_BF16_4x16 = XE_LOAD_2D_VNNI<16, 4, 16>;
-  using GEMM_BF16_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;
-  using GEMM_BF16_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
-  using GEMM_BF16_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>;
-  
-  static_assert(GEMM_BF16_4x16::CopyBits == 16);
-  static_assert(GEMM_BF16_4x16::AtomHeight == 4);
-  static_assert(GEMM_BF16_4x16::AtomWidth == 16);
-  
-  static_assert(GEMM_BF16_8x16::CopyBits == 16);
-  static_assert(GEMM_BF16_8x16::AtomHeight == 8);
-  static_assert(GEMM_BF16_8x16::AtomWidth == 16);
-  
-  static_assert(GEMM_BF16_16x16::CopyBits == 16);
-  static_assert(GEMM_BF16_16x16::AtomHeight == 16);
-  static_assert(GEMM_BF16_16x16::AtomWidth == 16);
-  
-  static_assert(GEMM_BF16_32x16::CopyBits == 16);
-  static_assert(GEMM_BF16_32x16::AtomHeight == 32);
-  static_assert(GEMM_BF16_32x16::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "BF16/FP16 GEMM VNNI configurations validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_int16) {
+  test_xe_vnni_load<int16_t, 16, 4, 32, 16>();
+  test_xe_vnni_load<int16_t, 16, 8, 32, 16>();
+  test_xe_vnni_load<int16_t, 16, 8, 32, 32>();
 }
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MoE_GEMMConfigs) {
-  // Test VNNI configurations used in MoE (Mixture of Experts) GEMM
-  // Based on example from 12_bmg_moe_gemm_cute_interface
-  using MoE_Load_A = XE_LOAD_2D_VNNI<16, 32, 16, 16>;
-  using MoE_Load_B_Alt1 = XE_LOAD_2D_VNNI<16, 16, 16>;
-  using MoE_Load_B_Alt2 = XE_LOAD_2D_VNNI<16, 8, 16>;
-  
-  static_assert(MoE_Load_A::CopyBits == 16);
-  static_assert(MoE_Load_A::AtomHeight == 32);
-  static_assert(MoE_Load_A::AtomWidth == 16);
-  
-  static_assert(MoE_Load_B_Alt1::CopyBits == 16);
-  static_assert(MoE_Load_B_Alt1::AtomHeight == 16);
-  static_assert(MoE_Load_B_Alt1::AtomWidth == 16);
-  
-  static_assert(MoE_Load_B_Alt2::CopyBits == 16);
-  static_assert(MoE_Load_B_Alt2::AtomHeight == 8);
-  static_assert(MoE_Load_B_Alt2::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "MoE GEMM VNNI configurations validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_half) {
+  test_xe_vnni_load<half_t, 16, 4, 32, 16>();
+  test_xe_vnni_load<half_t, 16, 8, 32, 16>();
+  test_xe_vnni_load<half_t, 16, 8, 32, 32>();
 }
 
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MixedBlockWidthConfigs) {
-  // Test various BlockWidth settings for optimization
-  using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>;
-  using VNNIOp_8bit_16x64_bw32 = XE_LOAD_2D_VNNI<8, 16, 64, 32>;
-  using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>;
-  using VNNIOp_16bit_16x32_bw16 = XE_LOAD_2D_VNNI<16, 16, 32, 16>;
-  
-  static_assert(VNNIOp_8bit_8x64_bw32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8);
-  static_assert(VNNIOp_8bit_8x64_bw32::AtomWidth == 64);
-  
-  static_assert(VNNIOp_8bit_16x64_bw32::CopyBits == 8);
-  static_assert(VNNIOp_8bit_16x64_bw32::AtomHeight == 16);
-  static_assert(VNNIOp_8bit_16x64_bw32::AtomWidth == 64);
-  
-  static_assert(VNNIOp_16bit_8x32_bw16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8);
-  static_assert(VNNIOp_16bit_8x32_bw16::AtomWidth == 32);
-  
-  static_assert(VNNIOp_16bit_16x32_bw16::CopyBits == 16);
-  static_assert(VNNIOp_16bit_16x32_bw16::AtomHeight == 16);
-  static_assert(VNNIOp_16bit_16x32_bw16::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "Mixed BlockWidth VNNI configurations validated";
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_bfloat16) {
+  test_xe_vnni_load<bfloat16_t, 16, 4, 32, 16>();
+  test_xe_vnni_load<bfloat16_t, 16, 8, 32, 16>();
+  test_xe_vnni_load<bfloat16_t, 16, 8, 32, 32>();
 }
 
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_DataType_Consistency) {
-  // Test that CopyBits correctly reflects the data size
-  using VNNIOp_8bit_small = XE_LOAD_2D_VNNI<8, 2, 16>;
-  using VNNIOp_8bit_large = XE_LOAD_2D_VNNI<8, 16, 64>;
-  using VNNIOp_16bit_small = XE_LOAD_2D_VNNI<16, 2, 16>;
-  using VNNIOp_16bit_large = XE_LOAD_2D_VNNI<16, 16, 32>;
-  
-  // All 8-bit variants should have CopyBits == 8
-  static_assert(VNNIOp_8bit_small::CopyBits == 8);
-  static_assert(VNNIOp_8bit_large::CopyBits == 8);
-  
-  // All 16-bit variants should have CopyBits == 16
-  static_assert(VNNIOp_16bit_small::CopyBits == 16);
-  static_assert(VNNIOp_16bit_large::CopyBits == 16);
-  
-  EXPECT_TRUE(true) << "Data type consistency validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_BlockWidth_Divisors) {
-  // Test BlockWidth as divisors of Width
-  using VNNIOp_8bit_8x32_bw8 = XE_LOAD_2D_VNNI<8, 8, 32, 8>;
-  using VNNIOp_8bit_8x32_bw16 = XE_LOAD_2D_VNNI<8, 8, 32, 16>;
-  using VNNIOp_8bit_8x64_bw16 = XE_LOAD_2D_VNNI<8, 8, 64, 16>;
-  using VNNIOp_8bit_8x64_bw32 = XE_LOAD_2D_VNNI<8, 8, 64, 32>;
-  using VNNIOp_16bit_8x32_bw8 = XE_LOAD_2D_VNNI<16, 8, 32, 8>;
-  using VNNIOp_16bit_8x32_bw16 = XE_LOAD_2D_VNNI<16, 8, 32, 16>;
-  
-  static_assert(VNNIOp_8bit_8x32_bw8::AtomHeight == 8 && VNNIOp_8bit_8x32_bw8::AtomWidth == 32);
-  static_assert(VNNIOp_8bit_8x32_bw16::AtomHeight == 8 && VNNIOp_8bit_8x32_bw16::AtomWidth == 32);
-  static_assert(VNNIOp_8bit_8x64_bw16::AtomHeight == 8 && VNNIOp_8bit_8x64_bw16::AtomWidth == 64);
-  static_assert(VNNIOp_8bit_8x64_bw32::AtomHeight == 8 && VNNIOp_8bit_8x64_bw32::AtomWidth == 64);
-  static_assert(VNNIOp_16bit_8x32_bw8::AtomHeight == 8 && VNNIOp_16bit_8x32_bw8::AtomWidth == 32);
-  static_assert(VNNIOp_16bit_8x32_bw16::AtomHeight == 8 && VNNIOp_16bit_8x32_bw16::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "BlockWidth divisor configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Symmetric_Configs) {
-  // Test configurations with matching height and width values
-  using VNNIOp_8bit_16x16 = XE_LOAD_2D_VNNI<8, 16, 16>;
-  using VNNIOp_8bit_32x32 = XE_LOAD_2D_VNNI<8, 32, 32>;
-  using VNNIOp_16bit_8x8 = XE_LOAD_2D_VNNI<16, 8, 8>;
-  using VNNIOp_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>;
-  using VNNIOp_16bit_32x32 = XE_LOAD_2D_VNNI<16, 32, 32>;
-  
-  static_assert(VNNIOp_8bit_16x16::AtomHeight == 16 && VNNIOp_8bit_16x16::AtomWidth == 16);
-  static_assert(VNNIOp_8bit_32x32::AtomHeight == 32 && VNNIOp_8bit_32x32::AtomWidth == 32);
-  static_assert(VNNIOp_16bit_8x8::AtomHeight == 8 && VNNIOp_16bit_8x8::AtomWidth == 8);
-  static_assert(VNNIOp_16bit_16x16::AtomHeight == 16 && VNNIOp_16bit_16x16::AtomWidth == 16);
-  static_assert(VNNIOp_16bit_32x32::AtomHeight == 32 && VNNIOp_16bit_32x32::AtomWidth == 32);
-  
-  EXPECT_TRUE(true) << "Symmetric VNNI configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_Small_Tiles) {
-  // Test small tile configurations useful for residual/boundary handling
-  using VNNIOp_8bit_1x16 = XE_LOAD_2D_VNNI<8, 1, 16>;
-  using VNNIOp_8bit_2x16 = XE_LOAD_2D_VNNI<8, 2, 16>;
-  using VNNIOp_8bit_1x32 = XE_LOAD_2D_VNNI<8, 1, 32>;
-  using VNNIOp_16bit_1x8 = XE_LOAD_2D_VNNI<16, 1, 8>;
-  using VNNIOp_16bit_2x8 = XE_LOAD_2D_VNNI<16, 2, 8>;
-  using VNNIOp_16bit_1x16 = XE_LOAD_2D_VNNI<16, 1, 16>;
-  
-  static_assert(VNNIOp_8bit_1x16::AtomHeight == 1 && VNNIOp_8bit_1x16::AtomWidth == 16);
-  static_assert(VNNIOp_8bit_2x16::AtomHeight == 2 && VNNIOp_8bit_2x16::AtomWidth == 16);
-  static_assert(VNNIOp_8bit_1x32::AtomHeight == 1 && VNNIOp_8bit_1x32::AtomWidth == 32);
-  static_assert(VNNIOp_16bit_1x8::AtomHeight == 1 && VNNIOp_16bit_1x8::AtomWidth == 8);
-  static_assert(VNNIOp_16bit_2x8::AtomHeight == 2 && VNNIOp_16bit_2x8::AtomWidth == 8);
-  static_assert(VNNIOp_16bit_1x16::AtomHeight == 1 && VNNIOp_16bit_1x16::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "Small tile VNNI configurations validated";
-}
-
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_MatMul_Optimized) {
-  // Test configurations optimized for matrix multiplication (DPAS integration)
-  // Based on typical DPAS dimensions: N=16 for all, K varies by data type
-  using MatMul_8bit_8x32 = XE_LOAD_2D_VNNI<8, 8, 32>;     // K=32 for int8
-  using MatMul_8bit_16x32 = XE_LOAD_2D_VNNI<8, 16, 32>;   // M=16 tile
-  using MatMul_16bit_8x16 = XE_LOAD_2D_VNNI<16, 8, 16>;   // K=16 for bf16/fp16
-  using MatMul_16bit_16x16 = XE_LOAD_2D_VNNI<16, 16, 16>; // M=16, N=16 for bf16/fp16
-  using MatMul_16bit_32x16 = XE_LOAD_2D_VNNI<16, 32, 16>; // Larger M tile
-  
-  // Verify dimensions match DPAS requirements
-  static_assert(MatMul_8bit_8x32::CopyBits == 8);
-  static_assert(MatMul_8bit_8x32::AtomHeight == 8);
-  static_assert(MatMul_8bit_8x32::AtomWidth == 32);  // Matches int8 DPAS K dimension
-  
-  static_assert(MatMul_8bit_16x32::CopyBits == 8);
-  static_assert(MatMul_8bit_16x32::AtomHeight == 16);
-  static_assert(MatMul_8bit_16x32::AtomWidth == 32);
-  
-  static_assert(MatMul_16bit_8x16::CopyBits == 16);
-  static_assert(MatMul_16bit_8x16::AtomHeight == 8);
-  static_assert(MatMul_16bit_8x16::AtomWidth == 16);  // Matches bf16/fp16 DPAS K dimension
-  
-  static_assert(MatMul_16bit_16x16::CopyBits == 16);
-  static_assert(MatMul_16bit_16x16::AtomHeight == 16);
-  static_assert(MatMul_16bit_16x16::AtomWidth == 16);
-  
-  static_assert(MatMul_16bit_32x16::CopyBits == 16);
-  static_assert(MatMul_16bit_32x16::AtomHeight == 32);
-  static_assert(MatMul_16bit_32x16::AtomWidth == 16);
-  
-  EXPECT_TRUE(true) << "MatMul-optimized VNNI configurations validated";
-}
+// Note: 32-bit and 64-bit types are NOT supported by VNNI
+// VNNI only works with 8-bit and 16-bit data types
 
 #else
 
-TEST(CuTe_Xe, XE_LOAD_2D_VNNI_SKIPPED) {
-  GTEST_SKIP() << "XE_LOAD_2D_VNNI tests require IGC version 2.18 or higher. skipped";
+// For the fallback case
+#include "cutlass_unit_test.h"
+
+TEST(PVC_CuTe_Xe, XE_VNNI_2D_SKIPPED) {
+  GTEST_SKIP() << "XE_VNNI_2D tests require IGC version 2.18 or higher. skipped";
 }
 
 #endif

From 042620ec182980d0166f9656ee13958640816ee5 Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Tue, 9 Dec 2025 20:44:30 +0530
Subject: [PATCH 6/7] Update CMakeLists.txt

---
 examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
index 0e79d95484..1b3830649e 100644
--- a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
+++ b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
@@ -69,7 +69,7 @@ cutlass_example_add_executable(
 if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64")
   # TODO(codeplay): Remove these once IGC block load loop hoisting bug is fixed
   target_link_options( 02_bmg_gemm_f16_s8_f16_tensorwise PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
-  target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
+  # target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
   target_link_options( 02_bmg_gemm_f16_u4_f16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
   target_link_options( 02_bmg_gemm_f16_u4_s8 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
 endif()

From 5c2c32bfd8935c12f8e8a6c73cf2409b082c745d Mon Sep 17 00:00:00 2001
From: Rishi Yadav <rishi.yadav@intel.com>
Date: Tue, 9 Dec 2025 22:24:14 +0530
Subject: [PATCH 7/7] Update CMakeLists.txt

---
 examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
index 1b3830649e..0e79d95484 100644
--- a/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
+++ b/examples/02_bmg_gemm_mixed_dtype/CMakeLists.txt
@@ -69,7 +69,7 @@ cutlass_example_add_executable(
 if(NOT DPCPP_SYCL_TARGET STREQUAL "spir64")
   # TODO(codeplay): Remove these once IGC block load loop hoisting bug is fixed
   target_link_options( 02_bmg_gemm_f16_s8_f16_tensorwise PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
-  # target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
+  target_link_options( 02_bmg_gemm_bf16_s8_bf16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
   target_link_options( 02_bmg_gemm_f16_u4_f16 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
   target_link_options( 02_bmg_gemm_f16_u4_s8 PRIVATE -Xs "-options \"-igc_opts 'allowDecompose2DBlockFuncs=0'\"" )
 endif()