From 19eb242fe009f2fe05fae3b64d46341f0f4134f2 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 30 Jun 2022 20:13:19 +0100
Subject: [PATCH 1/4] Explicitly adding extension headers in tests.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp       | 4 ++--
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 4 ++--
 SYCL/Matrix/element_wise_wi_marray.cpp    | 2 +-
 SYCL/Matrix/joint_matrix_tensorcore.cpp   | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index ff84ecbeb3..76a5012e19 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -6,14 +6,14 @@
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out
-
 #include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 #include <cmath>
 #include <vector>
 
 using namespace cl::sycl;
-using sycl::ext::oneapi::experimental::bfloat16;
+using namespace sycl::ext::oneapi::experimental;
 
 constexpr int N = 60; // divisible by all tested array sizes
 constexpr float bf16_eps = 0.00390625;
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 69976fa7e4..553f9e8d99 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -9,8 +9,8 @@
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
-
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 6ab3947ed9..5b82ebf9db 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -13,8 +13,8 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
 
 #define SG_SZ 32
 
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 2b5078d415..18e564397a 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -6,12 +6,12 @@
 // for the Nvidia case.  DPC++ JIT compilation is not
 // supported for the Nvidia matrix extension, although some JIT optimizations
 // are performed at the level of the PTX assembly code.
-
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
 constexpr float bf16_eps = 0.00390625;
 
 // Example usage of Nvidia matrix multiply.

From eb82212206671f3c51b33b3a7c2294b71aaae1d3 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 30 Jun 2022 20:47:32 +0100
Subject: [PATCH 2/4] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp       | 2 +-
 SYCL/Matrix/element_wise_all_ops_cuda.cpp | 2 +-
 SYCL/Matrix/joint_matrix_tensorcore.cpp   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 76a5012e19..41c3920cc6 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -6,8 +6,8 @@
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out
-#include <sycl/sycl.hpp>
 #include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cmath>
 #include <vector>
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 553f9e8d99..7e58ceb18d 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -9,8 +9,8 @@
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
-#include <sycl/sycl.hpp>
 #include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 18e564397a..de93dea56d 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -6,8 +6,8 @@
 // for the Nvidia case.  DPC++ JIT compilation is not
 // supported for the Nvidia matrix extension, although some JIT optimizations
 // are performed at the level of the PTX assembly code.
-#include <sycl/sycl.hpp>
 #include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental;

From 35844c95b86850fd982bce243d18f6c5160353af Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 1 Jul 2022 11:55:19 +0100
Subject: [PATCH 3/4] Includes needed for experimental/builtins.hpp removed
 from sycl.hpp

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp          | 4 ++--
 SYCL/DeviceLib/built-ins/ext_native_math.cpp | 3 ++-
 SYCL/Matrix/element_wise_all_ops_cuda.cpp    | 2 +-
 SYCL/Matrix/element_wise_wi_marray.cpp       | 1 +
 SYCL/Matrix/joint_matrix_tensorcore.cpp      | 2 +-
 SYCL/Printf/char.cpp                         | 3 ++-
 SYCL/Printf/double.cpp                       | 3 ++-
 SYCL/Printf/float.cpp                        | 3 ++-
 SYCL/Printf/int.cpp                          | 3 ++-
 SYCL/Printf/long.cpp                         | 3 ++-
 SYCL/Printf/mixed-address-space.cpp          | 3 ++-
 SYCL/Printf/percent-symbol.cpp               | 3 ++-
 12 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 41c3920cc6..101d811666 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -6,13 +6,13 @@
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend --cuda-gpu-arch=sm_80
 // RUN: %t.out
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/sycl.hpp>
 
 #include <cmath>
 #include <vector>
 
-using namespace cl::sycl;
+using namespace sycl;
 using namespace sycl::ext::oneapi::experimental;
 
 constexpr int N = 60; // divisible by all tested array sizes
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 965bca2a76..a9ff8b0958 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -7,7 +7,8 @@
 // OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
 // test is compiled with the -fsycl-device-code-split flag
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 #include <cassert>
 
 template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index 7e58ceb18d..c73da53888 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -9,7 +9,7 @@
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 5b82ebf9db..e4188a567a 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -10,6 +10,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 %s -o %t.out
 // RUN: %t.out
 
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index de93dea56d..664944bacd 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -6,7 +6,7 @@
 // for the Nvidia case.  DPC++ JIT compilation is not
 // supported for the Nvidia matrix extension, although some JIT optimizations
 // are performed at the level of the PTX assembly code.
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
diff --git a/SYCL/Printf/char.cpp b/SYCL/Printf/char.cpp
index 7cdca04ccf..6469ae25cf 100644
--- a/SYCL/Printf/char.cpp
+++ b/SYCL/Printf/char.cpp
@@ -25,7 +25,8 @@
 // CHECK: literal strings: s=Hello World!
 // CHECK_DISABLED: non-literal strings: s=Hello, World! ls=
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cstring>
 
diff --git a/SYCL/Printf/double.cpp b/SYCL/Printf/double.cpp
index 446e2d7a88..11954860f7 100644
--- a/SYCL/Printf/double.cpp
+++ b/SYCL/Printf/double.cpp
@@ -28,7 +28,8 @@
 
 #include <iostream>
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include "helper.hpp"
 
diff --git a/SYCL/Printf/float.cpp b/SYCL/Printf/float.cpp
index 7dcaa39276..7e95403c03 100644
--- a/SYCL/Printf/float.cpp
+++ b/SYCL/Printf/float.cpp
@@ -31,7 +31,8 @@
 
 #include <iostream>
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include "helper.hpp"
 
diff --git a/SYCL/Printf/int.cpp b/SYCL/Printf/int.cpp
index c613bf3fd6..63c702dccc 100644
--- a/SYCL/Printf/int.cpp
+++ b/SYCL/Printf/int.cpp
@@ -20,7 +20,8 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.constant.out %GPU_CHECK_PLACEHOLDER
 // RUN: %ACC_RUN_PLACEHOLDER %t.constant.out %ACC_CHECK_PLACEHOLDER
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include "helper.hpp"
 
diff --git a/SYCL/Printf/long.cpp b/SYCL/Printf/long.cpp
index b4cb14f411..f518600b79 100644
--- a/SYCL/Printf/long.cpp
+++ b/SYCL/Printf/long.cpp
@@ -20,7 +20,8 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.constant.out %GPU_CHECK_PLACEHOLDER
 // RUN: %ACC_RUN_PLACEHOLDER %t.constant.out %ACC_CHECK_PLACEHOLDER
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include "helper.hpp"
 
diff --git a/SYCL/Printf/mixed-address-space.cpp b/SYCL/Printf/mixed-address-space.cpp
index 37cb83ce68..430b0275de 100644
--- a/SYCL/Printf/mixed-address-space.cpp
+++ b/SYCL/Printf/mixed-address-space.cpp
@@ -12,7 +12,8 @@
 // CHECK: Constant addrspace literal
 // CHECK: Generic addrspace literal
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include "helper.hpp"
 
diff --git a/SYCL/Printf/percent-symbol.cpp b/SYCL/Printf/percent-symbol.cpp
index 5a23a7f2bf..8c4ec8e5aa 100644
--- a/SYCL/Printf/percent-symbol.cpp
+++ b/SYCL/Printf/percent-symbol.cpp
@@ -23,7 +23,8 @@
 // CHECK: %c %s %d %i %o %x %X %u
 // CHECK-NEXT: %f %F %e %E %a %A %g %G %n %p
 
-#include <CL/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cstring>
 

From bfab5bc8e410f57993017d76ca3c98efc52716f6 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 1 Jul 2022 12:04:44 +0100
Subject: [PATCH 4/4] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index a9ff8b0958..083019362d 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -7,9 +7,9 @@
 // OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
 // test is compiled with the -fsycl-device-code-split flag
 
+#include <cassert>
 #include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/sycl.hpp>
-#include <cassert>
 
 template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
   assert(sycl::all(lower < val && val < upper));