From f95793b9f67f50682d611061b3da853747a568e3 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 21 Apr 2022 18:26:54 +0100
Subject: [PATCH 01/20] tests for vec/marray math

sycl::math, native and half_precision cases covered.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 SYCL/Basic/half_builtins.cpp                  |   3 +-
 SYCL/DeviceLib/built-ins/ext_native_math.cpp  | 184 ++++++++++++
 .../half_precision_math_test_marray_vec.cpp   | 169 +++++++++++
 SYCL/DeviceLib/math_test_marray_vec.cpp       | 281 ++++++++++++++++++
 .../DeviceLib/native_math_test_marray_vec.cpp | 156 ++++++++++
 5 files changed, 792 insertions(+), 1 deletion(-)
 create mode 100644 SYCL/DeviceLib/built-ins/ext_native_math.cpp
 create mode 100644 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
 create mode 100644 SYCL/DeviceLib/math_test_marray_vec.cpp
 create mode 100644 SYCL/DeviceLib/native_math_test_marray_vec.cpp
diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
index 604ffb3c3a..e4f47695e1 100644
--- a/SYCL/Basic/half_builtins.cpp
+++ b/SYCL/Basic/half_builtins.cpp
@@ -166,6 +166,7 @@ template <int N> bool check(vec<float, N> a, vec<float, N> b) {
 
 int main() {
   queue q;
+  if (q.get_device().has(sycl::aspect::fp16)) {
   float16 a, b, c, d;
   for (int i = 0; i < SZ_max; i++) {
     a[i] = i / (float)SZ_max;
@@ -193,6 +194,6 @@ int main() {
     });
   }
   assert(err == 0);
-
+}
   return 0;
 }
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
new file mode 100644
index 0000000000..9c64ec817f
--- /dev/null
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -0,0 +1,184 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
+// test is compiled with the -fsycl-device-code-split flag
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+template <typename T, size_t N>
+void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
+                         sycl::marray<T, N> upper) {
+  for (int i = 0; i < N; i++) {
+    assert(lower[i] < val[i] && val[i] < upper[i]);
+  }
+}
+
+template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
+  assert(sycl::all(lower < val && val < upper));
+}
+
+template <>
+void assert_out_of_bound<float>(float val, float lower, float upper) {
+  assert(lower < val && val < upper);
+}
+
+template <>
+void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
+                                     sycl::half upper) {
+  assert(lower < val && val < upper);
+}
+
+template <typename T>
+void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
+
+template <typename T>
+void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
+
+int main() {
+
+  sycl::queue q;
+
+  const double tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                         -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const double tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                         -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+  const double tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                         -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+
+  native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                   {tu[0], tu[1]});
+  native_tanh_tester<sycl::float3>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  
+  native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                   {tl[0], tl[1], tl[2], tl[3]},
+                                   {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<float, 3>>(q, {tv[0], tv[1], tv[2]},
+                                           {tl[0], tl[1], tl[2]},
+                                           {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                           {tl[0], tl[1], tl[2], tl[3]},
+                                           {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                   {tl[0], tl[1], tl[2], tl[3]},
+                                   {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::float8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::float16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+  if (q.get_device().has(sycl::aspect::fp16)) {
+
+    native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
+    native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                    {tu[0], tu[1]});
+    native_tanh_tester<sycl::half3>(
+        q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+    native_tanh_tester<sycl::marray<sycl::half, 3>>(
+        q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+    native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                    {tl[0], tl[1], tl[2], tl[3]},
+                                    {tu[0], tu[1], tu[2], tu[3]});
+    native_tanh_tester<sycl::marray<sycl::half, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                    {tl[0], tl[1], tl[2], tl[3]},
+                                    {tu[0], tu[1], tu[2], tu[3]});
+    native_tanh_tester<sycl::half8>(
+        q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+    native_tanh_tester<sycl::half16>(
+        q,
+        {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+         tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+         tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+         tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+    const double ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+                           -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
+    const double el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+                           0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
+    const double eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+                           0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
+
+    native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
+    native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
+                                    {eu[0], eu[1]});
+    native_exp2_tester<sycl::half3>(
+        q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+    native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                    {el[0], el[1], el[2], el[3]},
+                                    {eu[0], eu[1], eu[2], eu[3]});
+    native_exp2_tester<sycl::marray<sycl::half, 3>>(q, {ev[0], ev[1], ev[2]},
+                                                {el[0], el[1], el[2]},
+                                                {eu[0], eu[1], eu[2]});
+    native_exp2_tester<sycl::marray<sycl::half, 4>>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                                {el[0], el[1], el[2], el[3]},
+                                                {eu[0], eu[1], eu[2], eu[3]});
+    native_exp2_tester<sycl::half8>(
+        q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
+        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
+        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
+    native_exp2_tester<sycl::half16>(
+        q,
+        {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
+         ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
+        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
+         el[10], el[11], el[12], el[13], el[14], el[15]},
+        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
+         eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
new file mode 100644
index 0000000000..0cc306a075
--- /dev/null
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -0,0 +1,169 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define COMMA ,
+
+#define HALF_PRECISION_OPERATOR(NAME)                                          \
+  template <typename T>                                                        \
+  void half_precision_math_test_##NAME(queue &deviceQueue, T result, T input,  \
+                                       size_t ref) {                           \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input_access(      \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class half_precision##NAME, T>>([=]() {     \
+          res_access[0] = sycl::half_precision::NAME(input_access[0]);         \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+HALF_PRECISION_OPERATOR(sin)
+HALF_PRECISION_OPERATOR(tan)
+HALF_PRECISION_OPERATOR(cos)
+HALF_PRECISION_OPERATOR(exp)
+HALF_PRECISION_OPERATOR(exp2)
+HALF_PRECISION_OPERATOR(exp10)
+HALF_PRECISION_OPERATOR(log)
+HALF_PRECISION_OPERATOR(log2)
+HALF_PRECISION_OPERATOR(log10)
+HALF_PRECISION_OPERATOR(sqrt)
+HALF_PRECISION_OPERATOR(rsqrt)
+HALF_PRECISION_OPERATOR(recip)
+
+#undef HALF_PRECISION_OPERATOR
+
+#define HALF_PRECISION_OPERATOR_2(NAME)                                        \
+  template <typename T>                                                        \
+  void half_precision_math_test_2_##NAME(queue &deviceQueue, T result,         \
+                                         T input1, T input2, size_t ref) {     \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class half_precision2##NAME, T>>([=]() {    \
+          res_access[0] =                                                      \
+              sycl::half_precision::NAME(input1_access[0], input2_access[0]);  \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+HALF_PRECISION_OPERATOR_2(divide)
+HALF_PRECISION_OPERATOR_2(powr)
+
+#undef HALF_PRECISION_OPERATOR_2
+
+#define HALF_PRECISION_TESTS_3(TYPE)                                           \
+  half_precision_math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
+                               0);                                             \
+  half_precision_math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
+                               0);                                             \
+  half_precision_math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
+                               1);                                             \
+  half_precision_math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
+                               1);                                             \
+  half_precision_math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2},  \
+                                4);                                            \
+  half_precision_math_test_exp10(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, \
+                                 100);                                         \
+  half_precision_math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1},   \
+                               0);                                             \
+  half_precision_math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},  \
+                                2);                                            \
+  half_precision_math_test_log10(deviceQueue, TYPE{-1, -1, -1},                \
+                                 TYPE{100, 100, 100}, 2);                      \
+  half_precision_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},  \
+                                2);                                            \
+  half_precision_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1},                \
+                                 TYPE{0.25, 0.25, 0.25}, 2);                   \
+  half_precision_math_test_recip(deviceQueue, TYPE{-1, -1, -1},                \
+                                 TYPE{0.25, 0.25, 0.25}, 4);                   \
+  half_precision_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1},               \
+                                  TYPE{2, 2, 2}, TYPE{2, 2, 2}, 4);            \
+  half_precision_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1},             \
+                                    TYPE{4, 4, 4}, TYPE{2, 2, 2}, 2);
+
+#define HALF_PRECISION_TESTS_4(TYPE)                                           \
+  half_precision_math_test_sin(deviceQueue, TYPE{-1, -1, -1, -1},              \
+                               TYPE{0, 0, 0, 0}, 0);                           \
+  half_precision_math_test_tan(deviceQueue, TYPE{-1, -1, -1, -1},              \
+                               TYPE{0, 0, 0, 0}, 0);                           \
+  half_precision_math_test_cos(deviceQueue, TYPE{-1, -1, -1, -1},              \
+                               TYPE{0, 0, 0, 0}, 1);                           \
+  half_precision_math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1},              \
+                               TYPE{0, 0, 0, 0}, 1);                           \
+  half_precision_math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1},             \
+                                TYPE{2, 2, 2, 2}, 4);                          \
+  half_precision_math_test_exp10(deviceQueue, TYPE{-1, -1, -1, -1},            \
+                                 TYPE{2, 2, 2, 2}, 100);                       \
+  half_precision_math_test_log(deviceQueue, TYPE{-1, -1, -1, -1},              \
+                               TYPE{1, 1, 1, 1}, 0);                           \
+  half_precision_math_test_log2(deviceQueue, TYPE{-1, -1, -1, -1},             \
+                                TYPE{4, 4, 4, 4}, 2);                          \
+  half_precision_math_test_log10(deviceQueue, TYPE{-1, -1, -1, -1},            \
+                                 TYPE{100, 100, 100, 100}, 2);                 \
+  half_precision_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1, -1},             \
+                                TYPE{4, 4, 4, 4}, 2);                          \
+  half_precision_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1, -1},            \
+                                 TYPE{0.25, 0.25, 0.25, 0.25}, 2);             \
+  half_precision_math_test_recip(deviceQueue, TYPE{-1, -1, -1, -1},            \
+                                 TYPE{0.25, 0.25, 0.25, 0.25}, 4);             \
+  half_precision_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1},           \
+                                  TYPE{2, 2, 2, 2}, TYPE{2, 2, 2, 2}, 4);      \
+  half_precision_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1, -1},         \
+                                    TYPE{4, 4, 4, 4}, TYPE{2, 2, 2, 2}, 2);
+
+int main() {
+  queue deviceQueue;
+
+  HALF_PRECISION_TESTS_3(float3)
+  HALF_PRECISION_TESTS_3(marray<float COMMA 3>)
+
+  HALF_PRECISION_TESTS_4(float4)
+  HALF_PRECISION_TESTS_4(marray<float COMMA 4>)
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
new file mode 100644
index 0000000000..53ca722725
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -0,0 +1,281 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define COMMA ,
+
+#define OPERATOR(NAME)                                                         \
+  template <typename T>                                                        \
+  void math_test_##NAME(queue &deviceQueue, T result, T input, size_t ref) {   \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input_access(      \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>(                         \
+            [=]() { res_access[0] = NAME(input_access[0]); });                 \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR(cos)
+OPERATOR(cospi)
+OPERATOR(sin)
+OPERATOR(sinpi)
+OPERATOR(cosh)
+OPERATOR(sinh)
+OPERATOR(tan)
+OPERATOR(tanpi)
+OPERATOR(atan)
+OPERATOR(atanpi)
+OPERATOR(tanh)
+OPERATOR(acos)
+OPERATOR(acospi)
+OPERATOR(asin)
+OPERATOR(asinpi)
+OPERATOR(acosh)
+OPERATOR(asinh)
+OPERATOR(atanh)
+OPERATOR(cbrt)
+OPERATOR(ceil)
+OPERATOR(exp)
+OPERATOR(exp2)
+OPERATOR(exp10)
+OPERATOR(expm1)
+OPERATOR(tgamma)
+OPERATOR(lgamma)
+OPERATOR(erf)
+OPERATOR(erfc)
+OPERATOR(log)
+OPERATOR(log2)
+OPERATOR(log10)
+OPERATOR(log1p)
+OPERATOR(logb)
+OPERATOR(sqrt)
+OPERATOR(rsqrt)
+OPERATOR(rint)
+OPERATOR(round)
+OPERATOR(trunc)
+
+#undef OPERATOR
+
+#define OPERATOR_2(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_2_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          size_t ref) {                                        \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] = NAME(input1_access[0], input2_access[0]);            \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_2(pow)
+OPERATOR_2(powr)
+OPERATOR_2(atan2)
+OPERATOR_2(atan2pi)
+OPERATOR_2(copysign)
+OPERATOR_2(fdim)
+OPERATOR_2(fmin)
+OPERATOR_2(fmax)
+OPERATOR_2(fmod)
+OPERATOR_2(hypot)
+OPERATOR_2(maxmag)
+OPERATOR_2(minmag)
+OPERATOR_2(nextafter)
+OPERATOR_2(remainder)
+
+#undef OPERATOR_2
+
+#define OPERATOR_3(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_3_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          T input3, size_t ref) {                              \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      buffer<T, 1> buffer4(&input3, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input3_access(     \
+            buffer4, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] =                                                      \
+              NAME(input1_access[0], input2_access[0], input3_access[0]);      \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_3(mad)
+OPERATOR_3(mix)
+OPERATOR_3(fma)
+
+#undef OPERATOR_3
+
+#define TESTS_4(TYPE)                                                          \
+  math_test_tanh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
+  math_test_cosh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);      \
+  math_test_sinh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
+  math_test_acos(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);      \
+  math_test_acospi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);    \
+  math_test_acosh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);     \
+  math_test_asin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
+  math_test_asinpi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);    \
+  math_test_asinh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
+  math_test_cbrt(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 1);      \
+  math_test_atan(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
+  math_test_atanpi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);    \
+  math_test_atanh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
+  math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);       \
+  math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, 4);      \
+  math_test_exp10(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, 100);   \
+  math_test_expm1(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
+  math_test_ceil(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0.6, 0.6, 0.6, 0.6},  \
+                 1);                                                           \
+  math_test_tgamma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 1);    \
+  math_test_lgamma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);    \
+  math_test_erf(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);       \
+  math_test_erfc(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);      \
+  math_test_2_pow(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
+                  TYPE{2, 2, 2, 2}, 4);                                        \
+  math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
+                   TYPE{2, 2, 2, 2}, 4);                                       \
+  math_test_2_atan2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},       \
+                    TYPE{2, 2, 2, 2}, 0);                                      \
+  math_test_2_atan2pi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},     \
+                      TYPE{2, 2, 2, 2}, 0);                                    \
+  math_test_2_copysign(deviceQueue, TYPE{-1, -1, -1, -1},                      \
+                       TYPE{-3, -3, -3, -3}, TYPE{2, 2, 2, 2}, 3);             \
+  math_test_2_fmin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
+                   TYPE{3, 3, 3, 3}, 2);                                       \
+  math_test_2_fmax(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
+                   TYPE{3, 3, 3, 3}, 3);                                       \
+  math_test_2_hypot(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},       \
+                    TYPE{3, 3, 3, 3}, 5);                                      \
+  math_test_2_maxmag(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{-2, -2, -2, -2},  \
+                     TYPE{3, 3, 3, 3}, 3);                                     \
+  math_test_2_minmag(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},      \
+                     TYPE{-3, -3, -3, -3}, 2);                                 \
+  math_test_2_remainder(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{5, 5, 5, 5},   \
+                        TYPE{2, 2, 2, 2}, 1);                                  \
+  math_test_2_fdim(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{3, 3, 3, 3},        \
+                   TYPE{3, 3, 3, 3}, 0);                                       \
+  math_test_2_fmod(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{5, 5, 5, 5},        \
+                   TYPE{3, 3, 3, 3}, 2);                                       \
+  math_test_2_nextafter(deviceQueue, TYPE{-1, -1, -1, -1},                     \
+                        TYPE{-0, -0, -0, -0}, TYPE{+0, +0, +0, +0}, 0);        \
+  math_test_3_fma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
+                  TYPE{2, 2, 2, 2}, TYPE{1, 1, 1, 1}, 5);                      \
+  math_test_3_mad(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
+                  TYPE{2, 2, 2, 2}, TYPE{1, 1, 1, 1}, 5);                      \
+  math_test_3_mix(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{3, 3, 3, 3},         \
+                  TYPE{5, 5, 5, 5}, TYPE{0.5, 0.5, 0.5, 0.5}, 4);
+
+#define TESTS_3(TYPE)                                                          \
+  math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
+  math_test_tanh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
+  math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);              \
+  math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
+  math_test_cosh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);             \
+  math_test_sinh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
+  math_test_acos(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);             \
+  math_test_acosh(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);            \
+  math_test_asin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
+  math_test_asinh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
+  math_test_cbrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 1);             \
+  math_test_atan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
+  math_test_atanh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
+  math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);              \
+  math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 4);             \
+  math_test_exp10(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 100);          \
+  math_test_expm1(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
+  math_test_ceil(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.6, 0.6, 0.6}, 1);       \
+  math_test_tgamma(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 1);           \
+  math_test_lgamma(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);           \
+  math_test_erf(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
+  math_test_erfc(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);             \
+  math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);              \
+  math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);             \
+  math_test_log10(deviceQueue, TYPE{-1, -1, -1}, TYPE{100, 100, 100}, 2);      \
+  math_test_log1p(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
+  math_test_logb(deviceQueue, TYPE{-1, -1, -1}, TYPE{1.1, 1.1, 1.1}, 0);       \
+  math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);             \
+  math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.25, 0.25, 0.25}, 2);   \
+  math_test_rint(deviceQueue, TYPE{-1, -1, -1}, TYPE{2.9, 2.9, 2.9}, 3);       \
+  math_test_round(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.5, 0.5, 0.5}, 1);      \
+  math_test_trunc(deviceQueue, TYPE{-1, -1, -1}, TYPE{1.9, 1.9, 1.9}, 1);
+
+int main() {
+  queue deviceQueue;
+
+  TESTS_4(float4)
+  TESTS_4(double4)
+  TESTS_4(marray<float COMMA 4>)
+  TESTS_4(marray<double COMMA 4>)
+
+  TESTS_3(float3)
+  TESTS_3(double3)
+  TESTS_3(marray<float COMMA 3>)
+  TESTS_3(marray<double COMMA 3>)
+
+  if (deviceQueue.get_device().has(sycl::aspect::fp16)) {
+    TESTS_4(half4)
+    TESTS_4(marray<half COMMA 4>)
+    TESTS_3(half3)
+    TESTS_3(marray<half COMMA 3>)
+  }
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
new file mode 100644
index 0000000000..e6708f3779
--- /dev/null
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -0,0 +1,156 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define COMMA ,
+
+#define NATIVE_OPERATOR(NAME)                                                  \
+  template <typename T>                                                        \
+  void native_math_test_##NAME(queue &deviceQueue, T result, T input,          \
+                               size_t ref) {                                   \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input_access(      \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class native##NAME, T>>(                    \
+            [=]() { res_access[0] = sycl::native::NAME(input_access[0]); });   \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+NATIVE_OPERATOR(sin)
+NATIVE_OPERATOR(tan)
+NATIVE_OPERATOR(cos)
+NATIVE_OPERATOR(exp)
+NATIVE_OPERATOR(exp2)
+NATIVE_OPERATOR(exp10)
+NATIVE_OPERATOR(log)
+NATIVE_OPERATOR(log2)
+NATIVE_OPERATOR(log10)
+NATIVE_OPERATOR(sqrt)
+NATIVE_OPERATOR(rsqrt)
+NATIVE_OPERATOR(recip)
+
+#undef NATIVE_OPERATOR
+
+#define NATIVE_OPERATOR_2(NAME)                                                \
+  template <typename T>                                                        \
+  void native_math_test_2_##NAME(queue &deviceQueue, T result, T input1,       \
+                                 T input2, size_t ref) {                       \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class native2##NAME, T>>([=]() {            \
+          res_access[0] =                                                      \
+              sycl::native::NAME(input1_access[0], input2_access[0]);          \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+NATIVE_OPERATOR_2(divide)
+NATIVE_OPERATOR_2(powr)
+
+#undef NATIVE_OPERATOR_2
+
+#define NATIVE_TESTS_3(TYPE)                                                   \
+  native_math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);       \
+  native_math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);       \
+  native_math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);       \
+  native_math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);       \
+  native_math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 4);      \
+  native_math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);       \
+  native_math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);      \
+  native_math_test_log10(deviceQueue, TYPE{-1, -1, -1}, TYPE{100, 100, 100},   \
+                         2);                                                   \
+  native_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);      \
+  native_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1},                        \
+                         TYPE{0.25, 0.25, 0.25}, 2);                           \
+  native_math_test_recip(deviceQueue, TYPE{-1, -1, -1},                        \
+                         TYPE{0.25, 0.25, 0.25}, 4);                           \
+  native_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2},        \
+                          TYPE{2, 2, 2}, 4);                                   \
+  native_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},      \
+                            TYPE{2, 2, 2}, 2);
+
+#define NATIVE_TESTS_4(TYPE)                                                   \
+  native_math_test_sin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
+                       0);                                                     \
+  native_math_test_tan(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
+                       0);                                                     \
+  native_math_test_cos(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
+                       1);                                                     \
+  native_math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
+                       1);                                                     \
+  native_math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},   \
+                        4);                                                    \
+  native_math_test_log(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1},    \
+                       0);                                                     \
+  native_math_test_log2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},   \
+                        2);                                                    \
+  native_math_test_log10(deviceQueue, TYPE{-1, -1, -1, -1},                    \
+                         TYPE{100, 100, 100, 100}, 2);                         \
+  native_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},   \
+                        2);                                                    \
+  native_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1, -1},                    \
+                         TYPE{0.25, 0.25, 0.25, 0.25}, 2);                     \
+  native_math_test_recip(deviceQueue, TYPE{-1, -1, -1, -1},                    \
+                         TYPE{0.25, 0.25, 0.25, 0.25}, 4);                     \
+  native_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, \
+                          TYPE{2, 2, 2, 2}, 4);                                \
+  native_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1, -1},                 \
+                            TYPE{4, 4, 4, 4}, TYPE{2, 2, 2, 2}, 2);
+
+int main() {
+  queue deviceQueue;
+
+  NATIVE_TESTS_3(float3)
+  NATIVE_TESTS_3(marray<float COMMA 3>)
+
+  NATIVE_TESTS_4(float4)
+  NATIVE_TESTS_4(marray<float COMMA 4>)
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}

From c7e2ff884d243f7c3ac2cb1d1524f38abee7522a Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 11 May 2022 14:11:03 +0100
Subject: [PATCH 02/20] Made template function improvement.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math.cpp  |   2 +-
 .../half_precision_math_test_marray_vec.cpp   | 122 +++++-----
 SYCL/DeviceLib/math_test_marray_vec.cpp       | 210 +++++++++---------
 .../DeviceLib/native_math_test_marray_vec.cpp |  95 ++++----
 4 files changed, 203 insertions(+), 226 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 9c64ec817f..913f51070e 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -181,4 +181,4 @@ int main() {
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index 0cc306a075..b2f3d152fe 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -29,8 +29,6 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
   return true;
 }
 
-#define COMMA ,
-
 #define HALF_PRECISION_OPERATOR(NAME)                                          \
   template <typename T>                                                        \
   void half_precision_math_test_##NAME(queue &deviceQueue, T result, T input,  \
@@ -95,75 +93,67 @@ HALF_PRECISION_OPERATOR_2(powr)
 
 #undef HALF_PRECISION_OPERATOR_2
 
-#define HALF_PRECISION_TESTS_3(TYPE)                                           \
-  half_precision_math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
-                               0);                                             \
-  half_precision_math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
-                               0);                                             \
-  half_precision_math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
-                               1);                                             \
-  half_precision_math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0},   \
-                               1);                                             \
-  half_precision_math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2},  \
-                                4);                                            \
-  half_precision_math_test_exp10(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, \
-                                 100);                                         \
-  half_precision_math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1},   \
-                               0);                                             \
-  half_precision_math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},  \
-                                2);                                            \
-  half_precision_math_test_log10(deviceQueue, TYPE{-1, -1, -1},                \
-                                 TYPE{100, 100, 100}, 2);                      \
-  half_precision_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},  \
-                                2);                                            \
-  half_precision_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1},                \
-                                 TYPE{0.25, 0.25, 0.25}, 2);                   \
-  half_precision_math_test_recip(deviceQueue, TYPE{-1, -1, -1},                \
-                                 TYPE{0.25, 0.25, 0.25}, 4);                   \
-  half_precision_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1},               \
-                                  TYPE{2, 2, 2}, TYPE{2, 2, 2}, 4);            \
-  half_precision_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1},             \
-                                    TYPE{4, 4, 4}, TYPE{2, 2, 2}, 2);
-
-#define HALF_PRECISION_TESTS_4(TYPE)                                           \
-  half_precision_math_test_sin(deviceQueue, TYPE{-1, -1, -1, -1},              \
-                               TYPE{0, 0, 0, 0}, 0);                           \
-  half_precision_math_test_tan(deviceQueue, TYPE{-1, -1, -1, -1},              \
-                               TYPE{0, 0, 0, 0}, 0);                           \
-  half_precision_math_test_cos(deviceQueue, TYPE{-1, -1, -1, -1},              \
-                               TYPE{0, 0, 0, 0}, 1);                           \
-  half_precision_math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1},              \
-                               TYPE{0, 0, 0, 0}, 1);                           \
-  half_precision_math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1},             \
-                                TYPE{2, 2, 2, 2}, 4);                          \
-  half_precision_math_test_exp10(deviceQueue, TYPE{-1, -1, -1, -1},            \
-                                 TYPE{2, 2, 2, 2}, 100);                       \
-  half_precision_math_test_log(deviceQueue, TYPE{-1, -1, -1, -1},              \
-                               TYPE{1, 1, 1, 1}, 0);                           \
-  half_precision_math_test_log2(deviceQueue, TYPE{-1, -1, -1, -1},             \
-                                TYPE{4, 4, 4, 4}, 2);                          \
-  half_precision_math_test_log10(deviceQueue, TYPE{-1, -1, -1, -1},            \
-                                 TYPE{100, 100, 100, 100}, 2);                 \
-  half_precision_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1, -1},             \
-                                TYPE{4, 4, 4, 4}, 2);                          \
-  half_precision_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1, -1},            \
-                                 TYPE{0.25, 0.25, 0.25, 0.25}, 2);             \
-  half_precision_math_test_recip(deviceQueue, TYPE{-1, -1, -1, -1},            \
-                                 TYPE{0.25, 0.25, 0.25, 0.25}, 4);             \
-  half_precision_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1},           \
-                                  TYPE{2, 2, 2, 2}, TYPE{2, 2, 2, 2}, 4);      \
-  half_precision_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1, -1},         \
-                                    TYPE{4, 4, 4, 4}, TYPE{2, 2, 2, 2}, 2);
+template <typename T> void half_precision_math_tests_3(queue &deviceQueue) {
+  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
+  half_precision_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100},
+                                 2);
+  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1},
+                                 T{0.25, 0.25, 0.25}, 2);
+  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1},
+                                 T{0.25, 0.25, 0.25}, 4);
+  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1}, T{2, 2, 2},
+                                  T{2, 2, 2}, 4);
+  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4},
+                                    T{2, 2, 2}, 2);
+}
+
+template <typename T> void half_precision_math_tests_4(queue &deviceQueue) {
+  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               0);
+  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               0);
+  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               1);
+  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                               1);
+  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                4);
+  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                 100);
+  half_precision_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1},
+                               0);
+  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                                2);
+  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1, -1},
+                                 T{100, 100, 100, 100}, 2);
+  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                                2);
+  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
+                                 T{0.25, 0.25, 0.25, 0.25}, 2);
+  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1, -1},
+                                 T{0.25, 0.25, 0.25, 0.25}, 4);
+  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                                  T{2, 2, 2, 2}, 4);
+  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1, -1},
+                                    T{4, 4, 4, 4}, T{2, 2, 2, 2}, 2);
+}
 
 int main() {
   queue deviceQueue;
 
-  HALF_PRECISION_TESTS_3(float3)
-  HALF_PRECISION_TESTS_3(marray<float COMMA 3>)
-
-  HALF_PRECISION_TESTS_4(float4)
-  HALF_PRECISION_TESTS_4(marray<float COMMA 4>)
+  half_precision_math_tests_3<float3>(deviceQueue);
+  half_precision_math_tests_3<marray<float, 3>>(deviceQueue);
 
+  half_precision_math_tests_4<float4>(deviceQueue);
+  half_precision_math_tests_4<marray<float, 4>>(deviceQueue);
   std::cout << "Pass" << std::endl;
   return 0;
 }
diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index 53ca722725..17aec48570 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -29,8 +29,6 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
   return true;
 }
 
-#define COMMA ,
-
 #define OPERATOR(NAME)                                                         \
   template <typename T>                                                        \
   void math_test_##NAME(queue &deviceQueue, T result, T input, size_t ref) {   \
@@ -163,117 +161,117 @@ OPERATOR_3(fma)
 
 #undef OPERATOR_3
 
-#define TESTS_4(TYPE)                                                          \
-  math_test_tanh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
-  math_test_cosh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);      \
-  math_test_sinh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
-  math_test_acos(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);      \
-  math_test_acospi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);    \
-  math_test_acosh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);     \
-  math_test_asin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
-  math_test_asinpi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);    \
-  math_test_asinh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
-  math_test_cbrt(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 1);      \
-  math_test_atan(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);      \
-  math_test_atanpi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);    \
-  math_test_atanh(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
-  math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);       \
-  math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, 4);      \
-  math_test_exp10(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, 100);   \
-  math_test_expm1(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);     \
-  math_test_ceil(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0.6, 0.6, 0.6, 0.6},  \
-                 1);                                                           \
-  math_test_tgamma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 1);    \
-  math_test_lgamma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1}, 0);    \
-  math_test_erf(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 0);       \
-  math_test_erfc(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0}, 1);      \
-  math_test_2_pow(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
-                  TYPE{2, 2, 2, 2}, 4);                                        \
-  math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
-                   TYPE{2, 2, 2, 2}, 4);                                       \
-  math_test_2_atan2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},       \
-                    TYPE{2, 2, 2, 2}, 0);                                      \
-  math_test_2_atan2pi(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},     \
-                      TYPE{2, 2, 2, 2}, 0);                                    \
-  math_test_2_copysign(deviceQueue, TYPE{-1, -1, -1, -1},                      \
-                       TYPE{-3, -3, -3, -3}, TYPE{2, 2, 2, 2}, 3);             \
-  math_test_2_fmin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
-                   TYPE{3, 3, 3, 3}, 2);                                       \
-  math_test_2_fmax(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},        \
-                   TYPE{3, 3, 3, 3}, 3);                                       \
-  math_test_2_hypot(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},       \
-                    TYPE{3, 3, 3, 3}, 5);                                      \
-  math_test_2_maxmag(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{-2, -2, -2, -2},  \
-                     TYPE{3, 3, 3, 3}, 3);                                     \
-  math_test_2_minmag(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},      \
-                     TYPE{-3, -3, -3, -3}, 2);                                 \
-  math_test_2_remainder(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{5, 5, 5, 5},   \
-                        TYPE{2, 2, 2, 2}, 1);                                  \
-  math_test_2_fdim(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{3, 3, 3, 3},        \
-                   TYPE{3, 3, 3, 3}, 0);                                       \
-  math_test_2_fmod(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{5, 5, 5, 5},        \
-                   TYPE{3, 3, 3, 3}, 2);                                       \
-  math_test_2_nextafter(deviceQueue, TYPE{-1, -1, -1, -1},                     \
-                        TYPE{-0, -0, -0, -0}, TYPE{+0, +0, +0, +0}, 0);        \
-  math_test_3_fma(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
-                  TYPE{2, 2, 2, 2}, TYPE{1, 1, 1, 1}, 5);                      \
-  math_test_3_mad(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},         \
-                  TYPE{2, 2, 2, 2}, TYPE{1, 1, 1, 1}, 5);                      \
-  math_test_3_mix(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{3, 3, 3, 3},         \
-                  TYPE{5, 5, 5, 5}, TYPE{0.5, 0.5, 0.5, 0.5}, 4);
+template <typename T> void math_tests_4(queue &deviceQueue) {
+  math_test_tanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acospi(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1, -1}, T{0.6, 0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_2_pow(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  4);
+  math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                   4);
+  math_test_2_atan2(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                    T{2, 2, 2, 2}, 0);
+  math_test_2_atan2pi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                      T{2, 2, 2, 2}, 0);
+  math_test_2_copysign(deviceQueue, T{-1, -1, -1, -1}, T{-3, -3, -3, -3},
+                       T{2, 2, 2, 2}, 3);
+  math_test_2_fmin(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_fmax(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   3);
+  math_test_2_hypot(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                    T{3, 3, 3, 3}, 5);
+  math_test_2_maxmag(deviceQueue, T{-1, -1, -1, -1}, T{-2, -2, -2, -2},
+                     T{3, 3, 3, 3}, 3);
+  math_test_2_minmag(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                     T{-3, -3, -3, -3}, 2);
+  math_test_2_remainder(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5},
+                        T{2, 2, 2, 2}, 1);
+  math_test_2_fdim(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{3, 3, 3, 3},
+                   0);
+  math_test_2_fmod(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_nextafter(deviceQueue, T{-1, -1, -1, -1}, T{-0, -0, -0, -0},
+                        T{+0, +0, +0, +0}, 0);
+  math_test_3_fma(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mad(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mix(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{5, 5, 5, 5},
+                  T{0.5, 0.5, 0.5, 0.5}, 4);
+}
 
-#define TESTS_3(TYPE)                                                          \
-  math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
-  math_test_tanh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
-  math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);              \
-  math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
-  math_test_cosh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);             \
-  math_test_sinh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
-  math_test_acos(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);             \
-  math_test_acosh(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);            \
-  math_test_asin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
-  math_test_asinh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
-  math_test_cbrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 1);             \
-  math_test_atan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);             \
-  math_test_atanh(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
-  math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);              \
-  math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 4);             \
-  math_test_exp10(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 100);          \
-  math_test_expm1(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
-  math_test_ceil(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.6, 0.6, 0.6}, 1);       \
-  math_test_tgamma(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 1);           \
-  math_test_lgamma(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);           \
-  math_test_erf(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);              \
-  math_test_erfc(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);             \
-  math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);              \
-  math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);             \
-  math_test_log10(deviceQueue, TYPE{-1, -1, -1}, TYPE{100, 100, 100}, 2);      \
-  math_test_log1p(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);            \
-  math_test_logb(deviceQueue, TYPE{-1, -1, -1}, TYPE{1.1, 1.1, 1.1}, 0);       \
-  math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);             \
-  math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.25, 0.25, 0.25}, 2);   \
-  math_test_rint(deviceQueue, TYPE{-1, -1, -1}, TYPE{2.9, 2.9, 2.9}, 3);       \
-  math_test_round(deviceQueue, TYPE{-1, -1, -1}, TYPE{0.5, 0.5, 0.5}, 1);      \
-  math_test_trunc(deviceQueue, TYPE{-1, -1, -1}, TYPE{1.9, 1.9, 1.9}, 1);
+template <typename T> void math_tests_3(queue &deviceQueue) {
+  math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_tanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1}, T{0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
+  math_test_log1p(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_logb(deviceQueue, T{-1, -1, -1}, T{1.1, 1.1, 1.1}, 0);
+  math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
+  math_test_rint(deviceQueue, T{-1, -1, -1}, T{2.9, 2.9, 2.9}, 3);
+  math_test_round(deviceQueue, T{-1, -1, -1}, T{0.5, 0.5, 0.5}, 1);
+  math_test_trunc(deviceQueue, T{-1, -1, -1}, T{1.9, 1.9, 1.9}, 1);
+}
 
 int main() {
   queue deviceQueue;
+  math_tests_4<float4>(deviceQueue);
+  math_tests_4<double4>(deviceQueue);
+  math_tests_4<marray<float, 4>>(deviceQueue);
+  math_tests_4<marray<double, 4>>(deviceQueue);
 
-  TESTS_4(float4)
-  TESTS_4(double4)
-  TESTS_4(marray<float COMMA 4>)
-  TESTS_4(marray<double COMMA 4>)
-
-  TESTS_3(float3)
-  TESTS_3(double3)
-  TESTS_3(marray<float COMMA 3>)
-  TESTS_3(marray<double COMMA 3>)
+  math_tests_3<float3>(deviceQueue);
+  math_tests_3<double3>(deviceQueue);
+  math_tests_3<marray<float, 3>>(deviceQueue);
+  math_tests_3<marray<double, 3>>(deviceQueue);
 
   if (deviceQueue.get_device().has(sycl::aspect::fp16)) {
-    TESTS_4(half4)
-    TESTS_4(marray<half COMMA 4>)
-    TESTS_3(half3)
-    TESTS_3(marray<half COMMA 3>)
+    math_tests_4<half4>(deviceQueue);
+    math_tests_4<marray<half, 4>>(deviceQueue);
+    math_tests_3<half3>(deviceQueue);
+    math_tests_3<marray<half, 3>>(deviceQueue);
   }
 
   std::cout << "Pass" << std::endl;
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index e6708f3779..f138ced366 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -29,8 +29,6 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
   return true;
 }
 
-#define COMMA ,
-
 #define NATIVE_OPERATOR(NAME)                                                  \
   template <typename T>                                                        \
   void native_math_test_##NAME(queue &deviceQueue, T result, T input,          \
@@ -94,62 +92,53 @@ NATIVE_OPERATOR_2(powr)
 
 #undef NATIVE_OPERATOR_2
 
-#define NATIVE_TESTS_3(TYPE)                                                   \
-  native_math_test_sin(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);       \
-  native_math_test_tan(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 0);       \
-  native_math_test_cos(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);       \
-  native_math_test_exp(deviceQueue, TYPE{-1, -1, -1}, TYPE{0, 0, 0}, 1);       \
-  native_math_test_exp2(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2}, 4);      \
-  native_math_test_log(deviceQueue, TYPE{-1, -1, -1}, TYPE{1, 1, 1}, 0);       \
-  native_math_test_log2(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);      \
-  native_math_test_log10(deviceQueue, TYPE{-1, -1, -1}, TYPE{100, 100, 100},   \
-                         2);                                                   \
-  native_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4}, 2);      \
-  native_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1},                        \
-                         TYPE{0.25, 0.25, 0.25}, 2);                           \
-  native_math_test_recip(deviceQueue, TYPE{-1, -1, -1},                        \
-                         TYPE{0.25, 0.25, 0.25}, 4);                           \
-  native_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1}, TYPE{2, 2, 2},        \
-                          TYPE{2, 2, 2}, 4);                                   \
-  native_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1}, TYPE{4, 4, 4},      \
-                            TYPE{2, 2, 2}, 2);
-
-#define NATIVE_TESTS_4(TYPE)                                                   \
-  native_math_test_sin(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
-                       0);                                                     \
-  native_math_test_tan(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
-                       0);                                                     \
-  native_math_test_cos(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
-                       1);                                                     \
-  native_math_test_exp(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{0, 0, 0, 0},    \
-                       1);                                                     \
-  native_math_test_exp2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2},   \
-                        4);                                                    \
-  native_math_test_log(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{1, 1, 1, 1},    \
-                       0);                                                     \
-  native_math_test_log2(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},   \
-                        2);                                                    \
-  native_math_test_log10(deviceQueue, TYPE{-1, -1, -1, -1},                    \
-                         TYPE{100, 100, 100, 100}, 2);                         \
-  native_math_test_sqrt(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{4, 4, 4, 4},   \
-                        2);                                                    \
-  native_math_test_rsqrt(deviceQueue, TYPE{-1, -1, -1, -1},                    \
-                         TYPE{0.25, 0.25, 0.25, 0.25}, 2);                     \
-  native_math_test_recip(deviceQueue, TYPE{-1, -1, -1, -1},                    \
-                         TYPE{0.25, 0.25, 0.25, 0.25}, 4);                     \
-  native_math_test_2_powr(deviceQueue, TYPE{-1, -1, -1, -1}, TYPE{2, 2, 2, 2}, \
-                          TYPE{2, 2, 2, 2}, 4);                                \
-  native_math_test_2_divide(deviceQueue, TYPE{-1, -1, -1, -1},                 \
-                            TYPE{4, 4, 4, 4}, TYPE{2, 2, 2, 2}, 2);
+template <typename T> void native_tests_3(queue &deviceQueue) {
+  native_math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  native_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  native_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  native_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  native_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  native_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  native_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  native_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
+  native_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  native_math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
+  native_math_test_recip(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 4);
+  native_math_test_2_powr(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, T{2, 2, 2},
+                          4);
+  native_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, T{2, 2, 2},
+                            2);
+}
+
+template <typename T> void native_tests_4(queue &deviceQueue) {
+  native_math_test_sin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  native_math_test_tan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  native_math_test_cos(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  native_math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  native_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
+  native_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  native_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
+  native_math_test_log10(deviceQueue, T{-1, -1, -1, -1}, T{100, 100, 100, 100},
+                         2);
+  native_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
+  native_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
+                         T{0.25, 0.25, 0.25, 0.25}, 2);
+  native_math_test_recip(deviceQueue, T{-1, -1, -1, -1},
+                         T{0.25, 0.25, 0.25, 0.25}, 4);
+  native_math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                          T{2, 2, 2, 2}, 4);
+  native_math_test_2_divide(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                            T{2, 2, 2, 2}, 2);
+}
 
 int main() {
   queue deviceQueue;
 
-  NATIVE_TESTS_3(float3)
-  NATIVE_TESTS_3(marray<float COMMA 3>)
+  native_tests_3<float3>(deviceQueue);
+  native_tests_3<marray<float, 3>>(deviceQueue);
 
-  NATIVE_TESTS_4(float4)
-  NATIVE_TESTS_4(marray<float COMMA 4>)
+  native_tests_4<float4>(deviceQueue);
+  native_tests_4<marray<float, 4>>(deviceQueue);
 
   std::cout << "Pass" << std::endl;
   return 0;

From b589c329071b05c4f393f8767092d16573a9dacf Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 11 May 2022 14:33:38 +0100
Subject: [PATCH 03/20] format

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 SYCL/Basic/half_builtins.cpp                 | 52 ++++++++++----------
 SYCL/DeviceLib/built-ins/ext_native_math.cpp | 32 ++++++------
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
index e4f47695e1..f26e58494d 100644
--- a/SYCL/Basic/half_builtins.cpp
+++ b/SYCL/Basic/half_builtins.cpp
@@ -167,33 +167,33 @@ template <int N> bool check(vec<float, N> a, vec<float, N> b) {
 int main() {
   queue q;
   if (q.get_device().has(sycl::aspect::fp16)) {
-  float16 a, b, c, d;
-  for (int i = 0; i < SZ_max; i++) {
-    a[i] = i / (float)SZ_max;
-    b[i] = (SZ_max - i) / (float)SZ_max;
-    c[i] = (float)(3 * i);
-  }
-  int err = 0;
-  {
-    buffer<float16> a_buf(&a, 1);
-    buffer<float16> b_buf(&b, 1);
-    buffer<float16> c_buf(&c, 1);
-    buffer<int> err_buf(&err, 1);
-    q.submit([&](handler &cgh) {
-      auto A = a_buf.get_access<access::mode::read>(cgh);
-      auto B = b_buf.get_access<access::mode::read>(cgh);
-      auto C = c_buf.get_access<access::mode::read>(cgh);
-      auto err = err_buf.get_access<access::mode::write>(cgh);
-      cgh.parallel_for(SZ_max, [=](item<1> index) {
-        size_t i = index.get_id(0);
-        TEST_BUILTIN_1(fabs);
-        TEST_BUILTIN_2(fmin);
-        TEST_BUILTIN_2(fmax);
-        TEST_BUILTIN_3(fma);
+    float16 a, b, c, d;
+    for (int i = 0; i < SZ_max; i++) {
+      a[i] = i / (float)SZ_max;
+      b[i] = (SZ_max - i) / (float)SZ_max;
+      c[i] = (float)(3 * i);
+    }
+    int err = 0;
+    {
+      buffer<float16> a_buf(&a, 1);
+      buffer<float16> b_buf(&b, 1);
+      buffer<float16> c_buf(&c, 1);
+      buffer<int> err_buf(&err, 1);
+      q.submit([&](handler &cgh) {
+        auto A = a_buf.get_access<access::mode::read>(cgh);
+        auto B = b_buf.get_access<access::mode::read>(cgh);
+        auto C = c_buf.get_access<access::mode::read>(cgh);
+        auto err = err_buf.get_access<access::mode::write>(cgh);
+        cgh.parallel_for(SZ_max, [=](item<1> index) {
+          size_t i = index.get_id(0);
+          TEST_BUILTIN_1(fabs);
+          TEST_BUILTIN_2(fmin);
+          TEST_BUILTIN_2(fmax);
+          TEST_BUILTIN_3(fma);
+        });
       });
-    });
+    }
+    assert(err == 0);
   }
-  assert(err == 0);
-}
   return 0;
 }
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 913f51070e..5f2d7e36d1 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -91,19 +91,18 @@ int main() {
                                    {tu[0], tu[1]});
   native_tanh_tester<sycl::float3>(
       q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-  
+
   native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
                                    {tl[0], tl[1], tl[2], tl[3]},
                                    {tu[0], tu[1], tu[2], tu[3]});
-  native_tanh_tester<sycl::marray<float, 3>>(q, {tv[0], tv[1], tv[2]},
-                                           {tl[0], tl[1], tl[2]},
-                                           {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<float, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
   native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                           {tl[0], tl[1], tl[2], tl[3]},
-                                           {tu[0], tu[1], tu[2], tu[3]});
+                                             {tl[0], tl[1], tl[2], tl[3]},
+                                             {tu[0], tu[1], tu[2], tu[3]});
   native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                   {tl[0], tl[1], tl[2], tl[3]},
-                                   {tu[0], tu[1], tu[2], tu[3]});
+                                             {tl[0], tl[1], tl[2], tl[3]},
+                                             {tu[0], tu[1], tu[2], tu[3]});
   native_tanh_tester<sycl::float8>(
       q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
       {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
@@ -129,9 +128,9 @@ int main() {
     native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
                                     {tl[0], tl[1], tl[2], tl[3]},
                                     {tu[0], tu[1], tu[2], tu[3]});
-    native_tanh_tester<sycl::marray<sycl::half, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                    {tl[0], tl[1], tl[2], tl[3]},
-                                    {tu[0], tu[1], tu[2], tu[3]});
+    native_tanh_tester<sycl::marray<sycl::half, 4>>(
+        q, {tv[0], tv[1], tv[2], tv[3]}, {tl[0], tl[1], tl[2], tl[3]},
+        {tu[0], tu[1], tu[2], tu[3]});
     native_tanh_tester<sycl::half8>(
         q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
         {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
@@ -160,12 +159,11 @@ int main() {
     native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
                                     {el[0], el[1], el[2], el[3]},
                                     {eu[0], eu[1], eu[2], eu[3]});
-    native_exp2_tester<sycl::marray<sycl::half, 3>>(q, {ev[0], ev[1], ev[2]},
-                                                {el[0], el[1], el[2]},
-                                                {eu[0], eu[1], eu[2]});
-    native_exp2_tester<sycl::marray<sycl::half, 4>>(q, {ev[0], ev[1], ev[2], ev[3]},
-                                                {el[0], el[1], el[2], el[3]},
-                                                {eu[0], eu[1], eu[2], eu[3]});
+    native_exp2_tester<sycl::marray<sycl::half, 3>>(
+        q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+    native_exp2_tester<sycl::marray<sycl::half, 4>>(
+        q, {ev[0], ev[1], ev[2], ev[3]}, {el[0], el[1], el[2], el[3]},
+        {eu[0], eu[1], eu[2], eu[3]});
     native_exp2_tester<sycl::half8>(
         q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
         {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},

From ab479c843b77016acf0863ecc70eb0c0f367f658 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 16 Jun 2022 09:04:32 +0100
Subject: [PATCH 04/20] split fp16 cases into new files.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math.cpp  | 210 +++-----------
 .../built-ins/ext_native_math_common.hpp      |  67 +++++
 .../built-ins/ext_native_math_fp16.cpp        |  94 +++++++
 SYCL/DeviceLib/math_test_marray_vec.cpp       | 259 +-----------------
 .../DeviceLib/math_test_marray_vec_common.hpp | 250 +++++++++++++++++
 SYCL/DeviceLib/math_test_marray_vec_fp16.cpp  |  27 ++
 6 files changed, 481 insertions(+), 426 deletions(-)
 create mode 100644 SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
 create mode 100644 SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
 create mode 100644 SYCL/DeviceLib/math_test_marray_vec_common.hpp
 create mode 100644 SYCL/DeviceLib/math_test_marray_vec_fp16.cpp

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 5f2d7e36d1..fa24addd0e 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -1,182 +1,52 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
-// test is compiled with the -fsycl-device-code-split flag
+// tests oneapi extension native tanh math function for sycl::vec and
+// sycl::marray float cases.
 
-#include <CL/sycl.hpp>
-#include <cassert>
-
-template <typename T, size_t N>
-void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
-                         sycl::marray<T, N> upper) {
-  for (int i = 0; i < N; i++) {
-    assert(lower[i] < val[i] && val[i] < upper[i]);
-  }
-}
-
-template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
-  assert(sycl::all(lower < val && val < upper));
-}
-
-template <>
-void assert_out_of_bound<float>(float val, float lower, float upper) {
-  assert(lower < val && val < upper);
-}
-
-template <>
-void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
-                                     sycl::half upper) {
-  assert(lower < val && val < upper);
-}
-
-template <typename T>
-void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
-  T r = val;
-
-#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
-  {
-    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
-    q.submit([&](sycl::handler &cgh) {
-      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
-      cgh.single_task([=]() {
-        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
-      });
-    });
-  }
-
-  assert_out_of_bound(r, up, lo);
-#else
-  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
-#endif
-}
-
-template <typename T>
-void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
-  T r = val;
-
-#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
-  {
-    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
-    q.submit([&](sycl::handler &cgh) {
-      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
-      cgh.single_task([=]() {
-        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
-      });
-    });
-  }
-
-  assert_out_of_bound(r, up, lo);
-#else
-  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
-#endif
-}
+#include "ext_native_math_common.hpp"
 
 int main() {
 
-  sycl::queue q;
-
-  const double tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
-                         -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
-  const double tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
-                         -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
-  const double tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
-                         -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
-
-  native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
-  native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
-                                   {tu[0], tu[1]});
-  native_tanh_tester<sycl::float3>(
-      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-
-  native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                   {tl[0], tl[1], tl[2], tl[3]},
-                                   {tu[0], tu[1], tu[2], tu[3]});
-  native_tanh_tester<sycl::marray<float, 3>>(
-      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                             {tl[0], tl[1], tl[2], tl[3]},
-                                             {tu[0], tu[1], tu[2], tu[3]});
-  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                             {tl[0], tl[1], tl[2], tl[3]},
-                                             {tu[0], tu[1], tu[2], tu[3]});
-  native_tanh_tester<sycl::float8>(
-      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
-      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
-      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
-  native_tanh_tester<sycl::float16>(
-      q,
-      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
-       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
-      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
-       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
-      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
-       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
-
-  if (q.get_device().has(sycl::aspect::fp16)) {
-
-    native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
-    native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
-                                    {tu[0], tu[1]});
-    native_tanh_tester<sycl::half3>(
-        q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-    native_tanh_tester<sycl::marray<sycl::half, 3>>(
-        q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-    native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                    {tl[0], tl[1], tl[2], tl[3]},
-                                    {tu[0], tu[1], tu[2], tu[3]});
-    native_tanh_tester<sycl::marray<sycl::half, 4>>(
-        q, {tv[0], tv[1], tv[2], tv[3]}, {tl[0], tl[1], tl[2], tl[3]},
-        {tu[0], tu[1], tu[2], tu[3]});
-    native_tanh_tester<sycl::half8>(
-        q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
-        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
-        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
-    native_tanh_tester<sycl::half16>(
-        q,
-        {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
-         tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
-        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
-         tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
-        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
-         tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
-
-    const double ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
-                           -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
-    const double el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
-                           0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
-    const double eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
-                           0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
-
-    native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
-    native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
-                                    {eu[0], eu[1]});
-    native_exp2_tester<sycl::half3>(
-        q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
-    native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
-                                    {el[0], el[1], el[2], el[3]},
-                                    {eu[0], eu[1], eu[2], eu[3]});
-    native_exp2_tester<sycl::marray<sycl::half, 3>>(
-        q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
-    native_exp2_tester<sycl::marray<sycl::half, 4>>(
-        q, {ev[0], ev[1], ev[2], ev[3]}, {el[0], el[1], el[2], el[3]},
-        {eu[0], eu[1], eu[2], eu[3]});
-    native_exp2_tester<sycl::half8>(
-        q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
-        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
-        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
-    native_exp2_tester<sycl::half16>(
-        q,
-        {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
-         ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
-        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
-         el[10], el[11], el[12], el[13], el[14], el[15]},
-        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
-         eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
-  }
+sycl::queue q;
+
+const float tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                       -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+const float tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                       -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+const float tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                       -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+
+native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
+native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                 {tu[0], tu[1]});
+native_tanh_tester<sycl::float3>(q, {tv[0], tv[1], tv[2]},
+                                 {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+
+native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                 {tl[0], tl[1], tl[2], tl[3]},
+                                 {tu[0], tu[1], tu[2], tu[3]});
+native_tanh_tester<sycl::marray<float, 3>>(q, {tv[0], tv[1], tv[2]},
+                                           {tl[0], tl[1], tl[2]},
+                                           {tu[0], tu[1], tu[2]});
+native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                           {tl[0], tl[1], tl[2], tl[3]},
+                                           {tu[0], tu[1], tu[2], tu[3]});
+native_tanh_tester<sycl::float8>(
+    q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+native_tanh_tester<sycl::float16>(
+    q,
+    {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+     tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+     tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+     tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
   return 0;
 }
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
new file mode 100644
index 0000000000..1ac9203537
--- /dev/null
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
@@ -0,0 +1,67 @@
+#include <sycl/sycl.hpp>
+#include <cassert>
+
+template <typename T, size_t N>
+void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
+                         sycl::marray<T, N> upper) {
+  for (int i = 0; i < N; i++) {
+    assert(lower[i] < val[i] && val[i] < upper[i]);
+  }
+}
+
+template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
+  assert(sycl::all(lower < val && val < upper));
+}
+
+template <>
+void assert_out_of_bound<float>(float val, float lower, float upper) {
+  assert(lower < val && val < upper);
+}
+
+template <>
+void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
+                                     sycl::half upper) {
+  assert(lower < val && val < upper);
+}
+
+template <typename T>
+void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
+
+template <typename T>
+void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
new file mode 100644
index 0000000000..de52fadd5a
--- /dev/null
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -0,0 +1,94 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
+// test is compiled with the -fsycl-device-code-split flag
+
+// tests oneapi extension native math functions for sycl::vec and sycl::marray
+// fp16 cases.
+
+#include "ext_native_math_common.hpp"
+
+int main() {
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+
+sycl::queue q;
+
+const half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                       -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+const half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                       -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+const half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                       -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+
+native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
+native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                {tu[0], tu[1]});
+native_tanh_tester<sycl::half3>(q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]},
+                                {tu[0], tu[1], tu[2]});
+native_tanh_tester<sycl::marray<sycl::half, 3>>(q, {tv[0], tv[1], tv[2]},
+                                                {tl[0], tl[1], tl[2]},
+                                                {tu[0], tu[1], tu[2]});
+native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                {tl[0], tl[1], tl[2], tl[3]},
+                                {tu[0], tu[1], tu[2], tu[3]});
+native_tanh_tester<sycl::marray<sycl::half, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                                {tl[0], tl[1], tl[2], tl[3]},
+                                                {tu[0], tu[1], tu[2], tu[3]});
+native_tanh_tester<sycl::half8>(
+    q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+native_tanh_tester<sycl::half16>(
+    q,
+    {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+     tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+     tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+     tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+const half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+                       -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
+const half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+                       0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
+const half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+                       0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
+
+native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
+native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
+                                {eu[0], eu[1]});
+native_exp2_tester<sycl::half3>(q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]},
+                                {eu[0], eu[1], eu[2]});
+native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                {el[0], el[1], el[2], el[3]},
+                                {eu[0], eu[1], eu[2], eu[3]});
+native_exp2_tester<sycl::marray<sycl::half, 3>>(q, {ev[0], ev[1], ev[2]},
+                                                {el[0], el[1], el[2]},
+                                                {eu[0], eu[1], eu[2]});
+native_exp2_tester<sycl::marray<sycl::half, 4>>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                                {el[0], el[1], el[2], el[3]},
+                                                {eu[0], eu[1], eu[2], eu[3]});
+native_exp2_tester<sycl::half8>(
+    q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
+    {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
+    {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
+native_exp2_tester<sycl::half16>(
+    q,
+    {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
+     ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
+    {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
+     el[10], el[11], el[12], el[13], el[14], el[15]},
+    {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
+     eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
+
+  return 0;
+}
diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index 17aec48570..de054a2c1f 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -4,256 +4,10 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-#include <sycl/sycl.hpp>
+// tests sycl floating point math functions for sycl::vec and sycl::marray float
+// and double cases.
 
-using namespace sycl;
-
-template <typename T1, typename T2> class TypeHelper;
-
-template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB;
-}
-
-template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
-}
-
-template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
-  for (int i = 0; i < N; i++) {
-    if (A[i] != B) {
-      return false;
-    }
-  }
-  return true;
-}
-
-#define OPERATOR(NAME)                                                         \
-  template <typename T>                                                        \
-  void math_test_##NAME(queue &deviceQueue, T result, T input, size_t ref) {   \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input, 1);                                         \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input_access(      \
-            buffer2, cgh);                                                     \
-        cgh.single_task<TypeHelper<class a##NAME, T>>(                         \
-            [=]() { res_access[0] = NAME(input_access[0]); });                 \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-OPERATOR(cos)
-OPERATOR(cospi)
-OPERATOR(sin)
-OPERATOR(sinpi)
-OPERATOR(cosh)
-OPERATOR(sinh)
-OPERATOR(tan)
-OPERATOR(tanpi)
-OPERATOR(atan)
-OPERATOR(atanpi)
-OPERATOR(tanh)
-OPERATOR(acos)
-OPERATOR(acospi)
-OPERATOR(asin)
-OPERATOR(asinpi)
-OPERATOR(acosh)
-OPERATOR(asinh)
-OPERATOR(atanh)
-OPERATOR(cbrt)
-OPERATOR(ceil)
-OPERATOR(exp)
-OPERATOR(exp2)
-OPERATOR(exp10)
-OPERATOR(expm1)
-OPERATOR(tgamma)
-OPERATOR(lgamma)
-OPERATOR(erf)
-OPERATOR(erfc)
-OPERATOR(log)
-OPERATOR(log2)
-OPERATOR(log10)
-OPERATOR(log1p)
-OPERATOR(logb)
-OPERATOR(sqrt)
-OPERATOR(rsqrt)
-OPERATOR(rint)
-OPERATOR(round)
-OPERATOR(trunc)
-
-#undef OPERATOR
-
-#define OPERATOR_2(NAME)                                                       \
-  template <typename T>                                                        \
-  void math_test_2_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
-                          size_t ref) {                                        \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input1, 1);                                        \
-      buffer<T, 1> buffer3(&input2, 1);                                        \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
-            buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
-            buffer3, cgh);                                                     \
-        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
-          res_access[0] = NAME(input1_access[0], input2_access[0]);            \
-        });                                                                    \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-OPERATOR_2(pow)
-OPERATOR_2(powr)
-OPERATOR_2(atan2)
-OPERATOR_2(atan2pi)
-OPERATOR_2(copysign)
-OPERATOR_2(fdim)
-OPERATOR_2(fmin)
-OPERATOR_2(fmax)
-OPERATOR_2(fmod)
-OPERATOR_2(hypot)
-OPERATOR_2(maxmag)
-OPERATOR_2(minmag)
-OPERATOR_2(nextafter)
-OPERATOR_2(remainder)
-
-#undef OPERATOR_2
-
-#define OPERATOR_3(NAME)                                                       \
-  template <typename T>                                                        \
-  void math_test_3_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
-                          T input3, size_t ref) {                              \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input1, 1);                                        \
-      buffer<T, 1> buffer3(&input2, 1);                                        \
-      buffer<T, 1> buffer4(&input3, 1);                                        \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
-            buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
-            buffer3, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input3_access(     \
-            buffer4, cgh);                                                     \
-        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
-          res_access[0] =                                                      \
-              NAME(input1_access[0], input2_access[0], input3_access[0]);      \
-        });                                                                    \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-OPERATOR_3(mad)
-OPERATOR_3(mix)
-OPERATOR_3(fma)
-
-#undef OPERATOR_3
-
-template <typename T> void math_tests_4(queue &deviceQueue) {
-  math_test_tanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_cosh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
-  math_test_sinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_acos(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
-  math_test_acospi(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
-  math_test_acosh(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
-  math_test_asin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_asinpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_asinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_cbrt(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
-  math_test_atan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_atanpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_atanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
-  math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
-  math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 100);
-  math_test_expm1(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_ceil(deviceQueue, T{-1, -1, -1, -1}, T{0.6, 0.6, 0.6, 0.6}, 1);
-  math_test_tgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
-  math_test_lgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
-  math_test_erf(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  math_test_erfc(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
-  math_test_2_pow(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
-                  4);
-  math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
-                   4);
-  math_test_2_atan2(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                    T{2, 2, 2, 2}, 0);
-  math_test_2_atan2pi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                      T{2, 2, 2, 2}, 0);
-  math_test_2_copysign(deviceQueue, T{-1, -1, -1, -1}, T{-3, -3, -3, -3},
-                       T{2, 2, 2, 2}, 3);
-  math_test_2_fmin(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
-                   2);
-  math_test_2_fmax(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
-                   3);
-  math_test_2_hypot(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
-                    T{3, 3, 3, 3}, 5);
-  math_test_2_maxmag(deviceQueue, T{-1, -1, -1, -1}, T{-2, -2, -2, -2},
-                     T{3, 3, 3, 3}, 3);
-  math_test_2_minmag(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
-                     T{-3, -3, -3, -3}, 2);
-  math_test_2_remainder(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5},
-                        T{2, 2, 2, 2}, 1);
-  math_test_2_fdim(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{3, 3, 3, 3},
-                   0);
-  math_test_2_fmod(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5}, T{3, 3, 3, 3},
-                   2);
-  math_test_2_nextafter(deviceQueue, T{-1, -1, -1, -1}, T{-0, -0, -0, -0},
-                        T{+0, +0, +0, +0}, 0);
-  math_test_3_fma(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
-                  T{1, 1, 1, 1}, 5);
-  math_test_3_mad(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
-                  T{1, 1, 1, 1}, 5);
-  math_test_3_mix(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{5, 5, 5, 5},
-                  T{0.5, 0.5, 0.5, 0.5}, 4);
-}
-
-template <typename T> void math_tests_3(queue &deviceQueue) {
-  math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_tanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_cosh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  math_test_sinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_acos(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  math_test_acosh(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  math_test_asin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_asinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_cbrt(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
-  math_test_atan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_atanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
-  math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
-  math_test_expm1(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_ceil(deviceQueue, T{-1, -1, -1}, T{0.6, 0.6, 0.6}, 1);
-  math_test_tgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
-  math_test_lgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  math_test_erf(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_erfc(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
-  math_test_log1p(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  math_test_logb(deviceQueue, T{-1, -1, -1}, T{1.1, 1.1, 1.1}, 0);
-  math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
-  math_test_rint(deviceQueue, T{-1, -1, -1}, T{2.9, 2.9, 2.9}, 3);
-  math_test_round(deviceQueue, T{-1, -1, -1}, T{0.5, 0.5, 0.5}, 1);
-  math_test_trunc(deviceQueue, T{-1, -1, -1}, T{1.9, 1.9, 1.9}, 1);
-}
+#include "math_test_marray_vec_common.hpp"
 
 int main() {
   queue deviceQueue;
@@ -267,13 +21,6 @@ int main() {
   math_tests_3<marray<float, 3>>(deviceQueue);
   math_tests_3<marray<double, 3>>(deviceQueue);
 
-  if (deviceQueue.get_device().has(sycl::aspect::fp16)) {
-    math_tests_4<half4>(deviceQueue);
-    math_tests_4<marray<half, 4>>(deviceQueue);
-    math_tests_3<half3>(deviceQueue);
-    math_tests_3<marray<half, 3>>(deviceQueue);
-  }
-
   std::cout << "Pass" << std::endl;
   return 0;
 }
diff --git a/SYCL/DeviceLib/math_test_marray_vec_common.hpp b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
new file mode 100644
index 0000000000..cc4b440448
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
@@ -0,0 +1,250 @@
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define OPERATOR(NAME)                                                         \
+  template <typename T>                                                        \
+  void math_test_##NAME(queue &deviceQueue, T result, T input, size_t ref) {   \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input_access(      \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>(                         \
+            [=]() { res_access[0] = NAME(input_access[0]); });                 \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR(cos)
+OPERATOR(cospi)
+OPERATOR(sin)
+OPERATOR(sinpi)
+OPERATOR(cosh)
+OPERATOR(sinh)
+OPERATOR(tan)
+OPERATOR(tanpi)
+OPERATOR(atan)
+OPERATOR(atanpi)
+OPERATOR(tanh)
+OPERATOR(acos)
+OPERATOR(acospi)
+OPERATOR(asin)
+OPERATOR(asinpi)
+OPERATOR(acosh)
+OPERATOR(asinh)
+OPERATOR(atanh)
+OPERATOR(cbrt)
+OPERATOR(ceil)
+OPERATOR(exp)
+OPERATOR(exp2)
+OPERATOR(exp10)
+OPERATOR(expm1)
+OPERATOR(tgamma)
+OPERATOR(lgamma)
+OPERATOR(erf)
+OPERATOR(erfc)
+OPERATOR(log)
+OPERATOR(log2)
+OPERATOR(log10)
+OPERATOR(log1p)
+OPERATOR(logb)
+OPERATOR(sqrt)
+OPERATOR(rsqrt)
+OPERATOR(rint)
+OPERATOR(round)
+OPERATOR(trunc)
+
+#undef OPERATOR
+
+#define OPERATOR_2(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_2_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          size_t ref) {                                        \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] = NAME(input1_access[0], input2_access[0]);            \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_2(pow)
+OPERATOR_2(powr)
+OPERATOR_2(atan2)
+OPERATOR_2(atan2pi)
+OPERATOR_2(copysign)
+OPERATOR_2(fdim)
+OPERATOR_2(fmin)
+OPERATOR_2(fmax)
+OPERATOR_2(fmod)
+OPERATOR_2(hypot)
+OPERATOR_2(maxmag)
+OPERATOR_2(minmag)
+OPERATOR_2(nextafter)
+OPERATOR_2(remainder)
+
+#undef OPERATOR_2
+
+#define OPERATOR_3(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_3_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          T input3, size_t ref) {                              \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      buffer<T, 1> buffer4(&input3, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+            buffer3, cgh);                                                     \
+        accessor<T, 1, access::mode::write, target::device> input3_access(     \
+            buffer4, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] =                                                      \
+              NAME(input1_access[0], input2_access[0], input3_access[0]);      \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_3(mad)
+OPERATOR_3(mix)
+OPERATOR_3(fma)
+
+#undef OPERATOR_3
+
+template <typename T> void math_tests_4(queue &deviceQueue) {
+  math_test_tanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acospi(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1, -1}, T{0.6, 0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_2_pow(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  4);
+  math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                   4);
+  math_test_2_atan2(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                    T{2, 2, 2, 2}, 0);
+  math_test_2_atan2pi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                      T{2, 2, 2, 2}, 0);
+  math_test_2_copysign(deviceQueue, T{-1, -1, -1, -1}, T{-3, -3, -3, -3},
+                       T{2, 2, 2, 2}, 3);
+  math_test_2_fmin(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_fmax(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   3);
+  math_test_2_hypot(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                    T{3, 3, 3, 3}, 5);
+  math_test_2_maxmag(deviceQueue, T{-1, -1, -1, -1}, T{-2, -2, -2, -2},
+                     T{3, 3, 3, 3}, 3);
+  math_test_2_minmag(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                     T{-3, -3, -3, -3}, 2);
+  math_test_2_remainder(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5},
+                        T{2, 2, 2, 2}, 1);
+  math_test_2_fdim(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{3, 3, 3, 3},
+                   0);
+  math_test_2_fmod(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_nextafter(deviceQueue, T{-1, -1, -1, -1}, T{-0, -0, -0, -0},
+                        T{+0, +0, +0, +0}, 0);
+  math_test_3_fma(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mad(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mix(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{5, 5, 5, 5},
+                  T{0.5, 0.5, 0.5, 0.5}, 4);
+}
+
+template <typename T> void math_tests_3(queue &deviceQueue) {
+  math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_tanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1}, T{0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
+  math_test_log1p(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_logb(deviceQueue, T{-1, -1, -1}, T{1.1, 1.1, 1.1}, 0);
+  math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
+  math_test_rint(deviceQueue, T{-1, -1, -1}, T{2.9, 2.9, 2.9}, 3);
+  math_test_round(deviceQueue, T{-1, -1, -1}, T{0.5, 0.5, 0.5}, 1);
+  math_test_trunc(deviceQueue, T{-1, -1, -1}, T{1.9, 1.9, 1.9}, 1);
+}
diff --git a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
new file mode 100644
index 0000000000..71740ad335
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
@@ -0,0 +1,27 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// tests sycl floating point math functions for sycl::vec and sycl::marray fp16
+// cases.
+
+#include "math_test_marray_vec_common.hpp"
+
+int main() {
+  queue deviceQueue;
+
+  if (!deviceQueue.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+  math_tests_4<half4>(deviceQueue);
+  math_tests_4<marray<half, 4>>(deviceQueue);
+  math_tests_3<half3>(deviceQueue);
+  math_tests_3<marray<half, 3>>(deviceQueue);
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}

From 9e2394e99912cf102978cc77b039efd3324e8127 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 16 Jun 2022 09:52:43 +0100
Subject: [PATCH 05/20] fixed queue constructor mistake.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../DeviceLib/built-ins/ext_native_math_fp16.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
index de52fadd5a..f3b0b7623b 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -14,19 +14,19 @@
 
 int main() {
 
+sycl::queue q;
+
   if (!q.get_device().has(sycl::aspect::fp16)) {
     std::cout << "skipping fp16 tests: requires fp16 device aspect."
               << std::endl;
     return 0;
   }
 
-sycl::queue q;
-
-const half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+const sycl::half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
                        -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
-const half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+const sycl::half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
                        -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
-const half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+const sycl::half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
                        -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
 
 native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
@@ -56,11 +56,11 @@ native_tanh_tester<sycl::half16>(
     {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
      tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
-const half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+const sycl::half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
                        -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
-const half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+const sycl::half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
                        0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
-const half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+const sycl::half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
                        0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
 
 native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);

From 80e69940245e252f810d2cedd14c4d788ee86b81 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 16 Jun 2022 10:16:10 +0100
Subject: [PATCH 06/20] format

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math.cpp  |  71 +++++-----
 .../built-ins/ext_native_math_common.hpp      |   2 +-
 .../built-ins/ext_native_math_fp16.cpp        | 130 +++++++++---------
 3 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index fa24addd0e..d5952fd8cf 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -11,42 +11,41 @@
 
 int main() {
 
-sycl::queue q;
-
-const float tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
-                       -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
-const float tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
-                       -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
-const float tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
-                       -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
-
-native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
-native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
-                                 {tu[0], tu[1]});
-native_tanh_tester<sycl::float3>(q, {tv[0], tv[1], tv[2]},
-                                 {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-
-native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                 {tl[0], tl[1], tl[2], tl[3]},
-                                 {tu[0], tu[1], tu[2], tu[3]});
-native_tanh_tester<sycl::marray<float, 3>>(q, {tv[0], tv[1], tv[2]},
-                                           {tl[0], tl[1], tl[2]},
-                                           {tu[0], tu[1], tu[2]});
-native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                           {tl[0], tl[1], tl[2], tl[3]},
-                                           {tu[0], tu[1], tu[2], tu[3]});
-native_tanh_tester<sycl::float8>(
-    q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
-    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
-    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
-native_tanh_tester<sycl::float16>(
-    q,
-    {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
-     tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
-    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
-     tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
-    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
-     tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+  sycl::queue q;
+
+  const float tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                        -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const float tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                        -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+  const float tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                        -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+
+  native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                   {tu[0], tu[1]});
+  native_tanh_tester<sycl::float3>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+
+  native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                   {tl[0], tl[1], tl[2], tl[3]},
+                                   {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<float, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                             {tl[0], tl[1], tl[2], tl[3]},
+                                             {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::float8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::float16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
   return 0;
 }
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
index 1ac9203537..7511f03206 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
@@ -1,5 +1,5 @@
-#include <sycl/sycl.hpp>
 #include <cassert>
+#include <sycl/sycl.hpp>
 
 template <typename T, size_t N>
 void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
index f3b0b7623b..a590b900d8 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -14,7 +14,7 @@
 
 int main() {
 
-sycl::queue q;
+  sycl::queue q;
 
   if (!q.get_device().has(sycl::aspect::fp16)) {
     std::cout << "skipping fp16 tests: requires fp16 device aspect."
@@ -22,73 +22,73 @@ sycl::queue q;
     return 0;
   }
 
-const sycl::half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
-                       -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
-const sycl::half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
-                       -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
-const sycl::half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
-                       -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+  const sycl::half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                             -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const sycl::half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95,  0.89,
+                             0.75,  -0.1,  -0.94, 0.92, -0.84, 0.82,
+                             -1.0,  0.98,  -1.10, 0.98};
+  const sycl::half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91,
+                             0.77,  0.1,   -0.92, 0.94, -0.82, 0.84,
+                             -0.98, 1.00,  -0.98, 1.10};
 
-native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
-native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
-                                {tu[0], tu[1]});
-native_tanh_tester<sycl::half3>(q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]},
-                                {tu[0], tu[1], tu[2]});
-native_tanh_tester<sycl::marray<sycl::half, 3>>(q, {tv[0], tv[1], tv[2]},
-                                                {tl[0], tl[1], tl[2]},
-                                                {tu[0], tu[1], tu[2]});
-native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                {tl[0], tl[1], tl[2], tl[3]},
-                                {tu[0], tu[1], tu[2], tu[3]});
-native_tanh_tester<sycl::marray<sycl::half, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                                {tl[0], tl[1], tl[2], tl[3]},
-                                                {tu[0], tu[1], tu[2], tu[3]});
-native_tanh_tester<sycl::half8>(
-    q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
-    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
-    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
-native_tanh_tester<sycl::half16>(
-    q,
-    {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
-     tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
-    {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
-     tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
-    {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
-     tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+  native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                  {tu[0], tu[1]});
+  native_tanh_tester<sycl::half3>(q, {tv[0], tv[1], tv[2]},
+                                  {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<sycl::half, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                  {tl[0], tl[1], tl[2], tl[3]},
+                                  {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<sycl::half, 4>>(
+      q, {tv[0], tv[1], tv[2], tv[3]}, {tl[0], tl[1], tl[2], tl[3]},
+      {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::half8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::half16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
-const sycl::half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
-                       -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
-const sycl::half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
-                       0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
-const sycl::half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
-                       0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
+  const sycl::half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+                             -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
+  const sycl::half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+                             0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
+  const sycl::half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+                             0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
 
-native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
-native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
-                                {eu[0], eu[1]});
-native_exp2_tester<sycl::half3>(q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]},
-                                {eu[0], eu[1], eu[2]});
-native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
-                                {el[0], el[1], el[2], el[3]},
-                                {eu[0], eu[1], eu[2], eu[3]});
-native_exp2_tester<sycl::marray<sycl::half, 3>>(q, {ev[0], ev[1], ev[2]},
-                                                {el[0], el[1], el[2]},
-                                                {eu[0], eu[1], eu[2]});
-native_exp2_tester<sycl::marray<sycl::half, 4>>(q, {ev[0], ev[1], ev[2], ev[3]},
-                                                {el[0], el[1], el[2], el[3]},
-                                                {eu[0], eu[1], eu[2], eu[3]});
-native_exp2_tester<sycl::half8>(
-    q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
-    {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
-    {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
-native_exp2_tester<sycl::half16>(
-    q,
-    {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
-     ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
-    {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
-     el[10], el[11], el[12], el[13], el[14], el[15]},
-    {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
-     eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
+  native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
+  native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
+                                  {eu[0], eu[1]});
+  native_exp2_tester<sycl::half3>(q, {ev[0], ev[1], ev[2]},
+                                  {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                  {el[0], el[1], el[2], el[3]},
+                                  {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::marray<sycl::half, 3>>(
+      q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::marray<sycl::half, 4>>(
+      q, {ev[0], ev[1], ev[2], ev[3]}, {el[0], el[1], el[2], el[3]},
+      {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::half8>(
+      q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
+  native_exp2_tester<sycl::half16>(
+      q,
+      {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
+       ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
+       el[10], el[11], el[12], el[13], el[14], el[15]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
+       eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
 
   return 0;
 }

From 1f0edf8cb176f671138c1ece9c5b2b8956737fd8 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 16 Jun 2022 11:01:14 +0100
Subject: [PATCH 07/20] use fp16 aspect in half_builtins.cpp

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Basic/half_builtins.cpp | 62 ++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
index f26e58494d..e2e06f8e40 100644
--- a/SYCL/Basic/half_builtins.cpp
+++ b/SYCL/Basic/half_builtins.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -166,34 +166,40 @@ template <int N> bool check(vec<float, N> a, vec<float, N> b) {
 
 int main() {
   queue q;
-  if (q.get_device().has(sycl::aspect::fp16)) {
-    float16 a, b, c, d;
-    for (int i = 0; i < SZ_max; i++) {
-      a[i] = i / (float)SZ_max;
-      b[i] = (SZ_max - i) / (float)SZ_max;
-      c[i] = (float)(3 * i);
-    }
-    int err = 0;
-    {
-      buffer<float16> a_buf(&a, 1);
-      buffer<float16> b_buf(&b, 1);
-      buffer<float16> c_buf(&c, 1);
-      buffer<int> err_buf(&err, 1);
-      q.submit([&](handler &cgh) {
-        auto A = a_buf.get_access<access::mode::read>(cgh);
-        auto B = b_buf.get_access<access::mode::read>(cgh);
-        auto C = c_buf.get_access<access::mode::read>(cgh);
-        auto err = err_buf.get_access<access::mode::write>(cgh);
-        cgh.parallel_for(SZ_max, [=](item<1> index) {
-          size_t i = index.get_id(0);
-          TEST_BUILTIN_1(fabs);
-          TEST_BUILTIN_2(fmin);
-          TEST_BUILTIN_2(fmax);
-          TEST_BUILTIN_3(fma);
-        });
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+
+  float16 a, b, c, d;
+  for (int i = 0; i < SZ_max; i++) {
+    a[i] = i / (float)SZ_max;
+    b[i] = (SZ_max - i) / (float)SZ_max;
+    c[i] = (float)(3 * i);
+  }
+  int err = 0;
+  {
+    buffer<float16> a_buf(&a, 1);
+    buffer<float16> b_buf(&b, 1);
+    buffer<float16> c_buf(&c, 1);
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+      auto A = a_buf.get_access<access::mode::read>(cgh);
+      auto B = b_buf.get_access<access::mode::read>(cgh);
+      auto C = c_buf.get_access<access::mode::read>(cgh);
+      auto err = err_buf.get_access<access::mode::write>(cgh);
+      cgh.parallel_for(SZ_max, [=](item<1> index) {
+        size_t i = index.get_id(0);
+        TEST_BUILTIN_1(fabs);
+        TEST_BUILTIN_2(fmin);
+        TEST_BUILTIN_2(fmax);
+        TEST_BUILTIN_3(fma);
       });
-    }
-    assert(err == 0);
+    });
   }
+  assert(err == 0);
+
   return 0;
 }

From 499c642b9a85340749df98c34bf54dc7d7e26ef8 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 16 Jun 2022 11:09:30 +0100
Subject: [PATCH 08/20] removed unnecessary
 -fsycl-device-code-split=per_kernel.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/Basic/half_builtins.cpp                      | 2 +-
 SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp | 2 +-
 SYCL/DeviceLib/math_test_marray_vec_fp16.cpp      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/Basic/half_builtins.cpp b/SYCL/Basic/half_builtins.cpp
index e2e06f8e40..6b015121d3 100644
--- a/SYCL/Basic/half_builtins.cpp
+++ b/SYCL/Basic/half_builtins.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
index a590b900d8..6a27bff4ad 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
index 71740ad335..da9df54d80 100644
--- a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From 3d039698c9ea2ed9417408984674f21a779deb7c Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 30 Jun 2022 10:27:32 +0100
Subject: [PATCH 09/20] write -> read (superficial change in context of test).

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../half_precision_math_test_marray_vec.cpp          |  6 +++---
 SYCL/DeviceLib/math_test_marray_vec_common.hpp       | 12 ++++++------
 SYCL/DeviceLib/native_math_test_marray_vec.cpp       |  6 +++---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index b2f3d152fe..28936bc7d2 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -39,7 +39,7 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input_access(      \
+        accessor<T, 1, access::mode::read, target::device> input_access(       \
             buffer2, cgh);                                                     \
         cgh.single_task<TypeHelper<class half_precision##NAME, T>>([=]() {     \
           res_access[0] = sycl::half_precision::NAME(input_access[0]);         \
@@ -75,9 +75,9 @@ HALF_PRECISION_OPERATOR(recip)
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
             buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
             buffer3, cgh);                                                     \
         cgh.single_task<TypeHelper<class half_precision2##NAME, T>>([=]() {    \
           res_access[0] =                                                      \
diff --git a/SYCL/DeviceLib/math_test_marray_vec_common.hpp b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
index cc4b440448..1c10543f68 100644
--- a/SYCL/DeviceLib/math_test_marray_vec_common.hpp
+++ b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
@@ -32,7 +32,7 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input_access(      \
+        accessor<T, 1, access::mode::read, target::device> input_access(       \
             buffer2, cgh);                                                     \
         cgh.single_task<TypeHelper<class a##NAME, T>>(                         \
             [=]() { res_access[0] = NAME(input_access[0]); });                 \
@@ -93,9 +93,9 @@ OPERATOR(trunc)
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
             buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
             buffer3, cgh);                                                     \
         cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
           res_access[0] = NAME(input1_access[0], input2_access[0]);            \
@@ -134,11 +134,11 @@ OPERATOR_2(remainder)
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
             buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
             buffer3, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input3_access(     \
+        accessor<T, 1, access::mode::read, target::device> input3_access(      \
             buffer4, cgh);                                                     \
         cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
           res_access[0] =                                                      \
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index f138ced366..22f9515eca 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -39,7 +39,7 @@ template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input_access(      \
+        accessor<T, 1, access::mode::read, target::device> input_access(       \
             buffer2, cgh);                                                     \
         cgh.single_task<TypeHelper<class native##NAME, T>>(                    \
             [=]() { res_access[0] = sycl::native::NAME(input_access[0]); });   \
@@ -74,9 +74,9 @@ NATIVE_OPERATOR(recip)
       deviceQueue.submit([&](handler &cgh) {                                   \
         accessor<T, 1, access::mode::write, target::device> res_access(        \
             buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input1_access(     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
             buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::write, target::device> input2_access(     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
             buffer3, cgh);                                                     \
         cgh.single_task<TypeHelper<class native2##NAME, T>>([=]() {            \
           res_access[0] =                                                      \

From 538c64b75ac30cf2f902011619098f60fbcb8e46 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 30 Jun 2022 10:30:02 +0100
Subject: [PATCH 10/20] Removed float3 powr tests.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp | 2 --
 SYCL/DeviceLib/native_math_test_marray_vec.cpp         | 2 --
 2 files changed, 4 deletions(-)

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index 28936bc7d2..ad72883320 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -109,8 +109,6 @@ template <typename T> void half_precision_math_tests_3(queue &deviceQueue) {
                                  T{0.25, 0.25, 0.25}, 2);
   half_precision_math_test_recip(deviceQueue, T{-1, -1, -1},
                                  T{0.25, 0.25, 0.25}, 4);
-  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1}, T{2, 2, 2},
-                                  T{2, 2, 2}, 4);
   half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4},
                                     T{2, 2, 2}, 2);
 }
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index 22f9515eca..8943c99c1d 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -104,8 +104,6 @@ template <typename T> void native_tests_3(queue &deviceQueue) {
   native_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
   native_math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
   native_math_test_recip(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 4);
-  native_math_test_2_powr(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, T{2, 2, 2},
-                          4);
   native_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, T{2, 2, 2},
                             2);
 }

From e77ba6971d6247e7338a53cd2cc13bb1d1fb204d Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 1 Jul 2022 18:24:20 +0100
Subject: [PATCH 11/20] Removed failing test coverage for existing float3
 functions.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp | 1 -
 SYCL/DeviceLib/native_math_test_marray_vec.cpp         | 1 -
 2 files changed, 2 deletions(-)

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index ad72883320..6ec4ad5b1b 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -99,7 +99,6 @@ template <typename T> void half_precision_math_tests_3(queue &deviceQueue) {
   half_precision_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
   half_precision_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
   half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
-  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
   half_precision_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
   half_precision_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
   half_precision_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100},
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index 8943c99c1d..984126f909 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -97,7 +97,6 @@ template <typename T> void native_tests_3(queue &deviceQueue) {
   native_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
   native_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
   native_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  native_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
   native_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
   native_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
   native_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);

From 4f2dfe53ca1e965399e744e8c2df2dfd30b75912 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 6 Sep 2022 10:54:19 +0100
Subject: [PATCH 12/20] Remove broken half/native cases.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp | 1 -
 SYCL/DeviceLib/native_math_test_marray_vec.cpp         | 1 -
 2 files changed, 2 deletions(-)

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index 6ec4ad5b1b..65358b8a8e 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -54,7 +54,6 @@ HALF_PRECISION_OPERATOR(tan)
 HALF_PRECISION_OPERATOR(cos)
 HALF_PRECISION_OPERATOR(exp)
 HALF_PRECISION_OPERATOR(exp2)
-HALF_PRECISION_OPERATOR(exp10)
 HALF_PRECISION_OPERATOR(log)
 HALF_PRECISION_OPERATOR(log2)
 HALF_PRECISION_OPERATOR(log10)
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index 984126f909..72a6cf4fff 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -56,7 +56,6 @@ NATIVE_OPERATOR(exp2)
 NATIVE_OPERATOR(exp10)
 NATIVE_OPERATOR(log)
 NATIVE_OPERATOR(log2)
-NATIVE_OPERATOR(log10)
 NATIVE_OPERATOR(sqrt)
 NATIVE_OPERATOR(rsqrt)
 NATIVE_OPERATOR(recip)

From c1972f09ab7e80fce4f19cb5c31231543393d428 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 7 Sep 2022 09:07:08 +0100
Subject: [PATCH 13/20] Removed unused cases.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp | 2 --
 SYCL/DeviceLib/native_math_test_marray_vec.cpp         | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
index 65358b8a8e..c53b70e95f 100644
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
@@ -122,8 +122,6 @@ template <typename T> void half_precision_math_tests_4(queue &deviceQueue) {
                                1);
   half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
                                 4);
-  half_precision_math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
-                                 100);
   half_precision_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1},
                                0);
   half_precision_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
index 72a6cf4fff..75b24197f0 100644
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
@@ -98,7 +98,6 @@ template <typename T> void native_tests_3(queue &deviceQueue) {
   native_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
   native_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
   native_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  native_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
   native_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
   native_math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
   native_math_test_recip(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 4);
@@ -114,8 +113,6 @@ template <typename T> void native_tests_4(queue &deviceQueue) {
   native_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
   native_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
   native_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
-  native_math_test_log10(deviceQueue, T{-1, -1, -1, -1}, T{100, 100, 100, 100},
-                         2);
   native_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
   native_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
                          T{0.25, 0.25, 0.25, 0.25}, 2);

From 57f33290af9c35f81f3e029400be71b92b26e3ec Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 12 Sep 2022 15:28:04 +0100
Subject: [PATCH 14/20] Add marray -fast-math test cases.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/fast-math-flag.cpp | 118 +++++++++++++-------
 1 file changed, 76 insertions(+), 42 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/fast-math-flag.cpp b/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
index e6b7d08238..d90845e58d 100644
--- a/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
+++ b/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
@@ -6,47 +6,64 @@
 #include <cassert>
 #include <sycl/sycl.hpp>
 
-#define __TEST_FFMATH_BINARY(func)                                             \
-  int test_ffmath_##func() {                                                   \
-    sycl::float4 r[2];                                                         \
-    sycl::float4 val[2] = {{1.0004f, 1e-4f, 1.4f, 14.0f},                      \
-                           {1.0004f, 1e-4f, 1.4f, 14.0f}};                     \
+using namespace sycl;
+
+template <typename T> bool checkEqual(vec<T, 4> A, vec<T, 4> B) {
+
+  return sycl::all(A == B);
+}
+
+template <typename T, size_t N>
+bool checkEqual(marray<T, N> A, marray<T, N> B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define __TEST_FFMATH_UNARY(func)                                              \
+  template <typename T> void test_ffmath_##func(queue &deviceQueue) {          \
+    T input{1.0004f, 1e-4f, 1.4f, 14.0f};                                      \
+    T res[2] = {{-1, -1, -1, -1}, {-2, -2, -2, -2}};                           \
     {                                                                          \
-      sycl::buffer<sycl::float4, 1> output(&r[0], sycl::range<1>(2));          \
-      sycl::buffer<sycl::float4, 1> input(&val[0], sycl::range<1>(2));         \
-      sycl::queue q;                                                           \
-      q.submit([&](sycl::handler &cgh) {                                       \
-        auto AccO =                                                            \
-            output.template get_access<sycl::access::mode::write>(cgh);        \
-        auto AccI = input.template get_access<sycl::access::mode::read>(cgh);  \
+      buffer<T, 1> input_buff(&input, 1);                                      \
+      buffer<T, 1> res_buff(&res[0], sycl::range<1>(2));                       \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_acc(res_buff,  \
+                                                                    cgh);      \
+        accessor<T, 1, access::mode::read, target::device> input_acc(          \
+            input_buff, cgh);                                                  \
         cgh.single_task([=]() {                                                \
-          AccO[0] = sycl::func(AccI[0], AccI[1]);                              \
-          AccO[1] = sycl::native::func(AccI[0], AccI[1]);                      \
+          res_acc[0] = sycl::native::func(input_acc[0]);                       \
+          res_acc[1] = sycl::func(input_acc[0]);                               \
         });                                                                    \
       });                                                                      \
     }                                                                          \
-    return sycl::all(r[0] == r[1]);                                            \
+    assert(checkEqual(res[0], res[1]));                                        \
   }
 
-#define __TEST_FFMATH_UNARY(func)                                              \
-  int test_ffmath_##func() {                                                   \
-    sycl::float4 val = {1.0004f, 1e-4f, 1.4f, 14.0f};                          \
-    sycl::float4 r[2];                                                         \
+#define __TEST_FFMATH_BINARY(func)                                             \
+  template <typename T> void test_ffmath_##func(queue &deviceQueue) {          \
+    T input[2] = {{1.0004f, 1e-4f, 1.4f, 14.0f},                               \
+                  {1.0004f, 1e-4f, 1.4f, 14.0f}};                              \
+    T res[2] = {{-1, -1, -1, -1}, {-2, -2, -2, -2}};                           \
     {                                                                          \
-      sycl::buffer<sycl::float4, 1> output(&r[0], sycl::range<1>(2));          \
-      sycl::buffer<sycl::float4, 1> input(&val, sycl::range<1>(1));            \
-      sycl::queue q;                                                           \
-      q.submit([&](sycl::handler &cgh) {                                       \
-        auto AccO =                                                            \
-            output.template get_access<sycl::access::mode::write>(cgh);        \
-        auto AccI = input.template get_access<sycl::access::mode::read>(cgh);  \
+      buffer<T, 1> input_buff(&input[0], range<1>(2));                         \
+      buffer<T, 1> res_buff(&res[0], range<1>(2));                             \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_acc(res_buff,  \
+                                                                    cgh);      \
+        accessor<T, 1, access::mode::read, target::device> input_acc(          \
+            input_buff, cgh);                                                  \
         cgh.single_task([=]() {                                                \
-          AccO[0] = sycl::func(AccI[0]);                                       \
-          AccO[1] = sycl::native::func(AccI[0]);                               \
+          res_acc[0] = sycl::native::func(input_acc[0], input_acc[1]);         \
+          res_acc[1] = sycl::func(input_acc[0], input_acc[1]);                 \
         });                                                                    \
       });                                                                      \
     }                                                                          \
-    return sycl::all(r[0] == r[1]);                                            \
+    assert(checkEqual(res[0], res[1]));                                        \
   }
 
 __TEST_FFMATH_UNARY(cos)
@@ -56,26 +73,43 @@ __TEST_FFMATH_UNARY(exp10)
 __TEST_FFMATH_UNARY(log)
 __TEST_FFMATH_UNARY(log2)
 __TEST_FFMATH_UNARY(log10)
-__TEST_FFMATH_BINARY(powr)
 __TEST_FFMATH_UNARY(rsqrt)
 __TEST_FFMATH_UNARY(sin)
 __TEST_FFMATH_UNARY(sqrt)
 __TEST_FFMATH_UNARY(tan)
 
+__TEST_FFMATH_BINARY(powr)
+
 int main() {
 
-  assert(test_ffmath_cos());
-  assert(test_ffmath_exp());
-  assert(test_ffmath_exp2());
-  assert(test_ffmath_exp10());
-  assert(test_ffmath_log());
-  assert(test_ffmath_log2());
-  assert(test_ffmath_log10());
-  assert(test_ffmath_powr());
-  assert(test_ffmath_rsqrt());
-  assert(test_ffmath_sin());
-  assert(test_ffmath_sqrt());
-  assert(test_ffmath_tan());
+  queue q;
+  test_ffmath_cos<marray<float, 4>>(q);
+  test_ffmath_exp<marray<float, 4>>(q);
+  test_ffmath_exp2<marray<float, 4>>(q);
+  test_ffmath_exp10<marray<float, 4>>(q);
+  test_ffmath_log<marray<float, 4>>(q);
+  test_ffmath_log2<marray<float, 4>>(q);
+  test_ffmath_log10<marray<float, 4>>(q);
+  test_ffmath_powr<marray<float, 4>>(q);
+  test_ffmath_rsqrt<marray<float, 4>>(q);
+  test_ffmath_sin<marray<float, 4>>(q);
+  test_ffmath_sqrt<marray<float, 4>>(q);
+  test_ffmath_tan<marray<float, 4>>(q);
+  test_ffmath_powr<marray<float, 4>>(q);
+
+  test_ffmath_cos<float4>(q);
+  test_ffmath_exp<float4>(q);
+  test_ffmath_exp2<float4>(q);
+  test_ffmath_exp10<float4>(q);
+  test_ffmath_log<float4>(q);
+  test_ffmath_log2<float4>(q);
+  test_ffmath_log10<float4>(q);
+  test_ffmath_powr<float4>(q);
+  test_ffmath_rsqrt<float4>(q);
+  test_ffmath_sin<float4>(q);
+  test_ffmath_sqrt<float4>(q);
+  test_ffmath_tan<float4>(q);
+  test_ffmath_powr<float4>(q);
 
   return 0;
 }

From a9efc862b6cdac11a52f09f1835fd01a8938832f Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 13 Sep 2022 14:45:53 +0100
Subject: [PATCH 15/20] Removed initially proposed native and half_prec tests.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../half_precision_math_test_marray_vec.cpp   | 153 ------------------
 .../DeviceLib/native_math_test_marray_vec.cpp | 138 ----------------
 2 files changed, 291 deletions(-)
 delete mode 100644 SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
 delete mode 100644 SYCL/DeviceLib/native_math_test_marray_vec.cpp

diff --git a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp b/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
deleted file mode 100644
index c53b70e95f..0000000000
--- a/SYCL/DeviceLib/half_precision_math_test_marray_vec.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-template <typename T1, typename T2> class TypeHelper;
-
-template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB;
-}
-
-template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
-}
-
-template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
-  for (int i = 0; i < N; i++) {
-    if (A[i] != B) {
-      return false;
-    }
-  }
-  return true;
-}
-
-#define HALF_PRECISION_OPERATOR(NAME)                                          \
-  template <typename T>                                                        \
-  void half_precision_math_test_##NAME(queue &deviceQueue, T result, T input,  \
-                                       size_t ref) {                           \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input, 1);                                         \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input_access(       \
-            buffer2, cgh);                                                     \
-        cgh.single_task<TypeHelper<class half_precision##NAME, T>>([=]() {     \
-          res_access[0] = sycl::half_precision::NAME(input_access[0]);         \
-        });                                                                    \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-HALF_PRECISION_OPERATOR(sin)
-HALF_PRECISION_OPERATOR(tan)
-HALF_PRECISION_OPERATOR(cos)
-HALF_PRECISION_OPERATOR(exp)
-HALF_PRECISION_OPERATOR(exp2)
-HALF_PRECISION_OPERATOR(log)
-HALF_PRECISION_OPERATOR(log2)
-HALF_PRECISION_OPERATOR(log10)
-HALF_PRECISION_OPERATOR(sqrt)
-HALF_PRECISION_OPERATOR(rsqrt)
-HALF_PRECISION_OPERATOR(recip)
-
-#undef HALF_PRECISION_OPERATOR
-
-#define HALF_PRECISION_OPERATOR_2(NAME)                                        \
-  template <typename T>                                                        \
-  void half_precision_math_test_2_##NAME(queue &deviceQueue, T result,         \
-                                         T input1, T input2, size_t ref) {     \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input1, 1);                                        \
-      buffer<T, 1> buffer3(&input2, 1);                                        \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input1_access(      \
-            buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input2_access(      \
-            buffer3, cgh);                                                     \
-        cgh.single_task<TypeHelper<class half_precision2##NAME, T>>([=]() {    \
-          res_access[0] =                                                      \
-              sycl::half_precision::NAME(input1_access[0], input2_access[0]);  \
-        });                                                                    \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-HALF_PRECISION_OPERATOR_2(divide)
-HALF_PRECISION_OPERATOR_2(powr)
-
-#undef HALF_PRECISION_OPERATOR_2
-
-template <typename T> void half_precision_math_tests_3(queue &deviceQueue) {
-  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
-  half_precision_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100},
-                                 2);
-  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1},
-                                 T{0.25, 0.25, 0.25}, 2);
-  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1},
-                                 T{0.25, 0.25, 0.25}, 4);
-  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4},
-                                    T{2, 2, 2}, 2);
-}
-
-template <typename T> void half_precision_math_tests_4(queue &deviceQueue) {
-  half_precision_math_test_sin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                               0);
-  half_precision_math_test_tan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                               0);
-  half_precision_math_test_cos(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                               1);
-  half_precision_math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
-                               1);
-  half_precision_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
-                                4);
-  half_precision_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1},
-                               0);
-  half_precision_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
-                                2);
-  half_precision_math_test_log10(deviceQueue, T{-1, -1, -1, -1},
-                                 T{100, 100, 100, 100}, 2);
-  half_precision_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
-                                2);
-  half_precision_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
-                                 T{0.25, 0.25, 0.25, 0.25}, 2);
-  half_precision_math_test_recip(deviceQueue, T{-1, -1, -1, -1},
-                                 T{0.25, 0.25, 0.25, 0.25}, 4);
-  half_precision_math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
-                                  T{2, 2, 2, 2}, 4);
-  half_precision_math_test_2_divide(deviceQueue, T{-1, -1, -1, -1},
-                                    T{4, 4, 4, 4}, T{2, 2, 2, 2}, 2);
-}
-
-int main() {
-  queue deviceQueue;
-
-  half_precision_math_tests_3<float3>(deviceQueue);
-  half_precision_math_tests_3<marray<float, 3>>(deviceQueue);
-
-  half_precision_math_tests_4<float4>(deviceQueue);
-  half_precision_math_tests_4<marray<float, 4>>(deviceQueue);
-  std::cout << "Pass" << std::endl;
-  return 0;
-}
diff --git a/SYCL/DeviceLib/native_math_test_marray_vec.cpp b/SYCL/DeviceLib/native_math_test_marray_vec.cpp
deleted file mode 100644
index 75b24197f0..0000000000
--- a/SYCL/DeviceLib/native_math_test_marray_vec.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-template <typename T1, typename T2> class TypeHelper;
-
-template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB;
-}
-
-template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
-  T TB = B;
-  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
-}
-
-template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
-  for (int i = 0; i < N; i++) {
-    if (A[i] != B) {
-      return false;
-    }
-  }
-  return true;
-}
-
-#define NATIVE_OPERATOR(NAME)                                                  \
-  template <typename T>                                                        \
-  void native_math_test_##NAME(queue &deviceQueue, T result, T input,          \
-                               size_t ref) {                                   \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input, 1);                                         \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input_access(       \
-            buffer2, cgh);                                                     \
-        cgh.single_task<TypeHelper<class native##NAME, T>>(                    \
-            [=]() { res_access[0] = sycl::native::NAME(input_access[0]); });   \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-NATIVE_OPERATOR(sin)
-NATIVE_OPERATOR(tan)
-NATIVE_OPERATOR(cos)
-NATIVE_OPERATOR(exp)
-NATIVE_OPERATOR(exp2)
-NATIVE_OPERATOR(exp10)
-NATIVE_OPERATOR(log)
-NATIVE_OPERATOR(log2)
-NATIVE_OPERATOR(sqrt)
-NATIVE_OPERATOR(rsqrt)
-NATIVE_OPERATOR(recip)
-
-#undef NATIVE_OPERATOR
-
-#define NATIVE_OPERATOR_2(NAME)                                                \
-  template <typename T>                                                        \
-  void native_math_test_2_##NAME(queue &deviceQueue, T result, T input1,       \
-                                 T input2, size_t ref) {                       \
-    {                                                                          \
-      buffer<T, 1> buffer1(&result, 1);                                        \
-      buffer<T, 1> buffer2(&input1, 1);                                        \
-      buffer<T, 1> buffer3(&input2, 1);                                        \
-      deviceQueue.submit([&](handler &cgh) {                                   \
-        accessor<T, 1, access::mode::write, target::device> res_access(        \
-            buffer1, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input1_access(      \
-            buffer2, cgh);                                                     \
-        accessor<T, 1, access::mode::read, target::device> input2_access(      \
-            buffer3, cgh);                                                     \
-        cgh.single_task<TypeHelper<class native2##NAME, T>>([=]() {            \
-          res_access[0] =                                                      \
-              sycl::native::NAME(input1_access[0], input2_access[0]);          \
-        });                                                                    \
-      });                                                                      \
-    }                                                                          \
-    assert(checkEqual(result, ref));                                           \
-  }
-
-NATIVE_OPERATOR_2(divide)
-NATIVE_OPERATOR_2(powr)
-
-#undef NATIVE_OPERATOR_2
-
-template <typename T> void native_tests_3(queue &deviceQueue) {
-  native_math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  native_math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
-  native_math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  native_math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
-  native_math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
-  native_math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  native_math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
-  native_math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
-  native_math_test_recip(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 4);
-  native_math_test_2_divide(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, T{2, 2, 2},
-                            2);
-}
-
-template <typename T> void native_tests_4(queue &deviceQueue) {
-  native_math_test_sin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  native_math_test_tan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
-  native_math_test_cos(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
-  native_math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
-  native_math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
-  native_math_test_log(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
-  native_math_test_log2(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
-  native_math_test_sqrt(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4}, 2);
-  native_math_test_rsqrt(deviceQueue, T{-1, -1, -1, -1},
-                         T{0.25, 0.25, 0.25, 0.25}, 2);
-  native_math_test_recip(deviceQueue, T{-1, -1, -1, -1},
-                         T{0.25, 0.25, 0.25, 0.25}, 4);
-  native_math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
-                          T{2, 2, 2, 2}, 4);
-  native_math_test_2_divide(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
-                            T{2, 2, 2, 2}, 2);
-}
-
-int main() {
-  queue deviceQueue;
-
-  native_tests_3<float3>(deviceQueue);
-  native_tests_3<marray<float, 3>>(deviceQueue);
-
-  native_tests_4<float4>(deviceQueue);
-  native_tests_4<marray<float, 4>>(deviceQueue);
-
-  std::cout << "Pass" << std::endl;
-  return 0;
-}

From f731c935345790082917b5bf76c913d2970cf559 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 13 Sep 2022 15:00:55 +0100
Subject: [PATCH 16/20] Added back device-code-split.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math.cpp      | 4 ++--
 SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 889c1baed8..4ab4dcf4d1 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -3,7 +3,7 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// tests oneapi extension native tanh math function for sycl::vec and
+// Tests oneapi extension native tanh math function for sycl::vec and
 // sycl::marray float cases.
 
 #include "ext_native_math_common.hpp"
@@ -47,4 +47,4 @@ int main() {
        tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
index 6a27bff4ad..9b89fb347b 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -7,7 +7,7 @@
 // OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
 // test is compiled with the -fsycl-device-code-split flag
 
-// tests oneapi extension native math functions for sycl::vec and sycl::marray
+// Tests oneapi extension native math functions for sycl::vec and sycl::marray
 // fp16 cases.
 
 #include "ext_native_math_common.hpp"

From 505064cd1aecd0771fcc58daffedb961255a7997 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Wed, 14 Sep 2022 09:19:55 +0100
Subject: [PATCH 17/20] Remove host_runs.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp | 1 -
 SYCL/DeviceLib/math_test_marray_vec.cpp           | 1 -
 SYCL/DeviceLib/math_test_marray_vec_fp16.cpp      | 1 -
 3 files changed, 3 deletions(-)

diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
index 9b89fb347b..74bd0f8213 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -1,5 +1,4 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index de054a2c1f..721db1a559 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -1,6 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
diff --git a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
index da9df54d80..7db7350ece 100644
--- a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
@@ -1,6 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 

From cfc1e91c3c93ed3330fa1543de8ab2cac8eb55de Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 22 Sep 2022 11:44:47 +0100
Subject: [PATCH 18/20] windows && level_zero marked unsupported.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/math_test_marray_vec.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index 721db1a559..83c69cd947 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -1,3 +1,5 @@
+// TODO fix level_zero on windows failure
+// UNSUPPORTED: (windows && level_zero)
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From ea6693d2bf74e2aa450603ffe4df89c6d9541b84 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 22 Sep 2022 15:00:54 +0100
Subject: [PATCH 19/20] Mark opencl && windows unsupported.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/math_test_marray_vec.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index 83c69cd947..cae8852791 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -1,5 +1,5 @@
-// TODO fix level_zero on windows failure
-// UNSUPPORTED: (windows && level_zero)
+// TODO fix windows failures
+// UNSUPPORTED: windows && (level_zero || opencl)
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From 512a37ab748843fb0673133ee150f08e3c6fb4c0 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 2 Dec 2022 14:43:07 +0000
Subject: [PATCH 20/20] Added fp64 aspect check.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/DeviceLib/math_test_marray_vec.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
index cae8852791..18dfc1ae00 100644
--- a/SYCL/DeviceLib/math_test_marray_vec.cpp
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -13,14 +13,18 @@
 int main() {
   queue deviceQueue;
   math_tests_4<float4>(deviceQueue);
-  math_tests_4<double4>(deviceQueue);
   math_tests_4<marray<float, 4>>(deviceQueue);
-  math_tests_4<marray<double, 4>>(deviceQueue);
 
   math_tests_3<float3>(deviceQueue);
-  math_tests_3<double3>(deviceQueue);
   math_tests_3<marray<float, 3>>(deviceQueue);
-  math_tests_3<marray<double, 3>>(deviceQueue);
+
+  if (deviceQueue.get_device().has(sycl::aspect::fp64)) {
+    math_tests_4<double4>(deviceQueue);
+    math_tests_4<marray<double, 4>>(deviceQueue);
+
+    math_tests_3<double3>(deviceQueue);
+    math_tests_3<marray<double, 3>>(deviceQueue);
+  }
 
   std::cout << "Pass" << std::endl;
   return 0;