From 025cf7ecb4373e248841227d3724ad4f94062b8e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:06:25 +0000
Subject: [PATCH 01/18] Added bfloat16 support for cuda backend. Added bfloat16
 in oneapi experimental namespace.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../sycl/ext/oneapi/experimental/bfloat16.hpp | 161 ++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
new file mode 100644
index 0000000000000..329094634d9ad
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -0,0 +1,161 @@
+//==--------- bfloat16.hpp ------- SYCL bfloat16 conversion ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/half_type.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return __nvvm_f2bf16_rn(a);
+#else
+    return __spirv_ConvertFToBF16INTEL(a);
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 conversion is not supported on host device"};
+#endif
+  }
+  static float to_float(const storage_t &a) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    unsigned int y = a;
+    y = y << 16;
+    float *res = reinterpret_cast<float *>(&y);
+    return *res;
+#else
+    return __spirv_ConvertBF16ToFINTEL(a);
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 conversion is not supported on host device"};
+#endif
+  }
+
+  static bfloat16 from_bits(const storage_t &a) {
+    bfloat16 res;
+    res.value = a;
+    return res;
+  }
+
+  // Implicit conversion from float to bfloat16
+  bfloat16(const float &a) { value = from_float(a); }
+
+  bfloat16 &operator=(const float &rhs) {
+    value = from_float(rhs);
+    return *this;
+  }
+
+  // Implicit conversion from bfloat16 to float
+  operator float() const { return to_float(value); }
+  operator sycl::half() const { return to_float(value); }
+
+  // Get raw bits representation of bfloat16
+  storage_t raw() const { return value; }
+
+  // Logical operators (!,||,&&) are covered if we can cast to bool
+  explicit operator bool() { return to_float(value) != 0.0f; }
+
+  // Unary minus operator overloading
+  friend bfloat16 operator-(bfloat16 &lhs) {
+    return bfloat16{-to_float(lhs.value)};
+  }
+
+// Increment and decrement operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs) {                                \
+    float f = to_float(lhs.value);                                             \
+    lhs.value = from_float(op f);                                              \
+    return lhs;                                                                \
+  }                                                                            \
+  friend bfloat16 operator op(bfloat16 &lhs, int) {                            \
+    bfloat16 old = lhs;                                                        \
+    operator op(lhs);                                                          \
+    return old;                                                                \
+  }
+  OP(++)
+  OP(--)
+#undef OP
+
+  // Assignment operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend bfloat16 &operator op(bfloat16 &lhs, const T &rhs) {                  \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T> friend T &operator op(T &lhs, const bfloat16 &rhs) {   \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }
+  OP(+=)
+  OP(-=)
+  OP(*=)
+  OP(/=)
+#undef OP
+
+// Binary operators overloading
+#define OP(type, op)                                                           \
+  friend type operator op(const bfloat16 &lhs, const bfloat16 &rhs) {          \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const bfloat16 &lhs, const T &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }
+  OP(bfloat16, +)
+  OP(bfloat16, -)
+  OP(bfloat16, *)
+  OP(bfloat16, /)
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
+#undef OP
+
+  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
+  // for floating-point types.
+};
+
+} // namespace experimental
+} // namespace intel
+} // namespace ext
+
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)

From 66b4e3344bc7a9e514d857d4931ba26ed192b3f9 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:13:58 +0000
Subject: [PATCH 02/18] deleted intel namespace bfloat16.

---
 .../sycl/ext/intel/experimental/bfloat16.hpp  | 150 ------------------
 1 file changed, 150 deletions(-)
 delete mode 100644 sycl/include/sycl/ext/intel/experimental/bfloat16.hpp

diff --git a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp b/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
deleted file mode 100644
index 5a51f3746e225..0000000000000
--- a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//==--------- bfloat16.hpp ------- SYCL bfloat16 conversion ----------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <CL/__spirv/spirv_ops.hpp>
-#include <CL/sycl/half_type.hpp>
-
-__SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
-namespace ext {
-namespace intel {
-namespace experimental {
-
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
-  using storage_t = uint16_t;
-  storage_t value;
-
-public:
-  bfloat16() = default;
-  bfloat16(const bfloat16 &) = default;
-  ~bfloat16() = default;
-
-  // Explicit conversion functions
-  static storage_t from_float(const float &a) {
-#if defined(__SYCL_DEVICE_ONLY__)
-    return __spirv_ConvertFToBF16INTEL(a);
-#else
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 conversion is not supported on host device"};
-#endif
-  }
-  static float to_float(const storage_t &a) {
-#if defined(__SYCL_DEVICE_ONLY__)
-    return __spirv_ConvertBF16ToFINTEL(a);
-#else
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 conversion is not supported on host device"};
-#endif
-  }
-
-  static bfloat16 from_bits(const storage_t &a) {
-    bfloat16 res;
-    res.value = a;
-    return res;
-  }
-
-  // Implicit conversion from float to bfloat16
-  bfloat16(const float &a) { value = from_float(a); }
-
-  bfloat16 &operator=(const float &rhs) {
-    value = from_float(rhs);
-    return *this;
-  }
-
-  // Implicit conversion from bfloat16 to float
-  operator float() const { return to_float(value); }
-  operator sycl::half() const { return to_float(value); }
-
-  // Get raw bits representation of bfloat16
-  storage_t raw() const { return value; }
-
-  // Logical operators (!,||,&&) are covered if we can cast to bool
-  explicit operator bool() { return to_float(value) != 0.0f; }
-
-  // Unary minus operator overloading
-  friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
-  }
-
-// Increment and decrement operators overloading
-#define OP(op)                                                                 \
-  friend bfloat16 &operator op(bfloat16 &lhs) {                                \
-    float f = to_float(lhs.value);                                             \
-    lhs.value = from_float(op f);                                              \
-    return lhs;                                                                \
-  }                                                                            \
-  friend bfloat16 operator op(bfloat16 &lhs, int) {                            \
-    bfloat16 old = lhs;                                                        \
-    operator op(lhs);                                                          \
-    return old;                                                                \
-  }
-  OP(++)
-  OP(--)
-#undef OP
-
-  // Assignment operators overloading
-#define OP(op)                                                                 \
-  friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend bfloat16 &operator op(bfloat16 &lhs, const T &rhs) {                  \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }                                                                            \
-  template <typename T> friend T &operator op(T &lhs, const bfloat16 &rhs) {   \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }
-  OP(+=)
-  OP(-=)
-  OP(*=)
-  OP(/=)
-#undef OP
-
-// Binary operators overloading
-#define OP(type, op)                                                           \
-  friend type operator op(const bfloat16 &lhs, const bfloat16 &rhs) {          \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend type operator op(const bfloat16 &lhs, const T &rhs) {                 \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }
-  OP(bfloat16, +)
-  OP(bfloat16, -)
-  OP(bfloat16, *)
-  OP(bfloat16, /)
-  OP(bool, ==)
-  OP(bool, !=)
-  OP(bool, <)
-  OP(bool, >)
-  OP(bool, <=)
-  OP(bool, >=)
-#undef OP
-
-  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
-  // for floating-point types.
-};
-
-} // namespace experimental
-} // namespace intel
-} // namespace ext
-
-} // namespace sycl
-} // __SYCL_INLINE_NAMESPACE(cl)

From 2d04406d0198b5321cf2aaa870d395e9f042755b Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:29:32 +0000
Subject: [PATCH 03/18] Format.

---
 sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 329094634d9ad..ef1f01d5340ae 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -154,7 +154,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 };
 
 } // namespace experimental
-} // namespace intel
+} // namespace oneapi
 } // namespace ext
 
 } // namespace sycl

From 9418f74ee1e1a35918f5bcf99a3d0d57f29dac90 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:35:02 +0000
Subject: [PATCH 04/18] Changed extension macro name.

---
 sycl/include/CL/sycl/feature_test.hpp.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index e6053ebf4ff1c..9bd849ca27d8a 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -46,7 +46,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY 1
 #define SYCL_EXT_ONEAPI_SRGB 1
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
-#define SYCL_EXT_INTEL_BF16_CONVERSION 1
+#define SYCL_EXT_ONEAPI_BF16_CONVERSION 1
 #define SYCL_EXT_INTEL_BITCAST 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__

From 4d99f3f97c06dc529d1d6b16df645b1af27fa8a6 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 17 Feb 2022 10:22:48 +0000
Subject: [PATCH 05/18] fixed test.

---
 sycl/test/extensions/bfloat16.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 6be5459642d0c..dd87806942f8a 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -2,10 +2,10 @@
 
 // UNSUPPORTED: cuda || hip_amd
 
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
-using sycl::ext::intel::experimental::bfloat16;
+using sycl::ext::oneapi::experimental::bfloat16;
 
 SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y);
 SYCL_EXTERNAL void foo(long x, sycl::half y);

From 3982001259745c617ad78d57dc67512e8d7ff6e9 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 4 Mar 2022 15:41:39 +0000
Subject: [PATCH 06/18] Used neg ptx7.0 builtin for unary minus

---
 clang/include/clang/Basic/BuiltinsNVPTX.def         |  5 +++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td              | 13 +++++++++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td            |  9 +++++++++
 .../sycl/ext/oneapi/experimental/bfloat16.hpp       | 13 +++++++++++--
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 449e4d1256944..955dbbaae8f0d 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -182,6 +182,11 @@ BUILTIN(__nvvm_fabs_ftz_f, "ff", "")
 BUILTIN(__nvvm_fabs_f, "ff", "")
 BUILTIN(__nvvm_fabs_d, "dd", "")
 
+// Neg
+
+TARGET_BUILTIN(__nvvm_neg_bf16, "ZUsZUs", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))
+
 // Round
 
 BUILTIN(__nvvm_round_ftz_f, "ff", "")
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 33ba30d782ff3..b7b0813f05292 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -740,6 +740,19 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
+//
+// Neg bf16, bf16x2
+//
+
+  foreach unary = ["neg"] in {
+    def int_nvvm_ # unary # _bf16 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16")>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
+    def int_nvvm_ # unary # _bf16x2 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16x2")>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  }
+
 //
 // Round
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ec004c5923ece..af9e1270bc5f5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -719,6 +719,15 @@ def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
 def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_fabs_d>;
 
+//
+// Neg bf16, bf16x2
+//
+
+def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", Int16Regs,
+  Int16Regs, int_nvvm_neg_bf16>;
+def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
+  Int32Regs, int_nvvm_neg_bf16x2>;
+
 //
 // Round
 //
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index ef1f01d5340ae..3768c65aab6a3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -42,7 +42,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
-    unsigned int y = a;
+    uint32_t y = a;
     y = y << 16;
     float *res = reinterpret_cast<float *>(&y);
     return *res;
@@ -81,7 +81,16 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 unary minus is not supported on host device"};
+#endif
   }
 
 // Increment and decrement operators overloading

From 450e1b57ee67236abce07fa3f460ad59ac434f3a Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 7 Mar 2022 15:54:56 +0000
Subject: [PATCH 07/18] Adding fma_relu extension

---
 libclc/generic/include/spirv/spirv_builtins.h |  16 +++
 libclc/generic/libspirv/float16.cl            |  36 ++++++
 .../sycl_ext_oneapi_fma_relu.asciidoc         | 120 ++++++++++++++++++
 sycl/include/CL/__spirv/spirv_ops.hpp         |  51 ++++++++
 sycl/include/CL/sycl/builtins.hpp             |  12 ++
 sycl/include/CL/sycl/detail/builtins.hpp      |   3 +
 .../sycl/ext/oneapi/experimental/builtins.hpp |  13 ++
 sycl/source/detail/builtins_math.cpp          |  22 ++++
 8 files changed, 273 insertions(+)
 create mode 100644 sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc

diff --git a/libclc/generic/include/spirv/spirv_builtins.h b/libclc/generic/include/spirv/spirv_builtins.h
index adeb3a63460d6..7fcf76b8371da 100644
--- a/libclc/generic/include/spirv/spirv_builtins.h
+++ b/libclc/generic/include/spirv/spirv_builtins.h
@@ -14146,6 +14146,22 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
     __spirv_ocl_fma(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
 #endif
 
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(__clc_fp16_t,
+                                                                 __clc_fp16_t,
+                                                                 __clc_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
+    __clc_fma_relu(__clc_vec2_fp16_t, __clc_vec2_fp16_t, __clc_vec2_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
+    __clc_fma_relu(__clc_vec3_fp16_t, __clc_vec3_fp16_t, __clc_vec3_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
+    __clc_fma_relu(__clc_vec4_fp16_t, __clc_vec4_fp16_t, __clc_vec4_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
+    __clc_fma_relu(__clc_vec8_fp16_t, __clc_vec8_fp16_t, __clc_vec8_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
+    __clc_fma_relu(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
+#endif
+
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t
     __spirv_ocl_fmax(__clc_fp32_t, __clc_fp32_t);
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
diff --git a/libclc/generic/libspirv/float16.cl b/libclc/generic/libspirv/float16.cl
index b2cd14e8c63f4..1dff41f274ffc 100644
--- a/libclc/generic/libspirv/float16.cl
+++ b/libclc/generic/libspirv/float16.cl
@@ -4540,6 +4540,42 @@ __spirv_ocl_fma(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
                          as_half16(args_2));
 }
 
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(
+    __clc_float16_t args_0, __clc_float16_t args_1, __clc_float16_t args_2) {
+  return __clc_fma_relu(as_half(args_0), as_half(args_1), as_half(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
+__clc_fma_relu(__clc_vec2_float16_t args_0, __clc_vec2_float16_t args_1,
+               __clc_vec2_float16_t args_2) {
+  return __clc_fma_relu(as_half2(args_0), as_half2(args_1), as_half2(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
+__clc_fma_relu(__clc_vec3_float16_t args_0, __clc_vec3_float16_t args_1,
+               __clc_vec3_float16_t args_2) {
+  return __clc_fma_relu(as_half3(args_0), as_half3(args_1), as_half3(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
+__clc_fma_relu(__clc_vec4_float16_t args_0, __clc_vec4_float16_t args_1,
+               __clc_vec4_float16_t args_2) {
+  return __clc_fma_relu(as_half4(args_0), as_half4(args_1), as_half4(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
+__clc_fma_relu(__clc_vec8_float16_t args_0, __clc_vec8_float16_t args_1,
+               __clc_vec8_float16_t args_2) {
+  return __clc_fma_relu(as_half8(args_0), as_half8(args_1), as_half8(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
+__clc_fma_relu(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
+               __clc_vec16_float16_t args_2) {
+  return __clc_fma_relu(as_half16(args_0), as_half16(args_1),
+                        as_half16(args_2));
+}
+
 _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
 __spirv_ocl_fmax(__clc_float16_t args_0, __clc_float16_t args_1) {
   return __spirv_ocl_fmax(as_half(args_0), as_half(args_1));
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
new file mode 100644
index 0000000000000..b471e84087a74
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
@@ -0,0 +1,120 @@
+= sycl_ext_oneapi_fma_relu
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+or contact hugh 'dot' delaney 'at' codeplay 'dot' com.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 4 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+For the `bfloat16` cases this extension depends on the following other SYCL
+extensions:
+
+* link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_*_bf16_conversion]
+
+For the `half` cases this extension requires the runtime aspect 
+`sycl::aspect::fp16`.
+
+== Contributors
+
+* Hugh Delaney
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for GPU devices and
+only when using the CUDA backend.  Attempting to use this extension in
+kernels that run on other devices or backends may result in undefined behavior.
+Be aware that the compiler is not able to issue a diagnostic to warn you if
+this happens.
+====
+
+
+== Overview
+
+This extension introduces the `fma_relu` function for datatypes `sycl::half`,
+`bfloat16` and `bfloat16x2`. `bfloat16` and `bfloat16x2` refer to the bfloat16
+class from the `sycl_ext_*_bf16_conversion` extension, and currently use 
+`uint16_t` and `uint32_t`, respectively, as storage types. 
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+
+// Available when T is sycl::half, uint16_t (bfloat16) or uint32_t (bfloat16x2)
+template <typename T>
+T fma_relu(T a, T b, T c);
+}
+```
+
+`fma_relu` returns `a * b + c > 0 ? a * b + c : 0`. 
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_FMA_RELU` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
+an implementation must additionally predefine the macro 
+`SYCL_EXT_INTEL_BF16_CONVERSION`, as detailed in 
+link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_*_bf16_conversion].
+
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 4878cc4dd5db8..3ac0eadce7321 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -755,6 +755,57 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...);
 extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...);
 #endif
 
+extern SYCL_EXTERNAL _Float16 __clc_fma_relu(_Float16, _Float16, _Float16);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 2>, __ocl_vec_t<_Float16, 2>,
+                   __ocl_vec_t<_Float16, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 3>, __ocl_vec_t<_Float16, 3>,
+                   __ocl_vec_t<_Float16, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 4>, __ocl_vec_t<_Float16, 4>,
+                   __ocl_vec_t<_Float16, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 8>, __ocl_vec_t<_Float16, 8>,
+                   __ocl_vec_t<_Float16, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 16>, __ocl_vec_t<_Float16, 16>,
+                   __ocl_vec_t<_Float16, 16>);
+
+extern SYCL_EXTERNAL uint16_t __clc_fma_relu(uint16_t, uint16_t, uint16_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 2>, __ocl_vec_t<uint16_t, 2>,
+                   __ocl_vec_t<uint16_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 3>, __ocl_vec_t<uint16_t, 3>,
+                   __ocl_vec_t<uint16_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 4>, __ocl_vec_t<uint16_t, 4>,
+                   __ocl_vec_t<uint16_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 8>, __ocl_vec_t<uint16_t, 8>,
+                   __ocl_vec_t<uint16_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 16>, __ocl_vec_t<uint16_t, 16>,
+                   __ocl_vec_t<uint16_t, 16>);
+
+extern SYCL_EXTERNAL uint32_t __clc_fma_relu(uint32_t, uint32_t, uint32_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 2>, __ocl_vec_t<uint32_t, 2>,
+                   __ocl_vec_t<uint32_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 3>, __ocl_vec_t<uint32_t, 3>,
+                   __ocl_vec_t<uint32_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 4>, __ocl_vec_t<uint32_t, 4>,
+                   __ocl_vec_t<uint32_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 8>, __ocl_vec_t<uint32_t, 8>,
+                   __ocl_vec_t<uint32_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 16>, __ocl_vec_t<uint32_t, 16>,
+                   __ocl_vec_t<uint32_t, 16>);
+
 #else // if !__SYCL_DEVICE_ONLY__
 
 template <typename dataT>
diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index 0a9814da3eed0..96196e4776123 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -191,6 +191,18 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> fma(T a, T b,
   return __sycl_std::__invoke_fma<T>(a, b, c);
 }
 
+// genfloath fma_relu (genfloath a, genfloath b, genfloath c)
+// BF16 : uint16_t fma_relu (uint16_t a, uint16_t b, uint16_t c)
+// BF16X2 : uint32_t fma_relu (uint32_t a, uint32_t b, uint32_t c)
+template <typename T>
+detail::enable_if_t<detail::is_genfloath<T>::value ||
+                        std::is_same<T, uint16_t>::value ||
+                        std::is_same<T, uint32_t>::value,
+                    T>
+fma(T a, T b, T c) __NOEXC {
+  return __sycl_std::__invoke_fma<T>(a, b, c);
+}
+
 // genfloat fmax (genfloat x, genfloat y)
 template <typename T>
 detail::enable_if_t<detail::is_genfloat<T>::value, T> fmax(T x, T y) __NOEXC {
diff --git a/sycl/include/CL/sycl/detail/builtins.hpp b/sycl/include/CL/sycl/detail/builtins.hpp
index f4f0475ea905b..a6ca592bacb42 100644
--- a/sycl/include/CL/sycl/detail/builtins.hpp
+++ b/sycl/include/CL/sycl/detail/builtins.hpp
@@ -20,6 +20,7 @@
 #ifdef __SYCL_DEVICE_ONLY__
 #define __FUNC_PREFIX_OCL __spirv_ocl_
 #define __FUNC_PREFIX_CORE __spirv_
+#define __FUNC_PREFIX_GENERIC __clc_
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1)
 #define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)
@@ -27,6 +28,7 @@
 #else
 #define __FUNC_PREFIX_OCL
 #define __FUNC_PREFIX_CORE
+#define __FUNC_PREFIX_GENERIC
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg)                              \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)                         \
@@ -134,6 +136,7 @@ __SYCL_MAKE_CALL_ARG1(fabs, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fdim, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG1(floor, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG3(fma, __FUNC_PREFIX_OCL)
+__SYCL_MAKE_CALL_ARG3(fma_relu, __FUNC_PREFIX_GENERIC)
 __SYCL_MAKE_CALL_ARG2(fmax, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmin, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmod, __FUNC_PREFIX_OCL)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index e32e1c70a5a97..8d9386c341a85 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -16,12 +16,25 @@
 #define __SYCL_CONSTANT_AS
 #endif
 
+// TODO Decide whether to mark functions with this attribute.
+#define __NOEXC /*noexcept*/
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
 namespace oneapi {
 namespace experimental {
 
+// fma_relu returns a * b + c > 0 ? a * b + c : 0
+template <typename T>
+sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
+                              sycl::detail::is_ugenshort<T>::value ||
+                              sycl::detail::is_ugenint<T>::value,
+                          T>
+fma_relu(T a, T b, T c) __NOEXC {
+  return __sycl_std::__invoke_fma_relu<T>(a, b, c);
+}
+
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
 //   OpenCL C
diff --git a/sycl/source/detail/builtins_math.cpp b/sycl/source/detail/builtins_math.cpp
index 5a78d6cb80a5b..9c7c914db0df1 100644
--- a/sycl/source/detail/builtins_math.cpp
+++ b/sycl/source/detail/builtins_math.cpp
@@ -359,6 +359,28 @@ MAKE_1V_2V_3V(fma, s::cl_float, s::cl_float, s::cl_float, s::cl_float)
 MAKE_1V_2V_3V(fma, s::cl_double, s::cl_double, s::cl_double, s::cl_double)
 MAKE_1V_2V_3V(fma, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
 
+// fma_relu
+__SYCL_EXPORT s::cl_half fma_relu(s::cl_half a, s::cl_half b,
+                                  s::cl_half c) __NOEXC {
+  auto ans = std::fma(a, b, c);
+  return (ans > 0) ? ans : 0;
+}
+__SYCL_EXPORT s::cl_ushort fma_relu(s::cl_ushort a, s::cl_ushort b,
+                                    s::cl_ushort c) __NOEXC {
+  // TODO implement this once bfloat16 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16 datatype.");
+}
+__SYCL_EXPORT s::cl_uint fma_relu(s::cl_uint a, s::cl_uint b,
+                                  s::cl_uint c) __NOEXC {
+  // TODO implement this once bfloat16x2 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16x2 datatype.");
+}
+MAKE_1V_2V_3V(fma_relu, s::cl_ushort, s::cl_ushort, s::cl_ushort, s::cl_ushort)
+MAKE_1V_2V_3V(fma_relu, s::cl_uint, s::cl_uint, s::cl_uint, s::cl_uint)
+MAKE_1V_2V_3V(fma_relu, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
+
 // fmax
 __SYCL_EXPORT s::cl_float fmax(s::cl_float x, s::cl_float y) __NOEXC {
   return std::fmax(x, y);

From 8d2d11fecfbc1e9caebdc460c7a9cb19b4e18774 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 7 Mar 2022 16:47:32 +0000
Subject: [PATCH 08/18] Replaced SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc with
 SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc

---
 .../SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc  | 336 ++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
new file mode 100644
index 0000000000000..bf0a799671ffa
--- /dev/null
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
@@ -0,0 +1,336 @@
+= sycl_oneapi_bf16_conversion
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+// This is necessary for asciidoc, but not for asciidoctor
+:cpp: C++
+
+== Notice
+
+IMPORTANT: This specification is a draft.
+
+Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 4.
+
+== Status
+
+Draft
+
+This is a preview extension specification, intended to provide early access to
+a feature for review and community feedback. When the feature matures, this
+specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are
+subject to change they are not intended to be used by shipping software
+products.
+
+== Version
+
+Revision: 4
+
+== Introduction
+
+This extension adds functionality to convert value of single-precision
+floating-point type(`float`) to `bfloat16` type and vice versa. The extension
+doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
+type(`uint16_t`) as a storage for `bfloat16` values.
+
+The purpose of conversion from float to bfloat16 is to reduce ammount of memory
+required to store floating-point numbers. Computations are expected to be done with
+32-bit floating-point values.
+
+This extension is an optional kernel feature as described in
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
+of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
+feature to a device that does not support it should cause a synchronous
+`errc::kernel_not_supported` exception to be thrown from the kernel invocation
+command (e.g. from `parallel_for`).
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_BF16_CONVERSION` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro’s
+ value to determine which of the extension’s APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bf16_conversion
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bf16_conversion` aspect, then it natively
+supports conversion of values of `float` type to `bfloat16` and back.
+
+If the device doesn't have the aspect, objects of `bfloat16` class must not be
+used in the device code.
+
+**NOTE**: The `ext_oneapi_bf16_conversion` aspect is not yet supported.  The
+`bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
+
+== New `bfloat16` class
+
+The `bfloat16` class below provides the conversion functionality. Conversion
+from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
+mode.
+
+[source]
+----
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a);
+  static float to_float(const storage_t &a);
+
+  // Convert from float to bfloat16
+  bfloat16(const float &a);
+  bfloat16 &operator=(const float &a);
+
+  // Convert from bfloat16 to float
+  operator float() const;
+
+  // Get bfloat16 as uint16.
+  operator storage_t() const;
+
+  // Convert to bool type
+  explicit operator bool();
+
+  friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
+
+  // OP is: prefix ++, --
+  friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }
+
+  // OP is: postfix ++, --
+  friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }
+
+  // OP is: +=, -=, *=, /=
+  friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is +, -, *, /
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is ==,!=, <, >, <=, >=
+  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+----
+
+Table 1. Member functions of `bfloat16` class.
+|===
+| Member Function | Description
+
+|  `static storage_t from_float(const float &a);`
+|  Explicitly convert from `float` to `bfloat16`.
+
+|  `static float to_float(const storage_t &a);`
+|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
+
+| `bfloat16(const float& a);`
+| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
+
+| `bfloat16 &operator=(const float &a);`
+| Replace the value with `a` converted to `bfloat16`
+
+| `operator float() const;`
+|  Return `bfloat16` value converted to `float`.
+
+| `operator storage_t() const;`
+| Return `uint16_t` value, whose bits represent `bfloat16` value.
+
+| `explicit operator bool() { /* ... */ }`
+| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
+  zero, return `true` otherwise.
+
+| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
+| Construct new instance of `bfloat16` class with negated value of the `bf`.
+
+| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }`
+| Perform an in-place `OP` prefix arithmetic operation on the `bf`,
+  assigning the result to the `bf` and return the `bf`.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }`
+| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning
+  the result to the `bf` and return a copy of `bf` before the operation is
+  performed.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
+  and return the `lhs`.
+
+  OP is: `+=, -=, *=, /=`
+
+| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` and `rhs` `bfloat16` values.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
+  values and return the result as a boolean value.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of
+  template type `T` and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` of template type `T` and `rhs`
+  `bfloat16` value and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+|===
+
+== Example
+
+[source]
+----
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
+using sycl::ext::oneapi::experimental::bfloat16;
+
+bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
+  return static_cast<float>(lhs) + static_cast<float>(rhs);
+}
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A {a};
+  bfloat16 B {b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-pointer
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main (int argc, char *argv[]) {
+  float data[3] = {7.0, 8.1, 0.0};
+  sycl::device dev;
+  sycl::queue deviceQueue{dev};
+  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
+
+  if (dev.has(sycl::aspect::ext_oneapi_bf16_conversion)) {
+    deviceQueue.submit ([&] (sycl::handler& cgh) {
+      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
+      cgh.single_task<class simple_kernel> ([=] () {
+        numbers[2] = foo(numbers[0], numbers[1]);
+      });
+    });
+  }
+  return 0;
+}
+----
+
+== Issues
+
+None.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2021-08-02|Alexey Sotkin |Initial public working draft
+|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions +
+                             Add operator overloadings +
+                             Apply code review suggestions
+|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
+|4|2022-03-07|Jack Kirk |Switch from Intel vendor specific to oneapi
+|========================================

From a5145055e3a764b76fe7ff854beb1422c95a5308 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 7 Mar 2022 18:13:58 +0000
Subject: [PATCH 09/18] Remove redundant include

---
 sycl/include/CL/sycl/builtins.hpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index 96196e4776123..0a9814da3eed0 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -191,18 +191,6 @@ detail::enable_if_t<detail::is_genfloat<T>::value, T> fma(T a, T b,
   return __sycl_std::__invoke_fma<T>(a, b, c);
 }
 
-// genfloath fma_relu (genfloath a, genfloath b, genfloath c)
-// BF16 : uint16_t fma_relu (uint16_t a, uint16_t b, uint16_t c)
-// BF16X2 : uint32_t fma_relu (uint32_t a, uint32_t b, uint32_t c)
-template <typename T>
-detail::enable_if_t<detail::is_genfloath<T>::value ||
-                        std::is_same<T, uint16_t>::value ||
-                        std::is_same<T, uint32_t>::value,
-                    T>
-fma(T a, T b, T c) __NOEXC {
-  return __sycl_std::__invoke_fma<T>(a, b, c);
-}
-
 // genfloat fmax (genfloat x, genfloat y)
 template <typename T>
 detail::enable_if_t<detail::is_genfloat<T>::value, T> fmax(T x, T y) __NOEXC {

From 37a18d7538ca7ad6b98994efb38cb86bc9dc7a66 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Fri, 11 Mar 2022 12:18:15 +0000
Subject: [PATCH 10/18] Adding symbols to linux dump

---
 sycl/test/abi/sycl_symbols_linux.dump | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 79d58722b4f4f..c2071e1df17a1 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -2932,6 +2932,27 @@ _ZN2cl10__host_std7u_rhaddEhh
 _ZN2cl10__host_std7u_rhaddEjj
 _ZN2cl10__host_std7u_rhaddEmm
 _ZN2cl10__host_std7u_rhaddEtt
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi3EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi8EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl6detail9half_impl4halfES4_S4_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi3EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi16EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi4EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi1EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi4EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi3EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi16EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi8EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi2EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi1EEES3_S3_
+_ZN2cl10__host_std8fma_reluEttt
+_ZN2cl10__host_std8fma_reluEjjj
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi4EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi8EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi2EEES3_S3_
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEE
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEE
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEE

From 7b40302321983fac83a76afaaf0135426dd4afd3 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Mon, 14 Mar 2022 10:12:01 +0000
Subject: [PATCH 11/18] Responding to comments

---
 .../extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
index b471e84087a74..fbfb564dc26bb 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
@@ -78,6 +78,8 @@ This extension introduces the `fma_relu` function for datatypes `sycl::half`,
 class from the `sycl_ext_*_bf16_conversion` extension, and currently use 
 `uint16_t` and `uint32_t`, respectively, as storage types. 
 
+== Specification
+
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
@@ -89,8 +91,6 @@ T fma_relu(T a, T b, T c);
 
 `fma_relu` returns `a * b + c > 0 ? a * b + c : 0`. 
 
-== Specification
-
 === Feature test macro
 
 This extension provides a feature-test macro as described in the core SYCL

From 8a29c4412c06b1246bdcd0fa5954b70957211e36 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 15 Mar 2022 14:41:54 +0000
Subject: [PATCH 12/18] Renamed extension to cover all bfloat16 funct.

Removed aspect reference: can be added once the ext_oneapi_bfloat16 aspect is merged.
---
 ....asciidoc => sycl_ext_oneapi_bfloat16.asciidoc} | 14 +++++++-------
 .../sycl/ext/oneapi/experimental/bfloat16.hpp      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename sycl/doc/extensions/experimental/{SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc => sycl_ext_oneapi_bfloat16.asciidoc} (96%)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
similarity index 96%
rename from sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
index bf0a799671ffa..175219e23c47f 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
@@ -1,4 +1,4 @@
-= sycl_oneapi_bf16_conversion
+= sycl_ext_oneapi_bfloat16
 
 :source-highlighter: coderay
 :coderay-linenums-mode: table
@@ -73,7 +73,7 @@ command (e.g. from `parallel_for`).
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_BF16_CONVERSION` to one of the values defined in the table
+`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
  value to determine which of the extension’s APIs the implementation supports.
@@ -91,18 +91,18 @@ the implementation supports this feature, or applications can test the macro’s
 namespace sycl {
 enum class aspect {
   ...
-  ext_oneapi_bf16_conversion
+  ext_oneapi_bfloat16
 }
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bf16_conversion` aspect, then it natively
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
 supports conversion of values of `float` type to `bfloat16` and back.
 
 If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
-**NOTE**: The `ext_oneapi_bf16_conversion` aspect is not yet supported.  The
+**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
 `bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
 
 == New `bfloat16` class
@@ -304,7 +304,7 @@ int main (int argc, char *argv[]) {
   sycl::queue deviceQueue{dev};
   sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
 
-  if (dev.has(sycl::aspect::ext_oneapi_bf16_conversion)) {
+  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
     deviceQueue.submit ([&] (sycl::handler& cgh) {
       auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
       cgh.single_task<class simple_kernel> ([=] () {
@@ -332,5 +332,5 @@ None.
                              Add operator overloadings +
                              Apply code review suggestions
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
-|4|2022-03-07|Jack Kirk |Switch from Intel vendor specific to oneapi
+|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 3768c65aab6a3..1190c80631928 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -17,7 +17,7 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+class bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 

From 49aca060e44ffb13bae8c727477f91da9a6ad5a5 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 4 Apr 2022 15:08:14 +0100
Subject: [PATCH 13/18] Making fma_relu accept the bfloat16 class

---
 .../sycl/ext/oneapi/experimental/builtins.hpp | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index 98cb47bc52175..8fb32a370cfc7 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -16,6 +16,8 @@
 
 #include <CL/__spirv/spirv_ops.hpp>
 
+#include "bfloat16.hpp"
+
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/
 
@@ -37,11 +39,18 @@ namespace experimental {
 // fma_relu returns a * b + c > 0 ? a * b + c : 0
 template <typename T>
 sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
-                              sycl::detail::is_ugenshort<T>::value ||
-                              sycl::detail::is_ugenint<T>::value,
+                              sycl::detail::is_ugenint<T>::value ||
+                              std::is_same<T, bfloat16>::value,
                           T>
 fma_relu(T a, T b, T c) __NOEXC {
-  return __sycl_std::__invoke_fma_relu<T>(a, b, c);
+  if constexpr (std::is_same<T, bfloat16>::value) {
+    uint16_t tmp = __sycl_std::__invoke_fma_relu<uint16_t>(
+        reinterpret_cast<uint16_t &>(a), reinterpret_cast<uint16_t &>(b),
+        reinterpret_cast<uint16_t &>(c));
+    return reinterpret_cast<bfloat16 &>(tmp);
+  } else {
+    return __sycl_std::__invoke_fma_relu<T>(a, b, c);
+  }
 }
 
 // Provides functionality to print data from kernels in a C way:
@@ -53,9 +62,9 @@ fma_relu(T a, T b, T c) __NOEXC {
 // Please refer to corresponding section in OpenCL C specification to find
 // information about format string and its differences from standard C rules.
 //
-// This function is placed under 'experimental' namespace on purpose, because it
-// has too much caveats you need to be aware of before using it. Please find
-// them below and read carefully before using it:
+// This function is placed under 'experimental' namespace on purpose, because
+// it has too much caveats you need to be aware of before using it. Please
+// find them below and read carefully before using it:
 //
 // - According to the OpenCL spec, the format string must be
 // resolvable at compile time i.e. cannot be dynamically created by the
@@ -65,19 +74,19 @@ fma_relu(T a, T b, T c) __NOEXC {
 // address space. The constant address space declarations might get "tricky",
 // see test/built-ins/printf.cpp for examples.
 // In simple cases (compile-time known string contents, direct declaration of
-// the format literal inside the printf call, etc.), the compiler should handle
-// the automatic address space conversion.
+// the format literal inside the printf call, etc.), the compiler should
+// handle the automatic address space conversion.
 // FIXME: Once the extension to generic address space is fully supported, the
 // constant AS version may need to be deprecated.
 //
-// - The format string is interpreted according to the OpenCL C spec, where all
-// data types has fixed size, opposed to C++ types which doesn't guarantee
+// - The format string is interpreted according to the OpenCL C spec, where
+// all data types has fixed size, opposed to C++ types which doesn't guarantee
 // the exact width of particular data types (except, may be, char). This might
 // lead to unexpected result, for example: %ld in OpenCL C means that printed
-// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However,
-// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by
-// performing a cast, for example) that if you use %ld specifier, you pass
-// 64-bit argument to the cl::sycl::experimental::printf
+// argument has 'long' type which is 64-bit wide by the OpenCL C spec.
+// However, by C++ spec long is just at least 32-bit wide, so, you need to
+// ensure (by performing a cast, for example) that if you use %ld specifier,
+// you pass 64-bit argument to the cl::sycl::experimental::printf
 //
 // - OpenCL spec defines several additional features, like, for example, 'v'
 // modifier which allows to print OpenCL vectors: note that these features are

From 9fb55df057b22e9cb7b7389103452deb125a1744 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 4 Apr 2022 15:20:10 +0100
Subject: [PATCH 14/18] Update doc

---
 .../experimental/sycl_ext_oneapi_fma_relu.asciidoc     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
index fbfb564dc26bb..8232942f24377 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
@@ -74,9 +74,11 @@ this happens.
 == Overview
 
 This extension introduces the `fma_relu` function for datatypes `sycl::half`,
-`bfloat16` and `bfloat16x2`. `bfloat16` and `bfloat16x2` refer to the bfloat16
-class from the `sycl_ext_*_bf16_conversion` extension, and currently use 
-`uint16_t` and `uint32_t`, respectively, as storage types. 
+`bfloat16` and `bfloat16x2`. `bfloat16` refers to the bfloat16 class from 
+the `sycl_ext_oneapi_bfloat16` extension. `bfloat16x2` has not yet been 
+implemented as a class so this `fma_relu` extension uses `uint32_t` as a
+storage type for `bfloat16x2`. This will be changed once the `bfloat16x2` 
+class has been implemented. 
 
 == Specification
 
@@ -103,7 +105,7 @@ supports.
 
 If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
 an implementation must additionally predefine the macro 
-`SYCL_EXT_INTEL_BF16_CONVERSION`, as detailed in 
+`SYCL_EXT_ONEAPI_BFLOAT16`, as detailed in 
 link:./sycl_ext_intel_bf16_conversion.asciidoc[
   sycl_ext_*_bf16_conversion].
 

From 358c943cebb97db860b8ba02d09572baa5a8055d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Mon, 4 Apr 2022 15:44:39 +0100
Subject: [PATCH 15/18] Update
 sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc

Co-authored-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
index 8232942f24377..1e79434a51815 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
@@ -107,7 +107,7 @@ If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
 an implementation must additionally predefine the macro 
 `SYCL_EXT_ONEAPI_BFLOAT16`, as detailed in 
 link:./sycl_ext_intel_bf16_conversion.asciidoc[
-  sycl_ext_*_bf16_conversion].
+  sycl_ext_oneapi_bfloat16].
 
 
 [%header,cols="1,5"]

From 7c6d72868c9284692b3e13511d0bb8edd83aad3a Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Mon, 4 Apr 2022 15:44:51 +0100
Subject: [PATCH 16/18] Update
 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp

Co-authored-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index 8fb32a370cfc7..bc8b90a85d142 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -16,7 +16,7 @@
 
 #include <CL/__spirv/spirv_ops.hpp>
 
-#include "bfloat16.hpp"
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/

From 390ae97c45574233f61655e5805c5a002e8a022f Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Mon, 4 Apr 2022 15:45:51 +0100
Subject: [PATCH 17/18] Update
 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp

Co-authored-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index bc8b90a85d142..e651072589d97 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -44,7 +44,7 @@ sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
                           T>
 fma_relu(T a, T b, T c) __NOEXC {
   if constexpr (std::is_same<T, bfloat16>::value) {
-    uint16_t tmp = __sycl_std::__invoke_fma_relu<uint16_t>(
+    return bfloat16::from_bits(__sycl_std::__invoke_fma_relu<uint16_t>(x.raw(), y.raw(), z.raw()));
         reinterpret_cast<uint16_t &>(a), reinterpret_cast<uint16_t &>(b),
         reinterpret_cast<uint16_t &>(c));
     return reinterpret_cast<bfloat16 &>(tmp);

From f08791a588f8864867570fa350f21ec6ecd41125 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 4 Apr 2022 15:50:02 +0100
Subject: [PATCH 18/18] Using bits instead of reinterpret cast

---
 sycl/include/sycl/ext/oneapi/experimental/builtins.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index e651072589d97..5c272e4f8132b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -44,10 +44,8 @@ sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
                           T>
 fma_relu(T a, T b, T c) __NOEXC {
   if constexpr (std::is_same<T, bfloat16>::value) {
-    return bfloat16::from_bits(__sycl_std::__invoke_fma_relu<uint16_t>(x.raw(), y.raw(), z.raw()));
-        reinterpret_cast<uint16_t &>(a), reinterpret_cast<uint16_t &>(b),
-        reinterpret_cast<uint16_t &>(c));
-    return reinterpret_cast<bfloat16 &>(tmp);
+    return bfloat16::from_bits(
+        __sycl_std::__invoke_fma_relu<uint16_t>(a.raw(), b.raw(), c.raw()));
   } else {
     return __sycl_std::__invoke_fma_relu<T>(a, b, c);
   }