diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab43b11..1ab0fa6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,12 +15,12 @@ set( SHIVA_VERSION_PATCHLEVEL 0 )
 # check if Shiva is build as a submodule or a separate project
 get_directory_property( parent_dir PARENT_DIRECTORY )
 if(parent_dir)
-    set( is_submodule ON )
+    set( SHIVA_IS_SUBMODULE ON )
 else()
-    set( is_submodule OFF )
+    set( SHIVA_IS_SUBMODULE OFF )
 endif()
 
-if( NOT is_submodule )
+if( NOT SHIVA_IS_SUBMODULE )
     message( "not a submodule")
     project( Shiva LANGUAGES CXX C )
 
@@ -66,12 +66,22 @@ include( cmake/Macros.cmake )
 include( cmake/Config.cmake )
 
 
+set(SHIVA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
+set(SHIVA_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} )
+
+message( STATUS "SHIVA_BINARY_DIR: ${SHIVA_BINARY_DIR}" )
+message( STATUS "SHIVA_SOURCE_DIR: ${SHIVA_SOURCE_DIR}" )
+
+
 add_subdirectory( src )
-add_subdirectory( tpl/camp )
-target_compile_options( camp PRIVATE "-Wno-shadow") 
 
-configure_file(tpl/camp/include/camp/config.in.hpp
-  ${PROJECT_BINARY_DIR}/include/camp/config.hpp)
+if( SHIVA_ENABLE_CAMP )
+    add_subdirectory( tpl/camp )
+    target_compile_options( camp PRIVATE "-Wno-shadow") 
+
+    configure_file(tpl/camp/include/camp/config.in.hpp
+      ${PROJECT_BINARY_DIR}/include/camp/config.hpp)
+endif()
 
 
 if( SHIVA_ENABLE_DOCS )
diff --git a/cmake/CMakeBasics.cmake b/cmake/CMakeBasics.cmake
index cf3c275..82a64a7 100644
--- a/cmake/CMakeBasics.cmake
+++ b/cmake/CMakeBasics.cmake
@@ -27,5 +27,20 @@ blt_append_custom_compiler_flag( FLAGS_VAR CMAKE_CXX_FLAGS_DEBUG
                                  CLANG "-fstandalone-debug"
                                 )
 
-
-set( CAMP_ENABLE_TESTS OFF CACHE BOOL "")
+option( SHIVA_ENABLE_CAMP OFF )
+option( CAMP_ENABLE_TESTS OFF )
+
+
+if( ENABLE_CUDA )
+  if( CUDA_VERSION AND CUDA_VERSION_MAJOR AND CUDA_VERSION_MINOR )
+    set( SHIVA_CUDA_VERSION ${CUDA_VERSION} )
+    set( SHIVA_CUDA_MAJOR ${CUDA_VERSION_MAJOR} )
+    set( SHIVA_CUDA_MINOR ${CUDA_VERSION_MINOR} )
+  else()
+    message(FATAL_ERROR "CUDA_VERSION_MAJOR and CUDA_VERSION_MINOR not defined")
+  endif()
+else()
+  set( SHIVA_CUDA_VERSION 0 )
+  set( SHIVA_CUDA_MAJOR 0 )
+  set( SHIVA_CUDA_MINOR 0 )
+endif()
diff --git a/cmake/Config.cmake b/cmake/Config.cmake
index 80af36e..849955d 100644
--- a/cmake/Config.cmake
+++ b/cmake/Config.cmake
@@ -1,6 +1,7 @@
 #
 set( PREPROCESSOR_DEFINES CUDA
                           HIP
+                          CAMP
                           BOUNDS_CHECK
                         )
 
diff --git a/cmake/blt b/cmake/blt
index fb4246b..9ff7734 160000
--- a/cmake/blt
+++ b/cmake/blt
@@ -1 +1 @@
-Subproject commit fb4246b8bae74c3d7291bef9698fd38863844680
+Subproject commit 9ff77344f0b2a6ee345e452bddd6bfd46cbbfa35
diff --git a/docs/doxygen/ShivaConfig.hpp b/docs/doxygen/ShivaConfig.hpp
index 390e74a..8e64b01 100644
--- a/docs/doxygen/ShivaConfig.hpp
+++ b/docs/doxygen/ShivaConfig.hpp
@@ -14,4 +14,9 @@
 
 /* #undef SHIVA_USE_CALIPER */
 
+#define SHIVA_USE_CAMP
+
 #define SHIVA_USE_BOUNDS_CHECK
+
+#define SHIVA_CUDA_MAJOR        0
+#define SHIVA_CUDA_MINOR        0
diff --git a/hostconfigs/TTE/maple_rocky9.cmake b/hostconfigs/TTE/maple_rocky9.cmake
new file mode 100644
index 0000000..e3f4a83
--- /dev/null
+++ b/hostconfigs/TTE/maple_rocky9.cmake
@@ -0,0 +1,24 @@
+set(CONFIG_NAME "maple_rocky9" CACHE PATH "") 
+
+set(COMPILER_DIR  /opt/rh/gcc-toolset-13/root/ )
+set(CMAKE_C_COMPILER ${COMPILER_DIR}/bin/gcc CACHE PATH "")
+set(CMAKE_CXX_COMPILER ${COMPILER_DIR}/bin/g++ CACHE PATH "")
+
+# C++ options
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -mtune=native -march=native" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g ${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+# Cuda options
+set(ENABLE_CUDA ON CACHE BOOL "")
+set(CUDA_TOOLKIT_ROOT_DIR /hrtc/apps/cuda/12.6.20/aarch64/rocky9 CACHE STRING "")
+set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "")
+set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "")
+set(CMAKE_CUDA_ARCHITECTURES 90 CACHE STRING "")
+set(CMAKE_CUDA_STANDARD 17 CACHE STRING "")
+set(CMAKE_CUDA_FLAGS "-restrict --expt-extended-lambda --expt-relaxed-constexpr -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
+#set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
+#set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
+#set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
+
+set( SHIVA_ENABLE_CAMP OFF CACHE BOOL "Disable CAMP support" FORCE )
\ No newline at end of file
diff --git a/scripts/aggregateOrSplit.py b/scripts/aggregateOrSplit.py
index 89baa12..251174e 100644
--- a/scripts/aggregateOrSplit.py
+++ b/scripts/aggregateOrSplit.py
@@ -20,11 +20,13 @@ def create_dependency_graph(self, header, include_paths=None):
         if include_paths is None:
             include_paths = []
 
+        header = os.path.abspath(header)  # Normalize here
+
         if header in self.dependencies:
             return  # Already processed
 
         self.dependencies[header] = set()
-        base_path = os.path.dirname(os.path.abspath(header))  # Base directory of the current header
+        base_path = os.path.dirname(header)  # Base directory of the current header
 
         try:
             with open(header, 'r') as file:
@@ -34,10 +36,10 @@ def create_dependency_graph(self, header, include_paths=None):
                         included_file = include_match.group(1)
 
                         if included_file != self.config_file:
-                            resolved_path = self.resolve_path(
-                                included_file, base_path, include_paths)
+                            resolved_path = self.resolve_path( included_file, base_path, include_paths)
 
                             if resolved_path:
+                                resolved_path = os.path.abspath(resolved_path)
                                 self.dependencies[header].add(resolved_path)
 
                                 if os.path.exists(resolved_path):
@@ -82,16 +84,21 @@ def resolve_path(self, included_file, base_path, include_paths):
 
         return None  # Return None if no resolution was possible
 
+
     def generate_header_list(self):
         remaining_dependencies = self.dependencies.copy()
         size_of_remaining_dependencies = len(remaining_dependencies)
+        unique_files = set()  # Track unique files by absolute path
 
         while size_of_remaining_dependencies > 0:
             local_included = []
 
             for key in remaining_dependencies:
                 if len(remaining_dependencies[key]) == 0:
-                    self.included_list.append(key)
+                    abs_key = os.path.abspath(key)
+                    if abs_key not in unique_files:
+                        self.included_list.append(abs_key)
+                        unique_files.add(abs_key)
                     local_included.append(key)
 
             for included_key in local_included:
@@ -111,6 +118,7 @@ def process_header(header_path, output):
             """
             Processes a single header file, commenting out includes and pragmas.
             """
+            header_path = os.path.abspath(header_path)
             if header_path in self.included:
                 return  # Avoid duplicate processing
             self.included.add(header_path)
@@ -133,6 +141,7 @@ def process_header(header_path, output):
 
         with open(output_file, 'w') as output:
             for header in headers:
+                header = os.path.abspath(header)
                 self.create_dependency_graph(header, include_paths)
             
             for header in self.dependencies:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ca02250..b680057 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,7 +23,8 @@ blt_add_library( NAME             shiva
 
 target_include_directories( shiva
                             INTERFACE
-                            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
+                            $<BUILD_INTERFACE:${SHIVA_BINARY_DIR}/include>
+                            $<BUILD_INTERFACE:${SHIVA_SOURCE_DIR}/src>
                             $<INSTALL_INTERFACE:include> )
 
 install( FILES ${shiva_headers} 
diff --git a/src/ShivaConfig.hpp.in b/src/ShivaConfig.hpp.in
index d7535f8..7103b82 100644
--- a/src/ShivaConfig.hpp.in
+++ b/src/ShivaConfig.hpp.in
@@ -14,4 +14,9 @@
 
 #cmakedefine SHIVA_USE_CALIPER
 
-#cmakedefine SHIVA_USE_BOUNDS_CHECK
\ No newline at end of file
+#cmakedefine SHIVA_USE_CAMP
+
+#cmakedefine SHIVA_USE_BOUNDS_CHECK
+
+#define SHIVA_CUDA_MAJOR        @SHIVA_CUDA_MAJOR@
+#define SHIVA_CUDA_MINOR        @SHIVA_CUDA_MINOR@
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 683c232..bc450a1 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -35,14 +35,14 @@ blt_add_library( NAME             common
 
 target_include_directories( common 
                             INTERFACE 
-                            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
-                            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/src>
+                            $<BUILD_INTERFACE:${SHIVA_BINARY_DIR}/include>
+                            $<BUILD_INTERFACE:${SHIVA_SOURCE_DIR}/src>
                             $<INSTALL_INTERFACE:include> )
 
 target_include_directories( common 
                             SYSTEM INTERFACE
-                            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/tpl/camp/include>
-                            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/tpl/camp/include> )
+                            $<BUILD_INTERFACE:${SHIVA_SOURCE_DIR}/tpl/camp/include>
+                            $<BUILD_INTERFACE:${SHIVA_BINARY_DIR}/tpl/camp/include> )
 
 install( FILES ${common_headers} 
          DESTINATION include/common )
diff --git a/src/common/ShivaMacros.hpp b/src/common/ShivaMacros.hpp
index c3280e1..f8ddd3d 100644
--- a/src/common/ShivaMacros.hpp
+++ b/src/common/ShivaMacros.hpp
@@ -105,6 +105,39 @@ void i_g_n_o_r_e( ARGS const & ... ) {}
 
 
 
+/**
+ * @brief This macro is used to detect the presence of builtin functions.
+ */
+#ifndef SHIVA_HAS_BUILTIN
+  #ifdef __has_builtin
+    #define SHIVA_HAS_BUILTIN( x ) __has_builtin( x )
+  #else
+    #define SHIVA_HAS_BUILTIN( x ) 0
+  #endif
+#endif
+
+/**
+ * @brief Define SHIVA_IS_CONST_EVAL() depending on compiler/toolchain
+ */
+#if defined(__CUDA_ARCH__)
+// Device code (nvcc, hipcc): no support in C++17
+  #define SHIVA_IS_CONST_EVAL() (false)
+
+#elif SHIVA_HAS_BUILTIN( __builtin_is_constant_evaluated )
+// GCC / Clang host code
+  #define SHIVA_IS_CONST_EVAL() (__builtin_is_constant_evaluated())
+
+#elif defined(_MSC_VER)
+// MSVC
+  #define SHIVA_IS_CONST_EVAL() (__is_constant_evaluated())
+
+#else
+// Fallback: always runtime
+  #define SHIVA_IS_CONST_EVAL() (false)
+#endif
+
+
+
 /**
  * @brief This macro is used to implement an assertion.
  * @param cond The condition to assert is true.
@@ -113,7 +146,7 @@ void i_g_n_o_r_e( ARGS const & ... ) {}
 #define SHIVA_ASSERT_MSG( cond, ... ) \
         do { \
           if ( !(cond)) { \
-            if ( !__builtin_is_constant_evaluated()) { \
+            if ( !SHIVA_IS_CONST_EVAL() ) { \
               shivaAssertionFailed( __FILE__, __LINE__, true, __VA_ARGS__ ); \
             } \
           } \
diff --git a/src/common/pmpl.hpp b/src/common/pmpl.hpp
index 2e0baa1..eb8d3b8 100644
--- a/src/common/pmpl.hpp
+++ b/src/common/pmpl.hpp
@@ -28,25 +28,27 @@
 namespace shiva
 {
 #if defined(SHIVA_USE_DEVICE)
-  #if defined(SHIVA_USE_CUDA)
-    #define deviceMalloc( PTR, BYTES ) cudaMalloc( PTR, BYTES );
-    #define deviceMallocManaged( PTR, BYTES ) cudaMallocManaged( PTR, BYTES );
-    #define deviceDeviceSynchronize() cudaDeviceSynchronize();
-    #define deviceMemCpy( DST, SRC, BYTES, KIND ) cudaMemcpy( DST, SRC, BYTES, KIND );
-    #define deviceFree( PTR ) cudaFree( PTR );
-    #define deviceError_t cudaError_t
-    #define deviceSuccess cudaSuccess
-    #define deviceGetErrorString    cudaGetErrorString
-    #elif defined(SHIVA_USE_HIP)
-    #define deviceMalloc( PTR, BYTES ) hipMalloc( PTR, BYTES );
-    #define deviceMallocManaged( PTR, BYTES ) hipMallocManaged( PTR, BYTES );
-    #define deviceDeviceSynchronize() hipDeviceSynchronize();
-    #define deviceMemCpy( DST, SRC, BYTES, KIND ) hipMemcpy( DST, SRC, BYTES, KIND );
-    #define deviceFree( PTR ) hipFree( PTR );
-    #define deviceError_t hipError_t
-    #define deviceSuccess = hipSuccess;
-    #define deviceGetErrorString    hipGetErrorString
-    #endif
+#if defined(SHIVA_USE_CUDA)
+#define deviceMalloc( PTR, BYTES ) cudaMalloc( PTR, BYTES );
+#define deviceMallocManaged( PTR, BYTES ) cudaMallocManaged( PTR, BYTES );
+#define deviceDeviceSynchronize() cudaDeviceSynchronize();
+#define deviceMemCpy( DST, SRC, BYTES, KIND ) cudaMemcpy( DST, SRC, BYTES, KIND );
+#define deviceFree( PTR ) cudaFree( PTR );
+#define deviceError_t cudaError_t
+#define deviceGetErrorString    cudaGetErrorString
+#define deviceMemcpyDeviceToHost cudaMemcpyDeviceToHost
+constexpr cudaError_t deviceSuccess = cudaSuccess;
+#elif defined(SHIVA_USE_HIP)
+#define deviceMalloc( PTR, BYTES ) hipMalloc( PTR, BYTES );
+#define deviceMallocManaged( PTR, BYTES ) hipMallocManaged( PTR, BYTES );
+#define deviceDeviceSynchronize() hipDeviceSynchronize();
+#define deviceMemCpy( DST, SRC, BYTES, KIND ) hipMemcpy( DST, SRC, BYTES, KIND );
+#define deviceFree( PTR ) hipFree( PTR );
+#define deviceError_t hipError_t
+#define deviceGetErrorString    hipGetErrorString
+#define deviceMemcpyDeviceToHost hipMemcpyDeviceToHost
+constexpr hipError_t deviceSuccess = hipSuccess;
+#endif
 #endif
 
 /**
@@ -100,9 +102,9 @@ void genericKernelWrapper( LAMBDA && func, bool const abortOnError = true )
 #if defined(SHIVA_USE_DEVICE)
   // UNCRUSTIFY-OFF
   genericKernel <<< 1, 1 >>> ( std::forward< LAMBDA >( func ) );
-  //UNCRUSTIFY-ON
+  // UNCRUSTIFY-ON
   deviceError_t err = deviceDeviceSynchronize();
-  if ( err != cudaSuccess )
+  if ( err != deviceSuccess )
   {
     printf( "Kernel failed: %s\n", deviceGetErrorString( err ));
     if ( abortOnError )
@@ -157,13 +159,14 @@ void genericKernelWrapper( int const N, DATA_TYPE * const hostData, LAMBDA && fu
 #if defined(SHIVA_USE_DEVICE)
   DATA_TYPE * deviceData;
   deviceMalloc( &deviceData, N * sizeof(DATA_TYPE) );
+  deviceMemCpy( deviceData, hostData, N * sizeof(DATA_TYPE), cudaMemcpyHostToDevice );
   // UNCRUSTIFY-OFF
   genericKernel <<< 1, 1 >>> ( std::forward< LAMBDA >( func ), deviceData );
   // UNCRUSTIFY-ON
   deviceError_t err = deviceDeviceSynchronize();
-  deviceMemCpy( hostData, deviceData, N * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost );
+  deviceMemCpy( hostData, deviceData, N * sizeof(DATA_TYPE), deviceMemcpyDeviceToHost );
   deviceFree( deviceData );
-  if ( err != cudaSuccess )
+  if ( err != deviceSuccess )
   {
     printf( "Kernel failed: %s\n", deviceGetErrorString( err ));
     if ( abortOnError )
diff --git a/src/common/types.hpp b/src/common/types.hpp
index ff2afd0..9c1ce28 100644
--- a/src/common/types.hpp
+++ b/src/common/types.hpp
@@ -20,7 +20,6 @@
 #include "common/ShivaMacros.hpp"
 
 /// @brief Macro to define whether or not to use camp.
-#define SHIVA_USE_CAMP
 #if defined(SHIVA_USE_CAMP)
 #include <camp/camp.hpp>
 #else
@@ -52,7 +51,9 @@ using tuple = camp::tuple< T ... >;
  * @return A tuple with the elements passed as arguments.
  */
 template< typename ... T >
-SHIVA_CONSTEXPR_HOSTDEVICE_FORCEINLINE auto make_tuple( T && ... t )
+SHIVA_CONSTEXPR_HOSTDEVICE_FORCEINLINE
+auto
+make_tuple( T && ... t )
 {
   return camp::make_tuple( std::forward< T >( t ) ... );
 }
@@ -65,6 +66,7 @@ SHIVA_CONSTEXPR_HOSTDEVICE_FORCEINLINE auto make_tuple( T && ... t )
  */
 template< typename ... T >
 using tuple = cuda::std::tuple< T ... >;
+using cuda::std::get;
 
 /**
  * @brief Wrapper for cuda::std::make_tuple.
@@ -73,6 +75,7 @@ using tuple = cuda::std::tuple< T ... >;
  * @return A tuple with the elements passed as arguments.
  */
 template< typename ... T >
+SHIVA_CONSTEXPR_HOSTDEVICE_FORCEINLINE
 auto make_tuple( T && ... t )
 {
   return cuda::std::make_tuple( std::forward< T >( t ) ... );
@@ -92,7 +95,9 @@ using tuple = std::tuple< T ... >;
  * @return A tuple with the elements passed as arguments.
  */
 template< typename ... T >
-auto make_tuple( T && ... t )
+SHIVA_CONSTEXPR_HOSTDEVICE_FORCEINLINE
+auto
+make_tuple( T && ... t )
 {
   return std::make_tuple( std::forward< T >( t ) ... );
 }
diff --git a/src/common/unitTests/testSequenceUtilities.cpp b/src/common/unitTests/testSequenceUtilities.cpp
index fae3cc1..7764e96 100644
--- a/src/common/unitTests/testSequenceUtilities.cpp
+++ b/src/common/unitTests/testSequenceUtilities.cpp
@@ -72,7 +72,7 @@ void testNestedSequenceExpansionLambdaHelper()
       return
         ( executeSequence< 10 >
             ( [ h = Data::h, aa = std::integral_constant< int, a >{} ] ( auto const ... b ) constexpr
-            { return ( (h[aa] * h[b]) + ...); }
+      { return ( (h[aa] * h[b]) + ...); }
             ) + ...
         );
     } );
@@ -119,7 +119,7 @@ void testSequenceExpansionTemplateLambdaHelper()
   kernelLaunch( [] SHIVA_HOST_DEVICE ()
   {
     constexpr int staticSum0 =
-      executeSequence< 10 >( [&] < int ... a > () constexpr
+      executeSequence< 10 >( [&]< int ... a > () constexpr
     {
       return (Data::h[a] + ...);
     } );
@@ -139,13 +139,13 @@ void testSequenceExpansionTemplateLambdaHelper()
 {
   kernelLaunch( [] SHIVA_HOST_DEVICE ()
   {
-    constexpr int staticSum0 = executeSequence< 10 >( [&] < int ... a > () constexpr
+    constexpr int staticSum0 = executeSequence< 10 >( [&]< int ... a > () constexpr
     {
       return
         ( executeSequence< 10 >
           (
-            [ h = Data::h, aa = std::integral_constant< int, a >{} ] < int ... b > () constexpr
-            { return ( (h[aa] * h[b]) + ...); }
+            [ h = Data::h, aa = std::integral_constant< int, a >{} ]< int ... b > () constexpr
+      { return ( (h[aa] * h[b]) + ...); }
           ) + ...
         );
     } );
@@ -167,7 +167,7 @@ void testForSequenceTemplateLambdaHelper()
     {
       int staticSum0 = 0;
       forSequence< 10 >(
-        [&] < int a > () constexpr
+        [&]< int a > () constexpr
       {
         staticSum0 += h[a];
       } );
diff --git a/src/discretizations/finiteElementMethod/parentElements/ParentElement.hpp b/src/discretizations/finiteElementMethod/parentElements/ParentElement.hpp
index 02c74e6..9d4bdb1 100644
--- a/src/discretizations/finiteElementMethod/parentElements/ParentElement.hpp
+++ b/src/discretizations/finiteElementMethod/parentElements/ParentElement.hpp
@@ -110,9 +110,9 @@ class ParentElement
     REAL_TYPE rval = {0};
 
     forNestedSequence< BASIS_TYPE::numSupportPoints... >( [&] ( auto const ... ic_indices ) constexpr
-          {
-            rval = rval + ( value< decltype(ic_indices)::value ... >( parentCoord ) * var( decltype(ic_indices)::value ... ) );
-          } );
+    {
+      rval = rval + ( value< decltype(ic_indices)::value ... >( parentCoord ) * var( decltype(ic_indices)::value ... ) );
+    } );
     return rval;
   }
 
@@ -130,13 +130,13 @@ class ParentElement
   {
     CArrayNd< RealType, numDims > rval = {0.0};
     forNestedSequence< BASIS_TYPE::numSupportPoints... >( [&] ( auto const ... ic_indices ) constexpr
-          {
-            CArrayNd< RealType, numDims > const grad = gradient< decltype(ic_indices)::value ... >( parentCoord );
-            forSequence< numDims >( [&] ( auto const a ) constexpr
-            {
-              rval( a ) = rval( a ) + grad( a ) * var( decltype(ic_indices)::value ... );
-            } );
-          } );
+    {
+      CArrayNd< RealType, numDims > const grad = gradient< decltype(ic_indices)::value ... >( parentCoord );
+      forSequence< numDims >( [&] ( auto const a ) constexpr
+      {
+        rval( a ) = rval( a ) + grad( a ) * var( decltype(ic_indices)::value ... );
+      } );
+    } );
     return rval;
   }
 
diff --git a/src/functions/bases/BasisProduct.hpp b/src/functions/bases/BasisProduct.hpp
index b0f00ab..28f1075 100644
--- a/src/functions/bases/BasisProduct.hpp
+++ b/src/functions/bases/BasisProduct.hpp
@@ -89,21 +89,21 @@ struct BasisProduct
   {
     static_assert( sizeof...(BASIS_FUNCTION_INDICES) == numDims, "Wrong number of basis function indicies specified" );
 
-    return
+
 #if __cplusplus >= 202002L
-      // expand pack over number of dimensions
-      executeSequence< numDims >( [&] < int ... PRODUCT_TERM_INDEX > () constexpr
-        {
-          return ( BASIS_TYPE::template value< BASIS_FUNCTION_INDICES >( parentCoord[PRODUCT_TERM_INDEX] ) * ... );
-        } );
+    // expand pack over number of dimensions
+    return executeSequence< numDims >( [&]< int ... PRODUCT_TERM_INDEX > () constexpr
+    {
+      return ( BASIS_TYPE::template value< BASIS_FUNCTION_INDICES >( parentCoord[PRODUCT_TERM_INDEX] ) * ... );
+    } );
 #else
-      executeSequence< numDims >( [&] ( auto ... PRODUCT_TERM_INDEX ) constexpr
-        {
-          // fold expression to multiply the value of each BASIS_TYPE in each
-          // dimension. In other words the fold expands on BASIS_TYPE...,
-          // BASIS_FUNCTION_INDICES..., and PRODUCT_TERM_INDEX... together.
-          return ( BASIS_TYPES::template value< BASIS_FUNCTION_INDICES >( parentCoord[decltype(PRODUCT_TERM_INDEX)::value] ) * ... );
-        } );
+    return executeSequence< numDims >( [&] ( auto ... PRODUCT_TERM_INDEX ) constexpr
+    {
+      // fold expression to multiply the value of each BASIS_TYPE in each
+      // dimension. In other words the fold expands on BASIS_TYPE...,
+      // BASIS_FUNCTION_INDICES..., and PRODUCT_TERM_INDEX... together.
+      return ( BASIS_TYPES::template value< BASIS_FUNCTION_INDICES >( parentCoord[decltype(PRODUCT_TERM_INDEX)::value] ) * ... );
+    } );
 
 #endif
   }
@@ -133,40 +133,40 @@ struct BasisProduct
     static_assert( sizeof...(BASIS_FUNCTION_INDICES) == numDims, "Wrong number of basis function indicies specified" );
 
 #if __cplusplus >= 202002L
-    return executeSequence< numDims >( [&] < int ... i > () constexpr->CArrayNd< RealType, numDims >
-        {
-          auto gradientComponent = [&] ( auto const iGrad,
-                                         auto const  ... PRODUCT_TERM_INDICES ) constexpr
-          {
-            // Ca
-            return ( gradientComponentHelper< BASIS_TYPES,
-                                              decltype(iGrad)::value,
-                                              BASIS_FUNCTION_INDICES,
-                                              PRODUCT_TERM_INDICES >( parentCoord ) * ... );
-          };
-
-          return { (executeSequence< numDims >( gradientComponent, std::integral_constant< int, i >{} ) )...  };
-        } );
+    return executeSequence< numDims >( [&]< int ... i > () constexpr->CArrayNd< RealType, numDims >
+    {
+      auto gradientComponent = [&] ( auto const iGrad,
+                                     auto const  ... PRODUCT_TERM_INDICES ) constexpr
+      {
+        // Ca
+        return ( gradientComponentHelper< BASIS_TYPES,
+                                          decltype(iGrad)::value,
+                                          BASIS_FUNCTION_INDICES,
+                                          PRODUCT_TERM_INDICES >( parentCoord ) * ... );
+      };
+
+      return { (executeSequence< numDims >( gradientComponent, std::integral_constant< int, i >{} ) )...  };
+    } );
 #else
     // Expand over the dimensions.
     return executeSequence< numDims >( [&] ( auto ... a ) constexpr->CArrayNd< RealType, numDims >
-        {
-          // define a lambda that calculates the gradient of the basis function in
-          // a single dimension/direction.
-          auto gradientComponent = [&] ( auto GRADIENT_COMPONENT, auto ... PRODUCT_TERM_INDICES ) constexpr
-          {
-            // fold expression calling gradientComponentHelper using expanding on
-            // BASIS_TYPE, BASIS_FUNCTION_INDICES, and PRODUCT_TERM_INDICES.
-            return ( gradientComponentHelper< BASIS_TYPES,
-                                              decltype(GRADIENT_COMPONENT)::value,
-                                              BASIS_FUNCTION_INDICES,
-                                              decltype(PRODUCT_TERM_INDICES)::value >( parentCoord ) * ... );
-          };
-
-          // execute the gradientComponent lambda on each direction, expand the
-          // pack on "i" corresponding to each direction of the gradient.
-          return { (executeSequence< numDims >( gradientComponent, a ) )...  };
-        } );
+    {
+      // define a lambda that calculates the gradient of the basis function in
+      // a single dimension/direction.
+      auto gradientComponent = [&] ( auto GRADIENT_COMPONENT, auto ... PRODUCT_TERM_INDICES ) constexpr
+      {
+        // fold expression calling gradientComponentHelper using expanding on
+        // BASIS_TYPE, BASIS_FUNCTION_INDICES, and PRODUCT_TERM_INDICES.
+        return ( gradientComponentHelper< BASIS_TYPES,
+                                          decltype(GRADIENT_COMPONENT)::value,
+                                          BASIS_FUNCTION_INDICES,
+                                          decltype(PRODUCT_TERM_INDICES)::value >( parentCoord ) * ... );
+      };
+
+      // execute the gradientComponent lambda on each direction, expand the
+      // pack on "i" corresponding to each direction of the gradient.
+      return { (executeSequence< numDims >( gradientComponent, a ) )...  };
+    } );
 #endif
   }
 
diff --git a/src/functions/bases/LagrangeBasis.hpp b/src/functions/bases/LagrangeBasis.hpp
index ab2b330..e18bb4a 100644
--- a/src/functions/bases/LagrangeBasis.hpp
+++ b/src/functions/bases/LagrangeBasis.hpp
@@ -82,17 +82,17 @@ class LagrangeBasis : public SPACING_TYPE< REAL_TYPE, ORDER + 1 >
   value( REAL_TYPE const & coord )
   {
 #if __cplusplus >= 202002L
-    return executeSequence< numSupportPoints >( [&] < int ... a > () constexpr
-        {
-          // return fold expression that is the product of all the polynomial
-          // factor terms.
-          return ( valueProductTerm< BF_INDEX, a >( coord ) * ... );
-        } );
+    return executeSequence< numSupportPoints >( [&]< int ... a > () constexpr
+    {
+      // return fold expression that is the product of all the polynomial
+      // factor terms.
+      return ( valueProductTerm< BF_INDEX, a >( coord ) * ... );
+    } );
 #else
     return executeSequence< numSupportPoints >( [&] ( auto const ... a ) constexpr
-        {
-          return ( valueProductTerm< BF_INDEX, decltype(a)::value >( coord ) * ... );
-        } );
+    {
+      return ( valueProductTerm< BF_INDEX, decltype(a)::value >( coord ) * ... );
+    } );
 #endif
   }
 
@@ -117,28 +117,28 @@ class LagrangeBasis : public SPACING_TYPE< REAL_TYPE, ORDER + 1 >
   {
 
 #if __cplusplus >= 202002L
-    return executeSequence< numSupportPoints >( [&coord] < int ... a > () constexpr
-        {
-          auto func = [&coord] < int ... b > ( auto aa ) constexpr
-          {
-            constexpr int aVal = decltype(aa)::value;
-            return gradientOfValueTerm< BF_INDEX, aVal >() * ( valueProductFactor< BF_INDEX, b, aVal >( coord ) * ... );
-          };
-
-          return ( executeSequence< numSupportPoints >( func, std::integral_constant< int, a >{} ) + ... );
-        } );
+    return executeSequence< numSupportPoints >( [&coord]< int ... a > () constexpr
+    {
+      auto func = [&coord]< int ... b > ( auto aa ) constexpr
+      {
+        constexpr int aVal = decltype(aa)::value;
+        return gradientOfValueTerm< BF_INDEX, aVal >() * ( valueProductFactor< BF_INDEX, b, aVal >( coord ) * ... );
+      };
+
+      return ( executeSequence< numSupportPoints >( func, std::integral_constant< int, a >{} ) + ... );
+    } );
 #else
     return executeSequence< numSupportPoints >( [&coord] ( auto const ... a ) constexpr
-        {
-          REAL_TYPE const values[ numSupportPoints ] = { valueProductTerm< BF_INDEX, decltype(a)::value >( coord )... };
-          auto func = [&values] ( auto aa, auto ... b ) constexpr
-          {
-            constexpr int aVal = decltype(aa)::value;
-            return gradientOfValueTerm< BF_INDEX, aVal >() * ( valueProductFactor< decltype(b)::value, aVal >( values ) * ... );
-          };
-
-          return ( executeSequence< numSupportPoints >( func, a ) + ... );
-        } );
+    {
+      REAL_TYPE const values[ numSupportPoints ] = { valueProductTerm< BF_INDEX, decltype(a)::value >( coord )... };
+      auto func = [&values] ( auto aa, auto ... b ) constexpr
+      {
+        constexpr int aVal = decltype(aa)::value;
+        return gradientOfValueTerm< BF_INDEX, aVal >() * ( valueProductFactor< decltype(b)::value, aVal >( values ) * ... );
+      };
+
+      return ( executeSequence< numSupportPoints >( func, a ) + ... );
+    } );
 #endif
   }
 
diff --git a/src/functions/quadrature/Quadrature.hpp b/src/functions/quadrature/Quadrature.hpp
index d0c466e..1c43301 100644
--- a/src/functions/quadrature/Quadrature.hpp
+++ b/src/functions/quadrature/Quadrature.hpp
@@ -53,13 +53,13 @@ struct QuadratureGaussLegendre : public GaussLegendreSpacing< REAL_TYPE, N >
     }
     else if constexpr ( N == 3 )
     {
-      assert( index >= 0 && index < 3 );
+      //assert( index >= 0 && index < 3 );
       return 0.5555555555555555555555555555555556 +
              0.3333333333333333333333333333333333 * ( index & 1 );
     }
     else if constexpr ( N == 4 )
     {
-      assert( index >= 0 && index < 4 );
+      //assert( index >= 0 && index < 4 );
       return 0.5 + ( -1 + ( ( ( index + 1 ) & 2 ) ) ) * 0.15214515486254614262693605077800059277;
     }
     return std::numeric_limits< REAL_TYPE >::max();
@@ -145,17 +145,17 @@ struct QuadratureGaussLobatto : public GaussLobattoSpacing< REAL_TYPE, N >
     }
     else if constexpr ( N == 3 )
     {
-      assert( index >= 0 && index < 3 );
+      //assert( index >= 0 && index < 3 );
       return 0.3333333333333333333333333333333333 + ( index & 1 );
     }
     else if constexpr ( N == 4 )
     {
-      assert( index >= 0 && index < 4 );
+      //assert( index >= 0 && index < 4 );
       return 0.1666666666666666666666666666666667 + ( ((index + 1) & 2) >> 1 ) * 0.6666666666666666666666666666666667;
     }
     else if constexpr ( N == 5 )
     {
-      assert( index >= 0 && index < 5 );
+      //assert( index >= 0 && index < 5 );
       return 0.1 + (index & 1) * 0.4444444444444444444444444444444444 + !( index - 2 ) * 0.6111111111111111111111111111111111;
     }
     return std::numeric_limits< REAL_TYPE >::max();
diff --git a/src/geometry/mapping/LinearTransform.hpp b/src/geometry/mapping/LinearTransform.hpp
index 86b9536..7fde65f 100644
--- a/src/geometry/mapping/LinearTransform.hpp
+++ b/src/geometry/mapping/LinearTransform.hpp
@@ -169,17 +169,17 @@ jacobian( LinearTransform< REAL_TYPE, INTERPOLATED_SHAPE > const & transform,
 
   auto const & nodeCoords = transform.getData();
   InterpolatedShape::supportLoop( [&] ( auto const ... ic_spIndices ) constexpr
-        {
-          CArrayNd< REAL_TYPE, DIMS > const dNadXi = InterpolatedShape::template gradient< decltype(ic_spIndices)::value ... >( pointCoordsParent );
-          // dimensional loop from domain to codomain
-          forNestedSequence< DIMS, DIMS >( [&] ( auto const ici, auto const icj ) constexpr
-          {
-            constexpr int i = decltype(ici)::value;
-            constexpr int j = decltype(icj)::value;
-            J( i, j ) = J( i, j ) + dNadXi( j ) * nodeCoords( decltype(ic_spIndices)::value ..., i );
-          } );
-
-        } );
+  {
+    CArrayNd< REAL_TYPE, DIMS > const dNadXi = InterpolatedShape::template gradient< decltype(ic_spIndices)::value ... >( pointCoordsParent );
+    // dimensional loop from domain to codomain
+    forNestedSequence< DIMS, DIMS >( [&] ( auto const ici, auto const icj ) constexpr
+    {
+      constexpr int i = decltype(ici)::value;
+      constexpr int j = decltype(icj)::value;
+      J( i, j ) = J( i, j ) + dNadXi( j ) * nodeCoords( decltype(ic_spIndices)::value ..., i );
+    } );
+
+  } );
 }
 
 
@@ -208,35 +208,35 @@ SHIVA_STATIC_CONSTEXPR_HOSTDEVICE_FORCEINLINE void
 jacobian( LinearTransform< REAL_TYPE, INTERPOLATED_SHAPE > const & transform,
           typename LinearTransform< REAL_TYPE, INTERPOLATED_SHAPE >::JacobianType & J )
 {
-  using Transform = std::remove_reference_t< decltype(transform) >;
+  using Transform = LinearTransform< REAL_TYPE, INTERPOLATED_SHAPE >;
   using InterpolatedShape = typename Transform::InterpolatedShape;
   constexpr int DIMS = Transform::numDims;
 
   auto const & nodeCoords = transform.getData();
-  constexpr double qcoords[3] = { ( QUADRATURE::template coordinate< QA >() )... };
+  constexpr REAL_TYPE qcoords[3] = { ( QUADRATURE::template coordinate< QA >() )... };
 
   InterpolatedShape::supportLoop( [&] ( auto const ... ic_spIndices ) constexpr
-        {
-          constexpr CArrayNd< REAL_TYPE, DIMS > dNadXi = InterpolatedShape::template gradient< decltype(ic_spIndices)::value ... >( qcoords );
-
-          // dimensional loop from domain to codomain
-    #if 0
-          forNestedSequence< DIMS, DIMS >( [&] ( auto const ici, auto const icj ) constexpr
-          {
-            constexpr int i = decltype(ici)::value;
-            constexpr int j = decltype(icj)::value;
-            J( j, i ) = J( j, i ) + dNadXi( i ) * nodeCoords( decltype(ic_spIndices)::value ..., j );
-          } );
+  {
+    constexpr CArrayNd< REAL_TYPE, DIMS > dNadXi = InterpolatedShape::template gradient< decltype(ic_spIndices)::value ... >( qcoords );
+
+    // dimensional loop from domain to codomain
+    #if 1
+    forNestedSequence< DIMS, DIMS >( [&] ( auto const ici, auto const icj ) constexpr
+    {
+      constexpr int i = decltype(ici)::value;
+      constexpr int j = decltype(icj)::value;
+      J( j, i ) = J( j, i ) + dNadXi( i ) * nodeCoords( decltype(ic_spIndices)::value ..., j );
+    } );
     #else
-          for ( int j = 0; j < DIMS; ++j )
-          {
-            for ( int i = 0; i < DIMS; ++i )
-            {
-              J( j, i ) = J( j, i ) + dNadXi( i ) * nodeCoords( decltype(ic_spIndices)::value ..., j );
-            }
-          }
+    for ( int j = 0; j < DIMS; ++j )
+    {
+      for ( int i = 0; i < DIMS; ++i )
+      {
+        J( j, i ) = J( j, i ) + dNadXi( i ) * nodeCoords( decltype(ic_spIndices)::value ..., j );
+      }
+    }
     #endif
-        } );
+  } );
 }
 
 
diff --git a/src/geometry/mapping/unitTests/testLinearTransform.cpp b/src/geometry/mapping/unitTests/testLinearTransform.cpp
index 9087bb6..5a575bf 100644
--- a/src/geometry/mapping/unitTests/testLinearTransform.cpp
+++ b/src/geometry/mapping/unitTests/testLinearTransform.cpp
@@ -305,7 +305,13 @@ void testInvJacobianFunctionReturnByValueHelper()
     auto cell = makeLinearTransform( Xref );
     for ( int q = 0; q < 8; ++q )
     {
+#if defined(SHIVA_USE_CUDA) && SHIVA_CUDA_MAJOR < 12
+      auto tmp  = inverseJacobian( cell, qCoords[q] );
+      auto detJ = shiva::get< 0 >( tmp );
+      auto invJ = shiva::get< 1 >( tmp );
+#else
       auto [ detJ, invJ ] = inverseJacobian( cell, qCoords[q] );
+#endif
 
       kernelData[ 10 * q ] = detJ;
       for ( int i = 0; i < 3; ++i )
diff --git a/src/geometry/mapping/unitTests/testScaling.cpp b/src/geometry/mapping/unitTests/testScaling.cpp
index 91f8129..b83bbb2 100644
--- a/src/geometry/mapping/unitTests/testScaling.cpp
+++ b/src/geometry/mapping/unitTests/testScaling.cpp
@@ -143,7 +143,13 @@ void testInvJacobianFunctionReturnByValueHelper()
   {
     auto cell = makeScaling( h );
 
-    auto [ detJ, invJ ] = inverseJacobian( cell );
+#if defined(SHIVA_USE_CUDA) && SHIVA_CUDA_MAJOR < 12
+    auto tmp  = inverseJacobian( cell );
+    auto detJ = shiva::get< 0 >( tmp );
+    auto invJ = shiva::get< 1 >( tmp );
+#else
+    auto [detJ, invJ] = inverseJacobian( cell );
+#endif
     kdata[0] = detJ;
     kdata[1] = invJ( 0 );
     kdata[2] = invJ( 1 );
diff --git a/src/geometry/mapping/unitTests/testUniformScaling.cpp b/src/geometry/mapping/unitTests/testUniformScaling.cpp
index bc3c93a..543ae70 100644
--- a/src/geometry/mapping/unitTests/testUniformScaling.cpp
+++ b/src/geometry/mapping/unitTests/testUniformScaling.cpp
@@ -99,7 +99,14 @@ TEST( testUniformScaling, testInvJacobianFunctionReturnByValue )
   double const h = 3.14;
   auto cell = makeUniformScaling( h );
 
-  auto [ detJ, invJ ] = inverseJacobian( cell );
+  // libcudacxx in CUDA 11 lacks SB support for cuda::std::tuple
+#if defined(SHIVA_USE_CUDA) && SHIVA_CUDA_MAJOR < 12
+  auto tmp  = inverseJacobian( cell );
+  auto detJ = shiva::get< 0 >( tmp );
+  auto invJ = shiva::get< 1 >( tmp );
+#else
+  auto [detJ, invJ] = inverseJacobian( cell );
+#endif
   EXPECT_EQ( detJ, 0.125 * h * h * h );
   EXPECT_EQ( invJ( 0 ), ( 2 / h ) );
 }