From 78bd6f2087cca67bc323fb7aa9a9b59d29274497 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Thu, 7 Aug 2025 10:18:18 +0200
Subject: [PATCH 1/7] Move some macros that should be in xobjects to it; black

---
 .gitignore                   |   1 +
 Architecture.md              |   6 +-
 LICENSE                      |   2 +-
 pyproject.toml               |  41 +++++++++++++-
 tests/test_common.py         |  54 ++++++++++++++++++
 tests/test_shared_memory.py  |   2 +-
 update_cprght_statement.py   |   4 +-
 xobjects/headers/atomicadd.h |  55 ++++++++++++++++++
 xobjects/headers/common.h    | 104 +++++++++++++++++++++++++++++++++++
 9 files changed, 261 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_common.py
 create mode 100644 xobjects/headers/atomicadd.h
 create mode 100644 xobjects/headers/common.h

diff --git a/.gitignore b/.gitignore
index 59d905f..5284f58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ cov_html
 .vscode
 .pytest_cache
 .coverage
+.idea
 *.c
diff --git a/Architecture.md b/Architecture.md
index 2615a0a..5e92f48 100644
--- a/Architecture.md
+++ b/Architecture.md
@@ -40,19 +40,19 @@ Buffer:
 
 Types can be composed of:
 - scalar: numbers, String
-- compound: Struct, Array, Ref, UnionRef 
+- compound: Struct, Array, Ref, UnionRef
 
 ### Scalars
 - examples: Float64, Int64, ...
 - create: Float64(3.14)
 - memory layout
-    - data 
+    - data
 
 ### String:
 - create: String(string_or_int)
 - memory layout
     - size
-    - data 
+    - data
 
 
 ### Struct
diff --git a/LICENSE b/LICENSE
index f49a4e1..261eeb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
diff --git a/pyproject.toml b/pyproject.toml
index 7959e2a..e5df678 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,43 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "xobjects"
+dynamic = ["version"]
+description = "In-memory serialization and code generator for CPU and GPU"
+readme = ""
+authors = [
+    { name = "Riccardo De Maria", email = "riccardo.de.maria@cern.ch" }
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.7"
+dependencies = [
+    "numpy",
+    "cffi",
+    "scipy"
+]
+[project.urls]
+Homepage = "https://xsuite.readthedocs.io/"
+"Bug Tracker" = "https://github.com/xsuite/xsuite/issues"
+Documentation = "https://xsuite.readthedocs.io/"
+"Source Code" = "https://github.com/xsuite/xobjects"
+"Download" = "https://pypi.python.org/pypi/xobjects"
+
+[project.optional-dependencies]
+tests = ["pytest", "pytest-mock"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["xobjects"]
+
+[tool.setuptools.dynamic]
+version = {attr = "xobjects._version.__version__"}
+
 [tool.black]
 line-length = 79
-target-version = ['py36', 'py37', 'py38']
+target-version = ['py310', 'py311', 'py312']
 include = '\.pyi?$'
+
+[project.entry-points.xobjects]
+include = "xobjects"
diff --git a/tests/test_common.py b/tests/test_common.py
new file mode 100644
index 0000000..4a1ac22
--- /dev/null
+++ b/tests/test_common.py
@@ -0,0 +1,54 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2025.                   #
+# ########################################### #
+
+import xobjects as xo
+from xobjects.test_helpers import for_all_test_contexts
+
+
+@for_all_test_contexts
+def test_common_atomicadd(test_context):
+    src = r"""
+    #include <xobjects/headers/common.h>
+
+    GPUKERN
+    double test_atomic_add()
+    {
+        int iterations = 1000;
+        double sum = 0;
+        VECTORIZE_OVER(i, iterations);
+            // If on CPU do some work to avoid the loop being optimized out
+            #if defined(XO_CONTEXT_CPU_OPENMP)
+                usleep(10);
+            #endif
+            atomicAdd(&sum, 1.0);
+        END_VECTORIZE;
+        return sum;
+    }
+    """
+
+    n_threads = 1
+    if type(test_context).__name__ in {"ContextCupy", "ContextPyopencl"}:
+        n_threads = 1000
+    elif (
+        test_context.omp_num_threads == "auto"
+        or test_context.omp_num_threads > 1
+    ):
+        n_threads = 8
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels={
+            "test_atomic_add": xo.Kernel(
+                args=[],
+                n_threads=n_threads,
+                ret=xo.Arg(xo.Float64),
+            )
+        },
+    )
+
+    expected = 1000
+    result = test_context.kernels.test_atomic_add()
+
+    assert result == expected
diff --git a/tests/test_shared_memory.py b/tests/test_shared_memory.py
index 4f2e6fc..31156d6 100644
--- a/tests/test_shared_memory.py
+++ b/tests/test_shared_memory.py
@@ -53,7 +53,7 @@ class TestElement(xo.HybridClass):
 
                   // sum s[0] += s[1]
                   if (tid == 0){
-                    sdata[tid] += sdata[tid + 1]; 
+                    sdata[tid] += sdata[tid + 1];
 
                     // write sum from shared to global mem
                     atomicAdd(&result[tid], sdata[tid]);
diff --git a/update_cprght_statement.py b/update_cprght_statement.py
index c3db6d0..0e9c049 100644
--- a/update_cprght_statement.py
+++ b/update_cprght_statement.py
@@ -6,8 +6,8 @@
 import os
 
 copyright_statement = """copyright #################################
-This file is part of the Xobjects Package. 
-Copyright (c) CERN, 2021.                  
+This file is part of the Xobjects Package.
+Copyright (c) CERN, 2021.
 ###########################################"""
 
 config = [
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
new file mode 100644
index 0000000..c3d047b
--- /dev/null
+++ b/xobjects/headers/atomicadd.h
@@ -0,0 +1,55 @@
+// copyright ################################# //
+// This file is part of the Xfields Package.   //
+// Copyright (c) CERN, 2021.                   //
+// ########################################### //
+
+#ifndef _ATOMICADD_H_
+#define _ATOMICADD_H_
+
+/*
+    Atomic add function (double type) for different contexts.
+    Following the blueprint of CUDA's atomicAdd function, the return
+    value is the old value of the address before the addition.
+*/
+
+#if defined(XO_CONTEXT_CPU_SERIAL)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        *addr = *addr + val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CPU_OPENMP)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        #pragma omp atomic
+        *addr += val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CL)
+    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+    inline double atomicAdd(volatile __global double *addr, double val)
+    {
+        union {
+            long u64;
+            double f64;
+        } next, expected, current;
+        current.f64 = *addr;
+        do {
+            expected.f64 = current.f64;
+            next.f64 = expected.f64 + val;
+            current.u64 = atom_cmpxchg(
+                (volatile __global long *)addr,
+                (long) expected.u64,
+                (long) next.u64);
+        } while( current.u64 != expected.u64 );
+        return current.f64;
+    }
+#elif defined(XO_CONTEXT_CUDA)
+    // CUDA already provides this
+#else
+    #error "Atomic add not implemented for this context"
+#endif
+
+#endif // _ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
new file mode 100644
index 0000000..e4196c8
--- /dev/null
+++ b/xobjects/headers/common.h
@@ -0,0 +1,104 @@
+#ifndef XOBJECTS_COMMON_H
+#define XOBJECTS_COMMON_H
+
+#include "xobjects/headers/atomicadd.h"
+
+/*
+    Common macros for vectorization and parallelization, as well as common
+    arithmetic operations.
+*/
+
+#ifdef XO_CONTEXT_CPU_SERIAL
+    // We are on CPU, without OpenMP
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+#endif  // XO_CONTEXT_CPU_SERIAL
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    // We are on CPU with the OpenMP context switched on
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        _Pragma("omp parallel for") \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+
+#endif  // XO_CONTEXT_CPU_OPENMP
+
+
+#ifdef XO_CONTEXT_CUDA
+    // We are on a CUDA GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) { \
+            int64_t INDEX_NAME = blockDim.x * blockIdx.x + threadIdx.x; \
+            if (INDEX_NAME < (COUNT)) {
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    // We are on an OpenCL GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        { \
+            int64_t INDEX_NAME = get_global_id(0);
+            if (INDEX_NAME < (COUNT)) { \
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CL
+
+
+/*
+    Qualifier keywords for GPU and optimisation
+*/
+
+#ifdef XO_CONTEXT_CPU // for both serial and OpenMP
+    #define GPUKERN
+    #define GPUFUN      static inline
+    #define GPUGLMEM
+    #define RESTRICT    restrict
+#endif
+
+
+#ifdef XO_CONTEXT_CUDA
+    #define GPUKERN     __global__
+    #define GPUFUN      __device__
+    #define GPUGLMEM
+    #define RESTRICT
+#endif // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    #define GPUKERN     __kernel
+    #define GPUFUN
+    #define GPUGLMEM    __global
+    #define RESTRICT
+#endif // XO_CONTEXT_CL
+
+
+/*
+    Common maths-related macros
+*/
+
+#define POW2(X) ((X)*(X))
+#define POW3(X) ((X)*(X)*(X))
+#define POW4(X) ((X)*(X)*(X)*(X))
+#define NONZERO(X) ((X) != 0.0)
+#define NONZERO_TOL(X, TOL) (fabs((X)) > (TOL))
+
+
+#ifndef VECTORIZE_OVER
+#error "Unknown context, or the expected context (XO_CONTEXT_*) flag undefined. Try updating Xobjects?"
+#endif
+
+#endif  // XOBJECTS_COMMON_H

From 4649fe42b29a704739b349f8ac4fe5c20b045752 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Mon, 10 Nov 2025 15:39:46 +0100
Subject: [PATCH 2/7] Move the new implementation of atomic add from Xtrack

---
 tests/test_common.py         | 125 +++++++++++-
 xobjects/headers/atomicadd.h | 385 +++++++++++++++++++++++++++++++----
 xobjects/headers/common.h    |   9 +-
 3 files changed, 470 insertions(+), 49 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 4a1ac22..91eb7d2 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -4,13 +4,17 @@
 # ########################################### #
 
 import xobjects as xo
+import numpy as np
+import pytest
+
 from xobjects.test_helpers import for_all_test_contexts
 
 
 @for_all_test_contexts
 def test_common_atomicadd(test_context):
     src = r"""
-    #include <xobjects/headers/common.h>
+    #include "xobjects/headers/common.h"
+    #include "xobjects/headers/atomicadd.h"
 
     GPUKERN
     double test_atomic_add()
@@ -52,3 +56,122 @@ def test_common_atomicadd(test_context):
     result = test_context.kernels.test_atomic_add()
 
     assert result == expected
+
+
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "overload", [True, False], ids=["overload", "no_overload"]
+)
+@pytest.mark.parametrize(
+    "ctype",
+    [
+        xo.Int8,
+        xo.Int16,
+        xo.Int32,
+        xo.Int64,
+        xo.UInt8,
+        xo.UInt16,
+        xo.UInt32,
+        xo.UInt64,
+        xo.Float32,
+        xo.Float64,
+    ],
+)
+def test_atomic(overload, ctype, test_context):
+    if overload:
+        func_name = "atomicAdd"
+    else:
+        func_name = f'atomicAdd_{ctype.__name__.lower()[0]}{ctype.__name__.split("t")[1]}'
+
+    class TestAtomic(xo.Struct):
+        val = ctype
+        _extra_c_sources = [
+            f"""
+            #include "xobjects/headers/common.h"
+            #include "xobjects/headers/atomicadd.h"
+
+            GPUKERN
+            void run_atomic_test(TestAtomic el, GPUGLMEM {ctype._c_type}* increments,
+                                 GPUGLMEM {ctype._c_type}* retvals, int length) {{
+                VECTORIZE_OVER(ii, length);
+                    GPUGLMEM {ctype._c_type}* val = TestAtomic_getp_val(el);
+                    {ctype._c_type} ret = {func_name}(val, increments[ii]);
+                    retvals[ii] = ret;
+                END_VECTORIZE;
+            }}
+            """
+        ]
+
+    kernels = {
+        "run_atomic_test": xo.Kernel(
+            c_name="run_atomic_test",
+            args=[
+                xo.Arg(TestAtomic, name="el"),
+                xo.Arg(ctype, pointer=True, name="increments"),
+                xo.Arg(ctype, pointer=True, name="retvals"),
+                xo.Arg(xo.Int32, name="length"),
+            ],
+            n_threads="length",
+        )
+    }
+
+    atomic = TestAtomic(_context=test_context, val=0)
+    assert atomic.val == 0
+
+    test_context.add_kernels(kernels=kernels)
+
+    # Test with all increments = 1, so we can check the return values easily.
+    num_steps = 10000
+    if ctype.__name__.startswith("Int") or ctype.__name__.startswith("Uint"):
+        # Less steps to avoid overflow
+        num_steps = min(num_steps, 2 ** (8 * ctype._size - 1) - 1)
+    increments = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype) + 1
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    test_context.kernels.run_atomic_test(
+        el=atomic, increments=increments, retvals=retvals, length=num_steps
+    )
+    assert atomic.val == num_steps
+    retvals = np.sort(test_context.nparray_from_context_array(retvals))
+    assert np.allclose(
+        retvals,
+        np.arange(num_steps, dtype=ctype._dtype),
+        atol=1.0e-15,
+        rtol=1.0e-15,
+    )
+
+    # Test with random increments, where we now only can check the total sum
+    # (retvals can be anything). Watch out: overflow is undefined behaviour,
+    # except for unsigned integers, so we skip this test for signed integers.
+    atomic.val = 0
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    if ctype.__name__.startswith("Uint"):
+        low = 0
+        high = 2 ** (8 * ctype._size) - 1
+        increments = np.random.randint(
+            low, high + 1, size=num_steps, dtype=ctype._dtype
+        )
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        assert atomic.val == (
+            np.sum(increments).item() % (2 ** (8 * ctype._size))
+        )
+
+    elif ctype.__name__.startswith("Float"):
+        increments = np.zeros(shape=(num_steps,), dtype=ctype._dtype)
+        increments += np.random.uniform(0, 10, size=num_steps)
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        if ctype == xo.Float32:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=10.0, rtol=1.0e-4
+            )
+        else:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=1.0e-6, rtol=1.0e-12
+            )
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
index c3d047b..6782705 100644
--- a/xobjects/headers/atomicadd.h
+++ b/xobjects/headers/atomicadd.h
@@ -1,55 +1,350 @@
 // copyright ################################# //
-// This file is part of the Xfields Package.   //
-// Copyright (c) CERN, 2021.                   //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
 // ########################################### //
 
 #ifndef _ATOMICADD_H_
 #define _ATOMICADD_H_
 
-/*
-    Atomic add function (double type) for different contexts.
-    Following the blueprint of CUDA's atomicAdd function, the return
-    value is the old value of the address before the addition.
-*/
-
-#if defined(XO_CONTEXT_CPU_SERIAL)
-    inline double atomicAdd(double *addr, double val)
-    {
-        double old_val = *addr;
-        *addr = *addr + val;
-        return old_val;
-    }
-#elif defined(XO_CONTEXT_CPU_OPENMP)
-    inline double atomicAdd(double *addr, double val)
-    {
-        double old_val = *addr;
-        #pragma omp atomic
-        *addr += val;
-        return old_val;
-    }
+#include "xobjects/headers/common.h"
+
+
+// Ensure no conflicting atomicAdd macro is defined earlier for CPU or OpenCL
+#ifndef XO_CONTEXT_CUDA
+#ifdef atomicAdd
+    #warning "Warning: atomicAdd macro already defined, undefining it."
+    #undef atomicAdd
+#endif // atomicAdd
+#endif // XO_CONTEXT_CUDA
+
+
+// ################################# //
+// ###        CPU Contexts       ### //
+// ################################# //
+
+#ifdef XO_CONTEXT_CPU
+#include <stdint.h>
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    #ifndef _OPENMP
+    #error "XO_CONTEXT_CPU_OPENMP set, but compiled without -fopenmp"
+    #endif
+    // OpenMP atomic capture gives us read+write atomically
+    #define OMP_ATOMIC_CAPTURE  _Pragma("omp atomic capture")
+#else
+    #define OMP_ATOMIC_CAPTURE  // No OpenMP: non-atomic fallback
+#endif // XO_CONTEXT_CPU_OPENMP
+
+// Macro to define atomicAdd for different types, will be overloaded via _Generic.
+#define DEF_ATOMIC_ADD(T, SUF)                                  \
+GPUFUN T atomicAdd_##SUF(GPUGLMEM T *addr, T val) {             \
+    T old;                                                      \
+    const T inc = (T)val;                                       \
+    OMP_ATOMIC_CAPTURE                                          \
+    { old = *addr; *addr = *addr + inc; }                       \
+    return old;                                                 \
+}
+
+DEF_ATOMIC_ADD(int8_t ,   i8)
+DEF_ATOMIC_ADD(int16_t,  i16)
+DEF_ATOMIC_ADD(int32_t,  i32)
+DEF_ATOMIC_ADD(int64_t,  i64)
+DEF_ATOMIC_ADD(uint8_t ,  u8)
+DEF_ATOMIC_ADD(uint16_t, u16)
+DEF_ATOMIC_ADD(uint32_t, u32)
+DEF_ATOMIC_ADD(uint64_t, u64)
+DEF_ATOMIC_ADD(float   , f32)
+DEF_ATOMIC_ADD(double  , f64)
+
+// Using _Generic to select the right function based on type (since C11).
+// See https://en.cppreference.com/w/c/language/generic.html
+#define atomicAdd(addr, val) _Generic((addr),        \
+    int8_t*:   atomicAdd_i8,                         \
+    int16_t*:  atomicAdd_i16,                        \
+    int32_t*:  atomicAdd_i32,                        \
+    int64_t*:  atomicAdd_i64,                        \
+    uint8_t*:  atomicAdd_u8,                         \
+    uint16_t*: atomicAdd_u16,                        \
+    uint32_t*: atomicAdd_u32,                        \
+    uint64_t*: atomicAdd_u64,                        \
+    float*:    atomicAdd_f32,                        \
+    double*:   atomicAdd_f64                         \
+)(addr, (val))
+#endif // XO_CONTEXT_CPU
+
+
+// ################################# //
+// ###        GPU Contexts       ### //
+// ################################# //
+
+/* -------------- Design notes for GPU atomics ---------------------------------
+* We will go for a unified approach for CUDA and OpenCL, with macros where needed.
+* We implement 8/16-bit atomic add by atomically CAS'ing the *containing*
+ * 32-bit word. Only the target byte/halfword lane is modified; the neighbor
+ * lane is preserved. This is linearisable: each successful CAS is one atomic
+ * RMW on the 32-bit word.
+ * Assumptions: little-endian lane layout (true on NVIDIA) and natural alignment
+ * of the 16-bit addresses (addr % 2 == 0). 8-bit has no alignment requirement.
+ * Return value matches CUDA semantics: the **old** value at *addr* (fetch-add).
+ * ---------------------------------------------------------------------------*/
+
+// Info: a CAS function (Compare-And-Swap) takes three arguments: a pointer to
+// the value to be modified, the expected old value, and the new value. The
+// second argument helps to recognise if another thread has modified the value
+// in the meantime. The function compares the value at the address with the
+// expected values, and if they are the same, it writes the new value at the
+// address. In any case, the function returns the actual old value at the
+// address. If the returned value is different from the expected value, it
+// means that another thread has modified the value in the meantime.
+
+// Define types and macros for CUDA and OpenCL
+// -------------------------------------------
+#if defined(XO_CONTEXT_CUDA)
+    // CUDA compiler may not have <stdint.h>, so define the types if needed.
+    #ifdef __CUDACC_RTC__
+        // NVRTC (CuPy RawModule default) can’t see <stdint.h>, so detect it via __CUDACC_RTC__
+        typedef signed char        int8_t;
+        typedef short              int16_t;
+        typedef int                int32_t;
+        typedef long long          int64_t;
+        typedef unsigned char      uint8_t;
+        typedef unsigned short     uint16_t;
+        typedef unsigned int       uint32_t;
+        typedef unsigned long long uint64_t;
+    #else
+        // Alternatively, NVCC path is fine with host headers
+        #include <stdint.h>
+    #endif // __CUDACC_RTC__
+    #define GPUVOLATILE    GPUGLMEM
+    #define __XT_CAS_U32(ptr, exp, val) atomicCAS((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atomicCAS((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) __float_as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) __uint_as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) __double_as_longlong(x)
+    #define __XT_AS_F64_FROM_U64(x) __longlong_as_double(x)
 #elif defined(XO_CONTEXT_CL)
+    // It seems OpenCL already has the types from <stdint.h> defined.
+    // typedef char           int8_t;
+    // typedef short          int16_t;
+    // typedef int            int32_t;
+    // typedef long           int64_t;
+    // typedef unsigned char  uint8_t;
+    // typedef unsigned short uint16_t;
+    // typedef unsigned int   uint32_t;
+    // typedef unsigned long  uint64_t;
+    #define GPUVOLATILE    GPUGLMEM volatile
+    #if __OPENCL_C_VERSION__ < 110
+        // Map 1.0 "atom_*" names to 1.1+ "atomic_*"
+        #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+        #define atomic_add     atom_add
+        #define atomic_cmpxchg atom_cmpxchg
+    #endif
     #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-    inline double atomicAdd(volatile __global double *addr, double val)
-    {
-        union {
-            long u64;
-            double f64;
-        } next, expected, current;
-        current.f64 = *addr;
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #define __XT_CAS_U32(ptr, exp, val) atomic_cmpxchg((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atom_cmpxchg((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) as_ulong(x)
+    #define __XT_AS_F64_FROM_U64(x) as_double(x)
+#endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+
+// Define atomic functions per type
+// --------------------------------
+#if defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+    // Helper: compute (base 32-bit word pointer, shift, mask) for a byte in that word.
+    GPUFUN inline void __xt_lane8(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w    = (GPUVOLATILE uint32_t*)(a & ~(size_t)3); // word: align down to 4-byte boundary
+        *sh   = (uint32_t)((a & (size_t)3) * 8U);        // shift 0,8,16,24 depending on byte lane
+        *mask = (uint32_t)(0xFFu << *sh);
+    }
+    // Helper: same for a halfword (16-bit) in the containing 32-bit word.
+    GPUFUN inline void __xt_lane16(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w  = (GPUVOLATILE uint32_t*)(a & ~(size_t)3);  // word: align down to 4-byte boundary
+        *sh = (uint32_t)((a & (size_t)2) ? 16U : 0U);   // shift0 or 16 depending on halfword
+        *mask  = (uint32_t)(0xFFFFU << *sh);
+    }
+
+    // ---------------- 8-bit: int8_t / uint8_t (CAS on 32-bit word) --------------
+    GPUFUN int8_t atomicAdd_i8(GPUVOLATILE int8_t* addr, int8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;  // byte, newbyte, newword
         do {
-            expected.f64 = current.f64;
-            next.f64 = expected.f64 + val;
-            current.u64 = atom_cmpxchg(
-                (volatile __global long *)addr,
-                (long) expected.u64,
-                (long) next.u64);
-        } while( current.u64 != expected.u64 );
-        return current.f64;
-    }
-#elif defined(XO_CONTEXT_CUDA)
-    // CUDA already provides this
-#else
-    #error "Atomic add not implemented for this context"
-#endif
+            assumed = old;
+            b  = (assumed & mask) >> sh;                    // Extract current 8-bit lane
+            nb = (uint32_t)((uint8_t)b + (uint8_t)val);     // Add in modulo-256 (two's complement)
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);  // Merge back updated lane; leave neighbor lanes intact
+            // Try to publish; if someone raced us, retry with their value
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int8_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint8_t atomicAdd_u8(GPUVOLATILE uint8_t* addr, uint8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);                             /* mod 256 */
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint8_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 16-bit: int16_t / uint16_t (CAS on 32-bit word) -----------
+    GPUFUN int16_t atomicAdd_i16(GPUVOLATILE int16_t* addr, int16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)((uint16_t)b + (uint16_t)val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int16_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint16_t atomicAdd_u16(GPUVOLATILE uint16_t* addr, uint16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint16_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 32-bit: int32_t / uint32_t (built-in) -----------
+    GPUFUN int32_t atomicAdd_i32(GPUVOLATILE int32_t* addr, int32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+    GPUFUN uint32_t atomicAdd_u32(GPUVOLATILE uint32_t* addr, uint32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: int64_t / uint64_t (built-in or CAS on 64-bit word) -----------
+    GPUFUN int64_t atomicAdd_i64(GPUVOLATILE int64_t* addr, int64_t val){
+        uint64_t old, nw;
+        do {
+            old = *addr;
+            nw = old + val;
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return old;
+    }
+    GPUFUN uint64_t atomicAdd_u64(GPUVOLATILE uint64_t* addr, uint64_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atom_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 32-bit: float (built-in or CAS on 32-bit word) -----------
+    GPUFUN float atomicAdd_f32(GPUVOLATILE float* addr, float val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        uint32_t old, nw;
+        do {
+            old = __XT_AS_U32_FROM_F32(*addr);
+            nw = __XT_AS_U32_FROM_F32(__XT_AS_F32_FROM_U32(old) + val);
+        } while (__XT_CAS_U32((GPUVOLATILE uint32_t*)addr, old, nw) != old);
+        return __XT_AS_F32_FROM_U32(old);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: float (built-in or CAS on 64-bit word) -----------
+    GPUFUN double atomicAdd_f64(GPUVOLATILE double* addr, double val){
+    #if __CUDA_ARCH__ >= 600
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL || __CUDA_ARCH__ < 600
+        uint64_t old, nw;
+        do {
+            old = __XT_AS_U64_FROM_F64(*addr);
+            nw = __XT_AS_U64_FROM_F64(__XT_AS_F64_FROM_U64(old) + val);
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return __XT_AS_F64_FROM_U64(old);
+    #endif // __CUDA_ARCH__ >= 600 / XO_CONTEXT_CL
+    }
+#endif // defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+
+// Define the overloaded function
+// ------------------------------
+#ifdef XO_CONTEXT_CUDA
+    // NVRTC (CuPy RawModule) usually compiles under extern "C".
+    // In C, function overloading is not possible, but we can cheat by doing it in
+    // C++ (with a different name to avoid clashes with the built-in atomicAdd).
+    // This function will then be remapped to atomicAdd() via a macro in C.
+    #ifdef __cplusplus
+    extern "C++" {
+
+    GPUFUN int8_t   xt_atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  xt_atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  xt_atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t xt_atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  xt_atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+
+    // Existing type definitions: forward to CUDA built-ins
+    GPUFUN int32_t  xt_atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return ::atomicAdd(p, v); }
+    GPUFUN uint32_t xt_atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return ::atomicAdd(p, v); }
+    GPUFUN uint64_t xt_atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return ::atomicAdd(p, v); }
+    GPUFUN float    xt_atomicAdd(GPUVOLATILE float* p, float v)       { return ::atomicAdd(p, v); }
+    #if __CUDA_ARCH__ >= 600
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return ::atomicAdd(p, v); }
+    #else
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return atomicAdd_f64(p, v); }
+    #endif
+
+    }
+    #endif // __cplusplus
+
+    // ---------- Global remap of the public name on device code ----------
+    // Define AFTER the wrappers so we don't macro-rewrite our own calls.
+    #ifdef atomicAdd
+    #undef atomicAdd
+    #endif
+    #define atomicAdd(ptr, val) xt_atomicAdd((ptr), (val))
+#endif /* XO_CONTEXT_CUDA */
+
+#ifdef XO_CONTEXT_CL
+    #if !__has_attribute(overloadable)
+        #error "The current OpenCL compiler/architecture does not support __attribute__((overloadable))"
+    #endif
+    #define OCL_OVERLOAD __attribute__((overloadable))
+    GPUFUN int8_t   OCL_OVERLOAD atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+    GPUFUN float    OCL_OVERLOAD atomicAdd(GPUVOLATILE float* p, float v)      { return atomicAdd_f32(p, v); }
+    GPUFUN double   OCL_OVERLOAD atomicAdd(GPUVOLATILE double* p, double v)    { return atomicAdd_f64(p, v); }
+
+    // Existing type definitions: forward to OpenCL built-ins
+    GPUFUN int32_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return atomic_add(p, v); }
+    GPUFUN uint32_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return atomic_add(p, v); }
+    GPUFUN uint64_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return atom_add(p, v); }
+#endif // XO_CONTEXT_CL
 
-#endif // _ATOMICADD_H_
+#endif //_ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
index e4196c8..f6fd35d 100644
--- a/xobjects/headers/common.h
+++ b/xobjects/headers/common.h
@@ -1,8 +1,12 @@
+// copyright ################################# //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
+// ########################################### //
+
+
 #ifndef XOBJECTS_COMMON_H
 #define XOBJECTS_COMMON_H
 
-#include "xobjects/headers/atomicadd.h"
-
 /*
     Common macros for vectorization and parallelization, as well as common
     arithmetic operations.
@@ -27,7 +31,6 @@
 
     #define END_VECTORIZE \
         }
-
 #endif  // XO_CONTEXT_CPU_OPENMP
 
 

From 1fb28f2480120ee5dc1409f554cb7266b4906a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 12 Nov 2025 17:39:47 +0100
Subject: [PATCH 3/7] Add a copy flag to the np.array <-> context array
 conversion functions

---
 xobjects/context.py          | 26 ++++++++++++++++++++++----
 xobjects/context_cpu.py      | 19 +++++++++++--------
 xobjects/context_cupy.py     | 11 +++++++----
 xobjects/context_pyopencl.py | 18 ++++++++++--------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/xobjects/context.py b/xobjects/context.py
index bcd03db..0ae2b54 100644
--- a/xobjects/context.py
+++ b/xobjects/context.py
@@ -375,12 +375,30 @@ def get_installed_c_source_paths(self) -> List[str]:
         return sources
 
     @abstractmethod
-    def nparray_to_context_array(self, arr):
-        return arr
+    def nparray_to_context_array(self, arr, copy=False):
+        """Obtain an array on the context, given a numpy array.
+
+        Args:
+            arr: numpy array
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            array on the context
+        """
 
     @abstractmethod
-    def nparray_from_context_array(self, dev_arr):
-        return dev_arr
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Obtain a numpy array, given an array on the context.
+
+        Args:
+            arr: array on the context
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            Numpy array
+        """
 
     @property
     @abstractmethod
diff --git a/xobjects/context_cpu.py b/xobjects/context_cpu.py
index 4104ed0..c85bbcd 100644
--- a/xobjects/context_cpu.py
+++ b/xobjects/context_cpu.py
@@ -543,7 +543,7 @@ def cffi_module_for_c_types(c_types, containing_dir="."):
 
         return None
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Moves a numpy array to the device memory. No action is performed by
         this function in the CPU context. The method is provided
@@ -551,14 +551,16 @@ def nparray_to_context_array(self, arr):
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): If True, a copy of the array is made.
 
         Returns:
-            numpy.ndarray: The same array (no copy!).
-
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            arr = np.copy(arr)
         return arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Moves an array to the device to a numpy array. No action is performed by
         this function in the CPU context. The method is provided so that the CPU
@@ -566,10 +568,13 @@ def nparray_from_context_array(self, dev_arr):
 
         Args:
             dev_arr (numpy.ndarray): Array to be transferred
-        Returns:
-            numpy.ndarray: The same array (no copy!)
+            copy (bool): If True, a copy of the array is made.
 
+        Returns:
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            dev_arr = np.copy(dev_arr)
         return dev_arr
 
     @property
@@ -579,7 +584,6 @@ def nplike_lib(self):
         through ``nplike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return np
 
     @property
@@ -589,7 +593,6 @@ def splike_lib(self):
         through ``splike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return sp
 
     def synchronize(self):
diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
index 8b93372..3838376 100644
--- a/xobjects/context_cupy.py
+++ b/xobjects/context_cupy.py
@@ -487,29 +487,32 @@ def build_kernels(
     def __str__(self):
         return f"{type(self).__name__}:{cupy.cuda.get_device_id()}"
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Copies a numpy array to the device memory.
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
 
         Returns:
             cupy.ndarray:The same array copied to the device.
-
         """
         dev_arr = cupy.array(arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (cupy.ndarray): Array to be transferred.
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index 843bf85..b48e027 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -253,28 +253,30 @@ def __str__(self):
         device_id = self.platform.get_devices().index(self.device)
         return f"{type(self).__name__}:{platform_id}.{device_id}"
 
-    def nparray_to_context_array(self, arr):
-        """
-        Copies a numpy array to the device memory.
+    def nparray_to_context_array(self, arr, copy=False):
+        """Copies a numpy array to the device memory.
+
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
 
         Returns:
             pyopencl.array.Array:The same array copied to the device.
-
         """
         dev_arr = cla.to_device(self.queue, arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
-        """
-        Copies an array to the device to a numpy array.
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (pyopencl.array.Array): Array to be transferred.
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 

From e5bad86bfca486508ecf7f4ff1abe5c01862dbde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 12 Nov 2025 17:40:55 +0100
Subject: [PATCH 4/7] Bugfix: capi get/set methods broken for nested arrays

---
 tests/test_capi.py | 129 ++++++++++++++++++++++++++++++++++++++++++++-
 xobjects/capi.py   |  26 ++++-----
 2 files changed, 141 insertions(+), 14 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index df2fcbb..1891072 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -11,7 +11,7 @@
 import cffi
 
 import xobjects as xo
-
+from xobjects.test_helpers import for_all_test_contexts
 
 ffi = cffi.FFI()
 
@@ -576,8 +576,133 @@ def test_gpu_api():
         ctx.add_kernels(
             sources=[src_code],
             kernels=kernel_descriptions,
-            # save_src_as=f'_test_{name}.c')
             save_source_as=None,
             compile=True,
             extra_classes=[xo.String[:]],
         )
+
+
+@for_all_test_contexts
+def test_array_of_arrays(test_context):
+    cell_ids = [3, 5, 7]
+    particle_per_cell = [
+        [1, 8],
+        [9, 3, 2],
+        [4, 5, 6, 7],
+    ]
+
+    class Cells(xo.Struct):
+        ids = xo.Int64[:]
+        particles = xo.Int64[:][:]
+
+    cells = Cells(ids=cell_ids, particles=particle_per_cell)
+
+    # Data layout (displayed as uint64):
+    #
+    #   [0] 216 (cells size)
+    #   [8]  56 (offset field 2 -- particles field)
+    #  [16] cell_ids data:
+    #        [0] 40 (cell_ids size)
+    #        [8]  3 (cell_ids length)
+    #       [16] {3, 5, 7} (cell_ids elements)
+    #  [56] particles data:
+    #        [0] 160 (particles size)
+    #        [8]   3 (particles length)
+    #       [16]  40 (offset particles[0])
+    #       [24]  72 (offset particles[1])
+    #       [32] 112 (offset particles[2])
+    #       [40] particles[0] data:
+    #             [0] 32 (particles[0] size)
+    #             [8]  2 (particles[0] length)
+    #            [16] {1, 8} (particles[0] elements)
+    #       [72] particles[1] data:
+    #             [0] 40 (particles[1] size)
+    #             [8]  3 (particles[1] length)
+    #            [16] {9, 3, 2} (particles[1
+    #      [112] particles[2] data:
+    #             [0] 48 (particles[2] size)
+    #             [8]  4 (particles[2] length)
+    #            [16] {4, 5, 6, 7} (particles[2] elements)
+
+    src = r"""
+    #include "xobjects/headers/common.h"
+
+    int MAX_PARTICLES = 4;
+    int MAX_CELLS = 3;
+
+    GPUKERN
+    uint8_t loop_over(Cells cells, uint64_t* out_counts, uint64_t* out_vals)
+    {
+        uint8_t success = 1;
+        int64_t num_cells = Cells_len_ids(cells);
+
+        for (int64_t i = 0; i < num_cells; i++) {
+            int64_t id = Cells_get_ids(cells, i);
+            int64_t count = Cells_len1_particles(cells, i);
+
+            printf("Cell ID: %lld\n Particles (count %lld): ", id, count);
+
+            if (i >= MAX_CELLS) {
+                success = 0;
+                continue;
+            }
+
+            out_counts[i] = count;
+
+            ArrNInt64 particles = Cells_getp1_particles(cells, i);
+            uint32_t num_particles = ArrNInt64_len(particles);
+
+            VECTORIZE_OVER(j, num_particles);
+                int64_t val = ArrNInt64_get(particles, j);
+                printf("%lld ", val);
+
+                if (j >= MAX_PARTICLES) {
+                    success = 0;
+                    continue;
+                }
+
+                out_vals[i * MAX_PARTICLES + j] = val;
+            END_VECTORIZE;
+            printf("\n");
+        }
+        fflush(stdout);
+        return success;
+    }
+    """
+
+    kernels = {
+        "loop_over": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="cells"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_counts"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
+            ],
+            n_threads="n",
+            ret=xo.Arg(xo.UInt8),
+        )
+    }
+    kernels.update(Cells._gen_kernels())
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels=kernels,
+    )
+
+    counts = np.zeros(len(cell_ids), dtype=np.uint64)
+    vals = np.zeros(12, dtype=np.uint64)
+
+    for i, _ in enumerate(particle_per_cell):
+        for j, expected in enumerate(particle_per_cell[i]):
+            result = test_context.kernels.Cells_get_particles(
+                obj=cells, i0=i, i1=j
+            )
+            assert result == expected
+
+    ret = test_context.kernels.loop_over(
+        cells=cells,
+        out_counts=counts,
+        out_vals=vals,
+    )
+    assert ret == 1
+    assert np.all(counts == [2, 3, 4])
+    assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])
diff --git a/xobjects/capi.py b/xobjects/capi.py
index 622803f..4be67e2 100644
--- a/xobjects/capi.py
+++ b/xobjects/capi.py
@@ -103,9 +103,10 @@ def get_layers(parts):
 
 
 def int_from_obj(offset, conf):
-    inttype = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
-    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-    return f"*({inttype})(({chartype}) obj+{offset})"
+    """Generate code to read the integer at location obj + offset (bytes)."""
+    int_pointer_type = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
+    char_pointer_type = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({int_pointer_type})(({char_pointer_type}) obj+{offset})"
 
 
 def Field_get_c_offset(self, conf):
@@ -146,7 +147,7 @@ def Index_get_c_offset(part, conf, icount):
         out.append(f"  offset+={soffset};")
     else:
         lookup_field_offset = f"offset+{soffset}"
-        out.append(f"  offset={int_from_obj(lookup_field_offset, conf)};")
+        out.append(f"  offset+={int_from_obj(lookup_field_offset, conf)};")
     return out
 
 
@@ -206,16 +207,17 @@ def gen_fun_kernel(cls, path, action, const, extra, ret, add_nindex=False):
 def gen_c_pointed(target: Arg, conf):
     size = gen_c_size_from_arg(target, conf)
     ret = gen_c_type_from_arg(target, conf)
+
     if target.pointer or is_compound(target.atype) or is_string(target.atype):
         chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
         return f"({ret})(({chartype}) obj+offset)"
-    else:
-        rettype = gen_pointer(ret + "*", conf)
-        if size == 1:
-            return f"*(({rettype}) obj+offset)"
-        else:
-            chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-            return f"*({rettype})(({chartype}) obj+offset)"
+
+    rettype = gen_pointer(ret + "*", conf)
+    if size == 1:
+        return f"*(({rettype}) obj+offset)"
+
+    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({rettype})(({chartype}) obj+offset)"
 
 
 def gen_method_get(cls, path, conf):
@@ -507,7 +509,7 @@ def methods_from_path(cls, path, conf):
 
     if is_array(lasttype):
         out.append(gen_method_len(cls, path, conf))
-    #    out.append(gen_method_shape(cls, path, conf))
+        out.append(gen_method_shape(cls, path, conf))
     #    out.append(gen_method_nd(cls, path, conf))
     #    out.append(gen_method_strides(cls, path, conf))
     #    out.append(gen_method_getpos(cls, path, conf))

From 81c33ebbffd735d5e026bb651160c2dbbf34993d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Thu, 13 Nov 2025 09:17:11 +0100
Subject: [PATCH 5/7] Implement the get_shape and get_nd methods in the C API

---
 tests/test_capi.py        | 216 ++++++++++++++++++++++++++------------
 tests/test_common.py      |  20 ++--
 xobjects/capi.py          |  71 ++++++++++++-
 xobjects/headers/common.h |   2 +-
 4 files changed, 228 insertions(+), 81 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 1891072..096750d 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -158,6 +158,70 @@ def test_array_dynamic_type_init_get_set(array_cls, example_shape):
             assert arr[ii].field1[idx_in_field] == 13 * vv
 
 
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "array_type",
+    [
+        xo.UInt64[3, 5, 7],
+        xo.UInt64[:, :, :],
+        xo.UInt64[:, 5, :],
+    ],
+)
+def test_array_get_shape(test_context, array_type):
+    source = """
+        #include "xobjects/headers/common.h"
+
+        GPUKERN void get_nd_and_shape(
+            ARRAY_TYPE arr,
+            GPUGLMEM int64_t* out_nd,
+            GPUGLMEM int64_t* out_shape
+        ) {
+            *out_nd = ARRAY_TYPE_nd(arr);
+            ARRAY_TYPE_shape(arr, out_shape);
+        }
+    """.replace(
+        "ARRAY_TYPE", array_type.__name__
+    )
+
+    kernels = {
+        "get_nd_and_shape": xo.Kernel(
+            c_name="get_nd_and_shape",
+            args=[
+                xo.Arg(array_type, name="arr"),
+                xo.Arg(xo.Int64, pointer=True, name="out_nd"),
+                xo.Arg(xo.Int64, pointer=True, name="out_shape"),
+            ],
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[source],
+        kernels=kernels,
+    )
+
+    instance = array_type(
+        np.array(range(3 * 5 * 7)).reshape((3, 5, 7)),
+        _context=test_context,
+    )
+
+    expected_nd = 3
+    result_nd = test_context.zeros((1,), dtype=np.int64)
+
+    expected_shape = [3, 5, 7]
+    result_shape = test_context.zeros((expected_nd,), dtype=np.int64)
+
+    test_context.kernels.get_nd_and_shape(
+        arr=instance,
+        out_nd=result_nd,
+        out_shape=result_shape,
+    )
+
+    assert result_nd[0] == expected_nd
+    assert result_shape[0] == expected_shape[0]
+    assert result_shape[1] == expected_shape[1]
+    assert result_shape[2] == expected_shape[2]
+
+
 def test_struct1():
     kernels = Struct1._gen_kernels()
     ctx = xo.ContextCpu()
@@ -539,47 +603,47 @@ def test_getp1_dyn_length_dyn_type_string_array():
         assert ord(ffi.cast("char *", s2)[8 + ii]) == ch
 
 
-def test_gpu_api():
-    for ctx in xo.context.get_test_contexts():
-        src_code = """
-        /*gpufun*/
-        void myfun(double x, double y,
-            double* z){
-            z[0] = x * y;
-            }
+@for_all_test_contexts
+def test_gpu_api(test_context):
+    src_code = """
+    /*gpufun*/
+    void myfun(double x, double y,
+        double* z){
+        z[0] = x * y;
+        }
 
-        /*gpukern*/
-        void my_mul(const int n,
-            /*gpuglmem*/ const double* x1,
-            /*gpuglmem*/ const double* x2,
-            /*gpuglmem*/       double* y) {
-            int tid = 0 //vectorize_over tid n
-            double z;
-            myfun(x1[tid], x2[tid], &z);
-            y[tid] = z;
-            //end_vectorize
-            }
-        """
-
-        kernel_descriptions = {
-            "my_mul": xo.Kernel(
-                args=[
-                    xo.Arg(xo.Int32, name="n"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
-                    xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
-                ],
-                n_threads="n",
-            ),
+    /*gpukern*/
+    void my_mul(const int n,
+        /*gpuglmem*/ const double* x1,
+        /*gpuglmem*/ const double* x2,
+        /*gpuglmem*/       double* y) {
+        int tid = 0 //vectorize_over tid n
+        double z;
+        myfun(x1[tid], x2[tid], &z);
+        y[tid] = z;
+        //end_vectorize
         }
+    """
 
-        ctx.add_kernels(
-            sources=[src_code],
-            kernels=kernel_descriptions,
-            save_source_as=None,
-            compile=True,
-            extra_classes=[xo.String[:]],
-        )
+    kernel_descriptions = {
+        "my_mul": xo.Kernel(
+            args=[
+                xo.Arg(xo.Int32, name="n"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
+                xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
+            ],
+            n_threads="n",
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[src_code],
+        kernels=kernel_descriptions,
+        save_source_as=None,
+        compile=True,
+        extra_classes=[xo.String[:]],
+    )
 
 
 @for_all_test_contexts
@@ -595,7 +659,9 @@ class Cells(xo.Struct):
         ids = xo.Int64[:]
         particles = xo.Int64[:][:]
 
-    cells = Cells(ids=cell_ids, particles=particle_per_cell)
+    cells = Cells(
+        ids=cell_ids, particles=particle_per_cell, _context=test_context
+    )
 
     # Data layout (displayed as uint64):
     #
@@ -627,23 +693,24 @@ class Cells(xo.Struct):
     src = r"""
     #include "xobjects/headers/common.h"
 
-    int MAX_PARTICLES = 4;
-    int MAX_CELLS = 3;
+    static const int MAX_PARTICLES = 4;
+    static const int MAX_CELLS = 3;
 
-    GPUKERN
-    uint8_t loop_over(Cells cells, uint64_t* out_counts, uint64_t* out_vals)
+    GPUKERN void loop_over(
+        Cells cells,
+        GPUGLMEM uint64_t* out_counts,
+        GPUGLMEM uint64_t* out_vals,
+        GPUGLMEM uint8_t* success
+    )
     {
-        uint8_t success = 1;
         int64_t num_cells = Cells_len_ids(cells);
 
         for (int64_t i = 0; i < num_cells; i++) {
             int64_t id = Cells_get_ids(cells, i);
             int64_t count = Cells_len1_particles(cells, i);
 
-            printf("Cell ID: %lld\n Particles (count %lld): ", id, count);
-
             if (i >= MAX_CELLS) {
-                success = 0;
+                *success = 0;
                 continue;
             }
 
@@ -654,19 +721,23 @@ class Cells(xo.Struct):
 
             VECTORIZE_OVER(j, num_particles);
                 int64_t val = ArrNInt64_get(particles, j);
-                printf("%lld ", val);
 
                 if (j >= MAX_PARTICLES) {
-                    success = 0;
-                    continue;
+                    *success = 0;
+                } else {
+                    out_vals[i * MAX_PARTICLES + j] = val;
                 }
-
-                out_vals[i * MAX_PARTICLES + j] = val;
             END_VECTORIZE;
-            printf("\n");
         }
-        fflush(stdout);
-        return success;
+    }
+
+    GPUKERN void kernel_Cells_get_particles(
+        Cells obj,
+        int64_t i0,
+        int64_t i1,
+        GPUGLMEM int64_t* out
+    ) {
+        *out = Cells_get_particles(obj, i0, i1);
     }
     """
 
@@ -676,33 +747,46 @@ class Cells(xo.Struct):
                 xo.Arg(Cells, name="cells"),
                 xo.Arg(xo.UInt64, pointer=True, name="out_counts"),
                 xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
+                xo.Arg(xo.UInt8, pointer=True, name="success"),
             ],
-            n_threads="n",
-            ret=xo.Arg(xo.UInt8),
-        )
+            n_threads=4,
+        ),
+        "kernel_Cells_get_particles": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="obj"),
+                xo.Arg(xo.Int64, name="i0"),
+                xo.Arg(xo.Int64, name="i1"),
+                xo.Arg(xo.Int64, pointer=True, name="out"),
+            ],
+        ),
     }
-    kernels.update(Cells._gen_kernels())
 
     test_context.add_kernels(
         sources=[src],
         kernels=kernels,
     )
 
-    counts = np.zeros(len(cell_ids), dtype=np.uint64)
-    vals = np.zeros(12, dtype=np.uint64)
+    counts = test_context.zeros(len(cell_ids), dtype=np.uint64)
+    vals = test_context.zeros(12, dtype=np.uint64)
+    success = test_context.zeros((1,), dtype=np.uint8) + 1
 
     for i, _ in enumerate(particle_per_cell):
         for j, expected in enumerate(particle_per_cell[i]):
-            result = test_context.kernels.Cells_get_particles(
-                obj=cells, i0=i, i1=j
+            result = test_context.zeros(shape=(1,), dtype=np.int64)
+            test_context.kernels.kernel_Cells_get_particles(
+                obj=cells, i0=i, i1=j, out=result
             )
-            assert result == expected
+            assert result[0] == expected
 
-    ret = test_context.kernels.loop_over(
+    test_context.kernels.loop_over(
         cells=cells,
         out_counts=counts,
         out_vals=vals,
+        success=success,
     )
-    assert ret == 1
+    counts = test_context.nparray_from_context_array(counts)
+    vals = test_context.nparray_from_context_array(vals)
+
+    assert success[0] == 1
     assert np.all(counts == [2, 3, 4])
     assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])
diff --git a/tests/test_common.py b/tests/test_common.py
index 91eb7d2..c1a741c 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -16,19 +16,15 @@ def test_common_atomicadd(test_context):
     #include "xobjects/headers/common.h"
     #include "xobjects/headers/atomicadd.h"
 
-    GPUKERN
-    double test_atomic_add()
+    GPUKERN void test_atomic_add(GPUGLMEM double* out, int32_t iterations)
     {
-        int iterations = 1000;
-        double sum = 0;
         VECTORIZE_OVER(i, iterations);
             // If on CPU do some work to avoid the loop being optimized out
             #if defined(XO_CONTEXT_CPU_OPENMP)
                 usleep(10);
             #endif
-            atomicAdd(&sum, 1.0);
+            atomicAdd(out, 1.0);
         END_VECTORIZE;
-        return sum;
     }
     """
 
@@ -45,15 +41,21 @@ def test_common_atomicadd(test_context):
         sources=[src],
         kernels={
             "test_atomic_add": xo.Kernel(
-                args=[],
+                c_name="test_atomic_add",
+                args=[
+                    xo.Arg(xo.Float64, pointer=True, name="out"),
+                    xo.Arg(xo.Int32, name="iterations"),
+                ],
                 n_threads=n_threads,
-                ret=xo.Arg(xo.Float64),
             )
         },
     )
 
     expected = 1000
-    result = test_context.kernels.test_atomic_add()
+    result = np.array([0], dtype=np.float64)
+    result_ctx = test_context.nparray_to_context_array(result)
+    test_context.kernels.test_atomic_add(out=result_ctx, iterations=expected)
+    result = test_context.nparray_from_context_array(result_ctx)
 
     assert result == expected
 
diff --git a/xobjects/capi.py b/xobjects/capi.py
index 4be67e2..ec09ea6 100644
--- a/xobjects/capi.py
+++ b/xobjects/capi.py
@@ -5,7 +5,7 @@
 
 from .context import Kernel, Arg
 
-from .scalar import Int64, Void, Int8, is_scalar
+from .scalar import UInt32, Int64, Void, is_scalar
 from .struct import is_field, is_struct
 from .array import is_index, is_array
 from .ref import is_unionref, is_ref
@@ -366,13 +366,74 @@ def gen_method_size(cls, path, conf):
 
 
 def gen_method_shape(cls, path, conf):
-    "return shape in an array"
-    return None, None
+    array_type = path[-1]
+
+    out_shape_arg = Arg(Int64, name="out_shape", pointer=True)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="shape",
+        extra=[out_shape_arg],
+        ret=None,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+
+    if array_type._is_static_shape:
+        terms = [str(dim) for dim in array_type._shape]
+    else:
+        lst.append(gen_method_offset(path, conf))
+        arrarg = Arg(Int64, pointer=True)
+        pointed = gen_c_pointed(arrarg, conf)
+        typearr = gen_pointer("int64_t*", conf)
+        lst.append(f"  {typearr} arr = {pointed};")
+        dim_len_idx = 1
+        terms = []
+        for dim_len in array_type._shape:
+            if dim_len:
+                terms.append(str(dim_len))
+            else:
+                terms.append(f"arr[{dim_len_idx}]")
+                dim_len_idx += 1
+
+    for dim_idx in range(nd):
+        lst.append(f"  out_shape[{dim_idx}] = {terms[dim_idx]};")
+
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_nd(cls, path, conf):
     "return length of shape"
-    return None, None
+    array_type = path[-1]
+
+    retarg = Arg(UInt32)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="nd",
+        extra=[],
+        ret=retarg,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+    lst.append(f"  return {nd};")
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_strides(cls, path, conf):
@@ -510,7 +571,7 @@ def methods_from_path(cls, path, conf):
     if is_array(lasttype):
         out.append(gen_method_len(cls, path, conf))
         out.append(gen_method_shape(cls, path, conf))
-    #    out.append(gen_method_nd(cls, path, conf))
+        out.append(gen_method_nd(cls, path, conf))
     #    out.append(gen_method_strides(cls, path, conf))
     #    out.append(gen_method_getpos(cls, path, conf))
 
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
index f6fd35d..6c7f04c 100644
--- a/xobjects/headers/common.h
+++ b/xobjects/headers/common.h
@@ -52,7 +52,7 @@
 
     #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
         { \
-            int64_t INDEX_NAME = get_global_id(0);
+            int64_t INDEX_NAME = get_global_id(0); \
             if (INDEX_NAME < (COUNT)) { \
 
     #define END_VECTORIZE \

From a8a58b6763aa7b180e9f35fe948c1f18644d7f62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Tue, 25 Nov 2025 13:23:54 +0100
Subject: [PATCH 6/7] Cache cl.Context on OpenCL

---
 xobjects/context.py          | 15 ++++++++-------
 xobjects/context_pyopencl.py | 17 ++++++++++++++---
 xobjects/general.py          |  3 +--
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/xobjects/context.py b/xobjects/context.py
index 0ae2b54..effd95a 100644
--- a/xobjects/context.py
+++ b/xobjects/context.py
@@ -668,14 +668,15 @@ def get_context_from_string(ctxstr):
 
     if ctxstr is None:
         return xo.ContextCpu()
+
+    ll = ctxstr.split(":")
+    if len(ll) <= 1:
+        ctxtype = ll[0]
+        option = []
     else:
-        ll = ctxstr.split(":")
-        if len(ll) <= 1:
-            ctxtype = ll[0]
-            option = []
-        else:
-            ctxtype, options = ctxstr.split(":")
-            option = options.split(",")
+        ctxtype, options = ctxstr.split(":")
+        option = options.split(",")
+
     if ctxtype == "ContextCpu":
         if len(option) == 0:
             return xo.ContextCpu()
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index b48e027..c61e21e 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -81,6 +81,8 @@ def _build_view(cls, a):
 
 
 class ContextPyopencl(XContext):
+    context_cache = {}
+
     @property
     def nplike_array_type(self):
         return cla.Array
@@ -128,20 +130,29 @@ def __init__(
         super().__init__()
 
         # TODO assume one device only
-        if device is None:
+        if device in self.context_cache:
+            self.platform, self.device, self.context = self.context_cache[
+                device
+            ]
+        elif device is None:
             self.context = cl.create_some_context(interactive=False)
             self.device = self.context.devices[0]
             self.platform = self.device.platform
         else:
             if isinstance(device, str):
-                platform, device = map(int, device.split("."))
+                platform, device_ = map(int, device.split("."))
                 self.platform = cl.get_platforms()[platform]
-                self.device = self.platform.get_devices()[device]
+                self.device = self.platform.get_devices()[device_]
             else:
                 self.device = device
                 self.platform = device.platform
 
             self.context = cl.Context([self.device])
+            self.context_cache[device] = (
+                self.platform,
+                self.device,
+                self.context,
+            )
 
         self.queue = cl.CommandQueue(self.context)
 
diff --git a/xobjects/general.py b/xobjects/general.py
index 26c4b01..5ac08b2 100644
--- a/xobjects/general.py
+++ b/xobjects/general.py
@@ -18,9 +18,8 @@ def __call__(self, *args, **kwargs):
 
 
 def assert_allclose(a, b, rtol=0, atol=0, max_outliers=0):
-
     try:
-        if a == b: # if this passes we return without raising
+        if a == b:  # if this passes we return without raising
             return
     except:
         pass

From af00fb41bad45d1e872290120e0e731138fdecd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 26 Nov 2025 17:55:16 +0100
Subject: [PATCH 7/7] Update pyproject.toml: PEP639, bump Python version

---
 pyproject.toml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e5df678..f50a370 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,17 +1,19 @@
 [build-system]
-requires = ["setuptools>=61.0"]
+requires = [
+    "setuptools>=77.0"
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "xobjects"
 dynamic = ["version"]
 description = "In-memory serialization and code generator for CPU and GPU"
-readme = ""
+readme = "README.md"
 authors = [
     { name = "Riccardo De Maria", email = "riccardo.de.maria@cern.ch" }
 ]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.7"
+license = "Apache-2.0"
+requires-python = ">=3.9"
 dependencies = [
     "numpy",
     "cffi",
@@ -41,3 +43,8 @@ include = '\.pyi?$'
 
 [project.entry-points.xobjects]
 include = "xobjects"
+
+[pytest]
+markers = [
+    "context_dependent: marks test as one that depends on the execution context",
+]