From 1f8a92b8ad76b9e29b784788e3ad014a775160ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Thu, 7 Aug 2025 10:18:18 +0200
Subject: [PATCH 01/27] Move some macros that should be in xobjects to it;
 black

---
 .gitignore                   |   1 +
 Architecture.md              |   6 +-
 LICENSE                      |   2 +-
 pyproject.toml               |  41 +++++++++++++-
 tests/test_common.py         |  54 ++++++++++++++++++
 tests/test_shared_memory.py  |   2 +-
 update_cprght_statement.py   |   4 +-
 xobjects/headers/atomicadd.h |  55 ++++++++++++++++++
 xobjects/headers/common.h    | 104 +++++++++++++++++++++++++++++++++++
 9 files changed, 261 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_common.py
 create mode 100644 xobjects/headers/atomicadd.h
 create mode 100644 xobjects/headers/common.h

diff --git a/.gitignore b/.gitignore
index 59d905f..5284f58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ cov_html
 .vscode
 .pytest_cache
 .coverage
+.idea
 *.c
diff --git a/Architecture.md b/Architecture.md
index 2615a0a..5e92f48 100644
--- a/Architecture.md
+++ b/Architecture.md
@@ -40,19 +40,19 @@ Buffer:
 
 Types can be composed of:
 - scalar: numbers, String
-- compound: Struct, Array, Ref, UnionRef 
+- compound: Struct, Array, Ref, UnionRef
 
 ### Scalars
 - examples: Float64, Int64, ...
 - create: Float64(3.14)
 - memory layout
-    - data 
+    - data
 
 ### String:
 - create: String(string_or_int)
 - memory layout
     - size
-    - data 
+    - data
 
 
 ### Struct
diff --git a/LICENSE b/LICENSE
index f49a4e1..261eeb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
diff --git a/pyproject.toml b/pyproject.toml
index 7959e2a..e5df678 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,43 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "xobjects"
+dynamic = ["version"]
+description = "In-memory serialization and code generator for CPU and GPU"
+readme = ""
+authors = [
+    { name = "Riccardo De Maria", email = "riccardo.de.maria@cern.ch" }
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.7"
+dependencies = [
+    "numpy",
+    "cffi",
+    "scipy"
+]
+[project.urls]
+Homepage = "https://xsuite.readthedocs.io/"
+"Bug Tracker" = "https://github.com/xsuite/xsuite/issues"
+Documentation = "https://xsuite.readthedocs.io/"
+"Source Code" = "https://github.com/xsuite/xobjects"
+"Download" = "https://pypi.python.org/pypi/xobjects"
+
+[project.optional-dependencies]
+tests = ["pytest", "pytest-mock"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["xobjects"]
+
+[tool.setuptools.dynamic]
+version = {attr = "xobjects._version.__version__"}
+
 [tool.black]
 line-length = 79
-target-version = ['py36', 'py37', 'py38']
+target-version = ['py310', 'py311', 'py312']
 include = '\.pyi?$'
+
+[project.entry-points.xobjects]
+include = "xobjects"
diff --git a/tests/test_common.py b/tests/test_common.py
new file mode 100644
index 0000000..4a1ac22
--- /dev/null
+++ b/tests/test_common.py
@@ -0,0 +1,54 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2025.                   #
+# ########################################### #
+
+import xobjects as xo
+from xobjects.test_helpers import for_all_test_contexts
+
+
+@for_all_test_contexts
+def test_common_atomicadd(test_context):
+    src = r"""
+    #include <xobjects/headers/common.h>
+
+    GPUKERN
+    double test_atomic_add()
+    {
+        int iterations = 1000;
+        double sum = 0;
+        VECTORIZE_OVER(i, iterations);
+            // If on CPU do some work to avoid the loop being optimized out
+            #if defined(XO_CONTEXT_CPU_OPENMP)
+                usleep(10);
+            #endif
+            atomicAdd(&sum, 1.0);
+        END_VECTORIZE;
+        return sum;
+    }
+    """
+
+    n_threads = 1
+    if type(test_context).__name__ in {"ContextCupy", "ContextPyopencl"}:
+        n_threads = 1000
+    elif (
+        test_context.omp_num_threads == "auto"
+        or test_context.omp_num_threads > 1
+    ):
+        n_threads = 8
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels={
+            "test_atomic_add": xo.Kernel(
+                args=[],
+                n_threads=n_threads,
+                ret=xo.Arg(xo.Float64),
+            )
+        },
+    )
+
+    expected = 1000
+    result = test_context.kernels.test_atomic_add()
+
+    assert result == expected
diff --git a/tests/test_shared_memory.py b/tests/test_shared_memory.py
index 4f2e6fc..31156d6 100644
--- a/tests/test_shared_memory.py
+++ b/tests/test_shared_memory.py
@@ -53,7 +53,7 @@ class TestElement(xo.HybridClass):
 
                   // sum s[0] += s[1]
                   if (tid == 0){
-                    sdata[tid] += sdata[tid + 1]; 
+                    sdata[tid] += sdata[tid + 1];
 
                     // write sum from shared to global mem
                     atomicAdd(&result[tid], sdata[tid]);
diff --git a/update_cprght_statement.py b/update_cprght_statement.py
index c3db6d0..0e9c049 100644
--- a/update_cprght_statement.py
+++ b/update_cprght_statement.py
@@ -6,8 +6,8 @@
 import os
 
 copyright_statement = """copyright #################################
-This file is part of the Xobjects Package. 
-Copyright (c) CERN, 2021.                  
+This file is part of the Xobjects Package.
+Copyright (c) CERN, 2021.
 ###########################################"""
 
 config = [
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
new file mode 100644
index 0000000..c3d047b
--- /dev/null
+++ b/xobjects/headers/atomicadd.h
@@ -0,0 +1,55 @@
+// copyright ################################# //
+// This file is part of the Xfields Package.   //
+// Copyright (c) CERN, 2021.                   //
+// ########################################### //
+
+#ifndef _ATOMICADD_H_
+#define _ATOMICADD_H_
+
+/*
+    Atomic add function (double type) for different contexts.
+    Following the blueprint of CUDA's atomicAdd function, the return
+    value is the old value of the address before the addition.
+*/
+
+#if defined(XO_CONTEXT_CPU_SERIAL)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        *addr = *addr + val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CPU_OPENMP)
+    inline double atomicAdd(double *addr, double val)
+    {
+        double old_val = *addr;
+        #pragma omp atomic
+        *addr += val;
+        return old_val;
+    }
+#elif defined(XO_CONTEXT_CL)
+    #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+    inline double atomicAdd(volatile __global double *addr, double val)
+    {
+        union {
+            long u64;
+            double f64;
+        } next, expected, current;
+        current.f64 = *addr;
+        do {
+            expected.f64 = current.f64;
+            next.f64 = expected.f64 + val;
+            current.u64 = atom_cmpxchg(
+                (volatile __global long *)addr,
+                (long) expected.u64,
+                (long) next.u64);
+        } while( current.u64 != expected.u64 );
+        return current.f64;
+    }
+#elif defined(XO_CONTEXT_CUDA)
+    // CUDA already provides this
+#else
+    #error "Atomic add not implemented for this context"
+#endif
+
+#endif // _ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
new file mode 100644
index 0000000..e4196c8
--- /dev/null
+++ b/xobjects/headers/common.h
@@ -0,0 +1,104 @@
+#ifndef XOBJECTS_COMMON_H
+#define XOBJECTS_COMMON_H
+
+#include "xobjects/headers/atomicadd.h"
+
+/*
+    Common macros for vectorization and parallelization, as well as common
+    arithmetic operations.
+*/
+
+#ifdef XO_CONTEXT_CPU_SERIAL
+    // We are on CPU, without OpenMP
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+#endif  // XO_CONTEXT_CPU_SERIAL
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    // We are on CPU with the OpenMP context switched on
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        _Pragma("omp parallel for") \
+        for (int64_t INDEX_NAME = 0; INDEX_NAME < (COUNT); INDEX_NAME++) {
+
+    #define END_VECTORIZE \
+        }
+
+#endif  // XO_CONTEXT_CPU_OPENMP
+
+
+#ifdef XO_CONTEXT_CUDA
+    // We are on a CUDA GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) { \
+            int64_t INDEX_NAME = blockDim.x * blockIdx.x + threadIdx.x; \
+            if (INDEX_NAME < (COUNT)) {
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    // We are on an OpenCL GPU
+
+    #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
+        { \
+            int64_t INDEX_NAME = get_global_id(0);
+            if (INDEX_NAME < (COUNT)) { \
+
+    #define END_VECTORIZE \
+            } \
+        }
+#endif  // XO_CONTEXT_CL
+
+
+/*
+    Qualifier keywords for GPU and optimisation
+*/
+
+#ifdef XO_CONTEXT_CPU // for both serial and OpenMP
+    #define GPUKERN
+    #define GPUFUN      static inline
+    #define GPUGLMEM
+    #define RESTRICT    restrict
+#endif
+
+
+#ifdef XO_CONTEXT_CUDA
+    #define GPUKERN     __global__
+    #define GPUFUN      __device__
+    #define GPUGLMEM
+    #define RESTRICT
+#endif // XO_CONTEXT_CUDA
+
+
+#ifdef XO_CONTEXT_CL
+    #define GPUKERN     __kernel
+    #define GPUFUN
+    #define GPUGLMEM    __global
+    #define RESTRICT
+#endif // XO_CONTEXT_CL
+
+
+/*
+    Common maths-related macros
+*/
+
+#define POW2(X) ((X)*(X))
+#define POW3(X) ((X)*(X)*(X))
+#define POW4(X) ((X)*(X)*(X)*(X))
+#define NONZERO(X) ((X) != 0.0)
+#define NONZERO_TOL(X, TOL) (fabs((X)) > (TOL))
+
+
+#ifndef VECTORIZE_OVER
+#error "Unknown context, or the expected context (XO_CONTEXT_*) flag undefined. Try updating Xobjects?"
+#endif
+
+#endif  // XOBJECTS_COMMON_H

From 15af3089d2f5fc5018ff4b58358834727d170702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Mon, 10 Nov 2025 15:39:46 +0100
Subject: [PATCH 02/27] Move the new implementation of atomic add from Xtrack

---
 tests/test_common.py         | 125 +++++++++++-
 xobjects/headers/atomicadd.h | 385 +++++++++++++++++++++++++++++++----
 xobjects/headers/common.h    |   8 +-
 3 files changed, 470 insertions(+), 48 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 4a1ac22..91eb7d2 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -4,13 +4,17 @@
 # ########################################### #
 
 import xobjects as xo
+import numpy as np
+import pytest
+
 from xobjects.test_helpers import for_all_test_contexts
 
 
 @for_all_test_contexts
 def test_common_atomicadd(test_context):
     src = r"""
-    #include <xobjects/headers/common.h>
+    #include "xobjects/headers/common.h"
+    #include "xobjects/headers/atomicadd.h"
 
     GPUKERN
     double test_atomic_add()
@@ -52,3 +56,122 @@ def test_common_atomicadd(test_context):
     result = test_context.kernels.test_atomic_add()
 
     assert result == expected
+
+
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "overload", [True, False], ids=["overload", "no_overload"]
+)
+@pytest.mark.parametrize(
+    "ctype",
+    [
+        xo.Int8,
+        xo.Int16,
+        xo.Int32,
+        xo.Int64,
+        xo.UInt8,
+        xo.UInt16,
+        xo.UInt32,
+        xo.UInt64,
+        xo.Float32,
+        xo.Float64,
+    ],
+)
+def test_atomic(overload, ctype, test_context):
+    if overload:
+        func_name = "atomicAdd"
+    else:
+        func_name = f'atomicAdd_{ctype.__name__.lower()[0]}{ctype.__name__.split("t")[1]}'
+
+    class TestAtomic(xo.Struct):
+        val = ctype
+        _extra_c_sources = [
+            f"""
+            #include "xobjects/headers/common.h"
+            #include "xobjects/headers/atomicadd.h"
+
+            GPUKERN
+            void run_atomic_test(TestAtomic el, GPUGLMEM {ctype._c_type}* increments,
+                                 GPUGLMEM {ctype._c_type}* retvals, int length) {{
+                VECTORIZE_OVER(ii, length);
+                    GPUGLMEM {ctype._c_type}* val = TestAtomic_getp_val(el);
+                    {ctype._c_type} ret = {func_name}(val, increments[ii]);
+                    retvals[ii] = ret;
+                END_VECTORIZE;
+            }}
+            """
+        ]
+
+    kernels = {
+        "run_atomic_test": xo.Kernel(
+            c_name="run_atomic_test",
+            args=[
+                xo.Arg(TestAtomic, name="el"),
+                xo.Arg(ctype, pointer=True, name="increments"),
+                xo.Arg(ctype, pointer=True, name="retvals"),
+                xo.Arg(xo.Int32, name="length"),
+            ],
+            n_threads="length",
+        )
+    }
+
+    atomic = TestAtomic(_context=test_context, val=0)
+    assert atomic.val == 0
+
+    test_context.add_kernels(kernels=kernels)
+
+    # Test with all increments = 1, so we can check the return values easily.
+    num_steps = 10000
+    if ctype.__name__.startswith("Int") or ctype.__name__.startswith("Uint"):
+        # Less steps to avoid overflow
+        num_steps = min(num_steps, 2 ** (8 * ctype._size - 1) - 1)
+    increments = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype) + 1
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    test_context.kernels.run_atomic_test(
+        el=atomic, increments=increments, retvals=retvals, length=num_steps
+    )
+    assert atomic.val == num_steps
+    retvals = np.sort(test_context.nparray_from_context_array(retvals))
+    assert np.allclose(
+        retvals,
+        np.arange(num_steps, dtype=ctype._dtype),
+        atol=1.0e-15,
+        rtol=1.0e-15,
+    )
+
+    # Test with random increments, where we now only can check the total sum
+    # (retvals can be anything). Watch out: overflow is undefined behaviour,
+    # except for unsigned integers, so we skip this test for signed integers.
+    atomic.val = 0
+    retvals = test_context.zeros(shape=(num_steps,), dtype=ctype._dtype)
+    if ctype.__name__.startswith("Uint"):
+        low = 0
+        high = 2 ** (8 * ctype._size) - 1
+        increments = np.random.randint(
+            low, high + 1, size=num_steps, dtype=ctype._dtype
+        )
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        assert atomic.val == (
+            np.sum(increments).item() % (2 ** (8 * ctype._size))
+        )
+
+    elif ctype.__name__.startswith("Float"):
+        increments = np.zeros(shape=(num_steps,), dtype=ctype._dtype)
+        increments += np.random.uniform(0, 10, size=num_steps)
+        increments = test_context.nparray_to_context_array(increments)
+        test_context.kernels.run_atomic_test(
+            el=atomic, increments=increments, retvals=retvals, length=num_steps
+        )
+        increments = test_context.nparray_from_context_array(increments)
+        if ctype == xo.Float32:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=10.0, rtol=1.0e-4
+            )
+        else:
+            assert np.isclose(
+                atomic.val, np.sum(increments), atol=1.0e-6, rtol=1.0e-12
+            )
diff --git a/xobjects/headers/atomicadd.h b/xobjects/headers/atomicadd.h
index c3d047b..6782705 100644
--- a/xobjects/headers/atomicadd.h
+++ b/xobjects/headers/atomicadd.h
@@ -1,55 +1,350 @@
 // copyright ################################# //
-// This file is part of the Xfields Package.   //
-// Copyright (c) CERN, 2021.                   //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
 // ########################################### //
 
 #ifndef _ATOMICADD_H_
 #define _ATOMICADD_H_
 
-/*
-    Atomic add function (double type) for different contexts.
-    Following the blueprint of CUDA's atomicAdd function, the return
-    value is the old value of the address before the addition.
-*/
-
-#if defined(XO_CONTEXT_CPU_SERIAL)
-    inline double atomicAdd(double *addr, double val)
-    {
-        double old_val = *addr;
-        *addr = *addr + val;
-        return old_val;
-    }
-#elif defined(XO_CONTEXT_CPU_OPENMP)
-    inline double atomicAdd(double *addr, double val)
-    {
-        double old_val = *addr;
-        #pragma omp atomic
-        *addr += val;
-        return old_val;
-    }
+#include "xobjects/headers/common.h"
+
+
+// Ensure no conflicting atomicAdd macro is defined earlier for CPU or OpenCL
+#ifndef XO_CONTEXT_CUDA
+#ifdef atomicAdd
+    #warning "Warning: atomicAdd macro already defined, undefining it."
+    #undef atomicAdd
+#endif // atomicAdd
+#endif // XO_CONTEXT_CUDA
+
+
+// ################################# //
+// ###        CPU Contexts       ### //
+// ################################# //
+
+#ifdef XO_CONTEXT_CPU
+#include <stdint.h>
+
+#ifdef XO_CONTEXT_CPU_OPENMP
+    #ifndef _OPENMP
+    #error "XO_CONTEXT_CPU_OPENMP set, but compiled without -fopenmp"
+    #endif
+    // OpenMP atomic capture gives us read+write atomically
+    #define OMP_ATOMIC_CAPTURE  _Pragma("omp atomic capture")
+#else
+    #define OMP_ATOMIC_CAPTURE  // No OpenMP: non-atomic fallback
+#endif // XO_CONTEXT_CPU_OPENMP
+
+// Macro to define atomicAdd for different types, will be overloaded via _Generic.
+#define DEF_ATOMIC_ADD(T, SUF)                                  \
+GPUFUN T atomicAdd_##SUF(GPUGLMEM T *addr, T val) {             \
+    T old;                                                      \
+    const T inc = (T)val;                                       \
+    OMP_ATOMIC_CAPTURE                                          \
+    { old = *addr; *addr = *addr + inc; }                       \
+    return old;                                                 \
+}
+
+DEF_ATOMIC_ADD(int8_t ,   i8)
+DEF_ATOMIC_ADD(int16_t,  i16)
+DEF_ATOMIC_ADD(int32_t,  i32)
+DEF_ATOMIC_ADD(int64_t,  i64)
+DEF_ATOMIC_ADD(uint8_t ,  u8)
+DEF_ATOMIC_ADD(uint16_t, u16)
+DEF_ATOMIC_ADD(uint32_t, u32)
+DEF_ATOMIC_ADD(uint64_t, u64)
+DEF_ATOMIC_ADD(float   , f32)
+DEF_ATOMIC_ADD(double  , f64)
+
+// Using _Generic to select the right function based on type (since C11).
+// See https://en.cppreference.com/w/c/language/generic.html
+#define atomicAdd(addr, val) _Generic((addr),        \
+    int8_t*:   atomicAdd_i8,                         \
+    int16_t*:  atomicAdd_i16,                        \
+    int32_t*:  atomicAdd_i32,                        \
+    int64_t*:  atomicAdd_i64,                        \
+    uint8_t*:  atomicAdd_u8,                         \
+    uint16_t*: atomicAdd_u16,                        \
+    uint32_t*: atomicAdd_u32,                        \
+    uint64_t*: atomicAdd_u64,                        \
+    float*:    atomicAdd_f32,                        \
+    double*:   atomicAdd_f64                         \
+)(addr, (val))
+#endif // XO_CONTEXT_CPU
+
+
+// ################################# //
+// ###        GPU Contexts       ### //
+// ################################# //
+
+/* -------------- Design notes for GPU atomics ---------------------------------
+* We will go for a unified approach for CUDA and OpenCL, with macros where needed.
+* We implement 8/16-bit atomic add by atomically CAS'ing the *containing*
+ * 32-bit word. Only the target byte/halfword lane is modified; the neighbor
+ * lane is preserved. This is linearisable: each successful CAS is one atomic
+ * RMW on the 32-bit word.
+ * Assumptions: little-endian lane layout (true on NVIDIA) and natural alignment
+ * of the 16-bit addresses (addr % 2 == 0). 8-bit has no alignment requirement.
+ * Return value matches CUDA semantics: the **old** value at *addr* (fetch-add).
+ * ---------------------------------------------------------------------------*/
+
+// Info: a CAS function (Compare-And-Swap) takes three arguments: a pointer to
+// the value to be modified, the expected old value, and the new value. The
+// second argument helps to recognise if another thread has modified the value
+// in the meantime. The function compares the value at the address with the
+// expected values, and if they are the same, it writes the new value at the
+// address. In any case, the function returns the actual old value at the
+// address. If the returned value is different from the expected value, it
+// means that another thread has modified the value in the meantime.
+
+// Define types and macros for CUDA and OpenCL
+// -------------------------------------------
+#if defined(XO_CONTEXT_CUDA)
+    // CUDA compiler may not have <stdint.h>, so define the types if needed.
+    #ifdef __CUDACC_RTC__
+        // NVRTC (CuPy RawModule default) can’t see <stdint.h>, so detect it via __CUDACC_RTC__
+        typedef signed char        int8_t;
+        typedef short              int16_t;
+        typedef int                int32_t;
+        typedef long long          int64_t;
+        typedef unsigned char      uint8_t;
+        typedef unsigned short     uint16_t;
+        typedef unsigned int       uint32_t;
+        typedef unsigned long long uint64_t;
+    #else
+        // Alternatively, NVCC path is fine with host headers
+        #include <stdint.h>
+    #endif // __CUDACC_RTC__
+    #define GPUVOLATILE    GPUGLMEM
+    #define __XT_CAS_U32(ptr, exp, val) atomicCAS((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atomicCAS((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) __float_as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) __uint_as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) __double_as_longlong(x)
+    #define __XT_AS_F64_FROM_U64(x) __longlong_as_double(x)
 #elif defined(XO_CONTEXT_CL)
+    // It seems OpenCL already has the types from <stdint.h> defined.
+    // typedef char           int8_t;
+    // typedef short          int16_t;
+    // typedef int            int32_t;
+    // typedef long           int64_t;
+    // typedef unsigned char  uint8_t;
+    // typedef unsigned short uint16_t;
+    // typedef unsigned int   uint32_t;
+    // typedef unsigned long  uint64_t;
+    #define GPUVOLATILE    GPUGLMEM volatile
+    #if __OPENCL_C_VERSION__ < 110
+        // Map 1.0 "atom_*" names to 1.1+ "atomic_*"
+        #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+        #define atomic_add     atom_add
+        #define atomic_cmpxchg atom_cmpxchg
+    #endif
     #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-    inline double atomicAdd(volatile __global double *addr, double val)
-    {
-        union {
-            long u64;
-            double f64;
-        } next, expected, current;
-        current.f64 = *addr;
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #define __XT_CAS_U32(ptr, exp, val) atomic_cmpxchg((GPUVOLATILE uint32_t*)(ptr), (uint32_t)(exp), (uint32_t)(val))
+    #define __XT_CAS_U64(ptr, exp, val) atom_cmpxchg((GPUVOLATILE uint64_t*)(ptr), (uint64_t)(exp), (uint64_t)(val))
+    #define __XT_AS_U32_FROM_F32(x) as_uint(x)
+    #define __XT_AS_F32_FROM_U32(x) as_float(x)
+    #define __XT_AS_U64_FROM_F64(x) as_ulong(x)
+    #define __XT_AS_F64_FROM_U64(x) as_double(x)
+#endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+
+// Define atomic functions per type
+// --------------------------------
+#if defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+    // Helper: compute (base 32-bit word pointer, shift, mask) for a byte in that word.
+    GPUFUN inline void __xt_lane8(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w    = (GPUVOLATILE uint32_t*)(a & ~(size_t)3); // word: align down to 4-byte boundary
+        *sh   = (uint32_t)((a & (size_t)3) * 8U);        // shift 0,8,16,24 depending on byte lane
+        *mask = (uint32_t)(0xFFu << *sh);
+    }
+    // Helper: same for a halfword (16-bit) in the containing 32-bit word.
+    GPUFUN inline void __xt_lane16(GPUVOLATILE void* addr, GPUVOLATILE uint32_t** w, uint32_t* sh, uint32_t* mask){
+        size_t a = (size_t)addr;
+        *w  = (GPUVOLATILE uint32_t*)(a & ~(size_t)3);  // word: align down to 4-byte boundary
+        *sh = (uint32_t)((a & (size_t)2) ? 16U : 0U);   // shift0 or 16 depending on halfword
+        *mask  = (uint32_t)(0xFFFFU << *sh);
+    }
+
+    // ---------------- 8-bit: int8_t / uint8_t (CAS on 32-bit word) --------------
+    GPUFUN int8_t atomicAdd_i8(GPUVOLATILE int8_t* addr, int8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;  // byte, newbyte, newword
         do {
-            expected.f64 = current.f64;
-            next.f64 = expected.f64 + val;
-            current.u64 = atom_cmpxchg(
-                (volatile __global long *)addr,
-                (long) expected.u64,
-                (long) next.u64);
-        } while( current.u64 != expected.u64 );
-        return current.f64;
-    }
-#elif defined(XO_CONTEXT_CUDA)
-    // CUDA already provides this
-#else
-    #error "Atomic add not implemented for this context"
-#endif
+            assumed = old;
+            b  = (assumed & mask) >> sh;                    // Extract current 8-bit lane
+            nb = (uint32_t)((uint8_t)b + (uint8_t)val);     // Add in modulo-256 (two's complement)
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);  // Merge back updated lane; leave neighbor lanes intact
+            // Try to publish; if someone raced us, retry with their value
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int8_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint8_t atomicAdd_u8(GPUVOLATILE uint8_t* addr, uint8_t val){
+        GPUVOLATILE uint32_t *w;
+        uint32_t sh, mask;
+        __xt_lane8(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);                             /* mod 256 */
+            nw = (assumed & ~mask) | ((nb & 0xFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint8_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 16-bit: int16_t / uint16_t (CAS on 32-bit word) -----------
+    GPUFUN int16_t atomicAdd_i16(GPUVOLATILE int16_t* addr, int16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)((uint16_t)b + (uint16_t)val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (int16_t)((assumed & mask) >> sh);
+    }
+    GPUFUN uint16_t atomicAdd_u16(GPUVOLATILE uint16_t* addr, uint16_t val){
+        GPUVOLATILE uint32_t* w;
+        uint32_t sh, mask;
+        __xt_lane16(addr, &w, &sh, &mask);
+        uint32_t old = *w, assumed, b, nb, nw;
+        do {
+            assumed = old;
+            b  = (assumed & mask) >> sh;
+            nb = (uint32_t)(b + val);
+            nw = (assumed & ~mask) | ((nb & 0xFFFFU) << sh);
+            old = __XT_CAS_U32(w, assumed, nw);
+        } while (old != assumed);
+        return (uint16_t)((assumed & mask) >> sh);
+    }
+
+    // ---------------- 32-bit: int32_t / uint32_t (built-in) -----------
+    GPUFUN int32_t atomicAdd_i32(GPUVOLATILE int32_t* addr, int32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+    GPUFUN uint32_t atomicAdd_u32(GPUVOLATILE uint32_t* addr, uint32_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atomic_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: int64_t / uint64_t (built-in or CAS on 64-bit word) -----------
+    GPUFUN int64_t atomicAdd_i64(GPUVOLATILE int64_t* addr, int64_t val){
+        uint64_t old, nw;
+        do {
+            old = *addr;
+            nw = old + val;
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return old;
+    }
+    GPUFUN uint64_t atomicAdd_u64(GPUVOLATILE uint64_t* addr, uint64_t val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        return atom_add(addr, val);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 32-bit: float (built-in or CAS on 32-bit word) -----------
+    GPUFUN float atomicAdd_f32(GPUVOLATILE float* addr, float val){
+    #ifdef XO_CONTEXT_CUDA
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL
+        uint32_t old, nw;
+        do {
+            old = __XT_AS_U32_FROM_F32(*addr);
+            nw = __XT_AS_U32_FROM_F32(__XT_AS_F32_FROM_U32(old) + val);
+        } while (__XT_CAS_U32((GPUVOLATILE uint32_t*)addr, old, nw) != old);
+        return __XT_AS_F32_FROM_U32(old);
+    #endif // XO_CONTEXT_CUDA / XO_CONTEXT_CL
+    }
+
+    // ---------------- 64-bit: float (built-in or CAS on 64-bit word) -----------
+    GPUFUN double atomicAdd_f64(GPUVOLATILE double* addr, double val){
+    #if __CUDA_ARCH__ >= 600
+        return atomicAdd(addr, val);
+    #else // XO_CONTEXT_CL || __CUDA_ARCH__ < 600
+        uint64_t old, nw;
+        do {
+            old = __XT_AS_U64_FROM_F64(*addr);
+            nw = __XT_AS_U64_FROM_F64(__XT_AS_F64_FROM_U64(old) + val);
+        } while (__XT_CAS_U64((GPUVOLATILE uint64_t*)addr, old, nw) != old);
+        return __XT_AS_F64_FROM_U64(old);
+    #endif // __CUDA_ARCH__ >= 600 / XO_CONTEXT_CL
+    }
+#endif // defined(XO_CONTEXT_CUDA) || defined(XO_CONTEXT_CL)
+
+// Define the overloaded function
+// ------------------------------
+#ifdef XO_CONTEXT_CUDA
+    // NVRTC (CuPy RawModule) usually compiles under extern "C".
+    // In C, function overloading is not possible, but we can cheat by doing it in
+    // C++ (with a different name to avoid clashes with the built-in atomicAdd).
+    // This function will then be remapped to atomicAdd() via a macro in C.
+    #ifdef __cplusplus
+    extern "C++" {
+
+    GPUFUN int8_t   xt_atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  xt_atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  xt_atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t xt_atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  xt_atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+
+    // Existing type definitions: forward to CUDA built-ins
+    GPUFUN int32_t  xt_atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return ::atomicAdd(p, v); }
+    GPUFUN uint32_t xt_atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return ::atomicAdd(p, v); }
+    GPUFUN uint64_t xt_atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return ::atomicAdd(p, v); }
+    GPUFUN float    xt_atomicAdd(GPUVOLATILE float* p, float v)       { return ::atomicAdd(p, v); }
+    #if __CUDA_ARCH__ >= 600
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return ::atomicAdd(p, v); }
+    #else
+    GPUFUN double   xt_atomicAdd(GPUVOLATILE double* p, double v)     { return atomicAdd_f64(p, v); }
+    #endif
+
+    }
+    #endif // __cplusplus
+
+    // ---------- Global remap of the public name on device code ----------
+    // Define AFTER the wrappers so we don't macro-rewrite our own calls.
+    #ifdef atomicAdd
+    #undef atomicAdd
+    #endif
+    #define atomicAdd(ptr, val) xt_atomicAdd((ptr), (val))
+#endif /* XO_CONTEXT_CUDA */
+
+#ifdef XO_CONTEXT_CL
+    #if !__has_attribute(overloadable)
+        #error "The current OpenCL compiler/architecture does not support __attribute__((overloadable))"
+    #endif
+    #define OCL_OVERLOAD __attribute__((overloadable))
+    GPUFUN int8_t   OCL_OVERLOAD atomicAdd(GPUVOLATILE int8_t*  p, int8_t  v)  { return atomicAdd_i8 (p, v); }
+    GPUFUN uint8_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE uint8_t* p, uint8_t v)  { return atomicAdd_u8 (p, v); }
+    GPUFUN int16_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int16_t* p, int16_t v)  { return atomicAdd_i16(p, v); }
+    GPUFUN uint16_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint16_t*p, uint16_t v) { return atomicAdd_u16(p, v); }
+    GPUFUN int64_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int64_t*p, int64_t v)   { return atomicAdd_i64(p, v); }
+    GPUFUN float    OCL_OVERLOAD atomicAdd(GPUVOLATILE float* p, float v)      { return atomicAdd_f32(p, v); }
+    GPUFUN double   OCL_OVERLOAD atomicAdd(GPUVOLATILE double* p, double v)    { return atomicAdd_f64(p, v); }
+
+    // Existing type definitions: forward to OpenCL built-ins
+    GPUFUN int32_t  OCL_OVERLOAD atomicAdd(GPUVOLATILE int32_t* p, int32_t v)   { return atomic_add(p, v); }
+    GPUFUN uint32_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint32_t* p, uint32_t v) { return atomic_add(p, v); }
+    GPUFUN uint64_t OCL_OVERLOAD atomicAdd(GPUVOLATILE uint64_t* p, uint64_t v) { return atom_add(p, v); }
+#endif // XO_CONTEXT_CL
 
-#endif // _ATOMICADD_H_
+#endif //_ATOMICADD_H_
diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
index e4196c8..84a9f54 100644
--- a/xobjects/headers/common.h
+++ b/xobjects/headers/common.h
@@ -1,8 +1,12 @@
+// copyright ################################# //
+// This file is part of the Xtrack Package.    //
+// Copyright (c) CERN, 2025.                   //
+// ########################################### //
+
+
 #ifndef XOBJECTS_COMMON_H
 #define XOBJECTS_COMMON_H
 
-#include "xobjects/headers/atomicadd.h"
-
 /*
     Common macros for vectorization and parallelization, as well as common
     arithmetic operations.

From 1ecb86023bc8183c11652ac1d07ab6014be8c503 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 12 Nov 2025 17:39:21 +0100
Subject: [PATCH 03/27] Fix style

---
 xobjects/headers/common.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
index 84a9f54..f6fd35d 100644
--- a/xobjects/headers/common.h
+++ b/xobjects/headers/common.h
@@ -31,7 +31,6 @@
 
     #define END_VECTORIZE \
         }
-
 #endif  // XO_CONTEXT_CPU_OPENMP
 
 

From 34b3c69c2d4f26b6a4069985872f3e79c86fb03a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 12 Nov 2025 17:39:47 +0100
Subject: [PATCH 04/27] Add a copy flag to the np.array <-> context array
 conversion functions

---
 xobjects/context.py          | 26 ++++++++++++++++++++++----
 xobjects/context_cpu.py      | 19 +++++++++++--------
 xobjects/context_cupy.py     | 11 +++++++----
 xobjects/context_pyopencl.py | 18 ++++++++++--------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/xobjects/context.py b/xobjects/context.py
index bcd03db..0ae2b54 100644
--- a/xobjects/context.py
+++ b/xobjects/context.py
@@ -375,12 +375,30 @@ def get_installed_c_source_paths(self) -> List[str]:
         return sources
 
     @abstractmethod
-    def nparray_to_context_array(self, arr):
-        return arr
+    def nparray_to_context_array(self, arr, copy=False):
+        """Obtain an array on the context, given a numpy array.
+
+        Args:
+            arr: numpy array
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            array on the context
+        """
 
     @abstractmethod
-    def nparray_from_context_array(self, dev_arr):
-        return dev_arr
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Obtain a numpy array, given an array on the context.
+
+        Args:
+            arr: array on the context
+            copy: if True, always create a copy in the context. If False,
+                try to avoid copy if possible (not guaranteed).
+
+        Returns:
+            Numpy array
+        """
 
     @property
     @abstractmethod
diff --git a/xobjects/context_cpu.py b/xobjects/context_cpu.py
index 4104ed0..c85bbcd 100644
--- a/xobjects/context_cpu.py
+++ b/xobjects/context_cpu.py
@@ -543,7 +543,7 @@ def cffi_module_for_c_types(c_types, containing_dir="."):
 
         return None
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Moves a numpy array to the device memory. No action is performed by
         this function in the CPU context. The method is provided
@@ -551,14 +551,16 @@ def nparray_to_context_array(self, arr):
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): If True, a copy of the array is made.
 
         Returns:
-            numpy.ndarray: The same array (no copy!).
-
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            arr = np.copy(arr)
         return arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Moves an array to the device to a numpy array. No action is performed by
         this function in the CPU context. The method is provided so that the CPU
@@ -566,10 +568,13 @@ def nparray_from_context_array(self, dev_arr):
 
         Args:
             dev_arr (numpy.ndarray): Array to be transferred
-        Returns:
-            numpy.ndarray: The same array (no copy!)
+            copy (bool): If True, a copy of the array is made.
 
+        Returns:
+            numpy.ndarray: Numpy array with the same data, original or a copy.
         """
+        if copy:
+            dev_arr = np.copy(dev_arr)
         return dev_arr
 
     @property
@@ -579,7 +584,6 @@ def nplike_lib(self):
         through ``nplike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return np
 
     @property
@@ -589,7 +593,6 @@ def splike_lib(self):
         through ``splike_lib`` to keep compatibility with the other contexts.
 
         """
-
         return sp
 
     def synchronize(self):
diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
index 8b93372..3838376 100644
--- a/xobjects/context_cupy.py
+++ b/xobjects/context_cupy.py
@@ -487,29 +487,32 @@ def build_kernels(
     def __str__(self):
         return f"{type(self).__name__}:{cupy.cuda.get_device_id()}"
 
-    def nparray_to_context_array(self, arr):
+    def nparray_to_context_array(self, arr, copy=False):
         """
         Copies a numpy array to the device memory.
 
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
 
         Returns:
             cupy.ndarray:The same array copied to the device.
-
         """
         dev_arr = cupy.array(arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
+    def nparray_from_context_array(self, dev_arr, copy=False):
         """
         Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (cupy.ndarray): Array to be transferred.
+            copy (bool): This parameter is ignored for CUDA, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index 843bf85..b48e027 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -253,28 +253,30 @@ def __str__(self):
         device_id = self.platform.get_devices().index(self.device)
         return f"{type(self).__name__}:{platform_id}.{device_id}"
 
-    def nparray_to_context_array(self, arr):
-        """
-        Copies a numpy array to the device memory.
+    def nparray_to_context_array(self, arr, copy=False):
+        """Copies a numpy array to the device memory.
+
         Args:
             arr (numpy.ndarray): Array to be transferred
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
 
         Returns:
             pyopencl.array.Array:The same array copied to the device.
-
         """
         dev_arr = cla.to_device(self.queue, arr)
         return dev_arr
 
-    def nparray_from_context_array(self, dev_arr):
-        """
-        Copies an array to the device to a numpy array.
+    def nparray_from_context_array(self, dev_arr, copy=False):
+        """Copies an array to the device to a numpy array.
 
         Args:
             dev_arr (pyopencl.array.Array): Array to be transferred.
+            copy (bool): This parameter is ignored for OpenCL, as the data lives
+                on a different device.
+
         Returns:
             numpy.ndarray: The same data copied to a numpy array.
-
         """
         return dev_arr.get()
 

From d5606b3efdd5ab623bf513cc7047669506867d49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 12 Nov 2025 17:40:55 +0100
Subject: [PATCH 05/27] Bugfix: capi get/set methods broken for nested arrays

---
 tests/test_capi.py | 129 ++++++++++++++++++++++++++++++++++++++++++++-
 xobjects/capi.py   |  26 ++++-----
 2 files changed, 141 insertions(+), 14 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index df2fcbb..1891072 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -11,7 +11,7 @@
 import cffi
 
 import xobjects as xo
-
+from xobjects.test_helpers import for_all_test_contexts
 
 ffi = cffi.FFI()
 
@@ -576,8 +576,133 @@ def test_gpu_api():
         ctx.add_kernels(
             sources=[src_code],
             kernels=kernel_descriptions,
-            # save_src_as=f'_test_{name}.c')
             save_source_as=None,
             compile=True,
             extra_classes=[xo.String[:]],
         )
+
+
+@for_all_test_contexts
+def test_array_of_arrays(test_context):
+    cell_ids = [3, 5, 7]
+    particle_per_cell = [
+        [1, 8],
+        [9, 3, 2],
+        [4, 5, 6, 7],
+    ]
+
+    class Cells(xo.Struct):
+        ids = xo.Int64[:]
+        particles = xo.Int64[:][:]
+
+    cells = Cells(ids=cell_ids, particles=particle_per_cell)
+
+    # Data layout (displayed as uint64):
+    #
+    #   [0] 216 (cells size)
+    #   [8]  56 (offset field 2 -- particles field)
+    #  [16] cell_ids data:
+    #        [0] 40 (cell_ids size)
+    #        [8]  3 (cell_ids length)
+    #       [16] {3, 5, 7} (cell_ids elements)
+    #  [56] particles data:
+    #        [0] 160 (particles size)
+    #        [8]   3 (particles length)
+    #       [16]  40 (offset particles[0])
+    #       [24]  72 (offset particles[1])
+    #       [32] 112 (offset particles[2])
+    #       [40] particles[0] data:
+    #             [0] 32 (particles[0] size)
+    #             [8]  2 (particles[0] length)
+    #            [16] {1, 8} (particles[0] elements)
+    #       [72] particles[1] data:
+    #             [0] 40 (particles[1] size)
+    #             [8]  3 (particles[1] length)
+    #            [16] {9, 3, 2} (particles[1
+    #      [112] particles[2] data:
+    #             [0] 48 (particles[2] size)
+    #             [8]  4 (particles[2] length)
+    #            [16] {4, 5, 6, 7} (particles[2] elements)
+
+    src = r"""
+    #include "xobjects/headers/common.h"
+
+    int MAX_PARTICLES = 4;
+    int MAX_CELLS = 3;
+
+    GPUKERN
+    uint8_t loop_over(Cells cells, uint64_t* out_counts, uint64_t* out_vals)
+    {
+        uint8_t success = 1;
+        int64_t num_cells = Cells_len_ids(cells);
+
+        for (int64_t i = 0; i < num_cells; i++) {
+            int64_t id = Cells_get_ids(cells, i);
+            int64_t count = Cells_len1_particles(cells, i);
+
+            printf("Cell ID: %lld\n Particles (count %lld): ", id, count);
+
+            if (i >= MAX_CELLS) {
+                success = 0;
+                continue;
+            }
+
+            out_counts[i] = count;
+
+            ArrNInt64 particles = Cells_getp1_particles(cells, i);
+            uint32_t num_particles = ArrNInt64_len(particles);
+
+            VECTORIZE_OVER(j, num_particles);
+                int64_t val = ArrNInt64_get(particles, j);
+                printf("%lld ", val);
+
+                if (j >= MAX_PARTICLES) {
+                    success = 0;
+                    continue;
+                }
+
+                out_vals[i * MAX_PARTICLES + j] = val;
+            END_VECTORIZE;
+            printf("\n");
+        }
+        fflush(stdout);
+        return success;
+    }
+    """
+
+    kernels = {
+        "loop_over": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="cells"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_counts"),
+                xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
+            ],
+            n_threads="n",
+            ret=xo.Arg(xo.UInt8),
+        )
+    }
+    kernels.update(Cells._gen_kernels())
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels=kernels,
+    )
+
+    counts = np.zeros(len(cell_ids), dtype=np.uint64)
+    vals = np.zeros(12, dtype=np.uint64)
+
+    for i, _ in enumerate(particle_per_cell):
+        for j, expected in enumerate(particle_per_cell[i]):
+            result = test_context.kernels.Cells_get_particles(
+                obj=cells, i0=i, i1=j
+            )
+            assert result == expected
+
+    ret = test_context.kernels.loop_over(
+        cells=cells,
+        out_counts=counts,
+        out_vals=vals,
+    )
+    assert ret == 1
+    assert np.all(counts == [2, 3, 4])
+    assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])
diff --git a/xobjects/capi.py b/xobjects/capi.py
index 622803f..4be67e2 100644
--- a/xobjects/capi.py
+++ b/xobjects/capi.py
@@ -103,9 +103,10 @@ def get_layers(parts):
 
 
 def int_from_obj(offset, conf):
-    inttype = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
-    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-    return f"*({inttype})(({chartype}) obj+{offset})"
+    """Generate code to read the integer at location obj + offset (bytes)."""
+    int_pointer_type = gen_pointer(conf.get("inttype", "int64_t") + "*", conf)
+    char_pointer_type = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({int_pointer_type})(({char_pointer_type}) obj+{offset})"
 
 
 def Field_get_c_offset(self, conf):
@@ -146,7 +147,7 @@ def Index_get_c_offset(part, conf, icount):
         out.append(f"  offset+={soffset};")
     else:
         lookup_field_offset = f"offset+{soffset}"
-        out.append(f"  offset={int_from_obj(lookup_field_offset, conf)};")
+        out.append(f"  offset+={int_from_obj(lookup_field_offset, conf)};")
     return out
 
 
@@ -206,16 +207,17 @@ def gen_fun_kernel(cls, path, action, const, extra, ret, add_nindex=False):
 def gen_c_pointed(target: Arg, conf):
     size = gen_c_size_from_arg(target, conf)
     ret = gen_c_type_from_arg(target, conf)
+
     if target.pointer or is_compound(target.atype) or is_string(target.atype):
         chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
         return f"({ret})(({chartype}) obj+offset)"
-    else:
-        rettype = gen_pointer(ret + "*", conf)
-        if size == 1:
-            return f"*(({rettype}) obj+offset)"
-        else:
-            chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
-            return f"*({rettype})(({chartype}) obj+offset)"
+
+    rettype = gen_pointer(ret + "*", conf)
+    if size == 1:
+        return f"*(({rettype}) obj+offset)"
+
+    chartype = gen_pointer(conf.get("chartype", "char") + "*", conf)
+    return f"*({rettype})(({chartype}) obj+offset)"
 
 
 def gen_method_get(cls, path, conf):
@@ -507,7 +509,7 @@ def methods_from_path(cls, path, conf):
 
     if is_array(lasttype):
         out.append(gen_method_len(cls, path, conf))
-    #    out.append(gen_method_shape(cls, path, conf))
+        out.append(gen_method_shape(cls, path, conf))
     #    out.append(gen_method_nd(cls, path, conf))
     #    out.append(gen_method_strides(cls, path, conf))
     #    out.append(gen_method_getpos(cls, path, conf))

From 04301c56c3acdce33a183e85fa058550279fdf02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Thu, 13 Nov 2025 09:17:11 +0100
Subject: [PATCH 06/27] Implement the get_shape and get_nd methods in the C API

---
 tests/test_capi.py | 28 ++++++++++++++++++
 xobjects/capi.py   | 71 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 1891072..6df6e31 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -158,6 +158,34 @@ def test_array_dynamic_type_init_get_set(array_cls, example_shape):
             assert arr[ii].field1[idx_in_field] == 13 * vv
 
 
+@for_all_test_contexts
+@pytest.mark.parametrize(
+    "array_type",
+    [
+        xo.UInt64[3, 5, 7],
+        xo.UInt64[:, :, :],
+        xo.UInt64[:, 5, :],
+    ],
+)
+def test_array_get_shape(test_context, array_type):
+    kernels = array_type._gen_kernels()
+    test_context.add_kernels(kernels=kernels)
+
+    instance = array_type(np.array(range(3 * 5 * 7)).reshape((3, 5, 7)))
+
+    nd_function = test_context.kernels[f"{array_type.__name__}_nd"]
+    nd = nd_function(obj=instance)
+    assert nd == 3
+
+    shape = np.zeros(nd, dtype=np.int64)
+    shape_function = test_context.kernels[f"{array_type.__name__}_shape"]
+    shape_function(obj=instance, out_shape=shape)
+
+    assert shape[0] == 3
+    assert shape[1] == 5
+    assert shape[2] == 7
+
+
 def test_struct1():
     kernels = Struct1._gen_kernels()
     ctx = xo.ContextCpu()
diff --git a/xobjects/capi.py b/xobjects/capi.py
index 4be67e2..ec09ea6 100644
--- a/xobjects/capi.py
+++ b/xobjects/capi.py
@@ -5,7 +5,7 @@
 
 from .context import Kernel, Arg
 
-from .scalar import Int64, Void, Int8, is_scalar
+from .scalar import UInt32, Int64, Void, is_scalar
 from .struct import is_field, is_struct
 from .array import is_index, is_array
 from .ref import is_unionref, is_ref
@@ -366,13 +366,74 @@ def gen_method_size(cls, path, conf):
 
 
 def gen_method_shape(cls, path, conf):
-    "return shape in an array"
-    return None, None
+    array_type = path[-1]
+
+    out_shape_arg = Arg(Int64, name="out_shape", pointer=True)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="shape",
+        extra=[out_shape_arg],
+        ret=None,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+
+    if array_type._is_static_shape:
+        terms = [str(dim) for dim in array_type._shape]
+    else:
+        lst.append(gen_method_offset(path, conf))
+        arrarg = Arg(Int64, pointer=True)
+        pointed = gen_c_pointed(arrarg, conf)
+        typearr = gen_pointer("int64_t*", conf)
+        lst.append(f"  {typearr} arr = {pointed};")
+        dim_len_idx = 1
+        terms = []
+        for dim_len in array_type._shape:
+            if dim_len:
+                terms.append(str(dim_len))
+            else:
+                terms.append(f"arr[{dim_len_idx}]")
+                dim_len_idx += 1
+
+    for dim_idx in range(nd):
+        lst.append(f"  out_shape[{dim_idx}] = {terms[dim_idx]};")
+
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_nd(cls, path, conf):
     "return length of shape"
-    return None, None
+    array_type = path[-1]
+
+    retarg = Arg(UInt32)
+
+    kernel = gen_fun_kernel(
+        cls,
+        path,
+        const=False,
+        action="nd",
+        extra=[],
+        ret=retarg,
+        add_nindex=True,
+    )
+    decl = gen_c_decl_from_kernel(kernel, conf)
+
+    lst = [decl + "{"]
+
+    nd = len(array_type._shape)
+    lst.append(f"  return {nd};")
+    lst.append("}")
+
+    return "\n".join(lst), kernel
 
 
 def gen_method_strides(cls, path, conf):
@@ -510,7 +571,7 @@ def methods_from_path(cls, path, conf):
     if is_array(lasttype):
         out.append(gen_method_len(cls, path, conf))
         out.append(gen_method_shape(cls, path, conf))
-    #    out.append(gen_method_nd(cls, path, conf))
+        out.append(gen_method_nd(cls, path, conf))
     #    out.append(gen_method_strides(cls, path, conf))
     #    out.append(gen_method_getpos(cls, path, conf))
 

From c2795aa71233264c5c2d90e86fc82b5570c3ef4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 16:50:54 +0100
Subject: [PATCH 07/27] Fixes in tests

---
 tests/test_capi.py   | 10 +++++-----
 tests/test_common.py | 11 +++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 6df6e31..c259dab 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -659,7 +659,11 @@ class Cells(xo.Struct):
     int MAX_CELLS = 3;
 
     GPUKERN
-    uint8_t loop_over(Cells cells, uint64_t* out_counts, uint64_t* out_vals)
+    uint8_t loop_over(
+        Cells cells,
+        GPUGLMEM uint64_t* out_counts,
+        GPUGLMEM uint64_t* out_vals
+    )
     {
         uint8_t success = 1;
         int64_t num_cells = Cells_len_ids(cells);
@@ -668,8 +672,6 @@ class Cells(xo.Struct):
             int64_t id = Cells_get_ids(cells, i);
             int64_t count = Cells_len1_particles(cells, i);
 
-            printf("Cell ID: %lld\n Particles (count %lld): ", id, count);
-
             if (i >= MAX_CELLS) {
                 success = 0;
                 continue;
@@ -682,7 +684,6 @@ class Cells(xo.Struct):
 
             VECTORIZE_OVER(j, num_particles);
                 int64_t val = ArrNInt64_get(particles, j);
-                printf("%lld ", val);
 
                 if (j >= MAX_PARTICLES) {
                     success = 0;
@@ -691,7 +692,6 @@ class Cells(xo.Struct):
 
                 out_vals[i * MAX_PARTICLES + j] = val;
             END_VECTORIZE;
-            printf("\n");
         }
         fflush(stdout);
         return success;
diff --git a/tests/test_common.py b/tests/test_common.py
index 91eb7d2..4b2e882 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -16,8 +16,7 @@ def test_common_atomicadd(test_context):
     #include "xobjects/headers/common.h"
     #include "xobjects/headers/atomicadd.h"
 
-    GPUKERN
-    double test_atomic_add()
+    GPUKERN void test_atomic_add(GPUGLMEM double* out)
     {
         int iterations = 1000;
         double sum = 0;
@@ -28,7 +27,7 @@ def test_common_atomicadd(test_context):
             #endif
             atomicAdd(&sum, 1.0);
         END_VECTORIZE;
-        return sum;
+        *out = sum;
     }
     """
 
@@ -45,15 +44,15 @@ def test_common_atomicadd(test_context):
         sources=[src],
         kernels={
             "test_atomic_add": xo.Kernel(
-                args=[],
+                args=[xo.Arg(xo.Float64, pointer=True, name="out")],
                 n_threads=n_threads,
-                ret=xo.Arg(xo.Float64),
             )
         },
     )
 
     expected = 1000
-    result = test_context.kernels.test_atomic_add()
+    result = np.array([0], dtype=np.float64)
+    test_context.kernels.test_atomic_add(out=result)
 
     assert result == expected
 

From 84ba92c132b001b8f741e0c2d093697ba80a23c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 16:58:16 +0100
Subject: [PATCH 08/27] Fixes for gpu

---
 tests/test_common.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 4b2e882..13d79de 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -52,7 +52,9 @@ def test_common_atomicadd(test_context):
 
     expected = 1000
     result = np.array([0], dtype=np.float64)
-    test_context.kernels.test_atomic_add(out=result)
+    result_ctx = test_context.nparray_to_context_array(result)
+    test_context.kernels.test_atomic_add(out=result_ctx)
+    result = test_context.nparray_form_context_array(result_ctx)[0]
 
     assert result == expected
 

From 23373aa26c05b617b79656570f09246787646102 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 16:59:29 +0100
Subject: [PATCH 09/27] fixes for gpu

---
 tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 13d79de..948d9d7 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -54,7 +54,7 @@ def test_common_atomicadd(test_context):
     result = np.array([0], dtype=np.float64)
     result_ctx = test_context.nparray_to_context_array(result)
     test_context.kernels.test_atomic_add(out=result_ctx)
-    result = test_context.nparray_form_context_array(result_ctx)[0]
+    result = test_context.nparray_from_context_array(result_ctx)[0]
 
     assert result == expected
 

From b30e666e9682299e0f22531802489201b3e5af19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 17:15:00 +0100
Subject: [PATCH 10/27] fix in tests

---
 tests/test_common.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index 948d9d7..c427065 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -16,9 +16,8 @@ def test_common_atomicadd(test_context):
     #include "xobjects/headers/common.h"
     #include "xobjects/headers/atomicadd.h"
 
-    GPUKERN void test_atomic_add(GPUGLMEM double* out)
+    GPUKERN void test_atomic_add(GPUGLMEM double* out, int32_t iterations)
     {
-        int iterations = 1000;
         double sum = 0;
         VECTORIZE_OVER(i, iterations);
             // If on CPU do some work to avoid the loop being optimized out
@@ -27,7 +26,7 @@ def test_common_atomicadd(test_context):
             #endif
             atomicAdd(&sum, 1.0);
         END_VECTORIZE;
-        *out = sum;
+        out[0] = sum;
     }
     """
 
@@ -44,8 +43,12 @@ def test_common_atomicadd(test_context):
         sources=[src],
         kernels={
             "test_atomic_add": xo.Kernel(
-                args=[xo.Arg(xo.Float64, pointer=True, name="out")],
-                n_threads=n_threads,
+                c_name="test_atomic_add",
+                args=[
+                    xo.Arg(xo.Float64, pointer=True, name="out"),
+                    xo.Arg(xo.Int32, name="iterations"),
+                ],
+                n_threads="iterations",
             )
         },
     )
@@ -53,8 +56,8 @@ def test_common_atomicadd(test_context):
     expected = 1000
     result = np.array([0], dtype=np.float64)
     result_ctx = test_context.nparray_to_context_array(result)
-    test_context.kernels.test_atomic_add(out=result_ctx)
-    result = test_context.nparray_from_context_array(result_ctx)[0]
+    test_context.kernels.test_atomic_add(out=result_ctx, iterations=expected)
+    result = test_context.nparray_from_context_array(result_ctx)
 
     assert result == expected
 

From 5e3b8ca6685831a2302cc5a8b56b3f65d3ffa6cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 17:20:35 +0100
Subject: [PATCH 11/27] fixes for gpu

---
 tests/test_common.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/test_common.py b/tests/test_common.py
index c427065..c1a741c 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -18,15 +18,13 @@ def test_common_atomicadd(test_context):
 
     GPUKERN void test_atomic_add(GPUGLMEM double* out, int32_t iterations)
     {
-        double sum = 0;
         VECTORIZE_OVER(i, iterations);
             // If on CPU do some work to avoid the loop being optimized out
             #if defined(XO_CONTEXT_CPU_OPENMP)
                 usleep(10);
             #endif
-            atomicAdd(&sum, 1.0);
+            atomicAdd(out, 1.0);
         END_VECTORIZE;
-        out[0] = sum;
     }
     """
 
@@ -48,7 +46,7 @@ def test_common_atomicadd(test_context):
                     xo.Arg(xo.Float64, pointer=True, name="out"),
                     xo.Arg(xo.Int32, name="iterations"),
                 ],
-                n_threads="iterations",
+                n_threads=n_threads,
             )
         },
     )

From 9d44e27d324df93b697ec7c71403f41489ed3724 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 17:26:17 +0100
Subject: [PATCH 12/27] fixes for gpu

---
 tests/test_capi.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index c259dab..f5b528e 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -658,14 +658,13 @@ class Cells(xo.Struct):
     int MAX_PARTICLES = 4;
     int MAX_CELLS = 3;
 
-    GPUKERN
-    uint8_t loop_over(
+    GPUKERN void loop_over(
         Cells cells,
         GPUGLMEM uint64_t* out_counts,
-        GPUGLMEM uint64_t* out_vals
+        GPUGLMEM uint64_t* out_vals,
+        GPUGLMEM uint8_t* success,
     )
     {
-        uint8_t success = 1;
         int64_t num_cells = Cells_len_ids(cells);
 
         for (int64_t i = 0; i < num_cells; i++) {
@@ -673,7 +672,7 @@ class Cells(xo.Struct):
             int64_t count = Cells_len1_particles(cells, i);
 
             if (i >= MAX_CELLS) {
-                success = 0;
+                *success = 0;
                 continue;
             }
 
@@ -686,15 +685,13 @@ class Cells(xo.Struct):
                 int64_t val = ArrNInt64_get(particles, j);
 
                 if (j >= MAX_PARTICLES) {
-                    success = 0;
+                    *success = 0;
                     continue;
                 }
 
                 out_vals[i * MAX_PARTICLES + j] = val;
             END_VECTORIZE;
         }
-        fflush(stdout);
-        return success;
     }
     """
 
@@ -704,9 +701,9 @@ class Cells(xo.Struct):
                 xo.Arg(Cells, name="cells"),
                 xo.Arg(xo.UInt64, pointer=True, name="out_counts"),
                 xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
+                xo.Arg(xo.UInt8, pointer=True, name="success"),
             ],
-            n_threads="n",
-            ret=xo.Arg(xo.UInt8),
+            n_threads=3,
         )
     }
     kernels.update(Cells._gen_kernels())
@@ -716,8 +713,9 @@ class Cells(xo.Struct):
         kernels=kernels,
     )
 
-    counts = np.zeros(len(cell_ids), dtype=np.uint64)
-    vals = np.zeros(12, dtype=np.uint64)
+    counts = test_context.zeros(len(cell_ids), dtype=np.uint64)
+    vals = test_context.zeros(12, dtype=np.uint64)
+    success = test_context.ones((1,), dtype=np.uint8)
 
     for i, _ in enumerate(particle_per_cell):
         for j, expected in enumerate(particle_per_cell[i]):
@@ -730,6 +728,7 @@ class Cells(xo.Struct):
         cells=cells,
         out_counts=counts,
         out_vals=vals,
+        success=success,
     )
     assert ret == 1
     assert np.all(counts == [2, 3, 4])

From bc0e6b26bfba1a46b01c2551e4478fe0613a5cfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 17:28:24 +0100
Subject: [PATCH 13/27] fixes for gpu

---
 tests/test_capi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index f5b528e..9e48c00 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -655,8 +655,8 @@ class Cells(xo.Struct):
     src = r"""
     #include "xobjects/headers/common.h"
 
-    int MAX_PARTICLES = 4;
-    int MAX_CELLS = 3;
+    static const int MAX_PARTICLES = 4;
+    static const int MAX_CELLS = 3;
 
     GPUKERN void loop_over(
         Cells cells,

From 52a981f3664acea103acb5670b29215576dbb223 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:04:54 +0100
Subject: [PATCH 14/27] fixes for gpu, built in kernels cannot be passed to
 add_kernels on gpu

---
 tests/test_capi.py | 104 ++++++++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 43 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 9e48c00..975b661 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -567,47 +567,47 @@ def test_getp1_dyn_length_dyn_type_string_array():
         assert ord(ffi.cast("char *", s2)[8 + ii]) == ch
 
 
-def test_gpu_api():
-    for ctx in xo.context.get_test_contexts():
-        src_code = """
-        /*gpufun*/
-        void myfun(double x, double y,
-            double* z){
-            z[0] = x * y;
-            }
+@for_all_test_contexts
+def test_gpu_api(test_context):
+    src_code = """
+    /*gpufun*/
+    void myfun(double x, double y,
+        double* z){
+        z[0] = x * y;
+        }
 
-        /*gpukern*/
-        void my_mul(const int n,
-            /*gpuglmem*/ const double* x1,
-            /*gpuglmem*/ const double* x2,
-            /*gpuglmem*/       double* y) {
-            int tid = 0 //vectorize_over tid n
-            double z;
-            myfun(x1[tid], x2[tid], &z);
-            y[tid] = z;
-            //end_vectorize
-            }
-        """
-
-        kernel_descriptions = {
-            "my_mul": xo.Kernel(
-                args=[
-                    xo.Arg(xo.Int32, name="n"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
-                    xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
-                    xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
-                ],
-                n_threads="n",
-            ),
+    /*gpukern*/
+    void my_mul(const int n,
+        /*gpuglmem*/ const double* x1,
+        /*gpuglmem*/ const double* x2,
+        /*gpuglmem*/       double* y) {
+        int tid = 0 //vectorize_over tid n
+        double z;
+        myfun(x1[tid], x2[tid], &z);
+        y[tid] = z;
+        //end_vectorize
         }
+    """
 
-        ctx.add_kernels(
-            sources=[src_code],
-            kernels=kernel_descriptions,
-            save_source_as=None,
-            compile=True,
-            extra_classes=[xo.String[:]],
-        )
+    kernel_descriptions = {
+        "my_mul": xo.Kernel(
+            args=[
+                xo.Arg(xo.Int32, name="n"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x1"),
+                xo.Arg(xo.Float64, pointer=True, const=True, name="x2"),
+                xo.Arg(xo.Float64, pointer=True, const=False, name="y"),
+            ],
+            n_threads="n",
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[src_code],
+        kernels=kernel_descriptions,
+        save_source_as=None,
+        compile=True,
+        extra_classes=[xo.String[:]],
+    )
 
 
 @for_all_test_contexts
@@ -662,7 +662,7 @@ class Cells(xo.Struct):
         Cells cells,
         GPUGLMEM uint64_t* out_counts,
         GPUGLMEM uint64_t* out_vals,
-        GPUGLMEM uint8_t* success,
+        GPUGLMEM uint8_t* success
     )
     {
         int64_t num_cells = Cells_len_ids(cells);
@@ -693,6 +693,15 @@ class Cells(xo.Struct):
             END_VECTORIZE;
         }
     }
+
+    GPUKERN void kernel_Cells_get_particles(
+        Cells obj,
+        int64_t i0,
+        int64_t i1,
+        GPUGLMEM int64_t* out
+    ) {
+        *out = Cells_get_particles(obj, i0, i1);
+    }
     """
 
     kernels = {
@@ -704,9 +713,17 @@ class Cells(xo.Struct):
                 xo.Arg(xo.UInt8, pointer=True, name="success"),
             ],
             n_threads=3,
-        )
+        ),
+        "kernel_Cells_get_particles": xo.Kernel(
+            args=[
+                xo.Arg(Cells, name="cells"),
+                xo.Arg(xo.Int64, pointer=True, name="i0"),
+                xo.Arg(xo.Int64, pointer=True, name="i1"),
+                xo.Arg(xo.Int64, pointer=True, name="out"),
+            ],
+            n_threads=3,
+        ),
     }
-    kernels.update(Cells._gen_kernels())
 
     test_context.add_kernels(
         sources=[src],
@@ -719,8 +736,9 @@ class Cells(xo.Struct):
 
     for i, _ in enumerate(particle_per_cell):
         for j, expected in enumerate(particle_per_cell[i]):
-            result = test_context.kernels.Cells_get_particles(
-                obj=cells, i0=i, i1=j
+            result = test_context.zeros(shape=(1,), dtype=np.int64)
+            test_context.kernels.kernel_Cells_get_particles(
+                obj=cells, i0=i, i1=j, out=result
             )
             assert result == expected
 

From 0ebe91757746a6e7a120b19999fb260642f8857c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:06:30 +0100
Subject: [PATCH 15/27] fixes on gpu

---
 tests/test_capi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 975b661..0183d3f 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -732,7 +732,7 @@ class Cells(xo.Struct):
 
     counts = test_context.zeros(len(cell_ids), dtype=np.uint64)
     vals = test_context.zeros(12, dtype=np.uint64)
-    success = test_context.ones((1,), dtype=np.uint8)
+    success = test_context.zeros((1,), dtype=np.uint8) + 1
 
     for i, _ in enumerate(particle_per_cell):
         for j, expected in enumerate(particle_per_cell[i]):

From 1c508e8314dae2403770792a7e6729ddab76b182 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:07:20 +0100
Subject: [PATCH 16/27] fixes for gpu

---
 tests/test_capi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 0183d3f..016e890 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -716,7 +716,7 @@ class Cells(xo.Struct):
         ),
         "kernel_Cells_get_particles": xo.Kernel(
             args=[
-                xo.Arg(Cells, name="cells"),
+                xo.Arg(Cells, name="obj"),
                 xo.Arg(xo.Int64, pointer=True, name="i0"),
                 xo.Arg(xo.Int64, pointer=True, name="i1"),
                 xo.Arg(xo.Int64, pointer=True, name="out"),

From 4e126a34d726f655a51b84fdc9909661ffaa5dff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:08:20 +0100
Subject: [PATCH 17/27] set context

---
 tests/test_capi.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 016e890..3292561 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -623,7 +623,9 @@ class Cells(xo.Struct):
         ids = xo.Int64[:]
         particles = xo.Int64[:][:]
 
-    cells = Cells(ids=cell_ids, particles=particle_per_cell)
+    cells = Cells(
+        ids=cell_ids, particles=particle_per_cell, _context=test_context
+    )
 
     # Data layout (displayed as uint64):
     #

From 35df65f7d19ab18cafc911adffcf89f6f2f9a765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:10:25 +0100
Subject: [PATCH 18/27] fixes for gpu

---
 tests/test_capi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 3292561..b97f483 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -719,8 +719,8 @@ class Cells(xo.Struct):
         "kernel_Cells_get_particles": xo.Kernel(
             args=[
                 xo.Arg(Cells, name="obj"),
-                xo.Arg(xo.Int64, pointer=True, name="i0"),
-                xo.Arg(xo.Int64, pointer=True, name="i1"),
+                xo.Arg(xo.Int64, name="i0"),
+                xo.Arg(xo.Int64, name="i1"),
                 xo.Arg(xo.Int64, pointer=True, name="out"),
             ],
             n_threads=3,

From 7944487be7586d90c172bf2e0d8c5778b3584cf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:11:10 +0100
Subject: [PATCH 19/27] fixes for gpu

---
 tests/test_capi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index b97f483..5ae12a2 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -744,12 +744,12 @@ class Cells(xo.Struct):
             )
             assert result == expected
 
-    ret = test_context.kernels.loop_over(
+    test_context.kernels.loop_over(
         cells=cells,
         out_counts=counts,
         out_vals=vals,
         success=success,
     )
-    assert ret == 1
+    assert success == 1
     assert np.all(counts == [2, 3, 4])
     assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])

From 414e8a1cf7ad2f701e8a8edde1cbc42bcc845fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:20:11 +0100
Subject: [PATCH 20/27] fix test_array_get_shape

---
 tests/test_capi.py | 51 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 5ae12a2..6727587 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -168,22 +168,51 @@ def test_array_dynamic_type_init_get_set(array_cls, example_shape):
     ],
 )
 def test_array_get_shape(test_context, array_type):
-    kernels = array_type._gen_kernels()
-    test_context.add_kernels(kernels=kernels)
+    source = """
+        GPUKERN void get_nd_and_shape(
+            ARRAY_TYPE arr,
+            GPUGLMEM int64_t* out_nd,
+            GPUGLMEM int64_t* out_shape
+        ) {
+            *out_nd = ARRAY_TYPE_nd(arr);
+            ARRAY_TYPE_shape(arr, out_shape);
+        }
+    """.replace(
+        "ARRAY_TYPE", array_type.__name__
+    )
+
+    kernels = {
+        "get_nd_and_shape": xo.Kernel(
+            c_name="get_nd_and_shape",
+            args=[
+                xo.Arg(array_type, name="arr"),
+                xo.Arg(xo.Int64, pointer=True, name="out_nd"),
+                xo.Arg(xo.Int64, pointer=True, name="out_shape"),
+            ],
+        ),
+    }
+
+    test_context.add_kernels(
+        sources=[src],
+        kernels=kernels,
+    )
 
     instance = array_type(np.array(range(3 * 5 * 7)).reshape((3, 5, 7)))
 
-    nd_function = test_context.kernels[f"{array_type.__name__}_nd"]
-    nd = nd_function(obj=instance)
-    assert nd == 3
+    expected_nd = 3
+    result_nd = test_context.zeros((1,), dtype=np.int64)
+
+    expected_shape = [3, 5, 7]
+    result_shape = test_context.zeros((expected_nd,), dtype=np.int64)
 
-    shape = np.zeros(nd, dtype=np.int64)
-    shape_function = test_context.kernels[f"{array_type.__name__}_shape"]
-    shape_function(obj=instance, out_shape=shape)
+    test_context.kernels.get_nd_and_shape(
+        arr=instance,
+        out_nd=result_nd,
+        out_shape=result_shape,
+    )
 
-    assert shape[0] == 3
-    assert shape[1] == 5
-    assert shape[2] == 7
+    assert result_nd[0] == expected_nd
+    assert np.all(result_shape == expected_shape)
 
 
 def test_struct1():

From 6e7427939651dbc1af6a8a75c7997ae3b4f6f0c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:21:36 +0100
Subject: [PATCH 21/27] more fixes

---
 tests/test_capi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 6727587..d310101 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -193,7 +193,7 @@ def test_array_get_shape(test_context, array_type):
     }
 
     test_context.add_kernels(
-        sources=[src],
+        sources=[source],
         kernels=kernels,
     )
 

From 5941a9a7f2c4652110e5a1710a83d03df1dadc5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:22:42 +0100
Subject: [PATCH 22/27] more fixes

---
 tests/test_capi.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index d310101..5bd79c4 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -169,6 +169,8 @@ def test_array_dynamic_type_init_get_set(array_cls, example_shape):
 )
 def test_array_get_shape(test_context, array_type):
     source = """
+        #include "xobjects/headers/common.h"
+
         GPUKERN void get_nd_and_shape(
             ARRAY_TYPE arr,
             GPUGLMEM int64_t* out_nd,

From db420e94c2429bff643bf440dd639547e3e305d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:23:24 +0100
Subject: [PATCH 23/27] right context

---
 tests/test_capi.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 5bd79c4..72295a2 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -199,7 +199,10 @@ def test_array_get_shape(test_context, array_type):
         kernels=kernels,
     )
 
-    instance = array_type(np.array(range(3 * 5 * 7)).reshape((3, 5, 7)))
+    instance = array_type(
+        np.array(range(3 * 5 * 7)).reshape((3, 5, 7)),
+        _context=test_context,
+    )
 
     expected_nd = 3
     result_nd = test_context.zeros((1,), dtype=np.int64)

From d7156d8a3160ac0e6314efd2c0d1cfb7535655ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:25:01 +0100
Subject: [PATCH 24/27] final fixes

---
 tests/test_capi.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 72295a2..a287199 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -217,7 +217,9 @@ def test_array_get_shape(test_context, array_type):
     )
 
     assert result_nd[0] == expected_nd
-    assert np.all(result_shape == expected_shape)
+    assert result_shape[0] == expected_shape[0]
+    assert result_shape[1] == expected_shape[1]
+    assert result_shape[2] == expected_shape[2]
 
 
 def test_struct1():

From de1ebd65acaea6c5858c01a994184b2eb25e3bb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Wed, 19 Nov 2025 18:26:50 +0100
Subject: [PATCH 25/27] final final fixes

---
 tests/test_capi.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index a287199..29518eb 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -786,6 +786,9 @@ class Cells(xo.Struct):
         out_vals=vals,
         success=success,
     )
+    counts = test_context.nparray_from_context_array(counts)
+    vals = test_context.nparray_from_context_array(vals)
+
     assert success == 1
     assert np.all(counts == [2, 3, 4])
     assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])

From 153ead296ab77e0fe2de8e50a5ea9e5c8604a8f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Fri, 21 Nov 2025 17:04:28 +0100
Subject: [PATCH 26/27] Fix typo

---
 xobjects/headers/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xobjects/headers/common.h b/xobjects/headers/common.h
index f6fd35d..6c7f04c 100644
--- a/xobjects/headers/common.h
+++ b/xobjects/headers/common.h
@@ -52,7 +52,7 @@
 
     #define VECTORIZE_OVER(INDEX_NAME, COUNT) \
         { \
-            int64_t INDEX_NAME = get_global_id(0);
+            int64_t INDEX_NAME = get_global_id(0); \
             if (INDEX_NAME < (COUNT)) { \
 
     #define END_VECTORIZE \

From f234f8897fe0c8193d5b1a24091524c3662f4848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= <szymon@lopaciuk.eu>
Date: Fri, 21 Nov 2025 17:22:18 +0100
Subject: [PATCH 27/27] fixes in tests for opencl

---
 tests/test_capi.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/test_capi.py b/tests/test_capi.py
index 29518eb..096750d 100644
--- a/tests/test_capi.py
+++ b/tests/test_capi.py
@@ -724,10 +724,9 @@ class Cells(xo.Struct):
 
                 if (j >= MAX_PARTICLES) {
                     *success = 0;
-                    continue;
+                } else {
+                    out_vals[i * MAX_PARTICLES + j] = val;
                 }
-
-                out_vals[i * MAX_PARTICLES + j] = val;
             END_VECTORIZE;
         }
     }
@@ -750,7 +749,7 @@ class Cells(xo.Struct):
                 xo.Arg(xo.UInt64, pointer=True, name="out_vals"),
                 xo.Arg(xo.UInt8, pointer=True, name="success"),
             ],
-            n_threads=3,
+            n_threads=4,
         ),
         "kernel_Cells_get_particles": xo.Kernel(
             args=[
@@ -759,7 +758,6 @@ class Cells(xo.Struct):
                 xo.Arg(xo.Int64, name="i1"),
                 xo.Arg(xo.Int64, pointer=True, name="out"),
             ],
-            n_threads=3,
         ),
     }
 
@@ -778,7 +776,7 @@ class Cells(xo.Struct):
             test_context.kernels.kernel_Cells_get_particles(
                 obj=cells, i0=i, i1=j, out=result
             )
-            assert result == expected
+            assert result[0] == expected
 
     test_context.kernels.loop_over(
         cells=cells,
@@ -789,6 +787,6 @@ class Cells(xo.Struct):
     counts = test_context.nparray_from_context_array(counts)
     vals = test_context.nparray_from_context_array(vals)
 
-    assert success == 1
+    assert success[0] == 1
     assert np.all(counts == [2, 3, 4])
     assert np.all(vals == [1, 8, 0, 0, 9, 3, 2, 0, 4, 5, 6, 7])